Question 1
library(dplyr)
pacman::p_load(nycflights13)
View(flights) # this View() function opens lets you directly view the whole dataset
glimpse(flights) # this glimpse() function provides a quick overview of the dataset
Rows: 336,776
Columns: 19
$ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
$ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ dep_time <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, 558, 558, 558, 559, 559,…
$ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, 600, 600, 600, 600, 559,…
$ dep_delay <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1, 0, -1, 0, 0, 1, -8, -3…
$ arr_time <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849, 853, 924, 923, 941, 702…
$ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851, 856, 917, 937, 910, 706…
$ arr_delay <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -14, 31, -4, -8, -7, 12, -…
$ carrier <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "AA", "B6", "B6", "UA", "…
$ flight <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 49, 71, 194, 1124, 707, 1…
$ tailnum <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N39463", "N516JB", "N829AS",…
$ origin <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA", "JFK", "LGA", "JFK", "J…
$ dest <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD", "MCO", "ORD", "PBI", "T…
$ air_time <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 158, 345, 361, 257, 44, 3…
$ distance <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, 1028, 1005, 2475, 2565, …
$ hour <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6…
$ minute <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0, 0, 0, 0, 10, 5, 10, 10…
$ time_hour <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 05:0…
summary(flights)
year month day dep_time sched_dep_time dep_delay
Min. :2013 Min. : 1.000 Min. : 1.00 Min. : 1 Min. : 106 Min. : -43.00
1st Qu.:2013 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.: 907 1st Qu.: 906 1st Qu.: -5.00
Median :2013 Median : 7.000 Median :16.00 Median :1401 Median :1359 Median : -2.00
Mean :2013 Mean : 6.549 Mean :15.71 Mean :1349 Mean :1344 Mean : 12.64
3rd Qu.:2013 3rd Qu.:10.000 3rd Qu.:23.00 3rd Qu.:1744 3rd Qu.:1729 3rd Qu.: 11.00
Max. :2013 Max. :12.000 Max. :31.00 Max. :2400 Max. :2359 Max. :1301.00
NA's :8255 NA's :8255
arr_time sched_arr_time arr_delay carrier flight tailnum
Min. : 1 Min. : 1 Min. : -86.000 Length:336776 Min. : 1 Length:336776
1st Qu.:1104 1st Qu.:1124 1st Qu.: -17.000 Class :character 1st Qu.: 553 Class :character
Median :1535 Median :1556 Median : -5.000 Mode :character Median :1496 Mode :character
Mean :1502 Mean :1536 Mean : 6.895 Mean :1972
3rd Qu.:1940 3rd Qu.:1945 3rd Qu.: 14.000 3rd Qu.:3465
Max. :2400 Max. :2359 Max. :1272.000 Max. :8500
NA's :8713 NA's :9430
origin dest air_time distance hour minute
Length:336776 Length:336776 Min. : 20.0 Min. : 17 Min. : 1.00 Min. : 0.00
Class :character Class :character 1st Qu.: 82.0 1st Qu.: 502 1st Qu.: 9.00 1st Qu.: 8.00
Mode :character Mode :character Median :129.0 Median : 872 Median :13.00 Median :29.00
Mean :150.7 Mean :1040 Mean :13.18 Mean :26.23
3rd Qu.:192.0 3rd Qu.:1389 3rd Qu.:17.00 3rd Qu.:44.00
Max. :695.0 Max. :4983 Max. :23.00 Max. :59.00
NA's :9430
time_hour
Min. :2013-01-01 05:00:00.00
1st Qu.:2013-04-04 13:00:00.00
Median :2013-07-03 10:00:00.00
Mean :2013-07-03 05:22:54.64
3rd Qu.:2013-10-01 07:00:00.00
Max. :2013-12-31 23:00:00.00
Question 2
sortf <- arrange(flights,desc(dep_delay))
select(sortf, carrier, flight, tailnum, everything())
maxdep <- max(flights$dep_delay, na.rm=TRUE)
maxdep_id <- which(flights$dep_delay==maxdep)
flights[maxdep_id, 10:12]
select(flights, starts_with("dep"))
summarise(flights, delay=mean(dep_delay,na.rm=TRUE))
Question 3
not_cancelled <- flights %>%
filter(!is.na(dep_delay))
not_cancelled %>%
group_by(year, month, day) %>%
summarise(mean = mean(dep_delay))
`summarise()` has grouped output by 'year', 'month'. You can override using the `.groups` argument.
flights %>%
group_by(year, month, day) %>%
summarise(mean = mean(dep_delay, na.rm = TRUE))
`summarise()` has grouped output by 'year', 'month'. You can override using the `.groups` argument.
delays <- not_cancelled %>%
group_by(tailnum) %>%
summarise(
delay = mean(arr_delay)
)
not_cancelled <- flights %>%
filter(!is.na(dep_delay), !is.na(arr_delay))
Question 4
# Step 1: Compute the average arrival delay per tailnum
avg_arr_delay <- flights %>%
filter(!is.na(arr_delay)) %>%
group_by(tailnum) %>%
summarise(avg_arr_delay = mean(arr_delay, na.rm = TRUE))
# Step 2: Get the tailnum with the lowest average arrival delay
min_tailnum <- avg_arr_delay %>%
filter(avg_arr_delay == min(avg_arr_delay)) %>%
select(tailnum)
# Print the result
print(min_tailnum)
Question 5
not_cancelled %>%
group_by(year, month, day) %>%
summarise(
first = min(dep_time),
last = max(dep_time)
)
`summarise()` has grouped output by 'year', 'month'. You can override using the `.groups` argument.
Question 6
# Compute the proportion of flights delayed by more than an hour for each month
delay_proportion <- flights %>%
filter(!is.na(dep_delay)) %>% # Remove NA values (cancelled flights)
group_by(month) %>%
summarise(
total_flights = n(),
delayed_flights = sum(dep_delay > 60),
proportion = delayed_flights / total_flights
)
print(delay_proportion)
Question 7
not_cancelled %>%
group_by(dest) %>%
summarise(carriers = n_distinct(carrier)) %>%
arrange(desc(carriers))
Question 8 not needed in Code
Question 9
delays <- flights %>%
group_by(dest) %>%
summarise(
count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)
) %>%
filter(count > 20, dest != "HNL")