Question 1
library(nycflights13)
library(tidyverse)
pacman::p_load(nycflights13)
summary(flights)
year month day dep_time sched_dep_time
Min. :2013 Min. : 1.000 Min. : 1.00 Min. : 1 Min. : 106
1st Qu.:2013 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.: 907 1st Qu.: 906
Median :2013 Median : 7.000 Median :16.00 Median :1401 Median :1359
Mean :2013 Mean : 6.549 Mean :15.71 Mean :1349 Mean :1344
3rd Qu.:2013 3rd Qu.:10.000 3rd Qu.:23.00 3rd Qu.:1744 3rd Qu.:1729
Max. :2013 Max. :12.000 Max. :31.00 Max. :2400 Max. :2359
NA's :8255
dep_delay arr_time sched_arr_time arr_delay carrier
Min. : -43.00 Min. : 1 Min. : 1 Min. : -86.000 Length:336776
1st Qu.: -5.00 1st Qu.:1104 1st Qu.:1124 1st Qu.: -17.000 Class :character
Median : -2.00 Median :1535 Median :1556 Median : -5.000 Mode :character
Mean : 12.64 Mean :1502 Mean :1536 Mean : 6.895
3rd Qu.: 11.00 3rd Qu.:1940 3rd Qu.:1945 3rd Qu.: 14.000
Max. :1301.00 Max. :2400 Max. :2359 Max. :1272.000
NA's :8255 NA's :8713 NA's :9430
flight tailnum origin dest air_time
Min. : 1 Length:336776 Length:336776 Length:336776 Min. : 20.0
1st Qu.: 553 Class :character Class :character Class :character 1st Qu.: 82.0
Median :1496 Mode :character Mode :character Mode :character Median :129.0
Mean :1972 Mean :150.7
3rd Qu.:3465 3rd Qu.:192.0
Max. :8500 Max. :695.0
NA's :9430
distance hour minute time_hour
Min. : 17 Min. : 1.00 Min. : 0.00 Min. :2013-01-01 05:00:00.00
1st Qu.: 502 1st Qu.: 9.00 1st Qu.: 8.00 1st Qu.:2013-04-04 13:00:00.00
Median : 872 Median :13.00 Median :29.00 Median :2013-07-03 10:00:00.00
Mean :1040 Mean :13.18 Mean :26.23 Mean :2013-07-03 05:22:54.64
3rd Qu.:1389 3rd Qu.:17.00 3rd Qu.:44.00 3rd Qu.:2013-10-01 07:00:00.00
Max. :4983 Max. :23.00 Max. :59.00 Max. :2013-12-31 23:00:00.00
View(flights)
Question 2
maxdep <- max(flights$dep_delay, na.rm=TRUE)
maxdep_id <- which(flights$dep_delay==maxdep)
flights[maxdep_id, 10:12]
not_cancelled <- flights %>%
filter(!is.na(dep_delay), !is.na(arr_delay))
select(flights, starts_with("dep"))
sortf <- arrange(flights,desc(dep_delay))
select(sortf, carrier, flight, tailnum, everything())
not_cancelled %>% group_by(year, month, day) %>%
summarise(
first = min(dep_time),
last = max(dep_time)
)
`summarise()` has grouped output by 'year', 'month'. You can override using the `.groups`
argument.
Question 3
not_cancelled <- flights %>%
filter(!is.na(dep_delay))
not_cancelled %>%
group_by(year, month, day) %>%
summarise(mean = mean(dep_delay))
not_cancelled <- flights %>%
filter(!is.na(dep_delay), !is.na(arr_delay))
delays <- not_cancelled %>%
group_by(tailnum) %>%
summarise(
delay = mean(arr_delay)
)
flights %>%
group_by(year, month, day) %>%
summarise(mean = mean(dep_delay, na.rm = TRUE))
`summarise()` has grouped output by 'year', 'month'. You can override using the `.groups` argument.
rename(flights, tail_num = tailnum)
NA
avg_arr_delay <- flights %>%
group_by(tailnum) %>%
summarise(avg_arr_delay = mean(arr_delay, na.rm = TRUE))
min_delay_tailnum <- avg_arr_delay %>%
filter(avg_arr_delay == min(avg_arr_delay, na.rm = TRUE))
Question 4
print(min_delay_tailnum)
Question 5
not_cancelled %>%
group_by(year, month, day) %>%
summarise(
first = min(dep_time),
last = max(dep_time)
)
`summarise()` has grouped output by 'year', 'month'. You can override using the `.groups` argument.
Question 6
monthly_delay_proportion <- flights %>%
group_by(month) %>%
summarise(
total_flights = n(),
delayed_flights = sum(dep_delay > 60, na.rm = TRUE),
proportion_delayed = delayed_flights / total_flights
) %>%
arrange(desc(proportion_delayed))
print(monthly_delay_proportion)
Question 7
dest_carrier_counts <- flights %>%
group_by(dest) %>%
summarise(num_carriers = n_distinct(carrier)) %>%
arrange(desc(num_carriers))
print(dest_carrier_counts)