pacman::p_load(nycflights13)
library(nycflights13)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
glimpse(flights)
## Rows: 336,776
## Columns: 19
## $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
## $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ dep_time <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
## $ dep_delay <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
## $ arr_time <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
## $ arr_delay <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
## $ carrier <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
## $ flight <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
## $ tailnum <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
## $ origin <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
## $ dest <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
## $ air_time <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
## $ distance <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
## $ hour <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
## $ minute <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
## $ time_hour <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…
maxdep <- max(flights$dep_delay, na.rm = TRUE)
maxdep_id <- which(flights$dep_delay == maxdep)
flights[maxdep_id, 10:12]
flights %>%
group_by(year, month, day) %>%
summarise(mean = mean(dep_delay, na.rm = TRUE))
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
not_cancelled <- flights %>%
filter(!is.na(dep_delay))
not_cancelled %>%
group_by(year, month, day) %>%
summarise(mean = mean(dep_delay))
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
avg_arr_delay_by_tailnum <- flights %>%
filter(!is.na(arr_delay)) %>%
group_by(tailnum) %>%
summarise(avg_arr_delay = mean(arr_delay, na.rm = TRUE)) %>%
arrange(avg_arr_delay)
lowest_delay_tailnum <- avg_arr_delay_by_tailnum %>%
slice_min(avg_arr_delay, n = 1)
print(lowest_delay_tailnum)
## # A tibble: 1 × 2
## tailnum avg_arr_delay
## <chr> <dbl>
## 1 N560AS -53
not_cancelled %>%
group_by(year, month, day) %>%
summarise(
first = min(dep_time),
last = max(dep_time)
)
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
not_cancelled <- flights %>%
filter(!is.na(dep_time))
not_cancelled %>%
group_by(year, month, day) %>%
summarise(
first = min(dep_time),
last = max(dep_time)
) %>%
arrange(desc(last))
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
delay_proportion <- flights %>%
filter(!is.na(dep_delay)) %>%
group_by(month) %>%
summarise(
total_flights = n(),
delayed_over_hour = sum(dep_delay > 60, na.rm = TRUE),
proportion_delayed = delayed_over_hour / total_flights
) %>%
arrange(desc(proportion_delayed))
print(delay_proportion)
## # A tibble: 12 × 4
## month total_flights delayed_over_hour proportion_delayed
## <int> <int> <int> <dbl>
## 1 7 28485 3820 0.134
## 2 6 27234 3494 0.128
## 3 12 27110 2553 0.0942
## 4 4 27662 2535 0.0916
## 5 3 27973 2340 0.0837
## 6 5 28233 2309 0.0818
## 7 8 28841 2295 0.0796
## 8 2 23690 1654 0.0698
## 9 1 26483 1821 0.0688
## 10 9 27122 1330 0.0490
## 11 10 28653 1344 0.0469
## 12 11 27035 1086 0.0402
dest_carriers <- flights %>%
group_by(dest) %>%
summarise(num_carriers = n_distinct(carrier, na.rm = TRUE)) %>%
arrange(desc(num_carriers))
print(dest_carriers)
## # A tibble: 105 × 2
## dest num_carriers
## <chr> <int>
## 1 ATL 7
## 2 BOS 7
## 3 CLT 7
## 4 ORD 7
## 5 TPA 7
## 6 AUS 6
## 7 DCA 6
## 8 DTW 6
## 9 IAD 6
## 10 MSP 6
## # ℹ 95 more rows