pacman::p_load(nycflights13)
library(nycflights13)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
glimpse(flights)
## Rows: 336,776
## Columns: 19
## $ year           <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
## $ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ day            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ dep_time       <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
## $ dep_delay      <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
## $ arr_time       <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
## $ arr_delay      <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
## $ carrier        <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
## $ flight         <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
## $ tailnum        <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
## $ origin         <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
## $ dest           <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
## $ air_time       <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
## $ distance       <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
## $ hour           <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
## $ minute         <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
## $ time_hour      <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…
maxdep <- max(flights$dep_delay, na.rm = TRUE)
maxdep_id <- which(flights$dep_delay == maxdep)
flights[maxdep_id, 10:12]
flights %>%
  group_by(year, month, day) %>%
  summarise(mean = mean(dep_delay, na.rm = TRUE))
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
not_cancelled <- flights %>%
  filter(!is.na(dep_delay))
not_cancelled %>%
  group_by(year, month, day) %>%
  summarise(mean = mean(dep_delay))
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
avg_arr_delay_by_tailnum <- flights %>%
  filter(!is.na(arr_delay)) %>% 
  group_by(tailnum) %>%
  summarise(avg_arr_delay = mean(arr_delay, na.rm = TRUE)) %>%
  arrange(avg_arr_delay)  

lowest_delay_tailnum <- avg_arr_delay_by_tailnum %>%
  slice_min(avg_arr_delay, n = 1) 

print(lowest_delay_tailnum)
## # A tibble: 1 × 2
##   tailnum avg_arr_delay
##   <chr>           <dbl>
## 1 N560AS            -53
not_cancelled %>% 
  group_by(year, month, day) %>% 
  summarise(
    first = min(dep_time),
    last = max(dep_time)
  )
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
not_cancelled <- flights %>%
  filter(!is.na(dep_time)) 

not_cancelled %>%
  group_by(year, month, day) %>%
  summarise(
    first = min(dep_time),
    last = max(dep_time)
  ) %>%
  arrange(desc(last))
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
delay_proportion <- flights %>%
  filter(!is.na(dep_delay)) %>%  
  group_by(month) %>%
  summarise(
    total_flights = n(),  
    delayed_over_hour = sum(dep_delay > 60, na.rm = TRUE),  
    proportion_delayed = delayed_over_hour / total_flights  
  ) %>%
  arrange(desc(proportion_delayed))  

print(delay_proportion)
## # A tibble: 12 × 4
##    month total_flights delayed_over_hour proportion_delayed
##    <int>         <int>             <int>              <dbl>
##  1     7         28485              3820             0.134 
##  2     6         27234              3494             0.128 
##  3    12         27110              2553             0.0942
##  4     4         27662              2535             0.0916
##  5     3         27973              2340             0.0837
##  6     5         28233              2309             0.0818
##  7     8         28841              2295             0.0796
##  8     2         23690              1654             0.0698
##  9     1         26483              1821             0.0688
## 10     9         27122              1330             0.0490
## 11    10         28653              1344             0.0469
## 12    11         27035              1086             0.0402
dest_carriers <- flights %>%
  group_by(dest) %>%
  summarise(num_carriers = n_distinct(carrier, na.rm = TRUE)) %>%
  arrange(desc(num_carriers))  

print(dest_carriers)
## # A tibble: 105 × 2
##    dest  num_carriers
##    <chr>        <int>
##  1 ATL              7
##  2 BOS              7
##  3 CLT              7
##  4 ORD              7
##  5 TPA              7
##  6 AUS              6
##  7 DCA              6
##  8 DTW              6
##  9 IAD              6
## 10 MSP              6
## # ℹ 95 more rows