pacman::p_load(nycflights13, dplyr)

# view basic info
glimpse(flights)
## Rows: 336,776
## Columns: 19
## $ year           <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
## $ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ day            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ dep_time       <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
## $ dep_delay      <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
## $ arr_time       <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
## $ arr_delay      <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
## $ carrier        <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
## $ flight         <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
## $ tailnum        <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
## $ origin         <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
## $ dest           <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
## $ air_time       <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
## $ distance       <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
## $ hour           <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
## $ minute         <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
## $ time_hour      <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…
# check dimensions
dim(flights)
## [1] 336776     19
# Option C
maxdep <- max(flights$dep_delay, na.rm = TRUE)
maxdep_id <- which(flights$dep_delay == maxdep)
flights[maxdep_id, 10:12]
## # A tibble: 1 × 3
##   carrier flight tailnum
##   <chr>    <int> <chr>  
## 1 HA          51 N384HA
# Option D
sortf <- arrange(flights, desc(dep_delay))
select(sortf, carrier, flight, tailnum, everything()) %>% head()
## # A tibble: 6 × 19
##   carrier flight tailnum  year month   day dep_time sched_dep_time dep_delay
##   <chr>    <int> <chr>   <int> <int> <int>    <int>          <int>     <dbl>
## 1 HA          51 N384HA   2013     1     9      641            900      1301
## 2 MQ        3535 N504MQ   2013     6    15     1432           1935      1137
## 3 MQ        3695 N517MQ   2013     1    10     1121           1635      1126
## 4 AA         177 N338AA   2013     9    20     1139           1845      1014
## 5 MQ        3075 N665MQ   2013     7    22      845           1600      1005
## 6 DL        2391 N959DL   2013     4    10     1100           1900       960
## # ℹ 10 more variables: arr_time <int>, sched_arr_time <int>, arr_delay <dbl>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
# B: specify na.rm = TRUE
flights %>%
  group_by(year, month, day) %>%
  summarise(mean = mean(dep_delay, na.rm = TRUE))
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
## # A tibble: 365 × 4
## # Groups:   year, month [12]
##     year month   day  mean
##    <int> <int> <int> <dbl>
##  1  2013     1     1 11.5 
##  2  2013     1     2 13.9 
##  3  2013     1     3 11.0 
##  4  2013     1     4  8.95
##  5  2013     1     5  5.73
##  6  2013     1     6  7.15
##  7  2013     1     7  5.42
##  8  2013     1     8  2.55
##  9  2013     1     9  2.28
## 10  2013     1    10  2.84
## # ℹ 355 more rows
# C: filter not cancelled (remove NA in both dep_delay & arr_delay)
not_cancelled <- flights %>%
  filter(!is.na(dep_delay), !is.na(arr_delay))

# D: similar fix, only using dep_delay
not_cancelled <- flights %>%
  filter(!is.na(dep_delay))

not_cancelled %>%
  group_by(year, month, day) %>%
  summarise(mean = mean(dep_delay))
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
## # A tibble: 365 × 4
## # Groups:   year, month [12]
##     year month   day  mean
##    <int> <int> <int> <dbl>
##  1  2013     1     1 11.5 
##  2  2013     1     2 13.9 
##  3  2013     1     3 11.0 
##  4  2013     1     4  8.95
##  5  2013     1     5  5.73
##  6  2013     1     6  7.15
##  7  2013     1     7  5.42
##  8  2013     1     8  2.55
##  9  2013     1     9  2.28
## 10  2013     1    10  2.84
## # ℹ 355 more rows
not_cancelled <- flights %>%
  filter(!is.na(arr_delay))

lowest_arr_delay <- not_cancelled %>%
  group_by(tailnum) %>%
  summarise(avg_arr_delay = mean(arr_delay)) %>%
  arrange(avg_arr_delay)

lowest_arr_delay %>% head()
## # A tibble: 6 × 2
##   tailnum avg_arr_delay
##   <chr>           <dbl>
## 1 N560AS          -53  
## 2 N315AS          -51  
## 3 N517AS          -40.5
## 4 N592AS          -35.5
## 5 N7AYAA          -35  
## 6 N585AS          -34.5
library(nycflights13)
library(dplyr)

# Remove flights with missing arrival delay
not_cancelled <- flights %>%
  filter(!is.na(arr_delay))

# Step 1: Average arrival delay for each plane
avg_delay_per_tailnum <- not_cancelled %>%
  group_by(tailnum) %>%
  summarise(avg_arr_delay = mean(arr_delay))

# View the first few results
head(avg_delay_per_tailnum)
## # A tibble: 6 × 2
##   tailnum avg_arr_delay
##   <chr>           <dbl>
## 1 D942DN          31.5 
## 2 N0EGMQ           9.98
## 3 N10156          12.7 
## 4 N102UW           2.94
## 5 N103US          -6.93
## 6 N104UW           1.80
not_cancelled %>%
  group_by(year, month, day) %>%
  summarise(
    first = min(dep_time),
    last = max(dep_time)
  ) %>%
  arrange(desc(last)) %>%
  head()
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
## # A tibble: 6 × 5
## # Groups:   year, month [3]
##    year month   day first  last
##   <int> <int> <int> <int> <int>
## 1  2013     2     7    27  2400
## 2  2013     2    11     1  2400
## 3  2013     3    15    11  2400
## 4  2013     3    22    37  2400
## 5  2013     3    25    13  2400
## 6  2013     4     2     9  2400
flights %>%
  mutate(over_hour = dep_delay > 60) %>%
  group_by(month) %>%
  summarise(prop_over_hour = mean(over_hour, na.rm = TRUE)) %>%
  arrange(desc(prop_over_hour))
## # A tibble: 12 × 2
##    month prop_over_hour
##    <int>          <dbl>
##  1     7         0.134 
##  2     6         0.128 
##  3    12         0.0942
##  4     4         0.0916
##  5     3         0.0837
##  6     5         0.0818
##  7     8         0.0796
##  8     2         0.0698
##  9     1         0.0688
## 10     9         0.0490
## 11    10         0.0469
## 12    11         0.0402
flights %>%
  group_by(dest) %>%
  summarise(n_carriers = n_distinct(carrier)) %>%
  arrange(desc(n_carriers)) %>%
  head(10)
## # A tibble: 10 × 2
##    dest  n_carriers
##    <chr>      <int>
##  1 ATL            7
##  2 BOS            7
##  3 CLT            7
##  4 ORD            7
##  5 TPA            7
##  6 AUS            6
##  7 DCA            6
##  8 DTW            6
##  9 IAD            6
## 10 MSP            6
delays <- flights %>%
  group_by(dest) %>%
  summarise(
    count = n(),
    dist = mean(distance, na.rm = TRUE),
    delay = mean(arr_delay, na.rm = TRUE)
  ) %>%
  filter(count > 20, dest != "HNL")

head(delays)
## # A tibble: 6 × 4
##   dest  count  dist delay
##   <chr> <int> <dbl> <dbl>
## 1 ABQ     254 1826   4.38
## 2 ACK     265  199   4.85
## 3 ALB     439  143  14.4 
## 4 ATL   17215  757. 11.3 
## 5 AUS    2439 1514.  6.02
## 6 AVL     275  584.  8.00
flights %>%
  filter(!is.na(arr_delay)) %>%  # Step 1: Remove cancelled flights
  group_by(tailnum) %>%          # Step 1: Group by plane (tailnum)
  summarise(avg_delay = mean(arr_delay)) %>% # Step 1: Calculate average delay
  arrange(avg_delay) %>%         # Step 2: Sort by average delay (ascending for lowest)
  head(1)                        # Step 2: Take the top row
## # A tibble: 1 × 2
##   tailnum avg_delay
##   <chr>       <dbl>
## 1 N560AS        -53