title: “Homework 5” author: “Aidan Buchanan” date: “2025-10-07” output: pdf_document

Question 1

library(nycflights13); library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
c(nrow = nrow(flights), ncol = ncol(flights))  # expect 336776, 19
##   nrow   ncol 
## 336776     19

Question 2

arrange(flights, desc(dep_delay)) %>% slice(1)
## # A tibble: 1 × 19
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
## 1  2013     1     9      641            900      1301     1242           1530
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
maxdep <- max(flights$dep_delay, na.rm = TRUE)
flights %>% filter(dep_delay == maxdep) %>% select(carrier, flight, tailnum, everything())
## # A tibble: 1 × 19
##   carrier flight tailnum  year month   day dep_time sched_dep_time dep_delay
##   <chr>    <int> <chr>   <int> <int> <int>    <int>          <int>     <dbl>
## 1 HA          51 N384HA   2013     1     9      641            900      1301
## # ℹ 10 more variables: arr_time <int>, sched_arr_time <int>, arr_delay <dbl>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

Question 3

flights %>% group_by(year, month, day) %>%
  summarise(mean_dep_delay = mean(dep_delay, na.rm = TRUE), .groups = "drop")
## # A tibble: 365 × 4
##     year month   day mean_dep_delay
##    <int> <int> <int>          <dbl>
##  1  2013     1     1          11.5 
##  2  2013     1     2          13.9 
##  3  2013     1     3          11.0 
##  4  2013     1     4           8.95
##  5  2013     1     5           5.73
##  6  2013     1     6           7.15
##  7  2013     1     7           5.42
##  8  2013     1     8           2.55
##  9  2013     1     9           2.28
## 10  2013     1    10           2.84
## # ℹ 355 more rows
flights %>% filter(!is.na(dep_delay)) %>%
  group_by(year, month, day) %>%
  summarise(mean_dep_delay = mean(dep_delay), .groups = "drop")
## # A tibble: 365 × 4
##     year month   day mean_dep_delay
##    <int> <int> <int>          <dbl>
##  1  2013     1     1          11.5 
##  2  2013     1     2          13.9 
##  3  2013     1     3          11.0 
##  4  2013     1     4           8.95
##  5  2013     1     5           5.73
##  6  2013     1     6           7.15
##  7  2013     1     7           5.42
##  8  2013     1     8           2.55
##  9  2013     1     9           2.28
## 10  2013     1    10           2.84
## # ℹ 355 more rows

Question 4

flights %>% filter(!is.na(arr_delay)) %>%
  group_by(tailnum) %>%
  summarise(n = n(), avg_arr_delay = mean(arr_delay), .groups = "drop") %>%
  arrange(avg_arr_delay) %>% slice(1)
## # A tibble: 1 × 3
##   tailnum     n avg_arr_delay
##   <chr>   <int>         <dbl>
## 1 N560AS      1           -53

Question 5

flights %>% filter(!is.na(dep_time)) %>%
  group_by(year, month, day) %>%
  summarise(first = min(dep_time), last = max(dep_time), .groups = "drop") %>%
  arrange(desc(last)) %>%
  summarise(all(last <= 2400))
## # A tibble: 1 × 1
##   `all(last <= 2400)`
##   <lgl>              
## 1 TRUE

Question 6

if (!requireNamespace("pacman", quietly = TRUE)) install.packages("pacman", quiet = TRUE)
pacman::p_load(nycflights13, dplyr)


not_cancelled <- flights %>%
  filter(!is.na(dep_delay))

prop_by_month <- not_cancelled %>%
  group_by(month) %>%
  summarise(
    n = n(),
    prop_over_60 = mean(dep_delay > 60),  # TRUE coerces to 1/0
    .groups = "drop"
  ) %>%
  arrange(desc(prop_over_60))

cat("Proportion of dep_delay > 60 minutes by month (descending):\n")
## Proportion of dep_delay > 60 minutes by month (descending):
print(prop_by_month)
## # A tibble: 12 × 3
##    month     n prop_over_60
##    <int> <int>        <dbl>
##  1     7 28485       0.134 
##  2     6 27234       0.128 
##  3    12 27110       0.0942
##  4     4 27662       0.0916
##  5     3 27973       0.0837
##  6     5 28233       0.0818
##  7     8 28841       0.0796
##  8     2 23690       0.0698
##  9     1 26483       0.0688
## 10     9 27122       0.0490
## 11    10 28653       0.0469
## 12    11 27035       0.0402

Question 7

dest_carriers <- flights %>%
  filter(!is.na(dest), !is.na(carrier)) %>%
  count(dest, carrier) %>%  # or group_by(dest) |> summarise(n_carriers = n_distinct(carrier))
  summarise(n_carriers = n(), .by = dest)

dest_carriers %>% filter(n_carriers == max(n_carriers)) %>%
  left_join(airports, by = c("dest" = "faa")) %>%
  select(dest, name, n_carriers) %>% arrange(dest)
## # A tibble: 5 × 3
##   dest  name                               n_carriers
##   <chr> <chr>                                   <int>
## 1 ATL   Hartsfield Jackson Atlanta Intl             7
## 2 BOS   General Edward Lawrence Logan Intl          7
## 3 CLT   Charlotte Douglas Intl                      7
## 4 ORD   Chicago Ohare Intl                          7
## 5 TPA   Tampa Intl                                  7

Question 9

if (!requireNamespace("pacman", quietly = TRUE)) install.packages("pacman", quiet = TRUE)
pacman::p_load(nycflights13, dplyr)

delays <- flights %>%
  group_by(dest) %>%
  summarise(
    count = n(),
    dist  = mean(distance, na.rm = TRUE),
    delay = mean(arr_delay, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  filter(count > 20, dest != "HNL")

head(delays)
## # A tibble: 6 × 4
##   dest  count  dist delay
##   <chr> <int> <dbl> <dbl>
## 1 ABQ     254 1826   4.38
## 2 ACK     265  199   4.85
## 3 ALB     439  143  14.4 
## 4 ATL   17215  757. 11.3 
## 5 AUS    2439 1514.  6.02
## 6 AVL     275  584.  8.00