title: “Homework 5” author: “Aidan Buchanan” date: “2025-10-07” output: pdf_document
library(nycflights13); library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
c(nrow = nrow(flights), ncol = ncol(flights)) # expect 336776, 19
## nrow ncol
## 336776 19
arrange(flights, desc(dep_delay)) %>% slice(1)
## # A tibble: 1 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 9 641 900 1301 1242 1530
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
maxdep <- max(flights$dep_delay, na.rm = TRUE)
flights %>% filter(dep_delay == maxdep) %>% select(carrier, flight, tailnum, everything())
## # A tibble: 1 × 19
## carrier flight tailnum year month day dep_time sched_dep_time dep_delay
## <chr> <int> <chr> <int> <int> <int> <int> <int> <dbl>
## 1 HA 51 N384HA 2013 1 9 641 900 1301
## # ℹ 10 more variables: arr_time <int>, sched_arr_time <int>, arr_delay <dbl>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
flights %>% group_by(year, month, day) %>%
summarise(mean_dep_delay = mean(dep_delay, na.rm = TRUE), .groups = "drop")
## # A tibble: 365 × 4
## year month day mean_dep_delay
## <int> <int> <int> <dbl>
## 1 2013 1 1 11.5
## 2 2013 1 2 13.9
## 3 2013 1 3 11.0
## 4 2013 1 4 8.95
## 5 2013 1 5 5.73
## 6 2013 1 6 7.15
## 7 2013 1 7 5.42
## 8 2013 1 8 2.55
## 9 2013 1 9 2.28
## 10 2013 1 10 2.84
## # ℹ 355 more rows
flights %>% filter(!is.na(dep_delay)) %>%
group_by(year, month, day) %>%
summarise(mean_dep_delay = mean(dep_delay), .groups = "drop")
## # A tibble: 365 × 4
## year month day mean_dep_delay
## <int> <int> <int> <dbl>
## 1 2013 1 1 11.5
## 2 2013 1 2 13.9
## 3 2013 1 3 11.0
## 4 2013 1 4 8.95
## 5 2013 1 5 5.73
## 6 2013 1 6 7.15
## 7 2013 1 7 5.42
## 8 2013 1 8 2.55
## 9 2013 1 9 2.28
## 10 2013 1 10 2.84
## # ℹ 355 more rows
flights %>% filter(!is.na(arr_delay)) %>%
group_by(tailnum) %>%
summarise(n = n(), avg_arr_delay = mean(arr_delay), .groups = "drop") %>%
arrange(avg_arr_delay) %>% slice(1)
## # A tibble: 1 × 3
## tailnum n avg_arr_delay
## <chr> <int> <dbl>
## 1 N560AS 1 -53
flights %>% filter(!is.na(dep_time)) %>%
group_by(year, month, day) %>%
summarise(first = min(dep_time), last = max(dep_time), .groups = "drop") %>%
arrange(desc(last)) %>%
summarise(all(last <= 2400))
## # A tibble: 1 × 1
## `all(last <= 2400)`
## <lgl>
## 1 TRUE
if (!requireNamespace("pacman", quietly = TRUE)) install.packages("pacman", quiet = TRUE)
pacman::p_load(nycflights13, dplyr)
not_cancelled <- flights %>%
filter(!is.na(dep_delay))
prop_by_month <- not_cancelled %>%
group_by(month) %>%
summarise(
n = n(),
prop_over_60 = mean(dep_delay > 60), # TRUE coerces to 1/0
.groups = "drop"
) %>%
arrange(desc(prop_over_60))
cat("Proportion of dep_delay > 60 minutes by month (descending):\n")
## Proportion of dep_delay > 60 minutes by month (descending):
print(prop_by_month)
## # A tibble: 12 × 3
## month n prop_over_60
## <int> <int> <dbl>
## 1 7 28485 0.134
## 2 6 27234 0.128
## 3 12 27110 0.0942
## 4 4 27662 0.0916
## 5 3 27973 0.0837
## 6 5 28233 0.0818
## 7 8 28841 0.0796
## 8 2 23690 0.0698
## 9 1 26483 0.0688
## 10 9 27122 0.0490
## 11 10 28653 0.0469
## 12 11 27035 0.0402
dest_carriers <- flights %>%
filter(!is.na(dest), !is.na(carrier)) %>%
count(dest, carrier) %>% # or group_by(dest) |> summarise(n_carriers = n_distinct(carrier))
summarise(n_carriers = n(), .by = dest)
dest_carriers %>% filter(n_carriers == max(n_carriers)) %>%
left_join(airports, by = c("dest" = "faa")) %>%
select(dest, name, n_carriers) %>% arrange(dest)
## # A tibble: 5 × 3
## dest name n_carriers
## <chr> <chr> <int>
## 1 ATL Hartsfield Jackson Atlanta Intl 7
## 2 BOS General Edward Lawrence Logan Intl 7
## 3 CLT Charlotte Douglas Intl 7
## 4 ORD Chicago Ohare Intl 7
## 5 TPA Tampa Intl 7
if (!requireNamespace("pacman", quietly = TRUE)) install.packages("pacman", quiet = TRUE)
pacman::p_load(nycflights13, dplyr)
delays <- flights %>%
group_by(dest) %>%
summarise(
count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE),
.groups = "drop"
) %>%
filter(count > 20, dest != "HNL")
head(delays)
## # A tibble: 6 × 4
## dest count dist delay
## <chr> <int> <dbl> <dbl>
## 1 ABQ 254 1826 4.38
## 2 ACK 265 199 4.85
## 3 ALB 439 143 14.4
## 4 ATL 17215 757. 11.3
## 5 AUS 2439 1514. 6.02
## 6 AVL 275 584. 8.00