Load necessary libraries (as shown in standard R practice)
library(nycflights13)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
— Question 1: Dataset Dimensions —
Check the number of rows and columns
dim(flights)
## [1] 336776 19
Result: 336776 rows and 19 columns.
— Question 2: Largest Departure Delay —
flights %>%
arrange(desc(dep_delay)) %>%
select(carrier, flight, tailnum, dep_delay) %>%
head(1)
## # A tibble: 1 × 4
## carrier flight tailnum dep_delay
## <chr> <int> <chr> <dbl>
## 1 HA 51 N384HA 1301
— Question 3: Mean Departure Delay (Handling NAs) —
flights %>%
group_by(year, month, day) %>%
summarise(mean_delay = mean(dep_delay, na.rm = TRUE))
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
## # A tibble: 365 × 4
## # Groups: year, month [12]
## year month day mean_delay
## <int> <int> <int> <dbl>
## 1 2013 1 1 11.5
## 2 2013 1 2 13.9
## 3 2013 1 3 11.0
## 4 2013 1 4 8.95
## 5 2013 1 5 5.73
## 6 2013 1 6 7.15
## 7 2013 1 7 5.42
## 8 2013 1 8 2.55
## 9 2013 1 9 2.28
## 10 2013 1 10 2.84
## # ℹ 355 more rows
— Question 4: Tailnum with lowest average arrival delay —
filter out NAs, group by tailnum, then find the minimum average
flights %>%
filter(!is.na(arr_delay)) %>%
group_by(tailnum) %>%
summarise(avg_arr_delay = mean(arr_delay)) %>%
arrange(avg_arr_delay) %>%
head(1)
## # A tibble: 1 × 2
## tailnum avg_arr_delay
## <chr> <dbl>
## 1 N560AS -53
— Question 5: Last flight of the day —
Checking if max dep_time is <= 2400 (midnight)
flights %>%
filter(!is.na(dep_time)) %>%
group_by(year, month, day) %>%
summarise(last_flight = max(dep_time)) %>%
arrange(desc(last_flight))
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
## # A tibble: 365 × 4
## # Groups: year, month [12]
## year month day last_flight
## <int> <int> <int> <int>
## 1 2013 2 7 2400
## 2 2013 2 11 2400
## 3 2013 3 15 2400
## 4 2013 3 22 2400
## 5 2013 3 25 2400
## 6 2013 4 2 2400
## 7 2013 4 4 2400
## 8 2013 4 20 2400
## 9 2013 5 21 2400
## 10 2013 6 17 2400
## # ℹ 355 more rows
— Question 6: Proportion of flights delayed > 60 mins per month
—
Calculate the logical mean (proportion) for each month
flights %>%
group_by(month) %>%
summarise(prop_over_60 = mean(dep_delay > 60, na.rm = TRUE)) %>%
arrange(desc(prop_over_60))
## # A tibble: 12 × 2
## month prop_over_60
## <int> <dbl>
## 1 7 0.134
## 2 6 0.128
## 3 12 0.0942
## 4 4 0.0916
## 5 3 0.0837
## 6 5 0.0818
## 7 8 0.0796
## 8 2 0.0698
## 9 1 0.0688
## 10 9 0.0490
## 11 10 0.0469
## 12 11 0.0402
— Question 7: Destinations with the most carriers —
Using n_distinct to count unique carriers per destination
flights %>%
group_by(dest) %>%
summarise(n_carrier = n_distinct(carrier)) %>%
arrange(desc(n_carrier))
## # A tibble: 105 × 2
## dest n_carrier
## <chr> <int>
## 1 ATL 7
## 2 BOS 7
## 3 CLT 7
## 4 ORD 7
## 5 TPA 7
## 6 AUS 6
## 7 DCA 6
## 8 DTW 6
## 9 IAD 6
## 10 MSP 6
## # ℹ 95 more rows
— Question 9: Combined Logic Verification —
This matches the “Function Chaining” logic on slide 17
delays <- flights %>%
group_by(dest) %>%
summarise(
count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)
) %>%
filter(count > 20, dest != "HNL")