Load necessary libraries (as shown in standard R practice)

library(nycflights13)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

— Question 1: Dataset Dimensions —

Check the number of rows and columns

dim(flights) 
## [1] 336776     19

Result: 336776 rows and 19 columns.

— Question 2: Largest Departure Delay —

flights %>%
  arrange(desc(dep_delay)) %>%
  select(carrier, flight, tailnum, dep_delay) %>%
  head(1)
## # A tibble: 1 × 4
##   carrier flight tailnum dep_delay
##   <chr>    <int> <chr>       <dbl>
## 1 HA          51 N384HA       1301

— Question 3: Mean Departure Delay (Handling NAs) —

flights %>%
  group_by(year, month, day) %>%
  summarise(mean_delay = mean(dep_delay, na.rm = TRUE))
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
## # A tibble: 365 × 4
## # Groups:   year, month [12]
##     year month   day mean_delay
##    <int> <int> <int>      <dbl>
##  1  2013     1     1      11.5 
##  2  2013     1     2      13.9 
##  3  2013     1     3      11.0 
##  4  2013     1     4       8.95
##  5  2013     1     5       5.73
##  6  2013     1     6       7.15
##  7  2013     1     7       5.42
##  8  2013     1     8       2.55
##  9  2013     1     9       2.28
## 10  2013     1    10       2.84
## # ℹ 355 more rows

— Question 4: Tailnum with lowest average arrival delay —

filter out NAs, group by tailnum, then find the minimum average

flights %>%
  filter(!is.na(arr_delay)) %>%
  group_by(tailnum) %>%
  summarise(avg_arr_delay = mean(arr_delay)) %>%
  arrange(avg_arr_delay) %>%
  head(1)
## # A tibble: 1 × 2
##   tailnum avg_arr_delay
##   <chr>           <dbl>
## 1 N560AS            -53

— Question 5: Last flight of the day —

Checking if max dep_time is <= 2400 (midnight)

flights %>%
  filter(!is.na(dep_time)) %>%
  group_by(year, month, day) %>%
  summarise(last_flight = max(dep_time)) %>%
  arrange(desc(last_flight))
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
## # A tibble: 365 × 4
## # Groups:   year, month [12]
##     year month   day last_flight
##    <int> <int> <int>       <int>
##  1  2013     2     7        2400
##  2  2013     2    11        2400
##  3  2013     3    15        2400
##  4  2013     3    22        2400
##  5  2013     3    25        2400
##  6  2013     4     2        2400
##  7  2013     4     4        2400
##  8  2013     4    20        2400
##  9  2013     5    21        2400
## 10  2013     6    17        2400
## # ℹ 355 more rows

— Question 6: Proportion of flights delayed > 60 mins per month —

Calculate the logical mean (proportion) for each month

flights %>%
  group_by(month) %>%
  summarise(prop_over_60 = mean(dep_delay > 60, na.rm = TRUE)) %>%
  arrange(desc(prop_over_60))
## # A tibble: 12 × 2
##    month prop_over_60
##    <int>        <dbl>
##  1     7       0.134 
##  2     6       0.128 
##  3    12       0.0942
##  4     4       0.0916
##  5     3       0.0837
##  6     5       0.0818
##  7     8       0.0796
##  8     2       0.0698
##  9     1       0.0688
## 10     9       0.0490
## 11    10       0.0469
## 12    11       0.0402

— Question 7: Destinations with the most carriers —

Using n_distinct to count unique carriers per destination

flights %>%
  group_by(dest) %>%
  summarise(n_carrier = n_distinct(carrier)) %>%
  arrange(desc(n_carrier))
## # A tibble: 105 × 2
##    dest  n_carrier
##    <chr>     <int>
##  1 ATL           7
##  2 BOS           7
##  3 CLT           7
##  4 ORD           7
##  5 TPA           7
##  6 AUS           6
##  7 DCA           6
##  8 DTW           6
##  9 IAD           6
## 10 MSP           6
## # ℹ 95 more rows

— Question 9: Combined Logic Verification —

This matches the “Function Chaining” logic on slide 17

delays <- flights %>% 
  group_by(dest) %>% 
  summarise(
    count = n(),
    dist = mean(distance, na.rm = TRUE),
    delay = mean(arr_delay, na.rm = TRUE)
  ) %>% 
  filter(count > 20, dest != "HNL")