Question 1

Does the dataset contain 336,776 records and 19 fields?

dim(flights)
## [1] 336776     19

Question 2

Flight with the largest departure delay:

flights %>%
  arrange(desc(dep_delay)) %>%
  select(carrier, flight, tailnum, dep_delay) %>%
  slice(1)
## # A tibble: 1 × 4
##   carrier flight tailnum dep_delay
##   <chr>    <int> <chr>       <dbl>
## 1 HA          51 N384HA       1301

Question 3

Average departure delay per date (handling cancelled flights):

flights %>%
  group_by(year, month, day) %>%
  summarise(mean_dep_delay = mean(dep_delay, na.rm = TRUE), .groups = "drop")
## # A tibble: 365 × 4
##     year month   day mean_dep_delay
##    <int> <int> <int>          <dbl>
##  1  2013     1     1          11.5 
##  2  2013     1     2          13.9 
##  3  2013     1     3          11.0 
##  4  2013     1     4           8.95
##  5  2013     1     5           5.73
##  6  2013     1     6           7.15
##  7  2013     1     7           5.42
##  8  2013     1     8           2.55
##  9  2013     1     9           2.28
## 10  2013     1    10           2.84
## # ℹ 355 more rows

Question 4

Tail number with the lowest average arrival delay:

flights %>%
  filter(!is.na(arr_delay)) %>%
  group_by(tailnum) %>%
  summarise(avg_arr_delay = mean(arr_delay),
            n_flights = n(),
            .groups = "drop") %>%
  filter(!is.na(tailnum)) %>%
  arrange(avg_arr_delay) %>%
  slice(1)
## # A tibble: 1 × 3
##   tailnum avg_arr_delay n_flights
##   <chr>           <dbl>     <int>
## 1 N560AS            -53         1

Question 5

First and last departure time each day:

flights %>%
  filter(!is.na(dep_time)) %>%
  group_by(year, month, day) %>%
  summarise(first = min(dep_time),
            last  = max(dep_time),
            .groups = "drop") %>%
  arrange(desc(last))
## # A tibble: 365 × 5
##     year month   day first  last
##    <int> <int> <int> <int> <int>
##  1  2013     2     7    27  2400
##  2  2013     2    11     1  2400
##  3  2013     3    15    11  2400
##  4  2013     3    22    37  2400
##  5  2013     3    25    13  2400
##  6  2013     4     2     9  2400
##  7  2013     4     4    14  2400
##  8  2013     4    20     7  2400
##  9  2013     5    21   110  2400
## 10  2013     6    17     2  2400
## # ℹ 355 more rows

Question 6

Proportion of flights with departure delays greater than 60 minutes by month:

flights %>%
  group_by(month) %>%
  summarise(prop_over_60 = mean(dep_delay > 60, na.rm = TRUE),
            .groups = "drop") %>%
  arrange(desc(prop_over_60))
## # A tibble: 12 × 2
##    month prop_over_60
##    <int>        <dbl>
##  1     7       0.134 
##  2     6       0.128 
##  3    12       0.0942
##  4     4       0.0916
##  5     3       0.0837
##  6     5       0.0818
##  7     8       0.0796
##  8     2       0.0698
##  9     1       0.0688
## 10     9       0.0490
## 11    10       0.0469
## 12    11       0.0402

Question 7

Destinations with the most carriers:

flights %>%
  group_by(dest) %>%
  summarise(n_carriers = n_distinct(carrier),
            .groups = "drop") %>%
  arrange(desc(n_carriers))
## # A tibble: 105 × 2
##    dest  n_carriers
##    <chr>      <int>
##  1 ATL            7
##  2 BOS            7
##  3 CLT            7
##  4 ORD            7
##  5 TPA            7
##  6 AUS            6
##  7 DCA            6
##  8 DTW            6
##  9 IAD            6
## 10 MSP            6
## # ℹ 95 more rows

Question 8 & 9

Distance vs average arrival delay by destination (removing small counts and HNL):

delay_by_dest <- flights %>%
  group_by(dest) %>%
  summarise(
    count = n(),
    dist  = mean(distance, na.rm = TRUE),
    delay = mean(arr_delay, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  filter(count > 20, dest != "HNL")

delay_by_dest
## # A tibble: 96 × 4
##    dest  count  dist delay
##    <chr> <int> <dbl> <dbl>
##  1 ABQ     254 1826   4.38
##  2 ACK     265  199   4.85
##  3 ALB     439  143  14.4 
##  4 ATL   17215  757. 11.3 
##  5 AUS    2439 1514.  6.02
##  6 AVL     275  584.  8.00
##  7 BDL     443  116   7.05
##  8 BGR     375  378   8.03
##  9 BHM     297  866. 16.9 
## 10 BNA    6333  758. 11.8 
## # ℹ 86 more rows

Plot relationship:

ggplot(delay_by_dest, aes(x = dist, y = delay)) +
  geom_point(aes(size = count), alpha = 1/3) +
  geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'