# install.packages(c("readr", "dplyr", "tidyr"))

library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)

# Import data
flights <- read_csv("flights.csv")
## Rows: 336776 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (4): carrier, tailnum, origin, dest
## dbl  (14): year, month, day, dep_time, sched_dep_time, dep_delay, arr_time, ...
## dttm  (1): time_hour
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Data cleaning
flights_clean <- flights %>%
  filter(!is.na(dep_delay))

# Feature creation
flights_with_total_delay <- flights_clean %>%
  mutate(total_delay = dep_delay + arr_delay)

# Summarisation
delay_summary <- flights_with_total_delay %>%
  group_by(carrier, month) %>%
  summarise(
    avg_dep_delay = mean(dep_delay, na.rm = TRUE),
    avg_arr_delay = mean(arr_delay, na.rm = TRUE),
    avg_total_delay = mean(total_delay, na.rm = TRUE),
    .groups = "drop"
  )

# Top 10 delayed airline-month combinations
top_10_delays <- delay_summary %>%
  arrange(desc(avg_total_delay)) %>%
  slice_head(n = 10)

top_10_delays
## # A tibble: 10 × 5
##    carrier month avg_dep_delay avg_arr_delay avg_total_delay
##    <chr>   <dbl>         <dbl>         <dbl>           <dbl>
##  1 OO          1          67           107             174  
##  2 OO          6          61            68.5           130. 
##  3 OO          8          64            64.5           128. 
##  4 YV          6          42.8          44.6            87.4
##  5 FL          7          41.2          45.0            86.7
##  6 HA          1          54.4          27.5            81.9
##  7 FL          6          38.8          42.0            80.0
##  8 F9          7          31.8          36.4            68.2
##  9 F9          5          35.9          27.1            63.0
## 10 F9          2          29.8          31.1            60.9
# Reshape summary (Part b)
delay_wide <- delay_summary %>%
  pivot_wider(
    names_from = month,
    values_from = avg_total_delay,
    names_prefix = "Month_"
  )

delay_wide
## # A tibble: 185 × 15
##    carrier avg_dep_delay avg_arr_delay Month_1 Month_2 Month_3 Month_4 Month_5
##    <chr>           <dbl>         <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1 9E              16.9          10.2     26.8    NA      NA      NA      NA  
##  2 9E              16.5           8.28    NA      24.7    NA      NA      NA  
##  3 9E              13.4           2.03    NA      NA      15.3    NA      NA  
##  4 9E              13.6           5.47    NA      NA      NA      18.9    NA  
##  5 9E              22.7          10.4     NA      NA      NA      NA      33.1
##  6 9E              29.0          22.5     NA      NA      NA      NA      NA  
##  7 9E              31.4          23.8     NA      NA      NA      NA      NA  
##  8 9E              17.3           5.31    NA      NA      NA      NA      NA  
##  9 9E               7.75         -7.14    NA      NA      NA      NA      NA  
## 10 9E               9.33         -1.35    NA      NA      NA      NA      NA  
## # ℹ 175 more rows
## # ℹ 7 more variables: Month_6 <dbl>, Month_7 <dbl>, Month_8 <dbl>,
## #   Month_9 <dbl>, Month_10 <dbl>, Month_11 <dbl>, Month_12 <dbl>
# Highest delay in July
july_highest_delay <- delay_wide %>%
  arrange(desc(Month_7)) %>%
  slice_head(n = 1) %>%
  select(carrier, Month_7)

july_highest_delay
## # A tibble: 1 × 2
##   carrier Month_7
##   <chr>     <dbl>
## 1 FL         86.7

Explanation of Reshaped Format Utility

This wide format enables managers to compare airlines across all months side by side, making it easy to detect seasonal delay patterns and identify top or worst-performing carriers without filtering or grouping repeatedly.