# install.packages(c("readr", "dplyr", "tidyr"))
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
# Import data
flights <- read_csv("flights.csv")
## Rows: 336776 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): carrier, tailnum, origin, dest
## dbl (14): year, month, day, dep_time, sched_dep_time, dep_delay, arr_time, ...
## dttm (1): time_hour
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Data cleaning
flights_clean <- flights %>%
filter(!is.na(dep_delay))
# Feature creation
flights_with_total_delay <- flights_clean %>%
mutate(total_delay = dep_delay + arr_delay)
# Summarisation
delay_summary <- flights_with_total_delay %>%
group_by(carrier, month) %>%
summarise(
avg_dep_delay = mean(dep_delay, na.rm = TRUE),
avg_arr_delay = mean(arr_delay, na.rm = TRUE),
avg_total_delay = mean(total_delay, na.rm = TRUE),
.groups = "drop"
)
# Top 10 delayed airline-month combinations
top_10_delays <- delay_summary %>%
arrange(desc(avg_total_delay)) %>%
slice_head(n = 10)
top_10_delays
## # A tibble: 10 × 5
## carrier month avg_dep_delay avg_arr_delay avg_total_delay
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 OO 1 67 107 174
## 2 OO 6 61 68.5 130.
## 3 OO 8 64 64.5 128.
## 4 YV 6 42.8 44.6 87.4
## 5 FL 7 41.2 45.0 86.7
## 6 HA 1 54.4 27.5 81.9
## 7 FL 6 38.8 42.0 80.0
## 8 F9 7 31.8 36.4 68.2
## 9 F9 5 35.9 27.1 63.0
## 10 F9 2 29.8 31.1 60.9
# Reshape summary (Part b)
delay_wide <- delay_summary %>%
pivot_wider(
names_from = month,
values_from = avg_total_delay,
names_prefix = "Month_"
)
delay_wide
## # A tibble: 185 × 15
## carrier avg_dep_delay avg_arr_delay Month_1 Month_2 Month_3 Month_4 Month_5
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 9E 16.9 10.2 26.8 NA NA NA NA
## 2 9E 16.5 8.28 NA 24.7 NA NA NA
## 3 9E 13.4 2.03 NA NA 15.3 NA NA
## 4 9E 13.6 5.47 NA NA NA 18.9 NA
## 5 9E 22.7 10.4 NA NA NA NA 33.1
## 6 9E 29.0 22.5 NA NA NA NA NA
## 7 9E 31.4 23.8 NA NA NA NA NA
## 8 9E 17.3 5.31 NA NA NA NA NA
## 9 9E 7.75 -7.14 NA NA NA NA NA
## 10 9E 9.33 -1.35 NA NA NA NA NA
## # ℹ 175 more rows
## # ℹ 7 more variables: Month_6 <dbl>, Month_7 <dbl>, Month_8 <dbl>,
## # Month_9 <dbl>, Month_10 <dbl>, Month_11 <dbl>, Month_12 <dbl>
# Highest delay in July
july_highest_delay <- delay_wide %>%
arrange(desc(Month_7)) %>%
slice_head(n = 1) %>%
select(carrier, Month_7)
july_highest_delay
## # A tibble: 1 × 2
## carrier Month_7
## <chr> <dbl>
## 1 FL 86.7
This wide format enables managers to compare airlines across all months side by side, making it easy to detect seasonal delay patterns and identify top or worst-performing carriers without filtering or grouping repeatedly.