Wrangling
flights <- readr::read_csv("data/flights_dirty.csv") %>%
clean_names() %>%
mutate(
dep_delay = as.numeric(trimws(as.character(dep_delay))),
arr_delay = as.numeric(trimws(as.character(arr_delay)))
) %>%
filter(between(dep_delay, -100, 600), between(arr_delay, -100, 600)) %>%
mutate(airline_code = toupper(str_trim(airline_code))) %>%
filter(nchar(airline_code) == 2)
## Rows: 140 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): airline_code, tailnum, destination
## dbl (4): flight, dep_delay, arr_delay, distance
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
airlines <- readr::read_csv("data/airlines_lookup.csv")
## Rows: 6 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): airline_code, airline_name
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
flights_cleaned <- flights %>%
left_join(airlines, by = "airline_code") %>%
select(flight, airline_code, airline_name, destination, distance, dep_delay, arr_delay)
readr::write_csv(flights_cleaned, "flights_cleaned.csv")
Visualizations
# A1: Departure delay distribution by airline
ggplot(flights_cleaned, aes(x = airline_name, y = dep_delay)) +
geom_boxplot(outlier.alpha = 0.3) +
coord_flip() +
labs(title = "Departure Delay by Airline", x = NULL, y = "Departure delay (min)")

ggsave("fig_flights_delay_by_airline.png", width = 8, height = 5, dpi = 150)
# A2: Mean arrival delay by destination
flights_dest <- flights_cleaned %>%
group_by(destination) %>%
summarise(mean_arr = mean(arr_delay, na.rm = TRUE), .groups = "drop") %>%
arrange(desc(mean_arr))
ggplot(flights_dest, aes(x = reorder(destination, mean_arr), y = mean_arr)) +
geom_col() +
coord_flip() +
labs(title = "Mean Arrival Delay by Destination", x = "Destination", y = "Mean arrival delay (min)")

ggsave("fig_flights_arr_delay_by_dest.png", width = 8, height = 5, dpi = 150)