flights <- read_csv("arrival_delays.csv")
flights
## # A tibble: 4 × 7
## Airline Status `Los Angeles` Phoenix `San Diego` `San Francisco` Seattle
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Alaska on time 497 221 212 503 1841
## 2 Alaska delayed 62 12 20 102 305
## 3 AM West on time 694 4840 383 320 201
## 4 AM West delayed 117 415 65 129 61
## Transform from wide to long
flights_long <- flights %>%
pivot_longer(
cols = c(`Los Angeles`, Phoenix, `San Diego`, `San Francisco`, Seattle),
names_to = "City",
values_to = "Count"
)
flights_long
## # A tibble: 20 × 4
## Airline Status City Count
## <chr> <chr> <chr> <dbl>
## 1 Alaska on time Los Angeles 497
## 2 Alaska on time Phoenix 221
## 3 Alaska on time San Diego 212
## 4 Alaska on time San Francisco 503
## 5 Alaska on time Seattle 1841
## 6 Alaska delayed Los Angeles 62
## 7 Alaska delayed Phoenix 12
## 8 Alaska delayed San Diego 20
## 9 Alaska delayed San Francisco 102
## 10 Alaska delayed Seattle 305
## 11 AM West on time Los Angeles 694
## 12 AM West on time Phoenix 4840
## 13 AM West on time San Diego 383
## 14 AM West on time San Francisco 320
## 15 AM West on time Seattle 201
## 16 AM West delayed Los Angeles 117
## 17 AM West delayed Phoenix 415
## 18 AM West delayed San Diego 65
## 19 AM West delayed San Francisco 129
## 20 AM West delayed Seattle 61
## Count analysis
flights_long %>%
group_by(Airline, Status) %>%
summarise(Total = sum(Count), .groups="drop")
## # A tibble: 4 × 3
## Airline Status Total
## <chr> <chr> <dbl>
## 1 AM West delayed 787
## 2 AM West on time 6438
## 3 Alaska delayed 501
## 4 Alaska on time 3274
## Overall delay percentage by airline
overall <- flights_long %>%
group_by(Airline, Status) %>%
summarise(Total = sum(Count), .groups="drop") %>%
pivot_wider(names_from = Status, values_from = Total) %>%
mutate(
AllFlights = `on time` + delayed,
DelayRate = delayed / AllFlights
)
ggplot(overall, aes(x = Airline, y = DelayRate)) +
geom_col() +
scale_y_continuous(labels = scales::percent) +
labs(title="Overall Delay Rate by Airline", y="Delay %", x=NULL)

- Overall, AM West has a lower delay percentage than Alaska.
## Delay percentage by city
by_city <- flights_long %>%
group_by(Airline, City, Status) %>%
summarise(Total = sum(Count), .groups="drop") %>%
pivot_wider(names_from = Status, values_from = Total) %>%
mutate(
AllFlights = `on time` + delayed,
DelayRate = delayed / AllFlights
)
ggplot(by_city, aes(x = City, y = DelayRate, group = Airline, color = Airline)) +
geom_line() +
geom_point(size = 3) +
scale_y_continuous(labels = scales::percent) +
theme(axis.text.x = element_text(angle = 30, hjust = 1)) +
labs(title="Delay Rate by City", y="Delay %", x=NULL)

- By city, Alaska has a lower delay % in Los Angeles, Phoenix, San
Diego, San Francisco, and Seattle
- Even though AM West looks better overall The overall comparison
differs from the city-by-city comparison because the overall delay rate
is a weighted average. AM West has a much larger number of flights in
Phoenix, which has a relatively low delay rate, so Phoenix heavily
influences AM West’s overall delay percentage. When looking within each
city, Alaska can appear better, but when the data is combined, the
weighting across cities can reverse the conclusion. This is an example
of Simpson’s Paradox.