This analysis examines airline arrival performance data for Alaska Airlines and AM West across five U.S. destination cities. The dataset summarizes counts of on-time and delayed arrivals and is commonly used to illustrate how aggregated statistics can lead to misleading conclusions when underlying group sizes differ. The original data source is adapted from Numbersense by Kaiser Fung (2013).
The dataset is recreated directly in R to match the original table structure exactly, using a wide format. This approach avoids reliance on local file paths and ensures full reproducibility.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.0 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Create the data in wide format
airline_data_wide <- tibble(
Airline = c("Alaska", "Alaska", "AM West", "AM West"),
Status = c("On Time", "Delayed", "On Time", "Delayed"),
Los_Angeles = c(497, 62, 694, 117),
Phoenix = c(221, 12, 4840, 415),
San_Diego = c(212, 20, 383, 65),
San_Francisco = c(503, 102, 320, 129),
Seattle = c(1841, 305, 201, 61)
)
airline_data_wide
## # A tibble: 4 × 7
## Airline Status Los_Angeles Phoenix San_Diego San_Francisco Seattle
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Alaska On Time 497 221 212 503 1841
## 2 Alaska Delayed 62 12 20 102 305
## 3 AM West On Time 694 4840 383 320 201
## 4 AM West Delayed 117 415 65 129 61
airline_data_long <- airline_data_wide %>%
pivot_longer(
cols = Los_Angeles:Seattle,
names_to = "City",
values_to = "Count"
)
airline_data_long
## # A tibble: 20 × 4
## Airline Status City Count
## <chr> <chr> <chr> <dbl>
## 1 Alaska On Time Los_Angeles 497
## 2 Alaska On Time Phoenix 221
## 3 Alaska On Time San_Diego 212
## 4 Alaska On Time San_Francisco 503
## 5 Alaska On Time Seattle 1841
## 6 Alaska Delayed Los_Angeles 62
## 7 Alaska Delayed Phoenix 12
## 8 Alaska Delayed San_Diego 20
## 9 Alaska Delayed San_Francisco 102
## 10 Alaska Delayed Seattle 305
## 11 AM West On Time Los_Angeles 694
## 12 AM West On Time Phoenix 4840
## 13 AM West On Time San_Diego 383
## 14 AM West On Time San_Francisco 320
## 15 AM West On Time Seattle 201
## 16 AM West Delayed Los_Angeles 117
## 17 AM West Delayed Phoenix 415
## 18 AM West Delayed San_Diego 65
## 19 AM West Delayed San_Francisco 129
## 20 AM West Delayed Seattle 61
overall_performance <- airline_data_long %>%
group_by(Airline, Status) %>%
summarise(Total = sum(Count), .groups = "drop") %>%
group_by(Airline) %>%
mutate(Percent = Total / sum(Total) * 100)
overall_performance
## # A tibble: 4 × 4
## # Groups: Airline [2]
## Airline Status Total Percent
## <chr> <chr> <dbl> <dbl>
## 1 AM West Delayed 787 10.9
## 2 AM West On Time 6438 89.1
## 3 Alaska Delayed 501 13.3
## 4 Alaska On Time 3274 86.7
city_performance <- airline_data_long %>%
group_by(Airline, City, Status) %>%
summarise(Total = sum(Count), .groups = "drop") %>%
group_by(Airline, City) %>%
mutate(Percent = Total / sum(Total) * 100)
city_performance
## # A tibble: 20 × 5
## # Groups: Airline, City [10]
## Airline City Status Total Percent
## <chr> <chr> <chr> <dbl> <dbl>
## 1 AM West Los_Angeles Delayed 117 14.4
## 2 AM West Los_Angeles On Time 694 85.6
## 3 AM West Phoenix Delayed 415 7.90
## 4 AM West Phoenix On Time 4840 92.1
## 5 AM West San_Diego Delayed 65 14.5
## 6 AM West San_Diego On Time 383 85.5
## 7 AM West San_Francisco Delayed 129 28.7
## 8 AM West San_Francisco On Time 320 71.3
## 9 AM West Seattle Delayed 61 23.3
## 10 AM West Seattle On Time 201 76.7
## 11 Alaska Los_Angeles Delayed 62 11.1
## 12 Alaska Los_Angeles On Time 497 88.9
## 13 Alaska Phoenix Delayed 12 5.15
## 14 Alaska Phoenix On Time 221 94.8
## 15 Alaska San_Diego Delayed 20 8.62
## 16 Alaska San_Diego On Time 212 91.4
## 17 Alaska San_Francisco Delayed 102 16.9
## 18 Alaska San_Francisco On Time 503 83.1
## 19 Alaska Seattle Delayed 305 14.2
## 20 Alaska Seattle On Time 1841 85.8