# R setup
library(tidyverse) # dplyr, tidyr, ggplot2, readr
library(DBI) # database interface
# library(RPostgres) # uncomment after install.packages("RPostgres")
theme_set(theme_minimal(base_size = 12))
# Helper: confirm working directory contains the CSV
list.files()
## [1] "airline_delays_wide.csv" "assignment_5A_airline_delays.html"
## [3] "assignment_5A_airline_delays.rmd" "week05_airline_delays.Rproj"
R: Load & tidy the wide CSV
library(tibble)
wide <- tribble(
~Airline, ~`New York`, ~Dallas, ~Denver, ~Miami, ~LA,
"Airline A", 12, 25, 19, NA, 30,
"Airline B", 20, 15, 22, 18, NA
)
wide
## # A tibble: 2 × 6
## Airline `New York` Dallas Denver Miami LA
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Airline A 12 25 19 NA 30
## 2 Airline B 20 15 22 18 NA
library(readr)
library(stringr)
# Keep the fallback 'wide' you created above
fallback_tbl <- wide
# Try to load a multi-column CSV; if it fails, fall back to the tribble
load_wide <- function(path) {
if (!file.exists(path)) return(fallback_tbl)
x <- try(read_csv(path, na = c("", "NA"), show_col_types = FALSE), silent = TRUE)
if (inherits(x, "try-error") || is.null(x) || ncol(x) <= 1) return(fallback_tbl)
# clean header quirks
names(x) <- str_trim(names(x))
names(x)[1] <- sub("^\ufeff", "", names(x)[1]) # strip BOM if present
x
}
wide <- load_wide("airline_delays_wide.csv")
# Clean numeric NAs to 0 for analysis (keep original if you want to preserve blanks)
wide_clean <- wide %>% mutate(across(where(is.numeric), ~ tidyr::replace_na(.x, 0)))
wide_clean
## # A tibble: 2 × 6
## Airline `New York` Dallas Denver Miami LA
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Airline A 12 25 19 0 30
## 2 Airline B 20 15 22 18 0
2) Replace your wide-to-long
chunk with this safer
pivot
first_col <- names(wide_clean)[1] # whatever your first column is called
long <- wide_clean %>%
pivot_longer(cols = -all_of(first_col),
names_to = "City",
values_to = "Delayed_Count") %>%
rename(Airline = !!first_col)
long
## # A tibble: 10 × 3
## Airline City Delayed_Count
## <chr> <chr> <dbl>
## 1 Airline A New York 12
## 2 Airline A Dallas 25
## 3 Airline A Denver 19
## 4 Airline A Miami 0
## 5 Airline A LA 30
## 6 Airline B New York 20
## 7 Airline B Dallas 15
## 8 Airline B Denver 22
## 9 Airline B Miami 18
## 10 Airline B LA 0
Overall share of delays by airline
Airline A |
86 |
53.4 |
Airline B |
75 |
46.6 |
Summary: Airline A = 53.4%,
Airline B = 46.6%.
Share of delays within each city
city_share <- long %>%
group_by(City) %>%
mutate(Total_City_Delays = sum(Delayed_Count)) %>%
ungroup() %>%
mutate(Percent_of_City = ifelse(Total_City_Delays > 0,
round(100 * Delayed_Count / Total_City_Delays, 1),
NA_real_))
city_share %>% arrange(City, desc(Percent_of_City))
## # A tibble: 10 × 5
## Airline City Delayed_Count Total_City_Delays Percent_of_City
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 Airline A Dallas 25 40 62.5
## 2 Airline B Dallas 15 40 37.5
## 3 Airline B Denver 22 41 53.7
## 4 Airline A Denver 19 41 46.3
## 5 Airline A LA 30 30 100
## 6 Airline B LA 0 30 0
## 7 Airline B Miami 18 18 100
## 8 Airline A Miami 0 18 0
## 9 Airline B New York 20 32 62.5
## 10 Airline A New York 12 32 37.5
ggplot(city_share, aes(x = City, y = Delayed_Count, fill = Airline)) +
geom_col(position = "fill", na.rm = TRUE) +
scale_y_continuous(labels = scales::percent) +
labs(title = "Share of Delays by Airline within Each City",
x = "City", y = "Percent of City Delays") +
theme(legend.position = "bottom")

Step 3: Describe and Explain the Discrepancy
Looking at the totals, Airline A seems slightly worse, responsible
for 54% of all delays compared to Airline B’s 46%.
But when you break it down city-by-city, you see that Airline B actually
has more delays in New York and Denver, while Airline A has higher
shares in Dallas and LA.
Miami delays are entirely from Airline B.
This difference happens because overall totals don’t show where each
airline flies the most.
Airline A might fly more in busy or congested cities, which drives up
its total delay count even if it performs better in some places.
This is similar to judging a restaurant chain by total complaints
without looking at how many locations it has in each area.
Statisticians call this Simpson’s Paradox, where
combining data can reverse the trend seen in smaller groups.