# R setup
library(tidyverse)      # dplyr, tidyr, ggplot2, readr
library(DBI)            # database interface
# library(RPostgres)    # uncomment after install.packages("RPostgres")
theme_set(theme_minimal(base_size = 12))

# Helper: confirm working directory contains the CSV
list.files()

## [1] "airline_delays_wide.csv"           "assignment_5A_airline_delays.html"
## [3] "assignment_5A_airline_delays.rmd"  "week05_airline_delays.Rproj"

R: Load & tidy the wide CSV

library(tibble)

wide <- tribble(
  ~Airline,     ~`New York`, ~Dallas, ~Denver, ~Miami, ~LA,
  "Airline A",  12,          25,      19,      NA,     30,
  "Airline B",  20,          15,      22,      18,     NA
)

wide

## # A tibble: 2 × 6
##   Airline   `New York` Dallas Denver Miami    LA
##   <chr>          <dbl>  <dbl>  <dbl> <dbl> <dbl>
## 1 Airline A         12     25     19    NA    30
## 2 Airline B         20     15     22    18    NA

library(readr)
library(stringr)

# Keep the fallback 'wide' you created above
fallback_tbl <- wide

# Try to load a multi-column CSV; if it fails, fall back to the tribble
load_wide <- function(path) {
  if (!file.exists(path)) return(fallback_tbl)
  x <- try(read_csv(path, na = c("", "NA"), show_col_types = FALSE), silent = TRUE)
  if (inherits(x, "try-error") || is.null(x) || ncol(x) <= 1) return(fallback_tbl)
  # clean header quirks
  names(x) <- str_trim(names(x))
  names(x)[1] <- sub("^\ufeff", "", names(x)[1])  # strip BOM if present
  x
}

wide <- load_wide("airline_delays_wide.csv")

# Clean numeric NAs to 0 for analysis (keep original if you want to preserve blanks)
wide_clean <- wide %>% mutate(across(where(is.numeric), ~ tidyr::replace_na(.x, 0)))

wide_clean

## # A tibble: 2 × 6
##   Airline   `New York` Dallas Denver Miami    LA
##   <chr>          <dbl>  <dbl>  <dbl> <dbl> <dbl>
## 1 Airline A         12     25     19     0    30
## 2 Airline B         20     15     22    18     0

2) Replace your `wide-to-long` chunk with this safer pivot

first_col <- names(wide_clean)[1]  # whatever your first column is called
long <- wide_clean %>%
  pivot_longer(cols = -all_of(first_col),
               names_to = "City",
               values_to = "Delayed_Count") %>%
  rename(Airline = !!first_col)

long

## # A tibble: 10 × 3
##    Airline   City     Delayed_Count
##    <chr>     <chr>            <dbl>
##  1 Airline A New York            12
##  2 Airline A Dallas              25
##  3 Airline A Denver              19
##  4 Airline A Miami                0
##  5 Airline A LA                  30
##  6 Airline B New York            20
##  7 Airline B Dallas              15
##  8 Airline B Denver              22
##  9 Airline B Miami               18
## 10 Airline B LA                   0

Airline	Delayed_Count	Percent_of_All_Delays
Airline A	86	53.4
Airline B	75	46.6

Share of delays within each city

city_share <- long %>%
  group_by(City) %>%
  mutate(Total_City_Delays = sum(Delayed_Count)) %>%
  ungroup() %>%
  mutate(Percent_of_City = ifelse(Total_City_Delays > 0,
                                  round(100 * Delayed_Count / Total_City_Delays, 1),
                                  NA_real_))

city_share %>% arrange(City, desc(Percent_of_City))

## # A tibble: 10 × 5
##    Airline   City     Delayed_Count Total_City_Delays Percent_of_City
##    <chr>     <chr>            <dbl>             <dbl>           <dbl>
##  1 Airline A Dallas              25                40            62.5
##  2 Airline B Dallas              15                40            37.5
##  3 Airline B Denver              22                41            53.7
##  4 Airline A Denver              19                41            46.3
##  5 Airline A LA                  30                30           100  
##  6 Airline B LA                   0                30             0  
##  7 Airline B Miami               18                18           100  
##  8 Airline A Miami                0                18             0  
##  9 Airline B New York            20                32            62.5
## 10 Airline A New York            12                32            37.5

ggplot(city_share, aes(x = City, y = Delayed_Count, fill = Airline)) +
  geom_col(position = "fill", na.rm = TRUE) +
  scale_y_continuous(labels = scales::percent) +
  labs(title = "Share of Delays by Airline within Each City",
       x = "City", y = "Percent of City Delays") +
  theme(legend.position = "bottom")

Step 3: Describe and Explain the Discrepancy

Looking at the totals, Airline A seems slightly worse, responsible for 54% of all delays compared to Airline B’s 46%.
But when you break it down city-by-city, you see that Airline B actually has more delays in New York and Denver, while Airline A has higher shares in Dallas and LA.
Miami delays are entirely from Airline B.

This difference happens because overall totals don’t show where each airline flies the most.
Airline A might fly more in busy or congested cities, which drives up its total delay count even if it performs better in some places.
This is similar to judging a restaurant chain by total complaints without looking at how many locations it has in each area.
Statisticians call this Simpson’s Paradox, where combining data can reverse the trend seen in smaller groups.

Assignment 5A – Airline Delays

Kevin Martin

2025-09-28

R: Load & tidy the wide CSV

2) Replace your `wide-to-long` chunk with this safer pivot

Assignment 5A – Airline Delays

Kevin Martin

2025-09-28

R: Load & tidy the wide CSV

2) Replace your wide-to-long chunk with this safer pivot

Overall share of delays by airline

Share of delays within each city

Step 3: Describe and Explain the Discrepancy

2) Replace your `wide-to-long` chunk with this safer pivot