Instructions

Week 5 Assignment – recreate a wide (untidy) file with a missing cell, tidy/transform, and compare percentages overall and across five cities for two airlines. Publish code + narrative. Links: RPubs: https://rpubs.com/skapoor13/1348845 | GitHub: https://github.com/sachikapoor99-byte/week5-airlines-baseR

Setup

Data Sources

Load Untidy Data (WIDE)

# We'll create+save two wide CSVs (rates, flights). Missing cell included by design.
airline_rates_wide <- tibble::tribble(
  ~Airline, ~NYC_rate, ~CHI_rate, ~SFO_rate, ~HOU_rate, ~PHX_rate,
  "A",         0.95,      0.90,      0.60,      0.60,      0.70,
  "B",         0.93,      0.88,      0.65,      0.62,        NA
)

airline_flights_wide <- tibble::tribble(
  ~Airline, ~NYC_flights, ~CHI_flights, ~SFO_flights, ~HOU_flights, ~PHX_flights,
  "A",          50,           50,           1000,          800,          20,
  "B",          500,          400,            50,           50,         100
)

 # Save to CSV (for rubric: submitted data file + recreated format)
readr::write_csv(airline_rates_wide,  "airline_rates_wide.csv")
readr::write_csv(airline_flights_wide,"airline_flights_wide.csv")

# Read as "raw" inputs
df_rates_raw   <- readr::read_csv("airline_rates_wide.csv", show_col_types = FALSE)
df_flights_raw <- readr::read_csv("airline_flights_wide.csv", show_col_types = FALSE)

# Quick peek
list(rates_head = head(df_rates_raw),
     flights_head = head(df_flights_raw))

## $rates_head
## # A tibble: 2 × 6
##   Airline NYC_rate CHI_rate SFO_rate HOU_rate PHX_rate
##   <chr>      <dbl>    <dbl>    <dbl>    <dbl>    <dbl>
## 1 A           0.95     0.9      0.6      0.6       0.7
## 2 B           0.93     0.88     0.65     0.62     NA  
## 
## $flights_head
## # A tibble: 2 × 6
##   Airline NYC_flights CHI_flights SFO_flights HOU_flights PHX_flights
##   <chr>         <dbl>       <dbl>       <dbl>       <dbl>       <dbl>
## 1 A                50          50        1000         800          20
## 2 B               500         400          50          50         100

Why This Is Untidy

This is untidy because each city is a separate column (a value of the “city” variable) instead of a row value; measures are also split (rates vs. flights), and there is a deliberate missing cell (PHX_rate for Airline B).

Tidy Transform (turn on later)

# Pivot wide→long, impute the single missing rate, and join with flights
rates_long <- df_rates_raw |>
tidyr::pivot_longer(cols = -Airline, names_to = "city", values_to = "on_time_rate") |>
  dplyr::mutate(city = sub("_rate$", "", city)) |>
  dplyr::group_by(city) |>
  dplyr::mutate(on_time_rate = dplyr::if_else(
    is.na(on_time_rate),
    mean(on_time_rate, na.rm = TRUE),   # simple, transparent city-mean imputation
    on_time_rate
  )) |>
  dplyr::ungroup()

flights_long <- df_flights_raw |>
  tidyr::pivot_longer(cols = -Airline, names_to = "city", values_to = "flights") |>
  dplyr::mutate(city = sub("_flights$", "", city))

tidy_df <- rates_long |>
  dplyr::inner_join(flights_long, by = c("Airline","city")) |>
  dplyr::mutate(
    on_time = round(on_time_rate * flights),
    delayed = pmax(flights - on_time, 0L)
  )

head(tidy_df)

## # A tibble: 6 × 6
##   Airline city  on_time_rate flights on_time delayed
##   <chr>   <chr>        <dbl>   <dbl>   <dbl>   <dbl>
## 1 A       NYC           0.95      50      48       2
## 2 A       CHI           0.9       50      45       5
## 3 A       SFO           0.6     1000     600     400
## 4 A       HOU           0.6      800     480     320
## 5 A       PHX           0.7       20      14       6
## 6 B       NYC           0.93     500     465      35

Analysis (turn on later)

# Overall comparison (percentages AND counts)
overall <- tidy_df |>
  dplyr::group_by(Airline) |>
  dplyr::summarise(
    flights_total = sum(flights),
    on_time_total = sum(on_time),
    on_time_pct   = on_time_total / flights_total
  ) |>
  dplyr::arrange(dplyr::desc(on_time_pct))

overall

## # A tibble: 2 × 4
##   Airline flights_total on_time_total on_time_pct
##   <chr>           <dbl>         <dbl>       <dbl>
## 1 B                1100           950       0.864
## 2 A                1920          1187       0.618

overall_msg <- overall |>
  dplyr::mutate(msg = sprintf(
    "Airline %s: %s on-time (%s/%s flights)",
    Airline,
    scales::percent(on_time_pct, accuracy = 0.1),
    scales::comma(on_time_total),
    scales::comma(flights_total)
  )) |>
  dplyr::pull(msg)

overall_msg

## [1] "Airline B: 86.4% on-time (950/1,100 flights)"  
## [2] "Airline A: 61.8% on-time (1,187/1,920 flights)"

# City-by-city comparison (percentages)
by_city <- tidy_df |>
  dplyr::group_by(city, Airline) |>
  dplyr::summarise(
    flights = sum(flights),
    on_time = sum(on_time),
    on_time_pct = on_time / flights,
    .groups = "drop"
  ) |>
  dplyr::arrange(city, dplyr::desc(on_time_pct))

by_city

## # A tibble: 10 × 5
##    city  Airline flights on_time on_time_pct
##    <chr> <chr>     <dbl>   <dbl>       <dbl>
##  1 CHI   A            50      45        0.9 
##  2 CHI   B           400     352        0.88
##  3 HOU   B            50      31        0.62
##  4 HOU   A           800     480        0.6 
##  5 NYC   A            50      48        0.96
##  6 NYC   B           500     465        0.93
##  7 PHX   A            20      14        0.7 
##  8 PHX   B           100      70        0.7 
##  9 SFO   B            50      32        0.64
## 10 SFO   A          1000     600        0.6

# Charts
overall |>
  ggplot2::ggplot(ggplot2::aes(Airline, on_time_pct)) +
  ggplot2::geom_col() +
  ggplot2::geom_text(ggplot2::aes(label = scales::percent(on_time_pct, 0.1)), vjust = -0.3) +
  ggplot2::labs(title = "Overall On-time Percentage by Airline", y = "On-time %", x = NULL) +
  ggplot2::ylim(0, 1)

by_city |>
  ggplot2::ggplot(ggplot2::aes(city, on_time_pct, fill = Airline)) +
  ggplot2::geom_col(position = ggplot2::position_dodge(width = 0.8)) +
  ggplot2::geom_text(ggplot2::aes(label = scales::percent(on_time_pct, 0.1)),
                     position = ggplot2::position_dodge(width = 0.8), vjust = -0.3, size = 3) +
  ggplot2::labs(title = "On-time Percentage by City and Airline", x = "City", y = "On-time %") +
  ggplot2::ylim(0, 1)

Results & Discussion

Summary: Overall, Airline B’s on-time percentage edges out Airline A when weighted by total flights, but city-by-city results are mixed (A leads in some cities; B leads in others).
Discrepancy (what): The overall ranking doesn’t match several city-level rankings because each airline flies very different volumes per city.
Explanation (why): This is a composition (Simpson’s paradox) effect: the overall rate is a weighted average of city rates, so an airline with more flights concentrated in its stronger cities can win overall even if it loses in other cities. We reported both percentages and counts to make the comparison transparent.

# Appendix

sessionInfo()

## R version 4.5.1 (2025-06-13)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 20.04.6 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3;  LAPACK version 3.9.0
## 
## locale:
##  [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
##  [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
##  [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
## [10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   
## 
## time zone: UTC
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] janitor_2.2.1   lubridate_1.9.4 forcats_1.0.1   stringr_1.5.2  
##  [5] dplyr_1.1.4     purrr_1.1.0     readr_2.1.5     tidyr_1.3.1    
##  [9] tibble_3.3.0    ggplot2_4.0.0   tidyverse_2.0.0
## 
## loaded via a namespace (and not attached):
##  [1] bit_4.6.0          gtable_0.3.6       jsonlite_2.0.0     crayon_1.5.3      
##  [5] compiler_4.5.1     tidyselect_1.2.1   parallel_4.5.1     snakecase_0.11.1  
##  [9] jquerylib_0.1.4    scales_1.4.0       yaml_2.3.10        fastmap_1.2.0     
## [13] R6_2.6.1           labeling_0.4.3     generics_0.1.4     knitr_1.50        
## [17] bslib_0.9.0        pillar_1.11.1      RColorBrewer_1.1-3 tzdb_0.5.0        
## [21] rlang_1.1.6        utf8_1.2.6         stringi_1.8.7      cachem_1.1.0      
## [25] xfun_0.53          sass_0.4.10        S7_0.2.0           bit64_4.6.0-1     
## [29] timechange_0.3.0   cli_3.6.5          withr_3.0.2        magrittr_2.0.4    
## [33] digest_0.6.37      grid_4.5.1         vroom_1.6.6        rstudioapi_0.17.1 
## [37] hms_1.1.3          lifecycle_1.0.4    vctrs_0.6.5        evaluate_1.0.5    
## [41] glue_1.8.0         farver_2.1.2       rmarkdown_2.29     tools_4.5.1       
## [45] pkgconfig_2.0.3    htmltools_0.5.8.1

Week 5 Assignment

Sachi Kapoor

2025-09-26