Week 5 Assignment – recreate a wide (untidy) file with a missing cell, tidy/transform, and compare percentages overall and across five cities for two airlines. Publish code + narrative. Links: RPubs: https://rpubs.com/skapoor13/1348845 | GitHub: https://github.com/sachikapoor99-byte/week5-airlines-baseR
# We'll create+save two wide CSVs (rates, flights). Missing cell included by design.
airline_rates_wide <- tibble::tribble(
~Airline, ~NYC_rate, ~CHI_rate, ~SFO_rate, ~HOU_rate, ~PHX_rate,
"A", 0.95, 0.90, 0.60, 0.60, 0.70,
"B", 0.93, 0.88, 0.65, 0.62, NA
)
airline_flights_wide <- tibble::tribble(
~Airline, ~NYC_flights, ~CHI_flights, ~SFO_flights, ~HOU_flights, ~PHX_flights,
"A", 50, 50, 1000, 800, 20,
"B", 500, 400, 50, 50, 100
)
# Save to CSV (for rubric: submitted data file + recreated format)
readr::write_csv(airline_rates_wide, "airline_rates_wide.csv")
readr::write_csv(airline_flights_wide,"airline_flights_wide.csv")
# Read as "raw" inputs
df_rates_raw <- readr::read_csv("airline_rates_wide.csv", show_col_types = FALSE)
df_flights_raw <- readr::read_csv("airline_flights_wide.csv", show_col_types = FALSE)
# Quick peek
list(rates_head = head(df_rates_raw),
flights_head = head(df_flights_raw))
## $rates_head
## # A tibble: 2 × 6
## Airline NYC_rate CHI_rate SFO_rate HOU_rate PHX_rate
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 A 0.95 0.9 0.6 0.6 0.7
## 2 B 0.93 0.88 0.65 0.62 NA
##
## $flights_head
## # A tibble: 2 × 6
## Airline NYC_flights CHI_flights SFO_flights HOU_flights PHX_flights
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 A 50 50 1000 800 20
## 2 B 500 400 50 50 100
This is untidy because each city is a separate column (a value of the “city” variable) instead of a row value; measures are also split (rates vs. flights), and there is a deliberate missing cell (PHX_rate for Airline B).
# Pivot wide→long, impute the single missing rate, and join with flights
rates_long <- df_rates_raw |>
tidyr::pivot_longer(cols = -Airline, names_to = "city", values_to = "on_time_rate") |>
dplyr::mutate(city = sub("_rate$", "", city)) |>
dplyr::group_by(city) |>
dplyr::mutate(on_time_rate = dplyr::if_else(
is.na(on_time_rate),
mean(on_time_rate, na.rm = TRUE), # simple, transparent city-mean imputation
on_time_rate
)) |>
dplyr::ungroup()
flights_long <- df_flights_raw |>
tidyr::pivot_longer(cols = -Airline, names_to = "city", values_to = "flights") |>
dplyr::mutate(city = sub("_flights$", "", city))
tidy_df <- rates_long |>
dplyr::inner_join(flights_long, by = c("Airline","city")) |>
dplyr::mutate(
on_time = round(on_time_rate * flights),
delayed = pmax(flights - on_time, 0L)
)
head(tidy_df)
## # A tibble: 6 × 6
## Airline city on_time_rate flights on_time delayed
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 A NYC 0.95 50 48 2
## 2 A CHI 0.9 50 45 5
## 3 A SFO 0.6 1000 600 400
## 4 A HOU 0.6 800 480 320
## 5 A PHX 0.7 20 14 6
## 6 B NYC 0.93 500 465 35
# Overall comparison (percentages AND counts)
overall <- tidy_df |>
dplyr::group_by(Airline) |>
dplyr::summarise(
flights_total = sum(flights),
on_time_total = sum(on_time),
on_time_pct = on_time_total / flights_total
) |>
dplyr::arrange(dplyr::desc(on_time_pct))
overall
## # A tibble: 2 × 4
## Airline flights_total on_time_total on_time_pct
## <chr> <dbl> <dbl> <dbl>
## 1 B 1100 950 0.864
## 2 A 1920 1187 0.618
overall_msg <- overall |>
dplyr::mutate(msg = sprintf(
"Airline %s: %s on-time (%s/%s flights)",
Airline,
scales::percent(on_time_pct, accuracy = 0.1),
scales::comma(on_time_total),
scales::comma(flights_total)
)) |>
dplyr::pull(msg)
overall_msg
## [1] "Airline B: 86.4% on-time (950/1,100 flights)"
## [2] "Airline A: 61.8% on-time (1,187/1,920 flights)"
# City-by-city comparison (percentages)
by_city <- tidy_df |>
dplyr::group_by(city, Airline) |>
dplyr::summarise(
flights = sum(flights),
on_time = sum(on_time),
on_time_pct = on_time / flights,
.groups = "drop"
) |>
dplyr::arrange(city, dplyr::desc(on_time_pct))
by_city
## # A tibble: 10 × 5
## city Airline flights on_time on_time_pct
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 CHI A 50 45 0.9
## 2 CHI B 400 352 0.88
## 3 HOU B 50 31 0.62
## 4 HOU A 800 480 0.6
## 5 NYC A 50 48 0.96
## 6 NYC B 500 465 0.93
## 7 PHX A 20 14 0.7
## 8 PHX B 100 70 0.7
## 9 SFO B 50 32 0.64
## 10 SFO A 1000 600 0.6
# Charts
overall |>
ggplot2::ggplot(ggplot2::aes(Airline, on_time_pct)) +
ggplot2::geom_col() +
ggplot2::geom_text(ggplot2::aes(label = scales::percent(on_time_pct, 0.1)), vjust = -0.3) +
ggplot2::labs(title = "Overall On-time Percentage by Airline", y = "On-time %", x = NULL) +
ggplot2::ylim(0, 1)
by_city |>
ggplot2::ggplot(ggplot2::aes(city, on_time_pct, fill = Airline)) +
ggplot2::geom_col(position = ggplot2::position_dodge(width = 0.8)) +
ggplot2::geom_text(ggplot2::aes(label = scales::percent(on_time_pct, 0.1)),
position = ggplot2::position_dodge(width = 0.8), vjust = -0.3, size = 3) +
ggplot2::labs(title = "On-time Percentage by City and Airline", x = "City", y = "On-time %") +
ggplot2::ylim(0, 1)
Summary: Overall, Airline B’s on-time percentage
edges out Airline A when weighted by total flights, but city-by-city
results are mixed (A leads in some cities; B leads in others).
Discrepancy (what): The overall ranking doesn’t match
several city-level rankings because each airline flies very different
volumes per city.
Explanation (why): This is a composition (Simpson’s
paradox) effect: the overall rate is a weighted average of city rates,
so an airline with more flights concentrated in its stronger cities can
win overall even if it loses in other cities. We reported both
percentages and counts to make the comparison transparent.
# Appendix
sessionInfo()
## R version 4.5.1 (2025-06-13)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 20.04.6 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3; LAPACK version 3.9.0
##
## locale:
## [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8
## [4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
## [7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
## [10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
##
## time zone: UTC
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] janitor_2.2.1 lubridate_1.9.4 forcats_1.0.1 stringr_1.5.2
## [5] dplyr_1.1.4 purrr_1.1.0 readr_2.1.5 tidyr_1.3.1
## [9] tibble_3.3.0 ggplot2_4.0.0 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] bit_4.6.0 gtable_0.3.6 jsonlite_2.0.0 crayon_1.5.3
## [5] compiler_4.5.1 tidyselect_1.2.1 parallel_4.5.1 snakecase_0.11.1
## [9] jquerylib_0.1.4 scales_1.4.0 yaml_2.3.10 fastmap_1.2.0
## [13] R6_2.6.1 labeling_0.4.3 generics_0.1.4 knitr_1.50
## [17] bslib_0.9.0 pillar_1.11.1 RColorBrewer_1.1-3 tzdb_0.5.0
## [21] rlang_1.1.6 utf8_1.2.6 stringi_1.8.7 cachem_1.1.0
## [25] xfun_0.53 sass_0.4.10 S7_0.2.0 bit64_4.6.0-1
## [29] timechange_0.3.0 cli_3.6.5 withr_3.0.2 magrittr_2.0.4
## [33] digest_0.6.37 grid_4.5.1 vroom_1.6.6 rstudioapi_0.17.1
## [37] hms_1.1.3 lifecycle_1.0.4 vctrs_0.6.5 evaluate_1.0.5
## [41] glue_1.8.0 farver_2.1.2 rmarkdown_2.29 tools_4.5.1
## [45] pkgconfig_2.0.3 htmltools_0.5.8.1