library(readr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ purrr 1.1.0
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
#Create data in a tibble and clean row of na's with dplyr
flights_wide_data <- tribble(
~Airline, ~Status, ~`Los Angeles`, ~Phoenix, ~`San Diego`, ~`San Francisco`, ~Seattle,
"Alaska", "On Time", 497, 221, 212, 503, 1841,
"Alaska", "Delayed", 62, 12, 20, 102, 305,
NA_character_, NA_character_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
"AM WEST", "On Time", 694, 4840, 383, 320, 201,
"AM WEST", "Delayed", 117, 415, 65, 129, 61
)
#removed rows that have completely filled NA values, anything with a single populated value will be kept
flights_wide <- flights_wide_data %>%
filter(!if_all(everything(), is.na))
flights_wide
## # A tibble: 4 × 7
## Airline Status `Los Angeles` Phoenix `San Diego` `San Francisco` Seattle
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Alaska On Time 497 221 212 503 1841
## 2 Alaska Delayed 62 12 20 102 305
## 3 AM WEST On Time 694 4840 383 320 201
## 4 AM WEST Delayed 117 415 65 129 61
write.csv(flights_wide, "flights_wide.csv", row.names = FALSE)
flights_wide_gh <- read_csv("https://raw.githubusercontent.com/lher96/MSDS-Assignments/refs/heads/main/flights_wide.csv",
show_col_types = FALSE)
#pivot tibble from wide form to a long
flights_long <- flights_wide_gh %>%
pivot_longer(cols = c(`Los Angeles`, Phoenix, `San Diego`, `San Francisco`, Seattle),
names_to = "City", values_to = "Flights")
#Make a total and percent on time column in a table
flights_long_pct<- flights_long %>%
group_by(Airline, City) %>%
mutate(
total_flights = sum(Flights, na.rm = TRUE),
pct_on_time = round(sum(Flights[Status == "On Time"], na.rm = TRUE) / total_flights,2)
)%>%
ungroup()
flights_long_pct
## # A tibble: 20 × 6
## Airline Status City Flights total_flights pct_on_time
## <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 Alaska On Time Los Angeles 497 559 0.89
## 2 Alaska On Time Phoenix 221 233 0.95
## 3 Alaska On Time San Diego 212 232 0.91
## 4 Alaska On Time San Francisco 503 605 0.83
## 5 Alaska On Time Seattle 1841 2146 0.86
## 6 Alaska Delayed Los Angeles 62 559 0.89
## 7 Alaska Delayed Phoenix 12 233 0.95
## 8 Alaska Delayed San Diego 20 232 0.91
## 9 Alaska Delayed San Francisco 102 605 0.83
## 10 Alaska Delayed Seattle 305 2146 0.86
## 11 AM WEST On Time Los Angeles 694 811 0.86
## 12 AM WEST On Time Phoenix 4840 5255 0.92
## 13 AM WEST On Time San Diego 383 448 0.85
## 14 AM WEST On Time San Francisco 320 449 0.71
## 15 AM WEST On Time Seattle 201 262 0.77
## 16 AM WEST Delayed Los Angeles 117 811 0.86
## 17 AM WEST Delayed Phoenix 415 5255 0.92
## 18 AM WEST Delayed San Diego 65 448 0.85
## 19 AM WEST Delayed San Francisco 129 449 0.71
## 20 AM WEST Delayed Seattle 61 262 0.77
#plot by airline by city and compare the outcomes
ggplot(
filter(flights_long_pct, Status == "On Time"),
aes(x = City, y = 1 - pct_on_time, fill = Airline)
) +
geom_col(position = "dodge") +
scale_y_continuous(labels = scales::percent_format()) +
labs(x = "City", y = "% delayed", fill = "Airline",
title = "Percent Delayed by Airline by City")+
theme_grey()

#Here we can see that by city Alaska is always having less delays than AM West. It is also clear that because it is consistent, Alaska must also have a higher rate of on time flights.
flights_long_pct_airline<- flights_long %>%
group_by(Airline) %>%
mutate(
total_flights = sum(Flights, na.rm = TRUE),
pct_on_time = round(sum(Flights[Status == "On Time"], na.rm = TRUE) / total_flights,3)
)%>%
slice_head(n = 1) %>%
ungroup() %>%
select(Airline, total_flights, pct_on_time)
flights_long_pct_airline
## # A tibble: 2 × 3
## Airline total_flights pct_on_time
## <chr> <dbl> <dbl>
## 1 AM WEST 7225 0.891
## 2 Alaska 3775 0.867
ggplot(flights_long_pct_airline, aes(x = Airline, y = pct_on_time)) +
geom_col() +
labs(x= "Airline", y = "On Time Rate")

#group by airline and city to calculate relative weights for locations.
pct_flights_city_alt <- flights_long %>%
group_by(Airline, City) %>%
summarise(Flights = sum(Flights, na.rm = TRUE), .groups = "drop_last") %>% # stays grouped by Airline
mutate(
total_airline = sum(Flights, na.rm = TRUE),
pct_cit = round(Flights / total_airline, 2)
) %>%
ungroup()
pct_flights_city_alt
## # A tibble: 10 × 5
## Airline City Flights total_airline pct_cit
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 AM WEST Los Angeles 811 7225 0.11
## 2 AM WEST Phoenix 5255 7225 0.73
## 3 AM WEST San Diego 448 7225 0.06
## 4 AM WEST San Francisco 449 7225 0.06
## 5 AM WEST Seattle 262 7225 0.04
## 6 Alaska Los Angeles 559 3775 0.15
## 7 Alaska Phoenix 233 3775 0.06
## 8 Alaska San Diego 232 3775 0.06
## 9 Alaska San Francisco 605 3775 0.16
## 10 Alaska Seattle 2146 3775 0.57
#Here we show that having a higher rate of on time flights in every city, AM West has significantly more flights to Phoenix(73%) where they have the highest rate of on time flights at 92%. This weighting is the reason we see that AM West still manages to have a higher rate of on time flights than Alaska since Alaska has 57% of flights going to Seattle at an on time rate of 86%. This phenomenon is known as Simpson's Paradox which shows that a trend can appear in grouped data but when that data is summed up, that trend might not be realized.