library(readr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ purrr     1.1.0
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)

#Create data in a tibble and clean row of na's with dplyr

flights_wide_data <- tribble(
  ~Airline,   ~Status,    ~`Los Angeles`, ~Phoenix, ~`San Diego`, ~`San Francisco`, ~Seattle,
  "Alaska",   "On Time",  497,            221,      212,          503,              1841,
  "Alaska",   "Delayed",  62,             12,       20,           102,              305,
  NA_character_, NA_character_,  NA_integer_, NA_integer_, NA_integer_,  NA_integer_, NA_integer_,
  "AM WEST",  "On Time",  694,            4840,     383,          320,              201,
  "AM WEST",  "Delayed",  117,            415,      65,           129,              61
)

#removed rows that have completely filled NA values, anything with a single populated value will be kept
flights_wide <- flights_wide_data %>%
  filter(!if_all(everything(), is.na))

flights_wide
## # A tibble: 4 × 7
##   Airline Status  `Los Angeles` Phoenix `San Diego` `San Francisco` Seattle
##   <chr>   <chr>           <dbl>   <dbl>       <dbl>           <dbl>   <dbl>
## 1 Alaska  On Time           497     221         212             503    1841
## 2 Alaska  Delayed            62      12          20             102     305
## 3 AM WEST On Time           694    4840         383             320     201
## 4 AM WEST Delayed           117     415          65             129      61
write.csv(flights_wide, "flights_wide.csv", row.names = FALSE)
flights_wide_gh <- read_csv("https://raw.githubusercontent.com/lher96/MSDS-Assignments/refs/heads/main/flights_wide.csv",
                            show_col_types = FALSE)

#pivot tibble from wide form to a long 
flights_long <- flights_wide_gh %>%
  pivot_longer(cols = c(`Los Angeles`, Phoenix, `San Diego`, `San Francisco`, Seattle),
               names_to = "City", values_to = "Flights")
#Make a total and percent on time column in a table
flights_long_pct<- flights_long %>%
  group_by(Airline, City) %>%
  mutate(
    total_flights = sum(Flights, na.rm = TRUE),
    pct_on_time   = round(sum(Flights[Status == "On Time"], na.rm = TRUE) / total_flights,2)
  )%>%
  ungroup()

flights_long_pct 
## # A tibble: 20 × 6
##    Airline Status  City          Flights total_flights pct_on_time
##    <chr>   <chr>   <chr>           <dbl>         <dbl>       <dbl>
##  1 Alaska  On Time Los Angeles       497           559        0.89
##  2 Alaska  On Time Phoenix           221           233        0.95
##  3 Alaska  On Time San Diego         212           232        0.91
##  4 Alaska  On Time San Francisco     503           605        0.83
##  5 Alaska  On Time Seattle          1841          2146        0.86
##  6 Alaska  Delayed Los Angeles        62           559        0.89
##  7 Alaska  Delayed Phoenix            12           233        0.95
##  8 Alaska  Delayed San Diego          20           232        0.91
##  9 Alaska  Delayed San Francisco     102           605        0.83
## 10 Alaska  Delayed Seattle           305          2146        0.86
## 11 AM WEST On Time Los Angeles       694           811        0.86
## 12 AM WEST On Time Phoenix          4840          5255        0.92
## 13 AM WEST On Time San Diego         383           448        0.85
## 14 AM WEST On Time San Francisco     320           449        0.71
## 15 AM WEST On Time Seattle           201           262        0.77
## 16 AM WEST Delayed Los Angeles       117           811        0.86
## 17 AM WEST Delayed Phoenix           415          5255        0.92
## 18 AM WEST Delayed San Diego          65           448        0.85
## 19 AM WEST Delayed San Francisco     129           449        0.71
## 20 AM WEST Delayed Seattle            61           262        0.77
#plot by airline by city and compare the outcomes 
ggplot(
  filter(flights_long_pct, Status == "On Time"),         
  aes(x = City, y = 1 - pct_on_time, fill = Airline)     
) +
  geom_col(position = "dodge") +
  scale_y_continuous(labels = scales::percent_format()) +
  labs(x = "City", y = "% delayed", fill = "Airline",
       title = "Percent Delayed by Airline by City")+
  theme_grey()

#Here we can see that by city Alaska is always having less delays than AM West. It is also clear that because it is consistent, Alaska must also have a higher rate of on time flights.
flights_long_pct_airline<- flights_long %>%
  group_by(Airline) %>%
  mutate(
    total_flights = sum(Flights, na.rm = TRUE),
    pct_on_time   = round(sum(Flights[Status == "On Time"], na.rm = TRUE) / total_flights,3)
  )%>%
  slice_head(n = 1) %>%               
  ungroup() %>%
  select(Airline, total_flights, pct_on_time)


flights_long_pct_airline
## # A tibble: 2 × 3
##   Airline total_flights pct_on_time
##   <chr>           <dbl>       <dbl>
## 1 AM WEST          7225       0.891
## 2 Alaska           3775       0.867
ggplot(flights_long_pct_airline, aes(x = Airline, y = pct_on_time)) +
         geom_col() +
         labs(x= "Airline", y = "On Time Rate")

#group by airline and city to calculate relative weights for locations. 
pct_flights_city_alt <- flights_long %>%
  group_by(Airline, City) %>%
  summarise(Flights = sum(Flights, na.rm = TRUE), .groups = "drop_last") %>% # stays grouped by Airline
  mutate(
    total_airline = sum(Flights, na.rm = TRUE),
    pct_cit       = round(Flights / total_airline, 2)
  ) %>%
  ungroup()

pct_flights_city_alt
## # A tibble: 10 × 5
##    Airline City          Flights total_airline pct_cit
##    <chr>   <chr>           <dbl>         <dbl>   <dbl>
##  1 AM WEST Los Angeles       811          7225    0.11
##  2 AM WEST Phoenix          5255          7225    0.73
##  3 AM WEST San Diego         448          7225    0.06
##  4 AM WEST San Francisco     449          7225    0.06
##  5 AM WEST Seattle           262          7225    0.04
##  6 Alaska  Los Angeles       559          3775    0.15
##  7 Alaska  Phoenix           233          3775    0.06
##  8 Alaska  San Diego         232          3775    0.06
##  9 Alaska  San Francisco     605          3775    0.16
## 10 Alaska  Seattle          2146          3775    0.57
#Here we show that having a higher rate of on time flights in every city, AM West has significantly more flights to Phoenix(73%) where they have the highest rate of on time flights at 92%. This weighting is the reason we see that AM West still manages to have a higher rate of on time flights than Alaska since Alaska has 57% of flights going to Seattle at an on time rate of 86%. This phenomenon is known as Simpson's Paradox which shows that a trend can appear in grouped data but when that data is summed up, that trend might not be realized.