Reading in the Library

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ ggplot2   3.5.1     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Looking at the data in the dplyr package

##      Package LibPath                                         
## [1,] "dplyr" "C:/Users/johnf/AppData/Local/R/win-library/4.4"
## [2,] "dplyr" "C:/Users/johnf/AppData/Local/R/win-library/4.4"
## [3,] "dplyr" "C:/Users/johnf/AppData/Local/R/win-library/4.4"
## [4,] "dplyr" "C:/Users/johnf/AppData/Local/R/win-library/4.4"
## [5,] "dplyr" "C:/Users/johnf/AppData/Local/R/win-library/4.4"
##      Item                Title                
## [1,] "band_instruments"  "Band membership"    
## [2,] "band_instruments2" "Band membership"    
## [3,] "band_members"      "Band membership"    
## [4,] "starwars"          "Starwars characters"
## [5,] "storms"            "Storm tracks data"

Reading in the created CSV file

#Alternatvie Github data ingest (for points)
df <-read.table("https://raw.githubusercontent.com/jhnboyy/CUNYSPS_DATA607/refs/heads/main/Week5/flight_data.csv",
                      header=TRUE,sep=',')
df
##   Airline  Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221       212           503    1841
## 2         delayed          62      12        20           102     305
## 3                          NA      NA        NA            NA      NA
## 4  AMWEST on time         694    4840       383           320     201
## 5         delayed         117     415        65           129      61
## I dont know what the missing data is supposed to be, making the assumption that its formatting that needs fixing
df[2, 'Airline'] <- "ALASKA"
df[5, 'Airline'] <- "AMWEST"
df <-df[!df$Airline == '',]

df
##   Airline  Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221       212           503    1841
## 2  ALASKA delayed          62      12        20           102     305
## 4  AMWEST on time         694    4840       383           320     201
## 5  AMWEST delayed         117     415        65           129      61

Working to Transform the data in order to plot and compare the delays

## First parsing out the delayed and the ontime using filter function in dplyr
ontime <- df %>% filter(Status =="on time")
delayed <- df %>% filter(Status =="delayed")

## Using Tidyr  to pivot each one longways for analysis

##Citation: https://tidyr.tidyverse.org/reference/pivot_longer.html
long_ontime <- pivot_longer(ontime,
             cols = c("Los.Angeles","Phoenix","San.Diego","San.Francisco","Seattle"), names_to = "arrival_city", values_to ="instance_count")
long_delay <- pivot_longer(delayed,
             cols = c("Los.Angeles","Phoenix","San.Diego","San.Francisco","Seattle"), names_to = "arrival_city", values_to ="instance_count")

## before placing in long df i want to calc percentages for delays for each city (referenced later on in analysis text)
temp_pct_work <- inner_join(long_ontime,long_delay, by=c("arrival_city"="arrival_city","Airline"="Airline"),su)
temp_pct_work$pct_delay <- round((temp_pct_work$instance_count.y/(temp_pct_work$instance_count.x+temp_pct_work$instance_count.y))*100,2)
temp_pct_work_lim <- temp_pct_work[,c("Airline","arrival_city","pct_delay")]
temp_pct_work_lim
## # A tibble: 10 × 3
##    Airline arrival_city  pct_delay
##    <chr>   <chr>             <dbl>
##  1 ALASKA  Los.Angeles       11.1 
##  2 ALASKA  Phoenix            5.15
##  3 ALASKA  San.Diego          8.62
##  4 ALASKA  San.Francisco     16.9 
##  5 ALASKA  Seattle           14.2 
##  6 AMWEST  Los.Angeles       14.4 
##  7 AMWEST  Phoenix            7.9 
##  8 AMWEST  San.Diego         14.5 
##  9 AMWEST  San.Francisco     28.7 
## 10 AMWEST  Seattle           23.3
#Concatenatining them together for one coherent DF 
# CITATIOn: https://dplyr.tidyverse.org/reference/bind_rows.html
long_flights <- bind_rows(long_ontime,long_delay)
long_flights
## # A tibble: 20 × 4
##    Airline Status  arrival_city  instance_count
##    <chr>   <chr>   <chr>                  <int>
##  1 ALASKA  on time Los.Angeles              497
##  2 ALASKA  on time Phoenix                  221
##  3 ALASKA  on time San.Diego                212
##  4 ALASKA  on time San.Francisco            503
##  5 ALASKA  on time Seattle                 1841
##  6 AMWEST  on time Los.Angeles              694
##  7 AMWEST  on time Phoenix                 4840
##  8 AMWEST  on time San.Diego                383
##  9 AMWEST  on time San.Francisco            320
## 10 AMWEST  on time Seattle                  201
## 11 ALASKA  delayed Los.Angeles               62
## 12 ALASKA  delayed Phoenix                   12
## 13 ALASKA  delayed San.Diego                 20
## 14 ALASKA  delayed San.Francisco            102
## 15 ALASKA  delayed Seattle                  305
## 16 AMWEST  delayed Los.Angeles              117
## 17 AMWEST  delayed Phoenix                  415
## 18 AMWEST  delayed San.Diego                 65
## 19 AMWEST  delayed San.Francisco            129
## 20 AMWEST  delayed Seattle                   61

Transformation Part 2 (Alaska)

## Mean and Median for Alaska Airlines
alaska_del <- long_flights %>%
  filter(Airline=='ALASKA' & Status=='delayed') %>%
  summarize(
    total_del_count = sum(instance_count),
    avg_del_count = mean(instance_count),
    med_del_count = median(instance_count)
  ) 

alaska_del$airline = "ALASKA"
  
alaska_ot <-long_flights %>%
  filter(Airline=='ALASKA' & Status=='on time')%>%
  summarize(
    total_ot_count = sum(instance_count),
    avg_ot_count = mean(instance_count),
    med_ot_count = median(instance_count)
  ) 
alaska_ot$airline = "ALASKA"

alaska <- merge(alaska_del, alaska_ot, by = "airline")
alaska$pct_del <- (alaska$total_del_count / (alaska$total_del_count + alaska$total_ot_count))*100

alaska
##   airline total_del_count avg_del_count med_del_count total_ot_count
## 1  ALASKA             501         100.2            62           3274
##   avg_ot_count med_ot_count  pct_del
## 1        654.8          497 13.27152

Transformation Part 2 (AMWEST)

# Mean and Median for AMWEST Airlines
## Mean and Median for Alaska Airlines
amwest_del <- long_flights %>%
  filter(Airline=='AMWEST' & Status=='delayed') %>%
  summarize(
    total_del_count = sum(instance_count),
    avg_del_count = mean(instance_count),
    med_del_count = median(instance_count)
  ) 

amwest_del$airline = "AMWEST"

amwest_ot <-long_flights %>%
  filter(Airline=='AMWEST' & Status=='on time')%>%
  summarize(
    total_ot_count = sum(instance_count),
    avg_ot_count = mean(instance_count),
    med_ot_count = median(instance_count)
  ) 
amwest_ot$airline = "AMWEST"

amwest <- merge(amwest_del, amwest_ot, by = "airline")
amwest$pct_del <- (amwest$total_del_count / (alaska$total_del_count + amwest$total_ot_count))*100

amwest
##   airline total_del_count avg_del_count med_del_count total_ot_count
## 1  AMWEST             787         157.4           117           6438
##   avg_ot_count med_ot_count  pct_del
## 1       1287.6          383 11.34169
#Combining both descriptive wide dfs for analysis
 stat_table <- bind_rows(alaska,amwest)
 stat_table
##   airline total_del_count avg_del_count med_del_count total_ot_count
## 1  ALASKA             501         100.2            62           3274
## 2  AMWEST             787         157.4           117           6438
##   avg_ot_count med_ot_count  pct_del
## 1        654.8          497 13.27152
## 2       1287.6          383 11.34169

Analysis

When all flights are considered Alaska airlines has a total of 13.27% of arrival delays, which is a bit more than the 11.34 percent arrival delays of AMWest. However, when looking at the COUNT of delays by city on the second plot, the arrival delays vary widely depending on location. The best airline with the lesser amount of delays for arrivals in Los Angeles, Phoenix, and San Diego is Alaska airlines. While AMwest is the best airline for arrivals of the two for in Seatle. Arrival delays are San Francisco are more evenly distributed between the airlines, but AMWEST has a little more of them. Further analysis with percent delays shows the the largest performenace in delays between airlines are in Seattle, SF, and San Diego arrivals. With performance being similar in Phoenix. That being said Alaska has consistantly less percent delays for their flights regardless of city than AMWEST.

#Plotting the main overview stats for relative percent delays performamce
ggplot(data = stat_table, aes(x=airline, y=pct_del))+
  geom_bar(stat = "identity")

#Looking at the long df again
head(long_flights)
## # A tibble: 6 × 4
##   Airline Status  arrival_city  instance_count
##   <chr>   <chr>   <chr>                  <int>
## 1 ALASKA  on time Los.Angeles              497
## 2 ALASKA  on time Phoenix                  221
## 3 ALASKA  on time San.Diego                212
## 4 ALASKA  on time San.Francisco            503
## 5 ALASKA  on time Seattle                 1841
## 6 AMWEST  on time Los.Angeles              694
ggplot(data = long_flights%>%filter(Status=='delayed'),aes(x=arrival_city,y=instance_count, fill=Airline))+
  geom_bar(position="dodge",stat = "identity")+
  geom_text(aes(label=instance_count),position = position_dodge(width = 0.8))

amwest$pct_del <- (amwest$total_del_count / (alaska$total_del_count + amwest$total_ot_count))*100

## Charting the earlier table for city-specific delays
ggplot(data =temp_pct_work_lim,aes(x=arrival_city,y=pct_delay, fill=Airline))+
  geom_bar(position="dodge", stat="identity")+
  geom_text(aes(label=pct_delay),position = position_dodge(width = 0.8))

#Note the labels are off, but can be assumed based on the chart colors and sizes. 

Conclusion

Overall, the airlines are very similar in performance when looked at as a percentage of arrival delays compared to all of their respective flights. Alaska airlines does have a few more arriaval delays than AMWEST. However, using percent of total flights that get delayed Alaska performs better in every city. THe degree to which this is true, varies by city. The city where Alaska performs the best relative to AMWEST is San Fransisco, while the discrepency in performance is smallest in arrivales to Phoenix. Cities with a larger discrepency in performance between the two airlines will be those where a passenger will want to choose Alaska airlines over AMwest. In the cities with less of a descrepency, then it matters less.