library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ ggplot2 3.5.1 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Package LibPath
## [1,] "dplyr" "C:/Users/johnf/AppData/Local/R/win-library/4.4"
## [2,] "dplyr" "C:/Users/johnf/AppData/Local/R/win-library/4.4"
## [3,] "dplyr" "C:/Users/johnf/AppData/Local/R/win-library/4.4"
## [4,] "dplyr" "C:/Users/johnf/AppData/Local/R/win-library/4.4"
## [5,] "dplyr" "C:/Users/johnf/AppData/Local/R/win-library/4.4"
## Item Title
## [1,] "band_instruments" "Band membership"
## [2,] "band_instruments2" "Band membership"
## [3,] "band_members" "Band membership"
## [4,] "starwars" "Starwars characters"
## [5,] "storms" "Storm tracks data"
#Alternatvie Github data ingest (for points)
df <-read.table("https://raw.githubusercontent.com/jhnboyy/CUNYSPS_DATA607/refs/heads/main/Week5/flight_data.csv",
header=TRUE,sep=',')
df
## Airline Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AMWEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
## I dont know what the missing data is supposed to be, making the assumption that its formatting that needs fixing
df[2, 'Airline'] <- "ALASKA"
df[5, 'Airline'] <- "AMWEST"
df <-df[!df$Airline == '',]
df
## Airline Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 4 AMWEST on time 694 4840 383 320 201
## 5 AMWEST delayed 117 415 65 129 61
## First parsing out the delayed and the ontime using filter function in dplyr
ontime <- df %>% filter(Status =="on time")
delayed <- df %>% filter(Status =="delayed")
## Using Tidyr to pivot each one longways for analysis
##Citation: https://tidyr.tidyverse.org/reference/pivot_longer.html
long_ontime <- pivot_longer(ontime,
cols = c("Los.Angeles","Phoenix","San.Diego","San.Francisco","Seattle"), names_to = "arrival_city", values_to ="instance_count")
long_delay <- pivot_longer(delayed,
cols = c("Los.Angeles","Phoenix","San.Diego","San.Francisco","Seattle"), names_to = "arrival_city", values_to ="instance_count")
## before placing in long df i want to calc percentages for delays for each city (referenced later on in analysis text)
temp_pct_work <- inner_join(long_ontime,long_delay, by=c("arrival_city"="arrival_city","Airline"="Airline"),su)
temp_pct_work$pct_delay <- round((temp_pct_work$instance_count.y/(temp_pct_work$instance_count.x+temp_pct_work$instance_count.y))*100,2)
temp_pct_work_lim <- temp_pct_work[,c("Airline","arrival_city","pct_delay")]
temp_pct_work_lim
## # A tibble: 10 × 3
## Airline arrival_city pct_delay
## <chr> <chr> <dbl>
## 1 ALASKA Los.Angeles 11.1
## 2 ALASKA Phoenix 5.15
## 3 ALASKA San.Diego 8.62
## 4 ALASKA San.Francisco 16.9
## 5 ALASKA Seattle 14.2
## 6 AMWEST Los.Angeles 14.4
## 7 AMWEST Phoenix 7.9
## 8 AMWEST San.Diego 14.5
## 9 AMWEST San.Francisco 28.7
## 10 AMWEST Seattle 23.3
#Concatenatining them together for one coherent DF
# CITATIOn: https://dplyr.tidyverse.org/reference/bind_rows.html
long_flights <- bind_rows(long_ontime,long_delay)
long_flights
## # A tibble: 20 × 4
## Airline Status arrival_city instance_count
## <chr> <chr> <chr> <int>
## 1 ALASKA on time Los.Angeles 497
## 2 ALASKA on time Phoenix 221
## 3 ALASKA on time San.Diego 212
## 4 ALASKA on time San.Francisco 503
## 5 ALASKA on time Seattle 1841
## 6 AMWEST on time Los.Angeles 694
## 7 AMWEST on time Phoenix 4840
## 8 AMWEST on time San.Diego 383
## 9 AMWEST on time San.Francisco 320
## 10 AMWEST on time Seattle 201
## 11 ALASKA delayed Los.Angeles 62
## 12 ALASKA delayed Phoenix 12
## 13 ALASKA delayed San.Diego 20
## 14 ALASKA delayed San.Francisco 102
## 15 ALASKA delayed Seattle 305
## 16 AMWEST delayed Los.Angeles 117
## 17 AMWEST delayed Phoenix 415
## 18 AMWEST delayed San.Diego 65
## 19 AMWEST delayed San.Francisco 129
## 20 AMWEST delayed Seattle 61
## Mean and Median for Alaska Airlines
alaska_del <- long_flights %>%
filter(Airline=='ALASKA' & Status=='delayed') %>%
summarize(
total_del_count = sum(instance_count),
avg_del_count = mean(instance_count),
med_del_count = median(instance_count)
)
alaska_del$airline = "ALASKA"
alaska_ot <-long_flights %>%
filter(Airline=='ALASKA' & Status=='on time')%>%
summarize(
total_ot_count = sum(instance_count),
avg_ot_count = mean(instance_count),
med_ot_count = median(instance_count)
)
alaska_ot$airline = "ALASKA"
alaska <- merge(alaska_del, alaska_ot, by = "airline")
alaska$pct_del <- (alaska$total_del_count / (alaska$total_del_count + alaska$total_ot_count))*100
alaska
## airline total_del_count avg_del_count med_del_count total_ot_count
## 1 ALASKA 501 100.2 62 3274
## avg_ot_count med_ot_count pct_del
## 1 654.8 497 13.27152
# Mean and Median for AMWEST Airlines
## Mean and Median for Alaska Airlines
amwest_del <- long_flights %>%
filter(Airline=='AMWEST' & Status=='delayed') %>%
summarize(
total_del_count = sum(instance_count),
avg_del_count = mean(instance_count),
med_del_count = median(instance_count)
)
amwest_del$airline = "AMWEST"
amwest_ot <-long_flights %>%
filter(Airline=='AMWEST' & Status=='on time')%>%
summarize(
total_ot_count = sum(instance_count),
avg_ot_count = mean(instance_count),
med_ot_count = median(instance_count)
)
amwest_ot$airline = "AMWEST"
amwest <- merge(amwest_del, amwest_ot, by = "airline")
amwest$pct_del <- (amwest$total_del_count / (alaska$total_del_count + amwest$total_ot_count))*100
amwest
## airline total_del_count avg_del_count med_del_count total_ot_count
## 1 AMWEST 787 157.4 117 6438
## avg_ot_count med_ot_count pct_del
## 1 1287.6 383 11.34169
#Combining both descriptive wide dfs for analysis
stat_table <- bind_rows(alaska,amwest)
stat_table
## airline total_del_count avg_del_count med_del_count total_ot_count
## 1 ALASKA 501 100.2 62 3274
## 2 AMWEST 787 157.4 117 6438
## avg_ot_count med_ot_count pct_del
## 1 654.8 497 13.27152
## 2 1287.6 383 11.34169
When all flights are considered Alaska airlines has a total of 13.27% of arrival delays, which is a bit more than the 11.34 percent arrival delays of AMWest. However, when looking at the COUNT of delays by city on the second plot, the arrival delays vary widely depending on location. The best airline with the lesser amount of delays for arrivals in Los Angeles, Phoenix, and San Diego is Alaska airlines. While AMwest is the best airline for arrivals of the two for in Seatle. Arrival delays are San Francisco are more evenly distributed between the airlines, but AMWEST has a little more of them. Further analysis with percent delays shows the the largest performenace in delays between airlines are in Seattle, SF, and San Diego arrivals. With performance being similar in Phoenix. That being said Alaska has consistantly less percent delays for their flights regardless of city than AMWEST.
#Plotting the main overview stats for relative percent delays performamce
ggplot(data = stat_table, aes(x=airline, y=pct_del))+
geom_bar(stat = "identity")
#Looking at the long df again
head(long_flights)
## # A tibble: 6 × 4
## Airline Status arrival_city instance_count
## <chr> <chr> <chr> <int>
## 1 ALASKA on time Los.Angeles 497
## 2 ALASKA on time Phoenix 221
## 3 ALASKA on time San.Diego 212
## 4 ALASKA on time San.Francisco 503
## 5 ALASKA on time Seattle 1841
## 6 AMWEST on time Los.Angeles 694
ggplot(data = long_flights%>%filter(Status=='delayed'),aes(x=arrival_city,y=instance_count, fill=Airline))+
geom_bar(position="dodge",stat = "identity")+
geom_text(aes(label=instance_count),position = position_dodge(width = 0.8))
amwest$pct_del <- (amwest$total_del_count / (alaska$total_del_count + amwest$total_ot_count))*100
## Charting the earlier table for city-specific delays
ggplot(data =temp_pct_work_lim,aes(x=arrival_city,y=pct_delay, fill=Airline))+
geom_bar(position="dodge", stat="identity")+
geom_text(aes(label=pct_delay),position = position_dodge(width = 0.8))
#Note the labels are off, but can be assumed based on the chart colors and sizes.
Overall, the airlines are very similar in performance when looked at as a percentage of arrival delays compared to all of their respective flights. Alaska airlines does have a few more arriaval delays than AMWEST. However, using percent of total flights that get delayed Alaska performs better in every city. THe degree to which this is true, varies by city. The city where Alaska performs the best relative to AMWEST is San Fransisco, while the discrepency in performance is smallest in arrivales to Phoenix. Cities with a larger discrepency in performance between the two airlines will be those where a passenger will want to choose Alaska airlines over AMwest. In the cities with less of a descrepency, then it matters less.