Introduction

This analysis compare performance of Alaska and Am West airlines across five cities: Los Angeles, Phoenix, San Diego, San Francisco and Seattle. The dataset is tidied from a wide format into a long format.

al <- read.csv('https://raw.githubusercontent.com/vincent-usny/Week5A-607/refs/heads/main/airline.csv', header = FALSE, stringsAsFactors = FALSE)

# manually reorganize data. Remove null.
al_df <- data.frame(
  airline = c('ALASKA','ALASKA','AM WEST','AM WEST'),
  status = c('on_time','delayed','on_time','delayed'),
  Los_Angeles = c(497, 62, 694, 117),
  Phoenix = c(221, 12, 4840, 415),
  San_Diego = c(212, 20, 383, 65),
  San_Francisco = c(503, 102, 320, 129),
  Seattle = c(1841, 305, 201, 61)
)
al_df
##   airline  status Los_Angeles Phoenix San_Diego San_Francisco Seattle
## 1  ALASKA on_time         497     221       212           503    1841
## 2  ALASKA delayed          62      12        20           102     305
## 3 AM WEST on_time         694    4840       383           320     201
## 4 AM WEST delayed         117     415        65           129      61
al_long <- al_df %>%
  pivot_longer(
    cols = Los_Angeles:Seattle,
    names_to = 'City',
    values_to = 'Count'
  )
al_long
## # A tibble: 20 × 4
##    airline status  City          Count
##    <chr>   <chr>   <chr>         <dbl>
##  1 ALASKA  on_time Los_Angeles     497
##  2 ALASKA  on_time Phoenix         221
##  3 ALASKA  on_time San_Diego       212
##  4 ALASKA  on_time San_Francisco   503
##  5 ALASKA  on_time Seattle        1841
##  6 ALASKA  delayed Los_Angeles      62
##  7 ALASKA  delayed Phoenix          12
##  8 ALASKA  delayed San_Diego        20
##  9 ALASKA  delayed San_Francisco   102
## 10 ALASKA  delayed Seattle         305
## 11 AM WEST on_time Los_Angeles     694
## 12 AM WEST on_time Phoenix        4840
## 13 AM WEST on_time San_Diego       383
## 14 AM WEST on_time San_Francisco   320
## 15 AM WEST on_time Seattle         201
## 16 AM WEST delayed Los_Angeles     117
## 17 AM WEST delayed Phoenix         415
## 18 AM WEST delayed San_Diego        65
## 19 AM WEST delayed San_Francisco   129
## 20 AM WEST delayed Seattle          61
# create a table for percent of delayed or on time
# for both airlines
percent_table <- al_long %>%
  group_by(airline, status) %>%
  summarise(total = sum(Count)) %>%
  pivot_wider(names_from = status, values_from = total) %>%
  mutate(
    percent_delayed = delayed / (delayed + on_time) * 100,
    percent_ontime = on_time /(delayed + on_time)* 100
  )
## `summarise()` has grouped output by 'airline'. You can override using the
## `.groups` argument.
percent_table
## # A tibble: 2 × 5
## # Groups:   airline [2]
##   airline delayed on_time percent_delayed percent_ontime
##   <chr>     <dbl>   <dbl>           <dbl>          <dbl>
## 1 ALASKA      501    3274            13.3           86.7
## 2 AM WEST     787    6438            10.9           89.1
# plot for comparison of both airlines
ggplot(data = percent_table, aes(x = airline, y = percent_delayed, fill = airline)) + geom_bar(stat = 'identity', width = 0.5) + ylab('percent_delayed') + xlab('Airline') + ggtitle('Percent Delayed by Airlines')

# Overall, AM West performed slightly better than Alaska
# with a lower delay rate, which suggests Am west is more
# reliable.
# Plot for comparison of both airlines across cities
city_percent <- al_long %>%
  group_by(airline, status, City) %>%
  summarise(total = sum(Count), .group = 'drop') %>%
  pivot_wider(
    names_from = status,
    values_from = total
  ) %>%
  mutate(percent_city_delayed = delayed / (delayed + on_time))
## `summarise()` has grouped output by 'airline', 'status'. You can override using
## the `.groups` argument.
  city_percent
## # A tibble: 10 × 6
## # Groups:   airline [2]
##    airline City          .group delayed on_time percent_city_delayed
##    <chr>   <chr>         <chr>    <dbl>   <dbl>                <dbl>
##  1 ALASKA  Los_Angeles   drop        62     497               0.111 
##  2 ALASKA  Phoenix       drop        12     221               0.0515
##  3 ALASKA  San_Diego     drop        20     212               0.0862
##  4 ALASKA  San_Francisco drop       102     503               0.169 
##  5 ALASKA  Seattle       drop       305    1841               0.142 
##  6 AM WEST Los_Angeles   drop       117     694               0.144 
##  7 AM WEST Phoenix       drop       415    4840               0.0790
##  8 AM WEST San_Diego     drop        65     383               0.145 
##  9 AM WEST San_Francisco drop       129     320               0.287 
## 10 AM WEST Seattle       drop        61     201               0.233
# Both airlines perform best in Phoenix. SF and Seattle 
# are worst for both, while AM is higher that Alaska
# (28.7% and 23.3%) vs (16.9% and 14.2%).
# Overall, AM is slightly better on average, but Alaska
# has lower delayed rate across cities.
ggplot(city_percent, aes(x = City, y = percent_city_delayed, fill = airline)) + geom_bar(stat = 'identity', position = 'dodge', width = 0.7) + ylab('percent_city_delayed') + xlab('City')+ ggtitle('Percent Delayed Across Cities')

Conclusion

Although Am West performs slightly better than Alaska on average, Alaska has lower delayed rate on each destination individually, especially SF and Seattle. However, in this dataset, the sample size of Alaska is only 3775 while Am West is 7225. More data is needed for Alaska in order to compare more fairly.