This analysis compare performance of Alaska and Am West airlines across five cities: Los Angeles, Phoenix, San Diego, San Francisco and Seattle. The dataset is tidied from a wide format into a long format.
al <- read.csv('https://raw.githubusercontent.com/vincent-usny/Week5A-607/refs/heads/main/airline.csv', header = FALSE, stringsAsFactors = FALSE)
# manually reorganize data. Remove null.
al_df <- data.frame(
airline = c('ALASKA','ALASKA','AM WEST','AM WEST'),
status = c('on_time','delayed','on_time','delayed'),
Los_Angeles = c(497, 62, 694, 117),
Phoenix = c(221, 12, 4840, 415),
San_Diego = c(212, 20, 383, 65),
San_Francisco = c(503, 102, 320, 129),
Seattle = c(1841, 305, 201, 61)
)
al_df
## airline status Los_Angeles Phoenix San_Diego San_Francisco Seattle
## 1 ALASKA on_time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 3 AM WEST on_time 694 4840 383 320 201
## 4 AM WEST delayed 117 415 65 129 61
al_long <- al_df %>%
pivot_longer(
cols = Los_Angeles:Seattle,
names_to = 'City',
values_to = 'Count'
)
al_long
## # A tibble: 20 × 4
## airline status City Count
## <chr> <chr> <chr> <dbl>
## 1 ALASKA on_time Los_Angeles 497
## 2 ALASKA on_time Phoenix 221
## 3 ALASKA on_time San_Diego 212
## 4 ALASKA on_time San_Francisco 503
## 5 ALASKA on_time Seattle 1841
## 6 ALASKA delayed Los_Angeles 62
## 7 ALASKA delayed Phoenix 12
## 8 ALASKA delayed San_Diego 20
## 9 ALASKA delayed San_Francisco 102
## 10 ALASKA delayed Seattle 305
## 11 AM WEST on_time Los_Angeles 694
## 12 AM WEST on_time Phoenix 4840
## 13 AM WEST on_time San_Diego 383
## 14 AM WEST on_time San_Francisco 320
## 15 AM WEST on_time Seattle 201
## 16 AM WEST delayed Los_Angeles 117
## 17 AM WEST delayed Phoenix 415
## 18 AM WEST delayed San_Diego 65
## 19 AM WEST delayed San_Francisco 129
## 20 AM WEST delayed Seattle 61
# create a table for percent of delayed or on time
# for both airlines
percent_table <- al_long %>%
group_by(airline, status) %>%
summarise(total = sum(Count)) %>%
pivot_wider(names_from = status, values_from = total) %>%
mutate(
percent_delayed = delayed / (delayed + on_time) * 100,
percent_ontime = on_time /(delayed + on_time)* 100
)
## `summarise()` has grouped output by 'airline'. You can override using the
## `.groups` argument.
percent_table
## # A tibble: 2 × 5
## # Groups: airline [2]
## airline delayed on_time percent_delayed percent_ontime
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 ALASKA 501 3274 13.3 86.7
## 2 AM WEST 787 6438 10.9 89.1
# plot for comparison of both airlines
ggplot(data = percent_table, aes(x = airline, y = percent_delayed, fill = airline)) + geom_bar(stat = 'identity', width = 0.5) + ylab('percent_delayed') + xlab('Airline') + ggtitle('Percent Delayed by Airlines')
# Overall, AM West performed slightly better than Alaska
# with a lower delay rate, which suggests Am west is more
# reliable.
# Plot for comparison of both airlines across cities
city_percent <- al_long %>%
group_by(airline, status, City) %>%
summarise(total = sum(Count), .group = 'drop') %>%
pivot_wider(
names_from = status,
values_from = total
) %>%
mutate(percent_city_delayed = delayed / (delayed + on_time))
## `summarise()` has grouped output by 'airline', 'status'. You can override using
## the `.groups` argument.
city_percent
## # A tibble: 10 × 6
## # Groups: airline [2]
## airline City .group delayed on_time percent_city_delayed
## <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 ALASKA Los_Angeles drop 62 497 0.111
## 2 ALASKA Phoenix drop 12 221 0.0515
## 3 ALASKA San_Diego drop 20 212 0.0862
## 4 ALASKA San_Francisco drop 102 503 0.169
## 5 ALASKA Seattle drop 305 1841 0.142
## 6 AM WEST Los_Angeles drop 117 694 0.144
## 7 AM WEST Phoenix drop 415 4840 0.0790
## 8 AM WEST San_Diego drop 65 383 0.145
## 9 AM WEST San_Francisco drop 129 320 0.287
## 10 AM WEST Seattle drop 61 201 0.233
# Both airlines perform best in Phoenix. SF and Seattle
# are worst for both, while AM is higher that Alaska
# (28.7% and 23.3%) vs (16.9% and 14.2%).
# Overall, AM is slightly better on average, but Alaska
# has lower delayed rate across cities.
ggplot(city_percent, aes(x = City, y = percent_city_delayed, fill = airline)) + geom_bar(stat = 'identity', position = 'dodge', width = 0.7) + ylab('percent_city_delayed') + xlab('City')+ ggtitle('Percent Delayed Across Cities')
Although Am West performs slightly better than Alaska on average, Alaska has lower delayed rate on each destination individually, especially SF and Seattle. However, in this dataset, the sample size of Alaska is only 3775 while Am West is 7225. More data is needed for Alaska in order to compare more fairly.