library(tidyverse)
## -- Attaching packages -------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.3 v dplyr 1.0.2
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts ----------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
read.csv("https://raw.githubusercontent.com/jnataky/DATA-607/master/A5_Data_transformation/airlines_dest.csv") airlines <-
# Get rid of the empty row & reset the index
na.omit(airlines)
airlines <-row.names(airlines) <- NULL
# Replace rows 2 and 4 by airlines names
2, "X"] <- "ALASKA"
airlines[4, "X"] <- "AM WEST" airlines[
# Rename the columns names
airlines %>%
airlines <- rename(carrier = X, status = X.1, los_angeles = Los.Angeles, phoenix = Phoenix, san_diego = San.Diego,
san_francisco = San.Francisco, seattle = Seattle)
# create new variable for destination and number of flights
airlines %>%
airlines <- gather("dest", "n_flights", 3:7)
# Wider the data frame by adding variables "on time" and "delayed"
airlines %>%
airlines <- pivot_wider(names_from = "status", values_from = "n_flights")
# Table for tidy data
%>%
airlines kbl(caption = "Airlines info", align = 'c') %>%
kable_material(c("striped", "hover")) %>%
row_spec(0, color = "indigo")
carrier | dest | on time | delayed |
---|---|---|---|
ALASKA | los_angeles | 497 | 62 |
AM WEST | los_angeles | 694 | 117 |
ALASKA | phoenix | 221 | 12 |
AM WEST | phoenix | 4840 | 415 |
ALASKA | san_diego | 212 | 20 |
AM WEST | san_diego | 383 | 65 |
ALASKA | san_francisco | 503 | 102 |
AM WEST | san_francisco | 320 | 129 |
ALASKA | seattle | 1841 | 305 |
AM WEST | seattle | 201 | 61 |
# Create news variables
airlines$delayed + airlines$`on time`
n_total <- round((airlines$`on time` / n_total), 3)
ontime_percent <- round((airlines$delayed / n_total), 3)
delayed_percent <-
airlines %>%
airlines_df <- mutate(n_total, ontime_percent, delayed_percent)
%>%
airlines_df kbl(caption = "Airlines more info", align = 'c') %>%
kable_material(c("striped", "hover")) %>%
row_spec(0, color = "indigo")
carrier | dest | on time | delayed | n_total | ontime_percent | delayed_percent |
---|---|---|---|---|---|---|
ALASKA | los_angeles | 497 | 62 | 559 | 0.889 | 0.111 |
AM WEST | los_angeles | 694 | 117 | 811 | 0.856 | 0.144 |
ALASKA | phoenix | 221 | 12 | 233 | 0.948 | 0.052 |
AM WEST | phoenix | 4840 | 415 | 5255 | 0.921 | 0.079 |
ALASKA | san_diego | 212 | 20 | 232 | 0.914 | 0.086 |
AM WEST | san_diego | 383 | 65 | 448 | 0.855 | 0.145 |
ALASKA | san_francisco | 503 | 102 | 605 | 0.831 | 0.169 |
AM WEST | san_francisco | 320 | 129 | 449 | 0.713 | 0.287 |
ALASKA | seattle | 1841 | 305 | 2146 | 0.858 | 0.142 |
AM WEST | seattle | 201 | 61 | 262 | 0.767 | 0.233 |
airlines_df%>%
on_time1 <- group_by(dest, carrier) %>%
summarise(ontime_percent)
## `summarise()` regrouping output by 'dest' (override with `.groups` argument)
%>%
on_time1 kbl(caption = "On time performance per city", align = 'c') %>%
kable_material(c("striped", "hover")) %>%
row_spec(0, color = "indigo")
dest | carrier | ontime_percent |
---|---|---|
los_angeles | ALASKA | 0.889 |
los_angeles | AM WEST | 0.856 |
phoenix | ALASKA | 0.948 |
phoenix | AM WEST | 0.921 |
san_diego | ALASKA | 0.914 |
san_diego | AM WEST | 0.855 |
san_francisco | ALASKA | 0.831 |
san_francisco | AM WEST | 0.713 |
seattle | ALASKA | 0.858 |
seattle | AM WEST | 0.767 |
# Plotting on time performance
ggplot(data = on_time1, aes(x = dest, y = ontime_percent, fill = carrier)) +
geom_bar(stat = "identity", position = "dodge") +
xlab("City") + ylab("On time % ") + ggtitle("Carriers on time performance per city")
Alaska performs better than AM West in every single of the five cities.
airlines_df%>%
on_time2 <- group_by(carrier) %>%
summarise(ontime_wg_mean = mean(ontime_percent))
## `summarise()` ungrouping output (override with `.groups` argument)
%>%
on_time2 kbl(caption = "Overall on-time performance", align = 'c') %>%
kable_material(c("striped", "hover")) %>%
row_spec(0, color = "indigo")
carrier | ontime_wg_mean |
---|---|
ALASKA | 0.8880 |
AM WEST | 0.8224 |
airlines_df%>%
on_time2 <- group_by(carrier) %>%
summarise(ontime_wg_mean = sum(`on time`) / sum(`on time`, delayed))
## `summarise()` ungrouping output (override with `.groups` argument)
%>%
on_time2 kbl(caption = "Overall on-time performance", align = 'c') %>%
kable_material(c("striped", "hover")) %>%
row_spec(0, color = "indigo")
carrier | ontime_wg_mean |
---|---|
ALASKA | 0.8672848 |
AM WEST | 0.8910727 |
# Plotting overall on-time performance
ggplot(data = on_time2, aes(x = carrier, y = ontime_wg_mean)) +
geom_bar(stat = "identity", width = 0.5, fill = "#b629a0") +
xlab("Carrier") + ylab("On-time mean % ") + ggtitle("Overall on-time performance per carrier")
Overall AM West has more on-time flights than Alaska Air.
When it comes to on-time performance per city for both airlines, Alaska Airlines performs better but it is worse in overall on-time performance. In per city performance, AM West doesn’t perform better than Alaska might be due to the number of flights its operates.
The analyze has shown some discrepancy when comparing the performance between per city and overall performance.
Before digging into the conclusion, let have a look on delays and analyze the overall number of flights per city, and see how it goes!
airlines_df%>%
on_time3 <- group_by(dest, carrier) %>%
summarise(ontime_percent)
## `summarise()` regrouping output by 'dest' (override with `.groups` argument)
# Plotting on time performance
ggplot(data = on_time3, aes(x = dest, y = delayed_percent, fill = carrier)) +
geom_bar(stat = "identity", position = "dodge") +
xlab("City") + ylab("Delayed % ") + ggtitle("Carriers delay per city")
airlines_df%>%
on_time4 <- group_by(dest, carrier) %>%
summarise(n_total)
## `summarise()` regrouping output by 'dest' (override with `.groups` argument)
# Plotting on time performance
ggplot(data = on_time4, aes(x = dest, y = n_total, fill = carrier)) +
geom_bar(stat = "identity", position = "dodge") +
xlab("City") + ylab("Number of flights ") + ggtitle("Number of flights per city")
Here’s: Looking at the graphs above, we can see that Alaska operates more flights in Seattle and San Francisco. In San Francisco, there is not much of difference in number of flights. This explain why Alaska Airlines beat AM West in per city performance. In the city where AM West operates more flights, it has significantly more flights than Alaska, this could explain why there are more delays. But taking into consideration the number of flights AM West operates in these 5 cities, I will recommend AM West to review its reservation system. With that, it will perform way better than Alaska. In the other hand, Alaska Airlines needs to review the internal problem that causes flights to delay.