file = "/Users/euniceok/PycharmProjects/cuny/spring2019/Week5/data/airline_delays.csv"
df = read.csv(file, na.strings=c(""," "))
df
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 <NA> delayed 62 12 20 102 305
## 3 <NA> <NA> NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 <NA> delayed 117 415 65 129 61
df <- df %>%
# convert wide dataset to long on all cols except X and X.1
gather(key, value, -X, -X.1, na.rm=TRUE) %>%
# rename all cols to be intuitive
rename(city = key, counts = value, airline = X, status = X.1) %>%
# fill out missing airline names
fill(airline)
head(df)
## airline status city counts
## 1 ALASKA on time Los.Angeles 497
## 2 ALASKA delayed Los.Angeles 62
## 4 AM WEST on time Los.Angeles 694
## 5 AM WEST delayed Los.Angeles 117
## 6 ALASKA on time Phoenix 221
## 7 ALASKA delayed Phoenix 12
# another version of tidy df for easier analysis
tmp <- df %>%
spread(status, counts) %>%
rename(ontime = 'on time') %>%
mutate(total = delayed + ontime)
tmp
## airline city delayed ontime total
## 1 ALASKA Los.Angeles 62 497 559
## 2 ALASKA Phoenix 12 221 233
## 3 ALASKA San.Diego 20 212 232
## 4 ALASKA San.Francisco 102 503 605
## 5 ALASKA Seattle 305 1841 2146
## 6 AM WEST Los.Angeles 117 694 811
## 7 AM WEST Phoenix 415 4840 5255
## 8 AM WEST San.Diego 65 383 448
## 9 AM WEST San.Francisco 129 320 449
## 10 AM WEST Seattle 61 201 262
# what is the relative rate of delayed flights for each airline?
airline_delay <- tmp %>%
group_by(airline) %>%
summarise(delays = sum(delayed),totals = sum(total)) %>%
mutate(del_rate = delays / totals)
# what is the on time rate for flights for each airline?
airline_ontime <- tmp %>%
group_by(airline) %>%
summarise(ontime = sum(ontime),totals = sum(total)) %>%
mutate(ontime_rate = ontime / totals)
airline_tbl <- inner_join(airline_delay, airline_ontime, by="airline")
airline_tbl %>% select(airline,del_rate,ontime_rate)
## # A tibble: 2 x 3
## airline del_rate ontime_rate
## <fct> <dbl> <dbl>
## 1 ALASKA 0.133 0.867
## 2 AM WEST 0.109 0.891
airline_tbl
## # A tibble: 2 x 7
## airline delays totals.x del_rate ontime totals.y ontime_rate
## <fct> <int> <int> <dbl> <int> <int> <dbl>
## 1 ALASKA 501 3775 0.133 3274 3775 0.867
## 2 AM WEST 787 7225 0.109 6438 7225 0.891
atplot <- airline_tbl %>%
select(airline,delays, ontime) %>%
gather(delays, ontime, -airline) %>%
rename(status = delays, flightcts = ontime)
atplot
## # A tibble: 4 x 3
## airline status flightcts
## <fct> <chr> <int>
## 1 ALASKA delays 501
## 2 AM WEST delays 787
## 3 ALASKA ontime 3274
## 4 AM WEST ontime 6438
g <- ggplot(atplot, aes(airline, flightcts))
g + geom_bar(aes(fill=status), width = 0.5, stat="identity") +
theme(axis.text.x = element_text(angle=65, vjust=0.6)) + theme_tufte()
calculate delay and ontime rates for each airline and city
plot rates for each airline and city
# ALASKA
alctplot <-
aircttbl %>%
filter(airline == "ALASKA") %>%
ungroup() %>%
select(city, del_rate, ontime_rate) %>%
gather(del_rate, ontime_rate, -city) %>%
rename(status = del_rate, flightcts = ontime_rate)
# AM WEST
amctplot <-
aircttbl %>%
filter(airline == "AM WEST") %>%
ungroup() %>%
select(city, del_rate, ontime_rate) %>%
gather(del_rate, ontime_rate, -city) %>%
rename(status = del_rate, flightcts = ontime_rate)
ALASKA PLOT
ggplot(alctplot,aes(x=city,y=flightcts,fill=factor(status)))+
geom_bar(stat="identity",position="dodge")+
# scale_fill_discrete(name="status",
# breaks=c(1, 2),
# labels=c("delay", "ontime"))+
xlab("airline")+ylab("rates") + theme_tufte()
AM WEST PLOT
ggplot(amctplot,aes(x=city,y=flightcts,fill=factor(status)))+
geom_bar(stat="identity",position="dodge")+
# scale_fill_discrete(name="status",
# breaks=c(1, 2),
# labels=c("delay", "ontime"))+
xlab("airline")+ylab("rates") + theme_tufte()
delctplot <- aircttbl %>%
select(airline, city, del_rate)
delctplot
## # A tibble: 10 x 3
## # Groups: airline [2]
## airline city del_rate
## <fct> <chr> <dbl>
## 1 ALASKA Los.Angeles 0.111
## 2 ALASKA Phoenix 0.0515
## 3 ALASKA San.Diego 0.0862
## 4 ALASKA San.Francisco 0.169
## 5 ALASKA Seattle 0.142
## 6 AM WEST Los.Angeles 0.144
## 7 AM WEST Phoenix 0.0790
## 8 AM WEST San.Diego 0.145
## 9 AM WEST San.Francisco 0.287
## 10 AM WEST Seattle 0.233
ggplot(delctplot,aes(x=city,y=del_rate,fill=factor(airline)))+
geom_bar(stat="identity",position="dodge")+
#scale_fill_discrete(name="status",
# breaks=c(1, 2),
# labels=c("alaska", "am west"))+
xlab("city")+ylab("rates") + theme_tufte()
Because these airlines may have different overall volume of flights, I converted the data into delay/on time rates.
Delay rates were more useful, and since they were the inverse of on time rates, I used delay rates as the main metric for comparison.
Overall, the delay trends for each city were similar between both airlines.
Both airlines had the highest delay rates in San Francisco and lowest delay rates in Phoenix.
This indicates that the primary cause of the delay is related to the location as opposed to the airline.
However, the magnitude of the delay in each differed between the two airlines.
Interestingly, at a high level across all cities, Alaska had a higher delay rate. But within each city, AM West had higher delay rates. Perhaps this is an example of Simpson’s Paradox.
The biggest gap is in San Francisco, where AM West has a delay rate 12 percentage points higher than Alaska.
There is also a sizeable gap in Seattle (9 pct pts), which indicates that AM West may have some inefficiencies - whether it is due to slower operations, equipment, etc that is an opportunity for AM West to address to catch up to Alaska.