raw <- read.csv('./data.csv')
raw
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
raw <-raw[-c(3),]
names(raw) = c('Airline','Arrival',"Los.Angeles","Phoenix","San.Diego","San.Francisco","Seattle")
raw
## Airline Arrival Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 4 AM WEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
raw[2,'Airline'] = 'ALASKA'
raw[4,'Airline'] = 'AM WEST'
raw
## Airline Arrival Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 4 AM WEST on time 694 4840 383 320 201
## 5 AM WEST delayed 117 415 65 129 61
df <- gather(raw, City, Number, 3:7, factor_key = TRUE)%>%
spread(Arrival,'Number')
head(df,10)
## Airline City delayed on time
## 1 ALASKA Los.Angeles 62 497
## 2 ALASKA Phoenix 12 221
## 3 ALASKA San.Diego 20 212
## 4 ALASKA San.Francisco 102 503
## 5 ALASKA Seattle 305 1841
## 6 AM WEST Los.Angeles 117 694
## 7 AM WEST Phoenix 415 4840
## 8 AM WEST San.Diego 65 383
## 9 AM WEST San.Francisco 129 320
## 10 AM WEST Seattle 61 201
df = df%>%
mutate(total_number = delayed + `on time`
, ontime_perc = `on time` / total_number
,delayed_perc = delayed / total_number)
df
## Airline City delayed on time total_number ontime_perc
## 1 ALASKA Los.Angeles 62 497 559 0.8890877
## 2 ALASKA Phoenix 12 221 233 0.9484979
## 3 ALASKA San.Diego 20 212 232 0.9137931
## 4 ALASKA San.Francisco 102 503 605 0.8314050
## 5 ALASKA Seattle 305 1841 2146 0.8578751
## 6 AM WEST Los.Angeles 117 694 811 0.8557337
## 7 AM WEST Phoenix 415 4840 5255 0.9210276
## 8 AM WEST San.Diego 65 383 448 0.8549107
## 9 AM WEST San.Francisco 129 320 449 0.7126949
## 10 AM WEST Seattle 61 201 262 0.7671756
## delayed_perc
## 1 0.11091234
## 2 0.05150215
## 3 0.08620690
## 4 0.16859504
## 5 0.14212488
## 6 0.14426634
## 7 0.07897241
## 8 0.14508929
## 9 0.28730512
## 10 0.23282443
library(ggplot2)
ggplot(df, aes(x = Airline, y=delayed_perc, fill = City)) +
geom_bar(stat="identity",position="dodge") +
xlab("Airlines") + ylab("Delays_percentage")
df%>%
group_by(City,Airline)%>%
summarise(delay_rate=sum(delayed)/sum(total_number))%>%
spread(Airline, delay_rate)%>%
mutate(delay_rate_diff = `AM WEST` - ALASKA)
## # A tibble: 5 x 4
## # Groups: City [5]
## City ALASKA `AM WEST` delay_rate_diff
## <fct> <dbl> <dbl> <dbl>
## 1 Los.Angeles 0.111 0.144 0.0334
## 2 Phoenix 0.0515 0.0790 0.0275
## 3 San.Diego 0.0862 0.145 0.0589
## 4 San.Francisco 0.169 0.287 0.119
## 5 Seattle 0.142 0.233 0.0907
According to this bar chart, we can conclude that the delay rate of AM WEST in these 5 cities are higher than ALASKA.
df%>%
group_by(Airline)%>%
summarize(avg_delay_rate = sum(delayed)/sum(total_number))%>%
spread(Airline, avg_delay_rate)%>%
mutate(Avg_delay_diff=`AM WEST` - ALASKA)
## # A tibble: 1 x 3
## ALASKA `AM WEST` Avg_delay_diff
## <dbl> <dbl> <dbl>
## 1 0.133 0.109 -0.0238
However, when comparing the the average delay rate, AM West is 2.4% lower than Alaska (13.3% v.s. 10.9%). This is because AM West’s majority business is in Phoenix and it is maintaining relatively low delay rate.