Read Data

raw <- read.csv('./data.csv')
raw
##         X     X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221       212           503    1841
## 2         delayed          62      12        20           102     305
## 3                          NA      NA        NA            NA      NA
## 4 AM WEST on time         694    4840       383           320     201
## 5         delayed         117     415        65           129      61

Data Transformation

Remove empty line

raw <-raw[-c(3),]
names(raw) = c('Airline','Arrival',"Los.Angeles","Phoenix","San.Diego","San.Francisco","Seattle")
raw
##   Airline Arrival Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221       212           503    1841
## 2         delayed          62      12        20           102     305
## 4 AM WEST on time         694    4840       383           320     201
## 5         delayed         117     415        65           129      61

Fill empty cell

raw[2,'Airline'] = 'ALASKA'
raw[4,'Airline'] = 'AM WEST'
raw 
##   Airline Arrival Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221       212           503    1841
## 2  ALASKA delayed          62      12        20           102     305
## 4 AM WEST on time         694    4840       383           320     201
## 5 AM WEST delayed         117     415        65           129      61

Spead Arrival Status

df <- gather(raw, City, Number, 3:7, factor_key = TRUE)%>%
       spread(Arrival,'Number')
head(df,10)
##    Airline          City delayed on time
## 1   ALASKA   Los.Angeles      62     497
## 2   ALASKA       Phoenix      12     221
## 3   ALASKA     San.Diego      20     212
## 4   ALASKA San.Francisco     102     503
## 5   ALASKA       Seattle     305    1841
## 6  AM WEST   Los.Angeles     117     694
## 7  AM WEST       Phoenix     415    4840
## 8  AM WEST     San.Diego      65     383
## 9  AM WEST San.Francisco     129     320
## 10 AM WEST       Seattle      61     201

Calculate delay / on time percentage

df = df%>%
  mutate(total_number = delayed + `on time`
        , ontime_perc = `on time` / total_number
         ,delayed_perc = delayed / total_number)
df
##    Airline          City delayed on time total_number ontime_perc
## 1   ALASKA   Los.Angeles      62     497          559   0.8890877
## 2   ALASKA       Phoenix      12     221          233   0.9484979
## 3   ALASKA     San.Diego      20     212          232   0.9137931
## 4   ALASKA San.Francisco     102     503          605   0.8314050
## 5   ALASKA       Seattle     305    1841         2146   0.8578751
## 6  AM WEST   Los.Angeles     117     694          811   0.8557337
## 7  AM WEST       Phoenix     415    4840         5255   0.9210276
## 8  AM WEST     San.Diego      65     383          448   0.8549107
## 9  AM WEST San.Francisco     129     320          449   0.7126949
## 10 AM WEST       Seattle      61     201          262   0.7671756
##    delayed_perc
## 1    0.11091234
## 2    0.05150215
## 3    0.08620690
## 4    0.16859504
## 5    0.14212488
## 6    0.14426634
## 7    0.07897241
## 8    0.14508929
## 9    0.28730512
## 10   0.23282443

Analysis

Compare airline overall delay percentage

library(ggplot2)
ggplot(df, aes(x = Airline, y=delayed_perc, fill = City)) +
    geom_bar(stat="identity",position="dodge") + 
    xlab("Airlines") + ylab("Delays_percentage") 

df%>%
  group_by(City,Airline)%>%
  summarise(delay_rate=sum(delayed)/sum(total_number))%>%
  spread(Airline, delay_rate)%>%
  mutate(delay_rate_diff = `AM WEST` - ALASKA)
## # A tibble: 5 x 4
## # Groups:   City [5]
##   City          ALASKA `AM WEST` delay_rate_diff
##   <fct>          <dbl>     <dbl>           <dbl>
## 1 Los.Angeles   0.111     0.144           0.0334
## 2 Phoenix       0.0515    0.0790          0.0275
## 3 San.Diego     0.0862    0.145           0.0589
## 4 San.Francisco 0.169     0.287           0.119 
## 5 Seattle       0.142     0.233           0.0907

According to this bar chart, we can conclude that the delay rate of AM WEST in these 5 cities are higher than ALASKA.

Calculate Average delay rate

df%>%
  group_by(Airline)%>%
  summarize(avg_delay_rate = sum(delayed)/sum(total_number))%>%
  spread(Airline, avg_delay_rate)%>%
  mutate(Avg_delay_diff=`AM WEST` - ALASKA)
## # A tibble: 1 x 3
##   ALASKA `AM WEST` Avg_delay_diff
##    <dbl>     <dbl>          <dbl>
## 1  0.133     0.109        -0.0238

However, when comparing the the average delay rate, AM West is 2.4% lower than Alaska (13.3% v.s. 10.9%). This is because AM West’s majority business is in Phoenix and it is maintaining relatively low delay rate.