Basic Setup

Set up libraries

library(stringr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)

Read Data and Clean up headers

AirlineData <- read.csv("AirlineData1.csv", header=TRUE, blank.lines.skip = TRUE, )
colnames(AirlineData)[1]<-"Airline"
colnames(AirlineData)[2]<-"Status"
AirlineData
##   Airline  Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221       212           503    1841
## 2  ALASKA delayed          62      12        20           102     305
## 3 AM WEST on time         694    4840       383           320     201
## 4 AM WEST delayed         117     415        65           129      61

Tidy the data

AirlineDataTidy1 <- gather(AirlineData, Destination, Number, Los.Angeles:Seattle)
AirlineDataTidy <- spread(AirlineDataTidy1, Status, Number)
colnames(AirlineDataTidy)[3:4]<- c('Delayed','OnTime')
AirlineDataTidy
##    Airline   Destination Delayed OnTime
## 1   ALASKA   Los.Angeles      62    497
## 2   ALASKA       Phoenix      12    221
## 3   ALASKA     San.Diego      20    212
## 4   ALASKA San.Francisco     102    503
## 5   ALASKA       Seattle     305   1841
## 6  AM WEST   Los.Angeles     117    694
## 7  AM WEST       Phoenix     415   4840
## 8  AM WEST     San.Diego      65    383
## 9  AM WEST San.Francisco     129    320
## 10 AM WEST       Seattle      61    201

Analysis of Data

Analyse the data for sector specific delays and overall delays

Sector-wise comparison

delaySectorWise <- mutate(group_by(AirlineDataTidy, Airline), DelayPercent=(Delayed/(Delayed+OnTime))*100)
delaySectorWise <- arrange(delaySectorWise, Destination)
delaySectorWise
## Source: local data frame [10 x 5]
## Groups: Airline [2]
## 
##    Airline   Destination Delayed OnTime DelayPercent
##     <fctr>         <chr>   <int>  <int>        <dbl>
## 1   ALASKA   Los.Angeles      62    497    11.091234
## 2  AM WEST   Los.Angeles     117    694    14.426634
## 3   ALASKA       Phoenix      12    221     5.150215
## 4  AM WEST       Phoenix     415   4840     7.897241
## 5   ALASKA     San.Diego      20    212     8.620690
## 6  AM WEST     San.Diego      65    383    14.508929
## 7   ALASKA San.Francisco     102    503    16.859504
## 8  AM WEST San.Francisco     129    320    28.730512
## 9   ALASKA       Seattle     305   1841    14.212488
## 10 AM WEST       Seattle      61    201    23.282443
ggplot(data=delaySectorWise, aes(x=Destination, y=DelayPercent, group=Airline, color=Airline)) + geom_line() +
    geom_point()

Conclusion: ALASKA airlines has a lower delay in all sectors compared to AM WEST. This is also confirmed by the chart.

Sector-wise comparison

OverallDelays<-mutate(summarise(group_by(AirlineDataTidy, Airline), DelayedAll=sum(Delayed), OnTimeAll=sum(OnTime)), DelayPercentAll=100*DelayedAll/(DelayedAll+OnTimeAll))
OverallDelays
## # A tibble: 2 × 4
##   Airline DelayedAll OnTimeAll DelayPercentAll
##    <fctr>      <int>     <int>           <dbl>
## 1  ALASKA        501      3274        13.27152
## 2 AM WEST        787      6438        10.89273

Conclusion: AM WEST has a lower delay percent compared to ALASKA when all flights are considered. This is because they have some sectors where their volume is very high and the percentage of delays in those sectors is lower than average delay.