Reading data

library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
TTData <- read.csv("TTData.csv")
## Warning in read.table(file = file, header = header, sep = sep, quote =
## quote, : incomplete final line found by readTableHeader on 'TTData.csv'
TTData
##         X     X.1 Los.Angeles Phoenix San.Diego San.Franscisco Seattle
## 1  ALASKA on time         497     221       212            503    1841
## 2  ALASKA delayed          62      12        20            102     305
## 3 AM WEST on time         694    4840       383            320     201
## 4 AM WEST delayed         117     415        65            129      61

Re arrangment for column names.

colnames(TTData) <- c("Airlines","time_check","Los_Angeles","Phoenix","San_Diego","San_Franscisco","Seattle")
TTData
##   Airlines time_check Los_Angeles Phoenix San_Diego San_Franscisco Seattle
## 1   ALASKA    on time         497     221       212            503    1841
## 2   ALASKA    delayed          62      12        20            102     305
## 3  AM WEST    on time         694    4840       383            320     201
## 4  AM WEST    delayed         117     415        65            129      61

Analysis for “Tidy” Set

tidyset <- gather(TTData,Destination,arrival_delay,Los_Angeles:Seattle)
tidyset <- data.frame(tidyset)
tidyset
##    Airlines time_check    Destination arrival_delay
## 1    ALASKA    on time    Los_Angeles           497
## 2    ALASKA    delayed    Los_Angeles            62
## 3   AM WEST    on time    Los_Angeles           694
## 4   AM WEST    delayed    Los_Angeles           117
## 5    ALASKA    on time        Phoenix           221
## 6    ALASKA    delayed        Phoenix            12
## 7   AM WEST    on time        Phoenix          4840
## 8   AM WEST    delayed        Phoenix           415
## 9    ALASKA    on time      San_Diego           212
## 10   ALASKA    delayed      San_Diego            20
## 11  AM WEST    on time      San_Diego           383
## 12  AM WEST    delayed      San_Diego            65
## 13   ALASKA    on time San_Franscisco           503
## 14   ALASKA    delayed San_Franscisco           102
## 15  AM WEST    on time San_Franscisco           320
## 16  AM WEST    delayed San_Franscisco           129
## 17   ALASKA    on time        Seattle          1841
## 18   ALASKA    delayed        Seattle           305
## 19  AM WEST    on time        Seattle           201
## 20  AM WEST    delayed        Seattle            61

Analysis to compare arrival delays of two airlines

We will use “spread” and “mutate” from package tidyverse.

tidyset1 <- spread(tidyset,time_check,arrival_delay)
tidyset1
##    Airlines    Destination delayed on time
## 1    ALASKA    Los_Angeles      62     497
## 2    ALASKA        Phoenix      12     221
## 3    ALASKA      San_Diego      20     212
## 4    ALASKA San_Franscisco     102     503
## 5    ALASKA        Seattle     305    1841
## 6   AM WEST    Los_Angeles     117     694
## 7   AM WEST        Phoenix     415    4840
## 8   AM WEST      San_Diego      65     383
## 9   AM WEST San_Franscisco     129     320
## 10  AM WEST        Seattle      61     201

Check for total flight delays

total_delays <- tidyset1 %>% gather(time_check,Number_flights,delayed,`on time`)
total_delays
##    Airlines    Destination time_check Number_flights
## 1    ALASKA    Los_Angeles    delayed             62
## 2    ALASKA        Phoenix    delayed             12
## 3    ALASKA      San_Diego    delayed             20
## 4    ALASKA San_Franscisco    delayed            102
## 5    ALASKA        Seattle    delayed            305
## 6   AM WEST    Los_Angeles    delayed            117
## 7   AM WEST        Phoenix    delayed            415
## 8   AM WEST      San_Diego    delayed             65
## 9   AM WEST San_Franscisco    delayed            129
## 10  AM WEST        Seattle    delayed             61
## 11   ALASKA    Los_Angeles    on time            497
## 12   ALASKA        Phoenix    on time            221
## 13   ALASKA      San_Diego    on time            212
## 14   ALASKA San_Franscisco    on time            503
## 15   ALASKA        Seattle    on time           1841
## 16  AM WEST    Los_Angeles    on time            694
## 17  AM WEST        Phoenix    on time           4840
## 18  AM WEST      San_Diego    on time            383
## 19  AM WEST San_Franscisco    on time            320
## 20  AM WEST        Seattle    on time            201
ggplot(total_delays,aes(x= Airlines,y = Number_flights,fill = time_check)) +
geom_bar(stat = "identity") + facet_grid(~Destination)+xlab("Cities") +
ylab("Flights")+ labs(fill = "time_check")

The graph shows that AM WEST has maximum flights with Phoenix city and delays are comparitively low to San_franscisco where the flights are delayed more .

Finding the “Delay ratio” will be next step.

tidyset2 <- mutate(tidyset1,DRatio = (tidyset1$delayed/(tidyset1$`on time` + tidyset1$delayed)))
tidyset2$Airlines <- as.character(tidyset2$Airlines)
tidyset2
##    Airlines    Destination delayed on time     DRatio
## 1    ALASKA    Los_Angeles      62     497 0.11091234
## 2    ALASKA        Phoenix      12     221 0.05150215
## 3    ALASKA      San_Diego      20     212 0.08620690
## 4    ALASKA San_Franscisco     102     503 0.16859504
## 5    ALASKA        Seattle     305    1841 0.14212488
## 6   AM WEST    Los_Angeles     117     694 0.14426634
## 7   AM WEST        Phoenix     415    4840 0.07897241
## 8   AM WEST      San_Diego      65     383 0.14508929
## 9   AM WEST San_Franscisco     129     320 0.28730512
## 10  AM WEST        Seattle      61     201 0.23282443
ggplot(tidyset2,aes(x = reorder(Destination,DRatio) , y = DRatio,color = Airlines,group = Airlines))+ geom_line(linetype = "dotted") + geom_point(shape = 20,size = 4) + ggtitle("Delay Comparision") + labs(x = "Destinations", y = "Delay Ratio")

The delay ratio comparision for both air lines show that delays are higher for ‘AM WEST’ than for ‘ALASKA airline’. THe delays are experienced more in cities ‘San_franscisco’ and ‘Seattle’

Distribution analysis

analysis <- summarise(group_by(tidyset2,Airlines),mean(DRatio),sd(DRatio),median(DRatio),aggregate_delay = sum(delayed)/sum(delayed + `on time`))
analysis
## # A tibble: 2 x 5
##   Airlines `mean(DRatio)` `sd(DRatio)` `median(DRatio)` aggregate_delay
##      <chr>          <dbl>        <dbl>            <dbl>           <dbl>
## 1   ALASKA      0.1118683   0.04592624        0.1109123       0.1327152
## 2  AM WEST      0.1776915   0.08212854        0.1450893       0.1089273

Distribution Analysis shows that ALASKA airline has lower mean (delay ratio) while AM WEST has lower proportion of delayed arrivals overall.