Reading data
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
TTData <- read.csv("TTData.csv")
## Warning in read.table(file = file, header = header, sep = sep, quote =
## quote, : incomplete final line found by readTableHeader on 'TTData.csv'
TTData
## X X.1 Los.Angeles Phoenix San.Diego San.Franscisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 3 AM WEST on time 694 4840 383 320 201
## 4 AM WEST delayed 117 415 65 129 61
Re arrangment for column names.
colnames(TTData) <- c("Airlines","time_check","Los_Angeles","Phoenix","San_Diego","San_Franscisco","Seattle")
TTData
## Airlines time_check Los_Angeles Phoenix San_Diego San_Franscisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 3 AM WEST on time 694 4840 383 320 201
## 4 AM WEST delayed 117 415 65 129 61
Analysis for “Tidy” Set
tidyset <- gather(TTData,Destination,arrival_delay,Los_Angeles:Seattle)
tidyset <- data.frame(tidyset)
tidyset
## Airlines time_check Destination arrival_delay
## 1 ALASKA on time Los_Angeles 497
## 2 ALASKA delayed Los_Angeles 62
## 3 AM WEST on time Los_Angeles 694
## 4 AM WEST delayed Los_Angeles 117
## 5 ALASKA on time Phoenix 221
## 6 ALASKA delayed Phoenix 12
## 7 AM WEST on time Phoenix 4840
## 8 AM WEST delayed Phoenix 415
## 9 ALASKA on time San_Diego 212
## 10 ALASKA delayed San_Diego 20
## 11 AM WEST on time San_Diego 383
## 12 AM WEST delayed San_Diego 65
## 13 ALASKA on time San_Franscisco 503
## 14 ALASKA delayed San_Franscisco 102
## 15 AM WEST on time San_Franscisco 320
## 16 AM WEST delayed San_Franscisco 129
## 17 ALASKA on time Seattle 1841
## 18 ALASKA delayed Seattle 305
## 19 AM WEST on time Seattle 201
## 20 AM WEST delayed Seattle 61
Analysis to compare arrival delays of two airlines
We will use “spread” and “mutate” from package tidyverse.
tidyset1 <- spread(tidyset,time_check,arrival_delay)
tidyset1
## Airlines Destination delayed on time
## 1 ALASKA Los_Angeles 62 497
## 2 ALASKA Phoenix 12 221
## 3 ALASKA San_Diego 20 212
## 4 ALASKA San_Franscisco 102 503
## 5 ALASKA Seattle 305 1841
## 6 AM WEST Los_Angeles 117 694
## 7 AM WEST Phoenix 415 4840
## 8 AM WEST San_Diego 65 383
## 9 AM WEST San_Franscisco 129 320
## 10 AM WEST Seattle 61 201
Check for total flight delays
total_delays <- tidyset1 %>% gather(time_check,Number_flights,delayed,`on time`)
total_delays
## Airlines Destination time_check Number_flights
## 1 ALASKA Los_Angeles delayed 62
## 2 ALASKA Phoenix delayed 12
## 3 ALASKA San_Diego delayed 20
## 4 ALASKA San_Franscisco delayed 102
## 5 ALASKA Seattle delayed 305
## 6 AM WEST Los_Angeles delayed 117
## 7 AM WEST Phoenix delayed 415
## 8 AM WEST San_Diego delayed 65
## 9 AM WEST San_Franscisco delayed 129
## 10 AM WEST Seattle delayed 61
## 11 ALASKA Los_Angeles on time 497
## 12 ALASKA Phoenix on time 221
## 13 ALASKA San_Diego on time 212
## 14 ALASKA San_Franscisco on time 503
## 15 ALASKA Seattle on time 1841
## 16 AM WEST Los_Angeles on time 694
## 17 AM WEST Phoenix on time 4840
## 18 AM WEST San_Diego on time 383
## 19 AM WEST San_Franscisco on time 320
## 20 AM WEST Seattle on time 201
ggplot(total_delays,aes(x= Airlines,y = Number_flights,fill = time_check)) +
geom_bar(stat = "identity") + facet_grid(~Destination)+xlab("Cities") +
ylab("Flights")+ labs(fill = "time_check")

The graph shows that AM WEST has maximum flights with Phoenix city and delays are comparitively low to San_franscisco where the flights are delayed more .
Finding the “Delay ratio” will be next step.
tidyset2 <- mutate(tidyset1,DRatio = (tidyset1$delayed/(tidyset1$`on time` + tidyset1$delayed)))
tidyset2$Airlines <- as.character(tidyset2$Airlines)
tidyset2
## Airlines Destination delayed on time DRatio
## 1 ALASKA Los_Angeles 62 497 0.11091234
## 2 ALASKA Phoenix 12 221 0.05150215
## 3 ALASKA San_Diego 20 212 0.08620690
## 4 ALASKA San_Franscisco 102 503 0.16859504
## 5 ALASKA Seattle 305 1841 0.14212488
## 6 AM WEST Los_Angeles 117 694 0.14426634
## 7 AM WEST Phoenix 415 4840 0.07897241
## 8 AM WEST San_Diego 65 383 0.14508929
## 9 AM WEST San_Franscisco 129 320 0.28730512
## 10 AM WEST Seattle 61 201 0.23282443
ggplot(tidyset2,aes(x = reorder(Destination,DRatio) , y = DRatio,color = Airlines,group = Airlines))+ geom_line(linetype = "dotted") + geom_point(shape = 20,size = 4) + ggtitle("Delay Comparision") + labs(x = "Destinations", y = "Delay Ratio")

The delay ratio comparision for both air lines show that delays are higher for ‘AM WEST’ than for ‘ALASKA airline’. THe delays are experienced more in cities ‘San_franscisco’ and ‘Seattle’
Distribution analysis
analysis <- summarise(group_by(tidyset2,Airlines),mean(DRatio),sd(DRatio),median(DRatio),aggregate_delay = sum(delayed)/sum(delayed + `on time`))
analysis
## # A tibble: 2 x 5
## Airlines `mean(DRatio)` `sd(DRatio)` `median(DRatio)` aggregate_delay
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 ALASKA 0.1118683 0.04592624 0.1109123 0.1327152
## 2 AM WEST 0.1776915 0.08212854 0.1450893 0.1089273
Distribution Analysis shows that ALASKA airline has lower mean (delay ratio) while AM WEST has lower proportion of delayed arrivals overall.