library(tidyr)
## Warning: package 'tidyr' was built under R version 3.2.3
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
flightdata<- read.csv("https://raw.githubusercontent.com/gpsingh12/IS-607-MSDA/master/flightdata.csv")
names(flightdata) <- c("Airline", "Arrival", "Los Angeles", "Phoenix", "San Diego",
"San Francisco", "Seatlle")
# row 3 is empty, we need to remove row 3.
flightdata <- na.omit(flightdata)
# variables at row 2 column 1 and row 4 column 1 are empty, we need to add the names
flightdata[2,1]<-c("ALASKA")
flightdata[4,1]<-c("AM WEST")
flight <- flightdata%>%
gather(City,Data, 3:ncol(flightdata))
flight%>%
spread(Arrival,Data )%>%
mutate(Total = delayed +`on time`)
## Airline City delayed on time Total
## 1 ALASKA Los Angeles 62 497 559
## 2 ALASKA Phoenix 12 221 233
## 3 ALASKA San Diego 20 212 232
## 4 ALASKA San Francisco 102 503 605
## 5 ALASKA Seatlle 305 1841 2146
## 6 AM WEST Los Angeles 117 694 811
## 7 AM WEST Phoenix 415 4840 5255
## 8 AM WEST San Diego 65 383 448
## 9 AM WEST San Francisco 129 320 449
## 10 AM WEST Seatlle 61 201 262
p<-subset(flight, Airline == 'ALASKA')
p
## Airline Arrival City Data
## 1 ALASKA on time Los Angeles 497
## 2 ALASKA delayed Los Angeles 62
## 5 ALASKA on time Phoenix 221
## 6 ALASKA delayed Phoenix 12
## 9 ALASKA on time San Diego 212
## 10 ALASKA delayed San Diego 20
## 13 ALASKA on time San Francisco 503
## 14 ALASKA delayed San Francisco 102
## 17 ALASKA on time Seatlle 1841
## 18 ALASKA delayed Seatlle 305
sum(p$Data)
## [1] 3775
q<-subset(flight, Airline == 'AM WEST')
q
## Airline Arrival City Data
## 3 AM WEST on time Los Angeles 694
## 4 AM WEST delayed Los Angeles 117
## 7 AM WEST on time Phoenix 4840
## 8 AM WEST delayed Phoenix 415
## 11 AM WEST on time San Diego 383
## 12 AM WEST delayed San Diego 65
## 15 AM WEST on time San Francisco 320
## 16 AM WEST delayed San Francisco 129
## 19 AM WEST on time Seatlle 201
## 20 AM WEST delayed Seatlle 61
sum(q$Data)
## [1] 7225
Analysis
airline_delay = flight%>%
group_by(Airline)
d<-subset(airline_delay, Arrival == 'delayed' & Airline == 'ALASKA')
d
## Source: local data frame [5 x 4]
## Groups: Airline [1]
##
## Airline Arrival City Data
## (fctr) (fctr) (chr) (int)
## 1 ALASKA delayed Los Angeles 62
## 2 ALASKA delayed Phoenix 12
## 3 ALASKA delayed San Diego 20
## 4 ALASKA delayed San Francisco 102
## 5 ALASKA delayed Seatlle 305
sum(d$Data)
## [1] 501
a<-subset(airline_delay, Arrival == 'delayed' & Airline == 'AM WEST')
a
## Source: local data frame [5 x 4]
## Groups: Airline [1]
##
## Airline Arrival City Data
## (fctr) (fctr) (chr) (int)
## 1 AM WEST delayed Los Angeles 117
## 2 AM WEST delayed Phoenix 415
## 3 AM WEST delayed San Diego 65
## 4 AM WEST delayed San Francisco 129
## 5 AM WEST delayed Seatlle 61
sum(a$Data)
## [1] 787
#Delay Percentage
#ALASKA
sum(((d$Data)/sum(p$Data)) * 100)
## [1] 13.27152
#AM WEST
sum(((a$Data)/sum(q$Data)) * 100)
## [1] 10.89273
ALASKA has more delays than AM WEST