library(tidyr)
## Warning: package 'tidyr' was built under R version 3.2.3
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
flightdata<- read.csv("https://raw.githubusercontent.com/gpsingh12/IS-607-MSDA/master/flightdata.csv")


names(flightdata) <- c("Airline", "Arrival", "Los Angeles", "Phoenix", "San Diego",
                       "San Francisco", "Seatlle")

# row 3 is empty, we need to remove row 3.


flightdata <- na.omit(flightdata)


# variables at row 2 column 1 and row 4 column 1 are empty, we need to add the names


flightdata[2,1]<-c("ALASKA")
flightdata[4,1]<-c("AM WEST")
flight <- flightdata%>%
  gather(City,Data, 3:ncol(flightdata))
flight%>%
  spread(Arrival,Data )%>%
   mutate(Total = delayed +`on time`)
##    Airline          City delayed on time Total
## 1   ALASKA   Los Angeles      62     497   559
## 2   ALASKA       Phoenix      12     221   233
## 3   ALASKA     San Diego      20     212   232
## 4   ALASKA San Francisco     102     503   605
## 5   ALASKA       Seatlle     305    1841  2146
## 6  AM WEST   Los Angeles     117     694   811
## 7  AM WEST       Phoenix     415    4840  5255
## 8  AM WEST     San Diego      65     383   448
## 9  AM WEST San Francisco     129     320   449
## 10 AM WEST       Seatlle      61     201   262
p<-subset(flight,  Airline == 'ALASKA')
p
##    Airline Arrival          City Data
## 1   ALASKA on time   Los Angeles  497
## 2   ALASKA delayed   Los Angeles   62
## 5   ALASKA on time       Phoenix  221
## 6   ALASKA delayed       Phoenix   12
## 9   ALASKA on time     San Diego  212
## 10  ALASKA delayed     San Diego   20
## 13  ALASKA on time San Francisco  503
## 14  ALASKA delayed San Francisco  102
## 17  ALASKA on time       Seatlle 1841
## 18  ALASKA delayed       Seatlle  305
sum(p$Data)
## [1] 3775
q<-subset(flight, Airline == 'AM WEST')
q
##    Airline Arrival          City Data
## 3  AM WEST on time   Los Angeles  694
## 4  AM WEST delayed   Los Angeles  117
## 7  AM WEST on time       Phoenix 4840
## 8  AM WEST delayed       Phoenix  415
## 11 AM WEST on time     San Diego  383
## 12 AM WEST delayed     San Diego   65
## 15 AM WEST on time San Francisco  320
## 16 AM WEST delayed San Francisco  129
## 19 AM WEST on time       Seatlle  201
## 20 AM WEST delayed       Seatlle   61
sum(q$Data)
## [1] 7225

Analysis

airline_delay = flight%>%
  group_by(Airline)
d<-subset(airline_delay, Arrival == 'delayed' & Airline == 'ALASKA')
d
## Source: local data frame [5 x 4]
## Groups: Airline [1]
## 
##   Airline Arrival          City  Data
##    (fctr)  (fctr)         (chr) (int)
## 1  ALASKA delayed   Los Angeles    62
## 2  ALASKA delayed       Phoenix    12
## 3  ALASKA delayed     San Diego    20
## 4  ALASKA delayed San Francisco   102
## 5  ALASKA delayed       Seatlle   305
sum(d$Data)
## [1] 501
a<-subset(airline_delay, Arrival == 'delayed' & Airline == 'AM WEST')
a
## Source: local data frame [5 x 4]
## Groups: Airline [1]
## 
##   Airline Arrival          City  Data
##    (fctr)  (fctr)         (chr) (int)
## 1 AM WEST delayed   Los Angeles   117
## 2 AM WEST delayed       Phoenix   415
## 3 AM WEST delayed     San Diego    65
## 4 AM WEST delayed San Francisco   129
## 5 AM WEST delayed       Seatlle    61
sum(a$Data)
## [1] 787
#Delay Percentage

#ALASKA
sum(((d$Data)/sum(p$Data)) * 100)
## [1] 13.27152
#AM WEST
sum(((a$Data)/sum(q$Data)) * 100)
## [1] 10.89273

ALASKA has more delays than AM WEST