Data Description : Flights that departed Houston in 2011 Metadata: https://cran.r-project.org/web/packages/hflights/index.html
The following is an investigation of the hflights data package for the CUNY MSDA Bridge Course R program.
The following is a step by step representation of the creation of a dataset compiling a time-series of delays and diversions associated with the major airlines operating out of both Houston Airports.
Process and reformat the data. Setting up the environment.
require(hflights)
## Loading required package: hflights
options(warn=-1)
m1<-hflights
m2 <- m1[,c("Month","DayofMonth", "Cancelled", "Diverted", "UniqueCarrier")]
lut <- c("AA" = "American", "AS" = "Alaska", "B6" = "JetBlue", "CO" = "Continental",
"DL" = "Delta", "OO" = "SkyWest", "UA" = "United", "US" = "US_Airways",
"WN" = "Southwest", "EV" = "Atlantic_Southeast", "F9" = "Frontier",
"FL" = "AirTran", "MQ" = "American_Eagle", "XE" = "ExpressJet", "YV" = "Mesa")
m2$UniqueCarrier <- lut[m2$UniqueCarrier]
The lut was taken from https://campus.datacamp.com
carrier = m2$UniqueCarrier
carrier.freq = table(carrier)
colors = c("red", "yellow", "green", "violet", "orange", "blue", "pink", "cyan")
For the purposes of simplicity and maintaining sample size equivalency, we drop the smaller carriers. This is also a point where our analysis diverges. We will return to the unique carrier subsets (m2.1, m2.2, m2.3, m2.4) a bit later in the analysis. We split the data and the bind it together and rerun the frequency histogram.
m2.1 <- subset(m2, UniqueCarrier=='Southwest' )
m2.2 <- subset(m2, UniqueCarrier=='Continental')
m2.3 <- subset(m2, UniqueCarrier=='ExpressJet')
m2.4 <- subset(m2, UniqueCarrier=='SkyWest')
m3<-rbind(m2.1,m2.2,m2.3,m2.4)
carrier2 = m3$UniqueCarrier
carrier2.freq = table(carrier2)
The following charts list in order: the probability of cancellation, being diverted, and both cancellation or being diverted.
## UniqueCarrier Cancelled
## 1 Continental 0.006782614
## 2 ExpressJet 0.015495599
## 3 SkyWest 0.013946828
## 4 Southwest 0.015504047
## UniqueCarrier Diverted
## 1 Continental 0.002627370
## 2 ExpressJet 0.003449550
## 3 SkyWest 0.003486707
## 4 Southwest 0.002293629
## UniqueCarrier (Diverted + Cancelled)
## 1 Continental 0.009409984
## 2 ExpressJet 0.018945149
## 3 SkyWest 0.017433535
## 4 Southwest 0.017797675
At this point it would be nice to analyze this data over time, but this requires some minor data modification to get the time data in a useful format.
require(stringr)
## Loading required package: stringr
m3$date<- paste(m3$Month,"/",m3$DayofMonth, "/2011")
m3$date2<-paste(str_replace_all(m3$date, fixed(" "), ""))
m3$date3 <- paste(as.POSIXlt(m3$date2, format = "%m/%d/%Y"))
m3$date <- NULL
m3$date2 <- NULL
The following is primarily code for data shaping and data renaming. There is also an inversion required.
m4<-tapply(m3$Cancelled + m3$Diverted, m3$date3, FUN=sum)
m5<-cbind(m4[1],stack(m4[-1]))
m6<-subset(m5, select=c(ind, values))
require(ggplot2)
## Loading required package: ggplot2
The following shows a February winter storm and your expected peak travel times (i.e. New Year’s).
This is interesting, but it’d be more interesting to see how our individual airlines relate to this data. The following code returns to an earlier subset and processes the data again per carrier. There is also some general house keeping.
m2.1 <- subset(m2, UniqueCarrier=='Southwest' )
m2.1$date<- paste(m2.1$Month,"/",m2.1$DayofMonth, "/2011")
m2.1$date2<-paste(str_replace_all(m2.1$date, fixed(" "), ""))
m2.1$date3 <- paste(as.POSIXlt(m2.1$date2, format = "%m/%d/%Y"))
m2.1$date <- NULL
m2.1$date2 <- NULL
m4.1<-tapply(m2.1$Cancelled + m2.1$Diverted, m2.1$date3, FUN=sum)
m5.1<-cbind(m4.1[1],stack(m4.1[-1]))
m6.1<-subset(m5.1, select=c(ind, values))
m6.1<-setNames(m6.1,c('Date','delays'))
m6.1$converted <- as.Date(m6.1$Date, format="%Y-%m-%d")
m2.2 <- subset(m2, UniqueCarrier=='Continental')
m2.2$date<- paste(m2.2$Month,"/",m2.2$DayofMonth, "/2011")
m2.2$date2<-paste(str_replace_all(m2.2$date, fixed(" "), ""))
m2.2$date3 <- paste(as.POSIXlt(m2.2$date2, format = "%m/%d/%Y"))
m3$date <- NULL
m3$date2 <- NULL
m4.2<-tapply(m2.2$Cancelled + m2.2$Diverted, m2.2$date3, FUN=sum)
m5.2<-cbind(m4.2[1],stack(m4.2[-1]))
m6.2<-subset(m5.2, select=c(ind, values))
m6.2<-setNames(m6.2,c('Date','delays'))
m6.2$converted <- as.Date(m6.2$Date, format="%Y-%m-%d")
m2.3 <- subset(m2, UniqueCarrier=='ExpressJet')
m2.3$date<- paste(m2.3$Month,"/",m2.3$DayofMonth, "/2011")
m2.3$date2<-paste(str_replace_all(m2.3$date, fixed(" "), ""))
m2.3$date3 <- paste(as.POSIXlt(m2.3$date2, format = "%m/%d/%Y"))
m2.3$date <- NULL
m2.3$date2 <- NULL
m4.3<-tapply(m2.3$Cancelled + m2.3$Diverted, m2.3$date3, FUN=sum)
m5.3<-cbind(m4.3[1],stack(m4.3[-1]))
m6.3<-subset(m5.3, select=c(ind, values))
m6.3<-setNames(m6.3,c('Date','delays'))
m6.3$converted <- as.Date(m6.3$Date, format="%Y-%m-%d")
m2.4 <- subset(m2, UniqueCarrier=='SkyWest')
m2.4$date<- paste(m2.4$Month,"/",m2.4$DayofMonth, "/2011")
m2.4$date2<-paste(str_replace_all(m2.4$date, fixed(" "), ""))
m2.4$date3 <- paste(as.POSIXlt(m2.4$date2, format = "%m/%d/%Y"))
m2.4$date <- NULL
m2.4$date2 <- NULL
m4.4<-tapply(m2.4$Cancelled + m2.4$Diverted, m2.4$date3, FUN=sum)
m5.4<-cbind(m4.4[1],stack(m4.4[-1]))
m6.4<-subset(m5.4, select=c(ind, values))
m6.4<-setNames(m6.4,c('Date','delays'))
m6.4$converted <- as.Date(m6.4$Date, format="%Y-%m-%d")
m7<-cbind(m6.1,m6.2,m6.3,m6.4)
m7$Date <- NULL
m7$Date <- NULL
m7$Date <- NULL
m7$Date <- NULL
m7$converted <- NULL
m7$converted <- NULL
m7$converted <- NULL
m7<-setNames(m7,c('southwest_delays','continental_delays','expressjet_delays','skywest_delays', 'date'))
At this point, we can now see how the cancellations and delays associate with each of the 4 major carriers. We once again notice the winter storm, but we can also pick out some moments where individual carriers experienced operational/technical problems. For instance Continental in August.
Skywest
ExpressJet
Continental
Southwest