Data Description : Flights that departed Houston in 2011 Metadata: https://cran.r-project.org/web/packages/hflights/index.html

The following is an investigation of the hflights data package for the CUNY MSDA Bridge Course R program.

The following is a step by step representation of the creation of a dataset compiling a time-series of delays and diversions associated with the major airlines operating out of both Houston Airports.

Initialization

Process and reformat the data. Setting up the environment.

require(hflights)
## Loading required package: hflights
options(warn=-1)
m1<-hflights
m2 <- m1[,c("Month","DayofMonth", "Cancelled", "Diverted", "UniqueCarrier")]
lut <- c("AA" = "American", "AS" = "Alaska", "B6" = "JetBlue", "CO" = "Continental", 
         "DL" = "Delta", "OO" = "SkyWest", "UA" = "United", "US" = "US_Airways", 
         "WN" = "Southwest", "EV" = "Atlantic_Southeast", "F9" = "Frontier", 
         "FL" = "AirTran", "MQ" = "American_Eagle", "XE" = "ExpressJet", "YV" = "Mesa")
m2$UniqueCarrier <- lut[m2$UniqueCarrier]

Flights Histogram

The lut was taken from https://campus.datacamp.com

carrier = m2$UniqueCarrier
carrier.freq = table(carrier)
colors = c("red", "yellow", "green", "violet", "orange", "blue", "pink", "cyan")

Data Reduction

For the purposes of simplicity and maintaining sample size equivalency, we drop the smaller carriers. This is also a point where our analysis diverges. We will return to the unique carrier subsets (m2.1, m2.2, m2.3, m2.4) a bit later in the analysis. We split the data and the bind it together and rerun the frequency histogram.

m2.1 <- subset(m2, UniqueCarrier=='Southwest' )
m2.2 <- subset(m2, UniqueCarrier=='Continental')
m2.3 <- subset(m2, UniqueCarrier=='ExpressJet')
m2.4 <- subset(m2, UniqueCarrier=='SkyWest')
m3<-rbind(m2.1,m2.2,m2.3,m2.4)
carrier2 = m3$UniqueCarrier
carrier2.freq = table(carrier2)

Important Statistics

The following charts list in order: the probability of cancellation, being diverted, and both cancellation or being diverted.

##   UniqueCarrier   Cancelled
## 1   Continental 0.006782614
## 2    ExpressJet 0.015495599
## 3       SkyWest 0.013946828
## 4     Southwest 0.015504047
##   UniqueCarrier    Diverted
## 1   Continental 0.002627370
## 2    ExpressJet 0.003449550
## 3       SkyWest 0.003486707
## 4     Southwest 0.002293629
##   UniqueCarrier (Diverted + Cancelled)
## 1   Continental            0.009409984
## 2    ExpressJet            0.018945149
## 3       SkyWest            0.017433535
## 4     Southwest            0.017797675

Time-series Analysis

At this point it would be nice to analyze this data over time, but this requires some minor data modification to get the time data in a useful format.

require(stringr)
## Loading required package: stringr
m3$date<- paste(m3$Month,"/",m3$DayofMonth, "/2011")
m3$date2<-paste(str_replace_all(m3$date, fixed(" "), ""))
m3$date3 <- paste(as.POSIXlt(m3$date2, format = "%m/%d/%Y"))
m3$date <- NULL
m3$date2 <- NULL

Further reduction

The following is primarily code for data shaping and data renaming. There is also an inversion required.

m4<-tapply(m3$Cancelled + m3$Diverted, m3$date3, FUN=sum)
m5<-cbind(m4[1],stack(m4[-1]))
m6<-subset(m5, select=c(ind, values))
require(ggplot2)
## Loading required package: ggplot2

Time-series Graph

The following shows a February winter storm and your expected peak travel times (i.e. New Year’s).

image attach

This is interesting, but it’d be more interesting to see how our individual airlines relate to this data. The following code returns to an earlier subset and processes the data again per carrier. There is also some general house keeping.

Further analysis

m2.1 <- subset(m2, UniqueCarrier=='Southwest' )
m2.1$date<- paste(m2.1$Month,"/",m2.1$DayofMonth, "/2011")
m2.1$date2<-paste(str_replace_all(m2.1$date, fixed(" "), ""))
m2.1$date3 <- paste(as.POSIXlt(m2.1$date2, format = "%m/%d/%Y"))

m2.1$date <- NULL
m2.1$date2 <- NULL
m4.1<-tapply(m2.1$Cancelled + m2.1$Diverted, m2.1$date3, FUN=sum)
m5.1<-cbind(m4.1[1],stack(m4.1[-1]))
m6.1<-subset(m5.1, select=c(ind, values))
m6.1<-setNames(m6.1,c('Date','delays'))
m6.1$converted <- as.Date(m6.1$Date, format="%Y-%m-%d")

m2.2 <- subset(m2, UniqueCarrier=='Continental')
m2.2$date<- paste(m2.2$Month,"/",m2.2$DayofMonth, "/2011")
m2.2$date2<-paste(str_replace_all(m2.2$date, fixed(" "), ""))
m2.2$date3 <- paste(as.POSIXlt(m2.2$date2, format = "%m/%d/%Y"))

m3$date <- NULL
m3$date2 <- NULL
m4.2<-tapply(m2.2$Cancelled + m2.2$Diverted, m2.2$date3, FUN=sum)
m5.2<-cbind(m4.2[1],stack(m4.2[-1]))
m6.2<-subset(m5.2, select=c(ind, values))
m6.2<-setNames(m6.2,c('Date','delays'))
m6.2$converted <- as.Date(m6.2$Date, format="%Y-%m-%d")

m2.3 <- subset(m2, UniqueCarrier=='ExpressJet')
m2.3$date<- paste(m2.3$Month,"/",m2.3$DayofMonth, "/2011")
m2.3$date2<-paste(str_replace_all(m2.3$date, fixed(" "), ""))
m2.3$date3 <- paste(as.POSIXlt(m2.3$date2, format = "%m/%d/%Y"))

m2.3$date <- NULL
m2.3$date2 <- NULL
m4.3<-tapply(m2.3$Cancelled + m2.3$Diverted, m2.3$date3, FUN=sum)
m5.3<-cbind(m4.3[1],stack(m4.3[-1]))
m6.3<-subset(m5.3, select=c(ind, values))
m6.3<-setNames(m6.3,c('Date','delays'))
m6.3$converted <- as.Date(m6.3$Date, format="%Y-%m-%d")

m2.4 <- subset(m2, UniqueCarrier=='SkyWest')
m2.4$date<- paste(m2.4$Month,"/",m2.4$DayofMonth, "/2011")
m2.4$date2<-paste(str_replace_all(m2.4$date, fixed(" "), ""))
m2.4$date3 <- paste(as.POSIXlt(m2.4$date2, format = "%m/%d/%Y"))

m2.4$date <- NULL
m2.4$date2 <- NULL
m4.4<-tapply(m2.4$Cancelled + m2.4$Diverted, m2.4$date3, FUN=sum)
m5.4<-cbind(m4.4[1],stack(m4.4[-1]))
m6.4<-subset(m5.4, select=c(ind, values))
m6.4<-setNames(m6.4,c('Date','delays'))
m6.4$converted <- as.Date(m6.4$Date, format="%Y-%m-%d")

m7<-cbind(m6.1,m6.2,m6.3,m6.4)
m7$Date <- NULL
m7$Date <- NULL
m7$Date <- NULL
m7$Date <- NULL
m7$converted <- NULL
m7$converted <- NULL
m7$converted <- NULL
m7<-setNames(m7,c('southwest_delays','continental_delays','expressjet_delays','skywest_delays', 'date'))

Final representations.

At this point, we can now see how the cancellations and delays associate with each of the 4 major carriers. We once again notice the winter storm, but we can also pick out some moments where individual carriers experienced operational/technical problems. For instance Continental in August.

image attach

Skywest

ExpressJet

Continental

Southwest