# Data: We intend to study patterns in flight delays using an extensive data-set of over 17 million flights covering the whole aviation landscape of USA. Our dataset covers domestic US flights for the quarter Oct-Dec 2018.
# We first set the working directory and then use haven package to read a .sav file
setwd("C:/Users/swapa/OneDrive/Desktop/Airline Delay Data")
library(haven)
airlinedelay.df<-read_sav("Aggregate.sav")
# Exploring the data: Before proceeding on statistically establishing relationships between airline delay and certain variables, it is pertinent to perform preliminary analysis to understand structure of the different data variables and their distribution.
# We check the dimensions, variable names
dim(airlinedelay.df)
## [1] 1774818 121
colnames(airlinedelay.df)
## [1] "Year" "Quarter"
## [3] "Month" "DayofMonth"
## [5] "DayOfWeek" "FlightDate"
## [7] "Reporting_Airline" "DOT_ID_Reporting_Airline"
## [9] "IATA_CODE_Reporting_Airline" "Tail_Number"
## [11] "Flight_Number_Reporting_Airline" "OriginAirportID"
## [13] "OriginAirportSeqID" "OriginCityMarketID"
## [15] "Origin" "OriginCityName"
## [17] "OriginState" "OriginStateFips"
## [19] "OriginStateName" "OriginWac"
## [21] "DestAirportID" "DestAirportSeqID"
## [23] "DestCityMarketID" "Dest"
## [25] "DestCityName" "DestState"
## [27] "DestStateFips" "DestStateName"
## [29] "DestWac" "CRSDepTime"
## [31] "DepTime" "DepDelay"
## [33] "DepDelayMinutes" "DepDel15"
## [35] "DepartureDelayGroups" "DepTimeBlk"
## [37] "TaxiOut" "WheelsOff"
## [39] "WheelsOn" "TaxiIn"
## [41] "CRSArrTime" "ArrTime"
## [43] "ArrDelay" "ArrDelayMinutes"
## [45] "ArrDel15" "ArrivalDelayGroups"
## [47] "ArrTimeBlk" "Cancelled"
## [49] "CancellationCode" "Diverted"
## [51] "CRSElapsedTime" "ActualElapsedTime"
## [53] "AirTime" "Flights"
## [55] "Distance" "DistanceGroup"
## [57] "CarrierDelay" "WeatherDelay"
## [59] "NASDelay" "SecurityDelay"
## [61] "LateAircraftDelay" "FirstDepTime"
## [63] "TotalAddGTime" "LongestAddGTime"
## [65] "DivAirportLandings" "DivReachedDest"
## [67] "DivActualElapsedTime" "DivArrDelay"
## [69] "DivDistance" "Div1Airport"
## [71] "Div1AirportID" "Div1AirportSeqID"
## [73] "Div1WheelsOn" "Div1TotalGTime"
## [75] "Div1LongestGTime" "Div1WheelsOff"
## [77] "Div1TailNum" "Div2Airport"
## [79] "Div2AirportID" "Div2AirportSeqID"
## [81] "Div2WheelsOn" "Div2TotalGTime"
## [83] "Div2LongestGTime" "Div2WheelsOff"
## [85] "Div2TailNum" "Div3Airport"
## [87] "Div3AirportID" "Div3AirportSeqID"
## [89] "Div3WheelsOn" "Div3TotalGTime"
## [91] "Div3LongestGTime" "Div3WheelsOff"
## [93] "Div3TailNum" "Div4Airport"
## [95] "Div4AirportID" "Div4AirportSeqID"
## [97] "Div4WheelsOn" "Div4TotalGTime"
## [99] "Div4LongestGTime" "Div4WheelsOff"
## [101] "Div4TailNum" "Div5Airport"
## [103] "Div5AirportID" "Div5AirportSeqID"
## [105] "Div5WheelsOn" "Div5TotalGTime"
## [107] "Div5LongestGTime" "Div5WheelsOff"
## [109] "Div5TailNum" "filter_$"
## [111] "WEEKEND" "PMDEP"
## [113] "PMARR" "Southwest"
## [115] "American" "Delta"
## [117] "United" "Alaska"
## [119] "Jetblue" "Skywest"
## [121] "Others"
# Before proceeding further, we find that the dataset contains certain columns, not necessary in the context of proposed analysis. We prune the dataset by removing unnecessary columns, and again check dimensionality and variable names
modifiedairlinedelay.df<-airlinedelay.df[,c(6,7,15,24,30:36,41:48,50:53,55,57:61,111:121)]
dim(modifiedairlinedelay.df)
## [1] 1774818 40
colnames(modifiedairlinedelay.df)
## [1] "FlightDate" "Reporting_Airline" "Origin"
## [4] "Dest" "CRSDepTime" "DepTime"
## [7] "DepDelay" "DepDelayMinutes" "DepDel15"
## [10] "DepartureDelayGroups" "DepTimeBlk" "CRSArrTime"
## [13] "ArrTime" "ArrDelay" "ArrDelayMinutes"
## [16] "ArrDel15" "ArrivalDelayGroups" "ArrTimeBlk"
## [19] "Cancelled" "Diverted" "CRSElapsedTime"
## [22] "ActualElapsedTime" "AirTime" "Distance"
## [25] "CarrierDelay" "WeatherDelay" "NASDelay"
## [28] "SecurityDelay" "LateAircraftDelay" "WEEKEND"
## [31] "PMDEP" "PMARR" "Southwest"
## [34] "American" "Delta" "United"
## [37] "Alaska" "Jetblue" "Skywest"
## [40] "Others"
View(modifiedairlinedelay.df)
FlightDate: Date of departure
Reporting Airline: Two character airline code
Origin: Three character origin Airport code
Dest: Three character destination Airport code
CRSDepTime: Scheduled departure time (mentioned as a continuous number e.g. 1105 for 11:05 hrs and 1823 for 18:23 hrs)
DepTime:Actual departure time (mentioned as a continuous number e.g. 1105 for 11:05 hrs and 1823 for 18:23 hrs)
DepDelay: DepTime-CRSDepTime (negative for before-time departures)
DepDelayMinutes: DepDelay if DepDelay>=0, 0 otherwise
DepDel15: 1 if DepDelayMinutes>=15, 0 otherwise
DepartureDelayGroups: Departure delay intervals every 15 min from <-15 to >180 e.g. -1 for -7 min, 3 for 55 min, 12 for both 207 min as well as 2109 min
DepTimeBlk: departure time hourly blocks
CRSArrTime: Scheduled arrival time (mentioned as a continuous number e.g. 1105 for 11:05 hrs and 1823 for 18:23 hrs)
ArrTime:Actual arrival time (mentioned as a continuous number e.g. 1105 for 11:05 hrs and 1823 for 18:23 hrs)
ArrDelay: ArrTime-CRSArrTime (negative for before-time arrivals)
ArrDelayMinutes: Arrival delay in minutes( Zero for before-time or on-time flights)
ArrDel15: 1 if ArrDelayMinutes>=15, 0 otherwise
ArrivalDelayGroups: Arrival delay intervals every 15 min from <-15 to >180 e.g. -1 for -7 min, 3 for 55 min, 12 for both 207 min as well as 2109 min
ArrTimeBlk: arrival time hourly blocks
Cancelled: 1 if flight cancelled, 0 otherwise
Diverted: 1 if diverted, 0 otherwise
CRSElapsedTime: CRSArrTime-CRSDepTime
ActualElapsedTime: ArrTime-DepTime
Airtime: Time that the flight was in air
Distance: origin to destination distance
CarrierDelay: Delay attributable to carrier in Min
WeatherDelay: Delay attributable to carrier in Min
NASDelay: Delay attributable to carrier in Min
SecurityDelay: Delay attributable to carrier in Min
LateAircraftDelay: Delay attributable to carrier in Min
WEEKEND: 1 for weekend, 0 for weekday
PMDEP: 1 if scheduled departure time after 1200 hrs, 0 otherwise
PMARR: 1 if scheduled arrival time after 1200 hrs, 0 otherwise
Southwest: 1, if specified carrier, 0 otherwise
American: 1, if specified carrier, 0 otherwise
Delta: 1, if specified carrier, 0 otherwise
United: 1, if specified carrier, 0 otherwise
Alaska: 1, if specified carrier, 0 otherwise
Jetblue: 1, if specified carrier, 0 otherwise
Skywest: 1, if specified carrier, 0 otherwise
Others: 1, if specified carrier, 0 otherwise
# Description of data
str(modifiedairlinedelay.df)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1774818 obs. of 40 variables:
## $ FlightDate : Date, format: "2018-10-14" "2018-10-15" ...
## $ Reporting_Airline : chr "AA" "AA" "AA" "AA" ...
## ..- attr(*, "format.spss")= chr "A2"
## ..- attr(*, "display_width")= int 2
## $ Origin : chr "LAX" "LAX" "LAX" "LAX" ...
## ..- attr(*, "format.spss")= chr "A3"
## ..- attr(*, "display_width")= int 3
## $ Dest : chr "CLT" "CLT" "CLT" "CLT" ...
## ..- attr(*, "format.spss")= chr "A3"
## ..- attr(*, "display_width")= int 3
## $ CRSDepTime : num 1123 1123 1123 1123 1123 ...
## ..- attr(*, "format.spss")= chr "F4.0"
## ..- attr(*, "display_width")= int 12
## $ DepTime : num 1127 1119 1119 1120 1125 ...
## ..- attr(*, "format.spss")= chr "F4.0"
## ..- attr(*, "display_width")= int 12
## $ DepDelay : num 4 -4 -4 -3 2 -1 1 -6 -4 -3 ...
## ..- attr(*, "format.spss")= chr "F4.0"
## ..- attr(*, "display_width")= int 12
## $ DepDelayMinutes : num 4 0 0 0 2 0 1 0 0 0 ...
## ..- attr(*, "format.spss")= chr "F4.0"
## ..- attr(*, "display_width")= int 12
## $ DepDel15 : num 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "format.spss")= chr "F1.0"
## ..- attr(*, "display_width")= int 12
## $ DepartureDelayGroups: num 0 -1 -1 -1 0 -1 0 -1 -1 -1 ...
## ..- attr(*, "format.spss")= chr "F2.0"
## ..- attr(*, "display_width")= int 12
## $ DepTimeBlk : chr "1100-1159" "1100-1159" "1100-1159" "1100-1159" ...
## ..- attr(*, "format.spss")= chr "A9"
## ..- attr(*, "display_width")= int 9
## $ CRSArrTime : num 1910 1910 1910 1910 1910 1910 1910 1910 1910 1910 ...
## ..- attr(*, "format.spss")= chr "F4.0"
## ..- attr(*, "display_width")= int 12
## $ ArrTime : num 1854 1849 1856 1902 1906 ...
## ..- attr(*, "format.spss")= chr "F4.0"
## ..- attr(*, "display_width")= int 12
## $ ArrDelay : num -16 -21 -14 -8 -4 -8 9 17 -9 -13 ...
## ..- attr(*, "format.spss")= chr "F4.0"
## ..- attr(*, "display_width")= int 12
## $ ArrDelayMinutes : num 0 0 0 0 0 0 9 17 0 0 ...
## ..- attr(*, "format.spss")= chr "F4.0"
## ..- attr(*, "display_width")= int 12
## $ ArrDel15 : num 0 0 0 0 0 0 0 1 0 0 ...
## ..- attr(*, "format.spss")= chr "F1.0"
## ..- attr(*, "display_width")= int 12
## $ ArrivalDelayGroups : num -2 -2 -1 -1 -1 -1 0 1 -1 -1 ...
## ..- attr(*, "format.spss")= chr "F2.0"
## ..- attr(*, "display_width")= int 12
## $ ArrTimeBlk : chr "1900-1959" "1900-1959" "1900-1959" "1900-1959" ...
## ..- attr(*, "format.spss")= chr "A9"
## ..- attr(*, "display_width")= int 9
## $ Cancelled : num 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "format.spss")= chr "F1.0"
## ..- attr(*, "display_width")= int 12
## $ Diverted : num 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "format.spss")= chr "F1.0"
## ..- attr(*, "display_width")= int 12
## $ CRSElapsedTime : num 287 287 287 287 287 287 287 287 287 287 ...
## ..- attr(*, "format.spss")= chr "F3.0"
## ..- attr(*, "display_width")= int 12
## $ ActualElapsedTime : num 267 270 277 282 281 280 295 310 282 277 ...
## ..- attr(*, "format.spss")= chr "F3.0"
## ..- attr(*, "display_width")= int 12
## $ AirTime : num 232 235 248 256 249 244 250 244 249 256 ...
## ..- attr(*, "format.spss")= chr "F3.0"
## ..- attr(*, "display_width")= int 12
## $ Distance : num 2125 2125 2125 2125 2125 ...
## ..- attr(*, "format.spss")= chr "F4.0"
## ..- attr(*, "display_width")= int 12
## $ CarrierDelay : num NA NA NA NA NA NA NA 0 NA NA ...
## ..- attr(*, "format.spss")= chr "F4.0"
## ..- attr(*, "display_width")= int 12
## $ WeatherDelay : num NA NA NA NA NA NA NA 0 NA NA ...
## ..- attr(*, "format.spss")= chr "F4.0"
## ..- attr(*, "display_width")= int 12
## $ NASDelay : num NA NA NA NA NA NA NA 17 NA NA ...
## ..- attr(*, "format.spss")= chr "F4.0"
## ..- attr(*, "display_width")= int 12
## $ SecurityDelay : num NA NA NA NA NA NA NA 0 NA NA ...
## ..- attr(*, "format.spss")= chr "F3.0"
## ..- attr(*, "display_width")= int 12
## $ LateAircraftDelay : num NA NA NA NA NA NA NA 0 NA NA ...
## ..- attr(*, "format.spss")= chr "F4.0"
## ..- attr(*, "display_width")= int 12
## $ WEEKEND : num 1 0 0 0 0 0 1 0 0 0 ...
## ..- attr(*, "label")= chr "WEEKEND"
## ..- attr(*, "format.spss")= chr "F8.2"
## ..- attr(*, "display_width")= int 10
## $ PMDEP : num 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "label")= chr "PMDEP"
## ..- attr(*, "format.spss")= chr "F8.2"
## ..- attr(*, "display_width")= int 10
## $ PMARR : num 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "label")= chr "PMARR"
## ..- attr(*, "format.spss")= chr "F8.2"
## ..- attr(*, "display_width")= int 10
## $ Southwest : num 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "label")= chr "Southwest"
## ..- attr(*, "format.spss")= chr "F8.2"
## ..- attr(*, "display_width")= int 11
## $ American : num 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "label")= chr "American"
## ..- attr(*, "format.spss")= chr "F8.2"
## ..- attr(*, "display_width")= int 10
## $ Delta : num 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "label")= chr "Delta"
## ..- attr(*, "format.spss")= chr "F8.2"
## ..- attr(*, "display_width")= int 10
## $ United : num 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "label")= chr "United"
## ..- attr(*, "format.spss")= chr "F8.2"
## ..- attr(*, "display_width")= int 10
## $ Alaska : num 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "label")= chr "Alaska"
## ..- attr(*, "format.spss")= chr "F8.2"
## ..- attr(*, "display_width")= int 10
## $ Jetblue : num 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "label")= chr "Jetblue"
## ..- attr(*, "format.spss")= chr "F8.2"
## ..- attr(*, "display_width")= int 10
## $ Skywest : num 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "label")= chr "Skywest"
## ..- attr(*, "format.spss")= chr "F8.2"
## ..- attr(*, "display_width")= int 10
## $ Others : num 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "label")= chr "Others"
## ..- attr(*, "format.spss")= chr "F8.2"
## ..- attr(*, "display_width")= int 10
# summary of selected variables
summary(modifiedairlinedelay.df [,c(8,9,15,16,19:40)])
## DepDelayMinutes DepDel15 ArrDelayMinutes ArrDel15
## Min. : 0.00 Min. :0.0000 Min. : 0.00 Min. :0.0000
## 1st Qu.: 0.00 1st Qu.:0.0000 1st Qu.: 0.00 1st Qu.:0.0000
## Median : 0.00 Median :0.0000 Median : 0.00 Median :0.0000
## Mean : 11.69 Mean :0.1694 Mean : 12.14 Mean :0.1815
## 3rd Qu.: 5.00 3rd Qu.:0.0000 3rd Qu.: 7.00 3rd Qu.:0.0000
## Max. :2109.00 Max. :1.0000 Max. :2153.00 Max. :1.0000
##
## Cancelled Diverted CRSElapsedTime ActualElapsedTime AirTime
## Min. :0 Min. :0 Min. :-99.0 Min. : 14.0 Min. : 7.0
## 1st Qu.:0 1st Qu.:0 1st Qu.: 88.0 1st Qu.: 84.0 1st Qu.: 60.0
## Median :0 Median :0 Median :122.0 Median :118.0 Median : 92.0
## Mean :0 Mean :0 Mean :140.9 Mean :136.2 Mean :111.3
## 3rd Qu.:0 3rd Qu.:0 3rd Qu.:170.0 3rd Qu.:166.0 3rd Qu.:141.0
## Max. :0 Max. :0 Max. :703.0 Max. :723.0 Max. :695.0
##
## Distance CarrierDelay WeatherDelay NASDelay
## Min. : 31.0 Min. : 0.0 Min. : 0 Min. : 0.0
## 1st Qu.: 363.0 1st Qu.: 0.0 1st Qu.: 0 1st Qu.: 0.0
## Median : 631.0 Median : 0.0 Median : 0 Median : 4.0
## Mean : 797.1 Mean : 19.1 Mean : 3 Mean : 14.9
## 3rd Qu.:1028.0 3rd Qu.: 17.0 3rd Qu.: 0 3rd Qu.: 19.0
## Max. :4983.0 Max. :2109.0 Max. :2098 Max. :1498.0
## NA's :1452702 NA's :1452702 NA's :1452702
## SecurityDelay LateAircraftDelay WEEKEND PMDEP
## Min. : 0.0 Min. : 0.0 Min. :0.000 Min. :0.0000
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.:0.000 1st Qu.:0.0000
## Median : 0.0 Median : 1.0 Median :0.000 Median :1.0000
## Mean : 0.1 Mean : 23.8 Mean :0.266 Mean :0.5826
## 3rd Qu.: 0.0 3rd Qu.: 29.0 3rd Qu.:1.000 3rd Qu.:1.0000
## Max. :738.0 Max. :1411.0 Max. :1.000 Max. :1.0000
## NA's :1452702 NA's :1452702
## PMARR Southwest American Delta
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :1.0000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.6953 Mean :0.1898 Mean :0.1277 Mean :0.1312
## 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
##
## United Alaska Jetblue Skywest
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.00000 Median :0.00000 Median :0.00000 Median :0.0000
## Mean :0.08894 Mean :0.03506 Mean :0.04188 Mean :0.1075
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.0000
## Max. :1.00000 Max. :1.00000 Max. :1.00000 Max. :1.0000
##
## Others
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.278
## 3rd Qu.:1.000
## Max. :1.000
##
# Important hints from this summary
# DepDel15 shows that around 17% flights delayed > 15 min on departure
# Similarly ArrDel15 shows around 18% flights delayed > 15 min on arrival
# Categorical dummies WEEKEND, PMDEP, PMARR and Airline name dummies give respective percentages on first sight e.g. 26.6 % flights departed on weekend and 10.75 % of total flights were run by Skywest.
# Based on above summary results, we see that variables cancelled and diverted are zero in all records. Hence these variables can't discriminate between records. The five attributable delays- CarrierDelay, WeatherDelay, NASDelay, SecurityDelay, LateAircraftDelay have 1452702 missing values which based on total no. of records, can't be approximated. Hence these seven variables are also excluded from the data
finalairlinedelay.df<-modifiedairlinedelay.df[,c(1:18,21:24,30:40)]
View(finalairlinedelay.df)
# summary statistics of important metric data variables
summary(finalairlinedelay.df$DepDelayMinutes)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 0.00 11.69 5.00 2109.00
summary(finalairlinedelay.df$ArrDelayMinutes)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 0.00 12.14 7.00 2153.00
# the mean departure delay is 11.69 min
# the mean departure delay is 12.14 min
# summary statistics of important metric data variables
# skewness and kurtosis are distribution-shape related characteristics. skewness indicates symmetry about mean, whereas kurtosis measures relative peakedness or flatness of distribution. negative skewness shows left skew i.e. left tail is longer and most of distribution is at the right, whereas positive means right skew. A positive kurtosis indicates that distribution is more peaked than normal distribution, whereas negative kurtosis shows that distribution is flatter than normal distribution.
library(psych)
describe(finalairlinedelay.df$DepDelayMinutes)[,c(1:5,8:9,11:12)]
## vars n mean sd median min max skew kurtosis
## X1 1 1774818 11.69 41.45 0 0 2109 11.52 232.62
describe(finalairlinedelay.df$ArrDelayMinutes)[,c(1:5,8:9,11:12)]
## vars n mean sd median min max skew kurtosis
## X1 1 1774818 12.14 41.51 0 0 2153 11.51 233.53
# summary statistics of important metric data variables- continued for independent variables
library(psych)
describe(finalairlinedelay.df$CRSElapsedTime)[,c(1:5,8:9,11:12)]
## vars n mean sd median min max skew kurtosis
## X1 1 1774818 140.85 72.91 122 -99 703 1.44 2.34
describe(finalairlinedelay.df$ActualElapsedTime)[,c(1:5,8:9,11:12)]
## vars n mean sd median min max skew kurtosis
## X1 1 1774818 136.25 72.5 118 14 723 1.42 2.29
describe(finalairlinedelay.df$AirTime)[,c(1:5,8:9,11:12)]
## vars n mean sd median min max skew kurtosis
## X1 1 1774818 111.31 70.74 92 7 695 1.45 2.33
describe(finalairlinedelay.df$Distance)[,c(1:5,8:9,11:12)]
## vars n mean sd median min max skew kurtosis
## X1 1 1774818 797.11 593.98 631 31 4983 1.48 2.44
# Preliminary idea of outliers by quantile analysis
quantile(finalairlinedelay.df$DepDelayMinutes,probs=seq(0,1,0.05))
## 0% 5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 55% 60% 65% 70%
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2
## 75% 80% 85% 90% 95% 100%
## 5 10 18 32 63 2109
quantile(finalairlinedelay.df$ArrDelayMinutes,probs=seq(0,1,0.05))
## 0% 5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 55% 60% 65% 70%
## 0 0 0 0 0 0 0 0 0 0 0 0 0 1 4
## 75% 80% 85% 90% 95% 100%
## 7 12 20 33 63 2153
quantile(finalairlinedelay.df$CRSElapsedTime,probs=seq(0,1,0.05))
## 0% 5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 55% 60% 65% 70%
## -99 61 70 76 83 88 94 100 107 115 122 130 140 150 160
## 75% 80% 85% 90% 95% 100%
## 170 184 208 243 302 703
quantile(finalairlinedelay.df$ActualElapsedTime,probs=seq(0,1,0.05))
## 0% 5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 55% 60% 65% 70%
## 14 56 65 72 78 84 90 96 103 110 118 126 135 145 155
## 75% 80% 85% 90% 95% 100%
## 166 181 203 238 295 723
quantile(finalairlinedelay.df$AirTime,probs=seq(0,1,0.05))
## 0% 5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 55% 60% 65% 70%
## 7 35 43 48 54 60 65 71 78 85 92 100 110 119 129
## 75% 80% 85% 90% 95% 100%
## 141 154 176 211 268 695
quantile(finalairlinedelay.df$Distance,probs=seq(0,1,0.05))
## 0% 5% 10% 15% 20% 25% 30% 35% 40% 45% 50% 55% 60% 65% 70%
## 31 168 223 270 319 363 408 456 519 583 631 693 776 864 948
## 75% 80% 85% 90% 95% 100%
## 1028 1142 1368 1626 2173 4983
# The analysis of skewness and 5% quantiles shows that DepDelayMinutes and ArrDelayMinutes are heavily right-skewed i.e. most of the data points are at the left. This is understandable as arounf 65-70% flights are on time i.e. these variables are zero for 65-70% of cases. What is more improtant is that 95% quantile of both these variables is at 63 Min whereas max is 2109/2153 min. This huge difference is an indication of outlier effect, which may have to be adjusted. We leave this decision to later statistical analysis
# So far our preliminary analysis was based on creating dummy variables in SPSS/Excel and then exporting data to R. To use the factor variable operations in-built in R and to ease data analysis, we add a new column to indicate whether flight departed ontime or late and similarly whether it arrived ontime or late
finalairlinedelay.df$DepStatus<- ifelse(finalairlinedelay.df$DepDelayMinutes > 0, "DelayedonDeparture", "OntimeDep")
finalairlinedelay.df$ArrStatus<- ifelse(finalairlinedelay.df$ArrDelayMinutes > 0, "DelayedonArrival", "OntimeArr")
# To use the factor variable operations in-built in R and to ease data analysis, we add a new column to indicate names of seven major airlines and amalgamate all other airlines as 'Others'.
finalairlinedelay.df$airline<- ifelse(finalairlinedelay.df$Reporting_Airline == "AA" , "American",
ifelse(finalairlinedelay.df$Reporting_Airline == "AS" , "Alaska",
ifelse(finalairlinedelay.df$Reporting_Airline == "WN" , "Southwest",
ifelse(finalairlinedelay.df$Reporting_Airline == "DL" , "Delta", ifelse(finalairlinedelay.df$Reporting_Airline == "UA" , "United", ifelse(finalairlinedelay.df$Reporting_Airline == "B6" , "Jetblue", ifelse(finalairlinedelay.df$Reporting_Airline == "OO" , "Skywest", "Others"
)))))))
#To use the factor variable operations in-built in R and to ease data analysis, we add a new column to indicate time window of departure
finalairlinedelay.df$timewindowdep<- ifelse(finalairlinedelay.df$CRSDepTime >= 1200 , "PM", "AM")
#To use the factor variable operations in-built in R and to ease data analysis, we add a new column to indicate time window of arrival
finalairlinedelay.df$timewindowarr<- ifelse(finalairlinedelay.df$CRSArrTime >= 1200 , "PM", "AM")
#To use the factor variable operations in-built in R and to ease data analysis, we add a new column to indicate day of week. We use the initial file imported from SPSS as it contained dayofweek variable, which we later removed from data file.
finalairlinedelay.df$daywindow<- ifelse(airlinedelay.df$DayOfWeek>= 6 , "Weekend", "Weekday")
# To understand the respective frequency of delay on departure and arrival in absolute count as well as by factor variables. This is to give an initial idea, whether delays deserve to be studies futher.
# We start with one-way frequency tables and percentages
library(data.table)
delay.dt<-data.table(finalairlinedelay.df)
table(delay.dt$DepStatus)
##
## DelayedonDeparture OntimeDep
## 588618 1186200
percentDepstatus<-round(prop.table(table(delay.dt$DepStatus))*100,2)
percentDepstatus
##
## DelayedonDeparture OntimeDep
## 33.16 66.84
table(delay.dt$ArrStatus)
##
## DelayedonArrival OntimeArr
## 629201 1145617
percentArrstatus<-round(prop.table(table(delay.dt$ArrStatus))*100,2)
percentArrstatus
##
## DelayedonArrival OntimeArr
## 35.45 64.55
table(delay.dt$daywindow)
##
## Weekday Weekend
## 1302690 472128
percentday<-round(prop.table(table(delay.dt$daywindow))*100,2)
percentday
##
## Weekday Weekend
## 73.4 26.6
table(delay.dt$timewindowdep)
##
## AM PM
## 740818 1034000
percenttimedep<-round(prop.table(table(delay.dt$timewindowdep))*100,2)
percenttimedep
##
## AM PM
## 41.74 58.26
table(delay.dt$timewindowarr)
##
## AM PM
## 540856 1233962
percenttimearr<-round(prop.table(table(delay.dt$timewindowarr))*100,2)
percenttimearr
##
## AM PM
## 30.47 69.53
# Now we check for two-way cross tabulation by factor variables
table(delay.dt$daywindow,delay.dt$DepStatus)
##
## DelayedonDeparture OntimeDep
## Weekday 435543 867147
## Weekend 153075 319053
library(tigerstats)
## Loading required package: abd
## Loading required package: nlme
## Loading required package: lattice
## Loading required package: grid
## Loading required package: mosaic
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:nlme':
##
## collapse
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: ggformula
## Loading required package: ggplot2
## Registered S3 methods overwritten by 'ggplot2':
## method from
## [.quosures rlang
## c.quosures rlang
## print.quosures rlang
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
## Loading required package: ggstance
##
## Attaching package: 'ggstance'
## The following objects are masked from 'package:ggplot2':
##
## geom_errorbarh, GeomErrorbarh
##
## New to ggformula? Try the tutorials:
## learnr::run_tutorial("introduction", package = "ggformula")
## learnr::run_tutorial("refining", package = "ggformula")
## Loading required package: mosaicData
## Loading required package: Matrix
## Registered S3 method overwritten by 'mosaic':
## method from
## fortify.SpatialPolygonsDataFrame ggplot2
##
## The 'mosaic' package masks several functions from core packages in order to add
## additional features. The original behavior of these functions should not be affected by this.
##
## Note: If you use the Matrix package, be sure to load it BEFORE loading mosaic.
##
## Attaching package: 'mosaic'
## The following object is masked from 'package:Matrix':
##
## mean
## The following object is masked from 'package:ggplot2':
##
## stat
## The following objects are masked from 'package:dplyr':
##
## count, do, tally
## The following objects are masked from 'package:psych':
##
## logit, read.file, rescale
## The following objects are masked from 'package:stats':
##
## binom.test, cor, cor.test, cov, fivenum, IQR, median,
## prop.test, quantile, sd, t.test, var
## The following objects are masked from 'package:base':
##
## max, mean, min, prod, range, sample, sum
## Welcome to tigerstats!
## To learn more about this package, consult its website:
## http://homerhanumat.github.io/tigerstats
##
## Attaching package: 'tigerstats'
## The following object is masked from 'package:psych':
##
## galton
rowPerc(table(delay.dt$daywindow,delay.dt$DepStatus))
##
## DelayedonDeparture OntimeDep Total
## Weekday 33.43 66.57 100.00
## Weekend 32.42 67.58 100.00
table(delay.dt$daywindow,delay.dt$ArrStatus)
##
## DelayedonArrival OntimeArr
## Weekday 472952 829738
## Weekend 156249 315879
library(tigerstats)
rowPerc(table(delay.dt$daywindow,delay.dt$ArrStatus))
##
## DelayedonArrival OntimeArr Total
## Weekday 36.31 63.69 100.00
## Weekend 33.09 66.91 100.00
table(delay.dt$timewindowdep,delay.dt$DepStatus)
##
## DelayedonDeparture OntimeDep
## AM 183690 557128
## PM 404928 629072
rowPerc(table(delay.dt$timewindowdep,delay.dt$DepStatus))
##
## DelayedonDeparture OntimeDep Total
## AM 24.80 75.20 100.00
## PM 39.16 60.84 100.00
table(delay.dt$timewindowdep,delay.dt$ArrStatus)
##
## DelayedonArrival OntimeArr
## AM 226877 513941
## PM 402324 631676
rowPerc(table(delay.dt$timewindowdep,delay.dt$ArrStatus))
##
## DelayedonArrival OntimeArr Total
## AM 30.63 69.37 100.00
## PM 38.91 61.09 100.00
# Now we check for two-way cross tabulation by factor variables
table(delay.dt$timewindowarr,delay.dt$ArrStatus)
##
## DelayedonArrival OntimeArr
## AM 162460 378396
## PM 466741 767221
rowPerc(table(delay.dt$timewindowarr,delay.dt$ArrStatus))
##
## DelayedonArrival OntimeArr Total
## AM 30.04 69.96 100.00
## PM 37.82 62.18 100.00
# using data.table to tabulate descriptive statistics by factor variables
library(data.table)
delay.dt<-data.table(finalairlinedelay.df)
tab1<-delay.dt[,.(N=.N,MeanDepDel = mean(DepDelayMinutes),SDDepDel = sd(DepDelayMinutes), MeanArrDel = mean(ArrDelayMinutes),SDArrDel = sd(ArrDelayMinutes)), by = (airline)][order(MeanArrDel)]
tab1
## airline N MeanDepDel SDDepDel MeanArrDel SDArrDel
## 1: Delta 232804 7.703600 33.50740 7.655350 33.31172
## 2: Southwest 336886 11.191578 26.75957 9.772324 26.00900
## 3: Alaska 62218 8.993603 28.32736 10.178501 28.53498
## 4: American 226578 11.714182 44.12548 12.777105 44.60255
## 5: United 157853 12.003491 43.97220 13.307894 44.17327
## 6: Others 493338 12.526353 44.56514 13.509697 44.66248
## 7: Skywest 190815 13.832293 56.46266 15.003385 56.54742
## 8: Jetblue 74326 17.004130 45.72818 17.714380 45.76497
# The mean departure and arrival delay for the whole dataset was 11.69 Min and 12.14 min respectively, which at first sight shows considerable variation by Airline.
# Repeating the process for checking timeofday and dayofweek variation. Similar variations in mean departure and arrival delay are seen for timeofday and dayofweek
tab2<-delay.dt[,.(N=.N,MeanDepDel = mean(DepDelayMinutes),SDDepDel = sd(DepDelayMinutes), MeanArrDel = mean(ArrDelayMinutes),SDArrDel = sd(ArrDelayMinutes)), by = (timewindowdep)][order(MeanArrDel)]
tab2
## timewindowdep N MeanDepDel SDDepDel MeanArrDel SDArrDel
## 1: AM 740818 8.17897 36.13149 9.248073 36.33691
## 2: PM 1034000 14.21286 44.71249 14.213309 44.72860
tab3<-delay.dt[,.(N=.N, MeanArrDel = mean(ArrDelayMinutes),SDArrDel = sd(ArrDelayMinutes)), by = (timewindowarr)][order(MeanArrDel)]
tab3
## timewindowarr N MeanArrDel SDArrDel
## 1: AM 540856 9.065136 37.02452
## 2: PM 1233962 13.488881 43.25458
tab4<-delay.dt[,.(N=.N,MeanDepDel = mean(DepDelayMinutes),SDDepDel = sd(DepDelayMinutes), MeanArrDel = mean(ArrDelayMinutes),SDArrDel = sd(ArrDelayMinutes)), by = (daywindow)][order(MeanArrDel)]
tab4
## daywindow N MeanDepDel SDDepDel MeanArrDel SDArrDel
## 1: Weekend 472128 10.77776 40.38421 10.8459 40.16021
## 2: Weekday 1302690 12.02646 41.83103 12.6101 41.97233
# summary statistics for above six continuous variables using Aggregate function
aggregate(finalairlinedelay.df[,c(8,15,19:22)],
by = list(finalairlinedelay.df$airline),mean)
## Group.1 DepDelayMinutes ArrDelayMinutes CRSElapsedTime
## 1 Alaska 8.993603 10.178501 206.5684
## 2 American 11.714182 12.777105 167.1867
## 3 Delta 7.703600 7.655350 151.1464
## 4 Jetblue 17.004130 17.714380 174.5919
## 5 Others 12.526353 13.509697 118.1208
## 6 Skywest 13.832293 15.003385 108.2800
## 7 Southwest 11.191578 9.772324 128.3607
## 8 United 12.003491 13.307894 183.1772
## ActualElapsedTime AirTime Distance
## 1 202.7015 174.99381 1333.3650
## 2 163.5417 135.61272 998.3643
## 3 143.4273 118.97884 872.1344
## 4 171.3222 146.69514 1080.3446
## 5 114.7034 88.27521 592.8487
## 6 104.8274 76.29302 488.6364
## 7 121.6868 104.07573 747.2839
## 8 180.1530 153.10035 1170.4063