# Data: We intend to study patterns in flight delays using an extensive data-set of over 17 million flights covering the whole aviation landscape of USA. Our dataset covers domestic US flights for the quarter Oct-Dec 2018.
# We first set the working directory and then use haven package to read a .sav file 
setwd("C:/Users/swapa/OneDrive/Desktop/Airline Delay Data")
library(haven)
airlinedelay.df<-read_sav("Aggregate.sav")# Exploring the data: Before proceeding on statistically establishing relationships between airline delay and certain variables, it is pertinent to perform preliminary analysis to understand structure of the different data variables and their distribution.
# We check the dimensions, variable names
dim(airlinedelay.df)## [1] 1774818     121colnames(airlinedelay.df)##   [1] "Year"                            "Quarter"                        
##   [3] "Month"                           "DayofMonth"                     
##   [5] "DayOfWeek"                       "FlightDate"                     
##   [7] "Reporting_Airline"               "DOT_ID_Reporting_Airline"       
##   [9] "IATA_CODE_Reporting_Airline"     "Tail_Number"                    
##  [11] "Flight_Number_Reporting_Airline" "OriginAirportID"                
##  [13] "OriginAirportSeqID"              "OriginCityMarketID"             
##  [15] "Origin"                          "OriginCityName"                 
##  [17] "OriginState"                     "OriginStateFips"                
##  [19] "OriginStateName"                 "OriginWac"                      
##  [21] "DestAirportID"                   "DestAirportSeqID"               
##  [23] "DestCityMarketID"                "Dest"                           
##  [25] "DestCityName"                    "DestState"                      
##  [27] "DestStateFips"                   "DestStateName"                  
##  [29] "DestWac"                         "CRSDepTime"                     
##  [31] "DepTime"                         "DepDelay"                       
##  [33] "DepDelayMinutes"                 "DepDel15"                       
##  [35] "DepartureDelayGroups"            "DepTimeBlk"                     
##  [37] "TaxiOut"                         "WheelsOff"                      
##  [39] "WheelsOn"                        "TaxiIn"                         
##  [41] "CRSArrTime"                      "ArrTime"                        
##  [43] "ArrDelay"                        "ArrDelayMinutes"                
##  [45] "ArrDel15"                        "ArrivalDelayGroups"             
##  [47] "ArrTimeBlk"                      "Cancelled"                      
##  [49] "CancellationCode"                "Diverted"                       
##  [51] "CRSElapsedTime"                  "ActualElapsedTime"              
##  [53] "AirTime"                         "Flights"                        
##  [55] "Distance"                        "DistanceGroup"                  
##  [57] "CarrierDelay"                    "WeatherDelay"                   
##  [59] "NASDelay"                        "SecurityDelay"                  
##  [61] "LateAircraftDelay"               "FirstDepTime"                   
##  [63] "TotalAddGTime"                   "LongestAddGTime"                
##  [65] "DivAirportLandings"              "DivReachedDest"                 
##  [67] "DivActualElapsedTime"            "DivArrDelay"                    
##  [69] "DivDistance"                     "Div1Airport"                    
##  [71] "Div1AirportID"                   "Div1AirportSeqID"               
##  [73] "Div1WheelsOn"                    "Div1TotalGTime"                 
##  [75] "Div1LongestGTime"                "Div1WheelsOff"                  
##  [77] "Div1TailNum"                     "Div2Airport"                    
##  [79] "Div2AirportID"                   "Div2AirportSeqID"               
##  [81] "Div2WheelsOn"                    "Div2TotalGTime"                 
##  [83] "Div2LongestGTime"                "Div2WheelsOff"                  
##  [85] "Div2TailNum"                     "Div3Airport"                    
##  [87] "Div3AirportID"                   "Div3AirportSeqID"               
##  [89] "Div3WheelsOn"                    "Div3TotalGTime"                 
##  [91] "Div3LongestGTime"                "Div3WheelsOff"                  
##  [93] "Div3TailNum"                     "Div4Airport"                    
##  [95] "Div4AirportID"                   "Div4AirportSeqID"               
##  [97] "Div4WheelsOn"                    "Div4TotalGTime"                 
##  [99] "Div4LongestGTime"                "Div4WheelsOff"                  
## [101] "Div4TailNum"                     "Div5Airport"                    
## [103] "Div5AirportID"                   "Div5AirportSeqID"               
## [105] "Div5WheelsOn"                    "Div5TotalGTime"                 
## [107] "Div5LongestGTime"                "Div5WheelsOff"                  
## [109] "Div5TailNum"                     "filter_$"                       
## [111] "WEEKEND"                         "PMDEP"                          
## [113] "PMARR"                           "Southwest"                      
## [115] "American"                        "Delta"                          
## [117] "United"                          "Alaska"                         
## [119] "Jetblue"                         "Skywest"                        
## [121] "Others"# Before proceeding further, we find that the dataset contains certain columns, not necessary in the context of proposed analysis. We prune the dataset by removing unnecessary columns, and again check dimensionality and variable names
modifiedairlinedelay.df<-airlinedelay.df[,c(6,7,15,24,30:36,41:48,50:53,55,57:61,111:121)]
dim(modifiedairlinedelay.df)## [1] 1774818      40colnames(modifiedairlinedelay.df)##  [1] "FlightDate"           "Reporting_Airline"    "Origin"              
##  [4] "Dest"                 "CRSDepTime"           "DepTime"             
##  [7] "DepDelay"             "DepDelayMinutes"      "DepDel15"            
## [10] "DepartureDelayGroups" "DepTimeBlk"           "CRSArrTime"          
## [13] "ArrTime"              "ArrDelay"             "ArrDelayMinutes"     
## [16] "ArrDel15"             "ArrivalDelayGroups"   "ArrTimeBlk"          
## [19] "Cancelled"            "Diverted"             "CRSElapsedTime"      
## [22] "ActualElapsedTime"    "AirTime"              "Distance"            
## [25] "CarrierDelay"         "WeatherDelay"         "NASDelay"            
## [28] "SecurityDelay"        "LateAircraftDelay"    "WEEKEND"             
## [31] "PMDEP"                "PMARR"                "Southwest"           
## [34] "American"             "Delta"                "United"              
## [37] "Alaska"               "Jetblue"              "Skywest"             
## [40] "Others"View(modifiedairlinedelay.df)FlightDate: Date of departure
Reporting Airline: Two character airline code
Origin: Three character origin Airport code
Dest: Three character destination Airport code
CRSDepTime: Scheduled departure time (mentioned as a continuous number e.g. 1105 for 11:05 hrs and 1823 for 18:23 hrs)
DepTime:Actual departure time (mentioned as a continuous number e.g. 1105 for 11:05 hrs and 1823 for 18:23 hrs)
DepDelay: DepTime-CRSDepTime (negative for before-time departures)
DepDelayMinutes: DepDelay if DepDelay>=0, 0 otherwise
DepDel15: 1 if DepDelayMinutes>=15, 0 otherwise
DepartureDelayGroups: Departure delay intervals every 15 min from <-15 to >180 e.g. -1 for -7 min, 3 for 55 min, 12 for both 207 min as well as 2109 min
DepTimeBlk: departure time hourly blocks
CRSArrTime: Scheduled arrival time (mentioned as a continuous number e.g. 1105 for 11:05 hrs and 1823 for 18:23 hrs)
ArrTime:Actual arrival time (mentioned as a continuous number e.g. 1105 for 11:05 hrs and 1823 for 18:23 hrs)
ArrDelay: ArrTime-CRSArrTime (negative for before-time arrivals)
ArrDelayMinutes: Arrival delay in minutes( Zero for before-time or on-time flights)
ArrDel15: 1 if ArrDelayMinutes>=15, 0 otherwise
ArrivalDelayGroups: Arrival delay intervals every 15 min from <-15 to >180 e.g. -1 for -7 min, 3 for 55 min, 12 for both 207 min as well as 2109 min
ArrTimeBlk: arrival time hourly blocks
Cancelled: 1 if flight cancelled, 0 otherwise
Diverted: 1 if diverted, 0 otherwise
CRSElapsedTime: CRSArrTime-CRSDepTime
ActualElapsedTime: ArrTime-DepTime
Airtime: Time that the flight was in air
Distance: origin to destination distance
CarrierDelay: Delay attributable to carrier in Min
WeatherDelay: Delay attributable to carrier in Min
NASDelay: Delay attributable to carrier in Min
SecurityDelay: Delay attributable to carrier in Min
LateAircraftDelay: Delay attributable to carrier in Min
WEEKEND: 1 for weekend, 0 for weekday
PMDEP: 1 if scheduled departure time after 1200 hrs, 0 otherwise
PMARR: 1 if scheduled arrival time after 1200 hrs, 0 otherwise
Southwest: 1, if specified carrier, 0 otherwise
American: 1, if specified carrier, 0 otherwise
Delta: 1, if specified carrier, 0 otherwise
United: 1, if specified carrier, 0 otherwise
Alaska: 1, if specified carrier, 0 otherwise
Jetblue: 1, if specified carrier, 0 otherwise
Skywest: 1, if specified carrier, 0 otherwise
Others: 1, if specified carrier, 0 otherwise
# Description of data
str(modifiedairlinedelay.df)## Classes 'tbl_df', 'tbl' and 'data.frame':    1774818 obs. of  40 variables:
##  $ FlightDate          : Date, format: "2018-10-14" "2018-10-15" ...
##  $ Reporting_Airline   : chr  "AA" "AA" "AA" "AA" ...
##   ..- attr(*, "format.spss")= chr "A2"
##   ..- attr(*, "display_width")= int 2
##  $ Origin              : chr  "LAX" "LAX" "LAX" "LAX" ...
##   ..- attr(*, "format.spss")= chr "A3"
##   ..- attr(*, "display_width")= int 3
##  $ Dest                : chr  "CLT" "CLT" "CLT" "CLT" ...
##   ..- attr(*, "format.spss")= chr "A3"
##   ..- attr(*, "display_width")= int 3
##  $ CRSDepTime          : num  1123 1123 1123 1123 1123 ...
##   ..- attr(*, "format.spss")= chr "F4.0"
##   ..- attr(*, "display_width")= int 12
##  $ DepTime             : num  1127 1119 1119 1120 1125 ...
##   ..- attr(*, "format.spss")= chr "F4.0"
##   ..- attr(*, "display_width")= int 12
##  $ DepDelay            : num  4 -4 -4 -3 2 -1 1 -6 -4 -3 ...
##   ..- attr(*, "format.spss")= chr "F4.0"
##   ..- attr(*, "display_width")= int 12
##  $ DepDelayMinutes     : num  4 0 0 0 2 0 1 0 0 0 ...
##   ..- attr(*, "format.spss")= chr "F4.0"
##   ..- attr(*, "display_width")= int 12
##  $ DepDel15            : num  0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "format.spss")= chr "F1.0"
##   ..- attr(*, "display_width")= int 12
##  $ DepartureDelayGroups: num  0 -1 -1 -1 0 -1 0 -1 -1 -1 ...
##   ..- attr(*, "format.spss")= chr "F2.0"
##   ..- attr(*, "display_width")= int 12
##  $ DepTimeBlk          : chr  "1100-1159" "1100-1159" "1100-1159" "1100-1159" ...
##   ..- attr(*, "format.spss")= chr "A9"
##   ..- attr(*, "display_width")= int 9
##  $ CRSArrTime          : num  1910 1910 1910 1910 1910 1910 1910 1910 1910 1910 ...
##   ..- attr(*, "format.spss")= chr "F4.0"
##   ..- attr(*, "display_width")= int 12
##  $ ArrTime             : num  1854 1849 1856 1902 1906 ...
##   ..- attr(*, "format.spss")= chr "F4.0"
##   ..- attr(*, "display_width")= int 12
##  $ ArrDelay            : num  -16 -21 -14 -8 -4 -8 9 17 -9 -13 ...
##   ..- attr(*, "format.spss")= chr "F4.0"
##   ..- attr(*, "display_width")= int 12
##  $ ArrDelayMinutes     : num  0 0 0 0 0 0 9 17 0 0 ...
##   ..- attr(*, "format.spss")= chr "F4.0"
##   ..- attr(*, "display_width")= int 12
##  $ ArrDel15            : num  0 0 0 0 0 0 0 1 0 0 ...
##   ..- attr(*, "format.spss")= chr "F1.0"
##   ..- attr(*, "display_width")= int 12
##  $ ArrivalDelayGroups  : num  -2 -2 -1 -1 -1 -1 0 1 -1 -1 ...
##   ..- attr(*, "format.spss")= chr "F2.0"
##   ..- attr(*, "display_width")= int 12
##  $ ArrTimeBlk          : chr  "1900-1959" "1900-1959" "1900-1959" "1900-1959" ...
##   ..- attr(*, "format.spss")= chr "A9"
##   ..- attr(*, "display_width")= int 9
##  $ Cancelled           : num  0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "format.spss")= chr "F1.0"
##   ..- attr(*, "display_width")= int 12
##  $ Diverted            : num  0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "format.spss")= chr "F1.0"
##   ..- attr(*, "display_width")= int 12
##  $ CRSElapsedTime      : num  287 287 287 287 287 287 287 287 287 287 ...
##   ..- attr(*, "format.spss")= chr "F3.0"
##   ..- attr(*, "display_width")= int 12
##  $ ActualElapsedTime   : num  267 270 277 282 281 280 295 310 282 277 ...
##   ..- attr(*, "format.spss")= chr "F3.0"
##   ..- attr(*, "display_width")= int 12
##  $ AirTime             : num  232 235 248 256 249 244 250 244 249 256 ...
##   ..- attr(*, "format.spss")= chr "F3.0"
##   ..- attr(*, "display_width")= int 12
##  $ Distance            : num  2125 2125 2125 2125 2125 ...
##   ..- attr(*, "format.spss")= chr "F4.0"
##   ..- attr(*, "display_width")= int 12
##  $ CarrierDelay        : num  NA NA NA NA NA NA NA 0 NA NA ...
##   ..- attr(*, "format.spss")= chr "F4.0"
##   ..- attr(*, "display_width")= int 12
##  $ WeatherDelay        : num  NA NA NA NA NA NA NA 0 NA NA ...
##   ..- attr(*, "format.spss")= chr "F4.0"
##   ..- attr(*, "display_width")= int 12
##  $ NASDelay            : num  NA NA NA NA NA NA NA 17 NA NA ...
##   ..- attr(*, "format.spss")= chr "F4.0"
##   ..- attr(*, "display_width")= int 12
##  $ SecurityDelay       : num  NA NA NA NA NA NA NA 0 NA NA ...
##   ..- attr(*, "format.spss")= chr "F3.0"
##   ..- attr(*, "display_width")= int 12
##  $ LateAircraftDelay   : num  NA NA NA NA NA NA NA 0 NA NA ...
##   ..- attr(*, "format.spss")= chr "F4.0"
##   ..- attr(*, "display_width")= int 12
##  $ WEEKEND             : num  1 0 0 0 0 0 1 0 0 0 ...
##   ..- attr(*, "label")= chr "WEEKEND"
##   ..- attr(*, "format.spss")= chr "F8.2"
##   ..- attr(*, "display_width")= int 10
##  $ PMDEP               : num  0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "label")= chr "PMDEP"
##   ..- attr(*, "format.spss")= chr "F8.2"
##   ..- attr(*, "display_width")= int 10
##  $ PMARR               : num  1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "label")= chr "PMARR"
##   ..- attr(*, "format.spss")= chr "F8.2"
##   ..- attr(*, "display_width")= int 10
##  $ Southwest           : num  0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "label")= chr "Southwest"
##   ..- attr(*, "format.spss")= chr "F8.2"
##   ..- attr(*, "display_width")= int 11
##  $ American            : num  1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "label")= chr "American"
##   ..- attr(*, "format.spss")= chr "F8.2"
##   ..- attr(*, "display_width")= int 10
##  $ Delta               : num  0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "label")= chr "Delta"
##   ..- attr(*, "format.spss")= chr "F8.2"
##   ..- attr(*, "display_width")= int 10
##  $ United              : num  0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "label")= chr "United"
##   ..- attr(*, "format.spss")= chr "F8.2"
##   ..- attr(*, "display_width")= int 10
##  $ Alaska              : num  0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "label")= chr "Alaska"
##   ..- attr(*, "format.spss")= chr "F8.2"
##   ..- attr(*, "display_width")= int 10
##  $ Jetblue             : num  0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "label")= chr "Jetblue"
##   ..- attr(*, "format.spss")= chr "F8.2"
##   ..- attr(*, "display_width")= int 10
##  $ Skywest             : num  0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "label")= chr "Skywest"
##   ..- attr(*, "format.spss")= chr "F8.2"
##   ..- attr(*, "display_width")= int 10
##  $ Others              : num  0 0 0 0 0 0 0 0 0 0 ...
##   ..- attr(*, "label")= chr "Others"
##   ..- attr(*, "format.spss")= chr "F8.2"
##   ..- attr(*, "display_width")= int 10# summary of selected variables 
summary(modifiedairlinedelay.df [,c(8,9,15,16,19:40)])##  DepDelayMinutes      DepDel15      ArrDelayMinutes      ArrDel15     
##  Min.   :   0.00   Min.   :0.0000   Min.   :   0.00   Min.   :0.0000  
##  1st Qu.:   0.00   1st Qu.:0.0000   1st Qu.:   0.00   1st Qu.:0.0000  
##  Median :   0.00   Median :0.0000   Median :   0.00   Median :0.0000  
##  Mean   :  11.69   Mean   :0.1694   Mean   :  12.14   Mean   :0.1815  
##  3rd Qu.:   5.00   3rd Qu.:0.0000   3rd Qu.:   7.00   3rd Qu.:0.0000  
##  Max.   :2109.00   Max.   :1.0000   Max.   :2153.00   Max.   :1.0000  
##                                                                       
##    Cancelled    Diverted CRSElapsedTime  ActualElapsedTime    AirTime     
##  Min.   :0   Min.   :0   Min.   :-99.0   Min.   : 14.0     Min.   :  7.0  
##  1st Qu.:0   1st Qu.:0   1st Qu.: 88.0   1st Qu.: 84.0     1st Qu.: 60.0  
##  Median :0   Median :0   Median :122.0   Median :118.0     Median : 92.0  
##  Mean   :0   Mean   :0   Mean   :140.9   Mean   :136.2     Mean   :111.3  
##  3rd Qu.:0   3rd Qu.:0   3rd Qu.:170.0   3rd Qu.:166.0     3rd Qu.:141.0  
##  Max.   :0   Max.   :0   Max.   :703.0   Max.   :723.0     Max.   :695.0  
##                                                                           
##     Distance       CarrierDelay      WeatherDelay        NASDelay      
##  Min.   :  31.0   Min.   :   0.0    Min.   :   0      Min.   :   0.0   
##  1st Qu.: 363.0   1st Qu.:   0.0    1st Qu.:   0      1st Qu.:   0.0   
##  Median : 631.0   Median :   0.0    Median :   0      Median :   4.0   
##  Mean   : 797.1   Mean   :  19.1    Mean   :   3      Mean   :  14.9   
##  3rd Qu.:1028.0   3rd Qu.:  17.0    3rd Qu.:   0      3rd Qu.:  19.0   
##  Max.   :4983.0   Max.   :2109.0    Max.   :2098      Max.   :1498.0   
##                   NA's   :1452702   NA's   :1452702   NA's   :1452702  
##  SecurityDelay     LateAircraftDelay    WEEKEND          PMDEP       
##  Min.   :  0.0     Min.   :   0.0    Min.   :0.000   Min.   :0.0000  
##  1st Qu.:  0.0     1st Qu.:   0.0    1st Qu.:0.000   1st Qu.:0.0000  
##  Median :  0.0     Median :   1.0    Median :0.000   Median :1.0000  
##  Mean   :  0.1     Mean   :  23.8    Mean   :0.266   Mean   :0.5826  
##  3rd Qu.:  0.0     3rd Qu.:  29.0    3rd Qu.:1.000   3rd Qu.:1.0000  
##  Max.   :738.0     Max.   :1411.0    Max.   :1.000   Max.   :1.0000  
##  NA's   :1452702   NA's   :1452702                                   
##      PMARR          Southwest         American          Delta       
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :1.0000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.6953   Mean   :0.1898   Mean   :0.1277   Mean   :0.1312  
##  3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##                                                                     
##      United            Alaska           Jetblue           Skywest      
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.0000  
##  Median :0.00000   Median :0.00000   Median :0.00000   Median :0.0000  
##  Mean   :0.08894   Mean   :0.03506   Mean   :0.04188   Mean   :0.1075  
##  3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.0000  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.00000   Max.   :1.0000  
##                                                                        
##      Others     
##  Min.   :0.000  
##  1st Qu.:0.000  
##  Median :0.000  
##  Mean   :0.278  
##  3rd Qu.:1.000  
##  Max.   :1.000  
## # Important hints from this summary
# DepDel15 shows that around 17% flights delayed > 15 min on departure
# Similarly ArrDel15 shows around 18% flights delayed > 15 min on arrival
# Categorical dummies WEEKEND, PMDEP, PMARR and Airline name dummies give respective percentages on first sight e.g. 26.6 % flights departed on weekend and 10.75 % of total flights were run by Skywest.# Based on above summary results, we see that variables cancelled and diverted are zero in all records. Hence these variables can't discriminate between records. The five attributable delays- CarrierDelay, WeatherDelay, NASDelay, SecurityDelay, LateAircraftDelay have 1452702 missing values which based on total no. of records, can't be approximated. Hence these seven variables are also excluded from the data
finalairlinedelay.df<-modifiedairlinedelay.df[,c(1:18,21:24,30:40)]
View(finalairlinedelay.df)# summary statistics of important metric data variables 
summary(finalairlinedelay.df$DepDelayMinutes)##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00    0.00   11.69    5.00 2109.00summary(finalairlinedelay.df$ArrDelayMinutes)##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00    0.00   12.14    7.00 2153.00# the mean departure delay is 11.69 min
# the mean departure delay is 12.14 min# summary statistics of important metric data variables 
# skewness and kurtosis are distribution-shape related characteristics. skewness indicates symmetry about mean, whereas kurtosis measures relative peakedness or flatness of distribution. negative skewness shows left skew i.e. left tail is longer and most of distribution is at the right, whereas positive means right skew. A positive kurtosis indicates that distribution is more peaked than normal distribution, whereas negative kurtosis shows that distribution is flatter than normal distribution. 
library(psych)
describe(finalairlinedelay.df$DepDelayMinutes)[,c(1:5,8:9,11:12)]##    vars       n  mean    sd median min  max  skew kurtosis
## X1    1 1774818 11.69 41.45      0   0 2109 11.52   232.62describe(finalairlinedelay.df$ArrDelayMinutes)[,c(1:5,8:9,11:12)]##    vars       n  mean    sd median min  max  skew kurtosis
## X1    1 1774818 12.14 41.51      0   0 2153 11.51   233.53# summary statistics of important metric data variables- continued for independent variables
library(psych)
describe(finalairlinedelay.df$CRSElapsedTime)[,c(1:5,8:9,11:12)]##    vars       n   mean    sd median min max skew kurtosis
## X1    1 1774818 140.85 72.91    122 -99 703 1.44     2.34describe(finalairlinedelay.df$ActualElapsedTime)[,c(1:5,8:9,11:12)]##    vars       n   mean   sd median min max skew kurtosis
## X1    1 1774818 136.25 72.5    118  14 723 1.42     2.29describe(finalairlinedelay.df$AirTime)[,c(1:5,8:9,11:12)]##    vars       n   mean    sd median min max skew kurtosis
## X1    1 1774818 111.31 70.74     92   7 695 1.45     2.33describe(finalairlinedelay.df$Distance)[,c(1:5,8:9,11:12)]##    vars       n   mean     sd median min  max skew kurtosis
## X1    1 1774818 797.11 593.98    631  31 4983 1.48     2.44# Preliminary idea of outliers by quantile analysis
quantile(finalairlinedelay.df$DepDelayMinutes,probs=seq(0,1,0.05))##   0%   5%  10%  15%  20%  25%  30%  35%  40%  45%  50%  55%  60%  65%  70% 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    0    2 
##  75%  80%  85%  90%  95% 100% 
##    5   10   18   32   63 2109quantile(finalairlinedelay.df$ArrDelayMinutes,probs=seq(0,1,0.05))##   0%   5%  10%  15%  20%  25%  30%  35%  40%  45%  50%  55%  60%  65%  70% 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    1    4 
##  75%  80%  85%  90%  95% 100% 
##    7   12   20   33   63 2153quantile(finalairlinedelay.df$CRSElapsedTime,probs=seq(0,1,0.05))##   0%   5%  10%  15%  20%  25%  30%  35%  40%  45%  50%  55%  60%  65%  70% 
##  -99   61   70   76   83   88   94  100  107  115  122  130  140  150  160 
##  75%  80%  85%  90%  95% 100% 
##  170  184  208  243  302  703quantile(finalairlinedelay.df$ActualElapsedTime,probs=seq(0,1,0.05))##   0%   5%  10%  15%  20%  25%  30%  35%  40%  45%  50%  55%  60%  65%  70% 
##   14   56   65   72   78   84   90   96  103  110  118  126  135  145  155 
##  75%  80%  85%  90%  95% 100% 
##  166  181  203  238  295  723quantile(finalairlinedelay.df$AirTime,probs=seq(0,1,0.05))##   0%   5%  10%  15%  20%  25%  30%  35%  40%  45%  50%  55%  60%  65%  70% 
##    7   35   43   48   54   60   65   71   78   85   92  100  110  119  129 
##  75%  80%  85%  90%  95% 100% 
##  141  154  176  211  268  695quantile(finalairlinedelay.df$Distance,probs=seq(0,1,0.05))##   0%   5%  10%  15%  20%  25%  30%  35%  40%  45%  50%  55%  60%  65%  70% 
##   31  168  223  270  319  363  408  456  519  583  631  693  776  864  948 
##  75%  80%  85%  90%  95% 100% 
## 1028 1142 1368 1626 2173 4983# The analysis of skewness and 5% quantiles shows that DepDelayMinutes and ArrDelayMinutes are heavily right-skewed i.e. most of the data points are at the left. This is understandable as arounf 65-70% flights are on time i.e. these variables are zero for 65-70% of cases. What is more improtant is that 95% quantile of both these variables is at 63 Min whereas max is 2109/2153 min. This huge difference is an indication of outlier effect, which may have to be adjusted. We leave this decision to later statistical analysis# So far our preliminary analysis was based on creating dummy variables in SPSS/Excel and then exporting data to R. To use the factor variable operations in-built in R and to ease data analysis, we add a new column to indicate whether flight departed ontime or late and similarly whether it arrived ontime or late
finalairlinedelay.df$DepStatus<- ifelse(finalairlinedelay.df$DepDelayMinutes > 0, "DelayedonDeparture", "OntimeDep")
finalairlinedelay.df$ArrStatus<- ifelse(finalairlinedelay.df$ArrDelayMinutes > 0, "DelayedonArrival", "OntimeArr")#  To use the factor variable operations in-built in R and to ease data analysis, we add a new column to indicate names of seven major airlines and amalgamate all other airlines as 'Others'. 
finalairlinedelay.df$airline<- ifelse(finalairlinedelay.df$Reporting_Airline == "AA" , "American",
                  ifelse(finalairlinedelay.df$Reporting_Airline == "AS" , "Alaska",
                         ifelse(finalairlinedelay.df$Reporting_Airline == "WN" , "Southwest",
                                ifelse(finalairlinedelay.df$Reporting_Airline == "DL" , "Delta", ifelse(finalairlinedelay.df$Reporting_Airline == "UA" , "United", ifelse(finalairlinedelay.df$Reporting_Airline == "B6" , "Jetblue", ifelse(finalairlinedelay.df$Reporting_Airline == "OO" , "Skywest", "Others"
)))))))#To use the factor variable operations in-built in R and to ease data analysis, we add a new column to indicate time window of departure
finalairlinedelay.df$timewindowdep<- ifelse(finalairlinedelay.df$CRSDepTime >= 1200 , "PM", "AM")#To use the factor variable operations in-built in R and to ease data analysis, we add a new column to indicate time window of arrival
finalairlinedelay.df$timewindowarr<- ifelse(finalairlinedelay.df$CRSArrTime >= 1200 , "PM", "AM")#To use the factor variable operations in-built in R and to ease data analysis, we add a new column to indicate day of week. We use the initial file imported from SPSS as it contained dayofweek variable, which we later removed from data file.
finalairlinedelay.df$daywindow<- ifelse(airlinedelay.df$DayOfWeek>= 6 , "Weekend", "Weekday")# To understand the respective frequency of delay on departure and arrival in absolute count as well as by factor variables. This is to give an initial idea, whether delays deserve to be studies futher.
# We start with one-way frequency tables and percentages
library(data.table)
delay.dt<-data.table(finalairlinedelay.df)
table(delay.dt$DepStatus)## 
## DelayedonDeparture          OntimeDep 
##             588618            1186200percentDepstatus<-round(prop.table(table(delay.dt$DepStatus))*100,2)
percentDepstatus## 
## DelayedonDeparture          OntimeDep 
##              33.16              66.84table(delay.dt$ArrStatus)## 
## DelayedonArrival        OntimeArr 
##           629201          1145617percentArrstatus<-round(prop.table(table(delay.dt$ArrStatus))*100,2)
percentArrstatus## 
## DelayedonArrival        OntimeArr 
##            35.45            64.55table(delay.dt$daywindow)## 
## Weekday Weekend 
## 1302690  472128percentday<-round(prop.table(table(delay.dt$daywindow))*100,2)
percentday## 
## Weekday Weekend 
##    73.4    26.6table(delay.dt$timewindowdep)## 
##      AM      PM 
##  740818 1034000percenttimedep<-round(prop.table(table(delay.dt$timewindowdep))*100,2)
percenttimedep## 
##    AM    PM 
## 41.74 58.26table(delay.dt$timewindowarr)## 
##      AM      PM 
##  540856 1233962percenttimearr<-round(prop.table(table(delay.dt$timewindowarr))*100,2)
percenttimearr## 
##    AM    PM 
## 30.47 69.53# Now we check for two-way cross tabulation by factor variables
table(delay.dt$daywindow,delay.dt$DepStatus)##          
##           DelayedonDeparture OntimeDep
##   Weekday             435543    867147
##   Weekend             153075    319053library(tigerstats)## Loading required package: abd## Loading required package: nlme## Loading required package: lattice## Loading required package: grid## Loading required package: mosaic## Loading required package: dplyr## 
## Attaching package: 'dplyr'## The following object is masked from 'package:nlme':
## 
##     collapse## The following objects are masked from 'package:data.table':
## 
##     between, first, last## The following objects are masked from 'package:stats':
## 
##     filter, lag## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union## Loading required package: ggformula## Loading required package: ggplot2## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang## 
## Attaching package: 'ggplot2'## The following objects are masked from 'package:psych':
## 
##     %+%, alpha## Loading required package: ggstance## 
## Attaching package: 'ggstance'## The following objects are masked from 'package:ggplot2':
## 
##     geom_errorbarh, GeomErrorbarh## 
## New to ggformula?  Try the tutorials: 
##  learnr::run_tutorial("introduction", package = "ggformula")
##  learnr::run_tutorial("refining", package = "ggformula")## Loading required package: mosaicData## Loading required package: Matrix## Registered S3 method overwritten by 'mosaic':
##   method                           from   
##   fortify.SpatialPolygonsDataFrame ggplot2## 
## The 'mosaic' package masks several functions from core packages in order to add 
## additional features.  The original behavior of these functions should not be affected by this.
## 
## Note: If you use the Matrix package, be sure to load it BEFORE loading mosaic.## 
## Attaching package: 'mosaic'## The following object is masked from 'package:Matrix':
## 
##     mean## The following object is masked from 'package:ggplot2':
## 
##     stat## The following objects are masked from 'package:dplyr':
## 
##     count, do, tally## The following objects are masked from 'package:psych':
## 
##     logit, read.file, rescale## The following objects are masked from 'package:stats':
## 
##     binom.test, cor, cor.test, cov, fivenum, IQR, median,
##     prop.test, quantile, sd, t.test, var## The following objects are masked from 'package:base':
## 
##     max, mean, min, prod, range, sample, sum## Welcome to tigerstats!
## To learn more about this package, consult its website:
##  http://homerhanumat.github.io/tigerstats## 
## Attaching package: 'tigerstats'## The following object is masked from 'package:psych':
## 
##     galtonrowPerc(table(delay.dt$daywindow,delay.dt$DepStatus))##          
##           DelayedonDeparture OntimeDep  Total
##   Weekday              33.43     66.57 100.00
##   Weekend              32.42     67.58 100.00table(delay.dt$daywindow,delay.dt$ArrStatus)##          
##           DelayedonArrival OntimeArr
##   Weekday           472952    829738
##   Weekend           156249    315879library(tigerstats)
rowPerc(table(delay.dt$daywindow,delay.dt$ArrStatus))##          
##           DelayedonArrival OntimeArr  Total
##   Weekday            36.31     63.69 100.00
##   Weekend            33.09     66.91 100.00table(delay.dt$timewindowdep,delay.dt$DepStatus)##     
##      DelayedonDeparture OntimeDep
##   AM             183690    557128
##   PM             404928    629072rowPerc(table(delay.dt$timewindowdep,delay.dt$DepStatus))##     
##      DelayedonDeparture OntimeDep  Total
##   AM              24.80     75.20 100.00
##   PM              39.16     60.84 100.00table(delay.dt$timewindowdep,delay.dt$ArrStatus)##     
##      DelayedonArrival OntimeArr
##   AM           226877    513941
##   PM           402324    631676rowPerc(table(delay.dt$timewindowdep,delay.dt$ArrStatus))##     
##      DelayedonArrival OntimeArr  Total
##   AM            30.63     69.37 100.00
##   PM            38.91     61.09 100.00# Now we check for two-way cross tabulation by factor variables
table(delay.dt$timewindowarr,delay.dt$ArrStatus)##     
##      DelayedonArrival OntimeArr
##   AM           162460    378396
##   PM           466741    767221rowPerc(table(delay.dt$timewindowarr,delay.dt$ArrStatus))##     
##      DelayedonArrival OntimeArr  Total
##   AM            30.04     69.96 100.00
##   PM            37.82     62.18 100.00# using data.table to tabulate descriptive statistics by factor variables
library(data.table)
delay.dt<-data.table(finalairlinedelay.df)
tab1<-delay.dt[,.(N=.N,MeanDepDel = mean(DepDelayMinutes),SDDepDel = sd(DepDelayMinutes), MeanArrDel = mean(ArrDelayMinutes),SDArrDel = sd(ArrDelayMinutes)), by = (airline)][order(MeanArrDel)]
tab1##      airline      N MeanDepDel SDDepDel MeanArrDel SDArrDel
## 1:     Delta 232804   7.703600 33.50740   7.655350 33.31172
## 2: Southwest 336886  11.191578 26.75957   9.772324 26.00900
## 3:    Alaska  62218   8.993603 28.32736  10.178501 28.53498
## 4:  American 226578  11.714182 44.12548  12.777105 44.60255
## 5:    United 157853  12.003491 43.97220  13.307894 44.17327
## 6:    Others 493338  12.526353 44.56514  13.509697 44.66248
## 7:   Skywest 190815  13.832293 56.46266  15.003385 56.54742
## 8:   Jetblue  74326  17.004130 45.72818  17.714380 45.76497# The mean departure and arrival delay for the whole dataset was 11.69 Min and 12.14 min respectively, which at first sight shows considerable variation by Airline.
# Repeating the process for checking timeofday and dayofweek variation. Similar variations in mean departure and arrival delay are seen for timeofday and dayofweek
tab2<-delay.dt[,.(N=.N,MeanDepDel = mean(DepDelayMinutes),SDDepDel = sd(DepDelayMinutes), MeanArrDel = mean(ArrDelayMinutes),SDArrDel = sd(ArrDelayMinutes)), by = (timewindowdep)][order(MeanArrDel)]
tab2##    timewindowdep       N MeanDepDel SDDepDel MeanArrDel SDArrDel
## 1:            AM  740818    8.17897 36.13149   9.248073 36.33691
## 2:            PM 1034000   14.21286 44.71249  14.213309 44.72860tab3<-delay.dt[,.(N=.N, MeanArrDel = mean(ArrDelayMinutes),SDArrDel = sd(ArrDelayMinutes)), by = (timewindowarr)][order(MeanArrDel)]
tab3##    timewindowarr       N MeanArrDel SDArrDel
## 1:            AM  540856   9.065136 37.02452
## 2:            PM 1233962  13.488881 43.25458tab4<-delay.dt[,.(N=.N,MeanDepDel = mean(DepDelayMinutes),SDDepDel = sd(DepDelayMinutes), MeanArrDel = mean(ArrDelayMinutes),SDArrDel = sd(ArrDelayMinutes)), by = (daywindow)][order(MeanArrDel)]
tab4##    daywindow       N MeanDepDel SDDepDel MeanArrDel SDArrDel
## 1:   Weekend  472128   10.77776 40.38421    10.8459 40.16021
## 2:   Weekday 1302690   12.02646 41.83103    12.6101 41.97233# summary statistics for above six continuous variables using Aggregate function
aggregate(finalairlinedelay.df[,c(8,15,19:22)],
          by = list(finalairlinedelay.df$airline),mean)##     Group.1 DepDelayMinutes ArrDelayMinutes CRSElapsedTime
## 1    Alaska        8.993603       10.178501       206.5684
## 2  American       11.714182       12.777105       167.1867
## 3     Delta        7.703600        7.655350       151.1464
## 4   Jetblue       17.004130       17.714380       174.5919
## 5    Others       12.526353       13.509697       118.1208
## 6   Skywest       13.832293       15.003385       108.2800
## 7 Southwest       11.191578        9.772324       128.3607
## 8    United       12.003491       13.307894       183.1772
##   ActualElapsedTime   AirTime  Distance
## 1          202.7015 174.99381 1333.3650
## 2          163.5417 135.61272  998.3643
## 3          143.4273 118.97884  872.1344
## 4          171.3222 146.69514 1080.3446
## 5          114.7034  88.27521  592.8487
## 6          104.8274  76.29302  488.6364
## 7          121.6868 104.07573  747.2839
## 8          180.1530 153.10035 1170.4063