Reading the Data

library(psych)
library(dplyr)
library(data.table)

delay_df<-load("FDelay.rda")
delay_df<-FDelay.dt[1:150000,]

Dimensions of the Data

dim(delay_df)
## [1] 150000     40

Column Names

colnames(delay_df)
##  [1] "V1"                   "FlightDate"           "Reporting_Airline"   
##  [4] "Origin"               "Dest"                 "CRSDepTime"          
##  [7] "DepTime"              "DepDelay"             "DepDelayMinutes"     
## [10] "DepDel15"             "DepartureDelayGroups" "DepTimeBlk"          
## [13] "CRSArrTime"           "ArrTime"              "ArrDelay"            
## [16] "ArrDelayMinutes"      "ArrDel15"             "ArrivalDelayGroups"  
## [19] "ArrTimeBlk"           "CRSElapsedTime"       "ActualElapsedTime"   
## [22] "AirTime"              "Distance"             "WEEKEND"             
## [25] "PMDEP"                "PMARR"                "Southwest"           
## [28] "American"             "Delta"                "United"              
## [31] "Alaska"               "Jetblue"              "Skywest"             
## [34] "Others"               "DepStatus"            "ArrStatus"           
## [37] "airline"              "timewindowdep"        "timewindowarr"       
## [40] "daywindow"

Summary Statistics

str(delay_df)
## Classes 'data.table' and 'data.frame':   150000 obs. of  40 variables:
##  $ V1                  : int  7 8 11 12 13 14 16 18 20 25 ...
##  $ FlightDate          : Factor w/ 92 levels "2018-10-01","2018-10-02",..: 21 22 25 26 28 29 31 2 4 10 ...
##  $ Reporting_Airline   : Factor w/ 17 levels "9E","AA","AS",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Origin              : Factor w/ 350 levels "ABE","ABI","ABQ",..: 183 183 183 183 183 183 183 215 215 215 ...
##  $ Dest                : Factor w/ 350 levels "ABE","ABI","ABQ",..: 71 71 71 71 71 71 71 71 71 71 ...
##  $ CRSDepTime          : int  1123 1123 1123 1123 1123 1123 1123 846 846 846 ...
##  $ DepTime             : int  1124 1117 1358 1125 1248 1211 1133 841 846 844 ...
##  $ DepDelay            : int  1 -6 155 2 85 48 10 -5 0 -2 ...
##  $ DepDelayMinutes     : int  1 0 155 2 85 48 10 0 0 0 ...
##  $ DepDel15            : int  0 0 1 0 1 1 0 0 0 0 ...
##  $ DepartureDelayGroups: int  0 -1 10 0 5 3 0 -1 0 -1 ...
##  $ DepTimeBlk          : Factor w/ 19 levels "0001-0559","0600-0659",..: 7 7 7 7 7 7 7 4 4 4 ...
##  $ CRSArrTime          : int  1910 1910 1910 1910 1910 1910 1910 1053 1050 1050 ...
##  $ ArrTime             : int  1919 1927 2133 1922 2009 1940 1913 1054 1059 1118 ...
##  $ ArrDelay            : int  9 17 143 12 59 30 3 1 9 28 ...
##  $ ArrDelayMinutes     : int  9 17 143 12 59 30 3 1 9 28 ...
##  $ ArrDel15            : int  0 1 1 0 1 1 0 0 0 1 ...
##  $ ArrivalDelayGroups  : int  0 1 9 0 3 2 0 0 0 1 ...
##  $ ArrTimeBlk          : Factor w/ 19 levels "0001-0559","0600-0659",..: 15 15 15 15 15 15 15 6 6 6 ...
##  $ CRSElapsedTime      : int  287 287 287 287 287 287 287 127 124 124 ...
##  $ ActualElapsedTime   : int  295 310 275 297 261 269 280 133 133 154 ...
##  $ AirTime             : int  250 244 246 257 234 241 256 96 97 100 ...
##  $ Distance            : int  2125 2125 2125 2125 2125 2125 2125 650 650 650 ...
##  $ WEEKEND             : int  1 0 0 0 1 0 0 0 0 0 ...
##  $ PMDEP               : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PMARR               : int  1 1 1 1 1 1 1 0 0 0 ...
##  $ Southwest           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ American            : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Delta               : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ United              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Alaska              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Jetblue             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Skywest             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Others              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ DepStatus           : Factor w/ 2 levels "DelayedonDeparture",..: 1 2 1 1 1 1 1 2 2 2 ...
##  $ ArrStatus           : Factor w/ 2 levels "DelayedonArrival",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ airline             : Factor w/ 11 levels "Alaska","American",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ timewindowdep       : Factor w/ 2 levels "AM","PM": 1 1 1 1 1 1 1 1 1 1 ...
##  $ timewindowarr       : Factor w/ 2 levels "AM","PM": 2 2 2 2 2 2 2 1 1 1 ...
##  $ daywindow           : Factor w/ 2 levels "Weekday","Weekend": 2 1 1 1 2 1 1 1 1 1 ...
##  - attr(*, ".internal.selfref")=<externalptr>
summary(delay_df)
##        V1              FlightDate     Reporting_Airline     Origin      
##  Min.   :     7   2018-10-15:  9236   WN     :20576     ATL    :  7971  
##  1st Qu.:115988   2018-10-12:  6547   AA     :20539     ORD    :  7884  
##  Median :227201   2018-10-18:  6223   DL     :20071     DFW    :  7412  
##  Mean   :226170   2018-10-05:  6192   OO     :15721     CLT    :  5120  
##  3rd Qu.:340176   2018-10-19:  6103   UA     :10987     BOS    :  4907  
##  Max.   :449631   2018-10-16:  6046   B6     : 9063     DEN    :  4562  
##                   (Other)   :109653   (Other):53043     (Other):112144  
##       Dest          CRSDepTime      DepTime        DepDelay      
##  ATL    :  7979   Min.   :   1   Min.   :   1   Min.   : -28.00  
##  ORD    :  6881   1st Qu.:1000   1st Qu.:1017   1st Qu.:  -2.00  
##  DFW    :  6694   Median :1405   Median :1429   Median :   9.00  
##  LAX    :  4996   Mean   :1378   Mean   :1407   Mean   :  27.99  
##  DEN    :  4929   3rd Qu.:1754   3rd Qu.:1816   3rd Qu.:  34.00  
##  IAH    :  4895   Max.   :2359   Max.   :2400   Max.   :2109.00  
##  (Other):113626                                                  
##  DepDelayMinutes      DepDel15      DepartureDelayGroups     DepTimeBlk   
##  Min.   :   0.00   Min.   :0.0000   Min.   :-2.000       1700-1759:10795  
##  1st Qu.:   0.00   1st Qu.:0.0000   1st Qu.:-1.000       1800-1859:10199  
##  Median :   9.00   Median :0.0000   Median : 0.000       1200-1259:10178  
##  Mean   :  29.46   Mean   :0.4279   Mean   : 1.152       1600-1659: 9831  
##  3rd Qu.:  34.00   3rd Qu.:1.0000   3rd Qu.: 2.000       1500-1559: 9713  
##  Max.   :2109.00   Max.   :1.0000   Max.   :12.000       1400-1459: 9640  
##                                                          (Other)  :89644  
##    CRSArrTime      ArrTime        ArrDelay       ArrDelayMinutes  
##  Min.   :   1   Min.   :   1   Min.   :   1.00   Min.   :   1.00  
##  1st Qu.:1150   1st Qu.:1144   1st Qu.:   6.00   1st Qu.:   6.00  
##  Median :1608   Median :1610   Median :  14.00   Median :  14.00  
##  Mean   :1547   Mean   :1529   Mean   :  32.81   Mean   :  32.81  
##  3rd Qu.:1944   3rd Qu.:1953   3rd Qu.:  35.00   3rd Qu.:  35.00  
##  Max.   :2400   Max.   :2400   Max.   :2153.00   Max.   :2153.00  
##                                                                   
##     ArrDel15     ArrivalDelayGroups     ArrTimeBlk    CRSElapsedTime 
##  Min.   :0.000   Min.   : 0.000     1900-1959:10704   Min.   :-99.0  
##  1st Qu.:0.000   1st Qu.: 0.000     1800-1859:10561   1st Qu.: 89.0  
##  Median :0.000   Median : 0.000     2100-2159:10306   Median :125.0  
##  Mean   :0.493   Mean   : 1.562     1700-1759:10088   Mean   :143.2  
##  3rd Qu.:1.000   3rd Qu.: 2.000     1600-1659: 9814   3rd Qu.:172.0  
##  Max.   :1.000   Max.   :12.000     2000-2059: 9553   Max.   :655.0  
##                                     (Other)  :88974                  
##  ActualElapsedTime    AirTime         Distance         WEEKEND      
##  Min.   : 17       Min.   :  8.0   Min.   :  31.0   Min.   :0.0000  
##  1st Qu.: 93       1st Qu.: 63.0   1st Qu.: 383.0   1st Qu.:0.0000  
##  Median :129       Median : 99.0   Median : 667.0   Median :0.0000  
##  Mean   :148       Mean   :117.9   Mean   : 826.8   Mean   :0.2293  
##  3rd Qu.:180       3rd Qu.:149.0   3rd Qu.:1061.0   3rd Qu.:0.0000  
##  Max.   :684       Max.   :653.0   Max.   :4983.0   Max.   :1.0000  
##                                                                     
##      PMDEP            PMARR          Southwest         American     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :1.0000   Median :1.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.6361   Mean   :0.7395   Mean   :0.1372   Mean   :0.1369  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##                                                                     
##      Delta            United            Alaska           Jetblue       
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.0000   Median :0.00000   Median :0.00000   Median :0.00000  
##  Mean   :0.1338   Mean   :0.07325   Mean   :0.05351   Mean   :0.06042  
##  3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.00000   Max.   :1.00000  
##                                                                        
##     Skywest           Others                    DepStatus    
##  Min.   :0.0000   Min.   :0.0000   DelayedonDeparture:96727  
##  1st Qu.:0.0000   1st Qu.:0.0000   OntimeDep         :53273  
##  Median :0.0000   Median :0.0000                             
##  Mean   :0.1048   Mean   :0.3001                             
##  3rd Qu.:0.0000   3rd Qu.:1.0000                             
##  Max.   :1.0000   Max.   :1.0000                             
##                                                              
##             ArrStatus           airline      timewindowdep timewindowarr
##  DelayedonArrival:150000   Others   :35995   AM:54590      AM: 39082    
##  OntimeArr       :     0   Southwest:20576   PM:95410      PM:110918    
##                            American :20539                              
##                            Delta    :20071                              
##                            Skywest  :15721                              
##                            United   :10987                              
##                            (Other)  :26111                              
##    daywindow     
##  Weekday:115602  
##  Weekend: 34398  
##                  
##                  
##                  
##                  
## 

Descriptive Statistic

describe(delay_df)[,c(1:5,8,9)]
##                      vars      n      mean        sd median min    max
## V1                      1 150000 226169.94 130786.75 227201   7 449631
## FlightDate*             2 150000     15.36      8.55     15   1     31
## Reporting_Airline*      3 150000      9.17      5.39      9   1     17
## Origin*                 4 150000    167.91     93.63    173   1    350
## Dest*                   5 150000    171.07     93.92    181   1    350
## CRSDepTime              6 150000   1378.39    471.41   1405   1   2359
## DepTime                 7 150000   1406.70    490.03   1429   1   2400
## DepDelay                8 150000     27.99     64.56      9 -28   2109
## DepDelayMinutes         9 150000     29.46     63.83      9   0   2109
## DepDel15               10 150000      0.43      0.49      0   0      1
## DepartureDelayGroups   11 150000      1.15      2.84      0  -2     12
## DepTimeBlk*            12 150000      9.53      4.67     10   1     19
## CRSArrTime             13 150000   1546.62    501.90   1608   1   2400
## ArrTime                14 150000   1529.05    550.74   1610   1   2400
## ArrDelay               15 150000     32.81     62.78     14   1   2153
## ArrDelayMinutes        16 150000     32.81     62.78     14   1   2153
## ArrDel15               17 150000      0.49      0.50      0   0      1
## ArrivalDelayGroups     18 150000      1.56      2.63      0   0     12
## ArrTimeBlk*            19 150000     11.24      4.84     12   1     19
## CRSElapsedTime         20 150000    143.16     74.03    125 -99    655
## ActualElapsedTime      21 150000    147.99     76.03    129  17    684
## AirTime                22 150000    117.91     74.18     99   8    653
## Distance               23 150000    826.75    606.55    667  31   4983
## WEEKEND                24 150000      0.23      0.42      0   0      1
## PMDEP                  25 150000      0.64      0.48      1   0      1
## PMARR                  26 150000      0.74      0.44      1   0      1
## Southwest              27 150000      0.14      0.34      0   0      1
## American               28 150000      0.14      0.34      0   0      1
## Delta                  29 150000      0.13      0.34      0   0      1
## United                 30 150000      0.07      0.26      0   0      1
## Alaska                 31 150000      0.05      0.23      0   0      1
## Jetblue                32 150000      0.06      0.24      0   0      1
## Skywest                33 150000      0.10      0.31      0   0      1
## Others                 34 150000      0.30      0.46      0   0      1
## DepStatus*             35 150000      1.36      0.48      1   1      2
## ArrStatus*             36 150000      1.00      0.00      1   1      1
## airline*               37 150000      5.99      2.97      7   1     11
## timewindowdep*         38 150000      1.64      0.48      2   1      2
## timewindowarr*         39 150000      1.74      0.44      2   1      2
## daywindow*             40 150000      1.23      0.42      1   1      2

Percentage of Delayed flights at Departure

table.dep.delay = table(delay_df$DepDel15)
round(prop.table(table.dep.delay)*100,2)
## 
##     0     1 
## 57.21 42.79

Bar Chart of Flight Delay at Departure

tab1 <- round(prop.table(table.dep.delay)*100,2)
# bar-plot
bp <- barplot(tab1,
              xlab = "Departure Delay (No/Yes)", ylab = "Percent (%)",
              main = "Percentage of delay",
              col = c("lightblue","red"), 
              legend = rownames(tab1), 
              beside = TRUE,
              ylim = c(0, 90))
text(bp, 0, round(tab1, 1),cex=1,pos=3) 

Percentage of Delayed flights at Arrival

table.arr.delay = table(delay_df$ArrDel15)
round(prop.table(table.arr.delay)*100,2)
## 
##    0    1 
## 50.7 49.3

Bar Chart of Flight Delay at Arrival

tab2 <- round(prop.table(table.arr.delay)*100,2)
# bar-plot
bp <- barplot(tab1,
              xlab = "Arrival Delay (No/Yes)", ylab = "Percent (%)",
              main = "Percentage of arrival",
              col = c("lightblue","red"), 
              legend = rownames(tab1), 
              beside = TRUE,
              ylim = c(0, 90))
text(bp, 0, round(tab2, 1),cex=1,pos=3) 

Percentage of Flight Delay at Departure on weekend vs weekdays

t1<-table(delay_df$WEEKEND,delay_df$DepDel15)
t1<-prop.table(t1,1)
t1 = round(t1*100,2)

# bar-plot
bp <- barplot(t1[,2],
              xlab = "Is weekend (No/Yes)", ylab = "Percent dept delay(%)",
              main = "Percentage of dept delay split by is weekend",
              col = c("lightblue","red"), 
              legend = rownames(tab1), 
              beside = TRUE,
              ylim = c(0, 90))
text(bp, 0, round(t1[,2], 1),cex=1,pos=3) 

Percentage of Flight Delay at Arrival on weekend vs weekdays

t2<-table(delay_df$WEEKEND,delay_df$ArrDel15)
t2<-prop.table(t2,1)
t2 = round(t2*100,2)
t2
##    
##         0     1
##   0 50.80 49.20
##   1 50.35 49.65
# bar-plot
bp <- barplot(t2[,2],
              xlab = "Is weekend (No/Yes)", ylab = "Percent arrival delay(%)",
              main = "Percentage of arrival delay split by is weekend",
              col = c("lightblue","red"), 
              legend = rownames(tab1), 
              beside = TRUE,
              ylim = c(0, 90))
text(bp, 0, round(t2[,2], 1),cex=1,pos=3) 

Table of Delay at Departure and Arrival respectively

t3<-table(delay_df$DepDel15,delay_df$ArrDel15)
t3<-prop.table(t3,1)
t3 = round(t3*100,2)
t3
##    
##         0     1
##   0 76.37 23.63
##   1 16.38 83.62

Bar Chart of % of Departure Delays by Airlines

t4 <- table(delay_df$airline,delay_df$DepDel15)
t4 <- prop.table(t4,1)
tab5 = round(t4*100,2)
tab5
##            
##                 0     1
##   Alaska    62.91 37.09
##   American  56.52 43.48
##   Delta     63.91 36.09
##   Frontier  37.28 62.72
##   Hawaiian  76.25 23.75
##   Jetblue   50.86 49.14
##   Others    56.23 43.77
##   Skywest   60.64 39.36
##   Southwest 52.35 47.65
##   Spirit    57.59 42.41
##   United    60.21 39.79
bp <- barplot(tab5[,2], beside = TRUE, main = "Bar Chart for % of departure delays by Airline", 
              col = c("lightblue"),
              xlab = "Airline",
              ylab = "Percent (%)",  
              args.legend = list(title = "Joined", x = "topright", cex = .7), 
              ylim = c(0, 100),
              las = 2)
text(bp, 0, round(tab5[,2], 1),cex=1,pos=3)

Bar Chart of % of Arrival Delays by Airlines

tab6 <- round(prop.table(table(delay_df$airline,delay_df$ArrDel15),1)*100,2)
bp <- barplot(tab6[,2], beside = TRUE, main = "Bar Chart for % of arrival delays by Airline", 
              col = c("pink"),
              xlab = "Airline",
              ylab = "Percent (%)",  
              args.legend = list(title = "Joined", x = "topright", cex = .7), 
              ylim = c(0, 100),
              las = 2)
text(bp, 0, round(tab6[,2], 1),cex=1,pos=3)

Bar Chart for % of departure delays by PM dept

t7 <- table(delay_df$PMDEP,delay_df$DepDel15)
t7 <- prop.table(t7,1)
tab7 = round(t7*100,2)
tab7
##    
##         0     1
##   0 67.97 32.03
##   1 51.06 48.94
bp <- barplot(tab7[,2], beside = TRUE, main = "Bar Chart for % of departure delays by PM dept", 
              col = c("lightgreen"),
              xlab = "Is Dept in PM",
              ylab = "Percent (%)",  
              args.legend = list(title = "Joined", x = "topright", cex = .7), 
              ylim = c(0, 100))
text(bp, 0, round(tab7[,2], 1),cex=1,pos=3)

Bar Chart for % of arrival delays by PM arrival

t8 <- table(delay_df$PMARR,delay_df$ArrDel15)
t8 <- prop.table(t8,1)
tab8 = round(t7*100,2)
tab8
##    
##         0     1
##   0 67.97 32.03
##   1 51.06 48.94
bp <- barplot(tab8[,2], beside = TRUE, main = "Bar Chart for % of arrival delays by PM arrival", 
              col = c("yellow"),
              xlab = "Is Arr in PM",
              ylab = "Percent (%)",  
              args.legend = list(title = "Joined", x = "topright", cex = .7), 
              ylim = c(0, 100))
text(bp, 0, round(tab8[,2], 1),cex=1,pos=3)

Mean of Delay (in mins) at Departure grouped by Airlines

delay_df %>%
  group_by(airline) %>%
  summarise(mean_Dep_delay = mean(DepDelayMinutes))
## # A tibble: 11 x 2
##    airline   mean_Dep_delay
##    <fct>              <dbl>
##  1 Alaska              21.3
##  2 American            31.9
##  3 Delta               22.6
##  4 Frontier            44.8
##  5 Hawaiian            15.4
##  6 Jetblue             32.8
##  7 Others              33.9
##  8 Skywest             33.3
##  9 Southwest           23.6
## 10 Spirit              30.2
## 11 United              26.9

Mean of Delay (in mins) at Arrival grouped by Airlines

delay_df %>%
  group_by(airline) %>%
  summarise(mean_Arr_delay = mean(ArrDelayMinutes))
## # A tibble: 11 x 2
##    airline   mean_Arr_delay
##    <fct>              <dbl>
##  1 Alaska              27.3
##  2 American            36.0
##  3 Delta               25.0
##  4 Frontier            45.2
##  5 Hawaiian            18.4
##  6 Jetblue             36.4
##  7 Others              38.3
##  8 Skywest             37.7
##  9 Southwest           23.0
## 10 Spirit              33.3
## 11 United              32.3

Box Plots and Mean Plots of Average distance Split by Delay at Departure and Arrival Respectively

dist_deptDelay = aggregate(delay_df$Distance,by = list(delay_df$DepDel15),mean)
setNames(dist_deptDelay, c("Is Delayed","Av. distance"))
##   Is Delayed Av. distance
## 1          0     837.4344
## 2          1     812.4697
library(gplots)
## Warning: package 'gplots' was built under R version 3.5.2
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
plotmeans(delay_df$Distance ~ delay_df$DepDel15, data = delay_df, 
          mean.labels = TRUE, col= "Red",
          main="Mean Plot for average distance, Split by Departure delay ")

boxplot(delay_df$Distance ~ delay_df$DepDel15,
        main = "Boxplot for distance Split by Departure Delay",
        col = c("red","light blue"))

plotmeans(delay_df$Distance ~ delay_df$ArrDel15, data = delay_df, 
          mean.labels = TRUE, col= "Red",
          main="Mean Plot for average distance, Split by Arrival delay ")

boxplot(delay_df$Distance ~ delay_df$ArrDel15,
        main = "Boxplot for distance Split by Arrival Delayed",
        col = c("red","light blue"))

Box Plots and Mean Plots of Average distance Split by Delay at Departure and Arrival Respectively

airtime_deptDelay = aggregate(delay_df$AirTime,by = list(delay_df$DepDel15),mean)
setNames(airtime_deptDelay, c("Is Delayed","Av. airtime (min)"))
##   Is Delayed Av. airtime (min)
## 1          0          122.0722
## 2          1          112.3394
plotmeans(delay_df$AirTime ~ delay_df$DepDel15, data = delay_df, 
          mean.labels = TRUE, col= "Red",
          main="Mean Plot for average airtime, Split by Departure delay ")

boxplot(delay_df$AirTime ~ delay_df$DepDel15,
        main = "Boxplot for airtime Split by Departure delay",
        col = c("red","light blue"))

plotmeans(delay_df$AirTime ~ delay_df$ArrDel15, data = delay_df, 
          mean.labels = TRUE, col= "Red",
          main="Mean Plot for average airtime, Split by Arrival delay ")

boxplot(delay_df$AirTime ~ delay_df$ArrDel15,
        main = "Boxplot for airtime Split by arrival delay",
        col = c("red","light blue"))

Correlation Matrix for all the continous variables

library(Hmisc)
## Warning: package 'Hmisc' was built under R version 3.5.2
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.2
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following object is masked from 'package:psych':
## 
##     describe
## The following objects are masked from 'package:base':
## 
##     format.pval, units
cormatrix<-cor(delay_df[,c(8:11,15:18,20:26)])
cormatrix
##                          DepDelay DepDelayMinutes     DepDel15
## DepDelay              1.000000000     0.999145232  0.498637497
## DepDelayMinutes       0.999145232     1.000000000  0.484430181
## DepDel15              0.498637497     0.484430181  1.000000000
## DepartureDelayGroups  0.834132288     0.826162116  0.691874332
## ArrDelay              0.971283267     0.974144521  0.408441370
## ArrDelayMinutes       0.971283267     0.974144521  0.408441370
## ArrDel15              0.394078628     0.386050026  0.593690464
## ArrivalDelayGroups    0.800479548     0.798676773  0.543869864
## CRSElapsedTime       -0.016983434    -0.016088038 -0.020575366
## ActualElapsedTime    -0.063492528    -0.059531305 -0.106137424
## AirTime              -0.042818798    -0.040864837 -0.064918918
## Distance             -0.019136106    -0.018767724 -0.020364160
## WEEKEND               0.009658715     0.009635365  0.005054569
## PMDEP                 0.068367134     0.062745627  0.164463444
## PMARR                 0.061630591     0.055796584  0.150615342
##                      DepartureDelayGroups     ArrDelay ArrDelayMinutes
## DepDelay                      0.834132288  0.971283267     0.971283267
## DepDelayMinutes               0.826162116  0.974144521     0.974144521
## DepDel15                      0.691874332  0.408441370     0.408441370
## DepartureDelayGroups          1.000000000  0.776933514     0.776933514
## ArrDelay                      0.776933514  1.000000000     1.000000000
## ArrDelayMinutes               0.776933514  1.000000000     1.000000000
## ArrDel15                      0.528994065  0.429015037     0.429015037
## ArrivalDelayGroups            0.924335190  0.827853634     0.827853634
## CRSElapsedTime               -0.016000497 -0.010534275    -0.010534275
## ActualElapsedTime            -0.082238284 -0.009117905    -0.009117905
## AirTime                      -0.051596505 -0.019277170    -0.019277170
## Distance                     -0.017806462 -0.015473569    -0.015473569
## WEEKEND                       0.008850742  0.010543277     0.010543277
## PMDEP                         0.108670171  0.041549168     0.041549168
## PMARR                         0.100819019  0.037355466     0.037355466
##                         ArrDel15 ArrivalDelayGroups CRSElapsedTime
## DepDelay             0.394078628        0.800479548   -0.016983434
## DepDelayMinutes      0.386050026        0.798676773   -0.016088038
## DepDel15             0.593690464        0.543869864   -0.020575366
## DepartureDelayGroups 0.528994065        0.924335190   -0.016000497
## ArrDelay             0.429015037        0.827853634   -0.010534275
## ArrDelayMinutes      0.429015037        0.827853634   -0.010534275
## ArrDel15             1.000000000        0.603536524    0.024062663
## ArrivalDelayGroups   0.603536524        1.000000000   -0.007515994
## CRSElapsedTime       0.024062663       -0.007515994    1.000000000
## ActualElapsedTime    0.043094394       -0.003349743    0.979377903
## AirTime              0.018436956       -0.018606153    0.988311246
## Distance             0.015607520       -0.013820487    0.986639258
## WEEKEND              0.003799525        0.011259646    0.020933702
## PMDEP                0.093064263        0.065319341   -0.057943194
## PMARR                0.084134268        0.061051118   -0.002190770
##                      ActualElapsedTime     AirTime     Distance
## DepDelay                  -0.063492528 -0.04281880 -0.019136106
## DepDelayMinutes           -0.059531305 -0.04086484 -0.018767724
## DepDel15                  -0.106137424 -0.06491892 -0.020364160
## DepartureDelayGroups      -0.082238284 -0.05159651 -0.017806462
## ArrDelay                  -0.009117905 -0.01927717 -0.015473569
## ArrDelayMinutes           -0.009117905 -0.01927717 -0.015473569
## ArrDel15                   0.043094394  0.01843696  0.015607520
## ArrivalDelayGroups        -0.003349743 -0.01860615 -0.013820487
## CRSElapsedTime             0.979377903  0.98831125  0.986639258
## ActualElapsedTime          1.000000000  0.98275624  0.964160018
## AirTime                    0.982756243  1.00000000  0.982260314
## Distance                   0.964160018  0.98226031  1.000000000
## WEEKEND                    0.020892406  0.02271114  0.024233121
## PMDEP                     -0.080159605 -0.06494807 -0.053403405
## PMARR                     -0.023620865 -0.01248622  0.002529857
##                           WEEKEND        PMDEP        PMARR
## DepDelay              0.009658715  0.068367134  0.061630591
## DepDelayMinutes       0.009635365  0.062745627  0.055796584
## DepDel15              0.005054569  0.164463444  0.150615342
## DepartureDelayGroups  0.008850742  0.108670171  0.100819019
## ArrDelay              0.010543277  0.041549168  0.037355466
## ArrDelayMinutes       0.010543277  0.041549168  0.037355466
## ArrDel15              0.003799525  0.093064263  0.084134268
## ArrivalDelayGroups    0.011259646  0.065319341  0.061051118
## CRSElapsedTime        0.020933702 -0.057943194 -0.002190770
## ActualElapsedTime     0.020892406 -0.080159605 -0.023620865
## AirTime               0.022711138 -0.064948066 -0.012486220
## Distance              0.024233121 -0.053403405  0.002529857
## WEEKEND               1.000000000 -0.005221578  0.002683795
## PMDEP                -0.005221578  1.000000000  0.661975271
## PMARR                 0.002683795  0.661975271  1.000000000
#Plotting Correlation Matrix
library(corrplot)
## corrplot 0.84 loaded
#par(mar=rep(2,4))
#plot(delay_df[,c(1,6:11,13:18,20:34)])
corrplot(cormatrix)

Scatter Plots

  1. Actual Elaspsed time vs Airlines by Departure Delay (Delay/No Delay : 1/0)
library(ggplot2)
ggplot(delay_df, aes(airline, ActualElapsedTime, color=DepDel15)) + geom_point()

  1. Actual Elaspsed time vs Airlines by Arrival Delay (Delay/No Delay : 1/0)
library(ggplot2)
ggplot(delay_df, aes(airline, ActualElapsedTime, color=ArrDel15)) + geom_point()

  1. Flying Distance vs Airlines by Departure Delay (Delay/No Delay : 1/0)
library(ggplot2)
ggplot(delay_df, aes(airline, Distance, color=DepDel15)) + geom_point()

  1. Flying Distance vs Airlines by Arrival Delay (Delay/No Delay : 1/0)
library(ggplot2)
ggplot(delay_df, aes(airline, Distance, color=ArrDel15)) + geom_point()