1.Loading Library

library(psych)
library(lattice)
library(corrplot)
library(corrgram)
library(car)
library(arm)

2.Reading Data

setwd("C:/Users/vaibhav/Desktop/DataSets")
flights.df<-read.csv("flights.csv")
setwd("C:/Users/vaibhav/Desktop/DataSets")
airlines.df<-read.csv("airlines.csv")
airport.df<-read.csv("airports.csv")

3.Cleaning Data

Removing unwanted Coloumns

flights_new.df<-cbind.data.frame(flights.df$YEAR,flights.df$MONTH,flights.df$DAY,flights.df$DAY_OF_WEEK,flights.df$AIRLINE,flights.df$FLIGHT_NUMBER,flights.df$ORIGIN_AIRPORT,flights.df$DESTINATION_AIRPORT,flights.df$DEPARTURE_DELAY,flights.df$AIR_TIME,flights.df$DISTANCE,flights.df$ARRIVAL_DELAY,flights.df$DIVERTED,flights.df$CANCELLED,flights.df$CANCELLATION_REASON)
flights_new.df$`flights.df$CANCELLATION_REASON`=factor(flights_new.df$`flights.df$CANCELLATION_REASON`,levels=c('A','B','C','D'),labels=c(1,2,3,4))

4Finding Reasons for Cancellation

Finding % of cancelled flights

cancelled_flights.df<-subset(flights_new.df,flights_new.df$`flights.df$CANCELLED`==1)
nrow(cancelled_flights.df)/nrow(flights_new.df)*100
[1] 1.544643

Percentage of Flights Cancelled:1.57 ###Finding which reasons lead to maximum cancellations

mytable1<-with(cancelled_flights.df,table(cancelled_flights.df$`flights.df$CANCELLATION_REASON`))
prop.table(mytable1)*100

          1           2           3           4 
28.10511326 54.34893863 17.52147212  0.02447599 
barchart(cancelled_flights.df$`flights.df$CANCELLATION_REASON`,data=cancelled_flights.df,col="black")

28% of cancellations were due to Airline or carrier reasons 54% of cancellations were due to Weather reasons 17% of cancellations were due to National Air System reasons 0.02% of cancellations were due to Security Reasons Hence maximum 54% of cancellations are due to weather reasons

Analyzing which month got maximum cancellations

mytable2<-with(cancelled_flights.df,table(cancelled_flights.df$`flights.df$MONTH`))
prop.table(mytable2)*100

        1         2         3         4         5         6         7 
13.330515 22.826087 12.240221  5.028704  6.334832 10.146411  5.346892 
        8         9        10        11        12 
 5.620578  2.308531  2.730186  5.116595  8.970451 
hist( cancelled_flights.df$`flights.df$MONTH`,
        xlab="Count",
        ylab="Month",
        xlim=c(0,12),
        main="Cancellations per month",
        col=c("black")      
        )

Top 5 Months with maximum cancellations are 1)february 2)January 3)March 4)June 5)December It can be clearly seen maximum Cancellations are in the season of winter due to heavy snowfall This can be seen as shown by the following table:

mytable3<-table(cancelled_flights.df$`flights.df$CANCELLATION_REASON`,cancelled_flights.df$`flights.df$MONTH`)
prop.table(mytable3,2)*100
   
               1            2            3            4            5
  1 23.985978968 13.720329483 22.668605708 39.734513274 35.247629083
  2 58.587881823 75.288784910 62.388656608 39.579646018 48.823322796
  3 17.417793357 10.986011600 14.897291402 20.685840708 15.911485774
  4  0.008345852  0.004874007  0.045446282  0.000000000  0.017562346
   
               6            7            8            9           10
  1 39.682017544 53.849354973 46.793349169 52.433734940 38.834555827
  2 36.458333333 18.352059925 25.930324624 24.337349398 39.812550937
  3 23.859649123 27.777777778 27.236737926 23.228915663 21.352893236
  4  0.000000000  0.020807324  0.039588282  0.000000000  0.000000000
   
              11           12
  1 23.592085236 19.583281657
  2 50.858882366 69.614287486
  3 25.309849967 10.802430857
  4  0.239182431  0.000000000
plot(cancelled_flights.df$`flights.df$CANCELLATION_REASON`,cancelled_flights.df$`flights.df$MONTH`)

Analyzing Which Airlines got maximum cancellations

mytable4<-with(cancelled_flights.df,table(cancelled_flights.df$`flights.df$AIRLINE`))
prop.table(mytable4)*100
## 
##         AA         AS         B6         DL         EV         F9 
## 12.1478795  0.7442926  4.7572427  4.2543723 16.9451738  0.6541765 
##         HA         MQ         NK         OO         UA         US 
##  0.1902452 16.7159895  2.2295403 11.0809488  7.3127587  4.5247208 
##         VX         WN 
##  0.5940991 17.8485604
plot(cancelled_flights.df$`flights.df$AIRLINE`,
     main="Maximum cancellations by which airlines",
     col="black")

Top 5 Airlines with maximum cancellations are 1)WN :Southwest Airlines Co. 2)EV :Atlantic Southeast Airlines 3)MQ :American Eagle Airlines Inc. 4)AA :American Airlines Inc. 5)OO : Skywest Airlines Inc.

Analyzing on which day flights got maximum cancellations

mytable5<-with(cancelled_flights.df,table(cancelled_flights.df$`flights.df$DAY_OF_WEEK`))
prop.table(mytable5)*100

        1         2         3         4         5         6         7 
23.444662 16.768279 11.936496 13.674291  9.795959  9.733657 14.646656 
hist(cancelled_flights.df$`flights.df$DAY_OF_WEEK`,
     main="Cancellation of flights based on which day of week",
     col=c("black"))

In decreasing order of cancellations 1)Monday 2)Tuesday 3)Sunday 4)Thursday 5)Wednesday 6)Friday 7)Saturday

Correlation between independent variables

corr.test(cancelled_flights.df[,c(2,3,4,6,11,14)])
## Call:corr.test(x = cancelled_flights.df[, c(2, 3, 4, 6, 11, 14)])
## Correlation matrix 
##                          flights.df$MONTH flights.df$DAY
## flights.df$MONTH                     1.00           0.19
## flights.df$DAY                       0.19           1.00
## flights.df$DAY_OF_WEEK               0.01          -0.06
## flights.df$FLIGHT_NUMBER             0.06          -0.05
## flights.df$DISTANCE                 -0.05           0.04
## flights.df$CANCELLED                   NA             NA
##                          flights.df$DAY_OF_WEEK flights.df$FLIGHT_NUMBER
## flights.df$MONTH                           0.01                     0.06
## flights.df$DAY                            -0.06                    -0.05
## flights.df$DAY_OF_WEEK                     1.00                     0.02
## flights.df$FLIGHT_NUMBER                   0.02                     1.00
## flights.df$DISTANCE                        0.01                    -0.38
## flights.df$CANCELLED                         NA                       NA
##                          flights.df$DISTANCE flights.df$CANCELLED
## flights.df$MONTH                       -0.05                   NA
## flights.df$DAY                          0.04                   NA
## flights.df$DAY_OF_WEEK                  0.01                   NA
## flights.df$FLIGHT_NUMBER               -0.38                   NA
## flights.df$DISTANCE                     1.00                   NA
## flights.df$CANCELLED                      NA                   NA
## Sample Size 
## [1] 89884
## Probability values (Entries above the diagonal are adjusted for multiple tests.) 
##                          flights.df$MONTH flights.df$DAY
## flights.df$MONTH                        0              0
## flights.df$DAY                          0              0
## flights.df$DAY_OF_WEEK                  0              0
## flights.df$FLIGHT_NUMBER                0              0
## flights.df$DISTANCE                     0              0
## flights.df$CANCELLED                   NA             NA
##                          flights.df$DAY_OF_WEEK flights.df$FLIGHT_NUMBER
## flights.df$MONTH                           0.01                        0
## flights.df$DAY                             0.00                        0
## flights.df$DAY_OF_WEEK                     0.00                        0
## flights.df$FLIGHT_NUMBER                   0.00                        0
## flights.df$DISTANCE                        0.01                        0
## flights.df$CANCELLED                         NA                       NA
##                          flights.df$DISTANCE flights.df$CANCELLED
## flights.df$MONTH                        0.00                   NA
## flights.df$DAY                          0.00                   NA
## flights.df$DAY_OF_WEEK                  0.01                   NA
## flights.df$FLIGHT_NUMBER                0.00                   NA
## flights.df$DISTANCE                     0.00                   NA
## flights.df$CANCELLED                      NA                   NA
## 
##  To see confidence intervals of the correlations, print with the short=FALSE option
corrplot.mixed(cor(cancelled_flights.df[,c(2,3,4,6,11,14)],use="complete.obs"),upper="circle",tl.pos = "lt")

5 DELAY RATIOS

Distributing data in terms of few factors

Creating a new coloumn which states the factors about delays and cancellation

flights_new.df$status<-'ON TIME'
flights_new.df$status[flights_new.df$`flights.df$DEPARTURE_DELAY`>0] <-'DEPARTURE DELAY'
flights_new.df$status[flights_new.df$`flights.df$CANCELLED`==1] <-'CANCELLED FLIGHTS'
flights_new.df$status[flights_new.df$`flights.df$ARRIVAL_DELAY`>0 & (flights_new.df$`flights.df$ARRIVAL_DELAY`-flights_new.df$`flights.df$DEPARTURE_DELAY`)>0 & flights_new.df$`flights.df$DEPARTURE_DELAY`>0] <-'DEPR & ARVL_DELAY'
flights_new.df$status[flights_new.df$`flights.df$ARRIVAL_DELAY`>0 & flights_new.df$`flights.df$DEPARTURE_DELAY`<=0] <-'ARRIVAL_DELAY'
flights_new.df$status<-factor(flights_new.df$status)

Seeing which delays occured the most

mytable4<-table(flights_new.df$status)
prop.table(mytable4)*100
## 
##     ARRIVAL_DELAY CANCELLED FLIGHTS   DEPARTURE DELAY DEPR & ARVL_DELAY 
##          9.945715          1.544643         26.854490          9.630287 
##           ON TIME 
##         52.024865
plot(flights_new.df$status)

52% flights were on time 26% flights had departure delays 9% flights had arrival delays 9% flights had arrival and departure delay 1.54% flights were cancelled

6 DEPARTURE DELAY ANALYSIS

Summarizing departure delay

summary(flights_new.df$`flights.df$DEPARTURE_DELAY`)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  -82.00   -5.00   -2.00    9.37    7.00 1988.00   86153

Breaking departure delay as per the time

flightsDepDelay<-subset(flights_new.df,flights_new.df$`flights.df$DEPARTURE_DELAY`>0)
flightsDepDelay$DelayTimeInterval<-cut(flightsDepDelay$`flights.df$DEPARTURE_DELAY`,
                                        breaks= c(0,10,30,60,90,120,180,1988),
                                        labels=c("(0-10]","(10-30]","(30-60]","(60-90]","(90-120]","(120-180]","(180-1988]"))

Tabluating departure delays

mytable5<-with(flights_new.df,table(flightsDepDelay$DelayTimeInterval))
prop.table(mytable5)*100
## 
##     (0-10]    (10-30]    (30-60]    (60-90]   (90-120]  (120-180] 
##  41.680866  28.286926  14.781536   6.385813   3.406303   3.195353 
## (180-1988] 
##   2.263201
plot(flightsDepDelay$DelayTimeInterval,col="Black")

Hence Of all the Departure delays 41% flights were delayed by less tha 10 mins 28% flights were delayed by 10 to 30 mins 14% flights were delayed by 30 to 60 mins 6% flights were delayed by 60 to 90 mins 3% flights were delayed by 90 to 120 mins 3% flights were delayed by 120 to 180 mins 2% flights were delayed by 180+ mins

mytable6<-table(flightsDepDelay$DelayTimeInterval,flightsDepDelay$`flights.df$MONTH`)
prop.table(mytable6,1)*100
##             
##                      1         2         3         4         5         6
##   (0-10]      7.949200  7.592192  9.158939  8.194240  8.340632  9.140993
##   (10-30]     8.511788  8.455075  9.415206  8.045776  8.352792  9.996807
##   (30-60]     8.906776  8.754006  9.181124  7.413773  8.306201 10.916012
##   (60-90]     8.736684  8.815512  8.587131  7.046663  8.601129 11.665856
##   (90-120]    8.507700  8.763207  8.296388  7.194255  8.803259 12.319591
##   (120-180]   8.106477  8.395047  8.165369  7.003725  9.020774 13.221242
##   (180-1988]  7.296235  7.942711  8.323113  7.302472  9.277236 12.998108
##             
##                      7         8         9        10        11        12
##   (0-10]      9.274292  8.863333  7.085745  8.109023  7.828542  8.462870
##   (10-30]     9.827665  8.775895  5.947225  6.566246  7.130384  8.975139
##   (30-60]    10.516902  9.178896  5.526116  5.459597  6.604731  9.235866
##   (60-90]    11.123635  9.580957  5.420000  5.355170  5.914335  9.152927
##   (90-120]   11.218838  9.597403  5.280022  5.025896  5.855949  9.137491
##   (120-180]  11.108494  9.497799  4.833557  4.745219  5.853860 10.048439
##   (180-1988]  9.456004  8.940487  4.631343  5.180119  7.196458 11.455713

Running Correlation tests

corrplot.mixed(cor(flightsDepDelay[,c(2,3,4,6,9,11,13)]),upper="ellipse",tl.pos="lt")

Distributing data in terms of 3 factors for easier evaluation

flightsDepDelay$Latency[flightsDepDelay$`flights.df$DEPARTURE_DELAY`>0 & flightsDepDelay$`flights.df$DEPARTURE_DELAY`<5]<-'On time'
flightsDepDelay$Latency[flightsDepDelay$`flights.df$DEPARTURE_DELAY`>=5 & flightsDepDelay$`flights.df$DEPARTURE_DELAY`<15]<-'Small Delay'
flightsDepDelay$Latency[flightsDepDelay$`flights.df$DEPARTURE_DELAY`>=15]<-"Long Delay"
flightsDepDelay$Latency<-factor(flightsDepDelay$Latency)
mytable7<-with(flightsDepDelay,table(flightsDepDelay$Latency))
prop.table(mytable7)*100
## 
##  Long Delay     On time Small Delay 
##    49.75278    22.53646    27.71076

Seeing these 3 factors with respect to month

mytable8<-table(flightsDepDelay$`flights.df$MONTH`,flightsDepDelay$Latency)
prop.table(mytable8,2)*100
##     
##      Long Delay   On time Small Delay
##   1    8.602681  7.820866    8.180298
##   2    8.631994  7.415263    7.916812
##   3    9.038309  9.009079    9.350197
##   4    7.554130  8.129401    8.230890
##   5    8.492805  8.354852    8.321718
##   6   11.049838  8.905955    9.483468
##   7   10.390391  9.145393    9.494504
##   8    9.126626  8.940191    8.742244
##   9    5.530498  7.456387    6.516532
##   10   5.705902  8.468830    7.502228
##   11   6.598623  8.002689    7.566742
##   12   9.278202  8.351095    8.694368

Of all the long delays Maximum were in the month of: 1)June 2)July 3)March,August,December

Seeing these 3 factors with respect to day of week

mytable9<-table(flightsDepDelay$`flights.df$DAY_OF_WEEK`,flightsDepDelay$Latency)
prop.table(mytable9,2)*100
##    
##     Long Delay  On time Small Delay
##   1   15.66454 14.95557    14.66916
##   2   14.17062 13.65004    13.67022
##   3   14.28580 14.25667    14.20466
##   4   15.75229 15.52713    15.85960
##   5   15.28745 15.53026    15.75094
##   6   10.74602 11.81177    11.54314
##   7   14.09328 14.26857    14.30228

All percentages are almost same so no evident conclusion can be drawn

Seeing these 3 factors with respect to Airlines

mytable10<-table(flightsDepDelay$`flights.df$AIRLINE`,flightsDepDelay$Latency)
prop.table(mytable10,2)*100
##     
##      Long Delay    On time Small Delay
##   AA 11.7004900 12.0503759  10.9399431
##   AS  1.7685149  2.5354929   2.1589916
##   B6  5.4551352  3.6963588   4.5266330
##   DL 11.6650308 15.5926762  14.3294427
##   EV  9.1718248  6.4155528   7.1587793
##   F9  1.9650060  1.1427045   1.4664912
##   HA  0.5281054  1.4923628   1.2583507
##   MQ  5.3229433  3.0730275   3.8558635
##   NK  2.9882162  1.6457950   2.1396375
##   OO  9.1335289  6.5322448   7.4170027
##   UA 11.4028220 13.2888971  12.2745215
##   US  2.7557931  3.3558854   2.9446967
##   VX  1.0428782  1.2038686   1.1176096
##   WN 25.0997112 27.9747578  28.4120368

-Maximum Long delays are seen in Southwest Airlines Co.(25%) folowed by American Airlines Inc.,Delta Air Lines Inc.,United Air Lines Inc. -Similar trend is seen in small delays and on time flights too.

Some insights(Comparing average delay time of each Airline)

var1<-aggregate(flightsDepDelay$`flights.df$DEPARTURE_DELAY`,list(flightsDepDelay$`flights.df$AIRLINE`),mean)
var1
plot(x=var1$Group.1,y=jitter(var1$x))

This shows that when taking into account no. of flights along with delay time maximum or the airlines with highest delay is F9 folowed by NK and EV

Hence Ailines affecting it changes drastically after considering number of flights

7 ARRIVAL DELAYS

Summarizing arrival delay

summary(flights_new.df$`flights.df$ARRIVAL_DELAY`)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  -87.00  -13.00   -5.00    4.41    8.00 1971.00  105071

Breaking arrival delay as per the time

flightsArrDelay<-subset(flights_new.df,!is.na(flights_new.df$`flights.df$ARRIVAL_DELAY`))
flights_ArrDelay<-subset(flightsArrDelay,flightsArrDelay$`flights.df$ARRIVAL_DELAY`>0)
flights_ArrDelay$DelayTimeInterval<-cut(flights_ArrDelay$`flights.df$ARRIVAL_DELAY`,
                                        breaks= c(0,10,30,60,90,120,180,1988),
                                        labels=c("(0-10]","(10-30]","(30-60]","(60-90]","(90-120]","(120-180]","(180-1988]"))

Tabulating Arrival Delay

mytable11<-with(flights_new.df,table(flights_ArrDelay$DelayTimeInterval))
prop.table(mytable11)*100
## 
##     (0-10]    (10-30]    (30-60]    (60-90]   (90-120]  (120-180] 
##  39.734611  29.947779  15.027342   6.374539   3.432945   3.222393 
## (180-1988] 
##   2.260391
plot(flights_ArrDelay$DelayTimeInterval,col="Black")

Hence Of all the Arrival delays 39% flights were delayed by less tha 10 mins 29% flights were delayed by 10 to 30 mins 15% flights were delayed by 30 to 60 mins 6% flights were delayed by 60 to 90 mins 3% flights were delayed by 90 to 120 mins 3% flights were delayed by 120 to 180 mins 2% flights were delayed by 180+ mins

This is very similar to Departure delays distribution

Running Correlation tests

corrplot.mixed(cor(flights_ArrDelay[,c(2,3,4,6,9,12,13)]),upper="ellipse",tl.pos="lt")
## Warning in cor(flights_ArrDelay[, c(2, 3, 4, 6, 9, 12, 13)]): the standard
## deviation is zero

Distributing data in terms of 3 factors for easier evaluation

flights_ArrDelay$Latency[flights_ArrDelay$`flights.df$ARRIVAL_DELAY`>0 & flights_ArrDelay$`flights.df$ARRIVAL_DELAY`<5]<-'On time'
flights_ArrDelay$Latency[flights_ArrDelay$`flights.df$ARRIVAL_DELAY`>=5 & flights_ArrDelay$`flights.df$ARRIVAL_DELAY`<15]<-'Small Delay'
flights_ArrDelay$Latency[flights_ArrDelay$`flights.df$ARRIVAL_DELAY`>=15]<-"Long Delay"
flights_ArrDelay$Latency<-factor(flights_ArrDelay$Latency)
mytable12<-with(flights_ArrDelay,table(flights_ArrDelay$Latency))
prop.table(mytable12)*100
## 
##  Long Delay     On time Small Delay 
##    50.95793    19.61368    29.42840

50% Flights have long delay 30% Flights have small delay

Seeing these 3 factors with respect to month

mytable13<-table(flights_ArrDelay$`flights.df$MONTH`,flights_ArrDelay$Latency)
prop.table(mytable13,2)*100
##     
##      Long Delay   On time Small Delay
##   1    9.022708  8.266454    8.682548
##   2    8.950114  7.477823    8.085453
##   3    8.975785  9.142547    9.323444
##   4    7.734059  8.815661    8.709578
##   5    8.429727  8.403756    8.326277
##   6   10.883746  8.744079    9.029863
##   7   10.120656  8.982769    9.008044
##   8    8.849873  8.598714    8.399062
##   9    5.647809  7.574569    6.898590
##   10   5.649501  8.421834    7.603967
##   11   6.636112  7.733859    7.687498
##   12   9.099911  7.837935    8.245677

Of all the long delays Maximum were in the month of: 1)June 2)July 3)Jan,December

Seeing these 3 factors with respect to day of week

mytable14<-table(flights_ArrDelay$`flights.df$DAY_OF_WEEK`,flights_ArrDelay$Latency)
prop.table(mytable14,2)*100
##    
##     Long Delay  On time Small Delay
##   1   15.63475 14.62900    14.46462
##   2   14.21642 14.07320    14.06585
##   3   14.44408 14.84253    14.64357
##   4   16.14498 15.72888    16.19940
##   5   15.36910 15.78850    16.01475
##   6   10.37342 11.44150    10.99586
##   7   13.81725 13.49639    13.61595

All percentages are almost same so no evident conclusion can be drawn

Seeing these 3 factors with respect to Airlines

mytable15<-table(flights_ArrDelay$`flights.df$AIRLINE`,flights_ArrDelay$Latency)
prop.table(mytable15,2)*100
##     
##      Long Delay    On time Small Delay
##   AA 12.2507262 11.6239980  12.1035920
##   AS  2.1018601  3.5397992   3.2748233
##   B6  5.5644941  3.9438870   4.3442863
##   DL 11.0982388 13.5467132  12.5977790
##   EV 10.2670675 10.1703570  10.1612010
##   F9  2.2163942  1.6534862   1.7738626
##   HA  0.8103897  2.4362536   1.8870290
##   MQ  5.6935095  4.0215774   4.3144886
##   NK  3.2179561  2.0365633   2.3333442
##   OO 10.1364535 11.5172348  10.9906536
##   UA  9.8474854  7.6649638   8.1627968
##   US  3.4368685  3.8016989   3.9363989
##   VX  1.1075388  1.1648673   1.2430390
##   WN 22.2510177 22.8786002  22.8767056

-Maximum Long delays are seen in Southwest Airlines Co.(25%) folowed by American Airlines Inc.,Delta Air Lines Inc.,United Air Lines Inc. -Similar trend is seen in small delays and on time flights too.

Some insights(Comparing average delay time of each Airline)

var3<-aggregate(flights_ArrDelay$`flights.df$ARRIVAL_DELAY`,list(flights_ArrDelay$`flights.df$AIRLINE`),mean)
var3
plot(x=var3$Group.1,y=jitter(var3$x))

This shows that when taking into account no. of flights along with delay time maximum or the airlines with highest delay is F9 folowed by NK,UA and MQ

8 REGRESSION MODELS

Running a correlation test

cor.test(flights_new.df$`flights.df$ARRIVAL_DELAY`,flights_new.df$`flights.df$DEPARTURE_DELAY`)
## 
##  Pearson's product-moment correlation
## 
## data:  flights_new.df$`flights.df$ARRIVAL_DELAY` and flights_new.df$`flights.df$DEPARTURE_DELAY`
## t = 6884.2, df = 5714000, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9445832 0.9447597
## sample estimates:
##       cor 
## 0.9446715

Predicting Departure delay based on various factors

fit1<-lm(flights.df$DEPARTURE_DELAY~flights.df$AIR_SYSTEM_DELAY+flights.df$WEATHER_DELAY+flights.df$LATE_AIRCRAFT_DELAY+flights.df$AIRLINE_DELAY+flights.df$SECURITY_DELAY)
summary(fit1)
## 
## Call:
## lm(formula = flights.df$DEPARTURE_DELAY ~ flights.df$AIR_SYSTEM_DELAY + 
##     flights.df$WEATHER_DELAY + flights.df$LATE_AIRCRAFT_DELAY + 
##     flights.df$AIRLINE_DELAY + flights.df$SECURITY_DELAY)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -226.04   -8.90    0.27    9.07  396.36 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    -1.8010139  0.0222124  -81.08   <2e-16 ***
## flights.df$AIR_SYSTEM_DELAY     0.6846885  0.0005808 1178.87   <2e-16 ***
## flights.df$WEATHER_DELAY        0.9755493  0.0007812 1248.80   <2e-16 ***
## flights.df$LATE_AIRCRAFT_DELAY  1.0613214  0.0003760 2822.32   <2e-16 ***
## flights.df$AIRLINE_DELAY        1.0375459  0.0003365 3083.53   <2e-16 ***
## flights.df$SECURITY_DELAY       1.0626763  0.0074362  142.91   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.43 on 1063433 degrees of freedom
##   (4755640 observations deleted due to missingness)
## Multiple R-squared:  0.9402, Adjusted R-squared:  0.9402 
## F-statistic: 3.346e+06 on 5 and 1063433 DF,  p-value: < 2.2e-16

Hence we can predict departure delay based on other delay This model is quite good at predicting delays as Adjusted R-squared is 0.9402

Predicting Arrival delay based on various factors

fit2<-lm(flights.df$ARRIVAL_DELAY~flights.df$AIR_SYSTEM_DELAY+flights.df$WEATHER_DELAY+flights.df$LATE_AIRCRAFT_DELAY+flights.df$AIRLINE_DELAY+flights.df$SECURITY_DELAY)
summary(fit2)
## 
## Call:
## lm(formula = flights.df$ARRIVAL_DELAY ~ flights.df$AIR_SYSTEM_DELAY + 
##     flights.df$WEATHER_DELAY + flights.df$LATE_AIRCRAFT_DELAY + 
##     flights.df$AIRLINE_DELAY + flights.df$SECURITY_DELAY)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -7.789e-08  0.000e+00  0.000e+00  0.000e+00  1.121e-07 
## 
## Coefficients:
##                                  Estimate Std. Error    t value Pr(>|t|)
## (Intercept)                    -7.321e-11  2.010e-13 -3.642e+02   <2e-16
## flights.df$AIR_SYSTEM_DELAY     1.000e+00  5.256e-15  1.902e+14   <2e-16
## flights.df$WEATHER_DELAY        1.000e+00  7.070e-15  1.414e+14   <2e-16
## flights.df$LATE_AIRCRAFT_DELAY  1.000e+00  3.403e-15  2.938e+14   <2e-16
## flights.df$AIRLINE_DELAY        1.000e+00  3.045e-15  3.284e+14   <2e-16
## flights.df$SECURITY_DELAY       1.000e+00  6.730e-14  1.486e+13   <2e-16
##                                   
## (Intercept)                    ***
## flights.df$AIR_SYSTEM_DELAY    ***
## flights.df$WEATHER_DELAY       ***
## flights.df$LATE_AIRCRAFT_DELAY ***
## flights.df$AIRLINE_DELAY       ***
## flights.df$SECURITY_DELAY      ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.487e-10 on 1063433 degrees of freedom
##   (4755640 observations deleted due to missingness)
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 3.97e+28 on 5 and 1063433 DF,  p-value: < 2.2e-16

Predicting Arrival delay based on Departure delay

fit<-lm(flights_new.df$`flights.df$ARRIVAL_DELAY`~flights_new.df$`flights.df$DEPARTURE_DELAY`)
summary(fit)
## 
## Call:
## lm(formula = flights_new.df$`flights.df$ARRIVAL_DELAY` ~ flights_new.df$`flights.df$DEPARTURE_DELAY`)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -197.10   -7.10   -1.04    5.95  334.97 
## 
## Coefficients:
##                                               Estimate Std. Error t value
## (Intercept)                                 -4.9403823  0.0055574    -889
## flights_new.df$`flights.df$DEPARTURE_DELAY`  1.0056588  0.0001461    6884
##                                             Pr(>|t|)    
## (Intercept)                                   <2e-16 ***
## flights_new.df$`flights.df$DEPARTURE_DELAY`   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.88 on 5714006 degrees of freedom
##   (105071 observations deleted due to missingness)
## Multiple R-squared:  0.8924, Adjusted R-squared:  0.8924 
## F-statistic: 4.739e+07 on 1 and 5714006 DF,  p-value: < 2.2e-16

Hence Arrival Delay can be predicted with Departure Delay as multiple R squared = 0.8924