library(psych)
library(lattice)
library(corrplot)
library(corrgram)
library(car)
library(arm)
setwd("C:/Users/vaibhav/Desktop/DataSets")
flights.df<-read.csv("flights.csv")
setwd("C:/Users/vaibhav/Desktop/DataSets")
airlines.df<-read.csv("airlines.csv")
airport.df<-read.csv("airports.csv")
flights_new.df<-cbind.data.frame(flights.df$YEAR,flights.df$MONTH,flights.df$DAY,flights.df$DAY_OF_WEEK,flights.df$AIRLINE,flights.df$FLIGHT_NUMBER,flights.df$ORIGIN_AIRPORT,flights.df$DESTINATION_AIRPORT,flights.df$DEPARTURE_DELAY,flights.df$AIR_TIME,flights.df$DISTANCE,flights.df$ARRIVAL_DELAY,flights.df$DIVERTED,flights.df$CANCELLED,flights.df$CANCELLATION_REASON)
flights_new.df$`flights.df$CANCELLATION_REASON`=factor(flights_new.df$`flights.df$CANCELLATION_REASON`,levels=c('A','B','C','D'),labels=c(1,2,3,4))
cancelled_flights.df<-subset(flights_new.df,flights_new.df$`flights.df$CANCELLED`==1)
nrow(cancelled_flights.df)/nrow(flights_new.df)*100
[1] 1.544643
Percentage of Flights Cancelled:1.57 ###Finding which reasons lead to maximum cancellations
mytable1<-with(cancelled_flights.df,table(cancelled_flights.df$`flights.df$CANCELLATION_REASON`))
prop.table(mytable1)*100
1 2 3 4
28.10511326 54.34893863 17.52147212 0.02447599
barchart(cancelled_flights.df$`flights.df$CANCELLATION_REASON`,data=cancelled_flights.df,col="black")
28% of cancellations were due to Airline or carrier reasons 54% of cancellations were due to Weather reasons 17% of cancellations were due to National Air System reasons 0.02% of cancellations were due to Security Reasons Hence maximum 54% of cancellations are due to weather reasons
mytable2<-with(cancelled_flights.df,table(cancelled_flights.df$`flights.df$MONTH`))
prop.table(mytable2)*100
1 2 3 4 5 6 7
13.330515 22.826087 12.240221 5.028704 6.334832 10.146411 5.346892
8 9 10 11 12
5.620578 2.308531 2.730186 5.116595 8.970451
hist( cancelled_flights.df$`flights.df$MONTH`,
xlab="Count",
ylab="Month",
xlim=c(0,12),
main="Cancellations per month",
col=c("black")
)
Top 5 Months with maximum cancellations are 1)february 2)January 3)March 4)June 5)December It can be clearly seen maximum Cancellations are in the season of winter due to heavy snowfall This can be seen as shown by the following table:
mytable3<-table(cancelled_flights.df$`flights.df$CANCELLATION_REASON`,cancelled_flights.df$`flights.df$MONTH`)
prop.table(mytable3,2)*100
1 2 3 4 5
1 23.985978968 13.720329483 22.668605708 39.734513274 35.247629083
2 58.587881823 75.288784910 62.388656608 39.579646018 48.823322796
3 17.417793357 10.986011600 14.897291402 20.685840708 15.911485774
4 0.008345852 0.004874007 0.045446282 0.000000000 0.017562346
6 7 8 9 10
1 39.682017544 53.849354973 46.793349169 52.433734940 38.834555827
2 36.458333333 18.352059925 25.930324624 24.337349398 39.812550937
3 23.859649123 27.777777778 27.236737926 23.228915663 21.352893236
4 0.000000000 0.020807324 0.039588282 0.000000000 0.000000000
11 12
1 23.592085236 19.583281657
2 50.858882366 69.614287486
3 25.309849967 10.802430857
4 0.239182431 0.000000000
plot(cancelled_flights.df$`flights.df$CANCELLATION_REASON`,cancelled_flights.df$`flights.df$MONTH`)
mytable4<-with(cancelled_flights.df,table(cancelled_flights.df$`flights.df$AIRLINE`))
prop.table(mytable4)*100
##
## AA AS B6 DL EV F9
## 12.1478795 0.7442926 4.7572427 4.2543723 16.9451738 0.6541765
## HA MQ NK OO UA US
## 0.1902452 16.7159895 2.2295403 11.0809488 7.3127587 4.5247208
## VX WN
## 0.5940991 17.8485604
plot(cancelled_flights.df$`flights.df$AIRLINE`,
main="Maximum cancellations by which airlines",
col="black")
Top 5 Airlines with maximum cancellations are 1)WN :Southwest Airlines Co. 2)EV :Atlantic Southeast Airlines 3)MQ :American Eagle Airlines Inc. 4)AA :American Airlines Inc. 5)OO : Skywest Airlines Inc.
mytable5<-with(cancelled_flights.df,table(cancelled_flights.df$`flights.df$DAY_OF_WEEK`))
prop.table(mytable5)*100
1 2 3 4 5 6 7
23.444662 16.768279 11.936496 13.674291 9.795959 9.733657 14.646656
hist(cancelled_flights.df$`flights.df$DAY_OF_WEEK`,
main="Cancellation of flights based on which day of week",
col=c("black"))
In decreasing order of cancellations 1)Monday 2)Tuesday 3)Sunday 4)Thursday 5)Wednesday 6)Friday 7)Saturday
corr.test(cancelled_flights.df[,c(2,3,4,6,11,14)])
## Call:corr.test(x = cancelled_flights.df[, c(2, 3, 4, 6, 11, 14)])
## Correlation matrix
## flights.df$MONTH flights.df$DAY
## flights.df$MONTH 1.00 0.19
## flights.df$DAY 0.19 1.00
## flights.df$DAY_OF_WEEK 0.01 -0.06
## flights.df$FLIGHT_NUMBER 0.06 -0.05
## flights.df$DISTANCE -0.05 0.04
## flights.df$CANCELLED NA NA
## flights.df$DAY_OF_WEEK flights.df$FLIGHT_NUMBER
## flights.df$MONTH 0.01 0.06
## flights.df$DAY -0.06 -0.05
## flights.df$DAY_OF_WEEK 1.00 0.02
## flights.df$FLIGHT_NUMBER 0.02 1.00
## flights.df$DISTANCE 0.01 -0.38
## flights.df$CANCELLED NA NA
## flights.df$DISTANCE flights.df$CANCELLED
## flights.df$MONTH -0.05 NA
## flights.df$DAY 0.04 NA
## flights.df$DAY_OF_WEEK 0.01 NA
## flights.df$FLIGHT_NUMBER -0.38 NA
## flights.df$DISTANCE 1.00 NA
## flights.df$CANCELLED NA NA
## Sample Size
## [1] 89884
## Probability values (Entries above the diagonal are adjusted for multiple tests.)
## flights.df$MONTH flights.df$DAY
## flights.df$MONTH 0 0
## flights.df$DAY 0 0
## flights.df$DAY_OF_WEEK 0 0
## flights.df$FLIGHT_NUMBER 0 0
## flights.df$DISTANCE 0 0
## flights.df$CANCELLED NA NA
## flights.df$DAY_OF_WEEK flights.df$FLIGHT_NUMBER
## flights.df$MONTH 0.01 0
## flights.df$DAY 0.00 0
## flights.df$DAY_OF_WEEK 0.00 0
## flights.df$FLIGHT_NUMBER 0.00 0
## flights.df$DISTANCE 0.01 0
## flights.df$CANCELLED NA NA
## flights.df$DISTANCE flights.df$CANCELLED
## flights.df$MONTH 0.00 NA
## flights.df$DAY 0.00 NA
## flights.df$DAY_OF_WEEK 0.01 NA
## flights.df$FLIGHT_NUMBER 0.00 NA
## flights.df$DISTANCE 0.00 NA
## flights.df$CANCELLED NA NA
##
## To see confidence intervals of the correlations, print with the short=FALSE option
corrplot.mixed(cor(cancelled_flights.df[,c(2,3,4,6,11,14)],use="complete.obs"),upper="circle",tl.pos = "lt")
Creating a new coloumn which states the factors about delays and cancellation
flights_new.df$status<-'ON TIME'
flights_new.df$status[flights_new.df$`flights.df$DEPARTURE_DELAY`>0] <-'DEPARTURE DELAY'
flights_new.df$status[flights_new.df$`flights.df$CANCELLED`==1] <-'CANCELLED FLIGHTS'
flights_new.df$status[flights_new.df$`flights.df$ARRIVAL_DELAY`>0 & (flights_new.df$`flights.df$ARRIVAL_DELAY`-flights_new.df$`flights.df$DEPARTURE_DELAY`)>0 & flights_new.df$`flights.df$DEPARTURE_DELAY`>0] <-'DEPR & ARVL_DELAY'
flights_new.df$status[flights_new.df$`flights.df$ARRIVAL_DELAY`>0 & flights_new.df$`flights.df$DEPARTURE_DELAY`<=0] <-'ARRIVAL_DELAY'
flights_new.df$status<-factor(flights_new.df$status)
mytable4<-table(flights_new.df$status)
prop.table(mytable4)*100
##
## ARRIVAL_DELAY CANCELLED FLIGHTS DEPARTURE DELAY DEPR & ARVL_DELAY
## 9.945715 1.544643 26.854490 9.630287
## ON TIME
## 52.024865
plot(flights_new.df$status)
52% flights were on time 26% flights had departure delays 9% flights had arrival delays 9% flights had arrival and departure delay 1.54% flights were cancelled
summary(flights_new.df$`flights.df$DEPARTURE_DELAY`)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -82.00 -5.00 -2.00 9.37 7.00 1988.00 86153
flightsDepDelay<-subset(flights_new.df,flights_new.df$`flights.df$DEPARTURE_DELAY`>0)
flightsDepDelay$DelayTimeInterval<-cut(flightsDepDelay$`flights.df$DEPARTURE_DELAY`,
breaks= c(0,10,30,60,90,120,180,1988),
labels=c("(0-10]","(10-30]","(30-60]","(60-90]","(90-120]","(120-180]","(180-1988]"))
mytable5<-with(flights_new.df,table(flightsDepDelay$DelayTimeInterval))
prop.table(mytable5)*100
##
## (0-10] (10-30] (30-60] (60-90] (90-120] (120-180]
## 41.680866 28.286926 14.781536 6.385813 3.406303 3.195353
## (180-1988]
## 2.263201
plot(flightsDepDelay$DelayTimeInterval,col="Black")
Hence Of all the Departure delays 41% flights were delayed by less tha 10 mins 28% flights were delayed by 10 to 30 mins 14% flights were delayed by 30 to 60 mins 6% flights were delayed by 60 to 90 mins 3% flights were delayed by 90 to 120 mins 3% flights were delayed by 120 to 180 mins 2% flights were delayed by 180+ mins
mytable6<-table(flightsDepDelay$DelayTimeInterval,flightsDepDelay$`flights.df$MONTH`)
prop.table(mytable6,1)*100
##
## 1 2 3 4 5 6
## (0-10] 7.949200 7.592192 9.158939 8.194240 8.340632 9.140993
## (10-30] 8.511788 8.455075 9.415206 8.045776 8.352792 9.996807
## (30-60] 8.906776 8.754006 9.181124 7.413773 8.306201 10.916012
## (60-90] 8.736684 8.815512 8.587131 7.046663 8.601129 11.665856
## (90-120] 8.507700 8.763207 8.296388 7.194255 8.803259 12.319591
## (120-180] 8.106477 8.395047 8.165369 7.003725 9.020774 13.221242
## (180-1988] 7.296235 7.942711 8.323113 7.302472 9.277236 12.998108
##
## 7 8 9 10 11 12
## (0-10] 9.274292 8.863333 7.085745 8.109023 7.828542 8.462870
## (10-30] 9.827665 8.775895 5.947225 6.566246 7.130384 8.975139
## (30-60] 10.516902 9.178896 5.526116 5.459597 6.604731 9.235866
## (60-90] 11.123635 9.580957 5.420000 5.355170 5.914335 9.152927
## (90-120] 11.218838 9.597403 5.280022 5.025896 5.855949 9.137491
## (120-180] 11.108494 9.497799 4.833557 4.745219 5.853860 10.048439
## (180-1988] 9.456004 8.940487 4.631343 5.180119 7.196458 11.455713
corrplot.mixed(cor(flightsDepDelay[,c(2,3,4,6,9,11,13)]),upper="ellipse",tl.pos="lt")
flightsDepDelay$Latency[flightsDepDelay$`flights.df$DEPARTURE_DELAY`>0 & flightsDepDelay$`flights.df$DEPARTURE_DELAY`<5]<-'On time'
flightsDepDelay$Latency[flightsDepDelay$`flights.df$DEPARTURE_DELAY`>=5 & flightsDepDelay$`flights.df$DEPARTURE_DELAY`<15]<-'Small Delay'
flightsDepDelay$Latency[flightsDepDelay$`flights.df$DEPARTURE_DELAY`>=15]<-"Long Delay"
flightsDepDelay$Latency<-factor(flightsDepDelay$Latency)
mytable7<-with(flightsDepDelay,table(flightsDepDelay$Latency))
prop.table(mytable7)*100
##
## Long Delay On time Small Delay
## 49.75278 22.53646 27.71076
mytable8<-table(flightsDepDelay$`flights.df$MONTH`,flightsDepDelay$Latency)
prop.table(mytable8,2)*100
##
## Long Delay On time Small Delay
## 1 8.602681 7.820866 8.180298
## 2 8.631994 7.415263 7.916812
## 3 9.038309 9.009079 9.350197
## 4 7.554130 8.129401 8.230890
## 5 8.492805 8.354852 8.321718
## 6 11.049838 8.905955 9.483468
## 7 10.390391 9.145393 9.494504
## 8 9.126626 8.940191 8.742244
## 9 5.530498 7.456387 6.516532
## 10 5.705902 8.468830 7.502228
## 11 6.598623 8.002689 7.566742
## 12 9.278202 8.351095 8.694368
Of all the long delays Maximum were in the month of: 1)June 2)July 3)March,August,December
mytable9<-table(flightsDepDelay$`flights.df$DAY_OF_WEEK`,flightsDepDelay$Latency)
prop.table(mytable9,2)*100
##
## Long Delay On time Small Delay
## 1 15.66454 14.95557 14.66916
## 2 14.17062 13.65004 13.67022
## 3 14.28580 14.25667 14.20466
## 4 15.75229 15.52713 15.85960
## 5 15.28745 15.53026 15.75094
## 6 10.74602 11.81177 11.54314
## 7 14.09328 14.26857 14.30228
All percentages are almost same so no evident conclusion can be drawn
mytable10<-table(flightsDepDelay$`flights.df$AIRLINE`,flightsDepDelay$Latency)
prop.table(mytable10,2)*100
##
## Long Delay On time Small Delay
## AA 11.7004900 12.0503759 10.9399431
## AS 1.7685149 2.5354929 2.1589916
## B6 5.4551352 3.6963588 4.5266330
## DL 11.6650308 15.5926762 14.3294427
## EV 9.1718248 6.4155528 7.1587793
## F9 1.9650060 1.1427045 1.4664912
## HA 0.5281054 1.4923628 1.2583507
## MQ 5.3229433 3.0730275 3.8558635
## NK 2.9882162 1.6457950 2.1396375
## OO 9.1335289 6.5322448 7.4170027
## UA 11.4028220 13.2888971 12.2745215
## US 2.7557931 3.3558854 2.9446967
## VX 1.0428782 1.2038686 1.1176096
## WN 25.0997112 27.9747578 28.4120368
-Maximum Long delays are seen in Southwest Airlines Co.(25%) folowed by American Airlines Inc.,Delta Air Lines Inc.,United Air Lines Inc. -Similar trend is seen in small delays and on time flights too.
var1<-aggregate(flightsDepDelay$`flights.df$DEPARTURE_DELAY`,list(flightsDepDelay$`flights.df$AIRLINE`),mean)
var1
plot(x=var1$Group.1,y=jitter(var1$x))
This shows that when taking into account no. of flights along with delay time maximum or the airlines with highest delay is F9 folowed by NK and EV
Hence Ailines affecting it changes drastically after considering number of flights
summary(flights_new.df$`flights.df$ARRIVAL_DELAY`)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -87.00 -13.00 -5.00 4.41 8.00 1971.00 105071
flightsArrDelay<-subset(flights_new.df,!is.na(flights_new.df$`flights.df$ARRIVAL_DELAY`))
flights_ArrDelay<-subset(flightsArrDelay,flightsArrDelay$`flights.df$ARRIVAL_DELAY`>0)
flights_ArrDelay$DelayTimeInterval<-cut(flights_ArrDelay$`flights.df$ARRIVAL_DELAY`,
breaks= c(0,10,30,60,90,120,180,1988),
labels=c("(0-10]","(10-30]","(30-60]","(60-90]","(90-120]","(120-180]","(180-1988]"))
mytable11<-with(flights_new.df,table(flights_ArrDelay$DelayTimeInterval))
prop.table(mytable11)*100
##
## (0-10] (10-30] (30-60] (60-90] (90-120] (120-180]
## 39.734611 29.947779 15.027342 6.374539 3.432945 3.222393
## (180-1988]
## 2.260391
plot(flights_ArrDelay$DelayTimeInterval,col="Black")
Hence Of all the Arrival delays 39% flights were delayed by less tha 10 mins 29% flights were delayed by 10 to 30 mins 15% flights were delayed by 30 to 60 mins 6% flights were delayed by 60 to 90 mins 3% flights were delayed by 90 to 120 mins 3% flights were delayed by 120 to 180 mins 2% flights were delayed by 180+ mins
This is very similar to Departure delays distribution
corrplot.mixed(cor(flights_ArrDelay[,c(2,3,4,6,9,12,13)]),upper="ellipse",tl.pos="lt")
## Warning in cor(flights_ArrDelay[, c(2, 3, 4, 6, 9, 12, 13)]): the standard
## deviation is zero
flights_ArrDelay$Latency[flights_ArrDelay$`flights.df$ARRIVAL_DELAY`>0 & flights_ArrDelay$`flights.df$ARRIVAL_DELAY`<5]<-'On time'
flights_ArrDelay$Latency[flights_ArrDelay$`flights.df$ARRIVAL_DELAY`>=5 & flights_ArrDelay$`flights.df$ARRIVAL_DELAY`<15]<-'Small Delay'
flights_ArrDelay$Latency[flights_ArrDelay$`flights.df$ARRIVAL_DELAY`>=15]<-"Long Delay"
flights_ArrDelay$Latency<-factor(flights_ArrDelay$Latency)
mytable12<-with(flights_ArrDelay,table(flights_ArrDelay$Latency))
prop.table(mytable12)*100
##
## Long Delay On time Small Delay
## 50.95793 19.61368 29.42840
50% Flights have long delay 30% Flights have small delay
mytable13<-table(flights_ArrDelay$`flights.df$MONTH`,flights_ArrDelay$Latency)
prop.table(mytable13,2)*100
##
## Long Delay On time Small Delay
## 1 9.022708 8.266454 8.682548
## 2 8.950114 7.477823 8.085453
## 3 8.975785 9.142547 9.323444
## 4 7.734059 8.815661 8.709578
## 5 8.429727 8.403756 8.326277
## 6 10.883746 8.744079 9.029863
## 7 10.120656 8.982769 9.008044
## 8 8.849873 8.598714 8.399062
## 9 5.647809 7.574569 6.898590
## 10 5.649501 8.421834 7.603967
## 11 6.636112 7.733859 7.687498
## 12 9.099911 7.837935 8.245677
Of all the long delays Maximum were in the month of: 1)June 2)July 3)Jan,December
mytable14<-table(flights_ArrDelay$`flights.df$DAY_OF_WEEK`,flights_ArrDelay$Latency)
prop.table(mytable14,2)*100
##
## Long Delay On time Small Delay
## 1 15.63475 14.62900 14.46462
## 2 14.21642 14.07320 14.06585
## 3 14.44408 14.84253 14.64357
## 4 16.14498 15.72888 16.19940
## 5 15.36910 15.78850 16.01475
## 6 10.37342 11.44150 10.99586
## 7 13.81725 13.49639 13.61595
All percentages are almost same so no evident conclusion can be drawn
mytable15<-table(flights_ArrDelay$`flights.df$AIRLINE`,flights_ArrDelay$Latency)
prop.table(mytable15,2)*100
##
## Long Delay On time Small Delay
## AA 12.2507262 11.6239980 12.1035920
## AS 2.1018601 3.5397992 3.2748233
## B6 5.5644941 3.9438870 4.3442863
## DL 11.0982388 13.5467132 12.5977790
## EV 10.2670675 10.1703570 10.1612010
## F9 2.2163942 1.6534862 1.7738626
## HA 0.8103897 2.4362536 1.8870290
## MQ 5.6935095 4.0215774 4.3144886
## NK 3.2179561 2.0365633 2.3333442
## OO 10.1364535 11.5172348 10.9906536
## UA 9.8474854 7.6649638 8.1627968
## US 3.4368685 3.8016989 3.9363989
## VX 1.1075388 1.1648673 1.2430390
## WN 22.2510177 22.8786002 22.8767056
-Maximum Long delays are seen in Southwest Airlines Co.(25%) folowed by American Airlines Inc.,Delta Air Lines Inc.,United Air Lines Inc. -Similar trend is seen in small delays and on time flights too.
var3<-aggregate(flights_ArrDelay$`flights.df$ARRIVAL_DELAY`,list(flights_ArrDelay$`flights.df$AIRLINE`),mean)
var3
plot(x=var3$Group.1,y=jitter(var3$x))
This shows that when taking into account no. of flights along with delay time maximum or the airlines with highest delay is F9 folowed by NK,UA and MQ
cor.test(flights_new.df$`flights.df$ARRIVAL_DELAY`,flights_new.df$`flights.df$DEPARTURE_DELAY`)
##
## Pearson's product-moment correlation
##
## data: flights_new.df$`flights.df$ARRIVAL_DELAY` and flights_new.df$`flights.df$DEPARTURE_DELAY`
## t = 6884.2, df = 5714000, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9445832 0.9447597
## sample estimates:
## cor
## 0.9446715
fit1<-lm(flights.df$DEPARTURE_DELAY~flights.df$AIR_SYSTEM_DELAY+flights.df$WEATHER_DELAY+flights.df$LATE_AIRCRAFT_DELAY+flights.df$AIRLINE_DELAY+flights.df$SECURITY_DELAY)
summary(fit1)
##
## Call:
## lm(formula = flights.df$DEPARTURE_DELAY ~ flights.df$AIR_SYSTEM_DELAY +
## flights.df$WEATHER_DELAY + flights.df$LATE_AIRCRAFT_DELAY +
## flights.df$AIRLINE_DELAY + flights.df$SECURITY_DELAY)
##
## Residuals:
## Min 1Q Median 3Q Max
## -226.04 -8.90 0.27 9.07 396.36
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.8010139 0.0222124 -81.08 <2e-16 ***
## flights.df$AIR_SYSTEM_DELAY 0.6846885 0.0005808 1178.87 <2e-16 ***
## flights.df$WEATHER_DELAY 0.9755493 0.0007812 1248.80 <2e-16 ***
## flights.df$LATE_AIRCRAFT_DELAY 1.0613214 0.0003760 2822.32 <2e-16 ***
## flights.df$AIRLINE_DELAY 1.0375459 0.0003365 3083.53 <2e-16 ***
## flights.df$SECURITY_DELAY 1.0626763 0.0074362 142.91 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.43 on 1063433 degrees of freedom
## (4755640 observations deleted due to missingness)
## Multiple R-squared: 0.9402, Adjusted R-squared: 0.9402
## F-statistic: 3.346e+06 on 5 and 1063433 DF, p-value: < 2.2e-16
Hence we can predict departure delay based on other delay This model is quite good at predicting delays as Adjusted R-squared is 0.9402
fit2<-lm(flights.df$ARRIVAL_DELAY~flights.df$AIR_SYSTEM_DELAY+flights.df$WEATHER_DELAY+flights.df$LATE_AIRCRAFT_DELAY+flights.df$AIRLINE_DELAY+flights.df$SECURITY_DELAY)
summary(fit2)
##
## Call:
## lm(formula = flights.df$ARRIVAL_DELAY ~ flights.df$AIR_SYSTEM_DELAY +
## flights.df$WEATHER_DELAY + flights.df$LATE_AIRCRAFT_DELAY +
## flights.df$AIRLINE_DELAY + flights.df$SECURITY_DELAY)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.789e-08 0.000e+00 0.000e+00 0.000e+00 1.121e-07
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.321e-11 2.010e-13 -3.642e+02 <2e-16
## flights.df$AIR_SYSTEM_DELAY 1.000e+00 5.256e-15 1.902e+14 <2e-16
## flights.df$WEATHER_DELAY 1.000e+00 7.070e-15 1.414e+14 <2e-16
## flights.df$LATE_AIRCRAFT_DELAY 1.000e+00 3.403e-15 2.938e+14 <2e-16
## flights.df$AIRLINE_DELAY 1.000e+00 3.045e-15 3.284e+14 <2e-16
## flights.df$SECURITY_DELAY 1.000e+00 6.730e-14 1.486e+13 <2e-16
##
## (Intercept) ***
## flights.df$AIR_SYSTEM_DELAY ***
## flights.df$WEATHER_DELAY ***
## flights.df$LATE_AIRCRAFT_DELAY ***
## flights.df$AIRLINE_DELAY ***
## flights.df$SECURITY_DELAY ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.487e-10 on 1063433 degrees of freedom
## (4755640 observations deleted due to missingness)
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 3.97e+28 on 5 and 1063433 DF, p-value: < 2.2e-16
fit<-lm(flights_new.df$`flights.df$ARRIVAL_DELAY`~flights_new.df$`flights.df$DEPARTURE_DELAY`)
summary(fit)
##
## Call:
## lm(formula = flights_new.df$`flights.df$ARRIVAL_DELAY` ~ flights_new.df$`flights.df$DEPARTURE_DELAY`)
##
## Residuals:
## Min 1Q Median 3Q Max
## -197.10 -7.10 -1.04 5.95 334.97
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) -4.9403823 0.0055574 -889
## flights_new.df$`flights.df$DEPARTURE_DELAY` 1.0056588 0.0001461 6884
## Pr(>|t|)
## (Intercept) <2e-16 ***
## flights_new.df$`flights.df$DEPARTURE_DELAY` <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.88 on 5714006 degrees of freedom
## (105071 observations deleted due to missingness)
## Multiple R-squared: 0.8924, Adjusted R-squared: 0.8924
## F-statistic: 4.739e+07 on 1 and 5714006 DF, p-value: < 2.2e-16
Hence Arrival Delay can be predicted with Departure Delay as multiple R squared = 0.8924