Flights.data <- read.table("~/Desktop/Flight Delays 2015/flights.csv", sep =",", header = TRUE)
head(Flights.data)
## YEAR MONTH DAY DAY_OF_WEEK AIRLINE FLIGHT_NUMBER TAIL_NUMBER
## 1 2015 1 1 4 AS 98 N407AS
## 2 2015 1 1 4 AA 2336 N3KUAA
## 3 2015 1 1 4 US 840 N171US
## 4 2015 1 1 4 AA 258 N3HYAA
## 5 2015 1 1 4 AS 135 N527AS
## 6 2015 1 1 4 DL 806 N3730B
## ORIGIN_AIRPORT DESTINATION_AIRPORT SCHEDULED_DEPARTURE DEPARTURE_TIME
## 1 ANC SEA 5 2354
## 2 LAX PBI 10 2
## 3 SFO CLT 20 18
## 4 LAX MIA 20 15
## 5 SEA ANC 25 24
## 6 SFO MSP 25 20
## DEPARTURE_DELAY TAXI_OUT WHEELS_OFF SCHEDULED_TIME ELAPSED_TIME AIR_TIME
## 1 -11 21 15 205 194 169
## 2 -8 12 14 280 279 263
## 3 -2 16 34 286 293 266
## 4 -5 15 30 285 281 258
## 5 -1 11 35 235 215 199
## 6 -5 18 38 217 230 206
## DISTANCE WHEELS_ON TAXI_IN SCHEDULED_ARRIVAL ARRIVAL_TIME ARRIVAL_DELAY
## 1 1448 404 4 430 408 -22
## 2 2330 737 4 750 741 -9
## 3 2296 800 11 806 811 5
## 4 2342 748 8 805 756 -9
## 5 1448 254 5 320 259 -21
## 6 1589 604 6 602 610 8
## DIVERTED CANCELLED CANCELLATION_REASON AIR_SYSTEM_DELAY SECURITY_DELAY
## 1 0 0 NA NA
## 2 0 0 NA NA
## 3 0 0 NA NA
## 4 0 0 NA NA
## 5 0 0 NA NA
## 6 0 0 NA NA
## AIRLINE_DELAY LATE_AIRCRAFT_DELAY WEATHER_DELAY
## 1 NA NA NA
## 2 NA NA NA
## 3 NA NA NA
## 4 NA NA NA
## 5 NA NA NA
## 6 NA NA NA
require(dplyr)
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 3.4.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
require(repr)
## Loading required package: repr
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'repr'
require(ggplot2)
## Loading required package: ggplot2
require(gridExtra)
## Loading required package: gridExtra
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
require(glmnet)
## Loading required package: glmnet
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'glmnet'
flights.delay.sub <- Flights.data[,c('YEAR','MONTH','DAY','DAY_OF_WEEK','AIRLINE','ORIGIN_AIRPORT','DESTINATION_AIRPORT',
'SCHEDULED_DEPARTURE','SCHEDULED_ARRIVAL','DEPARTURE_DELAY','ARRIVAL_DELAY',
'AIR_SYSTEM_DELAY','SECURITY_DELAY','AIRLINE_DELAY','LATE_AIRCRAFT_DELAY',
'WEATHER_DELAY')]
flights.delay.sub <- flights.delay.sub[complete.cases(flights.delay.sub), ]
flights.delay.sub.orig.lax.dest.sea <- flights.delay.sub[ which(flights.delay.sub$ORIGIN_AIRPORT=='LAX' &
flights.delay.sub$DESTINATION_AIRPORT=='SEA'), ]
numcols <- c('DEPARTURE_DELAY','ARRIVAL_DELAY','AIR_SYSTEM_DELAY','SECURITY_DELAY','AIRLINE_DELAY',
'LATE_AIRCRAFT_DELAY','WEATHER_DELAY')
head(flights.delay.sub.orig.lax.dest.sea)
## YEAR MONTH DAY DAY_OF_WEEK AIRLINE ORIGIN_AIRPORT
## 6135 2015 1 1 4 AS LAX
## 9942 2015 1 1 4 VX LAX
## 13707 2015 1 1 4 AS LAX
## 17449 2015 1 2 5 AS LAX
## 19327 2015 1 2 5 AS LAX
## 21837 2015 1 2 5 AS LAX
## DESTINATION_AIRPORT SCHEDULED_DEPARTURE SCHEDULED_ARRIVAL
## 6135 SEA 1300 1534
## 9942 SEA 1705 1945
## 13707 SEA 2225 106
## 17449 SEA 840 1127
## 19327 SEA 1030 1314
## 21837 SEA 1300 1534
## DEPARTURE_DELAY ARRIVAL_DELAY AIR_SYSTEM_DELAY SECURITY_DELAY
## 6135 82 96 14 0
## 9942 20 16 0 0
## 13707 62 60 0 0
## 17449 104 105 1 0
## 19327 8 28 20 0
## 21837 0 19 19 0
## AIRLINE_DELAY LATE_AIRCRAFT_DELAY WEATHER_DELAY
## 6135 0 82 0
## 9942 16 0 0
## 13707 0 60 0
## 17449 104 0 0
## 19327 0 8 0
## 21837 0 0 0
summary(flights.delay.sub.orig.lax.dest.sea)
## YEAR MONTH DAY DAY_OF_WEEK
## Min. :2015 Min. : 1.000 Min. : 1.00 Min. :1.000
## 1st Qu.:2015 1st Qu.: 4.000 1st Qu.: 9.00 1st Qu.:2.000
## Median :2015 Median : 8.000 Median :16.00 Median :4.000
## Mean :2015 Mean : 7.185 Mean :15.82 Mean :3.898
## 3rd Qu.:2015 3rd Qu.:11.000 3rd Qu.:22.00 3rd Qu.:5.000
## Max. :2015 Max. :12.000 Max. :31.00 Max. :7.000
##
## AIRLINE ORIGIN_AIRPORT DESTINATION_AIRPORT SCHEDULED_DEPARTURE
## AS :618 LAX :1221 SEA :1221 Min. : 600
## DL :250 10135 : 0 10135 : 0 1st Qu.:1050
## VX :212 10136 : 0 10136 : 0 Median :1515
## OO :123 10140 : 0 10140 : 0 Mean :1460
## UA : 17 10141 : 0 10141 : 0 3rd Qu.:1840
## B6 : 1 10146 : 0 10146 : 0 Max. :2355
## (Other): 0 (Other): 0 (Other): 0
## SCHEDULED_ARRIVAL DEPARTURE_DELAY ARRIVAL_DELAY AIR_SYSTEM_DELAY
## Min. : 1 Min. :-12.00 Min. : 15.00 Min. : 0.00
## 1st Qu.:1255 1st Qu.: 8.00 1st Qu.: 21.00 1st Qu.: 0.00
## Median :1719 Median : 27.00 Median : 31.00 Median : 6.00
## Mean :1621 Mean : 37.68 Mean : 44.03 Mean : 11.26
## 3rd Qu.:2100 3rd Qu.: 52.00 3rd Qu.: 52.00 3rd Qu.: 18.00
## Max. :2358 Max. :444.00 Max. :451.00 Max. :103.00
##
## SECURITY_DELAY AIRLINE_DELAY LATE_AIRCRAFT_DELAY WEATHER_DELAY
## Min. : 0.000 Min. : 0.00 Min. : 0.0 Min. : 0.0000
## 1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.: 0.0 1st Qu.: 0.0000
## Median : 0.000 Median : 0.00 Median : 0.0 Median : 0.0000
## Mean : 0.086 Mean : 11.61 Mean : 20.3 Mean : 0.7682
## 3rd Qu.: 0.000 3rd Qu.: 8.00 3rd Qu.: 29.0 3rd Qu.: 0.0000
## Max. :38.000 Max. :334.00 Max. :444.0 Max. :128.0000
##
str(flights.delay.sub.orig.lax.dest.sea)
## 'data.frame': 1221 obs. of 16 variables:
## $ YEAR : int 2015 2015 2015 2015 2015 2015 2015 2015 2015 2015 ...
## $ MONTH : int 1 1 1 1 1 1 1 1 1 1 ...
## $ DAY : int 1 1 1 2 2 2 2 2 2 2 ...
## $ DAY_OF_WEEK : int 4 4 4 5 5 5 5 5 5 5 ...
## $ AIRLINE : Factor w/ 14 levels "AA","AS","B6",..: 2 13 2 2 2 2 13 11 2 13 ...
## $ ORIGIN_AIRPORT : Factor w/ 628 levels "10135","10136",..: 483 483 483 483 483 483 483 483 483 483 ...
## $ DESTINATION_AIRPORT: Factor w/ 629 levels "10135","10136",..: 585 585 585 585 585 585 585 585 585 585 ...
## $ SCHEDULED_DEPARTURE: int 1300 1705 2225 840 1030 1300 1705 1935 1940 1955 ...
## $ SCHEDULED_ARRIVAL : int 1534 1945 106 1127 1314 1534 1945 2221 2220 2230 ...
## $ DEPARTURE_DELAY : int 82 20 62 104 8 0 20 32 12 6 ...
## $ ARRIVAL_DELAY : int 96 16 60 105 28 19 20 41 31 24 ...
## $ AIR_SYSTEM_DELAY : int 14 0 0 1 20 19 4 9 19 0 ...
## $ SECURITY_DELAY : int 0 0 0 0 0 0 0 0 0 0 ...
## $ AIRLINE_DELAY : int 0 16 0 104 0 0 0 8 12 24 ...
## $ LATE_AIRCRAFT_DELAY: int 82 0 60 0 8 0 16 24 0 0 ...
## $ WEATHER_DELAY : int 0 0 0 0 0 0 0 0 0 0 ...
cor(flights.delay.sub.orig.lax.dest.sea[,numcols])
## DEPARTURE_DELAY ARRIVAL_DELAY AIR_SYSTEM_DELAY
## DEPARTURE_DELAY 1.000000000 0.944965755 -0.26801529
## ARRIVAL_DELAY 0.944965755 1.000000000 -0.06030859
## AIR_SYSTEM_DELAY -0.268015286 -0.060308594 1.00000000
## SECURITY_DELAY -0.005092528 -0.006099835 -0.02887722
## AIRLINE_DELAY 0.549427204 0.549944125 -0.18845956
## LATE_AIRCRAFT_DELAY 0.680104218 0.652664761 -0.30548226
## WEATHER_DELAY 0.048874405 0.064121013 -0.05445003
## SECURITY_DELAY AIRLINE_DELAY LATE_AIRCRAFT_DELAY
## DEPARTURE_DELAY -0.005092528 0.54942720 0.68010422
## ARRIVAL_DELAY -0.006099835 0.54994412 0.65266476
## AIR_SYSTEM_DELAY -0.028877223 -0.18845956 -0.30548226
## SECURITY_DELAY 1.000000000 -0.01734383 -0.02433607
## AIRLINE_DELAY -0.017343826 1.00000000 -0.16159407
## LATE_AIRCRAFT_DELAY -0.024336074 -0.16159407 1.00000000
## WEATHER_DELAY -0.006465666 -0.04121311 -0.05107731
## WEATHER_DELAY
## DEPARTURE_DELAY 0.048874405
## ARRIVAL_DELAY 0.064121013
## AIR_SYSTEM_DELAY -0.054450030
## SECURITY_DELAY -0.006465666
## AIRLINE_DELAY -0.041213113
## LATE_AIRCRAFT_DELAY -0.051077308
## WEATHER_DELAY 1.000000000
cov(flights.delay.sub.orig.lax.dest.sea[,numcols])
## DEPARTURE_DELAY ARRIVAL_DELAY AIR_SYSTEM_DELAY
## DEPARTURE_DELAY 1974.2148763 1646.9518636 -172.9264463
## ARRIVAL_DELAY 1646.9518636 1538.6337388 -34.3519562
## AIR_SYSTEM_DELAY -172.9264463 -34.3519562 210.8673232
## SECURITY_DELAY -0.3633524 -0.3842228 -0.6733758
## AIRLINE_DELAY 739.7570293 653.6841033 -82.9286388
## LATE_AIRCRAFT_DELAY 1066.6558270 903.6690794 -156.5821572
## WEATHER_DELAY 13.8288060 16.0167351 -5.0351076
## SECURITY_DELAY AIRLINE_DELAY LATE_AIRCRAFT_DELAY
## DEPARTURE_DELAY -0.36335240 739.7570293 1066.65583
## ARRIVAL_DELAY -0.38422282 653.6841033 903.66908
## AIR_SYSTEM_DELAY -0.67337576 -82.9286388 -156.58216
## SECURITY_DELAY 2.57866436 -0.8439642 -1.37943
## AIRLINE_DELAY -0.84396423 918.2553403 -172.84578
## LATE_AIRCRAFT_DELAY -1.37942965 -172.8457774 1245.95760
## WEATHER_DELAY -0.06611753 -7.9528564 -11.48116
## WEATHER_DELAY
## DEPARTURE_DELAY 13.82880600
## ARRIVAL_DELAY 16.01673514
## AIR_SYSTEM_DELAY -5.03510761
## SECURITY_DELAY -0.06611753
## AIRLINE_DELAY -7.95285643
## LATE_AIRCRAFT_DELAY -11.48115627
## WEATHER_DELAY 40.55197299
options(repr.plot.width=12, repr.plot.height=12)
require(car)
## Loading required package: car
## Warning: package 'car' was built under R version 3.4.3
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
suppressWarnings(
scatterplotMatrix(~ flights.delay.sub.orig.lax.dest.sea$DEPARTURE_DELAY
+ flights.delay.sub.orig.lax.dest.sea$ARRIVAL_DELAY
+ flights.delay.sub.orig.lax.dest.sea$AIR_SYSTEM_DELAY
+ flights.delay.sub.orig.lax.dest.sea$SECURITY_DELAY
+ flights.delay.sub.orig.lax.dest.sea$AIRLINE_DELAY
+ flights.delay.sub.orig.lax.dest.sea$LATE_AIRCRAFT_DELAY
+ flights.delay.sub.orig.lax.dest.sea$WEATHER_DELAY
, data = flights.delay.sub.orig.lax.dest.sea))
options(repr.plot.width=12, repr.plot.height=6)
library(ellipse)
##
## Attaching package: 'ellipse'
## The following object is masked from 'package:car':
##
## ellipse
R = cor(flights.delay.sub.orig.lax.dest.sea[,numcols], method = 'kendal')
print(R)
## DEPARTURE_DELAY ARRIVAL_DELAY AIR_SYSTEM_DELAY
## DEPARTURE_DELAY 1.00000000 0.638419794 -0.395051900
## ARRIVAL_DELAY 0.63841979 1.000000000 -0.069167017
## AIR_SYSTEM_DELAY -0.39505190 -0.069167017 1.000000000
## SECURITY_DELAY 0.00494000 0.012005478 -0.002923588
## AIRLINE_DELAY 0.25712412 0.161440788 -0.294412287
## LATE_AIRCRAFT_DELAY 0.49181928 0.372501994 -0.377238395
## WEATHER_DELAY -0.01706855 0.004183648 -0.067211387
## SECURITY_DELAY AIRLINE_DELAY LATE_AIRCRAFT_DELAY
## DEPARTURE_DELAY 0.004940000 0.257124119 0.49181928
## ARRIVAL_DELAY 0.012005478 0.161440788 0.37250199
## AIR_SYSTEM_DELAY -0.002923588 -0.294412287 -0.37723840
## SECURITY_DELAY 1.000000000 -0.004308466 -0.01642175
## AIRLINE_DELAY -0.004308466 1.000000000 -0.20565278
## LATE_AIRCRAFT_DELAY -0.016421746 -0.205652777 1.00000000
## WEATHER_DELAY -0.012135734 -0.060608298 -0.06160339
## WEATHER_DELAY
## DEPARTURE_DELAY -0.017068551
## ARRIVAL_DELAY 0.004183648
## AIR_SYSTEM_DELAY -0.067211387
## SECURITY_DELAY -0.012135734
## AIRLINE_DELAY -0.060608298
## LATE_AIRCRAFT_DELAY -0.061603394
## WEATHER_DELAY 1.000000000
plotcorr(R, col = colorRampPalette(c("firebrick3", "white", "navy"))(10))
flights.delay.sub.new <- Flights.data[,c('YEAR','MONTH','DAY','DAY_OF_WEEK','AIRLINE','ORIGIN_AIRPORT',
'DESTINATION_AIRPORT','DEPARTURE_DELAY','ARRIVAL_DELAY')]
head(flights.delay.sub.new)
## YEAR MONTH DAY DAY_OF_WEEK AIRLINE ORIGIN_AIRPORT DESTINATION_AIRPORT
## 1 2015 1 1 4 AS ANC SEA
## 2 2015 1 1 4 AA LAX PBI
## 3 2015 1 1 4 US SFO CLT
## 4 2015 1 1 4 AA LAX MIA
## 5 2015 1 1 4 AS SEA ANC
## 6 2015 1 1 4 DL SFO MSP
## DEPARTURE_DELAY ARRIVAL_DELAY
## 1 -11 -22
## 2 -8 -9
## 3 -2 5
## 4 -5 -9
## 5 -1 -21
## 6 -5 8
flights.delay.sub.new <- flights.delay.sub.new[complete.cases(flights.delay.sub.new), ]
flights.delay.sub.orig.lax.dest.sea <- flights.delay.sub.new[ which(flights.delay.sub.new$ORIGIN_AIRPORT=='LAX' &
flights.delay.sub.new$DESTINATION_AIRPORT=='SEA'), ]
numcols <- c('DEPARTURE_DELAY','ARRIVAL_DELAY')
flights.delay.sub.orig.lax.dest.sea$DATE <- as.Date(paste(flights.delay.sub.orig.lax.dest.sea$YEAR,
flights.delay.sub.orig.lax.dest.sea$MONTH,
flights.delay.sub.orig.lax.dest.sea$DAY , sep = "."),
format = "%Y.%m.%d")
## Warning in strptime(x, format, tz = "GMT"): unknown timezone 'zone/tz/
## 2017c.1.0/zoneinfo/Asia/Shanghai'
head(flights.delay.sub.orig.lax.dest.sea)
## YEAR MONTH DAY DAY_OF_WEEK AIRLINE ORIGIN_AIRPORT DESTINATION_AIRPORT
## 233 2015 1 1 4 AS LAX SEA
## 896 2015 1 1 4 AS LAX SEA
## 2183 2015 1 1 4 AS LAX SEA
## 2258 2015 1 1 4 DL LAX SEA
## 3202 2015 1 1 4 OO LAX SEA
## 3799 2015 1 1 4 AS LAX SEA
## DEPARTURE_DELAY ARRIVAL_DELAY DATE
## 233 -5 -9 2015-01-01
## 896 -5 -15 2015-01-01
## 2183 -1 -7 2015-01-01
## 2258 -3 -13 2015-01-01
## 3202 -13 -12 2015-01-01
## 3799 -7 -18 2015-01-01
summary(flights.delay.sub.orig.lax.dest.sea)
## YEAR MONTH DAY DAY_OF_WEEK
## Min. :2015 Min. : 1.000 Min. : 1.00 Min. :1.000
## 1st Qu.:2015 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.:2.000
## Median :2015 Median : 7.000 Median :16.00 Median :4.000
## Mean :2015 Mean : 6.611 Mean :15.69 Mean :3.911
## 3rd Qu.:2015 3rd Qu.: 9.000 3rd Qu.:23.00 3rd Qu.:6.000
## Max. :2015 Max. :12.000 Max. :31.00 Max. :7.000
##
## AIRLINE ORIGIN_AIRPORT DESTINATION_AIRPORT DEPARTURE_DELAY
## AS :4582 LAX :7699 SEA :7699 Min. :-25.00
## DL :1644 10135 : 0 10135 : 0 1st Qu.: -6.00
## VX : 889 10136 : 0 10136 : 0 Median : -3.00
## OO : 495 10140 : 0 10140 : 0 Mean : 3.79
## UA : 88 10141 : 0 10141 : 0 3rd Qu.: 2.00
## B6 : 1 10146 : 0 10146 : 0 Max. :444.00
## (Other): 0 (Other): 0 (Other): 0
## ARRIVAL_DELAY DATE
## Min. :-48.000 Min. :2015-01-01
## 1st Qu.:-12.000 1st Qu.:2015-04-14
## Median : -4.000 Median :2015-07-03
## Mean : 1.495 Mean :2015-07-04
## 3rd Qu.: 7.000 3rd Qu.:2015-09-17
## Max. :451.000 Max. :2015-12-31
##
str(flights.delay.sub.orig.lax.dest.sea)
## 'data.frame': 7699 obs. of 10 variables:
## $ YEAR : int 2015 2015 2015 2015 2015 2015 2015 2015 2015 2015 ...
## $ MONTH : int 1 1 1 1 1 1 1 1 1 1 ...
## $ DAY : int 1 1 1 1 1 1 1 1 1 1 ...
## $ DAY_OF_WEEK : int 4 4 4 4 4 4 4 4 4 4 ...
## $ AIRLINE : Factor w/ 14 levels "AA","AS","B6",..: 2 2 2 4 10 2 13 2 4 2 ...
## $ ORIGIN_AIRPORT : Factor w/ 628 levels "10135","10136",..: 483 483 483 483 483 483 483 483 483 483 ...
## $ DESTINATION_AIRPORT: Factor w/ 629 levels "10135","10136",..: 585 585 585 585 585 585 585 585 585 585 ...
## $ DEPARTURE_DELAY : int -5 -5 -1 -3 -13 -7 -1 82 6 -2 ...
## $ ARRIVAL_DELAY : int -9 -15 -7 -13 -12 -18 -7 96 -3 -7 ...
## $ DATE : Date, format: "2015-01-01" "2015-01-01" ...
library(ggplot2)
ggplot(flights.delay.sub.orig.lax.dest.sea, aes(ARRIVAL_DELAY)) + geom_density() +
xlab('Arrival Delay in minutes') + ylab('Density of Arrival Delay') + ggtitle('Density charr arrival delay by Airline for 2015 flights from LAX to SEA')
options(repr.plot.width=12, repr.plot.height=8)
ggplot(flights.delay.sub.orig.lax.dest.sea, aes(x = factor(AIRLINE), y = ARRIVAL_DELAY)) + geom_boxplot() +
ylab('Arrival Delay in minutes') + xlab('Airline Abbreviations') +
ggtitle('Arrival delay by Airline for 2015 flights from LAX to SEA')
ggplot(flights.delay.sub.orig.lax.dest.sea, aes(x = factor(AIRLINE), y = ARRIVAL_DELAY)) +
geom_violin(trim = TRUE, draw_quantiles = c(0.25, 0.5, 0.75)) +
ylab('Arrival Delay in minutes') + xlab('Airline Abbreviations') +
ggtitle('Arrival delay by Airline for 2015 flights from LAX to SEA')
ggplot(flights.delay.sub.orig.lax.dest.sea, aes(DEPARTURE_DELAY,
ARRIVAL_DELAY)) + geom_point(aes(color = factor(AIRLINE),
size = ARRIVAL_DELAY, alpha = 0.5)) +
xlab('Departure delay in minutes') + ylab('Arrival delay in minutes') +
ggtitle('Relationship between Departure delay and Arrival delay with Airline shown')
ggplot(flights.delay.sub.orig.lax.dest.sea, aes(DAY_OF_WEEK,
ARRIVAL_DELAY)) + geom_point(aes(color = factor(AIRLINE),
shape = factor(AIRLINE),
size = ARRIVAL_DELAY, alpha = 0.5)) +
xlab('Day of week in numbers (MON=1...SUN=7)') + ylab('Arrival delay in minutes') +
ggtitle('Relationship between Departure delay and Arrival delay with Airline shown')