The following code examines the hflights data and looks for variables that are most strongly linked to departure delays
require(hflights)
## Loading required package: hflights
## Warning: package 'hflights' was built under R version 3.1.3
require(ggplot2)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.1.3
require(Rcpp)
## Loading required package: Rcpp
## Warning: package 'Rcpp' was built under R version 3.1.3
require(dplyr)
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 3.1.3
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data(hflights)
summary(hflights)
## Year Month DayofMonth DayOfWeek
## Min. :2011 Min. : 1.000 Min. : 1.00 Min. :1.000
## 1st Qu.:2011 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.:2.000
## Median :2011 Median : 7.000 Median :16.00 Median :4.000
## Mean :2011 Mean : 6.514 Mean :15.74 Mean :3.948
## 3rd Qu.:2011 3rd Qu.: 9.000 3rd Qu.:23.00 3rd Qu.:6.000
## Max. :2011 Max. :12.000 Max. :31.00 Max. :7.000
##
## DepTime ArrTime UniqueCarrier FlightNum
## Min. : 1 Min. : 1 Length:227496 Min. : 1
## 1st Qu.:1021 1st Qu.:1215 Class :character 1st Qu.: 855
## Median :1416 Median :1617 Mode :character Median :1696
## Mean :1396 Mean :1578 Mean :1962
## 3rd Qu.:1801 3rd Qu.:1953 3rd Qu.:2755
## Max. :2400 Max. :2400 Max. :7290
## NA's :2905 NA's :3066
## TailNum ActualElapsedTime AirTime ArrDelay
## Length:227496 Min. : 34.0 Min. : 11.0 Min. :-70.000
## Class :character 1st Qu.: 77.0 1st Qu.: 58.0 1st Qu.: -8.000
## Mode :character Median :128.0 Median :107.0 Median : 0.000
## Mean :129.3 Mean :108.1 Mean : 7.094
## 3rd Qu.:165.0 3rd Qu.:141.0 3rd Qu.: 11.000
## Max. :575.0 Max. :549.0 Max. :978.000
## NA's :3622 NA's :3622 NA's :3622
## DepDelay Origin Dest Distance
## Min. :-33.000 Length:227496 Length:227496 Min. : 79.0
## 1st Qu.: -3.000 Class :character Class :character 1st Qu.: 376.0
## Median : 0.000 Mode :character Mode :character Median : 809.0
## Mean : 9.445 Mean : 787.8
## 3rd Qu.: 9.000 3rd Qu.:1042.0
## Max. :981.000 Max. :3904.0
## NA's :2905
## TaxiIn TaxiOut Cancelled CancellationCode
## Min. : 1.000 Min. : 1.00 Min. :0.00000 Length:227496
## 1st Qu.: 4.000 1st Qu.: 10.00 1st Qu.:0.00000 Class :character
## Median : 5.000 Median : 14.00 Median :0.00000 Mode :character
## Mean : 6.099 Mean : 15.09 Mean :0.01307
## 3rd Qu.: 7.000 3rd Qu.: 18.00 3rd Qu.:0.00000
## Max. :165.000 Max. :163.00 Max. :1.00000
## NA's :3066 NA's :2947
## Diverted
## Min. :0.000000
## 1st Qu.:0.000000
## Median :0.000000
## Mean :0.002853
## 3rd Qu.:0.000000
## Max. :1.000000
##
head(hflights)
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## 5424 2011 1 1 6 1400 1500 AA
## 5425 2011 1 2 7 1401 1501 AA
## 5426 2011 1 3 1 1352 1502 AA
## 5427 2011 1 4 2 1403 1513 AA
## 5428 2011 1 5 3 1405 1507 AA
## 5429 2011 1 6 4 1359 1503 AA
## FlightNum TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin
## 5424 428 N576AA 60 40 -10 0 IAH
## 5425 428 N557AA 60 45 -9 1 IAH
## 5426 428 N541AA 70 48 -8 -8 IAH
## 5427 428 N403AA 70 39 3 3 IAH
## 5428 428 N492AA 62 44 -3 5 IAH
## 5429 428 N262AA 64 45 -7 -1 IAH
## Dest Distance TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 5424 DFW 224 7 13 0 0
## 5425 DFW 224 6 9 0 0
## 5426 DFW 224 5 17 0 0
## 5427 DFW 224 9 22 0 0
## 5428 DFW 224 9 9 0 0
## 5429 DFW 224 6 13 0 0
#boxplot DepDelay: a lot of outliers
boxplot(hflights$DepDelay)
#histogram DepDelay: right tail visible to about 200 minutes
ggplot(data = hflights) + geom_histogram(aes(x = DepDelay), bin = 10, fill = 'grey30')
#create a data.frame with data less than 200 minute departure delays and no negatives (departed early)
hf_depdelay <- hflights[which(hflights$DepDelay < 200 & hflights$DepDelay > 0), ]
#check summary data again
summary(hf_depdelay)
## Year Month DayofMonth DayOfWeek
## Min. :2011 Min. : 1.000 Min. : 1.00 Min. :1.000
## 1st Qu.:2011 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.:2.000
## Median :2011 Median : 6.000 Median :16.00 Median :4.000
## Mean :2011 Mean : 6.423 Mean :15.89 Mean :3.975
## 3rd Qu.:2011 3rd Qu.: 9.000 3rd Qu.:23.00 3rd Qu.:6.000
## Max. :2011 Max. :12.000 Max. :31.00 Max. :7.000
##
## DepTime ArrTime UniqueCarrier FlightNum
## Min. : 1 Min. : 1 Length:109204 Min. : 1
## 1st Qu.:1150 1st Qu.:1348 Class :character 1st Qu.: 667
## Median :1531 Median :1713 Mode :character Median :1552
## Mean :1497 Mean :1660 Mean :1745
## 3rd Qu.:1853 3rd Qu.:2032 3rd Qu.:2481
## Max. :2359 Max. :2400 Max. :7240
## NA's :94
## TailNum ActualElapsedTime AirTime ArrDelay
## Length:109204 Min. : 34.0 Min. : 22.0 Min. :-51.00
## Class :character 1st Qu.: 80.0 1st Qu.: 61.0 1st Qu.: -1.00
## Mode :character Median :133.0 Median :112.0 Median : 9.00
## Mean :135.5 Mean :114.3 Mean : 17.96
## 3rd Qu.:177.0 3rd Qu.:153.0 3rd Qu.: 25.00
## Max. :575.0 Max. :549.0 Max. :290.00
## NA's :417 NA's :417 NA's :417
## DepDelay Origin Dest Distance
## Min. : 1.00 Length:109204 Length:109204 Min. : 79
## 1st Qu.: 4.00 Class :character Class :character 1st Qu.: 419
## Median : 10.00 Mode :character Mode :character Median : 853
## Mean : 20.77 Mean : 839
## 3rd Qu.: 25.00 3rd Qu.:1195
## Max. :199.00 Max. :3904
##
## TaxiIn TaxiOut Cancelled CancellationCode
## Min. : 1.000 Min. : 2.00 Min. :0.0000000 Length:109204
## 1st Qu.: 4.000 1st Qu.: 10.00 1st Qu.:0.0000000 Class :character
## Median : 5.000 Median : 14.00 Median :0.0000000 Mode :character
## Mean : 6.121 Mean : 15.09 Mean :0.0003388
## 3rd Qu.: 7.000 3rd Qu.: 18.00 3rd Qu.:0.0000000
## Max. :165.000 Max. :163.00 Max. :1.0000000
## NA's :94 NA's :19
## Diverted
## Min. :0.00000
## 1st Qu.:0.00000
## Median :0.00000
## Mean :0.00348
## 3rd Qu.:0.00000
## Max. :1.00000
##
str(hf_depdelay)
## 'data.frame': 109204 obs. of 21 variables:
## $ Year : int 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
## $ Month : int 1 1 1 1 1 1 1 1 1 1 ...
## $ DayofMonth : int 2 4 5 9 10 11 12 17 18 20 ...
## $ DayOfWeek : int 7 2 3 7 1 2 3 1 2 4 ...
## $ DepTime : int 1401 1403 1405 1443 1443 1429 1419 1530 1408 1507 ...
## $ ArrTime : int 1501 1513 1507 1554 1553 1539 1515 1634 1508 1622 ...
## $ UniqueCarrier : chr "AA" "AA" "AA" "AA" ...
## $ FlightNum : int 428 428 428 428 428 428 428 428 428 428 ...
## $ TailNum : chr "N557AA" "N403AA" "N492AA" "N476AA" ...
## $ ActualElapsedTime: int 60 70 62 71 70 70 56 64 60 75 ...
## $ AirTime : int 45 39 44 41 45 42 41 48 42 42 ...
## $ ArrDelay : int -9 3 -3 44 43 29 5 84 -2 72 ...
## $ DepDelay : int 1 3 5 43 43 29 19 90 8 67 ...
## $ Origin : chr "IAH" "IAH" "IAH" "IAH" ...
## $ Dest : chr "DFW" "DFW" "DFW" "DFW" ...
## $ Distance : int 224 224 224 224 224 224 224 224 224 224 ...
## $ TaxiIn : int 6 9 9 8 6 8 4 8 7 9 ...
## $ TaxiOut : int 9 22 9 22 19 20 11 8 11 24 ...
## $ Cancelled : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CancellationCode : chr "" "" "" "" ...
## $ Diverted : int 0 0 0 0 0 0 0 0 0 0 ...
#check out new histogram
ggplot(data = hf_depdelay) + geom_histogram(aes(x = DepDelay), bin = 10, fill = 'grey30')
#Find some leads by checking correlations among other int variables:
#ArrDelay has an obvious strong correlation, with weaker correlations among DepTime, ActualElapsedTime, and Airtime
cor(hf_depdelay$DayofMonth, hf_depdelay$DepDelay, use = "na.or.complete")
## [1] 0.02045733
cor(hf_depdelay$DayOfWeek, hf_depdelay$DepDelay, use = "na.or.complete")
## [1] -0.009090004
cor(hf_depdelay$TaxiIn, hf_depdelay$DepDelay, use = "na.or.complete")
## [1] 0.01204868
cor(hf_depdelay$DepTime, hf_depdelay$DepDelay, use = "na.or.complete")
## [1] 0.2151914
cor(hf_depdelay$ArrTime, hf_depdelay$DepDelay, use = "na.or.complete")
## [1] 0.0202626
cor(hf_depdelay$ActualElapsedTime, hf_depdelay$DepDelay, use = "na.or.complete")
## [1] -0.04395686
cor(hf_depdelay$AirTime, hf_depdelay$DepDelay, use = "na.or.complete")
## [1] -0.04771528
cor(hf_depdelay$ActualElapsedTime, hf_depdelay$DepDelay, use = "na.or.complete")
## [1] -0.04395686
cor(hf_depdelay$ArrDelay, hf_depdelay$DepDelay, use = "na.or.complete")
## [1] 0.9224706
cor(hf_depdelay$Distance, hf_depdelay$DepDelay, use = "na.or.complete")
## [1] -0.04398316
cor(hf_depdelay$TaxiIn, hf_depdelay$DepDelay, use = "na.or.complete")
## [1] 0.01204868
cor(hf_depdelay$TaxiOut, hf_depdelay$DepDelay, use = "na.or.complete")
## [1] 0.005344066
#Scatterplot of ArrDelay by DepDelay
ggplot(hf_depdelay) + geom_point(aes(x = DepDelay, y = ArrDelay))
## Warning: Removed 417 rows containing missing values (geom_point).
#Now check out categorical variables: carrier, destination, and origin
#Carrier first: there's a few main carriers: Continental, Skywest, Southwest, and ExpressJet
ggplot(data = hf_depdelay) + geom_histogram(aes(x = UniqueCarrier), fill = 'grey30')
#density charts by airline: looks like continental is responsible for many of the delays, considering its size
ggplot(data = hf_depdelay) + geom_density(aes(x=DepDelay, fill = UniqueCarrier)) + facet_wrap(~UniqueCarrier)
#Check out just the main airlines: create a data.frame with only the main carriers
hf_mcarriers <- hf_depdelay[which(hf_depdelay$UniqueCarrier == 'CO' | hf_depdelay$UniqueCarrier == 'OO' | hf_depdelay$UniqueCarrier == 'WN' | hf_depdelay$UniqueCarrier == 'XE'), ]
#still only less than 1% of its flights, but continental has a higher percentage of (at least short) departure delays than others
ggplot(data = hf_mcarriers) + geom_density(aes(x=DepDelay, fill = UniqueCarrier)) + facet_wrap(~UniqueCarrier)
#check out delays by destination: there are a lot
ggplot(data = hf_depdelay) + geom_histogram(aes(x = Dest), fill = 'grey30')
#there's a lot of destinations here, so quickly using sql to get only destinations with over 2500 trips (should have done this in r, but couldn't find code in time.)
#(select * from depdelay where dest in (select dest from (select dest, count(dest) as count from depdelay group by dest) s where s.count > 2500 order by count desc))
#reload the data from sql
thedest <- "C:/Users/Andrew/Desktop/Bridge Courses/R/Final/largdest.csv"
largedests <- read.table (file = thedest, header = TRUE, sep = ";")
head(largedests)
## depdelay origin dest
## 1 1 IAH DFW
## 2 3 IAH DFW
## 3 5 IAH DFW
## 4 43 IAH DFW
## 5 43 IAH DFW
## 6 29 IAH DFW
#easier to visualize histogram
ggplot(data = largedests) + geom_histogram(aes(x = dest), fill = 'grey30')
#looking at departure delay density charts, it looks like DEN and LAS have the highest proportion of short-term delays
ggplot(data = largedests) + geom_density(aes(x=depdelay, fill = dest)) + facet_wrap(~dest)