The following code examines the hflights data and looks for variables that are most strongly linked to departure delays

require(hflights)
## Loading required package: hflights
## Warning: package 'hflights' was built under R version 3.1.3
require(ggplot2)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.1.3
require(Rcpp)
## Loading required package: Rcpp
## Warning: package 'Rcpp' was built under R version 3.1.3
require(dplyr)
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 3.1.3
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
data(hflights)
summary(hflights)
##       Year          Month          DayofMonth      DayOfWeek    
##  Min.   :2011   Min.   : 1.000   Min.   : 1.00   Min.   :1.000  
##  1st Qu.:2011   1st Qu.: 4.000   1st Qu.: 8.00   1st Qu.:2.000  
##  Median :2011   Median : 7.000   Median :16.00   Median :4.000  
##  Mean   :2011   Mean   : 6.514   Mean   :15.74   Mean   :3.948  
##  3rd Qu.:2011   3rd Qu.: 9.000   3rd Qu.:23.00   3rd Qu.:6.000  
##  Max.   :2011   Max.   :12.000   Max.   :31.00   Max.   :7.000  
##                                                                 
##     DepTime        ArrTime     UniqueCarrier        FlightNum   
##  Min.   :   1   Min.   :   1   Length:227496      Min.   :   1  
##  1st Qu.:1021   1st Qu.:1215   Class :character   1st Qu.: 855  
##  Median :1416   Median :1617   Mode  :character   Median :1696  
##  Mean   :1396   Mean   :1578                      Mean   :1962  
##  3rd Qu.:1801   3rd Qu.:1953                      3rd Qu.:2755  
##  Max.   :2400   Max.   :2400                      Max.   :7290  
##  NA's   :2905   NA's   :3066                                    
##    TailNum          ActualElapsedTime    AirTime         ArrDelay      
##  Length:227496      Min.   : 34.0     Min.   : 11.0   Min.   :-70.000  
##  Class :character   1st Qu.: 77.0     1st Qu.: 58.0   1st Qu.: -8.000  
##  Mode  :character   Median :128.0     Median :107.0   Median :  0.000  
##                     Mean   :129.3     Mean   :108.1   Mean   :  7.094  
##                     3rd Qu.:165.0     3rd Qu.:141.0   3rd Qu.: 11.000  
##                     Max.   :575.0     Max.   :549.0   Max.   :978.000  
##                     NA's   :3622      NA's   :3622    NA's   :3622     
##     DepDelay          Origin              Dest              Distance     
##  Min.   :-33.000   Length:227496      Length:227496      Min.   :  79.0  
##  1st Qu.: -3.000   Class :character   Class :character   1st Qu.: 376.0  
##  Median :  0.000   Mode  :character   Mode  :character   Median : 809.0  
##  Mean   :  9.445                                         Mean   : 787.8  
##  3rd Qu.:  9.000                                         3rd Qu.:1042.0  
##  Max.   :981.000                                         Max.   :3904.0  
##  NA's   :2905                                                            
##      TaxiIn           TaxiOut         Cancelled       CancellationCode  
##  Min.   :  1.000   Min.   :  1.00   Min.   :0.00000   Length:227496     
##  1st Qu.:  4.000   1st Qu.: 10.00   1st Qu.:0.00000   Class :character  
##  Median :  5.000   Median : 14.00   Median :0.00000   Mode  :character  
##  Mean   :  6.099   Mean   : 15.09   Mean   :0.01307                     
##  3rd Qu.:  7.000   3rd Qu.: 18.00   3rd Qu.:0.00000                     
##  Max.   :165.000   Max.   :163.00   Max.   :1.00000                     
##  NA's   :3066      NA's   :2947                                         
##     Diverted       
##  Min.   :0.000000  
##  1st Qu.:0.000000  
##  Median :0.000000  
##  Mean   :0.002853  
##  3rd Qu.:0.000000  
##  Max.   :1.000000  
## 
head(hflights)
##      Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## 5424 2011     1          1         6    1400    1500            AA
## 5425 2011     1          2         7    1401    1501            AA
## 5426 2011     1          3         1    1352    1502            AA
## 5427 2011     1          4         2    1403    1513            AA
## 5428 2011     1          5         3    1405    1507            AA
## 5429 2011     1          6         4    1359    1503            AA
##      FlightNum TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin
## 5424       428  N576AA                60      40      -10        0    IAH
## 5425       428  N557AA                60      45       -9        1    IAH
## 5426       428  N541AA                70      48       -8       -8    IAH
## 5427       428  N403AA                70      39        3        3    IAH
## 5428       428  N492AA                62      44       -3        5    IAH
## 5429       428  N262AA                64      45       -7       -1    IAH
##      Dest Distance TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 5424  DFW      224      7      13         0                         0
## 5425  DFW      224      6       9         0                         0
## 5426  DFW      224      5      17         0                         0
## 5427  DFW      224      9      22         0                         0
## 5428  DFW      224      9       9         0                         0
## 5429  DFW      224      6      13         0                         0
#boxplot DepDelay: a lot of outliers
boxplot(hflights$DepDelay)

#histogram DepDelay: right tail visible to about 200 minutes
ggplot(data = hflights) + geom_histogram(aes(x = DepDelay), bin = 10, fill = 'grey30')

#create a data.frame with data less than 200 minute departure delays and no negatives (departed early)
hf_depdelay <- hflights[which(hflights$DepDelay < 200 & hflights$DepDelay > 0),  ]

#check summary data again
summary(hf_depdelay)
##       Year          Month          DayofMonth      DayOfWeek    
##  Min.   :2011   Min.   : 1.000   Min.   : 1.00   Min.   :1.000  
##  1st Qu.:2011   1st Qu.: 4.000   1st Qu.: 8.00   1st Qu.:2.000  
##  Median :2011   Median : 6.000   Median :16.00   Median :4.000  
##  Mean   :2011   Mean   : 6.423   Mean   :15.89   Mean   :3.975  
##  3rd Qu.:2011   3rd Qu.: 9.000   3rd Qu.:23.00   3rd Qu.:6.000  
##  Max.   :2011   Max.   :12.000   Max.   :31.00   Max.   :7.000  
##                                                                 
##     DepTime        ArrTime     UniqueCarrier        FlightNum   
##  Min.   :   1   Min.   :   1   Length:109204      Min.   :   1  
##  1st Qu.:1150   1st Qu.:1348   Class :character   1st Qu.: 667  
##  Median :1531   Median :1713   Mode  :character   Median :1552  
##  Mean   :1497   Mean   :1660                      Mean   :1745  
##  3rd Qu.:1853   3rd Qu.:2032                      3rd Qu.:2481  
##  Max.   :2359   Max.   :2400                      Max.   :7240  
##                 NA's   :94                                      
##    TailNum          ActualElapsedTime    AirTime         ArrDelay     
##  Length:109204      Min.   : 34.0     Min.   : 22.0   Min.   :-51.00  
##  Class :character   1st Qu.: 80.0     1st Qu.: 61.0   1st Qu.: -1.00  
##  Mode  :character   Median :133.0     Median :112.0   Median :  9.00  
##                     Mean   :135.5     Mean   :114.3   Mean   : 17.96  
##                     3rd Qu.:177.0     3rd Qu.:153.0   3rd Qu.: 25.00  
##                     Max.   :575.0     Max.   :549.0   Max.   :290.00  
##                     NA's   :417       NA's   :417     NA's   :417     
##     DepDelay         Origin              Dest              Distance   
##  Min.   :  1.00   Length:109204      Length:109204      Min.   :  79  
##  1st Qu.:  4.00   Class :character   Class :character   1st Qu.: 419  
##  Median : 10.00   Mode  :character   Mode  :character   Median : 853  
##  Mean   : 20.77                                         Mean   : 839  
##  3rd Qu.: 25.00                                         3rd Qu.:1195  
##  Max.   :199.00                                         Max.   :3904  
##                                                                       
##      TaxiIn           TaxiOut         Cancelled         CancellationCode  
##  Min.   :  1.000   Min.   :  2.00   Min.   :0.0000000   Length:109204     
##  1st Qu.:  4.000   1st Qu.: 10.00   1st Qu.:0.0000000   Class :character  
##  Median :  5.000   Median : 14.00   Median :0.0000000   Mode  :character  
##  Mean   :  6.121   Mean   : 15.09   Mean   :0.0003388                     
##  3rd Qu.:  7.000   3rd Qu.: 18.00   3rd Qu.:0.0000000                     
##  Max.   :165.000   Max.   :163.00   Max.   :1.0000000                     
##  NA's   :94        NA's   :19                                             
##     Diverted      
##  Min.   :0.00000  
##  1st Qu.:0.00000  
##  Median :0.00000  
##  Mean   :0.00348  
##  3rd Qu.:0.00000  
##  Max.   :1.00000  
## 
str(hf_depdelay)
## 'data.frame':    109204 obs. of  21 variables:
##  $ Year             : int  2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
##  $ Month            : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ DayofMonth       : int  2 4 5 9 10 11 12 17 18 20 ...
##  $ DayOfWeek        : int  7 2 3 7 1 2 3 1 2 4 ...
##  $ DepTime          : int  1401 1403 1405 1443 1443 1429 1419 1530 1408 1507 ...
##  $ ArrTime          : int  1501 1513 1507 1554 1553 1539 1515 1634 1508 1622 ...
##  $ UniqueCarrier    : chr  "AA" "AA" "AA" "AA" ...
##  $ FlightNum        : int  428 428 428 428 428 428 428 428 428 428 ...
##  $ TailNum          : chr  "N557AA" "N403AA" "N492AA" "N476AA" ...
##  $ ActualElapsedTime: int  60 70 62 71 70 70 56 64 60 75 ...
##  $ AirTime          : int  45 39 44 41 45 42 41 48 42 42 ...
##  $ ArrDelay         : int  -9 3 -3 44 43 29 5 84 -2 72 ...
##  $ DepDelay         : int  1 3 5 43 43 29 19 90 8 67 ...
##  $ Origin           : chr  "IAH" "IAH" "IAH" "IAH" ...
##  $ Dest             : chr  "DFW" "DFW" "DFW" "DFW" ...
##  $ Distance         : int  224 224 224 224 224 224 224 224 224 224 ...
##  $ TaxiIn           : int  6 9 9 8 6 8 4 8 7 9 ...
##  $ TaxiOut          : int  9 22 9 22 19 20 11 8 11 24 ...
##  $ Cancelled        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ CancellationCode : chr  "" "" "" "" ...
##  $ Diverted         : int  0 0 0 0 0 0 0 0 0 0 ...
#check out new histogram
ggplot(data = hf_depdelay) + geom_histogram(aes(x = DepDelay), bin = 10, fill = 'grey30')

#Find some leads by checking correlations among other int variables: 
#ArrDelay has an obvious strong correlation, with weaker correlations among DepTime, ActualElapsedTime, and Airtime 
cor(hf_depdelay$DayofMonth, hf_depdelay$DepDelay, use = "na.or.complete")
## [1] 0.02045733
cor(hf_depdelay$DayOfWeek, hf_depdelay$DepDelay, use = "na.or.complete")
## [1] -0.009090004
cor(hf_depdelay$TaxiIn, hf_depdelay$DepDelay, use = "na.or.complete")
## [1] 0.01204868
cor(hf_depdelay$DepTime, hf_depdelay$DepDelay, use = "na.or.complete")
## [1] 0.2151914
cor(hf_depdelay$ArrTime, hf_depdelay$DepDelay, use = "na.or.complete")
## [1] 0.0202626
cor(hf_depdelay$ActualElapsedTime, hf_depdelay$DepDelay, use = "na.or.complete")
## [1] -0.04395686
cor(hf_depdelay$AirTime, hf_depdelay$DepDelay, use = "na.or.complete")
## [1] -0.04771528
cor(hf_depdelay$ActualElapsedTime, hf_depdelay$DepDelay, use = "na.or.complete")
## [1] -0.04395686
cor(hf_depdelay$ArrDelay, hf_depdelay$DepDelay, use = "na.or.complete")
## [1] 0.9224706
cor(hf_depdelay$Distance, hf_depdelay$DepDelay, use = "na.or.complete")
## [1] -0.04398316
cor(hf_depdelay$TaxiIn, hf_depdelay$DepDelay, use = "na.or.complete")
## [1] 0.01204868
cor(hf_depdelay$TaxiOut, hf_depdelay$DepDelay, use = "na.or.complete")
## [1] 0.005344066
#Scatterplot of ArrDelay by DepDelay
ggplot(hf_depdelay) + geom_point(aes(x = DepDelay, y = ArrDelay))
## Warning: Removed 417 rows containing missing values (geom_point).

#Now check out categorical variables: carrier, destination, and origin
#Carrier first: there's a few main carriers: Continental, Skywest, Southwest, and ExpressJet
ggplot(data = hf_depdelay) + geom_histogram(aes(x = UniqueCarrier), fill = 'grey30')

#density charts by airline: looks like continental is responsible for many of the delays, considering its size
ggplot(data = hf_depdelay) + geom_density(aes(x=DepDelay, fill = UniqueCarrier)) + facet_wrap(~UniqueCarrier)

#Check out just the main airlines: create a data.frame with only the main carriers
hf_mcarriers <- hf_depdelay[which(hf_depdelay$UniqueCarrier == 'CO' | hf_depdelay$UniqueCarrier == 'OO' | hf_depdelay$UniqueCarrier == 'WN' | hf_depdelay$UniqueCarrier == 'XE'),  ]

#still only less than 1% of its flights, but continental has a higher percentage of (at least short) departure delays than others
ggplot(data = hf_mcarriers) + geom_density(aes(x=DepDelay, fill = UniqueCarrier)) + facet_wrap(~UniqueCarrier)

#check out delays by destination: there are a lot
ggplot(data = hf_depdelay) + geom_histogram(aes(x = Dest), fill = 'grey30')

#there's a lot of destinations here, so quickly using sql to get only destinations with over 2500 trips (should have done this in r, but couldn't find code in time.)
#(select * from depdelay where dest in (select dest from (select dest, count(dest) as count from depdelay group by dest) s where s.count > 2500 order by count desc))
#reload the data from sql

thedest <- "C:/Users/Andrew/Desktop/Bridge Courses/R/Final/largdest.csv"
largedests <- read.table (file = thedest, header = TRUE, sep = ";")
head(largedests)
##   depdelay origin dest
## 1        1    IAH  DFW
## 2        3    IAH  DFW
## 3        5    IAH  DFW
## 4       43    IAH  DFW
## 5       43    IAH  DFW
## 6       29    IAH  DFW
#easier to visualize histogram
ggplot(data = largedests) + geom_histogram(aes(x = dest), fill = 'grey30')

#looking at departure delay density charts, it looks like DEN and LAS have the highest proportion of short-term delays
ggplot(data = largedests) + geom_density(aes(x=depdelay, fill = dest)) + facet_wrap(~dest)