The final project is to perform exploratory data analysis on a dataset named hflights (https://cran.r-project.org/web/packages/hflights/hflights.pdf). It contains data of commercial domestic flights that departed Houston (IAH and HOU) in 2011. The purpose of this project is to show the ability to (1) use R packages, (2) change the shape of data in a data frame, and (3) provide basic summary statistics and graphics as part of my exploratory data analysis.

First of all, I need to install all the packages that I am going to use in this markdown file.

install.packages("ggplot2", repos="http://cran.rstudio.com/")
## Installing package into 'C:/Users/blin261/Documents/R/win-library/3.3'
## (as 'lib' is unspecified)
## package 'ggplot2' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\blin261\AppData\Local\Temp\RtmpiubFXI\downloaded_packages
library(ggplot2)
install.packages("hflights", repos="http://cran.rstudio.com/")
## Installing package into 'C:/Users/blin261/Documents/R/win-library/3.3'
## (as 'lib' is unspecified)
## package 'hflights' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\blin261\AppData\Local\Temp\RtmpiubFXI\downloaded_packages
library(hflights)
install.packages("dplyr", repos="http://cran.rstudio.com/")
## Installing package into 'C:/Users/blin261/Documents/R/win-library/3.3'
## (as 'lib' is unspecified)
## package 'dplyr' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\blin261\AppData\Local\Temp\RtmpiubFXI\downloaded_packages
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

I take a look at the data first before I get started.

head(hflights)
##      Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## 5424 2011     1          1         6    1400    1500            AA
## 5425 2011     1          2         7    1401    1501            AA
## 5426 2011     1          3         1    1352    1502            AA
## 5427 2011     1          4         2    1403    1513            AA
## 5428 2011     1          5         3    1405    1507            AA
## 5429 2011     1          6         4    1359    1503            AA
##      FlightNum TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin
## 5424       428  N576AA                60      40      -10        0    IAH
## 5425       428  N557AA                60      45       -9        1    IAH
## 5426       428  N541AA                70      48       -8       -8    IAH
## 5427       428  N403AA                70      39        3        3    IAH
## 5428       428  N492AA                62      44       -3        5    IAH
## 5429       428  N262AA                64      45       -7       -1    IAH
##      Dest Distance TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 5424  DFW      224      7      13         0                         0
## 5425  DFW      224      6       9         0                         0
## 5426  DFW      224      5      17         0                         0
## 5427  DFW      224      9      22         0                         0
## 5428  DFW      224      9       9         0                         0
## 5429  DFW      224      6      13         0                         0
str(hflights)
## 'data.frame':    227496 obs. of  21 variables:
##  $ Year             : int  2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
##  $ Month            : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ DayofMonth       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ DayOfWeek        : int  6 7 1 2 3 4 5 6 7 1 ...
##  $ DepTime          : int  1400 1401 1352 1403 1405 1359 1359 1355 1443 1443 ...
##  $ ArrTime          : int  1500 1501 1502 1513 1507 1503 1509 1454 1554 1553 ...
##  $ UniqueCarrier    : chr  "AA" "AA" "AA" "AA" ...
##  $ FlightNum        : int  428 428 428 428 428 428 428 428 428 428 ...
##  $ TailNum          : chr  "N576AA" "N557AA" "N541AA" "N403AA" ...
##  $ ActualElapsedTime: int  60 60 70 70 62 64 70 59 71 70 ...
##  $ AirTime          : int  40 45 48 39 44 45 43 40 41 45 ...
##  $ ArrDelay         : int  -10 -9 -8 3 -3 -7 -1 -16 44 43 ...
##  $ DepDelay         : int  0 1 -8 3 5 -1 -1 -5 43 43 ...
##  $ Origin           : chr  "IAH" "IAH" "IAH" "IAH" ...
##  $ Dest             : chr  "DFW" "DFW" "DFW" "DFW" ...
##  $ Distance         : int  224 224 224 224 224 224 224 224 224 224 ...
##  $ TaxiIn           : int  7 6 5 9 9 6 12 7 8 6 ...
##  $ TaxiOut          : int  13 9 17 22 9 13 15 12 22 19 ...
##  $ Cancelled        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ CancellationCode : chr  "" "" "" "" ...
##  $ Diverted         : int  0 0 0 0 0 0 0 0 0 0 ...
summary(hflights)
##       Year          Month          DayofMonth      DayOfWeek    
##  Min.   :2011   Min.   : 1.000   Min.   : 1.00   Min.   :1.000  
##  1st Qu.:2011   1st Qu.: 4.000   1st Qu.: 8.00   1st Qu.:2.000  
##  Median :2011   Median : 7.000   Median :16.00   Median :4.000  
##  Mean   :2011   Mean   : 6.514   Mean   :15.74   Mean   :3.948  
##  3rd Qu.:2011   3rd Qu.: 9.000   3rd Qu.:23.00   3rd Qu.:6.000  
##  Max.   :2011   Max.   :12.000   Max.   :31.00   Max.   :7.000  
##                                                                 
##     DepTime        ArrTime     UniqueCarrier        FlightNum   
##  Min.   :   1   Min.   :   1   Length:227496      Min.   :   1  
##  1st Qu.:1021   1st Qu.:1215   Class :character   1st Qu.: 855  
##  Median :1416   Median :1617   Mode  :character   Median :1696  
##  Mean   :1396   Mean   :1578                      Mean   :1962  
##  3rd Qu.:1801   3rd Qu.:1953                      3rd Qu.:2755  
##  Max.   :2400   Max.   :2400                      Max.   :7290  
##  NA's   :2905   NA's   :3066                                    
##    TailNum          ActualElapsedTime    AirTime         ArrDelay      
##  Length:227496      Min.   : 34.0     Min.   : 11.0   Min.   :-70.000  
##  Class :character   1st Qu.: 77.0     1st Qu.: 58.0   1st Qu.: -8.000  
##  Mode  :character   Median :128.0     Median :107.0   Median :  0.000  
##                     Mean   :129.3     Mean   :108.1   Mean   :  7.094  
##                     3rd Qu.:165.0     3rd Qu.:141.0   3rd Qu.: 11.000  
##                     Max.   :575.0     Max.   :549.0   Max.   :978.000  
##                     NA's   :3622      NA's   :3622    NA's   :3622     
##     DepDelay          Origin              Dest              Distance     
##  Min.   :-33.000   Length:227496      Length:227496      Min.   :  79.0  
##  1st Qu.: -3.000   Class :character   Class :character   1st Qu.: 376.0  
##  Median :  0.000   Mode  :character   Mode  :character   Median : 809.0  
##  Mean   :  9.445                                         Mean   : 787.8  
##  3rd Qu.:  9.000                                         3rd Qu.:1042.0  
##  Max.   :981.000                                         Max.   :3904.0  
##  NA's   :2905                                                            
##      TaxiIn           TaxiOut         Cancelled       CancellationCode  
##  Min.   :  1.000   Min.   :  1.00   Min.   :0.00000   Length:227496     
##  1st Qu.:  4.000   1st Qu.: 10.00   1st Qu.:0.00000   Class :character  
##  Median :  5.000   Median : 14.00   Median :0.00000   Mode  :character  
##  Mean   :  6.099   Mean   : 15.09   Mean   :0.01307                     
##  3rd Qu.:  7.000   3rd Qu.: 18.00   3rd Qu.:0.00000                     
##  Max.   :165.000   Max.   :163.00   Max.   :1.00000                     
##  NA's   :3066      NA's   :2947                                         
##     Diverted       
##  Min.   :0.000000  
##  1st Qu.:0.000000  
##  Median :0.000000  
##  Mean   :0.002853  
##  3rd Qu.:0.000000  
##  Max.   :1.000000  
## 

I only care about flights that are delayed, therefore, I just extract out the data that have ArrDelay greater than 0. Then I subset the variables that I am interested to do more research.

hf <- subset(hflights, ArrDelay > 0)
sub_df <-subset(hf, select = c(Month, UniqueCarrier, ArrDelay, DepDelay), na.rm = TRUE, stringsAsFactors = FALSE)
head(sub_df)
##      Month UniqueCarrier ArrDelay DepDelay
## 5427     1            AA        3        3
## 5432     1            AA       44       43
## 5433     1            AA       43       43
## 5434     1            AA       29       29
## 5435     1            AA        5       19
## 5440     1            AA       84       90

The following code will concatenate the year, month and date with hyphens. Then I convert the string into a date class. Afterwards, I conbine the vector with the subset of data I created earlier.

flight_date <- paste (hf$Year, hf$Month, hf$DayofMonth, sep = "-")
flight_date <- as.Date(flight_date)
sub_df <- cbind(sub_df, flight_date)
head(sub_df)
##      Month UniqueCarrier ArrDelay DepDelay flight_date
## 5427     1            AA        3        3  2011-01-04
## 5432     1            AA       44       43  2011-01-09
## 5433     1            AA       43       43  2011-01-10
## 5434     1            AA       29       29  2011-01-11
## 5435     1            AA        5       19  2011-01-12
## 5440     1            AA       84       90  2011-01-17

I am curious about if the months during the year will have any impacts on the ArrDelay. The histogram I draw shows two peak of arrival delay. One near the December and January and one near May and June. My assumption is that because of the winter break and summer break, more people will go travel, therefore causing delayed flights. However, the scatter plot I draw did not show any relationship between months and average arrival delays on the same month.

ggplot(data = sub_df) + geom_histogram(aes(x = Month), binwidth =1, fill='blue') + scale_x_continuous(breaks = 0:12)

plot(aggregate(ArrDelay ~ Month, sub_df, mean))

I also want to investigate if any carriers have more occurences of arrival delays compared to other carriers. The bar graph shows, CO, MN, XE have much higher frequencies of arrival delays. However, this might be because they fly more often, so the more delays are going to happen. Then, I calculate the average arrival delay for each carrier and draw a graph. The scatterplot shows the previous three carriers even have relatively smaller average arrival delay. The one that has the highest is carrier called B6.

ggplot(data = sub_df) + geom_bar(aes(x = UniqueCarrier), fill='blue')

aggregate(ArrDelay ~ UniqueCarrier, sub_df, mean)
##    UniqueCarrier ArrDelay
## 1             AA 28.49740
## 2             AS 22.91195
## 3             B6 45.47744
## 4             CO 22.13374
## 5             DL 32.12463
## 6             EV 40.24231
## 7             F9 18.68683
## 8             FL 27.85693
## 9             MQ 38.75135
## 10            OO 24.14663
## 11            UA 32.48067
## 12            US 20.70235
## 13            WN 25.27750
## 14            XE 24.19337
## 15            YV 18.67568
UC <- group_by(sub_df, UniqueCarrier)
sub_df_uc<-summarize(UC, UniqueCarrier_ArrDelay_mean = mean(ArrDelay))
head(sub_df_uc)
## # A tibble: 6 x 2
##   UniqueCarrier UniqueCarrier_ArrDelay_mean
##           <chr>                       <dbl>
## 1            AA                    28.49740
## 2            AS                    22.91195
## 3            B6                    45.47744
## 4            CO                    22.13374
## 5            DL                    32.12463
## 6            EV                    40.24231
qplot(data = sub_df_uc, x = sub_df_uc$UniqueCarrier, y = sub_df_uc$UniqueCarrier_ArrDelay_mean, ylab = "Average Arrival Delay (Minutes)", xlab = "UniqueCarrier") 

In the end, I want to any relationship exist between DepDelay and ArrDelay. The scatterplot clealy shows strong positive relationship. The greater departure delay, the greater arrival delay. The correlation of two variables is 0.9453, which means 94.5% of the arrival delay can be explained by departure delay. Only 5.5% is due to some other reasons.

ggplot(data = sub_df, aes(x = DepDelay, y = ArrDelay)) + geom_point()

with(sub_df, cor.test(DepDelay, ArrDelay, method="pearson"))
## 
##  Pearson's product-moment correlation
## 
## data:  DepDelay and ArrDelay
## t = 947.99, df = 106920, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9447045 0.9459791
## sample estimates:
##       cor 
## 0.9453454