Data Plotting

Here are the graphs based on the schedule departure time and departure delay data that was produced.

# let's look at departure delay by time of day

ggplot(flights, mapping = aes(sched_dep_time, dep_delay)) + geom_point() + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 8255 rows containing non-finite values (stat_smooth).
## Warning: Removed 8255 rows containing missing values (geom_point).

# let's restrict that to a single airport and day of the year
ggplot(filter(flights, origin == "EWR", month == 9, day == 12), mapping = aes(sched_dep_time, dep_delay)) + geom_point() + geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 68 rows containing non-finite values (stat_smooth).
## Warning: Removed 68 rows containing missing values (geom_point).

# flights just to SFO or LAX

ggplot(filter(flights, origin == "EWR", month == 9, day == 12, dest %in% c("SFO", "LAX")), mapping = aes(sched_dep_time, dep_delay)) + geom_point()
## Warning: Removed 1 rows containing missing values (geom_point).

# all the major NYC airports
ggplot(filter(flights, month == 9, day == 12), mapping = aes(sched_dep_time, dep_delay)) + geom_point() + facet_wrap(~ origin) + geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 192 rows containing non-finite values (stat_smooth).
## Warning: Removed 192 rows containing missing values (geom_point).

Data Analysis

Based on these graphs we can tell that there tends to be a correlation between scheduled departure time and how much time the flight delays. Typically, the later the scheduled flight time is, the greater the delay time is. Overall, there is a peak around 1800 hours, and then it gradually decreases after. EWR airport has a steep peak, while JFK plateaus after 2000 hours. LGA keeps rising, and therefore doesn’t have a clear peak.

library(tidyverse)
library(nycflights13)

head(weather)
ggplot(filter(weather, month == 9, day == 12), mapping = aes(hour, wind_gust)) + geom_point() + geom_line() + facet_wrap(~ origin)
## Warning: Removed 66 rows containing missing values (geom_point).
## Warning: Removed 20 row(s) containing missing values (geom_path).

#Get details about nycflights13 dataset
?nycflights13
ls("package:nycflights13")
## [1] "airlines" "airports" "flights"  "planes"   "weather"
?flights
?nycflights13
help(nycflights13)


# Load different data points from the nycflights13 library
airlines_data <- airlines
airports_data <- airports
flights_data <- flights
planes_data <- planes
weather_data <- weather

# Inspecting flights dataset
sapply (flights_data, class)
## $year
## [1] "integer"
## 
## $month
## [1] "integer"
## 
## $day
## [1] "integer"
## 
## $dep_time
## [1] "integer"
## 
## $sched_dep_time
## [1] "integer"
## 
## $dep_delay
## [1] "numeric"
## 
## $arr_time
## [1] "integer"
## 
## $sched_arr_time
## [1] "integer"
## 
## $arr_delay
## [1] "numeric"
## 
## $carrier
## [1] "character"
## 
## $flight
## [1] "integer"
## 
## $tailnum
## [1] "character"
## 
## $origin
## [1] "character"
## 
## $dest
## [1] "character"
## 
## $air_time
## [1] "numeric"
## 
## $distance
## [1] "numeric"
## 
## $hour
## [1] "numeric"
## 
## $minute
## [1] "numeric"
## 
## $time_hour
## [1] "POSIXct" "POSIXt"
head(flights_data)
tail(flights_data,5)
#Order the data in month and day
flights_newdata <- flights_data[order(flights_data$month,flights_data$day),]
head(flights_newdata,10)
dim(flights_data)
## [1] 336776     19
summary(flights_data)
##       year          month             day           dep_time    sched_dep_time
##  Min.   :2013   Min.   : 1.000   Min.   : 1.00   Min.   :   1   Min.   : 106  
##  1st Qu.:2013   1st Qu.: 4.000   1st Qu.: 8.00   1st Qu.: 907   1st Qu.: 906  
##  Median :2013   Median : 7.000   Median :16.00   Median :1401   Median :1359  
##  Mean   :2013   Mean   : 6.549   Mean   :15.71   Mean   :1349   Mean   :1344  
##  3rd Qu.:2013   3rd Qu.:10.000   3rd Qu.:23.00   3rd Qu.:1744   3rd Qu.:1729  
##  Max.   :2013   Max.   :12.000   Max.   :31.00   Max.   :2400   Max.   :2359  
##                                                  NA's   :8255                 
##    dep_delay          arr_time    sched_arr_time   arr_delay          carrier         
##  Min.   : -43.00   Min.   :   1   Min.   :   1   Min.   : -86.000   Length:336776     
##  1st Qu.:  -5.00   1st Qu.:1104   1st Qu.:1124   1st Qu.: -17.000   Class :character  
##  Median :  -2.00   Median :1535   Median :1556   Median :  -5.000   Mode  :character  
##  Mean   :  12.64   Mean   :1502   Mean   :1536   Mean   :   6.895                     
##  3rd Qu.:  11.00   3rd Qu.:1940   3rd Qu.:1945   3rd Qu.:  14.000                     
##  Max.   :1301.00   Max.   :2400   Max.   :2359   Max.   :1272.000                     
##  NA's   :8255      NA's   :8713                  NA's   :9430                         
##      flight       tailnum             origin              dest              air_time    
##  Min.   :   1   Length:336776      Length:336776      Length:336776      Min.   : 20.0  
##  1st Qu.: 553   Class :character   Class :character   Class :character   1st Qu.: 82.0  
##  Median :1496   Mode  :character   Mode  :character   Mode  :character   Median :129.0  
##  Mean   :1972                                                            Mean   :150.7  
##  3rd Qu.:3465                                                            3rd Qu.:192.0  
##  Max.   :8500                                                            Max.   :695.0  
##                                                                          NA's   :9430   
##     distance         hour           minute        time_hour                  
##  Min.   :  17   Min.   : 1.00   Min.   : 0.00   Min.   :2013-01-01 05:00:00  
##  1st Qu.: 502   1st Qu.: 9.00   1st Qu.: 8.00   1st Qu.:2013-04-04 13:00:00  
##  Median : 872   Median :13.00   Median :29.00   Median :2013-07-03 10:00:00  
##  Mean   :1040   Mean   :13.18   Mean   :26.23   Mean   :2013-07-03 05:22:54  
##  3rd Qu.:1389   3rd Qu.:17.00   3rd Qu.:44.00   3rd Qu.:2013-10-01 07:00:00  
##  Max.   :4983   Max.   :23.00   Max.   :59.00   Max.   :2013-12-31 23:00:00  
## 
glimpse(flights_data)
## Rows: 336,776
## Columns: 19
## $ year           <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2...
## $ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ day            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ dep_time       <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, 558, 5...
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, 600, 6...
## $ dep_delay      <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1, 0, -...
## $ arr_time       <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849, 853, ...
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851, 856, ...
## $ arr_delay      <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -14, 31,...
## $ carrier        <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "AA", "...
## $ flight         <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 49, 71,...
## $ tailnum        <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N39463", "...
## $ origin         <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA", "JFK"...
## $ dest           <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD", "MCO"...
## $ air_time       <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 158, 34...
## $ distance       <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, 1028, ...
## $ hour           <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6, 6, 6...
## $ minute         <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0, 0, 0...
## $ time_hour      <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 05:00:0...
unique(flights_data$carrier)
##  [1] "UA" "AA" "B6" "DL" "EV" "MQ" "US" "WN" "VX" "FL" "AS" "9E" "F9" "HA" "YV" "OO"
length(unique(flights_data$carrier))
## [1] 16
unique(flights_data$origin)
## [1] "EWR" "LGA" "JFK"
unique(flights_data$dest)
##   [1] "IAH" "MIA" "BQN" "ATL" "ORD" "FLL" "IAD" "MCO" "PBI" "TPA" "LAX" "SFO" "DFW" "BOS"
##  [15] "LAS" "MSP" "DTW" "RSW" "SJU" "PHX" "BWI" "CLT" "BUF" "DEN" "SNA" "MSY" "SLC" "XNA"
##  [29] "MKE" "SEA" "ROC" "SYR" "SRQ" "RDU" "CMH" "JAX" "CHS" "MEM" "PIT" "SAN" "DCA" "CLE"
##  [43] "STL" "MYR" "JAC" "MDW" "HNL" "BNA" "AUS" "BTV" "PHL" "STT" "EGE" "AVL" "PWM" "IND"
##  [57] "SAV" "CAK" "HOU" "LGB" "DAY" "ALB" "BDL" "MHT" "MSN" "GSO" "CVG" "BUR" "RIC" "GSP"
##  [71] "GRR" "MCI" "ORF" "SAT" "SDF" "PDX" "SJC" "OMA" "CRW" "OAK" "SMF" "TUL" "TYS" "OKC"
##  [85] "PVD" "DSM" "PSE" "BHM" "CAE" "HDN" "BZN" "MTJ" "EYW" "PSP" "ACK" "BGR" "ABQ" "ILM"
##  [99] "MVY" "SBN" "LEX" "CHO" "TVC" "ANC" "LGA"
length(unique(flights_data$dest))
## [1] 105
# Number of departures getting cancelled
sum(is.na(flights_data$dep_time))
## [1] 8255
with(flights, table(carrier))
## carrier
##    9E    AA    AS    B6    DL    EV    F9    FL    HA    MQ    OO    UA    US    VX 
## 18460 32729   714 54635 48110 54173   685  3260   342 26397    32 58665 20536  5162 
##    WN    YV 
## 12275   601
with(flights, table(origin))
## origin
##    EWR    JFK    LGA 
## 120835 111279 104662
### Now a look at group_by and summarize. The use of print(..., n = Inf) is just to show all of the rows

by_hour_ewr <- group_by(filter(flights, origin == "EWR"), hour)
print(summarize(by_hour_ewr, mean = mean(dep_delay, na.rm = TRUE)), n = Inf)
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 20 x 2
##     hour    mean
##    <dbl>   <dbl>
##  1     1 NaN    
##  2     5   0.649
##  3     6   3.50 
##  4     7   4.04 
##  5     8   5.50 
##  6     9   5.84 
##  7    10   7.79 
##  8    11   7.96 
##  9    12  11.6  
## 10    13  13.6  
## 11    14  17.4  
## 12    15  20.6  
## 13    16  23.1  
## 14    17  25.3  
## 15    18  25.1  
## 16    19  31.1  
## 17    20  27.5  
## 18    21  26.1  
## 19    22  26.6  
## 20    23  19.4
print(summarize(by_hour_ewr, mean = mean(dep_delay, na.rm = TRUE), max = max(dep_delay, na.rm = TRUE)/60), n = Inf)
## Warning in max(dep_delay, na.rm = TRUE): no non-missing arguments to max; returning -Inf
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 20 x 3
##     hour    mean     max
##    <dbl>   <dbl>   <dbl>
##  1     1 NaN     -Inf   
##  2     5   0.649    3.13
##  3     6   3.50    13.1 
##  4     7   4.04     6.37
##  5     8   5.50     8.37
##  6     9   5.84    11.7 
##  7    10   7.79    10.9 
##  8    11   7.96     6.37
##  9    12  11.6      6.62
## 10    13  13.6      7.18
## 11    14  17.4      6.85
## 12    15  20.6      7.38
## 13    16  23.1     18.8 
## 14    17  25.3     14.9 
## 15    18  25.1      7.57
## 16    19  31.1      6.48
## 17    20  27.5     14.6 
## 18    21  26.1      4.98
## 19    22  26.6      2.62
## 20    23  19.4      1.7
# 
by_month_ewr <- group_by(filter(flights, origin == "EWR"), month)
print(summarize(by_month_ewr, mean = mean(dep_delay, na.rm = TRUE)), n = Inf)
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 12 x 2
##    month  mean
##    <int> <dbl>
##  1     1 14.9 
##  2     2 13.1 
##  3     3 18.1 
##  4     4 17.4 
##  5     5 15.4 
##  6     6 22.5 
##  7     7 22.0 
##  8     8 13.5 
##  9     9  7.29
## 10    10  8.64
## 11    11  6.72
## 12    12 21.0
arrange(filter(flights, origin == "EWR"), desc(dep_delay))