Here are the graphs based on the schedule departure time and departure delay data that was produced.
# let's look at departure delay by time of day
ggplot(flights, mapping = aes(sched_dep_time, dep_delay)) + geom_point() + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 8255 rows containing non-finite values (stat_smooth).
## Warning: Removed 8255 rows containing missing values (geom_point).
# let's restrict that to a single airport and day of the year
ggplot(filter(flights, origin == "EWR", month == 9, day == 12), mapping = aes(sched_dep_time, dep_delay)) + geom_point() + geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 68 rows containing non-finite values (stat_smooth).
## Warning: Removed 68 rows containing missing values (geom_point).
# flights just to SFO or LAX
ggplot(filter(flights, origin == "EWR", month == 9, day == 12, dest %in% c("SFO", "LAX")), mapping = aes(sched_dep_time, dep_delay)) + geom_point()
## Warning: Removed 1 rows containing missing values (geom_point).
# all the major NYC airports
ggplot(filter(flights, month == 9, day == 12), mapping = aes(sched_dep_time, dep_delay)) + geom_point() + facet_wrap(~ origin) + geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 192 rows containing non-finite values (stat_smooth).
## Warning: Removed 192 rows containing missing values (geom_point).
Based on these graphs we can tell that there tends to be a correlation between scheduled departure time and how much time the flight delays. Typically, the later the scheduled flight time is, the greater the delay time is. Overall, there is a peak around 1800 hours, and then it gradually decreases after. EWR airport has a steep peak, while JFK plateaus after 2000 hours. LGA keeps rising, and therefore doesn’t have a clear peak.
library(tidyverse)
library(nycflights13)
head(weather)
ggplot(filter(weather, month == 9, day == 12), mapping = aes(hour, wind_gust)) + geom_point() + geom_line() + facet_wrap(~ origin)
## Warning: Removed 66 rows containing missing values (geom_point).
## Warning: Removed 20 row(s) containing missing values (geom_path).
#Get details about nycflights13 dataset
?nycflights13
ls("package:nycflights13")
## [1] "airlines" "airports" "flights" "planes" "weather"
?flights
?nycflights13
help(nycflights13)
# Load different data points from the nycflights13 library
airlines_data <- airlines
airports_data <- airports
flights_data <- flights
planes_data <- planes
weather_data <- weather
# Inspecting flights dataset
sapply (flights_data, class)
## $year
## [1] "integer"
##
## $month
## [1] "integer"
##
## $day
## [1] "integer"
##
## $dep_time
## [1] "integer"
##
## $sched_dep_time
## [1] "integer"
##
## $dep_delay
## [1] "numeric"
##
## $arr_time
## [1] "integer"
##
## $sched_arr_time
## [1] "integer"
##
## $arr_delay
## [1] "numeric"
##
## $carrier
## [1] "character"
##
## $flight
## [1] "integer"
##
## $tailnum
## [1] "character"
##
## $origin
## [1] "character"
##
## $dest
## [1] "character"
##
## $air_time
## [1] "numeric"
##
## $distance
## [1] "numeric"
##
## $hour
## [1] "numeric"
##
## $minute
## [1] "numeric"
##
## $time_hour
## [1] "POSIXct" "POSIXt"
head(flights_data)
tail(flights_data,5)
#Order the data in month and day
flights_newdata <- flights_data[order(flights_data$month,flights_data$day),]
head(flights_newdata,10)
dim(flights_data)
## [1] 336776 19
summary(flights_data)
## year month day dep_time sched_dep_time
## Min. :2013 Min. : 1.000 Min. : 1.00 Min. : 1 Min. : 106
## 1st Qu.:2013 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.: 907 1st Qu.: 906
## Median :2013 Median : 7.000 Median :16.00 Median :1401 Median :1359
## Mean :2013 Mean : 6.549 Mean :15.71 Mean :1349 Mean :1344
## 3rd Qu.:2013 3rd Qu.:10.000 3rd Qu.:23.00 3rd Qu.:1744 3rd Qu.:1729
## Max. :2013 Max. :12.000 Max. :31.00 Max. :2400 Max. :2359
## NA's :8255
## dep_delay arr_time sched_arr_time arr_delay carrier
## Min. : -43.00 Min. : 1 Min. : 1 Min. : -86.000 Length:336776
## 1st Qu.: -5.00 1st Qu.:1104 1st Qu.:1124 1st Qu.: -17.000 Class :character
## Median : -2.00 Median :1535 Median :1556 Median : -5.000 Mode :character
## Mean : 12.64 Mean :1502 Mean :1536 Mean : 6.895
## 3rd Qu.: 11.00 3rd Qu.:1940 3rd Qu.:1945 3rd Qu.: 14.000
## Max. :1301.00 Max. :2400 Max. :2359 Max. :1272.000
## NA's :8255 NA's :8713 NA's :9430
## flight tailnum origin dest air_time
## Min. : 1 Length:336776 Length:336776 Length:336776 Min. : 20.0
## 1st Qu.: 553 Class :character Class :character Class :character 1st Qu.: 82.0
## Median :1496 Mode :character Mode :character Mode :character Median :129.0
## Mean :1972 Mean :150.7
## 3rd Qu.:3465 3rd Qu.:192.0
## Max. :8500 Max. :695.0
## NA's :9430
## distance hour minute time_hour
## Min. : 17 Min. : 1.00 Min. : 0.00 Min. :2013-01-01 05:00:00
## 1st Qu.: 502 1st Qu.: 9.00 1st Qu.: 8.00 1st Qu.:2013-04-04 13:00:00
## Median : 872 Median :13.00 Median :29.00 Median :2013-07-03 10:00:00
## Mean :1040 Mean :13.18 Mean :26.23 Mean :2013-07-03 05:22:54
## 3rd Qu.:1389 3rd Qu.:17.00 3rd Qu.:44.00 3rd Qu.:2013-10-01 07:00:00
## Max. :4983 Max. :23.00 Max. :59.00 Max. :2013-12-31 23:00:00
##
glimpse(flights_data)
## Rows: 336,776
## Columns: 19
## $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2...
## $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ dep_time <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, 558, 5...
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, 600, 6...
## $ dep_delay <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1, 0, -...
## $ arr_time <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849, 853, ...
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851, 856, ...
## $ arr_delay <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -14, 31,...
## $ carrier <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "AA", "...
## $ flight <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 49, 71,...
## $ tailnum <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N39463", "...
## $ origin <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA", "JFK"...
## $ dest <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD", "MCO"...
## $ air_time <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 158, 34...
## $ distance <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, 1028, ...
## $ hour <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6, 6, 6...
## $ minute <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0, 0, 0...
## $ time_hour <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 05:00:0...
unique(flights_data$carrier)
## [1] "UA" "AA" "B6" "DL" "EV" "MQ" "US" "WN" "VX" "FL" "AS" "9E" "F9" "HA" "YV" "OO"
length(unique(flights_data$carrier))
## [1] 16
unique(flights_data$origin)
## [1] "EWR" "LGA" "JFK"
unique(flights_data$dest)
## [1] "IAH" "MIA" "BQN" "ATL" "ORD" "FLL" "IAD" "MCO" "PBI" "TPA" "LAX" "SFO" "DFW" "BOS"
## [15] "LAS" "MSP" "DTW" "RSW" "SJU" "PHX" "BWI" "CLT" "BUF" "DEN" "SNA" "MSY" "SLC" "XNA"
## [29] "MKE" "SEA" "ROC" "SYR" "SRQ" "RDU" "CMH" "JAX" "CHS" "MEM" "PIT" "SAN" "DCA" "CLE"
## [43] "STL" "MYR" "JAC" "MDW" "HNL" "BNA" "AUS" "BTV" "PHL" "STT" "EGE" "AVL" "PWM" "IND"
## [57] "SAV" "CAK" "HOU" "LGB" "DAY" "ALB" "BDL" "MHT" "MSN" "GSO" "CVG" "BUR" "RIC" "GSP"
## [71] "GRR" "MCI" "ORF" "SAT" "SDF" "PDX" "SJC" "OMA" "CRW" "OAK" "SMF" "TUL" "TYS" "OKC"
## [85] "PVD" "DSM" "PSE" "BHM" "CAE" "HDN" "BZN" "MTJ" "EYW" "PSP" "ACK" "BGR" "ABQ" "ILM"
## [99] "MVY" "SBN" "LEX" "CHO" "TVC" "ANC" "LGA"
length(unique(flights_data$dest))
## [1] 105
# Number of departures getting cancelled
sum(is.na(flights_data$dep_time))
## [1] 8255
with(flights, table(carrier))
## carrier
## 9E AA AS B6 DL EV F9 FL HA MQ OO UA US VX
## 18460 32729 714 54635 48110 54173 685 3260 342 26397 32 58665 20536 5162
## WN YV
## 12275 601
with(flights, table(origin))
## origin
## EWR JFK LGA
## 120835 111279 104662
### Now a look at group_by and summarize. The use of print(..., n = Inf) is just to show all of the rows
by_hour_ewr <- group_by(filter(flights, origin == "EWR"), hour)
print(summarize(by_hour_ewr, mean = mean(dep_delay, na.rm = TRUE)), n = Inf)
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 20 x 2
## hour mean
## <dbl> <dbl>
## 1 1 NaN
## 2 5 0.649
## 3 6 3.50
## 4 7 4.04
## 5 8 5.50
## 6 9 5.84
## 7 10 7.79
## 8 11 7.96
## 9 12 11.6
## 10 13 13.6
## 11 14 17.4
## 12 15 20.6
## 13 16 23.1
## 14 17 25.3
## 15 18 25.1
## 16 19 31.1
## 17 20 27.5
## 18 21 26.1
## 19 22 26.6
## 20 23 19.4
print(summarize(by_hour_ewr, mean = mean(dep_delay, na.rm = TRUE), max = max(dep_delay, na.rm = TRUE)/60), n = Inf)
## Warning in max(dep_delay, na.rm = TRUE): no non-missing arguments to max; returning -Inf
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 20 x 3
## hour mean max
## <dbl> <dbl> <dbl>
## 1 1 NaN -Inf
## 2 5 0.649 3.13
## 3 6 3.50 13.1
## 4 7 4.04 6.37
## 5 8 5.50 8.37
## 6 9 5.84 11.7
## 7 10 7.79 10.9
## 8 11 7.96 6.37
## 9 12 11.6 6.62
## 10 13 13.6 7.18
## 11 14 17.4 6.85
## 12 15 20.6 7.38
## 13 16 23.1 18.8
## 14 17 25.3 14.9
## 15 18 25.1 7.57
## 16 19 31.1 6.48
## 17 20 27.5 14.6
## 18 21 26.1 4.98
## 19 22 26.6 2.62
## 20 23 19.4 1.7
#
by_month_ewr <- group_by(filter(flights, origin == "EWR"), month)
print(summarize(by_month_ewr, mean = mean(dep_delay, na.rm = TRUE)), n = Inf)
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 12 x 2
## month mean
## <int> <dbl>
## 1 1 14.9
## 2 2 13.1
## 3 3 18.1
## 4 4 17.4
## 5 5 15.4
## 6 6 22.5
## 7 7 22.0
## 8 8 13.5
## 9 9 7.29
## 10 10 8.64
## 11 11 6.72
## 12 12 21.0
arrange(filter(flights, origin == "EWR"), desc(dep_delay))