library (nycflights13)
library (tidyverse)
library (lubridate)
library (modelr)
library (ggthemes)
str(flights)## Classes 'tbl_df', 'tbl' and 'data.frame': 336776 obs. of 19 variables:
## $ year : int 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
## $ month : int 1 1 1 1 1 1 1 1 1 1 ...
## $ day : int 1 1 1 1 1 1 1 1 1 1 ...
## $ dep_time : int 517 533 542 544 554 554 555 557 557 558 ...
## $ sched_dep_time: int 515 529 540 545 600 558 600 600 600 600 ...
## $ dep_delay : num 2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
## $ arr_time : int 830 850 923 1004 812 740 913 709 838 753 ...
## $ sched_arr_time: int 819 830 850 1022 837 728 854 723 846 745 ...
## $ arr_delay : num 11 20 33 -18 -25 12 19 -14 -8 8 ...
## $ carrier : chr "UA" "UA" "AA" "B6" ...
## $ flight : int 1545 1714 1141 725 461 1696 507 5708 79 301 ...
## $ tailnum : chr "N14228" "N24211" "N619AA" "N804JB" ...
## $ origin : chr "EWR" "LGA" "JFK" "JFK" ...
## $ dest : chr "IAH" "IAH" "MIA" "BQN" ...
## $ air_time : num 227 227 160 183 116 150 158 53 140 138 ...
## $ distance : num 1400 1416 1089 1576 762 ...
## $ hour : num 5 5 5 5 6 5 6 6 6 6 ...
## $ minute : num 15 29 40 45 0 58 0 0 0 0 ...
## $ time_hour : POSIXct, format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...
## year month day dep_time
## Min. :2013 Min. : 1.000 Min. : 1.00 Min. : 1
## 1st Qu.:2013 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.: 907
## Median :2013 Median : 7.000 Median :16.00 Median :1401
## Mean :2013 Mean : 6.549 Mean :15.71 Mean :1349
## 3rd Qu.:2013 3rd Qu.:10.000 3rd Qu.:23.00 3rd Qu.:1744
## Max. :2013 Max. :12.000 Max. :31.00 Max. :2400
## NA's :8255
## sched_dep_time dep_delay arr_time sched_arr_time
## Min. : 106 Min. : -43.00 Min. : 1 Min. : 1
## 1st Qu.: 906 1st Qu.: -5.00 1st Qu.:1104 1st Qu.:1124
## Median :1359 Median : -2.00 Median :1535 Median :1556
## Mean :1344 Mean : 12.64 Mean :1502 Mean :1536
## 3rd Qu.:1729 3rd Qu.: 11.00 3rd Qu.:1940 3rd Qu.:1945
## Max. :2359 Max. :1301.00 Max. :2400 Max. :2359
## NA's :8255 NA's :8713
## arr_delay carrier flight tailnum
## Min. : -86.000 Length:336776 Min. : 1 Length:336776
## 1st Qu.: -17.000 Class :character 1st Qu.: 553 Class :character
## Median : -5.000 Mode :character Median :1496 Mode :character
## Mean : 6.895 Mean :1972
## 3rd Qu.: 14.000 3rd Qu.:3465
## Max. :1272.000 Max. :8500
## NA's :9430
## origin dest air_time distance
## Length:336776 Length:336776 Min. : 20.0 Min. : 17
## Class :character Class :character 1st Qu.: 82.0 1st Qu.: 502
## Mode :character Mode :character Median :129.0 Median : 872
## Mean :150.7 Mean :1040
## 3rd Qu.:192.0 3rd Qu.:1389
## Max. :695.0 Max. :4983
## NA's :9430
## hour minute time_hour
## Min. : 1.00 Min. : 0.00 Min. :2013-01-01 05:00:00
## 1st Qu.: 9.00 1st Qu.: 8.00 1st Qu.:2013-04-04 13:00:00
## Median :13.00 Median :29.00 Median :2013-07-03 10:00:00
## Mean :13.18 Mean :26.23 Mean :2013-07-03 05:22:54
## 3rd Qu.:17.00 3rd Qu.:44.00 3rd Qu.:2013-10-01 07:00:00
## Max. :23.00 Max. :59.00 Max. :2013-12-31 23:00:00
##
fpd <- flights %>%
mutate(date = make_date(year, month, day)) %>%
group_by(date) %>%
summarise(n = n())
fpd## # A tibble: 365 x 2
## date n
## <date> <int>
## 1 2013-01-01 842
## 2 2013-01-02 943
## 3 2013-01-03 914
## 4 2013-01-04 915
## 5 2013-01-05 720
## 6 2013-01-06 832
## 7 2013-01-07 933
## 8 2013-01-08 899
## 9 2013-01-09 902
## 10 2013-01-10 932
## # ... with 355 more rows
fpd <- fpd %>%
mutate(wday = wday(date, label = TRUE))
ggplot(fpd, aes(wday, n)) +
geom_boxplot() +
theme_fivethirtyeight()mod <- lm(n ~ wday, data = fpd)
grid <- fpd %>%
data_grid(wday) %>%
add_predictions(mod, "n")
ggplot(fpd, aes(wday, n)) +
geom_boxplot() +
geom_point(data = grid, colour = "red", size = 4) +
theme_fivethirtyeight()fpd <- fpd %>%
add_residuals(mod)
fpd %>%
ggplot(aes(date, resid)) +
geom_ref_line(h = 0) +
geom_line() +
theme_fivethirtyeight() ggplot(fpd, aes(date, resid, colour = wday)) +
geom_ref_line(h = 0) +
geom_line() +
theme_fivethirtyeight()Big downward spikes for Sundays, Fridays, Thursday - These spikes reflect days with significantly fewer flight than predicted by the model. Some of these spikes appear to align with U.S. holidays - 4th of July, Memorial Day, Christmas to name a few.
The model does not predict Saturday’s (yellow line) well - too low in the summer, too high in the fall and too low in what appears to be December.
Despite the noise / variation mentioned above, there does appear to be a trend in the data. We can use ggplot and geom_smooth to highlight this pattern.
fpd %>%
ggplot(aes(date, resid)) +
geom_ref_line(h = 0) +
geom_line(colour = "grey50") +
geom_smooth(se = FALSE, span = 0.20) +
theme_fivethirtyeight()fpd %>%
filter(wday == "Sat") %>%
ggplot(aes(date, n)) +
geom_point() +
geom_line() +
scale_x_date(NULL, date_breaks = "1 month", date_labels = "%b") +
theme_fivethirtyeight()Let’s add a new variable to the model that seeks to capture the four seasons
season <- function(date) {
cut(date,
breaks = ymd(20130101, 20130301, 20130601, 20130831, 20140101),
labels = c("winter","spring", "summer", "fall")
)
}
fpd <- fpd %>%
mutate(season = season(date))
fpd %>%
filter(wday == "Sat") %>%
ggplot(aes(date, n, colour = season)) +
geom_point(alpha = 1/3) +
geom_line() +
scale_x_date(NULL, date_breaks = "1 month", date_labels = "%b") +
theme_fivethirtyeight()mod1 <- lm(n ~ wday, data = fpd)
mod2 <- lm(n ~ wday * season, data = fpd)
fpd %>%
gather_residuals(without_seasib = mod1, with_season = mod2) %>%
ggplot(aes(date, resid, colour = model)) +
geom_line(alpha = 0.75) +
theme_fivethirtyeight()grid <- fpd %>%
data_grid(wday, season) %>%
add_predictions(mod2, "n")
ggplot(fpd, aes(wday, n)) +
geom_boxplot() +
geom_point(data = grid, colour = "red") +
facet_wrap(~ season) +
theme_fivethirtyeight()library(splines)
mod3 <- MASS::rlm(n ~ wday * ns(date, 5), data = fpd)
fpd %>%
data_grid(wday, date = seq_range(date, n = 13)) %>%
add_predictions(mod3) %>%
ggplot(aes(date, pred, colour = wday)) +
geom_line() +
geom_point() +
theme_fivethirtyeight()mod1 <- lm(n ~ wday, data = fpd)
mod2 <- lm(n ~ wday * season, data = fpd)
mod3 <- MASS::rlm(n ~ wday * ns(date, 5), data = fpd)
fpd %>%
gather_residuals(without_seasib = mod1, with_season = mod2, with_spline = mod3) %>%
ggplot(aes(date, resid, colour = model)) +
geom_line(alpha = 0.75) +
theme_fivethirtyeight()