daily <- flights %>%
mutate(date = make_date(year, month, day)) %>%
group_by(date) %>%
summarize(n = n())
ggplot(daily, aes(date, n)) +
geom_line()
daily <- daily %>%
mutate(wday = wday(date, label = TRUE))
ggplot(daily, aes(wday,n)) +
geom_boxplot()
mod = lm(n ~ wday, data = daily)
grid <- daily %>%
data_grid(wday) %>%
add_predictions(mod, "n")
ggplot(daily, aes(wday, n)) +
geom_boxplot() +
geom_point(data = grid, color = "orange", size = 4)
daily <- daily %>%
add_residuals(mod)
daily %>%
ggplot(aes(date, resid)) +
geom_ref_line(h = 0) +
geom_line()
ggplot(daily, aes(date, resid, color = wday)) +
geom_ref_line(h = 0, colour = "red") +
geom_line()
daily %>%
filter(resid < -100)
## # A tibble: 11 x 4
## date n wday resid
## <date> <int> <ord> <dbl>
## 1 2013-01-01 842 Tue -109.
## 2 2013-01-20 786 Sun -105.
## 3 2013-05-26 729 Sun -162.
## 4 2013-07-04 737 Thu -229.
## 5 2013-07-05 822 Fri -145.
## 6 2013-09-01 718 Sun -173.
## 7 2013-11-28 634 Thu -332.
## 8 2013-11-29 661 Fri -306.
## 9 2013-12-24 761 Tue -190.
## 10 2013-12-25 719 Wed -244.
## 11 2013-12-31 776 Tue -175.
daily %>%
ggplot(aes(date, resid)) +
geom_ref_line(h = 0, colour = "red", size = 1) +
geom_line(color = "grey50") +
geom_smooth(se = FALSE, span = 0.20)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
daily %>%
filter(wday == "Sat") %>%
ggplot(aes(date, n)) +
geom_point()+
geom_line() +
scale_x_date(
NULL,
date_breaks = "1 month",
date_labels = "%b"
)
term <- function(date) {
cut(date,
breaks = ymd(20130101, 20130605, 20130825, 20140101),
labels = c("spring", "summer", "fall")
)
}
daily <- daily %>%
mutate(term = term(date))
daily %>%
filter(wday == "Sat") %>%
ggplot(aes(date, n, color = term)) +
geom_point(alpha = 1/3)+
geom_line() +
scale_x_date(
NULL,
date_breaks = "1 month",
date_labels = "%b"
)
daily %>%
ggplot(aes(wday, n, color = term)) +
geom_boxplot()
mod1 <- lm(n ~ wday, data = daily)
mod2 <- lm(n ~ wday * term, data = daily)
daily %>%
gather_residuals(without_term = mod1, with_term = mod2) %>%
ggplot(aes(date, resid, color = model)) +
geom_line(alpha = 0.75)
grid <- daily %>%
data_grid(wday, term) %>%
add_predictions(mod2, "n")
ggplot(daily, aes(wday, n)) +
geom_boxplot() +
geom_point(data = grid, color = "red") +
facet_wrap(~ term)
mod3 <- MASS::rlm(n ~ wday * term, data = daily)
daily %>%
add_residuals(mod3, "resid") %>%
ggplot(aes(date, resid)) +
geom_hline(yintercept = 0, size = 2, color = "red") +
geom_line()
# If you are creating variables it might be a good idea to bundle the creation of the variables up into a function
compute_vars <- function(data) {
data %>%
mutate(term = term(date),
wday = wday(date, label = TRUE)
)
}
# Another option would be to put the transformations directly in the model formula:
wday2 <- function(x) wday(x, label = TRUE)
mod3 <- lm(n ~ wday2(date) * term(date), data = daily)
# We could use a more flexible model to capture the pattern of school term in the data
library(splines)
mod <- MASS::rlm(n ~ wday * ns(date, 5), data = daily)
daily %>%
data_grid(wday, date = seq_range(date, n = 13)) %>%
add_predictions(mod) %>%
ggplot(aes(date, pred, color = wday)) +
geom_line() +
geom_point()
# We see a strong pattern in the numbers of Sat flights. This is reassuring, because we also saw that pattern in the raw data. It's a good sign when you get the same signal from different approaches.
Why are there fewer than expected flights on January 20, May 26 and September 1? (Hint: they all have the same explanation.) How would these days generalize into another year?
Answer: These 3 dates are the Sunday before a Monday holiday. MLK Jr Day, Memorial Day, Labor Day. There will obviously be less travel on that Sunday than normal, since people would be coming home the following day (similar to a Saturday). On other years, you would just use whatever date that holiday falls on (these holidays aren’t the same every year).
What do the three days with high positive residuals represent? How would these days generalize to another year?
daily %>%
top_n(3, resid)
## # A tibble: 3 x 5
## date n wday resid term
## <date> <int> <ord> <dbl> <fct>
## 1 2013-11-30 857 Sat 112. fall
## 2 2013-12-01 987 Sun 95.5 fall
## 3 2013-12-28 814 Sat 69.4 fall
Answer: These 3 days have high residuals because they fall right after Thanksgiving and Christmas. The high residual means there is more travel on those weekdays (Sat,Sun,Sat) than expected. This is because these are high travel dates (regardles of day of week).
Create a new variable that splits the “wday” variable into terms, but only for Saturdays, i.e., it should have Thurs, Fri, but Sat-summer, Sat-spring, Sat-fall. How does this model compare with the model with every combination of “wday” and “term”?
daily <- daily %>%
mutate(wday_term =
case_when(
wday == "Sat" & term == "summer" ~ "Sat-summer",
wday == "Sat" & term == "fall" ~ "Sat-fall",
wday == "Sat" & term == "spring" ~ "Sat-spring",
TRUE ~ as.character(wday)
)
)
mod3 = lm(n ~ wday_term, data = daily)
summary(mod3)
##
## Call:
## lm(formula = n ~ wday_term, data = daily)
##
## Residuals:
## Min 1Q Median 3Q Max
## -331.75 -8.81 11.31 23.64 141.00
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 967.462 6.568 147.308 < 2e-16 ***
## wday_termMon 7.346 9.288 0.791 0.4295
## wday_termSat-fall -251.462 12.951 -19.416 < 2e-16 ***
## wday_termSat-spring -230.143 12.045 -19.107 < 2e-16 ***
## wday_termSat-summer -166.545 15.167 -10.981 < 2e-16 ***
## wday_termSun -75.981 9.288 -8.181 5.06e-15 ***
## wday_termThu -1.712 9.288 -0.184 0.8539
## wday_termTue -16.103 9.244 -1.742 0.0824 .
## wday_termWed -4.769 9.288 -0.513 0.6079
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 47.36 on 356 degrees of freedom
## Multiple R-squared: 0.7357, Adjusted R-squared: 0.7297
## F-statistic: 123.8 on 8 and 356 DF, p-value: < 2.2e-16
summary(mod2)
##
## Call:
## lm(formula = n ~ wday * term, data = daily)
##
## Residuals:
## Min 1Q Median 3Q Max
## -326.33 -8.00 9.82 20.67 141.00
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 912.8684 3.7086 246.151 < 2e-16 ***
## wday.L -71.4674 9.8349 -7.267 2.49e-12 ***
## wday.Q -161.0112 9.8196 -16.397 < 2e-16 ***
## wday.C -65.6215 9.8068 -6.691 8.98e-11 ***
## wday^4 -82.0997 9.8412 -8.342 1.78e-15 ***
## wday^5 -1.3425 9.7787 -0.137 0.8909
## wday^6 -12.4501 9.7903 -1.272 0.2043
## termsummer 37.6922 6.3336 5.951 6.56e-09 ***
## termfall 3.7598 5.5033 0.683 0.4949
## wday.L:termsummer -6.2215 16.8049 -0.370 0.7114
## wday.Q:termsummer 26.4121 16.7501 1.577 0.1158
## wday.C:termsummer 26.7451 16.7885 1.593 0.1121
## wday^4:termsummer 23.5933 16.7523 1.408 0.1599
## wday^5:termsummer -13.1343 16.7720 -0.783 0.4341
## wday^6:termsummer -5.0026 16.6744 -0.300 0.7643
## wday.L:termfall -31.9048 14.5607 -2.191 0.0291 *
## wday.Q:termfall -0.6959 14.5707 -0.048 0.9619
## wday.C:termfall -8.8349 14.5417 -0.608 0.5439
## wday^4:termfall -9.8570 14.5899 -0.676 0.4997
## wday^5:termfall -3.2107 14.5228 -0.221 0.8252
## wday^6:termfall -8.2061 14.5769 -0.563 0.5738
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 46.17 on 344 degrees of freedom
## Multiple R-squared: 0.7573, Adjusted R-squared: 0.7432
## F-statistic: 53.67 on 20 and 344 DF, p-value: < 2.2e-16
Answer: Based on the resulting statistics, it would appear that the new model doesn’t explain as much of the variation as the previous model. Therefore, I would not conclude that it is a better fit.
Create a new “wday” variable that combines the day of week, term(for Saturdays), and public holidays. What do the residuals of the model look like?
daily <- daily %>%
mutate(wday2 = case_when(
date == "2013-01-01" | date == "2013-01-21" | date == "2013-05-31" | date == "2013-07-04" | date == "2013-11-24" | date == "2013-12-25" ~ "holiday",
wday == "Sat" & term == "summer" ~ "Sat-summer",
wday == "Sat" & term == "fall" ~ "Sat-fall",
wday == "Sat" & term == "spring" ~ "Sat-spring",
TRUE ~ as.character(wday)
))
mod4 = lm(n ~ wday2, data = daily)
daily <- daily %>%
add_residuals(mod4)
daily %>%
ggplot(aes(date, resid)) +
geom_ref_line(h=0) +
geom_line()
What happens if you fit a day-of-week effect that varies by month (i.e.m n ~ wday*month)? Why is this not very helpful?
Answer: If you fit a day-of-the-week effect that varies by month, you will end up with 84 different parameters (12 months x 7 days). You will likely end up with a lot of error in the model.
What would you expect the model n ~ wday + ns(date,5) to look like? Knowing what you know about the data, why would you expect it not to be particularly effective?
Answer: As shown in the examples above, a natural spline only highlights the fact that the model will underestimate the weekends in the summer and overestimate them in the fall (specifically Saturdays).
We hypothesized that people leaving on Sundays are more likely to be business travelers who need to be somewhere on Monday. Explore the hypothesis by seeing how if breaks down based on distance and time: if it’s true, you’d expect to see more Sunday evening flights to places that are far away.
flights %>%
mutate(
date = make_date(year, month, day),
wday = wday(date, label = T)
) %>%
ggplot(aes(wday, distance)) +
stat_summary() +
labs(x = "Day of Week", y = "Distance")
## No summary function supplied, defaulting to `mean_se()
Answer: By looking at the mean distances for each day of the week, we can see that on Sundays, the distnace traveled is a little longer than the rest of the days of the week, though not by much.