daily <- flights %>%
mutate(date = make_date(year, month, day)) %>%
group_by(date) %>%
summarize(n = n())
ggplot(daily, aes(date, n)) +
geom_line()
daily <- daily %>%
mutate(wday = wday(date, label = TRUE))
ggplot(daily, aes(wday,n)) +
geom_boxplot()
mod = lm(n ~ wday, data = daily, na.action = na.warn)
grid <- daily %>%
data_grid(wday) %>%
add_predictions(mod, "n")
ggplot(daily, aes(wday, n)) +
geom_boxplot() +
geom_point(data = grid, color = "orange", size = 4)
daily <- daily %>%
add_residuals(mod)
daily %>%
ggplot(aes(date, resid)) +
geom_ref_line(h = 0) +
geom_line()
ggplot(daily, aes(date, resid, color = wday)) +
geom_ref_line(h = 0, colour = "red") +
geom_line()
daily %>%
filter(resid < -100)
## # A tibble: 11 x 4
## date n wday resid
## <date> <int> <ord> <dbl>
## 1 2013-01-01 842 Tue -109.
## 2 2013-01-20 786 Sun -105.
## 3 2013-05-26 729 Sun -162.
## 4 2013-07-04 737 Thu -229.
## 5 2013-07-05 822 Fri -145.
## 6 2013-09-01 718 Sun -173.
## 7 2013-11-28 634 Thu -332.
## 8 2013-11-29 661 Fri -306.
## 9 2013-12-24 761 Tue -190.
## 10 2013-12-25 719 Wed -244.
## 11 2013-12-31 776 Tue -175.
daily %>%
ggplot(aes(date, resid)) +
geom_ref_line(h = 0, colour = "red", size = 1) +
geom_line(color = "grey50") +
geom_smooth(se = FALSE, span = 0.20)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
daily %>%
filter(wday == "Sat") %>%
ggplot(aes(date, n)) +
geom_point()+
geom_line() +
scale_x_date(
NULL,
date_breaks = "1 month",
date_labels = "%b"
)
term <- function(date) {
cut(date,
breaks = ymd(20130101, 20130605, 20130825, 20140101),
labels = c("spring", "summer", "fall")
)
}
daily <- daily %>%
mutate(term = term(date))
daily %>%
filter(wday == "Sat") %>%
ggplot(aes(date, n, color = term)) +
geom_point(alpha = 1/3)+
geom_line() +
scale_x_date(
NULL,
date_breaks = "1 month",
date_labels = "%b"
)
daily %>%
ggplot(aes(wday, n, color = term)) +
geom_boxplot()
mod1 <- lm(n ~ wday, data = daily, na.action = na.warn)
mod2 <- lm(n ~ wday * term, data = daily, na.action = na.warn)
daily %>%
gather_residuals(without_term = mod1, with_term = mod2) %>%
ggplot(aes(date, resid, color = model)) +
geom_line(alpha = 0.75)
grid <- daily %>%
data_grid(wday, term) %>%
add_predictions(mod2, "n")
ggplot(daily, aes(wday, n)) +
geom_boxplot() +
geom_point(data = grid, color = "red") +
facet_wrap(~ term)
mod3 <- MASS::rlm(n ~ wday * term, data = daily, na.action = na.warn)
daily %>%
add_residuals(mod3, "resid") %>%
ggplot(aes(date, resid)) +
geom_hline(yintercept = 0, size = 2, color = "red") +
geom_line()
# If you are creating variables it might be a good idea to bundle the creation of the variables up into a function
compute_vars <- function(data) {
data %>%
mutate(term = term(date),
wday = wday(date, label = TRUE)
)
}
# Another option would be to put the transformations directly in the model formula:
wday2 <- function(x) wday(x, label = TRUE)
mod3 <- lm(n ~ wday2(date) * term(date), data = daily, na.action = na.warn)
# We could use a more flexible model to capture the pattern of school term in the data
library(splines)
mod <- MASS::rlm(n ~ wday * ns(date, 5), data = daily, na.action = na.warn)
daily %>%
data_grid(wday, date = seq_range(date, n = 13)) %>%
add_predictions(mod) %>%
ggplot(aes(date, pred, color = wday)) +
geom_line() +
geom_point()
# We see a strong pattern in the numbers of Sat flights. This is reassuring, because we also saw that pattern in the raw data. It's a good sign when you get the same signal from different approaches.
Why are there fewer than expected flights on January 20, May 26 and September 1? (Hint: they all have the same explanation.) How would these days generalize into another year?
# In 2013, January 20st (before the January 21st-Martin Luther King Day), May 26th (before the May 27th-Memorial Day), and September 1st (before the September 2nd-Labor Day) were sundays. The holidays change in to different days in another year.
What do the three days with high positive residuals represent? How would these days generalize to another year?
# Use this chunk to answer question 2
daily %>%
top_n(3, resid)
## # A tibble: 3 x 5
## date n wday resid term
## <date> <int> <ord> <dbl> <fct>
## 1 2013-11-30 857 Sat 112. fall
## 2 2013-12-01 987 Sun 95.5 fall
## 3 2013-12-28 814 Sat 69.4 fall
# The top three days correspond to the Saturday after Thanksgiving (November 30th), the Sunday after Thanksgiving (December 1st), and the Saturday after Christmas (December 28th) andt these specific days in another year might be different, so these days also can not generalize to another year.
Create a new variable that splits the “wday” variable into terms, but only for Saturdays, i.e., it should have Thurs, Fri, but Sat-summer, Sat-spring, Sat-fall. How does this model compare with the model with every combination of “wday” and “term”?
# Use this chunk to answer question 3
daily <- daily %>%
mutate(
wday2 =
case_when(
wday == "Sat" & term == "summer" ~ "Sat-summer",
wday == "Sat" & term == "fall" ~ "Sat-fall",
wday == "Sat" & term == "spring" ~ "Sat-spring",
TRUE ~ as.character(wday)
)
)
mod3 <- lm(n ~ wday2, data = daily)
daily %>%
gather_residuals(sat_term = mod3, all_interact = mod2) %>%
ggplot(aes(date, resid, colour = model)) +
geom_line(alpha = 0.75)
daily %>%
spread_residuals(sat_term = mod3, all_interact = mod2) %>%
mutate(resid_diff = sat_term - all_interact) %>%
ggplot(aes(date, resid_diff)) +
geom_line(alpha = 0.75)
# The model has higher residuals in the fall and lower residuals in the spring than the model with all interactions.
Create a new “wday” variable that combines the day of week, term(for Saturdays), and public holidays. What do the residuals of the model look like?
# Use this chunk to answer question 4
holidays_2013 <-
tribble(
~holiday, ~date,
"New Year's Day", 20130101,
"Martin Luther King Jr. Day", 20130121,
"Washington's Birthday", 20130218,
"Memorial Day", 20130527,
"Independence Day", 20130704,
"Labor Day", 20130902,
"Columbus Day", 20131028,
"Veteran's Day", 20131111,
"Thanksgiving", 20131128,
"Christmas", 20131225
) %>%
mutate(date = lubridate::ymd(date))
daily <- daily %>%
mutate(
wday3 =
case_when(
date %in% (holidays_2013$date - 1L) ~ "day before holiday",
date %in% (holidays_2013$date + 1L) ~ "day after holiday",
date %in% holidays_2013$date ~ "holiday",
.$wday == "Sat" & .$term == "summer" ~ "Sat-summer",
.$wday == "Sat" & .$term == "fall" ~ "Sat-fall",
.$wday == "Sat" & .$term == "spring" ~ "Sat-spring",
TRUE ~ as.character(.$wday)
)
)
mod4 <- lm(n ~ wday3, data = daily)
daily %>%
spread_residuals(resid_sat_terms = mod3, resid_holidays = mod4) %>%
mutate(resid_diff = resid_holidays - resid_sat_terms) %>%
ggplot(aes(date, resid_diff)) +
geom_line(alpha = 0.75)
# I would expect that Veteran's Day and Washington's Birthday have a different effect on travel than Thanksgiving, Christmas, and New Year's Day. Travel could be lighter on the day of the holiday, but heavier the day before or after.
What happens if you fit a day-of-week effect that varies by month (i.e.m n ~ wday*month)? Why is this not very helpful?
# Use this chunk to answer question 5
daily <- mutate(daily, month = factor(lubridate::month(date)))
mod6 <- lm(n ~ wday * month, data = daily)
print(summary(mod6))
##
## Call:
## lm(formula = n ~ wday * month, data = daily)
##
## Residuals:
## Min 1Q Median 3Q Max
## -269.2 -5.0 1.5 8.8 113.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 867.4000 7.5983 114.157 < 2e-16 ***
## wday.L -64.0744 20.8737 -3.070 0.002353 **
## wday.Q -165.6001 20.1555 -8.216 7.77e-15 ***
## wday.C -68.2591 20.3115 -3.361 0.000885 ***
## wday^4 -92.0814 20.4991 -4.492 1.03e-05 ***
## wday^5 9.7925 19.7334 0.496 0.620111
## wday^6 -20.4376 18.9922 -1.076 0.282802
## month2 23.7071 10.9946 2.156 0.031912 *
## month3 67.8857 10.7456 6.318 1.04e-09 ***
## month4 74.5929 10.8292 6.888 3.70e-11 ***
## month5 56.2786 10.7456 5.237 3.20e-07 ***
## month6 80.3071 10.8292 7.416 1.43e-12 ***
## month7 77.1143 10.7456 7.176 6.39e-12 ***
## month8 81.6357 10.7456 7.597 4.52e-13 ***
## month9 51.3714 10.8292 4.744 3.34e-06 ***
## month10 60.1357 10.7456 5.596 5.20e-08 ***
## month11 46.9143 10.8292 4.332 2.06e-05 ***
## month12 38.7786 10.7456 3.609 0.000364 ***
## wday.L:month2 -3.7230 29.6267 -0.126 0.900089
## wday.Q:month2 -3.8188 29.1251 -0.131 0.895776
## wday.C:month2 0.4899 29.2333 0.017 0.986641
## wday^4:month2 4.5690 29.3639 0.156 0.876460
## wday^5:month2 -4.2552 28.8346 -0.148 0.882784
## wday^6:month2 12.0570 28.3325 0.426 0.670760
## wday.L:month3 -14.5705 28.4302 -0.513 0.608703
## wday.Q:month3 15.4389 28.2073 0.547 0.584581
## wday.C:month3 8.2262 28.4672 0.289 0.772817
## wday^4:month3 22.7202 28.7015 0.792 0.429261
## wday^5:month3 -15.3298 28.5042 -0.538 0.591135
## wday^6:month3 11.3727 28.2682 0.402 0.687759
## wday.L:month4 -16.6682 29.3590 -0.568 0.570667
## wday.Q:month4 10.7254 28.9620 0.370 0.711418
## wday.C:month4 -0.2449 28.7249 -0.009 0.993202
## wday^4:month4 23.2883 28.8711 0.807 0.420561
## wday^5:month4 -17.8720 28.0764 -0.637 0.524935
## wday^6:month4 5.3524 27.8883 0.192 0.847940
## wday.L:month5 3.6663 29.3590 0.125 0.900711
## wday.Q:month5 -20.6652 28.6699 -0.721 0.471632
## wday.C:month5 4.6336 28.7249 0.161 0.871965
## wday^4:month5 5.9994 28.5109 0.210 0.833490
## wday^5:month5 -16.9119 28.0764 -0.602 0.547425
## wday^6:month5 12.7643 27.1935 0.469 0.639158
## wday.L:month6 -4.5261 28.6515 -0.158 0.874593
## wday.Q:month6 23.8130 28.2073 0.844 0.399267
## wday.C:month6 13.7580 28.7249 0.479 0.632342
## wday^4:month6 24.1183 29.1875 0.826 0.409322
## wday^5:month6 -17.6484 28.7981 -0.613 0.540483
## wday^6:month6 10.5256 28.3291 0.372 0.710510
## wday.L:month7 -28.7914 29.3590 -0.981 0.327601
## wday.Q:month7 49.5846 28.6699 1.730 0.084818 .
## wday.C:month7 54.5011 28.7249 1.897 0.058807 .
## wday^4:month7 50.8474 28.5109 1.783 0.075594 .
## wday^5:month7 -33.6983 28.0764 -1.200 0.231058
## wday^6:month7 -13.8943 27.1935 -0.511 0.609793
## wday.L:month8 -20.4479 28.8711 -0.708 0.479378
## wday.Q:month8 6.7648 28.5042 0.237 0.812578
## wday.C:month8 6.0012 28.4672 0.211 0.833186
## wday^4:month8 19.0738 28.7814 0.663 0.508058
## wday^5:month8 -19.3123 28.0576 -0.688 0.491827
## wday^6:month8 9.5074 27.8866 0.341 0.733410
## wday.L:month9 -30.3411 28.9257 -1.049 0.295110
## wday.Q:month9 -42.0342 28.6699 -1.466 0.143726
## wday.C:month9 -20.7186 28.7249 -0.721 0.471338
## wday^4:month9 -20.3752 28.7914 -0.708 0.479728
## wday^5:month9 -18.2376 28.5226 -0.639 0.523079
## wday^6:month9 11.7263 28.2699 0.415 0.678606
## wday.L:month10 -61.0507 29.5199 -2.068 0.039544 *
## wday.Q:month10 -26.2352 28.5042 -0.920 0.358153
## wday.C:month10 -32.4353 28.7249 -1.129 0.259788
## wday^4:month10 -12.2122 28.9901 -0.421 0.673890
## wday^5:month10 -27.6864 27.9072 -0.992 0.322008
## wday^6:month10 0.1234 26.8590 0.005 0.996339
## wday.L:month11 -54.9466 28.9257 -1.900 0.058512 .
## wday.Q:month11 16.0117 28.6699 0.558 0.576957
## wday.C:month11 54.9502 28.7249 1.913 0.056766 .
## wday^4:month11 47.2857 28.7914 1.642 0.101635
## wday^5:month11 -44.7401 28.5226 -1.569 0.117871
## wday^6:month11 -20.6876 28.2699 -0.732 0.464907
## wday.L:month12 -9.5058 28.8711 -0.329 0.742212
## wday.Q:month12 75.2088 28.5042 2.639 0.008791 **
## wday.C:month12 -25.0256 28.4672 -0.879 0.380097
## wday^4:month12 -23.7798 28.7814 -0.826 0.409380
## wday^5:month12 20.4470 28.0576 0.729 0.466761
## wday^6:month12 9.5864 27.8866 0.344 0.731282
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 42.05 on 281 degrees of freedom
## Multiple R-squared: 0.8355, Adjusted R-squared: 0.7869
## F-statistic: 17.2 on 83 and 281 DF, p-value: < 2.2e-16
# Since each month has only four to five weeks, each of these day of week vs month effects is the average of only four or five observations. These estimates have large standard errors and likely not generalize well beyond the data.
What would you expect the model n ~ wday + ns(date,5) to look like? Knowing what you know about the data, why would you expect it not to be particularly effective?
# Use this chunk to answer question 6
mod7 <- lm(n ~ wday + ns(date, 5), data = daily)
mod8 <- lm(n ~ wday * ns(date, 5), data = daily)
daily %>%
gather_residuals(mod7, mod8) %>%
ggplot(aes(x = date, y = resid, color = model)) +
geom_line(alpha = 0.75)
# The residuals of the model that does not interact day of week with time of year (mod7) are larger than those of the model that does (mod8). The model mod7 underestimates weekends during the summer and overestimates weekends during the autumn.
We hypothesized that people leaving on Sundays are more likely to be business travelers who need to be somewhere on Monday. Explore the hypothesis by seeing how if breaks down based on distance and time: if it’s true, you’d expect to see more Sunday evening flights to places that are far away.
# Use this chunk to answer question 7
flights %>%
mutate(date = make_date(year, month, day),
wday = wday(date, label = TRUE)) %>%
group_by(wday) %>%
summarise(dist_mean = mean(distance),
dist_median = median(distance)) %>%
ggplot(aes(y = dist_mean, x = wday)) +
geom_point()
# Sunday flights are the second longest when compared the average distances of flights by day of week. I would expect that Saturday may have the longest flights on average because regularly scheduled short flights on the weekends are very few.