daily <- flights %>%
mutate(date = make_date(year, month, day)) %>%
group_by(date) %>%
summarize(n = n())
ggplot(daily, aes(date, n)) +
geom_line()
daily <- daily %>%
mutate(wday = wday(date, label = TRUE))
ggplot(daily, aes(wday,n)) +
geom_boxplot()
mod = lm(n ~ wday, data = daily, na.action = na.warn)
grid <- daily %>%
data_grid(wday) %>%
add_predictions(mod, "n")
ggplot(daily, aes(wday, n)) +
geom_boxplot() +
geom_point(data = grid, color = "orange", size = 4)
daily <- daily %>%
add_residuals(mod)
daily %>%
ggplot(aes(date, resid)) +
geom_ref_line(h = 0) +
geom_line()
ggplot(daily, aes(date, resid, color = wday)) +
geom_ref_line(h = 0, colour = "red") +
geom_line()
daily %>%
filter(resid < -100)
## # A tibble: 11 x 4
## date n wday resid
## <date> <int> <ord> <dbl>
## 1 2013-01-01 842 Tue -109.
## 2 2013-01-20 786 Sun -105.
## 3 2013-05-26 729 Sun -162.
## 4 2013-07-04 737 Thu -229.
## 5 2013-07-05 822 Fri -145.
## 6 2013-09-01 718 Sun -173.
## 7 2013-11-28 634 Thu -332.
## 8 2013-11-29 661 Fri -306.
## 9 2013-12-24 761 Tue -190.
## 10 2013-12-25 719 Wed -244.
## 11 2013-12-31 776 Tue -175.
daily %>%
ggplot(aes(date, resid)) +
geom_ref_line(h = 0, colour = "red", size = 1) +
geom_line(color = "grey50") +
geom_smooth(se = FALSE, span = 0.20)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
daily %>%
filter(wday == "Sat") %>%
ggplot(aes(date, n)) +
geom_point()+
geom_line() +
scale_x_date(
NULL,
date_breaks = "1 month",
date_labels = "%b"
)
term <- function(date) {
cut(date,
breaks = ymd(20130101, 20130605, 20130825, 20140101),
labels = c("spring", "summer", "fall")
)
}
daily <- daily %>%
mutate(term = term(date))
daily %>%
filter(wday == "Sat") %>%
ggplot(aes(date, n, color = term)) +
geom_point(alpha = 1/3)+
geom_line() +
scale_x_date(
NULL,
date_breaks = "1 month",
date_labels = "%b"
)
daily %>%
ggplot(aes(wday, n, color = term)) +
geom_boxplot()
mod1 <- lm(n ~ wday, data = daily, na.action = na.warn)
mod2 <- lm(n ~ wday * term, data = daily, na.action = na.warn)
daily %>%
gather_residuals(without_term = mod1, with_term = mod2) %>%
ggplot(aes(date, resid, color = model)) +
geom_line(alpha = 0.75)
grid <- daily %>%
data_grid(wday, term) %>%
add_predictions(mod2, "n")
ggplot(daily, aes(wday, n)) +
geom_boxplot() +
geom_point(data = grid, color = "red") +
facet_wrap(~ term)
mod3 <- MASS::rlm(n ~ wday * term, data = daily, na.action = na.warn)
daily %>%
add_residuals(mod3, "resid") %>%
ggplot(aes(date, resid)) +
geom_hline(yintercept = 0, size = 2, color = "red") +
geom_line()
# If you are creating variables it might be a good idea to bundle the creation of the variables up into a function
compute_vars <- function(data) {
data %>%
mutate(term = term(date),
wday = wday(date, label = TRUE)
)
}
# Another option would be to put the transformations directly in the model formula:
wday2 <- function(x) wday(x, label = TRUE)
mod3 <- lm(n ~ wday2(date) * term(date), data = daily, na.action = na.warn)
# We could use a more flexible model to capture the pattern of school term in the data
library(splines)
mod <- MASS::rlm(n ~ wday * ns(date, 5), data = daily, na.action = na.warn)
daily %>%
data_grid(wday, date = seq_range(date, n = 13)) %>%
add_predictions(mod) %>%
ggplot(aes(date, pred, color = wday)) +
geom_line() +
geom_point()
# We see a strong pattern in the numbers of Sat flights. This is reassuring, because we also saw that pattern in the raw data. It's a good sign when you get the same signal from different approaches.
Why are there fewer than expected flights on January 20, May 26 and September 1? (Hint: they all have the same explanation.) How would these days generalize into another year?
# Use this chunk to answer question 1
#They are all around 2013 holidays: Martin Luther King Day, Religious holiday and Labor day. This explains why there are less people flying since all the business trips are not taking place. Given that these holidays are moving, next year they fill fall into a different calendar day, which will lead to a similar patter but with different dates
What do the three days with high positive residuals represent? How would these days generalize to another year?
# Use this chunk to answer question 2
daily %>% top_n(3, resid)
## # A tibble: 3 x 5
## date n wday resid term
## <date> <int> <ord> <dbl> <fct>
## 1 2013-11-30 857 Sat 112. fall
## 2 2013-12-01 987 Sun 95.5 fall
## 3 2013-12-28 814 Sat 69.4 fall
#They represent the fact that the model significantly underestimated the number of people traveling on those days. Given that these are floating holidays, a similar pattern would occure on different days. However, it is important to note that in that year (2013), the holidays fell to weekend days. As a result, if the holidays fall on weekdays next time, patterns might be different since in this case there is a combination of the holiday + weekend effects on travel.
Create a new variable that splits the “wday” variable into terms, but only for Saturdays, i.e., it should have Thurs, Fri, but Sat-summer, Sat-spring, Sat-fall. How does this model compare with the model with every combination of “wday” and “term”?
# Use this chunk to answer question 3
daily <- flights %>% mutate(date = make_date(year, month, day)) %>% group_by(date) %>% summarize(n = n()) %>% mutate(wday = wday(date, label = TRUE))
mod <- lm(n ~ wday, data = daily)
summary(mod)
##
## Call:
## lm(formula = n ~ wday, data = daily)
##
## Residuals:
## Min 1Q Median 3Q Max
## -331.75 -8.69 12.31 25.19 112.38
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 922.595 2.554 361.209 <2e-16 ***
## wday.L -83.322 6.765 -12.317 <2e-16 ***
## wday.Q -155.111 6.760 -22.945 <2e-16 ***
## wday.C -62.834 6.756 -9.300 <2e-16 ***
## wday^4 -80.126 6.766 -11.842 <2e-16 ***
## wday^5 -4.967 6.748 -0.736 0.4622
## wday^6 -16.934 6.751 -2.508 0.0126 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 48.8 on 358 degrees of freedom
## Multiple R-squared: 0.7178, Adjusted R-squared: 0.7131
## F-statistic: 151.8 on 6 and 358 DF, p-value: < 2.2e-16
daily <- add_residuals(daily, mod)
term <- function(date) {
cut(date,
breaks = ymd(20130101, 20130605, 20130825, 20140101),
labels = c("spring", "summer", "fall")
)
}
daily <- daily %>% mutate(term = term(date))
q3_daily <- daily %>%
mutate(wday = as.character(wday),
term_sat = ifelse(wday == "Sat", paste0(wday, "-", term), wday))
mod_1 <- MASS::rlm(n ~ term_sat, data = q3_daily)
q3_daily %>% add_residuals(mod1) %>% ggplot(aes(date, resid)) + geom_line()
mod_1
## Call:
## rlm(formula = n ~ term_sat, data = q3_daily)
## Converged in 7 iterations
##
## Coefficients:
## (Intercept) term_satMon term_satSat-fall
## 978.4105441 0.1863088 -280.1270484
## term_satSat-spring term_satSat-summer term_satSun
## -233.2025950 -177.4938775 -75.2507079
## term_satThu term_satTue term_satWed
## 1.2817145 -14.9240340 -7.3207258
##
## Degrees of freedom: 365 total; 356 residual
## Scale estimate: 21.2
Create a new “wday” variable that combines the day of week, term(for Saturdays), and public holidays. What do the residuals of the model look like?
# Use this chunk to answer question 4
q4_daily <- q3_daily %>% mutate(holidays = case_when(date %in% ymd(c(20130101, 20130121, 20130218, 20130527, 20130704, 20130902, 20131028, 20131111, 20131128, 20131225)) ~ "holiday", TRUE ~ "None")) %>% unite(new_term, term_sat, holidays)
mod_2 <- lm(n ~ new_term, data = q4_daily)
q4_daily %>% add_residuals(mod_2) %>% ggplot(aes(date, resid)) + geom_line()
What happens if you fit a day-of-week effect that varies by month (i.e.m n ~ wday*month)? Why is this not very helpful?
# Use this chunk to answer question 5
mod_2 <- lm(n ~ wday * month(date), data = q4_daily)
q4_daily %>% add_residuals(mod_2) %>% ggplot(aes(date, resid)) + geom_line()
# adding the interaction term doesn't help much. This could be driven by the fact that this term does not allow to understand the dynamics.
What would you expect the model n ~ wday + ns(date,5) to look like? Knowing what you know about the data, why would you expect it not to be particularly effective?
# Use this chunk to answer question 6
#The model would fit the data for that year but could not be genralized and applied to different years as the floating holidays would change the position of the residual peaks and, possibly, even overall patterns as floating holidays might end up overlaping other significant dates.
We hypothesized that people leaving on Sundays are more likely to be business travelers who need to be somewhere on Monday. Explore the hypothesis by seeing how if breaks down based on distance and time: if it’s true, you’d expect to see more Sunday evening flights to places that are far away.
# Use this chunk to answer question 7
flights %>% mutate(date = make_date(year, month, day),wday = wday(date, label = TRUE)) %>% ggplot(aes(y = distance, x = wday)) + geom_boxplot() + labs(x = "Day of Week", y = "Average Distance")
#The box plots seem to support this assumption. Specifically, we see that Saturday and Sunday both exhibit drips that are, on average, longer than during weekdays (see mediam in boxplots).