daily <- flights %>%
mutate(date = make_date(year, month, day)) %>%
group_by(date) %>%
summarize(n = n())
ggplot(daily, aes(date, n)) +
geom_line()
daily <- daily %>%
mutate(wday = wday(date, label = TRUE))
ggplot(daily, aes(wday,n)) +
geom_boxplot()
mod = lm(n ~ wday, data = daily)
grid <- daily %>%
data_grid(wday) %>%
add_predictions(mod, "n")
ggplot(daily, aes(wday, n)) +
geom_boxplot() +
geom_point(data = grid, color = "orange", size = 4)
daily <- daily %>%
add_residuals(mod)
daily %>%
ggplot(aes(date, resid)) +
geom_ref_line(h = 0) +
geom_line()
ggplot(daily, aes(date, resid, color = wday)) +
geom_ref_line(h = 0, colour = "red") +
geom_line()
daily %>%
filter(resid < -100)
## # A tibble: 11 x 4
## date n wday resid
## <date> <int> <ord> <dbl>
## 1 2013-01-01 842 Tue -109.
## 2 2013-01-20 786 Sun -105.
## 3 2013-05-26 729 Sun -162.
## 4 2013-07-04 737 Thu -229.
## 5 2013-07-05 822 Fri -145.
## 6 2013-09-01 718 Sun -173.
## 7 2013-11-28 634 Thu -332.
## 8 2013-11-29 661 Fri -306.
## 9 2013-12-24 761 Tue -190.
## 10 2013-12-25 719 Wed -244.
## 11 2013-12-31 776 Tue -175.
daily %>%
ggplot(aes(date, resid)) +
geom_ref_line(h = 0, colour = "red", size = 1) +
geom_line(color = "grey50") +
geom_smooth(se = FALSE, span = 0.20)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
daily %>%
filter(wday == "Sat") %>%
ggplot(aes(date, n)) +
geom_point()+
geom_line() +
scale_x_date(
NULL,
date_breaks = "1 month",
date_labels = "%b"
)
term <- function(date) {
cut(date,
breaks = ymd(20130101, 20130605, 20130825, 20140101),
labels = c("spring", "summer", "fall")
)
}
daily <- daily %>%
mutate(term = term(date))
daily %>%
filter(wday == "Sat") %>%
ggplot(aes(date, n, color = term)) +
geom_point(alpha = 1/3)+
geom_line() +
scale_x_date(
NULL,
date_breaks = "1 month",
date_labels = "%b"
)
daily %>%
ggplot(aes(wday, n, color = term)) +
geom_boxplot()
mod1 <- lm(n ~ wday, data = daily)
mod2 <- lm(n ~ wday * term, data = daily)
daily %>%
gather_residuals(without_term = mod1, with_term = mod2) %>%
ggplot(aes(date, resid, color = model)) +
geom_line(alpha = 0.75)
grid <- daily %>%
data_grid(wday, term) %>%
add_predictions(mod2, "n")
ggplot(daily, aes(wday, n)) +
geom_boxplot() +
geom_point(data = grid, color = "red") +
facet_wrap(~ term)
mod3 <- MASS::rlm(n ~ wday * term, data = daily)
daily %>%
add_residuals(mod3, "resid") %>%
ggplot(aes(date, resid)) +
geom_hline(yintercept = 0, size = 2, color = "red") +
geom_line()
# If you are creating variables it might be a good idea to bundle the creation of the variables up into a function
compute_vars <- function(data) {
data %>%
mutate(term = term(date),
wday = wday(date, label = TRUE)
)
}
# Another option would be to put the transformations directly in the model formula:
wday2 <- function(x) wday(x, label = TRUE)
mod3 <- lm(n ~ wday2(date) * term(date), data = daily)
# We could use a more flexible model to capture the pattern of school term in the data
library(splines)
mod <- MASS::rlm(n ~ wday * ns(date, 5), data = daily)
daily %>%
data_grid(wday, date = seq_range(date, n = 13)) %>%
add_predictions(mod) %>%
ggplot(aes(date, pred, color = wday)) +
geom_line() +
geom_point()
# We see a strong pattern in the numbers of Sat flights. This is reassuring, because we also saw that pattern in the raw data. It's a good sign when you get the same signal from different approaches.
Why are there fewer than expected flights on January 20, May 26 and September 1? (Hint: they all have the same explanation.) How would these days generalize into another year?
Dates <- c("Jan 20", "May 26", "Sept 1")
Year <- 2013:2018
map(Year, ~wday(ymd(paste0(.x,Dates, sep = "")), label = TRUE))
## [[1]]
## [1] Sun Sun Sun
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat
##
## [[2]]
## [1] Mon Mon Mon
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat
##
## [[3]]
## [1] Tue Tue Tue
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat
##
## [[4]]
## [1] Wed Thu Thu
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat
##
## [[5]]
## [1] Fri Fri Fri
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat
##
## [[6]]
## [1] Sat Sat Sat
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat
Because those days are Sundays before the Monday holidays.
What do the three days with high positive residuals represent? How would these days generalize to another year?
daily %>%
top_n(3, resid)
## # A tibble: 3 x 5
## date n wday resid term
## <date> <int> <ord> <dbl> <fct>
## 1 2013-11-30 857 Sat 112. fall
## 2 2013-12-01 987 Sun 95.5 fall
## 3 2013-12-28 814 Sat 69.4 fall
Those three days are 1) Saturday after Thanksgiving; 2) Sunday after Thanksgiving; 3)Saturday after Christamas (12/28/2013).
Create a new variable that splits the “wday” variable into terms, but only for Saturdays, i.e., it should have Thurs, Fri, but Sat-summer, Sat-spring, Sat-fall. How does this model compare with the model with every combination of “wday” and “term”?
library(broom)
##
## Attaching package: 'broom'
## The following object is masked from 'package:modelr':
##
## bootstrap
daily <- daily %>%
mutate(
wday2 =
case_when(
wday == "Sat" & term == "summer" ~ "Sat-summer",
wday == "Sat" & term == "fall" ~ "Sat-fall",
wday == "Sat" & term == "spring" ~ "Sat-spring",
TRUE ~ as.character(wday)
)
)
mod4 <- lm(n ~ wday2, data = daily)
daily %>%
gather_residuals(sat_term = mod4, all_interact = mod2) %>%
ggplot(aes(date, resid, colour = model)) +
geom_line(alpha = 0.75)
daily %>%
spread_residuals(sat_term = mod4, all_interact = mod3) %>%
mutate(resid_diff = sat_term - all_interact) %>%
ggplot(aes(date, resid_diff)) +
geom_line(alpha = 0.75)
glance(mod4) %>% select(r.squared, sigma, AIC, df)
## # A tibble: 1 x 4
## r.squared sigma AIC df
## <dbl> <dbl> <dbl> <int>
## 1 0.736 47.4 3863. 9
glance(mod2) %>% select(r.squared, sigma, AIC, df)
## # A tibble: 1 x 4
## r.squared sigma AIC df
## <dbl> <dbl> <dbl> <int>
## 1 0.757 46.2 3856. 21
we can tell that the model with Saturday and Term performs better than other model.
Create a new “wday” variable that combines the day of week, term(for Saturdays), and public holidays. What do the residuals of the model look like?
daily <- daily %>%
mutate(wday3 =
case_when(
.$date %in% lubridate::ymd(c(20130101,
20130121,
20130218,
20130527,
20130704,
20130902,
20131028,
20131111,
20131128,
20131225)) ~
"holiday",
.$wday == "Sat" & .$term == "summer" ~ "Sat-summer",
.$wday == "Sat" & .$ term == "fall" ~ "Sat-fall",
.$wday == "Sat" & .$term == "spring" ~ "Sat-spring",
TRUE ~ as.character(.$wday)))
mod5 <- lm(n ~ wday3, data = daily)
daily %>%
spread_residuals(mod5) %>%
arrange(desc(abs(resid))) %>%
slice(1:20) %>% select(date, wday, resid)
## # A tibble: 20 x 3
## date wday resid
## <date> <ord> <dbl>
## 1 2013-11-28 Thu -332.
## 2 2013-11-29 Fri -306.
## 3 2013-12-25 Wed -244.
## 4 2013-07-04 Thu -229.
## 5 2013-12-24 Tue -190.
## 6 2013-12-31 Tue -175.
## 7 2013-09-01 Sun -173.
## 8 2013-05-26 Sun -162.
## 9 2013-07-05 Fri -145.
## 10 2013-11-30 Sat 112.
## 11 2013-01-01 Tue -109.
## 12 2013-01-20 Sun -105.
## 13 2013-12-01 Sun 95.5
## 14 2013-02-03 Sun -77.5
## 15 2013-01-19 Sat -70.6
## 16 2013-12-28 Sat 69.4
## 17 2013-10-12 Sat -68.6
## 18 2013-01-27 Sun -68.5
## 19 2013-06-22 Sat 67.4
## 20 2013-06-29 Sat 67.4
What happens if you fit a day-of-week effect that varies by month (i.e.m n ~ wday*month)? Why is this not very helpful?
daily <- mutate(daily, month = factor(lubridate::month(date)))
mod6 <- lm(n ~ wday * month, data = daily)
summary(mod6)
##
## Call:
## lm(formula = n ~ wday * month, data = daily)
##
## Residuals:
## Min 1Q Median 3Q Max
## -269.2 -5.0 1.5 8.8 113.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 867.4000 7.5983 114.157 < 2e-16 ***
## wday.L -64.0744 20.8737 -3.070 0.002353 **
## wday.Q -165.6001 20.1555 -8.216 7.77e-15 ***
## wday.C -68.2591 20.3115 -3.361 0.000885 ***
## wday^4 -92.0814 20.4991 -4.492 1.03e-05 ***
## wday^5 9.7925 19.7334 0.496 0.620111
## wday^6 -20.4376 18.9922 -1.076 0.282802
## month2 23.7071 10.9946 2.156 0.031912 *
## month3 67.8857 10.7456 6.318 1.04e-09 ***
## month4 74.5929 10.8292 6.888 3.70e-11 ***
## month5 56.2786 10.7456 5.237 3.20e-07 ***
## month6 80.3071 10.8292 7.416 1.43e-12 ***
## month7 77.1143 10.7456 7.176 6.39e-12 ***
## month8 81.6357 10.7456 7.597 4.52e-13 ***
## month9 51.3714 10.8292 4.744 3.34e-06 ***
## month10 60.1357 10.7456 5.596 5.20e-08 ***
## month11 46.9143 10.8292 4.332 2.06e-05 ***
## month12 38.7786 10.7456 3.609 0.000364 ***
## wday.L:month2 -3.7230 29.6267 -0.126 0.900089
## wday.Q:month2 -3.8188 29.1251 -0.131 0.895776
## wday.C:month2 0.4899 29.2333 0.017 0.986641
## wday^4:month2 4.5690 29.3639 0.156 0.876460
## wday^5:month2 -4.2552 28.8346 -0.148 0.882784
## wday^6:month2 12.0570 28.3325 0.426 0.670760
## wday.L:month3 -14.5705 28.4302 -0.513 0.608703
## wday.Q:month3 15.4389 28.2073 0.547 0.584581
## wday.C:month3 8.2262 28.4672 0.289 0.772817
## wday^4:month3 22.7202 28.7015 0.792 0.429261
## wday^5:month3 -15.3298 28.5042 -0.538 0.591135
## wday^6:month3 11.3727 28.2682 0.402 0.687759
## wday.L:month4 -16.6682 29.3590 -0.568 0.570667
## wday.Q:month4 10.7254 28.9620 0.370 0.711418
## wday.C:month4 -0.2449 28.7249 -0.009 0.993202
## wday^4:month4 23.2883 28.8711 0.807 0.420561
## wday^5:month4 -17.8720 28.0764 -0.637 0.524935
## wday^6:month4 5.3524 27.8883 0.192 0.847940
## wday.L:month5 3.6663 29.3590 0.125 0.900711
## wday.Q:month5 -20.6652 28.6699 -0.721 0.471632
## wday.C:month5 4.6336 28.7249 0.161 0.871965
## wday^4:month5 5.9994 28.5109 0.210 0.833490
## wday^5:month5 -16.9119 28.0764 -0.602 0.547425
## wday^6:month5 12.7643 27.1935 0.469 0.639158
## wday.L:month6 -4.5261 28.6515 -0.158 0.874593
## wday.Q:month6 23.8130 28.2073 0.844 0.399267
## wday.C:month6 13.7580 28.7249 0.479 0.632342
## wday^4:month6 24.1183 29.1875 0.826 0.409322
## wday^5:month6 -17.6484 28.7981 -0.613 0.540483
## wday^6:month6 10.5256 28.3291 0.372 0.710510
## wday.L:month7 -28.7914 29.3590 -0.981 0.327601
## wday.Q:month7 49.5846 28.6699 1.730 0.084818 .
## wday.C:month7 54.5011 28.7249 1.897 0.058807 .
## wday^4:month7 50.8474 28.5109 1.783 0.075594 .
## wday^5:month7 -33.6983 28.0764 -1.200 0.231058
## wday^6:month7 -13.8943 27.1935 -0.511 0.609793
## wday.L:month8 -20.4479 28.8711 -0.708 0.479378
## wday.Q:month8 6.7648 28.5042 0.237 0.812578
## wday.C:month8 6.0012 28.4672 0.211 0.833186
## wday^4:month8 19.0738 28.7814 0.663 0.508058
## wday^5:month8 -19.3123 28.0576 -0.688 0.491827
## wday^6:month8 9.5074 27.8866 0.341 0.733410
## wday.L:month9 -30.3411 28.9257 -1.049 0.295110
## wday.Q:month9 -42.0342 28.6699 -1.466 0.143726
## wday.C:month9 -20.7186 28.7249 -0.721 0.471338
## wday^4:month9 -20.3752 28.7914 -0.708 0.479728
## wday^5:month9 -18.2376 28.5226 -0.639 0.523079
## wday^6:month9 11.7263 28.2699 0.415 0.678606
## wday.L:month10 -61.0507 29.5199 -2.068 0.039544 *
## wday.Q:month10 -26.2352 28.5042 -0.920 0.358153
## wday.C:month10 -32.4353 28.7249 -1.129 0.259788
## wday^4:month10 -12.2122 28.9901 -0.421 0.673890
## wday^5:month10 -27.6864 27.9072 -0.992 0.322008
## wday^6:month10 0.1234 26.8590 0.005 0.996339
## wday.L:month11 -54.9466 28.9257 -1.900 0.058512 .
## wday.Q:month11 16.0117 28.6699 0.558 0.576957
## wday.C:month11 54.9502 28.7249 1.913 0.056766 .
## wday^4:month11 47.2857 28.7914 1.642 0.101635
## wday^5:month11 -44.7401 28.5226 -1.569 0.117871
## wday^6:month11 -20.6876 28.2699 -0.732 0.464907
## wday.L:month12 -9.5058 28.8711 -0.329 0.742212
## wday.Q:month12 75.2088 28.5042 2.639 0.008791 **
## wday.C:month12 -25.0256 28.4672 -0.879 0.380097
## wday^4:month12 -23.7798 28.7814 -0.826 0.409380
## wday^5:month12 20.4470 28.0576 0.729 0.466761
## wday^6:month12 9.5864 27.8866 0.344 0.731282
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 42.05 on 281 degrees of freedom
## Multiple R-squared: 0.8355, Adjusted R-squared: 0.7869
## F-statistic: 17.2 on 83 and 281 DF, p-value: < 2.2e-16
Because it is really hard to generate a pattern.
What would you expect the model n ~ wday + ns(date,5) to look like? Knowing what you know about the data, why would you expect it not to be particularly effective?
mod7 <- lm(n ~ wday + ns(date, 5), data = daily)
mod8 <- lm(n ~ wday * ns(date, 5), data = daily)
daily %>%
gather_residuals(mod7, mod8) %>%
ggplot(aes(x = date, y = resid, color = model)) +
geom_line(alpha = 0.75)
The residuals of mod7 are larger than mod8. mod7 underestimates weekends during the summer and overestimates weekends during the autumn.
We hypothesized that people leaving on Sundays are more likely to be business travelers who need to be somewhere on Monday. Explore the hypothesis by seeing how if breaks down based on distance and time: if it’s true, you’d expect to see more Sunday evening flights to places that are far away.
flights %>%
mutate(date = make_date(year, month, day),
wday = wday(date, label = TRUE)) %>%
group_by(wday) %>%
summarise(dist_mean = mean(distance),
dist_median = median(distance)) %>%
ggplot(aes(y = dist_mean, x = wday)) +
geom_point()