daily <- flights %>%
mutate(date = make_date(year, month, day)) %>%
group_by(date) %>%
summarize(n = n())
ggplot(daily, aes(date, n)) +
geom_line()
daily <- daily %>%
mutate(wday = wday(date, label = TRUE))
ggplot(daily, aes(wday,n)) +
geom_boxplot()
mod = lm(n ~ wday, data = daily)
grid <- daily %>%
data_grid(wday) %>%
add_predictions(mod, "n")
ggplot(daily, aes(wday, n)) +
geom_boxplot() +
geom_point(data = grid, color = "orange", size = 4)
daily <- daily %>%
add_residuals(mod)
daily %>%
ggplot(aes(date, resid)) +
geom_ref_line(h = 0) +
geom_line()
ggplot(daily, aes(date, resid, color = wday)) +
geom_ref_line(h = 0, colour = "red") +
geom_line()
daily %>%
filter(resid < -100)
## # A tibble: 11 x 4
## date n wday resid
## <date> <int> <ord> <dbl>
## 1 2013-01-01 842 Tue -109.
## 2 2013-01-20 786 Sun -105.
## 3 2013-05-26 729 Sun -162.
## 4 2013-07-04 737 Thu -229.
## 5 2013-07-05 822 Fri -145.
## 6 2013-09-01 718 Sun -173.
## 7 2013-11-28 634 Thu -332.
## 8 2013-11-29 661 Fri -306.
## 9 2013-12-24 761 Tue -190.
## 10 2013-12-25 719 Wed -244.
## 11 2013-12-31 776 Tue -175.
daily %>%
ggplot(aes(date, resid)) +
geom_ref_line(h = 0, colour = "red", size = 1) +
geom_line(color = "grey50") +
geom_smooth(se = FALSE, span = 0.20)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
daily %>%
filter(wday == "Sat") %>%
ggplot(aes(date, n)) +
geom_point()+
geom_line() +
scale_x_date(
NULL,
date_breaks = "1 month",
date_labels = "%b"
)
term <- function(date) {
cut(date,
breaks = ymd(20130101, 20130605, 20130825, 20140101),
labels = c("spring", "summer", "fall")
)
}
daily <- daily %>%
mutate(term = term(date))
daily %>%
filter(wday == "Sat") %>%
ggplot(aes(date, n, color = term)) +
geom_point(alpha = 1/3)+
geom_line() +
scale_x_date(
NULL,
date_breaks = "1 month",
date_labels = "%b"
)
daily %>%
ggplot(aes(wday, n, color = term)) +
geom_boxplot()
mod1 <- lm(n ~ wday, data = daily)
mod2 <- lm(n ~ wday * term, data = daily)
daily %>%
gather_residuals(without_term = mod1, with_term = mod2) %>%
ggplot(aes(date, resid, color = model)) +
geom_line(alpha = 0.75)
grid <- daily %>%
data_grid(wday, term) %>%
add_predictions(mod2, "n")
ggplot(daily, aes(wday, n)) +
geom_boxplot() +
geom_point(data = grid, color = "red") +
facet_wrap(~ term)
mod3 <- MASS::rlm(n ~ wday * term, data = daily)
daily %>%
add_residuals(mod3, "resid") %>%
ggplot(aes(date, resid)) +
geom_hline(yintercept = 0, size = 2, color = "red") +
geom_line()
# If you are creating variables it might be a good idea to bundle the creation of the variables up into a function
compute_vars <- function(data) {
data %>%
mutate(term = term(date),
wday = wday(date, label = TRUE)
)
}
# Another option would be to put the transformations directly in the model formula:
wday2 <- function(x) wday(x, label = TRUE)
mod3 <- lm(n ~ wday2(date) * term(date), data = daily)
# We could use a more flexible model to capture the pattern of school term in the data
library(splines)
mod <- MASS::rlm(n ~ wday * ns(date, 5), data = daily)
daily %>%
data_grid(wday, date = seq_range(date, n = 13)) %>%
add_predictions(mod) %>%
ggplot(aes(date, pred, color = wday)) +
geom_line() +
geom_point()
# We see a strong pattern in the numbers of Sat flights. This is reassuring, because we also saw that pattern in the raw data. It's a good sign when you get the same signal from different approaches.
Why are there fewer than expected flights on January 20, May 26 and September 1? (Hint: they all have the same explanation.) How would these days generalize into another year?
Jan 21 is Martin Luther King Jr. Day, May 26 is Trinity Sunday, and Sep 2 is labot day. Holidays affected number of flights. They might end up with diffierent days in another year so the impact would be uncertain.
What do the three days with high positive residuals represent? How would these days generalize to another year?
daily %>%
top_n(3, resid)
## # A tibble: 3 x 5
## date n wday resid term
## <date> <int> <ord> <dbl> <fct>
## 1 2013-11-30 857 Sat 112. fall
## 2 2013-12-01 987 Sun 95.5 fall
## 3 2013-12-28 814 Sat 69.4 fall
Those are weekends so that the model is likely to underestimate the number of flights. Again they may end up with a weekday in another year so that the impact would be uncertain.
Create a new variable that splits the “wday” variable into terms, but only for Saturdays, i.e., it should have Thurs, Fri, but Sat-summer, Sat-spring, Sat-fall. How does this model compare with the model with every combination of “wday” and “term”?
daily <-
flights %>%
mutate(date = make_date(year, month, day)) %>%
group_by(date) %>%
summarize(n = n()) %>%
mutate(wday = wday(date, label = TRUE))
mod <- lm(n ~ wday, data = daily)
daily <- add_residuals(daily, mod)
term <- function(date) {
cut(date,
breaks = ymd(20130101, 20130501, 20130901, 20140101),
labels = c("spring", "summer", "fall")
)
}
daily <-
daily %>%
mutate(term = term(date))
###
new_daily <-
daily %>%
mutate(wday = as.character(wday),
term_sat = ifelse(wday == "Sat", paste0(wday, "-", term), wday))
mod1 <- MASS::rlm(n ~ term_sat, data = new_daily)
new_daily %>%
add_residuals(mod1) %>%
ggplot(aes(date, resid)) +
geom_line()
Observation indicates that Sat-spring period shows a sign of underestimation. Also, the standard deviation of Sat-fall is quite siginificant.
Create a new “wday” variable that combines the day of week, term(for Saturdays), and public holidays. What do the residuals of the model look like?
daily_holidays <-
new_daily %>%
mutate(holidays = case_when(date %in% ymd(c(20130101, # new year day
20130121, # MLK
20130218, # presidents day
20130527, # memorial day
20130704, # independence day
20130902, # labor day
20131028, # columbus day
20131111, # veterans day
20131128, # thanksgiving
20131225)) ~ "holiday",
TRUE ~ "None")) %>%
unite(new_term, term_sat, holidays)
mod2 <- lm(n ~ new_term, data = daily_holidays)
daily_holidays %>%
add_residuals(mod2) %>%
ggplot(aes(date, resid)) +
geom_line()
Hodidays seems to have some impact on the residual error but not as significant as expected.
What happens if you fit a day-of-week effect that varies by month (i.e.m n ~ wday*month)? Why is this not very helpful?
daily <- flights %>%
mutate(date = make_date(year, month, day)) %>%
group_by(date,month) %>%
summarise(n = n())
daily <- daily %>%
mutate(wday = wday(date, label = TRUE)) %>%
mutate(term = term(date))
mod2 <- lm(n ~ wday * term, data = daily)
mod4 <- lm(n~ wday * month, data = daily)
daily %>%
gather_residuals(mod4,mod2)%>%
arrange(date)%>%
ggplot(aes(date,resid,color = model))+
geom_line(alpha = 0.9)
It is not working because it is reducing the number of observations per bucket and therefore increases the odds of having outliers.
What would you expect the model n ~ wday + ns(date,5) to look like? Knowing what you know about the data, why would you expect it not to be particularly effective?
I expect the model would have significant residual errors because the days are not consistant in different years.
We hypothesized that people leaving on Sundays are more likely to be business travelers who need to be somewhere on Monday. Explore the hypothesis by seeing how if breaks down based on distance and time: if it’s true, you’d expect to see more Sunday evening flights to places that are far away.
week_relevel <- function(x) {
fct_relevel(x, "Sun", after = 7) # after 7 means night flights.
}
daily %>%
mutate(wday = week_relevel(wday)) %>%
ggplot(aes(wday, n)) + ggtitle("Sunday Night Flights") +
geom_boxplot()
The boxplot indicates there’s no such a trend that people tend to do long distance night flights on Sundays.