daily <- flights %>%
mutate(date = make_date(year, month, day)) %>%
group_by(date) %>%
summarize(n = n())
ggplot(daily, aes(date, n)) +
geom_line()
daily <- daily %>%
mutate(wday = wday(date, label = TRUE))
ggplot(daily, aes(wday,n)) +
geom_boxplot()
mod = lm(n ~ wday, data = daily)
grid <- daily %>%
data_grid(wday) %>%
add_predictions(mod, "n")
ggplot(daily, aes(wday, n)) +
geom_boxplot() +
geom_point(data = grid, color = "orange", size = 4)
daily <- daily %>%
add_residuals(mod)
daily %>%
ggplot(aes(date, resid)) +
geom_ref_line(h = 0) +
geom_line()
ggplot(daily, aes(date, resid, color = wday)) +
geom_ref_line(h = 0, colour = "red") +
geom_line()
daily %>%
filter(resid < -100)
## # A tibble: 11 x 4
## date n wday resid
## <date> <int> <ord> <dbl>
## 1 2013-01-01 842 Tue -109.
## 2 2013-01-20 786 Sun -105.
## 3 2013-05-26 729 Sun -162.
## 4 2013-07-04 737 Thu -229.
## 5 2013-07-05 822 Fri -145.
## 6 2013-09-01 718 Sun -173.
## 7 2013-11-28 634 Thu -332.
## 8 2013-11-29 661 Fri -306.
## 9 2013-12-24 761 Tue -190.
## 10 2013-12-25 719 Wed -244.
## 11 2013-12-31 776 Tue -175.
daily %>%
ggplot(aes(date, resid)) +
geom_ref_line(h = 0, colour = "red", size = 1) +
geom_line(color = "grey50") +
geom_smooth(se = FALSE, span = 0.20)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
daily %>%
filter(wday == "Sat") %>%
ggplot(aes(date, n)) +
geom_point()+
geom_line() +
scale_x_date(
NULL,
date_breaks = "1 month",
date_labels = "%b"
)
term <- function(date) {
cut(date,
breaks = ymd(20130101, 20130605, 20130825, 20140101),
labels = c("spring", "summer", "fall")
)
}
daily <- daily %>%
mutate(term = term(date))
daily %>%
filter(wday == "Sat") %>%
ggplot(aes(date, n, color = term)) +
geom_point(alpha = 1/3)+
geom_line() +
scale_x_date(
NULL,
date_breaks = "1 month",
date_labels = "%b"
)
daily %>%
ggplot(aes(wday, n, color = term)) +
geom_boxplot()
mod1 <- lm(n ~ wday, data = daily)
mod2 <- lm(n ~ wday * term, data = daily)
daily %>%
gather_residuals(without_term = mod1, with_term = mod2) %>%
ggplot(aes(date, resid, color = model)) +
geom_line(alpha = 0.75)
grid <- daily %>%
data_grid(wday, term) %>%
add_predictions(mod2, "n")
ggplot(daily, aes(wday, n)) +
geom_boxplot() +
geom_point(data = grid, color = "red") +
facet_wrap(~ term)
mod3 <- MASS::rlm(n ~ wday * term, data = daily)
daily %>%
add_residuals(mod3, "resid") %>%
ggplot(aes(date, resid)) +
geom_hline(yintercept = 0, size = 2, color = "red") +
geom_line()
# If you are creating variables it might be a good idea to bundle the creation of the variables up into a function
compute_vars <- function(data) {
data %>%
mutate(term = term(date),
wday = wday(date, label = TRUE)
)
}
# Another option would be to put the transformations directly in the model formula:
wday2 <- function(x) wday(x, label = TRUE)
mod3 <- lm(n ~ wday2(date) * term(date), data = daily)
# We could use a more flexible model to capture the pattern of school term in the data
library(splines)
mod <- MASS::rlm(n ~ wday * ns(date, 5), data = daily)
daily %>%
data_grid(wday, date = seq_range(date, n = 13)) %>%
add_predictions(mod) %>%
ggplot(aes(date, pred, color = wday)) +
geom_line() +
geom_point()
# We see a strong pattern in the numbers of Sat flights. This is reassuring, because we also saw that pattern in the raw data. It's a good sign when you get the same signal from different approaches.
Why are there fewer than expected flights on January 20, May 26 and September 1? (Hint: they all have the same explanation.) How would these days generalize into another year?
daily %>%
filter(date %in% as.Date(c("2013-01-20","2013-05-26","2013-09-01"))) %>%
mutate(holiday = c("Martin Luther King Jr. Day", "Memorial Day", "Labor Day")) %>%
kable() %>% kable_styling()
| date | n | wday | resid | term | holiday |
|---|---|---|---|---|---|
| 2013-01-20 | 786 | Sun | -105.4808 | spring | Martin Luther King Jr. Day |
| 2013-05-26 | 729 | Sun | -162.4808 | spring | Memorial Day |
| 2013-09-01 | 718 | Sun | -173.4808 | fall | Labor Day |
Ans : Because these dates are the sundays prior to a public holiday. The holidays are Martin Luther King Jr. Day, Memorial Day, and Labor Day. The corresponding Sundays in 2014 will be January 19, May 25 and August 31.
What do the three days with high positive residuals represent? How would these days generalize to another year?
daily %>%
top_n(3, resid) %>%
mutate(holiday = c("Thanksgiving (United States)", "Thanksgiving (United States)", "Hanukkah, Christmas, New Year")) %>%
kable() %>% kable_styling()
| date | n | wday | resid | term | holiday |
|---|---|---|---|---|---|
| 2013-11-30 | 857 | Sat | 112.38462 | fall | Thanksgiving (United States) |
| 2013-12-01 | 987 | Sun | 95.51923 | fall | Thanksgiving (United States) |
| 2013-12-28 | 814 | Sat | 69.38462 | fall | Hanukkah, Christmas, New Year |
Ans : These are the Saturdays prior to public hodlidays, such as Thanksgiving Day in the United States , Christmas Day or Hanukkah, and New Year’s Day. Positive residuals represent higher demand. It makes sense since people tend to travel around popular holidays, especially on the holidays that feature family gathering. Same trend would observed in 2014. ## Question #3
Create a new variable that splits the “wday” variable into terms, but only for Saturdays, i.e., it should have Thurs, Fri, but Sat-summer, Sat-spring, Sat-fall. How does this model compare with the model with every combination of “wday” and “term”?
daily <- daily %>%
mutate(wday2 = case_when(wday == "Sat" & term == "spring" ~ "Sat-spring",
wday == "Sat" & term == "summer" ~ "Sat-summer",
wday == "Sat" & term == "fall" ~ "Sat-fall",
T ~ as.character(wday)))
modQ3 <- lm(n ~wday2, data = daily)
daily %>%
spread_residuals(sat_term = modQ3, all_interact = mod2) %>%
mutate(residdiff = sat_term - all_interact) %>%
ggplot(aes(date, residdiff)) +
geom_ref_line(h = 0, colour = "red") +
geom_line(alpha = 0.75) +
labs(x="Date", y="Residuals difference") + ggtitle("Line chart")
options(scipen=100,digits=4)
rbind(broom::glance(modQ3), broom::glance(mod2)) %>%
mutate(model = c("with Saturday and term", "with all interactions")) %>%
kable() %>% kable_styling()
| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual | model |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.7357 | 0.7297 | 47.36 | 123.84 | 0 | 9 | -1921 | 3863 | 3902 | 798487 | 356 | with Saturday and term |
| 0.7573 | 0.7432 | 46.17 | 53.67 | 0 | 21 | -1906 | 3856 | 3942 | 733157 | 344 | with all interactions |
Ans: Compare with the model every combination of “wday” and “term”, new model (Line chart) shows lower residuals in Spring and higher residuals in Summer.
Table shows that the model (model with Saturday and term) has lower adjusted R-squared and the higher standard error of the regression, but with slightly higher AIC than the model with all interactions.
Create a new “wday” variable that combines the day of week, term(for Saturdays), and public holidays. What do the residuals of the model look like?
holidays <- ymd(20130101, 20130120, 20130526, 20130704, 20130705, 20130901, 20131128, 20131129, 20131130, 20131201, 20131224, 20131225, 20131231)
# Federal holidays in the United States in 2013 are available at https://www.officeholidays.com/countries/usa/2013.php.
daily <- daily %>%
mutate(wday3 = case_when(date %in% holidays ~ "holidays",
T ~ as.character(wday2)))
modQ4 <- lm(n ~ wday3, data = daily)
daily <- daily %>%
add_residuals(modQ4, "residQ4")
ggplot(daily, aes(date, residQ4)) +
geom_ref_line(h = 0, colour = "red", size = 1) +
geom_line() +
labs(x="Date", y="Residuals") + ggtitle("Line chart 11")
Ans: As indicated in the figures, the line remains roughly the same as before. However, the absolute values of the extreme residuals slightly narrow down. The model with Saturday, term and holidays still predict more flights in January and February, and less flights in July. And the number of flights fluctuates during Thanksgiving and Christmas holidays.
What happens if you fit a day-of-week effect that varies by month (i.e.m n ~ wday*month)? Why is this not very helpful?
dummy <- daily %>%
mutate(month = month(date, label=T))
modQ5 <- lm(n ~ wday * month, data = dummy)
dummy %>%
add_residuals(modQ5, "residQ5") %>%
ggplot(aes(date, residQ5)) +
geom_ref_line(h = 0, colour = "red", size = 1) +
geom_line() +
labs(x="Date", y="Residuals") + ggtitle("Line chart 12")
options(scipen=100,digits=4)
broom::glance(modQ5) %>% kable() %>% kable_styling()
| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual |
|---|---|---|---|---|---|---|---|---|---|---|
| 0.8355 | 0.7869 | 42.05 | 17.2 | 0 | 84 | -1835 | 3840 | 4171 | 496839 | 281 |
Ans: Above charts show higher volatility in high travel season - July, November and December, but lower volatility in January.The sample size for each day of the week in a month is either four or five, which may not be enough to build a robust regression. Day-of-week is not a significant explanatory variable to flights.
What would you expect the model n ~ wday + ns(date,5) to look like? Knowing what you know about the data, why would you expect it not to be particularly effective?
modQ6 <- lm(n ~ wday + splines::ns(date, df = 5), data = daily)
daily %>%
add_residuals(modQ6, "residQ6") %>%
ggplot(aes(date, residQ6)) +
geom_ref_line(h = 0, colour = "red", size = 1) +
geom_line() +
labs(x="Date", y="Residuals") + ggtitle("Line chart 13")
options(scipen=100,digits=4)
broom::glance(modQ6) %>% kable() %>% kable_styling()
| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual |
|---|---|---|---|---|---|---|---|---|---|---|
| 0.7816 | 0.7748 | 43.23 | 114.9 | 0 | 12 | -1887 | 3799 | 3850 | 659694 | 353 |
Ans: As shown in the second part of EDA, weekdays (through Monday to Friday) have similar flights distribution. In the third part of EDA, it shows that holidays, not a weekday trend across months, is more significant.
We hypothesized that people leaving on Sundays are more likely to be business travelers who need to be somewhere on Monday. Explore the hypothesis by seeing how if breaks down based on distance and time: if it’s true, you’d expect to see more Sunday evening flights to places that are far away.
datQ7 <- flights %>%
mutate(date = make_date(year, month, day),
wday = wday(date, label = T)) %>%
select(date, wday, distance, air_time, hour) %>%
filter(hour != 1) #omit the single one record of 1 AM flight
datQ7 %>%
ggplot(aes(wday, distance)) + geom_boxplot() +
labs(x="Week", y="Traveling distance (miles)") + ggtitle("Boxplot 5")
datQ7 %>%
group_by(wday) %>%
summarise("Mean distance (miles)" = round(mean(distance)),
"Median distance (miles)" = median(distance),
"Mean duration (minutes)" = round(mean(air_time, na.rm = T)),
"Median duration (minutes)" = median(air_time, na.rm = T)) %>%
rename(Week = wday) %>%
kable() %>% kable_styling()
| Week | Mean distance (miles) | Median distance (miles) | Mean duration (minutes) | Median duration (minutes) |
|---|---|---|---|---|
| Sun | 1055 | 937 | 152 | 130 |
| Mon | 1032 | 828 | 151 | 129 |
| Tue | 1027 | 820 | 150 | 128 |
| Wed | 1028 | 820 | 150 | 128 |
| Thu | 1033 | 828 | 150 | 128 |
| Fri | 1033 | 828 | 150 | 127 |
| Sat | 1081 | 944 | 154 | 134 |
datQ7 %>%
group_by(wday, hour) %>%
summarise(distMedian = median(distance)) %>%
reshape2::acast(wday~hour, value.var="distMedian") %>%
kable() %>% kable_styling()
| 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Sun | 1400 | 963 | 1041 | 1005 | 950 | 1017.0 | 820 | 746 | 762 | 762 | 765 | 764 | 1008 | 944 | 944 | 762 | 719 | 266 | 1598 |
| Mon | 1400 | 762 | 1035 | 872 | 944 | 872.0 | 888 | 746 | 762 | 762 | 765 | 762 | 950 | 997 | 937 | 762 | 719 | 266 | 1598 |
| Tue | 1400 | 760 | 1035 | 872 | 944 | 872.0 | 888 | 746 | 762 | 762 | 765 | 762 | 950 | 944 | 944 | 740 | 642 | 266 | 1598 |
| Wed | 1400 | 760 | 1035 | 833 | 937 | 872.0 | 888 | 746 | 762 | 762 | 764 | 762 | 1008 | 937 | 937 | 753 | 642 | 266 | 1598 |
| Thu | 1400 | 760 | 1035 | 872 | 944 | 872.0 | 888 | 746 | 762 | 762 | 765 | 764 | 1008 | 944 | 944 | 762 | 719 | 266 | 1598 |
| Fri | 1400 | 760 | 1035 | 872 | 944 | 809.5 | 888 | 746 | 762 | 762 | 765 | 764 | 950 | 944 | 937 | 762 | 733 | 266 | 1598 |
| Sat | 1400 | 937 | 1035 | 944 | 950 | 1028.0 | 937 | 738 | 937 | 764 | 888 | 944 | 1065 | 1020 | 1005 | 1023 | 937 | 266 | 1598 |
Ans: Above analysis show that traveling distances in each day of the week are similar, though 100 miles more median distances are on Saturday and Sunday. Saturday travel also appears to have longer mean/median distance than Sunday travel. As for the hypothesis that business trips take off on Sunday evenings, median distances are the same at 10 PM and 11 PM whichever day of the week it is. Thus the hypothesis appears to be untrue.