daily <- flights %>%
mutate(date = make_date(year, month, day)) %>%
group_by(date) %>%
summarize(n = n())
ggplot(daily, aes(date, n)) +
geom_line()
daily <- daily %>%
mutate(wday = wday(date, label = TRUE))
ggplot(daily, aes(wday,n)) +
geom_boxplot()
mod = lm(n ~ wday, data = daily)
grid <- daily %>%
data_grid(wday) %>%
add_predictions(mod, "n")
ggplot(daily, aes(wday, n)) +
geom_boxplot() +
geom_point(data = grid, color = "orange", size = 4)
daily <- daily %>%
add_residuals(mod)
daily %>%
ggplot(aes(date, resid)) +
geom_ref_line(h = 0) +
geom_line()
ggplot(daily, aes(date, resid, color = wday)) +
geom_ref_line(h = 0, colour = "red") +
geom_line()
daily %>%
filter(resid < -100)
## # A tibble: 11 x 4
## date n wday resid
## <date> <int> <ord> <dbl>
## 1 2013-01-01 842 Tue -109.
## 2 2013-01-20 786 Sun -105.
## 3 2013-05-26 729 Sun -162.
## 4 2013-07-04 737 Thu -229.
## 5 2013-07-05 822 Fri -145.
## 6 2013-09-01 718 Sun -173.
## 7 2013-11-28 634 Thu -332.
## 8 2013-11-29 661 Fri -306.
## 9 2013-12-24 761 Tue -190.
## 10 2013-12-25 719 Wed -244.
## 11 2013-12-31 776 Tue -175.
daily %>%
ggplot(aes(date, resid)) +
geom_ref_line(h = 0, colour = "red", size = 1) +
geom_line(color = "grey50") +
geom_smooth(se = FALSE, span = 0.20)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
daily %>%
filter(wday == "Sat") %>%
ggplot(aes(date, n)) +
geom_point()+
geom_line() +
scale_x_date(
NULL,
date_breaks = "1 month",
date_labels = "%b"
)
term <- function(date) {
cut(date,
breaks = ymd(20130101, 20130605, 20130825, 20140101),
labels = c("spring", "summer", "fall")
)
}
daily <- daily %>%
mutate(term = term(date))
daily %>%
filter(wday == "Sat") %>%
ggplot(aes(date, n, color = term)) +
geom_point(alpha = 1/3)+
geom_line() +
scale_x_date(
NULL,
date_breaks = "1 month",
date_labels = "%b"
)
daily %>%
ggplot(aes(wday, n, color = term)) +
geom_boxplot()
mod1 <- lm(n ~ wday, data = daily)
mod2 <- lm(n ~ wday * term, data = daily)
daily %>%
gather_residuals(without_term = mod1, with_term = mod2) %>%
ggplot(aes(date, resid, color = model)) +
geom_line(alpha = 0.75)
grid <- daily %>%
data_grid(wday, term) %>%
add_predictions(mod2, "n")
ggplot(daily, aes(wday, n)) +
geom_boxplot() +
geom_point(data = grid, color = "red") +
facet_wrap(~ term)
mod3 <- MASS::rlm(n ~ wday * term, data = daily)
daily %>%
add_residuals(mod3, "resid") %>%
ggplot(aes(date, resid)) +
geom_hline(yintercept = 0, size = 2, color = "red") +
geom_line()
# If you are creating variables it might be a good idea to bundle the creation of the variables up into a function
compute_vars <- function(data) {
data %>%
mutate(term = term(date),
wday = wday(date, label = TRUE)
)
}
# Another option would be to put the transformations directly in the model formula:
wday2 <- function(x) wday(x, label = TRUE)
mod3 <- lm(n ~ wday2(date) * term(date), data = daily)
# We could use a more flexible model to capture the pattern of school term in the data
library(splines)
mod <- MASS::rlm(n ~ wday * ns(date, 5), data = daily)
daily %>%
data_grid(wday, date = seq_range(date, n = 13)) %>%
add_predictions(mod) %>%
ggplot(aes(date, pred, color = wday)) +
geom_line() +
geom_point()
# We see a strong pattern in the numbers of Sat flights. This is reassuring, because we also saw that pattern in the raw data. It's a good sign when you get the same signal from different approaches.
Why are there fewer than expected flights on January 20, May 26 and September 1? (Hint: they all have the same explanation.) How would these days generalize into another year?
##Answer 1: These days in 2013 were all the day before important holidays. E.g. MKT day was 01/21/2013, memorial day was 05/27/2013, and labor day was 09/02/2013. Therefore, these days would be different in other years.
What do the three days with high positive residuals represent? How would these days generalize to another year?
##Answer 2: Positive residual indicates that the flight # is under-predicted. There are other factors that drives the flight number that are not captured by the moDEL. These three days are the days after thanksgiving and Christmas. These days will show similar results to other years.
daily %>%
top_n(3, resid)
## # A tibble: 3 x 5
## date n wday resid term
## <date> <int> <ord> <dbl> <fct>
## 1 2013-11-30 857 Sat 112. fall
## 2 2013-12-01 987 Sun 95.5 fall
## 3 2013-12-28 814 Sat 69.4 fall
Create a new variable that splits the “wday” variable into terms, but only for Saturdays, i.e., it should have Thurs, Fri, but Sat-summer, Sat-spring, Sat-fall. How does this model compare with the model with every combination of “wday” and “term”?
##Answer3: Based on the summary statistic of new model, it performs better fit than the model with every combination of “wday” and “term”.
data2 <-
daily %>%
mutate(wday = as.character(wday),
term = ifelse(wday == "Sat", paste0(wday, "-", term), wday))
new_model <- MASS::rlm(n ~ term, data = data2)
data2 %>%
add_residuals(new_model, "resid") %>%
ggplot(aes(date, resid)) +
geom_hline(yintercept = 0, size = 2, color = "blue") +
geom_line()
summary(mod3)
##
## Call:
## lm(formula = n ~ wday2(date) * term(date), data = daily)
##
## Residuals:
## Min 1Q Median 3Q Max
## -326.33 -8.00 9.82 20.67 141.00
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 912.8684 3.7086 246.151 < 2e-16 ***
## wday2(date).L -71.4674 9.8349 -7.267 2.49e-12 ***
## wday2(date).Q -161.0112 9.8196 -16.397 < 2e-16 ***
## wday2(date).C -65.6215 9.8068 -6.691 8.98e-11 ***
## wday2(date)^4 -82.0997 9.8412 -8.342 1.78e-15 ***
## wday2(date)^5 -1.3425 9.7787 -0.137 0.8909
## wday2(date)^6 -12.4501 9.7903 -1.272 0.2043
## term(date)summer 37.6922 6.3336 5.951 6.56e-09 ***
## term(date)fall 3.7598 5.5033 0.683 0.4949
## wday2(date).L:term(date)summer -6.2215 16.8049 -0.370 0.7114
## wday2(date).Q:term(date)summer 26.4121 16.7501 1.577 0.1158
## wday2(date).C:term(date)summer 26.7451 16.7885 1.593 0.1121
## wday2(date)^4:term(date)summer 23.5933 16.7523 1.408 0.1599
## wday2(date)^5:term(date)summer -13.1343 16.7720 -0.783 0.4341
## wday2(date)^6:term(date)summer -5.0026 16.6744 -0.300 0.7643
## wday2(date).L:term(date)fall -31.9048 14.5607 -2.191 0.0291 *
## wday2(date).Q:term(date)fall -0.6959 14.5707 -0.048 0.9619
## wday2(date).C:term(date)fall -8.8349 14.5417 -0.608 0.5439
## wday2(date)^4:term(date)fall -9.8570 14.5899 -0.676 0.4997
## wday2(date)^5:term(date)fall -3.2107 14.5228 -0.221 0.8252
## wday2(date)^6:term(date)fall -8.2061 14.5769 -0.563 0.5738
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 46.17 on 344 degrees of freedom
## Multiple R-squared: 0.7573, Adjusted R-squared: 0.7432
## F-statistic: 53.67 on 20 and 344 DF, p-value: < 2.2e-16
summary(new_model)
##
## Call: rlm(formula = n ~ term, data = data2)
## Residuals:
## Min 1Q Median 3Q Max
## -345.692 -14.487 2.403 13.840 158.717
##
## Coefficients:
## Value Std. Error t value
## (Intercept) 978.4105 3.2011 305.6436
## termMon 0.1863 4.5271 0.0412
## termSat-fall -280.1270 6.3127 -44.3748
## termSat-spring -233.2026 5.8710 -39.7213
## termSat-summer -177.4939 7.3927 -24.0092
## termSun -75.2507 4.5271 -16.6223
## termThu 1.2817 4.5271 0.2831
## termTue -14.9240 4.5057 -3.3123
## termWed -7.3207 4.5271 -1.6171
##
## Residual standard error: 21.18 on 356 degrees of freedom
anova(new_model)
## Analysis of Variance Table
##
## Response: n
## Df Sum Sq Mean Sq F value Pr(>F)
## term 8 2182237 272780
## Residuals 841150
Create a new “wday” variable that combines the day of week, term(for Saturdays), and public holidays. What do the residuals of the model look like?
##Answer: Based on the residuals of the model, it performs worse than question#3 model.
data3 <-
data2 %>%
mutate(holiday = case_when(date %in% ymd(c(20130101, 20130121, 20130218, 20130527, 20130704, 20130902, 20131028, 20131111, 20131128,20131225)) ~ "holiday",
TRUE ~ "None")) %>%
unite(combination, term, holiday)
new_model2 <- lm(n ~ combination, data = data3)
data3 %>%
add_residuals(new_model2, "resid") %>%
ggplot(aes(date, resid)) +
geom_hline(yintercept = 0, size = 2, color = "green") +
geom_line()
summary(new_model2)
##
## Call:
## lm(formula = n ~ combination, data = data3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -306.462 -10.462 8.529 20.538 141.000
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 967.46154 5.51738 175.348 < 2e-16 ***
## combinationMon_holiday -20.29487 17.15424 -1.183 0.23758
## combinationMon_None 10.95150 8.05318 1.360 0.17473
## combinationSat-fall_None -251.46154 10.88043 -23.111 < 2e-16 ***
## combinationSat-spring_None -230.14336 10.11900 -22.744 < 2e-16 ***
## combinationSat-summer_None -166.54487 12.74185 -13.071 < 2e-16 ***
## combinationSun_None -75.98077 7.80276 -9.738 < 2e-16 ***
## combinationThu_holiday -281.96154 28.66917 -9.835 < 2e-16 ***
## combinationThu_None 9.49846 7.88040 1.205 0.22889
## combinationTue_holiday -125.46154 40.16716 -3.123 0.00194 **
## combinationTue_None -14.00000 7.80276 -1.794 0.07363 .
## combinationWed_holiday -248.46154 40.16716 -6.186 1.72e-09 ***
## combinationWed_None 0.00905 7.84091 0.001 0.99908
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 39.79 on 352 degrees of freedom
## Multiple R-squared: 0.8155, Adjusted R-squared: 0.8093
## F-statistic: 129.7 on 12 and 352 DF, p-value: < 2.2e-16
What happens if you fit a day-of-week effect that varies by month (i.e.m n ~ wday*month)? Why is this not very helpful?
##Answer: The residual in this model is even worse than model in question#3. Becuase this interaction terms does not really have meanings.
model5 <- lm(n ~ wday * month(date), data = data3)
data3 %>%
add_residuals(model5, "resid") %>%
ggplot(aes(date, resid)) +
geom_hline(yintercept = 0, size = 2, color = "yellow") +
geom_line()
summary(model5)
##
## Call:
## lm(formula = n ~ wday * month(date), data = data3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -332.60 -10.93 11.56 25.59 111.56
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 958.7632 14.4978 66.132 < 2e-16 ***
## wdayMon -5.2579 20.5864 -0.255 0.799
## wdaySat -215.3627 20.5404 -10.485 < 2e-16 ***
## wdaySun -97.4345 20.4973 -4.754 2.92e-06 ***
## wdayThu 5.7758 20.3683 0.284 0.777
## wdayTue -17.0984 20.2302 -0.845 0.399
## wdayWed -6.7875 20.3583 -0.333 0.739
## month(date) 1.3343 1.9690 0.678 0.498
## wdayMon:month(date) 1.8859 2.7786 0.679 0.498
## wdaySat:month(date) -1.1485 2.7870 -0.412 0.681
## wdaySun:month(date) 3.2503 2.7715 1.173 0.242
## wdayThu:month(date) -1.1468 2.7734 -0.414 0.679
## wdayTue:month(date) 0.1506 2.7391 0.055 0.956
## wdayWed:month(date) 0.3292 2.7758 0.119 0.906
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 48.59 on 351 degrees of freedom
## Multiple R-squared: 0.7256, Adjusted R-squared: 0.7155
## F-statistic: 71.41 on 13 and 351 DF, p-value: < 2.2e-16
What would you expect the model n ~ wday + ns(date,5) to look like? Knowing what you know about the data, why would you expect it not to be particularly effective?
##Answer 6: Based on the daily-weekly pattern plot, we did not observe an obvious pattern among weekdays.
We hypothesized that people leaving on Sundays are more likely to be business travelers who need to be somewhere on Monday. Explore the hypothesis by seeing how if breaks down based on distance and time: if it’s true, you’d expect to see more Sunday evening flights to places that are far away.
##Answer: The hypothesis is true, since we observe more Sundays flights in the longer distance data than shorter distance data.
head(flights)
## # A tibble: 6 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## # … with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
summary(flights)
## year month day dep_time
## Min. :2013 Min. : 1.000 Min. : 1.00 Min. : 1
## 1st Qu.:2013 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.: 907
## Median :2013 Median : 7.000 Median :16.00 Median :1401
## Mean :2013 Mean : 6.549 Mean :15.71 Mean :1349
## 3rd Qu.:2013 3rd Qu.:10.000 3rd Qu.:23.00 3rd Qu.:1744
## Max. :2013 Max. :12.000 Max. :31.00 Max. :2400
## NA's :8255
## sched_dep_time dep_delay arr_time sched_arr_time
## Min. : 106 Min. : -43.00 Min. : 1 Min. : 1
## 1st Qu.: 906 1st Qu.: -5.00 1st Qu.:1104 1st Qu.:1124
## Median :1359 Median : -2.00 Median :1535 Median :1556
## Mean :1344 Mean : 12.64 Mean :1502 Mean :1536
## 3rd Qu.:1729 3rd Qu.: 11.00 3rd Qu.:1940 3rd Qu.:1945
## Max. :2359 Max. :1301.00 Max. :2400 Max. :2359
## NA's :8255 NA's :8713
## arr_delay carrier flight tailnum
## Min. : -86.000 Length:336776 Min. : 1 Length:336776
## 1st Qu.: -17.000 Class :character 1st Qu.: 553 Class :character
## Median : -5.000 Mode :character Median :1496 Mode :character
## Mean : 6.895 Mean :1972
## 3rd Qu.: 14.000 3rd Qu.:3465
## Max. :1272.000 Max. :8500
## NA's :9430
## origin dest air_time distance
## Length:336776 Length:336776 Min. : 20.0 Min. : 17
## Class :character Class :character 1st Qu.: 82.0 1st Qu.: 502
## Mode :character Mode :character Median :129.0 Median : 872
## Mean :150.7 Mean :1040
## 3rd Qu.:192.0 3rd Qu.:1389
## Max. :695.0 Max. :4983
## NA's :9430
## hour minute time_hour
## Min. : 1.00 Min. : 0.00 Min. :2013-01-01 05:00:00
## 1st Qu.: 9.00 1st Qu.: 8.00 1st Qu.:2013-04-04 13:00:00
## Median :13.00 Median :29.00 Median :2013-07-03 10:00:00
## Mean :13.18 Mean :26.23 Mean :2013-07-03 05:22:54
## 3rd Qu.:17.00 3rd Qu.:44.00 3rd Qu.:2013-10-01 07:00:00
## Max. :23.00 Max. :59.00 Max. :2013-12-31 23:00:00
##
long_ds <- flights[flights$distance >= 1500, ]
short_ds <- flights[flights$distance < 1500, ]
daily_l <- long_ds %>%
mutate(date = make_date(year, month, day)) %>%
group_by(date) %>%
summarize(n = n())
daily_l <- daily_l %>%
mutate(wday = wday(date, label = TRUE))
mod_l = lm(n ~ wday, data = daily_l)
grid <- daily_l %>%
data_grid(wday) %>%
add_predictions(mod_l, "n")
ggplot(daily_l, aes(wday, n)) +
geom_boxplot() +
geom_point(data = grid, color = "orange", size = 4)
daily_s <- short_ds %>%
mutate(date = make_date(year, month, day)) %>%
group_by(date) %>%
summarize(n = n())
daily_s <- daily_s %>%
mutate(wday = wday(date, label = TRUE))
mod_s = lm(n ~ wday, data = daily_s)
grid <- daily_s %>%
data_grid(wday) %>%
add_predictions(mod_s, "n")
ggplot(daily_s, aes(wday, n)) +
geom_boxplot() +
geom_point(data = grid, color = "orange", size = 4)