daily <- flights %>%
mutate(date = make_date(year, month, day)) %>%
group_by(date) %>%
summarize(n = n())
ggplot(daily, aes(date, n)) +
geom_line()
daily <- daily %>%
mutate(wday = wday(date, label = TRUE))
ggplot(daily, aes(wday,n)) +
geom_boxplot()
mod = lm(n ~ wday, data = daily, na.action = na.warn)
grid <- daily %>%
data_grid(wday) %>%
add_predictions(mod, "n")
ggplot(daily, aes(wday, n)) +
geom_boxplot() +
geom_point(data = grid, color = "orange", size = 4)
daily <- daily %>%
add_residuals(mod)
daily %>%
ggplot(aes(date, resid)) +
geom_ref_line(h = 0) +
geom_line()
ggplot(daily, aes(date, resid, color = wday)) +
geom_ref_line(h = 0, colour = "red") +
geom_line()
daily %>%
filter(resid < -100)
## # A tibble: 11 x 4
## date n wday resid
## <date> <int> <ord> <dbl>
## 1 2013-01-01 842 Tue -109.
## 2 2013-01-20 786 Sun -105.
## 3 2013-05-26 729 Sun -162.
## 4 2013-07-04 737 Thu -229.
## 5 2013-07-05 822 Fri -145.
## 6 2013-09-01 718 Sun -173.
## 7 2013-11-28 634 Thu -332.
## 8 2013-11-29 661 Fri -306.
## 9 2013-12-24 761 Tue -190.
## 10 2013-12-25 719 Wed -244.
## 11 2013-12-31 776 Tue -175.
daily %>%
ggplot(aes(date, resid)) +
geom_ref_line(h = 0, colour = "red", size = 1) +
geom_line(color = "grey50") +
geom_smooth(se = FALSE, span = 0.20)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
daily %>%
filter(wday == "Sat") %>%
ggplot(aes(date, n)) +
geom_point()+
geom_line() +
scale_x_date(
NULL,
date_breaks = "1 month",
date_labels = "%b"
)
term <- function(date) {
cut(date,
breaks = ymd(20130101, 20130605, 20130825, 20140101),
labels = c("spring", "summer", "fall")
)
}
daily <- daily %>%
mutate(term = term(date))
daily %>%
filter(wday == "Sat") %>%
ggplot(aes(date, n, color = term)) +
geom_point(alpha = 1/3)+
geom_line() +
scale_x_date(
NULL,
date_breaks = "1 month",
date_labels = "%b"
)
daily %>%
ggplot(aes(wday, n, color = term)) +
geom_boxplot()
mod1 <- lm(n ~ wday, data = daily, na.action = na.warn)
mod2 <- lm(n ~ wday * term, data = daily, na.action = na.warn)
daily %>%
gather_residuals(without_term = mod1, with_term = mod2) %>%
ggplot(aes(date, resid, color = model)) +
geom_line(alpha = 0.75)
grid <- daily %>%
data_grid(wday, term) %>%
add_predictions(mod2, "n")
ggplot(daily, aes(wday, n)) +
geom_boxplot() +
geom_point(data = grid, color = "red") +
facet_wrap(~ term)
mod3 <- MASS::rlm(n ~ wday * term, data = daily, na.action = na.warn)
daily %>%
add_residuals(mod3, "resid") %>%
ggplot(aes(date, resid)) +
geom_hline(yintercept = 0, size = 2, color = "red") +
geom_line()
# If you are creating variables it might be a good idea to bundle the creation of the variables up into a function
compute_vars <- function(data) {
data %>%
mutate(term = term(date),
wday = wday(date, label = TRUE)
)
}
# Another option would be to put the transformations directly in the model formula:
wday2 <- function(x) wday(x, label = TRUE)
mod3 <- lm(n ~ wday2(date) * term(date), data = daily, na.action = na.warn)
# We could use a more flexible model to capture the pattern of school term in the data
library(splines)
mod <- MASS::rlm(n ~ wday * ns(date, 5), data = daily, na.action = na.warn)
daily %>%
data_grid(wday, date = seq_range(date, n = 13)) %>%
add_predictions(mod) %>%
ggplot(aes(date, pred, color = wday)) +
geom_line() +
geom_point()
# We see a strong pattern in the numbers of Sat flights. This is reassuring, because we also saw that pattern in the raw data. It's a good sign when you get the same signal from different approaches.
Why are there fewer than expected flights on January 20, May 26 and September 1? (Hint: they all have the same explanation.) How would these days generalize into another year?
daily$percentile <- ecdf(daily$n)(daily$n)
daily$percentile[daily$date == "2013-01-20" | daily$date == "2013-05-26" |daily$date == "2013-09-01"]
## [1] 0.13424658 0.07123288 0.06027397
## These are days before MLK, Memorial Day, Labor Day, people don't generally fly on a sunday whenever there is a long weekend, hence fewere flights
What do the three days with high positive residuals represent? How would these days generalize to another year?
daily %>%
top_n(3, resid)
## # A tibble: 3 x 6
## date n wday resid term percentile
## <date> <int> <ord> <dbl> <fct> <dbl>
## 1 2013-11-30 857 Sat 112. fall 0.189
## 2 2013-12-01 987 Sun 95.5 fall 0.778
## 3 2013-12-28 814 Sat 69.4 fall 0.167
## These days generally fall during thanksgiving and new year celebration days,which is why there is high flight traffic.
Create a new variable that splits the “wday” variable into terms, but only for Saturdays, i.e., it should have Thurs, Fri, but Sat-summer, Sat-spring, Sat-fall. How does this model compare with the model with every combination of “wday” and “term”?
daily<-daily%>%
mutate(wday_term_Sat=ifelse(wday=="Sat", paste("Sat",term(date), sep = "-"), as.character(wday)))
daily$wday_term_Sat<-as.factor(daily$wday_term_Sat)
model1 <- MASS::rlm(n ~ wday * term, data = daily)
model2 <- MASS::rlm(n ~ wday_term_Sat, data = daily)
summary(model1)
##
## Call: rlm(formula = n ~ wday * term, data = daily)
## Residuals:
## Min 1Q Median 3Q Max
## -347.537 -12.899 3.915 11.500 160.101
##
## Coefficients:
## Value Std. Error t value
## (Intercept) 922.4857 1.5914 579.6839
## wday.L -79.3059 4.2202 -18.7920
## wday.Q -153.7811 4.2136 -36.4961
## wday.C -67.7750 4.2081 -16.1056
## wday^4 -74.9295 4.2229 -17.7437
## wday^5 -6.4803 4.1961 -1.5444
## wday^6 -9.7258 4.2011 -2.3151
## termsummer 32.9797 2.7178 12.1349
## termfall 1.9847 2.3615 0.8404
## wday.L:termsummer 9.5346 7.2110 1.3222
## wday.Q:termsummer 12.3445 7.1875 1.7175
## wday.C:termsummer 17.0750 7.2040 2.3702
## wday^4:termsummer 11.0813 7.1885 1.5415
## wday^5:termsummer -3.5357 7.1969 -0.4913
## wday^6:termsummer 0.3378 7.1550 0.0472
## wday.L:termfall -31.7325 6.2480 -5.0788
## wday.Q:termfall -33.1644 6.2524 -5.3043
## wday.C:termfall -23.8271 6.2399 -3.8185
## wday^4:termfall -22.5288 6.2606 -3.5985
## wday^5:termfall -4.9157 6.2318 -0.7888
## wday^6:termfall -3.3798 6.2550 -0.5403
##
## Residual standard error: 17.58 on 344 degrees of freedom
summary(model2)
##
## Call: rlm(formula = n ~ wday_term_Sat, data = daily)
## Residuals:
## Min 1Q Median 3Q Max
## -345.692 -14.487 2.403 13.840 158.717
##
## Coefficients:
## Value Std. Error t value
## (Intercept) 978.4105 3.2011 305.6436
## wday_term_SatMon 0.1863 4.5271 0.0412
## wday_term_SatSat-fall -280.1270 6.3127 -44.3748
## wday_term_SatSat-spring -233.2026 5.8710 -39.7213
## wday_term_SatSat-summer -177.4939 7.3927 -24.0092
## wday_term_SatSun -75.2507 4.5271 -16.6223
## wday_term_SatThu 1.2817 4.5271 0.2831
## wday_term_SatTue -14.9240 4.5057 -3.3123
## wday_term_SatWed -7.3207 4.5271 -1.6171
##
## Residual standard error: 21.18 on 356 degrees of freedom
p1<-daily %>%
add_residuals(model1, "residual1") %>%
ggplot(aes(date, residual1)) +
geom_hline(yintercept = 0, size = 1, color = "navy") +
geom_line()
p2<-daily %>%
add_residuals(model2, "residual2") %>%
ggplot(aes(date, residual2)) +
geom_hline(yintercept = 0, size = 1, color = "navy") +
geom_line()
grid.arrange(p1, p2)
## Model with wday_term_sat is better at prediction based on comparing mean residual value for each model
Create a new “wday” variable that combines the day of week, term(for Saturdays), and public holidays. What do the residuals of the model look like?
US_holiday<-c(date("2013-01-01"),date("2013-01-21"),date("2013-02-18"),date("2013-05-27"),
date("2013-07-04"),date("2013-09-02"),date("2013-10-11"),date("2013-11-11"),date("2013-11-28"),
date("2013-12-25") )
daily<-daily%>%
mutate(new_wday=ifelse(wday=="Sat", paste("Sat",term(date), sep = "-"),
ifelse(date %in% US_holiday, "holiday" , as.character(wday))))
daily$new_wday<-as.factor(daily$new_wday)
model_new <- MASS::rlm(n ~ new_wday, data = daily)
summary(model_new)
##
## Call: rlm(formula = n ~ new_wday, data = daily)
## Residuals:
## Min 1Q Median 3Q Max
## -317.410 -13.499 1.501 13.501 159.165
##
## Coefficients:
## Value Std. Error t value
## (Intercept) 978.4100 3.1205 313.5452
## new_wdayholiday -64.3951 7.7070 -8.3554
## new_wdayMon 3.0893 4.5059 0.6856
## new_wdaySat-fall -280.5753 6.1095 -45.9241
## new_wdaySat-spring -232.7398 5.6842 -40.9449
## new_wdaySat-summer -177.4933 7.1499 -24.8246
## new_wdaySun -74.9600 4.3918 -17.0684
## new_wdayThu 2.9383 4.4350 0.6625
## new_wdayTue -14.1105 4.3918 -3.2130
## new_wdayWed -6.5395 4.4130 -1.4819
##
## Residual standard error: 20.02 on 355 degrees of freedom
daily %>%
add_residuals(model_new, "residual_new") %>%
ggplot(aes(date, residual_new)) +
geom_hline(yintercept = 0, size = 1, color = "blue") +
geom_line()
## The model with new wday variable that combines the day of week, term and public holidays has lesser value of mean residual
What happens if you fit a day-of-week effect that varies by month (i.e.m n ~ wday*month)? Why is this not very helpful?
daily<-daily%>%
mutate(month= as.factor(month(date)))
model_month <- MASS::rlm(n ~ wday*month, data = daily)
summary(model_month)
##
## Call: rlm(formula = n ~ wday * month, data = daily)
## Residuals:
## Min 1Q Median 3Q Max
## -355.5000 -4.0000 0.4941 4.0000 138.5118
##
## Coefficients:
## Value Std. Error t value
## (Intercept) 868.1936 1.6650 521.4327
## wday.L -73.7893 4.5741 -16.1321
## wday.Q -164.7242 4.4167 -37.2960
## wday.C -70.2867 4.4509 -15.7917
## wday^4 -94.8767 4.4920 -21.1214
## wday^5 4.8125 4.3242 1.1129
## wday^6 -11.8265 4.1618 -2.8417
## month2 22.8336 2.4092 9.4775
## month3 67.3938 2.3547 28.6211
## month4 73.9634 2.3730 31.1686
## month5 62.6409 2.3547 26.6026
## month6 81.7330 2.3730 34.4427
## month7 93.6207 2.3547 39.7593
## month8 84.9062 2.3547 36.0584
## month9 58.2325 2.3730 24.5395
## month10 61.1632 2.3547 25.9751
## month11 63.5060 2.3730 26.7618
## month12 52.2364 2.3547 22.1840
## wday.L:month2 10.1749 6.4921 1.5673
## wday.Q:month2 -9.0842 6.3822 -1.4234
## wday.C:month2 1.3437 6.4059 0.2098
## wday^4:month2 8.7889 6.4345 1.3659
## wday^5:month2 2.4525 6.3185 0.3882
## wday^6:month2 4.9467 6.2085 0.7968
## wday.L:month3 -4.2993 6.2299 -0.6901
## wday.Q:month3 14.7026 6.1811 2.3787
## wday.C:month3 9.3917 6.2380 1.5056
## wday^4:month3 24.5305 6.2894 3.9003
## wday^5:month3 -11.1877 6.2461 -1.7911
## wday^6:month3 2.4036 6.1944 0.3880
## wday.L:month4 -6.8702 6.4334 -1.0679
## wday.Q:month4 9.6813 6.3465 1.5255
## wday.C:month4 1.9424 6.2945 0.3086
## wday^4:month4 26.3942 6.3265 4.1720
## wday^5:month4 -12.7402 6.1524 -2.0708
## wday^6:month4 -3.8213 6.1112 -0.6253
## wday.L:month5 -12.7427 6.4334 -1.9807
## wday.Q:month5 2.8971 6.2824 0.4611
## wday.C:month5 -6.7737 6.2945 -1.0761
## wday^4:month5 12.9616 6.2476 2.0747
## wday^5:month5 -9.9501 6.1524 -1.6173
## wday^6:month5 1.8807 5.9589 0.3156
## wday.L:month6 10.9685 6.2784 1.7470
## wday.Q:month6 26.4551 6.1811 4.2800
## wday.C:month6 19.2234 6.2945 3.0540
## wday^4:month6 28.9493 6.3958 4.5263
## wday^5:month6 -12.7016 6.3105 -2.0128
## wday^6:month6 3.3101 6.2078 0.5332
## wday.L:month7 5.3788 6.4334 0.8361
## wday.Q:month7 24.1848 6.2824 3.8496
## wday.C:month7 18.7778 6.2945 2.9832
## wday^4:month7 33.9526 6.2476 5.4345
## wday^5:month7 -14.5411 6.1524 -2.3635
## wday^6:month7 2.9936 5.9589 0.5024
## wday.L:month8 -5.4223 6.3265 -0.8571
## wday.Q:month8 10.0067 6.2461 1.6021
## wday.C:month8 11.3565 6.2380 1.8205
## wday^4:month8 23.8261 6.3069 3.7778
## wday^5:month8 -15.8985 6.1483 -2.5859
## wday^6:month8 1.0814 6.1108 0.1770
## wday.L:month9 -42.2908 6.3385 -6.6721
## wday.Q:month9 -24.7745 6.2824 -3.9435
## wday.C:month9 -31.9154 6.2945 -5.0704
## wday^4:month9 -17.3602 6.3091 -2.7516
## wday^5:month9 -12.5103 6.2502 -2.0016
## wday^6:month9 3.0670 6.1948 0.4951
## wday.L:month10 -49.1848 6.4687 -7.6035
## wday.Q:month10 -30.9858 6.2461 -4.9608
## wday.C:month10 -35.6117 6.2945 -5.6576
## wday^4:month10 -8.3348 6.3526 -1.3120
## wday^5:month10 -15.9756 6.1153 -2.6124
## wday^6:month10 -2.3547 5.8856 -0.4001
## wday.L:month11 -22.1747 6.3385 -3.4984
## wday.Q:month11 -23.2437 6.2824 -3.6998
## wday.C:month11 -11.7420 6.2945 -1.8654
## wday^4:month11 8.1113 6.3091 1.2857
## wday^5:month11 -18.4355 6.2502 -2.9496
## wday^6:month11 2.3924 6.1948 0.3862
## wday.L:month12 1.8501 6.3265 0.2924
## wday.Q:month12 18.0719 6.2461 2.8933
## wday.C:month12 4.2873 6.2380 0.6873
## wday^4:month12 11.5970 6.3069 1.8388
## wday^5:month12 -5.7612 6.1483 -0.9370
## wday^6:month12 -2.1140 6.1108 -0.3459
##
## Residual standard error: 5.93 on 281 degrees of freedom
daily %>%
gather_residuals(model_month, model_new) %>%
ggplot(aes(date, resid, color=model)) +
geom_hline(yintercept = 0, size = 1, color = "blue") +
geom_line()
##The new model is increasing the odds of having outliers, hence the model is not very helpful
What would you expect the model n ~ wday + ns(date,5) to look like? Knowing what you know about the data, why would you expect it not to be particularly effective?
model_spline <- MASS::rlm(n ~ wday + ns(date, 5), data = daily)
daily %>%
gather_residuals(model_spline, model_new) %>%
ggplot(aes(date, resid, color=model)) +
geom_hline(yintercept = 0, size = 1, color = "blue") +
geom_line()
daily %>%
data_grid(wday, date = seq_range(date, n = 13)) %>%
add_predictions(model_spline) %>%
ggplot(aes(date, pred, color = wday)) +
geom_line() +
geom_point()
## Model is overfit. It will work well only with the data on hand, it will not work well with other years. The cyclicity with weekday is not correct measure to build the model
We hypothesized that people leaving on Sundays are more likely to be business travelers who need to be somewhere on Monday. Explore the hypothesis by seeing how if breaks down based on distance and time: if it’s true, you’d expect to see more Sunday evening flights to places that are far away.
flights_revised<-flights %>%
mutate(date = make_date(year, month, day))%>%
mutate(wday = wday(date, label = TRUE))%>%
mutate(wday_revised=ifelse((wday=="Sun" & dep_time>1600), "Sun-Evening", ifelse(wday=="Sun" & dep_time<=1600, "Sun-DayTime", as.character(wday))))%>%
mutate(wday_revised=as.factor(wday_revised))%>%
group_by(wday_revised) %>%
summarise(average_dist = mean(distance),
median_dist = median(distance))
## Warning: Factor `wday_revised` contains implicit NA, consider using
## `forcats::fct_explicit_na`
flights_revised<-flights_revised[1:8,]
flights_revised$wday_revised <- factor(flights_revised$wday_revised , levels = c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun-DayTime", "Sun-Evening"))
p5<-ggplot(data= flights_revised, aes(y = average_dist, x = wday_revised)) +
geom_point(color="red")+labs(title = "Flights 'Average Distance' on Over Weekdays")
p6<-ggplot(data= flights_revised, aes(y = median_dist, x = wday_revised)) +
geom_point(color="red")+labs(title = " Flight 'Median Distance' on Over Weekdays")
grid.arrange(p5, p6)
## Sunday evening flights have higher mean and median values than sunday DayTime flgihts, this supports the hypothesis