============================================================================================================
Reference: Wickham, H., & Grolemund, G. (2016). R for data science: import, tidy, transform, visualize, and model data. “O’Reilly Media, Inc.”. 384-395.
Data dictionary: https://cran.r-project.org/web/packages/nycflights13/nycflights13.pdf
daily <- flights %>%
mutate(date = make_date(year, month, day)) %>%
group_by(date) %>%
summarize(n = n()) #aggregate the time series
ggplot(daily, aes(date, n)) + geom_line() + labs(x="Date", y="Number of flights") + ggtitle("Line chart 1")
daily <- daily %>%
mutate(wday = wday(date, label=T))
ggplot(daily, aes(wday,n)) + geom_boxplot() + labs(x="Week", y="Number of flights") + ggtitle("Boxplot 1")
mod <- lm(n ~ wday, data = daily)
grid <- daily %>%
data_grid(wday) %>%
add_predictions(mod, "n")
ggplot(daily, aes(wday, n)) +
geom_boxplot() +
geom_point(data = grid, color = "orange", size = 4) +
labs(x="Week", y="Number of flights") + ggtitle("Boxplot 2")
daily <- daily %>%
add_residuals(mod)
# Beginner notice: use "colour" instead of "color" in "geom_ref_line()"
daily %>%
ggplot(aes(date, resid)) +
geom_ref_line(h = 0, colour = "red", size = 1) +
geom_line() +
labs(x="Date", y="Residuals") + ggtitle("Line chart 2")
ggplot(daily, aes(date, resid, color = wday)) +
geom_ref_line(h = 0, colour = "red", size = 1) +
geom_line() +
labs(x="Week", y="Residuals") + ggtitle("Line chart 3")
# Beginner notice: resid<-100 can cause bug; use spaces properly
daily %>%
filter(resid < -100) %>% kable() %>% kable_styling()
| date | n | wday | resid |
|---|---|---|---|
| 2013-01-01 | 842 | Tue | -109.3585 |
| 2013-01-20 | 786 | Sun | -105.4808 |
| 2013-05-26 | 729 | Sun | -162.4808 |
| 2013-07-04 | 737 | Thu | -228.7500 |
| 2013-07-05 | 822 | Fri | -145.4615 |
| 2013-09-01 | 718 | Sun | -173.4808 |
| 2013-11-28 | 634 | Thu | -331.7500 |
| 2013-11-29 | 661 | Fri | -306.4615 |
| 2013-12-24 | 761 | Tue | -190.3585 |
| 2013-12-25 | 719 | Wed | -243.6923 |
| 2013-12-31 | 776 | Tue | -175.3585 |
daily %>%
ggplot(aes(date, resid)) +
geom_ref_line(h = 0, colour = "red", size = 1) +
geom_line(color = "grey50") +
geom_smooth(method = "loess", se = F, span = 0.20) +
labs(x="Date", y="Residuals") + ggtitle("Line chart 4")
daily %>%
filter(wday == "Sat") %>%
ggplot(aes(date, n)) +
geom_point()+
geom_line() +
scale_x_date(
NULL,
date_breaks = "1 month",
date_labels = "%b"
) + labs(x="Date", y="Number of flights") + ggtitle("Line chart 5")
term <- function(date) {
cut(date,
breaks = ymd(20130101, 20130605, 20130825, 20140101),
labels = c("spring", "summer", "fall")
)
}
daily <- daily %>%
mutate(term = term(date))
daily %>%
filter(wday == "Sat") %>%
ggplot(aes(date, n, color = term)) +
geom_point(alpha = 1/3) +
geom_line() +
scale_x_date(
NULL,
date_breaks = "1 month",
date_labels = "%b"
) + labs(x="Date", y="Number of flights") + ggtitle("Line chart 6")
daily %>%
ggplot(aes(wday, n, color = term)) +
geom_boxplot() + labs(x="Week", y="Number of flights") + ggtitle("Boxplot 3")
mod2 <- lm(n ~ wday * term, data = daily)
daily %>%
gather_residuals(without_term = mod, with_term = mod2) %>%
ggplot(aes(date, resid, color = model)) +
geom_line(alpha = 0.75) +
labs(x="Date", y="Residuals") + ggtitle("Line chart 7")
grid2 <- daily %>%
data_grid(wday, term) %>%
add_predictions(mod2, "n")
ggplot(daily, aes(wday, n)) +
geom_boxplot() +
geom_point(data = grid2, color = "red") +
facet_wrap(~ term) +
labs(x="Week", y="Number of flights") + ggtitle("Boxplot 4")
mod3 <- MASS::rlm(n ~ wday * term, data = daily)
daily %>%
add_residuals(mod3, "resid3") %>%
ggplot(aes(date, resid3)) +
geom_hline(yintercept = 0, size = 1, color = "red") +
geom_line() + labs(x="Date", y="Residuals") + ggtitle("Line chart 8")
# If you are creating variables it might be a good idea to bundle the creation of the variables up into a function
compute_vars <- function(data) {
data %>%
mutate(term = term(date),
wday = wday(date, label=T)
)
}
# Another option would be to put the transformations directly in the model formula:
wday2 <- function(x) {
wday(x, label=T)
}
mod3r <- lm(n ~ wday2(date) * term(date), data = daily)
# We could use a more flexible model to capture the pattern of school term in the data
mod4 <- MASS::rlm(n ~ wday * splines::ns(date, 5), data = daily)
daily %>%
data_grid(wday, date = seq_range(date, n = 13)) %>%
add_predictions(mod4) %>%
ggplot(aes(date, pred, color = wday)) +
geom_line() +
geom_point() +
labs(x="Date", y="Prediction") + ggtitle("Line chart 9")
# We see a strong pattern in the numbers of Saturday flights. This is reassuring, because we also saw that pattern in the raw data. It is a good sign when you get the same signal from different approaches.
Why are there fewer than expected flights on January 20, May 26 and September 1? (Hint: they all have the same explanation.) How would these days generalize into another year?
datQ1 <- subset(daily, date %in% as.Date(c("2013-01-20","2013-05-26","2013-09-01")))
datQ1$holiday <- c("Martin Luther King Jr. Day", "Memorial Day", "Labor Day")
#datQ1 %>% kable() %>% kable_styling()
# Or follow the Lecture's scripting style
daily %>%
filter(date %in% as.Date(c("2013-01-20","2013-05-26","2013-09-01"))) %>%
mutate(holiday = c("Martin Luther King Jr. Day", "Memorial Day", "Labor Day")) %>%
kable() %>% kable_styling()
| date | n | wday | resid | term | holiday |
|---|---|---|---|---|---|
| 2013-01-20 | 786 | Sun | -105.4808 | spring | Martin Luther King Jr. Day |
| 2013-05-26 | 729 | Sun | -162.4808 | spring | Memorial Day |
| 2013-09-01 | 718 | Sun | -173.4808 | fall | Labor Day |
ANSWER: These are the Sundays before Martin Luther King Jr. Day (the third Monday of January since 1986), Memorial Day (the last Monday of May since 1868) and Labor Day (the first Monday of September since 1882). The corresponding Sundays in 2014 will be January 19, May 25 and August 31.
What do the three days with high positive residuals represent? How would these days generalize to another year?
daily %>%
top_n(3, resid) %>%
mutate(holiday = c("Thanksgiving (United States)", "Thanksgiving (United States)", "Hanukkah, Christmas, New Year")) %>%
kable() %>% kable_styling()
| date | n | wday | resid | term | holiday |
|---|---|---|---|---|---|
| 2013-11-30 | 857 | Sat | 112.38462 | fall | Thanksgiving (United States) |
| 2013-12-01 | 987 | Sun | 95.51923 | fall | Thanksgiving (United States) |
| 2013-12-28 | 814 | Sat | 69.38462 | fall | Hanukkah, Christmas, New Year |
ANSWER: These are the Saturdays close to Thanksgiving Day in the United States (the fourth Thursday of November since 1621), Christmas Day (December 25 since 336) or Hanukkah (the eight nights and days starting on the 25th day of Kislev according to the Hebrew calendar since 200 B.C.), and New Year’s Day (January 1 since 45 B.C.). Traveling around popular holidays is in high demand. 2014 would witness the same case.
Create a new variable that splits the “wday” variable into terms, but only for Saturdays, i.e., it should have Thurs, Fri, but Sat-summer, Sat-spring, Sat-fall. How does this model compare with the model with every combination of “wday” and “term”?
daily <- daily %>%
mutate(wday2 = case_when(wday == "Sat" & term == "spring" ~ "Sat-spring",
wday == "Sat" & term == "summer" ~ "Sat-summer",
wday == "Sat" & term == "fall" ~ "Sat-fall",
T ~ as.character(wday)))
modQ3 <- lm(n ~wday2, data = daily)
daily %>%
spread_residuals(sat_term = modQ3, all_interact = mod2) %>%
mutate(residdiff = sat_term - all_interact) %>%
ggplot(aes(date, residdiff)) +
geom_ref_line(h = 0, colour = "red") +
geom_line(alpha = 0.75) +
labs(x="Date", y="Residuals difference") + ggtitle("Line chart 10")
options(scipen=100,digits=4)
rbind(broom::glance(modQ3), broom::glance(mod2)) %>%
mutate(model = c("with Saturday and term", "with all interactions")) %>%
kable() %>% kable_styling()
| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual | model |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.7357 | 0.7297 | 47.36 | 123.84 | 0 | 9 | -1921 | 3863 | 3902 | 798487 | 356 | with Saturday and term |
| 0.7573 | 0.7432 | 46.17 | 53.67 | 0 | 21 | -1906 | 3856 | 3942 | 733157 | 344 | with all interactions |
ANSWER: Line chart 10 shows that the model with Saturday and term has lower residuals in Spring and higher residuals in Summer than the model with all interactions.
Table shows that the model with Saturday and term has the lower adjusted R-squared and the higher standard error of the regression but the higher AIC than the model with all interactions.
Create a new “wday” variable that combines the day of week, term(for Saturdays), and public holidays. What do the residuals of the model look like?
holidays <- ymd(20130101, 20130120, 20130526, 20130704, 20130705, 20130901, 20131128, 20131129, 20131130, 20131201, 20131224, 20131225, 20131231)
# Federal holidays in the United States in 2013 are available at https://www.officeholidays.com/countries/usa/2013.php.
daily <- daily %>%
mutate(wday3 = case_when(date %in% holidays ~ "holidays",
T ~ as.character(wday2)))
modQ4 <- lm(n ~ wday3, data = daily)
daily <- daily %>%
add_residuals(modQ4, "residQ4")
ggplot(daily, aes(date, residQ4)) +
geom_ref_line(h = 0, colour = "red", size = 1) +
geom_line() +
labs(x="Date", y="Residuals") + ggtitle("Line chart 11")
ANSWER: Compared Line chart 11 to Line chart 2, the absolute values of the extreme residuals slightly narrow down. However, the line shape remains relatively the same. The model with Saturday, term and holidays still predict more flights in January and February, and less flights in July. And the number of flights fluctuates during Thanksgiving and Christmas holidays.
It is noticed that not all federal holidays impact traveling, e.g., Presidents’ Day (the third Monday of February since 1885), Emancipation Day (the weekday closest to April 16 since 1866), Columbus Day (the second Monday of October since 1869) and Veterans Day (November 11 since 1938). The Lunar New Year’s Day in 2013 was on February 10.
What happens if you fit a day-of-week effect that varies by month (i.e. n ~ wday * month)? Why is this not very helpful?
dummy <- daily %>%
mutate(month = month(date, label=T))
modQ5 <- lm(n ~ wday * month, data = dummy)
dummy %>%
add_residuals(modQ5, "residQ5") %>%
ggplot(aes(date, residQ5)) +
geom_ref_line(h = 0, colour = "red", size = 1) +
geom_line() +
labs(x="Date", y="Residuals") + ggtitle("Line chart 12")
options(scipen=100,digits=4)
broom::glance(modQ5) %>% kable() %>% kable_styling()
| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual |
|---|---|---|---|---|---|---|---|---|---|---|
| 0.8355 | 0.7869 | 42.05 | 17.2 | 0 | 84 | -1835 | 3840 | 4171 | 496839 | 281 |
ANSWER: The sample size for each day of the week in a month is either four or five, which may not be enough to build a robust regression. Line chart 12 shows severe fluctuation in July, November and December, but smoothness in January.
Table shows that the model with days of the week and month has the higher adjusted R-squared and the lower standard error of the regression but the lower AIC than the model with all interactions.
What would you expect the model n ~ wday + ns(date,5) to look like? Knowing what you know about the data, why would you expect it not to be particularly effective?
modQ6 <- lm(n ~ wday + splines::ns(date, df = 5), data = daily)
daily %>%
add_residuals(modQ6, "residQ6") %>%
ggplot(aes(date, residQ6)) +
geom_ref_line(h = 0, colour = "red", size = 1) +
geom_line() +
labs(x="Date", y="Residuals") + ggtitle("Line chart 13")
options(scipen=100,digits=4)
broom::glance(modQ6) %>% kable() %>% kable_styling()
| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual |
|---|---|---|---|---|---|---|---|---|---|---|
| 0.7816 | 0.7748 | 43.23 | 114.9 | 0 | 12 | -1887 | 3799 | 3850 | 659694 | 353 |
ANSWER: In the second part of EDA, boxplot 1 shows that weekdays (through Monday to Friday) have similar flights distribution. In the third part of EDA, line chart 3 shows that not a weekday trend across months but a holidays trend on the dataset particularly exists.
We hypothesized that people leaving on Sundays are more likely to be business travelers who need to be somewhere on Monday. Explore the hypothesis by seeing how if breaks down based on distance and time: if it’s true, you’d expect to see more Sunday evening flights to places that are far away.
# Beginner notice: the original R chunk header is "Question #7", however, if "ggplot" is called, there will be "pandoc" bug as shown below when knitting.
# pandoc: ~/NYCFlightsDataModel_files/figure-html/Question : openBinaryFile: does not exist (No such file or directory)
# Error: pandoc document conversion failed with error 1
# Execution halted
# Solution: delete the hashtag symbol (#) in the R chunk header
datQ7 <- flights %>%
mutate(date = make_date(year, month, day),
wday = wday(date, label = T)) %>%
select(date, wday, distance, air_time, hour) %>%
filter(hour != 1) #omit the single one record of 1 AM flight
datQ7 %>%
ggplot(aes(wday, distance)) + geom_boxplot() +
labs(x="Week", y="Traveling distance (miles)") + ggtitle("Boxplot 5")
datQ7 %>%
group_by(wday) %>%
summarise("Mean distance (miles)" = round(mean(distance)),
"Median distance (miles)" = median(distance),
"Mean duration (minutes)" = round(mean(air_time, na.rm = T)),
"Median duration (minutes)" = median(air_time, na.rm = T)) %>%
rename(Week = wday) %>%
kable() %>% kable_styling()
| Week | Mean distance (miles) | Median distance (miles) | Mean duration (minutes) | Median duration (minutes) |
|---|---|---|---|---|
| Sun | 1055 | 937 | 152 | 130 |
| Mon | 1032 | 828 | 151 | 129 |
| Tue | 1027 | 820 | 150 | 128 |
| Wed | 1028 | 820 | 150 | 128 |
| Thu | 1033 | 828 | 150 | 128 |
| Fri | 1033 | 828 | 150 | 127 |
| Sat | 1081 | 944 | 154 | 134 |
# Visualized by "plotly" for interaction on the chart
htmltools::div(
datQ7 %>%
#show legend starting with Sunday
mutate(wday = factor(wday, levels = rev(levels(wday)))) %>%
group_by(wday, hour) %>%
summarise(distMedian = median(distance)) %>%
plot_ly(y = ~distMedian, x = ~hour, color = ~wday) %>%
add_markers(showlegend = F) %>% add_lines() %>%
layout(xaxis = list(title = "Hour", gridcolor = "#ffffff", zeroline = F),
yaxis = list(title = "Median traveling distance (miles)", gridcolor = "#ffffff", zeroline = F),
title = "Line chart 14", plot_bgcolor = "#ebebeb"),
align = "center")
datQ7 %>%
group_by(wday, hour) %>%
summarise(distMedian = median(distance)) %>%
reshape2::acast(wday~hour, value.var="distMedian") %>%
kable() %>% kable_styling()
| 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Sun | 1400 | 963 | 1041 | 1005 | 950 | 1017.0 | 820 | 746 | 762 | 762 | 765 | 764 | 1008 | 944 | 944 | 762 | 719 | 266 | 1598 |
| Mon | 1400 | 762 | 1035 | 872 | 944 | 872.0 | 888 | 746 | 762 | 762 | 765 | 762 | 950 | 997 | 937 | 762 | 719 | 266 | 1598 |
| Tue | 1400 | 760 | 1035 | 872 | 944 | 872.0 | 888 | 746 | 762 | 762 | 765 | 762 | 950 | 944 | 944 | 740 | 642 | 266 | 1598 |
| Wed | 1400 | 760 | 1035 | 833 | 937 | 872.0 | 888 | 746 | 762 | 762 | 764 | 762 | 1008 | 937 | 937 | 753 | 642 | 266 | 1598 |
| Thu | 1400 | 760 | 1035 | 872 | 944 | 872.0 | 888 | 746 | 762 | 762 | 765 | 764 | 1008 | 944 | 944 | 762 | 719 | 266 | 1598 |
| Fri | 1400 | 760 | 1035 | 872 | 944 | 809.5 | 888 | 746 | 762 | 762 | 765 | 764 | 950 | 944 | 937 | 762 | 733 | 266 | 1598 |
| Sat | 1400 | 937 | 1035 | 944 | 950 | 1028.0 | 937 | 738 | 937 | 764 | 888 | 944 | 1065 | 1020 | 1005 | 1023 | 937 | 266 | 1598 |
ANSWER: Above two tables, one boxplot and one line chart show that traveling distances in each day of the week are similar, though 100 miles more median distances are on Saturday and Sunday. As for the hypothesis that business trips take off on Sunday evenings, median distances are the same at 10 PM and 11 PM whichever day of the week it is.