ANLY 505 - Problem Set #4

============================================================================================================
Reference: Wickham, H., & Grolemund, G. (2016). R for data science: import, tidy, transform, visualize, and model data. “O’Reilly Media, Inc.”. 384-395.
Data dictionary: https://cran.r-project.org/web/packages/nycflights13/nycflights13.pdf

Exploratory data analysis (EDA)

1. Summarize data

daily <- flights %>%
  mutate(date = make_date(year, month, day)) %>%
  group_by(date) %>%
  summarize(n = n()) #aggregate the time series

ggplot(daily, aes(date, n)) + geom_line() + labs(x="Date", y="Number of flights") + ggtitle("Line chart 1")

2. Investigate daily / weekly pattern

daily <- daily %>%
  mutate(wday = wday(date, label=T))
ggplot(daily, aes(wday,n)) + geom_boxplot() + labs(x="Week", y="Number of flights") + ggtitle("Boxplot 1")

mod <- lm(n ~ wday, data = daily)

grid <- daily %>%
  data_grid(wday) %>%
  add_predictions(mod, "n")

ggplot(daily, aes(wday, n)) +
  geom_boxplot() +
  geom_point(data = grid, color = "orange", size = 4) +
  labs(x="Week", y="Number of flights") + ggtitle("Boxplot 2")

3. Investigate residuals

daily <- daily %>%
  add_residuals(mod)

# Beginner notice: use "colour" instead of "color" in "geom_ref_line()"
daily %>%
  ggplot(aes(date, resid)) +
  geom_ref_line(h = 0, colour = "red", size = 1) +
  geom_line() +
  labs(x="Date", y="Residuals") + ggtitle("Line chart 2")

ggplot(daily, aes(date, resid, color = wday)) +
  geom_ref_line(h = 0, colour = "red", size = 1) +
  geom_line() +
  labs(x="Week", y="Residuals") + ggtitle("Line chart 3")

# Beginner notice: resid<-100 can cause bug; use spaces properly
daily %>%
  filter(resid < -100) %>% kable() %>% kable_styling()

date	n	wday	resid
2013-01-01	842	Tue	-109.3585
2013-01-20	786	Sun	-105.4808
2013-05-26	729	Sun	-162.4808
2013-07-04	737	Thu	-228.7500
2013-07-05	822	Fri	-145.4615
2013-09-01	718	Sun	-173.4808
2013-11-28	634	Thu	-331.7500
2013-11-29	661	Fri	-306.4615
2013-12-24	761	Tue	-190.3585
2013-12-25	719	Wed	-243.6923
2013-12-31	776	Tue	-175.3585

daily %>%
  ggplot(aes(date, resid)) +
  geom_ref_line(h = 0, colour = "red", size = 1) +
  geom_line(color = "grey50") +
  geom_smooth(method = "loess", se = F, span = 0.20) +
  labs(x="Date", y="Residuals") + ggtitle("Line chart 4")

4. Seasonal Saturday effect

daily %>%
  filter(wday == "Sat") %>%
  ggplot(aes(date, n)) +
  geom_point()+
  geom_line() +
  scale_x_date(
    NULL,
    date_breaks = "1 month",
    date_labels = "%b"
  ) + labs(x="Date", y="Number of flights") + ggtitle("Line chart 5")

5. Add seasonal variable

term <- function(date) {
  cut(date,
      breaks = ymd(20130101, 20130605, 20130825, 20140101),
      labels = c("spring", "summer", "fall")
      )
}
daily <- daily %>%
  mutate(term = term(date))

daily %>%
  filter(wday == "Sat") %>%
  ggplot(aes(date, n, color = term)) +
  geom_point(alpha = 1/3) +
  geom_line() +
  scale_x_date(
    NULL,
    date_breaks = "1 month",
    date_labels = "%b"
  ) + labs(x="Date", y="Number of flights") + ggtitle("Line chart 6")

daily %>%
  ggplot(aes(wday, n, color = term)) +
  geom_boxplot() + labs(x="Week", y="Number of flights") + ggtitle("Boxplot 3")

mod2 <- lm(n ~ wday * term, data = daily)

daily %>%
  gather_residuals(without_term = mod, with_term = mod2) %>%
  ggplot(aes(date, resid, color = model)) +
  geom_line(alpha = 0.75) +
  labs(x="Date", y="Residuals") + ggtitle("Line chart 7")

grid2 <- daily %>%
  data_grid(wday, term) %>%
  add_predictions(mod2, "n")

ggplot(daily, aes(wday, n)) +
  geom_boxplot() +
  geom_point(data = grid2, color = "red") +
  facet_wrap(~ term) +
  labs(x="Week", y="Number of flights") + ggtitle("Boxplot 4")

6. Better model for outliers (Robust regression)

mod3 <- MASS::rlm(n ~ wday * term, data = daily)

daily %>%
  add_residuals(mod3, "resid3") %>%
  ggplot(aes(date, resid3)) +
  geom_hline(yintercept = 0, size = 1, color = "red") +
  geom_line() + labs(x="Date", y="Residuals") + ggtitle("Line chart 8")

7. Computed variables

# If you are creating variables it might be a good idea to bundle the creation of the variables up into a function
compute_vars <- function(data) {
  data %>%
    mutate(term = term(date),
           wday = wday(date, label=T)
           )
}

# Another option would be to put the transformations directly in the model formula:
wday2 <- function(x) {
  wday(x, label=T)
}
mod3r <- lm(n ~ wday2(date) * term(date), data = daily)

8. Time of year: An alternative approach

# We could use a more flexible model to capture the pattern of school term in the data
mod4 <- MASS::rlm(n ~ wday * splines::ns(date, 5), data = daily)

daily %>% 
  data_grid(wday, date = seq_range(date, n = 13)) %>% 
  add_predictions(mod4) %>% 
  ggplot(aes(date, pred, color = wday)) +
  geom_line() +
  geom_point() +
  labs(x="Date", y="Prediction") + ggtitle("Line chart 9")

# We see a strong pattern in the numbers of Saturday flights. This is reassuring, because we also saw that pattern in the raw data. It is a good sign when you get the same signal from different approaches.

Questions

Question #1

Why are there fewer than expected flights on January 20, May 26 and September 1? (Hint: they all have the same explanation.) How would these days generalize into another year?

datQ1 <- subset(daily, date %in% as.Date(c("2013-01-20","2013-05-26","2013-09-01")))
datQ1$holiday <- c("Martin Luther King Jr. Day", "Memorial Day", "Labor Day")
#datQ1 %>% kable() %>% kable_styling()

# Or follow the Lecture's scripting style
daily %>%
  filter(date %in% as.Date(c("2013-01-20","2013-05-26","2013-09-01"))) %>%
  mutate(holiday = c("Martin Luther King Jr. Day", "Memorial Day", "Labor Day")) %>%
  kable() %>% kable_styling()

date	n	wday	resid	term	holiday
2013-01-20	786	Sun	-105.4808	spring	Martin Luther King Jr. Day
2013-05-26	729	Sun	-162.4808	spring	Memorial Day
2013-09-01	718	Sun	-173.4808	fall	Labor Day

ANSWER: These are the Sundays before Martin Luther King Jr. Day (the third Monday of January since 1986), Memorial Day (the last Monday of May since 1868) and Labor Day (the first Monday of September since 1882). The corresponding Sundays in 2014 will be January 19, May 25 and August 31.

Question #2

What do the three days with high positive residuals represent? How would these days generalize to another year?

daily %>%
  top_n(3, resid) %>%
  mutate(holiday = c("Thanksgiving (United States)", "Thanksgiving (United States)", "Hanukkah, Christmas, New Year")) %>%
  kable() %>% kable_styling()

date	n	wday	resid	term	holiday
2013-11-30	857	Sat	112.38462	fall	Thanksgiving (United States)
2013-12-01	987	Sun	95.51923	fall	Thanksgiving (United States)
2013-12-28	814	Sat	69.38462	fall	Hanukkah, Christmas, New Year

ANSWER: These are the Saturdays close to Thanksgiving Day in the United States (the fourth Thursday of November since 1621), Christmas Day (December 25 since 336) or Hanukkah (the eight nights and days starting on the 25th day of Kislev according to the Hebrew calendar since 200 B.C.), and New Year’s Day (January 1 since 45 B.C.). Traveling around popular holidays is in high demand. 2014 would witness the same case.

Question #3

Create a new variable that splits the “wday” variable into terms, but only for Saturdays, i.e., it should have Thurs, Fri, but Sat-summer, Sat-spring, Sat-fall. How does this model compare with the model with every combination of “wday” and “term”?

daily <- daily %>%
  mutate(wday2 = case_when(wday == "Sat" & term == "spring" ~ "Sat-spring",
                           wday == "Sat" & term == "summer" ~ "Sat-summer",
                           wday == "Sat" & term == "fall" ~ "Sat-fall",
                           T ~ as.character(wday)))
modQ3 <- lm(n ~wday2, data = daily)

daily %>%
  spread_residuals(sat_term = modQ3, all_interact = mod2) %>%
  mutate(residdiff = sat_term - all_interact) %>%
  ggplot(aes(date, residdiff)) +
  geom_ref_line(h = 0, colour = "red") +
  geom_line(alpha = 0.75) +
  labs(x="Date", y="Residuals difference") + ggtitle("Line chart 10")

options(scipen=100,digits=4)
rbind(broom::glance(modQ3), broom::glance(mod2)) %>%
  mutate(model = c("with Saturday and term", "with all interactions")) %>%
  kable() %>% kable_styling()

r.squared	adj.r.squared	sigma	statistic	p.value	df	logLik	AIC	BIC	deviance	df.residual	model
0.7357	0.7297	47.36	123.84	0	9	-1921	3863	3902	798487	356	with Saturday and term
0.7573	0.7432	46.17	53.67	0	21	-1906	3856	3942	733157	344	with all interactions

ANSWER: Line chart 10 shows that the model with Saturday and term has lower residuals in Spring and higher residuals in Summer than the model with all interactions.

Table shows that the model with Saturday and term has the lower adjusted R-squared and the higher standard error of the regression but the higher AIC than the model with all interactions.

Question #4

Create a new “wday” variable that combines the day of week, term(for Saturdays), and public holidays. What do the residuals of the model look like?

holidays <- ymd(20130101, 20130120, 20130526, 20130704, 20130705, 20130901, 20131128, 20131129, 20131130, 20131201, 20131224, 20131225, 20131231)
# Federal holidays in the United States in 2013 are available at https://www.officeholidays.com/countries/usa/2013.php.

daily <- daily %>%
  mutate(wday3 = case_when(date %in% holidays ~ "holidays",
                           T ~ as.character(wday2)))
modQ4 <- lm(n ~ wday3, data = daily)
daily <- daily %>%
  add_residuals(modQ4, "residQ4")

ggplot(daily, aes(date, residQ4)) +
  geom_ref_line(h = 0, colour = "red", size = 1) +
  geom_line() +
  labs(x="Date", y="Residuals") + ggtitle("Line chart 11")

ANSWER: Compared Line chart 11 to Line chart 2, the absolute values of the extreme residuals slightly narrow down. However, the line shape remains relatively the same. The model with Saturday, term and holidays still predict more flights in January and February, and less flights in July. And the number of flights fluctuates during Thanksgiving and Christmas holidays.

It is noticed that not all federal holidays impact traveling, e.g., Presidents’ Day (the third Monday of February since 1885), Emancipation Day (the weekday closest to April 16 since 1866), Columbus Day (the second Monday of October since 1869) and Veterans Day (November 11 since 1938). The Lunar New Year’s Day in 2013 was on February 10.

Question #5

What happens if you fit a day-of-week effect that varies by month (i.e. n ~ wday * month)? Why is this not very helpful?

dummy <- daily %>%
  mutate(month = month(date, label=T))
modQ5 <- lm(n ~ wday * month, data = dummy)
dummy %>%
  add_residuals(modQ5, "residQ5") %>%
  ggplot(aes(date, residQ5)) +
  geom_ref_line(h = 0, colour = "red", size = 1) +
  geom_line() +
  labs(x="Date", y="Residuals") + ggtitle("Line chart 12")

options(scipen=100,digits=4)
broom::glance(modQ5) %>% kable() %>% kable_styling()

r.squared	adj.r.squared	sigma	statistic	p.value	df	logLik	AIC	BIC	deviance	df.residual
0.8355	0.7869	42.05	17.2	0	84	-1835	3840	4171	496839	281

ANSWER: The sample size for each day of the week in a month is either four or five, which may not be enough to build a robust regression. Line chart 12 shows severe fluctuation in July, November and December, but smoothness in January.

Table shows that the model with days of the week and month has the higher adjusted R-squared and the lower standard error of the regression but the lower AIC than the model with all interactions.

Question #6

What would you expect the model n ~ wday + ns(date,5) to look like? Knowing what you know about the data, why would you expect it not to be particularly effective?

modQ6 <- lm(n ~ wday + splines::ns(date, df = 5), data = daily)

daily %>%
  add_residuals(modQ6, "residQ6") %>%
  ggplot(aes(date, residQ6)) +
  geom_ref_line(h = 0, colour = "red", size = 1) +
  geom_line() +
  labs(x="Date", y="Residuals") + ggtitle("Line chart 13")

options(scipen=100,digits=4)
broom::glance(modQ6) %>% kable() %>% kable_styling()

r.squared	adj.r.squared	sigma	statistic	p.value	df	logLik	AIC	BIC	deviance	df.residual
0.7816	0.7748	43.23	114.9	0	12	-1887	3799	3850	659694	353

ANSWER: In the second part of EDA, boxplot 1 shows that weekdays (through Monday to Friday) have similar flights distribution. In the third part of EDA, line chart 3 shows that not a weekday trend across months but a holidays trend on the dataset particularly exists.

Question #7

We hypothesized that people leaving on Sundays are more likely to be business travelers who need to be somewhere on Monday. Explore the hypothesis by seeing how if breaks down based on distance and time: if it’s true, you’d expect to see more Sunday evening flights to places that are far away.

# Beginner notice: the original R chunk header is "Question #7", however, if "ggplot" is called, there will be "pandoc" bug as shown below when knitting.
# pandoc: ~/NYCFlightsDataModel_files/figure-html/Question : openBinaryFile: does not exist (No such file or directory)
# Error: pandoc document conversion failed with error 1
# Execution halted
# Solution: delete the hashtag symbol (#) in the R chunk header

datQ7 <- flights %>%
  mutate(date = make_date(year, month, day),
         wday = wday(date, label = T)) %>%
  select(date, wday, distance, air_time, hour) %>%
  filter(hour != 1) #omit the single one record of 1 AM flight

datQ7 %>%
  ggplot(aes(wday, distance)) + geom_boxplot() +
  labs(x="Week", y="Traveling distance (miles)") + ggtitle("Boxplot 5")

datQ7 %>%
  group_by(wday) %>%
  summarise("Mean distance (miles)" = round(mean(distance)),
            "Median distance (miles)" = median(distance),
            "Mean duration (minutes)" = round(mean(air_time, na.rm = T)),
            "Median duration (minutes)" = median(air_time, na.rm = T)) %>%
  rename(Week = wday) %>%
  kable() %>% kable_styling()

Week	Mean distance (miles)	Median distance (miles)	Mean duration (minutes)	Median duration (minutes)
Sun	1055	937	152	130
Mon	1032	828	151	129
Tue	1027	820	150	128
Wed	1028	820	150	128
Thu	1033	828	150	128
Fri	1033	828	150	127
Sat	1081	944	154	134

# Visualized by "plotly" for interaction on the chart
htmltools::div(
  datQ7 %>%
    #show legend starting with Sunday
    mutate(wday = factor(wday, levels = rev(levels(wday)))) %>%
    group_by(wday, hour) %>%
    summarise(distMedian = median(distance)) %>%
    plot_ly(y = ~distMedian, x = ~hour, color = ~wday) %>%
    add_markers(showlegend = F) %>% add_lines() %>%
    layout(xaxis = list(title = "Hour", gridcolor = "#ffffff", zeroline = F),
           yaxis = list(title = "Median traveling distance (miles)", gridcolor = "#ffffff", zeroline = F),
           title = "Line chart 14", plot_bgcolor = "#ebebeb"), 
  align = "center")

datQ7 %>%
  group_by(wday, hour) %>%
  summarise(distMedian = median(distance)) %>%
  reshape2::acast(wday~hour, value.var="distMedian") %>%
  kable() %>% kable_styling()

	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19	20	21	22	23
Sun	1400	963	1041	1005	950	1017.0	820	746	762	762	765	764	1008	944	944	762	719	266	1598
Mon	1400	762	1035	872	944	872.0	888	746	762	762	765	762	950	997	937	762	719	266	1598
Tue	1400	760	1035	872	944	872.0	888	746	762	762	765	762	950	944	944	740	642	266	1598
Wed	1400	760	1035	833	937	872.0	888	746	762	762	764	762	1008	937	937	753	642	266	1598
Thu	1400	760	1035	872	944	872.0	888	746	762	762	765	764	1008	944	944	762	719	266	1598
Fri	1400	760	1035	872	944	809.5	888	746	762	762	765	764	950	944	937	762	733	266	1598
Sat	1400	937	1035	944	950	1028.0	937	738	937	764	888	944	1065	1020	1005	1023	937	266	1598

ANSWER: Above two tables, one boxplot and one line chart show that traveling distances in each day of the week are similar, though 100 miles more median distances are on Saturday and Sunday. As for the hypothesis that business trips take off on Sunday evenings, median distances are the same at 10 PM and 11 PM whichever day of the week it is.