Summarize Data

daily <- flights %>%
  mutate(date = make_date(year, month, day)) %>%
  group_by(date) %>%
  summarize(n = n())

ggplot(daily, aes(date, n)) +
  geom_line()

Investigate Daily-Weekly Pattern

daily <- daily %>%
  mutate(wday = wday(date, label = TRUE))
ggplot(daily, aes(wday,n)) +
  geom_boxplot()

mod = lm(n ~ wday, data = daily, na.action = na.warn)

grid <- daily %>%
  data_grid(wday) %>%
  add_predictions(mod, "n")

ggplot(daily, aes(wday, n)) +
  geom_boxplot() +
  geom_point(data = grid, color = "orange", size = 4)

Investigate residuals

daily <- daily %>%
  add_residuals(mod)

daily %>%
  ggplot(aes(date, resid)) +
  geom_ref_line(h = 0) +
  geom_line()

ggplot(daily, aes(date, resid, color = wday)) +
  geom_ref_line(h = 0, colour = "red") +
  geom_line()

daily %>%
  filter(resid < -100)
## # A tibble: 11 x 4
##    date           n wday  resid
##    <date>     <int> <ord> <dbl>
##  1 2013-01-01   842 周二  -109.
##  2 2013-01-20   786 周日  -105.
##  3 2013-05-26   729 周日  -162.
##  4 2013-07-04   737 周四  -229.
##  5 2013-07-05   822 周五  -145.
##  6 2013-09-01   718 周日  -173.
##  7 2013-11-28   634 周四  -332.
##  8 2013-11-29   661 周五  -306.
##  9 2013-12-24   761 周二  -190.
## 10 2013-12-25   719 周三  -244.
## 11 2013-12-31   776 周二  -175.
daily %>%
  ggplot(aes(date, resid)) +
  geom_ref_line(h = 0, colour = "red", size = 1) +
  geom_line(color = "grey50") +
  geom_smooth(se = FALSE, span = 0.20)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Seasonal Saturday effect

daily %>%
  filter(wday == "Sat") %>%
  ggplot(aes(date, n)) +
  geom_point()+
  geom_line() +
  scale_x_date(
    NULL,
    date_breaks = "1 month",
    date_labels = "%b"
  )

Add Seasonal Variable

term <- function(date) {
  cut(date,
      breaks = ymd(20130101, 20130605, 20130825, 20140101),
      labels = c("spring", "summer", "fall")
      )
}

daily <- daily %>%
  mutate(term = term(date))

daily %>%
  filter(wday == "Sat") %>%
  ggplot(aes(date, n, color = term)) +
  geom_point(alpha = 1/3)+
  geom_line() +
  scale_x_date(
    NULL,
    date_breaks = "1 month",
    date_labels = "%b"
  )

daily %>%
  ggplot(aes(wday, n, color = term)) +
  geom_boxplot()

mod1 <- lm(n ~ wday, data = daily, na.action = na.warn)
mod2 <- lm(n ~ wday * term, data = daily, na.action = na.warn)

daily %>%
  gather_residuals(without_term = mod1, with_term = mod2) %>%
  ggplot(aes(date, resid, color = model)) +
  geom_line(alpha = 0.75)

grid <- daily %>%
  data_grid(wday, term) %>%
  add_predictions(mod2, "n")

ggplot(daily, aes(wday, n)) +
  geom_boxplot() +
  geom_point(data = grid, color = "red") +
  facet_wrap(~ term)

Better model for outliers (Robust regression)

mod3 <- MASS::rlm(n ~ wday * term, data = daily, na.action = na.warn)

daily %>%
  add_residuals(mod3, "resid") %>%
  ggplot(aes(date, resid)) +
  geom_hline(yintercept = 0, size = 2, color = "red") +
  geom_line()

Computed Variables

# If you are creating variables it might be a good idea to bundle the creation of the variables up into a function
compute_vars <- function(data) {
  data %>%
    mutate(term = term(date),
           wday = wday(date, label = TRUE)
           )
}

# Another option would be to put the transformations directly in the model formula:

wday2 <- function(x) wday(x, label = TRUE)
mod3 <- lm(n ~ wday2(date) * term(date), data = daily, na.action = na.warn)

Time of Year: An Alternative Approach

# We could use a more flexible model to capture the pattern of school term in the data
library(splines)
mod <- MASS::rlm(n ~ wday * ns(date, 5), data = daily, na.action = na.warn)

daily %>% 
  data_grid(wday, date = seq_range(date, n = 13)) %>% 
  add_predictions(mod) %>% 
  ggplot(aes(date, pred, color = wday)) +
  geom_line() +
  geom_point()

# We see a strong pattern in the numbers of Sat flights.  This is reassuring, because we also saw that pattern in the raw data.  It's a good sign when you get the same signal from different approaches.

Question #1

Why are there fewer than expected flights on January 20, May 26 and September 1? (Hint: they all have the same explanation.) How would these days generalize into another year?

# Because those days are holidays. People have less willingness to traval. 

holiday <- c("0121", "0526", "0902")
years <- 2013:2019
map(years, ~ wday(ymd(paste0(.x, holiday, sep = "")), label = TRUE))
## [[1]]
## [1] 周一 周日 周一
## Levels: 周日 < 周一 < 周二 < 周三 < 周四 < 周五 < 周六
## 
## [[2]]
## [1] 周二 周一 周二
## Levels: 周日 < 周一 < 周二 < 周三 < 周四 < 周五 < 周六
## 
## [[3]]
## [1] 周三 周二 周三
## Levels: 周日 < 周一 < 周二 < 周三 < 周四 < 周五 < 周六
## 
## [[4]]
## [1] 周四 周四 周五
## Levels: 周日 < 周一 < 周二 < 周三 < 周四 < 周五 < 周六
## 
## [[5]]
## [1] 周六 周五 周六
## Levels: 周日 < 周一 < 周二 < 周三 < 周四 < 周五 < 周六
## 
## [[6]]
## [1] 周日 周六 周日
## Levels: 周日 < 周一 < 周二 < 周三 < 周四 < 周五 < 周六
## 
## [[7]]
## [1] 周一 周日 周一
## Levels: 周日 < 周一 < 周二 < 周三 < 周四 < 周五 < 周六

The result varis between years

Question #2

What do the three days with high positive residuals represent? How would these days generalize to another year?

# Use this chunk to answer question 2
daily %>%
  top_n(3, resid)
## # A tibble: 3 x 5
##   date           n wday  resid term 
##   <date>     <int> <ord> <dbl> <fct>
## 1 2013-11-30   857 周六  112.  fall 
## 2 2013-12-01   987 周日   95.5 fall 
## 3 2013-12-28   814 周六   69.4 fall

It represents that for Saturdays and Sundays the model underpredicts the number of flights. However, these specific days of week might varies for another year based on the test we did in question 1, so we need make sure that this high imprecision is a weekend effect or a date effect.

Question #3

Create a new variable that splits the “wday” variable into terms, but only for Saturdays, i.e., it should have Thurs, Fri, but Sat-summer, Sat-spring, Sat-fall. How does this model compare with the model with every combination of “wday” and “term”?

daily <-
  flights %>% 
  mutate(date = make_date(year, month, day)) %>%
  group_by(date) %>% 
  summarize(n = n()) %>% 
  mutate(wday = wday(date, label = TRUE))
mod <- lm(n ~ wday, data = daily)
daily <- add_residuals(daily, mod)
term <- function(date) {
  cut(date,
      breaks = ymd(20130101, 20130605, 20130825, 20140101),
      labels = c("spring", "summer", "fall")
      )
}
daily <-
  daily %>% 
  mutate(term = term(date))
###
new_daily <-
  daily %>% 
  mutate(wday = as.character(wday),
         term_sat = ifelse(wday == "Sat", paste0(wday, "-", term), wday))
mod1 <- MASS::rlm(n ~ term_sat, data = new_daily)
new_daily %>% 
  add_residuals(mod1) %>% 
  ggplot(aes(date, resid)) +
  geom_line()

# Use this chunk to answer question 3

Question #4

Create a new “wday” variable that combines the day of week, term(for Saturdays), and public holidays. What do the residuals of the model look like?

# Use this chunk to answer question 4

daily_holidays <-
  new_daily %>% 
  mutate(holidays = case_when(date %in% ymd(c(20130101, # new years
                                              20130121, # mlk
                                              20130218, # presidents
                                              20130527, # memorial
                                              20130704, # independence
                                              20130902, # labor
                                              20131028, # columbus
                                              20131111, # veterans
                                              20131128, # thanksgiving
                                              20131225)) ~ "holiday",
                              TRUE ~ "None")) %>% 
  unite(new_term, term_sat, holidays)
mod2 <- lm(n ~ new_term, data = daily_holidays)
daily_holidays %>% 
  add_residuals(mod2) %>% 
  ggplot(aes(date, resid)) +
  geom_line()

Question #5

What happens if you fit a day-of-week effect that varies by month (i.e.m n ~ wday*month)? Why is this not very helpful?

# Use this chunk to answer question 5

mod2 <- lm(n ~ wday * month(date), data = daily_holidays)
daily_holidays %>% 
  add_residuals(mod2) %>% 
  ggplot(aes(date, resid)) +
  geom_line()

Question #6

What would you expect the model n ~ wday + ns(date,5) to look like? Knowing what you know about the data, why would you expect it not to be particularly effective?

# The expect model should work for the specific year(this year). However, since we know the days various by each year, so we would know that it would not be particularly effective into generalize to another year.

Question #7

We hypothesized that people leaving on Sundays are more likely to be business travelers who need to be somewhere on Monday. Explore the hypothesis by seeing how if breaks down based on distance and time: if it’s true, you’d expect to see more Sunday evening flights to places that are far away.

# Use this chunk to answer question 7

week_relevel <- function(x) {
  fct_relevel(x, "Sun", after = 7)
}
daily %>% 
  mutate(wday = week_relevel(wday)) %>% 
  ggplot(aes(wday, n)) +
  geom_boxplot()
## Warning: Unknown levels in `f`: Sun