Summarize Data

daily <- flights %>%
  mutate(date = make_date(year, month, day)) %>%
  group_by(date) %>%
  summarize(n = n())

ggplot(daily, aes(date, n)) +
  geom_line()

Investigate Daily-Weekly Pattern

daily <- daily %>%
  mutate(wday = wday(date, label = TRUE))
ggplot(daily, aes(wday,n)) +
  geom_boxplot()

mod = lm(n ~ wday, data = daily)

grid <- daily %>%
  data_grid(wday) %>%
  add_predictions(mod, "n")

ggplot(daily, aes(wday, n)) +
  geom_boxplot() +
  geom_point(data = grid, color = "orange", size = 4)

Investigate residuals

daily <- daily %>%
  add_residuals(mod)

daily %>%
  ggplot(aes(date, resid)) +
  geom_ref_line(h = 0) +
  geom_line()

ggplot(daily, aes(date, resid, color = wday)) +
  geom_ref_line(h = 0, colour = "red") +
  geom_line()

daily %>%
  filter(resid < -100)
## # A tibble: 11 x 4
##    date           n wday  resid
##    <date>     <int> <ord> <dbl>
##  1 2013-01-01   842 鍛ㄤ簩  -109.
##  2 2013-01-20   786 鍛ㄦ棩  -105.
##  3 2013-05-26   729 鍛ㄦ棩  -162.
##  4 2013-07-04   737 鍛ㄥ洓  -229.
##  5 2013-07-05   822 鍛ㄤ簲  -145.
##  6 2013-09-01   718 鍛ㄦ棩  -173.
##  7 2013-11-28   634 鍛ㄥ洓  -332.
##  8 2013-11-29   661 鍛ㄤ簲  -306.
##  9 2013-12-24   761 鍛ㄤ簩  -190.
## 10 2013-12-25   719 鍛ㄤ笁  -244.
## 11 2013-12-31   776 鍛ㄤ簩  -175.
daily %>%
  ggplot(aes(date, resid)) +
  geom_ref_line(h = 0, colour = "red", size = 1) +
  geom_line(color = "grey50") +
  geom_smooth(se = FALSE, span = 0.20)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Seasonal Saturday effect

daily %>%
  filter(wday == "Sat") %>%
  ggplot(aes(date, n)) +
  geom_point()+
  geom_line() +
  scale_x_date(
    NULL,
    date_breaks = "1 month",
    date_labels = "%b"
  )

Add Seasonal Variable

term <- function(date) {
  cut(date,
      breaks = ymd(20130101, 20130605, 20130825, 20140101),
      labels = c("spring", "summer", "fall")
      )
}

daily <- daily %>%
  mutate(term = term(date))

daily %>%
  filter(wday == "Sat") %>%
  ggplot(aes(date, n, color = term)) +
  geom_point(alpha = 1/3)+
  geom_line() +
  scale_x_date(
    NULL,
    date_breaks = "1 month",
    date_labels = "%b"
  )

daily %>%
  ggplot(aes(wday, n, color = term)) +
  geom_boxplot()

mod1 <- lm(n ~ wday, data = daily)
mod2 <- lm(n ~ wday * term, data = daily)

daily %>%
  gather_residuals(without_term = mod1, with_term = mod2) %>%
  ggplot(aes(date, resid, color = model)) +
  geom_line(alpha = 0.75)

grid <- daily %>%
  data_grid(wday, term) %>%
  add_predictions(mod2, "n")

ggplot(daily, aes(wday, n)) +
  geom_boxplot() +
  geom_point(data = grid, color = "red") +
  facet_wrap(~ term)

Better model for outliers (Robust regression)

mod3 <- MASS::rlm(n ~ wday * term, data = daily)

daily %>%
  add_residuals(mod3, "resid") %>%
  ggplot(aes(date, resid)) +
  geom_hline(yintercept = 0, size = 2, color = "red") +
  geom_line()

Computed Variables

# If you are creating variables it might be a good idea to bundle the creation of the variables up into a function
compute_vars <- function(data) {
  data %>%
    mutate(term = term(date),
           wday = wday(date, label = TRUE)
           )
}

# Another option would be to put the transformations directly in the model formula:

wday2 <- function(x) wday(x, label = TRUE)
mod3 <- lm(n ~ wday2(date) * term(date), data = daily)

Time of Year: An Alternative Approach

# We could use a more flexible model to capture the pattern of school term in the data
library(splines)
mod <- MASS::rlm(n ~ wday * ns(date, 5), data = daily)

daily %>% 
  data_grid(wday, date = seq_range(date, n = 13)) %>% 
  add_predictions(mod) %>% 
  ggplot(aes(date, pred, color = wday)) +
  geom_line() +
  geom_point()

# We see a strong pattern in the numbers of Sat flights.  This is reassuring, because we also saw that pattern in the raw data.  It's a good sign when you get the same signal from different approaches.

Question #1

Why are there fewer than expected flights on January 20, May 26 and September 1? (Hint: they all have the same explanation.) How would these days generalize into another year?

#These days are Sunday before holidays. Since these holidays are not fixed, they will not generalize into another year (see results below).
holiday <- c("0121", "0526", "0902")
years <- 2013:2019
map(years, ~ wday(ymd(paste0(.x, holiday, sep = "")), label = TRUE))
## [[1]]
## [1] 鍛ㄤ竴 鍛ㄦ棩 鍛ㄤ竴
## Levels: 鍛ㄦ棩 < 鍛ㄤ竴 < 鍛ㄤ簩 < 鍛ㄤ笁 < 鍛ㄥ洓 < 鍛ㄤ簲 < 鍛ㄥ叚
## 
## [[2]]
## [1] 鍛ㄤ簩 鍛ㄤ竴 鍛ㄤ簩
## Levels: 鍛ㄦ棩 < 鍛ㄤ竴 < 鍛ㄤ簩 < 鍛ㄤ笁 < 鍛ㄥ洓 < 鍛ㄤ簲 < 鍛ㄥ叚
## 
## [[3]]
## [1] 鍛ㄤ笁 鍛ㄤ簩 鍛ㄤ笁
## Levels: 鍛ㄦ棩 < 鍛ㄤ竴 < 鍛ㄤ簩 < 鍛ㄤ笁 < 鍛ㄥ洓 < 鍛ㄤ簲 < 鍛ㄥ叚
## 
## [[4]]
## [1] 鍛ㄥ洓 鍛ㄥ洓 鍛ㄤ簲
## Levels: 鍛ㄦ棩 < 鍛ㄤ竴 < 鍛ㄤ簩 < 鍛ㄤ笁 < 鍛ㄥ洓 < 鍛ㄤ簲 < 鍛ㄥ叚
## 
## [[5]]
## [1] 鍛ㄥ叚 鍛ㄤ簲 鍛ㄥ叚
## Levels: 鍛ㄦ棩 < 鍛ㄤ竴 < 鍛ㄤ簩 < 鍛ㄤ笁 < 鍛ㄥ洓 < 鍛ㄤ簲 < 鍛ㄥ叚
## 
## [[6]]
## [1] 鍛ㄦ棩 鍛ㄥ叚 鍛ㄦ棩
## Levels: 鍛ㄦ棩 < 鍛ㄤ竴 < 鍛ㄤ簩 < 鍛ㄤ笁 < 鍛ㄥ洓 < 鍛ㄤ簲 < 鍛ㄥ叚
## 
## [[7]]
## [1] 鍛ㄤ竴 鍛ㄦ棩 鍛ㄤ竴
## Levels: 鍛ㄦ棩 < 鍛ㄤ竴 < 鍛ㄤ簩 < 鍛ㄤ笁 < 鍛ㄥ洓 < 鍛ㄤ簲 < 鍛ㄥ叚

Question #2

What do the three days with high positive residuals represent? How would these days generalize to another year?

daily %>%
  top_n(3, resid)
## # A tibble: 3 x 5
##   date           n wday  resid term 
##   <date>     <int> <ord> <dbl> <fct>
## 1 2013-11-30   857 鍛ㄥ叚  112.  fall 
## 2 2013-12-01   987 鍛ㄦ棩   95.5 fall 
## 3 2013-12-28   814 鍛ㄥ叚   69.4 fall

These three days represent that the model underestimated the number of flights. Since these are days around Thanksgiving and Christmas, which are fixed every year, we can generalize them to another year.

Question #3

Create a new variable that splits the “wday” variable into terms, but only for Saturdays, i.e., it should have Thurs, Fri, but Sat-summer, Sat-spring, Sat-fall. How does this model compare with the model with every combination of “wday” and “term”?

daily=daily %>%
  mutate(
    wday_new=
      case_when(
        wday=="Sat"&term=="summer"~"Sat-summer",
        wday=="Sat"&term=="fall"~"Sat-fall",
        wday=="Sat"&term=="spring"~"Sat-spring",
        TRUE~as.character(wday)
      )
  )

mod4=lm(n~wday_new,data=daily)

daily %>%
  gather_residuals(sat_term=mod4, all_term=mod2) %>%
  ggplot(aes(date,resid,colour=model)) +
  geom_line(alpha=0.8)

Jan-Mar is under predicted. Ourliers exist in summer and winter seasons.

Question #4

Create a new “wday” variable that combines the day of week, term(for Saturdays), and public holidays. What do the residuals of the model look like?

daily <- daily %>%
  mutate(wday3 = 
         case_when(
           .$date %in% lubridate::ymd(c(20130101, # new years
                                        20130121, # mlk
                                        20130218, # presidents
                                        20130527, # memorial
                                        20130704, # independence
                                        20130902, # labor
                                        20131028, # columbus
                                        20131111, # veterans
                                        20131128, # thanksgiving
                                        20131225)) ~
             "holiday",
           .$wday == "Sat" & .$term == "summer" ~ "Sat-summer",
           .$wday == "Sat" & .$ term == "fall" ~ "Sat-fall",
           .$wday == "Sat" & .$term == "spring" ~ "Sat-spring",
           TRUE ~ as.character(.$wday)))
mod5 <- lm(n ~ wday3, data = daily)
daily %>% 
  spread_residuals(mod5) %>%
  arrange(desc(abs(resid))) %>%
  slice(1:20) %>% select(date, wday, resid)
## # A tibble: 20 x 3
##    date       wday   resid
##    <date>     <ord>  <dbl>
##  1 2013-11-28 鍛ㄥ洓  -332. 
##  2 2013-11-29 鍛ㄤ簲  -306. 
##  3 2013-12-25 鍛ㄤ笁  -244. 
##  4 2013-07-04 鍛ㄥ洓  -229. 
##  5 2013-12-24 鍛ㄤ簩  -190. 
##  6 2013-12-31 鍛ㄤ簩  -175. 
##  7 2013-09-01 鍛ㄦ棩  -173. 
##  8 2013-05-26 鍛ㄦ棩  -162. 
##  9 2013-07-05 鍛ㄤ簲  -145. 
## 10 2013-11-30 鍛ㄥ叚   112. 
## 11 2013-01-01 鍛ㄤ簩  -109. 
## 12 2013-01-20 鍛ㄦ棩  -105. 
## 13 2013-12-01 鍛ㄦ棩    95.5
## 14 2013-02-03 鍛ㄦ棩   -77.5
## 15 2013-01-19 鍛ㄥ叚   -70.6
## 16 2013-12-28 鍛ㄥ叚    69.4
## 17 2013-10-12 鍛ㄥ叚   -68.6
## 18 2013-01-27 鍛ㄦ棩   -68.5
## 19 2013-06-22 鍛ㄥ叚    67.4
## 20 2013-06-29 鍛ㄥ叚    67.4

The pattern looks similiar to the model in Q3.

Question #5

What happens if you fit a day-of-week effect that varies by month (i.e.m n ~ wday*month)? Why is this not very helpful?

mod2=lm(n~wday*term,data=daily)

mod_q5=lm(n~wday*month(date),data=daily)
daily %>% 
  gather_residuals(Q4=mod_q5, All_combine=mod2) %>% 
  ggplot(aes(date,resid,colour=model)) +
    geom_line(alpha=0.75)

There will be more outliers in graph. The reason is that the model reduced the number of observations per bucket and made the prediction uncertain.

Question #6

What would you expect the model n ~ wday + ns(date,5) to look like? Knowing what you know about the data, why would you expect it not to be particularly effective?

mod5 <- lm(n ~ wday + splines::ns(date, 5), data = daily)
daily %>%
  gather_residuals(mod2,mod5)%>%
  arrange(date)%>%
  ggplot(aes(date,resid,color = model))+
  geom_line(alpha = 0.75)

The model did well in predicting the trend this year but may not we efective in other years because some factors are seasonal or change over the years.

Question #7

We hypothesized that people leaving on Sundays are more likely to be business travelers who need to be somewhere on Monday. Explore the hypothesis by seeing how if breaks down based on distance and time: if it’s true, you’d expect to see more Sunday evening flights to places that are far away.

week_relevel=function(x) {
  fct_relevel(x,"Sun",after=8)
}
daily %>%
  mutate(wday=week_relevel(wday)) %>%
  ggplot(aes(wday,n))+ggtitle("Sunday Night Flights") +
  geom_boxplot()
## Warning: Unknown levels in `f`: Sun

The result revealed that the hypothesis is not true.