Summarize Data

daily <- flights %>%
  mutate(date = make_date(year, month, day)) %>%
  group_by(date) %>%
  summarize(n = n())

ggplot(daily, aes(date, n)) +
  geom_line()

Investigate Daily-Weekly Pattern

daily <- daily %>%
  mutate(wday = wday(date, label = TRUE))
ggplot(daily, aes(wday,n)) +
  geom_boxplot()

mod = lm(n ~ wday, data = daily)

grid <- daily %>%
  data_grid(wday) %>%
  add_predictions(mod, "n")

ggplot(daily, aes(wday, n)) +
  geom_boxplot() +
  geom_point(data = grid, color = "orange", size = 4)

Investigate residuals

daily <- daily %>%
  add_residuals(mod)

daily %>%
  ggplot(aes(date, resid)) +
  geom_ref_line(h = 0) +
  geom_line()

ggplot(daily, aes(date, resid, color = wday)) +
  geom_ref_line(h = 0, colour = "red") +
  geom_line()

daily %>%
  filter(resid < -100)
## # A tibble: 11 x 4
##    date           n wday  resid
##    <date>     <int> <ord> <dbl>
##  1 2013-01-01   842 鍛ㄤ簩  -109.
##  2 2013-01-20   786 鍛ㄦ棩  -105.
##  3 2013-05-26   729 鍛ㄦ棩  -162.
##  4 2013-07-04   737 鍛ㄥ洓  -229.
##  5 2013-07-05   822 鍛ㄤ簲  -145.
##  6 2013-09-01   718 鍛ㄦ棩  -173.
##  7 2013-11-28   634 鍛ㄥ洓  -332.
##  8 2013-11-29   661 鍛ㄤ簲  -306.
##  9 2013-12-24   761 鍛ㄤ簩  -190.
## 10 2013-12-25   719 鍛ㄤ笁  -244.
## 11 2013-12-31   776 鍛ㄤ簩  -175.
daily %>%
  ggplot(aes(date, resid)) +
  geom_ref_line(h = 0, colour = "red", size = 1) +
  geom_line(color = "grey50") +
  geom_smooth(se = FALSE, span = 0.20)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Seasonal Saturday effect

daily %>%
  filter(wday == "Sat") %>%
  ggplot(aes(date, n)) +
  geom_point()+
  geom_line() +
  scale_x_date(
    NULL,
    date_breaks = "1 month",
    date_labels = "%b"
  )

Add Seasonal Variable

term <- function(date) {
  cut(date,
      breaks = ymd(20130101, 20130605, 20130825, 20140101),
      labels = c("spring", "summer", "fall")
      )
}

daily <- daily %>%
  mutate(term = term(date))

daily %>%
  filter(wday == "Sat") %>%
  ggplot(aes(date, n, color = term)) +
  geom_point(alpha = 1/3)+
  geom_line() +
  scale_x_date(
    NULL,
    date_breaks = "1 month",
    date_labels = "%b"
  )

daily %>%
  ggplot(aes(wday, n, color = term)) +
  geom_boxplot()

mod1 <- lm(n ~ wday, data = daily)
mod2 <- lm(n ~ wday * term, data = daily)

daily %>%
  gather_residuals(without_term = mod1, with_term = mod2) %>%
  ggplot(aes(date, resid, color = model)) +
  geom_line(alpha = 0.75)

grid <- daily %>%
  data_grid(wday, term) %>%
  add_predictions(mod2, "n")

ggplot(daily, aes(wday, n)) +
  geom_boxplot() +
  geom_point(data = grid, color = "red") +
  facet_wrap(~ term)

Better model for outliers (Robust regression)

mod3 <- MASS::rlm(n ~ wday * term, data = daily)

daily %>%
  add_residuals(mod3, "resid") %>%
  ggplot(aes(date, resid)) +
  geom_hline(yintercept = 0, size = 2, color = "red") +
  geom_line()

Computed Variables

# If you are creating variables it might be a good idea to bundle the creation of the variables up into a function
compute_vars <- function(data) {
  data %>%
    mutate(term = term(date),
           wday = wday(date, label = TRUE)
           )
}

# Another option would be to put the transformations directly in the model formula:

wday2 <- function(x) wday(x, label = TRUE)
mod3 <- lm(n ~ wday2(date) * term(date), data = daily)

Time of Year: An Alternative Approach

# We could use a more flexible model to capture the pattern of school term in the data
library(splines)
mod <- MASS::rlm(n ~ wday * ns(date, 5), data = daily)

daily %>% 
  data_grid(wday, date = seq_range(date, n = 13)) %>% 
  add_predictions(mod) %>% 
  ggplot(aes(date, pred, color = wday)) +
  geom_line() +
  geom_point()

# We see a strong pattern in the numbers of Sat flights.  This is reassuring, because we also saw that pattern in the raw data.  It's a good sign when you get the same signal from different approaches.

Question #1

Why are there fewer than expected flights on January 20, May 26 and September 1? (Hint: they all have the same explanation.) How would these days generalize into another year?

daily$percentile <- ecdf(daily$n)(daily$n)
daily$percentile[daily$date == "2013-01-20" | daily$date == "2013-05-26" |daily$date == "2013-09-01"]
## [1] 0.13424658 0.07123288 0.06027397

For all of these three days, the value is low. Since these are the Sundays before a Monday holiday. Martin Luther King Day, Memorial Day and Labor Day. The holiday dates varies in different years. So the pattern may not be the same for every year.

Question #2

What do the three days with high positive residuals represent? How would these days generalize to another year?

daily %>%
  top_n(3, resid)
## # A tibble: 3 x 6
##   date           n wday  resid term  percentile
##   <date>     <int> <ord> <dbl> <fct>      <dbl>
## 1 2013-11-30   857 鍛ㄥ叚  112.  fall       0.189
## 2 2013-12-01   987 鍛ㄦ棩   95.5 fall       0.778
## 3 2013-12-28   814 鍛ㄥ叚   69.4 fall       0.167

These are the Saturday and Sunday after Thanksgiving and Saturday after Christmas. The pattern can be generalized for Christmas since the date for Christmas is fixed but not for Thanksgiving.

Question #3

Create a new variable that splits the “wday” variable into terms, but only for Saturdays, i.e., it should have Thurs, Fri, but Sat-summer, Sat-spring, Sat-fall. How does this model compare with the model with every combination of “wday” and “term”?

daily <-
  flights %>% 
  mutate(date = make_date(year, month, day)) %>%
  group_by(date) %>% 
  summarize(n = n()) %>% 
  mutate(wday = wday(date, label = TRUE))
model <- lm(n ~ wday, data = daily)
daily <- add_residuals(daily, model)
term <- function(date) {
  cut(date,
      breaks = ymd(20130101, 20130605, 20130825, 20140101),
      labels = c("spring", "summer", "fall")
      )
}

daily <-
  daily %>% 
  mutate(term = term(date))
###
new_daily <-
  daily %>% 
  mutate(wday = as.character(wday),
         term_sat = ifelse(wday == "Sat", paste0(wday, "-", term), wday))
model1 <- MASS::rlm(n ~ term_sat, data = new_daily)
new_daily %>% 
  add_residuals(model1) %>% 
  ggplot(aes(date, resid)) +
  geom_line()

It looks almost the same. From Jan to March, it has negative values. Unusual values occur during Summer and Winter time and also September.

Question #4

Create a new “wday” variable that combines the day of week, term(for Saturdays), and public holidays. What do the residuals of the model look like?

daily_holidays <-
  new_daily %>% 
  mutate(holidays = case_when(date %in% ymd(c(20130101, # new years
                                              20130121, # mlk
                                              20130218, # presidents
                                              20130527, # memorial
                                              20130704, # independence
                                              20130902, # labor
                                              20131028, # columbus
                                              20131111, # veterans
                                              20131128, # thanksgiving
                                              20131225)) ~ "holiday",
                              TRUE ~ "None")) %>% 
  unite(new_term, term_sat, holidays)
model2 <- lm(n ~ new_term, data = daily_holidays)
daily_holidays %>% 
  add_residuals(model2) %>% 
  ggplot(aes(date, resid)) +
  geom_line()

It doesn’t look like holidays have any significant effect. But it looks like the average residual value is a little bit less than the model in Q2.

Question #5

What happens if you fit a day-of-week effect that varies by month (i.e.m n ~ wday*month)? Why is this not very helpful?

daily <- flights %>% 
  mutate(date = make_date(year, month, day)) %>% 
  group_by(date,month) %>% 
  summarise(n = n())
daily <- daily %>% 
  mutate(wday = wday(date, label = TRUE)) %>%
  mutate(term = term(date)) 
model2 <- lm(n ~ wday * term, data = daily)
mod4 <- lm(n~ wday * month, data = daily)

daily %>%
  gather_residuals(mod4,mod2)%>%
  arrange(date)%>%
  ggplot(aes(date,resid,color = model))+
  geom_line(alpha = 0.75)

There are more outliers in this model. We have more variables but do not have enough observations, there could be more errors.

Question #6

What would you expect the model n ~ wday + ns(date,5) to look like? Knowing what you know about the data, why would you expect it not to be particularly effective?

mod5 <- lm(n ~ wday + splines::ns(date, 5), data = daily)
daily %>%
  gather_residuals(mod2,mod5)%>%
  arrange(date)%>%
  ggplot(aes(date,resid,color = model))+
  geom_line(alpha = 0.75)

This model is only predictive for this year but cannot generalize other years patterns.

Question #7

We hypothesized that people leaving on Sundays are more likely to be business travelers who need to be somewhere on Monday. Explore the hypothesis by seeing how if breaks down based on distance and time: if it’s true, you’d expect to see more Sunday evening flights to places that are far away.

flights %>% 
  mutate(date = make_date(year, month, day),
         wday = wday(date, label = TRUE)) %>%
  group_by(wday) %>%
  summarise(dist_mean =  mean(distance),
            dist_median = median(distance)) %>%
  ggplot(aes(y = dist_mean, x = wday)) +
  geom_point()

Saturday has the longest distance and Sunday is the second long. It might because people usually travel on Saturday for vacation and Sunday for business.