Summarize Data

daily <- flights %>%
  mutate(date = make_date(year, month, day)) %>%
  group_by(date) %>%
  summarize(n = n())

ggplot(daily, aes(date, n)) +
  geom_line()

Investigate Daily-Weekly Pattern

daily <- daily %>%
  mutate(wday = wday(date, label = TRUE))
ggplot(daily, aes(wday,n)) +
  geom_boxplot()

mod = lm(n ~ wday, data = daily, na.action = na.warn)

grid <- daily %>%
  data_grid(wday) %>%
  add_predictions(mod, "n")

ggplot(daily, aes(wday, n)) +
  geom_boxplot() +
  geom_point(data = grid, color = "orange", size = 4)

Investigate residuals

daily <- daily %>%
  add_residuals(mod)

daily %>%
  ggplot(aes(date, resid)) +
  geom_ref_line(h = 0) +
  geom_line()

ggplot(daily, aes(date, resid, color = wday)) +
  geom_ref_line(h = 0, colour = "red") +
  geom_line()

daily %>%
  filter(resid < -100)
## # A tibble: 11 x 4
##    date           n wday  resid
##    <date>     <int> <ord> <dbl>
##  1 2013-01-01   842 Tue   -109.
##  2 2013-01-20   786 Sun   -105.
##  3 2013-05-26   729 Sun   -162.
##  4 2013-07-04   737 Thu   -229.
##  5 2013-07-05   822 Fri   -145.
##  6 2013-09-01   718 Sun   -173.
##  7 2013-11-28   634 Thu   -332.
##  8 2013-11-29   661 Fri   -306.
##  9 2013-12-24   761 Tue   -190.
## 10 2013-12-25   719 Wed   -244.
## 11 2013-12-31   776 Tue   -175.
daily %>%
  ggplot(aes(date, resid)) +
  geom_ref_line(h = 0, colour = "red", size = 1) +
  geom_line(color = "grey50") +
  geom_smooth(se = FALSE, span = 0.20)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Seasonal Saturday effect

daily %>%
  filter(wday == "Sat") %>%
  ggplot(aes(date, n)) +
  geom_point()+
  geom_line() +
  scale_x_date(
    NULL,
    date_breaks = "1 month",
    date_labels = "%b"
  )

Add Seasonal Variable

term <- function(date) {
  cut(date,
      breaks = ymd(20130101, 20130605, 20130825, 20140101),
      labels = c("spring", "summer", "fall")
      )
}

daily <- daily %>%
  mutate(term = term(date))

daily %>%
  filter(wday == "Sat") %>%
  ggplot(aes(date, n, color = term)) +
  geom_point(alpha = 1/3)+
  geom_line() +
  scale_x_date(
    NULL,
    date_breaks = "1 month",
    date_labels = "%b"
  )

daily %>%
  ggplot(aes(wday, n, color = term)) +
  geom_boxplot()

mod1 <- lm(n ~ wday, data = daily, na.action = na.warn)
mod2 <- lm(n ~ wday * term, data = daily, na.action = na.warn)

daily %>%
  gather_residuals(without_term = mod1, with_term = mod2) %>%
  ggplot(aes(date, resid, color = model)) +
  geom_line(alpha = 0.75)

grid <- daily %>%
  data_grid(wday, term) %>%
  add_predictions(mod2, "n")

ggplot(daily, aes(wday, n)) +
  geom_boxplot() +
  geom_point(data = grid, color = "red") +
  facet_wrap(~ term)

Better model for outliers (Robust regression)

mod3 <- MASS::rlm(n ~ wday * term, data = daily, na.action = na.warn)

daily %>%
  add_residuals(mod3, "resid") %>%
  ggplot(aes(date, resid)) +
  geom_hline(yintercept = 0, size = 2, color = "red") +
  geom_line()

Computed Variables

# If you are creating variables it might be a good idea to bundle the creation of the variables up into a function
compute_vars <- function(data) {
  data %>%
    mutate(term = term(date),
           wday = wday(date, label = TRUE)
           )
}

# Another option would be to put the transformations directly in the model formula:

wday2 <- function(x) wday(x, label = TRUE)
mod3 <- lm(n ~ wday2(date) * term(date), data = daily, na.action = na.warn)

Time of Year: An Alternative Approach

# We could use a more flexible model to capture the pattern of school term in the data
library(splines)
mod <- MASS::rlm(n ~ wday * ns(date, 5), data = daily, na.action = na.warn)

daily %>% 
  data_grid(wday, date = seq_range(date, n = 13)) %>% 
  add_predictions(mod) %>% 
  ggplot(aes(date, pred, color = wday)) +
  geom_line() +
  geom_point()

# We see a strong pattern in the numbers of Sat flights.  This is reassuring, because we also saw that pattern in the raw data.  It's a good sign when you get the same signal from different approaches.

Question #1

Why are there fewer than expected flights on January 20, May 26 and September 1? (Hint: they all have the same explanation.) How would these days generalize into another year?

# Use this chunk to answer question 1
# Because all these 3 days are US holidays, and people would prefer to stay with their families or friends.
holiday <- c("0121", "0526", "0902")
years <- 2013:2019
map(years, ~ wday(ymd(paste0(.x, holiday, sep = "")), label = TRUE))
## [[1]]
## [1] Mon Sun Mon
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat
## 
## [[2]]
## [1] Tue Mon Tue
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat
## 
## [[3]]
## [1] Wed Tue Wed
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat
## 
## [[4]]
## [1] Thu Thu Fri
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat
## 
## [[5]]
## [1] Sat Fri Sat
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat
## 
## [[6]]
## [1] Sun Sat Sun
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat
## 
## [[7]]
## [1] Mon Sun Mon
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat
## From this table, we can say the week days are changing with different years.

Question #2

What do the three days with high positive residuals represent? How would these days generalize to another year?

# Use this chunk to answer question 2
daily %>%
  top_n(3, resid)
## # A tibble: 3 x 5
##   date           n wday  resid term 
##   <date>     <int> <ord> <dbl> <fct>
## 1 2013-11-30   857 Sat   112.  fall 
## 2 2013-12-01   987 Sun    95.5 fall 
## 3 2013-12-28   814 Sat    69.4 fall
## These 3 days represents Christmas holiday & Thanks giving holiday. The high positive residuals represent that the flights are under-prediction, but it could change if the date is weekday or weekends based on different years.

Question #3

Create a new variable that splits the “wday” variable into terms, but only for Saturdays, i.e., it should have Thurs, Fri, but Sat-summer, Sat-spring, Sat-fall. How does this model compare with the model with every combination of “wday” and “term”?

# Use this chunk to answer question 3

daily <- daily %>%
  mutate(term = term(date))  %>%
  mutate(term2 = ifelse(wday == 'Sat',paste0(wday,"-",term),as.character(term) ))

mod3<- lm(n~ wday * term2, data = daily)

daily %>%
  gather_residuals(mod3,mod2)%>%
  arrange(date)%>%
  ggplot(aes(date,resid,color = model))+
  geom_line(alpha = 0.75)
## Warning in predict.lm(model, data): prediction from a rank-deficient fit
## may be misleading

## The prediction are most likely stable.

Question #4

Create a new “wday” variable that combines the day of week, term(for Saturdays), and public holidays. What do the residuals of the model look like?

# Use this chunk to answer question 4
daily_holidays <- daily %>% 
  mutate(holidays = case_when(date %in% ymd(c(20130101, 
                                              20130121, 
                                              20130218, 
                                              20130527, 
                                              20130704, 
                                              20130902, 
                                              20131028, 
                                              20131111, 
                                              20131128, 
                                              20131225)) ~ "holiday",
                              TRUE ~ "None")) %>% 
  unite(term, term2, holidays)

mod4 <- lm(n ~ term, data = daily_holidays)

daily_holidays %>% 
  add_residuals(mod4) %>% 
  ggplot(aes(date, resid)) +
  geom_line()

## The residual & the model are generally following a similar trend.

Question #5

What happens if you fit a day-of-week effect that varies by month (i.e.m n ~ wday*month)? Why is this not very helpful?

# Use this chunk to answer question 5
mod5 <- MASS::rlm(n ~ wday*month(date,label = TRUE), data = daily)

daily %>%
  gather_residuals(month_mod = mod5, regular_mod = mod2) %>%
  ggplot(aes(date, resid, color = model)) +
  geom_line()

## The plots are very close & similar to each other. Because on the x axis, each represents the same day, and the difference between the weekdays are fixed value (within 7);

Question #6

What would you expect the model n ~ wday + ns(date,5) to look like? Knowing what you know about the data, why would you expect it not to be particularly effective?

# Use this chunk to answer question 6

## I expect the model would have obvious residual errors since the days in different years are not the same, and people have different plans in different years.

Question #7

We hypothesized that people leaving on Sundays are more likely to be business travelers who need to be somewhere on Monday. Explore the hypothesis by seeing how if breaks down based on distance and time: if it’s true, you’d expect to see more Sunday evening flights to places that are far away.

# Use this chunk to answer question 7
week_relevel <- function(x) {
  fct_relevel(x, "Sun", after = 8)
}
daily %>%
  mutate(wday = week_relevel(wday)) %>%
  ggplot(aes(wday, n)) + ggtitle("Sunday Night Flights") +
  geom_boxplot()

## From this plot, we cannot see the breaks of distance & time on Sundays.