Summarize Data

daily <- flights %>%
  mutate(date = make_date(year, month, day)) %>%
  group_by(date) %>%
  summarize(n = n())

ggplot(daily, aes(date, n)) +
  geom_line()

Investigate Daily-Weekly Pattern

daily <- daily %>%
  mutate(wday = wday(date, label = TRUE))
ggplot(daily, aes(wday,n)) +
  geom_boxplot()

mod = lm(n ~ wday, data = daily)

grid <- daily %>%
  data_grid(wday) %>%
  add_predictions(mod, "n")

ggplot(daily, aes(wday, n)) +
  geom_boxplot() +
  geom_point(data = grid, color = "orange", size = 4)

Investigate residuals

daily <- daily %>%
  add_residuals(mod)

daily %>%
  ggplot(aes(date, resid)) +
  geom_ref_line(h = 0) +
  geom_line()

ggplot(daily, aes(date, resid, color = wday)) +
  geom_ref_line(h = 0, colour = "red") +
  geom_line()

daily %>%
  filter(resid < -100)
## # A tibble: 11 x 4
##    date           n wday  resid
##    <date>     <int> <ord> <dbl>
##  1 2013-01-01   842 Tue   -109.
##  2 2013-01-20   786 Sun   -105.
##  3 2013-05-26   729 Sun   -162.
##  4 2013-07-04   737 Thu   -229.
##  5 2013-07-05   822 Fri   -145.
##  6 2013-09-01   718 Sun   -173.
##  7 2013-11-28   634 Thu   -332.
##  8 2013-11-29   661 Fri   -306.
##  9 2013-12-24   761 Tue   -190.
## 10 2013-12-25   719 Wed   -244.
## 11 2013-12-31   776 Tue   -175.
daily %>%
  ggplot(aes(date, resid)) +
  geom_ref_line(h = 0, colour = "red", size = 1) +
  geom_line(color = "grey50") +
  geom_smooth(se = FALSE, span = 0.20)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Seasonal Saturday effect

daily %>%
  filter(wday == "Sat") %>%
  ggplot(aes(date, n)) +
  geom_point()+
  geom_line() +
  scale_x_date(
    NULL,
    date_breaks = "1 month",
    date_labels = "%b"
  )

Add Seasonal Variable

term <- function(date) {
  cut(date,
      breaks = ymd(20130101, 20130605, 20130825, 20140101),
      labels = c("spring", "summer", "fall")
      )
}

daily <- daily %>%
  mutate(term = term(date))

daily %>%
  filter(wday == "Sat") %>%
  ggplot(aes(date, n, color = term)) +
  geom_point(alpha = 1/3)+
  geom_line() +
  scale_x_date(
    NULL,
    date_breaks = "1 month",
    date_labels = "%b"
  )

daily %>%
  ggplot(aes(wday, n, color = term)) +
  geom_boxplot()

mod1 <- lm(n ~ wday, data = daily)
mod2 <- lm(n ~ wday * term, data = daily)

daily %>%
  gather_residuals(without_term = mod1, with_term = mod2) %>%
  ggplot(aes(date, resid, color = model)) +
  geom_line(alpha = 0.75)

grid <- daily %>%
  data_grid(wday, term) %>%
  add_predictions(mod2, "n")

ggplot(daily, aes(wday, n)) +
  geom_boxplot() +
  geom_point(data = grid, color = "red") +
  facet_wrap(~ term)

Better model for outliers (Robust regression)

mod3 <- MASS::rlm(n ~ wday * term, data = daily)

daily %>%
  add_residuals(mod3, "resid") %>%
  ggplot(aes(date, resid)) +
  geom_hline(yintercept = 0, size = 2, color = "red") +
  geom_line()

Computed Variables

# If you are creating variables it might be a good idea to bundle the creation of the variables up into a function
compute_vars <- function(data) {
  data %>%
    mutate(term = term(date),
           wday = wday(date, label = TRUE)
           )
}

# Another option would be to put the transformations directly in the model formula:

wday2 <- function(x) wday(x, label = TRUE)
mod3 <- lm(n ~ wday2(date) * term(date), data = daily)

Time of Year: An Alternative Approach

# We could use a more flexible model to capture the pattern of school term in the data
library(splines)
mod <- MASS::rlm(n ~ wday * ns(date, 5), data = daily)

daily %>% 
  data_grid(wday, date = seq_range(date, n = 13)) %>% 
  add_predictions(mod) %>% 
  ggplot(aes(date, pred, color = wday)) +
  geom_line() +
  geom_point()

# We see a strong pattern in the numbers of Sat flights.  This is reassuring, because we also saw that pattern in the raw data.  It's a good sign when you get the same signal from different approaches.

Question #1

Why are there fewer than expected flights on January 20, May 26 and September 1? (Hint: they all have the same explanation.) How would these days generalize into another year?

These are long weekends due holidays due observances of holidays, 01-21 Martin Luther King, 05-27 Memorial Day, and Labor day 09-02. These holidays are celebrate in similar but not exact dates (for example Labor day is the first Monday of September) that why the days wouldn’t match from year to year, but we could analyze the picks for those weeks. Or find the way to use the holiday functions to analyze comparable information and not doing it just by date.

The fact that this is not replicate it exactly the same date in the different years, make sense to have less flights on Sunday because people would tent to flight on Monday to enjoy the long weekend.

daysHolidays <- c("0121", "0527", "0902")
years <- 2013:2020
map(years, ~ wday(ymd(paste0(.x, daysHolidays, sep = "")), label = TRUE))
## [[1]]
## [1] Mon Mon Mon
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat
## 
## [[2]]
## [1] Tue Tue Tue
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat
## 
## [[3]]
## [1] Wed Wed Wed
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat
## 
## [[4]]
## [1] Thu Fri Fri
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat
## 
## [[5]]
## [1] Sat Sat Sat
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat
## 
## [[6]]
## [1] Sun Sun Sun
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat
## 
## [[7]]
## [1] Mon Mon Mon
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat
## 
## [[8]]
## [1] Tue Wed Wed
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat
library(splusTimeDate)
## 
## Attaching package: 'splusTimeDate'
## The following objects are masked from 'package:lubridate':
## 
##     days, hms, hours, mdy, minutes, seconds, years
## The following objects are masked from 'package:base':
## 
##     months, quarters, sort.list, weekdays
holiday.MLK(2013:2019)
## [1] 01/21/2013 01/20/2014 01/19/2015 01/18/2016 01/16/2017 01/15/2018
## [7] 01/21/2019
holiday.Labor(2013:2019)
## [1] 09/02/2013 09/01/2014 09/07/2015 09/05/2016 09/04/2017 09/03/2018
## [7] 09/02/2019
holiday.Memorial(2013:2019)
## [1] 05/27/2013 05/26/2014 05/25/2015 05/30/2016 05/29/2017 05/28/2018
## [7] 05/27/2019

Question #2

What do the three days with high positive residuals represent? How would these days generalize to another year?

These days follow Thanksgiving and Christmas holidays, like in the questions above, they change year by year. We could have high residuals due bad weather (typical of the season) plus the high season that could affect the model. If you think about every year about that time we see in the news multiple cancelled or delayed flights.

daily %>%
  top_n(3, resid)
## # A tibble: 3 x 5
##   date           n wday  resid term 
##   <date>     <int> <ord> <dbl> <fct>
## 1 2013-11-30   857 Sat   112.  fall 
## 2 2013-12-01   987 Sun    95.5 fall 
## 3 2013-12-28   814 Sat    69.4 fall

Question #3

Create a new variable that splits the “wday” variable into terms, but only for Saturdays, i.e., it should have Thurs, Fri, but Sat-summer, Sat-spring, Sat-fall. How does this model compare with the model with every combination of “wday” and “term”?

They behave really similar, but the model with the splits for Saturday has a lower AIC, by the graphic we can see the same model has significant variation for the period between October 2013 and January 2014, this could be related with the multiple holidays for that period, like Chrismas, New Year, and Thanksgiving.

daily_Q3 <- daily %>%
  mutate(wday2 = 
         case_when(.$wday == "Sat" & .$term == "summer" ~ "Sat-summer",
         .$wday == "Sat" & .$term == "fall" ~ "Sat-fall",
         .$wday == "Sat" & .$term == "spring" ~ "Sat-spring",
         TRUE ~ as.character(.$wday)))

mod3 <- lm(n ~ wday * term, data = daily_Q3)

mod_Q3 <- lm(n ~ wday2, data = daily_Q3)

daily_Q3 %>% 
  gather_residuals(Sat = mod_Q3, All_combine = mod3) %>% 
  ggplot(aes(date, resid, colour = model)) +
    geom_line(alpha = 0.50)

AIC(mod_Q3)
## [1] 3862.885
AIC(mod3)
## [1] 3855.73

Question #4

Create a new “wday” variable that combines the day of week, term(for Saturdays), and public holidays. What do the residuals of the model look like?

daily_q4 <- daily %>%
  mutate(wday = 
         case_when(
           .$date %in% lubridate::ymd(c(20130101, 20130121, 20130218, 20130527,20130704,
                                        20130902, 20131028, 20131111, 20131128, 20131225)) ~ "holiday",
           .$wday == "Sat" & .$term == "summer" ~ "Sat-summer",
           .$wday == "Sat" & .$ term == "fall" ~ "Sat-fall",
           .$wday == "Sat" & .$term == "spring" ~ "Sat-spring",
           TRUE ~ as.character(.$wday)))

mod2 <- lm(n ~ wday * term, data = daily_q4)

mod_q4 <- lm(n ~ wday, data = daily_q4)

daily_q4 %>% 
  gather_residuals(Q4 = mod_q4, All_combine = mod2) %>% 
  ggplot(aes(date, resid, colour = model)) +
    geom_line(alpha = 0.75)
## Warning in predict.lm(model, data): prediction from a rank-deficient fit
## may be misleading

Holidays might have some impact on the residual, but not significant. Make sense because not all public Holidays mean long weekends, others are regular week days. the variances could be related with Memorial in May and Labor day in September; but again days like Independence day and veterans day are calendar days, so do not represent long weekends.

Question #5

What happens if you fit a day-of-week effect that varies by month (i.e.m n ~ wday*month)? Why is this not very helpful?

mod2 <- lm(n ~ wday * term, data = daily)

mod_q5 <- lm(n ~ wday * month(date), data = daily)
daily %>% 
  gather_residuals(Q5 = mod_q5, All_combine = mod2) %>% 
  ggplot(aes(date, resid, colour = model)) +
    geom_line(alpha = 0.75)

As we can see in the graphic, variability increases; this is because with a smaller set of data points, outliers become more important.

Question #6

What would you expect the model n ~ wday + ns(date,5) to look like? Knowing what you know about the data, why would you expect it not to be particularly effective?

mod6 <- lm(n ~ wday + splines::ns(date, 5), data = daily)
daily %>%
  gather_residuals(mod2,mod6)%>%
  arrange(date)%>%
  ggplot(aes(date,resid,color = model))+
  geom_line(alpha = 0.75)

We can see that residuals’ behavior change during the years. The model could work for a particular year, but not for all, because variables do not remain constant, we observe the holidays in different dates, weather condition changes, etc.

Question #7

We hypothesized that people leaving on Sundays are more likely to be business travelers who need to be somewhere on Monday. Explore the hypothesis by seeing how if breaks down based on distance and time: if it’s true, you’d expect to see more Sunday evening flights to places that are far away.

It looks like, except for some outliers, business travelers prefer to flight on weekdays, this makes sense because companies would need to pay the extra night in the hotel when workers could flight in the early flights on Monday, for example.

week_relevel <- function(x) {
  fct_relevel(x, "Sun", after = 1800)
}

daily %>%
  mutate(wday = week_relevel(wday)) %>%
  ggplot(aes(wday, n)) + ggtitle("Nights Flights Comparisson") +
  geom_boxplot()

flights %>%
  mutate(
    date = make_date(year, month, day),
    wday = wday(date, label = TRUE)
  ) %>%
  ggplot(aes(y = distance, x = wday)) +
  geom_boxplot(outlier.shape = NA) +
  labs(x = "Day of Week", y = "Average Distance")