Summarize Data

daily <- flights %>%
  mutate(date = make_date(year, month, day)) %>%
  group_by(date) %>%
  summarize(n = n())

ggplot(daily, aes(date, n)) +
  geom_line()

Investigate Daily-Weekly Pattern

daily <- daily %>%
  mutate(wday = wday(date, label = TRUE))
ggplot(daily, aes(wday,n)) +
  geom_boxplot()

mod = lm(n ~ wday, data = daily)

grid <- daily %>%
  data_grid(wday) %>%
  add_predictions(mod, "n")

ggplot(daily, aes(wday, n)) +
  geom_boxplot() +
  geom_point(data = grid, color = "orange", size = 4)

Investigate residuals

daily <- daily %>%
  add_residuals(mod)

daily %>%
  ggplot(aes(date, resid)) +
  geom_ref_line(h = 0) +
  geom_line()

ggplot(daily, aes(date, resid, color = wday)) +
  geom_ref_line(h = 0, colour = "red") +
  geom_line()

daily %>%
  filter(resid < -100)
## # A tibble: 11 x 4
##    date           n wday  resid
##    <date>     <int> <ord> <dbl>
##  1 2013-01-01   842 Tue   -109.
##  2 2013-01-20   786 Sun   -105.
##  3 2013-05-26   729 Sun   -162.
##  4 2013-07-04   737 Thu   -229.
##  5 2013-07-05   822 Fri   -145.
##  6 2013-09-01   718 Sun   -173.
##  7 2013-11-28   634 Thu   -332.
##  8 2013-11-29   661 Fri   -306.
##  9 2013-12-24   761 Tue   -190.
## 10 2013-12-25   719 Wed   -244.
## 11 2013-12-31   776 Tue   -175.
daily %>%
  ggplot(aes(date, resid)) +
  geom_ref_line(h = 0, colour = "red", size = 1) +
  geom_line(color = "grey50") +
  geom_smooth(se = FALSE, span = 0.20)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Seasonal Saturday effect

daily %>%
  filter(wday == "Sat") %>%
  ggplot(aes(date, n)) +
  geom_point()+
  geom_line() +
  scale_x_date(
    NULL,
    date_breaks = "1 month",
    date_labels = "%b"
  )

Add Seasonal Variable

term <- function(date) {
  cut(date,
      breaks = ymd(20130101, 20130605, 20130825, 20140101),
      labels = c("spring", "summer", "fall")
      )
}

daily <- daily %>%
  mutate(term = term(date))

daily %>%
  filter(wday == "Sat") %>%
  ggplot(aes(date, n, color = term)) +
  geom_point(alpha = 1/3)+
  geom_line() +
  scale_x_date(
    NULL,
    date_breaks = "1 month",
    date_labels = "%b"
  )

daily %>%
  ggplot(aes(wday, n, color = term)) +
  geom_boxplot()

mod1 <- lm(n ~ wday, data = daily)
mod2 <- lm(n ~ wday * term, data = daily)

daily %>%
  gather_residuals(without_term = mod1, with_term = mod2) %>%
  ggplot(aes(date, resid, color = model)) +
  geom_line(alpha = 0.75)

grid <- daily %>%
  data_grid(wday, term) %>%
  add_predictions(mod2, "n")

ggplot(daily, aes(wday, n)) +
  geom_boxplot() +
  geom_point(data = grid, color = "red") +
  facet_wrap(~ term)

Better model for outliers (Robust regression)

mod3 <- MASS::rlm(n ~ wday * term, data = daily)

daily %>%
  add_residuals(mod3, "resid") %>%
  ggplot(aes(date, resid)) +
  geom_hline(yintercept = 0, size = 2, color = "red") +
  geom_line()

Computed Variables

# If you are creating variables it might be a good idea to bundle the creation of the variables up into a function
compute_vars <- function(data) {
  data %>%
    mutate(term = term(date),
           wday = wday(date, label = TRUE)
           )
}

# Another option would be to put the transformations directly in the model formula:

wday2 <- function(x) wday(x, label = TRUE)
mod3 <- lm(n ~ wday2(date) * term(date), data = daily)

Time of Year: An Alternative Approach

# We could use a more flexible model to capture the pattern of school term in the data
library(splines)
mod <- MASS::rlm(n ~ wday * ns(date, 5), data = daily)

daily %>% 
  data_grid(wday, date = seq_range(date, n = 13)) %>% 
  add_predictions(mod) %>% 
  ggplot(aes(date, pred, color = wday)) +
  geom_line() +
  geom_point()

# We see a strong pattern in the numbers of Sat flights.  This is reassuring, because we also saw that pattern in the raw data.  It's a good sign when you get the same signal from different approaches.

Question #1

Why are there fewer than expected flights on January 20, May 26 and September 1? (Hint: they all have the same explanation.) How would these days generalize into another year?

We can see that Jan 21 is MLK Jr. Day, May 26 is Trinity Sunday, and Sep 2 is Labor day - all the days mentioned in the question are the days which are a day away from these holidays.FOr the next year, we potentially could see the same paterns, but the days will vary because, in this case, the days were the weekend days before the Holidays.

holiday <- c("0121", "0526", "0902")
years <- 2013:2019
map(years, ~ wday(ymd(paste0(.x, holiday, sep = "")), label = TRUE))
## [[1]]
## [1] Mon Sun Mon
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat
## 
## [[2]]
## [1] Tue Mon Tue
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat
## 
## [[3]]
## [1] Wed Tue Wed
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat
## 
## [[4]]
## [1] Thu Thu Fri
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat
## 
## [[5]]
## [1] Sat Fri Sat
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat
## 
## [[6]]
## [1] Sun Sat Sun
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat
## 
## [[7]]
## [1] Mon Sun Mon
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat

Question #2

What do the three days with high positive residuals represent? How would these days generalize to another year?

The 3 days are Saturday after Thanksgiving, Sunday after Thanksgiving, and Sat after Christmas. Since Thanksgiving and Christmas are fixed year to year being calendar holidays during the weekend, we can generalize them to another year.

daily %>%
  top_n(3, resid)
## # A tibble: 3 x 5
##   date           n wday  resid term 
##   <date>     <int> <ord> <dbl> <fct>
## 1 2013-11-30   857 Sat   112.  fall 
## 2 2013-12-01   987 Sun    95.5 fall 
## 3 2013-12-28   814 Sat    69.4 fall

Question #3

Create a new variable that splits the “wday” variable into terms, but only for Saturdays, i.e., it should have Thurs, Fri, but Sat-summer, Sat-spring, Sat-fall. How does this model compare with the model with every combination of “wday” and “term”?

Because it is a slightly difficult to tell the overlap area in the two charts, let’s try to plot the differences between the 2 models. As seen from the chart, the Saturdays and terms chart has a higher residual during summer but relatively a lower residua lfor the rest of the year. From a comparison perspective, mod 4 has a lower R square and regression standard error with fewer variables. THe model, noteably, has a higher AIC. Hence the model with Saturday and Terms outperforms the other model.

library(broom)
## 
## Attaching package: 'broom'
## The following object is masked from 'package:modelr':
## 
##     bootstrap
daily <- daily %>%
  mutate(
    wday2 =
      case_when(
        wday == "Sat" & term == "summer" ~ "Sat-summer",
        wday == "Sat" & term == "fall" ~ "Sat-fall",
        wday == "Sat" & term == "spring" ~ "Sat-spring",
        TRUE ~ as.character(wday)
      )
  )

mod4 <- lm(n ~ wday2, data = daily)

daily %>%
  gather_residuals(sat_term = mod4, all_interact = mod2) %>%
  ggplot(aes(date, resid, colour = model)) +
  geom_line(alpha = 0.75)

daily %>% 
  spread_residuals(sat_term = mod4, all_interact = mod3) %>%
  mutate(resid_diff = sat_term - all_interact) %>%
  ggplot(aes(date, resid_diff)) +
    geom_line(alpha = 0.75)

glance(mod4) %>% select(r.squared, sigma, AIC, df)
## # A tibble: 1 x 4
##   r.squared sigma   AIC    df
##       <dbl> <dbl> <dbl> <int>
## 1     0.736  47.4 3863.     9
glance(mod2) %>% select(r.squared, sigma, AIC, df)
## # A tibble: 1 x 4
##   r.squared sigma   AIC    df
##       <dbl> <dbl> <dbl> <int>
## 1     0.757  46.2 3856.    21

Question #4

Create a new “wday” variable that combines the day of week, term(for Saturdays), and public holidays. What do the residuals of the model look like?

daily <- daily %>%
  mutate(wday3 = 
         case_when(
           .$date %in% lubridate::ymd(c(20130101, # new years
                                        20130121, # mlk
                                        20130218, # presidents
                                        20130527, # memorial
                                        20130704, # independence
                                        20130902, # labor
                                        20131028, # columbus
                                        20131111, # veterans
                                        20131128, # thanksgiving
                                        20131225)) ~
             "holiday",
           .$wday == "Sat" & .$term == "summer" ~ "Sat-summer",
           .$wday == "Sat" & .$ term == "fall" ~ "Sat-fall",
           .$wday == "Sat" & .$term == "spring" ~ "Sat-spring",
           TRUE ~ as.character(.$wday)))
mod5 <- lm(n ~ wday3, data = daily)
daily %>% 
  spread_residuals(mod5) %>%
  arrange(desc(abs(resid))) %>%
  slice(1:20) %>% select(date, wday, resid)
## # A tibble: 20 x 3
##    date       wday   resid
##    <date>     <ord>  <dbl>
##  1 2013-11-28 Thu   -332. 
##  2 2013-11-29 Fri   -306. 
##  3 2013-12-25 Wed   -244. 
##  4 2013-07-04 Thu   -229. 
##  5 2013-12-24 Tue   -190. 
##  6 2013-12-31 Tue   -175. 
##  7 2013-09-01 Sun   -173. 
##  8 2013-05-26 Sun   -162. 
##  9 2013-07-05 Fri   -145. 
## 10 2013-11-30 Sat    112. 
## 11 2013-01-01 Tue   -109. 
## 12 2013-01-20 Sun   -105. 
## 13 2013-12-01 Sun     95.5
## 14 2013-02-03 Sun    -77.5
## 15 2013-01-19 Sat    -70.6
## 16 2013-12-28 Sat     69.4
## 17 2013-10-12 Sat    -68.6
## 18 2013-01-27 Sun    -68.5
## 19 2013-06-22 Sat     67.4
## 20 2013-06-29 Sat     67.4

Question #5

What happens if you fit a day-of-week effect that varies by month (i.e.m n ~ wday*month)? Why is this not very helpful?

We see that there are a few outliers and the interaction tem with the minor observation in each cell makes the predictions more uncertain. Moreover, it is difficult to generate a pattern because if we spli tthe data by wday times month, we get only 4-5 variables which leave large room for errors.

daily <- mutate(daily, month = factor(lubridate::month(date)))
mod6 <- lm(n ~ wday * month, data = daily)
summary(mod6)
## 
## Call:
## lm(formula = n ~ wday * month, data = daily)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -269.2   -5.0    1.5    8.8  113.2 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     867.4000     7.5983 114.157  < 2e-16 ***
## wday.L          -64.0744    20.8737  -3.070 0.002353 ** 
## wday.Q         -165.6001    20.1555  -8.216 7.77e-15 ***
## wday.C          -68.2591    20.3115  -3.361 0.000885 ***
## wday^4          -92.0814    20.4991  -4.492 1.03e-05 ***
## wday^5            9.7925    19.7334   0.496 0.620111    
## wday^6          -20.4376    18.9922  -1.076 0.282802    
## month2           23.7071    10.9946   2.156 0.031912 *  
## month3           67.8857    10.7456   6.318 1.04e-09 ***
## month4           74.5929    10.8292   6.888 3.70e-11 ***
## month5           56.2786    10.7456   5.237 3.20e-07 ***
## month6           80.3071    10.8292   7.416 1.43e-12 ***
## month7           77.1143    10.7456   7.176 6.39e-12 ***
## month8           81.6357    10.7456   7.597 4.52e-13 ***
## month9           51.3714    10.8292   4.744 3.34e-06 ***
## month10          60.1357    10.7456   5.596 5.20e-08 ***
## month11          46.9143    10.8292   4.332 2.06e-05 ***
## month12          38.7786    10.7456   3.609 0.000364 ***
## wday.L:month2    -3.7230    29.6267  -0.126 0.900089    
## wday.Q:month2    -3.8188    29.1251  -0.131 0.895776    
## wday.C:month2     0.4899    29.2333   0.017 0.986641    
## wday^4:month2     4.5690    29.3639   0.156 0.876460    
## wday^5:month2    -4.2552    28.8346  -0.148 0.882784    
## wday^6:month2    12.0570    28.3325   0.426 0.670760    
## wday.L:month3   -14.5705    28.4302  -0.513 0.608703    
## wday.Q:month3    15.4389    28.2073   0.547 0.584581    
## wday.C:month3     8.2262    28.4672   0.289 0.772817    
## wday^4:month3    22.7202    28.7015   0.792 0.429261    
## wday^5:month3   -15.3298    28.5042  -0.538 0.591135    
## wday^6:month3    11.3727    28.2682   0.402 0.687759    
## wday.L:month4   -16.6682    29.3590  -0.568 0.570667    
## wday.Q:month4    10.7254    28.9620   0.370 0.711418    
## wday.C:month4    -0.2449    28.7249  -0.009 0.993202    
## wday^4:month4    23.2883    28.8711   0.807 0.420561    
## wday^5:month4   -17.8720    28.0764  -0.637 0.524935    
## wday^6:month4     5.3524    27.8883   0.192 0.847940    
## wday.L:month5     3.6663    29.3590   0.125 0.900711    
## wday.Q:month5   -20.6652    28.6699  -0.721 0.471632    
## wday.C:month5     4.6336    28.7249   0.161 0.871965    
## wday^4:month5     5.9994    28.5109   0.210 0.833490    
## wday^5:month5   -16.9119    28.0764  -0.602 0.547425    
## wday^6:month5    12.7643    27.1935   0.469 0.639158    
## wday.L:month6    -4.5261    28.6515  -0.158 0.874593    
## wday.Q:month6    23.8130    28.2073   0.844 0.399267    
## wday.C:month6    13.7580    28.7249   0.479 0.632342    
## wday^4:month6    24.1183    29.1875   0.826 0.409322    
## wday^5:month6   -17.6484    28.7981  -0.613 0.540483    
## wday^6:month6    10.5256    28.3291   0.372 0.710510    
## wday.L:month7   -28.7914    29.3590  -0.981 0.327601    
## wday.Q:month7    49.5846    28.6699   1.730 0.084818 .  
## wday.C:month7    54.5011    28.7249   1.897 0.058807 .  
## wday^4:month7    50.8474    28.5109   1.783 0.075594 .  
## wday^5:month7   -33.6983    28.0764  -1.200 0.231058    
## wday^6:month7   -13.8943    27.1935  -0.511 0.609793    
## wday.L:month8   -20.4479    28.8711  -0.708 0.479378    
## wday.Q:month8     6.7648    28.5042   0.237 0.812578    
## wday.C:month8     6.0012    28.4672   0.211 0.833186    
## wday^4:month8    19.0738    28.7814   0.663 0.508058    
## wday^5:month8   -19.3123    28.0576  -0.688 0.491827    
## wday^6:month8     9.5074    27.8866   0.341 0.733410    
## wday.L:month9   -30.3411    28.9257  -1.049 0.295110    
## wday.Q:month9   -42.0342    28.6699  -1.466 0.143726    
## wday.C:month9   -20.7186    28.7249  -0.721 0.471338    
## wday^4:month9   -20.3752    28.7914  -0.708 0.479728    
## wday^5:month9   -18.2376    28.5226  -0.639 0.523079    
## wday^6:month9    11.7263    28.2699   0.415 0.678606    
## wday.L:month10  -61.0507    29.5199  -2.068 0.039544 *  
## wday.Q:month10  -26.2352    28.5042  -0.920 0.358153    
## wday.C:month10  -32.4353    28.7249  -1.129 0.259788    
## wday^4:month10  -12.2122    28.9901  -0.421 0.673890    
## wday^5:month10  -27.6864    27.9072  -0.992 0.322008    
## wday^6:month10    0.1234    26.8590   0.005 0.996339    
## wday.L:month11  -54.9466    28.9257  -1.900 0.058512 .  
## wday.Q:month11   16.0117    28.6699   0.558 0.576957    
## wday.C:month11   54.9502    28.7249   1.913 0.056766 .  
## wday^4:month11   47.2857    28.7914   1.642 0.101635    
## wday^5:month11  -44.7401    28.5226  -1.569 0.117871    
## wday^6:month11  -20.6876    28.2699  -0.732 0.464907    
## wday.L:month12   -9.5058    28.8711  -0.329 0.742212    
## wday.Q:month12   75.2088    28.5042   2.639 0.008791 ** 
## wday.C:month12  -25.0256    28.4672  -0.879 0.380097    
## wday^4:month12  -23.7798    28.7814  -0.826 0.409380    
## wday^5:month12   20.4470    28.0576   0.729 0.466761    
## wday^6:month12    9.5864    27.8866   0.344 0.731282    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 42.05 on 281 degrees of freedom
## Multiple R-squared:  0.8355, Adjusted R-squared:  0.7869 
## F-statistic:  17.2 on 83 and 281 DF,  p-value: < 2.2e-16

Question #6

What would you expect the model n ~ wday + ns(date,5) to look like? Knowing what you know about the data, why would you expect it not to be particularly effective?

The models developed previously, show the effect of days of the week vary a lot across different times of the year. As we can see, the model n ~ wday + ns(date,5) does not interact wit hthe day of the week effect with the time of the year effects (ns(date, 5)).

And then, the residuals of the model that do not interact with the day of the week wit htime of the year (mod 7) are bigger than those of the model that does (which is model 8). THus the mod 7 underestimates the weekends during the summer and overestimates weekends during the autumn.

THe graph below, substantiates the same.

mod7 <- lm(n ~ wday + ns(date, 5), data = daily)
mod8 <- lm(n ~ wday * ns(date, 5), data = daily)

daily %>%
  gather_residuals(mod7, mod8) %>%
  ggplot(aes(x = date, y = resid, color = model)) +
  geom_line(alpha = 0.75)

Question #7

We hypothesized that people leaving on Sundays are more likely to be business travelers who need to be somewhere on Monday. Explore the hypothesis by seeing how if breaks down based on distance and time: if it’s true, you’d expect to see more Sunday evening flights to places that are far away.

THrough the box plot, we can see that Sunday flights have longer distance than weekdays but not as long as those on Saturdays. So while there might be a few people who are travelling on leisure, but most of the travellers aretravelling on business on Sunday Flights.

week_relevel <- function(x) {
  fct_relevel(x, "Sun", after = 7)
}
daily %>% 
  mutate(wday = week_relevel(wday)) %>% 
  ggplot(aes(wday, n)) +
  geom_boxplot()