NYCFlightsDataModel

Summarize Data

daily <- flights %>%
  mutate(date = make_date(year, month, day)) %>%
  group_by(date) %>%
  summarize(n = n())

ggplot(daily, aes(date, n)) +
  geom_line()

Investigate Daily-Weekly Pattern

daily <- daily %>%
  mutate(wday = wday(date, label = TRUE))
ggplot(daily, aes(wday,n)) +
  geom_boxplot()

mod = lm(n ~ wday, data = daily)

grid <- daily %>%
  data_grid(wday) %>%
  add_predictions(mod, "n")

ggplot(daily, aes(wday, n)) +
  geom_boxplot() +
  geom_point(data = grid, color = "orange", size = 4)

Investigate residuals

daily <- daily %>%
  add_residuals(mod)

daily %>%
  ggplot(aes(date, resid)) +
  geom_ref_line(h = 0) +
  geom_line()

ggplot(daily, aes(date, resid, color = wday)) +
  geom_ref_line(h = 0, colour = "red") +
  geom_line()

daily %>%
  filter(resid < -100)

## # A tibble: 11 x 4
##    date           n wday  resid
##    <date>     <int> <ord> <dbl>
##  1 2013-01-01   842 Tue   -109.
##  2 2013-01-20   786 Sun   -105.
##  3 2013-05-26   729 Sun   -162.
##  4 2013-07-04   737 Thu   -229.
##  5 2013-07-05   822 Fri   -145.
##  6 2013-09-01   718 Sun   -173.
##  7 2013-11-28   634 Thu   -332.
##  8 2013-11-29   661 Fri   -306.
##  9 2013-12-24   761 Tue   -190.
## 10 2013-12-25   719 Wed   -244.
## 11 2013-12-31   776 Tue   -175.

daily %>%
  ggplot(aes(date, resid)) +
  geom_ref_line(h = 0, colour = "red", size = 1) +
  geom_line(color = "grey50") +
  geom_smooth(se = FALSE, span = 0.20)

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Seasonal Saturday effect

daily %>%
  filter(wday == "Sat") %>%
  ggplot(aes(date, n)) +
  geom_point()+
  geom_line() +
  scale_x_date(
    NULL,
    date_breaks = "1 month",
    date_labels = "%b"
  )

Add Seasonal Variable

term <- function(date) {
  cut(date,
      breaks = ymd(20130101, 20130605, 20130825, 20140101),
      labels = c("spring", "summer", "fall")
      )
}

daily <- daily %>%
  mutate(term = term(date))

daily %>%
  filter(wday == "Sat") %>%
  ggplot(aes(date, n, color = term)) +
  geom_point(alpha = 1/3)+
  geom_line() +
  scale_x_date(
    NULL,
    date_breaks = "1 month",
    date_labels = "%b"
  )

daily %>%
  ggplot(aes(wday, n, color = term)) +
  geom_boxplot()

mod1 <- lm(n ~ wday, data = daily)
mod2 <- lm(n ~ wday * term, data = daily)

daily %>%
  gather_residuals(without_term = mod1, with_term = mod2) %>%
  ggplot(aes(date, resid, color = model)) +
  geom_line(alpha = 0.75)

grid <- daily %>%
  data_grid(wday, term) %>%
  add_predictions(mod2, "n")

ggplot(daily, aes(wday, n)) +
  geom_boxplot() +
  geom_point(data = grid, color = "red") +
  facet_wrap(~ term)

Better model for outliers (Robust regression)

mod3 <- MASS::rlm(n ~ wday * term, data = daily)

daily %>%
  add_residuals(mod3, "resid") %>%
  ggplot(aes(date, resid)) +
  geom_hline(yintercept = 0, size = 2, color = "red") +
  geom_line()

Computed Variables

# If you are creating variables it might be a good idea to bundle the creation of the variables up into a function
compute_vars <- function(data) {
  data %>%
    mutate(term = term(date),
           wday = wday(date, label = TRUE)
           )
}

# Another option would be to put the transformations directly in the model formula:

wday2 <- function(x) wday(x, label = TRUE)
mod3 <- lm(n ~ wday2(date) * term(date), data = daily)

Time of Year: An Alternative Approach

# We could use a more flexible model to capture the pattern of school term in the data
library(splines)
mod <- MASS::rlm(n ~ wday * ns(date, 5), data = daily)

daily %>% 
  data_grid(wday, date = seq_range(date, n = 13)) %>% 
  add_predictions(mod) %>% 
  ggplot(aes(date, pred, color = wday)) +
  geom_line() +
  geom_point()

# We see a strong pattern in the numbers of Sat flights.  This is reassuring, because we also saw that pattern in the raw data.  It's a good sign when you get the same signal from different approaches.

Question #1

Why are there fewer than expected flights on January 20, May 26 and September 1? (Hint: they all have the same explanation.) How would these days generalize into another year?

Dates <- c("Jan 20", "May 26", "Sept 1")
Year <- 2013:2018
map(Year, ~wday(ymd(paste0(.x,Dates, sep = "")), label = TRUE))

## [[1]]
## [1] Sun Sun Sun
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat
## 
## [[2]]
## [1] Mon Mon Mon
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat
## 
## [[3]]
## [1] Tue Tue Tue
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat
## 
## [[4]]
## [1] Wed Thu Thu
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat
## 
## [[5]]
## [1] Fri Fri Fri
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat
## 
## [[6]]
## [1] Sat Sat Sat
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat

Because those days are Sundays before the Monday holidays.

Question #2

What do the three days with high positive residuals represent? How would these days generalize to another year?

daily %>%
  top_n(3, resid)

## # A tibble: 3 x 5
##   date           n wday  resid term 
##   <date>     <int> <ord> <dbl> <fct>
## 1 2013-11-30   857 Sat   112.  fall 
## 2 2013-12-01   987 Sun    95.5 fall 
## 3 2013-12-28   814 Sat    69.4 fall

Those three days are 1) Saturday after Thanksgiving; 2) Sunday after Thanksgiving; 3)Saturday after Christamas (12/28/2013).

Question #3

Create a new variable that splits the “wday” variable into terms, but only for Saturdays, i.e., it should have Thurs, Fri, but Sat-summer, Sat-spring, Sat-fall. How does this model compare with the model with every combination of “wday” and “term”?

library(broom)

## 
## Attaching package: 'broom'

## The following object is masked from 'package:modelr':
## 
##     bootstrap

daily <- daily %>%
  mutate(
    wday2 =
      case_when(
        wday == "Sat" & term == "summer" ~ "Sat-summer",
        wday == "Sat" & term == "fall" ~ "Sat-fall",
        wday == "Sat" & term == "spring" ~ "Sat-spring",
        TRUE ~ as.character(wday)
      )
  )

mod4 <- lm(n ~ wday2, data = daily)

daily %>%
  gather_residuals(sat_term = mod4, all_interact = mod2) %>%
  ggplot(aes(date, resid, colour = model)) +
  geom_line(alpha = 0.75)

daily %>% 
  spread_residuals(sat_term = mod4, all_interact = mod3) %>%
  mutate(resid_diff = sat_term - all_interact) %>%
  ggplot(aes(date, resid_diff)) +
    geom_line(alpha = 0.75)

glance(mod4) %>% select(r.squared, sigma, AIC, df)

## # A tibble: 1 x 4
##   r.squared sigma   AIC    df
##       <dbl> <dbl> <dbl> <int>
## 1     0.736  47.4 3863.     9

glance(mod2) %>% select(r.squared, sigma, AIC, df)

## # A tibble: 1 x 4
##   r.squared sigma   AIC    df
##       <dbl> <dbl> <dbl> <int>
## 1     0.757  46.2 3856.    21

we can tell that the model with Saturday and Term performs better than other model.

Question #4

Create a new “wday” variable that combines the day of week, term(for Saturdays), and public holidays. What do the residuals of the model look like?

daily <- daily %>%
  mutate(wday3 = 
         case_when(
           .$date %in% lubridate::ymd(c(20130101, 
                                        20130121, 
                                        20130218, 
                                        20130527, 
                                        20130704, 
                                        20130902, 
                                        20131028, 
                                        20131111, 
                                        20131128, 
                                        20131225)) ~
             "holiday",
           .$wday == "Sat" & .$term == "summer" ~ "Sat-summer",
           .$wday == "Sat" & .$ term == "fall" ~ "Sat-fall",
           .$wday == "Sat" & .$term == "spring" ~ "Sat-spring",
           TRUE ~ as.character(.$wday)))
mod5 <- lm(n ~ wday3, data = daily)
daily %>% 
  spread_residuals(mod5) %>%
  arrange(desc(abs(resid))) %>%
  slice(1:20) %>% select(date, wday, resid)

## # A tibble: 20 x 3
##    date       wday   resid
##    <date>     <ord>  <dbl>
##  1 2013-11-28 Thu   -332. 
##  2 2013-11-29 Fri   -306. 
##  3 2013-12-25 Wed   -244. 
##  4 2013-07-04 Thu   -229. 
##  5 2013-12-24 Tue   -190. 
##  6 2013-12-31 Tue   -175. 
##  7 2013-09-01 Sun   -173. 
##  8 2013-05-26 Sun   -162. 
##  9 2013-07-05 Fri   -145. 
## 10 2013-11-30 Sat    112. 
## 11 2013-01-01 Tue   -109. 
## 12 2013-01-20 Sun   -105. 
## 13 2013-12-01 Sun     95.5
## 14 2013-02-03 Sun    -77.5
## 15 2013-01-19 Sat    -70.6
## 16 2013-12-28 Sat     69.4
## 17 2013-10-12 Sat    -68.6
## 18 2013-01-27 Sun    -68.5
## 19 2013-06-22 Sat     67.4
## 20 2013-06-29 Sat     67.4

Question #5

What happens if you fit a day-of-week effect that varies by month (i.e.m n ~ wday*month)? Why is this not very helpful?

daily <- mutate(daily, month = factor(lubridate::month(date)))
mod6 <- lm(n ~ wday * month, data = daily)
summary(mod6)

## 
## Call:
## lm(formula = n ~ wday * month, data = daily)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -269.2   -5.0    1.5    8.8  113.2 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     867.4000     7.5983 114.157  < 2e-16 ***
## wday.L          -64.0744    20.8737  -3.070 0.002353 ** 
## wday.Q         -165.6001    20.1555  -8.216 7.77e-15 ***
## wday.C          -68.2591    20.3115  -3.361 0.000885 ***
## wday^4          -92.0814    20.4991  -4.492 1.03e-05 ***
## wday^5            9.7925    19.7334   0.496 0.620111    
## wday^6          -20.4376    18.9922  -1.076 0.282802    
## month2           23.7071    10.9946   2.156 0.031912 *  
## month3           67.8857    10.7456   6.318 1.04e-09 ***
## month4           74.5929    10.8292   6.888 3.70e-11 ***
## month5           56.2786    10.7456   5.237 3.20e-07 ***
## month6           80.3071    10.8292   7.416 1.43e-12 ***
## month7           77.1143    10.7456   7.176 6.39e-12 ***
## month8           81.6357    10.7456   7.597 4.52e-13 ***
## month9           51.3714    10.8292   4.744 3.34e-06 ***
## month10          60.1357    10.7456   5.596 5.20e-08 ***
## month11          46.9143    10.8292   4.332 2.06e-05 ***
## month12          38.7786    10.7456   3.609 0.000364 ***
## wday.L:month2    -3.7230    29.6267  -0.126 0.900089    
## wday.Q:month2    -3.8188    29.1251  -0.131 0.895776    
## wday.C:month2     0.4899    29.2333   0.017 0.986641    
## wday^4:month2     4.5690    29.3639   0.156 0.876460    
## wday^5:month2    -4.2552    28.8346  -0.148 0.882784    
## wday^6:month2    12.0570    28.3325   0.426 0.670760    
## wday.L:month3   -14.5705    28.4302  -0.513 0.608703    
## wday.Q:month3    15.4389    28.2073   0.547 0.584581    
## wday.C:month3     8.2262    28.4672   0.289 0.772817    
## wday^4:month3    22.7202    28.7015   0.792 0.429261    
## wday^5:month3   -15.3298    28.5042  -0.538 0.591135    
## wday^6:month3    11.3727    28.2682   0.402 0.687759    
## wday.L:month4   -16.6682    29.3590  -0.568 0.570667    
## wday.Q:month4    10.7254    28.9620   0.370 0.711418    
## wday.C:month4    -0.2449    28.7249  -0.009 0.993202    
## wday^4:month4    23.2883    28.8711   0.807 0.420561    
## wday^5:month4   -17.8720    28.0764  -0.637 0.524935    
## wday^6:month4     5.3524    27.8883   0.192 0.847940    
## wday.L:month5     3.6663    29.3590   0.125 0.900711    
## wday.Q:month5   -20.6652    28.6699  -0.721 0.471632    
## wday.C:month5     4.6336    28.7249   0.161 0.871965    
## wday^4:month5     5.9994    28.5109   0.210 0.833490    
## wday^5:month5   -16.9119    28.0764  -0.602 0.547425    
## wday^6:month5    12.7643    27.1935   0.469 0.639158    
## wday.L:month6    -4.5261    28.6515  -0.158 0.874593    
## wday.Q:month6    23.8130    28.2073   0.844 0.399267    
## wday.C:month6    13.7580    28.7249   0.479 0.632342    
## wday^4:month6    24.1183    29.1875   0.826 0.409322    
## wday^5:month6   -17.6484    28.7981  -0.613 0.540483    
## wday^6:month6    10.5256    28.3291   0.372 0.710510    
## wday.L:month7   -28.7914    29.3590  -0.981 0.327601    
## wday.Q:month7    49.5846    28.6699   1.730 0.084818 .  
## wday.C:month7    54.5011    28.7249   1.897 0.058807 .  
## wday^4:month7    50.8474    28.5109   1.783 0.075594 .  
## wday^5:month7   -33.6983    28.0764  -1.200 0.231058    
## wday^6:month7   -13.8943    27.1935  -0.511 0.609793    
## wday.L:month8   -20.4479    28.8711  -0.708 0.479378    
## wday.Q:month8     6.7648    28.5042   0.237 0.812578    
## wday.C:month8     6.0012    28.4672   0.211 0.833186    
## wday^4:month8    19.0738    28.7814   0.663 0.508058    
## wday^5:month8   -19.3123    28.0576  -0.688 0.491827    
## wday^6:month8     9.5074    27.8866   0.341 0.733410    
## wday.L:month9   -30.3411    28.9257  -1.049 0.295110    
## wday.Q:month9   -42.0342    28.6699  -1.466 0.143726    
## wday.C:month9   -20.7186    28.7249  -0.721 0.471338    
## wday^4:month9   -20.3752    28.7914  -0.708 0.479728    
## wday^5:month9   -18.2376    28.5226  -0.639 0.523079    
## wday^6:month9    11.7263    28.2699   0.415 0.678606    
## wday.L:month10  -61.0507    29.5199  -2.068 0.039544 *  
## wday.Q:month10  -26.2352    28.5042  -0.920 0.358153    
## wday.C:month10  -32.4353    28.7249  -1.129 0.259788    
## wday^4:month10  -12.2122    28.9901  -0.421 0.673890    
## wday^5:month10  -27.6864    27.9072  -0.992 0.322008    
## wday^6:month10    0.1234    26.8590   0.005 0.996339    
## wday.L:month11  -54.9466    28.9257  -1.900 0.058512 .  
## wday.Q:month11   16.0117    28.6699   0.558 0.576957    
## wday.C:month11   54.9502    28.7249   1.913 0.056766 .  
## wday^4:month11   47.2857    28.7914   1.642 0.101635    
## wday^5:month11  -44.7401    28.5226  -1.569 0.117871    
## wday^6:month11  -20.6876    28.2699  -0.732 0.464907    
## wday.L:month12   -9.5058    28.8711  -0.329 0.742212    
## wday.Q:month12   75.2088    28.5042   2.639 0.008791 ** 
## wday.C:month12  -25.0256    28.4672  -0.879 0.380097    
## wday^4:month12  -23.7798    28.7814  -0.826 0.409380    
## wday^5:month12   20.4470    28.0576   0.729 0.466761    
## wday^6:month12    9.5864    27.8866   0.344 0.731282    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 42.05 on 281 degrees of freedom
## Multiple R-squared:  0.8355, Adjusted R-squared:  0.7869 
## F-statistic:  17.2 on 83 and 281 DF,  p-value: < 2.2e-16

Because it is really hard to generate a pattern.

Question #6

What would you expect the model n ~ wday + ns(date,5) to look like? Knowing what you know about the data, why would you expect it not to be particularly effective?

mod7 <- lm(n ~ wday + ns(date, 5), data = daily)
mod8 <- lm(n ~ wday * ns(date, 5), data = daily)

daily %>%
  gather_residuals(mod7, mod8) %>%
  ggplot(aes(x = date, y = resid, color = model)) +
  geom_line(alpha = 0.75)

The residuals of mod7 are larger than mod8. mod7 underestimates weekends during the summer and overestimates weekends during the autumn.

Question #7

We hypothesized that people leaving on Sundays are more likely to be business travelers who need to be somewhere on Monday. Explore the hypothesis by seeing how if breaks down based on distance and time: if it’s true, you’d expect to see more Sunday evening flights to places that are far away.

flights %>% 
  mutate(date = make_date(year, month, day),
         wday = wday(date, label = TRUE)) %>%
  group_by(wday) %>%
  summarise(dist_mean =  mean(distance),
            dist_median = median(distance)) %>%
  ggplot(aes(y = dist_mean, x = wday)) +
  geom_point()