Summarize Data

daily <- flights %>%
  mutate(date = make_date(year, month, day)) %>%
  group_by(date) %>%
  summarize(n = n())
ggplot(daily, aes(date, n)) +
  geom_line()

Investigate Daily-Weekly Pattern

daily <- daily %>%
  mutate(wday = wday(date, label = TRUE))
ggplot(daily, aes(wday,n)) +
  geom_boxplot()

mod = lm(n ~ wday, data = daily)

grid <- daily %>%
  data_grid(wday) %>%
  add_predictions(mod, "n")

ggplot(daily, aes(wday, n)) +
  geom_boxplot() +
  geom_point(data = grid, color = "orange", size = 4)

Investigate residuals

daily <- daily %>%
  add_residuals(mod)

daily %>%
  ggplot(aes(date, resid)) +
  geom_ref_line(h = 0) +
  geom_line()

ggplot(daily, aes(date, resid, color = wday)) +
  geom_ref_line(h = 0, colour = "red") +
  geom_line()

daily %>%
  filter(resid < -100)
## # A tibble: 11 x 4
##    date           n wday  resid
##    <date>     <int> <ord> <dbl>
##  1 2013-01-01   842 Tue   -109.
##  2 2013-01-20   786 Sun   -105.
##  3 2013-05-26   729 Sun   -162.
##  4 2013-07-04   737 Thu   -229.
##  5 2013-07-05   822 Fri   -145.
##  6 2013-09-01   718 Sun   -173.
##  7 2013-11-28   634 Thu   -332.
##  8 2013-11-29   661 Fri   -306.
##  9 2013-12-24   761 Tue   -190.
## 10 2013-12-25   719 Wed   -244.
## 11 2013-12-31   776 Tue   -175.
daily %>%
  ggplot(aes(date, resid)) +
  geom_ref_line(h = 0, colour = "red", size = 1) +
  geom_line(color = "grey50") +
  geom_smooth(se = FALSE, span = 0.20)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Seasonal Saturday effect

daily %>%
  filter(wday == "Sat") %>%
  ggplot(aes(date, n)) +
  geom_point()+
  geom_line() +
  scale_x_date(
    NULL,
    date_breaks = "1 month",
    date_labels = "%b"
  )

Add Seasonal Variable

term <- function(date) {
  cut(date,
      breaks = ymd(20130101, 20130605, 20130825, 20140101),
      labels = c("spring", "summer", "fall")
      )
}

daily <- daily %>%
  mutate(term = term(date))

daily %>%
  filter(wday == "Sat") %>%
  ggplot(aes(date, n, color = term)) +
  geom_point(alpha = 1/3)+
  geom_line() +
  scale_x_date(
    NULL,
    date_breaks = "1 month",
    date_labels = "%b"
  )

daily %>%
  ggplot(aes(wday, n, color = term)) +
  geom_boxplot()

mod1 <- lm(n ~ wday, data = daily)
mod2 <- lm(n ~ wday * term, data = daily)

daily %>%
  gather_residuals(without_term = mod1, with_term = mod2) %>%
  ggplot(aes(date, resid, color = model)) +
  geom_line(alpha = 0.75)

grid <- daily %>%
  data_grid(wday, term) %>%
  add_predictions(mod2, "n")

ggplot(daily, aes(wday, n)) +
  geom_boxplot() +
  geom_point(data = grid, color = "red") +
  facet_wrap(~ term)

Better model for outliers (Robust regression)

mod3 <- MASS::rlm(n ~ wday * term, data = daily)

daily %>%
  add_residuals(mod3, "resid") %>%
  ggplot(aes(date, resid)) +
  geom_hline(yintercept = 0, size = 2, color = "red") +
  geom_line()

Computed Variables

# If you are creating variables it might be a good idea to bundle the creation of the variables up into a function
compute_vars <- function(data) {
  data %>%
    mutate(term = term(date),
           wday = wday(date, label = TRUE)
           )
}

# Another option would be to put the transformations directly in the model formula:

wday2 <- function(x) wday(x, label = TRUE)
mod3 <- lm(n ~ wday2(date) * term(date), data = daily)

Time of Year: An Alternative Approach

# We could use a more flexible model to capture the pattern of school term in the data
library(splines)
mod <- MASS::rlm(n ~ wday * ns(date, 5), data = daily)

daily %>% 
  data_grid(wday, date = seq_range(date, n = 13)) %>% 
  add_predictions(mod) %>% 
  ggplot(aes(date, pred, color = wday)) +
  geom_line() +
  geom_point()

# We see a strong pattern in the numbers of Sat flights.  This is reassuring, because we also saw that pattern in the raw data.  It's a good sign when you get the same signal from different approaches.

Question #1

Why are there fewer than expected flights on January 20, May 26 and September 1? (Hint: they all have the same explanation.) How would these days generalize into another year?

##Answer 1: These days in 2013 were all the day before important holidays. E.g. MKT day was 01/21/2013, memorial day was 05/27/2013, and labor day was 09/02/2013. Therefore, these days would be different in other years.

Question #2

What do the three days with high positive residuals represent? How would these days generalize to another year?

##Answer 2: Positive residual indicates that the flight # is under-predicted. There are other factors that drives the flight number that are not captured by the moDEL. These three days are the days after thanksgiving and Christmas. These days will show similar results to other years.

daily %>%
  top_n(3, resid)
## # A tibble: 3 x 5
##   date           n wday  resid term 
##   <date>     <int> <ord> <dbl> <fct>
## 1 2013-11-30   857 Sat   112.  fall 
## 2 2013-12-01   987 Sun    95.5 fall 
## 3 2013-12-28   814 Sat    69.4 fall

Question #3

Create a new variable that splits the “wday” variable into terms, but only for Saturdays, i.e., it should have Thurs, Fri, but Sat-summer, Sat-spring, Sat-fall. How does this model compare with the model with every combination of “wday” and “term”?

##Answer3: Based on the summary statistic of new model, it performs better fit than the model with every combination of “wday” and “term”.

data2 <-
  daily %>% 
  mutate(wday = as.character(wday),
         term = ifelse(wday == "Sat", paste0(wday, "-", term), wday))
new_model <- MASS::rlm(n ~ term, data = data2)

data2 %>%
  add_residuals(new_model, "resid") %>%
  ggplot(aes(date, resid)) +
  geom_hline(yintercept = 0, size = 2, color = "blue") +
  geom_line()

summary(mod3)
## 
## Call:
## lm(formula = n ~ wday2(date) * term(date), data = daily)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -326.33   -8.00    9.82   20.67  141.00 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     912.8684     3.7086 246.151  < 2e-16 ***
## wday2(date).L                   -71.4674     9.8349  -7.267 2.49e-12 ***
## wday2(date).Q                  -161.0112     9.8196 -16.397  < 2e-16 ***
## wday2(date).C                   -65.6215     9.8068  -6.691 8.98e-11 ***
## wday2(date)^4                   -82.0997     9.8412  -8.342 1.78e-15 ***
## wday2(date)^5                    -1.3425     9.7787  -0.137   0.8909    
## wday2(date)^6                   -12.4501     9.7903  -1.272   0.2043    
## term(date)summer                 37.6922     6.3336   5.951 6.56e-09 ***
## term(date)fall                    3.7598     5.5033   0.683   0.4949    
## wday2(date).L:term(date)summer   -6.2215    16.8049  -0.370   0.7114    
## wday2(date).Q:term(date)summer   26.4121    16.7501   1.577   0.1158    
## wday2(date).C:term(date)summer   26.7451    16.7885   1.593   0.1121    
## wday2(date)^4:term(date)summer   23.5933    16.7523   1.408   0.1599    
## wday2(date)^5:term(date)summer  -13.1343    16.7720  -0.783   0.4341    
## wday2(date)^6:term(date)summer   -5.0026    16.6744  -0.300   0.7643    
## wday2(date).L:term(date)fall    -31.9048    14.5607  -2.191   0.0291 *  
## wday2(date).Q:term(date)fall     -0.6959    14.5707  -0.048   0.9619    
## wday2(date).C:term(date)fall     -8.8349    14.5417  -0.608   0.5439    
## wday2(date)^4:term(date)fall     -9.8570    14.5899  -0.676   0.4997    
## wday2(date)^5:term(date)fall     -3.2107    14.5228  -0.221   0.8252    
## wday2(date)^6:term(date)fall     -8.2061    14.5769  -0.563   0.5738    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 46.17 on 344 degrees of freedom
## Multiple R-squared:  0.7573, Adjusted R-squared:  0.7432 
## F-statistic: 53.67 on 20 and 344 DF,  p-value: < 2.2e-16
summary(new_model)
## 
## Call: rlm(formula = n ~ term, data = data2)
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -345.692  -14.487    2.403   13.840  158.717 
## 
## Coefficients:
##                Value     Std. Error t value  
## (Intercept)     978.4105    3.2011   305.6436
## termMon           0.1863    4.5271     0.0412
## termSat-fall   -280.1270    6.3127   -44.3748
## termSat-spring -233.2026    5.8710   -39.7213
## termSat-summer -177.4939    7.3927   -24.0092
## termSun         -75.2507    4.5271   -16.6223
## termThu           1.2817    4.5271     0.2831
## termTue         -14.9240    4.5057    -3.3123
## termWed          -7.3207    4.5271    -1.6171
## 
## Residual standard error: 21.18 on 356 degrees of freedom
anova(new_model)
## Analysis of Variance Table
## 
## Response: n
##           Df  Sum Sq Mean Sq F value Pr(>F)
## term       8 2182237  272780               
## Residuals     841150

Question #4

Create a new “wday” variable that combines the day of week, term(for Saturdays), and public holidays. What do the residuals of the model look like?

##Answer: Based on the residuals of the model, it performs worse than question#3 model.

data3 <-
  data2 %>% 
  mutate(holiday = case_when(date %in% ymd(c(20130101, 20130121, 20130218, 20130527, 20130704, 20130902, 20131028, 20131111, 20131128,20131225)) ~ "holiday",
                              TRUE ~ "None")) %>% 
unite(combination, term, holiday)
new_model2 <- lm(n ~ combination, data = data3)
data3 %>%
  add_residuals(new_model2, "resid") %>%
  ggplot(aes(date, resid)) +
  geom_hline(yintercept = 0, size = 2, color = "green") +
  geom_line()

summary(new_model2)
## 
## Call:
## lm(formula = n ~ combination, data = data3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -306.462  -10.462    8.529   20.538  141.000 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 967.46154    5.51738 175.348  < 2e-16 ***
## combinationMon_holiday      -20.29487   17.15424  -1.183  0.23758    
## combinationMon_None          10.95150    8.05318   1.360  0.17473    
## combinationSat-fall_None   -251.46154   10.88043 -23.111  < 2e-16 ***
## combinationSat-spring_None -230.14336   10.11900 -22.744  < 2e-16 ***
## combinationSat-summer_None -166.54487   12.74185 -13.071  < 2e-16 ***
## combinationSun_None         -75.98077    7.80276  -9.738  < 2e-16 ***
## combinationThu_holiday     -281.96154   28.66917  -9.835  < 2e-16 ***
## combinationThu_None           9.49846    7.88040   1.205  0.22889    
## combinationTue_holiday     -125.46154   40.16716  -3.123  0.00194 ** 
## combinationTue_None         -14.00000    7.80276  -1.794  0.07363 .  
## combinationWed_holiday     -248.46154   40.16716  -6.186 1.72e-09 ***
## combinationWed_None           0.00905    7.84091   0.001  0.99908    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 39.79 on 352 degrees of freedom
## Multiple R-squared:  0.8155, Adjusted R-squared:  0.8093 
## F-statistic: 129.7 on 12 and 352 DF,  p-value: < 2.2e-16

Question #5

What happens if you fit a day-of-week effect that varies by month (i.e.m n ~ wday*month)? Why is this not very helpful?

##Answer: The residual in this model is even worse than model in question#3. Becuase this interaction terms does not really have meanings.

model5 <- lm(n ~ wday * month(date), data = data3)
data3 %>% 
  add_residuals(model5, "resid") %>%
  ggplot(aes(date, resid)) +
  geom_hline(yintercept = 0, size = 2, color = "yellow") +
  geom_line()

summary(model5)
## 
## Call:
## lm(formula = n ~ wday * month(date), data = data3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -332.60  -10.93   11.56   25.59  111.56 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          958.7632    14.4978  66.132  < 2e-16 ***
## wdayMon               -5.2579    20.5864  -0.255    0.799    
## wdaySat             -215.3627    20.5404 -10.485  < 2e-16 ***
## wdaySun              -97.4345    20.4973  -4.754 2.92e-06 ***
## wdayThu                5.7758    20.3683   0.284    0.777    
## wdayTue              -17.0984    20.2302  -0.845    0.399    
## wdayWed               -6.7875    20.3583  -0.333    0.739    
## month(date)            1.3343     1.9690   0.678    0.498    
## wdayMon:month(date)    1.8859     2.7786   0.679    0.498    
## wdaySat:month(date)   -1.1485     2.7870  -0.412    0.681    
## wdaySun:month(date)    3.2503     2.7715   1.173    0.242    
## wdayThu:month(date)   -1.1468     2.7734  -0.414    0.679    
## wdayTue:month(date)    0.1506     2.7391   0.055    0.956    
## wdayWed:month(date)    0.3292     2.7758   0.119    0.906    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 48.59 on 351 degrees of freedom
## Multiple R-squared:  0.7256, Adjusted R-squared:  0.7155 
## F-statistic: 71.41 on 13 and 351 DF,  p-value: < 2.2e-16

Question #6

What would you expect the model n ~ wday + ns(date,5) to look like? Knowing what you know about the data, why would you expect it not to be particularly effective?

##Answer 6: Based on the daily-weekly pattern plot, we did not observe an obvious pattern among weekdays.

Question #7

We hypothesized that people leaving on Sundays are more likely to be business travelers who need to be somewhere on Monday. Explore the hypothesis by seeing how if breaks down based on distance and time: if it’s true, you’d expect to see more Sunday evening flights to places that are far away.

##Answer: The hypothesis is true, since we observe more Sundays flights in the longer distance data than shorter distance data.

head(flights)
## # A tibble: 6 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     1     1      517            515         2      830
## 2  2013     1     1      533            529         4      850
## 3  2013     1     1      542            540         2      923
## 4  2013     1     1      544            545        -1     1004
## 5  2013     1     1      554            600        -6      812
## 6  2013     1     1      554            558        -4      740
## # … with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>
summary(flights)
##       year          month             day           dep_time   
##  Min.   :2013   Min.   : 1.000   Min.   : 1.00   Min.   :   1  
##  1st Qu.:2013   1st Qu.: 4.000   1st Qu.: 8.00   1st Qu.: 907  
##  Median :2013   Median : 7.000   Median :16.00   Median :1401  
##  Mean   :2013   Mean   : 6.549   Mean   :15.71   Mean   :1349  
##  3rd Qu.:2013   3rd Qu.:10.000   3rd Qu.:23.00   3rd Qu.:1744  
##  Max.   :2013   Max.   :12.000   Max.   :31.00   Max.   :2400  
##                                                  NA's   :8255  
##  sched_dep_time   dep_delay          arr_time    sched_arr_time
##  Min.   : 106   Min.   : -43.00   Min.   :   1   Min.   :   1  
##  1st Qu.: 906   1st Qu.:  -5.00   1st Qu.:1104   1st Qu.:1124  
##  Median :1359   Median :  -2.00   Median :1535   Median :1556  
##  Mean   :1344   Mean   :  12.64   Mean   :1502   Mean   :1536  
##  3rd Qu.:1729   3rd Qu.:  11.00   3rd Qu.:1940   3rd Qu.:1945  
##  Max.   :2359   Max.   :1301.00   Max.   :2400   Max.   :2359  
##                 NA's   :8255      NA's   :8713                 
##    arr_delay          carrier              flight       tailnum         
##  Min.   : -86.000   Length:336776      Min.   :   1   Length:336776     
##  1st Qu.: -17.000   Class :character   1st Qu.: 553   Class :character  
##  Median :  -5.000   Mode  :character   Median :1496   Mode  :character  
##  Mean   :   6.895                      Mean   :1972                     
##  3rd Qu.:  14.000                      3rd Qu.:3465                     
##  Max.   :1272.000                      Max.   :8500                     
##  NA's   :9430                                                           
##     origin              dest              air_time        distance   
##  Length:336776      Length:336776      Min.   : 20.0   Min.   :  17  
##  Class :character   Class :character   1st Qu.: 82.0   1st Qu.: 502  
##  Mode  :character   Mode  :character   Median :129.0   Median : 872  
##                                        Mean   :150.7   Mean   :1040  
##                                        3rd Qu.:192.0   3rd Qu.:1389  
##                                        Max.   :695.0   Max.   :4983  
##                                        NA's   :9430                  
##       hour           minute        time_hour                  
##  Min.   : 1.00   Min.   : 0.00   Min.   :2013-01-01 05:00:00  
##  1st Qu.: 9.00   1st Qu.: 8.00   1st Qu.:2013-04-04 13:00:00  
##  Median :13.00   Median :29.00   Median :2013-07-03 10:00:00  
##  Mean   :13.18   Mean   :26.23   Mean   :2013-07-03 05:22:54  
##  3rd Qu.:17.00   3rd Qu.:44.00   3rd Qu.:2013-10-01 07:00:00  
##  Max.   :23.00   Max.   :59.00   Max.   :2013-12-31 23:00:00  
## 
long_ds <- flights[flights$distance >= 1500, ]
short_ds <- flights[flights$distance < 1500, ]

daily_l <- long_ds %>%
  mutate(date = make_date(year, month, day)) %>%
  group_by(date) %>%
  summarize(n = n())

daily_l <- daily_l %>%
  mutate(wday = wday(date, label = TRUE))

mod_l = lm(n ~ wday, data = daily_l)

grid <- daily_l %>%
  data_grid(wday) %>%
  add_predictions(mod_l, "n")

ggplot(daily_l, aes(wday, n)) +
  geom_boxplot() +
  geom_point(data = grid, color = "orange", size = 4)

daily_s <- short_ds %>%
  mutate(date = make_date(year, month, day)) %>%
  group_by(date) %>%
  summarize(n = n())

daily_s <- daily_s %>%
  mutate(wday = wday(date, label = TRUE))

mod_s = lm(n ~ wday, data = daily_s)

grid <- daily_s %>%
  data_grid(wday) %>%
  add_predictions(mod_s, "n")

ggplot(daily_s, aes(wday, n)) +
  geom_boxplot() +
  geom_point(data = grid, color = "orange", size = 4)