Summarize Data

daily <- flights %>%
  mutate(date = make_date(year, month, day)) %>%
  group_by(date) %>%
  summarize(n = n())

ggplot(daily, aes(date, n)) +
  geom_line()

Investigate Daily-Weekly Pattern

daily <- daily %>%
  mutate(wday = wday(date, label = TRUE))
ggplot(daily, aes(wday,n)) +
  geom_boxplot()

mod = lm(n ~ wday, data = daily, na.action = na.warn)

grid <- daily %>%
  data_grid(wday) %>%
  add_predictions(mod, "n")

ggplot(daily, aes(wday, n)) +
  geom_boxplot() +
  geom_point(data = grid, color = "orange", size = 4)

Investigate residuals

daily <- daily %>%
  add_residuals(mod)

daily %>%
  ggplot(aes(date, resid)) +
  geom_ref_line(h = 0) +
  geom_line()

ggplot(daily, aes(date, resid, color = wday)) +
  geom_ref_line(h = 0, colour = "red") +
  geom_line()

daily %>%
  filter(resid < -100)
## # A tibble: 11 x 4
##    date           n wday  resid
##    <date>     <int> <ord> <dbl>
##  1 2013-01-01   842 Tue   -109.
##  2 2013-01-20   786 Sun   -105.
##  3 2013-05-26   729 Sun   -162.
##  4 2013-07-04   737 Thu   -229.
##  5 2013-07-05   822 Fri   -145.
##  6 2013-09-01   718 Sun   -173.
##  7 2013-11-28   634 Thu   -332.
##  8 2013-11-29   661 Fri   -306.
##  9 2013-12-24   761 Tue   -190.
## 10 2013-12-25   719 Wed   -244.
## 11 2013-12-31   776 Tue   -175.
daily %>%
  ggplot(aes(date, resid)) +
  geom_ref_line(h = 0, colour = "red", size = 1) +
  geom_line(color = "grey50") +
  geom_smooth(se = FALSE, span = 0.20)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Seasonal Saturday effect

daily %>%
  filter(wday == "Sat") %>%
  ggplot(aes(date, n)) +
  geom_point()+
  geom_line() +
  scale_x_date(
    NULL,
    date_breaks = "1 month",
    date_labels = "%b"
  )

Add Seasonal Variable

term <- function(date) {
  cut(date,
      breaks = ymd(20130101, 20130605, 20130825, 20140101),
      labels = c("spring", "summer", "fall")
      )
}

daily <- daily %>%
  mutate(term = term(date))

daily %>%
  filter(wday == "Sat") %>%
  ggplot(aes(date, n, color = term)) +
  geom_point(alpha = 1/3)+
  geom_line() +
  scale_x_date(
    NULL,
    date_breaks = "1 month",
    date_labels = "%b"
  )

daily %>%
  ggplot(aes(wday, n, color = term)) +
  geom_boxplot()

mod1 <- lm(n ~ wday, data = daily, na.action = na.warn)
mod2 <- lm(n ~ wday * term, data = daily, na.action = na.warn)

daily %>%
  gather_residuals(without_term = mod1, with_term = mod2) %>%
  ggplot(aes(date, resid, color = model)) +
  geom_line(alpha = 0.75)

grid <- daily %>%
  data_grid(wday, term) %>%
  add_predictions(mod2, "n")

ggplot(daily, aes(wday, n)) +
  geom_boxplot() +
  geom_point(data = grid, color = "red") +
  facet_wrap(~ term)

Better model for outliers (Robust regression)

mod3 <- MASS::rlm(n ~ wday * term, data = daily, na.action = na.warn)

daily %>%
  add_residuals(mod3, "resid") %>%
  ggplot(aes(date, resid)) +
  geom_hline(yintercept = 0, size = 2, color = "red") +
  geom_line()

Computed Variables

# If you are creating variables it might be a good idea to bundle the creation of the variables up into a function
compute_vars <- function(data) {
  data %>%
    mutate(term = term(date),
           wday = wday(date, label = TRUE)
           )
}

# Another option would be to put the transformations directly in the model formula:

wday2 <- function(x) wday(x, label = TRUE)
mod3 <- lm(n ~ wday2(date) * term(date), data = daily, na.action = na.warn)

Time of Year: An Alternative Approach

# We could use a more flexible model to capture the pattern of school term in the data
library(splines)
mod <- MASS::rlm(n ~ wday * ns(date, 5), data = daily, na.action = na.warn)

daily %>% 
  data_grid(wday, date = seq_range(date, n = 13)) %>% 
  add_predictions(mod) %>% 
  ggplot(aes(date, pred, color = wday)) +
  geom_line() +
  geom_point()

# We see a strong pattern in the numbers of Sat flights.  This is reassuring, because we also saw that pattern in the raw data.  It's a good sign when you get the same signal from different approaches.

Question #1

Why are there fewer than expected flights on January 20, May 26 and September 1? (Hint: they all have the same explanation.) How would these days generalize into another year?

daily$percentile <- ecdf(daily$n)(daily$n)
daily$percentile[daily$date == "2013-01-20" | daily$date == "2013-05-26" |daily$date == "2013-09-01"] 
## [1] 0.13424658 0.07123288 0.06027397
## These are days before MLK, Memorial Day, Labor Day, people don't generally fly on a sunday whenever there is a long weekend, hence fewere flights

Question #2

What do the three days with high positive residuals represent? How would these days generalize to another year?

daily %>%
  top_n(3, resid)
## # A tibble: 3 x 6
##   date           n wday  resid term  percentile
##   <date>     <int> <ord> <dbl> <fct>      <dbl>
## 1 2013-11-30   857 Sat   112.  fall       0.189
## 2 2013-12-01   987 Sun    95.5 fall       0.778
## 3 2013-12-28   814 Sat    69.4 fall       0.167
## These days generally fall during thanksgiving and new year celebration days,which is why there is high flight traffic.

Question #3

Create a new variable that splits the “wday” variable into terms, but only for Saturdays, i.e., it should have Thurs, Fri, but Sat-summer, Sat-spring, Sat-fall. How does this model compare with the model with every combination of “wday” and “term”?

daily<-daily%>%
  mutate(wday_term_Sat=ifelse(wday=="Sat", paste("Sat",term(date), sep = "-"), as.character(wday)))
daily$wday_term_Sat<-as.factor(daily$wday_term_Sat)
model1 <- MASS::rlm(n ~ wday * term, data = daily)
model2 <- MASS::rlm(n ~ wday_term_Sat, data = daily)
summary(model1)
## 
## Call: rlm(formula = n ~ wday * term, data = daily)
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -347.537  -12.899    3.915   11.500  160.101 
## 
## Coefficients:
##                   Value     Std. Error t value  
## (Intercept)        922.4857    1.5914   579.6839
## wday.L             -79.3059    4.2202   -18.7920
## wday.Q            -153.7811    4.2136   -36.4961
## wday.C             -67.7750    4.2081   -16.1056
## wday^4             -74.9295    4.2229   -17.7437
## wday^5              -6.4803    4.1961    -1.5444
## wday^6              -9.7258    4.2011    -2.3151
## termsummer          32.9797    2.7178    12.1349
## termfall             1.9847    2.3615     0.8404
## wday.L:termsummer    9.5346    7.2110     1.3222
## wday.Q:termsummer   12.3445    7.1875     1.7175
## wday.C:termsummer   17.0750    7.2040     2.3702
## wday^4:termsummer   11.0813    7.1885     1.5415
## wday^5:termsummer   -3.5357    7.1969    -0.4913
## wday^6:termsummer    0.3378    7.1550     0.0472
## wday.L:termfall    -31.7325    6.2480    -5.0788
## wday.Q:termfall    -33.1644    6.2524    -5.3043
## wday.C:termfall    -23.8271    6.2399    -3.8185
## wday^4:termfall    -22.5288    6.2606    -3.5985
## wday^5:termfall     -4.9157    6.2318    -0.7888
## wday^6:termfall     -3.3798    6.2550    -0.5403
## 
## Residual standard error: 17.58 on 344 degrees of freedom
summary(model2)
## 
## Call: rlm(formula = n ~ wday_term_Sat, data = daily)
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -345.692  -14.487    2.403   13.840  158.717 
## 
## Coefficients:
##                         Value     Std. Error t value  
## (Intercept)              978.4105    3.2011   305.6436
## wday_term_SatMon           0.1863    4.5271     0.0412
## wday_term_SatSat-fall   -280.1270    6.3127   -44.3748
## wday_term_SatSat-spring -233.2026    5.8710   -39.7213
## wday_term_SatSat-summer -177.4939    7.3927   -24.0092
## wday_term_SatSun         -75.2507    4.5271   -16.6223
## wday_term_SatThu           1.2817    4.5271     0.2831
## wday_term_SatTue         -14.9240    4.5057    -3.3123
## wday_term_SatWed          -7.3207    4.5271    -1.6171
## 
## Residual standard error: 21.18 on 356 degrees of freedom
p1<-daily %>%
  add_residuals(model1, "residual1") %>%
  ggplot(aes(date, residual1)) +
  geom_hline(yintercept = 0, size = 1, color = "navy") +
  geom_line()
p2<-daily %>%
  add_residuals(model2, "residual2") %>%
  ggplot(aes(date, residual2)) +
  geom_hline(yintercept = 0, size = 1, color = "navy") +
  geom_line()
grid.arrange(p1, p2)

## Model with wday_term_sat is better at prediction based on comparing mean residual value for each model

Question #4

Create a new “wday” variable that combines the day of week, term(for Saturdays), and public holidays. What do the residuals of the model look like?

US_holiday<-c(date("2013-01-01"),date("2013-01-21"),date("2013-02-18"),date("2013-05-27"),
              date("2013-07-04"),date("2013-09-02"),date("2013-10-11"),date("2013-11-11"),date("2013-11-28"),
              date("2013-12-25") )
daily<-daily%>%
  mutate(new_wday=ifelse(wday=="Sat", paste("Sat",term(date), sep = "-"), 
                         ifelse(date %in% US_holiday, "holiday" , as.character(wday))))
daily$new_wday<-as.factor(daily$new_wday)
model_new <- MASS::rlm(n ~ new_wday, data = daily)
summary(model_new)
## 
## Call: rlm(formula = n ~ new_wday, data = daily)
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -317.410  -13.499    1.501   13.501  159.165 
## 
## Coefficients:
##                    Value     Std. Error t value  
## (Intercept)         978.4100    3.1205   313.5452
## new_wdayholiday     -64.3951    7.7070    -8.3554
## new_wdayMon           3.0893    4.5059     0.6856
## new_wdaySat-fall   -280.5753    6.1095   -45.9241
## new_wdaySat-spring -232.7398    5.6842   -40.9449
## new_wdaySat-summer -177.4933    7.1499   -24.8246
## new_wdaySun         -74.9600    4.3918   -17.0684
## new_wdayThu           2.9383    4.4350     0.6625
## new_wdayTue         -14.1105    4.3918    -3.2130
## new_wdayWed          -6.5395    4.4130    -1.4819
## 
## Residual standard error: 20.02 on 355 degrees of freedom
daily %>%
  add_residuals(model_new, "residual_new") %>%
  ggplot(aes(date, residual_new)) +
  geom_hline(yintercept = 0, size = 1, color = "blue") +
  geom_line()

## The model with new wday variable that combines the day of week, term and public holidays has lesser value of mean residual

Question #5

What happens if you fit a day-of-week effect that varies by month (i.e.m n ~ wday*month)? Why is this not very helpful?

daily<-daily%>%
  mutate(month= as.factor(month(date)))
model_month <- MASS::rlm(n ~ wday*month, data = daily)
summary(model_month)
## 
## Call: rlm(formula = n ~ wday * month, data = daily)
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -355.5000   -4.0000    0.4941    4.0000  138.5118 
## 
## Coefficients:
##                Value     Std. Error t value  
## (Intercept)     868.1936    1.6650   521.4327
## wday.L          -73.7893    4.5741   -16.1321
## wday.Q         -164.7242    4.4167   -37.2960
## wday.C          -70.2867    4.4509   -15.7917
## wday^4          -94.8767    4.4920   -21.1214
## wday^5            4.8125    4.3242     1.1129
## wday^6          -11.8265    4.1618    -2.8417
## month2           22.8336    2.4092     9.4775
## month3           67.3938    2.3547    28.6211
## month4           73.9634    2.3730    31.1686
## month5           62.6409    2.3547    26.6026
## month6           81.7330    2.3730    34.4427
## month7           93.6207    2.3547    39.7593
## month8           84.9062    2.3547    36.0584
## month9           58.2325    2.3730    24.5395
## month10          61.1632    2.3547    25.9751
## month11          63.5060    2.3730    26.7618
## month12          52.2364    2.3547    22.1840
## wday.L:month2    10.1749    6.4921     1.5673
## wday.Q:month2    -9.0842    6.3822    -1.4234
## wday.C:month2     1.3437    6.4059     0.2098
## wday^4:month2     8.7889    6.4345     1.3659
## wday^5:month2     2.4525    6.3185     0.3882
## wday^6:month2     4.9467    6.2085     0.7968
## wday.L:month3    -4.2993    6.2299    -0.6901
## wday.Q:month3    14.7026    6.1811     2.3787
## wday.C:month3     9.3917    6.2380     1.5056
## wday^4:month3    24.5305    6.2894     3.9003
## wday^5:month3   -11.1877    6.2461    -1.7911
## wday^6:month3     2.4036    6.1944     0.3880
## wday.L:month4    -6.8702    6.4334    -1.0679
## wday.Q:month4     9.6813    6.3465     1.5255
## wday.C:month4     1.9424    6.2945     0.3086
## wday^4:month4    26.3942    6.3265     4.1720
## wday^5:month4   -12.7402    6.1524    -2.0708
## wday^6:month4    -3.8213    6.1112    -0.6253
## wday.L:month5   -12.7427    6.4334    -1.9807
## wday.Q:month5     2.8971    6.2824     0.4611
## wday.C:month5    -6.7737    6.2945    -1.0761
## wday^4:month5    12.9616    6.2476     2.0747
## wday^5:month5    -9.9501    6.1524    -1.6173
## wday^6:month5     1.8807    5.9589     0.3156
## wday.L:month6    10.9685    6.2784     1.7470
## wday.Q:month6    26.4551    6.1811     4.2800
## wday.C:month6    19.2234    6.2945     3.0540
## wday^4:month6    28.9493    6.3958     4.5263
## wday^5:month6   -12.7016    6.3105    -2.0128
## wday^6:month6     3.3101    6.2078     0.5332
## wday.L:month7     5.3788    6.4334     0.8361
## wday.Q:month7    24.1848    6.2824     3.8496
## wday.C:month7    18.7778    6.2945     2.9832
## wday^4:month7    33.9526    6.2476     5.4345
## wday^5:month7   -14.5411    6.1524    -2.3635
## wday^6:month7     2.9936    5.9589     0.5024
## wday.L:month8    -5.4223    6.3265    -0.8571
## wday.Q:month8    10.0067    6.2461     1.6021
## wday.C:month8    11.3565    6.2380     1.8205
## wday^4:month8    23.8261    6.3069     3.7778
## wday^5:month8   -15.8985    6.1483    -2.5859
## wday^6:month8     1.0814    6.1108     0.1770
## wday.L:month9   -42.2908    6.3385    -6.6721
## wday.Q:month9   -24.7745    6.2824    -3.9435
## wday.C:month9   -31.9154    6.2945    -5.0704
## wday^4:month9   -17.3602    6.3091    -2.7516
## wday^5:month9   -12.5103    6.2502    -2.0016
## wday^6:month9     3.0670    6.1948     0.4951
## wday.L:month10  -49.1848    6.4687    -7.6035
## wday.Q:month10  -30.9858    6.2461    -4.9608
## wday.C:month10  -35.6117    6.2945    -5.6576
## wday^4:month10   -8.3348    6.3526    -1.3120
## wday^5:month10  -15.9756    6.1153    -2.6124
## wday^6:month10   -2.3547    5.8856    -0.4001
## wday.L:month11  -22.1747    6.3385    -3.4984
## wday.Q:month11  -23.2437    6.2824    -3.6998
## wday.C:month11  -11.7420    6.2945    -1.8654
## wday^4:month11    8.1113    6.3091     1.2857
## wday^5:month11  -18.4355    6.2502    -2.9496
## wday^6:month11    2.3924    6.1948     0.3862
## wday.L:month12    1.8501    6.3265     0.2924
## wday.Q:month12   18.0719    6.2461     2.8933
## wday.C:month12    4.2873    6.2380     0.6873
## wday^4:month12   11.5970    6.3069     1.8388
## wday^5:month12   -5.7612    6.1483    -0.9370
## wday^6:month12   -2.1140    6.1108    -0.3459
## 
## Residual standard error: 5.93 on 281 degrees of freedom
daily %>%
  gather_residuals(model_month, model_new) %>%
  ggplot(aes(date, resid, color=model)) +
  geom_hline(yintercept = 0, size = 1, color = "blue") +
  geom_line()

##The new model is increasing the odds of having outliers, hence the model is not very helpful

Question #6

What would you expect the model n ~ wday + ns(date,5) to look like? Knowing what you know about the data, why would you expect it not to be particularly effective?

model_spline <- MASS::rlm(n ~ wday + ns(date, 5), data = daily)
daily %>%
  gather_residuals(model_spline, model_new) %>%
  ggplot(aes(date, resid, color=model)) +
  geom_hline(yintercept = 0, size = 1, color = "blue") +
  geom_line()

daily %>% 
  data_grid(wday, date = seq_range(date, n = 13)) %>% 
  add_predictions(model_spline) %>% 
  ggplot(aes(date, pred, color = wday)) +
  geom_line() +
  geom_point()

## Model is overfit. It will work well only with the data on hand, it will not work well with other years. The cyclicity with weekday is not correct measure to build the model

Question #7

We hypothesized that people leaving on Sundays are more likely to be business travelers who need to be somewhere on Monday. Explore the hypothesis by seeing how if breaks down based on distance and time: if it’s true, you’d expect to see more Sunday evening flights to places that are far away.

flights_revised<-flights %>%
  mutate(date = make_date(year, month, day))%>%
      mutate(wday = wday(date, label = TRUE))%>%
  mutate(wday_revised=ifelse((wday=="Sun" & dep_time>1600), "Sun-Evening", ifelse(wday=="Sun" & dep_time<=1600, "Sun-DayTime", as.character(wday))))%>%
  mutate(wday_revised=as.factor(wday_revised))%>%
  group_by(wday_revised) %>%
  summarise(average_dist =  mean(distance),
            median_dist = median(distance))
## Warning: Factor `wday_revised` contains implicit NA, consider using
## `forcats::fct_explicit_na`
flights_revised<-flights_revised[1:8,]
flights_revised$wday_revised <- factor(flights_revised$wday_revised , levels = c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun-DayTime", "Sun-Evening"))
p5<-ggplot(data= flights_revised, aes(y = average_dist, x = wday_revised)) +
  geom_point(color="red")+labs(title = "Flights 'Average Distance' on Over Weekdays")
p6<-ggplot(data= flights_revised, aes(y = median_dist, x = wday_revised)) +
  geom_point(color="red")+labs(title = " Flight 'Median Distance' on Over Weekdays")
grid.arrange(p5, p6)

## Sunday evening flights have higher mean and median values than sunday DayTime flgihts, this supports the hypothesis