ANLY 505 - NYCFlightsDataModel

Summarize Data

daily <- flights %>%
  mutate(date = make_date(year, month, day)) %>%
  group_by(date) %>%
  summarize(n = n())

ggplot(daily, aes(date, n)) +
  geom_line()

Investigate Daily-Weekly Pattern

daily <- daily %>%
  mutate(wday = wday(date, label = TRUE))
ggplot(daily, aes(wday,n)) +
  geom_boxplot()

mod = lm(n ~ wday, data = daily, na.action = na.warn)

grid <- daily %>%
  data_grid(wday) %>%
  add_predictions(mod, "n")

ggplot(daily, aes(wday, n)) +
  geom_boxplot() +
  geom_point(data = grid, color = "orange", size = 4)

Investigate residuals

daily <- daily %>%
  add_residuals(mod)

daily %>%
  ggplot(aes(date, resid)) +
  geom_ref_line(h = 0) +
  geom_line()

ggplot(daily, aes(date, resid, color = wday)) +
  geom_ref_line(h = 0, colour = "red") +
  geom_line()

daily %>%
  filter(resid < -100)

## # A tibble: 11 x 4
##    date           n wday  resid
##    <date>     <int> <ord> <dbl>
##  1 2013-01-01   842 Tue   -109.
##  2 2013-01-20   786 Sun   -105.
##  3 2013-05-26   729 Sun   -162.
##  4 2013-07-04   737 Thu   -229.
##  5 2013-07-05   822 Fri   -145.
##  6 2013-09-01   718 Sun   -173.
##  7 2013-11-28   634 Thu   -332.
##  8 2013-11-29   661 Fri   -306.
##  9 2013-12-24   761 Tue   -190.
## 10 2013-12-25   719 Wed   -244.
## 11 2013-12-31   776 Tue   -175.

daily %>%
  ggplot(aes(date, resid)) +
  geom_ref_line(h = 0, colour = "red", size = 1) +
  geom_line(color = "grey50") +
  geom_smooth(se = FALSE, span = 0.20)

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Seasonal Saturday effect

daily %>%
  filter(wday == "Sat") %>%
  ggplot(aes(date, n)) +
  geom_point()+
  geom_line() +
  scale_x_date(
    NULL,
    date_breaks = "1 month",
    date_labels = "%b"
  )

Add Seasonal Variable

term <- function(date) {
  cut(date,
      breaks = ymd(20130101, 20130605, 20130825, 20140101),
      labels = c("spring", "summer", "fall")
      )
}

daily <- daily %>%
  mutate(term = term(date))

daily %>%
  filter(wday == "Sat") %>%
  ggplot(aes(date, n, color = term)) +
  geom_point(alpha = 1/3)+
  geom_line() +
  scale_x_date(
    NULL,
    date_breaks = "1 month",
    date_labels = "%b"
  )

daily %>%
  ggplot(aes(wday, n, color = term)) +
  geom_boxplot()

mod1 <- lm(n ~ wday, data = daily, na.action = na.warn)
mod2 <- lm(n ~ wday * term, data = daily, na.action = na.warn)

daily %>%
  gather_residuals(without_term = mod1, with_term = mod2) %>%
  ggplot(aes(date, resid, color = model)) +
  geom_line(alpha = 0.75)

grid <- daily %>%
  data_grid(wday, term) %>%
  add_predictions(mod2, "n")

ggplot(daily, aes(wday, n)) +
  geom_boxplot() +
  geom_point(data = grid, color = "red") +
  facet_wrap(~ term)

Better model for outliers (Robust regression)

mod3 <- MASS::rlm(n ~ wday * term, data = daily, na.action = na.warn)

daily %>%
  add_residuals(mod3, "resid") %>%
  ggplot(aes(date, resid)) +
  geom_hline(yintercept = 0, size = 2, color = "red") +
  geom_line()

Computed Variables

# If you are creating variables it might be a good idea to bundle the creation of the variables up into a function
compute_vars <- function(data) {
  data %>%
    mutate(term = term(date),
           wday = wday(date, label = TRUE)
           )
}

# Another option would be to put the transformations directly in the model formula:

wday2 <- function(x) wday(x, label = TRUE)
mod3 <- lm(n ~ wday2(date) * term(date), data = daily, na.action = na.warn)

Time of Year: An Alternative Approach

# We could use a more flexible model to capture the pattern of school term in the data
library(splines)
mod <- MASS::rlm(n ~ wday * ns(date, 5), data = daily, na.action = na.warn)

daily %>% 
  data_grid(wday, date = seq_range(date, n = 13)) %>% 
  add_predictions(mod) %>% 
  ggplot(aes(date, pred, color = wday)) +
  geom_line() +
  geom_point()

# We see a strong pattern in the numbers of Sat flights.  This is reassuring, because we also saw that pattern in the raw data.  It's a good sign when you get the same signal from different approaches.

Question #1

Why are there fewer than expected flights on January 20, May 26 and September 1? (Hint: they all have the same explanation.) How would these days generalize into another year?

# Use this chunk to answer question 1

These days are the Sundays before Monday holidays on January 21 - Martin Luther King Jr. Day, May 27 - Memorial Day, September 2 - Labor Day. We can generalize these days into another year as the following: the day before the third Monday in January for Martin Luther King Jr, the day before the last Monday in May for Memorial Day, and the day before the first Monday in September for Labor Day.

Question #2

What do the three days with high positive residuals represent? How would these days generalize to another year?

# Use this chunk to answer question 2
daily %>%
  top_n(3, resid)

## # A tibble: 3 x 5
##   date           n wday  resid term 
##   <date>     <int> <ord> <dbl> <fct>
## 1 2013-11-30   857 Sat   112.  fall 
## 2 2013-12-01   987 Sun    95.5 fall 
## 3 2013-12-28   814 Sat    69.4 fall

High positive residuals mean that the model undervalueated the number of flights on these days. These three days with high positive residuals are 11/30(the next Saturday right after Black Friday), 12/1(the next Sunday right after Thanksgiving), and 12/28(the next Saturday right after Christmas). These days are weekend days right after a national holidays. It will not be same date for another year.

Question #3

Create a new variable that splits the “wday” variable into terms, but only for Saturdays, i.e., it should have Thurs, Fri, but Sat-summer, Sat-spring, Sat-fall. How does this model compare with the model with every combination of “wday” and “term”?

# Use this chunk to answer question 3
daily <- daily %>%
  mutate(
    wday_newnames =
      case_when(
        wday == "Sat" & term == "summer" ~ "Sat-summer",
        wday == "Sat" & term == "fall" ~ "Sat-fall",
        wday == "Sat" & term == "spring" ~ "Sat-spring",
        TRUE ~ as.character(wday)
      )
  )

Mod_Sat_term = lm(n ~ wday_newnames, data = daily)

summary(mod2)

## 
## Call:
## lm(formula = n ~ wday * term, data = daily, na.action = na.warn)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -326.33   -8.00    9.82   20.67  141.00 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        912.8684     3.7086 246.151  < 2e-16 ***
## wday.L             -71.4674     9.8349  -7.267 2.49e-12 ***
## wday.Q            -161.0112     9.8196 -16.397  < 2e-16 ***
## wday.C             -65.6215     9.8068  -6.691 8.98e-11 ***
## wday^4             -82.0997     9.8412  -8.342 1.78e-15 ***
## wday^5              -1.3425     9.7787  -0.137   0.8909    
## wday^6             -12.4501     9.7903  -1.272   0.2043    
## termsummer          37.6922     6.3336   5.951 6.56e-09 ***
## termfall             3.7598     5.5033   0.683   0.4949    
## wday.L:termsummer   -6.2215    16.8049  -0.370   0.7114    
## wday.Q:termsummer   26.4121    16.7501   1.577   0.1158    
## wday.C:termsummer   26.7451    16.7885   1.593   0.1121    
## wday^4:termsummer   23.5933    16.7523   1.408   0.1599    
## wday^5:termsummer  -13.1343    16.7720  -0.783   0.4341    
## wday^6:termsummer   -5.0026    16.6744  -0.300   0.7643    
## wday.L:termfall    -31.9048    14.5607  -2.191   0.0291 *  
## wday.Q:termfall     -0.6959    14.5707  -0.048   0.9619    
## wday.C:termfall     -8.8349    14.5417  -0.608   0.5439    
## wday^4:termfall     -9.8570    14.5899  -0.676   0.4997    
## wday^5:termfall     -3.2107    14.5228  -0.221   0.8252    
## wday^6:termfall     -8.2061    14.5769  -0.563   0.5738    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 46.17 on 344 degrees of freedom
## Multiple R-squared:  0.7573, Adjusted R-squared:  0.7432 
## F-statistic: 53.67 on 20 and 344 DF,  p-value: < 2.2e-16

summary(Mod_Sat_term)

## 
## Call:
## lm(formula = n ~ wday_newnames, data = daily)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -331.75   -8.81   11.31   23.64  141.00 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              967.462      6.568 147.308  < 2e-16 ***
## wday_newnamesMon           7.346      9.288   0.791   0.4295    
## wday_newnamesSat-fall   -251.462     12.951 -19.416  < 2e-16 ***
## wday_newnamesSat-spring -230.143     12.045 -19.107  < 2e-16 ***
## wday_newnamesSat-summer -166.545     15.167 -10.981  < 2e-16 ***
## wday_newnamesSun         -75.981      9.288  -8.181 5.06e-15 ***
## wday_newnamesThu          -1.712      9.288  -0.184   0.8539    
## wday_newnamesTue         -16.103      9.244  -1.742   0.0824 .  
## wday_newnamesWed          -4.769      9.288  -0.513   0.6079    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 47.36 on 356 degrees of freedom
## Multiple R-squared:  0.7357, Adjusted R-squared:  0.7297 
## F-statistic: 123.8 on 8 and 356 DF,  p-value: < 2.2e-16

AIC(mod2)

## [1] 3855.73

AIC(Mod_Sat_term)

## [1] 3862.885

daily %>%
  gather_residuals(sat_term = Mod_Sat_term, Mod_alldays_term = mod2) %>%
  ggplot(aes(date, resid, colour = model)) +
  geom_line(alpha = 0.75)

AIC and Adjusted R-square are much close between two models. AIC of new model is a little higher, which means less fitness of the model. In general, the new model doesn’t provide better results than the old one.

Question #4

Create a new “wday” variable that combines the day of week, term(for Saturdays), and public holidays. What do the residuals of the model look like?

# Use this chunk to answer question 4
holidays2013 <-
  tribble(
    ~HolidayName, ~date,
    "New Year's Day", 20130101,
    "MLK Day", 20130121,
    "Washington's Birthday", 20130218,
    "Memorial Day", 20130527,
    "Independence Day", 20130704,
    "Labor Day", 20130902,
    "Columbus Day", 20131028,
    "Veteran's Day", 20131111,
    "Thanksgiving", 20131128,
    "Christmas", 20131225
  )

holidays2013

## # A tibble: 10 x 2
##    HolidayName               date
##    <chr>                    <dbl>
##  1 New Year's Day        20130101
##  2 MLK Day               20130121
##  3 Washington's Birthday 20130218
##  4 Memorial Day          20130527
##  5 Independence Day      20130704
##  6 Labor Day             20130902
##  7 Columbus Day          20131028
##  8 Veteran's Day         20131111
##  9 Thanksgiving          20131128
## 10 Christmas             20131225

daily <- daily %>%
  mutate(
    wday_Satterm_holidays =
      case_when(
        date %in% holidays2013$date ~ "holiday",
        .$wday == "Sat" & .$term == "summer" ~ "Sat-summer",
        .$wday == "Sat" & .$term == "fall" ~ "Sat-fall",
        .$wday == "Sat" & .$term == "spring" ~ "Sat-spring",
        TRUE ~ as.character(.$wday)
      )
  )

Mod_Sat_term_Holidays <- lm(n ~ wday_Satterm_holidays, data = daily)

summary(Mod_Sat_term_Holidays)

## 
## Call:
## lm(formula = n ~ wday_Satterm_holidays, data = daily)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -331.75   -8.81   11.31   23.64  141.00 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      967.462      6.568 147.308  < 2e-16 ***
## wday_Satterm_holidaysMon           7.346      9.288   0.791   0.4295    
## wday_Satterm_holidaysSat-fall   -251.462     12.951 -19.416  < 2e-16 ***
## wday_Satterm_holidaysSat-spring -230.143     12.045 -19.107  < 2e-16 ***
## wday_Satterm_holidaysSat-summer -166.545     15.167 -10.981  < 2e-16 ***
## wday_Satterm_holidaysSun         -75.981      9.288  -8.181 5.06e-15 ***
## wday_Satterm_holidaysThu          -1.712      9.288  -0.184   0.8539    
## wday_Satterm_holidaysTue         -16.103      9.244  -1.742   0.0824 .  
## wday_Satterm_holidaysWed          -4.769      9.288  -0.513   0.6079    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 47.36 on 356 degrees of freedom
## Multiple R-squared:  0.7357, Adjusted R-squared:  0.7297 
## F-statistic: 123.8 on 8 and 356 DF,  p-value: < 2.2e-16

daily %>%
  add_residuals(Mod_Sat_term_Holidays) %>%
  ggplot(aes(date, resid)) +
  geom_hline(yintercept = 0, size = 1, color = "red") +
  geom_line()+
  geom_smooth(se = FALSE, span = 0.20)

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

daily %>%
  gather_residuals(sat_term_holidays = Mod_Sat_term_Holidays, Mod_alldays_term = mod2) %>%
  ggplot(aes(date, resid, colour = model)) +
  geom_line(alpha = 0.75)

AIC(Mod_Sat_term_Holidays)

## [1] 3862.885

AIC(mod2)

## [1] 3855.73

The new model with taking holidays into account, still have some same extreme residuals. It does not seem to provide a better performance on residuals. Taking into account the holidays doesn’t neccessarily to contribute a better explanation of observed data.

Question #5

What happens if you fit a day-of-week effect that varies by month (i.e.m n ~ wday*month)? Why is this not very helpful?

# Use this chunk to answer question 5
Mod_Interaction_Day_Month <- lm(n ~ wday * month(date), data = daily)

daily %>%
  add_residuals(Mod_Interaction_Day_Month) %>%
  ggplot(aes(date, resid)) +
  geom_hline(yintercept = 0, size = 1, color = "red") +
  geom_line()+
  geom_smooth(se = FALSE, span = 0.20)

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

daily %>%
  gather_residuals(InteractionDayMonth = Mod_Interaction_Day_Month, Mod_alldays_term = mod2) %>%
  ggplot(aes(date, resid, colour = model)) +
  geom_line(alpha = 0.75)

#Looking at the comparison plot, it looks like the extremes...become more extreme compared to our first model (especially during summer and fall). This is of course not a good thing.
#The interaction of day of the week and month reduces the number of observations, and therefore increase the standard errors (higher uncertainty).

As shown on the conparison plot, the new model becomes more extreme compared to our first model. This indicates that new model is not good. This might be caused by the new model reduces the number of observations since the interaction of day of the week and month. Therefore the standard errors are increased.

Question #6

What would you expect the model n ~ wday + ns(date,5) to look like? Knowing what you know about the data, why would you expect it not to be particularly effective?

# Use this chunk to answer question 6
mod3 <- lm(n ~ wday + ns(date, 5), data = daily)
mod4 <- lm(n ~ wday * ns(date, 5), data = daily)
daily %>%
  gather_residuals(mod3, mod4) %>%
  ggplot(aes(x = date, y = resid, color = model)) +
  geom_line(alpha = 0.75)

We have already proved that the day of the week could not explain better about the residuals. Thus we don’t expect it to be particularly effective.

Question #7

We hypothesized that people leaving on Sundays are more likely to be business travelers who need to be somewhere on Monday. Explore the hypothesis by seeing how if breaks down based on distance and time: if it’s true, you’d expect to see more Sunday evening flights to places that are far away.

# Use this chunk to answer question 7

flights %>%
  mutate(
    date = make_date(year, month, day),
    wday = wday(date, label = TRUE)
  ) %>%
  ggplot(aes(y = distance, x = wday)) +
  geom_boxplot() +
  labs(x = "Day", y = "Distance")

flights %>%
  mutate(
    date = make_date(year, month, day),
    wday = wday(date, label = TRUE),
    timeOfDay = factor(case_when(
      hour < 12 ~ "Morning",
      hour < 18 ~ "Afternoon",
      hour >=18 ~ "Evening"
    ),
    levels = c("Morning",
      "Afternoon", "Evening"
    )
    )
  ) %>%
  group_by(timeOfDay) %>%
  ggplot(aes(x = timeOfDay, y = distance))+
  geom_boxplot()+
  labs(x= "TIme of Day", y = "Distance")

flights %>%
  mutate(
    date = make_date(year, month, day),
    wday = wday(date, label = TRUE),
    timeOfDay = factor(case_when(
      hour < 12 ~ "Morning",
      hour < 18 ~ "Afternoon",
      hour >=18 ~ "Evening"
    ),
    levels = c("Morning",
      "Afternoon", "Evening"
    )
    )
  ) %>%
  group_by(wday, timeOfDay) %>%
  ggplot(aes(x = timeOfDay, y = distance))+
  geom_boxplot()+
  labs(x= "Time of Day", y = "Distance")+
  facet_wrap(~wday, nrow=2)

flights %>%
  mutate(
    date = make_date(year, month, day),
    wday = wday(date, label = TRUE),
    timeOfDay = factor(case_when(
      hour < 12 ~ "Morning",
      hour < 18 ~ "Afternoon",
      hour >=18 ~ "Evening"
    ),
    levels = c("Morning",
      "Afternoon", "Evening"
    )
    )
  ) %>%
  group_by(wday, timeOfDay) %>%
  summarize(n = n()) %>%
  ggplot(aes(x = timeOfDay, y = n))+
  geom_boxplot()+
  labs(x = "Time of the day", y = "Number of flights")+
  facet_wrap(~wday, nrow=2)

Weekend travelers have longer distance than weekday travellers in general. However, Saturday travelers have the longest distance, not Sunday travelers. Also, no statistics show that there is a higher number of evening flights on Sunday. We can’t prove our hypotheis based on above plots.