NYCFlightsDataModel - Assignment 3

Summarize Data

daily <- flights %>%
  mutate(date = make_date(year, month, day)) %>%
  group_by(date) %>%
  summarize(n = n())

ggplot(daily, aes(date, n)) +
  geom_line()

Investigate Daily-Weekly Pattern

daily <- daily %>%
  mutate(wday = wday(date, label = TRUE))
ggplot(daily, aes(wday,n)) +
  geom_boxplot()

mod = lm(n ~ wday, data = daily)

grid <- daily %>%
  data_grid(wday) %>%
  add_predictions(mod, "n")

ggplot(daily, aes(wday, n)) +
  geom_boxplot() +
  geom_point(data = grid, color = "orange", size = 4)

Investigate residuals

daily <- daily %>%
  add_residuals(mod)

daily %>%
  ggplot(aes(date, resid)) +
  geom_ref_line(h = 0) +
  geom_line()

ggplot(daily, aes(date, resid, color = wday)) +
  geom_ref_line(h = 0, colour = "red") +
  geom_line()

daily %>%
  filter(resid < -100)

## # A tibble: 11 x 4
##    date           n wday  resid
##    <date>     <int> <ord> <dbl>
##  1 2013-01-01   842 Tue   -109.
##  2 2013-01-20   786 Sun   -105.
##  3 2013-05-26   729 Sun   -162.
##  4 2013-07-04   737 Thu   -229.
##  5 2013-07-05   822 Fri   -145.
##  6 2013-09-01   718 Sun   -173.
##  7 2013-11-28   634 Thu   -332.
##  8 2013-11-29   661 Fri   -306.
##  9 2013-12-24   761 Tue   -190.
## 10 2013-12-25   719 Wed   -244.
## 11 2013-12-31   776 Tue   -175.

daily %>%
  ggplot(aes(date, resid)) +
  geom_ref_line(h = 0, colour = "red", size = 1) +
  geom_line(color = "grey50") +
  geom_smooth(se = FALSE, span = 0.20)

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Seasonal Saturday effect

daily %>%
  filter(wday == "Sat") %>%
  ggplot(aes(date, n)) +
  geom_point()+
  geom_line() +
  scale_x_date(
    NULL,
    date_breaks = "1 month",
    date_labels = "%b"
  )

Add Seasonal Variable

term <- function(date) {
  cut(date,
      breaks = ymd(20130101, 20130605, 20130825, 20140101),
      labels = c("spring", "summer", "fall")
      )
}

daily <- daily %>%
  mutate(term = term(date))

daily %>%
  filter(wday == "Sat") %>%
  ggplot(aes(date, n, color = term)) +
  geom_point(alpha = 1/3)+
  geom_line() +
  scale_x_date(
    NULL,
    date_breaks = "1 month",
    date_labels = "%b"
  )

daily %>%
  ggplot(aes(wday, n, color = term)) +
  geom_boxplot()

#Two models
mod1 <- lm(n ~ wday, data = daily)
mod2 <- lm(n ~ wday * term, data = daily)

#Compare the residuals of the two models
daily %>%
  gather_residuals(without_term = mod1, with_term = mod2) %>%
  ggplot(aes(date, resid, color = model)) +
  geom_line(alpha = 0.75)

grid <- daily %>%
  data_grid(wday, term) %>%
  add_predictions(mod2, "n")

ggplot(daily, aes(wday, n)) +
  geom_boxplot() +
  geom_point(data = grid, color = "red") +
  facet_wrap(~ term)

Better model for outliers (Robust regression)

mod3 <- MASS::rlm(n ~ wday * term, data = daily)

daily %>%
  add_residuals(mod3, "resid") %>%
  ggplot(aes(date, resid)) +
  geom_hline(yintercept = 0, size = 2, color = "red") +
  geom_line()

Computed Variables

# If you are creating variables it might be a good idea to bundle the creation of the variables up into a function
compute_vars <- function(data) {
  data %>%
    mutate(term = term(date),
           wday = wday(date, label = TRUE)
           )
}

# Another option would be to put the transformations directly in the model formula:

wday2 <- function(x) wday(x, label = TRUE)
mod3 <- lm(n ~ wday2(date) * term(date), data = daily)

Time of Year: An Alternative Approach

# We could use a more flexible model to capture the pattern of school term in the data
library(splines)
mod <- MASS::rlm(n ~ wday * ns(date, 5), data = daily)

daily %>% 
  data_grid(wday, date = seq_range(date, n = 13)) %>% 
  add_predictions(mod) %>% 
  ggplot(aes(date, pred, color = wday)) +
  geom_line() +
  geom_point()

# We see a strong pattern in the numbers of Sat flights.  This is reassuring, because we also saw that pattern in the raw data.  It's a good sign when you get the same signal from different approaches.

Question #1

Why are there fewer than expected flights on January 20, May 26 and September 1? (Hint: they all have the same explanation.) How would these days generalize into another year?

#We used https://www.timeanddate.com/calendar/monthly.html?year=2013&month=5 to find out what happened on these dates:

#January 20th, 2013, was the Sunday right before Martin Luther King Day (Monday, January 21st, 2013)
#May 26th, 2013, was the Sunday right before Memorial Day (Monday, May 27th, 2013)
#September 1st, 2013, was the Sunday right before Labor Day (Monday, September 2nd, 2013)

#It seems that flight companies have less flights during the day prior to an holiday.

#In 2014, these days become the holiday date (NOT the Sunday prior to it), and in 2015, these days will become the date right after the holiday. We would not exactly observe the same pattern at these dates: we cannot generalize these specific dates. However, we could generalize that "the day before MLK day, Memorial Day and Labor Day, there are fewer flights".

Question #2

What do the three days with high positive residuals represent? How would these days generalize to another year?

daily %>%
  top_n(3, resid)

## # A tibble: 3 x 5
##   date           n wday  resid term 
##   <date>     <int> <ord> <dbl> <fct>
## 1 2013-11-30   857 Sat   112.  fall 
## 2 2013-12-01   987 Sun    95.5 fall 
## 3 2013-12-28   814 Sat    69.4 fall

#Highly Positive residuals indicate that the model underevaluated the number of flights during at these dates. Let's look at what these dates were:

#November 30th, 2013 was the Saturday right after Black Friday (November 29th, 2013) and the first weekend day after Thanksgiving.
#December 1st, 2013 was the Sunday closing the weekend following Thanksgiving.
#December 28th, 2013 was the Saturday between Christmas and New Year's Eve.

#The important aspect of these 3 days is that it is a weekend day right after or in-between holidays that occured during a week day. It would not necessarily be the same exact date in other years.

Question #3

Create a new variable that splits the “wday” variable into terms, but only for Saturdays, i.e., it should have Thurs, Fri, but Sat-summer, Sat-spring, Sat-fall. How does this model compare with the model with every combination of “wday” and “term” (mod2)?

#We create the variable (wday_newnames) by filtering by Day (Saturday, as "Sat") and by term (Summer, Fall and Spring. There is no Winter term). We then rename all these filtered items.
daily<- daily %>%
  mutate(
    wday_newnames =
      case_when(
        wday == "Sat" & term == "summer" ~ "Sat-summer",
        wday == "Sat" & term == "fall" ~ "Sat-fall",
        wday == "Sat" & term == "spring" ~ "Sat-spring",
        TRUE ~ as.character(wday)
      )
  )

#New model:
Mod_Sat_term<- lm(n ~ wday_newnames, data = daily)

#Summary mod2
summary(mod2)

## 
## Call:
## lm(formula = n ~ wday * term, data = daily)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -326.33   -8.00    9.82   20.67  141.00 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        912.8684     3.7086 246.151  < 2e-16 ***
## wday.L             -71.4674     9.8349  -7.267 2.49e-12 ***
## wday.Q            -161.0112     9.8196 -16.397  < 2e-16 ***
## wday.C             -65.6215     9.8068  -6.691 8.98e-11 ***
## wday^4             -82.0997     9.8412  -8.342 1.78e-15 ***
## wday^5              -1.3425     9.7787  -0.137   0.8909    
## wday^6             -12.4501     9.7903  -1.272   0.2043    
## termsummer          37.6922     6.3336   5.951 6.56e-09 ***
## termfall             3.7598     5.5033   0.683   0.4949    
## wday.L:termsummer   -6.2215    16.8049  -0.370   0.7114    
## wday.Q:termsummer   26.4121    16.7501   1.577   0.1158    
## wday.C:termsummer   26.7451    16.7885   1.593   0.1121    
## wday^4:termsummer   23.5933    16.7523   1.408   0.1599    
## wday^5:termsummer  -13.1343    16.7720  -0.783   0.4341    
## wday^6:termsummer   -5.0026    16.6744  -0.300   0.7643    
## wday.L:termfall    -31.9048    14.5607  -2.191   0.0291 *  
## wday.Q:termfall     -0.6959    14.5707  -0.048   0.9619    
## wday.C:termfall     -8.8349    14.5417  -0.608   0.5439    
## wday^4:termfall     -9.8570    14.5899  -0.676   0.4997    
## wday^5:termfall     -3.2107    14.5228  -0.221   0.8252    
## wday^6:termfall     -8.2061    14.5769  -0.563   0.5738    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 46.17 on 344 degrees of freedom
## Multiple R-squared:  0.7573, Adjusted R-squared:  0.7432 
## F-statistic: 53.67 on 20 and 344 DF,  p-value: < 2.2e-16

#Summary Mod_Sat_term
summary(Mod_Sat_term)

## 
## Call:
## lm(formula = n ~ wday_newnames, data = daily)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -331.75   -8.81   11.31   23.64  141.00 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              967.462      6.568 147.308  < 2e-16 ***
## wday_newnamesMon           7.346      9.288   0.791   0.4295    
## wday_newnamesSat-fall   -251.462     12.951 -19.416  < 2e-16 ***
## wday_newnamesSat-spring -230.143     12.045 -19.107  < 2e-16 ***
## wday_newnamesSat-summer -166.545     15.167 -10.981  < 2e-16 ***
## wday_newnamesSun         -75.981      9.288  -8.181 5.06e-15 ***
## wday_newnamesThu          -1.712      9.288  -0.184   0.8539    
## wday_newnamesTue         -16.103      9.244  -1.742   0.0824 .  
## wday_newnamesWed          -4.769      9.288  -0.513   0.6079    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 47.36 on 356 degrees of freedom
## Multiple R-squared:  0.7357, Adjusted R-squared:  0.7297 
## F-statistic: 123.8 on 8 and 356 DF,  p-value: < 2.2e-16

#AIC comparison

AIC(mod2)

## [1] 3855.73

AIC(Mod_Sat_term)

## [1] 3862.885

#Compare the residuals of the two models
daily %>%
  gather_residuals(sat_term = Mod_Sat_term, Mod_alldays_term = mod2) %>%
  ggplot(aes(date, resid, colour = model)) +
  geom_line(alpha = 0.75)

#Adjusted R-square are very close (slightly lower for the new model), AIC (higher AIC, indicating less goodness of fit, for the new model). Residual plot is difficult to interpret. Overall, it seems that the new model doesn't necessarily give better results than the previous one.

Question #4

Create a new “wday” variable that combines the day of week, term(for Saturdays), and public holidays. What do the residuals of the model look like?

#Using google, we identify and attribute the names of the holidays
holidays2013<-
  tribble(
    ~HolidayName, ~date,
    "New Year's Day", 20130101,
    "MLK Day", 20130121,
    "Washington's Birthday", 20130218,
    "Memorial Day", 20130527,
    "Independence Day", 20130704,
    "Labor Day", 20130902,
    "Columbus Day", 20131028,
    "Veteran's Day", 20131111,
    "Thanksgiving", 20131128,
    "Christmas", 20131225
  )

holidays2013

## # A tibble: 10 x 2
##    HolidayName               date
##    <chr>                    <dbl>
##  1 New Year's Day        20130101
##  2 MLK Day               20130121
##  3 Washington's Birthday 20130218
##  4 Memorial Day          20130527
##  5 Independence Day      20130704
##  6 Labor Day             20130902
##  7 Columbus Day          20131028
##  8 Veteran's Day         20131111
##  9 Thanksgiving          20131128
## 10 Christmas             20131225

#Adding the holidays to our dataset
daily <- daily %>%
  mutate(
    wday_Satterm_holidays =
      case_when(
        date %in% holidays2013$date ~ "holiday",
        .$wday == "Sat" & .$term == "summer" ~ "Sat-summer",
        .$wday == "Sat" & .$term == "fall" ~ "Sat-fall",
        .$wday == "Sat" & .$term == "spring" ~ "Sat-spring",
        TRUE ~ as.character(.$wday)
      )
  )

#NewModel

Mod_Sat_term_Holidays <- lm(n ~ wday_Satterm_holidays, data = daily)

#Summary

summary(Mod_Sat_term_Holidays)

## 
## Call:
## lm(formula = n ~ wday_Satterm_holidays, data = daily)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -331.75   -8.81   11.31   23.64  141.00 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      967.462      6.568 147.308  < 2e-16 ***
## wday_Satterm_holidaysMon           7.346      9.288   0.791   0.4295    
## wday_Satterm_holidaysSat-fall   -251.462     12.951 -19.416  < 2e-16 ***
## wday_Satterm_holidaysSat-spring -230.143     12.045 -19.107  < 2e-16 ***
## wday_Satterm_holidaysSat-summer -166.545     15.167 -10.981  < 2e-16 ***
## wday_Satterm_holidaysSun         -75.981      9.288  -8.181 5.06e-15 ***
## wday_Satterm_holidaysThu          -1.712      9.288  -0.184   0.8539    
## wday_Satterm_holidaysTue         -16.103      9.244  -1.742   0.0824 .  
## wday_Satterm_holidaysWed          -4.769      9.288  -0.513   0.6079    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 47.36 on 356 degrees of freedom
## Multiple R-squared:  0.7357, Adjusted R-squared:  0.7297 
## F-statistic: 123.8 on 8 and 356 DF,  p-value: < 2.2e-16

#Show the residuals in a plot

daily %>%
  add_residuals(Mod_Sat_term_Holidays) %>%
  ggplot(aes(date, resid)) +
  geom_hline(yintercept = 0, size = 1, color = "red") +
  geom_line()+
  geom_smooth(se = FALSE, span = 0.20)

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#Compare via plot
daily %>%
  gather_residuals(sat_term_holidays = Mod_Sat_term_Holidays, Mod_alldays_term = mod2) %>%
  ggplot(aes(date, resid, colour = model)) +
  geom_line(alpha = 0.75)

#Compare via AIC
AIC(Mod_Sat_term_Holidays)

## [1] 3862.885

AIC(mod2)

## [1] 3855.73

#The new model, taking into account the holidays, does not seem to improve the residuals: we still have the same extreme residuals, meaning that including the holidays does not contribute to a better explanation of our observed data.

Question #5

What happens if you fit a day-of-week effect that varies by month (i.e.m n ~ wday*month)? Why is this not very helpful?

#New model, including the interaction of day and month
Mod_Interaction_Day_Month <- lm(n ~ wday * month(date), data = daily)

#Residuals

daily %>%
  add_residuals(Mod_Interaction_Day_Month) %>%
  ggplot(aes(date, resid)) +
  geom_hline(yintercept = 0, size = 1, color = "red") +
  geom_line()+
  geom_smooth(se = FALSE, span = 0.20)

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#Compare via plot
daily %>%
  gather_residuals(InteractionDayMonth = Mod_Interaction_Day_Month, Mod_alldays_term = mod2) %>%
  ggplot(aes(date, resid, colour = model)) +
  geom_line(alpha = 0.75)

#Looking at the comparison plot, it looks like the extremes...become more extreme compared to our first model (especially during summer and fall). This is of course not a good thing.
#The interaction of day of the week and month reduces the number of observations, and therefore increase the standard errors (higher uncertainty).

Question #6

What would you expect the model n ~ wday + ns(date,5) to look like? Knowing what you know about the data, why would you expect it not to be particularly effective?

#We already identified that the day of the week did not explain better the residuals. Therefore, removing this interaction is not expected to provide a better fit.

Question #7

We hypothesized that people leaving on Sundays are more likely to be business travelers who need to be somewhere on Monday.
Explore the hypothesis by seeing how if breaks down based on distance and time: if it’s true, you’d expect to see more Sunday evening flights (after 6pm) to places that are far away.

#Distance, per day

flights %>%
  mutate(
    date = make_date(year, month, day),
    wday = wday(date, label = TRUE)
  ) %>%
  ggplot(aes(y = distance, x = wday)) +
  geom_boxplot() +
  labs(x = "Day", y = "Distance")

#distance, per day

flights %>%
  mutate(
    date = make_date(year, month, day),
    wday = wday(date, label = TRUE),
    timeOfDay = factor(case_when(
      hour < 12 ~ "Morning",
      hour < 18 ~ "Afternoon",
      hour >=18 ~ "Evening"
    ),
    levels = c("Morning",
      "Afternoon", "Evening"
    )
    )
  ) %>%
  group_by(timeOfDay) %>%
  ggplot(aes(x = timeOfDay, y = distance))+
  geom_boxplot()+
  labs(x= "TIme of Day", y = "Distance")

#distance, per hour, per day

flights %>%
  mutate(
    date = make_date(year, month, day),
    wday = wday(date, label = TRUE),
    timeOfDay = factor(case_when(
      hour < 12 ~ "Morning",
      hour < 18 ~ "Afternoon",
      hour >=18 ~ "Evening"
    ),
    levels = c("Morning",
      "Afternoon", "Evening"
    )
    )
  ) %>%
  group_by(wday, timeOfDay) %>%
  ggplot(aes(x = timeOfDay, y = distance))+
  geom_boxplot()+
  labs(x= "TIme of Day", y = "Distance")+
  facet_wrap(~wday, nrow=2)

#Number of flights, per hour, per day

flights %>%
  mutate(
    date = make_date(year, month, day),
    wday = wday(date, label = TRUE),
    timeOfDay = factor(case_when(
      hour < 12 ~ "Morning",
      hour < 18 ~ "Afternoon",
      hour >=18 ~ "Evening"
    ),
    levels = c("Morning",
      "Afternoon", "Evening"
    )
    )
  ) %>%
  group_by(wday, timeOfDay) %>%
  summarize(n = n()) %>%
  ggplot(aes(x = timeOfDay, y = n))+
  geom_boxplot()+
  labs(x = "Time of the day", y = "Number of flights")+
  facet_wrap(~wday, nrow=2)

We are not able to verify our hypothesis: from the plots, it looks like: - There is not necesseraly a higher number of evening flights on Sunday, compared to other days of the week. However, we can tell that there is a significantly lower number of them on Saturday nights, compared to other days of the week. - In terms of distance, there is again no evidence that distances are higher for customers leaving with evening flights. Sundays and Saturdays seems to cover higher distances, on average. Looking at the time of the day, however, we note that the flghts of Sunday morning cover, on average, higher distance. For Saturdays, the evening flights cover, on average, higher distances.