ANLY 505 - NYCFlightsDataModel

Summarize Data

daily <- flights %>%
  mutate(date = make_date(year, month, day)) %>%
  group_by(date) %>%
  summarize(n = n())

ggplot(daily, aes(date, n)) +
  geom_line()

Investigate Daily-Weekly Pattern

daily <- daily %>%
  mutate(wday = wday(date, label = TRUE))
ggplot(daily, aes(wday,n)) +
  geom_boxplot()

mod = lm(n ~ wday, data = daily, na.action = na.warn)

grid <- daily %>%
  data_grid(wday) %>%
  add_predictions(mod, "n")

ggplot(daily, aes(wday, n)) +
  geom_boxplot() +
  geom_point(data = grid, color = "orange", size = 4)

Investigate residuals

daily <- daily %>%
  add_residuals(mod)

daily %>%
  ggplot(aes(date, resid)) +
  geom_ref_line(h = 0) +
  geom_line()

ggplot(daily, aes(date, resid, color = wday)) +
  geom_ref_line(h = 0, colour = "red") +
  geom_line()

daily %>%
  filter(resid < -100)

## # A tibble: 11 x 4
##    date           n wday  resid
##    <date>     <int> <ord> <dbl>
##  1 2013-01-01   842 Tue   -109.
##  2 2013-01-20   786 Sun   -105.
##  3 2013-05-26   729 Sun   -162.
##  4 2013-07-04   737 Thu   -229.
##  5 2013-07-05   822 Fri   -145.
##  6 2013-09-01   718 Sun   -173.
##  7 2013-11-28   634 Thu   -332.
##  8 2013-11-29   661 Fri   -306.
##  9 2013-12-24   761 Tue   -190.
## 10 2013-12-25   719 Wed   -244.
## 11 2013-12-31   776 Tue   -175.

daily %>%
  ggplot(aes(date, resid)) +
  geom_ref_line(h = 0, colour = "red", size = 1) +
  geom_line(color = "grey50") +
  geom_smooth(se = FALSE, span = 0.20)

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Seasonal Saturday effect

daily %>%
  filter(wday == "Sat") %>%
  ggplot(aes(date, n)) +
  geom_point()+
  geom_line() +
  scale_x_date(
    NULL,
    date_breaks = "1 month",
    date_labels = "%b"
  )

Add Seasonal Variable

term <- function(date) {
  cut(date,
      breaks = ymd(20130101, 20130605, 20130825, 20140101),
      labels = c("spring", "summer", "fall")
      )
}

daily <- daily %>%
  mutate(term = term(date))

daily %>%
  filter(wday == "Sat") %>%
  ggplot(aes(date, n, color = term)) +
  geom_point(alpha = 1/3)+
  geom_line() +
  scale_x_date(
    NULL,
    date_breaks = "1 month",
    date_labels = "%b"
  )

daily %>%
  ggplot(aes(wday, n, color = term)) +
  geom_boxplot()

mod1 <- lm(n ~ wday, data = daily, na.action = na.warn)
mod2 <- lm(n ~ wday * term, data = daily, na.action = na.warn)

daily %>%
  gather_residuals(without_term = mod1, with_term = mod2) %>%
  ggplot(aes(date, resid, color = model)) +
  geom_line(alpha = 0.75)

grid <- daily %>%
  data_grid(wday, term) %>%
  add_predictions(mod2, "n")

ggplot(daily, aes(wday, n)) +
  geom_boxplot() +
  geom_point(data = grid, color = "red") +
  facet_wrap(~ term)

Better model for outliers (Robust regression)

mod3 <- MASS::rlm(n ~ wday * term, data = daily, na.action = na.warn)

daily %>%
  add_residuals(mod3, "resid") %>%
  ggplot(aes(date, resid)) +
  geom_hline(yintercept = 0, size = 2, color = "red") +
  geom_line()

Computed Variables

# If you are creating variables it might be a good idea to bundle the creation of the variables up into a function
compute_vars <- function(data) {
  data %>%
    mutate(term = term(date),
           wday = wday(date, label = TRUE)
           )
}

# Another option would be to put the transformations directly in the model formula:

wday2 <- function(x) wday(x, label = TRUE)
mod3 <- lm(n ~ wday2(date) * term(date), data = daily, na.action = na.warn)

Time of Year: An Alternative Approach

# We could use a more flexible model to capture the pattern of school term in the data
library(splines)
mod <- MASS::rlm(n ~ wday * ns(date, 5), data = daily, na.action = na.warn)

daily %>% 
  data_grid(wday, date = seq_range(date, n = 13)) %>% 
  add_predictions(mod) %>% 
  ggplot(aes(date, pred, color = wday)) +
  geom_line() +
  geom_point()

# We see a strong pattern in the numbers of Sat flights.  This is reassuring, because we also saw that pattern in the raw data.  It's a good sign when you get the same signal from different approaches.

Question #1

Why are there fewer than expected flights on January 20, May 26 and September 1? (Hint: they all have the same explanation.) How would these days generalize into another year?

# Use this chunk to answer question 1
x <- as.Date("2013-01-20")
y <- as.Date("2013-05-26")
z <- as.Date("2013-09-1")
weekdays(x)

## [1] "Sunday"

weekdays(y)

## [1] "Sunday"

weekdays(z)

## [1] "Sunday"

x1 <- as.Date("2014-01-20")
y1 <- as.Date("2014-05-26")
z1 <- as.Date("2014-09-1")
weekdays(x1)

## [1] "Monday"

weekdays(y1)

## [1] "Monday"

weekdays(z1)

## [1] "Monday"

Becuase all of them are Sunday in 2013. More specifically, they’re holiday weekends. In 2014, they’ll all be Monday with a different number of flights since the whole context changes.

Question #2

What do the three days with high positive residuals represent? How would these days generalize to another year?

# Use this chunk to answer question 2
daily %>%
  top_n(3, resid)

## # A tibble: 3 x 5
##   date           n wday  resid term 
##   <date>     <int> <ord> <dbl> <fct>
## 1 2013-11-30   857 Sat   112.  fall 
## 2 2013-12-01   987 Sun    95.5 fall 
## 3 2013-12-28   814 Sat    69.4 fall

The high residuals indicate that they had more than expected flights on those dates. All of the three days fall on weekends - Saturday or Sunday. If we were looking at another year, the residuals might be different since the same dates may not be weekends in other years.

Question #3

Create a new variable that splits the “wday” variable into terms, but only for Saturdays, i.e., it should have Thurs, Fri, but Sat-summer, Sat-spring, Sat-fall. How does this model compare with the model with every combination of “wday” and “term”?

# Use this chunk to answer question 3

daily <- daily %>%
  mutate(disp = case_when (wday == "Sat" & term == "spring" ~ "Sat-spring",
                           wday == "Sat" & term == "summer" ~ "Sat-summer",
                           wday == "Sat" & term == "fall" ~ "Sat-fall",
                           T ~ as.character(wday)))

model1 <- lm(n ~disp, data = daily)

daily %>%
  spread_residuals (sat_term = model1, all_interact = mod2) %>%
  mutate (residdiff = sat_term - all_interact) %>%
  ggplot (aes(date, residdiff)) +
  geom_ref_line (h = 0, colour = "pink") +
  geom_line (alpha = 0.75, size = 0.5, color = "grey") +
  labs (x = "Date", y="Residuals difference") + ggtitle("Model with weekday and term") +
  theme_dark()

summary(model1)

## 
## Call:
## lm(formula = n ~ disp, data = daily)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -331.75   -8.81   11.31   23.64  141.00 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     967.462      6.568 147.308  < 2e-16 ***
## dispMon           7.346      9.288   0.791   0.4295    
## dispSat-fall   -251.462     12.951 -19.416  < 2e-16 ***
## dispSat-spring -230.143     12.045 -19.107  < 2e-16 ***
## dispSat-summer -166.545     15.167 -10.981  < 2e-16 ***
## dispSun         -75.981      9.288  -8.181 5.06e-15 ***
## dispThu          -1.712      9.288  -0.184   0.8539    
## dispTue         -16.103      9.244  -1.742   0.0824 .  
## dispWed          -4.769      9.288  -0.513   0.6079    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 47.36 on 356 degrees of freedom
## Multiple R-squared:  0.7357, Adjusted R-squared:  0.7297 
## F-statistic: 123.8 on 8 and 356 DF,  p-value: < 2.2e-16

From the charts above and the summary of two models, we can see the model with Sat term has higher residual and higher standard error. Therefore, the model with all the interaction might be a better fit for the data.

Question #4

Create a new “wday” variable that combines the day of week, term(for Saturdays), and public holidays. What do the residuals of the model look like?

# Use this chunk to answer question 4
holidays <- ymd (20130101, 20130120, 20130526, 
                 20130704, 20130705, 20130901, 
                 20131128, 20131129, 20131130, 
                 20131201, 20131224, 20131225, 
                 20131231)

daily <- daily %>%
  mutate(disp2 = case_when(date %in% holidays ~ "holidays", T ~ as.character(disp)))

model2 <- lm(n ~ disp2, data = daily)
daily <- daily %>%
  add_residuals(model2, "resid2")

ggplot (daily, aes(date, resid2)) +
  geom_ref_line(h = 0, colour = "pink", size = 1) +
  geom_line(color = "grey") +
  labs(x = "Period", y = "Residuals") + ggtitle ("Model with term, day of week and holidays") +
  theme_dark()

From the chart that we can tell the holidays do affect the number of flights and they’re special compared to other weekends.The number of flights are lower in Jan to Mar and higher in Nov to Jan, which is right in the holiday season, than the model predict using data by week.In those season without a lot of special holidays, the model matches better to the actual flights.

Question #5

What happens if you fit a day-of-week effect that varies by month (i.e.m n ~ wday*month)? Why is this not very helpful?

# Use this chunk to answer question 5
tdata <- daily %>%
  mutate(month = month(date, label = T))
disp3 <- lm(n ~ wday * month, data = tdata)
tdata %>%
  add_residuals(disp3, "disp3") %>%
  ggplot (aes(date, disp3)) +
  geom_ref_line (h = 0, colour = "pink", size = 1) +
  geom_line (color = "grey", size = 0.5) +
  labs (x = "Period", y = "Residuals") + 
  ggtitle ("Model with day of week varying by month") +
  theme_dark()

If we only apply to month, we can’t see the variety between weekdays, which makes the whole model weaker.

Question #6

What would you expect the model n ~ wday + ns(date,5) to look like? Knowing what you know about the data, why would you expect it not to be particularly effective?

# Use this chunk to answer question 6
disp4 <- lm(n ~ wday + splines::ns(date, df = 5), data = daily)

daily %>%
  add_residuals(disp4, "resid4") %>%
  ggplot (aes(date, resid4)) +
  geom_ref_line(h = 0, colour = "pink", size = 1) +
  geom_line(size = 0.5, color = "grey") +
  labs(x = "Period", y = "Residuals") + 
  ggtitle("model n~wday + ns (date,5)") +
  theme_dark()

A week has 7 days from Monday to Sunday. If we add 5 days into the data, the order of a week would be messed up and we would not find the connection between different weekdays, which makes the whole analytics not effective.

Question #7

We hypothesized that people leaving on Sundays are more likely to be business travelers who need to be somewhere on Monday. Explore the hypothesis by seeing how if breaks down based on distance and time: if it’s true, you’d expect to see more Sunday evening flights to places that are far away.

# Use this chunk to answer question 7
tdata2 <- flights %>%
  mutate (date = make_date (year, month, day),
          wday = wday (date, label = T)) %>%
  select (date, wday, distance, air_time, hour)

tdata2 %>%
  ggplot (aes(wday, distance)) + geom_boxplot() +
  labs (x="Week", y="Traveling distance (miles)") + ggtitle("Sunday departures analysis") + theme_dark()

library(kableExtra)

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

tdata2 %>%
  group_by (wday) %>%
  summarise ("Mean distance" = round(mean(distance)),
            "Median distance" = median(distance),
            "Mean time" = round(mean(air_time, na.rm = T)),
            "Median time" = median(air_time, na.rm = T)) %>%
  rename (Week = wday) %>%
  kable() %>% kable_styling()

Week	Mean distance	Median distance	Mean time	Median time
Sun	1055	937	152	130
Mon	1032	828	151	129
Tue	1027	820	150	128
Wed	1028	820	150	128
Thu	1033	828	150	128
Fri	1033	828	150	127
Sat	1081	944	154	134

As we can see in the chart and the graph, although saturday and sunday have more deviation, those are all very slightly different. Therefore, the data cannot support the assumption.