ANLY505 Homework NYC Flights

Summarize Data

daily <- flights %>%
  mutate(date = make_date(year, month, day)) %>%
  group_by(date) %>%
  summarize(n = n())

ggplot(daily, aes(date, n)) +
  geom_line()

Investigate Daily-Weekly Pattern

daily <- daily %>%
  mutate(wday = wday(date, label = TRUE))
ggplot(daily, aes(wday,n)) +
  geom_boxplot()

mod = lm(n ~ wday, data = daily)

grid <- daily %>%
  data_grid(wday) %>%
  add_predictions(mod, "n")

ggplot(daily, aes(wday, n)) +
  geom_boxplot() +
  geom_point(data = grid, color = "orange", size = 4)

Investigate residuals

daily <- daily %>%
  add_residuals(mod)

daily %>%
  ggplot(aes(date, resid)) +
  geom_ref_line(h = 0) +
  geom_line()

ggplot(daily, aes(date, resid, color = wday)) +
  geom_ref_line(h = 0, colour = "red") +
  geom_line()

daily %>%
  filter(resid < -100)

## # A tibble: 11 x 4
##    date           n wday  resid
##    <date>     <int> <ord> <dbl>
##  1 2013-01-01   842 Tue   -109.
##  2 2013-01-20   786 Sun   -105.
##  3 2013-05-26   729 Sun   -162.
##  4 2013-07-04   737 Thu   -229.
##  5 2013-07-05   822 Fri   -145.
##  6 2013-09-01   718 Sun   -173.
##  7 2013-11-28   634 Thu   -332.
##  8 2013-11-29   661 Fri   -306.
##  9 2013-12-24   761 Tue   -190.
## 10 2013-12-25   719 Wed   -244.
## 11 2013-12-31   776 Tue   -175.

daily %>%
  ggplot(aes(date, resid)) +
  geom_ref_line(h = 0, colour = "red", size = 1) +
  geom_line(color = "grey50") +
  geom_smooth(se = FALSE, span = 0.20)

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Seasonal Saturday effect

daily %>%
  filter(wday == "Sat") %>%
  ggplot(aes(date, n)) +
  geom_point()+
  geom_line() +
  scale_x_date(
    NULL,
    date_breaks = "1 month",
    date_labels = "%b"
  )

Add Seasonal Variable

term <- function(date) {
  cut(date,
      breaks = ymd(20130101, 20130605, 20130825, 20140101),
      labels = c("spring", "summer", "fall")
      )
}

daily <- daily %>%
  mutate(term = term(date))

daily %>%
  filter(wday == "Sat") %>%
  ggplot(aes(date, n, color = term)) +
  geom_point(alpha = 1/3)+
  geom_line() +
  scale_x_date(
    NULL,
    date_breaks = "1 month",
    date_labels = "%b"
  )

daily %>%
  ggplot(aes(wday, n, color = term)) +
  geom_boxplot()

mod1 <- lm(n ~ wday, data = daily)
mod2 <- lm(n ~ wday * term, data = daily)

daily %>%
  gather_residuals(without_term = mod1, with_term = mod2) %>%
  ggplot(aes(date, resid, color = model)) +
  geom_line(alpha = 0.75)

grid <- daily %>%
  data_grid(wday, term) %>%
  add_predictions(mod2, "n")

ggplot(daily, aes(wday, n)) +
  geom_boxplot() +
  geom_point(data = grid, color = "red") +
  facet_wrap(~ term)

Better model for outliers (Robust regression)

mod3 <- MASS::rlm(n ~ wday * term, data = daily)

daily %>%
  add_residuals(mod3, "resid") %>%
  ggplot(aes(date, resid)) +
  geom_hline(yintercept = 0, size = 2, color = "red") +
  geom_line()

Computed Variables

# If you are creating variables it might be a good idea to bundle the creation of the variables up into a function
compute_vars <- function(data) {
  data %>%
    mutate(term = term(date),
           wday = wday(date, label = TRUE)
           )
}

# Another option would be to put the transformations directly in the model formula:

wday2 <- function(x) wday(x, label = TRUE)
mod3 <- lm(n ~ wday2(date) * term(date), data = daily)

Time of Year: An Alternative Approach

# We could use a more flexible model to capture the pattern of school term in the data
library(splines)
mod4 <- MASS::rlm(n ~ wday * ns(date, 5), data = daily)

daily %>% 
  data_grid(wday, date = seq_range(date, n = 13)) %>% 
  add_predictions(mod4) %>% 
  ggplot(aes(date, pred, color = wday)) +
  geom_line() +
  geom_point()

# We see a strong pattern in the numbers of Sat flights.  This is reassuring, because we also saw that pattern in the raw data.  It's a good sign when you get the same signal from different approaches.

Question #1

Why are there fewer than expected flights on January 20, May 26 and September 1? (Hint: they all have the same explanation.) How would these days generalize into another year?

# In Year 2013, January 21st is the MLK day, and 20th is the Sunday before that; May 27th is the Memorial Day, and 26th is the Sunday before that; September 2nd is the Labor Day, and 1st is the Sunday before that.

# The similarity among these 3 days is that they are all Sundays before holidays. To generalize these days, (1) the first day should be the Sunday before the third Monday of January; (2) the second day should be the Sunday before the last Monday of May; (3) the third day should be the Sunday before the first Monday of September.

Question #2

What do the three days with high positive residuals represent? How would these days generalize to another year?

daily %>%
  top_n(3, resid)

## # A tibble: 3 x 5
##   date           n wday  resid term 
##   <date>     <int> <ord> <dbl> <fct>
## 1 2013-11-30   857 Sat   112.  fall 
## 2 2013-12-01   987 Sun    95.5 fall 
## 3 2013-12-28   814 Sat    69.4 fall

# Nov.30th and Dec.1st are the Saturday and Sunday after Black Friday, and Dec.28th is the Saturday before New Year Eve.

# To generalze these 3 days: (1) the first 2 days are the fourth Saturday and Sunday in November; (2) the third day is the last Saturday of the Year.

Question #3

Create a new variable that splits the “wday” variable into terms, but only for Saturdays, i.e., it should have Thurs, Fri, but Sat-summer, Sat-spring, Sat-fall. How does this model compare with the model with every combination of “wday” and “term”?

daily$term2<-ifelse(daily$wday=="Sun","Sun",
        ifelse(daily$wday=="Mon","Mon",
        ifelse(daily$wday=="Tue","Tue",
        ifelse(daily$wday=="Wed","Wed",
        ifelse(daily$wday=="Thu","Thu",
        ifelse(daily$wday=="Fri","Fri", paste("Sat -",daily$term)))))))

mod5 <- lm(n ~ wday * term2, data = daily)

daily %>%
  gather_residuals(old = mod2, new = mod5) %>%
  ggplot(aes(date, resid, color = model)) +
  geom_line(alpha = 0.75)

## Warning in predict.lm(model, data): prediction from a rank-deficient fit
## may be misleading

could not see the difference between orginal wday_term and the new wday_term2.

Question #4

Create a new “wday” variable that combines the day of week, term(for Saturdays), and public holidays. What do the residuals of the model look like?

daily$term3<-ifelse(daily$date=="2013-01-01","Holiday",
                 ifelse(daily$date=="2013-01-21","Holiday",
                 ifelse(daily$date=="2013-05-27","Holiday",
                 ifelse(daily$date=="2013-07-04","Holiday",
                 ifelse(daily$date=="2013-09-02","Holiday",
                 ifelse(daily$date=="2013-11-28","Holiday",
                 ifelse(daily$date=="2013-11-29","Holiday",
                 ifelse(daily$date=="2013-12-25","Holiday", daily$term2))))))))

mod6 <- lm(n ~ wday * term3, data = daily)

daily %>%
  gather_residuals(old = mod5, new = mod6) %>%
  ggplot(aes(date, resid, color = model)) +
  geom_line(alpha = 0.75)

## Warning in predict.lm(model, data): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(model, data): prediction from a rank-deficient fit
## may be misleading

the new model with holidays added in renders smaller residuals than the model without holidays.

Question #5

What happens if you fit a day-of-week effect that varies by month (i.e.m n ~ wday*month)? Why is this not very helpful?

daily$month = month(daily$date, label=TRUE)

mod7 <- lm(n ~ wday * month, data = daily)

daily %>%
  gather_residuals(old = mod6, new = mod7) %>%
  ggplot(aes(date, resid, color = model)) +
  geom_line(alpha = 0.75)

## Warning in predict.lm(model, data): prediction from a rank-deficient fit
## may be misleading

from the plot, we can see that the model with wday_month yields larger residuals than the model with wday_saturday_holiday, this might infer that the pattern of number of flights taken is less likely related to month than to holidays.

Question #6

What would you expect the model n ~ wday + ns(date,5) to look like? Knowing what you know about the data, why would you expect it not to be particularly effective?

mod4 <- MASS::rlm(n ~ wday * ns(date, 5), data = daily)

daily %>%
  add_residuals(mod4, "resid") %>%
  ggplot(aes(date, resid)) +
  geom_hline(yintercept = 0, size = 2, color = "red") +
  geom_line()

the model n ~ wday + ns(date,5) yields even larger residuals, probably because school term is not a factor contributing to the number of flights daily.

Question #7

We hypothesized that people leaving on Sundays are more likely to be business travelers who need to be somewhere on Monday. Explore the hypothesis by seeing how if breaks down based on distance and time: if it’s true, you’d expect to see more Sunday evening flights to places that are far away.

new <- flights %>%
  mutate(date = make_date(year, month, day)) %>%
  mutate(wday = wday(date, label=TRUE)) %>%
  mutate(sund = ifelse(wday=="Sun", "Sun", "non-Sun")) %>%
  mutate(time = ifelse(dep_time<1200,"morning","evening")) %>%
  mutate(dist = ifelse(distance<502, "D1",
                       ifelse(distance>=502 & distance<872, "D2",
                       ifelse(distance>=872 & distance<1389, "D3", "D4")))) %>%
  group_by(wday, time, dist)


new2=na.omit(new)
new3=count(new2)

model_10 = lm(n ~ ., data = new3)
summary(model_10)

## 
## Call:
## lm(formula = n ~ ., data = new3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1076.61  -216.75   -17.41   237.71   919.39 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6662.54     130.68  50.982  < 2e-16 ***
## wday.L       -599.26     154.63  -3.876 0.000343 ***
## wday.Q       -965.26     154.63  -6.242 1.36e-07 ***
## wday.C       -318.94     154.63  -2.063 0.044945 *  
## wday^4       -458.82     154.63  -2.967 0.004799 ** 
## wday^5       -110.36     154.63  -0.714 0.479075    
## wday^6        -51.82     154.63  -0.335 0.739077    
## timemorning -2357.21     116.89 -20.167  < 2e-16 ***
## distD2        495.29     165.30   2.996 0.004435 ** 
## distD3        256.50     165.30   1.552 0.127742    
## distD4        694.36     165.30   4.200 0.000124 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 437.4 on 45 degrees of freedom
## Multiple R-squared:  0.9165, Adjusted R-squared:  0.898 
## F-statistic: 49.41 on 10 and 45 DF,  p-value: < 2.2e-16

from the moewl, we can see that flights in the Morning or Evening plays an significant role, and the flights in the evening is significantly more than those in the monring; but the distance of travel is not very significant, which actually makes sense, because the distance traveled is not predictable.