NYCFlightsDataModel

Summarize Data

daily <- flights %>%
  mutate(date = make_date(year, month, day)) %>%
  group_by(date) %>%
  summarize(n = n())

ggplot(daily, aes(date, n)) +
  geom_line()

Investigate Daily-Weekly Pattern

daily <- daily %>%
  mutate(wday = wday(date, label = TRUE))
ggplot(daily, aes(wday,n)) +
  geom_boxplot()

mod = lm(n ~ wday, data = daily)

grid <- daily %>%
  data_grid(wday) %>%
  add_predictions(mod, "n")

ggplot(daily, aes(wday, n)) +
  geom_boxplot() +
  geom_point(data = grid, color = "orange", size = 4)

Investigate residuals

daily <- daily %>%
  add_residuals(mod)

daily %>%
  ggplot(aes(date, resid)) +
  geom_ref_line(h = 0) +
  geom_line()

ggplot(daily, aes(date, resid, color = wday)) +
  geom_ref_line(h = 0, colour = "red") +
  geom_line()

daily %>%
  filter(resid < -100)

## # A tibble: 11 x 4
##    date           n wday  resid
##    <date>     <int> <ord> <dbl>
##  1 2013-01-01   842 Tue   -109.
##  2 2013-01-20   786 Sun   -105.
##  3 2013-05-26   729 Sun   -162.
##  4 2013-07-04   737 Thu   -229.
##  5 2013-07-05   822 Fri   -145.
##  6 2013-09-01   718 Sun   -173.
##  7 2013-11-28   634 Thu   -332.
##  8 2013-11-29   661 Fri   -306.
##  9 2013-12-24   761 Tue   -190.
## 10 2013-12-25   719 Wed   -244.
## 11 2013-12-31   776 Tue   -175.

daily %>%
  ggplot(aes(date, resid)) +
  geom_ref_line(h = 0, colour = "red", size = 1) +
  geom_line(color = "grey50") +
  geom_smooth(se = FALSE, span = 0.20)

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Seasonal Saturday effect

daily %>%
  filter(wday == "Sat") %>%
  ggplot(aes(date, n)) +
  geom_point()+
  geom_line() +
  scale_x_date(
    NULL,
    date_breaks = "1 month",
    date_labels = "%b"
  )

Add Seasonal Variable

term <- function(date) {
  cut(date,
      breaks = ymd(20130101, 20130605, 20130825, 20140101),
      labels = c("spring", "summer", "fall")
      )
}

daily <- daily %>%
  mutate(term = term(date))

daily %>%
  filter(wday == "Sat") %>%
  ggplot(aes(date, n, color = term)) +
  geom_point(alpha = 1/3)+
  geom_line() +
  scale_x_date(
    NULL,
    date_breaks = "1 month",
    date_labels = "%b"
  )

daily %>%
  ggplot(aes(wday, n, color = term)) +
  geom_boxplot()

mod1 <- lm(n ~ wday, data = daily)
mod2 <- lm(n ~ wday * term, data = daily)

daily %>%
  gather_residuals(without_term = mod1, with_term = mod2) %>%
  ggplot(aes(date, resid, color = model)) +
  geom_line(alpha = 0.75)

grid <- daily %>%
  data_grid(wday, term) %>%
  add_predictions(mod2, "n")

ggplot(daily, aes(wday, n)) +
  geom_boxplot() +
  geom_point(data = grid, color = "red") +
  facet_wrap(~ term)

Better model for outliers (Robust regression)

mod3 <- MASS::rlm(n ~ wday * term, data = daily)

daily %>%
  add_residuals(mod3, "resid") %>%
  ggplot(aes(date, resid)) +
  geom_hline(yintercept = 0, size = 2, color = "red") +
  geom_line()

Computed Variables

# If you are creating variables it might be a good idea to bundle the creation of the variables up into a function
compute_vars <- function(data) {
  data %>%
    mutate(term = term(date),
           wday = wday(date, label = TRUE)
           )
}

# Another option would be to put the transformations directly in the model formula:

wday2 <- function(x) wday(x, label = TRUE)
mod3 <- lm(n ~ wday2(date) * term(date), data = daily)

Time of Year: An Alternative Approach

# We could use a more flexible model to capture the pattern of school term in the data
library(splines)
mod <- MASS::rlm(n ~ wday * ns(date, 5), data = daily)

daily %>% 
  data_grid(wday, date = seq_range(date, n = 13)) %>% 
  add_predictions(mod) %>% 
  ggplot(aes(date, pred, color = wday)) +
  geom_line() +
  geom_point()

# We see a strong pattern in the numbers of Sat flights.  This is reassuring, because we also saw that pattern in the raw data.  It's a good sign when you get the same signal from different approaches.

Question #1

Why are there fewer than expected flights on January 20, May 26 and September 1? (Hint: they all have the same explanation.) How would these days generalize into another year?

Answer: January 20 could be Dr.MLK Jr. Day, May 26 should be Memorial Day, and Septemeber could be the Labor Day. These days are US national holidays, to generalize, Dr.MLK Jr. day is the third Monday of January, Memorial Day is the last Monday of May, and Labor Day is the first Monday in September.

Question #2

What do the three days with high positive residuals represent? How would these days generalize to another year?

daily %>%
  top_n(3, resid)

## # A tibble: 3 x 5
##   date           n wday  resid term 
##   <date>     <int> <ord> <dbl> <fct>
## 1 2013-11-30   857 Sat   112.  fall 
## 2 2013-12-01   987 Sun    95.5 fall 
## 3 2013-12-28   814 Sat    69.4 fall

Answer: Nov.30th and Dec.1st are Saturday and Sunday after Thanksgiving Day, and Dec.28th is the Saturday after Christmas. Thanksgiving Day is the last Thursday in November, Christmas Day is fixed on Dec.25th. To generalize these days, we need to use the Thanksgiving and Christmas to locate the date.

Question #3

Create a new variable that splits the “wday” variable into terms, but only for Saturdays, i.e., it should have Thurs, Fri, but Sat-summer, Sat-spring, Sat-fall. How does this model compare with the model with every combination of “wday” and “term”?

daily <- daily %>%
  mutate(
    #use wday2 to avoid conflict
    wday2 = case_when(
      wday == "Sat" & term == "summer" ~ "Sat-summer",
      wday == "Sat" & term == "fall" ~ "Sat-fall",
      wday == "Sat" & term == "spring" ~ "Sat-spring",
      TRUE ~ as.character(wday)
    )
  )

#mod3 as linear model wday with Sat by term
mod3 = lm(n ~ wday2, data = daily)


daily %>% 
  gather_residuals(splits_wday = mod3, wday_by_term = mod2) %>% 
  arrange(date)%>%
  ggplot(aes(date, resid, color = model)) +
  geom_line(alpha = 0.75)

Answer: From the plot, residuals for new variable that splits the “wday” variable into terms for Saturday doesn’t make much difference from the model with every combination, espicially national holidays such as Memorial day, Independent day, Labor day, Thanks giving and Christmas, the lines are overlapping.

Question #4

Create a new “wday” variable that combines the day of week, term(for Saturdays), and public holidays. What do the residuals of the model look like?

daily <- daily %>%
  mutate(wday3 = 
         case_when(
           # combo public holidays
           .$date %in% lubridate::ymd(c(20130101, # new year
                                        20130121, # dr MLK Jr
                                        20130218, # president day
                                        20130527, # memorial day
                                        20130704, # independence day
                                        20130902, # labor day
                                        20131028, # columbus day
                                        20131111, # veteran day
                                        20131128, # thanksgiving day
                                        20131225) # xmas
                                      ) ~  "holiday",
           #combo Sat by term
           wday == "Sat" & term == "summer" ~ "Sat-summer",
           wday == "Sat" & term == "fall" ~ "Sat-fall",
           wday == "Sat" & term == "spring" ~ "Sat-spring",
           TRUE ~ as.character(wday)))
#mod4 as linear model for weekday with Sat by term and national holidays
mod4 = lm(n ~ wday3, data = daily)
daily %>% 
  gather_residuals(wday_term_holidays = mod4, splits_wday = mod3, wday_by_term = mod2) %>%
  ggplot(aes(date, resid, color = model)) +
  geom_line(alpha = 0.75)

daily %>%
  spread_residuals(wday_by_term = mod2, wday_term_holidays = mod4) %>%
  mutate(resid_diff = wday_term_holidays - wday_by_term) %>%
  ggplot(aes(date, resid_diff)) +
  geom_line(alpha = 0.75)

Answer: The overlapping plot could not provide a clear idea about the comparision on residuals. By using spread_residuals() function, it can tell the difference between the models. According to the second plot, residuals for the model with combines the day of week, term(for Saturdays), and public holidays is higher in summer term and slightly lower in spring and fall terms.

Question #5

What happens if you fit a day-of-week effect that varies by month (i.e.m n ~ wday*month)? Why is this not very helpful?

daily <- flights %>% 
  mutate(date = make_date(year, month, day)) %>% 
  group_by(date,month) %>% 
  summarise(n = n())

daily <- daily %>% 
  mutate(wday = wday(date, label = TRUE)) %>%
  mutate(term = term(date)) 

mod2 = lm(n ~ wday * term, data = daily)
#as linear model for weekday that varies by month
mod5 = lm(n ~ wday * month, data = daily)


daily %>%
  spread_residuals(wday_by_term = mod2, fit_by_month = mod5) %>%
  mutate(resid_diff = fit_by_month - wday_by_term) %>%
  ggplot(aes(date, resid_diff)) +
  geom_line(alpha = 0.75)

Answer: A linear model fit a day-of-week effect that varies by month here will generate a much higher residuals, which means more outlier, and this is not proper for both modeling and predicting compare to the previous model.

Question #6

What would you expect the model n ~ wday + ns(date,5) to look like? Knowing what you know about the data, why would you expect it not to be particularly effective?

#mod6 as model wtih wday plus natural spline
mod6 = lm(n ~ wday + splines::ns(date, 5), data = daily)
daily %>%
  gather_residuals(wday_by_term = mod2, natural_spline = mod6) %>%
  arrange(date) %>%
  ggplot(aes(date,resid,color = model))+
  geom_line(alpha = 0.75)

Answer: The linear model added natural spline did better and stable in spring, but the estimation for summer is greater than weekday linear model, and it did an overestimation for fall.

Question #7

We hypothesized that people leaving on Sundays are more likely to be business travelers who need to be somewhere on Monday. Explore the hypothesis by seeing how if breaks down based on distance and time: if it’s true, you’d expect to see more Sunday evening flights to places that are far away.

assemble_data = flights %>%
  #mutate(date = make_date(year, month, day)) %>%
  mutate(date = make_date(year, month, day), wday = wday(date, label = TRUE))

# boxplot
assemble_data %>%
  ggplot(aes(y = distance, x = wday)) +
  geom_boxplot() +
  labs(x = "Day of Week", y = "Average Distance")

# focus on Q1~Q3
assemble_data %>%
  ggplot(aes(y = distance, x = wday)) +
  geom_boxplot() +
  labs(x = "Day of Week", y = "Average Distance") +
  scale_y_continuous(limits=c(0,2800))

## Warning: Removed 715 rows containing non-finite values (stat_boxplot).

# focus on median
assemble_data %>%
  ggplot() +
  geom_pointrange(mapping = aes(x = wday, y = distance),
  stat = "summary",
  fun.ymin = function(z) {quantile(z,0.25)},
  fun.ymax = function(z) {quantile(z,0.75)},
  fun.y = median)

# considering time with distance
assemble_data %>%
  filter( distance < 3000, hour >= 5, hour <= 21) %>%
  group_by(wday, hour) %>%
  summarise(distance = mean(distance)) %>%
  ggplot(aes(x = hour, color = wday, y = distance)) +
  geom_line()

# try with median
assemble_data %>%
  filter( distance < 3000, hour >= 5, hour <= 21) %>%
  group_by(wday, hour) %>%
  summarise(distance = median(distance)) %>%
  ggplot(aes(x = hour, color = wday, y = distance)) +
  geom_line()

# cannot support hypothesis

Answer: After comparing the average distance and time of the flights by weekday, the hypothesis that people leaving on Sundays are more likely to be business travelers who need to be somewhere on Monday cannot be proved. Therefore the hypothesis is not true.