============================================================================================================
Reference: Wickham, H., & Grolemund, G. (2016). R for data science: import, tidy, transform, visualize, and model data. “O’Reilly Media, Inc.”. 384-395.
Data dictionary: https://cran.r-project.org/web/packages/nycflights13/nycflights13.pdf

 

Exploratory data analysis (EDA)

1. Summarize data

daily <- flights %>%
  mutate(date = make_date(year, month, day)) %>%
  group_by(date) %>%
  summarize(n = n()) #aggregate the time series

ggplot(daily, aes(date, n)) + geom_line() + labs(x="Date", y="Number of flights") + ggtitle("Line chart 1")

2. Investigate daily / weekly pattern

daily <- daily %>%
  mutate(wday = wday(date, label=T))
ggplot(daily, aes(wday,n)) + geom_boxplot() + labs(x="Week", y="Number of flights") + ggtitle("Boxplot 1")

mod <- lm(n ~ wday, data = daily)

grid <- daily %>%
  data_grid(wday) %>%
  add_predictions(mod, "n")

ggplot(daily, aes(wday, n)) +
  geom_boxplot() +
  geom_point(data = grid, color = "orange", size = 4) +
  labs(x="Week", y="Number of flights") + ggtitle("Boxplot 2")

3. Investigate residuals

daily <- daily %>%
  add_residuals(mod)

# Beginner notice: use "colour" instead of "color" in "geom_ref_line()"
daily %>%
  ggplot(aes(date, resid)) +
  geom_ref_line(h = 0, colour = "red", size = 1) +
  geom_line() +
  labs(x="Date", y="Residuals") + ggtitle("Line chart 2")

ggplot(daily, aes(date, resid, color = wday)) +
  geom_ref_line(h = 0, colour = "red", size = 1) +
  geom_line() +
  labs(x="Week", y="Residuals") + ggtitle("Line chart 3")

# Beginner notice: resid<-100 can cause bug; use spaces properly
daily %>%
  filter(resid < -100) %>% kable() %>% kable_styling()
date n wday resid
2013-01-01 842 Tue -109.3585
2013-01-20 786 Sun -105.4808
2013-05-26 729 Sun -162.4808
2013-07-04 737 Thu -228.7500
2013-07-05 822 Fri -145.4615
2013-09-01 718 Sun -173.4808
2013-11-28 634 Thu -331.7500
2013-11-29 661 Fri -306.4615
2013-12-24 761 Tue -190.3585
2013-12-25 719 Wed -243.6923
2013-12-31 776 Tue -175.3585
daily %>%
  ggplot(aes(date, resid)) +
  geom_ref_line(h = 0, colour = "red", size = 1) +
  geom_line(color = "grey50") +
  geom_smooth(method = "loess", se = F, span = 0.20) +
  labs(x="Date", y="Residuals") + ggtitle("Line chart 4")

4. Seasonal Saturday effect

daily %>%
  filter(wday == "Sat") %>%
  ggplot(aes(date, n)) +
  geom_point()+
  geom_line() +
  scale_x_date(
    NULL,
    date_breaks = "1 month",
    date_labels = "%b"
  ) + labs(x="Date", y="Number of flights") + ggtitle("Line chart 5")

5. Add seasonal variable

term <- function(date) {
  cut(date,
      breaks = ymd(20130101, 20130605, 20130825, 20140101),
      labels = c("spring", "summer", "fall")
      )
}
daily <- daily %>%
  mutate(term = term(date))

daily %>%
  filter(wday == "Sat") %>%
  ggplot(aes(date, n, color = term)) +
  geom_point(alpha = 1/3) +
  geom_line() +
  scale_x_date(
    NULL,
    date_breaks = "1 month",
    date_labels = "%b"
  ) + labs(x="Date", y="Number of flights") + ggtitle("Line chart 6")

daily %>%
  ggplot(aes(wday, n, color = term)) +
  geom_boxplot() + labs(x="Week", y="Number of flights") + ggtitle("Boxplot 3")

mod2 <- lm(n ~ wday * term, data = daily)

daily %>%
  gather_residuals(without_term = mod, with_term = mod2) %>%
  ggplot(aes(date, resid, color = model)) +
  geom_line(alpha = 0.75) +
  labs(x="Date", y="Residuals") + ggtitle("Line chart 7")

grid2 <- daily %>%
  data_grid(wday, term) %>%
  add_predictions(mod2, "n")

ggplot(daily, aes(wday, n)) +
  geom_boxplot() +
  geom_point(data = grid2, color = "red") +
  facet_wrap(~ term) +
  labs(x="Week", y="Number of flights") + ggtitle("Boxplot 4")

6. Better model for outliers (Robust regression)

mod3 <- MASS::rlm(n ~ wday * term, data = daily)

daily %>%
  add_residuals(mod3, "resid3") %>%
  ggplot(aes(date, resid3)) +
  geom_hline(yintercept = 0, size = 1, color = "red") +
  geom_line() + labs(x="Date", y="Residuals") + ggtitle("Line chart 8")

7. Computed variables

# If you are creating variables it might be a good idea to bundle the creation of the variables up into a function
compute_vars <- function(data) {
  data %>%
    mutate(term = term(date),
           wday = wday(date, label=T)
           )
}

# Another option would be to put the transformations directly in the model formula:
wday2 <- function(x) {
  wday(x, label=T)
}
mod3r <- lm(n ~ wday2(date) * term(date), data = daily)

8. Time of year: An alternative approach

# We could use a more flexible model to capture the pattern of school term in the data
mod4 <- MASS::rlm(n ~ wday * splines::ns(date, 5), data = daily)

daily %>% 
  data_grid(wday, date = seq_range(date, n = 13)) %>% 
  add_predictions(mod4) %>% 
  ggplot(aes(date, pred, color = wday)) +
  geom_line() +
  geom_point() +
  labs(x="Date", y="Prediction") + ggtitle("Line chart 9")

# We see a strong pattern in the numbers of Saturday flights. This is reassuring, because we also saw that pattern in the raw data. It is a good sign when you get the same signal from different approaches.

 

Questions

Question #1

Why are there fewer than expected flights on January 20, May 26 and September 1? (Hint: they all have the same explanation.) How would these days generalize into another year?

datQ1 <- subset(daily, date %in% as.Date(c("2013-01-20","2013-05-26","2013-09-01")))
datQ1$holiday <- c("Martin Luther King Jr. Day", "Memorial Day", "Labor Day")
#datQ1 %>% kable() %>% kable_styling()

# Or follow the Lecture's scripting style
daily %>%
  filter(date %in% as.Date(c("2013-01-20","2013-05-26","2013-09-01"))) %>%
  mutate(holiday = c("Martin Luther King Jr. Day", "Memorial Day", "Labor Day")) %>%
  kable() %>% kable_styling()
date n wday resid term holiday
2013-01-20 786 Sun -105.4808 spring Martin Luther King Jr. Day
2013-05-26 729 Sun -162.4808 spring Memorial Day
2013-09-01 718 Sun -173.4808 fall Labor Day

 

ANSWER: These are the Sundays before Martin Luther King Jr. Day (the third Monday of January since 1986), Memorial Day (the last Monday of May since 1868) and Labor Day (the first Monday of September since 1882). The corresponding Sundays in 2014 will be January 19, May 25 and August 31.

 

 

Question #2

What do the three days with high positive residuals represent? How would these days generalize to another year?

daily %>%
  top_n(3, resid) %>%
  mutate(holiday = c("Thanksgiving (United States)", "Thanksgiving (United States)", "Hanukkah, Christmas, New Year")) %>%
  kable() %>% kable_styling()
date n wday resid term holiday
2013-11-30 857 Sat 112.38462 fall Thanksgiving (United States)
2013-12-01 987 Sun 95.51923 fall Thanksgiving (United States)
2013-12-28 814 Sat 69.38462 fall Hanukkah, Christmas, New Year

 

ANSWER: These are the Saturdays close to Thanksgiving Day in the United States (the fourth Thursday of November since 1621), Christmas Day (December 25 since 336) or Hanukkah (the eight nights and days starting on the 25th day of Kislev according to the Hebrew calendar since 200 B.C.), and New Year’s Day (January 1 since 45 B.C.). Traveling around popular holidays is in high demand. 2014 would witness the same case.

 

 

Question #3

Create a new variable that splits the “wday” variable into terms, but only for Saturdays, i.e., it should have Thurs, Fri, but Sat-summer, Sat-spring, Sat-fall. How does this model compare with the model with every combination of “wday” and “term”?

daily <- daily %>%
  mutate(wday2 = case_when(wday == "Sat" & term == "spring" ~ "Sat-spring",
                           wday == "Sat" & term == "summer" ~ "Sat-summer",
                           wday == "Sat" & term == "fall" ~ "Sat-fall",
                           T ~ as.character(wday)))
modQ3 <- lm(n ~wday2, data = daily)

daily %>%
  spread_residuals(sat_term = modQ3, all_interact = mod2) %>%
  mutate(residdiff = sat_term - all_interact) %>%
  ggplot(aes(date, residdiff)) +
  geom_ref_line(h = 0, colour = "red") +
  geom_line(alpha = 0.75) +
  labs(x="Date", y="Residuals difference") + ggtitle("Line chart 10")

options(scipen=100,digits=4)
rbind(broom::glance(modQ3), broom::glance(mod2)) %>%
  mutate(model = c("with Saturday and term", "with all interactions")) %>%
  kable() %>% kable_styling()
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual model
0.7357 0.7297 47.36 123.84 0 9 -1921 3863 3902 798487 356 with Saturday and term
0.7573 0.7432 46.17 53.67 0 21 -1906 3856 3942 733157 344 with all interactions

 

ANSWER: Line chart 10 shows that the model with Saturday and term has lower residuals in Spring and higher residuals in Summer than the model with all interactions.

Table shows that the model with Saturday and term has the lower adjusted R-squared and the higher standard error of the regression but the higher AIC than the model with all interactions.

 

 

Question #4

Create a new “wday” variable that combines the day of week, term(for Saturdays), and public holidays. What do the residuals of the model look like?

holidays <- ymd(20130101, 20130120, 20130526, 20130704, 20130705, 20130901, 20131128, 20131129, 20131130, 20131201, 20131224, 20131225, 20131231)
# Federal holidays in the United States in 2013 are available at https://www.officeholidays.com/countries/usa/2013.php.

daily <- daily %>%
  mutate(wday3 = case_when(date %in% holidays ~ "holidays",
                           T ~ as.character(wday2)))
modQ4 <- lm(n ~ wday3, data = daily)
daily <- daily %>%
  add_residuals(modQ4, "residQ4")

ggplot(daily, aes(date, residQ4)) +
  geom_ref_line(h = 0, colour = "red", size = 1) +
  geom_line() +
  labs(x="Date", y="Residuals") + ggtitle("Line chart 11")

 

ANSWER: Compared Line chart 11 to Line chart 2, the absolute values of the extreme residuals slightly narrow down. However, the line shape remains relatively the same. The model with Saturday, term and holidays still predict more flights in January and February, and less flights in July. And the number of flights fluctuates during Thanksgiving and Christmas holidays.

It is noticed that not all federal holidays impact traveling, e.g., Presidents’ Day (the third Monday of February since 1885), Emancipation Day (the weekday closest to April 16 since 1866), Columbus Day (the second Monday of October since 1869) and Veterans Day (November 11 since 1938). The Lunar New Year’s Day in 2013 was on February 10.

 

 

Question #5

What happens if you fit a day-of-week effect that varies by month (i.e. n ~ wday * month)? Why is this not very helpful?

dummy <- daily %>%
  mutate(month = month(date, label=T))
modQ5 <- lm(n ~ wday * month, data = dummy)
dummy %>%
  add_residuals(modQ5, "residQ5") %>%
  ggplot(aes(date, residQ5)) +
  geom_ref_line(h = 0, colour = "red", size = 1) +
  geom_line() +
  labs(x="Date", y="Residuals") + ggtitle("Line chart 12")

options(scipen=100,digits=4)
broom::glance(modQ5) %>% kable() %>% kable_styling()
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual
0.8355 0.7869 42.05 17.2 0 84 -1835 3840 4171 496839 281

 

ANSWER: The sample size for each day of the week in a month is either four or five, which may not be enough to build a robust regression. Line chart 12 shows severe fluctuation in July, November and December, but smoothness in January.

Table shows that the model with days of the week and month has the higher adjusted R-squared and the lower standard error of the regression but the lower AIC than the model with all interactions.

 

 

Question #6

What would you expect the model n ~ wday + ns(date,5) to look like? Knowing what you know about the data, why would you expect it not to be particularly effective?

modQ6 <- lm(n ~ wday + splines::ns(date, df = 5), data = daily)

daily %>%
  add_residuals(modQ6, "residQ6") %>%
  ggplot(aes(date, residQ6)) +
  geom_ref_line(h = 0, colour = "red", size = 1) +
  geom_line() +
  labs(x="Date", y="Residuals") + ggtitle("Line chart 13")

options(scipen=100,digits=4)
broom::glance(modQ6) %>% kable() %>% kable_styling()
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual
0.7816 0.7748 43.23 114.9 0 12 -1887 3799 3850 659694 353

 

ANSWER: In the second part of EDA, boxplot 1 shows that weekdays (through Monday to Friday) have similar flights distribution. In the third part of EDA, line chart 3 shows that not a weekday trend across months but a holidays trend on the dataset particularly exists.

 

 

Question #7

We hypothesized that people leaving on Sundays are more likely to be business travelers who need to be somewhere on Monday. Explore the hypothesis by seeing how if breaks down based on distance and time: if it’s true, you’d expect to see more Sunday evening flights to places that are far away.

# Beginner notice: the original R chunk header is "Question #7", however, if "ggplot" is called, there will be "pandoc" bug as shown below when knitting.
# pandoc: ~/NYCFlightsDataModel_files/figure-html/Question : openBinaryFile: does not exist (No such file or directory)
# Error: pandoc document conversion failed with error 1
# Execution halted
# Solution: delete the hashtag symbol (#) in the R chunk header

datQ7 <- flights %>%
  mutate(date = make_date(year, month, day),
         wday = wday(date, label = T)) %>%
  select(date, wday, distance, air_time, hour) %>%
  filter(hour != 1) #omit the single one record of 1 AM flight

datQ7 %>%
  ggplot(aes(wday, distance)) + geom_boxplot() +
  labs(x="Week", y="Traveling distance (miles)") + ggtitle("Boxplot 5")

datQ7 %>%
  group_by(wday) %>%
  summarise("Mean distance (miles)" = round(mean(distance)),
            "Median distance (miles)" = median(distance),
            "Mean duration (minutes)" = round(mean(air_time, na.rm = T)),
            "Median duration (minutes)" = median(air_time, na.rm = T)) %>%
  rename(Week = wday) %>%
  kable() %>% kable_styling()
Week Mean distance (miles) Median distance (miles) Mean duration (minutes) Median duration (minutes)
Sun 1055 937 152 130
Mon 1032 828 151 129
Tue 1027 820 150 128
Wed 1028 820 150 128
Thu 1033 828 150 128
Fri 1033 828 150 127
Sat 1081 944 154 134
# Visualized by "plotly" for interaction on the chart
htmltools::div(
  datQ7 %>%
    #show legend starting with Sunday
    mutate(wday = factor(wday, levels = rev(levels(wday)))) %>%
    group_by(wday, hour) %>%
    summarise(distMedian = median(distance)) %>%
    plot_ly(y = ~distMedian, x = ~hour, color = ~wday) %>%
    add_markers(showlegend = F) %>% add_lines() %>%
    layout(xaxis = list(title = "Hour", gridcolor = "#ffffff", zeroline = F),
           yaxis = list(title = "Median traveling distance (miles)", gridcolor = "#ffffff", zeroline = F),
           title = "Line chart 14", plot_bgcolor = "#ebebeb"), 
  align = "center")
datQ7 %>%
  group_by(wday, hour) %>%
  summarise(distMedian = median(distance)) %>%
  reshape2::acast(wday~hour, value.var="distMedian") %>%
  kable() %>% kable_styling()
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
Sun 1400 963 1041 1005 950 1017.0 820 746 762 762 765 764 1008 944 944 762 719 266 1598
Mon 1400 762 1035 872 944 872.0 888 746 762 762 765 762 950 997 937 762 719 266 1598
Tue 1400 760 1035 872 944 872.0 888 746 762 762 765 762 950 944 944 740 642 266 1598
Wed 1400 760 1035 833 937 872.0 888 746 762 762 764 762 1008 937 937 753 642 266 1598
Thu 1400 760 1035 872 944 872.0 888 746 762 762 765 764 1008 944 944 762 719 266 1598
Fri 1400 760 1035 872 944 809.5 888 746 762 762 765 764 950 944 937 762 733 266 1598
Sat 1400 937 1035 944 950 1028.0 937 738 937 764 888 944 1065 1020 1005 1023 937 266 1598

 

ANSWER: Above two tables, one boxplot and one line chart show that traveling distances in each day of the week are similar, though 100 miles more median distances are on Saturday and Sunday. As for the hypothesis that business trips take off on Sunday evenings, median distances are the same at 10 PM and 11 PM whichever day of the week it is.