Summarize Data

daily <- flights %>%
  mutate(date = make_date(year, month, day)) %>%
  group_by(date) %>%
  summarize(n = n())

ggplot(daily, aes(date, n)) +
  geom_line()

Investigate Daily-Weekly Pattern

daily <- daily %>%
  mutate(wday = wday(date, label = TRUE))
ggplot(daily, aes(wday,n)) +
  geom_boxplot()

mod = lm(n ~ wday, data = daily, na.action = na.warn)

grid <- daily %>%
  data_grid(wday) %>%
  add_predictions(mod, "n")

ggplot(daily, aes(wday, n)) +
  geom_boxplot() +
  geom_point(data = grid, color = "orange", size = 4)

Investigate residuals

daily <- daily %>%
  add_residuals(mod)

daily %>%
  ggplot(aes(date, resid)) +
  geom_ref_line(h = 0) +
  geom_line()

ggplot(daily, aes(date, resid, color = wday)) +
  geom_ref_line(h = 0, colour = "red") +
  geom_line()

daily %>%
  filter(resid < -100)
## # A tibble: 11 x 4
##    date           n wday  resid
##    <date>     <int> <ord> <dbl>
##  1 2013-01-01   842 Tue   -109.
##  2 2013-01-20   786 Sun   -105.
##  3 2013-05-26   729 Sun   -162.
##  4 2013-07-04   737 Thu   -229.
##  5 2013-07-05   822 Fri   -145.
##  6 2013-09-01   718 Sun   -173.
##  7 2013-11-28   634 Thu   -332.
##  8 2013-11-29   661 Fri   -306.
##  9 2013-12-24   761 Tue   -190.
## 10 2013-12-25   719 Wed   -244.
## 11 2013-12-31   776 Tue   -175.
daily %>%
  ggplot(aes(date, resid)) +
  geom_ref_line(h = 0, colour = "red", size = 1) +
  geom_line(color = "grey50") +
  geom_smooth(se = FALSE, span = 0.20)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Seasonal Saturday effect

daily %>%
  filter(wday == "Sat") %>%
  ggplot(aes(date, n)) +
  geom_point()+
  geom_line() +
  scale_x_date(
    NULL,
    date_breaks = "1 month",
    date_labels = "%b"
  )

Add Seasonal Variable

term <- function(date) {
  cut(date,
      breaks = ymd(20130101, 20130605, 20130825, 20140101),
      labels = c("spring", "summer", "fall")
      )
}

daily <- daily %>%
  mutate(term = term(date))

daily %>%
  filter(wday == "Sat") %>%
  ggplot(aes(date, n, color = term)) +
  geom_point(alpha = 1/3)+
  geom_line() +
  scale_x_date(
    NULL,
    date_breaks = "1 month",
    date_labels = "%b"
  )

daily %>%
  ggplot(aes(wday, n, color = term)) +
  geom_boxplot()

mod1 <- lm(n ~ wday, data = daily, na.action = na.warn)
mod2 <- lm(n ~ wday * term, data = daily, na.action = na.warn)

daily %>%
  gather_residuals(without_term = mod1, with_term = mod2) %>%
  ggplot(aes(date, resid, color = model)) +
  geom_line(alpha = 0.75)

grid <- daily %>%
  data_grid(wday, term) %>%
  add_predictions(mod2, "n")

ggplot(daily, aes(wday, n)) +
  geom_boxplot() +
  geom_point(data = grid, color = "red") +
  facet_wrap(~ term)

Better model for outliers (Robust regression)

mod3 <- MASS::rlm(n ~ wday * term, data = daily, na.action = na.warn)

daily %>%
  add_residuals(mod3, "resid") %>%
  ggplot(aes(date, resid)) +
  geom_hline(yintercept = 0, size = 2, color = "red") +
  geom_line()

Computed Variables

# If you are creating variables it might be a good idea to bundle the creation of the variables up into a function
compute_vars <- function(data) {
  data %>%
    mutate(term = term(date),
           wday = wday(date, label = TRUE)
           )
}

# Another option would be to put the transformations directly in the model formula:

wday2 <- function(x) wday(x, label = TRUE)
mod3 <- lm(n ~ wday2(date) * term(date), data = daily, na.action = na.warn)

Time of Year: An Alternative Approach

# We could use a more flexible model to capture the pattern of school term in the data
library(splines)
mod <- MASS::rlm(n ~ wday * ns(date, 5), data = daily, na.action = na.warn)

daily %>% 
  data_grid(wday, date = seq_range(date, n = 13)) %>% 
  add_predictions(mod) %>% 
  ggplot(aes(date, pred, color = wday)) +
  geom_line() +
  geom_point()

# We see a strong pattern in the numbers of Sat flights.  This is reassuring, because we also saw that pattern in the raw data.  It's a good sign when you get the same signal from different approaches.

Question #1

Why are there fewer than expected flights on January 20, May 26 and September 1? (Hint: they all have the same explanation.) How would these days generalize into another year?

# Use this chunk to answer question 1
##January 20, May 26 and September 1 are the MLK, Trinity Sunday and the Labor day weekends. Most of the people will prefer using the holidays to roam a new place so they will either fly before or after the long weekends. Hence, the graph shows fewer than expected flights.

Question #2

What do the three days with high positive residuals represent? How would these days generalize to another year?

# Use this chunk to answer question 2
daily %>%
  top_n(3, resid)
## # A tibble: 3 x 5
##   date           n wday  resid term 
##   <date>     <int> <ord> <dbl> <fct>
## 1 2013-11-30   857 Sat   112.  fall 
## 2 2013-12-01   987 Sun    95.5 fall 
## 3 2013-12-28   814 Sat    69.4 fall
##As we can see after running the code, these 3 days are weekends and holidays weekends like Thanksgiving. There may be extreme number of flights unusually higher than the other weekend and hence there is a positive residual. To answer the next part of the question, every year the thanksgiving or holiday dates will change and hence it will show different patterns.

Question #3

Create a new variable that splits the “wday” variable into terms, but only for Saturdays, i.e., it should have Thurs, Fri, but Sat-summer, Sat-spring, Sat-fall. How does this model compare with the model with every combination of “wday” and “term”?

daily <- daily %>%
  mutate(disp = case_when (wday == "Sat" & term == "spring" ~ "Sat-spring",
                           wday == "Sat" & term == "summer" ~ "Sat-summer",
                           wday == "Sat" & term == "fall" ~ "Sat-fall",
                           T ~ as.character(wday)))

model <- lm(n ~disp, data = daily)

daily %>%
  spread_residuals (sat_term = model, all_interact = mod2) %>%
  mutate (residdiff = sat_term - all_interact) %>%
  ggplot (aes(date, residdiff)) +
  geom_ref_line (h = 0, colour = "green") +
  geom_line (alpha = 0.75, size = 0.5, color = "blue") +
  labs (x = "Date", y="Residuals difference") + ggtitle("Model with wday and term") +
  theme_dark()

##WE can see through model that there is lower residual in spring than in summer than the other model with interactions.

Question #4

Create a new “wday” variable that combines the day of week, term(for Saturdays), and public holidays. What do the residuals of the model look like?

# Use this chunk to answer question 4
ho <- ymd (20130101, 20130120, 20130526, 
                 20130704, 20130705, 20130901, 
                 20131128, 20131129, 20131130, 
                 20131201, 20131224, 20131225, 
                 20131231)

daily <- daily %>%
  mutate(disp2 = case_when(date %in% ho ~ "holidays", T ~ as.character(disp)))

mod2 <- lm(n ~ disp2, data = daily)
daily <- daily %>%
  add_residuals(mod2, "resid2")

ggplot (daily, aes(date, resid2)) +
  geom_ref_line(h = 0, colour = "green", size = 1) +
  geom_line(color = "blue") +
  labs(x = "Period", y = "Residuals") + ggtitle ("Model with term, day of week and holidays") +
  theme_dark()

##From the graph we can observe that tehre is no uniform pattern for the holidays affecting the flights. The highest deviation is in November. The model does a good job predicting the pattern for most of the year till November.

Question #5

What happens if you fit a day-of-week effect that varies by month (i.e.m n ~ wday*month)? Why is this not very helpful?

# Use this chunk to answer question 5
td <- daily %>%
  mutate(month = month(date, label = T))
disp3 <- lm(n ~ wday * month, data = td)
td %>%
  add_residuals(disp3, "disp3") %>%
  ggplot (aes(date, disp3)) +
  geom_ref_line (h = 0, colour = "red", size = 1) +
  geom_line (color = "blue", size = 0.5) +
  labs (x = "Period", y = "Residuals") + 
  ggtitle ("Model with day of week varying by month") +
  theme_dark()

##The model with day of the week and month has lower deviation and higher adjusted R>2 compared to the model with interactions.

Question #6

What would you expect the model n ~ wday + ns(date,5) to look like? Knowing what you know about the data, why would you expect it not to be particularly effective?

# Use this chunk to answer question 6
dp9 <- lm(n ~ wday + splines::ns(date, df = 5), data = daily)

daily %>%
  add_residuals(dp9, "resid4") %>%
  ggplot (aes(date, resid4)) +
  geom_ref_line(h = 0, colour = "green", size = 1) +
  geom_line(size = 0.5, color = "blue") +
  labs(x = "Period", y = "Residuals") + 
  ggtitle("model n~wday + ns (date,5)") +
  theme_dark()

##There is a uniform flight trend observed for weekdays. The residuals are mostly due to the more holidays which show inconsistent trends.

Question #7

We hypothesized that people leaving on Sundays are more likely to be business travelers who need to be somewhere on Monday. Explore the hypothesis by seeing how if breaks down based on distance and time: if it’s true, you’d expect to see more Sunday evening flights to places that are far away.

# Use this chunk to answer question 7
install.packages("kableExtra",repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/NehaKatti/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)
## package 'kableExtra' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\NehaKatti\AppData\Local\Temp\Rtmp6rfACc\downloaded_packages
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
td2 <- flights %>%
  mutate (date = make_date (year, month, day),
          wday = wday (date, label = T)) %>%
  select (date, wday, distance, air_time, hour)

td2 %>%
  ggplot (aes(wday, distance)) + geom_boxplot() +
  labs (x="Week", y="Traveling distance (miles)") + ggtitle("Sunday departures analysis") + theme_dark()

td2 %>%
  group_by (wday) %>%
  summarise ("Mean distance" = round(mean(distance)),
            "Median distance" = median(distance),
            "Mean time" = round(mean(air_time, na.rm = T)),
            "Median time" = median(air_time, na.rm = T)) %>%
  rename (Week = wday) %>%
  kable() %>% kable_styling()
Week Mean distance Median distance Mean time Median time
Sun 1055 937 152 130
Mon 1032 828 151 129
Tue 1027 820 150 128
Wed 1028 820 150 128
Thu 1033 828 150 128
Fri 1033 828 150 127
Sat 1081 944 154 134
td2 %>%
  group_by(wday, hour) %>%
  summarise(distMedian = median(distance)) %>%
  reshape2::acast(wday~hour, value.var="distMedian") %>%
  kable() %>% 
  kable_styling()
1 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
Sun NA 1400 963 1041 1005 950 1017.0 820 746 762 762 765 764 1008 944 944 762 719 266 1598
Mon NA 1400 762 1035 872 944 872.0 888 746 762 762 765 762 950 997 937 762 719 266 1598
Tue NA 1400 760 1035 872 944 872.0 888 746 762 762 765 762 950 944 944 740 642 266 1598
Wed NA 1400 760 1035 833 937 872.0 888 746 762 762 764 762 1008 937 937 753 642 266 1598
Thu NA 1400 760 1035 872 944 872.0 888 746 762 762 765 764 1008 944 944 762 719 266 1598
Fri NA 1400 760 1035 872 944 809.5 888 746 762 762 765 764 950 944 937 762 733 266 1598
Sat 17 1400 937 1035 944 950 1028.0 937 738 937 764 888 944 1065 1020 1005 1023 937 266 1598
##The hypothesis is not true since the miles travelled everyday are the same except for the weekends.So this does not prove that people leaving on Sundays are mostly business travellers.