Assignment 5: NYCFlightsDataModel

Summarize Data

daily <- flights %>%
  mutate(date = make_date(year, month, day)) %>%
  group_by(date) %>%
  summarize(n = n())

ggplot(daily, aes(date, n)) +
  geom_line()

Investigate Daily-Weekly Pattern

daily <- daily %>%
  mutate(wday = wday(date, label = TRUE))
ggplot(daily, aes(wday,n)) +
  geom_boxplot()

mod = lm(n ~ wday, data = daily)

grid <- daily %>%
  data_grid(wday) %>%
  add_predictions(mod, "n")

ggplot(daily, aes(wday, n)) +
  geom_boxplot() +
  geom_point(data = grid, color = "orange", size = 4)

Investigate residuals

daily <- daily %>%
  add_residuals(mod)

daily %>%
  ggplot(aes(date, resid)) +
  geom_ref_line(h = 0) +
  geom_line()

ggplot(daily, aes(date, resid, color = wday)) +
  geom_ref_line(h = 0, colour = "red") +
  geom_line()

daily %>%
  filter(resid < -100)

## # A tibble: 11 x 4
##    date           n wday  resid
##    <date>     <int> <ord> <dbl>
##  1 2013-01-01   842 Tue   -109.
##  2 2013-01-20   786 Sun   -105.
##  3 2013-05-26   729 Sun   -162.
##  4 2013-07-04   737 Thu   -229.
##  5 2013-07-05   822 Fri   -145.
##  6 2013-09-01   718 Sun   -173.
##  7 2013-11-28   634 Thu   -332.
##  8 2013-11-29   661 Fri   -306.
##  9 2013-12-24   761 Tue   -190.
## 10 2013-12-25   719 Wed   -244.
## 11 2013-12-31   776 Tue   -175.

daily %>%
  ggplot(aes(date, resid)) +
  geom_ref_line(h = 0, colour = "red", size = 1) +
  geom_line(color = "grey50") +
  geom_smooth(se = FALSE, span = 0.20)

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Seasonal Saturday effect

daily %>%
  filter(wday == "Sat") %>%
  ggplot(aes(date, n)) +
  geom_point()+
  geom_line() +
  scale_x_date(
    NULL,
    date_breaks = "1 month",
    date_labels = "%b"
  )

Add Seasonal Variable

term <- function(date) {
  cut(date,
      breaks = ymd(20130101, 20130605, 20130825, 20140101),
      labels = c("spring", "summer", "fall")
      )
}

daily <- daily %>%
  mutate(term = term(date))

daily %>%
  filter(wday == "Sat") %>%
  ggplot(aes(date, n, color = term)) +
  geom_point(alpha = 1/3)+
  geom_line() +
  scale_x_date(
    NULL,
    date_breaks = "1 month",
    date_labels = "%b"
  )

daily %>%
  ggplot(aes(wday, n, color = term)) +
  geom_boxplot()

mod1 <- lm(n ~ wday, data = daily)
mod2 <- lm(n ~ wday * term, data = daily)

daily %>%
  gather_residuals(without_term = mod1, with_term = mod2) %>%
  ggplot(aes(date, resid, color = model)) +
  geom_line(alpha = 0.75)

grid <- daily %>%
  data_grid(wday, term) %>%
  add_predictions(mod2, "n")

ggplot(daily, aes(wday, n)) +
  geom_boxplot() +
  geom_point(data = grid, color = "red") +
  facet_wrap(~ term)

Better model for outliers (Robust regression)

mod3 <- MASS::rlm(n ~ wday * term, data = daily)

daily %>%
  add_residuals(mod3, "resid") %>%
  ggplot(aes(date, resid)) +
  geom_hline(yintercept = 0, size = 2, color = "red") +
  geom_line()

Computed Variables

# If you are creating variables it might be a good idea to bundle the creation of the variables up into a function
compute_vars <- function(data) {
  data %>%
    mutate(term = term(date),
           wday = wday(date, label = TRUE)
           )
}

# Another option would be to put the transformations directly in the model formula:

wday2 <- function(x) wday(x, label = TRUE)
mod3 <- lm(n ~ wday2(date) * term(date), data = daily)

Time of Year: An Alternative Approach

# We could use a more flexible model to capture the pattern of school term in the data
library(splines)
mod <- MASS::rlm(n ~ wday * ns(date, 5), data = daily)

daily %>% 
  data_grid(wday, date = seq_range(date, n = 13)) %>% 
  add_predictions(mod) %>% 
  ggplot(aes(date, pred, color = wday)) +
  geom_line() +
  geom_point()

# We see a strong pattern in the numbers of Sat flights.  This is reassuring, because we also saw that pattern in the raw data.  It's a good sign when you get the same signal from different approaches.

Question #1

Why are there fewer than expected flights on January 20, May 26 and September 1? (Hint: they all have the same explanation.) How would these days generalize into another year?

#adding percentile values for number of flight column "n"
daily$percentile <- ecdf(daily$n)(daily$n)

#calculating percentile for the given dates
daily$percentile[daily$date == "2013-01-20" | daily$date == "2013-05-26" |daily$date == "2013-09-01"]

## [1] 0.13424658 0.07123288 0.06027397

Comemnts:

As we can see that for all the three dates the pecentile values is very low, even less than 1st quartile that shows that the number of flight on these datses are very low. On investigating the reason for such low flight volum, we found that all these dates fall exactly one day before a national holiday and All three of them are a Sunday:

January 21: MLK Day May 27: Memorial Day
September 02: Labor Day

This means the next day of these three sundays is always a holiday. Hence this explain low flight volumne on these Sundays as the next day is holiday so many people either on their long weeknd vacation oo otherwise also do not need to fly back to their worklocation on sundays, hence low traffic.

Question #2

What do the three days with high positive residuals represent? How would these days generalize to another year?

daily %>%
  top_n(3, resid)

## # A tibble: 3 x 6
##   date           n wday  resid term  percentile
##   <date>     <int> <ord> <dbl> <fct>      <dbl>
## 1 2013-11-30   857 Sat   112.  fall       0.189
## 2 2013-12-01   987 Sun    95.5 fall       0.778
## 3 2013-12-28   814 Sat    69.4 fall       0.167

Comments: Thursday 28 Nov, 2013 was “Thanksgiving Day”, which is a very big festival in US and a holiday. People prefer do spend this day with families and friends and therefore a lot of travel happens around this time. Post Thans giving Nov 29 is Black Friday and then next two days Nov 30 and Dec 01 are weekends. So This explains abnormally high flight traffic on Sat (30 Nov) and Sun (01 Dec) as people travel back after the festival to resume Work again from monday.

Dec 28 is a Sat. Jan 01 2014 was a New year day thus holiday. THis makes This period a 4 days long holiday if taken a leav on Dec 31. This clearly explains a hige travel rush on Dec 28, 2013 because people travel a lot to celebrate the new year combined with a long weekend.

Generalization to other years: As explained above the context of these days, to in order to generalize this for other years Thanksgiving is not on a fixed date , it is always the fourth Thursday of November, so for example it will be on 27th Nov in 2014.

However the dates of New year & New year eve is alwyas fixed: New Year - 1 Jan of every year. New year eve - 31st december of every year.

So to generalize it over years we can follow the above rules and identify those dates for every year.

Question #3

Create a new variable that splits the “wday” variable into terms, but only for Saturdays, i.e., it should have Thurs, Fri, but Sat-summer, Sat-spring, Sat-fall. How does this model compare with the model with every combination of “wday” and “term”?

daily<-daily%>%
  mutate(wday_term_Sat=ifelse(wday=="Sat", paste("Sat",term(date), sep = "-"), as.character(wday)))


#  converting wday_term_Sat variable  into factor 
daily$wday_term_Sat<-as.factor(daily$wday_term_Sat)

#model with model with every combination of "wday" and "term"
model1 <- MASS::rlm(n ~ wday * term, data = daily)


#model with revsions suggested in question
model2 <- MASS::rlm(n ~ wday_term_Sat, data = daily)

#comparing test statistics for both the models
summary(model1)

## 
## Call: rlm(formula = n ~ wday * term, data = daily)
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -347.537  -12.899    3.915   11.500  160.101 
## 
## Coefficients:
##                   Value     Std. Error t value  
## (Intercept)        922.4857    1.5914   579.6839
## wday.L             -79.3059    4.2202   -18.7920
## wday.Q            -153.7811    4.2136   -36.4961
## wday.C             -67.7750    4.2081   -16.1056
## wday^4             -74.9295    4.2229   -17.7437
## wday^5              -6.4803    4.1961    -1.5444
## wday^6              -9.7258    4.2011    -2.3151
## termsummer          32.9797    2.7178    12.1349
## termfall             1.9847    2.3615     0.8404
## wday.L:termsummer    9.5346    7.2110     1.3222
## wday.Q:termsummer   12.3445    7.1875     1.7175
## wday.C:termsummer   17.0750    7.2040     2.3702
## wday^4:termsummer   11.0813    7.1885     1.5415
## wday^5:termsummer   -3.5357    7.1969    -0.4913
## wday^6:termsummer    0.3378    7.1550     0.0472
## wday.L:termfall    -31.7325    6.2480    -5.0788
## wday.Q:termfall    -33.1644    6.2524    -5.3043
## wday.C:termfall    -23.8271    6.2399    -3.8185
## wday^4:termfall    -22.5288    6.2606    -3.5985
## wday^5:termfall     -4.9157    6.2318    -0.7888
## wday^6:termfall     -3.3798    6.2550    -0.5403
## 
## Residual standard error: 17.58 on 344 degrees of freedom

summary(model2)

## 
## Call: rlm(formula = n ~ wday_term_Sat, data = daily)
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -345.692  -14.487    2.403   13.840  158.717 
## 
## Coefficients:
##                         Value     Std. Error t value  
## (Intercept)              978.4105    3.2011   305.6436
## wday_term_SatMon           0.1863    4.5271     0.0412
## wday_term_SatSat-fall   -280.1270    6.3127   -44.3748
## wday_term_SatSat-spring -233.2026    5.8710   -39.7213
## wday_term_SatSat-summer -177.4939    7.3927   -24.0092
## wday_term_SatSun         -75.2507    4.5271   -16.6223
## wday_term_SatThu           1.2817    4.5271     0.2831
## wday_term_SatTue         -14.9240    4.5057    -3.3123
## wday_term_SatWed          -7.3207    4.5271    -1.6171
## 
## Residual standard error: 21.18 on 356 degrees of freedom

#visualizing residuals for  both the models

p1<-daily %>%
  add_residuals(model1, "residual1") %>%
  ggplot(aes(date, residual1)) +
  geom_hline(yintercept = 0, size = 1, color = "navy") +
  geom_line()


p2<-daily %>%
  add_residuals(model2, "residual2") %>%
  ggplot(aes(date, residual2)) +
  geom_hline(yintercept = 0, size = 1, color = "navy") +
  geom_line()


grid.arrange(p1, p2)

Comments: By analysing the summary statistics we can see the model1 -model with model with every combination of “wday” and “term”. This model has amean rwsidual value of 3.915. model 2- with suggested changes in the question ha smean residual 2.403, this is lesser than model 1 hence in terms of prediting the values on averagr model 2 does a better job. howver the resdual error for model 2 is higher than model1. On further analyszing if we see the graphs of residuals for both the models we can see the 2nd model has more resduals specially in July and Aug, however slightly lesser residuals for April Month. Also, the region of very high residuals (ourliers) are quite similar for both the models

Question #4

Create a new “wday” variable that combines the day of week, term(for Saturdays), and public holidays. What do the residuals of the model look like?

#creating afactor of all US federal holidays
US_holiday<-c(date("2013-01-01"),date("2013-01-21"),date("2013-02-18"),date("2013-05-27"),
              date("2013-07-04"),date("2013-09-02"),date("2013-10-11"),date("2013-11-11"),date("2013-11-28"),
              date("2013-12-25") )
             


daily<-daily%>%
  mutate(new_wday=ifelse(wday=="Sat", paste("Sat",term(date), sep = "-"), 
                         ifelse(date %in% US_holiday, "holiday" , as.character(wday))))


#  converting "new_wday" variable  into factor 
daily$new_wday<-as.factor(daily$new_wday)

model_new <- MASS::rlm(n ~ new_wday, data = daily)

#comparing test statistics for both the models
summary(model_new)

## 
## Call: rlm(formula = n ~ new_wday, data = daily)
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -317.410  -13.499    1.501   13.501  159.165 
## 
## Coefficients:
##                    Value     Std. Error t value  
## (Intercept)         978.4100    3.1205   313.5452
## new_wdayholiday     -64.3951    7.7070    -8.3554
## new_wdayMon           3.0893    4.5059     0.6856
## new_wdaySat-fall   -280.5753    6.1095   -45.9241
## new_wdaySat-spring -232.7398    5.6842   -40.9449
## new_wdaySat-summer -177.4933    7.1499   -24.8246
## new_wdaySun         -74.9600    4.3918   -17.0684
## new_wdayThu           2.9383    4.4350     0.6625
## new_wdayTue         -14.1105    4.3918    -3.2130
## new_wdayWed          -6.5395    4.4130    -1.4819
## 
## Residual standard error: 20.02 on 355 degrees of freedom

daily %>%
  add_residuals(model_new, "residual_new") %>%
  ggplot(aes(date, residual_new)) +
  geom_hline(yintercept = 0, size = 1, color = "blue") +
  geom_line()

Comments: Please note that for the analysis of this question the public holidays for 2013 were sources from the website: http://www.calendarpedia.com/holidays/federal-holidays-2013.html

THis revised model have lesser value of mean residual

Also, the aversge residual error value is also lesser than the revise dmodel in previous question(model2) The same obsrrvation is validated onn visualizing the residual plot for this model

Question #5

What happens if you fit a day-of-week effect that varies by month (i.e.m n ~ wday*month)? Why is this not very helpful?

daily<-daily%>%
  mutate(month= as.factor(month(date)))

model_month <- MASS::rlm(n ~ wday*month, data = daily)

#comparing test statistics for both the models
summary(model_month)

## 
## Call: rlm(formula = n ~ wday * month, data = daily)
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -355.5000   -4.0000    0.4941    4.0000  138.5118 
## 
## Coefficients:
##                Value     Std. Error t value  
## (Intercept)     868.1936    1.6650   521.4327
## wday.L          -73.7893    4.5741   -16.1321
## wday.Q         -164.7242    4.4167   -37.2960
## wday.C          -70.2867    4.4509   -15.7917
## wday^4          -94.8767    4.4920   -21.1214
## wday^5            4.8125    4.3242     1.1129
## wday^6          -11.8265    4.1618    -2.8417
## month2           22.8336    2.4092     9.4775
## month3           67.3938    2.3547    28.6211
## month4           73.9634    2.3730    31.1686
## month5           62.6409    2.3547    26.6026
## month6           81.7330    2.3730    34.4427
## month7           93.6207    2.3547    39.7593
## month8           84.9062    2.3547    36.0584
## month9           58.2325    2.3730    24.5395
## month10          61.1632    2.3547    25.9751
## month11          63.5060    2.3730    26.7618
## month12          52.2364    2.3547    22.1840
## wday.L:month2    10.1749    6.4921     1.5673
## wday.Q:month2    -9.0842    6.3822    -1.4234
## wday.C:month2     1.3437    6.4059     0.2098
## wday^4:month2     8.7889    6.4345     1.3659
## wday^5:month2     2.4525    6.3185     0.3882
## wday^6:month2     4.9467    6.2085     0.7968
## wday.L:month3    -4.2993    6.2299    -0.6901
## wday.Q:month3    14.7026    6.1811     2.3787
## wday.C:month3     9.3917    6.2380     1.5056
## wday^4:month3    24.5305    6.2894     3.9003
## wday^5:month3   -11.1877    6.2461    -1.7911
## wday^6:month3     2.4036    6.1944     0.3880
## wday.L:month4    -6.8702    6.4334    -1.0679
## wday.Q:month4     9.6813    6.3465     1.5255
## wday.C:month4     1.9424    6.2945     0.3086
## wday^4:month4    26.3942    6.3265     4.1720
## wday^5:month4   -12.7402    6.1524    -2.0708
## wday^6:month4    -3.8213    6.1112    -0.6253
## wday.L:month5   -12.7427    6.4334    -1.9807
## wday.Q:month5     2.8971    6.2824     0.4611
## wday.C:month5    -6.7737    6.2945    -1.0761
## wday^4:month5    12.9616    6.2476     2.0747
## wday^5:month5    -9.9501    6.1524    -1.6173
## wday^6:month5     1.8807    5.9589     0.3156
## wday.L:month6    10.9685    6.2784     1.7470
## wday.Q:month6    26.4551    6.1811     4.2800
## wday.C:month6    19.2234    6.2945     3.0540
## wday^4:month6    28.9493    6.3958     4.5263
## wday^5:month6   -12.7016    6.3105    -2.0128
## wday^6:month6     3.3101    6.2078     0.5332
## wday.L:month7     5.3788    6.4334     0.8361
## wday.Q:month7    24.1848    6.2824     3.8496
## wday.C:month7    18.7778    6.2945     2.9832
## wday^4:month7    33.9526    6.2476     5.4345
## wday^5:month7   -14.5411    6.1524    -2.3635
## wday^6:month7     2.9936    5.9589     0.5024
## wday.L:month8    -5.4223    6.3265    -0.8571
## wday.Q:month8    10.0067    6.2461     1.6021
## wday.C:month8    11.3565    6.2380     1.8205
## wday^4:month8    23.8261    6.3069     3.7778
## wday^5:month8   -15.8985    6.1483    -2.5859
## wday^6:month8     1.0814    6.1108     0.1770
## wday.L:month9   -42.2908    6.3385    -6.6721
## wday.Q:month9   -24.7745    6.2824    -3.9435
## wday.C:month9   -31.9154    6.2945    -5.0704
## wday^4:month9   -17.3602    6.3091    -2.7516
## wday^5:month9   -12.5103    6.2502    -2.0016
## wday^6:month9     3.0670    6.1948     0.4951
## wday.L:month10  -49.1848    6.4687    -7.6035
## wday.Q:month10  -30.9858    6.2461    -4.9608
## wday.C:month10  -35.6117    6.2945    -5.6576
## wday^4:month10   -8.3348    6.3526    -1.3120
## wday^5:month10  -15.9756    6.1153    -2.6124
## wday^6:month10   -2.3547    5.8856    -0.4001
## wday.L:month11  -22.1747    6.3385    -3.4984
## wday.Q:month11  -23.2437    6.2824    -3.6998
## wday.C:month11  -11.7420    6.2945    -1.8654
## wday^4:month11    8.1113    6.3091     1.2857
## wday^5:month11  -18.4355    6.2502    -2.9496
## wday^6:month11    2.3924    6.1948     0.3862
## wday.L:month12    1.8501    6.3265     0.2924
## wday.Q:month12   18.0719    6.2461     2.8933
## wday.C:month12    4.2873    6.2380     0.6873
## wday^4:month12   11.5970    6.3069     1.8388
## wday^5:month12   -5.7612    6.1483    -0.9370
## wday^6:month12   -2.1140    6.1108    -0.3459
## 
## Residual standard error: 5.93 on 281 degrees of freedom

daily %>%
  gather_residuals(model_month, model_new) %>%
  ggplot(aes(date, resid, color=model)) +
  geom_hline(yintercept = 0, size = 1, color = "blue") +
  geom_line()

Comments: As we can see in the graph that there is not much difference in the residual patterns for the revised model with month effect added(model_month). THis might be because when we see all combinations of month(12) and wday(7) there becauomes too many parameters (12*7)=84.

There are only 365 days data we have , so for each combinationthe data will available for approximately four obervations. ANd based non so low observations the prediction happens for that combination. Therefore the predictions are on very good because of very less data that may cause overfitting and may not be robust for predictions.

Question #6

What would you expect the model n ~ wday + ns(date,5) to look like? Knowing what you know about the data, why would you expect it not to be particularly effective?

model_spline <- MASS::rlm(n ~ wday + ns(date, 5), data = daily)


#comparing the residuals from spline model to that of other model developed in previous section
daily %>%
  gather_residuals(model_spline, model_new) %>%
  ggplot(aes(date, resid, color=model)) +
  geom_hline(yintercept = 0, size = 1, color = "blue") +
  geom_line()

daily %>% 
  data_grid(wday, date = seq_range(date, n = 13)) %>% 
  add_predictions(model_spline) %>% 
  ggplot(aes(date, pred, color = wday)) +
  geom_line() +
  geom_point()

Comments:

Spline might not be effecting in this model as they would try to fit the model fit a smooth cyclicity with respect to wday i.e weekday. Here the cyclicity with weekday is nto the correct mesure to build the model, hence splines will not be very effect.

Question #7

We hypothesized that people leaving on Sundays are more likely to be business travelers who need to be somewhere on Monday. Explore the hypothesis by seeing how if breaks down based on distance and time: if it’s true, you’d expect to see more Sunday evening flights to places that are far away.

#lets create the labels for Sunday as "evening Flights" or "Day Time Flights"

flights_revised<-flights %>%
  mutate(date = make_date(year, month, day))%>%
      mutate(wday = wday(date, label = TRUE))%>%
  mutate(wday_revised=ifelse((wday=="Sun" & dep_time>1600), "Sun-Evening", ifelse(wday=="Sun" & dep_time<=1600, "Sun-DayTime", as.character(wday))))%>%
  mutate(wday_revised=as.factor(wday_revised))%>%
  group_by(wday_revised) %>%
  summarise(average_dist =  mean(distance),
            median_dist = median(distance))

flights_revised<-flights_revised[1:8,]
  
#lets reorder the weekdays 

flights_revised$wday_revised <- factor(flights_revised$wday_revised , levels = c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun-DayTime", "Sun-Evening"))


#ploting it 
p5<-ggplot(data= flights_revised, aes(y = average_dist, x = wday_revised)) +
  geom_point(color="red")+labs(title = "Flights 'Average Distance' on Over Weekdays")

#ploting it 
p6<-ggplot(data= flights_revised, aes(y = median_dist, x = wday_revised)) +
  geom_point(color="red")+labs(title = " Flight 'Median Distance' on Over Weekdays")

grid.arrange(p5, p6)

Comments: As we can See in the graphs for both median and mean that for sunday evening the average and median distance of flight is much higher than any other day of the week except “Saturday”. Also, Sunday evening flights have higher mean and median values than sunday DayTime flgihts, this supports the hypothesis mentioned in the question. Also, for Saturday flights having the hgigest average distance, one reason could be that those are vacation flights or long weeknd flights where people travel to tourist places that could be much further than usual work related travel.