daily <- flights %>%
mutate(date = make_date(year, month, day)) %>%
group_by(date) %>%
summarize(n = n())
ggplot(daily, aes(date, n)) +
geom_line()
daily <- daily %>%
mutate(wday = wday(date, label = TRUE))
ggplot(daily, aes(wday,n)) +
geom_boxplot()
mod = lm(n ~ wday, data = daily)
grid <- daily %>%
data_grid(wday) %>%
add_predictions(mod, "n")
ggplot(daily, aes(wday, n)) +
geom_boxplot() +
geom_point(data = grid, color = "orange", size = 4)
daily <- daily %>%
add_residuals(mod)
daily %>%
ggplot(aes(date, resid)) +
geom_ref_line(h = 0) +
geom_line()
ggplot(daily, aes(date, resid, color = wday)) +
geom_ref_line(h = 0, colour = "red") +
geom_line()
daily %>%
filter(resid < -100)
## # A tibble: 11 x 4
## date n wday resid
## <date> <int> <ord> <dbl>
## 1 2013-01-01 842 Tue -109.
## 2 2013-01-20 786 Sun -105.
## 3 2013-05-26 729 Sun -162.
## 4 2013-07-04 737 Thu -229.
## 5 2013-07-05 822 Fri -145.
## 6 2013-09-01 718 Sun -173.
## 7 2013-11-28 634 Thu -332.
## 8 2013-11-29 661 Fri -306.
## 9 2013-12-24 761 Tue -190.
## 10 2013-12-25 719 Wed -244.
## 11 2013-12-31 776 Tue -175.
daily %>%
ggplot(aes(date, resid)) +
geom_ref_line(h = 0, colour = "red", size = 1) +
geom_line(color = "grey50") +
geom_smooth(se = FALSE, span = 0.20)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
daily %>%
filter(wday == "Sat") %>%
ggplot(aes(date, n)) +
geom_point()+
geom_line() +
scale_x_date(
NULL,
date_breaks = "1 month",
date_labels = "%b"
)
term <- function(date) {
cut(date,
breaks = ymd(20130101, 20130605, 20130825, 20140101),
labels = c("spring", "summer", "fall")
)
}
daily <- daily %>%
mutate(term = term(date))
daily %>%
filter(wday == "Sat") %>%
ggplot(aes(date, n, color = term)) +
geom_point(alpha = 1/3)+
geom_line() +
scale_x_date(
NULL,
date_breaks = "1 month",
date_labels = "%b"
)
daily %>%
ggplot(aes(wday, n, color = term)) +
geom_boxplot()
mod1 <- lm(n ~ wday, data = daily)
mod2 <- lm(n ~ wday * term, data = daily)
daily %>%
gather_residuals(without_term = mod1, with_term = mod2) %>%
ggplot(aes(date, resid, color = model)) +
geom_line(alpha = 0.75)
grid <- daily %>%
data_grid(wday, term) %>%
add_predictions(mod2, "n")
ggplot(daily, aes(wday, n)) +
geom_boxplot() +
geom_point(data = grid, color = "red") +
facet_wrap(~ term)
mod3 <- MASS::rlm(n ~ wday * term, data = daily)
daily %>%
add_residuals(mod3, "resid") %>%
ggplot(aes(date, resid)) +
geom_hline(yintercept = 0, size = 2, color = "red") +
geom_line()
# If you are creating variables it might be a good idea to bundle the creation of the variables up into a function
compute_vars <- function(data) {
data %>%
mutate(term = term(date),
wday = wday(date, label = TRUE)
)
}
# Another option would be to put the transformations directly in the model formula:
wday2 <- function(x) wday(x, label = TRUE)
mod3 <- lm(n ~ wday2(date) * term(date), data = daily)
# We could use a more flexible model to capture the pattern of school term in the data
library(splines)
mod <- MASS::rlm(n ~ wday * ns(date, 5), data = daily)
daily %>%
data_grid(wday, date = seq_range(date, n = 13)) %>%
add_predictions(mod) %>%
ggplot(aes(date, pred, color = wday)) +
geom_line() +
geom_point()
# We see a strong pattern in the numbers of Sat flights. This is reassuring, because we also saw that pattern in the raw data. It's a good sign when you get the same signal from different approaches.
Why are there fewer than expected flights on January 20, May 26 and September 1? (Hint: they all have the same explanation.) How would these days generalize into another year?
library(splusTimeDate)
##
## Attaching package: 'splusTimeDate'
## The following objects are masked from 'package:lubridate':
##
## days, hms, hours, mdy, minutes, seconds, years
## The following objects are masked from 'package:base':
##
## months, quarters, sort.list, weekdays
print(paste('Martin Luther King Jr Day: ', holiday.MLK(2013), ' and the weekday was: ', wday('2013-01-21', label = TRUE, abbr = F)[1]))
## [1] "Martin Luther King Jr Day: 01/21/2013 and the weekday was: Monday"
print(paste('Memorial Day: ', holiday.Memorial(2013), ' and the weekday was: ', wday('2013-05-27', label = TRUE, abbr = F)[1]))
## [1] "Memorial Day: 05/27/2013 and the weekday was: Monday"
print(paste('Labor Day: ', holiday.Labor(2013), ' and the weekday was: ', wday('2013-09-02', label = TRUE, abbr = F)[1]))
## [1] "Labor Day: 09/02/2013 and the weekday was: Monday"
All of January 20, May 26 and September 1 were Sundays before the Monday holidays of Martin Luther King Jr day, Memorial day and Labor day.
January 20 is the Sunday before Martin Luther King Jr day that is celebrated on the 3rd Monday of January. In 2013 January 21 was the third Monday and thus January 20 was the Sunday before that day.
May 26 is the Sunday before Memorial day that is a federal holiday observed on the last Monday of May. In 2013 May 27 was the last monday of May and thus May 26 was the Sunday before that day.
September 1 is the Sunday before Labor day that is celebrated on the first Monday of September. In 2013 September 2 was the first Monday of September and thus September 1 was the Sunday before that day.
In each of these cases it makes perfect sense to have fewer flights as generally people who want to travel during the long weekends are expected to have their flights either on Friday evening or Saturday morning so they can enjoy their long weekends.
What do the three days with high positive residuals represent? How would these days generalize to another year?
daily %>%
top_n(3, resid)
## # A tibble: 3 x 5
## date n wday resid term
## <date> <int> <ord> <dbl> <fct>
## 1 2013-11-30 857 Sat 112. fall
## 2 2013-12-01 987 Sun 95.5 fall
## 3 2013-12-28 814 Sat 69.4 fall
30th november 2013 and 1st December 2013 are the Saturdays and Sundays after Thanksgiving and 28th December 2013 is the Saturday after Christmas.
We cannot generalize these days to every year as these holiday days change from year to year and in some years they could end up on the weekends itself.
for(i in 1:length(holiday.Thanksgiving(2013:2019))){
print(paste(holiday.Thanksgiving(2013:2019)[i], weekdays(holiday.Thanksgiving(2013:2019)[i], abbreviate = F)
))
}
## [1] "11/28/2013 Thursday"
## [1] "11/27/2014 Thursday"
## [1] "11/26/2015 Thursday"
## [1] "11/24/2016 Thursday"
## [1] "11/23/2017 Thursday"
## [1] "11/22/2018 Thursday"
## [1] "11/28/2019 Thursday"
for(i in 1:length(holiday.Christmas(2013:2019))){
print(paste(holiday.Christmas(2013:2019)[i], weekdays(holiday.Christmas(2013:2019)[i], abbreviate = F)
))
}
## [1] "12/25/2013 Wednesday"
## [1] "12/25/2014 Thursday"
## [1] "12/25/2015 Friday"
## [1] "12/25/2016 Sunday"
## [1] "12/25/2017 Monday"
## [1] "12/25/2018 Tuesday"
## [1] "12/25/2019 Wednesday"
As we can see that Thanksgiving occurs on the fourth thursday of every year so we know it will be a thursday but the date is not always going to be the same. In the case of Christmas the date will always be the 25th but the day will not be the same as we can see it could be a weekend as in case of 2016. Thus generalizing dates year by year may not always be possible for all holidays especially those that occur on a constant date and thus different days like Christmas.
Create a new variable that splits the “wday” variable into terms, but only for Saturdays, i.e., it should have Thurs, Fri, but Sat-summer, Sat-spring, Sat-fall. How does this model compare with the model with every combination of “wday” and “term”?
daily <- daily %>%
mutate(
wday2 =
case_when(
wday == "Sat" & term == "summer" ~ "Sat-summer",
wday == "Sat" & term == "fall" ~ "Sat-fall",
wday == "Sat" & term == "spring" ~ "Sat-spring",
TRUE ~ as.character(wday)
)
)
mod3 <- lm(n ~ wday2, data = daily)
daily %>%
gather_residuals(sat_term = mod3, all_interact = mod2) %>%
ggplot(aes(date, resid, color = model)) +
geom_line(alpha = 0.75)
mod2
##
## Call:
## lm(formula = n ~ wday * term, data = daily)
##
## Coefficients:
## (Intercept) wday.L wday.Q
## 912.8684 -71.4674 -161.0112
## wday.C wday^4 wday^5
## -65.6215 -82.0997 -1.3425
## wday^6 termsummer termfall
## -12.4501 37.6922 3.7598
## wday.L:termsummer wday.Q:termsummer wday.C:termsummer
## -6.2215 26.4121 26.7451
## wday^4:termsummer wday^5:termsummer wday^6:termsummer
## 23.5933 -13.1343 -5.0026
## wday.L:termfall wday.Q:termfall wday.C:termfall
## -31.9048 -0.6959 -8.8349
## wday^4:termfall wday^5:termfall wday^6:termfall
## -9.8570 -3.2107 -8.2061
Plotting the residual difference will give a better idea
daily %>%
spread_residuals(sat_term = mod3, all_interact = mod2) %>%
mutate(resid_diff = sat_term - all_interact) %>%
ggplot(aes(date, resid_diff)) +
geom_line(alpha = 0.75)
The Saturday term model residuals are lower than the all interaction model of weekday and term in the Spring Season and higher in the Summer Season whereas its more on the lower side in the fall season.
Comparing the model Statistics
Saturday term model
library(broom)
##
## Attaching package: 'broom'
## The following object is masked from 'package:modelr':
##
## bootstrap
glance(mod3) %>% select(r.squared, sigma, AIC, df)
## # A tibble: 1 x 4
## r.squared sigma AIC df
## <dbl> <dbl> <dbl> <int>
## 1 0.736 47.4 3863. 9
All interactions model
glance(mod2) %>% select(r.squared, sigma, AIC, df)
## # A tibble: 1 x 4
## r.squared sigma AIC df
## <dbl> <dbl> <dbl> <int>
## 1 0.757 46.2 3856. 21
The all interactions model has a higher r-squared that indicates that in can better explain the variance in the number of flights and the residual standard deviation is also smaller for the all interactions model indicating it fits the data slightly better than the Saturday only model. Also the AIC a metric that penalizes the model for having higher parameters is lower for the all interactions model inspite of having higher parameters than the Saturday term model.
Create a new “wday” variable that combines the day of week, term(for Saturdays), and public holidays. What do the residuals of the model look like?
The following are the public holidays for 2013: Date Holiday Tuesday, January 1, New Year’s Day Monday, January 21, Birthday of Martin Luther King, Jr. Monday, February 18, Washington’s Birthday Monday, May 27, Memorial Day Thursday, July 4, Independence Day Monday, September 2, Labor Day Monday, October 14, Columbus Day Monday, November 11, Veterans Day Thursday, November 28, Thanksgiving Day Wednesday, December 25, Christmas Day
The above list of holidays is obtained from: https://www.opm.gov/policy-data-oversight/snow-dismissal-procedures/federal-holidays/#url=2013
Creating tibble of holidays using tribble function
holidays13 <-
tribble(
~holiday, ~date,
"New Year's Day", 20130101,
"Martin Luther King Jr. Day", 20130121,
"Washington's Birthday", 20130218,
"Memorial Day", 20130527,
"Independence Day", 20130704,
"Labor Day", 20130902,
"Columbus Day", 20131028,
"Veteran's Day", 20131111,
"Thanksgiving", 20131128,
"Christmas", 20131225
) %>%
mutate(date = lubridate::ymd(date))
The model should also consider the days after holidays and the days before holidays as the travel would be heavier on the days before and after holidays but lighter on the actual holidays.
q4_daily <- daily %>%
mutate(
wday3 =
case_when(
date %in% (holidays13$date - 1L) ~ "day before holiday",
date %in% (holidays13$date + 1L) ~ "day after holiday",
date %in% holidays13$date ~ "holiday",
.$wday == "Sat" & .$term == "summer" ~ "Sat-summer",
.$wday == "Sat" & .$term == "fall" ~ "Sat-fall",
.$wday == "Sat" & .$term == "spring" ~ "Sat-spring",
TRUE ~ as.character(.$wday)
)
)
mod4 <- lm(n ~ wday3, data = q4_daily)
q4_daily %>%
gather_residuals(holidays_or_around = mod4, sat_term = mod3) %>%
ggplot(aes(date, resid, color = model)) +
geom_line(alpha = 0.75)
Looks the model with holidays hs higher residuals than the models for Saturdays especially around the days of the public holidays
Lookign at the residual difference
q4_daily %>%
spread_residuals(resid_sat_terms = mod3, resid_holidays_or_around = mod4) %>%
mutate(resid_diff = resid_holidays_or_around - resid_sat_terms) %>%
ggplot(aes(date, resid_diff)) +
geom_line(alpha = 0.75)
Comparing twith the all interactions model
q4_daily %>%
gather_residuals(holidays_or_around = mod4, all_interactions = mod2) %>%
ggplot(aes(date, resid, color = model)) +
geom_line(alpha = 0.75)
There does not appear to be much of a difference with the all interactions model and it makes sense as not all holidays will be on weekends.
Looking at the residual difference
q4_daily %>%
spread_residuals(resid_all_interactions = mod2, resid_holidays_or_around = mod4) %>%
mutate(resid_diff = resid_holidays_or_around - resid_all_interactions) %>%
ggplot(aes(date, resid_diff)) +
geom_line(alpha = 0.75)
As in the case of the earlier comparison with the Saturday model the residual of the holidays model is higher than the all interactions model at around the days of the public holidays.
What happens if you fit a day-of-week effect that varies by month (i.e.m n ~ wday*month)? Why is this not very helpful?
q5_daily <- mutate(daily, month = factor(lubridate::month(date)))
mod5 <- lm(n ~ wday * month, data = q5_daily)
print(summary(mod5))
##
## Call:
## lm(formula = n ~ wday * month, data = q5_daily)
##
## Residuals:
## Min 1Q Median 3Q Max
## -269.2 -5.0 1.5 8.8 113.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 867.4000 7.5983 114.157 < 2e-16 ***
## wday.L -64.0744 20.8737 -3.070 0.002353 **
## wday.Q -165.6001 20.1555 -8.216 7.77e-15 ***
## wday.C -68.2591 20.3115 -3.361 0.000885 ***
## wday^4 -92.0814 20.4991 -4.492 1.03e-05 ***
## wday^5 9.7925 19.7334 0.496 0.620111
## wday^6 -20.4376 18.9922 -1.076 0.282802
## month2 23.7071 10.9946 2.156 0.031912 *
## month3 67.8857 10.7456 6.318 1.04e-09 ***
## month4 74.5929 10.8292 6.888 3.70e-11 ***
## month5 56.2786 10.7456 5.237 3.20e-07 ***
## month6 80.3071 10.8292 7.416 1.43e-12 ***
## month7 77.1143 10.7456 7.176 6.39e-12 ***
## month8 81.6357 10.7456 7.597 4.52e-13 ***
## month9 51.3714 10.8292 4.744 3.34e-06 ***
## month10 60.1357 10.7456 5.596 5.20e-08 ***
## month11 46.9143 10.8292 4.332 2.06e-05 ***
## month12 38.7786 10.7456 3.609 0.000364 ***
## wday.L:month2 -3.7230 29.6267 -0.126 0.900089
## wday.Q:month2 -3.8188 29.1251 -0.131 0.895776
## wday.C:month2 0.4899 29.2333 0.017 0.986641
## wday^4:month2 4.5690 29.3639 0.156 0.876460
## wday^5:month2 -4.2552 28.8346 -0.148 0.882784
## wday^6:month2 12.0570 28.3325 0.426 0.670760
## wday.L:month3 -14.5705 28.4302 -0.513 0.608703
## wday.Q:month3 15.4389 28.2073 0.547 0.584581
## wday.C:month3 8.2262 28.4672 0.289 0.772817
## wday^4:month3 22.7202 28.7015 0.792 0.429261
## wday^5:month3 -15.3298 28.5042 -0.538 0.591135
## wday^6:month3 11.3727 28.2682 0.402 0.687759
## wday.L:month4 -16.6682 29.3590 -0.568 0.570667
## wday.Q:month4 10.7254 28.9620 0.370 0.711418
## wday.C:month4 -0.2449 28.7249 -0.009 0.993202
## wday^4:month4 23.2883 28.8711 0.807 0.420561
## wday^5:month4 -17.8720 28.0764 -0.637 0.524935
## wday^6:month4 5.3524 27.8883 0.192 0.847940
## wday.L:month5 3.6663 29.3590 0.125 0.900711
## wday.Q:month5 -20.6652 28.6699 -0.721 0.471632
## wday.C:month5 4.6336 28.7249 0.161 0.871965
## wday^4:month5 5.9994 28.5109 0.210 0.833490
## wday^5:month5 -16.9119 28.0764 -0.602 0.547425
## wday^6:month5 12.7643 27.1935 0.469 0.639158
## wday.L:month6 -4.5261 28.6515 -0.158 0.874593
## wday.Q:month6 23.8130 28.2073 0.844 0.399267
## wday.C:month6 13.7580 28.7249 0.479 0.632342
## wday^4:month6 24.1183 29.1875 0.826 0.409322
## wday^5:month6 -17.6484 28.7981 -0.613 0.540483
## wday^6:month6 10.5256 28.3291 0.372 0.710510
## wday.L:month7 -28.7914 29.3590 -0.981 0.327601
## wday.Q:month7 49.5846 28.6699 1.730 0.084818 .
## wday.C:month7 54.5011 28.7249 1.897 0.058807 .
## wday^4:month7 50.8474 28.5109 1.783 0.075594 .
## wday^5:month7 -33.6983 28.0764 -1.200 0.231058
## wday^6:month7 -13.8943 27.1935 -0.511 0.609793
## wday.L:month8 -20.4479 28.8711 -0.708 0.479378
## wday.Q:month8 6.7648 28.5042 0.237 0.812578
## wday.C:month8 6.0012 28.4672 0.211 0.833186
## wday^4:month8 19.0738 28.7814 0.663 0.508058
## wday^5:month8 -19.3123 28.0576 -0.688 0.491827
## wday^6:month8 9.5074 27.8866 0.341 0.733410
## wday.L:month9 -30.3411 28.9257 -1.049 0.295110
## wday.Q:month9 -42.0342 28.6699 -1.466 0.143726
## wday.C:month9 -20.7186 28.7249 -0.721 0.471338
## wday^4:month9 -20.3752 28.7914 -0.708 0.479728
## wday^5:month9 -18.2376 28.5226 -0.639 0.523079
## wday^6:month9 11.7263 28.2699 0.415 0.678606
## wday.L:month10 -61.0507 29.5199 -2.068 0.039544 *
## wday.Q:month10 -26.2352 28.5042 -0.920 0.358153
## wday.C:month10 -32.4353 28.7249 -1.129 0.259788
## wday^4:month10 -12.2122 28.9901 -0.421 0.673890
## wday^5:month10 -27.6864 27.9072 -0.992 0.322008
## wday^6:month10 0.1234 26.8590 0.005 0.996339
## wday.L:month11 -54.9466 28.9257 -1.900 0.058512 .
## wday.Q:month11 16.0117 28.6699 0.558 0.576957
## wday.C:month11 54.9502 28.7249 1.913 0.056766 .
## wday^4:month11 47.2857 28.7914 1.642 0.101635
## wday^5:month11 -44.7401 28.5226 -1.569 0.117871
## wday^6:month11 -20.6876 28.2699 -0.732 0.464907
## wday.L:month12 -9.5058 28.8711 -0.329 0.742212
## wday.Q:month12 75.2088 28.5042 2.639 0.008791 **
## wday.C:month12 -25.0256 28.4672 -0.879 0.380097
## wday^4:month12 -23.7798 28.7814 -0.826 0.409380
## wday^5:month12 20.4470 28.0576 0.729 0.466761
## wday^6:month12 9.5864 27.8866 0.344 0.731282
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 42.05 on 281 degrees of freedom
## Multiple R-squared: 0.8355, Adjusted R-squared: 0.7869
## F-statistic: 17.2 on 83 and 281 DF, p-value: < 2.2e-16
For two reasons this model is not very useful. First is the fact that this model gives us the no of days in a week 7 times the no of months in a year 12, 84 parameters. Second each month only has 4 to 5 weeks so the average effect is taken across only 4 to 5 observations and thus this kind of a model will fail to generalize beyond this dataset. It will not be very useful for newer unseen datasets. We can clearly see that almost all of the interactions between weekday and month are insignificant.
Looking at the residuals
q5_daily %>%
gather_residuals(wday_month = mod5, all_interactions = mod2) %>%
ggplot(aes(date, resid, color = model)) +
geom_line(alpha = 0.75)
As expected because of fewer data points the model is resulting in a higher variation in residuals as outliers will start to affect the data due to the lower no of observations in each interaction level of weekday and month.
Residual difference
q5_daily %>%
spread_residuals(resid_all_interactions = mod2, resid_wday_month = mod5) %>%
mutate(resid_diff = resid_wday_month - resid_all_interactions) %>%
ggplot(aes(date, resid_diff)) +
geom_line(alpha = 0.75)
The higher variation is extremely evident when you look at residual differences.
Thus this model is not very useful.
What would you expect the model n ~ wday + ns(date,5) to look like? Knowing what you know about the data, why would you expect it not to be particularly effective?
mod6 <- lm(n ~ wday + splines::ns(date, 5), data = daily)
daily %>%
gather_residuals(mod2,mod6)%>%
ggplot(aes(x = date, y = resid, color = model))+
geom_line(alpha = 0.75)
The resdiuals of this model has a very high variation and thus interpolating the number of flights by date might work for this year but would not work for other years as the holidays we observed fall on different days for different years.
We hypothesized that people leaving on Sundays are more likely to be business travelers who need to be somewhere on Monday. Explore the hypothesis by seeing how if breaks down based on distance and time: if it’s true, you’d expect to see more Sunday evening flights to places that are far away.
Comparing the average distances of flights by day of week
flights %>%
mutate(
date = make_date(year, month, day),
wday = wday(date, label = TRUE)
) %>%
ggplot(aes(y = distance, x = wday)) +
geom_boxplot() +
labs(x = "Day of Week", y = "Distance")
On average the flights appear to be longest on Saturdays floowed by Sundays.
Hiding the outliers
flights %>%
mutate(
date = make_date(year, month, day),
wday = wday(date, label = TRUE)
) %>%
ggplot(aes(y = distance, x = wday)) +
geom_boxplot(outlier.shape = NA) +
labs(x = "Day of Week", y = "Distance")
Pointrange with average distances
flights %>%
mutate(
date = make_date(year, month, day),
wday = wday(date, label = TRUE)
) %>%
ggplot(aes(y = distance, x = wday)) +
stat_summary() +
labs(x = "Day of Week", y = "Average Distance")
## No summary function supplied, defaulting to `mean_se()
Summary statistics for distances
flights %>%
mutate(
date = make_date(year, month, day),
wday = wday(date, label = TRUE)
) %>%
group_by(wday) %>%
summarise("Mean distance (miles)" = round(mean(distance)),
"Median distance (miles)" = median(distance),
"Mean duration (minutes)" = round(mean(air_time, na.rm = T)),
"Median duration (minutes)" = median(air_time, na.rm = T))
## # A tibble: 7 x 5
## wday `Mean distance ~ `Median distanc~ `Mean duration ~ `Median duratio~
## <ord> <dbl> <dbl> <dbl> <dbl>
## 1 Sun 1055 937 152 130
## 2 Mon 1032 828 151 129
## 3 Tue 1027 820 150 128
## 4 Wed 1028 820 150 128
## 5 Thu 1033 828 150 128
## 6 Fri 1033 828 150 127
## 7 Sat 1081 944 154 134
Both the Mean and Median distances are higher on Saturday than Sunday. Mean and median duration of flights is also higher for Saturday
Looking at median distances by
flights %>%
mutate(
date = make_date(year, month, day),
wday = wday(date, label = TRUE)
) %>%
group_by(wday, hour) %>%
summarise("Median distance (miles) at each hour" = median(distance)) %>%
reshape2::acast(wday~hour, value.var="Median distance (miles) at each hour")
## 1 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## Sun NA 1400 963 1041 1005 950 1017.0 820 746 762 762 765 764 1008 944
## Mon NA 1400 762 1035 872 944 872.0 888 746 762 762 765 762 950 997
## Tue NA 1400 760 1035 872 944 872.0 888 746 762 762 765 762 950 944
## Wed NA 1400 760 1035 833 937 872.0 888 746 762 762 764 762 1008 937
## Thu NA 1400 760 1035 872 944 872.0 888 746 762 762 765 764 1008 944
## Fri NA 1400 760 1035 872 944 809.5 888 746 762 762 765 764 950 944
## Sat 17 1400 937 1035 944 950 1028.0 937 738 937 764 888 944 1065 1020
## 19 20 21 22 23
## Sun 944 762 719 266 1598
## Mon 937 762 719 266 1598
## Tue 944 740 642 266 1598
## Wed 937 753 642 266 1598
## Thu 944 762 719 266 1598
## Fri 937 762 733 266 1598
## Sat 1005 1023 937 266 1598
We can see from evening 5 to 9 on Saturday have the highest median distances whereas there does not appear to be much difference between the median distacnes for Sunday and most other days.
Finally lets look at the total number of evening fligts
flights %>%
mutate(
date = make_date(year, month, day),
wday = wday(date, label = TRUE)
) %>%
filter(
distance > 1000,
hour >= 14
) %>%
ggplot(aes(x = hour, color = wday, y = ..density..)) +
geom_freqpoly(binwidth = 1)
We can see that for evening flights after 2 the number of flights seems to be the same for almost all the days except Saturday.
Thus both in terms of no of flights and the distance our hypothesis does not appear to be true.