INSERT DESCRIPTION OF DATA HERE
mta2020 <- read.csv("https://raw.githubusercontent.com/okhaimova/DATA698/main/MTA_Daily_Ridership_Data__Beginning_2020.csv")
gas <- read.csv("https://raw.githubusercontent.com/okhaimova/DATA698/main/NYC%20Weekly%20Retail%20Gasoline%20and%20Diesel.csv",
skip = 2, header = TRUE)
bicycle_count <- read.csv("Bicycle_Counts.csv")
counters <- read.csv("Bicycle_Counters.csv")
DESCRIPTION
mta2020 <- mta2020 %>%
set_colnames(c("Date", "Subway", "Subway_Pre", "Buses", "Buses_Pre", "LIRR", "LIRR_Pre",
"MetroNorth","MetroNorth_Pre", "Access-A-Ride", "Access-A-Ride_Pre",
"Bridges_and_Tunnels", "Bridges_and_Tunnels_Pre")) %>%
mutate(Date = as.Date(Date, format = "%m/%d/%Y")) %>%
arrange(Date)
DESCRIPTION
gas <- gas %>%
dplyr::select(1:9) %>%
select(contains(c("Date","City"))) %>%
set_colnames(c("Date", "All_Grades", "Regular", "Midgrade", "Premium")) %>%
mutate(Date = as.Date(Date, format = "%b %d, %Y")) %>%
na.omit()
DESCRIPTION -date to get rid of times to sum by date -explain filtering -join from counter data -comprehensive brooklyn bridge, comp manhattan, pulaski, ed koch, williamsburg - The Brooklyn Bridge data seemed to have an outlier on February 8, 2022 with a reading of 68,200, even though February 2022 had a daily average of 1348.2 bicycles per day. We chose to remove the outlier and impute it with the daily average of February.
# these are the ids that match the five bridge counters
bridge_ids <- c("100010018", "100062893", "100009428", "300020904", "100009427")
counters <- counters %>%
mutate(id = as.character(id))
bicycle_count <- bicycle_count %>%
mutate(date = as.Date(date, "%m/%d/%Y")) %>%
group_by(id, date) %>%
summarise(total = sum(counts)) %>%
filter(id %in% bridge_ids) %>%
mutate(id = as.character(id)) %>%
left_join(counters, by = 'id') %>%
select(c(date, id, name, total)) %>%
filter(date > "2014-12-31")
brooklyn_febraury_avg <- bicycle_count %>%
filter(id == "300020904",
date >= "2022-02-01" & date < "2022-03-01",
date != "2022-02-08") %>%
summarise(mean = mean(total)) %>%
select(mean) %>%
as.numeric()
# replace the outlier
bicycle_count <- bicycle_count %>%
mutate(total = replace(total, total == 68200, round(brooklyn_febraury_avg,0)))
mta_weekly <- mta2020 %>%
filter(Date > "2020-03-01") %>%
mutate(Date = yearweek(Date)) %>%
group_by(Date) %>%
summarise(Subway = sum(Subway),
Buses = sum(Buses),
LIRR = sum(LIRR),
MetroNorth = sum(MetroNorth),
`Access-A-Ride` = sum(`Access-A-Ride`),
Bridges_and_Tunnels = sum(Bridges_and_Tunnels)) %>%
mutate(MTA = rowSums(across(c(Subway, Buses))))
gas_weekly <- gas %>%
filter(Date > "2020-03-01") %>%
mutate(Date = yearweek(Date))
bike_weekly <- bicycle_count %>%
filter(date > "2020-03-01") %>%
mutate(date = yearweek(date)) %>%
group_by(date) %>%
summarise(Bikes = sum(total)) %>%
rename(Date = date)
weekly <- plyr::join_all(list(mta_weekly, bike_weekly, gas_weekly),
by = "Date", type = 'left')
The price of gasoline was decreasing in the beginning of 2020, up until late February. Gasoline prices started increasing as the World Health Organization (WHO) declared the novel Coronavirus (COVID-19) outbreak as a pandemic on March 11, 2020.
It continued increasing, reaching the ultimate peak in June 2022.
Some factors that affected the gasoline prices throughout the last few years were COVID-19 related supply disruptions, the Russian-Ukrainian conflict, and the Keystone XL pipeline cancellation.
The pattern in gasoline prices are similar across the different grades of gasoline.
gas %>%
filter(Date >= "2020-01-01") %>%
ggplot(aes(x = Date, y = All_Grades)) +
geom_line() +
labs(title = "Price of Gasoline for All Grades, NYC",
y = "$USD")
gas %>%
filter(Date >= "2020-01-01") %>%
pivot_longer(-Date) %>%
filter(name != "All_Grades") %>%
ggplot(aes(x = Date, y = value, colour = name)) +
geom_line() +
facet_grid(factor(name, levels =c("Regular", "Midgrade", "Premium")) ~., scales = "free_y") +
labs(title = "Price of Gasoline, NYC",
y = "$USD",
color = "Gasoline Grade")
DESCRIPTION -repeat weekly for other methods -describe the charts -repeat for bridges and tunnels -fix scientific notation
mta2020 %>%
dplyr::select(Date, Subway, Buses) %>%
pivot_longer(-Date) %>%
ggplot(aes(x = Date, y = value, colour = name)) +
geom_line() +
facet_grid(name ~., scales = "free_y") +
labs(title = "MTA Ridership",
y = "Count",
color = "Mode")
mta2020 %>%
dplyr::select(Date, LIRR, MetroNorth, "Access-A-Ride") %>%
pivot_longer(-Date) %>%
ggplot(aes(x = Date, y = value, colour = name)) +
geom_line() +
facet_grid(name ~., scales = "free_y") +
labs(title = "MTA Ridership",
y = "Count",
color = "Mode")
## Warning: Removed 62 row(s) containing missing values (geom_path).
mta2020 %>%
mutate(Date = yearweek(Date)) %>%
group_by(Date) %>%
summarise(Subway = sum(Subway)) %>%
as_tsibble(index = Date) %>%
autoplot(Subway) +
labs(title = "Weekly Subway Ridership",
y = "Count")
mta2020 %>%
as_tsibble(index = Date) %>%
model(STL(Subway ~ season(window = "periodic"), robust = TRUE)) %>%
components() %>%
autoplot() +
labs(title = "STL Decomposition")
-rename the labels
The five bridges seems to have a similar seasonality pattern. There is a peak of bicycle usage in the summer and a decline in the winter. Brooklyn Bridge, Manhattan Bridge, and Williamsburg Bridge all connect Brooklyn and Manhattan. The Pulaski Bridge connects Brooklyn and Queens. The Ed Koch Queensboro Bridge connects Queens and Manhattan. It is also the only bridge that seems to have an apparent increase in usage from 2020 to the present.
bicycle_count %>%
ggplot(aes(x = date, y = total, color = name)) +
geom_line() +
facet_grid(name ~., scales = "free_y")
#dcmp <- bicycle_count %>%
# as_tsibble(index = date, key = c(id, name)) %>%
# fill_gaps() %>%
# model(stl = STL(total))
weekly %>%
as_tsibble(index = Date) %>%
model(STL(Bikes ~ season(window = "periodic"), robust = TRUE)) %>%
components() %>%
autoplot() +
labs(title = "STL Decomposition")
weekly %>%
ggplot(aes(x = Date)) +
geom_line(aes(y = MTA), color = "steelblue") +
geom_line(aes(y = All_Grades * 10000000), color = "darkred") +
scale_y_continuous(name = "MTA Subway & Bus Ridership",
sec.axis = sec_axis(~./10000000, name = "Price of Gasoline")) +
labs(title = "Price of Gasoline for All Grades, NYC")
weekly %>%
ggplot(aes(x = Date)) +
geom_line(aes(y = Buses), color = "steelblue") +
geom_line(aes(y = All_Grades * 5000000), color = "darkred") +
scale_y_continuous(name = "MTA Bus Ridership",
sec.axis = sec_axis(~./5000000, name = "Price of Gasoline")) +
labs(title = "Price of Gasoline for All Grades, NYC")
weekly %>%
ggplot(aes(x = Date)) +
geom_line(aes(y = Subway), color = "steelblue") +
geom_line(aes(y = All_Grades * 5000000), color = "darkred") +
scale_y_continuous(name = "MTA Subway Ridership",
sec.axis = sec_axis(~./5000000, name = "Price of Gasoline")) +
labs(title = "Price of Gasoline for All Grades, NYC")
weekly %>%
ggplot(aes(x = Date)) +
geom_line(aes(y = Bikes), color = "steelblue") +
geom_line(aes(y = All_Grades * 100000), color = "darkred") +
scale_y_continuous(name = "Bike Ridership",
sec.axis = sec_axis(~./100000, name = "Price of Gasoline")) +
labs(title = "Price of Gasoline for All Grades, NYC")
weekly %>%
ggplot(aes(x = Date)) +
geom_line(aes(y = LIRR), color = "steelblue") +
geom_line(aes(y = All_Grades * 500000), color = "darkred") +
scale_y_continuous(name = "LIRR Ridership",
sec.axis = sec_axis(~./500000, name = "Price of Gasoline")) +
labs(title = "Price of Gasoline for All Grades, NYC")
## Warning: Removed 5 row(s) containing missing values (geom_path).
weekly %>%
ggplot(aes(x = Date)) +
geom_line(aes(y = MetroNorth), color = "steelblue") +
geom_line(aes(y = All_Grades * 500000), color = "darkred") +
scale_y_continuous(name = "MetroNorth Ridership",
sec.axis = sec_axis(~./500000, name = "Price of Gasoline")) +
labs(title = "Price of Gasoline for All Grades, NYC")
## Warning: Removed 5 row(s) containing missing values (geom_path).
weekly %>%
ggplot(aes(x = Date)) +
geom_line(aes(y = Bridges_and_Tunnels), color = "steelblue") +
geom_line(aes(y = All_Grades * 1000000), color = "darkred") +
scale_y_continuous(name = "Traffic",
sec.axis = sec_axis(~./1000000, name = "Price of Gasoline")) +
labs(title = "Price of Gasoline for All Grades, NYC")
weekly %>%
ggplot(aes(x = Date)) +
geom_line(aes(y = `Access-A-Ride`), color = "steelblue") +
geom_line(aes(y = All_Grades * 50000), color = "darkred") +
scale_y_continuous(name = "Traffic",
sec.axis = sec_axis(~./50000, name = "Price of Gasoline")) +
labs(title = "Price of Gasoline for All Grades, NYC")
-back to work January 21 -omicron end of 21
weekly %>%
as_tsibble(index = Date) %>%
model(classical_decomposition(Subway, type = "additive")) %>%
components() %>%
autoplot() +
labs(title = "Classical Additive Decomposition of Subway Ridership")
## Warning: Removed 26 row(s) containing missing values (geom_path).
lambda <- weekly %>%
as_tsibble(index = Date) %>%
features(Subway, features = guerrero) %>%
pull(lambda_guerrero)
#stl decomp applied to the box cox transformed data
weekly %>%
as_tsibble(index = Date) %>%
model(STL(box_cox(Subway,lambda) ~ season(window = "periodic"), robust = TRUE)) %>%
components() %>%
autoplot() +
ggtitle("STL with Box-Cox")
weekly %>%
as_tsibble(index = Date) %>%
gg_tsdisplay(difference(Subway), plot_type='partial') +
labs(title = "hi")
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_point).
model1<- lm(Subway ~ All_Grades, weekly)
summary(model1)
##
## Call:
## lm(formula = Subway ~ All_Grades, data = weekly)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6500633 -1706335 -250356 1491752 21217419
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3005632 1166011 -2.578 0.011 *
## All_Grades 5453226 354684 15.375 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3560000 on 143 degrees of freedom
## Multiple R-squared: 0.6231, Adjusted R-squared: 0.6204
## F-statistic: 236.4 on 1 and 143 DF, p-value: < 2.2e-16
model2 <- lm(Subway ~ log(All_Grades), weekly)
summary(model2)
##
## Call:
## lm(formula = Subway ~ log(All_Grades), data = weekly)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5842698 -1769687 -545468 1809551 21186135
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5960240 1213512 -4.912 2.44e-06 ***
## log(All_Grades) 18076735 1052659 17.172 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3313000 on 143 degrees of freedom
## Multiple R-squared: 0.6734, Adjusted R-squared: 0.6712
## F-statistic: 294.9 on 1 and 143 DF, p-value: < 2.2e-16
model2a<- lm(Subway ~ log(Regular) + log(Midgrade) + log(Premium), weekly)
summary(model2a)
##
## Call:
## lm(formula = Subway ~ log(Regular) + log(Midgrade) + log(Premium),
## data = weekly)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5460247 -2093682 -233210 1387776 22069845
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1324492 4466350 -0.297 0.76725
## log(Regular) 58415600 19912453 2.934 0.00391 **
## log(Midgrade) -129765537 42229867 -3.073 0.00255 **
## log(Premium) 85825572 26970956 3.182 0.00180 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3222000 on 141 degrees of freedom
## Multiple R-squared: 0.6954, Adjusted R-squared: 0.6889
## F-statistic: 107.3 on 3 and 141 DF, p-value: < 2.2e-16
model3 <- lm(MTA ~ log(All_Grades), weekly)
summary(model3)
##
## Call:
## lm(formula = MTA ~ log(All_Grades), data = weekly)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9147120 -2386471 -357666 2245694 27741690
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2658363 1624774 -1.636 0.104
## log(All_Grades) 21601961 1409408 15.327 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4436000 on 143 degrees of freedom
## Multiple R-squared: 0.6216, Adjusted R-squared: 0.619
## F-statistic: 234.9 on 1 and 143 DF, p-value: < 2.2e-16
model3a<- lm(MTA ~ log(Regular) + log(Midgrade) + log(Premium), weekly)
summary(model3a)
##
## Call:
## lm(formula = MTA ~ log(Regular) + log(Midgrade) + log(Premium),
## data = weekly)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7931138 -2764487 -282822 2081546 28740439
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3848972 6036809 0.638 0.52478
## log(Regular) 71764365 26914073 2.666 0.00856 **
## log(Midgrade) -151227525 57078741 -2.649 0.00898 **
## log(Premium) 96459509 36454488 2.646 0.00907 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4356000 on 141 degrees of freedom
## Multiple R-squared: 0.6403, Adjusted R-squared: 0.6327
## F-statistic: 83.68 on 3 and 141 DF, p-value: < 2.2e-16
model4<- lm(Buses ~ All_Grades, weekly)
summary(model4)
##
## Call:
## lm(formula = Buses ~ All_Grades, data = weekly)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3434353 -581950 45629 571264 6560564
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3883422 422815 9.185 4.32e-16 ***
## All_Grades 1061775 128614 8.256 9.14e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1291000 on 143 degrees of freedom
## Multiple R-squared: 0.3228, Adjusted R-squared: 0.318
## F-statistic: 68.15 on 1 and 143 DF, p-value: 9.136e-14
model5 <- lm(log(Buses) ~ log(All_Grades), weekly)
summary(model5)
##
## Call:
## lm(formula = log(Buses) ~ log(All_Grades), data = weekly)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.74008 -0.07283 0.01487 0.11235 0.72958
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15.13670 0.07369 205.42 < 2e-16 ***
## log(All_Grades) 0.56439 0.06392 8.83 3.4e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2012 on 143 degrees of freedom
## Multiple R-squared: 0.3528, Adjusted R-squared: 0.3483
## F-statistic: 77.97 on 1 and 143 DF, p-value: 3.402e-15
model6<- lm(LIRR ~ All_Grades, weekly)
summary(model6)
##
## Call:
## lm(formula = LIRR ~ All_Grades, data = weekly)
##
## Residuals:
## Min 1Q Median 3Q Max
## -340309 -102596 -1990 107723 375142
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -300636 48838 -6.156 7.64e-09 ***
## All_Grades 311894 14734 21.168 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 145500 on 138 degrees of freedom
## (5 observations deleted due to missingness)
## Multiple R-squared: 0.7645, Adjusted R-squared: 0.7628
## F-statistic: 448.1 on 1 and 138 DF, p-value: < 2.2e-16
model6a<- lm(LIRR ~ Regular + Midgrade + Premium, weekly)
summary(model6a)
##
## Call:
## lm(formula = LIRR ~ Regular + Midgrade + Premium, data = weekly)
##
## Residuals:
## Min 1Q Median 3Q Max
## -330374 -103784 12247 105803 231467
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -264278 101162 -2.612 0.01000 *
## Regular 653681 346045 1.889 0.06102 .
## Midgrade -2036904 634659 -3.209 0.00166 **
## Premium 1630259 328994 4.955 2.11e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 130800 on 136 degrees of freedom
## (5 observations deleted due to missingness)
## Multiple R-squared: 0.8124, Adjusted R-squared: 0.8083
## F-statistic: 196.3 on 3 and 136 DF, p-value: < 2.2e-16
model7 <- lm(LIRR ~ log(All_Grades), weekly)
summary(model7)
##
## Call:
## lm(formula = LIRR ~ log(All_Grades), data = weekly)
##
## Residuals:
## Min 1Q Median 3Q Max
## -308005 -97987 -12208 104345 342203
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -450624 50553 -8.914 2.62e-15 ***
## log(All_Grades) 1016695 43526 23.358 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 134800 on 138 degrees of freedom
## (5 observations deleted due to missingness)
## Multiple R-squared: 0.7981, Adjusted R-squared: 0.7967
## F-statistic: 545.6 on 1 and 138 DF, p-value: < 2.2e-16
model8<- lm(MetroNorth ~ Regular + Midgrade + Premium, weekly)
summary(model8)
##
## Call:
## lm(formula = MetroNorth ~ Regular + Midgrade + Premium, data = weekly)
##
## Residuals:
## Min 1Q Median 3Q Max
## -308891 -82475 7096 86803 231638
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -395226 87398 -4.522 1.33e-05 ***
## Regular 272938 298882 0.913 0.36277
## Midgrade -1544542 548093 -2.818 0.00556 **
## Premium 1494163 284182 5.258 5.56e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 113000 on 135 degrees of freedom
## (6 observations deleted due to missingness)
## Multiple R-squared: 0.8362, Adjusted R-squared: 0.8326
## F-statistic: 229.7 on 3 and 135 DF, p-value: < 2.2e-16
model9 <- lm(MetroNorth ~ log(All_Grades), weekly)
summary(model9)
##
## Call:
## lm(formula = MetroNorth ~ log(All_Grades), data = weekly)
##
## Residuals:
## Min 1Q Median 3Q Max
## -219177 -83177 -18060 71372 347781
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -465334 46473 -10.01 <2e-16 ***
## log(All_Grades) 937441 40017 23.43 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 123900 on 137 degrees of freedom
## (6 observations deleted due to missingness)
## Multiple R-squared: 0.8002, Adjusted R-squared: 0.7988
## F-statistic: 548.8 on 1 and 137 DF, p-value: < 2.2e-16
model10<- lm(Bikes ~ All_Grades, weekly)
summary(model10)
##
## Call:
## lm(formula = Bikes ~ All_Grades, data = weekly)
##
## Residuals:
## Min 1Q Median 3Q Max
## -98947 -37539 9184 37377 82968
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 101684 15288 6.651 5.72e-10 ***
## All_Grades 11013 4650 2.368 0.0192 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 46670 on 143 degrees of freedom
## Multiple R-squared: 0.03774, Adjusted R-squared: 0.03101
## F-statistic: 5.608 on 1 and 143 DF, p-value: 0.01921
model11a<- lm(Bikes ~ log(Regular) + log(Midgrade) + log(Premium), weekly)
summary(model11a)
##
## Call:
## lm(formula = Bikes ~ log(Regular) + log(Midgrade) + log(Premium),
## data = weekly)
##
## Residuals:
## Min 1Q Median 3Q Max
## -94997 -36014 10448 32755 82895
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -97063 63046 -1.540 0.12591
## log(Regular) -878671 281079 -3.126 0.00215 **
## log(Midgrade) 1317391 596106 2.210 0.02872 *
## log(Premium) -316000 380715 -0.830 0.40793
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 45490 on 141 degrees of freedom
## Multiple R-squared: 0.0987, Adjusted R-squared: 0.07953
## F-statistic: 5.147 on 3 and 141 DF, p-value: 0.002087
“An alternative to using seasonal dummy variables, especially for long seasonal periods, is to use Fourier terms.
If m is the seasonal period
With Fourier terms, we often need fewer predictors than with dummy
variables, especially when
m is large. This makes them useful for weekly data, for example, where m
is 52 ”
weekly_ts <- weekly %>%
as_tsibble(index = Date)
model_subway <- weekly_ts %>%
model(TSLM(Subway ~ log(All_Grades) + trend() + fourier(K = 13)))
model_subway %>%
report()
## Series: Subway
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -6441451 -1083652 -139580 1210269 15878276
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8596036 2186556 -3.931 0.000144 ***
## log(All_Grades) 21970549 3192232 6.883 3.19e-10 ***
## trend() -24419 20431 -1.195 0.234448
## fourier(K = 13)C1_52 2435610 436703 5.577 1.62e-07 ***
## fourier(K = 13)S1_52 371308 333058 1.115 0.267221
## fourier(K = 13)C2_52 873333 320661 2.724 0.007457 **
## fourier(K = 13)S2_52 -888498 314427 -2.826 0.005556 **
## fourier(K = 13)C3_52 988170 324421 3.046 0.002872 **
## fourier(K = 13)S3_52 -146845 318396 -0.461 0.645517
## fourier(K = 13)C4_52 -257150 319913 -0.804 0.423150
## fourier(K = 13)S4_52 412583 318455 1.296 0.197694
## fourier(K = 13)C5_52 -417787 321647 -1.299 0.196554
## fourier(K = 13)S5_52 -567000 317463 -1.786 0.076707 .
## fourier(K = 13)C6_52 191356 316929 0.604 0.547167
## fourier(K = 13)S6_52 147405 319413 0.461 0.645314
## fourier(K = 13)C7_52 -628585 316465 -1.986 0.049361 *
## fourier(K = 13)S7_52 118517 316402 0.375 0.708659
## fourier(K = 13)C8_52 517017 317199 1.630 0.105826
## fourier(K = 13)S8_52 -278258 315925 -0.881 0.380262
## fourier(K = 13)C9_52 372167 315804 1.178 0.241019
## fourier(K = 13)S9_52 438178 317315 1.381 0.169967
## fourier(K = 13)C10_52 -275831 316698 -0.871 0.385576
## fourier(K = 13)S10_52 172021 317296 0.542 0.588757
## fourier(K = 13)C11_52 410817 317312 1.295 0.198003
## fourier(K = 13)S11_52 -59139 314933 -0.188 0.851376
## fourier(K = 13)C12_52 -244216 315173 -0.775 0.439997
## fourier(K = 13)S12_52 317385 315522 1.006 0.316555
## fourier(K = 13)C13_52 -514290 313260 -1.642 0.103353
## fourier(K = 13)S13_52 -287869 315513 -0.912 0.363458
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2659000 on 116 degrees of freedom
## Multiple R-squared: 0.8293, Adjusted R-squared: 0.7881
## F-statistic: 20.13 on 28 and 116 DF, p-value: < 2.22e-16
model_buses <- weekly_ts %>%
model(TSLM(Buses ~ log(All_Grades) + trend() + fourier(K = 13)))
model_buses %>%
report()
## Series: Buses
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -2555289 -611400 23943 628375 4566947
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1814916 904396 2.007 0.047100 *
## log(All_Grades) 5855225 1320359 4.435 2.11e-05 ***
## trend() -16099 8450 -1.905 0.059250 .
## fourier(K = 13)C1_52 725307 180627 4.015 0.000106 ***
## fourier(K = 13)S1_52 -360876 137758 -2.620 0.009979 **
## fourier(K = 13)C2_52 118382 132631 0.893 0.373937
## fourier(K = 13)S2_52 -349839 130052 -2.690 0.008200 **
## fourier(K = 13)C3_52 436003 134186 3.249 0.001514 **
## fourier(K = 13)S3_52 10802 131694 0.082 0.934772
## fourier(K = 13)C4_52 -38218 132321 -0.289 0.773226
## fourier(K = 13)S4_52 197964 131718 1.503 0.135573
## fourier(K = 13)C5_52 -52804 133038 -0.397 0.692167
## fourier(K = 13)S5_52 -238280 131308 -1.815 0.072159 .
## fourier(K = 13)C6_52 49594 131087 0.378 0.705881
## fourier(K = 13)S6_52 66475 132115 0.503 0.615806
## fourier(K = 13)C7_52 -288908 130895 -2.207 0.029271 *
## fourier(K = 13)S7_52 5136 130869 0.039 0.968760
## fourier(K = 13)C8_52 177146 131199 1.350 0.179577
## fourier(K = 13)S8_52 -88336 130672 -0.676 0.500377
## fourier(K = 13)C9_52 124004 130622 0.949 0.344422
## fourier(K = 13)S9_52 161555 131247 1.231 0.220840
## fourier(K = 13)C10_52 -57028 130992 -0.435 0.664111
## fourier(K = 13)S10_52 94051 131239 0.717 0.475039
## fourier(K = 13)C11_52 166945 131246 1.272 0.205917
## fourier(K = 13)S11_52 -5866 130262 -0.045 0.964160
## fourier(K = 13)C12_52 -133921 130361 -1.027 0.306411
## fourier(K = 13)S12_52 53579 130505 0.411 0.682158
## fourier(K = 13)C13_52 -169675 129569 -1.310 0.192942
## fourier(K = 13)S13_52 -163281 130501 -1.251 0.213385
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1100000 on 116 degrees of freedom
## Multiple R-squared: 0.601, Adjusted R-squared: 0.5047
## F-statistic: 6.241 on 28 and 116 DF, p-value: 6.4776e-13
model_LIRR <- weekly_ts %>%
model(TSLM(LIRR ~ log(All_Grades) + trend() + fourier(K = 13)))
model_LIRR %>%
report()
## Series: LIRR
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -262752 -33553 8992 37419 122348
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -205648.06 54901.56 -3.746 0.000287 ***
## log(All_Grades) 590505.35 81606.78 7.236 6.36e-11 ***
## trend() 3005.79 532.59 5.644 1.29e-07 ***
## fourier(K = 13)C1_52 96741.37 10805.16 8.953 9.11e-15 ***
## fourier(K = 13)S1_52 -73107.97 8411.17 -8.692 3.60e-14 ***
## fourier(K = 13)C2_52 38882.02 7898.89 4.922 2.99e-06 ***
## fourier(K = 13)S2_52 -13197.36 8001.25 -1.649 0.101890
## fourier(K = 13)C3_52 25910.97 8045.66 3.220 0.001679 **
## fourier(K = 13)S3_52 5174.01 7968.43 0.649 0.517477
## fourier(K = 13)C4_52 -13264.00 7991.57 -1.660 0.099788 .
## fourier(K = 13)S4_52 3037.37 7927.61 0.383 0.702350
## fourier(K = 13)C5_52 -9942.31 8019.91 -1.240 0.217700
## fourier(K = 13)S5_52 3264.50 7926.41 0.412 0.681243
## fourier(K = 13)C6_52 -7024.73 7953.88 -0.883 0.379046
## fourier(K = 13)S6_52 21.84 7938.63 0.003 0.997810
## fourier(K = 13)C7_52 -3398.20 7961.77 -0.427 0.670342
## fourier(K = 13)S7_52 -798.02 7900.94 -0.101 0.919730
## fourier(K = 13)C8_52 6486.69 7908.91 0.820 0.413874
## fourier(K = 13)S8_52 4946.19 7943.85 0.623 0.534795
## fourier(K = 13)C9_52 -981.28 7882.42 -0.124 0.901152
## fourier(K = 13)S9_52 -6457.10 7958.78 -0.811 0.418919
## fourier(K = 13)C10_52 -4023.83 7922.99 -0.508 0.612554
## fourier(K = 13)S10_52 6511.31 7933.81 0.821 0.413574
## fourier(K = 13)C11_52 1083.13 7934.94 0.137 0.891672
## fourier(K = 13)S11_52 4921.85 7899.55 0.623 0.534527
## fourier(K = 13)C12_52 -3151.81 7911.84 -0.398 0.691126
## fourier(K = 13)S12_52 -10000.04 7897.50 -1.266 0.208082
## fourier(K = 13)C13_52 -1406.23 7863.45 -0.179 0.858396
## fourier(K = 13)S13_52 -815.53 7857.71 -0.104 0.917525
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 65180 on 111 degrees of freedom
## Multiple R-squared: 0.962, Adjusted R-squared: 0.9524
## F-statistic: 100.4 on 28 and 111 DF, p-value: < 2.22e-16
model_metronorth <- weekly_ts %>%
model(TSLM(MetroNorth ~ log(All_Grades) + trend() + fourier(K = 13)))
model_metronorth %>%
report()
## Series: MetroNorth
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -229462 -33600 1265 28719 98358
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -120950.1 44942.6 -2.691 0.00823 **
## log(All_Grades) 349232.5 66846.6 5.224 8.36e-07 ***
## trend() 4115.9 436.2 9.436 7.67e-16 ***
## fourier(K = 13)C1_52 56954.9 8870.9 6.420 3.55e-09 ***
## fourier(K = 13)S1_52 -70309.6 6891.3 -10.203 < 2e-16 ***
## fourier(K = 13)C2_52 57165.4 6458.2 8.852 1.66e-14 ***
## fourier(K = 13)S2_52 -11997.7 6576.6 -1.824 0.07082 .
## fourier(K = 13)C3_52 27537.7 6622.1 4.158 6.37e-05 ***
## fourier(K = 13)S3_52 1502.9 6515.5 0.231 0.81801
## fourier(K = 13)C4_52 -1704.4 6550.1 -0.260 0.79518
## fourier(K = 13)S4_52 -6097.4 6498.5 -0.938 0.35016
## fourier(K = 13)C5_52 -10471.5 6553.2 -1.598 0.11293
## fourier(K = 13)S5_52 -687.8 6519.9 -0.105 0.91618
## fourier(K = 13)C6_52 -6583.2 6533.1 -1.008 0.31582
## fourier(K = 13)S6_52 -2618.9 6494.0 -0.403 0.68752
## fourier(K = 13)C7_52 -2573.9 6537.2 -0.394 0.69454
## fourier(K = 13)S7_52 237.2 6465.2 0.037 0.97080
## fourier(K = 13)C8_52 8729.0 6463.1 1.351 0.17960
## fourier(K = 13)S8_52 4662.0 6535.8 0.713 0.47717
## fourier(K = 13)C9_52 2192.6 6465.6 0.339 0.73517
## fourier(K = 13)S9_52 -2036.5 6522.3 -0.312 0.75545
## fourier(K = 13)C10_52 -2735.5 6512.7 -0.420 0.67529
## fourier(K = 13)S10_52 1689.3 6485.9 0.260 0.79499
## fourier(K = 13)C11_52 7651.7 6489.9 1.179 0.24093
## fourier(K = 13)S11_52 3168.5 6494.2 0.488 0.62660
## fourier(K = 13)C12_52 -7470.8 6479.9 -1.153 0.25144
## fourier(K = 13)S12_52 -9016.5 6481.3 -1.391 0.16698
## fourier(K = 13)C13_52 -6570.7 6472.8 -1.015 0.31227
## fourier(K = 13)S13_52 -4050.9 6420.7 -0.631 0.52941
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 53260 on 110 degrees of freedom
## Multiple R-squared: 0.9703, Adjusted R-squared: 0.9628
## F-statistic: 128.6 on 28 and 110 DF, p-value: < 2.22e-16
-no trend in bikes, but big seasonality factor
model_bikes <- weekly_ts %>%
model(TSLM(Bikes ~ All_Grades + fourier(K = 13)))
model_bikes %>%
report()
## Series: Bikes
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -49204 -13645 1114 14096 38322
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 111508.06 6862.47 16.249 < 2e-16 ***
## All_Grades 6383.19 2095.60 3.046 0.00287 **
## fourier(K = 13)C1_52 7239.23 2383.01 3.038 0.00294 **
## fourier(K = 13)S1_52 -59914.07 2559.95 -23.404 < 2e-16 ***
## fourier(K = 13)C2_52 12048.64 2481.47 4.855 3.75e-06 ***
## fourier(K = 13)S2_52 -3981.63 2445.33 -1.628 0.10616
## fourier(K = 13)C3_52 4480.24 2487.67 1.801 0.07428 .
## fourier(K = 13)S3_52 -5275.39 2447.11 -2.156 0.03315 *
## fourier(K = 13)C4_52 -4101.43 2451.48 -1.673 0.09699 .
## fourier(K = 13)S4_52 276.45 2475.79 0.112 0.91128
## fourier(K = 13)C5_52 2846.03 2460.11 1.157 0.24968
## fourier(K = 13)S5_52 -2104.75 2464.17 -0.854 0.39477
## fourier(K = 13)C6_52 -350.44 2463.83 -0.142 0.88714
## fourier(K = 13)S6_52 3320.21 2459.92 1.350 0.17971
## fourier(K = 13)C7_52 -4113.51 2461.63 -1.671 0.09738 .
## fourier(K = 13)S7_52 3068.13 2460.84 1.247 0.21497
## fourier(K = 13)C8_52 3914.60 2464.93 1.588 0.11496
## fourier(K = 13)S8_52 -1198.94 2456.97 -0.488 0.62648
## fourier(K = 13)C9_52 3520.93 2456.47 1.433 0.15443
## fourier(K = 13)S9_52 1727.70 2465.93 0.701 0.48493
## fourier(K = 13)C10_52 13.56 2456.08 0.006 0.99560
## fourier(K = 13)S10_52 2553.64 2465.11 1.036 0.30238
## fourier(K = 13)C11_52 2548.59 2468.03 1.033 0.30390
## fourier(K = 13)S11_52 3403.84 2448.34 1.390 0.16709
## fourier(K = 13)C12_52 -6508.25 2451.59 -2.655 0.00904 **
## fourier(K = 13)S12_52 -2220.57 2454.26 -0.905 0.36744
## fourier(K = 13)C13_52 -2792.50 2436.67 -1.146 0.25412
## fourier(K = 13)S13_52 310.19 2453.73 0.126 0.89962
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20690 on 117 degrees of freedom
## Multiple R-squared: 0.8453, Adjusted R-squared: 0.8096
## F-statistic: 23.68 on 27 and 117 DF, p-value: < 2.22e-16
weekly %>%
select(-c(All_Grades, Regular, Midgrade, Premium, Date)) %>%
map(~lm(.x ~ All_Grades, data = weekly)) %>%
map(glance) %>%
do.call(rbind.data.frame, .) %>%
rownames_to_column()
## # A tibble: 8 x 13
## rowname r.squared adj.r.squared sigma statistic p.value df logLik AIC
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Subway 0.623 0.620 3.56e6 236. 4.23e-32 1 -2392. 4790.
## 2 Buses 0.323 0.318 1.29e6 68.2 9.14e-14 1 -2245. 4496.
## 3 LIRR 0.765 0.763 1.46e5 448. 3.55e-45 1 -1862. 3730.
## 4 MetroN~ 0.772 0.770 1.32e5 464. 8.28e-46 1 -1835. 3677.
## 5 Access~ 0.376 0.372 2.32e4 85.7 2.96e-16 1 -1651. 3308.
## 6 Bridge~ 0.431 0.427 7.91e5 108. 3.12e-19 1 -2174. 4354.
## 7 MTA 0.575 0.572 4.70e6 193. 2.42e-28 1 -2432. 4871.
## 8 Bikes 0.0377 0.0310 4.67e4 5.61 1.92e- 2 1 -1764. 3533.
## # ... with 4 more variables: BIC <dbl>, deviance <dbl>, df.residual <int>,
## # nobs <int>
weekly %>%
select(-c(All_Grades, Regular, Midgrade, Premium, Date)) %>%
map(~lm(.x ~ All_Grades, data = weekly)) %>%
map(glance) %>%
do.call(rbind.data.frame, .) %>%
rownames_to_column %>%
as_tibble %>%
ggplot(aes(x = reorder(rowname, -r.squared), y = r.squared)) +
geom_col() +
coord_flip() +
scale_y_continuous(labels = scales::percent) +
labs(x = "Mode of Transportation",
y = expression( R^{2}),
title = expression(paste("Comparison of ", R^{2})))
weekly %>%
select(-c(All_Grades, Regular, Midgrade, Premium, Date)) %>%
map(~lm(.x ~ Regular + Midgrade + Premium, data = weekly)) %>%
map(glance) %>%
do.call(rbind.data.frame, .) %>%
rownames_to_column()
## # A tibble: 8 x 13
## rowname r.squared adj.r.squared sigma statistic p.value df logLik AIC
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Subway 0.643 0.635 3.49e6 84.6 2.28e-31 3 -2388. 4786.
## 2 Buses 0.329 0.315 1.29e6 23.0 3.44e-12 3 -2244. 4499.
## 3 LIRR 0.812 0.808 1.31e5 196. 3.18e-49 3 -1846. 3702.
## 4 MetroN~ 0.836 0.833 1.13e5 230. 7.89e-53 3 -1812. 3635.
## 5 Access~ 0.516 0.505 2.06e4 49.7 6.23e-22 3 -1633. 3275.
## 6 Bridge~ 0.478 0.467 7.63e5 43.1 8.08e-20 3 -2168. 4345.
## 7 MTA 0.591 0.582 4.64e6 67.9 3.13e-27 3 -2430. 4869.
## 8 Bikes 0.0908 0.0715 4.57e4 4.69 3.73e- 3 3 -1760. 3529.
## # ... with 4 more variables: BIC <dbl>, deviance <dbl>, df.residual <int>,
## # nobs <int>
weekly %>%
select(-c(All_Grades, Regular, Midgrade, Premium, Date)) %>%
map(~lm(.x ~ Regular + Midgrade + Premium, data = weekly)) %>%
map(glance) %>%
do.call(rbind.data.frame, .) %>%
rownames_to_column %>%
as_tibble %>%
ggplot(aes(x = reorder(rowname, -adj.r.squared), y = adj.r.squared)) +
geom_col() +
coord_flip() +
scale_y_continuous(labels = scales::percent) +
labs(x = "Mode of Transportation",
y = expression(paste("Adjusted ", R^{2})),
title = expression(paste("Comparison of Adjusted ", R^{2})))
weekly %>%
select(-c(All_Grades, Regular, Midgrade, Premium, Date)) %>%
map(~lm(.x ~ log(All_Grades), data = weekly)) %>%
map(glance) %>%
do.call(rbind.data.frame, .) %>%
rownames_to_column()
## # A tibble: 8 x 13
## rowname r.squared adj.r.squared sigma statistic p.value df logLik AIC
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Subway 0.673 0.671 3.31e6 295. 1.43e-36 1 -2382. 4769.
## 2 Buses 0.350 0.345 1.26e6 77.0 4.69e-15 1 -2242. 4490.
## 3 LIRR 0.798 0.797 1.35e5 546. 8.50e-50 1 -1851. 3708.
## 4 MetroN~ 0.800 0.799 1.24e5 549. 9.26e-50 1 -1826. 3659.
## 5 Access~ 0.414 0.409 2.25e4 100. 3.58e-18 1 -1646. 3299.
## 6 Bridge~ 0.466 0.463 7.66e5 125. 2.99e-21 1 -2169. 4345.
## 7 MTA 0.622 0.619 4.44e6 235. 5.59e-32 1 -2424. 4854.
## 8 Bikes 0.0277 0.0209 4.69e4 4.08 4.53e- 2 1 -1764. 3535.
## # ... with 4 more variables: BIC <dbl>, deviance <dbl>, df.residual <int>,
## # nobs <int>
weekly %>%
select(-c(All_Grades, Regular, Midgrade, Premium, Date)) %>%
map(~lm(.x ~ log(All_Grades), data = weekly)) %>%
map(glance) %>%
do.call(rbind.data.frame, .) %>%
rownames_to_column %>%
as_tibble %>%
ggplot(aes(x = reorder(rowname, -adj.r.squared), y = adj.r.squared)) +
geom_col() +
coord_flip() +
scale_y_continuous(labels = scales::percent) +
labs(x = "Mode of Transportation",
y = expression(paste("Adjusted ", R^{2})),
title = expression(paste("Comparison of Adjusted ", R^{2})))
model_aar <- weekly_ts %>% model(TSLM(`Access-A-Ride` ~ All_Grades + fourier(K = 13)))
model_bat <- weekly_ts %>% model(TSLM(Bridges_and_Tunnels ~ log(All_Grades) + fourier(K = 13)))
model_mta <- weekly_ts %>% model(TSLM(MTA ~ log(All_Grades) + trend() + fourier(K = 13)))
names <- c("Subway", "Buses", "LIRR", "MetroNorth", "Bikes", "Access-A-Ride",
"Bridges_and_Tunnels", "MTA")
trend_models <-
rbind(glance(model_subway),
glance(model_buses),
glance(model_LIRR),
glance(model_metronorth),
glance(model_bikes),
glance(model_aar),
glance(model_bat),
glance(model_mta)) %>%
select(.model:adj_r_squared) %>%
cbind(names, .)
trend_models %>% select(.model:adj_r_squared)
## .model r_squared
## 1 TSLM(Subway ~ log(All_Grades) + trend() + fourier(K = 13)) 0.8293352
## 2 TSLM(Buses ~ log(All_Grades) + trend() + fourier(K = 13)) 0.6010397
## 3 TSLM(LIRR ~ log(All_Grades) + trend() + fourier(K = 13)) 0.9620075
## 4 TSLM(MetroNorth ~ log(All_Grades) + trend() + fourier(K = 13)) 0.9703479
## 5 TSLM(Bikes ~ All_Grades + fourier(K = 13)) 0.8452933
## 6 TSLM(`Access-A-Ride` ~ All_Grades + fourier(K = 13)) 0.5976953
## 7 TSLM(Bridges_and_Tunnels ~ log(All_Grades) + fourier(K = 13)) 0.6789031
## 8 TSLM(MTA ~ log(All_Grades) + trend() + fourier(K = 13)) 0.7915920
## adj_r_squared
## 1 0.7881403
## 2 0.5047390
## 3 0.9524238
## 4 0.9628001
## 5 0.8095918
## 6 0.5040555
## 7 0.6048038
## 8 0.7412866
trend_models %>%
as_tibble %>%
ggplot(aes(x = reorder(names, -adj_r_squared), y = adj_r_squared)) +
geom_col() +
coord_flip() +
scale_y_continuous(labels = scales::percent) +
labs(x = "Mode of Transportation",
y = expression(paste("Adjusted ", R^{2})),
title = expression(paste("Comparison of Adjusted ", R^{2})))