This RMarkdown file contains the report of the data analysis done for the project on forecasting daily bike rental demand using time series models in R. It contains analysis such as data exploration, summary statistics and building the time series models. The final report was completed on Sun Sep 14 11:37:34 2025.
Data Description:
This dataset contains the daily count of rental bike transactions
between years 2011 and 2012 in Capital bikeshare system with the
corresponding weather and seasonal information.
Data Source: https://archive.ics.uci.edu/ml/datasets/bike+sharing+dataset
Relevant Paper: Fanaee-T, Hadi, and Gama, Joao (2013).
Event labeling combining ensemble detectors and background
knowledge. Progress in Artificial Intelligence.
pkgs <- c(
"readr","dplyr","ggplot2","lubridate","tsibble",
"fable","feasts","fabletools","tseries","slider",
"timetk","plotly","knitr","tidyr","scales"
)
to_install <- setdiff(pkgs, rownames(installed.packages()))
if (length(to_install)) install.packages(to_install, dependencies = TRUE)
invisible(lapply(pkgs, library, character.only = TRUE))
dir.create("data", showWarnings = FALSE)
csv_path <- "data/day.csv"
if (!file.exists(csv_path)) {
zip_url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip"
tf <- tempfile(fileext = ".zip")
download.file(zip_url, tf, mode = "wb", quiet = TRUE)
unzip(tf, files = "day.csv", exdir = "data", overwrite = TRUE)
unlink(tf)
}
raw <- readr::read_csv(csv_path, show_col_types = FALSE)
daily <- raw %>%
transmute(
date = as.Date(dteday),
rentals = cnt,
temp_cel = temp * 41,
feel_cel = atemp * 50,
humidity = hum,
season = factor(season, levels = 1:4, labels = c("Spring","Summer","Fall","Winter")),
weekday = factor(weekday, levels = 0:6, labels = c("Sun","Mon","Tue","Wed","Thu","Fri","Sat")),
working = as.logical(workingday),
holiday = as.logical(holiday)
) %>%
arrange(date)
# Define tsibble once here for reuse
daily_ts <- daily %>% as_tsibble(index = date)
knitr::kable(
daily %>% summarise(start=min(date), end=max(date), n_days=n(),
min=min(rentals), q25=quantile(rentals,.25), median=median(rentals),
mean=mean(rentals), q75=quantile(rentals,.75), max=max(rentals)),
caption = "Table 1. Coverage & rentals summary"
)
start | end | n_days | min | q25 | median | mean | q75 | max |
---|---|---|---|---|---|---|---|---|
2011-01-01 | 2012-12-31 | 731 | 22 | 3152 | 4548 | 4504.349 | 5956 | 8714 |
ggplot(daily, aes(date, rentals)) +
geom_line() +
scale_x_date(date_breaks = "3 months", date_labels = "%Y-%b", expand = c(0.01, 0.01)) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title="Daily bike rentals", x=NULL, y="Rentals")
Figure 3.1. Daily rentals time series with rotated labels.
daily_month <- daily %>%
mutate(month = floor_date(date, "month")) %>%
group_by(month) %>%
summarise(rentals = mean(rentals), .groups = "drop")
ggplot(daily_month, aes(month, rentals)) +
geom_line() +
scale_x_date(date_breaks = "3 months", date_labels = "%Y-%b", expand = c(0.01, 0.01)) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title="Monthly average rentals", x=NULL, y="Rentals")
Figure 3.2. Monthly average rentals (mean of daily totals by
month).
ggplot(daily, aes(season, rentals, fill = season)) +
geom_boxplot(alpha=.75, outlier.alpha=.3) +
theme(legend.position="none") +
labs(title="Rentals by season", x=NULL, y="Rentals")
Figure 3.3a. Rentals by season (boxplots).
ggplot(daily, aes(temp_cel, rentals, color = season)) +
geom_point(alpha=.5) +
geom_smooth(se = FALSE, method = "loess") +
labs(title="Rentals vs temperature", x="Temperature (°C)", y="Rentals", color="Season")
Figure 3.3b. Rentals vs. temperature with LOESS curves by
season.
daily %>%
timetk::plot_time_series(
.date_var = date, .value = rentals,
.smooth = FALSE, .interactive = TRUE,
.color_var = season,
.title = "Daily bike rentals (interactive)"
)
Figure 3.4. Interactive daily rentals (hover for details).
daily <- daily %>%
mutate(
rentals_ma7 = slider::slide_dbl(rentals, mean, .before = 6, .complete = TRUE),
rentals_ma30 = slider::slide_dbl(rentals, mean, .before = 29, .complete = TRUE)
)
daily %>%
select(date, rentals, rentals_ma7, rentals_ma30) %>%
pivot_longer(-date, names_to = "series", values_to = "value") %>%
ggplot(aes(date, value, color = series)) +
geom_line() +
scale_x_date(date_breaks="3 months", date_labels="%Y-%b", expand = c(0.01, 0.01)) +
theme(axis.text.x = element_text(angle=45, hjust=1)) +
labs(title="Observed vs rolling-mean smoothed rentals", x=NULL, y="Daily rentals", color=NULL)
Figure 3.5. Observed vs. 7-day and 30-day rolling means.
monthly_ts <- daily_ts %>%
index_by(month = yearmonth(date)) %>%
summarise(rentals = mean(rentals), .groups = "drop_last") %>%
as_tsibble(index = month)
autoplot(monthly_ts, rentals) +
labs(title = "Monthly average rentals", x = NULL, y = "Rentals")
Figure 3.6a. Monthly average rentals (tsibble plot).
gg_season(monthly_ts, rentals) +
labs(title = "Seasonality by month (monthly means)", y = "Rentals")
Figure 3.6b. Seasonality by month across years (monthly
means).
gg_subseries(monthly_ts, rentals) +
scale_x_yearmonth(
date_breaks = "1 year", # only 2011, 2012
labels = label_date("%Y")
) +
theme(
axis.text.x = element_text(angle = 90, vjust = 0.5, size = 8),
panel.grid.minor = element_blank()
) +
labs(title = "Monthly subseries (monthly means)", y = "Rentals")
Figure 3.6c. Monthly subseries (12 tidy panels, one per
month).
stl_fit <- daily_ts %>%
model(STL(rentals ~ season(window="periodic"), robust = TRUE))
components(stl_fit) %>%
autoplot() + labs(title="STL decomposition (trend + seasonality)", x=NULL)
Figure 4.1. STL decomposition into trend, seasonal and
remainder.
n <- nrow(daily_ts)
k_adf <- floor(12 * (n/100)^(1/4))
adf_level <- tseries::adf.test(daily_ts$rentals, k = k_adf)$p.value
daily_ts <- daily_ts %>%
mutate(diff1 = difference(rentals),
diff_s7 = difference(rentals, lag = 7))
adf_diff1 <- tseries::adf.test(na.omit(daily_ts$diff1), k = k_adf)$p.value
adf_s7 <- tseries::adf.test(na.omit(daily_ts$diff_s7), k = k_adf)$p.value
tibble::tibble(
Series = c("Level","First difference","Seasonal diff (lag 7)"),
`ADF p-value` = c(adf_level, adf_diff1, adf_s7)
) %>%
knitr::kable(caption = "Table 2. ADF stationarity checks")
Series | ADF p-value |
---|---|
Level | 0.9824264 |
First difference | 0.0100000 |
Seasonal diff (lag 7) | 0.0100000 |
set.seed(123)
cv_tbl <- daily_ts %>%
stretch_tsibble(.init = 365, .step = 30) %>%
model(
NAIVE = NAIVE(rentals),
SNAIVE = SNAIVE(rentals ~ lag("week")),
ARIMA = ARIMA(rentals)
) %>%
accuracy()
cv_tbl %>%
dplyr::filter(.type == "Test") %>%
dplyr::group_by(.model) %>%
dplyr::summarise(n = dplyr::n(), RMSE = mean(RMSE), MAE = mean(MAE), MAPE = mean(MAPE), .groups = "drop") %>%
dplyr::arrange(RMSE) %>%
knitr::kable(digits = 2, caption = "Table 3. Rolling-origin CV (Test only)")
.model | n | RMSE | MAE | MAPE |
---|
final_fit <- daily_ts %>% model(ARIMA = ARIMA(rentals))
fabletools::report(final_fit)
## Series: rentals
## Model: ARIMA(1,1,1)(1,0,2)[7]
##
## Coefficients:
## ar1 ma1 sar1 sma1 sma2
## 0.3612 -0.9005 0.9106 -0.9035 0.0545
## s.e. 0.0427 0.0205 0.0760 0.0865 0.0417
##
## sigma^2 estimated as 844004: log likelihood=-6014.88
## AIC=12041.75 AICc=12041.87 BIC=12069.31
fc_60 <- forecast(final_fit, h = "60 days")
autoplot(fc_60, daily_ts) +
labs(title = "60-day ARIMA forecast", x = NULL, y = "Daily rentals")
Figure 5.2. 60-day ARIMA forecast with 80/95% intervals (decline
into winter).
The forecast projects rentals declining into the winter months, with wider confidence intervals during seasonal transitions.
aug <- augment(final_fit)
feasts::gg_tsdisplay(aug, .innov, plot_type = "partial") +
labs(title = "Residual diagnostics (innovations ACF/PACF)")
Figure 5.3. Residual ACF/PACF (no significant
autocorrelation).
aug %>%
features(.innov, ljung_box, lag = 24, dof = 2) %>%
knitr::kable(caption = "Table 4. Ljung–Box test on residuals (lag 24)")
.model | lb_stat | lb_pvalue |
---|---|---|
ARIMA | 16.03538 | 0.814126 |
daily_ts <- daily_ts %>%
mutate(
temp_cel_s = as.numeric(scale(temp_cel)),
hum_s = as.numeric(scale(humidity))
)
fit_arimax <- daily_ts %>% model(
ARIMAX = ARIMA(rentals ~ temp_cel_s + hum_s + working + holiday)
)
fabletools::report(fit_arimax)
cv_compare <- daily_ts %>%
stretch_tsibble(.init = 365, .step = 30) %>%
model(
SNAIVE = SNAIVE(rentals ~ lag("week")),
ARIMA = ARIMA(rentals),
ARIMAX = ARIMA(rentals ~ temp_cel_s + hum_s + working + holiday)
) %>%
accuracy() %>%
dplyr::filter(.type == "Test") %>%
dplyr::group_by(.model) %>%
dplyr::summarise(RMSE = mean(RMSE), MAE = mean(MAE), MAPE = mean(MAPE), .groups = "drop") %>%
dplyr::arrange(RMSE)
knitr::kable(cv_compare, digits = 2, caption = "Table 5. CV (Test): SNAIVE vs ARIMA vs ARIMAX")
sessionInfo()
## R version 4.2.3 (2023-03-15 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 26100)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] scales_1.3.0 tidyr_1.3.1 knitr_1.49 plotly_4.10.4
## [5] timetk_2.9.1 slider_0.3.2 tseries_0.10-58 feasts_0.3.2
## [9] fable_0.3.4 fabletools_0.4.2 tsibble_1.1.4 lubridate_1.9.4
## [13] ggplot2_3.5.1 dplyr_1.1.4 readr_2.1.5
##
## loaded via a namespace (and not attached):
## [1] nlme_3.1-167 xts_0.14.1 bit64_4.6.0-1
## [4] httr_1.4.7 tools_4.2.3 bslib_0.9.0
## [7] R6_2.6.1 rpart_4.1.24 ggdist_3.3.3
## [10] mgcv_1.9-1 lazyeval_0.2.2 colorspace_2.1-1
## [13] nnet_7.3-20 withr_3.0.2 tidyselect_1.2.1
## [16] bit_4.5.0.1 curl_6.2.1 compiler_4.2.3
## [19] progressr_0.15.1 cli_3.6.4 labeling_0.4.3
## [22] sass_0.4.9 quadprog_1.5-8 digest_0.6.37
## [25] rmarkdown_2.29 pkgconfig_2.0.3 htmltools_0.5.8.1
## [28] parallelly_1.42.0 fastmap_1.2.0 htmlwidgets_1.6.4
## [31] rlang_1.1.5 TTR_0.24.4 rstudioapi_0.17.1
## [34] quantmod_0.4.28 farver_2.1.2 jquerylib_0.1.4
## [37] generics_0.1.3 zoo_1.8-13 jsonlite_1.9.0
## [40] crosstalk_1.2.1 vroom_1.6.5 distributional_0.5.0
## [43] magrittr_2.0.3 Matrix_1.5-3 Rcpp_1.0.14
## [46] munsell_0.5.1 lifecycle_1.0.4 furrr_0.3.1
## [49] yaml_2.3.10 MASS_7.3-58.2 recipes_1.1.1
## [52] grid_4.2.3 parallel_4.2.3 listenv_0.9.1
## [55] forcats_1.0.0 crayon_1.5.3 lattice_0.22-6
## [58] splines_4.2.3 hms_1.1.3 anytime_0.3.12
## [61] pillar_1.10.1 future.apply_1.11.3 codetools_0.2-20
## [64] urca_1.3-4 glue_1.8.0 evaluate_1.0.3
## [67] rsample_1.3.1 data.table_1.17.0 vctrs_0.6.5
## [70] tzdb_0.4.0 gtable_0.3.6 purrr_1.0.4
## [73] future_1.34.0 cachem_1.1.0 xfun_0.51
## [76] gower_1.0.2 prodlim_2024.06.25 class_7.3-23
## [79] survival_3.8-3 viridisLite_0.4.2 timeDate_4041.110
## [82] tibble_3.3.0 hardhat_1.4.1 warp_0.2.1
## [85] lava_1.8.1 timechange_0.3.0 globals_0.16.3
## [88] ellipsis_0.3.2 ipred_0.9-15