About Data Analysis Report

This RMarkdown file contains the report of the data analysis done for the project on forecasting daily bike rental demand using time series models in R. It contains analysis such as data exploration, summary statistics and building the time series models. The final report was completed on Sun Sep 14 11:37:34 2025.

Data Description:
This dataset contains the daily count of rental bike transactions between years 2011 and 2012 in Capital bikeshare system with the corresponding weather and seasonal information.

Data Source: https://archive.ics.uci.edu/ml/datasets/bike+sharing+dataset
Relevant Paper: Fanaee-T, Hadi, and Gama, Joao (2013). Event labeling combining ensemble detectors and background knowledge. Progress in Artificial Intelligence.

1) Packages

pkgs <- c(
  "readr","dplyr","ggplot2","lubridate","tsibble",
  "fable","feasts","fabletools","tseries","slider",
  "timetk","plotly","knitr","tidyr","scales"
)
to_install <- setdiff(pkgs, rownames(installed.packages()))
if (length(to_install)) install.packages(to_install, dependencies = TRUE)
invisible(lapply(pkgs, library, character.only = TRUE))

2) Data (auto-download if missing) + Cleaning

dir.create("data", showWarnings = FALSE)
csv_path <- "data/day.csv"

if (!file.exists(csv_path)) {
  zip_url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip"
  tf <- tempfile(fileext = ".zip")
  download.file(zip_url, tf, mode = "wb", quiet = TRUE)
  unzip(tf, files = "day.csv", exdir = "data", overwrite = TRUE)
  unlink(tf)
}

raw <- readr::read_csv(csv_path, show_col_types = FALSE)

daily <- raw %>%
  transmute(
    date      = as.Date(dteday),
    rentals   = cnt,
    temp_cel  = temp  * 41,
    feel_cel  = atemp * 50,
    humidity  = hum,
    season    = factor(season, levels = 1:4, labels = c("Spring","Summer","Fall","Winter")),
    weekday   = factor(weekday, levels = 0:6, labels = c("Sun","Mon","Tue","Wed","Thu","Fri","Sat")),
    working   = as.logical(workingday),
    holiday   = as.logical(holiday)
  ) %>%
  arrange(date)

# Define tsibble once here for reuse
daily_ts <- daily %>% as_tsibble(index = date)

knitr::kable(
  daily %>% summarise(start=min(date), end=max(date), n_days=n(),
                      min=min(rentals), q25=quantile(rentals,.25), median=median(rentals),
                      mean=mean(rentals), q75=quantile(rentals,.75), max=max(rentals)),
  caption = "Table 1. Coverage & rentals summary"
)
Table 1. Coverage & rentals summary
start end n_days min q25 median mean q75 max
2011-01-01 2012-12-31 731 22 3152 4548 4504.349 5956 8714

3) Exploratory Visuals (Fixed Axes)

3.1 Time plot (daily) — fewer ticks, rotated labels

ggplot(daily, aes(date, rentals)) +
  geom_line() +
  scale_x_date(date_breaks = "3 months", date_labels = "%Y-%b", expand = c(0.01, 0.01)) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title="Daily bike rentals", x=NULL, y="Rentals")

Figure 3.1. Daily rentals time series with rotated labels.

3.2 Optional: Monthly aggregation for readability

daily_month <- daily %>%
  mutate(month = floor_date(date, "month")) %>%
  group_by(month) %>%
  summarise(rentals = mean(rentals), .groups = "drop")

ggplot(daily_month, aes(month, rentals)) +
  geom_line() +
  scale_x_date(date_breaks = "3 months", date_labels = "%Y-%b", expand = c(0.01, 0.01)) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title="Monthly average rentals", x=NULL, y="Rentals")

Figure 3.2. Monthly average rentals (mean of daily totals by month).

3.3 Seasonality & distribution

ggplot(daily, aes(season, rentals, fill = season)) +
  geom_boxplot(alpha=.75, outlier.alpha=.3) +
  theme(legend.position="none") +
  labs(title="Rentals by season", x=NULL, y="Rentals")

Figure 3.3a. Rentals by season (boxplots).

ggplot(daily, aes(temp_cel, rentals, color = season)) +
  geom_point(alpha=.5) +
  geom_smooth(se = FALSE, method = "loess") +
  labs(title="Rentals vs temperature", x="Temperature (°C)", y="Rentals", color="Season")

Figure 3.3b. Rentals vs. temperature with LOESS curves by season.

3.4 Interactive time-series (Plotly via timetk)

daily %>%
  timetk::plot_time_series(
    .date_var = date, .value = rentals,
    .smooth = FALSE, .interactive = TRUE,
    .color_var = season,
    .title = "Daily bike rentals (interactive)"
  )

Figure 3.4. Interactive daily rentals (hover for details).

3.5 Rolling means (7-day & 30-day)

daily <- daily %>%
  mutate(
    rentals_ma7  = slider::slide_dbl(rentals, mean, .before = 6,  .complete = TRUE),
    rentals_ma30 = slider::slide_dbl(rentals, mean, .before = 29, .complete = TRUE)
  )

daily %>%
  select(date, rentals, rentals_ma7, rentals_ma30) %>%
  pivot_longer(-date, names_to = "series", values_to = "value") %>%
  ggplot(aes(date, value, color = series)) +
  geom_line() +
  scale_x_date(date_breaks="3 months", date_labels="%Y-%b", expand = c(0.01, 0.01)) +
  theme(axis.text.x = element_text(angle=45, hjust=1)) +
  labs(title="Observed vs rolling-mean smoothed rentals", x=NULL, y="Daily rentals", color=NULL)

Figure 3.5. Observed vs. 7-day and 30-day rolling means.

3.6 Seasonality views (monthly aggregation for readability)

monthly_ts <- daily_ts %>%
  index_by(month = yearmonth(date)) %>%
  summarise(rentals = mean(rentals), .groups = "drop_last") %>%
  as_tsibble(index = month)

autoplot(monthly_ts, rentals) +
  labs(title = "Monthly average rentals", x = NULL, y = "Rentals")

Figure 3.6a. Monthly average rentals (tsibble plot).

gg_season(monthly_ts, rentals) +
  labs(title = "Seasonality by month (monthly means)", y = "Rentals")

Figure 3.6b. Seasonality by month across years (monthly means).

gg_subseries(monthly_ts, rentals) +
  scale_x_yearmonth(
    date_breaks = "1 year",      # only 2011, 2012
    labels = label_date("%Y")
  ) +
  theme(
    axis.text.x = element_text(angle = 90, vjust = 0.5, size = 8),
    panel.grid.minor = element_blank()
  ) +
  labs(title = "Monthly subseries (monthly means)", y = "Rentals")

Figure 3.6c. Monthly subseries (12 tidy panels, one per month).


4) Decomposition & Stationarity

stl_fit <- daily_ts %>%
  model(STL(rentals ~ season(window="periodic"), robust = TRUE))

components(stl_fit) %>%
  autoplot() + labs(title="STL decomposition (trend + seasonality)", x=NULL)

Figure 4.1. STL decomposition into trend, seasonal and remainder.

n <- nrow(daily_ts)
k_adf <- floor(12 * (n/100)^(1/4))

adf_level  <- tseries::adf.test(daily_ts$rentals, k = k_adf)$p.value

daily_ts <- daily_ts %>%
  mutate(diff1  = difference(rentals),
         diff_s7 = difference(rentals, lag = 7))

adf_diff1 <- tseries::adf.test(na.omit(daily_ts$diff1),  k = k_adf)$p.value
adf_s7    <- tseries::adf.test(na.omit(daily_ts$diff_s7), k = k_adf)$p.value

tibble::tibble(
  Series = c("Level","First difference","Seasonal diff (lag 7)"),
  `ADF p-value` = c(adf_level, adf_diff1, adf_s7)
) %>%
  knitr::kable(caption = "Table 2. ADF stationarity checks")
Table 2. ADF stationarity checks
Series ADF p-value
Level 0.9824264
First difference 0.0100000
Seasonal diff (lag 7) 0.0100000

5) Modeling & Forecasting

5.1 Rolling-origin CV (SNAIVE vs ARIMA)

set.seed(123)
cv_tbl <- daily_ts %>%
  stretch_tsibble(.init = 365, .step = 30) %>%
  model(
    NAIVE  = NAIVE(rentals),
    SNAIVE = SNAIVE(rentals ~ lag("week")),
    ARIMA  = ARIMA(rentals)
  ) %>%
  accuracy()

cv_tbl %>%
  dplyr::filter(.type == "Test") %>%
  dplyr::group_by(.model) %>%
  dplyr::summarise(n = dplyr::n(), RMSE = mean(RMSE), MAE = mean(MAE), MAPE = mean(MAPE), .groups = "drop") %>%
  dplyr::arrange(RMSE) %>%
  knitr::kable(digits = 2, caption = "Table 3. Rolling-origin CV (Test only)")
Table 3. Rolling-origin CV (Test only)
.model n RMSE MAE MAPE

5.2 Final ARIMA fit & 60-day forecast

final_fit <- daily_ts %>% model(ARIMA = ARIMA(rentals))
fabletools::report(final_fit)
## Series: rentals 
## Model: ARIMA(1,1,1)(1,0,2)[7] 
## 
## Coefficients:
##          ar1      ma1    sar1     sma1    sma2
##       0.3612  -0.9005  0.9106  -0.9035  0.0545
## s.e.  0.0427   0.0205  0.0760   0.0865  0.0417
## 
## sigma^2 estimated as 844004:  log likelihood=-6014.88
## AIC=12041.75   AICc=12041.87   BIC=12069.31
fc_60 <- forecast(final_fit, h = "60 days")
autoplot(fc_60, daily_ts) +
  labs(title = "60-day ARIMA forecast", x = NULL, y = "Daily rentals")

Figure 5.2. 60-day ARIMA forecast with 80/95% intervals (decline into winter).

The forecast projects rentals declining into the winter months, with wider confidence intervals during seasonal transitions.

5.3 Diagnostics

aug <- augment(final_fit)

feasts::gg_tsdisplay(aug, .innov, plot_type = "partial") +
  labs(title = "Residual diagnostics (innovations ACF/PACF)")

Figure 5.3. Residual ACF/PACF (no significant autocorrelation).

aug %>%
  features(.innov, ljung_box, lag = 24, dof = 2) %>%
  knitr::kable(caption = "Table 4. Ljung–Box test on residuals (lag 24)")
Table 4. Ljung–Box test on residuals (lag 24)
.model lb_stat lb_pvalue
ARIMA 16.03538 0.814126

5.4 (Optional) ARIMAX — weather & calendar

daily_ts <- daily_ts %>%
  mutate(
    temp_cel_s = as.numeric(scale(temp_cel)),
    hum_s      = as.numeric(scale(humidity))
  )

fit_arimax <- daily_ts %>% model(
  ARIMAX = ARIMA(rentals ~ temp_cel_s + hum_s + working + holiday)
)
fabletools::report(fit_arimax)

cv_compare <- daily_ts %>%
  stretch_tsibble(.init = 365, .step = 30) %>%
  model(
    SNAIVE = SNAIVE(rentals ~ lag("week")),
    ARIMA  = ARIMA(rentals),
    ARIMAX = ARIMA(rentals ~ temp_cel_s + hum_s + working + holiday)
  ) %>%
  accuracy() %>%
  dplyr::filter(.type == "Test") %>%
  dplyr::group_by(.model) %>%
  dplyr::summarise(RMSE = mean(RMSE), MAE = mean(MAE), MAPE = mean(MAPE), .groups = "drop") %>%
  dplyr::arrange(RMSE)

knitr::kable(cv_compare, digits = 2, caption = "Table 5. CV (Test): SNAIVE vs ARIMA vs ARIMAX")

6) Findings and Conclusions

Findings

  • Seasonality: Clear weekly and annual cycles exist. Rentals peak in summer and decline in winter; weekday commuting patterns are strong.
  • Baseline vs. ARIMA: A seasonal naïve model (SNAIVE, weekly) performs reasonably, but ARIMA consistently achieves lower error across cross-validation folds.
  • Model diagnostics: ARIMA residuals appear uncorrelated; Ljung–Box tests confirm no significant autocorrelation. Forecast uncertainty increases near seasonal transitions (e.g., summer → fall).
  • Exogenous covariates: Adding weather (temperature, humidity) and calendar effects (working days, holidays) via ARIMAX yields small but measurable accuracy improvements, though seasonality explains most of the variation.

Recommendations

  • Plan for seasonality: Align staffing, bike supply, and maintenance schedules with strong summer demand and reduced winter demand.
  • Targeted rebalancing: Anticipate weekday commuter surges by ensuring bikes are stocked at downtown and transit-adjacent stations during peak hours.
  • Leverage weather forecasts: Adjust redistribution and staffing plans on days with extreme heat, humidity, or rain when ridership typically falls.
  • Marketing focus: Run membership promotions in late spring and early summer when casual riders are most active, emphasizing cost savings and convenience.
  • Ongoing updates: Refit ARIMA/ARIMAX models quarterly to capture changes in ridership patterns, infrastructure, or policy.

Limitations

  • Data coverage: The dataset spans only two calendar years (2011–2012), limiting long-term insights.
  • Exogenous variables: Only basic weather and calendar fields are included. Key drivers such as precipitation, special events, or pricing changes are absent.
  • Geographic detail: The daily aggregate removes station-level variation, which could provide richer insights into local demand and rebalancing needs.
  • Model scope: ARIMA/ARIMAX provides interpretable forecasts but may underperform compared to modern ML approaches for nonlinear effects.

Next Steps

  • Extend the dataset: Incorporate more recent years to improve robustness.
  • Enhance exogenous inputs: Add precipitation, wind, and event data to improve predictive power.
  • Granular analysis: Apply forecasting methods at station or neighborhood level to support rebalancing logistics.
  • Compare models: Benchmark ARIMA against Prophet, gradient boosting, and neural nets for accuracy and scalability.
  • Automate forecasting: Build a pipeline that updates forecasts weekly/monthly and integrates with dashboards.

Session Info

sessionInfo()
## R version 4.2.3 (2023-03-15 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 26100)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] scales_1.3.0     tidyr_1.3.1      knitr_1.49       plotly_4.10.4   
##  [5] timetk_2.9.1     slider_0.3.2     tseries_0.10-58  feasts_0.3.2    
##  [9] fable_0.3.4      fabletools_0.4.2 tsibble_1.1.4    lubridate_1.9.4 
## [13] ggplot2_3.5.1    dplyr_1.1.4      readr_2.1.5     
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-167         xts_0.14.1           bit64_4.6.0-1       
##  [4] httr_1.4.7           tools_4.2.3          bslib_0.9.0         
##  [7] R6_2.6.1             rpart_4.1.24         ggdist_3.3.3        
## [10] mgcv_1.9-1           lazyeval_0.2.2       colorspace_2.1-1    
## [13] nnet_7.3-20          withr_3.0.2          tidyselect_1.2.1    
## [16] bit_4.5.0.1          curl_6.2.1           compiler_4.2.3      
## [19] progressr_0.15.1     cli_3.6.4            labeling_0.4.3      
## [22] sass_0.4.9           quadprog_1.5-8       digest_0.6.37       
## [25] rmarkdown_2.29       pkgconfig_2.0.3      htmltools_0.5.8.1   
## [28] parallelly_1.42.0    fastmap_1.2.0        htmlwidgets_1.6.4   
## [31] rlang_1.1.5          TTR_0.24.4           rstudioapi_0.17.1   
## [34] quantmod_0.4.28      farver_2.1.2         jquerylib_0.1.4     
## [37] generics_0.1.3       zoo_1.8-13           jsonlite_1.9.0      
## [40] crosstalk_1.2.1      vroom_1.6.5          distributional_0.5.0
## [43] magrittr_2.0.3       Matrix_1.5-3         Rcpp_1.0.14         
## [46] munsell_0.5.1        lifecycle_1.0.4      furrr_0.3.1         
## [49] yaml_2.3.10          MASS_7.3-58.2        recipes_1.1.1       
## [52] grid_4.2.3           parallel_4.2.3       listenv_0.9.1       
## [55] forcats_1.0.0        crayon_1.5.3         lattice_0.22-6      
## [58] splines_4.2.3        hms_1.1.3            anytime_0.3.12      
## [61] pillar_1.10.1        future.apply_1.11.3  codetools_0.2-20    
## [64] urca_1.3-4           glue_1.8.0           evaluate_1.0.3      
## [67] rsample_1.3.1        data.table_1.17.0    vctrs_0.6.5         
## [70] tzdb_0.4.0           gtable_0.3.6         purrr_1.0.4         
## [73] future_1.34.0        cachem_1.1.0         xfun_0.51           
## [76] gower_1.0.2          prodlim_2024.06.25   class_7.3-23        
## [79] survival_3.8-3       viridisLite_0.4.2    timeDate_4041.110   
## [82] tibble_3.3.0         hardhat_1.4.1        warp_0.2.1          
## [85] lava_1.8.1           timechange_0.3.0     globals_0.16.3      
## [88] ellipsis_0.3.2       ipred_0.9-15