This RMarkdown file contains the report of the data analysis done for the project on forecasting daily bike rental demand using time series models in R. It contains analysis such as data exploration, summary statistics and building the time series models. The final report was completed on Thu Nov 6 14:27:39 2025.
Data Description:
This dataset contains the daily count of rental bike transactions between years 2011 and 2012 in Capital bikeshare system with the corresponding weather and seasonal information.
Data Source: https://archive.ics.uci.edu/ml/datasets/bike+sharing+dataset
Relevant Paper:
Fanaee-T, Hadi, and Gama, Joao, ‘Event labeling combining ensemble detectors and background knowledge’, Progress in Artificial Intelligence (2013): pp. 1-15, Springer Berlin Heidelberg
## Import required packages
#install if needed
library(timetk)
library(ggplot2)
library(stats)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ lubridate 1.9.4 ✔ tibble 3.3.0
## ✔ purrr 1.1.0 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tseries)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(dplyr)
library(forecast)
library(lubridate)
#Import data
data("bike_sharing_daily")
#visualize first 10 recordings
head(bike_sharing_daily, 10)
## # A tibble: 10 × 16
## instant dteday season yr mnth holiday weekday workingday weathersit
## <dbl> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 2011-01-01 1 0 1 0 6 0 2
## 2 2 2011-01-02 1 0 1 0 0 0 2
## 3 3 2011-01-03 1 0 1 0 1 1 1
## 4 4 2011-01-04 1 0 1 0 2 1 1
## 5 5 2011-01-05 1 0 1 0 3 1 1
## 6 6 2011-01-06 1 0 1 0 4 1 1
## 7 7 2011-01-07 1 0 1 0 5 1 2
## 8 8 2011-01-08 1 0 1 0 6 0 2
## 9 9 2011-01-09 1 0 1 0 0 0 1
## 10 10 2011-01-10 1 0 1 0 1 1 1
## # ℹ 7 more variables: temp <dbl>, atemp <dbl>, hum <dbl>, windspeed <dbl>,
## # casual <dbl>, registered <dbl>, cnt <dbl>
#check the class
class(bike_sharing_daily)
## [1] "spec_tbl_df" "tbl_df" "tbl" "data.frame"
#view a statistical summary of data
summary(bike_sharing_daily)
## instant dteday season yr
## Min. : 1.0 Min. :2011-01-01 Min. :1.000 Min. :0.0000
## 1st Qu.:183.5 1st Qu.:2011-07-02 1st Qu.:2.000 1st Qu.:0.0000
## Median :366.0 Median :2012-01-01 Median :3.000 Median :1.0000
## Mean :366.0 Mean :2012-01-01 Mean :2.497 Mean :0.5007
## 3rd Qu.:548.5 3rd Qu.:2012-07-01 3rd Qu.:3.000 3rd Qu.:1.0000
## Max. :731.0 Max. :2012-12-31 Max. :4.000 Max. :1.0000
## mnth holiday weekday workingday
## Min. : 1.00 Min. :0.00000 Min. :0.000 Min. :0.000
## 1st Qu.: 4.00 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:0.000
## Median : 7.00 Median :0.00000 Median :3.000 Median :1.000
## Mean : 6.52 Mean :0.02873 Mean :2.997 Mean :0.684
## 3rd Qu.:10.00 3rd Qu.:0.00000 3rd Qu.:5.000 3rd Qu.:1.000
## Max. :12.00 Max. :1.00000 Max. :6.000 Max. :1.000
## weathersit temp atemp hum
## Min. :1.000 Min. :0.05913 Min. :0.07907 Min. :0.0000
## 1st Qu.:1.000 1st Qu.:0.33708 1st Qu.:0.33784 1st Qu.:0.5200
## Median :1.000 Median :0.49833 Median :0.48673 Median :0.6267
## Mean :1.395 Mean :0.49538 Mean :0.47435 Mean :0.6279
## 3rd Qu.:2.000 3rd Qu.:0.65542 3rd Qu.:0.60860 3rd Qu.:0.7302
## Max. :3.000 Max. :0.86167 Max. :0.84090 Max. :0.9725
## windspeed casual registered cnt
## Min. :0.02239 Min. : 2.0 Min. : 20 Min. : 22
## 1st Qu.:0.13495 1st Qu.: 315.5 1st Qu.:2497 1st Qu.:3152
## Median :0.18097 Median : 713.0 Median :3662 Median :4548
## Mean :0.19049 Mean : 848.2 Mean :3656 Mean :4504
## 3rd Qu.:0.23321 3rd Qu.:1096.0 3rd Qu.:4776 3rd Qu.:5956
## Max. :0.50746 Max. :3410.0 Max. :6946 Max. :8714
#visualization - boxplot just to see how is the temperature related to seasons
boxplot(subset(bike_sharing_daily, season == 1)$temp,
subset(bike_sharing_daily, season == 2)$temp,
subset(bike_sharing_daily, season == 3)$temp,
subset(bike_sharing_daily, season == 4)$temp,
col = c("lightgreen", "pink", "orange", "lightblue"),
xlab = "Spring - Summer -Fall - Winter",
ylab = "Temperature")
#we can see that the temperature in higher in the summer and autumn especially
#next we'll take a look at the mean of number of rented bikes for each season
tapply(bike_sharing_daily$cnt, bike_sharing_daily$season, mean)
## 1 2 3 4
## 2604.133 4992.332 5644.303 4728.163
#again, we can see that the mean of rented bikes is higher for the season with higher temperature
#next, we will check to see if the number of rented bikes is correlated to the temperature across the year
cor(bike_sharing_daily$temp, bike_sharing_daily$cnt)
## [1] 0.627494
#we can see that the correlation between temperature and number of rented bike is strong, positive (0,627494)
#a plot to see the relationship between the number of bikes rented and temperature
ggplot(bike_sharing_daily, aes(x = temp, y = cnt)) +
geom_point(alpha = 0.5, color = "steelblue") + #points
geom_smooth(method = "lm", se = FALSE, color = "red") + #linear regression line
labs(
title = "Correlation between temperature and number of bycicles rented",
x = "Temperature (temp)",
y = "Number of bycicles (cnt)"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
#the plot shows us the same relationship, but we can also see some outliers
#next we'll take a look at the mean of number of rented bikes for each season, starting with spring
tapply(bike_sharing_daily$cnt, bike_sharing_daily$season, mean)
## 1 2 3 4
## 2604.133 4992.332 5644.303 4728.163
#again, we cand see that the mean of rented bikes is higher for the season with higher temperature
#now, we'll take a look at the outliers for each season
ggplot(bike_sharing_daily, aes(x = factor(season), y = cnt)) +
geom_boxplot(fill = "grey", alpha = 0.6, outlier.color = "red") +
labs(
title = "Boxplot -number of rented bycicles for each season, starting with spring",
x = "Season",
y = "Number of bycicles (cnt)"
) +
theme_minimal()
#we can see two clear outliers in the spring and in the winter, 2 days with very high / very low numbers
#so the relationship between season (and temperature respectively) is simpler in summer and autumn
##next step is to analyze the impact of holiday and temperature on the number of rented bycicles,
#and to analyze the outliers in order to decide what kind of smoothing we'll choose
bike_sharing_daily <- bike_sharing_daily %>%
mutate(
holiday_flag = if_else(
dteday %in% as.Date(c("2011-12-25", "2012-12-25", # Christmas
"2011-01-01", "2012-01-01")), 1, 0
),
weekend_flag = if_else(wday(dteday) %in% c(1,7), 1, 0) # 1=Sunday, 7=Saturday
)
ggplot(bike_sharing_daily, aes(x = temp, y = cnt, color = factor(holiday_flag))) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm") +
labs(color = "Holiday", title = "The Effect of Temperature and Holidays on Rented Bykes")
## `geom_smooth()` using formula = 'y ~ x'
model <- lm(cnt ~ temp + holiday_flag + weekend_flag, data = bike_sharing_daily)
summary(model)
##
## Call:
## lm(formula = cnt ~ temp + holiday_flag + weekend_flag, data = bike_sharing_daily)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4601.7 -1131.4 -92.1 1037.6 3742.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1262.33 166.33 7.589 9.88e-14 ***
## temp 6578.67 304.99 21.570 < 2e-16 ***
## holiday_flag -2092.05 757.73 -2.761 0.00591 **
## weekend_flag -19.15 123.34 -0.155 0.87665
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1503 on 727 degrees of freedom
## Multiple R-squared: 0.4001, Adjusted R-squared: 0.3977
## F-statistic: 161.7 on 3 and 727 DF, p-value: < 2.2e-16
# this is a statistically significant model
# temperature variations explain best the fluctuations in the number of rented bikes
# the number of rented bykes drops on holidays
# next step, convert data to time series
ts_data <- ts(bike_sharing_daily$cnt, start = c(2011,1), frequency = 7)
head(ts_data)
## Time Series:
## Start = c(2011, 1)
## End = c(2011, 6)
## Frequency = 7
## [1] 985 801 1349 1562 1600 1606
#plot time series
## Read about the timetk package
# ?timetk
bike_sharing_daily %>%
plot_time_series(dteday, cnt,
.interactive = TRUE,
.plotly_slider = TRUE)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the timetk package.
## Please report the issue at
## <https://github.com/business-science/timetk/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Ignoring unknown labels:
## • colour : "Legend"
bike_sharing_daily%>%
plot_seasonal_diagnostics(dteday, cnt,
.interactive = FALSE)
# upward trend with yearly seasonality
# we can see from the plot that the number of rented bikes goes up in warm seasons and back down when it gets cold outside
# we can also see that the number of rented bikes was higher in 2012 compared to 2011
# we can see a few outliers
#we'll do that with Holt Winters because TS has trend (upward) and seasonality - it seems the most appropriate choice
#just to be sure we have time series - check
class(ts_data)
## [1] "ts"
# check for missing values
any(is.na(bike_sharing_daily$cnt))
## [1] FALSE
#no missing values, so no tsclean() needed
#because we have outliers (we've seen it a few good times) we'll use tsoutliers(), smooth with Holt Winters and SMA(?!?) and compare the results - choose the best one
#question - what kind of seasonality
#TS doesn't have missing values
tsoutliers(ts_data)
## $index
## [1] 478 500 646 668 669
##
## $replacements
## [1] 4642.478 5439.664 6121.828 4966.355 5480.664
outliers <- tsoutliers(ts_data)
ts_data_clean <- ts_data
ts_data_clean[outliers$index] <- outliers$replacements
par(mfrow = c(2, 1)) # 2 plots in the same window, for comparison
plot(ts_data, main = "Original Time Series", col = "blue")
points(time(ts_data)[outliers$index], ts_data[outliers$index], col = "red", pch = 19)
plot(ts_data_clean, main = " Cleaned Time Series - ouliers replaced", col = "darkgreen")
#1Holt Winters smoothing
holt_winters_model <- HoltWinters(ts_data_clean)
ts_Holt_Winters_smooth <- holt_winters_model$fitted[,1]
plot(ts_data_clean, main = " Holt-Winters Smoothing", col = "gray", lwd = 1.5)
lines(ts_Holt_Winters_smooth, col = "blue", lwd = 2)
legend("topleft",
legend = c("TS without outliers", " Holt-Winters Smoothed"),
col = c("gray", "blue"), lwd = 2)
#2. ETS Model (Exponential Smoothing)
ets_model <- ets(ts_data_clean)
plot(fitted(ets_model), type = "l", col = "darkgreen", lwd = 2,
main = "Model ETS (Exponential Smoothing) pe seria curățată",
ylab = "Valori netezite", xlab = "Timp")
lines(ts_data_clean, col = "gray")
legend("topleft", legend = c("Seria originală", "ETS netezit"),
col = c("gray", "darkgreen"), lwd = 2)
#check which smoothing has better results
accuracy(holt_winters_model$fitted[,1], ts_data_clean)
## ME RMSE MAE MPE MAPE ACF1 Theil's U
## Test set 16.08671 887.308 663.5352 -6.363985 21.50976 0.1620917 0.6851779
accuracy(fitted(ets_model), ts_data_clean)
## ME RMSE MAE MPE MAPE ACF1 Theil's U
## Test set 3.784018 870.3516 645.5342 -7.092351 21.04007 0.1544508 0.6578006
mean((ts_data_clean - holt_winters_model$fitted[,1])^2)
## [1] 787315.5
mean((ts_data_clean - fitted(ets_model))^2)
## [1] 757512
#ets is a little bit better so we keep it
#select / extract smoothed series
ts_ets_smooth <- fitted(ets_model)
#check to see if it is additive or multiplicative
plot(ts_ets_smooth)
plot(log(ts_ets_smooth))
#decompose
model <- decompose(ts_ets_smooth, type = "additive")
plot(model)
#check for stationarity, ADF on smoothed series
adf.test(ts_ets_smooth, alternative = "stationary")
##
## Augmented Dickey-Fuller Test
##
## data: ts_ets_smooth
## Dickey-Fuller = -0.45654, Lag order = 9, p-value = 0.9838
## alternative hypothesis: stationary
#of course, it is not stationary - we saw it in the plot
#differencing the (ETS) smoothed TS
ts_diff <- diff(ts_ets_smooth, differences = 1)
#ADF test for stationarity
adf.test(ts_diff, alternative = "stationary")
## Warning in adf.test(ts_diff, alternative = "stationary"): p-value smaller than
## printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: ts_diff
## Dickey-Fuller = -10.794, Lag order = 8, p-value = 0.01
## alternative hypothesis: stationary
#TS is stationary now, appropriate for ARIMA modeling
#visualization
par(mfrow = c(1, 1)) #one plot
plot(ts_diff, main = "ETS differenced", col = "pink")
fit_arima <- auto.arima(ts_diff)
#Model Summary
summary(fit_arima)
## Series: ts_diff
## ARIMA(2,0,4) with zero mean
##
## Coefficients:
## ar1 ar2 ma1 ma2 ma3 ma4
## 1.2342 -0.9787 -1.1144 0.7474 0.1656 -0.0805
## s.e. 0.0163 0.0169 0.0406 0.0564 0.0554 0.0391
##
## sigma^2 = 69070: log likelihood = -5100.46
## AIC=10214.92 AICc=10215.08 BIC=10247.07
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.8126997 261.7294 192.7137 113.1563 239.6489 0.730611 0.002112228
#Residuals Diagnostics
checkresiduals(fit_arima)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,0,4) with zero mean
## Q* = 30.529, df = 8, p-value = 0.0001703
##
## Model df: 6. Total lags used: 14
#Check to see if we can find a better model
fit_arima_seasonal <- auto.arima(ts_diff, seasonal = TRUE)
#This model's summary
summary(fit_arima_seasonal)
## Series: ts_diff
## ARIMA(2,0,4) with zero mean
##
## Coefficients:
## ar1 ar2 ma1 ma2 ma3 ma4
## 1.2342 -0.9787 -1.1144 0.7474 0.1656 -0.0805
## s.e. 0.0163 0.0169 0.0406 0.0564 0.0554 0.0391
##
## sigma^2 = 69070: log likelihood = -5100.46
## AIC=10214.92 AICc=10215.08 BIC=10247.07
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.8126997 261.7294 192.7137 113.1563 239.6489 0.730611 0.002112228
# ... and residuals
checkresiduals(fit_arima_seasonal)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,0,4) with zero mean
## Q* = 30.529, df = 8, p-value = 0.0001703
##
## Model df: 6. Total lags used: 14
# ... keep looking
fit_arima_seasonal_adjust <- auto.arima(ts_diff, seasonal = TRUE, stepwise = FALSE, approximation = FALSE)
#Summary for the third model
summary(fit_arima_seasonal_adjust)
## Series: ts_diff
## ARIMA(2,0,1)(2,0,0)[7] with zero mean
##
## Coefficients:
## ar1 ar2 ma1 sar1 sar2
## 1.0324 -0.2220 -0.9062 0.1419 0.1732
## s.e. 0.0471 0.0372 0.0343 0.0383 0.0383
##
## sigma^2 = 68954: log likelihood = -5100.21
## AIC=10212.41 AICc=10212.53 BIC=10239.97
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 1.29684 261.691 192.2662 82.75053 215.7618 0.7289143 -0.006216619
#Residuals for the third model
checkresiduals(fit_arima_seasonal_adjust)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,0,1)(2,0,0)[7] with zero mean
## Q* = 13.961, df = 9, p-value = 0.1237
##
## Model df: 5. Total lags used: 14
# this model, the third is the best for forecast
# visualizing forecast
forecast_arima <- forecast(fit_arima_seasonal_adjust, h = 12)
plot(forecast_arima, main = " ARIMA Forecast")
#First look at the data - plot - it has a trend, upward, seasonality - because of different temperatures in different seasons and a few outliers
#Time series has a few outliers, we replaced that
#The smoothing with best numbers for this TS is ets
#Time series is not stationary so it needed differencing.
#Best ARIMA I found is ARIMA(2,0,1)(2,0,0)[7] with 0 mean.