About Data Analysis Report

This RMarkdown file contains the report of the data analysis done for the project on forecasting daily bike rental demand using time series models in R. It contains analysis such as data exploration, summary statistics and building the time series models. The final report was completed on Thu Nov 6 14:27:39 2025.

Data Description:

This dataset contains the daily count of rental bike transactions between years 2011 and 2012 in Capital bikeshare system with the corresponding weather and seasonal information.

Data Source: https://archive.ics.uci.edu/ml/datasets/bike+sharing+dataset

Relevant Paper:

Fanaee-T, Hadi, and Gama, Joao, ‘Event labeling combining ensemble detectors and background knowledge’, Progress in Artificial Intelligence (2013): pp. 1-15, Springer Berlin Heidelberg

Task One: Load and explore the data

Load data and install packages

## Import required packages
#install if needed 

library(timetk)
library(ggplot2)
library(stats)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ lubridate 1.9.4     ✔ tibble    3.3.0
## ✔ purrr     1.1.0     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tseries)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(dplyr)
library(forecast)
library(lubridate)

Describe and explore the data

#Import data 

data("bike_sharing_daily") 

#visualize first 10 recordings 

head(bike_sharing_daily, 10)
## # A tibble: 10 × 16
##    instant dteday     season    yr  mnth holiday weekday workingday weathersit
##      <dbl> <date>      <dbl> <dbl> <dbl>   <dbl>   <dbl>      <dbl>      <dbl>
##  1       1 2011-01-01      1     0     1       0       6          0          2
##  2       2 2011-01-02      1     0     1       0       0          0          2
##  3       3 2011-01-03      1     0     1       0       1          1          1
##  4       4 2011-01-04      1     0     1       0       2          1          1
##  5       5 2011-01-05      1     0     1       0       3          1          1
##  6       6 2011-01-06      1     0     1       0       4          1          1
##  7       7 2011-01-07      1     0     1       0       5          1          2
##  8       8 2011-01-08      1     0     1       0       6          0          2
##  9       9 2011-01-09      1     0     1       0       0          0          1
## 10      10 2011-01-10      1     0     1       0       1          1          1
## # ℹ 7 more variables: temp <dbl>, atemp <dbl>, hum <dbl>, windspeed <dbl>,
## #   casual <dbl>, registered <dbl>, cnt <dbl>
#check the class 

class(bike_sharing_daily)
## [1] "spec_tbl_df" "tbl_df"      "tbl"         "data.frame"
#view a statistical summary of data

summary(bike_sharing_daily)
##     instant          dteday               season            yr        
##  Min.   :  1.0   Min.   :2011-01-01   Min.   :1.000   Min.   :0.0000  
##  1st Qu.:183.5   1st Qu.:2011-07-02   1st Qu.:2.000   1st Qu.:0.0000  
##  Median :366.0   Median :2012-01-01   Median :3.000   Median :1.0000  
##  Mean   :366.0   Mean   :2012-01-01   Mean   :2.497   Mean   :0.5007  
##  3rd Qu.:548.5   3rd Qu.:2012-07-01   3rd Qu.:3.000   3rd Qu.:1.0000  
##  Max.   :731.0   Max.   :2012-12-31   Max.   :4.000   Max.   :1.0000  
##       mnth          holiday           weekday        workingday   
##  Min.   : 1.00   Min.   :0.00000   Min.   :0.000   Min.   :0.000  
##  1st Qu.: 4.00   1st Qu.:0.00000   1st Qu.:1.000   1st Qu.:0.000  
##  Median : 7.00   Median :0.00000   Median :3.000   Median :1.000  
##  Mean   : 6.52   Mean   :0.02873   Mean   :2.997   Mean   :0.684  
##  3rd Qu.:10.00   3rd Qu.:0.00000   3rd Qu.:5.000   3rd Qu.:1.000  
##  Max.   :12.00   Max.   :1.00000   Max.   :6.000   Max.   :1.000  
##    weathersit         temp             atemp              hum        
##  Min.   :1.000   Min.   :0.05913   Min.   :0.07907   Min.   :0.0000  
##  1st Qu.:1.000   1st Qu.:0.33708   1st Qu.:0.33784   1st Qu.:0.5200  
##  Median :1.000   Median :0.49833   Median :0.48673   Median :0.6267  
##  Mean   :1.395   Mean   :0.49538   Mean   :0.47435   Mean   :0.6279  
##  3rd Qu.:2.000   3rd Qu.:0.65542   3rd Qu.:0.60860   3rd Qu.:0.7302  
##  Max.   :3.000   Max.   :0.86167   Max.   :0.84090   Max.   :0.9725  
##    windspeed           casual         registered        cnt      
##  Min.   :0.02239   Min.   :   2.0   Min.   :  20   Min.   :  22  
##  1st Qu.:0.13495   1st Qu.: 315.5   1st Qu.:2497   1st Qu.:3152  
##  Median :0.18097   Median : 713.0   Median :3662   Median :4548  
##  Mean   :0.19049   Mean   : 848.2   Mean   :3656   Mean   :4504  
##  3rd Qu.:0.23321   3rd Qu.:1096.0   3rd Qu.:4776   3rd Qu.:5956  
##  Max.   :0.50746   Max.   :3410.0   Max.   :6946   Max.   :8714
#visualization - boxplot just to see how is the temperature related to seasons 

boxplot(subset(bike_sharing_daily, season == 1)$temp,
        subset(bike_sharing_daily, season == 2)$temp,
        subset(bike_sharing_daily, season == 3)$temp,
        subset(bike_sharing_daily, season == 4)$temp,
        col = c("lightgreen", "pink", "orange", "lightblue"),
        xlab = "Spring - Summer -Fall - Winter",
        ylab = "Temperature")

#we can see that the temperature in higher in the summer and autumn especially 
#next we'll take a look at the mean of number of rented bikes for each season 

tapply(bike_sharing_daily$cnt, bike_sharing_daily$season, mean)
##        1        2        3        4 
## 2604.133 4992.332 5644.303 4728.163
#again, we can see that the mean of rented bikes is higher for the season with higher temperature 
#next, we will check to see if the number of rented bikes is correlated to the temperature across the year

cor(bike_sharing_daily$temp, bike_sharing_daily$cnt)
## [1] 0.627494
#we can see that the correlation between temperature and number of rented bike is strong, positive (0,627494)
#a plot to see the relationship between the number of bikes rented and temperature 

ggplot(bike_sharing_daily, aes(x = temp, y = cnt)) +
  geom_point(alpha = 0.5, color = "steelblue") +            #points
  geom_smooth(method = "lm", se = FALSE, color = "red") +   #linear regression line 
  labs(
    title = "Correlation between temperature and number of bycicles rented",
    x = "Temperature (temp)",
    y = "Number of bycicles (cnt)"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

#the plot shows us the same relationship, but we can also see some outliers 
#next we'll take a look at the mean of number of rented bikes for each season, starting with spring  

tapply(bike_sharing_daily$cnt, bike_sharing_daily$season, mean)
##        1        2        3        4 
## 2604.133 4992.332 5644.303 4728.163
#again, we cand see that the mean of rented bikes is higher for the season with higher temperature 
#now, we'll take a look at the outliers for each season

ggplot(bike_sharing_daily, aes(x = factor(season), y = cnt)) +
  geom_boxplot(fill = "grey", alpha = 0.6, outlier.color = "red") +
  labs(
    title = "Boxplot -number of rented bycicles for each season, starting with spring",
    x = "Season",
    y = "Number of bycicles (cnt)"
  ) +
  theme_minimal()

#we can see two clear outliers in the spring and in the winter, 2 days with very high / very low numbers
#so the relationship between season (and temperature respectively) is simpler in summer and autumn  
##next step is to analyze the impact of holiday and temperature on the number of rented bycicles, 
#and to analyze the outliers in order to decide what kind of smoothing we'll choose 

bike_sharing_daily <- bike_sharing_daily %>%
  mutate(
    holiday_flag = if_else(
      dteday %in% as.Date(c("2011-12-25", "2012-12-25",  # Christmas 
                           "2011-01-01", "2012-01-01")), 1, 0
    ),
    weekend_flag = if_else(wday(dteday) %in% c(1,7), 1, 0) # 1=Sunday, 7=Saturday
  )

ggplot(bike_sharing_daily, aes(x = temp, y = cnt, color = factor(holiday_flag))) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm") +
  labs(color = "Holiday", title = "The Effect of Temperature and Holidays on Rented Bykes")
## `geom_smooth()` using formula = 'y ~ x'

model <- lm(cnt ~ temp + holiday_flag + weekend_flag, data = bike_sharing_daily)

summary(model)
## 
## Call:
## lm(formula = cnt ~ temp + holiday_flag + weekend_flag, data = bike_sharing_daily)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4601.7 -1131.4   -92.1  1037.6  3742.9 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1262.33     166.33   7.589 9.88e-14 ***
## temp          6578.67     304.99  21.570  < 2e-16 ***
## holiday_flag -2092.05     757.73  -2.761  0.00591 ** 
## weekend_flag   -19.15     123.34  -0.155  0.87665    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1503 on 727 degrees of freedom
## Multiple R-squared:  0.4001, Adjusted R-squared:  0.3977 
## F-statistic: 161.7 on 3 and 727 DF,  p-value: < 2.2e-16
# this is a statistically significant model 
# temperature variations explain best the fluctuations in the  number of rented bikes 
# the number of rented bykes drops on holidays 
# next step, convert data to time series 

ts_data <- ts(bike_sharing_daily$cnt, start = c(2011,1), frequency = 7)

head(ts_data)
## Time Series:
## Start = c(2011, 1) 
## End = c(2011, 6) 
## Frequency = 7 
## [1]  985  801 1349 1562 1600 1606
#plot time series 

Task Two: Create interactive time series plots

## Read about the timetk package
# ?timetk

bike_sharing_daily %>% 
  plot_time_series(dteday, cnt, 
                   .interactive = TRUE,
                   .plotly_slider = TRUE)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the timetk package.
##   Please report the issue at
##   <https://github.com/business-science/timetk/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Ignoring unknown labels:
## • colour : "Legend"
bike_sharing_daily%>%
plot_seasonal_diagnostics(dteday, cnt, 
                          .interactive = FALSE)

# upward trend with yearly seasonality
# we can see from the plot that the number of rented bikes goes up in warm seasons and back down when it gets cold outside 
# we can also see that the number of rented bikes was higher in 2012 compared to 2011
# we can see a few outliers 

Task Three: Smooth time series data

#we'll do that with Holt Winters because TS has trend (upward) and seasonality - it seems the most appropriate choice 
#just to be sure we have time series - check

class(ts_data)
## [1] "ts"
# check for missing values 

any(is.na(bike_sharing_daily$cnt))
## [1] FALSE
#no missing values, so no tsclean() needed 
#because we have outliers (we've seen it a few good times) we'll use tsoutliers(), smooth with Holt Winters and SMA(?!?) and compare the results - choose the best one 
#question - what kind of seasonality
#TS doesn't have missing values 

tsoutliers(ts_data)
## $index
## [1] 478 500 646 668 669
## 
## $replacements
## [1] 4642.478 5439.664 6121.828 4966.355 5480.664
outliers <- tsoutliers(ts_data)

ts_data_clean <- ts_data

ts_data_clean[outliers$index] <- outliers$replacements

par(mfrow = c(2, 1))  # 2 plots in the same window, for comparison

plot(ts_data, main = "Original Time Series", col = "blue")

points(time(ts_data)[outliers$index], ts_data[outliers$index], col = "red", pch = 19)

plot(ts_data_clean, main = " Cleaned Time Series - ouliers replaced", col = "darkgreen")

#1Holt Winters smoothing

holt_winters_model <- HoltWinters(ts_data_clean)

ts_Holt_Winters_smooth <- holt_winters_model$fitted[,1]

plot(ts_data_clean, main = " Holt-Winters Smoothing", col = "gray", lwd = 1.5)

lines(ts_Holt_Winters_smooth, col = "blue", lwd = 2)

legend("topleft",
       legend = c("TS without outliers", " Holt-Winters Smoothed"),
       col = c("gray", "blue"), lwd = 2)

#2. ETS Model (Exponential Smoothing)

ets_model <- ets(ts_data_clean)

plot(fitted(ets_model), type = "l", col = "darkgreen", lwd = 2,
     main = "Model ETS (Exponential Smoothing) pe seria curățată",
     ylab = "Valori netezite", xlab = "Timp")

lines(ts_data_clean, col = "gray")

legend("topleft", legend = c("Seria originală", "ETS netezit"),
       col = c("gray", "darkgreen"), lwd = 2)

#check which smoothing has better results 

accuracy(holt_winters_model$fitted[,1], ts_data_clean)
##                ME    RMSE      MAE       MPE     MAPE      ACF1 Theil's U
## Test set 16.08671 887.308 663.5352 -6.363985 21.50976 0.1620917 0.6851779
accuracy(fitted(ets_model), ts_data_clean)
##                ME     RMSE      MAE       MPE     MAPE      ACF1 Theil's U
## Test set 3.784018 870.3516 645.5342 -7.092351 21.04007 0.1544508 0.6578006
mean((ts_data_clean - holt_winters_model$fitted[,1])^2)
## [1] 787315.5
mean((ts_data_clean - fitted(ets_model))^2)
## [1] 757512
#ets is a little bit better so we keep it

Task Four: Decompse and access the stationarity of time series data

#select / extract smoothed series
ts_ets_smooth <- fitted(ets_model)

#check to see if it is additive or multiplicative

plot(ts_ets_smooth)

plot(log(ts_ets_smooth))

#decompose

model <- decompose(ts_ets_smooth, type = "additive")

plot(model)

#check for stationarity, ADF on smoothed series

adf.test(ts_ets_smooth, alternative = "stationary")
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts_ets_smooth
## Dickey-Fuller = -0.45654, Lag order = 9, p-value = 0.9838
## alternative hypothesis: stationary
#of course, it is not stationary - we saw it in the plot

#differencing the (ETS) smoothed TS

ts_diff <- diff(ts_ets_smooth, differences = 1)

#ADF test for stationarity 

adf.test(ts_diff, alternative = "stationary")
## Warning in adf.test(ts_diff, alternative = "stationary"): p-value smaller than
## printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts_diff
## Dickey-Fuller = -10.794, Lag order = 8, p-value = 0.01
## alternative hypothesis: stationary
#TS is stationary now, appropriate for ARIMA modeling
#visualization 

par(mfrow = c(1, 1)) #one plot

plot(ts_diff, main = "ETS differenced", col = "pink")

Task Five: Fit and forecast time series data using ARIMA models

fit_arima <- auto.arima(ts_diff)

#Model Summary

summary(fit_arima)
## Series: ts_diff 
## ARIMA(2,0,4) with zero mean 
## 
## Coefficients:
##          ar1      ar2      ma1     ma2     ma3      ma4
##       1.2342  -0.9787  -1.1144  0.7474  0.1656  -0.0805
## s.e.  0.0163   0.0169   0.0406  0.0564  0.0554   0.0391
## 
## sigma^2 = 69070:  log likelihood = -5100.46
## AIC=10214.92   AICc=10215.08   BIC=10247.07
## 
## Training set error measures:
##                     ME     RMSE      MAE      MPE     MAPE     MASE        ACF1
## Training set 0.8126997 261.7294 192.7137 113.1563 239.6489 0.730611 0.002112228
#Residuals Diagnostics

checkresiduals(fit_arima)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,0,4) with zero mean
## Q* = 30.529, df = 8, p-value = 0.0001703
## 
## Model df: 6.   Total lags used: 14
#Check to see if we can find a better model 

fit_arima_seasonal <- auto.arima(ts_diff, seasonal = TRUE)

#This model's summary

summary(fit_arima_seasonal)
## Series: ts_diff 
## ARIMA(2,0,4) with zero mean 
## 
## Coefficients:
##          ar1      ar2      ma1     ma2     ma3      ma4
##       1.2342  -0.9787  -1.1144  0.7474  0.1656  -0.0805
## s.e.  0.0163   0.0169   0.0406  0.0564  0.0554   0.0391
## 
## sigma^2 = 69070:  log likelihood = -5100.46
## AIC=10214.92   AICc=10215.08   BIC=10247.07
## 
## Training set error measures:
##                     ME     RMSE      MAE      MPE     MAPE     MASE        ACF1
## Training set 0.8126997 261.7294 192.7137 113.1563 239.6489 0.730611 0.002112228
# ... and residuals 

checkresiduals(fit_arima_seasonal)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,0,4) with zero mean
## Q* = 30.529, df = 8, p-value = 0.0001703
## 
## Model df: 6.   Total lags used: 14
# ... keep looking 

fit_arima_seasonal_adjust <- auto.arima(ts_diff, seasonal = TRUE, stepwise = FALSE, approximation = FALSE)

#Summary for the third model

summary(fit_arima_seasonal_adjust)
## Series: ts_diff 
## ARIMA(2,0,1)(2,0,0)[7] with zero mean 
## 
## Coefficients:
##          ar1      ar2      ma1    sar1    sar2
##       1.0324  -0.2220  -0.9062  0.1419  0.1732
## s.e.  0.0471   0.0372   0.0343  0.0383  0.0383
## 
## sigma^2 = 68954:  log likelihood = -5100.21
## AIC=10212.41   AICc=10212.53   BIC=10239.97
## 
## Training set error measures:
##                   ME    RMSE      MAE      MPE     MAPE      MASE         ACF1
## Training set 1.29684 261.691 192.2662 82.75053 215.7618 0.7289143 -0.006216619
#Residuals for the third model

checkresiduals(fit_arima_seasonal_adjust)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,0,1)(2,0,0)[7] with zero mean
## Q* = 13.961, df = 9, p-value = 0.1237
## 
## Model df: 5.   Total lags used: 14
# this model, the third is the best for forecast 
# visualizing forecast 

forecast_arima <- forecast(fit_arima_seasonal_adjust, h = 12)

plot(forecast_arima, main = " ARIMA Forecast")

Task Six: Findings and Conclusions

#First look at the data - plot - it has a trend, upward, seasonality - because of different temperatures in different seasons and a few outliers

#Time series has a few outliers, we replaced that

#The smoothing with best numbers for this TS is ets

#Time series is not stationary so it needed differencing.

#Best ARIMA I found is ARIMA(2,0,1)(2,0,0)[7] with 0 mean.