Time Series Forecasting

Author

Pranish

# Load necessary libraries
library(tidyverse)

Warning: package 'tidyverse' was built under R version 4.5.2

Warning: package 'ggplot2' was built under R version 4.5.2

Warning: package 'tibble' was built under R version 4.5.2

Warning: package 'tidyr' was built under R version 4.5.2

Warning: package 'readr' was built under R version 4.5.2

Warning: package 'purrr' was built under R version 4.5.2

Warning: package 'dplyr' was built under R version 4.5.2

Warning: package 'stringr' was built under R version 4.5.2

Warning: package 'forcats' was built under R version 4.5.2

Warning: package 'lubridate' was built under R version 4.5.2

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.6
✔ forcats   1.0.1     ✔ stringr   1.6.0
✔ ggplot2   4.0.1     ✔ tibble    3.3.0
✔ lubridate 1.9.4     ✔ tidyr     1.3.2
✔ purrr     1.2.0     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(lubridate)
library(forecast)

Warning: package 'forecast' was built under R version 4.5.2

Registered S3 method overwritten by 'quantmod':
  method            from
  as.zoo.data.frame zoo

library(timetk)

Warning: package 'timetk' was built under R version 4.5.2

library(ggplot2)
library(dplyr)
library(tseries)

Warning: package 'tseries' was built under R version 4.5.2

# Load your dataset (replace 'your_data.csv' with the actual file name)
daily_data <- read.csv("C:/Users/Administrator/Downloads/bike data daily.csv")

hourly_data <- read.csv("C:/Users/Administrator/Downloads/bike data hour.csv")

str(daily_data)

'data.frame':   731 obs. of  16 variables:
 $ instant   : int  1 2 3 4 5 6 7 8 9 10 ...
 $ dteday    : chr  "2011-01-01" "2011-01-02" "2011-01-03" "2011-01-04" ...
 $ season    : int  1 1 1 1 1 1 1 1 1 1 ...
 $ yr        : int  0 0 0 0 0 0 0 0 0 0 ...
 $ mnth      : int  1 1 1 1 1 1 1 1 1 1 ...
 $ holiday   : int  0 0 0 0 0 0 0 0 0 0 ...
 $ weekday   : int  6 0 1 2 3 4 5 6 0 1 ...
 $ workingday: int  0 0 1 1 1 1 1 0 0 1 ...
 $ weathersit: int  2 2 1 1 1 1 2 2 1 1 ...
 $ temp      : num  0.344 0.363 0.196 0.2 0.227 ...
 $ atemp     : num  0.364 0.354 0.189 0.212 0.229 ...
 $ hum       : num  0.806 0.696 0.437 0.59 0.437 ...
 $ windspeed : num  0.16 0.249 0.248 0.16 0.187 ...
 $ casual    : int  331 131 120 108 82 88 148 68 54 41 ...
 $ registered: int  654 670 1229 1454 1518 1518 1362 891 768 1280 ...
 $ cnt       : int  985 801 1349 1562 1600 1606 1510 959 822 1321 ...

head(daily_data)

  instant     dteday season yr mnth holiday weekday workingday weathersit
1       1 2011-01-01      1  0    1       0       6          0          2
2       2 2011-01-02      1  0    1       0       0          0          2
3       3 2011-01-03      1  0    1       0       1          1          1
4       4 2011-01-04      1  0    1       0       2          1          1
5       5 2011-01-05      1  0    1       0       3          1          1
6       6 2011-01-06      1  0    1       0       4          1          1
      temp    atemp      hum windspeed casual registered  cnt
1 0.344167 0.363625 0.805833 0.1604460    331        654  985
2 0.363478 0.353739 0.696087 0.2485390    131        670  801
3 0.196364 0.189405 0.437273 0.2483090    120       1229 1349
4 0.200000 0.212122 0.590435 0.1602960    108       1454 1562
5 0.226957 0.229270 0.436957 0.1869000     82       1518 1600
6 0.204348 0.233209 0.518261 0.0895652     88       1518 1606

view(daily_data)
view(hourly_data)

daily_data[,"dteday"] <- as.Date(daily_data[,"dteday"])
daily_data[,"ncnt"] <- daily_data[,"cnt"] / max(daily_data[,"cnt"])
daily_data[,"nr"] <- daily_data[,"registered"] / max(daily_data[,"registered"])
daily_data[,"rr"] <- daily_data[,"cnt"] / max(daily_data[,"registered"])
summary(daily_data)

    instant          dteday               season            yr        
 Min.   :  1.0   Min.   :2011-01-01   Min.   :1.000   Min.   :0.0000  
 1st Qu.:183.5   1st Qu.:2011-07-02   1st Qu.:2.000   1st Qu.:0.0000  
 Median :366.0   Median :2012-01-01   Median :3.000   Median :1.0000  
 Mean   :366.0   Mean   :2012-01-01   Mean   :2.497   Mean   :0.5007  
 3rd Qu.:548.5   3rd Qu.:2012-07-01   3rd Qu.:3.000   3rd Qu.:1.0000  
 Max.   :731.0   Max.   :2012-12-31   Max.   :4.000   Max.   :1.0000  
      mnth          holiday           weekday        workingday   
 Min.   : 1.00   Min.   :0.00000   Min.   :0.000   Min.   :0.000  
 1st Qu.: 4.00   1st Qu.:0.00000   1st Qu.:1.000   1st Qu.:0.000  
 Median : 7.00   Median :0.00000   Median :3.000   Median :1.000  
 Mean   : 6.52   Mean   :0.02873   Mean   :2.997   Mean   :0.684  
 3rd Qu.:10.00   3rd Qu.:0.00000   3rd Qu.:5.000   3rd Qu.:1.000  
 Max.   :12.00   Max.   :1.00000   Max.   :6.000   Max.   :1.000  
   weathersit         temp             atemp              hum        
 Min.   :1.000   Min.   :0.05913   Min.   :0.07907   Min.   :0.0000  
 1st Qu.:1.000   1st Qu.:0.33708   1st Qu.:0.33784   1st Qu.:0.5200  
 Median :1.000   Median :0.49833   Median :0.48673   Median :0.6267  
 Mean   :1.395   Mean   :0.49538   Mean   :0.47435   Mean   :0.6279  
 3rd Qu.:2.000   3rd Qu.:0.65542   3rd Qu.:0.60860   3rd Qu.:0.7302  
 Max.   :3.000   Max.   :0.86167   Max.   :0.84090   Max.   :0.9725  
   windspeed           casual         registered        cnt      
 Min.   :0.02239   Min.   :   2.0   Min.   :  20   Min.   :  22  
 1st Qu.:0.13495   1st Qu.: 315.5   1st Qu.:2497   1st Qu.:3152  
 Median :0.18097   Median : 713.0   Median :3662   Median :4548  
 Mean   :0.19049   Mean   : 848.2   Mean   :3656   Mean   :4504  
 3rd Qu.:0.23321   3rd Qu.:1096.0   3rd Qu.:4776   3rd Qu.:5956  
 Max.   :0.50746   Max.   :3410.0   Max.   :6946   Max.   :8714  
      ncnt                nr                 rr          
 Min.   :0.002525   Min.   :0.002879   Min.   :0.003167  
 1st Qu.:0.361717   1st Qu.:0.359487   1st Qu.:0.453786  
 Median :0.521919   Median :0.527210   Median :0.654765  
 Mean   :0.516909   Mean   :0.526371   Mean   :0.648481  
 3rd Qu.:0.683498   3rd Qu.:0.687662   3rd Qu.:0.857472  
 Max.   :1.000000   Max.   :1.000000   Max.   :1.254535

#daily_data
daily_data %>% group_by(yr) %>% plot_time_series(dteday, temp, .color_var = season, .x_lab = "Date", .y_lab = "Normalized Temperature", 
                              .title = "Normalized Temperature vs Date for Daily Data", .interactive = TRUE)

Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
ℹ The deprecated feature was likely used in the timetk package.
  Please report the issue at
  <https://github.com/business-science/timetk/issues>.

daily_data %>% group_by(yr) %>% plot_time_series(dteday, hum, .color_var = season, .x_lab = "Date", .y_lab = "Normalized Humidity", 
                              .title = "Normalized Humidity vs Date for Daily Data", .interactive = TRUE)

daily_data %>% group_by(yr) %>% plot_time_series(dteday, windspeed, .color_var = season, .x_lab = "Date", .y_lab = "Normalized Windspeed", 
                              .title = "Windspeed vs Date for Daily Data", .interactive = TRUE)

daily_data %>% group_by(yr) %>% plot_time_series(dteday, ncnt, .color_var = season, .x_lab = "Date", .y_lab = "Normalized Bike Rentals", 
                              .title = "Normalized Bike Rentals vs Date for Daily Data", .interactive = TRUE)

daily_data %>% group_by(yr) %>% plot_time_series(dteday, nr, .color_var = season, .x_lab = "Date", .y_lab = "Normalized Registered Rentals", 
                              .title = "Normalized Registered Rentals vs Date for Daily Data", .interactive = TRUE)

daily_data %>% group_by(yr) %>% plot_time_series(dteday, rr, .color_var = season, .x_lab = "Date", .y_lab = "Ratio", 
                              .title = "Ratio of Rentals to Registration vs Date for Daily Data", .interactive = TRUE)

#since the humdity and windspeed data has so much noise, I will focus on temperature and bike rentals
#data cleaning
daily_data[,"temp"] <- tsclean(daily_data[,"temp"])
daily_data[,"ncnt"] <- tsclean(daily_data[,"ncnt"])
daily_data[,"nr"] <- tsclean(daily_data[,"nr"])
daily_data[,"rr"] <- tsclean(daily_data[,"rr"])
head(daily_data)

  instant     dteday season yr mnth holiday weekday workingday weathersit
1       1 2011-01-01      1  0    1       0       6          0          2
2       2 2011-01-02      1  0    1       0       0          0          2
3       3 2011-01-03      1  0    1       0       1          1          1
4       4 2011-01-04      1  0    1       0       2          1          1
5       5 2011-01-05      1  0    1       0       3          1          1
6       6 2011-01-06      1  0    1       0       4          1          1
      temp    atemp      hum windspeed casual registered  cnt       ncnt
1 0.344167 0.363625 0.805833 0.1604460    331        654  985 0.11303649
2 0.363478 0.353739 0.696087 0.2485390    131        670  801 0.09192105
3 0.196364 0.189405 0.437273 0.2483090    120       1229 1349 0.15480835
4 0.200000 0.212122 0.590435 0.1602960    108       1454 1562 0.17925178
5 0.226957 0.229270 0.436957 0.1869000     82       1518 1600 0.18361258
6 0.204348 0.233209 0.518261 0.0895652     88       1518 1606 0.18430112
          nr        rr
1 0.09415491 0.1418082
2 0.09645839 0.1153182
3 0.17693637 0.1942125
4 0.20932911 0.2248776
5 0.21854305 0.2303484
6 0.21854305 0.2312122

#daily_data
daily_data %>% group_by(yr) %>% plot_time_series(dteday, temp, .color_var = season, .x_lab = "Date", .y_lab = "Normalized Temperature", .title = "Normalized Temperature vs Date for Daily Data", .interactive = TRUE)

daily_data %>% group_by(yr) %>% plot_time_series(dteday, ncnt, .color_var = season, .x_lab = "Date", .y_lab = "Normalized Bike Rentals", .title = "Normalized Bike Rentals vs Date for Daily Data", .interactive = TRUE)

daily_data %>% group_by(yr) %>% plot_time_series(dteday, nr, .color_var = season, .x_lab = "Date", .y_lab = "Normalized Registered Rentals", .title = "Normalized Registered Rentals vs Date for Daily Data", .interactive = TRUE)

daily_data %>% group_by(yr) %>% plot_time_series(dteday, rr, .color_var = season, .x_lab = "Date", .y_lab = "Normalized Registered Rentals", .title = "Ratio of Rentals to Registration vs Date for Daily Data", .interactive = TRUE)

#daily data
daily_data[,"temp"] %>% adf.test()


    Augmented Dickey-Fuller Test

data:  .
Dickey-Fuller = -1.6785, Lag order = 9, p-value = 0.7144
alternative hypothesis: stationary

daily_data[,"ncnt"] %>% adf.test()


    Augmented Dickey-Fuller Test

data:  .
Dickey-Fuller = -1.3084, Lag order = 9, p-value = 0.871
alternative hypothesis: stationary

daily_data[,"nr"] %>% adf.test()


    Augmented Dickey-Fuller Test

data:  .
Dickey-Fuller = -2.418, Lag order = 9, p-value = 0.4014
alternative hypothesis: stationary

daily_data[,"rr"] %>% adf.test()


    Augmented Dickey-Fuller Test

data:  .
Dickey-Fuller = -1.3084, Lag order = 9, p-value = 0.871
alternative hypothesis: stationary

#decomposes the data
freq <- 365

#normalized daily rentals
norm_rentals <- ts(daily_data[, "nr"], frequency = freq)
decomped1 <- stl(norm_rentals, "periodic")
plot(decomped1$time.series[,2], ylab = "Stationary of the Normalized Rental Reservations", 
     xlab = "Daily of the Year")

checkresiduals(decomped1$time.series[, 3])


    Ljung-Box test

data:  Residuals
Q* = 1997.3, df = 146, p-value < 2.2e-16

Model df: 0.   Total lags used: 146

#normalized daily counts
norm_cnt <- ts(daily_data[, "ncnt"], frequency = freq)
decomped2 <- stl(norm_cnt, "periodic")
plot(decomped2$time.series[,2], ylab = "Stationary of the Normalized Rental Counts", 
     xlab = "Daily of the Year")

checkresiduals(decomped2$time.series[, 3])


    Ljung-Box test

data:  Residuals
Q* = 1437.8, df = 146, p-value < 2.2e-16

Model df: 0.   Total lags used: 146

#normalized daily rental rates
norm_rr <- ts(daily_data[, "rr"], frequency = freq)
decomped3 <- stl(norm_rr, "periodic")
plot(decomped3$time.series[,2], ylab = "Stationary of the Normalized Rental Counts to Reservations", 
     xlab = "Daily of the Year")

checkresiduals(decomped3$time.series[, 3])


    Ljung-Box test

data:  Residuals
Q* = 1437.8, df = 146, p-value < 2.2e-16

Model df: 0.   Total lags used: 146

#returns the stats
print("-------Noramlized Rental Reservations")

[1] "-------Noramlized Rental Reservations"

shapiro.test(decomped1$time.series[, 3])


    Shapiro-Wilk normality test

data:  decomped1$time.series[, 3]
W = 0.99554, p-value = 0.03334

print("-------Normalized Count of Rentals")

[1] "-------Normalized Count of Rentals"

shapiro.test(decomped1$time.series[, 3])


    Shapiro-Wilk normality test

data:  decomped1$time.series[, 3]
W = 0.99554, p-value = 0.03334

print("-------Normalized Ratio of Rentals to Reservations")

[1] "-------Normalized Ratio of Rentals to Reservations"

shapiro.test(decomped3$time.series[, 3])


    Shapiro-Wilk normality test

data:  decomped3$time.series[, 3]
W = 0.99702, p-value = 0.1993

#bike count predictions
fit1 <- auto.arima(norm_cnt, seasonal = TRUE)
summary(fit1)

Series: norm_cnt 
ARIMA(1,0,2)(0,1,0)[365] with drift 

Coefficients:
         ar1      ma1      ma2  drift
      0.9638  -0.6601  -0.1504  7e-04
s.e.  0.0246   0.0563   0.0497  1e-04

sigma^2 = 0.01709:  log likelihood = 227
AIC=-443.99   AICc=-443.83   BIC=-424.48

Training set error measures:
                       ME       RMSE        MAE      MPE     MAPE      MASE
Training set 0.0006729107 0.09200729 0.05020995 -2.84962 10.02705 0.1897625
                   ACF1
Training set 0.01095462

hist(fit1$residuals, xlab = "Residual", ylab = "Distribution", main = "Histogram of Model Errors - Bike Count")

shapiro.test(fit1$residuals)


    Shapiro-Wilk normality test

data:  fit1$residuals
W = 0.83122, p-value < 2.2e-16

prediction1 <- forecast(fit1, 25)
plot(prediction1, xlab = "Date", ylab = "Normalized Count of Rentals", main = "Prediction of Bike Rental Counts")

#predictions of number of reservations
fit2 <- auto.arima(norm_rentals, seasonal = TRUE, )
hist(fit2$residuals, xlab = "Residual", ylab = "Distribution", main = "Histogram of Model Errors - Rental Count")

shapiro.test(fit2$residuals)


    Shapiro-Wilk normality test

data:  fit2$residuals
W = 0.85305, p-value < 2.2e-16

prediction2 <- forecast(fit2, 25)
plot(prediction2, xlab = "Date", ylab = "Normalized Rentals", main = "Prediction of Bike Rentals")

#bike count predictions
fit2 <- auto.arima(norm_cnt, seasonal = TRUE, )
hist(fit2$residuals, xlab = "Residual", ylab = "Distribution", main = "Histogram of Model Errors - Count to Rental Ratio")

shapiro.test(fit2$residuals)


    Shapiro-Wilk normality test

data:  fit2$residuals
W = 0.83122, p-value < 2.2e-16

prediction3 <- forecast(fit2, 25)
plot(prediction3, xlab = "Date", ylab = "Normalized Rental Ratio", main = "Prediction of Bike Rentals to Reservations")

#Findings and Conclusions After processing the raw data and using the ARIMA package to model ride share data, I was able to make predictions for the 25 days beyond the current data set. Qualitatively the data shows that was the weather gets warmer the number of bike rentals increase, and over the course of two years the number of rentals increases over the number of rentals from the previous year. As the data terminates at the end of one cycle, I expect the number of rentals to increase to a level higher than it was a year before, which is what the models are predicting. Therefore the results were what I expected - the data appears to oscillate up and down over a 1 year period with the overall data moving towards higher rental numbers.