Warning: package 'tidyverse' was built under R version 4.5.2
Warning: package 'ggplot2' was built under R version 4.5.2
Warning: package 'tibble' was built under R version 4.5.2
Warning: package 'tidyr' was built under R version 4.5.2
Warning: package 'readr' was built under R version 4.5.2
Warning: package 'purrr' was built under R version 4.5.2
Warning: package 'dplyr' was built under R version 4.5.2
Warning: package 'stringr' was built under R version 4.5.2
Warning: package 'forcats' was built under R version 4.5.2
Warning: package 'lubridate' was built under R version 4.5.2
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.6
✔ forcats 1.0.1 ✔ stringr 1.6.0
✔ ggplot2 4.0.1 ✔ tibble 3.3.0
✔ lubridate 1.9.4 ✔ tidyr 1.3.2
✔ purrr 1.2.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)library(forecast)
Warning: package 'forecast' was built under R version 4.5.2
Registered S3 method overwritten by 'quantmod':
method from
as.zoo.data.frame zoo
library(timetk)
Warning: package 'timetk' was built under R version 4.5.2
library(ggplot2)library(dplyr)library(tseries)
Warning: package 'tseries' was built under R version 4.5.2
# Load your dataset (replace 'your_data.csv' with the actual file name)daily_data <-read.csv("C:/Users/Administrator/Downloads/bike data daily.csv")hourly_data <-read.csv("C:/Users/Administrator/Downloads/bike data hour.csv")str(daily_data)
instant dteday season yr
Min. : 1.0 Min. :2011-01-01 Min. :1.000 Min. :0.0000
1st Qu.:183.5 1st Qu.:2011-07-02 1st Qu.:2.000 1st Qu.:0.0000
Median :366.0 Median :2012-01-01 Median :3.000 Median :1.0000
Mean :366.0 Mean :2012-01-01 Mean :2.497 Mean :0.5007
3rd Qu.:548.5 3rd Qu.:2012-07-01 3rd Qu.:3.000 3rd Qu.:1.0000
Max. :731.0 Max. :2012-12-31 Max. :4.000 Max. :1.0000
mnth holiday weekday workingday
Min. : 1.00 Min. :0.00000 Min. :0.000 Min. :0.000
1st Qu.: 4.00 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:0.000
Median : 7.00 Median :0.00000 Median :3.000 Median :1.000
Mean : 6.52 Mean :0.02873 Mean :2.997 Mean :0.684
3rd Qu.:10.00 3rd Qu.:0.00000 3rd Qu.:5.000 3rd Qu.:1.000
Max. :12.00 Max. :1.00000 Max. :6.000 Max. :1.000
weathersit temp atemp hum
Min. :1.000 Min. :0.05913 Min. :0.07907 Min. :0.0000
1st Qu.:1.000 1st Qu.:0.33708 1st Qu.:0.33784 1st Qu.:0.5200
Median :1.000 Median :0.49833 Median :0.48673 Median :0.6267
Mean :1.395 Mean :0.49538 Mean :0.47435 Mean :0.6279
3rd Qu.:2.000 3rd Qu.:0.65542 3rd Qu.:0.60860 3rd Qu.:0.7302
Max. :3.000 Max. :0.86167 Max. :0.84090 Max. :0.9725
windspeed casual registered cnt
Min. :0.02239 Min. : 2.0 Min. : 20 Min. : 22
1st Qu.:0.13495 1st Qu.: 315.5 1st Qu.:2497 1st Qu.:3152
Median :0.18097 Median : 713.0 Median :3662 Median :4548
Mean :0.19049 Mean : 848.2 Mean :3656 Mean :4504
3rd Qu.:0.23321 3rd Qu.:1096.0 3rd Qu.:4776 3rd Qu.:5956
Max. :0.50746 Max. :3410.0 Max. :6946 Max. :8714
ncnt nr rr
Min. :0.002525 Min. :0.002879 Min. :0.003167
1st Qu.:0.361717 1st Qu.:0.359487 1st Qu.:0.453786
Median :0.521919 Median :0.527210 Median :0.654765
Mean :0.516909 Mean :0.526371 Mean :0.648481
3rd Qu.:0.683498 3rd Qu.:0.687662 3rd Qu.:0.857472
Max. :1.000000 Max. :1.000000 Max. :1.254535
#daily_datadaily_data %>%group_by(yr) %>%plot_time_series(dteday, temp, .color_var = season, .x_lab ="Date", .y_lab ="Normalized Temperature", .title ="Normalized Temperature vs Date for Daily Data", .interactive =TRUE)
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
ℹ The deprecated feature was likely used in the timetk package.
Please report the issue at
<https://github.com/business-science/timetk/issues>.
daily_data %>%group_by(yr) %>%plot_time_series(dteday, hum, .color_var = season, .x_lab ="Date", .y_lab ="Normalized Humidity", .title ="Normalized Humidity vs Date for Daily Data", .interactive =TRUE)
daily_data %>%group_by(yr) %>%plot_time_series(dteday, windspeed, .color_var = season, .x_lab ="Date", .y_lab ="Normalized Windspeed", .title ="Windspeed vs Date for Daily Data", .interactive =TRUE)
daily_data %>%group_by(yr) %>%plot_time_series(dteday, ncnt, .color_var = season, .x_lab ="Date", .y_lab ="Normalized Bike Rentals", .title ="Normalized Bike Rentals vs Date for Daily Data", .interactive =TRUE)
daily_data %>%group_by(yr) %>%plot_time_series(dteday, nr, .color_var = season, .x_lab ="Date", .y_lab ="Normalized Registered Rentals", .title ="Normalized Registered Rentals vs Date for Daily Data", .interactive =TRUE)
daily_data %>%group_by(yr) %>%plot_time_series(dteday, rr, .color_var = season, .x_lab ="Date", .y_lab ="Ratio", .title ="Ratio of Rentals to Registration vs Date for Daily Data", .interactive =TRUE)
#since the humdity and windspeed data has so much noise, I will focus on temperature and bike rentals#data cleaningdaily_data[,"temp"] <-tsclean(daily_data[,"temp"])daily_data[,"ncnt"] <-tsclean(daily_data[,"ncnt"])daily_data[,"nr"] <-tsclean(daily_data[,"nr"])daily_data[,"rr"] <-tsclean(daily_data[,"rr"])head(daily_data)
#daily_datadaily_data %>%group_by(yr) %>%plot_time_series(dteday, temp, .color_var = season, .x_lab ="Date", .y_lab ="Normalized Temperature", .title ="Normalized Temperature vs Date for Daily Data", .interactive =TRUE)
daily_data %>%group_by(yr) %>%plot_time_series(dteday, ncnt, .color_var = season, .x_lab ="Date", .y_lab ="Normalized Bike Rentals", .title ="Normalized Bike Rentals vs Date for Daily Data", .interactive =TRUE)
daily_data %>%group_by(yr) %>%plot_time_series(dteday, nr, .color_var = season, .x_lab ="Date", .y_lab ="Normalized Registered Rentals", .title ="Normalized Registered Rentals vs Date for Daily Data", .interactive =TRUE)
daily_data %>%group_by(yr) %>%plot_time_series(dteday, rr, .color_var = season, .x_lab ="Date", .y_lab ="Normalized Registered Rentals", .title ="Ratio of Rentals to Registration vs Date for Daily Data", .interactive =TRUE)
#daily datadaily_data[,"temp"] %>%adf.test()
Augmented Dickey-Fuller Test
data: .
Dickey-Fuller = -1.6785, Lag order = 9, p-value = 0.7144
alternative hypothesis: stationary
daily_data[,"ncnt"] %>%adf.test()
Augmented Dickey-Fuller Test
data: .
Dickey-Fuller = -1.3084, Lag order = 9, p-value = 0.871
alternative hypothesis: stationary
daily_data[,"nr"] %>%adf.test()
Augmented Dickey-Fuller Test
data: .
Dickey-Fuller = -2.418, Lag order = 9, p-value = 0.4014
alternative hypothesis: stationary
daily_data[,"rr"] %>%adf.test()
Augmented Dickey-Fuller Test
data: .
Dickey-Fuller = -1.3084, Lag order = 9, p-value = 0.871
alternative hypothesis: stationary
#decomposes the datafreq <-365#normalized daily rentalsnorm_rentals <-ts(daily_data[, "nr"], frequency = freq)decomped1 <-stl(norm_rentals, "periodic")plot(decomped1$time.series[,2], ylab ="Stationary of the Normalized Rental Reservations", xlab ="Daily of the Year")
checkresiduals(decomped1$time.series[, 3])
Ljung-Box test
data: Residuals
Q* = 1997.3, df = 146, p-value < 2.2e-16
Model df: 0. Total lags used: 146
#normalized daily countsnorm_cnt <-ts(daily_data[, "ncnt"], frequency = freq)decomped2 <-stl(norm_cnt, "periodic")plot(decomped2$time.series[,2], ylab ="Stationary of the Normalized Rental Counts", xlab ="Daily of the Year")
checkresiduals(decomped2$time.series[, 3])
Ljung-Box test
data: Residuals
Q* = 1437.8, df = 146, p-value < 2.2e-16
Model df: 0. Total lags used: 146
#normalized daily rental ratesnorm_rr <-ts(daily_data[, "rr"], frequency = freq)decomped3 <-stl(norm_rr, "periodic")plot(decomped3$time.series[,2], ylab ="Stationary of the Normalized Rental Counts to Reservations", xlab ="Daily of the Year")
checkresiduals(decomped3$time.series[, 3])
Ljung-Box test
data: Residuals
Q* = 1437.8, df = 146, p-value < 2.2e-16
Model df: 0. Total lags used: 146
#returns the statsprint("-------Noramlized Rental Reservations")
[1] "-------Noramlized Rental Reservations"
shapiro.test(decomped1$time.series[, 3])
Shapiro-Wilk normality test
data: decomped1$time.series[, 3]
W = 0.99554, p-value = 0.03334
print("-------Normalized Count of Rentals")
[1] "-------Normalized Count of Rentals"
shapiro.test(decomped1$time.series[, 3])
Shapiro-Wilk normality test
data: decomped1$time.series[, 3]
W = 0.99554, p-value = 0.03334
print("-------Normalized Ratio of Rentals to Reservations")
[1] "-------Normalized Ratio of Rentals to Reservations"
shapiro.test(decomped3$time.series[, 3])
Shapiro-Wilk normality test
data: decomped3$time.series[, 3]
W = 0.99702, p-value = 0.1993
Series: norm_cnt
ARIMA(1,0,2)(0,1,0)[365] with drift
Coefficients:
ar1 ma1 ma2 drift
0.9638 -0.6601 -0.1504 7e-04
s.e. 0.0246 0.0563 0.0497 1e-04
sigma^2 = 0.01709: log likelihood = 227
AIC=-443.99 AICc=-443.83 BIC=-424.48
Training set error measures:
ME RMSE MAE MPE MAPE MASE
Training set 0.0006729107 0.09200729 0.05020995 -2.84962 10.02705 0.1897625
ACF1
Training set 0.01095462
hist(fit1$residuals, xlab ="Residual", ylab ="Distribution", main ="Histogram of Model Errors - Bike Count")
shapiro.test(fit1$residuals)
Shapiro-Wilk normality test
data: fit1$residuals
W = 0.83122, p-value < 2.2e-16
prediction1 <-forecast(fit1, 25)plot(prediction1, xlab ="Date", ylab ="Normalized Count of Rentals", main ="Prediction of Bike Rental Counts")
#predictions of number of reservationsfit2 <-auto.arima(norm_rentals, seasonal =TRUE, )hist(fit2$residuals, xlab ="Residual", ylab ="Distribution", main ="Histogram of Model Errors - Rental Count")
shapiro.test(fit2$residuals)
Shapiro-Wilk normality test
data: fit2$residuals
W = 0.85305, p-value < 2.2e-16
prediction2 <-forecast(fit2, 25)plot(prediction2, xlab ="Date", ylab ="Normalized Rentals", main ="Prediction of Bike Rentals")
#bike count predictionsfit2 <-auto.arima(norm_cnt, seasonal =TRUE, )hist(fit2$residuals, xlab ="Residual", ylab ="Distribution", main ="Histogram of Model Errors - Count to Rental Ratio")
shapiro.test(fit2$residuals)
Shapiro-Wilk normality test
data: fit2$residuals
W = 0.83122, p-value < 2.2e-16
prediction3 <-forecast(fit2, 25)plot(prediction3, xlab ="Date", ylab ="Normalized Rental Ratio", main ="Prediction of Bike Rentals to Reservations")
#Findings and Conclusions After processing the raw data and using the ARIMA package to model ride share data, I was able to make predictions for the 25 days beyond the current data set. Qualitatively the data shows that was the weather gets warmer the number of bike rentals increase, and over the course of two years the number of rentals increases over the number of rentals from the previous year. As the data terminates at the end of one cycle, I expect the number of rentals to increase to a level higher than it was a year before, which is what the models are predicting. Therefore the results were what I expected - the data appears to oscillate up and down over a 1 year period with the overall data moving towards higher rental numbers.