#Include all libraries here

library(dplyr)
library(forecast)
library(splitstackshape)
library(lubridate)
library(lmtest)
library(kableExtra)
library(moderndive)
library(TTR)
#code that creates the time variables
#split Date variable on the commas
#this will create two new variables

#split Date_1 into two columns: Date_1_1, Date_1_2 (separates month and day from time)
data=cSplit(data, "Date", " ")

#Convert character variable to an R date
data$newdate=lubridate::mdy(data$Date_1)

#create time variable
data$date =  as.Date(data$newdate, format="%m/%d/%Y")
data$month = as.numeric(format(data$date, "%m"))
data$year = as.numeric(format(data$date, "%Y"))

Introduction

The data set is the mileage of cycling collected for 2017-2024 by Professor Pyott. The data includes the number of rides per year in addition to the mileage cycled per month and per year.

Exploratory Data Anaysis

#Month totals for bar chart
month_totals = data %>%
  select(Distance, month) %>%
  group_by(month) %>%
  summarise(total=sum(Distance),
            sd=sd(Distance),
            n=n())

dt= month_totals %>%
  kbl(caption="Monthly Mileage Summary") %>%
  kable_classic(full_width=F, html_font = "Cambria")

dt
Monthly Mileage Summary
month total sd n
1 894 7.1 42
2 952 7.9 47
3 2223 10.4 81
4 2655 10.6 92
5 3504 16.5 102
6 3402 13.1 103
7 3353 12.8 109
8 2932 11.9 93
9 2194 13.5 66
10 1928 16.2 61
11 1176 9.2 50
12 832 9.6 37
#create annual totals
annual_totals = data %>%
  select(Distance, year) %>%
  group_by(year) %>%
  summarise(total=sum(Distance),
            sd=sd(Distance),
            n=n())


dt= annual_totals %>%
  kbl(caption="Annual Mileage Summary") %>%
  kable_classic(full_width=F, html_font = "Cambria")

dt
Annual Mileage Summary
year total sd n
2017 2303 15 70
2018 1813 10 61
2019 3940 14 107
2020 4564 12 153
2021 5059 13 177
2022 3804 12 144
2023 3850 13 144
2024 712 10 27
#Bar plot, box plot, violin plot of miles per month
Month=c("Jan", "Feb", "Mar", "Apr", "May","Jun", 
        "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")
barplot(total~month, data=month_totals, names.arg=Month, col="pink")
title(main = "Monthly Average Ride Length", font.main = 4)

#graph totals data set
barplot(total~year, data=annual_totals, col="orange")
title(main = "Annual Total Miles", font.main = 4)

Times Series

#create and plot time series

#create time series
timeseries_totals = data %>%
  select(month, year, Distance) %>%
  group_by(year, month) %>% 
  summarise(sum=sum(Distance), average=mean(Distance)) 

#Which variable to do want to model? Sum, average?
#Sum
timeseries1 = ts(timeseries_totals$sum, frequency=12, start=c(2017,4))

plot(timeseries1, xlab="Time", ylab="Sum", main="Time vs Average", col="green")
abline(reg=lm(timeseries1~time(timeseries1)),col="red")

tscomponents1=decompose(timeseries1)
plot(tscomponents1)

#Average
timeseries2 = ts(timeseries_totals$average, frequency=12, start=c(2017,4))

plot(timeseries2, xlab="Time", ylab="Average", main="Time vs Average", col="purple")
abline(reg=lm(timeseries2~time(timeseries2)),col="blue")

tscomponents2=decompose(timeseries2)
plot(tscomponents2)

Timeseries1 shows a secular trend that the sum is increasing. We see a seasonal trend of mileage increasing in the warmer months then decreasing during the colder months. The decompostion plot for timeseries1 shows that there is a seasonal trend and the sum does go up over time. There is a nonrandom increase for 2020 due to covid allowing more time for cycling Timeseries2 shows a secular trend that average is decreasing. We still see the same seasonal trend of mileage increasing for warmer months and decreasing for cold months. The decomposition shows a seasonal trend and that average decreases over time. We can see the covid disruption allowing for more cycling.

Forecasting

This section will forecast the next five months for the cyclists, and include the smoothed time series plot with the forecasts on the plot.

#smooth TS
#forecast 5 months
#Holt Winters 

hw2 =HoltWinters(timeseries1, optim.start=c(alpha=.1, 
                beta=.1, gamma=.1), 
                seasonal = c("additive"))

plot(hw2)

fc=forecast(hw2, h=5, level=c(95))

dt2= fc %>%
  kbl(caption="Forecast for Next 5 Months") %>%
  kable_classic(full_width=F, html_font = "Cambria")

dt2
Forecast for Next 5 Months
Point Forecast Lo 95 Hi 95
Mar 2024 302 53 550
Apr 2024 285 -32 602
May 2024 326 -47 698
Jun 2024 336 -85 757
Jul 2024 220 -244 685

Regression

#perform OLS on time series 

##linear regression
timeseries_totals=as.data.frame(timeseries_totals)
#Add an index variable to represent time point
timeseries_totals$time= 1:nrow(timeseries_totals)

plot(timeseries_totals$time, timeseries_totals$sum, col="blue", xlab="Time", ylab="Sum", main="Sum vs Time")
abline(lm(timeseries_totals$sum~timeseries_totals$time), col="red")

model =lm(sum~time, data=timeseries_totals)

model_summary=summary(model)
model_summary

Call:
lm(formula = sum ~ time, data = timeseries_totals)

Residuals:
   Min     1Q Median     3Q    Max 
-278.3 -149.6  -21.4  137.4  431.6 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  278.770     38.859    7.17  3.1e-10 ***
time           0.834      0.804    1.04      0.3    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 175 on 81 degrees of freedom
Multiple R-squared:  0.0131,    Adjusted R-squared:  0.000928 
F-statistic: 1.08 on 1 and 81 DF,  p-value: 0.303
r2=model_summary$r.squared

#durbin-watson test
dwt=dwtest(model)

The linear regression model is mileage=278.77 + 0.83 * time. The coefficient of determination is 0.01. The F test is F(1, 81)=1.08`.

The DW test stat is 0.61 with a p-value of 2.13^{-14}.

Model Utility and DW test analysis: Annual cycling mileage is statistically significantly predicted by the model F(1, 81)=1.08`, p = 2.13^{-14}, with adjust R squared 0.01. The value for time, b = 0.83, significantly predicts mileage. Since the DW value is 0.61, linear regression is not appropriate since the residuals are positively correlated. A DW value of less than 2 means positive correlation and a value greater than 2 or close to 4 means a negative correlation for the residuals. A value centered at 2 means that the residuals are not correlated and linear regression will model the data well.