##Packages
suppressWarnings({
suppressMessages({
library(dplyr)
library(ggplot2)
library(lubridate)
library(gridExtra)
library(forecast)
library(modeltime)
library(fabletools)
library(feasts)
library(tsibble)
library(tibble)
library(fable)
library(data.table)
library(knitr)
library(prophet)
library(fable.prophet)
library(tidyr)
options(warn=-1)
})
})
—-Section 1—-
Prophet, a forecasting tool created by Facebook, specializes in analyzing time-series data with daily observations showcasing trends, seasonality, and holiday effects. It stands out for its user-friendly interface, offering intuitive parameter adjustments, automatic feature selection, and robust modeling techniques. Prophet efficiently captures intricate data patterns, making it a valuable asset for accurate time-series forecasting.
The basic model is fit as:
yt = gt + st +ht + et
where gts the trend, stis seasonality, and ht are holidays.
vehicle_df <- read.csv('total_vehicle_sales.csv')
vehicle_sales_tbl_ts <- vehicle_df %>% select(date, vehicle_sales) %>%
mutate(value = vehicle_sales) %>%
mutate(date = yearmonth(date)) %>%
as_tsibble(index = date)
set.seed(123)
# Determine the number of rows for training (80%) and testing (20%)
n_rows <- nrow(vehicle_sales_tbl_ts)
train_rows <- round(0.8 * n_rows)
# Split the data into training and testing sets
vehicle_train <- vehicle_sales_tbl_ts %>% slice(1:train_rows)
vehicle_test <- vehicle_sales_tbl_ts %>% slice((train_rows + 1):n_rows)
# Visualize the training and test sets
ggplot() +
geom_line(data = vehicle_train, aes(x = date, y = value, color = "Training"), linewidth = 0.4) +
geom_line(data = vehicle_test, aes(x = date, y = value, color = "Test"), linewidth = 0.4) +
scale_color_manual(values = c("Training" = "blue", "Test" = "red")) +
labs(title = "Training and Testing data", x = "Date", y = "vehicle sales") +
theme_minimal()
vehicle_train %>%
model(prophet = fable.prophet::prophet(vehicle_sales)) %>%
forecast(h=115) %>%
autoplot(vehicle_train %>% bind_rows(vehicle_test))+
ylab('vehicle sales')+
xlab('Month')+
theme_bw()
—-Section 2—-
The analysis of the time series data indicates a rising trend until the early 2000s, followed by a decline thereafter. Furthermore, there is an additive seasonal pattern observed, with no evidence of multiplicative seasonality.
model = vehicle_train %>%
model(prophet = fable.prophet::prophet(vehicle_sales))
model %>%
components() %>%
autoplot()
changepoints = model %>%
glance() %>%
pull(changepoints) %>%
bind_rows() %>%
.$changepoints
vehicle_train %>%
ggplot()+
geom_line(aes(date,vehicle_sales))+
geom_vline(xintercept=as.Date(changepoints),color='red',linetype='dashed')
model2 = vehicle_train %>%
model(
prophet_orig = fable.prophet::prophet(vehicle_sales ~ growth(n_changepoints = 25) + season(period = 'year') + season(period = 'month', order = 12)),
prophet_10 = fable.prophet::prophet(vehicle_sales ~ growth(n_changepoints = 10) + season(period = 'year') + season(period = 'month', order = 12))
)
changepoints_orig = model2 %>%
glance() %>%
filter(.model == 'prophet_orig') %>%
pull(changepoints) %>%
bind_rows() %>%
#filter(abs(adjustment)>0.05) %>%
.$changepoints
changepoints_10 = model2 %>%
glance() %>%
filter(.model == 'prophet_10') %>%
pull(changepoints) %>%
bind_rows() %>%
#filter(abs(adjustment)>0.05) %>%
.$changepoints
vehicle_train %>%
ggplot()+
geom_line(aes(date,vehicle_sales))+
# geom_vline(aes(xintercept=ymd('2000-01-01')))
geom_vline(xintercept=as.Date(changepoints_orig),color='red')+
geom_vline(xintercept=as.Date(changepoints_10),color='blue',linetype='dashed')
model1 = vehicle_train %>%
model(
prophet_orig = fable.prophet::prophet(vehicle_sales),
prophet_90_range = fable.prophet::prophet(vehicle_sales~growth(changepoint_range=0.9)+season(period='year')+ season(period = 'month', order = 12)),
)
changepoints_orig = model1 %>%
glance() %>%
filter(.model == 'prophet_orig') %>%
pull(changepoints) %>%
bind_rows() %>%
#filter(abs(adjustment)>0.01) %>%
.$changepoints
changepoints_90_range = model1 %>%
glance() %>%
filter(.model == 'prophet_90_range') %>%
unnest(changepoints) %>%
bind_rows() %>%
#filter(abs(adjustment)>0.01) %>%
.$changepoints
model1 %>%
forecast(h=68) %>%
autoplot(vehicle_train,level=NULL)+
geom_vline(xintercept=as.Date(changepoints_orig),color='blue',linetype='dashed')+
geom_vline(xintercept=as.Date(changepoints_90_range),color='red',linetype='dashed')
model3 = vehicle_train %>%
model(
prophet_orig = fable.prophet::prophet(vehicle_sales),
prophet_less_flexible = fable.prophet::prophet(vehicle_sales~growth(changepoint_prior_scale=0.01)+season(period='year')),
prophet_more_flexible = fable.prophet::prophet(vehicle_sales~growth(changepoint_prior_scale=0.5)+season(period='year'))
)
changepoints_orig = model3 %>%
glance() %>%
unnest(changepoints) %>%
bind_rows() %>%
filter(.model == 'prophet_orig') %>%
filter(abs(adjustment)>=0.05) %>%
.$changepoints
changepoints_more_flexible = model3 %>%
glance() %>%
unnest(changepoints) %>%
bind_rows() %>%
filter(.model == 'prophet_more_flexible') %>%
filter(abs(adjustment)>=0.50) %>%
.$changepoints
changepoints_less_flexible = model3 %>%
glance() %>%
unnest(changepoints) %>%
bind_rows() %>%
filter(.model == 'prophet_less_flexible') %>%
filter(abs(adjustment)>=0.01) %>%
.$changepoints
model3 %>%
forecast(h=68) %>%
autoplot(vehicle_train,level=NULL)+
geom_vline(xintercept=as.Date(changepoints_orig),color='blue',linetype='dashed')+
geom_vline(xintercept=as.Date(changepoints_more_flexible),color='green',linetype='dashed')+
geom_vline(xintercept=as.Date(changepoints_less_flexible),color='red',linetype='dashed')
model3 %>%
components() %>%
autoplot(trend)+
geom_vline(xintercept=as.Date(changepoints_orig),color='blue',linetype='dashed')+
geom_vline(xintercept=as.Date(changepoints_more_flexible),color='green',linetype='dashed')+
geom_vline(xintercept=as.Date(changepoints_less_flexible),color='red',linetype='dashed')
vehicle_train %>%
model(
prophet_orig = fable.prophet::prophet(vehicle_sales),
prophet_saturating = fable.prophet::prophet(vehicle_sales~growth(type='linear',capacity=3.7,floor=2.8)+season('year'))
) %>%
forecast(h=68) %>%
autoplot(vehicle_train)
The forecast predictions from changepoints_less_flexible outperform other changepoint options, suggesting that a less flexible approach may better capture the underlying data patterns. This implies that simpler models with fewer changepoints could yield more dependable forecasts.
Although adjustments were made to the floor and capacity parameters to a significant extent, the forecasts produced show minimal deviation from the base Prophet model. This suggests that while these parameters theoretically affect the model’s behavior, their practical impact on forecasting accuracy may be limited in this dataset. Further exploration and experimentation may be necessary to gain a deeper understanding of their effect.
—-Section 3—-
model %>%
components()
## # A dable: 460 x 10 [1M]
## # Key: .model [1]
## # : vehicle_sales = trend * (1 + multiplicative_terms) + additive_terms
## # + .resid
## .model date vehicle_sales additive_terms multiplicative_terms trend
## <chr> <mth> <dbl> <dbl> <dbl> <dbl>
## 1 prophet 1976 Jan 885. -36.3 0 929.
## 2 prophet 1976 Feb 995. 59.3 0 929.
## 3 prophet 1976 Mar 1244. 337. 0 929.
## 4 prophet 1976 Apr 1191. 235. 0 930.
## 5 prophet 1976 May 1203. 319. 0 930.
## 6 prophet 1976 Jun 1255. 300. 0 930.
## 7 prophet 1976 Jul 1162. 226. 0 931.
## 8 prophet 1976 Aug 1026. 181. 0 931.
## 9 prophet 1976 Sep 1058. 131. 0 932.
## 10 prophet 1976 Oct 1129. 140. 0 932.
## # ℹ 450 more rows
## # ℹ 4 more variables: yearly <dbl>, weekly <dbl>, daily <dbl>, .resid <dbl>
yearly_seasonality = model %>%
components() %>%
autoplot(yearly)
yearly_seasonality
model4 = vehicle_train %>%
model(
additive = fable.prophet::prophet(vehicle_sales~growth()+season(period='year',type='additive')),
multiplicative = fable.prophet::prophet(vehicle_sales~growth()+season(period='year',type='multiplicative')))
model4 %>%
components() %>%
autoplot()
Both the decomposition graph and table reveal the existence of additive seasonality within the time series, rather than multiplicative seasonality. This implies that the seasonal fluctuations within the data maintain a consistent magnitude over time, rather than scaling proportionally with the trend.
—-Section 4—-
vehicle_train %>%
bind_rows(vehicle_test) %>%
ggplot()+
geom_line(aes(date, vehicle_sales))+
geom_vline(aes(xintercept=ymd('2014-05-01')),color='red')+
annotate("text", x = as.Date("1976-01-01"), y = max(vehicle_sales_tbl_ts$vehicle_sales) * 0.9,
label = "Train", color = "blue", size = 4) +
annotate("text", x = as.Date("2023-11-01"), y = max(vehicle_sales_tbl_ts$vehicle_sales) * 0.9,
label = "Test", color = "blue", size = 4) +
labs(x = "Week", y = "vehicle sales", title = "Training and Testing Dataset") +
theme_bw()
vehicle_train_data <- vehicle_train %>%
stretch_tsibble(.init = 22*12, .step = 12)
vehicle_train_data %>%
ggplot()+
geom_point(aes(date,factor(.id),color=factor(.id)))+
ylab('Iteration')+
ggtitle('Samples included in each CV Iteration')
vehicle_train_cv_forecast <- vehicle_train_data %>%
model(
arima = ARIMA(vehicle_sales~pdq(2,0,2)),
prophet = fable.prophet::prophet(vehicle_sales~growth()+season(period='year')),
naive = NAIVE(vehicle_sales~drift())
) %>%
forecast(h=12)
vehicle_train_cv_forecast %>%
as_tsibble() %>%
dplyr::select(-vehicle_sales) %>%
left_join(
vehicle_train
) %>%
ggplot()+
geom_line(aes(date,vehicle_sales))+
geom_line(aes(date,.mean,color=factor(.id),linetype=.model))+
scale_color_discrete(name='Iteration')+
ylab('Ridership Count')+
theme_bw()
## Joining with `by = join_by(date)`
vehicle_train_cv_forecast %>%
as_tibble() %>%
dplyr::select(-vehicle_sales) %>%
left_join(vehicle_train) %>%
group_by(.id,.model) %>%
mutate(
months_ahead = seq(1:12)
) %>%
ungroup() %>%
mutate(error = abs(vehicle_sales - .mean)) %>%
ggplot()+
geom_boxplot(aes(factor(months_ahead),error))+
facet_wrap(~.model,ncol=1)+
guides(color='none')+
ylab('Absolute Error')+
xlab('Days Ahead')+
ggtitle('Absolute Error by Iteration')+
ylim(0,600)
## Joining with `by = join_by(date)`
vehicle_train_cv_forecast %>%
as_tibble() %>%
dplyr::select(-vehicle_sales) %>%
left_join(vehicle_train) %>%
group_by(.id,.model) %>%
mutate(
months_ahead = seq(1:12)
) %>%
ungroup() %>%
filter(!is.na(vehicle_sales)) %>%
group_by(months_ahead,.model) %>%
summarize(
rmse = sqrt(mean((vehicle_sales - .mean)^2,na.rm=T)),
) %>%
ungroup() %>%
ggplot()+
geom_point(aes(months_ahead,rmse,color=.model),alpha=0.4)+
geom_smooth(aes(months_ahead,rmse,color=.model),se=F)+
xlab("Months Ahead")+
ylab("Smoothed RMSE")
## Joining with `by = join_by(date)`
## `summarise()` has grouped output by 'months_ahead'. You can override using the
## `.groups` argument.
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
vehicle_train_cv_forecast %>%
as_tibble() %>%
dplyr::select(-vehicle_sales) %>%
left_join(vehicle_train) %>%
group_by(.id,.model) %>%
mutate(
months_ahead = seq(1:12)
) %>%
ungroup() %>%
group_by(months_ahead,.model) %>%
mutate(
mape = mean(abs((vehicle_sales - .mean)/vehicle_sales),na.rm=T)
) %>%
ungroup() %>%
ggplot()+
geom_point(aes(months_ahead,mape,color=.model),alpha=0.4)+
geom_smooth(aes(months_ahead,mape,color=.model),se=F)+
xlab("Months Ahead")+
ylab("Smoothed MAPE")
## Joining with `by = join_by(date)`
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
vehicle_train_cv_forecast %>%
accuracy(vehicle_train)
## # A tibble: 3 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 arima Test -9.43 117. 87.7 -1.61 7.20 0.784 0.794 0.463
## 2 naive Test -42.5 192. 149. -5.41 12.4 1.33 1.30 0.444
## 3 prophet Test -25.4 219. 171. -4.15 14.9 1.53 1.48 0.805
vehicle_train_model <- vehicle_train %>%
model(
prophet = fable.prophet::prophet(vehicle_sales~growth()+season(period='year', type = 'additive'))
)
vehicle_train_model %>%
forecast(h=115) %>%
autoplot(vehicle_train %>%
bind_rows(vehicle_test))+
ylab('vehicle sales')+
theme_bw()
# Produce forecasts for the training set
train_forecast <- vehicle_train_model %>%
forecast(new_data = vehicle_train)
# Visualize actual versus forecasted values for the training set
autoplot(train_forecast) +
autolayer(vehicle_train) +
labs(title = "Actual vs Forecasted Values for Training Set", y = "vehicle sales", x = "Date") +
theme_minimal()
## Plot variable not specified, automatically selected `.vars = vehicle_sales`
Although the forecast model adeptly captures the variability and trend within the data, it struggles to accurately estimate the magnitude of the observations. This indicates that while the model effectively tracks the general patterns, it tends to underestimate the true values, leading to discrepancies in performance when evaluated on the test set. Notably, the ARIMA model exhibits the lowest error rate among all models, as evidenced by its superior performance in terms of RMSE and MAPE.
When contrasting the fitted values with the actual observations, it’s evident that the model displays minimal bias, indicating a strong alignment with the training dataset. Yet, upon scrutinizing the forecast against the test data, a notable reduction in variability is observed, signaling a potential overfitting challenge. This disparity suggests that while the model effectively captures the intrinsic patterns, its ability to generalize to new data is compromised. Further exploration and adjustments may be necessary to mitigate the overfitting issue and enhance the model’s predictive performance.