suppressWarnings({
suppressMessages({
library(dplyr)
library(ggplot2)
library(lubridate)
library(gridExtra)
library(forecast)
library(modeltime)
library(fabletools)
library(feasts)
library(tsibble)
library(tibble)
library(fable)
library(data.table)
library(knitr)
library(prophet)
library(fable.prophet)
library(tidyr)
options(warn=-1)
})
})Untitled
data <- read.csv("//Users/keerthichereddy/Documents/Sem2/Forecasting Methods/Project/total_vehicle_sales.csv", stringsAsFactors = FALSE)
names(data)[1] "date" "vehicle_sales"
data_ts <- data %>% select(date, vehicle_sales) %>%
mutate(value = vehicle_sales) %>%
mutate(date = yearmonth(date)) %>%
as_tsibble(index = date)Section 1
The Prophet model, developed by Facebook, is designed for forecasting time series data. It works well with time series that have strong seasonal effects and several seasons of historical data. Prophet is robust to deal with missing data, outliers, handle daily, weekly, and yearly seasonality and shifts in the trend and typically handles outliers well.
The methodology combines decomposable time series models with robust statistical methods to fit the models, making it highly adaptable and easy to use, even with default settings.
# Set the seed for reproducibility
set.seed(123)
# Determine the number of rows for training (80%) and testing (20%)
n_rows <- nrow(data_ts)
train_rows <- round(0.8 * n_rows)
# Split the data into training and testing sets
data_train <- data_ts %>% slice(1:train_rows)
data_test <- data_ts %>% slice((train_rows + 1):n_rows)
# Visualize the training and test sets
ggplot() +
geom_line(data = data_train, aes(x = date, y = value, color = "Training"), linewidth = 1) +
geom_line(data = data_test, aes(x = date, y = value, color = "Test"), linewidth = 1) +
scale_color_manual(values = c("Training" = "blue", "Test" = "red")) +
labs(title = "Training and Test Sets", x = "Date", y = "vehicle sales") +
theme_minimal()Forecast using prophet model
data_train %>%
model(prophet = fable.prophet::prophet(vehicle_sales)) %>%
forecast(h=115) %>%
autoplot(data_train %>% bind_rows(data_test))+
ylab('vehicle sales')+
xlab('Month')+
theme_bw()Section 2
model = data_train %>%
model(prophet = fable.prophet::prophet(vehicle_sales))
model %>%
components() %>%
autoplot()The decomposition of the time series data shows an upward trend until early 2000, followed by a downward slope. Also, an additive seasonality component can be seen without any indication of multiplicative seasonality.
changepoints = model %>%
glance() %>%
pull(changepoints) %>%
bind_rows() %>%
.$changepoints
data_train %>%
ggplot()+
geom_line(aes(date,vehicle_sales))+
geom_vline(xintercept=as.Date(changepoints),color='red',linetype='dashed')model2 = data_train %>%
model(
prophet_orig = fable.prophet::prophet(vehicle_sales ~ growth(n_changepoints = 25) + season(period = 'year') + season(period = 'month', order = 12)),
prophet_10 = fable.prophet::prophet(vehicle_sales ~ growth(n_changepoints = 10) + season(period = 'year') + season(period = 'month', order = 12))
)
changepoints_orig = model2 %>%
glance() %>%
filter(.model == 'prophet_orig') %>%
pull(changepoints) %>%
bind_rows() %>%
#filter(abs(adjustment)>0.05) %>%
.$changepoints
changepoints_10 = model2 %>%
glance() %>%
filter(.model == 'prophet_10') %>%
pull(changepoints) %>%
bind_rows() %>%
#filter(abs(adjustment)>0.05) %>%
.$changepoints
data_train %>%
ggplot()+
geom_line(aes(date,vehicle_sales))+
# geom_vline(aes(xintercept=ymd('2000-01-01')))
geom_vline(xintercept=as.Date(changepoints_orig),color='red')+
geom_vline(xintercept=as.Date(changepoints_10),color='blue',linetype='dashed')model1 = data_train %>%
model(
prophet_orig = fable.prophet::prophet(vehicle_sales),
prophet_90_range = fable.prophet::prophet(vehicle_sales~growth(changepoint_range=0.9)+season(period='year')+ season(period = 'month', order = 12)),
)
changepoints_orig = model1 %>%
glance() %>%
filter(.model == 'prophet_orig') %>%
pull(changepoints) %>%
bind_rows() %>%
#filter(abs(adjustment)>0.01) %>%
.$changepoints
changepoints_90_range = model1 %>%
glance() %>%
filter(.model == 'prophet_90_range') %>%
unnest(changepoints) %>%
bind_rows() %>%
#filter(abs(adjustment)>0.01) %>%
.$changepoints
model1 %>%
forecast(h=68) %>%
autoplot(data_train,level=NULL)+
geom_vline(xintercept=as.Date(changepoints_orig),color='blue',linetype='dashed')+
geom_vline(xintercept=as.Date(changepoints_90_range),color='red',linetype='dashed')model3 = data_train %>%
model(
prophet_orig = fable.prophet::prophet(vehicle_sales),
prophet_less_flexible = fable.prophet::prophet(vehicle_sales~growth(changepoint_prior_scale=0.01)+season(period='year')),
prophet_more_flexible = fable.prophet::prophet(vehicle_sales~growth(changepoint_prior_scale=0.5)+season(period='year'))
)
changepoints_orig = model3 %>%
glance() %>%
unnest(changepoints) %>%
bind_rows() %>%
filter(.model == 'prophet_orig') %>%
filter(abs(adjustment)>=0.05) %>%
.$changepoints
changepoints_more_flexible = model3 %>%
glance() %>%
unnest(changepoints) %>%
bind_rows() %>%
filter(.model == 'prophet_more_flexible') %>%
filter(abs(adjustment)>=0.50) %>%
.$changepoints
changepoints_less_flexible = model3 %>%
glance() %>%
unnest(changepoints) %>%
bind_rows() %>%
filter(.model == 'prophet_less_flexible') %>%
filter(abs(adjustment)>=0.01) %>%
.$changepoints
model3 %>%
forecast(h=68) %>%
autoplot(data_train,level=NULL)+
geom_vline(xintercept=as.Date(changepoints_orig),color='blue',linetype='dashed')+
geom_vline(xintercept=as.Date(changepoints_more_flexible),color='green',linetype='dashed')+
geom_vline(xintercept=as.Date(changepoints_less_flexible),color='red',linetype='dashed')A less flexible approach may better capture the underlying patterns in the data as changepoints_less_flexible shows superior forecast predictions compared to other changepoint options. In this case, it suggests that simpler models with fewer changepoints may provide more reliable forecasts.
model3 %>%
components() %>%
autoplot(trend)+
geom_vline(xintercept=as.Date(changepoints_orig),color='blue',linetype='dashed')+
geom_vline(xintercept=as.Date(changepoints_more_flexible),color='green',linetype='dashed')+
geom_vline(xintercept=as.Date(changepoints_less_flexible),color='red',linetype='dashed')data_train %>%
model(
prophet_orig = fable.prophet::prophet(vehicle_sales),
prophet_saturating = fable.prophet::prophet(vehicle_sales~growth(type='linear',capacity=3.7,floor=2.8)+season('year'))
) %>%
forecast(h=68) %>%
autoplot(data_train)There is only a minimal deviation from the base model even after adjusting the floor and capacity parameters. It indicates that while these parameters theoretically influence the model’s behavior, their practical impact on forecasting accuracy may be limited in this particular scenario.
Section 3
model %>%
components()# A dable: 460 x 10 [1M]
# Key: .model [1]
# : vehicle_sales = trend * (1 + multiplicative_terms) + additive_terms
# + .resid
.model date vehicle_sales additive_terms multiplicative_terms trend
<chr> <mth> <dbl> <dbl> <dbl> <dbl>
1 prophet 1976 Jan 885. -36.3 0 929.
2 prophet 1976 Feb 995. 59.3 0 929.
3 prophet 1976 Mar 1244. 337. 0 929.
4 prophet 1976 Apr 1191. 235. 0 930.
5 prophet 1976 May 1203. 319. 0 930.
6 prophet 1976 Jun 1255. 300. 0 930.
7 prophet 1976 Jul 1162. 226. 0 931.
8 prophet 1976 Aug 1026. 181. 0 931.
9 prophet 1976 Sep 1058. 131. 0 932.
10 prophet 1976 Oct 1129. 140. 0 932.
# ℹ 450 more rows
# ℹ 4 more variables: yearly <dbl>, weekly <dbl>, daily <dbl>, .resid <dbl>
yearly_seasonality = model %>%
components() %>%
autoplot(yearly)
yearly_seasonality model4 = data_train %>%
model(
additive = fable.prophet::prophet(vehicle_sales~growth()+season(period='year',type='additive')),
multiplicative = fable.prophet::prophet(vehicle_sales~growth()+season(period='year',type='multiplicative')))
model4 %>%
components() %>%
autoplot()Section 4
data_train %>%
bind_rows(data_test) %>%
ggplot()+
geom_line(aes(date, vehicle_sales))+
geom_vline(aes(xintercept=ymd('2014-05-01')),color='red')+
annotate("text", x = as.Date("1976-01-01"), y = max(data_ts$vehicle_sales) * 0.9,
label = "Train", color = "blue", size = 4) +
annotate("text", x = as.Date("2023-11-01"), y = max(data_ts$vehicle_sales) * 0.9,
label = "Test", color = "blue", size = 4) +
labs(x = "Week", y = "vehicle sales", title = "Training and Testing Dataset") +
theme_bw()data_train_data <- data_train %>%
stretch_tsibble(.init = 22*12, .step = 12)
data_train_data %>%
ggplot()+
geom_point(aes(date,factor(.id),color=factor(.id)))+
ylab('Iteration')+
ggtitle('Samples included in each CV Iteration')data_train_cv_forecast <- data_train_data %>%
model(
arima = ARIMA(vehicle_sales~pdq(2,0,2)),
prophet = fable.prophet::prophet(vehicle_sales~growth()+season(period='year')),
naive = NAIVE(vehicle_sales~drift())
) %>%
forecast(h=12)
data_train_cv_forecast %>%
as_tsibble() %>%
dplyr::select(-vehicle_sales) %>%
left_join(
data_train
) %>%
ggplot()+
geom_line(aes(date,vehicle_sales))+
geom_line(aes(date,.mean,color=factor(.id),linetype=.model))+
scale_color_discrete(name='Iteration')+
ylab('Ridership Count')+
theme_bw()Joining with `by = join_by(date)`
data_train_cv_forecast %>%
as_tibble() %>%
dplyr::select(-vehicle_sales) %>%
left_join(data_train) %>%
group_by(.id,.model) %>%
mutate(
months_ahead = seq(1:12)
) %>%
ungroup() %>%
mutate(error = abs(vehicle_sales - .mean)) %>%
ggplot()+
geom_boxplot(aes(factor(months_ahead),error))+
facet_wrap(~.model,ncol=1)+
guides(color='none')+
ylab('Absolute Error')+
xlab('Days Ahead')+
ggtitle('Absolute Error by Iteration')+
ylim(0,600)Joining with `by = join_by(date)`
data_train_cv_forecast %>%
as_tibble() %>%
dplyr::select(-vehicle_sales) %>%
left_join(data_train) %>%
group_by(.id,.model) %>%
mutate(
months_ahead = seq(1:12)
) %>%
ungroup() %>%
filter(!is.na(vehicle_sales)) %>%
group_by(months_ahead,.model) %>%
summarize(
rmse = sqrt(mean((vehicle_sales - .mean)^2,na.rm=T)),
) %>%
ungroup() %>%
ggplot()+
geom_point(aes(months_ahead,rmse,color=.model),alpha=0.4)+
geom_smooth(aes(months_ahead,rmse,color=.model),se=F)+
xlab("Months Ahead")+
ylab("Smoothed RMSE")Joining with `by = join_by(date)`
`summarise()` has grouped output by 'months_ahead'. You can override using the
`.groups` argument.
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'
data_train_cv_forecast %>%
as_tibble() %>%
dplyr::select(-vehicle_sales) %>%
left_join(data_train) %>%
group_by(.id,.model) %>%
mutate(
months_ahead = seq(1:12)
) %>%
ungroup() %>%
group_by(months_ahead,.model) %>%
mutate(
mape = mean(abs((vehicle_sales - .mean)/vehicle_sales),na.rm=T)
) %>%
ungroup() %>%
ggplot()+
geom_point(aes(months_ahead,mape,color=.model),alpha=0.4)+
geom_smooth(aes(months_ahead,mape,color=.model),se=F)+
xlab("Months Ahead")+
ylab("Smoothed MAPE")Joining with `by = join_by(date)`
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'
data_train_cv_forecast %>%
accuracy(data_train)# A tibble: 3 × 10
.model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 arima Test -9.43 117. 87.7 -1.61 7.20 0.784 0.794 0.463
2 naive Test -42.5 192. 149. -5.41 12.4 1.33 1.30 0.444
3 prophet Test -25.5 219. 171. -4.15 14.9 1.53 1.48 0.804
data_train_model <- data_train %>%
model(
prophet = fable.prophet::prophet(vehicle_sales~growth()+season(period='year', type = 'additive'))
)data_train_model %>%
forecast(h=115) %>%
autoplot(data_train %>%
bind_rows(data_test))+
ylab('vehicle sales')+
theme_bw()# Produce forecasts for the training set
train_forecast <- data_train_model %>%
forecast(new_data = data_train)
# Visualize actual versus forecasted values for the training set
autoplot(train_forecast) +
autolayer(data_train) +
labs(title = "Actual vs Forecasted Values for Training Set", y = "vehicle sales", x = "Date") +
theme_minimal()Plot variable not specified, automatically selected `.vars = vehicle_sales`
Plotting the fitted values alongside the actual values reveals that the model shows low bias, indicating a strong fit to the training data. Yet, when comparing the forecasted values to the test data, it becomes clear that the forecast shows low variance, pointing to a possible issue of overfitting. This difference implies that although the model effectively identifies the underlying trends, it might have difficulties in generalizing to new, unseen data. This situation calls for further exploration and possible modifications to mitigate the overfitting concern.