Assignment 5 Forecasting

##Packages
suppressWarnings({
  suppressMessages({
library(dplyr)
library(ggplot2)
library(lubridate)
library(gridExtra)
library(forecast)
library(modeltime)
library(fabletools)
library(feasts)
library(tsibble)
library(tibble)
library(fable)
library(data.table) 
library(knitr) 
library(prophet)  
library(fable.prophet)
library(tidyr)


options(warn=-1)
})
})

—-Section 1—-

Prophet, a forecasting tool created by Facebook, specializes in analyzing time-series data with daily observations showcasing trends, seasonality, and holiday effects. It stands out for its user-friendly interface, offering intuitive parameter adjustments, automatic feature selection, and robust modeling techniques. Prophet efficiently captures intricate data patterns, making it a valuable asset for accurate time-series forecasting.

The basic model is fit as:

yt = gt + st +ht + et

where gts the trend, stis seasonality, and ht are holidays.

vehicle_df <- read.csv('total_vehicle_sales.csv')

 vehicle_sales_tbl_ts <- vehicle_df %>% select(date, vehicle_sales) %>%
  mutate(value = vehicle_sales) %>%
  mutate(date = yearmonth(date)) %>%
  as_tsibble(index = date)

set.seed(123)

# Determine the number of rows for training (80%) and testing (20%)
n_rows <- nrow(vehicle_sales_tbl_ts)
train_rows <- round(0.8 * n_rows)

# Split the data into training and testing sets
vehicle_train <- vehicle_sales_tbl_ts %>% slice(1:train_rows)
vehicle_test <- vehicle_sales_tbl_ts %>% slice((train_rows + 1):n_rows)

# Visualize the training and test sets
ggplot() +
  geom_line(data = vehicle_train, aes(x = date, y = value, color = "Training"), linewidth = 0.4) +
  geom_line(data = vehicle_test, aes(x = date, y = value, color = "Test"), linewidth = 0.4) +
  scale_color_manual(values = c("Training" = "blue", "Test" = "red")) +
  labs(title = "Training and Testing data", x = "Date", y = "vehicle sales") +
  theme_minimal()

vehicle_train %>%
    model(prophet = fable.prophet::prophet(vehicle_sales)) %>%
    forecast(h=115) %>%
    autoplot(vehicle_train %>% bind_rows(vehicle_test))+
    ylab('vehicle sales')+
    xlab('Month')+
    theme_bw()

—-Section 2—-

The analysis of the time series data indicates a rising trend until the early 2000s, followed by a decline thereafter. Furthermore, there is an additive seasonal pattern observed, with no evidence of multiplicative seasonality.

model = vehicle_train %>%
    model(prophet = fable.prophet::prophet(vehicle_sales))

model %>%
components() %>%
autoplot()

changepoints = model %>%
glance() %>%
pull(changepoints) %>%
bind_rows() %>%
.$changepoints

vehicle_train %>%
ggplot()+
geom_line(aes(date,vehicle_sales))+
geom_vline(xintercept=as.Date(changepoints),color='red',linetype='dashed')

model2 = vehicle_train %>%
    model(
        prophet_orig = fable.prophet::prophet(vehicle_sales ~ growth(n_changepoints = 25) + season(period = 'year') + season(period = 'month', order = 12)),
        prophet_10 = fable.prophet::prophet(vehicle_sales ~ growth(n_changepoints = 10) + season(period = 'year') + season(period = 'month', order = 12))
        )


changepoints_orig = model2 %>%
glance() %>%
filter(.model == 'prophet_orig') %>%
pull(changepoints) %>%
bind_rows() %>%
#filter(abs(adjustment)>0.05) %>%
.$changepoints

changepoints_10 = model2 %>%
glance() %>%
filter(.model == 'prophet_10') %>%
pull(changepoints) %>%
bind_rows() %>%
#filter(abs(adjustment)>0.05) %>%
.$changepoints

vehicle_train %>%
ggplot()+
geom_line(aes(date,vehicle_sales))+
# geom_vline(aes(xintercept=ymd('2000-01-01')))
geom_vline(xintercept=as.Date(changepoints_orig),color='red')+
geom_vline(xintercept=as.Date(changepoints_10),color='blue',linetype='dashed')

model1 = vehicle_train %>%
    model(
        prophet_orig = fable.prophet::prophet(vehicle_sales),
        prophet_90_range = fable.prophet::prophet(vehicle_sales~growth(changepoint_range=0.9)+season(period='year')+ season(period = 'month', order = 12)),

        )

changepoints_orig = model1 %>%
glance() %>%
filter(.model == 'prophet_orig') %>%
pull(changepoints) %>%
bind_rows() %>%
#filter(abs(adjustment)>0.01) %>%
.$changepoints

changepoints_90_range = model1 %>%
glance() %>%
filter(.model == 'prophet_90_range') %>%
unnest(changepoints) %>%
bind_rows() %>%
#filter(abs(adjustment)>0.01) %>%
.$changepoints

model1 %>%
forecast(h=68) %>%
autoplot(vehicle_train,level=NULL)+
geom_vline(xintercept=as.Date(changepoints_orig),color='blue',linetype='dashed')+
geom_vline(xintercept=as.Date(changepoints_90_range),color='red',linetype='dashed')

model3 = vehicle_train %>%
    model(
        prophet_orig = fable.prophet::prophet(vehicle_sales),
        prophet_less_flexible = fable.prophet::prophet(vehicle_sales~growth(changepoint_prior_scale=0.01)+season(period='year')),
        prophet_more_flexible = fable.prophet::prophet(vehicle_sales~growth(changepoint_prior_scale=0.5)+season(period='year'))
        )

changepoints_orig = model3 %>%
glance() %>%
unnest(changepoints) %>%
bind_rows() %>% 
filter(.model == 'prophet_orig') %>%
filter(abs(adjustment)>=0.05) %>%
.$changepoints

changepoints_more_flexible = model3 %>%
glance() %>%
unnest(changepoints) %>%
bind_rows() %>%
filter(.model == 'prophet_more_flexible') %>%
filter(abs(adjustment)>=0.50) %>%
.$changepoints

changepoints_less_flexible = model3 %>%
glance() %>%
unnest(changepoints) %>%
bind_rows() %>%
filter(.model == 'prophet_less_flexible') %>%
filter(abs(adjustment)>=0.01) %>%
.$changepoints

model3 %>%
forecast(h=68) %>%
autoplot(vehicle_train,level=NULL)+
geom_vline(xintercept=as.Date(changepoints_orig),color='blue',linetype='dashed')+
geom_vline(xintercept=as.Date(changepoints_more_flexible),color='green',linetype='dashed')+
geom_vline(xintercept=as.Date(changepoints_less_flexible),color='red',linetype='dashed')

model3 %>%
components() %>%
autoplot(trend)+
geom_vline(xintercept=as.Date(changepoints_orig),color='blue',linetype='dashed')+
geom_vline(xintercept=as.Date(changepoints_more_flexible),color='green',linetype='dashed')+
geom_vline(xintercept=as.Date(changepoints_less_flexible),color='red',linetype='dashed')

vehicle_train %>%
    model(
        prophet_orig = fable.prophet::prophet(vehicle_sales),
        prophet_saturating = fable.prophet::prophet(vehicle_sales~growth(type='linear',capacity=3.7,floor=2.8)+season('year'))
        ) %>%
    forecast(h=68) %>%
    autoplot(vehicle_train)

The forecast predictions from changepoints_less_flexible outperform other changepoint options, suggesting that a less flexible approach may better capture the underlying data patterns. This implies that simpler models with fewer changepoints could yield more dependable forecasts.

Although adjustments were made to the floor and capacity parameters to a significant extent, the forecasts produced show minimal deviation from the base Prophet model. This suggests that while these parameters theoretically affect the model’s behavior, their practical impact on forecasting accuracy may be limited in this dataset. Further exploration and experimentation may be necessary to gain a deeper understanding of their effect.

—-Section 3—-

model %>%
  components()

## # A dable: 460 x 10 [1M]
## # Key:     .model [1]
## # :        vehicle_sales = trend * (1 + multiplicative_terms) + additive_terms
## #   + .resid
##    .model      date vehicle_sales additive_terms multiplicative_terms trend
##    <chr>      <mth>         <dbl>          <dbl>                <dbl> <dbl>
##  1 prophet 1976 Jan          885.          -36.3                    0  929.
##  2 prophet 1976 Feb          995.           59.3                    0  929.
##  3 prophet 1976 Mar         1244.          337.                     0  929.
##  4 prophet 1976 Apr         1191.          235.                     0  930.
##  5 prophet 1976 May         1203.          319.                     0  930.
##  6 prophet 1976 Jun         1255.          300.                     0  930.
##  7 prophet 1976 Jul         1162.          226.                     0  931.
##  8 prophet 1976 Aug         1026.          181.                     0  931.
##  9 prophet 1976 Sep         1058.          131.                     0  932.
## 10 prophet 1976 Oct         1129.          140.                     0  932.
## # ℹ 450 more rows
## # ℹ 4 more variables: yearly <dbl>, weekly <dbl>, daily <dbl>, .resid <dbl>

yearly_seasonality = model %>%
components() %>%
autoplot(yearly)

yearly_seasonality

model4 = vehicle_train %>%
    model(
      additive = fable.prophet::prophet(vehicle_sales~growth()+season(period='year',type='additive')),
      multiplicative = fable.prophet::prophet(vehicle_sales~growth()+season(period='year',type='multiplicative')))

model4 %>%
components() %>%
autoplot()

Both the decomposition graph and table reveal the existence of additive seasonality within the time series, rather than multiplicative seasonality. This implies that the seasonal fluctuations within the data maintain a consistent magnitude over time, rather than scaling proportionally with the trend.

—-Section 4—-

vehicle_train %>%
bind_rows(vehicle_test) %>%
    ggplot()+
    geom_line(aes(date, vehicle_sales))+
    geom_vline(aes(xintercept=ymd('2014-05-01')),color='red')+
  annotate("text", x = as.Date("1976-01-01"), y = max(vehicle_sales_tbl_ts$vehicle_sales) * 0.9, 
           label = "Train", color = "blue", size = 4) +
  annotate("text", x = as.Date("2023-11-01"), y = max(vehicle_sales_tbl_ts$vehicle_sales) * 0.9, 
           label = "Test", color = "blue", size = 4) +
  labs(x = "Week", y = "vehicle sales", title = "Training and Testing Dataset") +
    theme_bw()

vehicle_train_data <- vehicle_train %>%
  stretch_tsibble(.init = 22*12, .step = 12)

vehicle_train_data %>%
    ggplot()+
    geom_point(aes(date,factor(.id),color=factor(.id)))+
    ylab('Iteration')+
    ggtitle('Samples included in each CV Iteration')

vehicle_train_cv_forecast <- vehicle_train_data %>%
  model(
    arima = ARIMA(vehicle_sales~pdq(2,0,2)),
    prophet = fable.prophet::prophet(vehicle_sales~growth()+season(period='year')),
    naive = NAIVE(vehicle_sales~drift())
  ) %>%
  forecast(h=12)


vehicle_train_cv_forecast %>%
  as_tsibble() %>%
  dplyr::select(-vehicle_sales) %>%
  left_join(
      vehicle_train
  ) %>%
    ggplot()+
    geom_line(aes(date,vehicle_sales))+
    geom_line(aes(date,.mean,color=factor(.id),linetype=.model))+
    scale_color_discrete(name='Iteration')+
    ylab('Ridership Count')+
    theme_bw()

## Joining with `by = join_by(date)`

vehicle_train_cv_forecast %>%
  as_tibble() %>%
  dplyr::select(-vehicle_sales) %>%
  left_join(vehicle_train) %>%
  group_by(.id,.model) %>%
  mutate(
    months_ahead = seq(1:12)
  ) %>%
  ungroup() %>%
  mutate(error = abs(vehicle_sales - .mean)) %>%
  ggplot()+
  geom_boxplot(aes(factor(months_ahead),error))+
  facet_wrap(~.model,ncol=1)+
  guides(color='none')+
  ylab('Absolute Error')+
  xlab('Days Ahead')+
  ggtitle('Absolute Error by Iteration')+
  ylim(0,600)

## Joining with `by = join_by(date)`

vehicle_train_cv_forecast %>%
  as_tibble() %>%
  dplyr::select(-vehicle_sales) %>%
  left_join(vehicle_train) %>% 
  group_by(.id,.model) %>%
  mutate(
    months_ahead = seq(1:12)
  ) %>%
  ungroup() %>% 
  filter(!is.na(vehicle_sales)) %>%
  group_by(months_ahead,.model) %>%
  summarize(
    rmse = sqrt(mean((vehicle_sales - .mean)^2,na.rm=T)),
  ) %>%
  ungroup() %>%
  ggplot()+
  geom_point(aes(months_ahead,rmse,color=.model),alpha=0.4)+ 
  geom_smooth(aes(months_ahead,rmse,color=.model),se=F)+
  xlab("Months Ahead")+
  ylab("Smoothed RMSE")

## Joining with `by = join_by(date)`
## `summarise()` has grouped output by 'months_ahead'. You can override using the
## `.groups` argument.
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

vehicle_train_cv_forecast %>%
  as_tibble() %>%
  dplyr::select(-vehicle_sales) %>%
  left_join(vehicle_train) %>%
  group_by(.id,.model) %>%
  mutate(
    months_ahead = seq(1:12)
  ) %>%
  ungroup() %>%
  group_by(months_ahead,.model) %>%
  mutate(
    mape = mean(abs((vehicle_sales - .mean)/vehicle_sales),na.rm=T)
  ) %>%
  ungroup() %>%
  ggplot()+
  geom_point(aes(months_ahead,mape,color=.model),alpha=0.4)+ 
  geom_smooth(aes(months_ahead,mape,color=.model),se=F)+
  xlab("Months Ahead")+
  ylab("Smoothed MAPE")

## Joining with `by = join_by(date)`
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

vehicle_train_cv_forecast %>%
  accuracy(vehicle_train)

## # A tibble: 3 × 10
##   .model  .type     ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>   <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 arima   Test   -9.43  117.  87.7 -1.61  7.20 0.784 0.794 0.463
## 2 naive   Test  -42.5   192. 149.  -5.41 12.4  1.33  1.30  0.444
## 3 prophet Test  -25.4   219. 171.  -4.15 14.9  1.53  1.48  0.805

vehicle_train_model <- vehicle_train %>%
  model(
    prophet = fable.prophet::prophet(vehicle_sales~growth()+season(period='year', type = 'additive'))
  )

vehicle_train_model %>%
    forecast(h=115) %>%
    autoplot(vehicle_train %>%
    bind_rows(vehicle_test))+
    ylab('vehicle sales')+
    theme_bw()

# Produce forecasts for the training set
train_forecast <- vehicle_train_model %>%
  forecast(new_data = vehicle_train)

# Visualize actual versus forecasted values for the training set
autoplot(train_forecast) +
  autolayer(vehicle_train) +
  labs(title = "Actual vs Forecasted Values for Training Set", y = "vehicle sales", x = "Date") +
  theme_minimal()

## Plot variable not specified, automatically selected `.vars = vehicle_sales`

Although the forecast model adeptly captures the variability and trend within the data, it struggles to accurately estimate the magnitude of the observations. This indicates that while the model effectively tracks the general patterns, it tends to underestimate the true values, leading to discrepancies in performance when evaluated on the test set. Notably, the ARIMA model exhibits the lowest error rate among all models, as evidenced by its superior performance in terms of RMSE and MAPE.

When contrasting the fitted values with the actual observations, it’s evident that the model displays minimal bias, indicating a strong alignment with the training dataset. Yet, upon scrutinizing the forecast against the test data, a notable reduction in variability is observed, signaling a potential overfitting challenge. This disparity suggests that while the model effectively captures the intrinsic patterns, its ability to generalize to new data is compromised. Further exploration and adjustments may be necessary to mitigate the overfitting issue and enhance the model’s predictive performance.

Assignment 5 Forecasting

Sai Lohith Roy Valleri

2024-02-19