Untitled

suppressWarnings({
  suppressMessages({
library(dplyr)
library(ggplot2)
library(lubridate)
library(gridExtra)
library(forecast)
library(modeltime)
library(fabletools)
library(feasts)
library(tsibble)
library(tibble)
library(fable)
library(data.table) 
library(knitr) 
library(prophet)  
library(fable.prophet)
library(tidyr)


options(warn=-1)
})
})

data <- read.csv("//Users/keerthichereddy/Documents/Sem2/Forecasting Methods/Project/total_vehicle_sales.csv", stringsAsFactors = FALSE)
names(data)

[1] "date"          "vehicle_sales"

 data_ts <- data %>% select(date, vehicle_sales) %>%
  mutate(value = vehicle_sales) %>%
  mutate(date = yearmonth(date)) %>%
  as_tsibble(index = date)

Section 1

The Prophet model, developed by Facebook, is designed for forecasting time series data. It works well with time series that have strong seasonal effects and several seasons of historical data. Prophet is robust to deal with missing data, outliers, handle daily, weekly, and yearly seasonality and shifts in the trend and typically handles outliers well.

The methodology combines decomposable time series models with robust statistical methods to fit the models, making it highly adaptable and easy to use, even with default settings.

# Set the seed for reproducibility
set.seed(123)

# Determine the number of rows for training (80%) and testing (20%)
n_rows <- nrow(data_ts)
train_rows <- round(0.8 * n_rows)

# Split the data into training and testing sets
data_train <- data_ts %>% slice(1:train_rows)
data_test <- data_ts %>% slice((train_rows + 1):n_rows)

# Visualize the training and test sets
ggplot() +
  geom_line(data = data_train, aes(x = date, y = value, color = "Training"), linewidth = 1) +
  geom_line(data = data_test, aes(x = date, y = value, color = "Test"), linewidth = 1) +
  scale_color_manual(values = c("Training" = "blue", "Test" = "red")) +
  labs(title = "Training and Test Sets", x = "Date", y = "vehicle sales") +
  theme_minimal()

Forecast using prophet model

data_train %>%
    model(prophet = fable.prophet::prophet(vehicle_sales)) %>%
    forecast(h=115) %>%
    autoplot(data_train %>% bind_rows(data_test))+
    ylab('vehicle sales')+
    xlab('Month')+
    theme_bw()

Section 2

model = data_train %>%
    model(prophet = fable.prophet::prophet(vehicle_sales))

model %>%
components() %>%
autoplot()

The decomposition of the time series data shows an upward trend until early 2000, followed by a downward slope. Also, an additive seasonality component can be seen without any indication of multiplicative seasonality.

changepoints = model %>%
glance() %>%
pull(changepoints) %>%
bind_rows() %>%
.$changepoints

data_train %>%
ggplot()+
geom_line(aes(date,vehicle_sales))+
geom_vline(xintercept=as.Date(changepoints),color='red',linetype='dashed')

model2 = data_train %>%
    model(
        prophet_orig = fable.prophet::prophet(vehicle_sales ~ growth(n_changepoints = 25) + season(period = 'year') + season(period = 'month', order = 12)),
        prophet_10 = fable.prophet::prophet(vehicle_sales ~ growth(n_changepoints = 10) + season(period = 'year') + season(period = 'month', order = 12))
        )


changepoints_orig = model2 %>%
glance() %>%
filter(.model == 'prophet_orig') %>%
pull(changepoints) %>%
bind_rows() %>%
#filter(abs(adjustment)>0.05) %>%
.$changepoints

changepoints_10 = model2 %>%
glance() %>%
filter(.model == 'prophet_10') %>%
pull(changepoints) %>%
bind_rows() %>%
#filter(abs(adjustment)>0.05) %>%
.$changepoints

data_train %>%
ggplot()+
geom_line(aes(date,vehicle_sales))+
# geom_vline(aes(xintercept=ymd('2000-01-01')))
geom_vline(xintercept=as.Date(changepoints_orig),color='red')+
geom_vline(xintercept=as.Date(changepoints_10),color='blue',linetype='dashed')

model1 = data_train %>%
    model(
        prophet_orig = fable.prophet::prophet(vehicle_sales),
        prophet_90_range = fable.prophet::prophet(vehicle_sales~growth(changepoint_range=0.9)+season(period='year')+ season(period = 'month', order = 12)),

        )

changepoints_orig = model1 %>%
glance() %>%
filter(.model == 'prophet_orig') %>%
pull(changepoints) %>%
bind_rows() %>%
#filter(abs(adjustment)>0.01) %>%
.$changepoints

changepoints_90_range = model1 %>%
glance() %>%
filter(.model == 'prophet_90_range') %>%
unnest(changepoints) %>%
bind_rows() %>%
#filter(abs(adjustment)>0.01) %>%
.$changepoints

model1 %>%
forecast(h=68) %>%
autoplot(data_train,level=NULL)+
geom_vline(xintercept=as.Date(changepoints_orig),color='blue',linetype='dashed')+
geom_vline(xintercept=as.Date(changepoints_90_range),color='red',linetype='dashed')

model3 = data_train %>%
    model(
        prophet_orig = fable.prophet::prophet(vehicle_sales),
        prophet_less_flexible = fable.prophet::prophet(vehicle_sales~growth(changepoint_prior_scale=0.01)+season(period='year')),
        prophet_more_flexible = fable.prophet::prophet(vehicle_sales~growth(changepoint_prior_scale=0.5)+season(period='year'))
        )

changepoints_orig = model3 %>%
glance() %>%
unnest(changepoints) %>%
bind_rows() %>% 
filter(.model == 'prophet_orig') %>%
filter(abs(adjustment)>=0.05) %>%
.$changepoints

changepoints_more_flexible = model3 %>%
glance() %>%
unnest(changepoints) %>%
bind_rows() %>%
filter(.model == 'prophet_more_flexible') %>%
filter(abs(adjustment)>=0.50) %>%
.$changepoints

changepoints_less_flexible = model3 %>%
glance() %>%
unnest(changepoints) %>%
bind_rows() %>%
filter(.model == 'prophet_less_flexible') %>%
filter(abs(adjustment)>=0.01) %>%
.$changepoints

model3 %>%
forecast(h=68) %>%
autoplot(data_train,level=NULL)+
geom_vline(xintercept=as.Date(changepoints_orig),color='blue',linetype='dashed')+
geom_vline(xintercept=as.Date(changepoints_more_flexible),color='green',linetype='dashed')+
geom_vline(xintercept=as.Date(changepoints_less_flexible),color='red',linetype='dashed')

A less flexible approach may better capture the underlying patterns in the data as changepoints_less_flexible shows superior forecast predictions compared to other changepoint options. In this case, it suggests that simpler models with fewer changepoints may provide more reliable forecasts.

model3 %>%
components() %>%
autoplot(trend)+
geom_vline(xintercept=as.Date(changepoints_orig),color='blue',linetype='dashed')+
geom_vline(xintercept=as.Date(changepoints_more_flexible),color='green',linetype='dashed')+
geom_vline(xintercept=as.Date(changepoints_less_flexible),color='red',linetype='dashed')

data_train %>%
    model(
        prophet_orig = fable.prophet::prophet(vehicle_sales),
        prophet_saturating = fable.prophet::prophet(vehicle_sales~growth(type='linear',capacity=3.7,floor=2.8)+season('year'))
        ) %>%
    forecast(h=68) %>%
    autoplot(data_train)

There is only a minimal deviation from the base model even after adjusting the floor and capacity parameters. It indicates that while these parameters theoretically influence the model’s behavior, their practical impact on forecasting accuracy may be limited in this particular scenario.

Section 3

model %>%
  components()

# A dable: 460 x 10 [1M]
# Key:     .model [1]
# :        vehicle_sales = trend * (1 + multiplicative_terms) + additive_terms
#   + .resid
   .model      date vehicle_sales additive_terms multiplicative_terms trend
   <chr>      <mth>         <dbl>          <dbl>                <dbl> <dbl>
 1 prophet 1976 Jan          885.          -36.3                    0  929.
 2 prophet 1976 Feb          995.           59.3                    0  929.
 3 prophet 1976 Mar         1244.          337.                     0  929.
 4 prophet 1976 Apr         1191.          235.                     0  930.
 5 prophet 1976 May         1203.          319.                     0  930.
 6 prophet 1976 Jun         1255.          300.                     0  930.
 7 prophet 1976 Jul         1162.          226.                     0  931.
 8 prophet 1976 Aug         1026.          181.                     0  931.
 9 prophet 1976 Sep         1058.          131.                     0  932.
10 prophet 1976 Oct         1129.          140.                     0  932.
# ℹ 450 more rows
# ℹ 4 more variables: yearly <dbl>, weekly <dbl>, daily <dbl>, .resid <dbl>

yearly_seasonality = model %>%
components() %>%
autoplot(yearly)

yearly_seasonality

model4 = data_train %>%
    model(
      additive = fable.prophet::prophet(vehicle_sales~growth()+season(period='year',type='additive')),
      multiplicative = fable.prophet::prophet(vehicle_sales~growth()+season(period='year',type='multiplicative')))

model4 %>%
components() %>%
autoplot()

Section 4

data_train %>%
bind_rows(data_test) %>%
    ggplot()+
    geom_line(aes(date, vehicle_sales))+
    geom_vline(aes(xintercept=ymd('2014-05-01')),color='red')+
  annotate("text", x = as.Date("1976-01-01"), y = max(data_ts$vehicle_sales) * 0.9, 
           label = "Train", color = "blue", size = 4) +
  annotate("text", x = as.Date("2023-11-01"), y = max(data_ts$vehicle_sales) * 0.9, 
           label = "Test", color = "blue", size = 4) +
  labs(x = "Week", y = "vehicle sales", title = "Training and Testing Dataset") +
    theme_bw()

data_train_data <- data_train %>%
  stretch_tsibble(.init = 22*12, .step = 12)

data_train_data %>%
    ggplot()+
    geom_point(aes(date,factor(.id),color=factor(.id)))+
    ylab('Iteration')+
    ggtitle('Samples included in each CV Iteration')

data_train_cv_forecast <- data_train_data %>%
  model(
    arima = ARIMA(vehicle_sales~pdq(2,0,2)),
    prophet = fable.prophet::prophet(vehicle_sales~growth()+season(period='year')),
    naive = NAIVE(vehicle_sales~drift())
  ) %>%
  forecast(h=12)


data_train_cv_forecast %>%
  as_tsibble() %>%
  dplyr::select(-vehicle_sales) %>%
  left_join(
      data_train
  ) %>%
    ggplot()+
    geom_line(aes(date,vehicle_sales))+
    geom_line(aes(date,.mean,color=factor(.id),linetype=.model))+
    scale_color_discrete(name='Iteration')+
    ylab('Ridership Count')+
    theme_bw()

Joining with `by = join_by(date)`

data_train_cv_forecast %>%
  as_tibble() %>%
  dplyr::select(-vehicle_sales) %>%
  left_join(data_train) %>%
  group_by(.id,.model) %>%
  mutate(
    months_ahead = seq(1:12)
  ) %>%
  ungroup() %>%
  mutate(error = abs(vehicle_sales - .mean)) %>%
  ggplot()+
  geom_boxplot(aes(factor(months_ahead),error))+
  facet_wrap(~.model,ncol=1)+
  guides(color='none')+
  ylab('Absolute Error')+
  xlab('Days Ahead')+
  ggtitle('Absolute Error by Iteration')+
  ylim(0,600)

Joining with `by = join_by(date)`

data_train_cv_forecast %>%
  as_tibble() %>%
  dplyr::select(-vehicle_sales) %>%
  left_join(data_train) %>% 
  group_by(.id,.model) %>%
  mutate(
    months_ahead = seq(1:12)
  ) %>%
  ungroup() %>% 
  filter(!is.na(vehicle_sales)) %>%
  group_by(months_ahead,.model) %>%
  summarize(
    rmse = sqrt(mean((vehicle_sales - .mean)^2,na.rm=T)),
  ) %>%
  ungroup() %>%
  ggplot()+
  geom_point(aes(months_ahead,rmse,color=.model),alpha=0.4)+ 
  geom_smooth(aes(months_ahead,rmse,color=.model),se=F)+
  xlab("Months Ahead")+
  ylab("Smoothed RMSE")

Joining with `by = join_by(date)`
`summarise()` has grouped output by 'months_ahead'. You can override using the
`.groups` argument.
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'

data_train_cv_forecast %>%
  as_tibble() %>%
  dplyr::select(-vehicle_sales) %>%
  left_join(data_train) %>%
  group_by(.id,.model) %>%
  mutate(
    months_ahead = seq(1:12)
  ) %>%
  ungroup() %>%
  group_by(months_ahead,.model) %>%
  mutate(
    mape = mean(abs((vehicle_sales - .mean)/vehicle_sales),na.rm=T)
  ) %>%
  ungroup() %>%
  ggplot()+
  geom_point(aes(months_ahead,mape,color=.model),alpha=0.4)+ 
  geom_smooth(aes(months_ahead,mape,color=.model),se=F)+
  xlab("Months Ahead")+
  ylab("Smoothed MAPE")

Joining with `by = join_by(date)`
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'

data_train_cv_forecast %>%
  accuracy(data_train)

# A tibble: 3 × 10
  .model  .type     ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
  <chr>   <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 arima   Test   -9.43  117.  87.7 -1.61  7.20 0.784 0.794 0.463
2 naive   Test  -42.5   192. 149.  -5.41 12.4  1.33  1.30  0.444
3 prophet Test  -25.5   219. 171.  -4.15 14.9  1.53  1.48  0.804

data_train_model <- data_train %>%
  model(
    prophet = fable.prophet::prophet(vehicle_sales~growth()+season(period='year', type = 'additive'))
  )

data_train_model %>%
    forecast(h=115) %>%
    autoplot(data_train %>%
    bind_rows(data_test))+
    ylab('vehicle sales')+
    theme_bw()

# Produce forecasts for the training set
train_forecast <- data_train_model %>%
  forecast(new_data = data_train)

# Visualize actual versus forecasted values for the training set
autoplot(train_forecast) +
  autolayer(data_train) +
  labs(title = "Actual vs Forecasted Values for Training Set", y = "vehicle sales", x = "Date") +
  theme_minimal()

Plot variable not specified, automatically selected `.vars = vehicle_sales`

Plotting the fitted values alongside the actual values reveals that the model shows low bias, indicating a strong fit to the training data. Yet, when comparing the forecasted values to the test data, it becomes clear that the forecast shows low variance, pointing to a possible issue of overfitting. This difference implies that although the model effectively identifies the underlying trends, it might have difficulties in generalizing to new, unseen data. This situation calls for further exploration and possible modifications to mitigate the overfitting concern.