##Packages
suppressWarnings({
  suppressMessages({
library(dplyr)
library(ggplot2)
library(lubridate)
library(gridExtra)
library(forecast)
library(modeltime)
library(fabletools)
library(feasts)
library(tsibble)
library(tibble)
library(fable)
library(data.table) 
library(knitr)
library(tibbletime)
#install.packages('tibbletime')
options(warn=-1)
})
})

Section 1 - Train and Test Splits

I would split the vehicle sales prior to Nov 2013(Jan 1976- Nov 2013) into the training data and the sales after(Dec 2013 - Nov 2023) into the test data set. Based on the trends in the above line plot the sales after the red line (which is used to divide the data as mentioned above) seem to follow the similar trends except for the drop in sales due to the pandemic which was out of control. Hence, I believe that the training set represent the testing set.

vehicle_df <- read.csv('total_vehicle_sales.csv', header = TRUE)

 vehicle_sales_tbl_ts <- vehicle_df %>% select(date, vehicle_sales) %>%
  mutate(value = vehicle_sales) %>%
  mutate(date = yearmonth(date)) %>%
  as_tsibble(index = date)
set.seed(123)

# Determine the number of rows for training (80%) and testing (20%)
n_rows <- nrow(vehicle_sales_tbl_ts)
train_rows <- round(0.8 * n_rows)

# Split the data into training and testing sets
train_data <- vehicle_sales_tbl_ts %>% slice(1:train_rows)
test_data <- vehicle_sales_tbl_ts %>% slice((train_rows + 1):n_rows)

# Visualize the training and test sets
ggplot() +
  geom_line(data = train_data, aes(x = date, y = value, color = "Training"), linewidth = 0.3) +
  geom_line(data = test_data, aes(x = date, y = value, color = "Test"), linewidth = 0.3) +
  scale_color_manual(values = c("Training" = "blue", "Test" = "red")) +
  labs(title = "Training and Test Sets", x = "Date", y = "vehicle sales") +
  theme_minimal()

Section 2 - Cross-Validation Scheme

vehicle_train_cv <- train_data %>%
  stretch_tsibble(.init = 22*12, .step = 12)

vehicle_train_cv %>%
    ggplot()+
    geom_point(aes(date,factor(.id),color=factor(.id)))+
    ylab('Iteration')+
    ggtitle('Samples included in each CV Iteration')

Section 3 - Model Selection and Comparison

In the training set, there’s around 23 years of vehicle sales data, totaling 273 data points. While the naive model occasionally outperforms the ARIMA model, there are instances where the ARIMA model performs better. However, overall, the naive models demonstrate superior performance across different forecast horizons compared to the ARIMA models.

vehicle_train_cv_forecast = vehicle_train_cv %>%
  model(
    arima = ARIMA(value~pdq(2,0,2)),
    naive = NAIVE(value~drift())
  ) %>%
  forecast(h=6)

vehicle_train_cv_forecast %>%
  autoplot(vehicle_train_cv) +
  facet_wrap(~.id,nrow=6)+
  theme_bw()+
  ylab('vehicle sales')

vehicle_train_cv_forecast %>%
  as_tibble() %>%
  select(-value) %>%
  left_join(train_data, , by = c("date" = "date")) %>%
  ggplot() +
  geom_line(aes(date, value)) +
  geom_line(aes(date, .mean, color = factor(.id), linetype = .model)) +
  scale_color_discrete(name = 'Iteration') +
  ylab('vehicle saless') +
  theme_bw()

vehicle_train_cv_forecast %>%
  group_by(.id,.model) %>%
  mutate(h = row_number()) %>%
  ungroup() %>%
  as_fable(response = "value", distribution = value) %>%
  accuracy(train_data, by = c("h", ".model")) %>%
  ggplot(aes(x = h, y = RMSE,color=.model)) +
  geom_point()+
  geom_line()+
  ylab('Average RMSE at Forecasting Intervals')+
  xlab('Months in the Future')

vehicle_train_cv_forecast %>%
  group_by(.id,.model) %>%
  mutate(h = row_number()) %>%
  ungroup() %>%
  as_fable(response = "value", distribution = value) %>%
  accuracy(train_data, by = c("h", ".model")) %>%
  mutate(MAPE = MAPE/100) %>% # Rescale
  ggplot(aes(x = h, y = MAPE,color=.model)) +
  geom_point()+
  geom_line()+
  theme_bw()+
  scale_y_continuous(
    name = 'Average MAPE at Forecasting Intervals',labels=scales::percent)

vehicle_train_cv_forecast %>%
    filter(.id<20) %>%
     group_by(.id) %>%
     accuracy(train_data) %>%
     ungroup() %>%
     arrange(.id) %>%
  data.table()
##     .model .id .type          ME      RMSE       MAE         MPE      MAPE
##  1:  arima   1  Test   24.728520  82.80546  69.21523   1.0658581  4.980571
##  2:  naive   1  Test   67.610203 205.70932 188.57395   2.8156321 13.987953
##  3:  arima   2  Test   63.126974  83.12091  77.99279   3.9025877  5.225522
##  4:  naive   2  Test   79.649515 201.69306 181.63061   3.7089791 12.545517
##  5:  arima   3  Test   62.285522  89.63868  63.38809   3.8654290  3.954396
##  6:  naive   3  Test  112.310163 191.45922 180.05523   6.1266049 11.593002
##  7:  arima   4  Test   29.937974  59.66279  56.36607   1.8430254  3.762277
##  8:  naive   4  Test  202.614939 264.29450 228.01193  12.4770793 14.600041
##  9:  arima   5  Test  -90.205244  96.48966  90.20524  -6.4831447  6.483145
## 10:  naive   5  Test   87.859563 176.28938 160.45807   4.9017310 11.281180
## 11:  arima   6  Test  -79.952076  91.82735  79.95208  -6.0389850  6.038985
## 12:  naive   6  Test  -80.532840 183.76190 130.50616  -7.4440926 10.600814
## 13:  arima   7  Test  -32.081158  58.37355  48.30251  -2.4039657  3.380653
## 14:  naive   7  Test  -37.476475 166.89875 126.80418  -4.0668620  9.575601
## 15:  arima   8  Test  -23.677738  91.89673  82.86632  -2.3841784  5.946845
## 16:  naive   8  Test -124.433077 242.41521 174.65601 -11.0272969 13.980086
## 17:  arima   9  Test  -18.806681  40.12690  26.05585  -1.1662123  1.782553
## 18:  naive   9  Test  -95.337056 173.62656 113.48091  -7.8605830  9.015170
## 19:  arima  10  Test  -30.877697  54.10425  46.52647  -2.4246567  3.412970
## 20:  naive  10  Test  -78.379481 181.03480 144.90097  -7.1448476 11.347083
## 21:  arima  11  Test -126.515038 150.04490 126.51504  -9.9713219  9.971322
## 22:  naive  11  Test -161.569386 199.89262 161.56939 -13.8818328 13.881833
## 23:  arima  12  Test -179.933088 188.84406 179.93309 -21.6957694 21.695769
## 24:  naive  12  Test -101.257141 139.99878 108.55668 -14.1220148 14.899877
## 25:  arima  13  Test   12.162463  35.27311  29.61274   1.0808596  2.876431
## 26:  naive  13  Test  -99.823755 178.23919 133.07643 -13.5262834 16.532692
## 27:  arima  14  Test    3.025458  89.01285  81.86042   0.2030432  7.519574
## 28:  naive  14  Test  -90.831331 163.66096 130.80789 -10.3434549 13.492669
## 29:  arima  15  Test   75.939529  91.46023  78.67768   5.8589420  6.086274
## 30:  naive  15  Test  -32.888336 163.47156 131.49489  -4.6255282 11.687490
## 31:  arima  16  Test   30.459573  32.88544  30.45957   2.2929957  2.292996
## 32:  naive  16  Test  -49.633082 156.42932 128.88986  -5.1680776 10.571414
## 33:  arima  17  Test    9.683441  53.29158  49.34706   0.1320706  3.814136
## 34:  naive  17  Test  -77.572629 209.26154 175.55220  -8.3882284 14.728748
##     .model .id .type          ME      RMSE       MAE         MPE      MAPE
##          MASE     RMSSE          ACF1
##  1: 0.6556739 0.6007557  4.297619e-01
##  2: 1.7863558 1.4924263  4.415678e-01
##  3: 0.7446313 0.6086313 -1.872480e-02
##  4: 1.7341068 1.4768451  2.752264e-01
##  5: 0.6007834 0.6546318  9.918132e-05
##  6: 1.7065381 1.3982279  5.048391e-02
##  7: 0.5384668 0.4396482 -3.851460e-01
##  8: 2.1782050 1.9475557  4.364741e-02
##  9: 0.8574263 0.7073735 -4.629632e-01
## 10: 1.5251992 1.2923917  2.942982e-01
## 11: 0.7509831 0.6658542  1.616236e-01
## 12: 1.2258333 1.3324858  3.906079e-01
## 13: 0.4631514 0.4299624 -3.658617e-01
## 14: 1.2158691 1.2293271  2.454842e-01
## 15: 0.8033601 0.6840014 -2.814111e-01
## 16: 1.6932292 1.8043334  2.746266e-01
## 17: 0.2522286 0.2988781  2.123586e-01
## 18: 1.0985302 1.2932265  2.989677e-01
## 19: 0.4548923 0.4050376 -7.336295e-01
## 20: 1.4167060 1.3552705  7.424795e-02
## 21: 1.2542740 1.1352299  9.879810e-02
## 22: 1.6018039 1.5123744  5.763002e-03
## 23: 1.7064788 1.3546556  4.446194e-02
## 24: 1.0295475 1.0042684  3.622699e-01
## 25: 0.2686017 0.2373347  2.441744e-02
## 26: 1.2070670 1.1992804  2.924169e-01
## 27: 0.7363936 0.5981269  3.469854e-01
## 28: 1.1767115 1.0997291  2.051686e-01
## 29: 0.7075430 0.6170518 -4.587204e-01
## 30: 1.1825246 1.1028885  2.601423e-02
## 31: 0.2716772 0.2213308 -1.083929e-01
## 32: 1.1496038 1.0528255  1.568193e-01
## 33: 0.4413407 0.3606203  2.389616e-01
## 34: 1.5700697 1.4160580  1.701322e-01
##          MASE     RMSSE          ACF1

Based on the metric used to evaluate the performance of the specific model differs. There are cases where one metric says Navie model fits better for the same forecast and the different metric says otherwise.

If we use RMSE to infer the results, ARIMA model predicts the best model which is opposite to my intuition.

But according to my experience it is not always best practice to solely rely on the metrics rather use the personal experience, domain knowledge, consider the external factors at that point of time etc. as well to best predict the forecast. This would hopefully minimise the error in forecast.

vehicle_train_cv_forecast %>%
  accuracy(train_data) %>%
  data.table()
##    .model .type        ME      RMSE       MAE       MPE      MAPE      MASE
## 1:  arima  Test -16.43562  91.05244  72.04963 -1.942047  5.877182 0.6443835
## 2:  naive  Test -27.22996 190.11396 152.43048 -4.486380 12.564695 1.3632781
##        RMSSE      ACF1
## 1: 0.6161454 0.5334159
## 2: 1.2864876 0.2775713

Section 4

The model is decent in predicting the trends in the data, however the are instances where the model failed to predict the dips and high in the sales. But, overall the model has decently captured all the seasonal and trends in sales. I observe that the model did overfit and also underfit based on the time frame that we are looking at. Hence, it is pretty hard to classify the model into one of these segments.

vehicle_model = train_data %>%
model(
     ARIMA(vehicle_sales~pdq(2,0,2))
)
     vehicle_model  %>%
    forecast(h=115) %>%
    autoplot(train_data %>%
    bind_rows(test_data))+
    ylab('vehicle sales Value')+
    theme_bw()

vehicle_model %>%
    forecast(h=115) %>%
    accuracy(train_data %>% bind_rows(test_data)) %>%
   dplyr::select(.model, RMSE, MAE, MAPE, MASE) %>%
  kable()
.model RMSE MAE MAPE MASE
ARIMA(vehicle_sales ~ pdq(2, 0, 2)) 168.6234 138.7378 10.33725 1.246869
# Fit the ARIMA model to the training data
vehicle_model <- train_data %>%
  model(
    ARIMA(vehicle_sales ~ pdq(2, 0, 2))
  )

# Extract the fitted values from the model
train_fitted <- vehicle_model %>%
  augment()

# Convert date variable to Date class if it's not already
train_fitted$date <- as.Date(train_fitted$date)
train_data$date <- as.Date(train_data$date)

# Plot the fitted values against the actual values
ggplot() +
  geom_line(data = train_data, aes(x = date, y = vehicle_sales), color = "blue", linetype = "solid", size = 0.5, alpha = 0.7) +
  geom_line(data = train_fitted, aes(x = date, y = .fitted), color = "red", linetype = "dashed", size = 0.5) +
  labs(title = "Actual vs Fitted Values for Training Set", y = "Vehicle Sales", x = "Date") +
  theme_minimal()