##Packages
suppressWarnings({
suppressMessages({
library(dplyr)
library(ggplot2)
library(lubridate)
library(gridExtra)
library(forecast)
library(modeltime)
library(fabletools)
library(feasts)
library(tsibble)
library(tibble)
library(fable)
library(data.table)
library(knitr)
library(tibbletime)
#install.packages('tibbletime')
options(warn=-1)
})
})
Section 1 - Train and Test Splits
I would split the vehicle sales prior to Nov 2013(Jan 1976- Nov 2013) into the training data and the sales after(Dec 2013 - Nov 2023) into the test data set. Based on the trends in the above line plot the sales after the red line (which is used to divide the data as mentioned above) seem to follow the similar trends except for the drop in sales due to the pandemic which was out of control. Hence, I believe that the training set represent the testing set.
vehicle_df <- read.csv('total_vehicle_sales.csv', header = TRUE)
vehicle_sales_tbl_ts <- vehicle_df %>% select(date, vehicle_sales) %>%
mutate(value = vehicle_sales) %>%
mutate(date = yearmonth(date)) %>%
as_tsibble(index = date)
set.seed(123)
# Determine the number of rows for training (80%) and testing (20%)
n_rows <- nrow(vehicle_sales_tbl_ts)
train_rows <- round(0.8 * n_rows)
# Split the data into training and testing sets
train_data <- vehicle_sales_tbl_ts %>% slice(1:train_rows)
test_data <- vehicle_sales_tbl_ts %>% slice((train_rows + 1):n_rows)
# Visualize the training and test sets
ggplot() +
geom_line(data = train_data, aes(x = date, y = value, color = "Training"), linewidth = 0.3) +
geom_line(data = test_data, aes(x = date, y = value, color = "Test"), linewidth = 0.3) +
scale_color_manual(values = c("Training" = "blue", "Test" = "red")) +
labs(title = "Training and Test Sets", x = "Date", y = "vehicle sales") +
theme_minimal()
Section 2 - Cross-Validation Scheme
vehicle_train_cv <- train_data %>%
stretch_tsibble(.init = 22*12, .step = 12)
vehicle_train_cv %>%
ggplot()+
geom_point(aes(date,factor(.id),color=factor(.id)))+
ylab('Iteration')+
ggtitle('Samples included in each CV Iteration')
Section 3 - Model Selection and Comparison
In the training set, there’s around 23 years of vehicle sales data, totaling 273 data points. While the naive model occasionally outperforms the ARIMA model, there are instances where the ARIMA model performs better. However, overall, the naive models demonstrate superior performance across different forecast horizons compared to the ARIMA models.
vehicle_train_cv_forecast = vehicle_train_cv %>%
model(
arima = ARIMA(value~pdq(2,0,2)),
naive = NAIVE(value~drift())
) %>%
forecast(h=6)
vehicle_train_cv_forecast %>%
autoplot(vehicle_train_cv) +
facet_wrap(~.id,nrow=6)+
theme_bw()+
ylab('vehicle sales')
vehicle_train_cv_forecast %>%
as_tibble() %>%
select(-value) %>%
left_join(train_data, , by = c("date" = "date")) %>%
ggplot() +
geom_line(aes(date, value)) +
geom_line(aes(date, .mean, color = factor(.id), linetype = .model)) +
scale_color_discrete(name = 'Iteration') +
ylab('vehicle saless') +
theme_bw()
vehicle_train_cv_forecast %>%
group_by(.id,.model) %>%
mutate(h = row_number()) %>%
ungroup() %>%
as_fable(response = "value", distribution = value) %>%
accuracy(train_data, by = c("h", ".model")) %>%
ggplot(aes(x = h, y = RMSE,color=.model)) +
geom_point()+
geom_line()+
ylab('Average RMSE at Forecasting Intervals')+
xlab('Months in the Future')
vehicle_train_cv_forecast %>%
group_by(.id,.model) %>%
mutate(h = row_number()) %>%
ungroup() %>%
as_fable(response = "value", distribution = value) %>%
accuracy(train_data, by = c("h", ".model")) %>%
mutate(MAPE = MAPE/100) %>% # Rescale
ggplot(aes(x = h, y = MAPE,color=.model)) +
geom_point()+
geom_line()+
theme_bw()+
scale_y_continuous(
name = 'Average MAPE at Forecasting Intervals',labels=scales::percent)
vehicle_train_cv_forecast %>%
filter(.id<20) %>%
group_by(.id) %>%
accuracy(train_data) %>%
ungroup() %>%
arrange(.id) %>%
data.table()
## .model .id .type ME RMSE MAE MPE MAPE
## 1: arima 1 Test 24.728520 82.80546 69.21523 1.0658581 4.980571
## 2: naive 1 Test 67.610203 205.70932 188.57395 2.8156321 13.987953
## 3: arima 2 Test 63.126974 83.12091 77.99279 3.9025877 5.225522
## 4: naive 2 Test 79.649515 201.69306 181.63061 3.7089791 12.545517
## 5: arima 3 Test 62.285522 89.63868 63.38809 3.8654290 3.954396
## 6: naive 3 Test 112.310163 191.45922 180.05523 6.1266049 11.593002
## 7: arima 4 Test 29.937974 59.66279 56.36607 1.8430254 3.762277
## 8: naive 4 Test 202.614939 264.29450 228.01193 12.4770793 14.600041
## 9: arima 5 Test -90.205244 96.48966 90.20524 -6.4831447 6.483145
## 10: naive 5 Test 87.859563 176.28938 160.45807 4.9017310 11.281180
## 11: arima 6 Test -79.952076 91.82735 79.95208 -6.0389850 6.038985
## 12: naive 6 Test -80.532840 183.76190 130.50616 -7.4440926 10.600814
## 13: arima 7 Test -32.081158 58.37355 48.30251 -2.4039657 3.380653
## 14: naive 7 Test -37.476475 166.89875 126.80418 -4.0668620 9.575601
## 15: arima 8 Test -23.677738 91.89673 82.86632 -2.3841784 5.946845
## 16: naive 8 Test -124.433077 242.41521 174.65601 -11.0272969 13.980086
## 17: arima 9 Test -18.806681 40.12690 26.05585 -1.1662123 1.782553
## 18: naive 9 Test -95.337056 173.62656 113.48091 -7.8605830 9.015170
## 19: arima 10 Test -30.877697 54.10425 46.52647 -2.4246567 3.412970
## 20: naive 10 Test -78.379481 181.03480 144.90097 -7.1448476 11.347083
## 21: arima 11 Test -126.515038 150.04490 126.51504 -9.9713219 9.971322
## 22: naive 11 Test -161.569386 199.89262 161.56939 -13.8818328 13.881833
## 23: arima 12 Test -179.933088 188.84406 179.93309 -21.6957694 21.695769
## 24: naive 12 Test -101.257141 139.99878 108.55668 -14.1220148 14.899877
## 25: arima 13 Test 12.162463 35.27311 29.61274 1.0808596 2.876431
## 26: naive 13 Test -99.823755 178.23919 133.07643 -13.5262834 16.532692
## 27: arima 14 Test 3.025458 89.01285 81.86042 0.2030432 7.519574
## 28: naive 14 Test -90.831331 163.66096 130.80789 -10.3434549 13.492669
## 29: arima 15 Test 75.939529 91.46023 78.67768 5.8589420 6.086274
## 30: naive 15 Test -32.888336 163.47156 131.49489 -4.6255282 11.687490
## 31: arima 16 Test 30.459573 32.88544 30.45957 2.2929957 2.292996
## 32: naive 16 Test -49.633082 156.42932 128.88986 -5.1680776 10.571414
## 33: arima 17 Test 9.683441 53.29158 49.34706 0.1320706 3.814136
## 34: naive 17 Test -77.572629 209.26154 175.55220 -8.3882284 14.728748
## .model .id .type ME RMSE MAE MPE MAPE
## MASE RMSSE ACF1
## 1: 0.6556739 0.6007557 4.297619e-01
## 2: 1.7863558 1.4924263 4.415678e-01
## 3: 0.7446313 0.6086313 -1.872480e-02
## 4: 1.7341068 1.4768451 2.752264e-01
## 5: 0.6007834 0.6546318 9.918132e-05
## 6: 1.7065381 1.3982279 5.048391e-02
## 7: 0.5384668 0.4396482 -3.851460e-01
## 8: 2.1782050 1.9475557 4.364741e-02
## 9: 0.8574263 0.7073735 -4.629632e-01
## 10: 1.5251992 1.2923917 2.942982e-01
## 11: 0.7509831 0.6658542 1.616236e-01
## 12: 1.2258333 1.3324858 3.906079e-01
## 13: 0.4631514 0.4299624 -3.658617e-01
## 14: 1.2158691 1.2293271 2.454842e-01
## 15: 0.8033601 0.6840014 -2.814111e-01
## 16: 1.6932292 1.8043334 2.746266e-01
## 17: 0.2522286 0.2988781 2.123586e-01
## 18: 1.0985302 1.2932265 2.989677e-01
## 19: 0.4548923 0.4050376 -7.336295e-01
## 20: 1.4167060 1.3552705 7.424795e-02
## 21: 1.2542740 1.1352299 9.879810e-02
## 22: 1.6018039 1.5123744 5.763002e-03
## 23: 1.7064788 1.3546556 4.446194e-02
## 24: 1.0295475 1.0042684 3.622699e-01
## 25: 0.2686017 0.2373347 2.441744e-02
## 26: 1.2070670 1.1992804 2.924169e-01
## 27: 0.7363936 0.5981269 3.469854e-01
## 28: 1.1767115 1.0997291 2.051686e-01
## 29: 0.7075430 0.6170518 -4.587204e-01
## 30: 1.1825246 1.1028885 2.601423e-02
## 31: 0.2716772 0.2213308 -1.083929e-01
## 32: 1.1496038 1.0528255 1.568193e-01
## 33: 0.4413407 0.3606203 2.389616e-01
## 34: 1.5700697 1.4160580 1.701322e-01
## MASE RMSSE ACF1
Based on the metric used to evaluate the performance of the specific model differs. There are cases where one metric says Navie model fits better for the same forecast and the different metric says otherwise.
If we use RMSE to infer the results, ARIMA model predicts the best model which is opposite to my intuition.
But according to my experience it is not always best practice to solely rely on the metrics rather use the personal experience, domain knowledge, consider the external factors at that point of time etc. as well to best predict the forecast. This would hopefully minimise the error in forecast.
vehicle_train_cv_forecast %>%
accuracy(train_data) %>%
data.table()
## .model .type ME RMSE MAE MPE MAPE MASE
## 1: arima Test -16.43562 91.05244 72.04963 -1.942047 5.877182 0.6443835
## 2: naive Test -27.22996 190.11396 152.43048 -4.486380 12.564695 1.3632781
## RMSSE ACF1
## 1: 0.6161454 0.5334159
## 2: 1.2864876 0.2775713
Section 4
The model is decent in predicting the trends in the data, however the are instances where the model failed to predict the dips and high in the sales. But, overall the model has decently captured all the seasonal and trends in sales. I observe that the model did overfit and also underfit based on the time frame that we are looking at. Hence, it is pretty hard to classify the model into one of these segments.
vehicle_model = train_data %>%
model(
ARIMA(vehicle_sales~pdq(2,0,2))
)
vehicle_model %>%
forecast(h=115) %>%
autoplot(train_data %>%
bind_rows(test_data))+
ylab('vehicle sales Value')+
theme_bw()
vehicle_model %>%
forecast(h=115) %>%
accuracy(train_data %>% bind_rows(test_data)) %>%
dplyr::select(.model, RMSE, MAE, MAPE, MASE) %>%
kable()
| .model | RMSE | MAE | MAPE | MASE |
|---|---|---|---|---|
| ARIMA(vehicle_sales ~ pdq(2, 0, 2)) | 168.6234 | 138.7378 | 10.33725 | 1.246869 |
# Fit the ARIMA model to the training data
vehicle_model <- train_data %>%
model(
ARIMA(vehicle_sales ~ pdq(2, 0, 2))
)
# Extract the fitted values from the model
train_fitted <- vehicle_model %>%
augment()
# Convert date variable to Date class if it's not already
train_fitted$date <- as.Date(train_fitted$date)
train_data$date <- as.Date(train_data$date)
# Plot the fitted values against the actual values
ggplot() +
geom_line(data = train_data, aes(x = date, y = vehicle_sales), color = "blue", linetype = "solid", size = 0.5, alpha = 0.7) +
geom_line(data = train_fitted, aes(x = date, y = .fitted), color = "red", linetype = "dashed", size = 0.5) +
labs(title = "Actual vs Fitted Values for Training Set", y = "Vehicle Sales", x = "Date") +
theme_minimal()