Forecasting_Assignment4

suppressWarnings({
  suppressMessages({
library(ggplot2)
library(dplyr)
library(readxl)
library(zoo)
#library(forecast)
library(lubridate)
library(slider)
library(tidyr)
#library(tsfeatures)
#library(tseries)
library(fabletools)
library(tsibble)
library(gridExtra)
library(modeltime)
library(feasts)
library(tibble)
library(fable)
library(knitr)
library(tibbletime)
library(data.table)
    options(warn = -1)
  })
})

Section 1

data <- read.csv("//Users/keerthichereddy/Documents/Sem2/Forecasting Methods/Project/total_vehicle_sales.csv", stringsAsFactors = FALSE)
names(data)
[1] "date"          "vehicle_sales"
#sum(is.na(data$vehicle_sales))

#sum(is.na(data$date))
#sum(duplicated(data$date))
#data$date <- ymd(data$date)

data_ts <- data %>% select(date, vehicle_sales) %>%
  mutate(value = vehicle_sales) %>%
  mutate(date = yearmonth(date)) %>%
  as_tsibble(index = date)


set.seed(123)

# Determine the number of rows for training (80%) and testing (20%)
n_rows <- nrow(data_ts)
train_rows <- round(0.8 * n_rows)

# Split the data into training and testing sets
train_data <- data_ts %>% slice(1:train_rows)
test_data <- data_ts %>% slice((train_rows + 1):n_rows)

# Visualize the training and test sets
ggplot() +
  geom_line(data = train_data, aes(x = date, y = value, color = "Training"), linewidth = 0.3) +
  geom_line(data = test_data, aes(x = date, y = value, color = "Test"), linewidth = 0.3) +
  scale_color_manual(values = c("Training" = "green", "Test" = "orange")) +
  labs(title = "Training and Test Sets", x = "Date", y = "vehicle sales") +
  theme_minimal()

The test set appears to be representative of the training set. It captures the overall behavior and fluctuations present in the training set, including the general trends and variability in vehicle sales over time.

Section 2

data_train_cv <- train_data %>%
  stretch_tsibble(.init = 22*12, .step = 12)

data_train_cv %>%
    ggplot()+
    geom_point(aes(date,factor(.id),color=factor(.id)))+
    ylab('Iteration')+
    ggtitle('Samples included in each CV Iteration')

Section 3

data_train_cv_forecast = data_train_cv %>%
  model(
    arima = ARIMA(value~pdq(2,0,2)),
    naive = NAIVE(value~drift())
  ) %>%
  forecast(h=6)

data_train_cv_forecast %>%
  autoplot(data_train_cv) +
  facet_wrap(~.id,nrow=6)+
  theme_bw()+
  ylab('vehicle sales')

data_train_cv_forecast %>%
  as_tibble() %>%
  select(-value) %>%
  left_join(train_data, , by = c("date" = "date")) %>%
  ggplot() +
  geom_line(aes(date, value)) +
  geom_line(aes(date, .mean, color = factor(.id), linetype = .model)) +
  scale_color_discrete(name = 'Iteration') +
  ylab('vehicle saless') +
  theme_bw()

data_train_cv_forecast %>%
  group_by(.id,.model) %>%
  mutate(h = row_number()) %>%
  ungroup() %>%
  as_fable(response = "value", distribution = value) %>%
  accuracy(train_data, by = c("h", ".model")) %>%
  ggplot(aes(x = h, y = RMSE,color=.model)) +
  geom_point()+
  geom_line()+
  ylab('Average RMSE at Forecasting Intervals')+
  xlab('Months in the Future')

data_train_cv_forecast %>%
  group_by(.id,.model) %>%
  mutate(h = row_number()) %>%
  ungroup() %>%
  as_fable(response = "value", distribution = value) %>%
  accuracy(train_data, by = c("h", ".model")) %>%
  mutate(MAPE = MAPE/100) %>% # Rescale
  ggplot(aes(x = h, y = MAPE,color=.model)) +
  geom_point()+
  geom_line()+
  theme_bw()+
  scale_y_continuous(
    name = 'Average MAPE at Forecasting Intervals',labels=scales::percent)

data_train_cv_forecast %>%
    filter(.id<20) %>%
     group_by(.id) %>%
     accuracy(train_data) %>%
     ungroup() %>%
     arrange(.id) %>%
  data.table()
    .model .id .type          ME      RMSE       MAE         MPE      MAPE
 1:  arima   1  Test   24.728520  82.80546  69.21523   1.0658581  4.980571
 2:  naive   1  Test   67.610203 205.70932 188.57395   2.8156321 13.987953
 3:  arima   2  Test   63.126973  83.12091  77.99279   3.9025876  5.225522
 4:  naive   2  Test   79.649515 201.69306 181.63061   3.7089791 12.545517
 5:  arima   3  Test   62.285520  89.63868  63.38809   3.8654289  3.954396
 6:  naive   3  Test  112.310163 191.45922 180.05523   6.1266049 11.593002
 7:  arima   4  Test   29.937975  59.66279  56.36607   1.8430255  3.762277
 8:  naive   4  Test  202.614939 264.29450 228.01193  12.4770793 14.600041
 9:  arima   5  Test  -90.205244  96.48966  90.20524  -6.4831447  6.483145
10:  naive   5  Test   87.859563 176.28938 160.45807   4.9017310 11.281180
11:  arima   6  Test  -79.952076  91.82735  79.95208  -6.0389850  6.038985
12:  naive   6  Test  -80.532840 183.76190 130.50616  -7.4440926 10.600814
13:  arima   7  Test  -32.081158  58.37355  48.30251  -2.4039657  3.380653
14:  naive   7  Test  -37.476475 166.89875 126.80418  -4.0668620  9.575601
15:  arima   8  Test  -23.677728  91.89673  82.86632  -2.3841776  5.946844
16:  naive   8  Test -124.433077 242.41521 174.65601 -11.0272969 13.980086
17:  arima   9  Test  -18.806676  40.12691  26.05586  -1.1662118  1.782554
18:  naive   9  Test  -95.337056 173.62656 113.48091  -7.8605830  9.015170
19:  arima  10  Test  -30.877697  54.10425  46.52647  -2.4246567  3.412970
20:  naive  10  Test  -78.379481 181.03480 144.90097  -7.1448476 11.347083
21:  arima  11  Test -126.514991 150.04489 126.51499  -9.9713177  9.971318
22:  naive  11  Test -161.569386 199.89262 161.56939 -13.8818328 13.881833
23:  arima  12  Test -179.933182 188.84415 179.93318 -21.6957807 21.695781
24:  naive  12  Test -101.257141 139.99878 108.55668 -14.1220148 14.899877
25:  arima  13  Test   12.162660  35.27333  29.61299   1.0808789  2.876456
26:  naive  13  Test  -99.823755 178.23919 133.07643 -13.5262834 16.532692
27:  arima  14  Test    3.025457  89.01285  81.86042   0.2030432  7.519574
28:  naive  14  Test  -90.831331 163.66096 130.80789 -10.3434549 13.492669
29:  arima  15  Test   75.939530  91.46023  78.67768   5.8589420  6.086274
30:  naive  15  Test  -32.888336 163.47156 131.49489  -4.6255282 11.687490
31:  arima  16  Test   30.459573  32.88544  30.45957   2.2929957  2.292996
32:  naive  16  Test  -49.633082 156.42932 128.88986  -5.1680776 10.571414
33:  arima  17  Test    9.683441  53.29158  49.34706   0.1320706  3.814136
34:  naive  17  Test  -77.572629 209.26154 175.55220  -8.3882284 14.728748
    .model .id .type          ME      RMSE       MAE         MPE      MAPE
         MASE     RMSSE          ACF1
 1: 0.6556739 0.6007557  4.297619e-01
 2: 1.7863558 1.4924263  4.415678e-01
 3: 0.7446313 0.6086313 -1.872481e-02
 4: 1.7341068 1.4768451  2.752264e-01
 5: 0.6007834 0.6546318  9.918463e-05
 6: 1.7065381 1.3982279  5.048391e-02
 7: 0.5384668 0.4396483 -3.851460e-01
 8: 2.1782050 1.9475557  4.364741e-02
 9: 0.8574263 0.7073735 -4.629632e-01
10: 1.5251992 1.2923917  2.942982e-01
11: 0.7509831 0.6658542  1.616236e-01
12: 1.2258333 1.3324858  3.906079e-01
13: 0.4631514 0.4299624 -3.658617e-01
14: 1.2158691 1.2293271  2.454842e-01
15: 0.8033601 0.6840014 -2.814112e-01
16: 1.6932292 1.8043334  2.746266e-01
17: 0.2522287 0.2988781  2.123586e-01
18: 1.0985302 1.2932265  2.989677e-01
19: 0.4548923 0.4050376 -7.336295e-01
20: 1.4167060 1.3552705  7.424795e-02
21: 1.2542735 1.1352297  9.879827e-02
22: 1.6018039 1.5123744  5.763002e-03
23: 1.7064797 1.3546563  4.446220e-02
24: 1.0295475 1.0042684  3.622699e-01
25: 0.2686040 0.2373362  2.441225e-02
26: 1.2070670 1.1992804  2.924169e-01
27: 0.7363936 0.5981269  3.469854e-01
28: 1.1767115 1.0997291  2.051686e-01
29: 0.7075430 0.6170519 -4.587204e-01
30: 1.1825246 1.1028885  2.601423e-02
31: 0.2716772 0.2213308 -1.083929e-01
32: 1.1496038 1.0528255  1.568193e-01
33: 0.4413407 0.3606203  2.389616e-01
34: 1.5700697 1.4160580  1.701322e-01
         MASE     RMSSE          ACF1
data_train_cv_forecast %>%
  accuracy(train_data) %>%
  data.table()
   .model .type        ME      RMSE       MAE       MPE      MAPE      MASE
1:  arima  Test -16.43561  91.05246  72.04965 -1.942046  5.877184 0.6443837
2:  naive  Test -27.22996 190.11396 152.43048 -4.486380 12.564695 1.3632781
       RMSSE      ACF1
1: 0.6161455 0.5334158
2: 1.2864876 0.2775713

ARIMA model consistently aligns closer to the actual values across the cross-validation iterations (lower RMSE, MAE, MAPE, and MASE values for the ARIMA model) compared to the naive model. The ARIMA model’s predictions are closer to the actual line, indicating it outperforms the naive model.

Section 4

The model is decent in predicting the trends in the data, however the are instances where the model failed to predict the dips and high in the sales. But, overall the model has decently captured all the seasonal and trends in sales. I observe that the model did overfit and also underfit based on the time frame that we are looking at. Hence, it is pretty hard to classify the model into one of these segments.

vehicle_model = train_data %>%
model(
     ARIMA(vehicle_sales~pdq(2,0,2))
)
     vehicle_model  %>%
    forecast(h=115) %>%
    autoplot(train_data %>%
    bind_rows(test_data))+
    ylab('vehicle sales Value')+
    theme_bw()

vehicle_model %>%
    forecast(h=115) %>%
    accuracy(train_data %>% bind_rows(test_data)) %>%
   dplyr::select(.model, RMSE, MAE, MAPE, MASE) %>%
  kable()
.model RMSE MAE MAPE MASE
ARIMA(vehicle_sales ~ pdq(2, 0, 2)) 168.6234 138.7378 10.33725 1.246869
# Fit the ARIMA model to the training data
vehicle_model <- train_data %>%
  model(
    ARIMA(vehicle_sales ~ pdq(2, 0, 2))
  )

# Extract the fitted values from the model
train_fitted <- vehicle_model %>%
  augment()

# Convert date variable to Date class if it's not already
train_fitted$date <- as.Date(train_fitted$date)
train_data$date <- as.Date(train_data$date)

# Plot the fitted values against the actual values
ggplot() +
  geom_line(data = train_data, aes(x = date, y = vehicle_sales), color = "blue", linetype = "solid", size = 0.5, alpha = 0.7) +
  geom_line(data = train_fitted, aes(x = date, y = .fitted), color = "red", linetype = "dashed", size = 0.5) +
  labs(title = "Actual vs Fitted Values for Training Set", y = "Vehicle Sales", x = "Date") +
  theme_minimal()

The ARIMA model performs well on the test set, capturing the seasonal and trend variations in sales. It adapts well to the sales data, reflecting a good understanding of the underlying patterns without adhering too closely or too loosely to the training set.

It does not consistently overfit or underfit (showing both behaviors at different times) which suggests that the model has a balanced fit overall. This balanced performance suggests it is an adaptive model and that it does not fit neatly into a category of overfitting or underfitting. Hence using domain knowledge and external factors apart from model metrics becomes very important.