suppressWarnings({
suppressMessages({
library(ggplot2)
library(dplyr)
library(readxl)
library(zoo)
#library(forecast)
library(lubridate)
library(slider)
library(tidyr)
#library(tsfeatures)
#library(tseries)
library(fabletools)
library(tsibble)
library(gridExtra)
library(modeltime)
library(feasts)
library(tibble)
library(fable)
library(knitr)
library(tibbletime)
library(data.table)
options(warn = -1)
})
})Forecasting_Assignment4
Section 1
data <- read.csv("//Users/keerthichereddy/Documents/Sem2/Forecasting Methods/Project/total_vehicle_sales.csv", stringsAsFactors = FALSE)
names(data)[1] "date" "vehicle_sales"
#sum(is.na(data$vehicle_sales))
#sum(is.na(data$date))
#sum(duplicated(data$date))
#data$date <- ymd(data$date)
data_ts <- data %>% select(date, vehicle_sales) %>%
mutate(value = vehicle_sales) %>%
mutate(date = yearmonth(date)) %>%
as_tsibble(index = date)
set.seed(123)
# Determine the number of rows for training (80%) and testing (20%)
n_rows <- nrow(data_ts)
train_rows <- round(0.8 * n_rows)
# Split the data into training and testing sets
train_data <- data_ts %>% slice(1:train_rows)
test_data <- data_ts %>% slice((train_rows + 1):n_rows)
# Visualize the training and test sets
ggplot() +
geom_line(data = train_data, aes(x = date, y = value, color = "Training"), linewidth = 0.3) +
geom_line(data = test_data, aes(x = date, y = value, color = "Test"), linewidth = 0.3) +
scale_color_manual(values = c("Training" = "green", "Test" = "orange")) +
labs(title = "Training and Test Sets", x = "Date", y = "vehicle sales") +
theme_minimal()The test set appears to be representative of the training set. It captures the overall behavior and fluctuations present in the training set, including the general trends and variability in vehicle sales over time.
Section 2
data_train_cv <- train_data %>%
stretch_tsibble(.init = 22*12, .step = 12)
data_train_cv %>%
ggplot()+
geom_point(aes(date,factor(.id),color=factor(.id)))+
ylab('Iteration')+
ggtitle('Samples included in each CV Iteration')Section 3
data_train_cv_forecast = data_train_cv %>%
model(
arima = ARIMA(value~pdq(2,0,2)),
naive = NAIVE(value~drift())
) %>%
forecast(h=6)
data_train_cv_forecast %>%
autoplot(data_train_cv) +
facet_wrap(~.id,nrow=6)+
theme_bw()+
ylab('vehicle sales')data_train_cv_forecast %>%
as_tibble() %>%
select(-value) %>%
left_join(train_data, , by = c("date" = "date")) %>%
ggplot() +
geom_line(aes(date, value)) +
geom_line(aes(date, .mean, color = factor(.id), linetype = .model)) +
scale_color_discrete(name = 'Iteration') +
ylab('vehicle saless') +
theme_bw()data_train_cv_forecast %>%
group_by(.id,.model) %>%
mutate(h = row_number()) %>%
ungroup() %>%
as_fable(response = "value", distribution = value) %>%
accuracy(train_data, by = c("h", ".model")) %>%
ggplot(aes(x = h, y = RMSE,color=.model)) +
geom_point()+
geom_line()+
ylab('Average RMSE at Forecasting Intervals')+
xlab('Months in the Future')data_train_cv_forecast %>%
group_by(.id,.model) %>%
mutate(h = row_number()) %>%
ungroup() %>%
as_fable(response = "value", distribution = value) %>%
accuracy(train_data, by = c("h", ".model")) %>%
mutate(MAPE = MAPE/100) %>% # Rescale
ggplot(aes(x = h, y = MAPE,color=.model)) +
geom_point()+
geom_line()+
theme_bw()+
scale_y_continuous(
name = 'Average MAPE at Forecasting Intervals',labels=scales::percent)data_train_cv_forecast %>%
filter(.id<20) %>%
group_by(.id) %>%
accuracy(train_data) %>%
ungroup() %>%
arrange(.id) %>%
data.table() .model .id .type ME RMSE MAE MPE MAPE
1: arima 1 Test 24.728520 82.80546 69.21523 1.0658581 4.980571
2: naive 1 Test 67.610203 205.70932 188.57395 2.8156321 13.987953
3: arima 2 Test 63.126973 83.12091 77.99279 3.9025876 5.225522
4: naive 2 Test 79.649515 201.69306 181.63061 3.7089791 12.545517
5: arima 3 Test 62.285520 89.63868 63.38809 3.8654289 3.954396
6: naive 3 Test 112.310163 191.45922 180.05523 6.1266049 11.593002
7: arima 4 Test 29.937975 59.66279 56.36607 1.8430255 3.762277
8: naive 4 Test 202.614939 264.29450 228.01193 12.4770793 14.600041
9: arima 5 Test -90.205244 96.48966 90.20524 -6.4831447 6.483145
10: naive 5 Test 87.859563 176.28938 160.45807 4.9017310 11.281180
11: arima 6 Test -79.952076 91.82735 79.95208 -6.0389850 6.038985
12: naive 6 Test -80.532840 183.76190 130.50616 -7.4440926 10.600814
13: arima 7 Test -32.081158 58.37355 48.30251 -2.4039657 3.380653
14: naive 7 Test -37.476475 166.89875 126.80418 -4.0668620 9.575601
15: arima 8 Test -23.677728 91.89673 82.86632 -2.3841776 5.946844
16: naive 8 Test -124.433077 242.41521 174.65601 -11.0272969 13.980086
17: arima 9 Test -18.806676 40.12691 26.05586 -1.1662118 1.782554
18: naive 9 Test -95.337056 173.62656 113.48091 -7.8605830 9.015170
19: arima 10 Test -30.877697 54.10425 46.52647 -2.4246567 3.412970
20: naive 10 Test -78.379481 181.03480 144.90097 -7.1448476 11.347083
21: arima 11 Test -126.514991 150.04489 126.51499 -9.9713177 9.971318
22: naive 11 Test -161.569386 199.89262 161.56939 -13.8818328 13.881833
23: arima 12 Test -179.933182 188.84415 179.93318 -21.6957807 21.695781
24: naive 12 Test -101.257141 139.99878 108.55668 -14.1220148 14.899877
25: arima 13 Test 12.162660 35.27333 29.61299 1.0808789 2.876456
26: naive 13 Test -99.823755 178.23919 133.07643 -13.5262834 16.532692
27: arima 14 Test 3.025457 89.01285 81.86042 0.2030432 7.519574
28: naive 14 Test -90.831331 163.66096 130.80789 -10.3434549 13.492669
29: arima 15 Test 75.939530 91.46023 78.67768 5.8589420 6.086274
30: naive 15 Test -32.888336 163.47156 131.49489 -4.6255282 11.687490
31: arima 16 Test 30.459573 32.88544 30.45957 2.2929957 2.292996
32: naive 16 Test -49.633082 156.42932 128.88986 -5.1680776 10.571414
33: arima 17 Test 9.683441 53.29158 49.34706 0.1320706 3.814136
34: naive 17 Test -77.572629 209.26154 175.55220 -8.3882284 14.728748
.model .id .type ME RMSE MAE MPE MAPE
MASE RMSSE ACF1
1: 0.6556739 0.6007557 4.297619e-01
2: 1.7863558 1.4924263 4.415678e-01
3: 0.7446313 0.6086313 -1.872481e-02
4: 1.7341068 1.4768451 2.752264e-01
5: 0.6007834 0.6546318 9.918463e-05
6: 1.7065381 1.3982279 5.048391e-02
7: 0.5384668 0.4396483 -3.851460e-01
8: 2.1782050 1.9475557 4.364741e-02
9: 0.8574263 0.7073735 -4.629632e-01
10: 1.5251992 1.2923917 2.942982e-01
11: 0.7509831 0.6658542 1.616236e-01
12: 1.2258333 1.3324858 3.906079e-01
13: 0.4631514 0.4299624 -3.658617e-01
14: 1.2158691 1.2293271 2.454842e-01
15: 0.8033601 0.6840014 -2.814112e-01
16: 1.6932292 1.8043334 2.746266e-01
17: 0.2522287 0.2988781 2.123586e-01
18: 1.0985302 1.2932265 2.989677e-01
19: 0.4548923 0.4050376 -7.336295e-01
20: 1.4167060 1.3552705 7.424795e-02
21: 1.2542735 1.1352297 9.879827e-02
22: 1.6018039 1.5123744 5.763002e-03
23: 1.7064797 1.3546563 4.446220e-02
24: 1.0295475 1.0042684 3.622699e-01
25: 0.2686040 0.2373362 2.441225e-02
26: 1.2070670 1.1992804 2.924169e-01
27: 0.7363936 0.5981269 3.469854e-01
28: 1.1767115 1.0997291 2.051686e-01
29: 0.7075430 0.6170519 -4.587204e-01
30: 1.1825246 1.1028885 2.601423e-02
31: 0.2716772 0.2213308 -1.083929e-01
32: 1.1496038 1.0528255 1.568193e-01
33: 0.4413407 0.3606203 2.389616e-01
34: 1.5700697 1.4160580 1.701322e-01
MASE RMSSE ACF1
data_train_cv_forecast %>%
accuracy(train_data) %>%
data.table() .model .type ME RMSE MAE MPE MAPE MASE
1: arima Test -16.43561 91.05246 72.04965 -1.942046 5.877184 0.6443837
2: naive Test -27.22996 190.11396 152.43048 -4.486380 12.564695 1.3632781
RMSSE ACF1
1: 0.6161455 0.5334158
2: 1.2864876 0.2775713
ARIMA model consistently aligns closer to the actual values across the cross-validation iterations (lower RMSE, MAE, MAPE, and MASE values for the ARIMA model) compared to the naive model. The ARIMA model’s predictions are closer to the actual line, indicating it outperforms the naive model.
Section 4
The model is decent in predicting the trends in the data, however the are instances where the model failed to predict the dips and high in the sales. But, overall the model has decently captured all the seasonal and trends in sales. I observe that the model did overfit and also underfit based on the time frame that we are looking at. Hence, it is pretty hard to classify the model into one of these segments.
vehicle_model = train_data %>%
model(
ARIMA(vehicle_sales~pdq(2,0,2))
)
vehicle_model %>%
forecast(h=115) %>%
autoplot(train_data %>%
bind_rows(test_data))+
ylab('vehicle sales Value')+
theme_bw()vehicle_model %>%
forecast(h=115) %>%
accuracy(train_data %>% bind_rows(test_data)) %>%
dplyr::select(.model, RMSE, MAE, MAPE, MASE) %>%
kable()| .model | RMSE | MAE | MAPE | MASE |
|---|---|---|---|---|
| ARIMA(vehicle_sales ~ pdq(2, 0, 2)) | 168.6234 | 138.7378 | 10.33725 | 1.246869 |
# Fit the ARIMA model to the training data
vehicle_model <- train_data %>%
model(
ARIMA(vehicle_sales ~ pdq(2, 0, 2))
)
# Extract the fitted values from the model
train_fitted <- vehicle_model %>%
augment()
# Convert date variable to Date class if it's not already
train_fitted$date <- as.Date(train_fitted$date)
train_data$date <- as.Date(train_data$date)
# Plot the fitted values against the actual values
ggplot() +
geom_line(data = train_data, aes(x = date, y = vehicle_sales), color = "blue", linetype = "solid", size = 0.5, alpha = 0.7) +
geom_line(data = train_fitted, aes(x = date, y = .fitted), color = "red", linetype = "dashed", size = 0.5) +
labs(title = "Actual vs Fitted Values for Training Set", y = "Vehicle Sales", x = "Date") +
theme_minimal()The ARIMA model performs well on the test set, capturing the seasonal and trend variations in sales. It adapts well to the sales data, reflecting a good understanding of the underlying patterns without adhering too closely or too loosely to the training set.
It does not consistently overfit or underfit (showing both behaviors at different times) which suggests that the model has a balanced fit overall. This balanced performance suggests it is an adaptive model and that it does not fit neatly into a category of overfitting or underfitting. Hence using domain knowledge and external factors apart from model metrics becomes very important.