I picked the Gasoline Stations data set, and here’s the link of the data: https://www.census.gov/retail/marts/www/adv44700.txt
# Load necessary libraries
library(forecast)
## Warning: package 'forecast' was built under R version 4.3.3
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# read the data by using link code
url <- "https://www.census.gov/retail/marts/www/adv44700.txt"
download.file(url, destfile = "adv44700.txt")
data <- read.table("adv44700.txt", skip = 1, fill = TRUE, header = FALSE)
# Clean the data
data <- data[complete.cases(data), ]
colnames(data) <- c('Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec')
# Reshape the data to have a single column of monthly data
data <- data %>%
gather(key = "Month", value = "Sales", -Year) %>%
mutate(Date = as.Date(paste(Year, Month, "01", sep = "-"), format = "%Y-%b-%d")) %>%
arrange(Date) %>%
mutate(Sales = as.numeric(gsub(",", "", Sales))) # Ensure Sales is numeric
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Sales = as.numeric(gsub(",", "", Sales))`.
## Caused by warning:
## ! NAs introduced by coercion
# Filter the last 5 years
data <- data %>%
filter(Date >= as.Date("2015-01-01")) %>%
select(Date, Sales)
# Split the data into training 4 years and testing 1 year
train_data <- data %>%
filter(Date < as.Date("2019-01-01")) %>%
pull(Sales) # Ensure it's a vector
test_data <- data %>%
filter(Date >= as.Date("2019-01-01")) %>%
pull(Sales) # Ensure it's a vector
# Convert to time series objects
train_ts <- ts(train_data, start = c(2015, 1), frequency = 12)
test_ts <- ts(test_data, start = c(2019, 1), frequency = 12)
# ETS model
ets_model <- ets(train_ts)
ets_forecast <- forecast(ets_model, h = 12)
# ARIMA model
arima_model <- auto.arima(train_ts, seasonal = TRUE)
arima_forecast <- forecast(arima_model, h = 12)
# dynamic regression model (using auto.arima with external regressors if available)
# Assuming no external regressors in this example
dynamic_model <- auto.arima(train_ts)
dynamic_forecast <- forecast(dynamic_model, h = 12)
# Combine forecasts
combined_forecast <- (ets_forecast$mean + arima_forecast$mean + dynamic_forecast$mean) / 3
# Evaluate model performance
ets_accuracy <- accuracy(ets_forecast, test_ts)
arima_accuracy <- accuracy(arima_forecast, test_ts)
dynamic_accuracy <- accuracy(dynamic_forecast, test_ts)
combined_accuracy <- accuracy(combined_forecast, test_ts)
# Print accuracy
print(ets_accuracy)
## ME RMSE MAE MPE MAPE MASE
## Training set 1472.1654 4152.298 1868.535 3.111884 10.520703 1.5055365
## Test set 345.0402 1926.395 1221.076 -1.973525 7.497176 0.9838589
## ACF1 Theil's U
## Training set -0.13022272 NA
## Test set -0.05848032 0.0583077
print(arima_accuracy)
## ME RMSE MAE MPE MAPE MASE
## Training set -43.68557 754.651 477.5387 -4330.17 5654.024 0.3847678
## Test set 792.49780 1607.559 973.6169 -6307.85 6311.677 0.7844734
## ACF1 Theil's U
## Training set 0.02338336 NA
## Test set -0.48827133 0.05466454
print(dynamic_accuracy)
## ME RMSE MAE MPE MAPE MASE
## Training set -43.68557 754.651 477.5387 -4330.17 5654.024 0.3847678
## Test set 792.49780 1607.559 973.6169 -6307.85 6311.677 0.7844734
## ACF1 Theil's U
## Training set 0.02338336 NA
## Test set -0.48827133 0.05466454
print(combined_accuracy)
## ME RMSE MAE MPE MAPE ACF1 Theil's U
## Test set 643.3453 1636.433 1056.096 -4205.891 4209.57 -0.2957432 0.05430624
# Plot the model forecast
plot(test_ts, type = "l", col = "black", main = "Sales Forecasts")
lines(ets_forecast$mean, col = "yellow")
lines(arima_forecast$mean, col = "red")
lines(dynamic_forecast$mean, col = "green")
lines(combined_forecast, col = "purple")
legend("topright", legend = c("Actual", "ETS", "ARIMA", "Dynamic", "Combined"),
col = c("black", "yellow", "red", "green", "purple"), lty = 1)
RMSE (Test set): 1926.395
MAE (Test set): 1221.076
MAPE (Test set): 7.497176
RMSE (Test set): 1607.559
MAE (Test set): 973.6169
MAPE (Test set): 6311.677
RMSE (Test set): 1607.559 (same as ARIMA)
MAE (Test set): 973.6169 (same as ARIMA)
MAPE (Test set): 6311.677 (same as ARIMA)
RMSE (Test set): 1636.433
MAE (Test set): 1056.096
MAPE (Test set): 4209.57
The ARIMA model performs the best since it has the lowest values in these important values, based on the lowest RMSE and MAE on the test set below:
Lowest RMSE: 1607.559
Lowest MAE: 973.6169
The ARIMA and Dynamic models have a substantially greater MAPE, indicating that their % error reliability may be lower. However, ARIMA is regarded as the top performer among these models since RMSE and MAE are typically given priority for absolute error assessments.