PA HW 2

I picked the Gasoline Stations data set, and here’s the link of the data: https://www.census.gov/retail/marts/www/adv44700.txt

# Load necessary libraries
library(forecast)

## Warning: package 'forecast' was built under R version 4.3.3

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

# read the data by using link code 
url <- "https://www.census.gov/retail/marts/www/adv44700.txt"
download.file(url, destfile = "adv44700.txt")
data <- read.table("adv44700.txt", skip = 1, fill = TRUE, header = FALSE)

# Clean the data
data <- data[complete.cases(data), ]
colnames(data) <- c('Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec')

# Reshape the data to have a single column of monthly data
data <- data %>%
  gather(key = "Month", value = "Sales", -Year) %>%
  mutate(Date = as.Date(paste(Year, Month, "01", sep = "-"), format = "%Y-%b-%d")) %>%
  arrange(Date) %>%
  mutate(Sales = as.numeric(gsub(",", "", Sales)))  # Ensure Sales is numeric

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Sales = as.numeric(gsub(",", "", Sales))`.
## Caused by warning:
## ! NAs introduced by coercion

# Filter the last 5 years
data <- data %>%
  filter(Date >= as.Date("2015-01-01")) %>%
  select(Date, Sales)

# Split the data into training 4 years and testing 1 year
train_data <- data %>%
  filter(Date < as.Date("2019-01-01")) %>%
  pull(Sales)  # Ensure it's a vector
test_data <- data %>%
  filter(Date >= as.Date("2019-01-01")) %>%
  pull(Sales)  # Ensure it's a vector

# Convert to time series objects
train_ts <- ts(train_data, start = c(2015, 1), frequency = 12)
test_ts <- ts(test_data, start = c(2019, 1), frequency = 12)

# ETS model
ets_model <- ets(train_ts)
ets_forecast <- forecast(ets_model, h = 12)

# ARIMA model
arima_model <- auto.arima(train_ts, seasonal = TRUE)
arima_forecast <- forecast(arima_model, h = 12)

# dynamic regression model (using auto.arima with external regressors if available)
# Assuming no external regressors in this example
dynamic_model <- auto.arima(train_ts)
dynamic_forecast <- forecast(dynamic_model, h = 12)

# Combine forecasts
combined_forecast <- (ets_forecast$mean + arima_forecast$mean + dynamic_forecast$mean) / 3

# Evaluate model performance
ets_accuracy <- accuracy(ets_forecast, test_ts)
arima_accuracy <- accuracy(arima_forecast, test_ts)
dynamic_accuracy <- accuracy(dynamic_forecast, test_ts)
combined_accuracy <- accuracy(combined_forecast, test_ts)

# Print accuracy 
print(ets_accuracy)

##                     ME     RMSE      MAE       MPE      MAPE      MASE
## Training set 1472.1654 4152.298 1868.535  3.111884 10.520703 1.5055365
## Test set      345.0402 1926.395 1221.076 -1.973525  7.497176 0.9838589
##                     ACF1 Theil's U
## Training set -0.13022272        NA
## Test set     -0.05848032 0.0583077

print(arima_accuracy)

##                     ME     RMSE      MAE      MPE     MAPE      MASE
## Training set -43.68557  754.651 477.5387 -4330.17 5654.024 0.3847678
## Test set     792.49780 1607.559 973.6169 -6307.85 6311.677 0.7844734
##                     ACF1  Theil's U
## Training set  0.02338336         NA
## Test set     -0.48827133 0.05466454

print(dynamic_accuracy)

##                     ME     RMSE      MAE      MPE     MAPE      MASE
## Training set -43.68557  754.651 477.5387 -4330.17 5654.024 0.3847678
## Test set     792.49780 1607.559 973.6169 -6307.85 6311.677 0.7844734
##                     ACF1  Theil's U
## Training set  0.02338336         NA
## Test set     -0.48827133 0.05466454

print(combined_accuracy)

##                ME     RMSE      MAE       MPE    MAPE       ACF1  Theil's U
## Test set 643.3453 1636.433 1056.096 -4205.891 4209.57 -0.2957432 0.05430624

# Plot the model forecast
plot(test_ts, type = "l", col = "black", main = "Sales Forecasts")
lines(ets_forecast$mean, col = "yellow")
lines(arima_forecast$mean, col = "red")
lines(dynamic_forecast$mean, col = "green")
lines(combined_forecast, col = "purple")
legend("topright", legend = c("Actual", "ETS", "ARIMA", "Dynamic", "Combined"),
       col = c("black", "yellow", "red", "green", "purple"), lty = 1)

ETS Model

RMSE (Test set): 1926.395
MAE (Test set): 1221.076
MAPE (Test set): 7.497176

ARIMA Model

RMSE (Test set): 1607.559
MAE (Test set): 973.6169
MAPE (Test set): 6311.677

Dynamic Model

RMSE (Test set): 1607.559 (same as ARIMA)
MAE (Test set): 973.6169 (same as ARIMA)
MAPE (Test set): 6311.677 (same as ARIMA)

Combined Model

RMSE (Test set): 1636.433
MAE (Test set): 1056.096
MAPE (Test set): 4209.57

Best Performing Model

The ARIMA model performs the best since it has the lowest values in these important values, based on the lowest RMSE and MAE on the test set below:

Lowest RMSE: 1607.559
Lowest MAE: 973.6169

The ARIMA and Dynamic models have a substantially greater MAPE, indicating that their % error reliability may be lower. However, ARIMA is regarded as the top performer among these models since RMSE and MAE are typically given priority for absolute error assessments.