library(readr)
library(dplyr)
library(tidyr)
library(lubridate)
library(ggplot2)
library(forecast)
library(zoo)
library(rpart)
library(scales)
set.seed(7406)
theme_set(theme_minimal(base_size = 12))
output_dir <- "r_final_outputs"
dir.create(output_dir, showWarnings = FALSE)Final Project Code: Forecasting U.S. Retail Sales
data_path <- "final_outputs/fred_retail_macro_cache.csv"
dir.create("final_outputs", showWarnings = FALSE)
fetch_fred_series <- function(series_id, value_name) {
fred_url <- paste0("https://fred.stlouisfed.org/graph/fredgraph.csv?id=", series_id)
out <- read_csv(fred_url, show_col_types = FALSE)
names(out) <- c("Date", value_name)
out |>
mutate(
Date = as.Date(Date),
across(all_of(value_name), as.numeric)
)
}
if (!file.exists(data_path)) {
sales <- fetch_fred_series("RSXFS", "Sales")
cpi <- fetch_fred_series("CPIAUCSL", "CPI")
unemployment <- fetch_fred_series("UNRATE", "Unemployment")
retail_cache <- sales |>
full_join(cpi, by = "Date") |>
full_join(unemployment, by = "Date") |>
filter(Date >= as.Date("2010-01-01"), Date <= as.Date("2025-12-31")) |>
arrange(Date)
write_csv(retail_cache, data_path)
}
retail_raw <- read_csv(data_path, show_col_types = FALSE) |>
mutate(Date = as.Date(Date)) |>
arrange(Date) |>
filter(!is.na(Sales)) |>
mutate(
CPI = na.locf(CPI, na.rm = FALSE),
CPI = na.locf(CPI, fromLast = TRUE),
Unemployment = na.locf(Unemployment, na.rm = FALSE),
Unemployment = na.locf(Unemployment, fromLast = TRUE)
)
glimpse(retail_raw)Rows: 192
Columns: 4
$ Date <date> 2010-01-01, 2010-02-01, 2010-03-01, 2010-04-01, 2010-05-…
$ Sales <dbl> 302325, 302310, 309525, 312143, 309158, 308588, 309543, 3…
$ CPI <dbl> 217.488, 217.281, 217.353, 217.403, 217.290, 217.199, 217…
$ Unemployment <dbl> 9.8, 9.8, 9.9, 9.9, 9.6, 9.4, 9.4, 9.5, 9.5, 9.4, 9.8, 9.…
summary(retail_raw) Date Sales CPI Unemployment
Min. :2010-01-01 Min. :302310 Min. :217.2 Min. : 3.400
1st Qu.:2013-12-24 1st Qu.:368841 1st Qu.:234.7 1st Qu.: 3.900
Median :2017-12-16 Median :422078 Median :248.3 Median : 4.850
Mean :2017-12-15 Mean :449513 Mean :258.4 Mean : 5.702
3rd Qu.:2021-12-08 3rd Qu.:553376 3rd Qu.:281.3 3rd Qu.: 7.350
Max. :2025-12-01 Max. :634830 Max. :326.0 Max. :14.800
add_features <- function(data) {
data |>
arrange(Date) |>
mutate(
t = row_number(),
month = month(Date),
quarter = quarter(Date),
month_factor = factor(month),
month_sin = sin(2 * pi * month / 12),
month_cos = cos(2 * pi * month / 12),
is_december = as.integer(month == 12),
holiday_count = case_when(
month %in% c(1, 2, 5, 6, 7, 9, 10, 11) ~ 1,
month == 12 ~ 2,
TRUE ~ 0
),
lag_1 = lag(Sales, 1),
lag_3 = lag(Sales, 3),
lag_12 = lag(Sales, 12),
rolling_3 = rollmean(lag(Sales, 1), k = 3, fill = NA, align = "right"),
rolling_6 = rollmean(lag(Sales, 1), k = 6, fill = NA, align = "right"),
rolling_12 = rollmean(lag(Sales, 1), k = 12, fill = NA, align = "right"),
yoy_growth = Sales / lag_12 - 1
)
}
retail <- add_features(retail_raw)
colSums(is.na(retail)) Date Sales CPI Unemployment t
0 0 0 0 0
month quarter month_factor month_sin month_cos
0 0 0 0 0
is_december holiday_count lag_1 lag_3 lag_12
0 0 1 3 12
rolling_3 rolling_6 rolling_12 yoy_growth
3 6 12 12
split_index <- floor(nrow(retail_raw) * 0.80)
train_raw <- retail_raw[1:split_index, ]
test_raw <- retail_raw[(split_index + 1):nrow(retail_raw), ]
train <- retail[1:split_index, ]
test <- retail[(split_index + 1):nrow(retail), ]
sales_train_ts <- ts(train_raw$Sales, frequency = 12, start = c(year(min(train_raw$Date)), month(min(train_raw$Date))))
sales_test_ts <- ts(test_raw$Sales, frequency = 12, start = c(year(min(test_raw$Date)), month(min(test_raw$Date))))
cat("Training period:", as.character(min(train_raw$Date)), "to", as.character(max(train_raw$Date)), "\n")Training period: 2010-01-01 to 2022-09-01
cat("Testing period:", as.character(min(test_raw$Date)), "to", as.character(max(test_raw$Date)), "\n")Testing period: 2022-10-01 to 2025-12-01
p_split <- ggplot() +
geom_line(data = train_raw, aes(Date, Sales, color = "Train"), linewidth = 0.8) +
geom_line(data = test_raw, aes(Date, Sales, color = "Test"), linewidth = 0.8) +
scale_y_continuous(labels = comma) +
labs(
title = "Monthly U.S. Retail Sales: Train/Test Split",
x = NULL,
y = "Retail Sales (Millions of Dollars)",
color = NULL
)
p_splitggsave(file.path(output_dir, "train_test_split.png"), p_split, width = 10, height = 5, dpi = 200)stl_fit <- stl(sales_train_ts, s.window = "periodic")
autoplot(stl_fit) +
labs(title = "STL Decomposition of Retail Sales")ggsave(file.path(output_dir, "stl_decomposition.png"), width = 10, height = 6, dpi = 200)forecast_horizon <- nrow(test_raw)
forecast_table <- tibble(Date = test_raw$Date, Actual = test_raw$Sales)
add_forecast <- function(table, model_name, values) {
table[[model_name]] <- as.numeric(values)
table
}
safe_mape <- function(actual, predicted) {
mean(abs((actual - predicted) / actual), na.rm = TRUE) * 100
}
evaluate_model <- function(actual, predicted, model_name) {
tibble(
Model = model_name,
RMSE = sqrt(mean((actual - predicted)^2, na.rm = TRUE)),
MAE = mean(abs(actual - predicted), na.rm = TRUE),
`MAPE (%)` = safe_mape(actual, predicted),
ME = mean(actual - predicted, na.rm = TRUE)
)
}# CPI and unemployment are external variables. To avoid look-ahead bias, test-period
# values are forecast from the training period instead of using realized future values.
cpi_train_ts <- ts(train_raw$CPI, frequency = 12, start = c(year(min(train_raw$Date)), month(min(train_raw$Date))))
unemp_train_ts <- ts(train_raw$Unemployment, frequency = 12, start = c(year(min(train_raw$Date)), month(min(train_raw$Date))))
cpi_future <- forecast(auto.arima(cpi_train_ts, seasonal = TRUE), h = forecast_horizon)$mean
unemp_future <- forecast(auto.arima(unemp_train_ts, seasonal = TRUE), h = forecast_horizon)$mean
future_external <- tibble(
Date = test_raw$Date,
CPI = as.numeric(cpi_future),
Unemployment = as.numeric(unemp_future),
month = month(Date),
quarter = quarter(Date),
month_sin = sin(2 * pi * month / 12),
month_cos = cos(2 * pi * month / 12),
is_december = as.integer(month == 12),
holiday_count = case_when(
month %in% c(1, 2, 5, 6, 7, 9, 10, 11) ~ 1,
month == 12 ~ 2,
TRUE ~ 0
)
)
future_external |> head()# A tibble: 6 × 9
Date CPI Unemployment month quarter month_sin month_cos is_december
<date> <dbl> <dbl> <dbl> <int> <dbl> <dbl> <int>
1 2022-10-01 298. 3.5 10 4 -8.66e- 1 5 e- 1 0
2 2022-11-01 299. 3.5 11 4 -5.00e- 1 8.66e- 1 0
3 2022-12-01 301. 3.5 12 4 -2.45e-16 1 e+ 0 1
4 2023-01-01 302. 3.5 1 1 5 e- 1 8.66e- 1 0
5 2023-02-01 304. 3.5 2 1 8.66e- 1 5 e- 1 0
6 2023-03-01 305. 3.5 3 1 1 e+ 0 6.12e-17 0
# ℹ 1 more variable: holiday_count <dbl>
naive_fc <- naive(sales_train_ts, h = forecast_horizon)
snaive_fc <- snaive(sales_train_ts, h = forecast_horizon)
ets_fit <- ets(sales_train_ts)
ets_fc <- forecast(ets_fit, h = forecast_horizon)
arima_fit <- auto.arima(sales_train_ts, seasonal = TRUE, stepwise = FALSE, approximation = FALSE)
arima_fc <- forecast(arima_fit, h = forecast_horizon)
forecast_table <- forecast_table |>
add_forecast("Naive", naive_fc$mean) |>
add_forecast("Seasonal Naive", snaive_fc$mean) |>
add_forecast("ETS", ets_fc$mean) |>
add_forecast("ARIMA", arima_fc$mean)
ets_fitETS(M,A,N)
Call:
ets(y = sales_train_ts)
Smoothing parameters:
alpha = 0.6812
beta = 1e-04
Initial states:
l = 302366.5466
b = 1732.1726
sigma: 0.0216
AIC AICc BIC
3552.773 3553.182 3567.926
arima_fitSeries: sales_train_ts
ARIMA(0,1,2) with drift
Coefficients:
ma1 ma2 drift
-0.1239 -0.2347 1831.6721
s.e. 0.0790 0.0797 471.7953
sigma^2 = 82866770: log likelihood = -1599.92
AIC=3207.84 AICc=3208.12 BIC=3219.94
xreg_cols <- c("CPI", "Unemployment", "holiday_count", "is_december", "month_sin", "month_cos")
train_xreg <- as.matrix(train[, xreg_cols])
test_xreg <- as.matrix(future_external[, xreg_cols])
arimax_fit <- auto.arima(
sales_train_ts,
xreg = train_xreg,
seasonal = TRUE,
stepwise = FALSE,
approximation = FALSE
)
arimax_fc <- forecast(arimax_fit, xreg = test_xreg, h = forecast_horizon)
tslm_fit <- tslm(sales_train_ts ~ trend + season)
tslm_fc <- forecast(tslm_fit, h = forecast_horizon)
forecast_table <- forecast_table |>
add_forecast("ARIMAX", arimax_fc$mean) |>
add_forecast("TSLM", tslm_fc$mean)
arimax_fitSeries: sales_train_ts
Regression with ARIMA(1,0,0)(0,0,2)[12] errors
Coefficients:
ar1 sma1 sma2 intercept CPI Unemployment
0.8741 -0.2112 -0.1724 -372149.47 3313.2619 -4843.7971
s.e. 0.0433 0.0850 0.1017 46019.92 179.6879 695.7806
holiday_count is_december month_sin month_cos
611.5186 -999.4727 -213.8973 -2143.423
s.e. 709.6760 1247.3093 1092.5913 1118.378
sigma^2 = 53600263: log likelihood = -1574.84
AIC=3171.68 AICc=3173.55 BIC=3205.02
summary(tslm_fit)
Call:
tslm(formula = sales_train_ts ~ trend + season)
Residuals:
Min 1Q Median 3Q Max
-107534 -15744 1454 10942 67129
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 293466.6 7728.1 37.974 <2e-16 ***
trend 1499.3 46.0 32.595 <2e-16 ***
season2 -549.4 9843.6 -0.056 0.956
season3 2428.7 9843.9 0.247 0.805
season4 -1556.0 9844.5 -0.158 0.875
season5 2511.7 9845.2 0.255 0.799
season6 4264.5 9846.2 0.433 0.666
season7 2885.8 9847.4 0.293 0.770
season8 2866.5 9848.8 0.291 0.771
season9 3276.6 9850.4 0.333 0.740
season10 -1032.4 10047.4 -0.103 0.918
season11 -779.8 10048.1 -0.078 0.938
season12 -1563.8 10049.1 -0.156 0.877
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 25100 on 140 degrees of freedom
Multiple R-squared: 0.8841, Adjusted R-squared: 0.8742
F-statistic: 89.01 on 12 and 140 DF, p-value: < 2.2e-16
ml_cols <- c(
"t", "CPI", "Unemployment", "holiday_count", "is_december",
"month_sin", "month_cos", "quarter",
"lag_1", "lag_3", "lag_12", "rolling_3", "rolling_6", "rolling_12"
)
ml_train <- train |>
drop_na(all_of(ml_cols), yoy_growth)
X_train <- ml_train[, ml_cols]
y_train <- ml_train$yoy_growth
use_xgboost <- requireNamespace("xgboost", quietly = TRUE)
if (use_xgboost) {
dtrain <- xgboost::xgb.DMatrix(data = as.matrix(X_train), label = y_train)
tree_fit <- xgboost::xgb.train(
data = dtrain,
nrounds = 250,
max_depth = 3,
eta = 0.03,
objective = "reg:squarederror",
verbose = 0
)
tree_model_name <- "XGBoost YoY Growth"
} else {
tree_fit <- rpart(
yoy_growth ~ t + CPI + Unemployment + holiday_count + is_december +
month_sin + month_cos + quarter + lag_1 + lag_3 + lag_12 +
rolling_3 + rolling_6 + rolling_12,
data = ml_train,
method = "anova",
control = rpart.control(cp = 0.001, minbucket = 5)
)
tree_model_name <- "Regression Tree YoY Growth"
}
recursive_tree_forecast <- function(model, history, future_external, use_xgboost) {
preds <- numeric(nrow(future_external))
hist <- history
for (i in seq_len(nrow(future_external))) {
current_date <- future_external$Date[i]
feature_row <- tibble(
t = nrow(hist) + 1,
CPI = future_external$CPI[i],
Unemployment = future_external$Unemployment[i],
holiday_count = future_external$holiday_count[i],
is_december = as.integer(month(current_date) == 12),
month_sin = sin(2 * pi * month(current_date) / 12),
month_cos = cos(2 * pi * month(current_date) / 12),
quarter = quarter(current_date),
lag_1 = tail(hist$Sales, 1),
lag_3 = tail(hist$Sales, 3)[1],
lag_12 = tail(hist$Sales, 12)[1],
rolling_3 = mean(tail(hist$Sales, 3)),
rolling_6 = mean(tail(hist$Sales, 6)),
rolling_12 = mean(tail(hist$Sales, 12))
)
if (use_xgboost) {
growth <- predict(model, xgboost::xgb.DMatrix(as.matrix(feature_row[, ml_cols])))
} else {
growth <- predict(model, newdata = feature_row)
}
growth <- pmin(pmax(as.numeric(growth), -0.08), 0.14)
pred_level <- feature_row$lag_12 * (1 + growth)
preds[i] <- pred_level
hist <- bind_rows(
hist,
tibble(
Date = current_date,
Sales = pred_level,
CPI = future_external$CPI[i],
Unemployment = future_external$Unemployment[i]
)
)
}
preds
}
tree_preds <- recursive_tree_forecast(
tree_fit,
train_raw,
future_external |> select(Date, CPI, Unemployment, holiday_count),
use_xgboost
)
forecast_table <- add_forecast(forecast_table, tree_model_name, tree_preds)
tree_model_name[1] "XGBoost YoY Growth"
nnet_fit <- nnetar(sales_train_ts, repeats = 30)
nnet_fc <- forecast(nnet_fit, h = forecast_horizon, PI = TRUE, npaths = 1000)
forecast_table <- add_forecast(forecast_table, "NNETAR Neural Network", nnet_fc$mean)
nnet_fitSeries: sales_train_ts
Model: NNAR(1,1,2)[12]
Call: nnetar(y = sales_train_ts, repeats = 30)
Average of 30 networks, each of which is
a 2-2-1 network with 9 weights
options were - linear output units
sigma^2 estimated as 69549214
ensemble_members <- c("Naive", "Seasonal Naive", "ETS", tree_model_name)
forecast_table <- forecast_table |>
mutate(Ensemble = rowMeans(across(all_of(ensemble_members)), na.rm = TRUE))
ensemble_members[1] "Naive" "Seasonal Naive" "ETS"
[4] "XGBoost YoY Growth"
model_cols <- setdiff(names(forecast_table), c("Date", "Actual"))
results <- bind_rows(lapply(model_cols, function(col) {
evaluate_model(forecast_table$Actual, forecast_table[[col]], col)
})) |>
arrange(RMSE)
write_csv(results, file.path(output_dir, "model_comparison_results.csv"))
write_csv(forecast_table, file.path(output_dir, "model_forecasts.csv"))
results# A tibble: 9 × 5
Model RMSE MAE `MAPE (%)` ME
<chr> <dbl> <dbl> <dbl> <dbl>
1 Ensemble 8204. 6537. 1.09 -4681.
2 ETS 12218. 11350. 1.88 -11002.
3 ARIMA 14626. 13707. 2.26 -13367.
4 Naive 30796. 25713. 4.18 24993.
5 Seasonal Naive 43536. 37024. 6.04 37024.
6 TSLM 50968. 50690. 8.39 50690.
7 XGBoost YoY Growth 77155. 69738. 11.4 -69738.
8 ARIMAX 96122. 85546. 14.0 -85540.
9 NNETAR Neural Network 117685. 103576. 16.9 103240.
extract_95_interval <- function(fc, model_name) {
if (is.null(fc$lower) || is.null(fc$upper)) {
return(NULL)
}
lower <- as.matrix(fc$lower)
upper <- as.matrix(fc$upper)
level_names <- colnames(lower)
if ("95%" %in% level_names) {
idx <- which(level_names == "95%")[1]
} else {
idx <- ncol(lower)
}
tibble(
Date = test_raw$Date,
Model = model_name,
Lo95 = as.numeric(lower[, idx]),
Hi95 = as.numeric(upper[, idx])
)
}
forecast_intervals <- bind_rows(
extract_95_interval(naive_fc, "Naive"),
extract_95_interval(snaive_fc, "Seasonal Naive"),
extract_95_interval(ets_fc, "ETS"),
extract_95_interval(arima_fc, "ARIMA"),
extract_95_interval(arimax_fc, "ARIMAX"),
extract_95_interval(tslm_fc, "TSLM"),
extract_95_interval(nnet_fc, "NNETAR Neural Network")
)
write_csv(forecast_intervals, file.path(output_dir, "model_forecast_95_intervals.csv"))
forecast_intervals# A tibble: 273 × 4
Date Model Lo95 Hi95
<date> <chr> <dbl> <dbl>
1 2022-10-01 Naive 560897. 597995.
2 2022-11-01 Naive 553213. 605679.
3 2022-12-01 Naive 547317. 611575.
4 2023-01-01 Naive 542347. 616545.
5 2023-02-01 Naive 537968. 620924.
6 2023-03-01 Naive 534009. 624883.
7 2023-04-01 Naive 530369. 628523.
8 2023-05-01 Naive 526980. 631912.
9 2023-06-01 Naive 523798. 635094.
10 2023-07-01 Naive 520787. 638105.
# ℹ 263 more rows
top_models <- head(results$Model, 5)
forecast_plot_data <- forecast_table |>
select(Date, all_of(top_models)) |>
pivot_longer(-Date, names_to = "Series", values_to = "Sales")
top_interval_data <- forecast_intervals |>
filter(Model %in% top_models)
p_forecasts <- ggplot() +
geom_ribbon(
data = top_interval_data,
aes(Date, ymin = Lo95, ymax = Hi95, fill = Model),
alpha = 0.12,
color = NA,
show.legend = FALSE
) +
geom_line(
data = train_raw,
aes(Date, Sales, color = "Training Actual"),
linewidth = 0.65
) +
geom_line(
data = test_raw,
aes(Date, Sales, color = "Test Actual"),
linewidth = 0.8
) +
geom_line(
data = forecast_plot_data,
aes(Date, Sales, color = Series),
linewidth = 0.75
) +
scale_y_continuous(labels = comma) +
scale_color_manual(
values = c(
"Training Actual" = "gray45",
"Test Actual" = "black",
"Ensemble" = "#C49A00",
"ETS" = "#1B9E77",
"ARIMA" = "#D95F02",
"Naive" = "#0072B2",
"Seasonal Naive" = "#CC79A7"
)
) +
scale_fill_manual(
values = c(
"ETS" = "#1B9E77",
"ARIMA" = "#D95F02",
"Naive" = "#0072B2",
"Seasonal Naive" = "#CC79A7"
)
) +
labs(
title = "Historical Retail Sales and Top Model Forecasts with 95% Intervals",
subtitle = "Shaded bands show 95% forecast intervals where available; Ensemble and XGBoost are shown as point forecasts.",
x = NULL,
y = "Retail Sales (Millions of Dollars)",
color = NULL
)
p_forecastsggsave(file.path(output_dir, "top_model_forecasts.png"), p_forecasts, width = 10, height = 5, dpi = 200)
ggsave(file.path(output_dir, "top_model_forecasts_with_95_intervals.png"), p_forecasts, width = 10, height = 5, dpi = 200)all_forecast_data <- forecast_table |>
pivot_longer(
cols = -c(Date, Actual),
names_to = "Model",
values_to = "Forecast"
)
p_model_facets <- ggplot(all_forecast_data, aes(Date)) +
geom_line(aes(y = Actual), color = "black", linewidth = 0.7) +
geom_line(aes(y = Forecast), color = "#2C7FB8", linewidth = 0.7) +
facet_wrap(~Model, scales = "free_y", ncol = 3) +
scale_y_continuous(labels = comma) +
labs(
title = "Actual vs Forecast by Model",
x = NULL,
y = "Retail Sales (Millions of Dollars)"
)
p_model_facetsggsave(file.path(output_dir, "actual_vs_forecast_by_model.png"), p_model_facets, width = 12, height = 8, dpi = 200)all_model_overlay_data <- forecast_table |>
pivot_longer(
cols = -c(Date, Actual),
names_to = "Model",
values_to = "Forecast"
)
p_all_models_overlay <- ggplot() +
geom_ribbon(
data = forecast_intervals,
aes(Date, ymin = Lo95, ymax = Hi95, fill = Model),
alpha = 0.08,
color = NA,
show.legend = FALSE
) +
geom_line(
data = forecast_table,
aes(Date, Actual),
color = "black",
linewidth = 1
) +
geom_line(
data = all_model_overlay_data,
aes(Date, Forecast, color = Model),
linewidth = 0.7,
alpha = 0.85
) +
scale_y_continuous(labels = comma) +
labs(
title = "All Forecast Models vs Actual Retail Sales with 95% Intervals",
subtitle = "Intervals are shown for models with forecast-package prediction intervals; XGBoost and Ensemble are point forecasts.",
x = NULL,
y = "Retail Sales (Millions of Dollars)",
color = "Model"
)
p_all_models_overlayggsave(file.path(output_dir, "all_model_forecasts_overlay.png"), p_all_models_overlay, width = 11, height = 6, dpi = 200)
ggsave(file.path(output_dir, "all_model_forecasts_overlay_with_95_intervals.png"), p_all_models_overlay, width = 11, height = 6, dpi = 200)all_model_ci_plot_data <- all_forecast_data |>
left_join(forecast_intervals, by = c("Date", "Model"))
p_all_model_ci_facets <- ggplot(all_model_ci_plot_data, aes(Date)) +
geom_ribbon(aes(ymin = Lo95, ymax = Hi95), fill = "#9ECAE1", alpha = 0.35, na.rm = TRUE) +
geom_line(aes(y = Actual), color = "black", linewidth = 0.7) +
geom_line(aes(y = Forecast), color = "#2C7FB8", linewidth = 0.7) +
facet_wrap(~Model, scales = "free_y", ncol = 3) +
scale_y_continuous(labels = comma) +
labs(
title = "Actual vs Forecast by Model with 95% Intervals",
subtitle = "Ribbon appears where a 95% forecast interval is available.",
x = NULL,
y = "Retail Sales (Millions of Dollars)"
)
p_all_model_ci_facetsggsave(file.path(output_dir, "actual_vs_forecast_by_model_with_95_intervals.png"), p_all_model_ci_facets, width = 12, height = 8, dpi = 200)focus_models <- intersect(c("ARIMAX", tree_model_name, "NNETAR Neural Network", "TSLM"), model_cols)
focus_plot_data <- forecast_table |>
select(Date, Actual, all_of(focus_models)) |>
pivot_longer(
cols = -c(Date, Actual),
names_to = "Model",
values_to = "Forecast"
)
p_focus_models <- ggplot(focus_plot_data, aes(Date)) +
geom_line(aes(y = Actual), color = "black", linewidth = 0.8) +
geom_line(aes(y = Forecast), color = "#D95F02", linewidth = 0.8) +
facet_wrap(~Model, scales = "free_y", ncol = 2) +
scale_y_continuous(labels = comma) +
labs(
title = "Focused Forecast Plots for ARIMAX, XGBoost, NNETAR, and TSLM",
x = NULL,
y = "Retail Sales (Millions of Dollars)"
)
p_focus_modelsggsave(file.path(output_dir, "focused_forecasts_arimax_xgboost_nnetar_tslm.png"), p_focus_models, width = 11, height = 7, dpi = 200)individual_model_dir <- file.path(output_dir, "individual_model_plots")
dir.create(individual_model_dir, showWarnings = FALSE)
clean_file_name <- function(x) {
out <- gsub("[^A-Za-z0-9]+", "_", x)
out <- gsub("^_|_$", "", out)
tolower(out)
}
for (model_name in model_cols) {
model_plot_data <- forecast_table |>
transmute(Date, Actual, Forecast = .data[[model_name]])
p_one_model <- ggplot(model_plot_data, aes(Date)) +
geom_line(aes(y = Actual, color = "Actual"), linewidth = 0.8) +
geom_line(aes(y = Forecast, color = "Forecast"), linewidth = 0.8) +
scale_y_continuous(labels = comma) +
scale_color_manual(values = c("Actual" = "black", "Forecast" = "#2C7FB8")) +
labs(
title = paste("Actual vs Forecast:", model_name),
x = NULL,
y = "Retail Sales (Millions of Dollars)",
color = NULL
)
ggsave(
file.path(individual_model_dir, paste0(clean_file_name(model_name), "_forecast.png")),
p_one_model,
width = 9,
height = 5,
dpi = 200
)
}
list.files(individual_model_dir, full.names = TRUE)[1] "r_final_outputs/individual_model_plots/arima_forecast.png"
[2] "r_final_outputs/individual_model_plots/arimax_forecast.png"
[3] "r_final_outputs/individual_model_plots/ensemble_forecast.png"
[4] "r_final_outputs/individual_model_plots/ets_forecast.png"
[5] "r_final_outputs/individual_model_plots/naive_forecast.png"
[6] "r_final_outputs/individual_model_plots/nnetar_neural_network_forecast.png"
[7] "r_final_outputs/individual_model_plots/seasonal_naive_forecast.png"
[8] "r_final_outputs/individual_model_plots/tslm_forecast.png"
[9] "r_final_outputs/individual_model_plots/xgboost_yoy_growth_forecast.png"
metric_data <- results |>
select(Model, RMSE, MAE, `MAPE (%)`) |>
pivot_longer(-Model, names_to = "Metric", values_to = "Value")
p_metrics <- ggplot(metric_data, aes(reorder(Model, Value), Value, fill = Metric)) +
geom_col(show.legend = FALSE) +
facet_wrap(~Metric, scales = "free_x") +
coord_flip() +
labs(title = "Forecast Accuracy Metrics by Model", x = NULL, y = NULL)
p_metricsggsave(file.path(output_dir, "accuracy_metrics_by_model.png"), p_metrics, width = 11, height = 6, dpi = 200)if (use_xgboost) {
xgb_importance <- xgboost::xgb.importance(
feature_names = ml_cols,
model = tree_fit
)
write_csv(as_tibble(xgb_importance), file.path(output_dir, "xgboost_feature_importance.csv"))
p_importance <- xgb_importance |>
as_tibble() |>
slice_max(Gain, n = 10) |>
mutate(Feature = reorder(Feature, Gain)) |>
ggplot(aes(Feature, Gain)) +
geom_col(fill = "#2C7FB8") +
coord_flip() +
labs(
title = "XGBoost Feature Importance",
x = NULL,
y = "Gain"
)
p_importance
ggsave(file.path(output_dir, "xgboost_feature_importance.png"), p_importance, width = 8, height = 5, dpi = 200)
} else {
message("xgboost is not installed, so feature importance is skipped.")
}residuals_by_model <- forecast_table |>
pivot_longer(
cols = -c(Date, Actual),
names_to = "Model",
values_to = "Forecast"
) |>
mutate(Residual = Actual - Forecast)
p_resid_facets <- ggplot(residuals_by_model, aes(Date, Residual)) +
geom_hline(yintercept = 0, linetype = "dashed", color = "gray40") +
geom_line(color = "#D95F0E", linewidth = 0.6) +
facet_wrap(~Model, scales = "free_y", ncol = 3) +
labs(
title = "Residuals Over Time by Model",
x = NULL,
y = "Actual - Forecast"
)
p_resid_facetsggsave(file.path(output_dir, "residuals_by_model.png"), p_resid_facets, width = 12, height = 8, dpi = 200)p_scatter <- ggplot(all_forecast_data, aes(Actual, Forecast)) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "gray40") +
geom_point(color = "#1B9E77", alpha = 0.75, size = 1.5) +
facet_wrap(~Model, scales = "free", ncol = 3) +
scale_x_continuous(labels = comma) +
scale_y_continuous(labels = comma) +
labs(
title = "Actual vs Forecast Scatterplots by Model",
x = "Actual Retail Sales",
y = "Forecast Retail Sales"
)
p_scatterggsave(file.path(output_dir, "actual_vs_forecast_scatter_by_model.png"), p_scatter, width = 12, height = 8, dpi = 200)best_model <- results$Model[1]
resid_data <- forecast_table |>
transmute(Date, Residual = Actual - .data[[best_model]])
p_resid <- ggplot(resid_data, aes(Date, Residual)) +
geom_hline(yintercept = 0, linetype = "dashed") +
geom_line(linewidth = 0.8) +
labs(
title = paste("Residuals Over Time:", best_model),
x = NULL,
y = "Actual - Forecast"
)
p_residggsave(file.path(output_dir, "best_model_residuals.png"), p_resid, width = 10, height = 4.5, dpi = 200)list.files(output_dir, full.names = TRUE) [1] "r_final_outputs/accuracy_metrics_by_model.png"
[2] "r_final_outputs/actual_vs_forecast_by_model.png"
[3] "r_final_outputs/actual_vs_forecast_by_model_with_95_intervals.png"
[4] "r_final_outputs/actual_vs_forecast_scatter_by_model.png"
[5] "r_final_outputs/all_model_forecasts_overlay.png"
[6] "r_final_outputs/all_model_forecasts_overlay_with_95_intervals.png"
[7] "r_final_outputs/best_model_residuals.png"
[8] "r_final_outputs/cumulative_forecast_error.png"
[9] "r_final_outputs/focused_forecasts_arimax_xgboost_nnetar_tslm.png"
[10] "r_final_outputs/individual_model_plots"
[11] "r_final_outputs/model_comparison_results.csv"
[12] "r_final_outputs/model_forecast_95_intervals.csv"
[13] "r_final_outputs/model_forecasts.csv"
[14] "r_final_outputs/residuals_by_model.png"
[15] "r_final_outputs/stl_decomposition.png"
[16] "r_final_outputs/top_model_forecasts.png"
[17] "r_final_outputs/top_model_forecasts_with_95_intervals.png"
[18] "r_final_outputs/train_test_split.png"
[19] "r_final_outputs/xgboost_feature_importance.csv"
[20] "r_final_outputs/xgboost_feature_importance.png"