library(fpp3) # tsibble + fable + feasts
library(fredr) # FRED API
library(dplyr)
library(ggplot2)
library(knitr)
library(kableExtra)
library(readr)
# Get a free FRED API key at https://fred.stlouisfed.org/docs/api/api_key.html
# Store it in your .Renviron as FRED_API_KEY=xxxxxx (recommended)
fredr_set_key(Sys.getenv("FRED_API_KEY"))Module 2 — Accuracy Metrics & Decomposition
1. Setup
2. Data — FRED series RSXFS
I’m using U.S. Advance Retail Sales: Retail Trade and Food Services (FRED ID RSXFS), monthly, not seasonally adjusted. This series has both strong trend and very strong seasonality, which makes the four benchmark models behave visibly differently and gives the additive-vs-multiplicative decomposition something to argue about.
raw <- fredr(
series_id = "RSXFS",
observation_start = as.Date("2000-01-01"),
observation_end = as.Date("2024-12-01")
)
retail <- raw |>
mutate(Month = yearmonth(date)) |>
select(Month, Sales = value) |>
as_tsibble(index = Month)
retail |>
autoplot(Sales) +
labs(title = "U.S. Advance Retail Sales (RSXFS), 2000–2024",
y = "Millions of USD", x = NULL)3. Train / Test split
I hold out the last 24 months as the test set.
train <- retail |> filter(year(Month) <= 2022)
test <- retail |> filter(year(Month) >= 2023)
h <- nrow(test)
h[1] 24
4. Fit benchmark models
fits <- train |>
model(
Mean = MEAN(Sales),
Naive = NAIVE(Sales),
SNaive = SNAIVE(Sales ~ lag("year")),
Drift = RW(Sales ~ drift())
)
fc <- fits |> forecast(h = h)5. Forecast plot — point estimates and 80% / 95% CIs
fc |>
autoplot(retail |> filter(year(Month) >= 2018), level = c(80, 95)) +
facet_wrap(~ .model, scales = "free_y", ncol = 2) +
labs(title = "Forecasts vs. actuals — RSXFS, 24-month horizon",
y = "Sales (millions $)", x = NULL) +
guides(colour = "none", fill = guide_legend(title = "Level"))6. Accuracy metrics on the test set
acc <- fc |>
accuracy(retail) |>
select(.model, ME, RMSE, MAE, MPE, MAPE) |>
arrange(RMSE)
acc |>
kbl(digits = 2,
caption = "Test-set accuracy metrics — 2023–2024 holdout",
col.names = c("Model", "ME", "RMSE", "MAE", "MPE (%)", "MAPE (%)")) |>
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE, position = "left") |>
row_spec(0, bold = TRUE, background = "#f5f5f5")| Model | ME | RMSE | MAE | MPE (%) | MAPE (%) |
|---|---|---|---|---|---|
| Drift | 12712.56 | 13600.89 | 12712.56 | 2.12 | 2.12 |
| SNaive | 20632.29 | 23717.06 | 20632.29 | 3.43 | 3.43 |
| Naive | 27784.29 | 29609.61 | 27784.29 | 4.63 | 4.63 |
| Mean | 239277.45 | 239496.26 | 239277.45 | 40.07 | 40.07 |
How the five metrics differ
Let \(e_t = y_t - \hat y_t\) be the forecast error on observation \(t\), with \(n\) test points.
- ME (Mean Error) \(= \frac{1}{n}\sum e_t\). Scale-dependent. Measures bias only — positive and negative errors cancel, so a model can have ME ≈ 0 and still be terrible. Useful for spotting systematic over- or under-forecasting.
- RMSE (Root Mean Squared Error) \(= \sqrt{\frac{1}{n}\sum e_t^2}\). Scale-dependent. Penalizes large errors quadratically, so it is sensitive to outliers. Optimal point forecast under squared-error loss is the conditional mean.
- MAE (Mean Absolute Error) \(= \frac{1}{n}\sum |e_t|\). Scale-dependent. Linear penalty, robust to outliers. Optimal point forecast under absolute-error loss is the conditional median.
- MPE (Mean Percentage Error) \(= \frac{100}{n}\sum \frac{e_t}{y_t}\). Scale-free. Like ME but in percent — also a bias measure, signs cancel.
- MAPE (Mean Absolute Percentage Error) \(= \frac{100}{n}\sum \left|\frac{e_t}{y_t}\right|\). Scale-free, so you can compare across series with different units. Two known weaknesses: undefined when \(y_t = 0\), and asymmetric — over-forecasts can exceed 100% but under-forecasts are capped at 100%.
Pairing the bias metric (ME, MPE) with the matching magnitude metric (MAE, MAPE) is the standard read: if MAE is large but ME is near zero, errors are big but unbiased; if both are large and same sign, the model is systematically off.
7. Excel cross-check
Export the SNaive forecast plus the held-out actuals so we can recompute the five metrics by hand in Excel:
verify <- fc |>
filter(.model == "SNaive") |>
as_tibble() |>
select(Month, Forecast = .mean) |>
left_join(as_tibble(test), by = "Month") |>
rename(Actual = Sales) |>
mutate(Error = Actual - Forecast,
AbsError = abs(Error),
SqError = Error^2,
PctError = 100 * Error / Actual,
AbsPctError = abs(PctError))
write_csv(verify, "snaive_accuracy_check.csv")
verify |>
kbl(digits = 3, caption = "SNaive errors — exported to snaive_accuracy_check.csv") |>
kable_styling(bootstrap_options = c("striped", "condensed"), full_width = FALSE)| Month | Forecast | Actual | Error | AbsError | SqError | PctError | AbsPctError |
|---|---|---|---|---|---|---|---|
| 2023 Jan | 560257 | 591964 | 31707 | 31707 | 1005333849 | 5.356 | 5.356 |
| 2023 Feb | 563777 | 586165 | 22388 | 22388 | 501222544 | 3.819 | 3.819 |
| 2023 Mar | 574485 | 578812 | 4327 | 4327 | 18722929 | 0.748 | 0.748 |
| 2023 Apr | 580943 | 583874 | 2931 | 2931 | 8590761 | 0.502 | 0.502 |
| 2023 May | 579844 | 586286 | 6442 | 6442 | 41499364 | 1.099 | 1.099 |
| 2023 Jun | 585556 | 588670 | 3114 | 3114 | 9696996 | 0.529 | 0.529 |
| 2023 Jul | 579267 | 589257 | 9990 | 9990 | 99800100 | 1.695 | 1.695 |
| 2023 Aug | 581196 | 595191 | 13995 | 13995 | 195860025 | 2.351 | 2.351 |
| 2023 Sep | 579446 | 598956 | 19510 | 19510 | 380640100 | 3.257 | 3.257 |
| 2023 Oct | 585006 | 595517 | 10511 | 10511 | 110481121 | 1.765 | 1.765 |
| 2023 Nov | 576356 | 593567 | 17211 | 17211 | 296218521 | 2.900 | 2.900 |
| 2023 Dec | 569119 | 594256 | 25137 | 25137 | 631868769 | 4.230 | 4.230 |
| 2024 Jan | 560257 | 588944 | 28687 | 28687 | 822943969 | 4.871 | 4.871 |
| 2024 Feb | 563777 | 593206 | 29429 | 29429 | 866066041 | 4.961 | 4.961 |
| 2024 Mar | 574485 | 595969 | 21484 | 21484 | 461562256 | 3.605 | 3.605 |
| 2024 Apr | 580943 | 595676 | 14733 | 14733 | 217061289 | 2.473 | 2.473 |
| 2024 May | 579844 | 600372 | 20528 | 20528 | 421398784 | 3.419 | 3.419 |
| 2024 Jun | 585556 | 599979 | 14423 | 14423 | 208022929 | 2.404 | 2.404 |
| 2024 Jul | 579267 | 605814 | 26547 | 26547 | 704743209 | 4.382 | 4.382 |
| 2024 Aug | 581196 | 603601 | 22405 | 22405 | 501984025 | 3.712 | 3.712 |
| 2024 Sep | 579446 | 608774 | 29328 | 29328 | 860131584 | 4.818 | 4.818 |
| 2024 Oct | 585006 | 612772 | 27766 | 27766 | 770950756 | 4.531 | 4.531 |
| 2024 Nov | 576356 | 616344 | 39988 | 39988 | 1599040144 | 6.488 | 6.488 |
| 2024 Dec | 569119 | 621713 | 52594 | 52594 | 2766128836 | 8.460 | 8.460 |
Open snaive_accuracy_check.csv in Excel. Columns land as: A = Month, B = Forecast, C = Actual, D = Error, E = AbsError, F = SqError, G = PctError, H = AbsPctError. With 24 forecast months you have rows 2:25. Add five summary cells:
| Metric | Excel formula |
|---|---|
| ME | =AVERAGE(D2:D25) |
| RMSE | =SQRT(AVERAGE(F2:F25)) |
| MAE | =AVERAGE(E2:E25) |
| MPE | =AVERAGE(G2:G25) |
| MAPE | =AVERAGE(H2:H25) |
These should match the SNaive row of the R table to within rounding. Take a screenshot of the Excel cells next to the R table and paste both into the discussion post.
8. Decomposition — additive vs. multiplicative
You can decompose this series because it’s monthly (frequency 12). Below I fit both an STL decomposition (commonly used as the additive workhorse) and a classical multiplicative decomposition.
8.1 Additive (STL)
add_dcmp <- train |>
model(STL(Sales ~ trend(window = 13) + season(window = "periodic"),
robust = TRUE)) |>
components()
autoplot(add_dcmp) +
labs(title = "Additive STL decomposition of RSXFS")8.2 Multiplicative (classical)
mult_dcmp <- train |>
model(classical_decomposition(Sales, type = "multiplicative")) |>
components()
autoplot(mult_dcmp) +
labs(title = "Classical multiplicative decomposition of RSXFS")8.3 Which worked better, and how do we tell?
The diagnostic is the remainder (residual) panel. After we strip out trend and season, the remainder should look like white noise — no structure, roughly constant variance.
- In the additive decomposition, the remainder’s amplitude grows over time and is much larger near the COVID shock and during the recent high-sales years. That’s the classic signature that the seasonal swings scale with the level — additivity is the wrong assumption.
- In the multiplicative decomposition, the remainder is centred near 1, much more stable across years, and visibly closer to noise.
Numeric check — variance of the remainder, smaller is better (note these are on different scales, additive is in millions of $, multiplicative is a unitless ratio, so compare each against its own trend):
add_rem <- add_dcmp |> pull(remainder)
mult_rem <- mult_dcmp |> pull(random) |> na.omit()
tibble(
decomposition = c("Additive (STL)", "Multiplicative (classical)"),
remainder_sd = c(sd(add_rem), sd(mult_rem)),
rem_over_level = c(sd(add_rem) / mean(train$Sales),
sd(mult_rem)) # mult random is already a ratio
) |>
kbl(digits = 4, caption = "Remainder dispersion — lower is better") |>
kable_styling(full_width = FALSE)| decomposition | remainder_sd | rem_over_level |
|---|---|---|
| Additive (STL) | 7455.2109 | 0.0208 |
| Multiplicative (classical) | 0.0166 | 0.0166 |
For this series multiplicative wins — the seasonal index is a constant percentage-of-trend rather than a constant dollar amount, which matches the economic intuition for retail sales.
8.4 Using decomposition for forecasting
You can absolutely forecast off a decomposition — that’s what decomposition_model() does. The idea: forecast the seasonally adjusted series with any model (ETS, ARIMA, RW with drift), forecast the seasonal component naively (SNAIVE on the season), and recombine.
dcmp_fit <- train |>
model(stl_ets = decomposition_model(
STL(Sales ~ trend(window = 13) + season(window = "periodic"),
robust = TRUE),
ETS(season_adjust ~ season("N")),
SNAIVE(season_year)
))
dcmp_fc <- dcmp_fit |> forecast(h = h)
bind_rows(fc, dcmp_fc) |>
accuracy(retail) |>
select(.model, ME, RMSE, MAE, MPE, MAPE) |>
arrange(RMSE) |>
kbl(digits = 2,
caption = "Adding a decomposition-based forecast to the comparison") |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| .model | ME | RMSE | MAE | MPE | MAPE |
|---|---|---|---|---|---|
| stl_ets | 7010.17 | 8615.80 | 7062.89 | 1.17 | 1.18 |
| Drift | 12712.56 | 13600.89 | 12712.56 | 2.12 | 2.12 |
| SNaive | 20632.29 | 23717.06 | 20632.29 | 3.43 | 3.43 |
| Naive | 27784.29 | 29609.61 | 27784.29 | 4.63 | 4.63 |
| Mean | 239277.45 | 239496.26 | 239277.45 | 40.07 | 40.07 |
dcmp_fc |>
autoplot(retail |> filter(year(Month) >= 2018), level = c(80, 95)) +
labs(title = "STL + ETS decomposition forecast",
y = "Sales (millions $)", x = NULL)Would I use it for a final forecast? On this series, yes — the seasonal structure is so dominant that any model which doesn’t explicitly handle it (Mean, Naive, Drift) is hopeless. SNaive captures seasonality but has no trend; the STL+ETS pipeline captures both and typically beats SNaive’s RMSE on this series. For monthly seasonal economic data this kind of decomposition forecast is the right benchmark to beat before reaching for something heavier.