Data Preparation
library(fredr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(slider)
set.seed(123)
## 1. Get 5 years of monthly FRED data ----
key=fredr_set_key("2e0d26bb120e8b4acc4096fa26a5222a")
series_id <- "UNRATE" #Unemployment Rate
start_date <- floor_date(Sys.Date() %m-% years(5), unit = "month")
raw_data <- fredr(
series_id = series_id,
observation_start = start_date,
frequency = "m"
)
# Keep only date and value, rename for convenience
data <- raw_data %>%
select(date, value) %>%
arrange(date)
## 2. Create time-based features (for ML) ----
# For monthly FRED data, we use year and month; if you had daily data,
# you’d add day-of-week, holidays, etc.
data <- data %>%
mutate(
year = year(date),
month = month(date),
# Optionally encode month as factor (for one-hot later, if using ML libs)
month_factor = factor(month, levels = 1:12, labels = month.abb)
)
## 3. Create lag and rolling features ----
data <- data %>%
mutate(
lag_1 = dplyr::lag(value, 1),
# 3-month rolling mean (aligned to current month)
rolling_mean_3 = slide_dbl(
value,
mean,
.before = 2, # this month + 2 prior
.complete = TRUE
),
# First difference (to capture changes)
diff_1 = value - lag_1
)
## 4. Handle missing values from lags/rolling with conservative imputation ----
# - rolling_mean_3: replace NA with the first available 3-month average
# - lag_1: for the first row, set lag_1 = value (assume no change)
# - diff_1: for the first row, set diff_1 = 0
# Get first non-missing 3-month rolling mean
first_roll_mean <- data %>%
filter(!is.na(rolling_mean_3)) %>%
dplyr::slice(1) %>% # explicitly use dplyr::slice
pull(rolling_mean_3)
data_ml <- data %>%
mutate(
rolling_mean_3 = if_else(
is.na(rolling_mean_3),
first_roll_mean,
rolling_mean_3
),
lag_1 = if_else(
is.na(lag_1),
value, # conservative: assume previous value = current
lag_1
),
diff_1 = if_else(
is.na(diff_1),
0, # conservative: no change when we don't know
diff_1
)
)
## 5. Train-test split: 4 years train, 1 year test ----
# Data is already sorted by date.
# Get unique years in the cleaned ML data
years_in_data <- sort(unique(year(data_ml$date)))
years_in_data
## [1] 2020 2021 2022 2023 2024 2025
# For monthly data, last year = test, previous 4 = train
last_year <- max(years_in_data)
train_years <- (last_year - 4):(last_year - 1)
train_data <- data_ml %>% filter(year(date) %in% train_years)
test_data <- data_ml %>% filter(year(date) == last_year)
## 6. Scale features (like StandardScaler in Python) ----
# We'll scale only numeric predictors using base::scale.
# Typically: fit scaler on train, apply same transformation to test.
# Choose numeric feature columns
num_cols <- c("value", "lag_1", "rolling_mean_3", "diff_1")
# Fit scaler on training data
train_means <- sapply(train_data[num_cols], mean, na.rm = TRUE)
train_sds <- sapply(train_data[num_cols], sd, na.rm = TRUE)
scale_with_params <- function(x, mean_vec, sd_vec) {
as.data.frame(scale(x, center = mean_vec, scale = sd_vec))
}
train_scaled <- scale_with_params(train_data[num_cols], train_means, train_sds)
test_scaled <- scale_with_params(test_data[num_cols], train_means, train_sds)
# Reattach scaled features to original non-scaled columns (e.g., dates, factors)
train_final <- train_data %>%
select(date, year, month, month_factor) %>%
bind_cols(train_scaled)
test_final <- test_data %>%
select(date, year, month, month_factor) %>%
bind_cols(test_scaled)
# Get mean and sd for *value* only
value_mean <- train_means["value"]
value_sd <- train_sds["value"]
inv_scale <- function(z) z * value_sd + value_mean
RF
# install.packages("ranger") # if needed
library(ranger)
## Warning: package 'ranger' was built under R version 4.5.2
y_test <- inv_scale(test_final$value)
rf_formula <- value ~ lag_1 + rolling_mean_3 + diff_1 + month_factor
set.seed(123)
rf_model <- ranger(
formula = rf_formula,
data = train_final,
num.trees = 500,
mtry = 2, # number of variables tried at each split
min.node.size = 3,
importance = "impurity"
)
# Predictions on test set
rf_pred <- predict(rf_model, data = test_final)$predictions
rf_pred <- inv_scale(rf_pred)
# Simple accuracy metrics on the (scaled) target
rmse <- function(actual, pred) sqrt(mean((actual - pred)^2))
mae <- function(actual, pred) mean(abs(actual - pred))
rf_rmse <- rmse(y_test, rf_pred)
rf_mae <- mae(y_test, rf_pred)
rf_rmse
## [1] 0.0909806
rf_mae
## [1] 0.081675
# Optional: look at variable importance
rf_model$variable.importance
## lag_1 rolling_mean_3 diff_1 month_factor
## 19.9091126 20.6433923 3.4286077 0.9885849
XGB
# install.packages("xgboost") # if needed
library(xgboost)
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
# Build model matrices (drop date, keep numeric + factor-expanded predictors)
# ~ . - date removes the date column, model.matrix expands month_factor into dummies
X_train <- model.matrix(value ~ . - date, data = train_final)[, -1] # remove intercept
X_test <- model.matrix(value ~ . - date, data = test_final)[, -1]
# y on the *scaled* scale for training, as in RF
y_train_scaled <- train_final$value
# y_test on the *original* scale, for fair comparison to RF/ARIMA/ETS
y_test_orig <- inv_scale(test_final$value)
dtrain <- xgb.DMatrix(data = X_train, label = y_train_scaled)
dtest <- xgb.DMatrix(data = X_test) # no label needed for prediction
# Basic XGBoost parameters for regression
params <- list(
objective = "reg:squarederror",
eta = 0.05, # learning rate
max_depth = 3,
subsample = 0.8,
colsample_bytree = 0.8,
eval_metric = "rmse"
)
set.seed(123)
xgb_model <- xgb.train(
params = params,
data = dtrain,
nrounds = 300,
watchlist = list(train = dtrain),
verbose = 0
)
# Predict on test data (still on scaled target scale)
xgb_pred_scaled <- predict(xgb_model, dtest)
# Inverse-transform predictions back to original unemployment units
xgb_pred <- inv_scale(xgb_pred_scaled)
# Evaluate performance on the original scale
rmse <- function(actual, pred) sqrt(mean((actual - pred)^2))
mae <- function(actual, pred) mean(abs(actual - pred))
xgb_rmse <- rmse(y_test_orig, xgb_pred)
xgb_mae <- mae(y_test_orig, xgb_pred)
xgb_rmse
## [1] 0.05727271
xgb_mae
## [1] 0.04636377
# Optional: feature importance
xgb_importance <- xgb.importance(model = xgb_model, feature_names = colnames(X_train))
print(xgb_importance)
## Feature Gain Cover Frequency
## <char> <num> <num> <num>
## 1: lag_1 7.538785e-01 0.314832188 0.255102041
## 2: rolling_mean_3 2.158854e-01 0.205247708 0.163945578
## 3: month 1.573435e-02 0.148382411 0.206802721
## 4: year 7.789155e-03 0.067941773 0.085034014
## 5: diff_1 3.399415e-03 0.122960990 0.110884354
## 6: month_factorAug 1.987261e-03 0.012256756 0.013605442
## 7: month_factorMay 4.217392e-04 0.031685985 0.045578231
## 8: month_factorOct 2.519385e-04 0.029204370 0.023809524
## 9: month_factorDec 2.012276e-04 0.021123990 0.019047619
## 10: month_factorNov 1.158045e-04 0.005568502 0.008163265
## 11: month_factorSep 1.076706e-04 0.010592259 0.013605442
## 12: month_factorApr 7.330397e-05 0.012256756 0.016326531
## 13: month_factorFeb 5.657833e-05 0.009744878 0.017006803
## 14: month_factorJul 4.576438e-05 0.002330297 0.006802721
## 15: month_factorJun 3.439603e-05 0.003692159 0.006802721
## 16: month_factorMar 1.749381e-05 0.002178979 0.007482993
# xgb.plot.importance(xgb_importance) # visual if you like
Accuracy Metrics RF and XGB
# ---------- Common accuracy functions (original scale) ----------
rmse <- function(actual, pred) sqrt(mean((actual - pred)^2))
mae <- function(actual, pred) mean(abs(actual - pred))
mape <- function(actual, pred) mean(abs((actual - pred) / actual)) * 100
mpe <- function(actual, pred) mean((actual - pred) / actual) * 100
me <- function(actual, pred) mean(actual - pred)
# MASE denominator: naive forecast error (lag-1) on ORIGINAL scale
mase <- function(actual, pred, train_actual) {
naive_error <- mean(abs(diff(train_actual)))
mean(abs(actual - pred)) / naive_error
}
# AICc and BIC for ML models using Gaussian residuals (original scale)
aicc <- function(actual, pred, k) {
n <- length(actual)
rss <- sum((actual - pred)^2)
sigma2 <- rss / n
ll <- -n/2 * (log(2 * pi * sigma2) + 1)
aic <- -2 * ll + 2 * k
aic + (2 * k * (k + 1)) / (n - k - 1)
}
bic <- function(actual, pred, k) {
n <- length(actual)
rss <- sum((actual - pred)^2)
sigma2 <- rss / n
ll <- -n/2 * (log(2 * pi * sigma2) + 1)
-2 * ll + log(n) * k
}
# ---------- Original-scale targets for metrics ----------
# Training target in *original* units
y_train_orig <- inv_scale(train_final$value)
# Test target in *original* units
y_test_orig <- inv_scale(test_final$value)
# ---------- Effective parameter counts ----------
k_rf <- rf_model$num.independent.variables
k_xgb <- length(xgb_model$feature_names)
# ---------- Random Forest metrics (original scale) ----------
rf_metrics <- data.frame(
Model = "Random Forest",
RMSE = rmse(y_test_orig, rf_pred),
MAE = mae(y_test_orig, rf_pred),
MAPE = mape(y_test_orig, rf_pred),
MPE = mpe(y_test_orig, rf_pred),
ME = me(y_test_orig, rf_pred),
MASE = mase(y_test_orig, rf_pred, y_train_orig),
AICc = aicc(y_test_orig, rf_pred, k_rf),
BIC = bic(y_test_orig, rf_pred, k_rf)
)
# ---------- XGBoost metrics (original scale) ----------
xgb_metrics <- data.frame(
Model = "XGBoost",
RMSE = rmse(y_test_orig, xgb_pred),
MAE = mae(y_test_orig, xgb_pred),
MAPE = mape(y_test_orig, xgb_pred),
MPE = mpe(y_test_orig, xgb_pred),
ME = me(y_test_orig, xgb_pred),
MASE = mase(y_test_orig, xgb_pred, y_train_orig),
AICc = aicc(y_test_orig, xgb_pred, k_xgb),
BIC = bic(y_test_orig, xgb_pred, k_xgb)
)
# ---------- Combined table ----------
comparison_full <- rbind(rf_metrics, xgb_metrics)
comparison_full %>%
arrange(RMSE)
## Model RMSE MAE MAPE MPE ME MASE
## 1 XGBoost 0.05727271 0.04636377 1.112409 0.4588596 0.02012719 0.3693385
## 2 Random Forest 0.09098060 0.08167500 1.950769 1.2656417 0.05401250 0.6506314
## AICc BIC
## 1 -51.500325 10.215184
## 2 5.682606 -7.332961
ETS/ARIMA
library(fpp3)
## Registered S3 method overwritten by 'tsibble':
## method from
## as_tibble.grouped_df dplyr
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.1 ──
## ✔ tibble 3.3.0 ✔ tsibbledata 0.4.1
## ✔ tidyr 1.3.1 ✔ feasts 0.4.1
## ✔ ggplot2 4.0.0 ✔ fable 0.4.1
## ✔ tsibble 1.1.6
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::filter() masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tsibble::setdiff() masks base::setdiff()
## ✖ xgboost::slice() masks dplyr::slice()
## ✖ tsibble::union() masks base::union()
library(dplyr)
#----------------------------------------------------------
# 1. Make a MONTHLY tsibble (avoid daily implicit gaps)
#----------------------------------------------------------
fred_ts <- data %>%
mutate(Month = yearmonth(date)) %>% # convert to monthly index
as_tsibble(index = Month) %>%
arrange(Month)
#----------------------------------------------------------
# 2. Train / test split: 4 years train, 1 year test
#----------------------------------------------------------
years_in_data <- fred_ts %>%
mutate(yr = year(Month)) %>%
distinct(yr) %>%
arrange(yr) %>%
pull(yr)
last_year <- max(years_in_data)
train_years <- (last_year - 4):(last_year - 1)
train_ts <- fred_ts %>% filter(year(Month) %in% train_years)
test_ts <- fred_ts %>% filter(year(Month) == last_year)
h <- nrow(test_ts) # forecast horizon = size of test set
#----------------------------------------------------------
# 3. Fit ETS and ARIMA in ONE model() call
#----------------------------------------------------------
fit <- train_ts %>%
model(
ETS = ETS(value),
ARIMA = ARIMA(value),
TSLM = TSLM(value),
MEAN = MEAN(value),
NAIVE = NAIVE(value),
SNAIVE = SNAIVE(value)
)
#----------------------------------------------------------
# 4. Forecast over the test horizon
#----------------------------------------------------------
fc <- fit %>%
forecast(h = h)
#----------------------------------------------------------
# 5. Accuracy metrics on the test set
# (RMSE, MAE, MAPE, MPE, ME, MASE)
#----------------------------------------------------------
acc <- fc %>%
accuracy(test_ts) %>%
select(.model, RMSE, MAE, MAPE, MPE, ME, MASE)
#----------------------------------------------------------
# 6. AICc / AIC and BIC from glance()
# (fall back to AIC if AICc is missing)
#----------------------------------------------------------
info_raw <- fit %>%
glance() %>%
as_tibble()
if ("AICc" %in% names(info_raw)) {
info <- info_raw %>%
transmute(.model, AICc = AICc, BIC = BIC)
} else {
# If AICc is not available, use AIC as a proxy
info <- info_raw %>%
transmute(.model, AICc = AIC, BIC = BIC)
}
#----------------------------------------------------------
# 7. Merge accuracy + info into one table
#----------------------------------------------------------
ets_arima_metrics <- acc %>%
left_join(info, by = ".model") %>%
mutate(
Model = case_when(
.model == "ETS" ~ "ETS (fpp3)",
.model == "ARIMA" ~ "ARIMA (fpp3)",
.model == "TSLM" ~ "TSLM (fpp3)",
.model == "MEAN" ~ "MEAN (fpp3)",
.model == "NAIVE" ~ "NAIVE (fpp3)",
.model == "SNAIVE" ~ "SNAIVE(fpp3)",
TRUE ~ .model
)
) %>%
select(Model, RMSE, MAE, MAPE, MPE, ME, MASE, AICc, BIC)
ets_arima_metrics
## # A tibble: 6 × 9
## Model RMSE MAE MAPE MPE ME MASE AICc BIC
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ARIMA (fpp3) 0.101 0.0860 2.06 1.01 0.0441 NaN -43.8 -38.8
## 2 ETS (fpp3) 0.121 0.102 2.44 1.71 0.0732 NaN 0.927 10.1
## 3 MEAN (fpp3) 0.0858 0.0708 1.71 -0.143 -0.00417 NaN NA NA
## 4 NAIVE (fpp3) 0.106 0.0875 2.08 1.46 0.0625 NaN NA NA
## 5 SNAIVE(fpp3) 0.212 0.175 4.22 4.22 0.175 NaN NA NA
## 6 TSLM (fpp3) 0.0858 0.0708 1.71 -0.143 -0.00417 NaN -14.0 -10.5
comparison_full <- bind_rows(
comparison_full, # from RF + XGBoost
ets_arima_metrics # from ETS + ARIMA
)
comparison_full %>%
arrange(RMSE)
## Model RMSE MAE MAPE MPE ME
## 1 XGBoost 0.05727271 0.04636377 1.112409 0.4588596 0.020127186
## 2 MEAN (fpp3) 0.08579692 0.07083333 1.711760 -0.1429151 -0.004166667
## 3 TSLM (fpp3) 0.08579692 0.07083333 1.711760 -0.1429151 -0.004166667
## 4 Random Forest 0.09098060 0.08167500 1.950769 1.2656417 0.054012500
## 5 ARIMA (fpp3) 0.10117126 0.08598480 2.056553 1.0137550 0.044069463
## 6 NAIVE (fpp3) 0.10606602 0.08750000 2.084372 1.4593715 0.062500000
## 7 ETS (fpp3) 0.12053744 0.10217621 2.435613 1.7113417 0.073186666
## 8 SNAIVE(fpp3) 0.21213203 0.17500000 4.218906 4.2189062 0.175000000
## MASE AICc BIC
## 1 0.3693385 -51.500325 10.215184
## 2 NaN NA NA
## 3 NaN -14.000658 -10.524923
## 4 0.6506314 5.682606 -7.332961
## 5 NaN -43.755009 -38.840513
## 6 NaN NA NA
## 7 NaN 0.926872 10.105298
## 8 NaN NA NA
RNN/LSTM/CNN-LSTM
library(reticulate)
## Warning: package 'reticulate' was built under R version 4.5.2
np <- import("numpy", convert = FALSE)
random <- import("random", convert = FALSE)
# Python / NumPy / Keras seeds
np$random$seed(123L)
## None
random$seed(123L)
## None
keras3::set_random_seed(123)
Sys.setenv("PYTHONHASHSEED" = "123")
library(dplyr)
library(keras3)
## Warning: package 'keras3' was built under R version 4.5.2
##
## Attaching package: 'keras3'
## The following object is masked from 'package:fabletools':
##
## new_model_class
## 1. Build supervised sequences from scaled data ----
# train_final$value and test_final$value are already z-scaled
all_scaled <- bind_rows(train_final, test_final)
make_sequences <- function(x, lag = 12) {
x <- as.numeric(x)
n <- length(x)
n_samples <- n - lag
# X: [samples, timesteps, features]
X <- array(NA_real_, dim = c(n_samples, lag, 1))
y <- numeric(n_samples)
for (i in seq_len(n_samples)) {
X[i, , 1] <- x[i:(i + lag - 1)] # input window
y[i] <- x[i + lag] # next value
}
list(X = X, y = y)
}
lag <- 12
# Training sequences from training portion only (scaled)
train_seq <- make_sequences(train_final$value, lag)
# Sequences over full (train + test) series, then trim last n_test as test horizon
full_seq <- make_sequences(all_scaled$value, lag)
n_test <- nrow(test_final)
X_train <- train_seq$X
y_train_scaled <- train_seq$y # scaled targets for training
X_test <- tail(full_seq$X, n_test)
y_test_scaled <- tail(full_seq$y, n_test) # scaled test targets
# Inverse-scale to original unemployment-rate units
y_train_orig <- inv_scale(y_train_scaled)
y_test_orig <- inv_scale(y_test_scaled)
dim(X_train); dim(X_test) # sanity check
## [1] 36 12 1
## [1] 8 12 1
## 2. RNN model (keras3) ----
rnn_model <- keras_model_sequential() |>
layer_simple_rnn(units = 16, input_shape = c(lag, 1)) |>
layer_dense(units = 1)
rnn_model |>
compile(
loss = "mse",
optimizer = optimizer_adam(learning_rate = 0.01)
)
history_rnn <- rnn_model |>
fit(
X_train, y_train_scaled, # train on scaled targets
epochs = 50,
batch_size = 8,
verbose = 0
)
rnn_pred_scaled <- rnn_model |>
predict(X_test) |>
as.numeric()
## 1/1 - 0s - 63ms/step
rnn_pred <- inv_scale(rnn_pred_scaled) # back to original scale
## 3. LSTM model (keras3) ----
lstm_model <- keras_model_sequential() |>
layer_lstm(units = 32, input_shape = c(lag, 1)) |>
layer_dense(units = 1)
lstm_model |>
compile(
loss = "mse",
optimizer = optimizer_adam(learning_rate = 0.005)
)
history_lstm <- lstm_model |>
fit(
X_train, y_train_scaled,
epochs = 60,
batch_size = 8,
verbose = 0
)
lstm_pred_scaled <- lstm_model |>
predict(X_test) |>
as.numeric()
## 1/1 - 0s - 74ms/step
lstm_pred <- inv_scale(lstm_pred_scaled)
## 4. CNN-LSTM model (keras3) ----
## 1D conv over the sequence, then pooling, then LSTM
cnn_lstm_model <- keras_model_sequential() |>
layer_conv_1d(
filters = 16,
kernel_size = 3,
activation = "relu",
padding = "causal",
input_shape = c(lag, 1)
) |>
layer_max_pooling_1d(pool_size = 2) |>
layer_lstm(units = 32) |>
layer_dense(units = 1)
cnn_lstm_model |>
compile(
loss = "mse",
optimizer = optimizer_adam(learning_rate = 0.003)
)
history_cnn_lstm <- cnn_lstm_model |>
fit(
X_train, y_train_scaled,
epochs = 60,
batch_size = 8,
verbose = 0
)
cnn_lstm_pred_scaled <- cnn_lstm_model |>
predict(X_test) |>
as.numeric()
## 1/1 - 0s - 151ms/step
cnn_lstm_pred <- inv_scale(cnn_lstm_pred_scaled)
## 5. Metrics for RNN, LSTM, and CNN-LSTM (original scale) ----
nn_metrics <- function(model_name, actual, pred, train_actual) {
mae <- mean(abs(actual - pred))
rmse <- sqrt(mean((actual - pred)^2))
mape <- mean(abs((actual - pred) / actual)) * 100
mpe <- mean((actual - pred) / actual) * 100
me <- mean(actual - pred)
mase <- mae / mean(abs(diff(train_actual))) # naive(1) denominator
data.frame(
Model = model_name,
RMSE = rmse,
MAE = mae,
MAPE = mape,
MPE = mpe,
ME = me,
MASE = mase,
AICc = NA_real_, # not meaningful for deep nets here
BIC = NA_real_
)
}
rnn_metrics <- nn_metrics("RNN (keras3)", y_test_orig, rnn_pred, y_train_orig)
lstm_metrics <- nn_metrics("LSTM (keras3)", y_test_orig, lstm_pred, y_train_orig)
cnn_lstm_metrics <- nn_metrics("CNN_LSTM (keras3)", y_test_orig, cnn_lstm_pred, y_train_orig)
nn_results <- bind_rows(rnn_metrics, lstm_metrics, cnn_lstm_metrics)
nn_results
## Model RMSE MAE MAPE MPE ME MASE
## 1 RNN (keras3) 0.1771148 0.16221523 3.871644 3.5426076 0.14905377 1.831462
## 2 LSTM (keras3) 0.1072154 0.09295227 2.226892 1.0655136 0.04631249 1.049461
## 3 CNN_LSTM (keras3) 0.1082585 0.09527231 2.294081 0.4565961 0.02118300 1.075655
## AICc BIC
## 1 NA NA
## 2 NA NA
## 3 NA NA
## 6. Add to your overall comparison table ----
comparison_full <- bind_rows(
comparison_full, # RF, XGBoost, ARIMA, ETS (already on original scale)
nn_results # RNN, LSTM, CNN-LSTM (now also on original scale)
)
comparison_full %>%
arrange(RMSE)
## Model RMSE MAE MAPE MPE ME
## 1 XGBoost 0.05727271 0.04636377 1.112409 0.4588596 0.020127186
## 2 MEAN (fpp3) 0.08579692 0.07083333 1.711760 -0.1429151 -0.004166667
## 3 TSLM (fpp3) 0.08579692 0.07083333 1.711760 -0.1429151 -0.004166667
## 4 Random Forest 0.09098060 0.08167500 1.950769 1.2656417 0.054012500
## 5 ARIMA (fpp3) 0.10117126 0.08598480 2.056553 1.0137550 0.044069463
## 6 NAIVE (fpp3) 0.10606602 0.08750000 2.084372 1.4593715 0.062500000
## 7 LSTM (keras3) 0.10721545 0.09295227 2.226892 1.0655136 0.046312488
## 8 CNN_LSTM (keras3) 0.10825845 0.09527231 2.294081 0.4565961 0.021183000
## 9 ETS (fpp3) 0.12053744 0.10217621 2.435613 1.7113417 0.073186666
## 10 RNN (keras3) 0.17711480 0.16221523 3.871644 3.5426076 0.149053774
## 11 SNAIVE(fpp3) 0.21213203 0.17500000 4.218906 4.2189062 0.175000000
## MASE AICc BIC
## 1 0.3693385 -51.500325 10.215184
## 2 NaN NA NA
## 3 NaN -14.000658 -10.524923
## 4 0.6506314 5.682606 -7.332961
## 5 NaN -43.755009 -38.840513
## 6 NaN NA NA
## 7 1.0494611 NA NA
## 8 1.0756551 NA NA
## 9 NaN 0.926872 10.105298
## 10 1.8314623 NA NA
## 11 NaN NA NA
Transformer
#-------------------------------------------------------------
# Transformer-style self-attention model (keras3)
#-------------------------------------------------------------
library(keras3)
library(dplyr)
# Input shape matches your sequence data: (timesteps = lag, features = 1)
transformer_input <- keras_input(shape = c(lag, 1))
# Project to a slightly higher dimensional embedding
x <- transformer_input |>
layer_dense(units = 16, activation = "relu", name = "proj_dense")
# Self-attention: query = value = key = x
attn_out <- layer_attention(
list(x, x, x), # list(query, value, key)
use_scale = TRUE,
score_mode = "dot",
name = "self_attention"
)
# Pool and output
x_out <- attn_out |>
layer_flatten(name = "flatten") |>
layer_dense(units = 16, activation = "relu", name = "post_attn_dense") |>
layer_dense(units = 1, name = "output")
transformer_model <- keras_model(
inputs = transformer_input,
outputs = x_out,
name = "transformer_style_model"
)
summary(transformer_model)
## Model: "transformer_style_model"
## ┌───────────────────────┬───────────────────┬─────────────┬────────────────────
## │ Layer (type) │ Output Shape │ Param # │ Connected to
## ├───────────────────────┼───────────────────┼─────────────┼────────────────────
## │ input_layer_3 │ (None, 12, 1) │ 0 │ -
## │ (InputLayer) │ │ │
## ├───────────────────────┼───────────────────┼─────────────┼────────────────────
## │ proj_dense (Dense) │ (None, 12, 16) │ 32 │ input_layer_3[0][…
## ├───────────────────────┼───────────────────┼─────────────┼────────────────────
## │ self_attention │ (None, 12, 16) │ 1 │ proj_dense[0][0],
## │ (Attention) │ │ │ proj_dense[0][0],
## │ │ │ │ proj_dense[0][0]
## ├───────────────────────┼───────────────────┼─────────────┼────────────────────
## │ flatten (Flatten) │ (None, 192) │ 0 │ self_attention[0]…
## ├───────────────────────┼───────────────────┼─────────────┼────────────────────
## │ post_attn_dense │ (None, 16) │ 3,088 │ flatten[0][0]
## │ (Dense) │ │ │
## ├───────────────────────┼───────────────────┼─────────────┼────────────────────
## │ output (Dense) │ (None, 1) │ 17 │ post_attn_dense[0…
## └───────────────────────┴───────────────────┴─────────────┴────────────────────
## Total params: 3,138 (12.26 KB)
## Trainable params: 3,138 (12.26 KB)
## Non-trainable params: 0 (0.00 B)
transformer_model |>
compile(
loss = "mse",
optimizer = optimizer_adam(learning_rate = 0.005)
)
history_transformer <- transformer_model |>
fit(
X_train, y_train_scaled, # train on SCALED targets
epochs = 60,
batch_size = 8,
verbose = 0
)
# Predictions on test (scaled), then inverse-scale
transformer_pred_scaled <- transformer_model |>
predict(X_test) |>
as.numeric()
## 1/1 - 0s - 34ms/step
transformer_pred <- inv_scale(transformer_pred_scaled) # back to original scale
# Metrics on ORIGINAL SCALE (consistent with RF/XGB/ETS/ARIMA/RNN/etc.)
transformer_metrics <- nn_metrics(
model_name = "Transformer (keras3)",
actual = y_test_orig,
pred = transformer_pred,
train_actual = y_train_orig
)
transformer_metrics
## Model RMSE MAE MAPE MPE ME
## 1 Transformer (keras3) 0.09392649 0.07620724 1.837583 -0.0009689506 0.001854453
## MASE AICc BIC
## 1 0.8604043 NA NA
# Append to overall comparison table
comparison_full <- bind_rows(
comparison_full,
transformer_metrics
)
library(knitr)
comparison_full %>%
arrange(RMSE)
## Model RMSE MAE MAPE MPE
## 1 XGBoost 0.05727271 0.04636377 1.112409 0.4588596251
## 2 MEAN (fpp3) 0.08579692 0.07083333 1.711760 -0.1429151023
## 3 TSLM (fpp3) 0.08579692 0.07083333 1.711760 -0.1429151023
## 4 Random Forest 0.09098060 0.08167500 1.950769 1.2656416873
## 5 Transformer (keras3) 0.09392649 0.07620724 1.837583 -0.0009689506
## 6 ARIMA (fpp3) 0.10117126 0.08598480 2.056553 1.0137550112
## 7 NAIVE (fpp3) 0.10606602 0.08750000 2.084372 1.4593715393
## 8 LSTM (keras3) 0.10721545 0.09295227 2.226892 1.0655136223
## 9 CNN_LSTM (keras3) 0.10825845 0.09527231 2.294081 0.4565960885
## 10 ETS (fpp3) 0.12053744 0.10217621 2.435613 1.7113416724
## 11 RNN (keras3) 0.17711480 0.16221523 3.871644 3.5426075611
## 12 SNAIVE(fpp3) 0.21213203 0.17500000 4.218906 4.2189061529
## ME MASE AICc BIC
## 1 0.020127186 0.3693385 -51.500325 10.215184
## 2 -0.004166667 NaN NA NA
## 3 -0.004166667 NaN -14.000658 -10.524923
## 4 0.054012500 0.6506314 5.682606 -7.332961
## 5 0.001854453 0.8604043 NA NA
## 6 0.044069463 NaN -43.755009 -38.840513
## 7 0.062500000 NaN NA NA
## 8 0.046312488 1.0494611 NA NA
## 9 0.021183000 1.0756551 NA NA
## 10 0.073186666 NaN 0.926872 10.105298
## 11 0.149053774 1.8314623 NA NA
## 12 0.175000000 NaN NA NA