Forecasting with HD data

We will use the FRED-MD dataset to forecast US Real Personal Income (RPI) growth. The forecast period spans from October 2015 to September 2025. We will consider three forecasting methods: ARMA, Factor models, and LASSO.

# load data
data <- read_excel("C:/Users/leesa/Desktop/FRED-MD.xlsx", col_names = FALSE)

# 1. Define indices (Out-of-Sample period: 120 months)
oos_period <- 120
n <- nrow(data)
train_idx <- 1:(n - oos_period)
test_idx  <- (n - oos_period + 1):n

# 2. Prepare Y (Target) - Convert to numeric vector
y_all <- as.numeric(unlist(data[, 1]))
y_train <- y_all[train_idx]
y_test  <- y_all[test_idx]

# 3. Prepare X (Features) - Convert to matrix and scale
X_all <- data.matrix(data[, -1])
X_scaled <- scale(X_all) 
X_train <- X_scaled[train_idx, ]
X_test  <- X_scaled[test_idx, ]

# Strategy 1: ARMA (Time Series)
arma_fit <- auto.arima(y_train)

# Strategy 2: Factor Model (PCA)
pca_train <- prcomp(X_train)
# Use top 3 Principal Components
factors_train <- pca_train$x[, 1:3]
fm_fit <- lm(y_train ~ factors_train)

# Strategy 3: LASSO (Machine Learning)
cv_lasso <- cv.glmnet(X_train, y_train, alpha = 1)

# Project test data onto the training rotation matrix
# Strategy 1: ARMA Forecast
y_hat_arma <- as.numeric(forecast(arma_fit, h = oos_period)$mean)

# Strategy 2: Factor Model Forecast
factors_test <- X_test %*% pca_train$rotation[, 1:3]
colnames(factors_test) <- colnames(factors_train)
y_hat_fm <- predict(fm_fit, newdata = as.data.frame(factors_test))

# Strategy 3: LASSO Forecast
y_hat_lasso <- as.numeric(predict(cv_lasso, newx = X_test, s = "lambda.min"))

# Out-Of-Sample MSFE
msfe_arma  <- mean((y_test - y_hat_arma)^2)
msfe_fm    <- mean((y_test - y_hat_fm)^2)
msfe_lasso <- mean((y_test - y_hat_lasso)^2)


results <- data.frame(
  Model = c("ARMA", "Factor Model", "LASSO"),
  MSFE = c(msfe_fm, msfe_lasso, msfe_arma)
)
knitr::kable(results, caption = "Out-of-Sample Performance (MSFE)")

Out-of-Sample Performance (MSFE)
Model	MSFE
ARMA	0.0009122
Factor Model	0.0004134
LASSO	0.0007236

# [Step 1] Cleanse Data Structures
y_true_plot  <- as.numeric(y_test)
y_arma_plot  <- as.numeric(y_hat_arma)
y_fm_plot    <- as.numeric(y_hat_fm)
y_lasso_plot <- as.numeric(y_hat_lasso)

# [Step 2] Strict Length Alignment
expected_len <- length(y_true_plot)

actual_len <- min(length(y_true_plot), length(y_arma_plot), 
                  length(y_fm_plot), length(y_lasso_plot))

y_true_plot  <- tail(y_true_plot, actual_len)
y_arma_plot  <- tail(y_arma_plot, actual_len)
y_fm_plot    <- tail(y_fm_plot, actual_len)
y_lasso_plot <- tail(y_lasso_plot, actual_len)

# [Step 3] Timeline Generation
dates <- seq(from = as.Date("2025-09-01"), 
             length.out = actual_len, 
             by = "-1 month") %>% rev()

# [Step 4] Execute Plotting
plot(dates, y_true_plot, type = "l", lwd = 2, col = "black", xaxt = "n",
     main = "Out-of-Sample Forecast Horse Race: 2015.10 - 2025.09",
     xlab = "Timeline (Year.Month)", ylab = "RPI Growth Rate",
     ylim = range(c(y_true_plot, y_arma_plot, y_fm_plot, y_lasso_plot), na.rm = TRUE))

# Add Model Layers
lines(dates, y_arma_plot,  col = "blue", lty = 2, lwd = 2) 
lines(dates, y_fm_plot,    col = "darkgreen", lwd = 2)    
lines(dates, y_lasso_plot, col = "red", lwd = 2)          

# X-axis
tick_indices <- round(seq(1, length(dates), length.out = 15))
axis(1, at = dates[tick_indices], 
     labels = format(dates[tick_indices], "%Y.%m"), 
     las = 2, cex.axis = 0.7)

# Add Legend
legend("topleft", legend = c("Actual", "ARMA", "Factor", "LASSO"),
       col = c("black", "blue", "darkgreen", "red"), lty = c(1, 2, 1, 1), 
       lwd = 2, bty = "n", cex = 0.8)

Forecasting with HD data

Saerom Lee

02/23/2026