With time-based split
library(data.table)
library(xgboost)
library(glmnet)
library(SHAPforxgboost)
library(stargazer)
library(ggplot2)
library(gridExtra)
OUTPUT_DIR <- "/Users/amalianimeskern/Library/CloudStorage/OneDrive-ErasmusUniversityRotterdam/Freddie Mac Data"
# --- Load data ---
train_xgb <- readRDS(file.path(OUTPUT_DIR, "train_xgb.rds"))
train_woe <- readRDS(file.path(OUTPUT_DIR, "train_woe.rds"))
train_raw <- readRDS(file.path(OUTPUT_DIR, "train.rds"))
test_xgb <- readRDS(file.path(OUTPUT_DIR, "test_xgb.rds"))
test_woe <- readRDS(file.path(OUTPUT_DIR, "test_woe.rds"))
best_params <- readRDS(file.path(OUTPUT_DIR, "best_hyperparams.rds"))
xgb_model <- readRDS(file.path(OUTPUT_DIR, "xgb_model.rds"))
best_lambda <- readRDS(file.path(OUTPUT_DIR, "best_lambda.rds"))
# --- Feature names ---
drop_cols <- c("loan_sequence_number", "monthly_reporting_period",
"default_next_12m")
xgb_features <- setdiff(names(train_xgb), drop_cols)
woe_features <- setdiff(names(train_woe), drop_cols)
# --- Map base variable for XGBoost dummy aggregation ---
base_var_map <- sapply(xgb_features, function(f) {
woe_base <- gsub("_woe$", "", woe_features)
for (i in seq_along(woe_base)) {
if (f == woe_base[i] || grepl(paste0("^", woe_base[i], "_"), f)) {
return(woe_base[i])
}
}
return(f)
}, USE.NAMES = TRUE)
# --- Fixed XGBoost parameters ---
fixed_params <- list(
objective = "binary:logistic",
eval_metric = "auc",
eta = best_params$eta,
max_depth = best_params$max_depth,
subsample = best_params$subsample,
colsample_bytree = best_params$colsample_bytree,
nthread = 2
)
best_nrounds <- xgb_model$niter
# --- Test matrices (fixed across all iterations) ---
X_test_xgb <- as.matrix(test_xgb[, ..xgb_features])
X_test_woe <- as.matrix(test_woe[, ..woe_features])
# --- Fixed SHAP subsample ---
set.seed(42)
shap_idx <- sample(nrow(X_test_xgb), min(3000, nrow(X_test_xgb)))
X_shap_xgb <- X_test_xgb[shap_idx, ]
# --- Extract observation year and assign regime ---
train_raw[, obs_year := as.integer(substr(as.character(monthly_reporting_period), 1, 4))]
stopifnot(nrow(train_raw) == nrow(train_xgb))
# Pre-crisis: 2006-2007, Crisis/recovery: 2008-2010 (2011 excluded as observations after Dec 2011 are already dropped )
train_raw[, regime := fifelse(obs_year <= 2007, "pre_crisis", "crisis_recovery")]
print(train_raw[, .N, by = regime])
# --- Build loan-to-row index mapping per regime ---
train_xgb[, .row_id := .I]
train_woe[, .row_id := .I]
train_raw[, .row_id := .I]
# Full matrices (needed for bootstrap row subsetting)
X_train_full_xgb <- as.matrix(train_xgb[, ..xgb_features])
y_train_full_xgb <- train_xgb$default_next_12m
X_train_full_woe <- as.matrix(train_woe[, ..woe_features])
y_train_full_woe <- train_woe$default_next_12m
# Per-regime unique loan IDs and their row indices
build_regime_index <- function(regime_label) {
regime_rows <- train_raw[regime == regime_label, .row_id]
regime_loans <- train_raw[regime == regime_label,
.(rows = list(.row_id)),
by = loan_sequence_number]
list(
loan_ids = regime_loans$loan_sequence_number,
row_lookup = setNames(regime_loans$rows, regime_loans$loan_sequence_number),
n_loans = nrow(regime_loans),
n_obs = length(regime_rows)
)
}
regime_pre <- build_regime_index("pre_crisis")
regime_crisis <- build_regime_index("crisis_recovery")
# Memory Offload
rm(train_xgb, train_woe, train_raw, test_xgb, test_woe, xgb_model)
gc()
# --- Bootstrap function for one regime ---
run_regime_bootstrap <- function(regime_info, regime_name, N_BOOT = 50) {
results_xgb <- vector("list", N_BOOT)
results_log <- vector("list", N_BOOT)
regime_start <- Sys.time()
for (b in 1:N_BOOT) {
iter_start <- Sys.time()
# Resample loan IDs with replacement within this regime
set.seed(1000 * ifelse(regime_name == "Pre-crisis", 1, 2) + b)
boot_loan_ids <- sample(regime_info$loan_ids,
regime_info$n_loans, replace = TRUE)
boot_rows <- unlist(regime_info$row_lookup[boot_loan_ids], use.names = FALSE)
# XGBoost
dtrain_boot <- xgb.DMatrix(
data = X_train_full_xgb[boot_rows, ],
label = y_train_full_xgb[boot_rows]
)
xgb_boot <- xgb.train(
params = fixed_params,
data = dtrain_boot,
nrounds = best_nrounds,
verbose = 0
)
shap_boot <- shap.values(xgb_model = xgb_boot, X_train = X_shap_xgb)
mean_shap_boot <- shap_boot$mean_shap_score
shap_raw <- data.table(
feature = names(mean_shap_boot),
mean_abs_shap = as.numeric(mean_shap_boot)
)
shap_raw[, base_var := base_var_map[feature]]
shap_agg <- shap_raw[, .(mean_abs_shap = sum(mean_abs_shap)), by = base_var]
setnames(shap_agg, "base_var", "feature")
shap_agg <- shap_agg[order(-mean_abs_shap)]
shap_agg[, rank := .I]
results_xgb[[b]] <- shap_agg
# Logistic regression
X_boot_woe <- X_train_full_woe[boot_rows, ]
y_boot <- y_train_full_woe[boot_rows]
log_boot <- glmnet(X_boot_woe, y_boot, family = "binomial",
alpha = 0, lambda = best_lambda)
coefs_boot <- as.numeric(coef(log_boot, s = best_lambda))[-1]
names(coefs_boot) <- woe_features
boot_means <- colMeans(X_boot_woe, na.rm = TRUE)
shap_log_boot <- sweep(X_test_woe, 2, boot_means, "-")
shap_log_boot <- sweep(shap_log_boot, 2, coefs_boot, "*")
mean_shap_log_boot <- sort(colMeans(abs(shap_log_boot)), decreasing = TRUE)
importance_boot_log <- data.table(
feature = names(mean_shap_log_boot),
mean_abs_shap = as.numeric(mean_shap_log_boot)
)[order(-mean_abs_shap)]
importance_boot_log[, rank := .I]
results_log[[b]] <- importance_boot_log
# Clean up
rm(dtrain_boot, xgb_boot, shap_boot, log_boot,
X_boot_woe, shap_log_boot)
gc(verbose = FALSE)
}
list(xgb = results_xgb, log = results_log)
}
# Run
N_BOOT <- 50
total_start <- Sys.time()
res_pre <- run_regime_bootstrap(regime_pre, "Pre-crisis", N_BOOT)
res_crisis <- run_regime_bootstrap(regime_crisis, "Crisis/recovery", N_BOOT)
# --- Analysis---
# Rank matrix to compute pairwise Spearman
compute_spearman <- function(results_list) {
rank_mat <- do.call(cbind, lapply(results_list, function(dt) {
dt[order(feature)]$rank
}))
rownames(rank_mat) <- results_list[[1]][order(feature)]$feature
n <- ncol(rank_mat)
n_pairs <- n * (n - 1) / 2
rhos <- numeric(n_pairs)
k <- 1
for (i in 1:(n - 1)) {
for (j in (i + 1):n) {
rhos[k] <- cor(rank_mat[, i], rank_mat[, j], method = "spearman")
k <- k + 1
}
}
list(rank_matrix = rank_mat, spearman = rhos)
}
# Within-regime pairwise Spearman
pre_xgb <- compute_spearman(res_pre$xgb)
pre_log <- compute_spearman(res_pre$log)
crisis_xgb <- compute_spearman(res_crisis$xgb)
crisis_log <- compute_spearman(res_crisis$log)
# Cross-regime Spearman:
# Compare each pre-crisis iteration to each crisis iteration
cross_regime_spearman <- function(results_pre, results_crisis) {
# Using mean rank per feature in each regime iteration
rhos <- numeric(N_BOOT * N_BOOT)
k <- 1
ranks_pre <- do.call(cbind, lapply(results_pre, function(dt) {
dt[order(feature)]$rank
}))
ranks_crisis <- do.call(cbind, lapply(results_crisis, function(dt) {
dt[order(feature)]$rank
}))
for (i in 1:N_BOOT) {
for (j in 1:N_BOOT) {
rhos[k] <- cor(ranks_pre[, i], ranks_crisis[, j], method = "spearman")
k <- k + 1
}
}
rhos
}
cross_xgb <- cross_regime_spearman(res_pre$xgb, res_crisis$xgb)
cross_log <- cross_regime_spearman(res_pre$log, res_crisis$log)
# Per-feature stability per regime
feature_stability <- function(rank_mat) {
data.table(
feature = rownames(rank_mat),
mean_rank = apply(rank_mat, 1, mean),
median_rank = apply(rank_mat, 1, median),
sd_rank = apply(rank_mat, 1, sd),
min_rank = apply(rank_mat, 1, min),
max_rank = apply(rank_mat, 1, max),
range = apply(rank_mat, 1, max) - apply(rank_mat, 1, min)
)[order(mean_rank)]
}
fs_pre_xgb <- feature_stability(pre_xgb$rank_matrix)
fs_pre_log <- feature_stability(pre_log$rank_matrix)
fs_crisis_xgb <- feature_stability(crisis_xgb$rank_matrix)
fs_crisis_log <- feature_stability(crisis_log$rank_matrix)
# --- Summary table ---
summary_table <- data.table(
Model = rep(c("XGBoost", "Logistic Regression"), 3),
Regime = rep(c("Pre-crisis", "Crisis/recovery", "Cross-regime"), each = 2),
Mean = round(c(mean(pre_xgb$spearman), mean(pre_log$spearman),
mean(crisis_xgb$spearman), mean(crisis_log$spearman),
mean(cross_xgb), mean(cross_log)), 4),
Median = round(c(median(pre_xgb$spearman), median(pre_log$spearman),
median(crisis_xgb$spearman), median(crisis_log$spearman),
median(cross_xgb), median(cross_log)), 4),
SD = round(c(sd(pre_xgb$spearman), sd(pre_log$spearman),
sd(crisis_xgb$spearman), sd(crisis_log$spearman),
sd(cross_xgb), sd(cross_log)), 4),
Min = round(c(min(pre_xgb$spearman), min(pre_log$spearman),
min(crisis_xgb$spearman), min(crisis_log$spearman),
min(cross_xgb), min(cross_log)), 4),
Max = round(c(max(pre_xgb$spearman), max(pre_log$spearman),
max(crisis_xgb$spearman), max(crisis_log$spearman),
max(cross_xgb), max(cross_log)), 4)
)
print(summary_table)
stargazer(as.data.frame(summary_table),
summary = FALSE, rownames = FALSE, type = "html",
title = "Regime-Stratified Pairwise Spearman Rank Correlations",
out = file.path(OUTPUT_DIR, "table_regime_stability_spearman.html"))
# --- Saving Results ---
saveRDS(list(
pre_crisis = list(
xgb_rankings = res_pre$xgb,
log_rankings = res_pre$log,
spearman_xgb = pre_xgb$spearman,
spearman_log = pre_log$spearman,
rank_matrix_xgb = pre_xgb$rank_matrix,
rank_matrix_log = pre_log$rank_matrix,
feature_stability_xgb = fs_pre_xgb,
feature_stability_log = fs_pre_log
),
crisis_recovery = list(
xgb_rankings = res_crisis$xgb,
log_rankings = res_crisis$log,
spearman_xgb = crisis_xgb$spearman,
spearman_log = crisis_log$spearman,
rank_matrix_xgb = crisis_xgb$rank_matrix,
rank_matrix_log = crisis_log$rank_matrix,
feature_stability_xgb = fs_crisis_xgb,
feature_stability_log = fs_crisis_log
),
cross_regime = list(
spearman_xgb = cross_xgb,
spearman_log = cross_log
),
summary_table = summary_table,
n_boot = N_BOOT,
base_var_map = base_var_map
), file.path(OUTPUT_DIR, "regime_stability_results.rds"))
# --- Boxplots for Feature Rankings Stability ---
results <- readRDS(file.path(OUTPUT_DIR, "regime_stability_results.rds"))
N_BOOT <- results$n_boot
make_rank_boxplot <- function(rank_mat, title_text, fill_color, n_boot) {
long <- data.table(
feature = rep(rownames(rank_mat), n_boot),
rank = as.vector(rank_mat)
)
mean_ranks <- long[, .(mean_rank = mean(rank)), by = feature]
long <- merge(long, mean_ranks, by = "feature")
ordered_features <- mean_ranks[order(mean_rank)]$feature
long[, feature := factor(gsub("_woe$", "", gsub("_", " ", feature)),
levels = gsub("_woe$", "", gsub("_", " ", ordered_features)))]
ggplot(long, aes(rank, feature)) +
geom_boxplot(fill = fill_color, alpha = 0.6) +
labs(title = title_text, x = "Rank", y = NULL) +
theme_minimal(base_size = 11) +
theme(panel.grid.major.y = element_blank(),
panel.grid.minor = element_blank())
}
p1 <- make_rank_boxplot(results$pre_crisis$rank_matrix_xgb, "XGBoost – Pre-crisis", "#4A90D9", N_BOOT)
p2 <- make_rank_boxplot(results$crisis_recovery$rank_matrix_xgb, "XGBoost – Crisis/recovery", "#2E5F8A", N_BOOT)
p3 <- make_rank_boxplot(results$pre_crisis$rank_matrix_log, "Logistic – Pre-crisis", "#E8855E", N_BOOT)
p4 <- make_rank_boxplot(results$crisis_recovery$rank_matrix_log, "Logistic – Crisis/recovery", "#B85A3A", N_BOOT)
ggsave(file.path(OUTPUT_DIR, "fig_regime_stability_ranks.png"),
arrangeGrob(p1, p2, p3, p4, ncol = 2, nrow = 2),
width = 14, height = 14, dpi = 300)