Robustness 1 : global stability

Global stability: bootstrap re-estimation of feature importance rankings

50 iterations, fixed hyperparameters, Spearman rank correlation

With time-based split

library(data.table)
library(xgboost)
library(glmnet)
library(SHAPforxgboost)
library(stargazer)
library(ggplot2)
library(gridExtra)

OUTPUT_DIR <- "/Users/amalianimeskern/Library/CloudStorage/OneDrive-ErasmusUniversityRotterdam/Freddie Mac Data"

# --- Load data ---
train_xgb   <- readRDS(file.path(OUTPUT_DIR, "train_xgb.rds"))
train_woe   <- readRDS(file.path(OUTPUT_DIR, "train_woe.rds"))
train_raw   <- readRDS(file.path(OUTPUT_DIR, "train.rds"))
test_xgb    <- readRDS(file.path(OUTPUT_DIR, "test_xgb.rds"))
test_woe    <- readRDS(file.path(OUTPUT_DIR, "test_woe.rds"))
best_params <- readRDS(file.path(OUTPUT_DIR, "best_hyperparams.rds"))
xgb_model   <- readRDS(file.path(OUTPUT_DIR, "xgb_model.rds"))
best_lambda <- readRDS(file.path(OUTPUT_DIR, "best_lambda.rds"))

# --- Feature names ---
drop_cols    <- c("loan_sequence_number", "monthly_reporting_period",
                  "default_next_12m")
xgb_features <- setdiff(names(train_xgb), drop_cols)
woe_features <- setdiff(names(train_woe), drop_cols)

# --- Map base variable for XGBoost dummy aggregation ---
base_var_map <- sapply(xgb_features, function(f) {
  woe_base <- gsub("_woe$", "", woe_features)
  for (i in seq_along(woe_base)) {
    if (f == woe_base[i] || grepl(paste0("^", woe_base[i], "_"), f)) {
      return(woe_base[i])
    }
  }
  return(f)
}, USE.NAMES = TRUE)


# --- Fixed XGBoost parameters ---
fixed_params <- list(
  objective        = "binary:logistic",
  eval_metric      = "auc",
  eta              = best_params$eta,
  max_depth        = best_params$max_depth,
  subsample        = best_params$subsample,
  colsample_bytree = best_params$colsample_bytree,
  nthread          = 2
)
best_nrounds <- xgb_model$niter

# --- Test matrices (fixed across all iterations) ---
X_test_xgb <- as.matrix(test_xgb[, ..xgb_features])
X_test_woe <- as.matrix(test_woe[, ..woe_features])

# --- Fixed SHAP subsample ---
set.seed(42)
shap_idx   <- sample(nrow(X_test_xgb), min(3000, nrow(X_test_xgb)))
X_shap_xgb <- X_test_xgb[shap_idx, ]

# --- Extract observation year and assign regime ---
train_raw[, obs_year := as.integer(substr(as.character(monthly_reporting_period), 1, 4))]
stopifnot(nrow(train_raw) == nrow(train_xgb))

# Pre-crisis: 2006-2007, Crisis/recovery: 2008-2010 (2011 excluded as observations after Dec 2011 are already dropped )
train_raw[, regime := fifelse(obs_year <= 2007, "pre_crisis", "crisis_recovery")]
print(train_raw[, .N, by = regime])

# --- Build loan-to-row index mapping per regime ---
train_xgb[, .row_id := .I]
train_woe[, .row_id := .I]
train_raw[, .row_id := .I]

# Full matrices (needed for bootstrap row subsetting)
X_train_full_xgb <- as.matrix(train_xgb[, ..xgb_features])
y_train_full_xgb <- train_xgb$default_next_12m
X_train_full_woe <- as.matrix(train_woe[, ..woe_features])
y_train_full_woe <- train_woe$default_next_12m

# Per-regime unique loan IDs and their row indices
build_regime_index <- function(regime_label) {
  regime_rows <- train_raw[regime == regime_label, .row_id]
  regime_loans <- train_raw[regime == regime_label,
                            .(rows = list(.row_id)),
                            by = loan_sequence_number]
  list(
    loan_ids   = regime_loans$loan_sequence_number,
    row_lookup = setNames(regime_loans$rows, regime_loans$loan_sequence_number),
    n_loans    = nrow(regime_loans),
    n_obs      = length(regime_rows)
  )
}

regime_pre   <- build_regime_index("pre_crisis")
regime_crisis <- build_regime_index("crisis_recovery")


# Memory Offload
rm(train_xgb, train_woe, train_raw, test_xgb, test_woe, xgb_model)
gc()

# --- Bootstrap function for one regime ---
run_regime_bootstrap <- function(regime_info, regime_name, N_BOOT = 50) {
  
  results_xgb <- vector("list", N_BOOT)
  results_log <- vector("list", N_BOOT)
  
  regime_start <- Sys.time()
  
  for (b in 1:N_BOOT) {
    iter_start <- Sys.time()

    # Resample loan IDs with replacement within this regime
    set.seed(1000 * ifelse(regime_name == "Pre-crisis", 1, 2) + b)
    boot_loan_ids <- sample(regime_info$loan_ids,
                            regime_info$n_loans, replace = TRUE)
    boot_rows <- unlist(regime_info$row_lookup[boot_loan_ids], use.names = FALSE)
    
    # XGBoost
    dtrain_boot <- xgb.DMatrix(
      data  = X_train_full_xgb[boot_rows, ],
      label = y_train_full_xgb[boot_rows]
    )
    
    xgb_boot <- xgb.train(
      params  = fixed_params,
      data    = dtrain_boot,
      nrounds = best_nrounds,
      verbose = 0
    )
    
    shap_boot      <- shap.values(xgb_model = xgb_boot, X_train = X_shap_xgb)
    mean_shap_boot <- shap_boot$mean_shap_score
    
    shap_raw <- data.table(
      feature       = names(mean_shap_boot),
      mean_abs_shap = as.numeric(mean_shap_boot)
    )
    shap_raw[, base_var := base_var_map[feature]]
    shap_agg <- shap_raw[, .(mean_abs_shap = sum(mean_abs_shap)), by = base_var]
    setnames(shap_agg, "base_var", "feature")
    shap_agg <- shap_agg[order(-mean_abs_shap)]
    shap_agg[, rank := .I]
    results_xgb[[b]] <- shap_agg
    
    # Logistic regression
    X_boot_woe <- X_train_full_woe[boot_rows, ]
    y_boot     <- y_train_full_woe[boot_rows]
    
    log_boot <- glmnet(X_boot_woe, y_boot, family = "binomial",
                       alpha = 0, lambda = best_lambda)
    
    coefs_boot <- as.numeric(coef(log_boot, s = best_lambda))[-1]
    names(coefs_boot) <- woe_features
    boot_means <- colMeans(X_boot_woe, na.rm = TRUE)
    
    shap_log_boot <- sweep(X_test_woe, 2, boot_means, "-")
    shap_log_boot <- sweep(shap_log_boot, 2, coefs_boot, "*")
    
    mean_shap_log_boot <- sort(colMeans(abs(shap_log_boot)), decreasing = TRUE)
    
    importance_boot_log <- data.table(
      feature       = names(mean_shap_log_boot),
      mean_abs_shap = as.numeric(mean_shap_log_boot)
    )[order(-mean_abs_shap)]
    importance_boot_log[, rank := .I]
    results_log[[b]] <- importance_boot_log
    
    # Clean up
    rm(dtrain_boot, xgb_boot, shap_boot, log_boot,
       X_boot_woe, shap_log_boot)
    gc(verbose = FALSE)
    
  }
  

  list(xgb = results_xgb, log = results_log)
}

# Run
N_BOOT <- 50
total_start <- Sys.time()

res_pre    <- run_regime_bootstrap(regime_pre,    "Pre-crisis",      N_BOOT)
res_crisis <- run_regime_bootstrap(regime_crisis, "Crisis/recovery", N_BOOT)



# --- Analysis---

# Rank matrix to compute pairwise Spearman
compute_spearman <- function(results_list) {
  rank_mat <- do.call(cbind, lapply(results_list, function(dt) {
    dt[order(feature)]$rank
  }))
  rownames(rank_mat) <- results_list[[1]][order(feature)]$feature
  
  n <- ncol(rank_mat)
  n_pairs <- n * (n - 1) / 2
  rhos <- numeric(n_pairs)
  k <- 1
  for (i in 1:(n - 1)) {
    for (j in (i + 1):n) {
      rhos[k] <- cor(rank_mat[, i], rank_mat[, j], method = "spearman")
      k <- k + 1
    }
  }
  list(rank_matrix = rank_mat, spearman = rhos)
}

# Within-regime pairwise Spearman
pre_xgb    <- compute_spearman(res_pre$xgb)
pre_log    <- compute_spearman(res_pre$log)
crisis_xgb <- compute_spearman(res_crisis$xgb)
crisis_log <- compute_spearman(res_crisis$log)

# Cross-regime Spearman:
# Compare each pre-crisis iteration to each crisis iteration
cross_regime_spearman <- function(results_pre, results_crisis) {
  # Using mean rank per feature in each regime iteration
  rhos <- numeric(N_BOOT * N_BOOT)
  k <- 1
  
  ranks_pre <- do.call(cbind, lapply(results_pre, function(dt) {
    dt[order(feature)]$rank
  }))
  ranks_crisis <- do.call(cbind, lapply(results_crisis, function(dt) {
    dt[order(feature)]$rank
  }))
  
  for (i in 1:N_BOOT) {
    for (j in 1:N_BOOT) {
      rhos[k] <- cor(ranks_pre[, i], ranks_crisis[, j], method = "spearman")
      k <- k + 1
    }
  }
  rhos
}

cross_xgb <- cross_regime_spearman(res_pre$xgb, res_crisis$xgb)
cross_log <- cross_regime_spearman(res_pre$log, res_crisis$log)

# Per-feature stability per regime
feature_stability <- function(rank_mat) {
  data.table(
    feature     = rownames(rank_mat),
    mean_rank   = apply(rank_mat, 1, mean),
    median_rank = apply(rank_mat, 1, median),
    sd_rank     = apply(rank_mat, 1, sd),
    min_rank    = apply(rank_mat, 1, min),
    max_rank    = apply(rank_mat, 1, max),
    range       = apply(rank_mat, 1, max) - apply(rank_mat, 1, min)
  )[order(mean_rank)]
}

fs_pre_xgb    <- feature_stability(pre_xgb$rank_matrix)
fs_pre_log    <- feature_stability(pre_log$rank_matrix)
fs_crisis_xgb <- feature_stability(crisis_xgb$rank_matrix)
fs_crisis_log  <- feature_stability(crisis_log$rank_matrix)

# --- Summary table ---
summary_table <- data.table(
  Model  = rep(c("XGBoost", "Logistic Regression"), 3),
  Regime = rep(c("Pre-crisis", "Crisis/recovery", "Cross-regime"), each = 2),
  Mean   = round(c(mean(pre_xgb$spearman),    mean(pre_log$spearman),
                   mean(crisis_xgb$spearman), mean(crisis_log$spearman),
                   mean(cross_xgb),           mean(cross_log)), 4),
  Median = round(c(median(pre_xgb$spearman),    median(pre_log$spearman),
                   median(crisis_xgb$spearman), median(crisis_log$spearman),
                   median(cross_xgb),           median(cross_log)), 4),
  SD     = round(c(sd(pre_xgb$spearman),    sd(pre_log$spearman),
                   sd(crisis_xgb$spearman), sd(crisis_log$spearman),
                   sd(cross_xgb),           sd(cross_log)), 4),
  Min    = round(c(min(pre_xgb$spearman),    min(pre_log$spearman),
                   min(crisis_xgb$spearman), min(crisis_log$spearman),
                   min(cross_xgb),           min(cross_log)), 4),
  Max    = round(c(max(pre_xgb$spearman),    max(pre_log$spearman),
                   max(crisis_xgb$spearman), max(crisis_log$spearman),
                   max(cross_xgb),           max(cross_log)), 4)
)

print(summary_table)

stargazer(as.data.frame(summary_table),
          summary = FALSE, rownames = FALSE, type = "html",
          title = "Regime-Stratified Pairwise Spearman Rank Correlations",
          out = file.path(OUTPUT_DIR, "table_regime_stability_spearman.html"))

# --- Saving Results ---
saveRDS(list(
  pre_crisis = list(
    xgb_rankings  = res_pre$xgb,
    log_rankings  = res_pre$log,
    spearman_xgb  = pre_xgb$spearman,
    spearman_log  = pre_log$spearman,
    rank_matrix_xgb = pre_xgb$rank_matrix,
    rank_matrix_log = pre_log$rank_matrix,
    feature_stability_xgb = fs_pre_xgb,
    feature_stability_log = fs_pre_log
  ),
  crisis_recovery = list(
    xgb_rankings  = res_crisis$xgb,
    log_rankings  = res_crisis$log,
    spearman_xgb  = crisis_xgb$spearman,
    spearman_log  = crisis_log$spearman,
    rank_matrix_xgb = crisis_xgb$rank_matrix,
    rank_matrix_log = crisis_log$rank_matrix,
    feature_stability_xgb = fs_crisis_xgb,
    feature_stability_log = fs_crisis_log
  ),
  cross_regime = list(
    spearman_xgb = cross_xgb,
    spearman_log = cross_log
  ),
  summary_table  = summary_table,
  n_boot         = N_BOOT,
  base_var_map   = base_var_map
), file.path(OUTPUT_DIR, "regime_stability_results.rds"))

# --- Boxplots for Feature Rankings Stability ---
results <- readRDS(file.path(OUTPUT_DIR, "regime_stability_results.rds"))

N_BOOT <- results$n_boot

make_rank_boxplot <- function(rank_mat, title_text, fill_color, n_boot) {
  long <- data.table(
    feature = rep(rownames(rank_mat), n_boot),
    rank    = as.vector(rank_mat)
  )
  mean_ranks <- long[, .(mean_rank = mean(rank)), by = feature]
  long <- merge(long, mean_ranks, by = "feature")
  ordered_features <- mean_ranks[order(mean_rank)]$feature
  long[, feature := factor(gsub("_woe$", "", gsub("_", " ", feature)),
                           levels = gsub("_woe$", "", gsub("_", " ", ordered_features)))]
  
  ggplot(long, aes(rank, feature)) +
    geom_boxplot(fill = fill_color, alpha = 0.6) +
    labs(title = title_text, x = "Rank", y = NULL) +
    theme_minimal(base_size = 11) +
    theme(panel.grid.major.y = element_blank(),
          panel.grid.minor = element_blank())
}


p1 <- make_rank_boxplot(results$pre_crisis$rank_matrix_xgb,      "XGBoost – Pre-crisis",       "#4A90D9", N_BOOT)
p2 <- make_rank_boxplot(results$crisis_recovery$rank_matrix_xgb, "XGBoost – Crisis/recovery",  "#2E5F8A", N_BOOT)
p3 <- make_rank_boxplot(results$pre_crisis$rank_matrix_log,      "Logistic – Pre-crisis",      "#E8855E", N_BOOT)
p4 <- make_rank_boxplot(results$crisis_recovery$rank_matrix_log, "Logistic – Crisis/recovery", "#B85A3A", N_BOOT)

ggsave(file.path(OUTPUT_DIR, "fig_regime_stability_ranks.png"),
       arrangeGrob(p1, p2, p3, p4, ncol = 2, nrow = 2),
       width = 14, height = 14, dpi = 300)