Both models evaluated, with stratification by predicted PD tier
library(stargazer)
library(data.table)
library(xgboost)
library(glmnet)
library(SHAPforxgboost)
library(FNN)
library(ggplot2)
library(gridExtra)
OUTPUT_DIR <- "/Users/amalianimeskern/Library/CloudStorage/OneDrive-ErasmusUniversityRotterdam/Freddie Mac Data"
# --- Load models and test data ---
xgb_model <- readRDS(file.path(OUTPUT_DIR, "xgb_model.rds"))
logistic_model <- readRDS(file.path(OUTPUT_DIR, "logistic_model.rds"))
best_lambda <- readRDS(file.path(OUTPUT_DIR, "best_lambda.rds"))
test_xgb <- readRDS(file.path(OUTPUT_DIR, "test_xgb.rds"))
test_woe <- readRDS(file.path(OUTPUT_DIR, "test_woe.rds"))
train_woe <- readRDS(file.path(OUTPUT_DIR, "train_woe.rds"))
# --- Feature names ---
xgb_features <- setdiff(names(test_xgb),
c("loan_sequence_number", "monthly_reporting_period",
"default_next_12m"))
woe_features <- setdiff(names(test_woe),
c("loan_sequence_number", "monthly_reporting_period",
"default_next_12m"))
# --- Test matrices ---
X_test_xgb <- as.matrix(test_xgb[, ..xgb_features])
X_test_woe <- as.matrix(test_woe[, ..woe_features])
# --- Sample 1,000 test observations ---
set.seed(42)
cp_idx <- sample(nrow(X_test_xgb), 1000)
# --- Standardize feature space for nearest-neighbor search ---
xgb_means <- colMeans(X_test_xgb, na.rm = TRUE)
xgb_sds <- apply(X_test_xgb, 2, sd, na.rm = TRUE)
xgb_sds[xgb_sds == 0] <- 1
X_test_std <- sweep(X_test_xgb, 2, xgb_means, "-")
X_test_std <- sweep(X_test_std, 2, xgb_sds, "/")
X_test_std[is.na(X_test_std)] <- 0
# --- Find k=5 nearest neighbours for each sampled observation ---
nn_result <- get.knnx(X_test_std, X_test_std[cp_idx, ], k = 6)
nn_indices <- nn_result$nn.index[, 2:6] # remove self-neighbour
# --- Compute XGBoost SHAP for sampled observations + neighbours ---
all_idx <- unique(c(cp_idx, as.vector(nn_indices)))
X_all <- X_test_xgb[all_idx, , drop = FALSE]
shap_all_xgb <- shap.values(xgb_model = xgb_model, X_train = X_all)
shap_mat_xgb <- as.matrix(shap_all_xgb$shap_score)
# --- Lookup: original test row index -> row position in shap_mat_xgb ---
idx_lookup <- match(1:nrow(X_test_xgb), all_idx) # Map full test-set row indices to their position in the SHAP matrix
# --- Compute logistic SHAP for full test set ---
X_train_woe <- as.matrix(train_woe[, ..woe_features])
train_means <- colMeans(X_train_woe, na.rm = TRUE)
coefs <- as.numeric(coef(logistic_model, s = best_lambda))[-1]
names(coefs) <- woe_features
shap_all_log <- sweep(X_test_woe, 2, train_means, "-")
shap_all_log <- sweep(shap_all_log, 2, coefs, "*")
# --- Compute Pearson correlation and MAD for each pair ---
n_sample <- length(cp_idx)
k <- 5
pearson_xgb <- numeric(n_sample * k)
mad_xgb <- numeric(n_sample * k)
pearson_log <- numeric(n_sample * k)
mad_log <- numeric(n_sample * k)
counter <- 1
for (i in seq_len(n_sample)) {
focal_row_xgb <- idx_lookup[cp_idx[i]]
shap_focal_xgb <- shap_mat_xgb[focal_row_xgb, ]
shap_focal_log <- shap_all_log[cp_idx[i], ]
for (j in seq_len(k)) {
neighbor_orig_idx <- nn_indices[i, j]
neighbor_row_xgb <- idx_lookup[neighbor_orig_idx]
shap_neighbor_xgb <- shap_mat_xgb[neighbor_row_xgb, ]
shap_neighbor_log <- shap_all_log[neighbor_orig_idx, ]
pearson_xgb[counter] <- cor(shap_focal_xgb, shap_neighbor_xgb, method = "pearson")
pearson_log[counter] <- cor(shap_focal_log, shap_neighbor_log, method = "pearson")
mad_xgb[counter] <- mean(abs(shap_focal_xgb - shap_neighbor_xgb))
mad_log[counter] <- mean(abs(shap_focal_log - shap_neighbor_log))
counter <- counter + 1
}
}
# --- Results ---
results_dt <- data.table(
pearson_xgb = pearson_xgb,
pearson_log = pearson_log,
mad_xgb = mad_xgb,
mad_log = mad_log
)
summary_pearson <- data.table(
model = rep(c("XGBoost", "Logistic"), each = 6),
metric = rep(c("Mean", "Median", "SD", "Min", "Q25", "Q75"), 2),
value = c(
mean(pearson_xgb, na.rm = TRUE), median(pearson_xgb, na.rm = TRUE),
sd(pearson_xgb, na.rm = TRUE), min(pearson_xgb, na.rm = TRUE),
quantile(pearson_xgb, 0.25, na.rm = TRUE),
quantile(pearson_xgb, 0.75, na.rm = TRUE),
mean(pearson_log, na.rm = TRUE), median(pearson_log, na.rm = TRUE),
sd(pearson_log, na.rm = TRUE), min(pearson_log, na.rm = TRUE),
quantile(pearson_log, 0.25, na.rm = TRUE),
quantile(pearson_log, 0.75, na.rm = TRUE)
)
)
summary_mad <- data.table(
model = rep(c("XGBoost", "Logistic"), each = 6),
metric = rep(c("Mean", "Median", "SD", "Min", "Q25", "Q75"), 2),
value = c(
mean(mad_xgb), median(mad_xgb), sd(mad_xgb), min(mad_xgb),
quantile(mad_xgb, 0.25), quantile(mad_xgb, 0.75),
mean(mad_log), median(mad_log), sd(mad_log), min(mad_log),
quantile(mad_log, 0.25), quantile(mad_log, 0.75)
)
)
print(summary_pearson)
## model metric value
## <char> <char> <num>
## 1: XGBoost Mean 0.97079802
## 2: XGBoost Median 0.99637734
## 3: XGBoost SD 0.08538868
## 4: XGBoost Min 0.16416672
## 5: XGBoost Q25 0.98454173
## 6: XGBoost Q75 0.99938067
## 7: Logistic Mean 0.94971643
## 8: Logistic Median 1.00000000
## 9: Logistic SD 0.11663431
## 10: Logistic Min -0.33358765
## 11: Logistic Q25 0.96774558
## 12: Logistic Q75 1.00000000
print(summary_mad)
## model metric value
## <char> <char> <num>
## 1: XGBoost Mean 1.460311e-02
## 2: XGBoost Median 9.142346e-03
## 3: XGBoost SD 1.800892e-02
## 4: XGBoost Min 9.219526e-05
## 5: XGBoost Q25 3.904766e-03
## 6: XGBoost Q75 1.702605e-02
## 7: Logistic Mean 1.078940e-02
## 8: Logistic Median 0.000000e+00
## 9: Logistic SD 1.691215e-02
## 10: Logistic Min 0.000000e+00
## 11: Logistic Q25 0.000000e+00
## 12: Logistic Q75 1.367226e-02
# Predicted PD for sampled focals
pred_xgb_sample <- predict(xgb_model, X_test_xgb[cp_idx, , drop = FALSE])
# --- Stratify by predicted PD tier ---
# Split sampled borrowers into low, medium, and high predicted-risk groups
pd_tier <- cut(
pred_xgb_sample,
breaks = quantile(pred_xgb_sample, probs = c(0, 1/3, 2/3, 1), na.rm = TRUE),
labels = c("Low PD", "Medium PD", "High PD"),
include.lowest = TRUE
)
results_dt[, pd_tier := rep(pd_tier, each = k)]
summary_by_tier <- results_dt[, .(
`Mean r (XGB)` = mean(pearson_xgb, na.rm = TRUE),
`SD r (XGB)` = sd(pearson_xgb, na.rm = TRUE),
`Min r (XGB)` = min(pearson_xgb, na.rm = TRUE),
`Mean r (Log)` = mean(pearson_log, na.rm = TRUE),
`SD r (Log)` = sd(pearson_log, na.rm = TRUE),
`Min r (Log)` = min(pearson_log, na.rm = TRUE),
`MAD (XGB)` = mean(mad_xgb),
`MAD (Log)` = mean(mad_log),
`N` = .N
), by = .(`PD Tier` = pd_tier)][order(`PD Tier`)]
print(summary_by_tier)
## PD Tier Mean r (XGB) SD r (XGB) Min r (XGB) Mean r (Log) SD r (Log)
## <fctr> <num> <num> <num> <num> <num>
## 1: Low PD 0.9819241 0.05928704 0.4483380 0.9510539 0.09985858
## 2: Medium PD 0.9697162 0.08325366 0.1641667 0.9437852 0.13506016
## 3: High PD 0.9607203 0.10592751 0.1965535 0.9543061 0.11210216
## Min r (Log) MAD (XGB) MAD (Log) N
## <num> <num> <num> <int>
## 1: 0.2438343 0.01676931 0.013382055 1670
## 2: -0.3335876 0.01382369 0.010072884 1665
## 3: 0.1700866 0.01320982 0.008905463 1665
stargazer(as.data.frame(summary_by_tier),
type = "html", summary = FALSE, rownames = FALSE,
title = "Cross-Profile Consistency by Predicted PD Tier",
label = "tab:crossprofile_by_tier",
notes = "Tertiles based on XGBoost predicted PD across 1,000 focal borrowers; k = 5 neighbours each.",
out = file.path(OUTPUT_DIR, "crossprofile_by_tier.html"))
##
## <table style="text-align:center"><caption><strong>Cross-Profile Consistency by Predicted PD Tier</strong></caption>
## <tr><td colspan="10" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">PD Tier</td><td>Mean r (XGB)</td><td>SD r (XGB)</td><td>Min r (XGB)</td><td>Mean r (Log)</td><td>SD r (Log)</td><td>Min r (Log)</td><td>MAD (XGB)</td><td>MAD (Log)</td><td>N</td></tr>
## <tr><td colspan="10" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">Low PD</td><td>0.982</td><td>0.059</td><td>0.448</td><td>0.951</td><td>0.100</td><td>0.244</td><td>0.017</td><td>0.013</td><td>1,670</td></tr>
## <tr><td style="text-align:left">Medium PD</td><td>0.970</td><td>0.083</td><td>0.164</td><td>0.944</td><td>0.135</td><td>-0.334</td><td>0.014</td><td>0.010</td><td>1,665</td></tr>
## <tr><td style="text-align:left">High PD</td><td>0.961</td><td>0.106</td><td>0.197</td><td>0.954</td><td>0.112</td><td>0.170</td><td>0.013</td><td>0.009</td><td>1,665</td></tr>
## <tr><td colspan="10" style="border-bottom: 1px solid black"></td></tr><tr><td colspan="10" style="text-align:left">Tertiles based on XGBoost predicted PD across 1,000 focal borrowers; k = 5 neighbours each.</td></tr>
## </table>
# --- Save results ---
saveRDS(list(
results_dt = results_dt,
summary_pearson = summary_pearson,
summary_mad = summary_mad,
summary_by_tier = summary_by_tier,
n_sample = n_sample,
k = k
), file.path(OUTPUT_DIR, "crossprofile_results.rds"))
# --- Table with focal, 1st and 5th nearest neighbour ---
# based on low-, median-, and high-risk examples from XGBoost predictions
example_positions <- c(
which.min(pred_xgb_sample),
which.min(abs(pred_xgb_sample - median(pred_xgb_sample))),
which.max(pred_xgb_sample)
)
example_labels <- c("Low risk", "Medium risk", "High risk")
# Tables for both models
example_tables <- vector("list", length(example_positions) * 2)
idx <- 1
for (e in seq_along(example_positions)) {
pos <- example_positions[e]
focal_orig <- cp_idx[pos]
nn1_orig <- nn_indices[pos, 1]
nn5_orig <- nn_indices[pos, 5]
# XGBoost
focal_shap_xgb <- shap_mat_xgb[idx_lookup[focal_orig], ]
nn1_shap_xgb <- shap_mat_xgb[idx_lookup[nn1_orig], ]
nn5_shap_xgb <- shap_mat_xgb[idx_lookup[nn5_orig], ]
top5_xgb <- order(-abs(focal_shap_xgb))[1:5]
feat_names_xgb <- if (!is.null(colnames(shap_mat_xgb))) {
colnames(shap_mat_xgb)[top5_xgb]
} else {
xgb_features[top5_xgb]
}
example_tables[[idx]] <- data.table(
model = "XGBoost",
risk_group = example_labels[e],
feature = feat_names_xgb,
shap_focal = round(focal_shap_xgb[top5_xgb], 4),
shap_nn1 = round(nn1_shap_xgb[top5_xgb], 4),
shap_nn5 = round(nn5_shap_xgb[top5_xgb], 4),
pearson_nn1 = round(cor(focal_shap_xgb, nn1_shap_xgb, method = "pearson"), 4),
pearson_nn5 = round(cor(focal_shap_xgb, nn5_shap_xgb, method = "pearson"), 4)
)
idx <- idx + 1
# Logistic
focal_shap_log <- shap_all_log[focal_orig, ]
nn1_shap_log <- shap_all_log[nn1_orig, ]
nn5_shap_log <- shap_all_log[nn5_orig, ]
top5_log <- order(-abs(focal_shap_log))[1:5]
feat_names_log <- woe_features[top5_log]
example_tables[[idx]] <- data.table(
model = "Logistic",
risk_group = example_labels[e],
feature = feat_names_log,
shap_focal = round(focal_shap_log[top5_log], 4),
shap_nn1 = round(nn1_shap_log[top5_log], 4),
shap_nn5 = round(nn5_shap_log[top5_log], 4),
pearson_nn1 = round(cor(focal_shap_log, nn1_shap_log, method = "pearson"), 4),
pearson_nn5 = round(cor(focal_shap_log, nn5_shap_log, method = "pearson"), 4)
)
idx <- idx + 1
}
example_comparison <- rbindlist(example_tables)
print(example_comparison)
## model risk_group feature shap_focal shap_nn1 shap_nn5
## <char> <char> <char> <num> <num> <num>
## 1: XGBoost Low risk orig_interest_rate -1.7299 -1.7269 -1.7451
## 2: XGBoost Low risk credit_score -1.4163 -1.4274 -1.4052
## 3: XGBoost Low risk orig_ltv -0.9970 -0.9958 -1.0123
## 4: XGBoost Low risk orig_dti -0.5115 -0.5115 -0.4886
## 5: XGBoost Low risk orig_loan_term -0.3839 -0.3817 -0.4075
## 6: Logistic Low risk orig_interest_rate_woe -0.9959 -0.9959 -0.9959
## 7: Logistic Low risk credit_score_woe -0.8100 -0.8100 -0.8100
## 8: Logistic Low risk orig_ltv_woe -0.3969 -0.3969 -0.3969
## 9: Logistic Low risk orig_loan_term_woe -0.3698 -0.3698 -0.3698
## 10: Logistic Low risk orig_dti_woe -0.3436 -0.3436 -0.3436
## 11: XGBoost Medium risk orig_ltv 0.4271 0.4282 0.4087
## 12: XGBoost Medium risk channel_C -0.3682 -0.3796 -0.3743
## 13: XGBoost Medium risk num_borrowers -0.3275 -0.3320 -0.3250
## 14: XGBoost Medium risk loan_purpose_P -0.2923 -0.2924 -0.2823
## 15: XGBoost Medium risk orig_dti -0.2044 -0.2078 -0.2158
## 16: Logistic Medium risk orig_ltv_woe 0.4674 0.4674 0.4674
## 17: Logistic Medium risk credit_score_woe 0.3280 0.3280 0.3280
## 18: Logistic Medium risk num_borrowers_woe -0.2475 -0.2475 -0.2475
## 19: Logistic Medium risk current_upb_woe 0.2216 0.2216 0.2216
## 20: Logistic Medium risk channel_woe -0.1922 -0.1922 -0.1922
## 21: XGBoost High risk credit_score 1.6070 1.1216 1.0779
## 22: XGBoost High risk loan_age 1.2523 -0.1913 -0.2152
## 23: XGBoost High risk current_upb 0.5971 0.5530 0.5402
## 24: XGBoost High risk num_borrowers -0.4153 -0.2616 -0.2721
## 25: XGBoost High risk orig_dti 0.2526 0.2120 0.2090
## 26: Logistic High risk credit_score_woe 0.8839 0.8839 0.8839
## 27: Logistic High risk loan_age_woe 0.4633 0.4633 -0.3854
## 28: Logistic High risk orig_dti_woe 0.2722 0.2722 0.2722
## 29: Logistic High risk num_borrowers_woe -0.2475 -0.2475 -0.2475
## 30: Logistic High risk current_upb_woe 0.2216 0.2216 0.2216
## model risk_group feature shap_focal shap_nn1 shap_nn5
## pearson_nn1 pearson_nn5
## <num> <num>
## 1: 1.0000 0.9993
## 2: 1.0000 0.9993
## 3: 1.0000 0.9993
## 4: 1.0000 0.9993
## 5: 1.0000 0.9993
## 6: 1.0000 0.9921
## 7: 1.0000 0.9921
## 8: 1.0000 0.9921
## 9: 1.0000 0.9921
## 10: 1.0000 0.9921
## 11: 0.9996 0.9968
## 12: 0.9996 0.9968
## 13: 0.9996 0.9968
## 14: 0.9996 0.9968
## 15: 0.9996 0.9968
## 16: 1.0000 0.9499
## 17: 1.0000 0.9499
## 18: 1.0000 0.9499
## 19: 1.0000 0.9499
## 20: 1.0000 0.9499
## 21: 0.7065 0.6813
## 22: 0.7065 0.6813
## 23: 0.7065 0.6813
## 24: 0.7065 0.6813
## 25: 0.7065 0.6813
## 26: 1.0000 0.5800
## 27: 1.0000 0.5800
## 28: 1.0000 0.5800
## 29: 1.0000 0.5800
## 30: 1.0000 0.5800
## pearson_nn1 pearson_nn5
# --- Visual ---
for (mod in c("XGBoost", "Logistic")) {
sub_comp <- example_comparison[model == mod]
plot_list <- vector("list", length(example_labels))
names(plot_list) <- example_labels
for (rg in example_labels) {
sub <- sub_comp[risk_group == rg]
sub_long <- melt(
sub,
id.vars = c("model", "risk_group", "feature"),
measure.vars = c("shap_focal", "shap_nn1", "shap_nn5"),
variable.name = "borrower",
value.name = "shap"
)
sub_long[, borrower := factor(
borrower,
levels = c("shap_focal", "shap_nn1", "shap_nn5"),
labels = c("Focal", "Nearest (k=1)", "Farthest (k=5)")
)]
sub_long[, feature := factor(feature, levels = rev(sub$feature))]
pd_label <- paste0(
"r(k=1) = ", sub$pearson_nn1[1],
" | r(k=5) = ", sub$pearson_nn5[1]
)
p <- ggplot(sub_long, aes(x = shap, y = feature, fill = borrower)) +
geom_col(position = position_dodge(width = 0.7), width = 0.6) +
scale_fill_manual(values = c(
"Focal" = "#D32F2F",
"Nearest (k=1)" = "#1976D2",
"Farthest (k=5)" = "#90CAF9"
)) +
labs(
title = rg,
subtitle = pd_label,
x = "SHAP Value",
y = NULL,
fill = NULL
) +
theme_classic() +
theme(
plot.title = element_text(size = 11, face = "bold"),
plot.subtitle = element_text(size = 9, color = "grey40"),
legend.position = "bottom"
)
plot_list[[rg]] <- p
}
p_combined <- grid.arrange(
grobs = plot_list,
ncol = 3,
top = grid::textGrob(
paste0("Cross-Profile Consistency – ", mod),
gp = grid::gpar(fontsize = 13, fontface = "bold")
)
)
filename <- paste0("crossprofile_examples_", tolower(mod), ".pdf")
ggsave(
file.path(OUTPUT_DIR, filename),
p_combined,
width = 18,
height = 6
)
}


# --- Combined summary table ---
appendix_table <- data.frame(
Metric = c("Mean", "Median", "SD", "Min", "Q25", "Q75"),
Pearson_XGBoost = summary_pearson[model == "XGBoost", value],
Pearson_Logistic = summary_pearson[model == "Logistic", value],
MAD_XGBoost = summary_mad[model == "XGBoost", value],
MAD_Logistic = summary_mad[model == "Logistic", value]
)
stargazer(appendix_table,
type = "html", summary = FALSE, rownames = FALSE,
title = "Cross-Profile Consistency: Summary Statistics",
label = "tab:crossprofile_summary",
notes = "Based on 1,000 sampled observations with k = 5 nearest neighbours each.",
out = file.path(OUTPUT_DIR, "crossprofile_summary_table.html"))
##
## <table style="text-align:center"><caption><strong>Cross-Profile Consistency: Summary Statistics</strong></caption>
## <tr><td colspan="5" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">Metric</td><td>Pearson_XGBoost</td><td>Pearson_Logistic</td><td>MAD_XGBoost</td><td>MAD_Logistic</td></tr>
## <tr><td colspan="5" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">Mean</td><td>0.971</td><td>0.950</td><td>0.015</td><td>0.011</td></tr>
## <tr><td style="text-align:left">Median</td><td>0.996</td><td>1</td><td>0.009</td><td>0</td></tr>
## <tr><td style="text-align:left">SD</td><td>0.085</td><td>0.117</td><td>0.018</td><td>0.017</td></tr>
## <tr><td style="text-align:left">Min</td><td>0.164</td><td>-0.334</td><td>0.0001</td><td>0</td></tr>
## <tr><td style="text-align:left">Q25</td><td>0.985</td><td>0.968</td><td>0.004</td><td>0</td></tr>
## <tr><td style="text-align:left">Q75</td><td>0.999</td><td>1</td><td>0.017</td><td>0.014</td></tr>
## <tr><td colspan="5" style="border-bottom: 1px solid black"></td></tr><tr><td colspan="5" style="text-align:left">Based on 1,000 sampled observations with k = 5 nearest neighbours each.</td></tr>
## </table>
# --- Loan age pattern check ---
# Uses the SHAP values already computed above
loan_age_vals <- as.numeric(X_all[, "loan_age"])
loan_age_shaps <- as.numeric(shap_mat_xgb[, "loan_age"])
print(table(loan_age_vals))
## loan_age_vals
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
## 125 150 175 199 207 211 204 207 216 192 192 178 167 157 156 158 154 152 168 171
## 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39
## 141 144 131 118 107 107 110 99 99 97 98 89 71 59 52 57 63 62 50 47
## 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
## 45 41 37 39 36 34 37 43 34 33 23 23 20 21 20 20 14 18 15 18
## 60 61 62 63 64 65 66 67
## 14 11 8 12 11 8 4 3
for (v in c(0, 1, 2, 3, 4)) {
mask <- loan_age_vals == v
if (sum(mask) > 0) {
cat(sprintf(" loan_age = %d (n=%d): mean = %.4f, sd = %.4f\n",
v, sum(mask), mean(loan_age_shaps[mask]), sd(loan_age_shaps[mask])))
}
}
## loan_age = 0 (n=125): mean = 0.7184, sd = 0.2311
## loan_age = 1 (n=150): mean = -0.2856, sd = 0.1216
## loan_age = 2 (n=175): mean = -0.3265, sd = 0.1109
## loan_age = 3 (n=199): mean = -0.3485, sd = 0.1100
## loan_age = 4 (n=207): mean = -0.3625, sd = 0.1010
mask <- loan_age_vals >= 5 & loan_age_vals <= 10
if (sum(mask) > 0) {
cat(sprintf(" loan_age 5-10 (n=%d): mean = %.4f, sd = %.4f\n",
sum(mask), mean(loan_age_shaps[mask]), sd(loan_age_shaps[mask])))
}
## loan_age 5-10 (n=1222): mean = -0.2088, sd = 0.1060