Performance of Models

Test set evaluation: AUC, Gini, confidence intervals, DeLong test

library(data.table)
library(glmnet)
library(xgboost)
library(pROC)
library(knitr)
library(ggplot2)
library(kableExtra)


OUTPUT_DIR <- "/Users/amalianimeskern/Library/CloudStorage/OneDrive-ErasmusUniversityRotterdam/Freddie Mac Data"

# --- load models and test data ---

logistic_model <- readRDS(file.path(OUTPUT_DIR, "logistic_model.rds"))
best_lambda    <- readRDS(file.path(OUTPUT_DIR, "best_lambda.rds"))
xgb_model      <- readRDS(file.path(OUTPUT_DIR, "xgb_model.rds"))

test_woe <- readRDS(file.path(OUTPUT_DIR, "test_woe.rds"))
test_xgb <- readRDS(file.path(OUTPUT_DIR, "test_xgb.rds"))

# --- test matrices ---

woe_features <- setdiff(names(test_woe),
                        c("loan_sequence_number", "monthly_reporting_period",
                          "default_next_12m"))
X_test_woe <- as.matrix(test_woe[, ..woe_features])
y_test     <- test_woe$default_next_12m

xgb_features <- setdiff(names(test_xgb),
                        c("loan_sequence_number", "monthly_reporting_period",
                          "default_next_12m"))
dtest <- xgb.DMatrix(data = as.matrix(test_xgb[, ..xgb_features]),
                     label = test_xgb$default_next_12m)

# --- predictions ---

preds_logistic <- predict(logistic_model, newx = X_test_woe, s = best_lambda,
                          type = "response")[, 1]
preds_xgb      <- predict(xgb_model, dtest)
preds_nn <- readRDS(file.path(OUTPUT_DIR, "nn_test_probs_tuned.rds"))

# --- ROC curves ---

roc_logistic <- roc(y_test, preds_logistic, quiet = TRUE)
roc_xgb      <- roc(y_test, preds_xgb, quiet = TRUE)
roc_nn       <- roc(y_test, preds_nn, quiet = TRUE)

# --- AUC with 95% confidence intervals ---

ci_logistic <- ci.auc(roc_logistic, conf.level = 0.95)
ci_xgb      <- ci.auc(roc_xgb, conf.level = 0.95)
ci_nn       <- ci.auc(roc_nn, conf.level = 0.95)

# --- results table ---

results <- data.table(
  Model      = c("Logistic Regression", "XGBoost", "Neural Network"),
  AUC        = round(c(auc(roc_logistic), auc(roc_xgb), auc(roc_nn)), 4),
  AUC_lower  = round(c(ci_logistic[1], ci_xgb[1], ci_nn[1]), 4),
  AUC_upper  = round(c(ci_logistic[3], ci_xgb[3], ci_nn[3]), 4),
  Gini       = round(c(2 * auc(roc_logistic) - 1,
                       2 * auc(roc_xgb) - 1,
                       2 * auc(roc_nn) - 1), 4)
)
results

##                  Model    AUC AUC_lower AUC_upper   Gini
##                 <char>  <num>     <num>     <num>  <num>
## 1: Logistic Regression 0.8103    0.8042    0.8164 0.6207
## 2:             XGBoost 0.8499    0.8444    0.8554 0.6997
## 3:      Neural Network 0.8270    0.8206    0.8333 0.6540

# --- DeLong tests ---

delong_xgb_lr  <- roc.test(roc_xgb, roc_logistic, method = "delong")
delong_nn_lr   <- roc.test(roc_nn, roc_logistic, method = "delong")
delong_nn_xgb  <- roc.test(roc_nn, roc_xgb, method = "delong")

delong_xgb_lr

## 
##  DeLong's test for two correlated ROC curves
## 
## data:  roc_xgb and roc_logistic
## Z = 20.865, p-value < 2.2e-16
## alternative hypothesis: true difference in AUC is not equal to 0
## 95 percent confidence interval:
##  0.03580600 0.04323045
## sample estimates:
## AUC of roc1 AUC of roc2 
##   0.8498588   0.8103406

delong_nn_lr

## 
##  DeLong's test for two correlated ROC curves
## 
## data:  roc_nn and roc_logistic
## Z = 7.2559, p-value = 3.989e-13
## alternative hypothesis: true difference in AUC is not equal to 0
## 95 percent confidence interval:
##  0.01215355 0.02114929
## sample estimates:
## AUC of roc1 AUC of roc2 
##   0.8269920   0.8103406

delong_nn_xgb

## 
##  DeLong's test for two correlated ROC curves
## 
## data:  roc_nn and roc_xgb
## Z = -13.46, p-value < 2.2e-16
## alternative hypothesis: true difference in AUC is not equal to 0
## 95 percent confidence interval:
##  -0.02619653 -0.01953708
## sample estimates:
## AUC of roc1 AUC of roc2 
##   0.8269920   0.8498588

# --- save ---

saveRDS(results, file.path(OUTPUT_DIR, "performance_results.rds"))
saveRDS(list(xgb_lr = delong_xgb_lr, nn_lr = delong_nn_lr, nn_xgb = delong_nn_xgb),
        file.path(OUTPUT_DIR, "delong_tests.rds"))
saveRDS(list(logistic = preds_logistic, xgb = preds_xgb, nn = preds_nn),
        file.path(OUTPUT_DIR, "test_predictions.rds"))

# --- performance table ---

fmt_p <- function(p) ifelse(p < 0.001, "< 0.001", as.character(round(p, 4)))
fmt_ci <- function(test) {
  bounds <- round(test$conf.int, 3)
  paste0("[", bounds[1], ", ", bounds[2], "]")
}

perf_table <- data.table(
  Metric = c("AUC", "95% CI (AUC)", "Gini",
             "DeLong vs LR (Z)", "p-value",
             "95% CI (AUC diff vs LR)",
             "DeLong vs XGBoost (Z)", "p-value",
             "95% CI (AUC diff vs XGB)"),
  `Logistic Regression` = c(
    as.character(results$AUC[1]),
    paste0("[", results$AUC_lower[1], ", ", results$AUC_upper[1], "]"),
    as.character(results$Gini[1]),
    "", "", "", "", "", ""
  ),
  XGBoost = c(
    as.character(results$AUC[2]),
    paste0("[", results$AUC_lower[2], ", ", results$AUC_upper[2], "]"),
    as.character(results$Gini[2]),
    round(delong_xgb_lr$statistic, 3),
    fmt_p(delong_xgb_lr$p.value),
    fmt_ci(delong_xgb_lr),
    "", "", ""
  ),
  `Neural Network` = c(
    as.character(results$AUC[3]),
    paste0("[", results$AUC_lower[3], ", ", results$AUC_upper[3], "]"),
    as.character(results$Gini[3]),
    round(delong_nn_lr$statistic, 3),
    fmt_p(delong_nn_lr$p.value),
    fmt_ci(delong_nn_lr),
    round(delong_nn_xgb$statistic, 3),
    fmt_p(delong_nn_xgb$p.value),
    fmt_ci(delong_nn_xgb)
  )
)

kbl <- kable(perf_table, format = "html",
             col.names = c("Metric", "Logistic Regression", "XGBoost", "Neural Network"))
writeLines(kbl, file.path(OUTPUT_DIR, "performance_table.html"))

# --- ROC plot ---

roc_dt <- rbind(
  data.table(fpr = 1 - roc_logistic$specificities,
             tpr = roc_logistic$sensitivities,
             model = paste0("Logistic Regression (AUC = ", results$AUC[1], ")")),
  data.table(fpr = 1 - roc_xgb$specificities,
             tpr = roc_xgb$sensitivities,
             model = paste0("XGBoost (AUC = ", results$AUC[2], ")")),
  data.table(fpr = 1 - roc_nn$specificities,
             tpr = roc_nn$sensitivities,
             model = paste0("Neural Network (AUC = ", results$AUC[3], ")"))
)

p_roc <- ggplot(roc_dt, aes(x = fpr, y = tpr, color = model)) +
  geom_line(linewidth = 0.8) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "grey50") +
  scale_color_manual(values = c("#1976D2", "#D32F2F", "#388E3C")) +
  labs(x = "False Positive Rate", y = "True Positive Rate",
       title = "ROC Curves: Logistic Regression vs XGBoost vs Neural Network",
       color = NULL) +
  theme_classic() +
  theme(plot.title = element_text(size = 13, face = "bold"),
        legend.position = c(0.65, 0.25),
        legend.background = element_rect(fill = "white", color = "grey80"))

ggsave(file.path(OUTPUT_DIR, "roc_curves.pdf"), p_roc, width = 8, height = 7)

# --- hyperparameters ---

best_params <- readRDS(file.path(OUTPUT_DIR, "best_hyperparams.rds"))
best_lambda <- readRDS(file.path(OUTPUT_DIR, "best_lambda.rds"))
nn_best_config <- readRDS(file.path(OUTPUT_DIR, "nn_best_config.rds"))

hyperparams_table <- data.table(
  Model = c("XGBoost", "XGBoost", "XGBoost", "XGBoost", "XGBoost",
            "Logistic Regression",
            "Neural Network", "Neural Network", "Neural Network",
            "Neural Network", "Neural Network", "Neural Network"),
  
  Hyperparameter = c("Learning rate (η)", "Max tree depth", "Subsample",
                     "Column subsample", "Number of boosting rounds",
                     "Ridge penalty (λ)",
                     "Hidden layers", "Learning rate",
                     "Dropout rates", "Batch size",
                     "Maximum epochs", "Patience (early stopping)"),
  
  Value = c(format(round(best_params$eta, 4), scientific = FALSE),
            format(best_params$max_depth, scientific = FALSE),
            format(round(best_params$subsample, 4), scientific = FALSE),
            format(round(best_params$colsample_bytree, 4), scientific = FALSE),
            format(xgb_model$niter, scientific = FALSE),
            format(round(best_lambda, 6), scientific = FALSE),
            gsub("-", " → ", nn_best_config$architecture),
            format(nn_best_config$learning_rate, scientific = FALSE),
            paste(nn_best_config$dropout1,
                  nn_best_config$dropout2,
                  nn_best_config$dropout3,
                  sep = " / "),
            format(nn_best_config$batch_size, scientific = FALSE),
            "50",
            "5 epochs")
)

kbl_hp <- kable(
  hyperparams_table,
  format = "html",
  col.names = c("Model", "Hyperparameter", "Value"),
  caption = "Selected Hyperparameters"
)

writeLines(kbl_hp, file.path(OUTPUT_DIR, "hyperparameters_table.html"))

Performance of Models

Amalia Nimeskern

2026-04-27

Performance of models

Test set evaluation: AUC, Gini, confidence intervals, DeLong test