Ridge logistic regression with lambda tuning on validation AUC
library(data.table)
library(glmnet)
library(pROC)
library(stargazer)
OUTPUT_DIR <- "/Users/amalianimeskern/Library/CloudStorage/OneDrive-ErasmusUniversityRotterdam/Freddie Mac Data"
# --- Load WoE-transformed data ---
train_woe <- readRDS(file.path(OUTPUT_DIR, "train_woe.rds"))
valid_woe <- readRDS(file.path(OUTPUT_DIR, "valid_woe.rds"))
# --- Preparation of matrices ---
woe_features <- setdiff(names(train_woe),
c("loan_sequence_number", "monthly_reporting_period",
"default_next_12m"))
X_train <- as.matrix(train_woe[, ..woe_features])
y_train <- train_woe$default_next_12m
X_valid <- as.matrix(valid_woe[, ..woe_features])
y_valid <- valid_woe$default_next_12m
# -- Fit ridge logistic regression (alpha = 0) ---
fit <- glmnet(X_train, y_train, family = "binomial", alpha = 0)
# --- Tune lambda on validation set AUC ---
lambdas <- fit$lambda
valid_aucs <- sapply(lambdas, function(lam) {
preds <- predict(fit, newx = X_valid, s = lam, type = "response")[, 1]
as.numeric(auc(roc(y_valid, preds, quiet = TRUE)))
})
best_idx <- which.max(valid_aucs)
best_lambda <- lambdas[best_idx]
best_auc <- valid_aucs[best_idx]
# --- Results ---
data.table(lambda = best_lambda, validation_auc = round(best_auc, 6))
## lambda validation_auc
## <num> <num>
## 1: 0.0001889164 0.804785
# Lambda tuning path
lambda_results <- data.table(lambda = lambdas, auc = valid_aucs)
lambda_results[order(-auc)][1:10]
## lambda auc
## <num> <num>
## 1: 0.0001889164 0.8047847
## 2: 0.0002073356 0.8047444
## 3: 0.0002275505 0.8046942
## 4: 0.0002497364 0.8046389
## 5: 0.0002740854 0.8045762
## 6: 0.0003008084 0.8045070
## 7: 0.0003301368 0.8044326
## 8: 0.0003623248 0.8043506
## 9: 0.0003976510 0.8042577
## 10: 0.0004364215 0.8041588
# --- Coefficients at best lambda ---
coefs <- coef(fit, s = best_lambda)
coef_dt <- data.table(feature = rownames(coefs), coefficient = as.numeric(coefs))
coef_dt[order(-abs(coefficient))]
## feature coefficient
## <char> <num>
## 1: (Intercept) -5.9887189
## 2: first_time_homebuyer_woe -1.6636504
## 3: current_upb_woe 0.9206802
## 4: hpi_qoq_qlag1_woe 0.9037092
## 5: num_borrowers_woe 0.8581153
## 6: loan_age_woe 0.7624119
## 7: credit_score_woe 0.6824889
## 8: occupancy_status_woe 0.6584118
## 9: orig_ltv_woe 0.5794001
## 10: unemployment_rate_lag4_woe 0.4670239
## 11: orig_interest_rate_woe 0.4395245
## 12: orig_dti_woe 0.4229598
## 13: property_type_woe 0.3872274
## 14: loan_purpose_woe 0.3787428
## 15: orig_loan_term_woe 0.2890103
## 16: channel_woe 0.2530759
## 17: mi_pct_woe 0.1306451
## 18: current_delinquency_status_woe 0.0000000
## 19: delta_interest_rate_woe 0.0000000
## 20: mod_flag_12m_woe 0.0000000
## 21: current_deferred_upb_woe 0.0000000
## feature coefficient
# --- Save ---
saveRDS(fit, file.path(OUTPUT_DIR, "logistic_model.rds"))
saveRDS(best_lambda, file.path(OUTPUT_DIR, "best_lambda.rds"))
saveRDS(lambda_results, file.path(OUTPUT_DIR, "lambda_tuning_results.rds"))
# --- Scorecard table ---
bins <- readRDS(file.path(OUTPUT_DIR, "woe_bins.rds"))
bin_table <- rbindlist(bins, idcol = "feature")
# Get coefficients from fitted model
coefs <- coef(fit, s = best_lambda)
coef_dt <- data.table(feature = rownames(coefs), coefficient = as.numeric(coefs))
coef_clean <- coef_dt[feature != "(Intercept)"]
coef_clean[, feature_base := gsub("_woe$", "", feature)]
scorecard <- merge(bin_table, coef_clean[, .(feature_base, coefficient)],
by.x = "feature", by.y = "feature_base", all.x = TRUE)
# Show coefficient only on first row per feature
scorecard[, row_num := seq_len(.N), by = feature]
scorecard[row_num > 1, coefficient := NA]
scorecard[, row_num := NULL]
# Clean feature names
feature_labels <- c(
"credit_score" = "Credit Score",
"orig_ltv" = "Original LTV",
"orig_dti" = "Debt-to-Income",
"orig_interest_rate" = "Origination Interest Rate",
"orig_loan_term" = "Loan Term",
"num_borrowers" = "Number of Borrowers",
"mi_pct" = "Mortgage Insurance %",
"current_delinquency_status" = "Current Delinquency Status",
"loan_age" = "Loan Age",
"current_upb" = "Current UPB",
"delta_interest_rate" = "Delta Interest Rate",
"mod_flag_12m" = "Modification Flag (12m)",
"current_deferred_upb" = "Current Deferred UPB",
"unemployment_rate_lag4" = "Unemployment Rate (Lag 4)",
"hpi_qoq_qlag1" = "HPI Growth QoQ (Lag 1)",
"occupancy_status" = "Occupancy Status",
"loan_purpose" = "Loan Purpose",
"channel" = "Channel",
"property_type" = "Property Type",
"first_time_homebuyer" = "First-Time Homebuyer"
)
scorecard[, feature_clean := feature_labels[feature]]
# Clean bin labels
scorecard[, bin_clean := gsub("%,%", ", ", bin)]
scorecard[, bin_clean := gsub("\\[-Inf, Inf\\)", "All", bin_clean)]
scorecard[, bin_clean := gsub("\\[-Inf,", "< ", bin_clean)]
scorecard[, bin_clean := gsub(", Inf\\)", "+", bin_clean)]
scorecard[, bin_clean := gsub("(\\d+\\.?\\d*),\\s*(\\d+\\.?\\d*)", "\\1–\\2", bin_clean)]
# Format
scorecard[, default_rate := paste0(round(posprob * 100, 1), "%")]
scorecard[, pct_obs := paste0(round(count_distr * 100, 1), "%")]
scorecard[, coefficient := round(coefficient, 4)]
scorecard[, woe := round(woe, 4)]
display_table <- scorecard[, .(
`Risk Driver` = fifelse(!is.na(coefficient), feature_clean, ""),
Bucket = bin_clean,
Coefficient = fifelse(!is.na(coefficient), as.character(coefficient), ""),
`Default Rate` = default_rate,
`# Obs` = format(count, big.mark = ","),
`% Obs` = pct_obs,
WoE = woe
)]
fwrite(display_table, file.path(OUTPUT_DIR, "table_logistic_scorecard.csv"))
# --- Coefficient table ---
library(knitr)
coef_display <- coef_dt[order(-abs(coefficient))]
coef_display[, feature := gsub("_woe$", "", feature)]
coef_display[, odds_ratio := round(exp(coefficient), 4)]
coef_display[, coefficient := round(coefficient, 4)]
# Clean feature names
feature_labels <- c(
"(Intercept)" = "Constant",
"credit_score" = "Credit Score",
"orig_ltv" = "Original LTV",
"orig_dti" = "Debt-to-Income",
"orig_interest_rate" = "Origination Interest Rate",
"orig_loan_term" = "Loan Term",
"num_borrowers" = "Number of Borrowers",
"mi_pct" = "Mortgage Insurance %",
"current_delinquency_status" = "Current Delinquency Status",
"loan_age" = "Loan Age",
"current_upb" = "Current UPB",
"delta_interest_rate" = "Delta Interest Rate",
"mod_flag_12m" = "Modification Flag (12m)",
"current_deferred_upb" = "Current Deferred UPB",
"unemployment_rate_lag4" = "Unemployment Rate (Lag 4)",
"hpi_qoq_qlag1" = "HPI Growth QoQ (Lag 1)",
"occupancy_status" = "Occupancy Status",
"loan_purpose" = "Loan Purpose",
"channel" = "Channel",
"property_type" = "Property Type",
"first_time_homebuyer" = "First-Time Homebuyer"
)
coef_display[, feature := feature_labels[feature]]
setnames(coef_display, c("Feature", "Coefficient", "Odds Ratio"))
writeLines(
kable(coef_display, format = "html", row.names = FALSE,
caption = "Ridge Logistic Regression Coefficients"),
file.path(OUTPUT_DIR, "table_logistic_coefficients.html")
)
# Specification Robustness: Additional Fixed Effects
# --- Load raw train/valid to get property_state ---
train_raw <- readRDS(file.path(OUTPUT_DIR, "train.rds"))
valid_raw <- readRDS(file.path(OUTPUT_DIR, "valid.rds"))
# Verify row alignment with WoE data
stopifnot(nrow(train_raw) == nrow(train_woe))
stopifnot(nrow(valid_raw) == nrow(valid_woe))
# --- State dummies ---
train_state <- model.matrix(~ property_state - 1, data = train_raw)
valid_state <- model.matrix(~ property_state - 1, data = valid_raw)
# Alignment of columns
missing_cols <- setdiff(colnames(train_state), colnames(valid_state))
if (length(missing_cols) > 0) {
pad <- matrix(0, nrow = nrow(valid_state), ncol = length(missing_cols))
colnames(pad) <- missing_cols
valid_state <- cbind(valid_state, pad)
}
valid_state <- valid_state[, colnames(train_state), drop = FALSE]
# Drop one state column for reference category
train_state <- train_state[, -1, drop = FALSE]
valid_state <- valid_state[, -1, drop = FALSE]
# --- Augmented feature matrices ---
X_train_fe <- cbind(X_train, train_state)
X_valid_fe <- cbind(X_valid, valid_state)
# --- Fit augmented ridge model ---
fit_fe <- glmnet(X_train_fe, y_train, family = "binomial", alpha = 0)
lambdas_fe <- fit_fe$lambda
valid_aucs_fe <- sapply(lambdas_fe, function(lam) {
preds <- predict(fit_fe, newx = X_valid_fe, s = lam, type = "response")[, 1]
as.numeric(auc(roc(y_valid, preds, quiet = TRUE)))
})
best_idx_fe <- which.max(valid_aucs_fe)
best_lambda_fe <- lambdas_fe[best_idx_fe]
best_auc_fe <- valid_aucs_fe[best_idx_fe]
# --- Compare AUCs ---
data.table(
Specification = c("Baseline", "State Fixed Effects"),
AUC = round(c(best_auc, best_auc_fe), 6)
)
## Specification AUC
## <char> <num>
## 1: Baseline 0.804785
## 2: State Fixed Effects 0.811895
# --- Compare coefficients on original WoE features ---
coefs_fe <- coef(fit_fe, s = best_lambda_fe)
coef_fe_dt <- data.table(feature = rownames(coefs_fe),
coefficient_fe = as.numeric(coefs_fe))
coef_baseline <- coef_dt[feature %in% woe_features]
coef_augmented <- coef_fe_dt[feature %in% woe_features]
coef_compare <- merge(coef_baseline, coef_augmented, by = "feature")
# Spearman rank correlation of coefficient magnitudes
spearman_coef <- cor(abs(coef_compare$coefficient),
abs(coef_compare$coefficient_fe),
method = "spearman")
print(coef_compare[order(-abs(coefficient))])
## feature coefficient coefficient_fe
## <char> <num> <num>
## 1: first_time_homebuyer_woe -1.6636504 -1.6766858
## 2: current_upb_woe 0.9206802 0.7392084
## 3: hpi_qoq_qlag1_woe 0.9037092 0.5059718
## 4: num_borrowers_woe 0.8581153 0.8404940
## 5: loan_age_woe 0.7624119 0.7870038
## 6: credit_score_woe 0.6824889 0.6836481
## 7: occupancy_status_woe 0.6584118 0.8117888
## 8: orig_ltv_woe 0.5794001 0.6431439
## 9: unemployment_rate_lag4_woe 0.4670239 0.5852921
## 10: orig_interest_rate_woe 0.4395245 0.4250734
## 11: orig_dti_woe 0.4229598 0.4159406
## 12: property_type_woe 0.3872274 0.1290927
## 13: loan_purpose_woe 0.3787428 0.2354684
## 14: orig_loan_term_woe 0.2890103 0.2720313
## 15: channel_woe 0.2530759 0.2519539
## 16: mi_pct_woe 0.1306451 0.1556489
## 17: current_deferred_upb_woe 0.0000000 0.0000000
## 18: current_delinquency_status_woe 0.0000000 0.0000000
## 19: delta_interest_rate_woe 0.0000000 0.0000000
## 20: mod_flag_12m_woe 0.0000000 0.0000000
## feature coefficient coefficient_fe
print(paste("Spearman rank correlation:", round(spearman_coef, 4)))
## [1] "Spearman rank correlation: 0.9288"
# --- Save robustness results ---
fe_results <- list(
baseline_auc = best_auc,
fe_auc = best_auc_fe,
auc_difference = best_auc_fe - best_auc,
spearman_coef = spearman_coef,
coef_comparison = coef_compare,
fe_model = fit_fe,
fe_best_lambda = best_lambda_fe
)
saveRDS(fe_results, file.path(OUTPUT_DIR, "fe_robustness_results.rds"))