Logistic Model

Ridge logistic regression with lambda tuning on validation AUC

library(data.table)
library(glmnet)
library(pROC)
library(stargazer)


OUTPUT_DIR <- "/Users/amalianimeskern/Library/CloudStorage/OneDrive-ErasmusUniversityRotterdam/Freddie Mac Data"

# --- Load WoE-transformed data ---
train_woe <- readRDS(file.path(OUTPUT_DIR, "train_woe.rds"))
valid_woe <- readRDS(file.path(OUTPUT_DIR, "valid_woe.rds"))

# --- Preparation of matrices ---
woe_features <- setdiff(names(train_woe),
                        c("loan_sequence_number", "monthly_reporting_period",
                          "default_next_12m"))

X_train <- as.matrix(train_woe[, ..woe_features])
y_train <- train_woe$default_next_12m

X_valid <- as.matrix(valid_woe[, ..woe_features])
y_valid <- valid_woe$default_next_12m

# -- Fit ridge logistic regression (alpha = 0) ---
fit <- glmnet(X_train, y_train, family = "binomial", alpha = 0)

# --- Tune lambda on validation set AUC ---
lambdas <- fit$lambda
valid_aucs <- sapply(lambdas, function(lam) {
  preds <- predict(fit, newx = X_valid, s = lam, type = "response")[, 1]
  as.numeric(auc(roc(y_valid, preds, quiet = TRUE)))
})

best_idx    <- which.max(valid_aucs)
best_lambda <- lambdas[best_idx]
best_auc    <- valid_aucs[best_idx]

# --- Results ---
data.table(lambda = best_lambda, validation_auc = round(best_auc, 6))
##          lambda validation_auc
##           <num>          <num>
## 1: 0.0001889164       0.804785
# Lambda tuning path
lambda_results <- data.table(lambda = lambdas, auc = valid_aucs)
lambda_results[order(-auc)][1:10]
##           lambda       auc
##            <num>     <num>
##  1: 0.0001889164 0.8047847
##  2: 0.0002073356 0.8047444
##  3: 0.0002275505 0.8046942
##  4: 0.0002497364 0.8046389
##  5: 0.0002740854 0.8045762
##  6: 0.0003008084 0.8045070
##  7: 0.0003301368 0.8044326
##  8: 0.0003623248 0.8043506
##  9: 0.0003976510 0.8042577
## 10: 0.0004364215 0.8041588
# --- Coefficients at best lambda ---
coefs <- coef(fit, s = best_lambda)
coef_dt <- data.table(feature = rownames(coefs), coefficient = as.numeric(coefs))
coef_dt[order(-abs(coefficient))]
##                            feature coefficient
##                             <char>       <num>
##  1:                    (Intercept)  -5.9887189
##  2:       first_time_homebuyer_woe  -1.6636504
##  3:                current_upb_woe   0.9206802
##  4:              hpi_qoq_qlag1_woe   0.9037092
##  5:              num_borrowers_woe   0.8581153
##  6:                   loan_age_woe   0.7624119
##  7:               credit_score_woe   0.6824889
##  8:           occupancy_status_woe   0.6584118
##  9:                   orig_ltv_woe   0.5794001
## 10:     unemployment_rate_lag4_woe   0.4670239
## 11:         orig_interest_rate_woe   0.4395245
## 12:                   orig_dti_woe   0.4229598
## 13:              property_type_woe   0.3872274
## 14:               loan_purpose_woe   0.3787428
## 15:             orig_loan_term_woe   0.2890103
## 16:                    channel_woe   0.2530759
## 17:                     mi_pct_woe   0.1306451
## 18: current_delinquency_status_woe   0.0000000
## 19:        delta_interest_rate_woe   0.0000000
## 20:               mod_flag_12m_woe   0.0000000
## 21:       current_deferred_upb_woe   0.0000000
##                            feature coefficient
# --- Save ---
saveRDS(fit,            file.path(OUTPUT_DIR, "logistic_model.rds"))
saveRDS(best_lambda,    file.path(OUTPUT_DIR, "best_lambda.rds"))
saveRDS(lambda_results, file.path(OUTPUT_DIR, "lambda_tuning_results.rds"))

# --- Scorecard table ---
bins <- readRDS(file.path(OUTPUT_DIR, "woe_bins.rds"))
bin_table <- rbindlist(bins, idcol = "feature")

# Get coefficients from fitted model
coefs <- coef(fit, s = best_lambda)
coef_dt <- data.table(feature = rownames(coefs), coefficient = as.numeric(coefs))
coef_clean <- coef_dt[feature != "(Intercept)"]
coef_clean[, feature_base := gsub("_woe$", "", feature)]

scorecard <- merge(bin_table, coef_clean[, .(feature_base, coefficient)],
                   by.x = "feature", by.y = "feature_base", all.x = TRUE)

# Show coefficient only on first row per feature
scorecard[, row_num := seq_len(.N), by = feature]
scorecard[row_num > 1, coefficient := NA]
scorecard[, row_num := NULL]

# Clean feature names
feature_labels <- c(
  "credit_score" = "Credit Score",
  "orig_ltv" = "Original LTV",
  "orig_dti" = "Debt-to-Income",
  "orig_interest_rate" = "Origination Interest Rate",
  "orig_loan_term" = "Loan Term",
  "num_borrowers" = "Number of Borrowers",
  "mi_pct" = "Mortgage Insurance %",
  "current_delinquency_status" = "Current Delinquency Status",
  "loan_age" = "Loan Age",
  "current_upb" = "Current UPB",
  "delta_interest_rate" = "Delta Interest Rate",
  "mod_flag_12m" = "Modification Flag (12m)",
  "current_deferred_upb" = "Current Deferred UPB",
  "unemployment_rate_lag4" = "Unemployment Rate (Lag 4)",
  "hpi_qoq_qlag1" = "HPI Growth QoQ (Lag 1)",
  "occupancy_status" = "Occupancy Status",
  "loan_purpose" = "Loan Purpose",
  "channel" = "Channel",
  "property_type" = "Property Type",
  "first_time_homebuyer" = "First-Time Homebuyer"
)
scorecard[, feature_clean := feature_labels[feature]]

# Clean bin labels
scorecard[, bin_clean := gsub("%,%", ", ", bin)]
scorecard[, bin_clean := gsub("\\[-Inf, Inf\\)", "All", bin_clean)]
scorecard[, bin_clean := gsub("\\[-Inf,", "< ", bin_clean)]
scorecard[, bin_clean := gsub(", Inf\\)", "+", bin_clean)]
scorecard[, bin_clean := gsub("(\\d+\\.?\\d*),\\s*(\\d+\\.?\\d*)", "\\1–\\2", bin_clean)]

# Format
scorecard[, default_rate := paste0(round(posprob * 100, 1), "%")]
scorecard[, pct_obs := paste0(round(count_distr * 100, 1), "%")]
scorecard[, coefficient := round(coefficient, 4)]
scorecard[, woe := round(woe, 4)]

display_table <- scorecard[, .(
  `Risk Driver` = fifelse(!is.na(coefficient), feature_clean, ""),
  Bucket = bin_clean,
  Coefficient = fifelse(!is.na(coefficient), as.character(coefficient), ""),
  `Default Rate` = default_rate,
  `# Obs` = format(count, big.mark = ","),
  `% Obs` = pct_obs,
  WoE = woe
)]

fwrite(display_table, file.path(OUTPUT_DIR, "table_logistic_scorecard.csv"))


# --- Coefficient table ---

library(knitr)

coef_display <- coef_dt[order(-abs(coefficient))]
coef_display[, feature := gsub("_woe$", "", feature)]
coef_display[, odds_ratio := round(exp(coefficient), 4)]
coef_display[, coefficient := round(coefficient, 4)]

# Clean feature names
feature_labels <- c(
  "(Intercept)" = "Constant",
  "credit_score" = "Credit Score",
  "orig_ltv" = "Original LTV",
  "orig_dti" = "Debt-to-Income",
  "orig_interest_rate" = "Origination Interest Rate",
  "orig_loan_term" = "Loan Term",
  "num_borrowers" = "Number of Borrowers",
  "mi_pct" = "Mortgage Insurance %",
  "current_delinquency_status" = "Current Delinquency Status",
  "loan_age" = "Loan Age",
  "current_upb" = "Current UPB",
  "delta_interest_rate" = "Delta Interest Rate",
  "mod_flag_12m" = "Modification Flag (12m)",
  "current_deferred_upb" = "Current Deferred UPB",
  "unemployment_rate_lag4" = "Unemployment Rate (Lag 4)",
  "hpi_qoq_qlag1" = "HPI Growth QoQ (Lag 1)",
  "occupancy_status" = "Occupancy Status",
  "loan_purpose" = "Loan Purpose",
  "channel" = "Channel",
  "property_type" = "Property Type",
  "first_time_homebuyer" = "First-Time Homebuyer"
)
coef_display[, feature := feature_labels[feature]]

setnames(coef_display, c("Feature", "Coefficient", "Odds Ratio"))

writeLines(
  kable(coef_display, format = "html", row.names = FALSE,
        caption = "Ridge Logistic Regression Coefficients"),
  file.path(OUTPUT_DIR, "table_logistic_coefficients.html")
)




# Specification Robustness: Additional Fixed Effects

# --- Load raw train/valid to get property_state ---
train_raw <- readRDS(file.path(OUTPUT_DIR, "train.rds"))
valid_raw <- readRDS(file.path(OUTPUT_DIR, "valid.rds"))

# Verify row alignment with WoE data
stopifnot(nrow(train_raw) == nrow(train_woe))
stopifnot(nrow(valid_raw) == nrow(valid_woe))

# --- State dummies ---
train_state <- model.matrix(~ property_state - 1, data = train_raw)
valid_state <- model.matrix(~ property_state - 1, data = valid_raw)

# Alignment of columns
missing_cols <- setdiff(colnames(train_state), colnames(valid_state))
if (length(missing_cols) > 0) {
  pad <- matrix(0, nrow = nrow(valid_state), ncol = length(missing_cols))
  colnames(pad) <- missing_cols
  valid_state <- cbind(valid_state, pad)
}
valid_state <- valid_state[, colnames(train_state), drop = FALSE]

# Drop one state column for reference category
train_state <- train_state[, -1, drop = FALSE]
valid_state <- valid_state[, -1, drop = FALSE]

# --- Augmented feature matrices ---
X_train_fe <- cbind(X_train, train_state)
X_valid_fe <- cbind(X_valid, valid_state)

# --- Fit augmented ridge model ---
fit_fe <- glmnet(X_train_fe, y_train, family = "binomial", alpha = 0)

lambdas_fe <- fit_fe$lambda
valid_aucs_fe <- sapply(lambdas_fe, function(lam) {
  preds <- predict(fit_fe, newx = X_valid_fe, s = lam, type = "response")[, 1]
  as.numeric(auc(roc(y_valid, preds, quiet = TRUE)))
})

best_idx_fe    <- which.max(valid_aucs_fe)
best_lambda_fe <- lambdas_fe[best_idx_fe]
best_auc_fe    <- valid_aucs_fe[best_idx_fe]

# --- Compare AUCs ---
data.table(
  Specification = c("Baseline", "State Fixed Effects"),
  AUC = round(c(best_auc, best_auc_fe), 6)
)
##          Specification      AUC
##                 <char>    <num>
## 1:            Baseline 0.804785
## 2: State Fixed Effects 0.811895
# --- Compare coefficients on original WoE features ---
coefs_fe <- coef(fit_fe, s = best_lambda_fe)
coef_fe_dt <- data.table(feature = rownames(coefs_fe),
                         coefficient_fe = as.numeric(coefs_fe))

coef_baseline <- coef_dt[feature %in% woe_features]
coef_augmented <- coef_fe_dt[feature %in% woe_features]

coef_compare <- merge(coef_baseline, coef_augmented, by = "feature")

# Spearman rank correlation of coefficient magnitudes
spearman_coef <- cor(abs(coef_compare$coefficient),
                     abs(coef_compare$coefficient_fe),
                     method = "spearman")

print(coef_compare[order(-abs(coefficient))])
##                            feature coefficient coefficient_fe
##                             <char>       <num>          <num>
##  1:       first_time_homebuyer_woe  -1.6636504     -1.6766858
##  2:                current_upb_woe   0.9206802      0.7392084
##  3:              hpi_qoq_qlag1_woe   0.9037092      0.5059718
##  4:              num_borrowers_woe   0.8581153      0.8404940
##  5:                   loan_age_woe   0.7624119      0.7870038
##  6:               credit_score_woe   0.6824889      0.6836481
##  7:           occupancy_status_woe   0.6584118      0.8117888
##  8:                   orig_ltv_woe   0.5794001      0.6431439
##  9:     unemployment_rate_lag4_woe   0.4670239      0.5852921
## 10:         orig_interest_rate_woe   0.4395245      0.4250734
## 11:                   orig_dti_woe   0.4229598      0.4159406
## 12:              property_type_woe   0.3872274      0.1290927
## 13:               loan_purpose_woe   0.3787428      0.2354684
## 14:             orig_loan_term_woe   0.2890103      0.2720313
## 15:                    channel_woe   0.2530759      0.2519539
## 16:                     mi_pct_woe   0.1306451      0.1556489
## 17:       current_deferred_upb_woe   0.0000000      0.0000000
## 18: current_delinquency_status_woe   0.0000000      0.0000000
## 19:        delta_interest_rate_woe   0.0000000      0.0000000
## 20:               mod_flag_12m_woe   0.0000000      0.0000000
##                            feature coefficient coefficient_fe
print(paste("Spearman rank correlation:", round(spearman_coef, 4)))
## [1] "Spearman rank correlation: 0.9288"
# --- Save robustness results ---
fe_results <- list(
  baseline_auc    = best_auc,
  fe_auc          = best_auc_fe,
  auc_difference  = best_auc_fe - best_auc,
  spearman_coef   = spearman_coef,
  coef_comparison = coef_compare,
  fe_model        = fit_fe,
  fe_best_lambda  = best_lambda_fe
)

saveRDS(fe_results, file.path(OUTPUT_DIR, "fe_robustness_results.rds"))