Feature engineering for modelling
# Feature engineering: WoE for logistic regression, one-hot for XGBoost
library(data.table)
library(scorecard)
OUTPUT_DIR <- "/Users/amalianimeskern/Library/CloudStorage/OneDrive-ErasmusUniversityRotterdam/Freddie Mac Data"
# --- Load splits ---
train <- readRDS(file.path(OUTPUT_DIR, "train.rds"))
valid <- readRDS(file.path(OUTPUT_DIR, "valid.rds"))
test <- readRDS(file.path(OUTPUT_DIR, "test.rds"))
# --- Define feature set ---
# Drop: amortization_type, io_indicator (zero variation), orig_cltv (ρ=0.93 with orig_ltv), orig_upb (ρ=0.96 with current_upb), unemployment_rate_lag1-3 (ρ=0.98-1.00, lag4 retained per AIC), hpi_qoq_qlag2-4 (up to ρ=0.54, lag1 retained per AIC), property_state (macro controls capture geography)
continuous_features <- c("credit_score", "orig_ltv", "orig_dti",
"orig_interest_rate", "orig_loan_term",
"num_borrowers", "mi_pct",
"current_delinquency_status", "loan_age",
"current_upb", "delta_interest_rate",
"mod_flag_12m", "current_deferred_upb",
"unemployment_rate_lag4",
"hpi_qoq_qlag1")
categorical_features <- c("occupancy_status", "loan_purpose", "channel",
"property_type", "first_time_homebuyer")
all_features <- c(continuous_features, categorical_features)
target <- "default_next_12m"
# --- Subset to relevant columns ---
keep_cols <- c("loan_sequence_number", "monthly_reporting_period",
all_features, target)
train <- train[, ..keep_cols]
valid <- valid[, ..keep_cols]
test <- test[, ..keep_cols]
# # WoE transformation for logistic regression
# --- WoE binning on training set only ---
train_woe_input <- copy(train[, c(all_features, target), with = FALSE])
# Ensure categoricals are character
for (v in categorical_features) {
train_woe_input[[v]] <- as.character(train_woe_input[[v]])
}
bins <- woebin(train_woe_input, y = target, positive = "1")
## ✔ Binning on 4386259 rows and 21 columns in 00:00:16
# --- Apply WoE transformation ---
train_woe <- woebin_ply(train, bins)
## ✔ Woe transformating on 4386259 rows and 20 columns in 00:00:20
valid_woe <- woebin_ply(valid, bins)
## ✔ Woe transformating on 1465449 rows and 20 columns in 00:00:07
test_woe <- woebin_ply(test, bins)
## ✔ Woe transformating on 1461257 rows and 20 columns in 00:00:06
# --- Save WoE outputs ---
saveRDS(bins, file.path(OUTPUT_DIR, "woe_bins.rds"))
saveRDS(train_woe, file.path(OUTPUT_DIR, "train_woe.rds"))
saveRDS(valid_woe, file.path(OUTPUT_DIR, "valid_woe.rds"))
saveRDS(test_woe, file.path(OUTPUT_DIR, "test_woe.rds"))
# Xgboost: One-hot encoding
# --- One-hot encode categoricals ---
encode_onehot <- function(dt, cat_vars, ref_levels) {
dt_out <- copy(dt)
for (v in cat_vars) {
dt_out[[v]] <- as.character(dt_out[[v]])
levels_to_encode <- setdiff(ref_levels[[v]], ref_levels[[v]][1])
for (lev in levels_to_encode) {
col_name <- paste0(v, "_", lev)
dt_out[, (col_name) := as.integer(get(v) == lev)]
}
dt_out[, (v) := NULL]
}
dt_out
}
# Get reference levels from training set
ref_levels <- lapply(categorical_features, function(v) {
sort(unique(as.character(train[[v]])))
})
names(ref_levels) <- categorical_features
train_xgb <- encode_onehot(train, categorical_features, ref_levels)
valid_xgb <- encode_onehot(valid, categorical_features, ref_levels)
test_xgb <- encode_onehot(test, categorical_features, ref_levels)
# --- Save XGBoost outputs ---
saveRDS(ref_levels, file.path(OUTPUT_DIR, "onehot_ref_levels.rds"))
saveRDS(train_xgb, file.path(OUTPUT_DIR, "train_xgb.rds"))
saveRDS(valid_xgb, file.path(OUTPUT_DIR, "valid_xgb.rds"))
saveRDS(test_xgb, file.path(OUTPUT_DIR, "test_xgb.rds"))
# --- WoE bin table for appendix ---
bin_table <- rbindlist(bins, idcol = "feature")
bin_table <- bin_table[, .(feature, bin, count, count_distr,
neg, pos, posprob, woe, bin_iv, total_iv)]
bin_table[, count_distr := round(count_distr * 100, 2)]
bin_table[, posprob := round(posprob * 100, 4)]
bin_table[, woe := round(woe, 4)]
bin_table[, bin_iv := round(bin_iv, 4)]
bin_table[, total_iv := round(total_iv, 4)]
# Clean bin labels
bin_table[, bin := gsub("%,%", ", ", bin)]
bin_table[, bin := gsub("\\[-Inf,", "< ", bin)]
bin_table[, bin := gsub(", Inf\\)", "+", bin)]
bin_table[, bin := gsub("(\\d+\\.?\\d*),\\s*(\\d+\\.?\\d*)", "\\1–\\2", bin)]
# Clean feature names
feature_labels <- c(
"credit_score" = "Credit Score",
"orig_ltv" = "Original LTV",
"orig_dti" = "Debt-to-Income",
"orig_interest_rate" = "Origination Interest Rate",
"orig_loan_term" = "Loan Term",
"num_borrowers" = "Number of Borrowers",
"mi_pct" = "Mortgage Insurance %",
"current_delinquency_status" = "Current Delinquency Status",
"loan_age" = "Loan Age",
"current_upb" = "Current UPB",
"delta_interest_rate" = "Delta Interest Rate",
"mod_flag_12m" = "Modification Flag (12m)",
"current_deferred_upb" = "Current Deferred UPB",
"unemployment_rate_lag4" = "Unemployment Rate (Lag 4)",
"hpi_qoq_qlag1" = "HPI Growth QoQ (Lag 1)",
"occupancy_status" = "Occupancy Status",
"loan_purpose" = "Loan Purpose",
"channel" = "Channel",
"property_type" = "Property Type",
"first_time_homebuyer" = "First-Time Homebuyer"
)
bin_table[, feature := feature_labels[feature]]
# Rename columns
setnames(bin_table,
c("feature", "bin", "count", "count_distr", "neg", "pos", "posprob", "woe", "bin_iv", "total_iv"),
c("Feature", "Bin", "Count", "% Obs", "Non-Defaults", "Defaults", "Default Rate", "WoE", "Bin IV", "Total IV"))
fwrite(bin_table, file.path(OUTPUT_DIR, "woe_bin_table.csv"))
# --- WoE stability analysis ---
train_stability <- copy(train[, c("monthly_reporting_period", all_features, target), with = FALSE])
train_stability[, year := as.integer(substr(monthly_reporting_period, 1, 4))]
for (v in categorical_features) {
train_stability[[v]] <- as.character(train_stability[[v]])
}
breaks_list <- lapply(bins, function(b) {
if (!is.null(b$breaks)) b$breaks else NULL
})
woe_by_year <- list()
for (yr in sort(unique(train_stability$year))) {
subset <- train_stability[year == yr]
bins_yr <- woebin(subset[, c(all_features, target), with = FALSE],
y = target, positive = "1",
breaks_list = breaks_list)
tbl_yr <- rbindlist(bins_yr, idcol = "feature")
tbl_yr[, year := yr]
missing_features <- setdiff(all_features, unique(tbl_yr$feature))
if (length(missing_features) > 0) {
missing_dt <- data.table(feature = missing_features, year = yr, total_iv = 0)
tbl_yr <- rbindlist(list(tbl_yr, missing_dt), fill = TRUE)
}
woe_by_year[[as.character(yr)]] <- tbl_yr
}
## ✔ Binning on 147294 rows and 19 columns in 00:00:10
## ✔ Binning on 471458 rows and 19 columns in 00:00:01
## ✔ Binning on 753603 rows and 19 columns in 00:00:02
## ✔ Binning on 931542 rows and 21 columns in 00:00:03
## ✔ Binning on 1080105 rows and 21 columns in 00:00:03
## ✔ Binning on 1002257 rows and 21 columns in 00:00:03
woe_stability <- rbindlist(woe_by_year)
iv_stability <- woe_stability[, .(iv = unique(total_iv)), by = .(feature, year)]
iv_stability[is.na(iv), iv := 0]
iv_wide <- dcast(iv_stability, feature ~ year, value.var = "iv")
year_cols <- setdiff(names(iv_wide), "feature")
iv_wide[, iv_mean := apply(.SD, 1, mean), .SDcols = year_cols]
iv_wide[, iv_sd := apply(.SD, 1, sd), .SDcols = year_cols]
iv_wide[, iv_cv := fifelse(iv_mean > 0, iv_sd / iv_mean, NA_real_)]
iv_wide[order(-iv_sd)]
## feature 2006 2007 2008
## <char> <num> <num> <num>
## 1: orig_interest_rate 0.10210654 0.1525377357 1.425915e-01
## 2: credit_score 1.10761950 0.9411233771 6.690609e-01
## 3: loan_age 0.19541777 0.2082185614 8.082948e-02
## 4: channel 0.13866432 0.0139410811 7.599166e-02
## 5: hpi_qoq_qlag1 0.01489677 0.2020746713 2.159442e-01
## 6: mi_pct 0.20329844 0.2563061835 1.077798e-01
## 7: num_borrowers 0.22378909 0.1834264881 1.191778e-01
## 8: orig_ltv 0.30454264 0.4024183019 2.766737e-01
## 9: current_upb 0.00247344 0.0515638908 1.448263e-01
## 10: orig_dti 0.18809861 0.1325746740 2.079771e-01
## 11: orig_loan_term 0.15305357 0.1012901550 1.235901e-01
## 12: unemployment_rate_lag4 0.01883253 0.0014230794 4.246141e-02
## 13: occupancy_status 0.04021366 0.0197694963 2.148648e-03
## 14: loan_purpose 0.01898846 0.0117580917 3.222316e-02
## 15: property_type 0.01050460 0.0007900923 9.316634e-04
## 16: first_time_homebuyer 0.01332807 0.0021798333 8.815525e-05
## 17: current_deferred_upb 0.00000000 0.0000000000 0.000000e+00
## 18: current_delinquency_status 0.00000000 0.0000000000 0.000000e+00
## 19: delta_interest_rate 0.00000000 0.0000000000 0.000000e+00
## 20: mod_flag_12m 0.00000000 0.0000000000 0.000000e+00
## feature 2006 2007 2008
## 2009 2010 2011 iv_mean iv_sd iv_cv
## <num> <num> <num> <num> <num> <num>
## 1: 4.621222e-01 8.422856e-01 0.859942148 0.426930943 0.352961204 0.8267407
## 2: 5.924409e-01 6.321927e-01 0.606230899 0.758111384 0.214452423 0.2828772
## 3: 1.854255e-01 5.333155e-01 0.256360488 0.243261209 0.153329116 0.6303065
## 4: 1.144506e-01 2.665679e-01 0.277692127 0.147884622 0.105117110 0.7108049
## 5: 6.419409e-02 6.151078e-02 0.009750168 0.094728440 0.091484016 0.9657503
## 6: 8.105297e-02 9.506581e-02 0.085192071 0.138115887 0.073550531 0.5325277
## 7: 7.888725e-02 6.263339e-02 0.077973060 0.124314519 0.065469886 0.5266471
## 8: 2.578620e-01 2.929885e-01 0.330281389 0.310794426 0.051179831 0.1646742
## 9: 8.151583e-02 4.631261e-02 0.030182379 0.059479067 0.049221346 0.8275407
## 10: 2.741262e-01 2.166643e-01 0.215367283 0.205801355 0.045952273 0.2232846
## 11: 1.073894e-01 1.639877e-01 0.190175633 0.139914414 0.034912878 0.2495302
## 12: 4.552508e-03 5.592710e-02 0.045623786 0.028136736 0.022976951 0.8166175
## 13: 4.331547e-03 3.525758e-03 0.004702935 0.012448674 0.015073701 1.2108679
## 14: 3.182288e-02 4.339132e-02 0.031673294 0.028309534 0.011200605 0.3956478
## 15: 4.142624e-03 4.419696e-03 0.026461944 0.007875103 0.009763861 1.2398391
## 16: 5.842527e-05 9.080121e-05 0.009307987 0.004175546 0.005734739 1.3734106
## 17: 0.000000e+00 0.000000e+00 0.000000000 0.000000000 0.000000000 NA
## 18: 0.000000e+00 0.000000e+00 0.000000000 0.000000000 0.000000000 NA
## 19: 0.000000e+00 0.000000e+00 0.000000000 0.000000000 0.000000000 NA
## 20: 0.000000e+00 0.000000e+00 0.000000000 0.000000000 0.000000000 NA
## 2009 2010 2011 iv_mean iv_sd iv_cv
# Save
iv_wide_rounded <- copy(iv_wide)
cols <- setdiff(names(iv_wide_rounded), "feature")
iv_wide_rounded[, (cols) := lapply(.SD, round, 4), .SDcols = cols]
fwrite(iv_wide_rounded, file.path(OUTPUT_DIR, "iv_stability_by_year.csv"))
fwrite(woe_stability, file.path(OUTPUT_DIR, "woe_bin_stability_by_year.csv"))