Feature engineering for modelling

# Feature engineering: WoE for logistic regression, one-hot for XGBoost

library(data.table)
library(scorecard)

OUTPUT_DIR <- "/Users/amalianimeskern/Library/CloudStorage/OneDrive-ErasmusUniversityRotterdam/Freddie Mac Data"

# --- Load splits ---
train <- readRDS(file.path(OUTPUT_DIR, "train.rds"))
valid <- readRDS(file.path(OUTPUT_DIR, "valid.rds"))
test  <- readRDS(file.path(OUTPUT_DIR, "test.rds"))

# --- Define feature set ---
# Drop: amortization_type, io_indicator (zero variation), orig_cltv (ρ=0.93 with orig_ltv), orig_upb (ρ=0.96 with current_upb), unemployment_rate_lag1-3 (ρ=0.98-1.00, lag4 retained per AIC), hpi_qoq_qlag2-4 (up to ρ=0.54, lag1 retained per AIC), property_state (macro controls capture geography)
continuous_features <- c("credit_score", "orig_ltv", "orig_dti",
                         "orig_interest_rate", "orig_loan_term",
                         "num_borrowers", "mi_pct",
                         "current_delinquency_status", "loan_age",
                         "current_upb", "delta_interest_rate",
                         "mod_flag_12m", "current_deferred_upb",
                         "unemployment_rate_lag4",
                         "hpi_qoq_qlag1")

categorical_features <- c("occupancy_status", "loan_purpose", "channel",
                          "property_type", "first_time_homebuyer")

all_features <- c(continuous_features, categorical_features)
target <- "default_next_12m"

# --- Subset to relevant columns ---
keep_cols <- c("loan_sequence_number", "monthly_reporting_period",
               all_features, target)

train <- train[, ..keep_cols]
valid <- valid[, ..keep_cols]
test  <- test[, ..keep_cols]

# # WoE transformation for logistic regression

# --- WoE binning on training set only ---
train_woe_input <- copy(train[, c(all_features, target), with = FALSE])

# Ensure categoricals are character
for (v in categorical_features) {
  train_woe_input[[v]] <- as.character(train_woe_input[[v]])
}

bins <- woebin(train_woe_input, y = target, positive = "1")
## ✔ Binning on 4386259 rows and 21 columns in 00:00:16
# --- Apply WoE transformation ---
train_woe <- woebin_ply(train, bins)
## ✔ Woe transformating on 4386259 rows and 20 columns in 00:00:20
valid_woe <- woebin_ply(valid, bins)
## ✔ Woe transformating on 1465449 rows and 20 columns in 00:00:07
test_woe  <- woebin_ply(test, bins)
## ✔ Woe transformating on 1461257 rows and 20 columns in 00:00:06
# --- Save WoE outputs ---
saveRDS(bins,      file.path(OUTPUT_DIR, "woe_bins.rds"))
saveRDS(train_woe, file.path(OUTPUT_DIR, "train_woe.rds"))
saveRDS(valid_woe, file.path(OUTPUT_DIR, "valid_woe.rds"))
saveRDS(test_woe,  file.path(OUTPUT_DIR, "test_woe.rds"))

# Xgboost: One-hot encoding

# --- One-hot encode categoricals ---
encode_onehot <- function(dt, cat_vars, ref_levels) {
  dt_out <- copy(dt)
  for (v in cat_vars) {
    dt_out[[v]] <- as.character(dt_out[[v]])
    levels_to_encode <- setdiff(ref_levels[[v]], ref_levels[[v]][1])
    for (lev in levels_to_encode) {
      col_name <- paste0(v, "_", lev)
      dt_out[, (col_name) := as.integer(get(v) == lev)]
    }
    dt_out[, (v) := NULL]
  }
  dt_out
}

# Get reference levels from training set
ref_levels <- lapply(categorical_features, function(v) {
  sort(unique(as.character(train[[v]])))
})
names(ref_levels) <- categorical_features

train_xgb <- encode_onehot(train, categorical_features, ref_levels)
valid_xgb <- encode_onehot(valid, categorical_features, ref_levels)
test_xgb  <- encode_onehot(test,  categorical_features, ref_levels)

# --- Save XGBoost outputs ---
saveRDS(ref_levels, file.path(OUTPUT_DIR, "onehot_ref_levels.rds"))
saveRDS(train_xgb,  file.path(OUTPUT_DIR, "train_xgb.rds"))
saveRDS(valid_xgb,  file.path(OUTPUT_DIR, "valid_xgb.rds"))
saveRDS(test_xgb,   file.path(OUTPUT_DIR, "test_xgb.rds"))

# --- WoE bin table for appendix ---
bin_table <- rbindlist(bins, idcol = "feature")
bin_table <- bin_table[, .(feature, bin, count, count_distr,
                           neg, pos, posprob, woe, bin_iv, total_iv)]
bin_table[, count_distr := round(count_distr * 100, 2)]
bin_table[, posprob     := round(posprob * 100, 4)]
bin_table[, woe         := round(woe, 4)]
bin_table[, bin_iv      := round(bin_iv, 4)]
bin_table[, total_iv    := round(total_iv, 4)]

# Clean bin labels
bin_table[, bin := gsub("%,%", ", ", bin)]
bin_table[, bin := gsub("\\[-Inf,", "< ", bin)]
bin_table[, bin := gsub(", Inf\\)", "+", bin)]
bin_table[, bin := gsub("(\\d+\\.?\\d*),\\s*(\\d+\\.?\\d*)", "\\1–\\2", bin)]

# Clean feature names
feature_labels <- c(
  "credit_score" = "Credit Score",
  "orig_ltv" = "Original LTV",
  "orig_dti" = "Debt-to-Income",
  "orig_interest_rate" = "Origination Interest Rate",
  "orig_loan_term" = "Loan Term",
  "num_borrowers" = "Number of Borrowers",
  "mi_pct" = "Mortgage Insurance %",
  "current_delinquency_status" = "Current Delinquency Status",
  "loan_age" = "Loan Age",
  "current_upb" = "Current UPB",
  "delta_interest_rate" = "Delta Interest Rate",
  "mod_flag_12m" = "Modification Flag (12m)",
  "current_deferred_upb" = "Current Deferred UPB",
  "unemployment_rate_lag4" = "Unemployment Rate (Lag 4)",
  "hpi_qoq_qlag1" = "HPI Growth QoQ (Lag 1)",
  "occupancy_status" = "Occupancy Status",
  "loan_purpose" = "Loan Purpose",
  "channel" = "Channel",
  "property_type" = "Property Type",
  "first_time_homebuyer" = "First-Time Homebuyer"
)

bin_table[, feature := feature_labels[feature]]

# Rename columns
setnames(bin_table,
         c("feature", "bin", "count", "count_distr", "neg", "pos", "posprob", "woe", "bin_iv", "total_iv"),
         c("Feature", "Bin", "Count", "% Obs", "Non-Defaults", "Defaults", "Default Rate", "WoE", "Bin IV", "Total IV"))

fwrite(bin_table, file.path(OUTPUT_DIR, "woe_bin_table.csv"))

# --- WoE stability analysis ---
train_stability <- copy(train[, c("monthly_reporting_period", all_features, target), with = FALSE])
train_stability[, year := as.integer(substr(monthly_reporting_period, 1, 4))]

for (v in categorical_features) {
  train_stability[[v]] <- as.character(train_stability[[v]])
}

breaks_list <- lapply(bins, function(b) {
  if (!is.null(b$breaks)) b$breaks else NULL
})

woe_by_year <- list()

for (yr in sort(unique(train_stability$year))) {
  subset <- train_stability[year == yr]
  
  bins_yr <- woebin(subset[, c(all_features, target), with = FALSE],
                    y = target, positive = "1",
                    breaks_list = breaks_list)
  
  tbl_yr <- rbindlist(bins_yr, idcol = "feature")
  tbl_yr[, year := yr]
  
  missing_features <- setdiff(all_features, unique(tbl_yr$feature))
  
  if (length(missing_features) > 0) {
    missing_dt <- data.table(feature = missing_features, year = yr, total_iv = 0)
    tbl_yr <- rbindlist(list(tbl_yr, missing_dt), fill = TRUE)
  }
  
  woe_by_year[[as.character(yr)]] <- tbl_yr
}
## ✔ Binning on 147294 rows and 19 columns in 00:00:10
## ✔ Binning on 471458 rows and 19 columns in 00:00:01
## ✔ Binning on 753603 rows and 19 columns in 00:00:02
## ✔ Binning on 931542 rows and 21 columns in 00:00:03
## ✔ Binning on 1080105 rows and 21 columns in 00:00:03
## ✔ Binning on 1002257 rows and 21 columns in 00:00:03
woe_stability <- rbindlist(woe_by_year)

iv_stability <- woe_stability[, .(iv = unique(total_iv)), by = .(feature, year)]

iv_stability[is.na(iv), iv := 0]

iv_wide <- dcast(iv_stability, feature ~ year, value.var = "iv")

year_cols <- setdiff(names(iv_wide), "feature")

iv_wide[, iv_mean := apply(.SD, 1, mean), .SDcols = year_cols]
iv_wide[, iv_sd := apply(.SD, 1, sd), .SDcols = year_cols]
iv_wide[, iv_cv := fifelse(iv_mean > 0, iv_sd / iv_mean, NA_real_)]

iv_wide[order(-iv_sd)]
##                        feature       2006         2007         2008
##                         <char>      <num>        <num>        <num>
##  1:         orig_interest_rate 0.10210654 0.1525377357 1.425915e-01
##  2:               credit_score 1.10761950 0.9411233771 6.690609e-01
##  3:                   loan_age 0.19541777 0.2082185614 8.082948e-02
##  4:                    channel 0.13866432 0.0139410811 7.599166e-02
##  5:              hpi_qoq_qlag1 0.01489677 0.2020746713 2.159442e-01
##  6:                     mi_pct 0.20329844 0.2563061835 1.077798e-01
##  7:              num_borrowers 0.22378909 0.1834264881 1.191778e-01
##  8:                   orig_ltv 0.30454264 0.4024183019 2.766737e-01
##  9:                current_upb 0.00247344 0.0515638908 1.448263e-01
## 10:                   orig_dti 0.18809861 0.1325746740 2.079771e-01
## 11:             orig_loan_term 0.15305357 0.1012901550 1.235901e-01
## 12:     unemployment_rate_lag4 0.01883253 0.0014230794 4.246141e-02
## 13:           occupancy_status 0.04021366 0.0197694963 2.148648e-03
## 14:               loan_purpose 0.01898846 0.0117580917 3.222316e-02
## 15:              property_type 0.01050460 0.0007900923 9.316634e-04
## 16:       first_time_homebuyer 0.01332807 0.0021798333 8.815525e-05
## 17:       current_deferred_upb 0.00000000 0.0000000000 0.000000e+00
## 18: current_delinquency_status 0.00000000 0.0000000000 0.000000e+00
## 19:        delta_interest_rate 0.00000000 0.0000000000 0.000000e+00
## 20:               mod_flag_12m 0.00000000 0.0000000000 0.000000e+00
##                        feature       2006         2007         2008
##             2009         2010        2011     iv_mean       iv_sd     iv_cv
##            <num>        <num>       <num>       <num>       <num>     <num>
##  1: 4.621222e-01 8.422856e-01 0.859942148 0.426930943 0.352961204 0.8267407
##  2: 5.924409e-01 6.321927e-01 0.606230899 0.758111384 0.214452423 0.2828772
##  3: 1.854255e-01 5.333155e-01 0.256360488 0.243261209 0.153329116 0.6303065
##  4: 1.144506e-01 2.665679e-01 0.277692127 0.147884622 0.105117110 0.7108049
##  5: 6.419409e-02 6.151078e-02 0.009750168 0.094728440 0.091484016 0.9657503
##  6: 8.105297e-02 9.506581e-02 0.085192071 0.138115887 0.073550531 0.5325277
##  7: 7.888725e-02 6.263339e-02 0.077973060 0.124314519 0.065469886 0.5266471
##  8: 2.578620e-01 2.929885e-01 0.330281389 0.310794426 0.051179831 0.1646742
##  9: 8.151583e-02 4.631261e-02 0.030182379 0.059479067 0.049221346 0.8275407
## 10: 2.741262e-01 2.166643e-01 0.215367283 0.205801355 0.045952273 0.2232846
## 11: 1.073894e-01 1.639877e-01 0.190175633 0.139914414 0.034912878 0.2495302
## 12: 4.552508e-03 5.592710e-02 0.045623786 0.028136736 0.022976951 0.8166175
## 13: 4.331547e-03 3.525758e-03 0.004702935 0.012448674 0.015073701 1.2108679
## 14: 3.182288e-02 4.339132e-02 0.031673294 0.028309534 0.011200605 0.3956478
## 15: 4.142624e-03 4.419696e-03 0.026461944 0.007875103 0.009763861 1.2398391
## 16: 5.842527e-05 9.080121e-05 0.009307987 0.004175546 0.005734739 1.3734106
## 17: 0.000000e+00 0.000000e+00 0.000000000 0.000000000 0.000000000        NA
## 18: 0.000000e+00 0.000000e+00 0.000000000 0.000000000 0.000000000        NA
## 19: 0.000000e+00 0.000000e+00 0.000000000 0.000000000 0.000000000        NA
## 20: 0.000000e+00 0.000000e+00 0.000000000 0.000000000 0.000000000        NA
##             2009         2010        2011     iv_mean       iv_sd     iv_cv
# Save 
iv_wide_rounded <- copy(iv_wide)
cols <- setdiff(names(iv_wide_rounded), "feature")

iv_wide_rounded[, (cols) := lapply(.SD, round, 4), .SDcols = cols]

fwrite(iv_wide_rounded, file.path(OUTPUT_DIR, "iv_stability_by_year.csv"))
fwrite(woe_stability, file.path(OUTPUT_DIR, "woe_bin_stability_by_year.csv"))