Stratified 60/20/20 loan-level split preserving default rates
library(data.table)
OUTPUT_DIR <- "/Users/amalianimeskern/Library/CloudStorage/OneDrive-ErasmusUniversityRotterdam/Freddie Mac Data"
# --- Load panel ---
panel <- readRDS(file.path(OUTPUT_DIR, "freddie_mac_panel.rds"))
# Loan-level ever_default flag
loan_ids <- panel[, .(ever_default = max(default_next_12m)), by = loan_sequence_number]
# --- Stratified split ---
set.seed(123)
defaults <- loan_ids[ever_default == 1]
non_defaults <- loan_ids[ever_default == 0]
split_loans <- function(dt) {
n <- nrow(dt)
idx <- sample(n)
n_train <- round(0.6 * n)
n_valid <- round(0.2 * n)
dt[idx[1:n_train], split := "train"]
dt[idx[(n_train + 1):(n_train + n_valid)], split := "valid"]
dt[idx[(n_train + n_valid + 1):n], split := "test"]
dt
}
defaults <- split_loans(defaults)
non_defaults <- split_loans(non_defaults)
loan_splits <- rbindlist(list(defaults, non_defaults))
# --- Map split labels to panel ---
panel <- merge(panel, loan_splits[, .(loan_sequence_number, split)],
by = "loan_sequence_number")
# --- Verification ---
# Loan-level
loan_splits[, .(n_loans = .N,
n_default = sum(ever_default),
default_rate = round(100 * mean(ever_default), 2)),
by = split]
## split n_loans n_default default_rate
## <char> <int> <int> <num>
## 1: train 144339 10661 7.39
## 2: valid 48113 3554 7.39
## 3: test 48113 3553 7.38
# Observation-level
panel[, .(n_obs = .N,
n_default = sum(default_next_12m),
default_rate = round(100 * mean(default_next_12m), 4)),
by = split]
## split n_obs n_default default_rate
## <char> <int> <int> <num>
## 1: test 1461257 3553 0.2431
## 2: valid 1465449 3554 0.2425
## 3: train 4386259 10661 0.2431
# --- Save ---
# Final datasets
train <- panel[split == "train"]
valid <- panel[split == "valid"]
test <- panel[split == "test"]
# Remove temporary splits
train[, split := NULL]
valid[, split := NULL]
test[, split := NULL]
# Save files
saveRDS(train, file.path(OUTPUT_DIR, "train.rds"))
saveRDS(valid, file.path(OUTPUT_DIR, "valid.rds"))
saveRDS(test, file.path(OUTPUT_DIR, "test.rds"))