Panel Data Splitting

Stratified 60/20/20 loan-level split preserving default rates

library(data.table)

OUTPUT_DIR <- "/Users/amalianimeskern/Library/CloudStorage/OneDrive-ErasmusUniversityRotterdam/Freddie Mac Data"

# --- Load panel ---
panel <- readRDS(file.path(OUTPUT_DIR, "freddie_mac_panel.rds"))

# Loan-level ever_default flag 
loan_ids <- panel[, .(ever_default = max(default_next_12m)), by = loan_sequence_number]

# --- Stratified split ---
set.seed(123)

defaults     <- loan_ids[ever_default == 1]
non_defaults <- loan_ids[ever_default == 0]

split_loans <- function(dt) {
  n <- nrow(dt)
  idx <- sample(n)
  n_train <- round(0.6 * n)
  n_valid <- round(0.2 * n)
  dt[idx[1:n_train], split := "train"]
  dt[idx[(n_train + 1):(n_train + n_valid)], split := "valid"]
  dt[idx[(n_train + n_valid + 1):n], split := "test"]
  dt
}

defaults     <- split_loans(defaults)
non_defaults <- split_loans(non_defaults)

loan_splits <- rbindlist(list(defaults, non_defaults))

# --- Map split labels to panel ---
panel <- merge(panel, loan_splits[, .(loan_sequence_number, split)],
               by = "loan_sequence_number")

# --- Verification ---
# Loan-level
loan_splits[, .(n_loans = .N,
                n_default = sum(ever_default),
                default_rate = round(100 * mean(ever_default), 2)),
            by = split]
##     split n_loans n_default default_rate
##    <char>   <int>     <int>        <num>
## 1:  train  144339     10661         7.39
## 2:  valid   48113      3554         7.39
## 3:   test   48113      3553         7.38
# Observation-level
panel[, .(n_obs = .N,
          n_default = sum(default_next_12m),
          default_rate = round(100 * mean(default_next_12m), 4)),
      by = split]
##     split   n_obs n_default default_rate
##    <char>   <int>     <int>        <num>
## 1:   test 1461257      3553       0.2431
## 2:  valid 1465449      3554       0.2425
## 3:  train 4386259     10661       0.2431
# --- Save ---
# Final datasets
train <- panel[split == "train"]
valid <- panel[split == "valid"]
test  <- panel[split == "test"]

# Remove temporary splits
train[, split := NULL]
valid[, split := NULL]
test[, split := NULL]

# Save files
saveRDS(train, file.path(OUTPUT_DIR, "train.rds"))
saveRDS(valid, file.path(OUTPUT_DIR, "valid.rds"))
saveRDS(test,  file.path(OUTPUT_DIR, "test.rds"))