Overview

This document preprocesses the Freddie Mac Single-Family Loan-Level Dataset to construct a panel dataset with a 12-month forward default indicator.

Setup

library(data.table)
library(fredr)

fredr_set_key("0ebdc36a1f51b8f30be241cffec62719")

Settings

DATA_DIR   <- "/Users/amalianimeskern/Library/CloudStorage/OneDrive-ErasmusUniversityRotterdam/Freddie Mac Data"
OUTPUT_DIR <- "/Users/amalianimeskern/Library/CloudStorage/OneDrive-ErasmusUniversityRotterdam/Freddie Mac Data"

years <- 2006:2010

Origination Data

Column Definitions

orig_positions <- c(1, 2, 3, 6, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 20, 21, 22, 23, 31)
orig_names <- c(
  "credit_score", "first_payment_date", "first_time_homebuyer",
  "mi_pct", "occupancy_status", "orig_cltv", "orig_dti", "orig_upb",
  "orig_ltv", "orig_interest_rate", "channel", "amortization_type",
  "property_state", "property_type", "loan_sequence_number",
  "loan_purpose", "orig_loan_term", "num_borrowers", "io_indicator"
)

Load Data

orig_list <- lapply(years, function(yr) {
  folder <- file.path(DATA_DIR, paste0("sample_", yr))
  fname  <- file.path(folder, paste0("sample_orig_", yr, ".txt"))
  
  if (!file.exists(fname)) {
    zipfile <- file.path(folder, paste0("sample_", yr, ".zip"))
    if (file.exists(zipfile)) unzip(zipfile, exdir = folder)
  }
  
  fread(fname, sep = "|", header = FALSE,
        select     = orig_positions,
        col.names  = orig_names,
        colClasses = "character",
        na.strings = c("", "NA"))
})

orig <- rbindlist(orig_list, fill = TRUE)

Cleaning

numeric_cols <- c("credit_score", "mi_pct", "orig_cltv", "orig_dti", 
                  "orig_upb", "orig_ltv", "orig_interest_rate", 
                  "orig_loan_term", "num_borrowers")
orig[, (numeric_cols) := lapply(.SD, as.numeric), .SDcols = numeric_cols]

orig[credit_score == 9999, credit_score := NA]
orig[orig_dti     == 999,  orig_dti     := NA]
orig[orig_ltv     == 999,  orig_ltv     := NA]
orig[orig_cltv    == 999,  orig_cltv    := NA]
orig[mi_pct       == 999,  mi_pct       := NA]
orig[num_borrowers == 99, num_borrowers := NA]
orig[orig_loan_term > 360, orig_loan_term := NA]

Time Features

orig[, first_payment_date := as.character(first_payment_date)]
orig[, orig_year    := as.integer(substr(first_payment_date, 1, 4))]
orig[, orig_month   := as.integer(substr(first_payment_date, 5, 6))]
orig[, orig_quarter := paste0(orig_year, "Q", ceiling(orig_month / 3))]
orig[, orig_quarter := factor(orig_quarter, levels = sort(unique(orig_quarter)))]

orig <- orig[orig_year >= 2006 & orig_year <= 2010]

Performance Data

Column Definitions

perf_positions <- c(1, 2, 3, 4, 5, 8, 9, 11, 12, 24)
perf_names     <- c(
  "loan_sequence_number", "monthly_reporting_period", "current_upb",
  "current_delinquency_status", "loan_age", "modification_flag",
  "zero_balance_code", "current_interest_rate", "current_deferred_upb",
  "step_modification_flag"
)

Load Data

perf_list <- lapply(years, function(yr) {
  folder <- file.path(DATA_DIR, paste0("sample_", yr))
  fname  <- file.path(folder, paste0("sample_svcg_", yr, ".txt"))
  
  if (!file.exists(fname)) {
    zipfile <- file.path(folder, paste0("sample_", yr, ".zip"))
    if (file.exists(zipfile)) unzip(zipfile, exdir = folder)
  }
  
  fread(fname, sep = "|", header = FALSE,
        select     = perf_positions,
        col.names  = perf_names,
        colClasses = "character",
        na.strings = c("", "NA"))
})

perf <- rbindlist(perf_list, fill = TRUE)

Cleaning

perf[, monthly_reporting_period   := as.integer(monthly_reporting_period)]
perf[, current_delinquency_status := suppressWarnings(as.integer(current_delinquency_status))]
perf[, loan_age                   := as.integer(loan_age)]
perf[, current_upb                := as.numeric(current_upb)]
perf[, current_interest_rate      := as.numeric(current_interest_rate)]
perf[, current_deferred_upb       := as.numeric(current_deferred_upb)]
perf[, zero_balance_code          := as.character(zero_balance_code)]

perf <- perf[monthly_reporting_period >= 200601 & monthly_reporting_period <= 201212]
perf <- perf[loan_sequence_number %in% orig$loan_sequence_number]

Feature Engineering

setorder(perf, loan_sequence_number, monthly_reporting_period)

perf <- merge(perf,
              orig[, .(loan_sequence_number, orig_interest_rate)],
              by = "loan_sequence_number", all.x = TRUE)

perf[, delta_interest_rate := current_interest_rate - orig_interest_rate]

perf[, mod_flag_bin := as.integer(modification_flag == "Y")]
perf[, mod_flag_12m := frollapply(mod_flag_bin, n = 12, FUN = max,
                                  fill = NA, align = "right"),
     by = loan_sequence_number]
perf[is.na(mod_flag_12m), mod_flag_12m := 0L]

perf[, c("mod_flag_bin", "modification_flag", "step_modification_flag",
         "current_interest_rate", "orig_interest_rate") := NULL]

Default Construction

perf[, is_default_event := as.integer(
  (!is.na(current_delinquency_status) & current_delinquency_status >= 3) |
    (zero_balance_code %in% c("03", "09"))
)]

first_default <- perf[is_default_event == 1,
                      .(first_default_month = min(monthly_reporting_period)),
                      by = loan_sequence_number]

perf <- merge(perf, first_default, by = "loan_sequence_number", all.x = TRUE)

perf <- perf[is.na(first_default_month) | monthly_reporting_period < first_default_month]

12-Month Forward Default

perf[, month_index := 12 * (monthly_reporting_period %/% 100) + (monthly_reporting_period %% 100)]
perf[, first_default_index := 12 * (first_default_month %/% 100) + (first_default_month %% 100)]

perf[, default_next_12m := as.integer(
  !is.na(first_default_index) &
    first_default_index > month_index &
    first_default_index <= month_index + 12
)]

Merge Panel

panel <- merge(perf, orig, by = "loan_sequence_number", all.x = FALSE)
panel[, c("is_default_event", "first_default_month") := NULL]

Macro Controls (FRED)

state_abbrevs <- c("AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA",
                   "HI","ID","IL","IN","IA","KS","KY","LA","ME","MD",
                   "MA","MI","MN","MS","MO","MT","NE","NV","NH","NJ",
                   "NM","NY","NC","ND","OH","OK","OR","PA","RI","SC",
                   "SD","TN","TX","UT","VT","VA","WA","WV","WI","WY","DC")

(Remaining macro merge code can stay identical — already chunk-safe)

Save Output

fwrite(panel, file.path(OUTPUT_DIR, "freddie_mac_panel.csv"))
saveRDS(panel, file.path(OUTPUT_DIR, "freddie_mac_panel.rds"))

set.seed(123)
sample_loans <- sample(panel[, unique(loan_sequence_number)], 500)
panel_sample <- panel[loan_sequence_number %in% sample_loans]
fwrite(panel_sample, file.path(OUTPUT_DIR, "freddie_mac_panel_sample.csv"))