Freddie Mac Single-Family Loan-Level Dataset - Preprocessing

Panel dataset with 12-month forward default indicator.
Origination vintages: 2006–2010, observation window: 2006–2012.

# Freddie Mac Single-Family Loan-Level Dataset - Preprocessing
# Panel dataset with 12-month forward default indicator
# Origination vintages: 2006-2010, observation window: 2006-2012

library(data.table)
library(fredr)

fredr_set_key("0ebdc36a1f51b8f30be241cffec62719")

# --- Settings ---

DATA_DIR   <- "/Users/amalianimeskern/Library/CloudStorage/OneDrive-ErasmusUniversityRotterdam/Freddie Mac Data"
OUTPUT_DIR <- "/Users/amalianimeskern/Library/CloudStorage/OneDrive-ErasmusUniversityRotterdam/Freddie Mac Data"

years <- 2006:2010

# --- Origination columns ---

orig_positions <- c(1, 2, 3, 6, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 20, 21, 22, 23, 31)
orig_names <- c(
  "credit_score", "first_payment_date", "first_time_homebuyer",
  "mi_pct", "occupancy_status", "orig_cltv", "orig_dti", "orig_upb",
  "orig_ltv", "orig_interest_rate", "channel", "amortization_type",
  "property_state", "property_type", "loan_sequence_number",
  "loan_purpose", "orig_loan_term", "num_borrowers", "io_indicator"
)

# --- Load origination files ---

orig_list <- lapply(years, function(yr) {
  folder <- file.path(DATA_DIR, paste0("sample_", yr))
  fname  <- file.path(folder, paste0("sample_orig_", yr, ".txt"))
  
  if (!file.exists(fname)) {
    zipfile <- file.path(folder, paste0("sample_", yr, ".zip"))
    if (file.exists(zipfile)) unzip(zipfile, exdir = folder)
  }
  
  fread(fname, sep = "|", header = FALSE,
        select     = orig_positions,
        col.names  = orig_names,
        colClasses = "character",
        na.strings = c("", "NA"))
})

orig <- rbindlist(orig_list, fill = TRUE)

# --- Clean origination data ---

numeric_cols <- c("credit_score", "mi_pct", "orig_cltv", "orig_dti", 
                  "orig_upb", "orig_ltv", "orig_interest_rate", 
                  "orig_loan_term", "num_borrowers")
orig[, (numeric_cols) := lapply(.SD, as.numeric), .SDcols = numeric_cols]

orig[credit_score == 9999, credit_score := NA]
orig[orig_dti     == 999,  orig_dti     := NA]
orig[orig_ltv     == 999,  orig_ltv     := NA]
orig[orig_cltv    == 999,  orig_cltv    := NA]
orig[mi_pct       == 999,  mi_pct       := NA]
orig[num_borrowers == 99, num_borrowers := NA]
orig[orig_loan_term > 360, orig_loan_term := NA]
orig[orig_cltv > 200, orig_cltv := NA]
orig[first_time_homebuyer == "9", first_time_homebuyer := NA]

orig[, first_payment_date := as.character(first_payment_date)]
orig[, orig_year    := as.integer(substr(first_payment_date, 1, 4))]
orig[, orig_month   := as.integer(substr(first_payment_date, 5, 6))]
orig[, orig_quarter := paste0(orig_year, "Q", ceiling(orig_month / 3))]
orig[, orig_quarter := factor(orig_quarter, levels = sort(unique(orig_quarter)))]

orig <- orig[orig_year >= 2006 & orig_year <= 2010]

# --- Performance columns ---

perf_positions <- c(1, 2, 3, 4, 5, 8, 9, 11, 12, 24)
perf_names     <- c(
  "loan_sequence_number", "monthly_reporting_period", "current_upb",
  "current_delinquency_status", "loan_age", "modification_flag",
  "zero_balance_code", "current_interest_rate", "current_deferred_upb",
  "step_modification_flag"
)

# --- Load performance files ---

perf_list <- lapply(years, function(yr) {
  folder <- file.path(DATA_DIR, paste0("sample_", yr))
  fname  <- file.path(folder, paste0("sample_svcg_", yr, ".txt"))
  
  if (!file.exists(fname)) {
    zipfile <- file.path(folder, paste0("sample_", yr, ".zip"))
    if (file.exists(zipfile)) unzip(zipfile, exdir = folder)
  }
  
  fread(fname, sep = "|", header = FALSE,
        select     = perf_positions,
        col.names  = perf_names,
        colClasses = "character",
        na.strings = c("", "NA"))
})

perf <- rbindlist(perf_list, fill = TRUE)

# --- Clean performance data ---

perf[, monthly_reporting_period   := as.integer(monthly_reporting_period)]
perf[, current_delinquency_status := suppressWarnings(as.integer(current_delinquency_status))]
perf[, loan_age                   := as.integer(loan_age)]
perf[, current_upb                := as.numeric(current_upb)]
perf[, current_interest_rate      := as.numeric(current_interest_rate)]
perf[, current_deferred_upb       := as.numeric(current_deferred_upb)]
perf[, zero_balance_code          := as.character(zero_balance_code)]

perf <- perf[monthly_reporting_period >= 200601 & monthly_reporting_period <= 201212]
perf <- perf[loan_sequence_number %in% orig$loan_sequence_number]

# --- Feature engineering ---

setorder(perf, loan_sequence_number, monthly_reporting_period)

perf <- merge(perf,
              orig[, .(loan_sequence_number, orig_interest_rate)],
              by = "loan_sequence_number", all.x = TRUE)

perf[, delta_interest_rate := current_interest_rate - orig_interest_rate]

perf[, mod_flag_bin := as.integer(modification_flag == "Y")]
perf[, mod_flag_12m := frollapply(mod_flag_bin, n = 12, FUN = max,
                                  fill = NA, align = "right"),
     by = loan_sequence_number]
perf[is.na(mod_flag_12m), mod_flag_12m := 0L]

perf[, c("mod_flag_bin", "modification_flag", "step_modification_flag",
         "current_interest_rate", "orig_interest_rate") := NULL]

# --- Construct default event and truncate panel ---

perf[, is_default_event := as.integer(
  (!is.na(current_delinquency_status) & current_delinquency_status >= 3) |
    (zero_balance_code %in% c("03", "09"))
)]

first_default <- perf[is_default_event == 1,
                      .(first_default_month = min(monthly_reporting_period)),
                      by = loan_sequence_number]

perf <- merge(perf, first_default, by = "loan_sequence_number", all.x = TRUE)

perf <- perf[is.na(first_default_month) | monthly_reporting_period < first_default_month]

# --- Construct 12-month forward default outcome ---

perf[, month_index := 12 * (monthly_reporting_period %/% 100) + (monthly_reporting_period %% 100)]
perf[, first_default_index := 12 * (first_default_month %/% 100) + (first_default_month %% 100)]

perf[, default_next_12m := as.integer(
  !is.na(first_default_index) &
    first_default_index > month_index &
    first_default_index <= month_index + 12
)]

rows_1 <- perf[default_next_12m == 1,
               .(first_flag_month = min(monthly_reporting_period)),
               by = loan_sequence_number]

rows_0 <- perf[default_next_12m == 0]
rows_0 <- merge(rows_0, rows_1, by = "loan_sequence_number", all.x = TRUE)
rows_0 <- rows_0[is.na(first_flag_month) | monthly_reporting_period < first_flag_month]
rows_0[, first_flag_month := NULL]

rows_1_full <- merge(perf[default_next_12m == 1], rows_1, by = "loan_sequence_number")
rows_1_full <- rows_1_full[monthly_reporting_period == first_flag_month]
rows_1_full[, first_flag_month := NULL]

perf <- rbindlist(list(rows_0, rows_1_full), fill = TRUE)
setorder(perf, loan_sequence_number, monthly_reporting_period)

perf <- perf[monthly_reporting_period <= 201112]

# --- Merge origination into the panel ---

panel <- merge(perf, orig, by = "loan_sequence_number", all.x = FALSE)
panel[, c("is_default_event", "first_default_month") := NULL]

# --- Macro controls from FRED ---

state_abbrevs <- c("AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA",
                   "HI","ID","IL","IN","IA","KS","KY","LA","ME","MD",
                   "MA","MI","MN","MS","MO","MT","NE","NV","NH","NJ",
                   "NM","NY","NC","ND","OH","OK","OR","PA","RI","SC",
                   "SD","TN","TX","UT","VT","VA","WA","WV","WI","WY","DC")

unemp_list <- lapply(state_abbrevs, function(st) {
  series_id <- paste0(st, "UR")
  dt <- tryCatch({
    fredr(series_id          = series_id,
          observation_start  = as.Date("2005-01-01"),
          observation_end    = as.Date("2011-12-01"),
          frequency          = "m")
  }, error = function(e) NULL)
  if (is.null(dt)) return(NULL)
  data.table(state_abbrev             = st,
             monthly_reporting_period  = as.integer(format(dt$date, "%Y%m")),
             unemployment_rate        = dt$value)
})

unemp <- rbindlist(unemp_list)

setorder(unemp, state_abbrev, monthly_reporting_period)
for (k in 1:4) {
  col <- paste0("unemployment_rate_lag", k)
  unemp[, (col) := shift(unemployment_rate, k), by = state_abbrev]
}

hpi_list <- lapply(state_abbrevs, function(st) {
  series_id <- paste0(st, "STHPI")
  dt <- tryCatch({
    fredr(series_id          = series_id,
          observation_start  = as.Date("2004-01-01"),
          observation_end    = as.Date("2012-12-01"),
          frequency          = "q")
  }, error = function(e) NULL)
  if (is.null(dt)) return(NULL)
  data.table(state_abbrev = st, date = dt$date, hpi = dt$value)
})

hpi <- rbindlist(hpi_list)

setorder(hpi, state_abbrev, date)
hpi[, hpi_qoq := (hpi / shift(hpi, n = 1) - 1) * 100, by = state_abbrev]

for (k in 1:4) {
  col <- paste0("hpi_qoq_qlag", k)
  hpi[, (col) := shift(hpi_qoq, k), by = state_abbrev]
}

hpi[, year    := as.integer(format(date, "%Y"))]
hpi[, quarter := as.integer(ceiling(as.integer(format(date, "%m")) / 3))]
hpi[, months  := lapply(quarter, function(q) ((q - 1) * 3 + 1):((q - 1) * 3 + 3))]

hpi_monthly <- hpi[, .(month = unlist(months)),
                   by = .(state_abbrev, year,
                          hpi_qoq_qlag1, hpi_qoq_qlag2,
                          hpi_qoq_qlag3, hpi_qoq_qlag4)]
hpi_monthly[, monthly_reporting_period := as.integer(year * 100 + month)]
hpi_monthly <- hpi_monthly[monthly_reporting_period >= 200601 &
                             monthly_reporting_period <= 201112]

unemp_cols <- c("state_abbrev", "monthly_reporting_period",
                paste0("unemployment_rate_lag", 1:4))
panel <- merge(panel, unemp[, ..unemp_cols],
               by.x = c("property_state", "monthly_reporting_period"),
               by.y = c("state_abbrev", "monthly_reporting_period"),
               all.x = TRUE)

hpi_cols <- c("state_abbrev", "monthly_reporting_period",
              paste0("hpi_qoq_qlag", 1:4))
panel <- merge(panel, hpi_monthly[, ..hpi_cols],
               by.x = c("property_state", "monthly_reporting_period"),
               by.y = c("state_abbrev", "monthly_reporting_period"),
               all.x = TRUE)

panel <- panel[!(property_state %in% c("PR", "GU", "VI"))]

# --- Save output ---

fwrite(panel, file.path(OUTPUT_DIR, "freddie_mac_panel.csv"))
saveRDS(panel, file.path(OUTPUT_DIR, "freddie_mac_panel.rds"))

set.seed(123)
sample_loans <- sample(panel[, unique(loan_sequence_number)], 500)
panel_sample <- panel[loan_sequence_number %in% sample_loans]
fwrite(panel_sample, file.path(OUTPUT_DIR, "freddie_mac_panel_sample.csv"))