This document preprocesses the Freddie Mac Single-Family Loan-Level Dataset to construct a panel dataset with a 12-month forward default indicator.
library(data.table)
library(fredr)
fredr_set_key("0ebdc36a1f51b8f30be241cffec62719")
DATA_DIR <- "/Users/amalianimeskern/Library/CloudStorage/OneDrive-ErasmusUniversityRotterdam/Freddie Mac Data"
OUTPUT_DIR <- "/Users/amalianimeskern/Library/CloudStorage/OneDrive-ErasmusUniversityRotterdam/Freddie Mac Data"
years <- 2006:2010
orig_positions <- c(1, 2, 3, 6, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 20, 21, 22, 23, 31)
orig_names <- c(
"credit_score", "first_payment_date", "first_time_homebuyer",
"mi_pct", "occupancy_status", "orig_cltv", "orig_dti", "orig_upb",
"orig_ltv", "orig_interest_rate", "channel", "amortization_type",
"property_state", "property_type", "loan_sequence_number",
"loan_purpose", "orig_loan_term", "num_borrowers", "io_indicator"
)
orig_list <- lapply(years, function(yr) {
folder <- file.path(DATA_DIR, paste0("sample_", yr))
fname <- file.path(folder, paste0("sample_orig_", yr, ".txt"))
if (!file.exists(fname)) {
zipfile <- file.path(folder, paste0("sample_", yr, ".zip"))
if (file.exists(zipfile)) unzip(zipfile, exdir = folder)
}
fread(fname, sep = "|", header = FALSE,
select = orig_positions,
col.names = orig_names,
colClasses = "character",
na.strings = c("", "NA"))
})
orig <- rbindlist(orig_list, fill = TRUE)
numeric_cols <- c("credit_score", "mi_pct", "orig_cltv", "orig_dti",
"orig_upb", "orig_ltv", "orig_interest_rate",
"orig_loan_term", "num_borrowers")
orig[, (numeric_cols) := lapply(.SD, as.numeric), .SDcols = numeric_cols]
orig[credit_score == 9999, credit_score := NA]
orig[orig_dti == 999, orig_dti := NA]
orig[orig_ltv == 999, orig_ltv := NA]
orig[orig_cltv == 999, orig_cltv := NA]
orig[mi_pct == 999, mi_pct := NA]
orig[num_borrowers == 99, num_borrowers := NA]
orig[orig_loan_term > 360, orig_loan_term := NA]
orig[, first_payment_date := as.character(first_payment_date)]
orig[, orig_year := as.integer(substr(first_payment_date, 1, 4))]
orig[, orig_month := as.integer(substr(first_payment_date, 5, 6))]
orig[, orig_quarter := paste0(orig_year, "Q", ceiling(orig_month / 3))]
orig[, orig_quarter := factor(orig_quarter, levels = sort(unique(orig_quarter)))]
orig <- orig[orig_year >= 2006 & orig_year <= 2010]
perf_positions <- c(1, 2, 3, 4, 5, 8, 9, 11, 12, 24)
perf_names <- c(
"loan_sequence_number", "monthly_reporting_period", "current_upb",
"current_delinquency_status", "loan_age", "modification_flag",
"zero_balance_code", "current_interest_rate", "current_deferred_upb",
"step_modification_flag"
)
perf_list <- lapply(years, function(yr) {
folder <- file.path(DATA_DIR, paste0("sample_", yr))
fname <- file.path(folder, paste0("sample_svcg_", yr, ".txt"))
if (!file.exists(fname)) {
zipfile <- file.path(folder, paste0("sample_", yr, ".zip"))
if (file.exists(zipfile)) unzip(zipfile, exdir = folder)
}
fread(fname, sep = "|", header = FALSE,
select = perf_positions,
col.names = perf_names,
colClasses = "character",
na.strings = c("", "NA"))
})
perf <- rbindlist(perf_list, fill = TRUE)
perf[, monthly_reporting_period := as.integer(monthly_reporting_period)]
perf[, current_delinquency_status := suppressWarnings(as.integer(current_delinquency_status))]
perf[, loan_age := as.integer(loan_age)]
perf[, current_upb := as.numeric(current_upb)]
perf[, current_interest_rate := as.numeric(current_interest_rate)]
perf[, current_deferred_upb := as.numeric(current_deferred_upb)]
perf[, zero_balance_code := as.character(zero_balance_code)]
perf <- perf[monthly_reporting_period >= 200601 & monthly_reporting_period <= 201212]
perf <- perf[loan_sequence_number %in% orig$loan_sequence_number]
setorder(perf, loan_sequence_number, monthly_reporting_period)
perf <- merge(perf,
orig[, .(loan_sequence_number, orig_interest_rate)],
by = "loan_sequence_number", all.x = TRUE)
perf[, delta_interest_rate := current_interest_rate - orig_interest_rate]
perf[, mod_flag_bin := as.integer(modification_flag == "Y")]
perf[, mod_flag_12m := frollapply(mod_flag_bin, n = 12, FUN = max,
fill = NA, align = "right"),
by = loan_sequence_number]
perf[is.na(mod_flag_12m), mod_flag_12m := 0L]
perf[, c("mod_flag_bin", "modification_flag", "step_modification_flag",
"current_interest_rate", "orig_interest_rate") := NULL]
perf[, is_default_event := as.integer(
(!is.na(current_delinquency_status) & current_delinquency_status >= 3) |
(zero_balance_code %in% c("03", "09"))
)]
first_default <- perf[is_default_event == 1,
.(first_default_month = min(monthly_reporting_period)),
by = loan_sequence_number]
perf <- merge(perf, first_default, by = "loan_sequence_number", all.x = TRUE)
perf <- perf[is.na(first_default_month) | monthly_reporting_period < first_default_month]
perf[, month_index := 12 * (monthly_reporting_period %/% 100) + (monthly_reporting_period %% 100)]
perf[, first_default_index := 12 * (first_default_month %/% 100) + (first_default_month %% 100)]
perf[, default_next_12m := as.integer(
!is.na(first_default_index) &
first_default_index > month_index &
first_default_index <= month_index + 12
)]
panel <- merge(perf, orig, by = "loan_sequence_number", all.x = FALSE)
panel[, c("is_default_event", "first_default_month") := NULL]
state_abbrevs <- c("AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA",
"HI","ID","IL","IN","IA","KS","KY","LA","ME","MD",
"MA","MI","MN","MS","MO","MT","NE","NV","NH","NJ",
"NM","NY","NC","ND","OH","OK","OR","PA","RI","SC",
"SD","TN","TX","UT","VT","VA","WA","WV","WI","WY","DC")
(Remaining macro merge code can stay identical — already chunk-safe)
fwrite(panel, file.path(OUTPUT_DIR, "freddie_mac_panel.csv"))
saveRDS(panel, file.path(OUTPUT_DIR, "freddie_mac_panel.rds"))
set.seed(123)
sample_loans <- sample(panel[, unique(loan_sequence_number)], 500)
panel_sample <- panel[loan_sequence_number %in% sample_loans]
fwrite(panel_sample, file.path(OUTPUT_DIR, "freddie_mac_panel_sample.csv"))