Exploratory Data Analysis
library(data.table)
library(ggplot2)
library(stargazer)
library(patchwork)
# --- Load data ---
OUTPUT_DIR <- "/Users/amalianimeskern/Library/CloudStorage/OneDrive-ErasmusUniversityRotterdam/Freddie Mac Data"
panel <- readRDS(file.path(OUTPUT_DIR, "freddie_mac_panel.rds"))
# Note: Panel dataset is restricted to properties in the 50 U.S. states and DC.
# Loans in U.S. territories (PR, GU, VI) were excluded during preprocessing.
# Number of unique loans
panel[, uniqueN(loan_sequence_number)]
## [1] 240565
# --- Sanity check: geography ---
setdiff(unique(panel$property_state),
c("AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA",
"HI","ID","IL","IN","IA","KS","KY","LA","ME","MD",
"MA","MI","MN","MS","MO","MT","NE","NV","NH","NJ",
"NM","NY","NC","ND","OH","OK","OR","PA","RI","SC",
"SD","TN","TX","UT","VT","VA","WA","WV","WI","WY","DC"))
## character(0)
# --- Sanity check: Missingness ---
missingness <- data.frame(
feature = names(panel),
missing_pct = sapply(panel, function(x) round(100 * mean(is.na(x)), 2))
)
missingness <- missingness[order(-missingness$missing_pct), ]
print(missingness)
## feature missing_pct
## zero_balance_code zero_balance_code 98.64
## first_default_index first_default_index 94.65
## orig_dti orig_dti 6.19
## orig_loan_term orig_loan_term 0.09
## credit_score credit_score 0.07
## num_borrowers num_borrowers 0.03
## first_time_homebuyer first_time_homebuyer 0.02
## orig_cltv orig_cltv 0.01
## orig_ltv orig_ltv 0.01
## property_state property_state 0.00
## monthly_reporting_period monthly_reporting_period 0.00
## loan_sequence_number loan_sequence_number 0.00
## current_upb current_upb 0.00
## current_delinquency_status current_delinquency_status 0.00
## loan_age loan_age 0.00
## current_deferred_upb current_deferred_upb 0.00
## delta_interest_rate delta_interest_rate 0.00
## mod_flag_12m mod_flag_12m 0.00
## month_index month_index 0.00
## default_next_12m default_next_12m 0.00
## first_payment_date first_payment_date 0.00
## mi_pct mi_pct 0.00
## occupancy_status occupancy_status 0.00
## orig_upb orig_upb 0.00
## orig_interest_rate orig_interest_rate 0.00
## channel channel 0.00
## amortization_type amortization_type 0.00
## property_type property_type 0.00
## loan_purpose loan_purpose 0.00
## io_indicator io_indicator 0.00
## orig_year orig_year 0.00
## orig_month orig_month 0.00
## orig_quarter orig_quarter 0.00
## unemployment_rate_lag1 unemployment_rate_lag1 0.00
## unemployment_rate_lag2 unemployment_rate_lag2 0.00
## unemployment_rate_lag3 unemployment_rate_lag3 0.00
## unemployment_rate_lag4 unemployment_rate_lag4 0.00
## hpi_qoq_qlag1 hpi_qoq_qlag1 0.00
## hpi_qoq_qlag2 hpi_qoq_qlag2 0.00
## hpi_qoq_qlag3 hpi_qoq_qlag3 0.00
## hpi_qoq_qlag4 hpi_qoq_qlag4 0.00
key_vars <- c("credit_score", "orig_ltv", "orig_cltv", "orig_dti",
"orig_interest_rate", "current_upb", "current_deferred_upb",
"delta_interest_rate", "mod_flag_12m",
paste0("unemployment_rate_lag", 1:4),
paste0("hpi_qoq_qlag", 1:4))
miss_key <- missingness[missingness$feature %in% key_vars, ]
print(miss_key)
## feature missing_pct
## orig_dti orig_dti 6.19
## credit_score credit_score 0.07
## orig_cltv orig_cltv 0.01
## orig_ltv orig_ltv 0.01
## current_upb current_upb 0.00
## current_deferred_upb current_deferred_upb 0.00
## delta_interest_rate delta_interest_rate 0.00
## mod_flag_12m mod_flag_12m 0.00
## orig_interest_rate orig_interest_rate 0.00
## unemployment_rate_lag1 unemployment_rate_lag1 0.00
## unemployment_rate_lag2 unemployment_rate_lag2 0.00
## unemployment_rate_lag3 unemployment_rate_lag3 0.00
## unemployment_rate_lag4 unemployment_rate_lag4 0.00
## hpi_qoq_qlag1 hpi_qoq_qlag1 0.00
## hpi_qoq_qlag2 hpi_qoq_qlag2 0.00
## hpi_qoq_qlag3 hpi_qoq_qlag3 0.00
## hpi_qoq_qlag4 hpi_qoq_qlag4 0.00
# --- Class imbalance ---
# Observation level
table(panel$default_next_12m)
##
## 0 1
## 7295197 17768
prop.table(table(panel$default_next_12m)) * 100
##
## 0 1
## 99.7570343 0.2429657
# Loan level
panel[, .(ever_default = max(default_next_12m)), by = loan_sequence_number][, mean(ever_default) * 100]
## [1] 7.385946
# Average months
panel[, .N, by = loan_sequence_number][, summary(N)]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 18.0 28.0 30.4 41.0 72.0
# --- Default rate by origination year (loan level) ---
setorder(panel, loan_sequence_number, monthly_reporting_period)
cols_to_keep <- c("orig_year", "credit_score", "orig_ltv", "orig_cltv",
"orig_dti", "orig_interest_rate", "orig_upb", "orig_loan_term",
"num_borrowers", "mi_pct", "occupancy_status", "amortization_type",
"loan_purpose", "channel", "property_type",
"first_time_homebuyer", "io_indicator")
loan_level <- panel[, .(ever_default_12m = max(default_next_12m, na.rm = TRUE)),
by = loan_sequence_number]
first_obs <- panel[panel[, .I[1], by = loan_sequence_number]$V1,
c("loan_sequence_number", cols_to_keep), with = FALSE]
loan_level <- merge(loan_level, first_obs, by = "loan_sequence_number")
default_by_year <- loan_level[, .(
n_loans = .N,
default_rate = round(100 * mean(ever_default_12m), 2)
), by = orig_year][order(orig_year)]
print(default_by_year)
## orig_year n_loans default_rate
## <int> <int> <num>
## 1: 2006 41258 12.12
## 2: 2007 49688 13.74
## 3: 2008 49999 9.02
## 4: 2009 49638 1.82
## 5: 2010 49982 1.05
ggplot(default_by_year, aes(x = orig_year, y = default_rate)) +
geom_line() +
geom_point() +
labs(title = "Default Rate by Origination Year",
x = "Origination Year",
y = "Default Rate (%)") +
theme_classic()

# --- Feature distributions by default status ---
# Origination variables
continuous_orig <- c("credit_score", "orig_ltv", "orig_dti", "orig_interest_rate")
feature_stats_orig <- rbindlist(lapply(continuous_orig, function(var) {
loan_level[, .(
variable = var,
mean_default = round(mean(get(var)[ever_default_12m == 1], na.rm = TRUE), 2),
mean_no_default = round(mean(get(var)[ever_default_12m == 0], na.rm = TRUE), 2),
sd_default = round(sd(get(var)[ever_default_12m == 1], na.rm = TRUE), 2),
sd_no_default = round(sd(get(var)[ever_default_12m == 0], na.rm = TRUE), 2)
)]
}))
print(feature_stats_orig)
## variable mean_default mean_no_default sd_default sd_no_default
## <char> <num> <num> <num> <num>
## 1: credit_score 690.72 745.61 55.62 51.54
## 2: orig_ltv 77.99 68.54 13.42 18.08
## 3: orig_dti 40.96 34.37 11.40 12.14
## 4: orig_interest_rate 6.41 5.70 0.55 0.81
# Behavioural + macro variables — observation level
continuous_behav <- c("current_delinquency_status", "current_upb",
"delta_interest_rate", "mod_flag_12m",
"current_deferred_upb",
"unemployment_rate_lag1", "hpi_qoq_qlag1")
feature_stats_behav <- rbindlist(lapply(continuous_behav, function(var) {
panel[, .(
variable = var,
mean_default = round(mean(get(var)[default_next_12m == 1], na.rm = TRUE), 2),
mean_no_default = round(mean(get(var)[default_next_12m == 0], na.rm = TRUE), 2),
sd_default = round(sd(get(var)[default_next_12m == 1], na.rm = TRUE), 2),
sd_no_default = round(sd(get(var)[default_next_12m == 0], na.rm = TRUE), 2)
)]
}))
print(feature_stats_behav)
## variable mean_default mean_no_default sd_default
## <char> <num> <num> <num>
## 1: current_delinquency_status 0.12 0.01 0.37
## 2: current_upb 198189.42 176828.25 98538.34
## 3: delta_interest_rate -0.01 0.00 0.15
## 4: mod_flag_12m 0.00 0.00 0.02
## 5: current_deferred_upb 4.57 3.91 537.48
## 6: unemployment_rate_lag1 7.77 7.96 2.53
## 7: hpi_qoq_qlag1 -1.76 -0.98 2.40
## sd_no_default
## <num>
## 1: 0.12
## 2: 102407.95
## 3: 0.09
## 4: 0.01
## 5: 653.89
## 6: 2.52
## 7: 1.87
# --- DTI distribution ---
ggplot(panel[!is.na(orig_dti)],
aes(x = orig_dti)) +
geom_histogram(binwidth = 2, fill = "orange", color = "white") +
labs(title = "DTI Distribution", x = "DTI", y = "Count") +
theme_classic()

# --- Delinquency status distribution ---
ggplot(panel[!is.na(current_delinquency_status)],
aes(x = factor(current_delinquency_status),
fill = factor(default_next_12m))) +
geom_bar(position = "fill") +
scale_fill_manual(values = c("steelblue", "orange"),
labels = c("No Default", "Default")) +
labs(title = "12-Month Default Rate by Current Delinquency Status",
x = "Current Delinquency Status (months past due)",
y = "Proportion",
fill = "") +
theme_classic()

# --- Macro controls over time ---
# Average lagged unemployment and HPI by month across all loans
macro_time <- panel[, .(
avg_unemp = mean(unemployment_rate_lag1, na.rm = TRUE),
avg_hpi = mean(hpi_qoq_qlag1, na.rm = TRUE)
), by = monthly_reporting_period][order(monthly_reporting_period)]
macro_time[, date := as.Date(paste0(
substr(as.character(monthly_reporting_period), 1, 4), "-",
substr(as.character(monthly_reporting_period), 5, 6), "-01"
))]
macro_time <- panel[, .(
unemp_lag1 = mean(unemployment_rate_lag1, na.rm = TRUE),
unemp_lag2 = mean(unemployment_rate_lag2, na.rm = TRUE),
unemp_lag3 = mean(unemployment_rate_lag3, na.rm = TRUE),
unemp_lag4 = mean(unemployment_rate_lag4, na.rm = TRUE),
hpi_lag1 = mean(hpi_qoq_qlag1, na.rm = TRUE),
hpi_lag2 = mean(hpi_qoq_qlag2, na.rm = TRUE),
hpi_lag3 = mean(hpi_qoq_qlag3, na.rm = TRUE),
hpi_lag4 = mean(hpi_qoq_qlag4, na.rm = TRUE)
), by = monthly_reporting_period][order(monthly_reporting_period)]
macro_time[, date := as.Date(paste0(
substr(as.character(monthly_reporting_period), 1, 4), "-",
substr(as.character(monthly_reporting_period), 5, 6), "-01"
))]
# Reshape to long for faceting
unemp_long <- melt(macro_time, id.vars = "date",
measure.vars = paste0("unemp_lag", 1:4),
variable.name = "lag", value.name = "unemployment_rate")
unemp_long[, lag := factor(lag,
levels = paste0("unemp_lag", 1:4),
labels = paste0("Lag ", 1:4))]
hpi_long <- melt(macro_time, id.vars = "date",
measure.vars = paste0("hpi_lag", 1:4),
variable.name = "lag", value.name = "hpi_qoq")
hpi_long[, lag := factor(lag,
levels = paste0("hpi_lag", 1:4),
labels = paste0("Lag ", 1:4))]
p1 <- ggplot(unemp_long, aes(x = date, y = unemployment_rate)) +
geom_line(color = "steelblue") +
facet_wrap(~lag, ncol = 2) +
labs(title = "Average State Unemployment Rate Over Time",
x = "", y = "Unemployment Rate (%)") +
theme_classic()
p2 <- ggplot(hpi_long, aes(x = date, y = hpi_qoq)) +
geom_line(color = "darkorange") +
geom_hline(yintercept = 0, linetype = "dashed", color = "grey50") +
facet_wrap(~lag, ncol = 2) +
labs(title = "Average State HPI QoQ Change Over Time",
x = "", y = "HPI QoQ Change (%)") +
theme_classic()
p1 / p2

# --- Variable dictionary ---
variable_dict <- data.frame(
variable = c(
"loan_sequence_number",
"monthly_reporting_period",
"month_index",
"first_default_index",
"first_payment_date",
"orig_year",
"orig_month",
"orig_quarter",
"credit_score", "orig_ltv", "orig_cltv", "orig_dti",
"orig_interest_rate", "orig_upb", "orig_loan_term",
"num_borrowers", "mi_pct", "occupancy_status",
"amortization_type", "loan_purpose", "channel",
"property_type", "property_state", "first_time_homebuyer",
"io_indicator",
"current_delinquency_status", "loan_age", "mod_flag_12m",
"delta_interest_rate", "current_upb", "current_deferred_upb",
"zero_balance_code",
paste0("unemployment_rate_lag", 1:4),
paste0("hpi_qoq_qlag", 1:4),
"default_next_12m"
),
type = c(
rep("Identifier", 2),
rep("Temporal", 6),
rep("Origination", 17),
rep("Behavioural", 7),
rep("Macro", 8),
"Outcome"
),
description = c(
"Unique loan identifier used to link origination and performance records",
"Year-month of the performance observation (YYYYMM integer format)",
"Linear month index derived from monthly_reporting_period: 12 * (YYYYMM %/% 100) + (YYYYMM %% 100). Used to compute forward default horizon",
"Linear month index of the first default event (90+ DPD or zero balance code 03/09) for the loan; NA if no default observed in the data",
"Date of first scheduled payment (YYYYMM string); source for orig_year, orig_month, and orig_quarter",
"Calendar year of loan origination, extracted from first_payment_date (characters 1-4)",
"Calendar month of loan origination, extracted from first_payment_date (characters 5-6)",
"Origination quarter as ordered factor (e.g. '2007Q3'), derived from orig_year and orig_month",
"Borrower FICO credit score at origination. Recoded from 9999 to NA",
"Original loan-to-value ratio at origination (%). Recoded from 999 to NA",
"Original combined loan-to-value ratio including all liens (%). Recoded from 999 to NA",
"Original debt-to-income ratio at origination (%). Recoded from 999 to NA",
"Note rate at origination (%)",
"Original unpaid principal balance (USD)",
"Original loan term in months (e.g. 360 = 30-year). Values > 360 recoded to NA",
"Number of borrowers on the loan. Recoded from 99 to NA",
"Mortgage insurance coverage percentage. Recoded from 999 to NA",
"Occupancy status: P = primary residence, S = second home, I = investment property",
"Amortization type: FRM = fixed-rate mortgage, ARM = adjustable-rate mortgage",
"Loan purpose: P = purchase, R = rate-and-term refinance, C = cash-out refinance",
"Origination channel: R = retail, B = broker, C = correspondent, T = unspecified",
"Property type: SF = single family, CO = condo, MH = manufactured housing",
"Two-letter US state code of the mortgaged property (50 states + DC; territories PR/GU/VI excluded)",
"First-time homebuyer flag (Y/N)",
"Interest-only indicator (Y/N)",
"Number of months past due at observation month t (0 = current, 1 = 30 DPD, 2 = 60 DPD). Observations at or after first default month are excluded from the panel",
"Number of months since loan origination at observation month t",
"Binary indicator: loan was modified at least once in the 12 months up to and including observation month t. For the first 11 months of each loan's panel history, assumes no prior modifications before the observation window",
"Difference between current interest rate and original note rate at observation month t (%). Derived as current_interest_rate minus orig_interest_rate",
"Outstanding unpaid principal balance at observation month t (USD)",
"Deferred principal balance at observation month t (USD)",
"Loan termination code: 01 = prepaid/matured, 03 = foreclosure, 09 = short sale, 15 = note sale, 16 = repurchase. NA for active loans",
paste0("State-level monthly unemployment rate (%) lagged ", 1:4, " month(s). Source: BLS LAUS via FRED (series: [STATE]UR), matched on property_state and monthly_reporting_period"),
paste0("State-level HPI quarter-over-quarter % change lagged ", 1:4, " quarter(s). Source: FHFA via FRED (series: [STATE]STHPI)."),
"Binary outcome: equals 1 if the loan first reaches 90+ days past due or experiences a foreclosure/short sale within 12 months of observation month t, and 0 otherwise. Each defaulting loan receives exactly one positive flag at the earliest month where default enters the 12-month horizon after which the loan exits the risk set. Non-defaulting loans contribute 0s for every month they remain active. Observations after December 2011 are excluded to ensure a complete 12-month outcome window through December 2012"
),
source = c(
"Origination and Performance files",
"Performance file (time-varying)",
"Derived from monthly_reporting_period",
"Derived from performance file",
"Origination file",
"Derived from origination file",
"Derived from origination file",
"Derived from origination file",
rep("Origination file", 17),
rep("Performance file (time-varying)", 7),
rep("FRED (BLS LAUS)", 4),
rep("FRED (FHFA)", 4),
"Derived from performance file"
)
)
write.csv(variable_dict,
file.path(OUTPUT_DIR, "variable_dictionary.csv"),
row.names = FALSE)
# --- Summary statistics and categorical variables distribution ---
orig_vars <- c("credit_score", "orig_ltv", "orig_cltv", "orig_dti",
"orig_interest_rate", "orig_upb", "orig_loan_term",
"num_borrowers", "mi_pct")
stargazer(as.data.frame(loan_level[, ..orig_vars]),
type = "html",
title = "Panel A: Origination Variables (loan level)",
digits = 2,
summary.stat = c("n", "mean", "sd", "min", "p25", "median", "p75", "max"),
out = file.path(OUTPUT_DIR, "summary_origination.html"))
##
## <table style="text-align:center"><caption><strong>Panel A: Origination Variables (loan level)</strong></caption>
## <tr><td colspan="9" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">Statistic</td><td>N</td><td>Mean</td><td>St. Dev.</td><td>Min</td><td>Pctl(25)</td><td>Median</td><td>Pctl(75)</td><td>Max</td></tr>
## <tr><td colspan="9" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">credit_score</td><td>240,443</td><td>741.56</td><td>53.80</td><td>300</td><td>706</td><td>754</td><td>785</td><td>850</td></tr>
## <tr><td style="text-align:left">orig_ltv</td><td>240,552</td><td>69.24</td><td>17.95</td><td>6</td><td>59</td><td>75</td><td>80</td><td>125</td></tr>
## <tr><td style="text-align:left">orig_cltv</td><td>240,546</td><td>71.34</td><td>18.80</td><td>6</td><td>60</td><td>75</td><td>80</td><td>193</td></tr>
## <tr><td style="text-align:left">orig_dti</td><td>220,093</td><td>34.87</td><td>12.21</td><td>1</td><td>26</td><td>35</td><td>44</td><td>65</td></tr>
## <tr><td style="text-align:left">orig_interest_rate</td><td>240,565</td><td>5.75</td><td>0.82</td><td>3.25</td><td>5.00</td><td>5.88</td><td>6.38</td><td>9.79</td></tr>
## <tr><td style="text-align:left">orig_upb</td><td>240,565</td><td>196,477.30</td><td>107,205.80</td><td>9,000</td><td>114,000</td><td>173,000</td><td>260,000</td><td>1,095,000</td></tr>
## <tr><td style="text-align:left">orig_loan_term</td><td>240,357</td><td>326.92</td><td>68.19</td><td>60</td><td>360</td><td>360</td><td>360</td><td>360</td></tr>
## <tr><td style="text-align:left">num_borrowers</td><td>240,491</td><td>1.56</td><td>0.50</td><td>1</td><td>1</td><td>2</td><td>2</td><td>2</td></tr>
## <tr><td style="text-align:left">mi_pct</td><td>240,562</td><td>3.17</td><td>8.59</td><td>0</td><td>0</td><td>0</td><td>0</td><td>40</td></tr>
## <tr><td colspan="9" style="border-bottom: 1px solid black"></td></tr></table>
behav_vars <- c("current_delinquency_status", "loan_age", "current_upb",
"delta_interest_rate", "mod_flag_12m", "current_deferred_upb")
stargazer(as.data.frame(panel[, ..behav_vars]),
type = "html",
title = "Panel B: Behavioural Variables (observation level)",
digits = 2,
summary.stat = c("n", "mean", "sd", "min", "p25", "median", "p75", "max"),
out = file.path(OUTPUT_DIR, "summary_behavioural.html"))
##
## <table style="text-align:center"><caption><strong>Panel B: Behavioural Variables (observation level)</strong></caption>
## <tr><td colspan="9" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">Statistic</td><td>N</td><td>Mean</td><td>St. Dev.</td><td>Min</td><td>Pctl(25)</td><td>Median</td><td>Pctl(75)</td><td>Max</td></tr>
## <tr><td colspan="9" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">current_delinquency_status</td><td>7,312,948</td><td>0.01</td><td>0.12</td><td>0</td><td>0</td><td>0</td><td>0</td><td>2</td></tr>
## <tr><td style="text-align:left">loan_age</td><td>7,312,965</td><td>19.41</td><td>14.83</td><td>0</td><td>8</td><td>16</td><td>28</td><td>71</td></tr>
## <tr><td style="text-align:left">current_upb</td><td>7,312,965</td><td>176,880.10</td><td>102,404.10</td><td>0.00</td><td>100,522.90</td><td>154,118.70</td><td>234,667.40</td><td>1,094,000.00</td></tr>
## <tr><td style="text-align:left">delta_interest_rate</td><td>7,312,965</td><td>-0.002</td><td>0.09</td><td>-5.75</td><td>0.00</td><td>0.00</td><td>0.00</td><td>2.25</td></tr>
## <tr><td style="text-align:left">mod_flag_12m</td><td>7,312,965</td><td>0.0000</td><td>0.01</td><td>0</td><td>0</td><td>0</td><td>0</td><td>1</td></tr>
## <tr><td style="text-align:left">current_deferred_upb</td><td>7,312,965</td><td>3.91</td><td>653.63</td><td>0.00</td><td>0.00</td><td>0.00</td><td>0.00</td><td>251,900.00</td></tr>
## <tr><td colspan="9" style="border-bottom: 1px solid black"></td></tr></table>
macro_vars <- c(paste0("unemployment_rate_lag", 1:4),
paste0("hpi_qoq_qlag", 1:4))
stargazer(as.data.frame(panel[, ..macro_vars]),
type = "html",
title = "Panel C: Macro Controls (observation level, lagged)",
digits = 2,
summary.stat = c("n", "mean", "sd", "min", "p25", "median", "p75", "max"),
out = file.path(OUTPUT_DIR, "summary_macro.html"))
##
## <table style="text-align:center"><caption><strong>Panel C: Macro Controls (observation level, lagged)</strong></caption>
## <tr><td colspan="9" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">Statistic</td><td>N</td><td>Mean</td><td>St. Dev.</td><td>Min</td><td>Pctl(25)</td><td>Median</td><td>Pctl(75)</td><td>Max</td></tr>
## <tr><td colspan="9" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">unemployment_rate_lag1</td><td>7,312,965</td><td>7.95</td><td>2.52</td><td>2.30</td><td>5.70</td><td>8.20</td><td>10.00</td><td>14.30</td></tr>
## <tr><td style="text-align:left">unemployment_rate_lag2</td><td>7,312,965</td><td>7.89</td><td>2.55</td><td>2.30</td><td>5.60</td><td>8.10</td><td>9.90</td><td>14.30</td></tr>
## <tr><td style="text-align:left">unemployment_rate_lag3</td><td>7,312,965</td><td>7.83</td><td>2.57</td><td>2.30</td><td>5.50</td><td>8.00</td><td>9.90</td><td>14.30</td></tr>
## <tr><td style="text-align:left">unemployment_rate_lag4</td><td>7,312,965</td><td>7.76</td><td>2.59</td><td>2.30</td><td>5.40</td><td>7.90</td><td>9.90</td><td>14.30</td></tr>
## <tr><td style="text-align:left">hpi_qoq_qlag1</td><td>7,312,965</td><td>-0.98</td><td>1.87</td><td>-10.02</td><td>-1.87</td><td>-0.67</td><td>0.35</td><td>6.94</td></tr>
## <tr><td style="text-align:left">hpi_qoq_qlag2</td><td>7,312,965</td><td>-1.01</td><td>1.91</td><td>-10.02</td><td>-1.92</td><td>-0.74</td><td>0.31</td><td>8.48</td></tr>
## <tr><td style="text-align:left">hpi_qoq_qlag3</td><td>7,312,965</td><td>-0.88</td><td>1.99</td><td>-10.02</td><td>-1.79</td><td>-0.62</td><td>0.42</td><td>10.32</td></tr>
## <tr><td style="text-align:left">hpi_qoq_qlag4</td><td>7,312,965</td><td>-0.69</td><td>2.04</td><td>-10.02</td><td>-1.64</td><td>-0.47</td><td>0.55</td><td>10.32</td></tr>
## <tr><td colspan="9" style="border-bottom: 1px solid black"></td></tr></table>
cat_vars <- c("occupancy_status", "amortization_type", "loan_purpose",
"channel", "property_type", "first_time_homebuyer", "io_indicator")
cat_table <- rbindlist(lapply(cat_vars, function(v) {
dt <- loan_level[, .(N = .N,
Default_Rate = round(100 * mean(ever_default_12m), 2)),
by = v]
setnames(dt, v, "Category")
dt[, Variable := v]
dt[, Pct := round(100 * N / nrow(loan_level), 2)]
dt[, .(Variable, Category, N, Pct, Default_Rate)]
}))[order(Variable, -N)]
stargazer(cat_table,
type = "html",
title = "Panel D: Categorical Variables (loan level)",
summary = FALSE,
rownames = FALSE,
digits = 2,
out = file.path(OUTPUT_DIR, "summary_categorical.html"))
##
## <table style="text-align:center"><caption><strong>Panel D: Categorical Variables (loan level)</strong></caption>
## <tr><td colspan="5" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">Variable</td><td>Category</td><td>N</td><td>Pct</td><td>Default_Rate</td></tr>
## <tr><td colspan="5" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">amortization_type</td><td>FRM</td><td>240,565</td><td>100</td><td>7.39</td></tr>
## <tr><td style="text-align:left">channel</td><td>R</td><td>122,719</td><td>51.01</td><td>5.48</td></tr>
## <tr><td style="text-align:left">channel</td><td>T</td><td>65,749</td><td>27.33</td><td>14.29</td></tr>
## <tr><td style="text-align:left">channel</td><td>C</td><td>34,499</td><td>14.34</td><td>2.47</td></tr>
## <tr><td style="text-align:left">channel</td><td>B</td><td>17,598</td><td>7.32</td><td>4.54</td></tr>
## <tr><td style="text-align:left">first_time_homebuyer</td><td>N</td><td>217,299</td><td>90.33</td><td>7.26</td></tr>
## <tr><td style="text-align:left">first_time_homebuyer</td><td>Y</td><td>23,223</td><td>9.65</td><td>8.59</td></tr>
## <tr><td style="text-align:left">first_time_homebuyer</td><td></td><td>43</td><td>0.02</td><td>0</td></tr>
## <tr><td style="text-align:left">io_indicator</td><td>N</td><td>240,565</td><td>100</td><td>7.39</td></tr>
## <tr><td style="text-align:left">loan_purpose</td><td>P</td><td>86,133</td><td>35.80</td><td>7.23</td></tr>
## <tr><td style="text-align:left">loan_purpose</td><td>N</td><td>79,366</td><td>32.99</td><td>5.33</td></tr>
## <tr><td style="text-align:left">loan_purpose</td><td>C</td><td>75,066</td><td>31.20</td><td>9.74</td></tr>
## <tr><td style="text-align:left">occupancy_status</td><td>P</td><td>215,465</td><td>89.57</td><td>7.36</td></tr>
## <tr><td style="text-align:left">occupancy_status</td><td>I</td><td>13,288</td><td>5.52</td><td>9.20</td></tr>
## <tr><td style="text-align:left">occupancy_status</td><td>S</td><td>11,812</td><td>4.91</td><td>5.85</td></tr>
## <tr><td style="text-align:left">property_type</td><td>SF</td><td>179,667</td><td>74.69</td><td>7.40</td></tr>
## <tr><td style="text-align:left">property_type</td><td>PU</td><td>40,845</td><td>16.98</td><td>6.39</td></tr>
## <tr><td style="text-align:left">property_type</td><td>CO</td><td>17,495</td><td>7.27</td><td>9.08</td></tr>
## <tr><td style="text-align:left">property_type</td><td>MH</td><td>1,805</td><td>0.75</td><td>14.57</td></tr>
## <tr><td style="text-align:left">property_type</td><td>CP</td><td>753</td><td>0.31</td><td>2.52</td></tr>
## <tr><td colspan="5" style="border-bottom: 1px solid black"></td></tr></table>
# --- Distribution of default events by unemployment level ------
panel[default_next_12m == 1,
.(n_defaults = .N),
by = cut(unemployment_rate_lag4,
breaks = c(0, 5, 7.4, 9.8, 11.8, Inf),
right = FALSE)][order(cut)]
## cut n_defaults
## <fctr> <int>
## 1: [0,5) 3896
## 2: [5,7.4) 5738
## 3: [7.4,9.8) 3849
## 4: [9.8,11.8) 3358
## 5: [11.8,Inf) 927
# --- Correlation ---
panel_cor <- panel[!is.na(default_next_12m)]
# Sample for speed, as Spearman is slow on large data
set.seed(42)
sample_idx <- sample(nrow(panel_cor), min(100000, nrow(panel_cor)))
panel_sample <- panel_cor[sample_idx]
all_vars <- c("current_delinquency_status", "loan_age", "current_upb",
"delta_interest_rate", "mod_flag_12m", "current_deferred_upb",
paste0("unemployment_rate_lag", 1:4),
paste0("hpi_qoq_qlag", 1:4),
"credit_score", "orig_ltv", "orig_cltv", "orig_dti",
"orig_interest_rate", "orig_upb", "orig_loan_term",
"num_borrowers", "mi_pct", "default_next_12m")
cor_matrix <- cor(panel_sample[, ..all_vars],
use = "complete.obs",
method = "spearman")
cor_matrix <- round(cor_matrix, 2)
write.table(cor_matrix,
file.path(OUTPUT_DIR, "correlation_matrix.txt"),
sep = "\t",
row.names = TRUE,
quote = FALSE)
# Predictive power
print(sort(cor_matrix[, "default_next_12m"], decreasing = TRUE))
## default_next_12m current_delinquency_status
## 1.00 0.06
## orig_dti orig_interest_rate
## 0.03 0.03
## orig_ltv orig_cltv
## 0.02 0.02
## orig_loan_term mi_pct
## 0.02 0.02
## loan_age current_upb
## 0.01 0.01
## orig_upb delta_interest_rate
## 0.01 0.00
## mod_flag_12m current_deferred_upb
## 0.00 0.00
## unemployment_rate_lag1 unemployment_rate_lag2
## 0.00 0.00
## unemployment_rate_lag3 unemployment_rate_lag4
## 0.00 0.00
## hpi_qoq_qlag1 hpi_qoq_qlag2
## -0.01 -0.01
## hpi_qoq_qlag3 hpi_qoq_qlag4
## -0.01 -0.01
## num_borrowers credit_score
## -0.01 -0.04
# Multicollinearity
pred_vars <- setdiff(all_vars, "default_next_12m")
cor_predictors <- cor(panel_sample[, ..pred_vars],
use = "complete.obs",
method = "spearman")
round(cor_predictors, 2)
## current_delinquency_status loan_age current_upb
## current_delinquency_status 1.00 0.06 -0.01
## loan_age 0.06 1.00 -0.14
## current_upb -0.01 -0.14 1.00
## delta_interest_rate -0.02 0.01 -0.02
## mod_flag_12m 0.00 0.00 0.00
## current_deferred_upb 0.00 0.00 0.00
## unemployment_rate_lag1 0.02 0.35 0.02
## unemployment_rate_lag2 0.02 0.36 0.02
## unemployment_rate_lag3 0.02 0.36 0.02
## unemployment_rate_lag4 0.02 0.37 0.01
## hpi_qoq_qlag1 -0.01 -0.12 -0.09
## hpi_qoq_qlag2 -0.01 -0.22 -0.08
## hpi_qoq_qlag3 -0.01 -0.21 -0.08
## hpi_qoq_qlag4 -0.01 -0.20 -0.08
## credit_score -0.09 -0.10 0.04
## orig_ltv 0.02 0.05 0.09
## orig_cltv 0.01 0.06 0.12
## orig_dti 0.03 0.03 0.13
## orig_interest_rate 0.05 0.22 -0.22
## orig_upb -0.01 -0.09 0.96
## orig_loan_term 0.02 0.04 0.22
## num_borrowers -0.03 -0.03 0.16
## mi_pct 0.02 0.03 -0.02
## delta_interest_rate mod_flag_12m
## current_delinquency_status -0.02 0.00
## loan_age 0.01 0.00
## current_upb -0.02 0.00
## delta_interest_rate 1.00 -0.13
## mod_flag_12m -0.13 1.00
## current_deferred_upb -0.29 0.15
## unemployment_rate_lag1 -0.02 0.00
## unemployment_rate_lag2 -0.02 0.00
## unemployment_rate_lag3 -0.02 0.00
## unemployment_rate_lag4 -0.02 0.00
## hpi_qoq_qlag1 0.00 -0.01
## hpi_qoq_qlag2 0.01 0.00
## hpi_qoq_qlag3 0.01 0.00
## hpi_qoq_qlag4 0.01 0.00
## credit_score 0.02 -0.01
## orig_ltv -0.01 0.00
## orig_cltv -0.01 0.00
## orig_dti -0.02 0.01
## orig_interest_rate -0.01 0.01
## orig_upb -0.02 0.00
## orig_loan_term -0.01 0.00
## num_borrowers 0.01 0.00
## mi_pct 0.00 0.00
## current_deferred_upb unemployment_rate_lag1
## current_delinquency_status 0.00 0.02
## loan_age 0.00 0.35
## current_upb 0.00 0.02
## delta_interest_rate -0.29 -0.02
## mod_flag_12m 0.15 0.00
## current_deferred_upb 1.00 0.01
## unemployment_rate_lag1 0.01 1.00
## unemployment_rate_lag2 0.01 1.00
## unemployment_rate_lag3 0.01 0.99
## unemployment_rate_lag4 0.01 0.98
## hpi_qoq_qlag1 0.00 -0.29
## hpi_qoq_qlag2 0.00 -0.40
## hpi_qoq_qlag3 0.00 -0.48
## hpi_qoq_qlag4 -0.01 -0.53
## credit_score 0.00 0.12
## orig_ltv 0.00 -0.08
## orig_cltv 0.00 -0.10
## orig_dti 0.00 -0.05
## orig_interest_rate 0.00 -0.28
## orig_upb 0.00 0.05
## orig_loan_term 0.00 -0.03
## num_borrowers -0.01 0.00
## mi_pct 0.00 -0.04
## unemployment_rate_lag2 unemployment_rate_lag3
## current_delinquency_status 0.02 0.02
## loan_age 0.36 0.36
## current_upb 0.02 0.02
## delta_interest_rate -0.02 -0.02
## mod_flag_12m 0.00 0.00
## current_deferred_upb 0.01 0.01
## unemployment_rate_lag1 1.00 0.99
## unemployment_rate_lag2 1.00 1.00
## unemployment_rate_lag3 1.00 1.00
## unemployment_rate_lag4 0.99 1.00
## hpi_qoq_qlag1 -0.28 -0.27
## hpi_qoq_qlag2 -0.38 -0.37
## hpi_qoq_qlag3 -0.47 -0.45
## hpi_qoq_qlag4 -0.53 -0.52
## credit_score 0.12 0.12
## orig_ltv -0.08 -0.08
## orig_cltv -0.10 -0.10
## orig_dti -0.05 -0.05
## orig_interest_rate -0.29 -0.30
## orig_upb 0.04 0.04
## orig_loan_term -0.03 -0.04
## num_borrowers 0.00 0.00
## mi_pct -0.04 -0.04
## unemployment_rate_lag4 hpi_qoq_qlag1 hpi_qoq_qlag2
## current_delinquency_status 0.02 -0.01 -0.01
## loan_age 0.37 -0.12 -0.22
## current_upb 0.01 -0.09 -0.08
## delta_interest_rate -0.02 0.00 0.01
## mod_flag_12m 0.00 -0.01 0.00
## current_deferred_upb 0.01 0.00 0.00
## unemployment_rate_lag1 0.98 -0.29 -0.40
## unemployment_rate_lag2 0.99 -0.28 -0.38
## unemployment_rate_lag3 1.00 -0.27 -0.37
## unemployment_rate_lag4 1.00 -0.26 -0.36
## hpi_qoq_qlag1 -0.26 1.00 0.42
## hpi_qoq_qlag2 -0.36 0.42 1.00
## hpi_qoq_qlag3 -0.44 0.09 0.52
## hpi_qoq_qlag4 -0.52 0.36 0.23
## credit_score 0.12 -0.03 -0.05
## orig_ltv -0.08 0.09 0.09
## orig_cltv -0.10 0.09 0.10
## orig_dti -0.05 -0.01 0.00
## orig_interest_rate -0.31 0.05 0.11
## orig_upb 0.04 -0.09 -0.10
## orig_loan_term -0.04 -0.02 -0.01
## num_borrowers 0.00 0.00 0.00
## mi_pct -0.04 0.05 0.05
## hpi_qoq_qlag3 hpi_qoq_qlag4 credit_score orig_ltv
## current_delinquency_status -0.01 -0.01 -0.09 0.02
## loan_age -0.21 -0.20 -0.10 0.05
## current_upb -0.08 -0.08 0.04 0.09
## delta_interest_rate 0.01 0.01 0.02 -0.01
## mod_flag_12m 0.00 0.00 -0.01 0.00
## current_deferred_upb 0.00 -0.01 0.00 0.00
## unemployment_rate_lag1 -0.48 -0.53 0.12 -0.08
## unemployment_rate_lag2 -0.47 -0.53 0.12 -0.08
## unemployment_rate_lag3 -0.45 -0.52 0.12 -0.08
## unemployment_rate_lag4 -0.44 -0.52 0.12 -0.08
## hpi_qoq_qlag1 0.09 0.36 -0.03 0.09
## hpi_qoq_qlag2 0.52 0.23 -0.05 0.09
## hpi_qoq_qlag3 1.00 0.54 -0.07 0.10
## hpi_qoq_qlag4 0.54 1.00 -0.07 0.10
## credit_score -0.07 -0.07 1.00 -0.18
## orig_ltv 0.10 0.10 -0.18 1.00
## orig_cltv 0.11 0.11 -0.18 0.93
## orig_dti 0.00 0.01 -0.21 0.15
## orig_interest_rate 0.14 0.16 -0.33 0.23
## orig_upb -0.10 -0.10 0.05 0.08
## orig_loan_term -0.01 0.00 -0.08 0.25
## num_borrowers 0.00 0.00 0.01 -0.06
## mi_pct 0.05 0.05 -0.14 0.61
## orig_cltv orig_dti orig_interest_rate orig_upb
## current_delinquency_status 0.01 0.03 0.05 -0.01
## loan_age 0.06 0.03 0.22 -0.09
## current_upb 0.12 0.13 -0.22 0.96
## delta_interest_rate -0.01 -0.02 -0.01 -0.02
## mod_flag_12m 0.00 0.01 0.01 0.00
## current_deferred_upb 0.00 0.00 0.00 0.00
## unemployment_rate_lag1 -0.10 -0.05 -0.28 0.05
## unemployment_rate_lag2 -0.10 -0.05 -0.29 0.04
## unemployment_rate_lag3 -0.10 -0.05 -0.30 0.04
## unemployment_rate_lag4 -0.10 -0.05 -0.31 0.04
## hpi_qoq_qlag1 0.09 -0.01 0.05 -0.09
## hpi_qoq_qlag2 0.10 0.00 0.11 -0.10
## hpi_qoq_qlag3 0.11 0.00 0.14 -0.10
## hpi_qoq_qlag4 0.11 0.01 0.16 -0.10
## credit_score -0.18 -0.21 -0.33 0.05
## orig_ltv 0.93 0.15 0.23 0.08
## orig_cltv 1.00 0.16 0.24 0.11
## orig_dti 0.16 1.00 0.16 0.13
## orig_interest_rate 0.24 0.16 1.00 -0.23
## orig_upb 0.11 0.13 -0.23 1.00
## orig_loan_term 0.26 0.14 0.25 0.19
## num_borrowers -0.04 -0.11 -0.10 0.17
## mi_pct 0.53 0.09 0.19 -0.03
## orig_loan_term num_borrowers mi_pct
## current_delinquency_status 0.02 -0.03 0.02
## loan_age 0.04 -0.03 0.03
## current_upb 0.22 0.16 -0.02
## delta_interest_rate -0.01 0.01 0.00
## mod_flag_12m 0.00 0.00 0.00
## current_deferred_upb 0.00 -0.01 0.00
## unemployment_rate_lag1 -0.03 0.00 -0.04
## unemployment_rate_lag2 -0.03 0.00 -0.04
## unemployment_rate_lag3 -0.04 0.00 -0.04
## unemployment_rate_lag4 -0.04 0.00 -0.04
## hpi_qoq_qlag1 -0.02 0.00 0.05
## hpi_qoq_qlag2 -0.01 0.00 0.05
## hpi_qoq_qlag3 -0.01 0.00 0.05
## hpi_qoq_qlag4 0.00 0.00 0.05
## credit_score -0.08 0.01 -0.14
## orig_ltv 0.25 -0.06 0.61
## orig_cltv 0.26 -0.04 0.53
## orig_dti 0.14 -0.11 0.09
## orig_interest_rate 0.25 -0.10 0.19
## orig_upb 0.19 0.17 -0.03
## orig_loan_term 1.00 -0.06 0.12
## num_borrowers -0.06 1.00 -0.04
## mi_pct 0.12 -0.04 1.00
# Univariate correlation (Macroeconomic lags)
results_unemp <- rbindlist(lapply(1:4, function(i) {
var <- paste0("unemployment_rate_lag", i)
fit <- glm(default_next_12m ~ get(var), data = panel_cor, family = binomial)
data.table(
Variable = paste("Unemployment (Lag", i, ")"),
AIC = round(AIC(fit), 2),
LogLik = round(as.numeric(logLik(fit)), 2),
Coef = round(coef(fit)[2], 4),
P_value = round(summary(fit)$coefficients[2, 4], 4)
)
}))
results_hpi <- rbindlist(lapply(1:4, function(i) {
var <- paste0("hpi_qoq_qlag", i)
fit <- glm(default_next_12m ~ get(var), data = panel_cor, family = binomial)
data.table(
Variable = paste("HPI QoQ (Lag", i, ")"),
AIC = round(AIC(fit), 2),
LogLik = round(as.numeric(logLik(fit)), 2),
Coef = round(coef(fit)[2], 4),
P_value = round(summary(fit)$coefficients[2, 4], 4)
)
}))
results_macro <- rbindlist(list(results_unemp, results_hpi))
print(results_macro)
## Variable AIC LogLik Coef P_value
## <char> <num> <num> <num> <num>
## 1: Unemployment (Lag 1 ) 249326.1 -124661.1 -0.0294 0
## 2: Unemployment (Lag 2 ) 249248.1 -124622.0 -0.0391 0
## 3: Unemployment (Lag 3 ) 249157.2 -124576.6 -0.0478 0
## 4: Unemployment (Lag 4 ) 249062.2 -124529.1 -0.0554 0
## 5: HPI QoQ (Lag 1 ) 246703.0 -123349.5 -0.1839 0
## 6: HPI QoQ (Lag 2 ) 247201.2 -123598.6 -0.1660 0
## 7: HPI QoQ (Lag 3 ) 248257.5 -124126.7 -0.1197 0
## 8: HPI QoQ (Lag 4 ) 248609.2 -124302.6 -0.0992 0