Exploratory Data Analysis
library(data.table)
library(ggplot2)
library(stargazer)
library(patchwork)
# --- Load data ---
OUTPUT_DIR <- "/Users/amalianimeskern/Library/CloudStorage/OneDrive-ErasmusUniversityRotterdam/Freddie Mac Data"
panel <- readRDS(file.path(OUTPUT_DIR, "freddie_mac_panel.rds"))
# Number of unique loans
panel[, uniqueN(loan_sequence_number)]
## [1] 240565
# --- Sanity check: geography ---
setdiff(unique(panel$property_state),
c("AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA",
"HI","ID","IL","IN","IA","KS","KY","LA","ME","MD",
"MA","MI","MN","MS","MO","MT","NE","NV","NH","NJ",
"NM","NY","NC","ND","OH","OK","OR","PA","RI","SC",
"SD","TN","TX","UT","VT","VA","WA","WV","WI","WY","DC"))
## character(0)
# --- Missingness ---
missingness <- data.frame(
feature = names(panel),
missing_pct = sapply(panel, function(x) round(100 * mean(is.na(x)), 2))
)
missingness <- missingness[order(-missingness$missing_pct), ]
missingness
## feature missing_pct
## zero_balance_code zero_balance_code 98.64
## first_default_index first_default_index 94.65
## orig_dti orig_dti 6.19
## orig_loan_term orig_loan_term 0.09
## credit_score credit_score 0.07
## num_borrowers num_borrowers 0.03
## first_time_homebuyer first_time_homebuyer 0.02
## orig_cltv orig_cltv 0.01
## orig_ltv orig_ltv 0.01
## property_state property_state 0.00
## monthly_reporting_period monthly_reporting_period 0.00
## loan_sequence_number loan_sequence_number 0.00
## current_upb current_upb 0.00
## current_delinquency_status current_delinquency_status 0.00
## loan_age loan_age 0.00
## current_deferred_upb current_deferred_upb 0.00
## delta_interest_rate delta_interest_rate 0.00
## mod_flag_12m mod_flag_12m 0.00
## month_index month_index 0.00
## default_next_12m default_next_12m 0.00
## first_payment_date first_payment_date 0.00
## mi_pct mi_pct 0.00
## occupancy_status occupancy_status 0.00
## orig_upb orig_upb 0.00
## orig_interest_rate orig_interest_rate 0.00
## channel channel 0.00
## amortization_type amortization_type 0.00
## property_type property_type 0.00
## loan_purpose loan_purpose 0.00
## io_indicator io_indicator 0.00
## orig_year orig_year 0.00
## orig_month orig_month 0.00
## orig_quarter orig_quarter 0.00
## unemployment_rate_lag1 unemployment_rate_lag1 0.00
## unemployment_rate_lag2 unemployment_rate_lag2 0.00
## unemployment_rate_lag3 unemployment_rate_lag3 0.00
## unemployment_rate_lag4 unemployment_rate_lag4 0.00
## hpi_qoq_qlag1 hpi_qoq_qlag1 0.00
## hpi_qoq_qlag2 hpi_qoq_qlag2 0.00
## hpi_qoq_qlag3 hpi_qoq_qlag3 0.00
## hpi_qoq_qlag4 hpi_qoq_qlag4 0.00
key_vars <- c("credit_score", "orig_ltv", "orig_cltv", "orig_dti",
"orig_interest_rate", "current_upb", "current_deferred_upb",
"delta_interest_rate", "mod_flag_12m",
paste0("unemployment_rate_lag", 1:4),
paste0("hpi_qoq_qlag", 1:4))
miss_key <- missingness[missingness$feature %in% key_vars, ]
miss_key
## feature missing_pct
## orig_dti orig_dti 6.19
## credit_score credit_score 0.07
## orig_cltv orig_cltv 0.01
## orig_ltv orig_ltv 0.01
## current_upb current_upb 0.00
## current_deferred_upb current_deferred_upb 0.00
## delta_interest_rate delta_interest_rate 0.00
## mod_flag_12m mod_flag_12m 0.00
## orig_interest_rate orig_interest_rate 0.00
## unemployment_rate_lag1 unemployment_rate_lag1 0.00
## unemployment_rate_lag2 unemployment_rate_lag2 0.00
## unemployment_rate_lag3 unemployment_rate_lag3 0.00
## unemployment_rate_lag4 unemployment_rate_lag4 0.00
## hpi_qoq_qlag1 hpi_qoq_qlag1 0.00
## hpi_qoq_qlag2 hpi_qoq_qlag2 0.00
## hpi_qoq_qlag3 hpi_qoq_qlag3 0.00
## hpi_qoq_qlag4 hpi_qoq_qlag4 0.00
# --- Class imbalance ---
table(panel$default_next_12m)
##
## 0 1
## 7295197 17768
prop.table(table(panel$default_next_12m)) * 100
##
## 0 1
## 99.7570343 0.2429657
panel[, .(ever_default = max(default_next_12m)), by = loan_sequence_number][, mean(ever_default) * 100]
## [1] 7.385946
panel[, .N, by = loan_sequence_number][, summary(N)]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 18.0 28.0 30.4 41.0 72.0
# --- Default rate by origination year ---
setorder(panel, loan_sequence_number, monthly_reporting_period)
cols_to_keep <- c("orig_year", "credit_score", "orig_ltv", "orig_cltv",
"orig_dti", "orig_interest_rate", "orig_upb", "orig_loan_term",
"num_borrowers", "mi_pct", "occupancy_status", "amortization_type",
"loan_purpose", "channel", "property_type",
"first_time_homebuyer", "io_indicator")
loan_level <- panel[, .(ever_default_12m = max(default_next_12m, na.rm = TRUE)),
by = loan_sequence_number]
first_obs <- panel[panel[, .I[1], by = loan_sequence_number]$V1,
c("loan_sequence_number", cols_to_keep), with = FALSE]
loan_level <- merge(loan_level, first_obs, by = "loan_sequence_number")
default_by_year <- loan_level[, .(
n_loans = .N,
default_rate = round(100 * mean(ever_default_12m), 2)
), by = orig_year][order(orig_year)]
default_by_year
## orig_year n_loans default_rate
## <int> <int> <num>
## 1: 2006 41258 12.12
## 2: 2007 49688 13.74
## 3: 2008 49999 9.02
## 4: 2009 49638 1.82
## 5: 2010 49982 1.05
ggplot(default_by_year, aes(x = orig_year, y = default_rate)) +
geom_line() +
geom_point() +
labs(title = "Default Rate by Origination Year",
x = "Origination Year",
y = "Default Rate (%)") +
theme_classic()

# --- Feature statistics ---
continuous_orig <- c("credit_score", "orig_ltv", "orig_dti", "orig_interest_rate")
feature_stats_orig <- rbindlist(lapply(continuous_orig, function(var) {
loan_level[, .(
variable = var,
mean_default = round(mean(get(var)[ever_default_12m == 1], na.rm = TRUE), 2),
mean_no_default = round(mean(get(var)[ever_default_12m == 0], na.rm = TRUE), 2),
sd_default = round(sd(get(var)[ever_default_12m == 1], na.rm = TRUE), 2),
sd_no_default = round(sd(get(var)[ever_default_12m == 0], na.rm = TRUE), 2)
)]
}))
feature_stats_orig
## variable mean_default mean_no_default sd_default sd_no_default
## <char> <num> <num> <num> <num>
## 1: credit_score 690.72 745.61 55.62 51.54
## 2: orig_ltv 77.99 68.54 13.42 18.08
## 3: orig_dti 40.96 34.37 11.40 12.14
## 4: orig_interest_rate 6.41 5.70 0.55 0.81
continuous_behav <- c("current_delinquency_status", "current_upb",
"delta_interest_rate", "mod_flag_12m",
"current_deferred_upb",
"unemployment_rate_lag1", "hpi_qoq_qlag1")
feature_stats_behav <- rbindlist(lapply(continuous_behav, function(var) {
panel[, .(
variable = var,
mean_default = round(mean(get(var)[default_next_12m == 1], na.rm = TRUE), 2),
mean_no_default = round(mean(get(var)[default_next_12m == 0], na.rm = TRUE), 2),
sd_default = round(sd(get(var)[default_next_12m == 1], na.rm = TRUE), 2),
sd_no_default = round(sd(get(var)[default_next_12m == 0], na.rm = TRUE), 2)
)]
}))
feature_stats_behav
## variable mean_default mean_no_default sd_default
## <char> <num> <num> <num>
## 1: current_delinquency_status 0.12 0.01 0.37
## 2: current_upb 198189.42 176828.25 98538.34
## 3: delta_interest_rate -0.01 0.00 0.15
## 4: mod_flag_12m 0.00 0.00 0.02
## 5: current_deferred_upb 4.57 3.91 537.48
## 6: unemployment_rate_lag1 7.77 7.96 2.53
## 7: hpi_qoq_qlag1 -1.76 -0.98 2.40
## sd_no_default
## <num>
## 1: 0.12
## 2: 102407.95
## 3: 0.09
## 4: 0.01
## 5: 653.89
## 6: 2.52
## 7: 1.87
# --- Plots ---
ggplot(panel[!is.na(orig_dti)], aes(x = orig_dti)) +
geom_histogram(binwidth = 2, fill = "orange", color = "white") +
labs(title = "DTI Distribution", x = "DTI", y = "Count") +
theme_classic()

ggplot(panel[!is.na(current_delinquency_status)],
aes(x = factor(current_delinquency_status),
fill = factor(default_next_12m))) +
geom_bar(position = "fill") +
labs(title = "Default Rate by Delinquency Status",
x = "Delinquency Status",
y = "Proportion") +
theme_classic()

# --- Correlation ---
panel_cor <- panel[!is.na(default_next_12m)]
set.seed(42)
sample_idx <- sample(nrow(panel_cor), min(100000, nrow(panel_cor)))
panel_sample <- panel_cor[sample_idx]
all_vars <- c("current_delinquency_status", "loan_age", "current_upb",
"delta_interest_rate", "mod_flag_12m", "current_deferred_upb",
paste0("unemployment_rate_lag", 1:4),
paste0("hpi_qoq_qlag", 1:4),
"credit_score", "orig_ltv", "orig_cltv", "orig_dti",
"orig_interest_rate", "orig_upb", "orig_loan_term",
"num_borrowers", "mi_pct", "default_next_12m")
cor_matrix <- cor(panel_sample[, ..all_vars],
use = "complete.obs",
method = "spearman")
cor_matrix <- round(cor_matrix, 2)
sort(cor_matrix[, "default_next_12m"], decreasing = TRUE)
## default_next_12m current_delinquency_status
## 1.00 0.06
## orig_dti orig_interest_rate
## 0.03 0.03
## orig_ltv orig_cltv
## 0.02 0.02
## orig_loan_term mi_pct
## 0.02 0.02
## loan_age current_upb
## 0.01 0.01
## orig_upb delta_interest_rate
## 0.01 0.00
## mod_flag_12m current_deferred_upb
## 0.00 0.00
## unemployment_rate_lag1 unemployment_rate_lag2
## 0.00 0.00
## unemployment_rate_lag3 unemployment_rate_lag4
## 0.00 0.00
## hpi_qoq_qlag1 hpi_qoq_qlag2
## -0.01 -0.01
## hpi_qoq_qlag3 hpi_qoq_qlag4
## -0.01 -0.01
## num_borrowers credit_score
## -0.01 -0.04
# --- Univariate macro models ---
results_unemp <- rbindlist(lapply(1:4, function(i) {
var <- paste0("unemployment_rate_lag", i)
fit <- glm(default_next_12m ~ get(var), data = panel_cor, family = binomial)
data.table(
Variable = paste("Unemployment (Lag", i, ")"),
AIC = round(AIC(fit), 2),
LogLik = round(as.numeric(logLik(fit)), 2),
Coef = round(coef(fit)[2], 4),
P_value = round(summary(fit)$coefficients[2, 4], 4)
)
}))
results_hpi <- rbindlist(lapply(1:4, function(i) {
var <- paste0("hpi_qoq_qlag", i)
fit <- glm(default_next_12m ~ get(var), data = panel_cor, family = binomial)
data.table(
Variable = paste("HPI QoQ (Lag", i, ")"),
AIC = round(AIC(fit), 2),
LogLik = round(as.numeric(logLik(fit)), 2),
Coef = round(coef(fit)[2], 4),
P_value = round(summary(fit)$coefficients[2, 4], 4)
)
}))
results_macro <- rbindlist(list(results_unemp, results_hpi))
results_macro
## Variable AIC LogLik Coef P_value
## <char> <num> <num> <num> <num>
## 1: Unemployment (Lag 1 ) 249326.1 -124661.1 -0.0294 0
## 2: Unemployment (Lag 2 ) 249248.1 -124622.0 -0.0391 0
## 3: Unemployment (Lag 3 ) 249157.2 -124576.6 -0.0478 0
## 4: Unemployment (Lag 4 ) 249062.2 -124529.1 -0.0554 0
## 5: HPI QoQ (Lag 1 ) 246703.0 -123349.5 -0.1839 0
## 6: HPI QoQ (Lag 2 ) 247201.2 -123598.6 -0.1660 0
## 7: HPI QoQ (Lag 3 ) 248257.5 -124126.7 -0.1197 0
## 8: HPI QoQ (Lag 4 ) 248609.2 -124302.6 -0.0992 0