Exploratory Data Analysis

library(data.table)
library(ggplot2)
library(stargazer)
library(patchwork)

# --- Load data ---

OUTPUT_DIR <- "/Users/amalianimeskern/Library/CloudStorage/OneDrive-ErasmusUniversityRotterdam/Freddie Mac Data"

panel <- readRDS(file.path(OUTPUT_DIR, "freddie_mac_panel.rds"))

# Number of unique loans
panel[, uniqueN(loan_sequence_number)]
## [1] 240565
# --- Sanity check: geography ---

setdiff(unique(panel$property_state),
        c("AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA",
          "HI","ID","IL","IN","IA","KS","KY","LA","ME","MD",
          "MA","MI","MN","MS","MO","MT","NE","NV","NH","NJ", 
          "NM","NY","NC","ND","OH","OK","OR","PA","RI","SC",
          "SD","TN","TX","UT","VT","VA","WA","WV","WI","WY","DC"))
## character(0)
# --- Missingness ---

missingness <- data.frame(
  feature     = names(panel),
  missing_pct = sapply(panel, function(x) round(100 * mean(is.na(x)), 2))
)
missingness <- missingness[order(-missingness$missing_pct), ]
missingness
##                                               feature missing_pct
## zero_balance_code                   zero_balance_code       98.64
## first_default_index               first_default_index       94.65
## orig_dti                                     orig_dti        6.19
## orig_loan_term                         orig_loan_term        0.09
## credit_score                             credit_score        0.07
## num_borrowers                           num_borrowers        0.03
## first_time_homebuyer             first_time_homebuyer        0.02
## orig_cltv                                   orig_cltv        0.01
## orig_ltv                                     orig_ltv        0.01
## property_state                         property_state        0.00
## monthly_reporting_period     monthly_reporting_period        0.00
## loan_sequence_number             loan_sequence_number        0.00
## current_upb                               current_upb        0.00
## current_delinquency_status current_delinquency_status        0.00
## loan_age                                     loan_age        0.00
## current_deferred_upb             current_deferred_upb        0.00
## delta_interest_rate               delta_interest_rate        0.00
## mod_flag_12m                             mod_flag_12m        0.00
## month_index                               month_index        0.00
## default_next_12m                     default_next_12m        0.00
## first_payment_date                 first_payment_date        0.00
## mi_pct                                         mi_pct        0.00
## occupancy_status                     occupancy_status        0.00
## orig_upb                                     orig_upb        0.00
## orig_interest_rate                 orig_interest_rate        0.00
## channel                                       channel        0.00
## amortization_type                   amortization_type        0.00
## property_type                           property_type        0.00
## loan_purpose                             loan_purpose        0.00
## io_indicator                             io_indicator        0.00
## orig_year                                   orig_year        0.00
## orig_month                                 orig_month        0.00
## orig_quarter                             orig_quarter        0.00
## unemployment_rate_lag1         unemployment_rate_lag1        0.00
## unemployment_rate_lag2         unemployment_rate_lag2        0.00
## unemployment_rate_lag3         unemployment_rate_lag3        0.00
## unemployment_rate_lag4         unemployment_rate_lag4        0.00
## hpi_qoq_qlag1                           hpi_qoq_qlag1        0.00
## hpi_qoq_qlag2                           hpi_qoq_qlag2        0.00
## hpi_qoq_qlag3                           hpi_qoq_qlag3        0.00
## hpi_qoq_qlag4                           hpi_qoq_qlag4        0.00
key_vars <- c("credit_score", "orig_ltv", "orig_cltv", "orig_dti",
              "orig_interest_rate", "current_upb", "current_deferred_upb",
              "delta_interest_rate", "mod_flag_12m",
              paste0("unemployment_rate_lag", 1:4),
              paste0("hpi_qoq_qlag", 1:4))
miss_key <- missingness[missingness$feature %in% key_vars, ]
miss_key
##                                       feature missing_pct
## orig_dti                             orig_dti        6.19
## credit_score                     credit_score        0.07
## orig_cltv                           orig_cltv        0.01
## orig_ltv                             orig_ltv        0.01
## current_upb                       current_upb        0.00
## current_deferred_upb     current_deferred_upb        0.00
## delta_interest_rate       delta_interest_rate        0.00
## mod_flag_12m                     mod_flag_12m        0.00
## orig_interest_rate         orig_interest_rate        0.00
## unemployment_rate_lag1 unemployment_rate_lag1        0.00
## unemployment_rate_lag2 unemployment_rate_lag2        0.00
## unemployment_rate_lag3 unemployment_rate_lag3        0.00
## unemployment_rate_lag4 unemployment_rate_lag4        0.00
## hpi_qoq_qlag1                   hpi_qoq_qlag1        0.00
## hpi_qoq_qlag2                   hpi_qoq_qlag2        0.00
## hpi_qoq_qlag3                   hpi_qoq_qlag3        0.00
## hpi_qoq_qlag4                   hpi_qoq_qlag4        0.00
# --- Class imbalance ---

table(panel$default_next_12m)
## 
##       0       1 
## 7295197   17768
prop.table(table(panel$default_next_12m)) * 100
## 
##          0          1 
## 99.7570343  0.2429657
panel[, .(ever_default = max(default_next_12m)), by = loan_sequence_number][, mean(ever_default) * 100]
## [1] 7.385946
panel[, .N, by = loan_sequence_number][, summary(N)]
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     1.0    18.0    28.0    30.4    41.0    72.0
# --- Default rate by origination year ---

setorder(panel, loan_sequence_number, monthly_reporting_period)

cols_to_keep <- c("orig_year", "credit_score", "orig_ltv", "orig_cltv",
                  "orig_dti", "orig_interest_rate", "orig_upb", "orig_loan_term",
                  "num_borrowers", "mi_pct", "occupancy_status", "amortization_type",
                  "loan_purpose", "channel", "property_type",
                  "first_time_homebuyer", "io_indicator")

loan_level <- panel[, .(ever_default_12m = max(default_next_12m, na.rm = TRUE)),
                    by = loan_sequence_number]

first_obs <- panel[panel[, .I[1], by = loan_sequence_number]$V1,
                   c("loan_sequence_number", cols_to_keep), with = FALSE]

loan_level <- merge(loan_level, first_obs, by = "loan_sequence_number")

default_by_year <- loan_level[, .(
  n_loans      = .N,
  default_rate = round(100 * mean(ever_default_12m), 2)
), by = orig_year][order(orig_year)]

default_by_year
##    orig_year n_loans default_rate
##        <int>   <int>        <num>
## 1:      2006   41258        12.12
## 2:      2007   49688        13.74
## 3:      2008   49999         9.02
## 4:      2009   49638         1.82
## 5:      2010   49982         1.05
ggplot(default_by_year, aes(x = orig_year, y = default_rate)) +
  geom_line() +
  geom_point() +
  labs(title = "Default Rate by Origination Year",
       x     = "Origination Year",
       y     = "Default Rate (%)") +
  theme_classic()

# --- Feature statistics ---

continuous_orig <- c("credit_score", "orig_ltv", "orig_dti", "orig_interest_rate")

feature_stats_orig <- rbindlist(lapply(continuous_orig, function(var) {
  loan_level[, .(
    variable        = var,
    mean_default    = round(mean(get(var)[ever_default_12m == 1], na.rm = TRUE), 2),
    mean_no_default = round(mean(get(var)[ever_default_12m == 0], na.rm = TRUE), 2),
    sd_default      = round(sd(get(var)[ever_default_12m == 1],   na.rm = TRUE), 2),
    sd_no_default   = round(sd(get(var)[ever_default_12m == 0],   na.rm = TRUE), 2)
  )]
}))

feature_stats_orig
##              variable mean_default mean_no_default sd_default sd_no_default
##                <char>        <num>           <num>      <num>         <num>
## 1:       credit_score       690.72          745.61      55.62         51.54
## 2:           orig_ltv        77.99           68.54      13.42         18.08
## 3:           orig_dti        40.96           34.37      11.40         12.14
## 4: orig_interest_rate         6.41            5.70       0.55          0.81
continuous_behav <- c("current_delinquency_status", "current_upb",
                      "delta_interest_rate", "mod_flag_12m",
                      "current_deferred_upb",
                      "unemployment_rate_lag1", "hpi_qoq_qlag1")

feature_stats_behav <- rbindlist(lapply(continuous_behav, function(var) {
  panel[, .(
    variable        = var,
    mean_default    = round(mean(get(var)[default_next_12m == 1], na.rm = TRUE), 2),
    mean_no_default = round(mean(get(var)[default_next_12m == 0], na.rm = TRUE), 2),
    sd_default      = round(sd(get(var)[default_next_12m == 1],   na.rm = TRUE), 2),
    sd_no_default   = round(sd(get(var)[default_next_12m == 0],   na.rm = TRUE), 2)
  )]
}))

feature_stats_behav
##                      variable mean_default mean_no_default sd_default
##                        <char>        <num>           <num>      <num>
## 1: current_delinquency_status         0.12            0.01       0.37
## 2:                current_upb    198189.42       176828.25   98538.34
## 3:        delta_interest_rate        -0.01            0.00       0.15
## 4:               mod_flag_12m         0.00            0.00       0.02
## 5:       current_deferred_upb         4.57            3.91     537.48
## 6:     unemployment_rate_lag1         7.77            7.96       2.53
## 7:              hpi_qoq_qlag1        -1.76           -0.98       2.40
##    sd_no_default
##            <num>
## 1:          0.12
## 2:     102407.95
## 3:          0.09
## 4:          0.01
## 5:        653.89
## 6:          2.52
## 7:          1.87
# --- Plots ---

ggplot(panel[!is.na(orig_dti)], aes(x = orig_dti)) +
  geom_histogram(binwidth = 2, fill = "orange", color = "white") +
  labs(title = "DTI Distribution", x = "DTI", y = "Count") +
  theme_classic()

ggplot(panel[!is.na(current_delinquency_status)],
       aes(x = factor(current_delinquency_status),
           fill = factor(default_next_12m))) +
  geom_bar(position = "fill") +
  labs(title = "Default Rate by Delinquency Status",
       x = "Delinquency Status",
       y = "Proportion") +
  theme_classic()

# --- Correlation ---

panel_cor <- panel[!is.na(default_next_12m)]

set.seed(42)
sample_idx <- sample(nrow(panel_cor), min(100000, nrow(panel_cor)))
panel_sample <- panel_cor[sample_idx]

all_vars <- c("current_delinquency_status", "loan_age", "current_upb",
              "delta_interest_rate", "mod_flag_12m", "current_deferred_upb",
              paste0("unemployment_rate_lag", 1:4),
              paste0("hpi_qoq_qlag", 1:4),
              "credit_score", "orig_ltv", "orig_cltv", "orig_dti",
              "orig_interest_rate", "orig_upb", "orig_loan_term",
              "num_borrowers", "mi_pct", "default_next_12m")

cor_matrix <- cor(panel_sample[, ..all_vars],
                  use    = "complete.obs",
                  method = "spearman")

cor_matrix <- round(cor_matrix, 2)

sort(cor_matrix[, "default_next_12m"], decreasing = TRUE)
##           default_next_12m current_delinquency_status 
##                       1.00                       0.06 
##                   orig_dti         orig_interest_rate 
##                       0.03                       0.03 
##                   orig_ltv                  orig_cltv 
##                       0.02                       0.02 
##             orig_loan_term                     mi_pct 
##                       0.02                       0.02 
##                   loan_age                current_upb 
##                       0.01                       0.01 
##                   orig_upb        delta_interest_rate 
##                       0.01                       0.00 
##               mod_flag_12m       current_deferred_upb 
##                       0.00                       0.00 
##     unemployment_rate_lag1     unemployment_rate_lag2 
##                       0.00                       0.00 
##     unemployment_rate_lag3     unemployment_rate_lag4 
##                       0.00                       0.00 
##              hpi_qoq_qlag1              hpi_qoq_qlag2 
##                      -0.01                      -0.01 
##              hpi_qoq_qlag3              hpi_qoq_qlag4 
##                      -0.01                      -0.01 
##              num_borrowers               credit_score 
##                      -0.01                      -0.04
# --- Univariate macro models ---

results_unemp <- rbindlist(lapply(1:4, function(i) {
  var <- paste0("unemployment_rate_lag", i)
  fit <- glm(default_next_12m ~ get(var), data = panel_cor, family = binomial)
  
  data.table(
    Variable = paste("Unemployment (Lag", i, ")"),
    AIC      = round(AIC(fit), 2),
    LogLik   = round(as.numeric(logLik(fit)), 2),
    Coef     = round(coef(fit)[2], 4),
    P_value  = round(summary(fit)$coefficients[2, 4], 4)
  )
}))

results_hpi <- rbindlist(lapply(1:4, function(i) {
  var <- paste0("hpi_qoq_qlag", i)
  fit <- glm(default_next_12m ~ get(var), data = panel_cor, family = binomial)
  
  data.table(
    Variable = paste("HPI QoQ (Lag", i, ")"),
    AIC      = round(AIC(fit), 2),
    LogLik   = round(as.numeric(logLik(fit)), 2),
    Coef     = round(coef(fit)[2], 4),
    P_value  = round(summary(fit)$coefficients[2, 4], 4)
  )
}))

results_macro <- rbindlist(list(results_unemp, results_hpi))

results_macro
##                 Variable      AIC    LogLik    Coef P_value
##                   <char>    <num>     <num>   <num>   <num>
## 1: Unemployment (Lag 1 ) 249326.1 -124661.1 -0.0294       0
## 2: Unemployment (Lag 2 ) 249248.1 -124622.0 -0.0391       0
## 3: Unemployment (Lag 3 ) 249157.2 -124576.6 -0.0478       0
## 4: Unemployment (Lag 4 ) 249062.2 -124529.1 -0.0554       0
## 5:      HPI QoQ (Lag 1 ) 246703.0 -123349.5 -0.1839       0
## 6:      HPI QoQ (Lag 2 ) 247201.2 -123598.6 -0.1660       0
## 7:      HPI QoQ (Lag 3 ) 248257.5 -124126.7 -0.1197       0
## 8:      HPI QoQ (Lag 4 ) 248609.2 -124302.6 -0.0992       0