Exploratory Data Analysis

library(data.table)
library(ggplot2)
library(stargazer)
library(patchwork)


# --- Load data ---

OUTPUT_DIR <- "/Users/amalianimeskern/Library/CloudStorage/OneDrive-ErasmusUniversityRotterdam/Freddie Mac Data"

panel <- readRDS(file.path(OUTPUT_DIR, "freddie_mac_panel.rds"))

# Note: Panel dataset is restricted to properties in the 50 U.S. states and DC.
# Loans in U.S. territories (PR, GU, VI) were excluded during preprocessing.

# Number of unique loans
panel[, uniqueN(loan_sequence_number)]
## [1] 240565
# --- Sanity check: geography ---

setdiff(unique(panel$property_state),
        c("AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA",
          "HI","ID","IL","IN","IA","KS","KY","LA","ME","MD",
          "MA","MI","MN","MS","MO","MT","NE","NV","NH","NJ", 
          "NM","NY","NC","ND","OH","OK","OR","PA","RI","SC",
          "SD","TN","TX","UT","VT","VA","WA","WV","WI","WY","DC"))
## character(0)
# --- Sanity check: Missingness ---

missingness <- data.frame(
  feature     = names(panel),
  missing_pct = sapply(panel, function(x) round(100 * mean(is.na(x)), 2))
)
missingness <- missingness[order(-missingness$missing_pct), ]
print(missingness)
##                                               feature missing_pct
## zero_balance_code                   zero_balance_code       98.64
## first_default_index               first_default_index       94.65
## orig_dti                                     orig_dti        6.19
## orig_loan_term                         orig_loan_term        0.09
## credit_score                             credit_score        0.07
## num_borrowers                           num_borrowers        0.03
## first_time_homebuyer             first_time_homebuyer        0.02
## orig_cltv                                   orig_cltv        0.01
## orig_ltv                                     orig_ltv        0.01
## property_state                         property_state        0.00
## monthly_reporting_period     monthly_reporting_period        0.00
## loan_sequence_number             loan_sequence_number        0.00
## current_upb                               current_upb        0.00
## current_delinquency_status current_delinquency_status        0.00
## loan_age                                     loan_age        0.00
## current_deferred_upb             current_deferred_upb        0.00
## delta_interest_rate               delta_interest_rate        0.00
## mod_flag_12m                             mod_flag_12m        0.00
## month_index                               month_index        0.00
## default_next_12m                     default_next_12m        0.00
## first_payment_date                 first_payment_date        0.00
## mi_pct                                         mi_pct        0.00
## occupancy_status                     occupancy_status        0.00
## orig_upb                                     orig_upb        0.00
## orig_interest_rate                 orig_interest_rate        0.00
## channel                                       channel        0.00
## amortization_type                   amortization_type        0.00
## property_type                           property_type        0.00
## loan_purpose                             loan_purpose        0.00
## io_indicator                             io_indicator        0.00
## orig_year                                   orig_year        0.00
## orig_month                                 orig_month        0.00
## orig_quarter                             orig_quarter        0.00
## unemployment_rate_lag1         unemployment_rate_lag1        0.00
## unemployment_rate_lag2         unemployment_rate_lag2        0.00
## unemployment_rate_lag3         unemployment_rate_lag3        0.00
## unemployment_rate_lag4         unemployment_rate_lag4        0.00
## hpi_qoq_qlag1                           hpi_qoq_qlag1        0.00
## hpi_qoq_qlag2                           hpi_qoq_qlag2        0.00
## hpi_qoq_qlag3                           hpi_qoq_qlag3        0.00
## hpi_qoq_qlag4                           hpi_qoq_qlag4        0.00
key_vars <- c("credit_score", "orig_ltv", "orig_cltv", "orig_dti",
              "orig_interest_rate", "current_upb", "current_deferred_upb",
              "delta_interest_rate", "mod_flag_12m",
              paste0("unemployment_rate_lag", 1:4),
              paste0("hpi_qoq_qlag", 1:4))
miss_key <- missingness[missingness$feature %in% key_vars, ]
print(miss_key)
##                                       feature missing_pct
## orig_dti                             orig_dti        6.19
## credit_score                     credit_score        0.07
## orig_cltv                           orig_cltv        0.01
## orig_ltv                             orig_ltv        0.01
## current_upb                       current_upb        0.00
## current_deferred_upb     current_deferred_upb        0.00
## delta_interest_rate       delta_interest_rate        0.00
## mod_flag_12m                     mod_flag_12m        0.00
## orig_interest_rate         orig_interest_rate        0.00
## unemployment_rate_lag1 unemployment_rate_lag1        0.00
## unemployment_rate_lag2 unemployment_rate_lag2        0.00
## unemployment_rate_lag3 unemployment_rate_lag3        0.00
## unemployment_rate_lag4 unemployment_rate_lag4        0.00
## hpi_qoq_qlag1                   hpi_qoq_qlag1        0.00
## hpi_qoq_qlag2                   hpi_qoq_qlag2        0.00
## hpi_qoq_qlag3                   hpi_qoq_qlag3        0.00
## hpi_qoq_qlag4                   hpi_qoq_qlag4        0.00
# --- Class imbalance ---

# Observation level
table(panel$default_next_12m)
## 
##       0       1 
## 7295197   17768
prop.table(table(panel$default_next_12m)) * 100
## 
##          0          1 
## 99.7570343  0.2429657
# Loan level
panel[, .(ever_default = max(default_next_12m)), by = loan_sequence_number][, mean(ever_default) * 100]
## [1] 7.385946
# Average months
panel[, .N, by = loan_sequence_number][, summary(N)]
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     1.0    18.0    28.0    30.4    41.0    72.0
# --- Default rate by origination year (loan level) ---
setorder(panel, loan_sequence_number, monthly_reporting_period)
cols_to_keep <- c("orig_year", "credit_score", "orig_ltv", "orig_cltv",
                  "orig_dti", "orig_interest_rate", "orig_upb", "orig_loan_term",
                  "num_borrowers", "mi_pct", "occupancy_status", "amortization_type",
                  "loan_purpose", "channel", "property_type",
                  "first_time_homebuyer", "io_indicator")
loan_level <- panel[, .(ever_default_12m = max(default_next_12m, na.rm = TRUE)),
                    by = loan_sequence_number]
first_obs <- panel[panel[, .I[1], by = loan_sequence_number]$V1,
                   c("loan_sequence_number", cols_to_keep), with = FALSE]
loan_level <- merge(loan_level, first_obs, by = "loan_sequence_number")

default_by_year <- loan_level[, .(
  n_loans      = .N,
  default_rate = round(100 * mean(ever_default_12m), 2)
), by = orig_year][order(orig_year)]
print(default_by_year)
##    orig_year n_loans default_rate
##        <int>   <int>        <num>
## 1:      2006   41258        12.12
## 2:      2007   49688        13.74
## 3:      2008   49999         9.02
## 4:      2009   49638         1.82
## 5:      2010   49982         1.05
ggplot(default_by_year, aes(x = orig_year, y = default_rate)) +
  geom_line() +
  geom_point() +
  labs(title = "Default Rate by Origination Year",
       x     = "Origination Year",
       y     = "Default Rate (%)") +
  theme_classic()

# --- Feature distributions by default status ---
# Origination variables 
continuous_orig <- c("credit_score", "orig_ltv", "orig_dti", "orig_interest_rate")
feature_stats_orig <- rbindlist(lapply(continuous_orig, function(var) {
  loan_level[, .(
    variable        = var,
    mean_default    = round(mean(get(var)[ever_default_12m == 1], na.rm = TRUE), 2),
    mean_no_default = round(mean(get(var)[ever_default_12m == 0], na.rm = TRUE), 2),
    sd_default      = round(sd(get(var)[ever_default_12m == 1],   na.rm = TRUE), 2),
    sd_no_default   = round(sd(get(var)[ever_default_12m == 0],   na.rm = TRUE), 2)
  )]
}))
print(feature_stats_orig)
##              variable mean_default mean_no_default sd_default sd_no_default
##                <char>        <num>           <num>      <num>         <num>
## 1:       credit_score       690.72          745.61      55.62         51.54
## 2:           orig_ltv        77.99           68.54      13.42         18.08
## 3:           orig_dti        40.96           34.37      11.40         12.14
## 4: orig_interest_rate         6.41            5.70       0.55          0.81
# Behavioural + macro variables — observation level
continuous_behav <- c("current_delinquency_status", "current_upb",
                      "delta_interest_rate", "mod_flag_12m",
                      "current_deferred_upb",
                      "unemployment_rate_lag1", "hpi_qoq_qlag1")
feature_stats_behav <- rbindlist(lapply(continuous_behav, function(var) {
  panel[, .(
    variable        = var,
    mean_default    = round(mean(get(var)[default_next_12m == 1], na.rm = TRUE), 2),
    mean_no_default = round(mean(get(var)[default_next_12m == 0], na.rm = TRUE), 2),
    sd_default      = round(sd(get(var)[default_next_12m == 1],   na.rm = TRUE), 2),
    sd_no_default   = round(sd(get(var)[default_next_12m == 0],   na.rm = TRUE), 2)
  )]
}))
print(feature_stats_behav)
##                      variable mean_default mean_no_default sd_default
##                        <char>        <num>           <num>      <num>
## 1: current_delinquency_status         0.12            0.01       0.37
## 2:                current_upb    198189.42       176828.25   98538.34
## 3:        delta_interest_rate        -0.01            0.00       0.15
## 4:               mod_flag_12m         0.00            0.00       0.02
## 5:       current_deferred_upb         4.57            3.91     537.48
## 6:     unemployment_rate_lag1         7.77            7.96       2.53
## 7:              hpi_qoq_qlag1        -1.76           -0.98       2.40
##    sd_no_default
##            <num>
## 1:          0.12
## 2:     102407.95
## 3:          0.09
## 4:          0.01
## 5:        653.89
## 6:          2.52
## 7:          1.87
# --- DTI distribution ---

ggplot(panel[!is.na(orig_dti)],
       aes(x = orig_dti)) +
  geom_histogram(binwidth = 2, fill = "orange", color = "white") +
  labs(title = "DTI Distribution", x = "DTI", y = "Count") +
  theme_classic()

# --- Delinquency status distribution ---

ggplot(panel[!is.na(current_delinquency_status)],
       aes(x = factor(current_delinquency_status),
           fill = factor(default_next_12m))) +
  geom_bar(position = "fill") +
  scale_fill_manual(values = c("steelblue", "orange"),
                    labels = c("No Default", "Default")) +
  labs(title = "12-Month Default Rate by Current Delinquency Status",
       x     = "Current Delinquency Status (months past due)",
       y     = "Proportion",
       fill  = "") +
  theme_classic()

# --- Macro controls over time ---

# Average lagged unemployment and HPI by month across all loans
macro_time <- panel[, .(
  avg_unemp = mean(unemployment_rate_lag1, na.rm = TRUE),
  avg_hpi   = mean(hpi_qoq_qlag1, na.rm = TRUE)
), by = monthly_reporting_period][order(monthly_reporting_period)]

macro_time[, date := as.Date(paste0(
  substr(as.character(monthly_reporting_period), 1, 4), "-",
  substr(as.character(monthly_reporting_period), 5, 6), "-01"
))]


macro_time <- panel[, .(
  unemp_lag1 = mean(unemployment_rate_lag1, na.rm = TRUE),
  unemp_lag2 = mean(unemployment_rate_lag2, na.rm = TRUE),
  unemp_lag3 = mean(unemployment_rate_lag3, na.rm = TRUE),
  unemp_lag4 = mean(unemployment_rate_lag4, na.rm = TRUE),
  hpi_lag1   = mean(hpi_qoq_qlag1, na.rm = TRUE),
  hpi_lag2   = mean(hpi_qoq_qlag2, na.rm = TRUE),
  hpi_lag3   = mean(hpi_qoq_qlag3, na.rm = TRUE),
  hpi_lag4   = mean(hpi_qoq_qlag4, na.rm = TRUE)
), by = monthly_reporting_period][order(monthly_reporting_period)]

macro_time[, date := as.Date(paste0(
  substr(as.character(monthly_reporting_period), 1, 4), "-",
  substr(as.character(monthly_reporting_period), 5, 6), "-01"
))]

# Reshape to long for faceting
unemp_long <- melt(macro_time, id.vars = "date",
                   measure.vars = paste0("unemp_lag", 1:4),
                   variable.name = "lag", value.name = "unemployment_rate")
unemp_long[, lag := factor(lag,
                           levels = paste0("unemp_lag", 1:4),
                           labels = paste0("Lag ", 1:4))]

hpi_long <- melt(macro_time, id.vars = "date",
                 measure.vars = paste0("hpi_lag", 1:4),
                 variable.name = "lag", value.name = "hpi_qoq")
hpi_long[, lag := factor(lag,
                         levels = paste0("hpi_lag", 1:4),
                         labels = paste0("Lag ", 1:4))]

p1 <- ggplot(unemp_long, aes(x = date, y = unemployment_rate)) +
  geom_line(color = "steelblue") +
  facet_wrap(~lag, ncol = 2) +
  labs(title = "Average State Unemployment Rate Over Time",
       x = "", y = "Unemployment Rate (%)") +
  theme_classic()

p2 <- ggplot(hpi_long, aes(x = date, y = hpi_qoq)) +
  geom_line(color = "darkorange") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "grey50") +
  facet_wrap(~lag, ncol = 2) +
  labs(title = "Average State HPI QoQ Change Over Time",
       x = "", y = "HPI QoQ Change (%)") +
  theme_classic()

p1 / p2

# --- Variable dictionary ---
variable_dict <- data.frame(
  variable = c(
    "loan_sequence_number",
    "monthly_reporting_period",
    "month_index",
    "first_default_index",
    "first_payment_date",
    "orig_year",
    "orig_month",
    "orig_quarter",
    "credit_score", "orig_ltv", "orig_cltv", "orig_dti",
    "orig_interest_rate", "orig_upb", "orig_loan_term",
    "num_borrowers", "mi_pct", "occupancy_status",
    "amortization_type", "loan_purpose", "channel",
    "property_type", "property_state", "first_time_homebuyer",
    "io_indicator",
    "current_delinquency_status", "loan_age", "mod_flag_12m",
    "delta_interest_rate", "current_upb", "current_deferred_upb",
    "zero_balance_code",
    paste0("unemployment_rate_lag", 1:4),
    paste0("hpi_qoq_qlag", 1:4),
    "default_next_12m"
  ),
  type = c(
    rep("Identifier", 2),
    rep("Temporal", 6),
    rep("Origination", 17),
    rep("Behavioural", 7),
    rep("Macro", 8),
    "Outcome"
  ),
  description = c(
    "Unique loan identifier used to link origination and performance records",
    "Year-month of the performance observation (YYYYMM integer format)",
    "Linear month index derived from monthly_reporting_period: 12 * (YYYYMM %/% 100) + (YYYYMM %% 100). Used to compute forward default horizon",
    "Linear month index of the first default event (90+ DPD or zero balance code 03/09) for the loan; NA if no default observed in the data",
    "Date of first scheduled payment (YYYYMM string); source for orig_year, orig_month, and orig_quarter",
    "Calendar year of loan origination, extracted from first_payment_date (characters 1-4)",
    "Calendar month of loan origination, extracted from first_payment_date (characters 5-6)",
    "Origination quarter as ordered factor (e.g. '2007Q3'), derived from orig_year and orig_month",
    "Borrower FICO credit score at origination. Recoded from 9999 to NA",
    "Original loan-to-value ratio at origination (%). Recoded from 999 to NA",
    "Original combined loan-to-value ratio including all liens (%). Recoded from 999 to NA",
    "Original debt-to-income ratio at origination (%). Recoded from 999 to NA",
    "Note rate at origination (%)",
    "Original unpaid principal balance (USD)",
    "Original loan term in months (e.g. 360 = 30-year). Values > 360 recoded to NA",
    "Number of borrowers on the loan. Recoded from 99 to NA",
    "Mortgage insurance coverage percentage. Recoded from 999 to NA",
    "Occupancy status: P = primary residence, S = second home, I = investment property",
    "Amortization type: FRM = fixed-rate mortgage, ARM = adjustable-rate mortgage",
    "Loan purpose: P = purchase, R = rate-and-term refinance, C = cash-out refinance",
    "Origination channel: R = retail, B = broker, C = correspondent, T = unspecified",
    "Property type: SF = single family, CO = condo, MH = manufactured housing",
    "Two-letter US state code of the mortgaged property (50 states + DC; territories PR/GU/VI excluded)",
    "First-time homebuyer flag (Y/N)",
    "Interest-only indicator (Y/N)",
    "Number of months past due at observation month t (0 = current, 1 = 30 DPD, 2 = 60 DPD). Observations at or after first default month are excluded from the panel",
    "Number of months since loan origination at observation month t",
    "Binary indicator: loan was modified at least once in the 12 months up to and including observation month t. For the first 11 months of each loan's panel history, assumes no prior modifications before the observation window",
    "Difference between current interest rate and original note rate at observation month t (%). Derived as current_interest_rate minus orig_interest_rate",
    "Outstanding unpaid principal balance at observation month t (USD)",
    "Deferred principal balance at observation month t (USD)",
    "Loan termination code: 01 = prepaid/matured, 03 = foreclosure, 09 = short sale, 15 = note sale, 16 = repurchase. NA for active loans",
    paste0("State-level monthly unemployment rate (%) lagged ", 1:4, " month(s). Source: BLS LAUS via FRED (series: [STATE]UR), matched on property_state and monthly_reporting_period"),
    paste0("State-level HPI quarter-over-quarter % change lagged ", 1:4, " quarter(s). Source: FHFA via FRED (series: [STATE]STHPI)."),
    "Binary outcome: equals 1 if the loan first reaches 90+ days past due or experiences a foreclosure/short sale within 12 months of observation month t, and 0 otherwise. Each defaulting loan receives exactly one positive flag at the earliest month where default enters the 12-month horizon after which the loan exits the risk set. Non-defaulting loans contribute 0s for every month they remain active. Observations after December 2011 are excluded to ensure a complete 12-month outcome window through December 2012"
  ),
  source = c(
    "Origination and Performance files",
    "Performance file (time-varying)",
    "Derived from monthly_reporting_period",
    "Derived from performance file",
    "Origination file",
    "Derived from origination file",
    "Derived from origination file",
    "Derived from origination file",
    rep("Origination file", 17),
    rep("Performance file (time-varying)", 7),
    rep("FRED (BLS LAUS)", 4),
    rep("FRED (FHFA)", 4),
    "Derived from performance file"
  )
)

write.csv(variable_dict,
          file.path(OUTPUT_DIR, "variable_dictionary.csv"),
          row.names = FALSE)

# --- Summary statistics and categorical variables distribution ---

orig_vars <- c("credit_score", "orig_ltv", "orig_cltv", "orig_dti",
               "orig_interest_rate", "orig_upb", "orig_loan_term",
               "num_borrowers", "mi_pct")

stargazer(as.data.frame(loan_level[, ..orig_vars]),
          type         = "html",
          title        = "Panel A: Origination Variables (loan level)",
          digits       = 2,
          summary.stat = c("n", "mean", "sd", "min", "p25", "median", "p75", "max"),
          out          = file.path(OUTPUT_DIR, "summary_origination.html"))
## 
## <table style="text-align:center"><caption><strong>Panel A: Origination Variables (loan level)</strong></caption>
## <tr><td colspan="9" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">Statistic</td><td>N</td><td>Mean</td><td>St. Dev.</td><td>Min</td><td>Pctl(25)</td><td>Median</td><td>Pctl(75)</td><td>Max</td></tr>
## <tr><td colspan="9" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">credit_score</td><td>240,443</td><td>741.56</td><td>53.80</td><td>300</td><td>706</td><td>754</td><td>785</td><td>850</td></tr>
## <tr><td style="text-align:left">orig_ltv</td><td>240,552</td><td>69.24</td><td>17.95</td><td>6</td><td>59</td><td>75</td><td>80</td><td>125</td></tr>
## <tr><td style="text-align:left">orig_cltv</td><td>240,546</td><td>71.34</td><td>18.80</td><td>6</td><td>60</td><td>75</td><td>80</td><td>193</td></tr>
## <tr><td style="text-align:left">orig_dti</td><td>220,093</td><td>34.87</td><td>12.21</td><td>1</td><td>26</td><td>35</td><td>44</td><td>65</td></tr>
## <tr><td style="text-align:left">orig_interest_rate</td><td>240,565</td><td>5.75</td><td>0.82</td><td>3.25</td><td>5.00</td><td>5.88</td><td>6.38</td><td>9.79</td></tr>
## <tr><td style="text-align:left">orig_upb</td><td>240,565</td><td>196,477.30</td><td>107,205.80</td><td>9,000</td><td>114,000</td><td>173,000</td><td>260,000</td><td>1,095,000</td></tr>
## <tr><td style="text-align:left">orig_loan_term</td><td>240,357</td><td>326.92</td><td>68.19</td><td>60</td><td>360</td><td>360</td><td>360</td><td>360</td></tr>
## <tr><td style="text-align:left">num_borrowers</td><td>240,491</td><td>1.56</td><td>0.50</td><td>1</td><td>1</td><td>2</td><td>2</td><td>2</td></tr>
## <tr><td style="text-align:left">mi_pct</td><td>240,562</td><td>3.17</td><td>8.59</td><td>0</td><td>0</td><td>0</td><td>0</td><td>40</td></tr>
## <tr><td colspan="9" style="border-bottom: 1px solid black"></td></tr></table>
behav_vars <- c("current_delinquency_status", "loan_age", "current_upb",
                "delta_interest_rate", "mod_flag_12m", "current_deferred_upb")

stargazer(as.data.frame(panel[, ..behav_vars]),
          type         = "html",
          title        = "Panel B: Behavioural Variables (observation level)",
          digits       = 2,
          summary.stat = c("n", "mean", "sd", "min", "p25", "median", "p75", "max"),
          out          = file.path(OUTPUT_DIR, "summary_behavioural.html"))
## 
## <table style="text-align:center"><caption><strong>Panel B: Behavioural Variables (observation level)</strong></caption>
## <tr><td colspan="9" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">Statistic</td><td>N</td><td>Mean</td><td>St. Dev.</td><td>Min</td><td>Pctl(25)</td><td>Median</td><td>Pctl(75)</td><td>Max</td></tr>
## <tr><td colspan="9" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">current_delinquency_status</td><td>7,312,948</td><td>0.01</td><td>0.12</td><td>0</td><td>0</td><td>0</td><td>0</td><td>2</td></tr>
## <tr><td style="text-align:left">loan_age</td><td>7,312,965</td><td>19.41</td><td>14.83</td><td>0</td><td>8</td><td>16</td><td>28</td><td>71</td></tr>
## <tr><td style="text-align:left">current_upb</td><td>7,312,965</td><td>176,880.10</td><td>102,404.10</td><td>0.00</td><td>100,522.90</td><td>154,118.70</td><td>234,667.40</td><td>1,094,000.00</td></tr>
## <tr><td style="text-align:left">delta_interest_rate</td><td>7,312,965</td><td>-0.002</td><td>0.09</td><td>-5.75</td><td>0.00</td><td>0.00</td><td>0.00</td><td>2.25</td></tr>
## <tr><td style="text-align:left">mod_flag_12m</td><td>7,312,965</td><td>0.0000</td><td>0.01</td><td>0</td><td>0</td><td>0</td><td>0</td><td>1</td></tr>
## <tr><td style="text-align:left">current_deferred_upb</td><td>7,312,965</td><td>3.91</td><td>653.63</td><td>0.00</td><td>0.00</td><td>0.00</td><td>0.00</td><td>251,900.00</td></tr>
## <tr><td colspan="9" style="border-bottom: 1px solid black"></td></tr></table>
macro_vars <- c(paste0("unemployment_rate_lag", 1:4),
                paste0("hpi_qoq_qlag", 1:4))

stargazer(as.data.frame(panel[, ..macro_vars]),
          type         = "html",
          title        = "Panel C: Macro Controls (observation level, lagged)",
          digits       = 2,
          summary.stat = c("n", "mean", "sd", "min", "p25", "median", "p75", "max"),
          out          = file.path(OUTPUT_DIR, "summary_macro.html"))
## 
## <table style="text-align:center"><caption><strong>Panel C: Macro Controls (observation level, lagged)</strong></caption>
## <tr><td colspan="9" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">Statistic</td><td>N</td><td>Mean</td><td>St. Dev.</td><td>Min</td><td>Pctl(25)</td><td>Median</td><td>Pctl(75)</td><td>Max</td></tr>
## <tr><td colspan="9" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">unemployment_rate_lag1</td><td>7,312,965</td><td>7.95</td><td>2.52</td><td>2.30</td><td>5.70</td><td>8.20</td><td>10.00</td><td>14.30</td></tr>
## <tr><td style="text-align:left">unemployment_rate_lag2</td><td>7,312,965</td><td>7.89</td><td>2.55</td><td>2.30</td><td>5.60</td><td>8.10</td><td>9.90</td><td>14.30</td></tr>
## <tr><td style="text-align:left">unemployment_rate_lag3</td><td>7,312,965</td><td>7.83</td><td>2.57</td><td>2.30</td><td>5.50</td><td>8.00</td><td>9.90</td><td>14.30</td></tr>
## <tr><td style="text-align:left">unemployment_rate_lag4</td><td>7,312,965</td><td>7.76</td><td>2.59</td><td>2.30</td><td>5.40</td><td>7.90</td><td>9.90</td><td>14.30</td></tr>
## <tr><td style="text-align:left">hpi_qoq_qlag1</td><td>7,312,965</td><td>-0.98</td><td>1.87</td><td>-10.02</td><td>-1.87</td><td>-0.67</td><td>0.35</td><td>6.94</td></tr>
## <tr><td style="text-align:left">hpi_qoq_qlag2</td><td>7,312,965</td><td>-1.01</td><td>1.91</td><td>-10.02</td><td>-1.92</td><td>-0.74</td><td>0.31</td><td>8.48</td></tr>
## <tr><td style="text-align:left">hpi_qoq_qlag3</td><td>7,312,965</td><td>-0.88</td><td>1.99</td><td>-10.02</td><td>-1.79</td><td>-0.62</td><td>0.42</td><td>10.32</td></tr>
## <tr><td style="text-align:left">hpi_qoq_qlag4</td><td>7,312,965</td><td>-0.69</td><td>2.04</td><td>-10.02</td><td>-1.64</td><td>-0.47</td><td>0.55</td><td>10.32</td></tr>
## <tr><td colspan="9" style="border-bottom: 1px solid black"></td></tr></table>
cat_vars <- c("occupancy_status", "amortization_type", "loan_purpose",
              "channel", "property_type", "first_time_homebuyer", "io_indicator")

cat_table <- rbindlist(lapply(cat_vars, function(v) {
  dt <- loan_level[, .(N = .N,
                       Default_Rate = round(100 * mean(ever_default_12m), 2)),
                   by = v]
  setnames(dt, v, "Category")
  dt[, Variable := v]
  dt[, Pct := round(100 * N / nrow(loan_level), 2)]
  dt[, .(Variable, Category, N, Pct, Default_Rate)]
}))[order(Variable, -N)]

stargazer(cat_table,
          type    = "html",
          title   = "Panel D: Categorical Variables (loan level)",
          summary = FALSE,
          rownames = FALSE,
          digits  = 2,
          out     = file.path(OUTPUT_DIR, "summary_categorical.html"))
## 
## <table style="text-align:center"><caption><strong>Panel D: Categorical Variables (loan level)</strong></caption>
## <tr><td colspan="5" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">Variable</td><td>Category</td><td>N</td><td>Pct</td><td>Default_Rate</td></tr>
## <tr><td colspan="5" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">amortization_type</td><td>FRM</td><td>240,565</td><td>100</td><td>7.39</td></tr>
## <tr><td style="text-align:left">channel</td><td>R</td><td>122,719</td><td>51.01</td><td>5.48</td></tr>
## <tr><td style="text-align:left">channel</td><td>T</td><td>65,749</td><td>27.33</td><td>14.29</td></tr>
## <tr><td style="text-align:left">channel</td><td>C</td><td>34,499</td><td>14.34</td><td>2.47</td></tr>
## <tr><td style="text-align:left">channel</td><td>B</td><td>17,598</td><td>7.32</td><td>4.54</td></tr>
## <tr><td style="text-align:left">first_time_homebuyer</td><td>N</td><td>217,299</td><td>90.33</td><td>7.26</td></tr>
## <tr><td style="text-align:left">first_time_homebuyer</td><td>Y</td><td>23,223</td><td>9.65</td><td>8.59</td></tr>
## <tr><td style="text-align:left">first_time_homebuyer</td><td></td><td>43</td><td>0.02</td><td>0</td></tr>
## <tr><td style="text-align:left">io_indicator</td><td>N</td><td>240,565</td><td>100</td><td>7.39</td></tr>
## <tr><td style="text-align:left">loan_purpose</td><td>P</td><td>86,133</td><td>35.80</td><td>7.23</td></tr>
## <tr><td style="text-align:left">loan_purpose</td><td>N</td><td>79,366</td><td>32.99</td><td>5.33</td></tr>
## <tr><td style="text-align:left">loan_purpose</td><td>C</td><td>75,066</td><td>31.20</td><td>9.74</td></tr>
## <tr><td style="text-align:left">occupancy_status</td><td>P</td><td>215,465</td><td>89.57</td><td>7.36</td></tr>
## <tr><td style="text-align:left">occupancy_status</td><td>I</td><td>13,288</td><td>5.52</td><td>9.20</td></tr>
## <tr><td style="text-align:left">occupancy_status</td><td>S</td><td>11,812</td><td>4.91</td><td>5.85</td></tr>
## <tr><td style="text-align:left">property_type</td><td>SF</td><td>179,667</td><td>74.69</td><td>7.40</td></tr>
## <tr><td style="text-align:left">property_type</td><td>PU</td><td>40,845</td><td>16.98</td><td>6.39</td></tr>
## <tr><td style="text-align:left">property_type</td><td>CO</td><td>17,495</td><td>7.27</td><td>9.08</td></tr>
## <tr><td style="text-align:left">property_type</td><td>MH</td><td>1,805</td><td>0.75</td><td>14.57</td></tr>
## <tr><td style="text-align:left">property_type</td><td>CP</td><td>753</td><td>0.31</td><td>2.52</td></tr>
## <tr><td colspan="5" style="border-bottom: 1px solid black"></td></tr></table>
# --- Distribution of default events by unemployment level ------
panel[default_next_12m == 1, 
      .(n_defaults = .N), 
      by = cut(unemployment_rate_lag4, 
               breaks = c(0, 5, 7.4, 9.8, 11.8, Inf),
               right = FALSE)][order(cut)]
##           cut n_defaults
##        <fctr>      <int>
## 1:      [0,5)       3896
## 2:    [5,7.4)       5738
## 3:  [7.4,9.8)       3849
## 4: [9.8,11.8)       3358
## 5: [11.8,Inf)        927
# --- Correlation ---

panel_cor <- panel[!is.na(default_next_12m)]

# Sample for speed, as Spearman is slow on large data
set.seed(42)
sample_idx <- sample(nrow(panel_cor), min(100000, nrow(panel_cor)))
panel_sample <- panel_cor[sample_idx]

all_vars <- c("current_delinquency_status", "loan_age", "current_upb",
              "delta_interest_rate", "mod_flag_12m", "current_deferred_upb",
              paste0("unemployment_rate_lag", 1:4),
              paste0("hpi_qoq_qlag", 1:4),
              "credit_score", "orig_ltv", "orig_cltv", "orig_dti",
              "orig_interest_rate", "orig_upb", "orig_loan_term",
              "num_borrowers", "mi_pct", "default_next_12m")

cor_matrix <- cor(panel_sample[, ..all_vars],
                  use    = "complete.obs",
                  method = "spearman")
cor_matrix <- round(cor_matrix, 2)

write.table(cor_matrix,
            file.path(OUTPUT_DIR, "correlation_matrix.txt"),
            sep       = "\t",
            row.names = TRUE,
            quote     = FALSE)
 
# Predictive power
print(sort(cor_matrix[, "default_next_12m"], decreasing = TRUE))
##           default_next_12m current_delinquency_status 
##                       1.00                       0.06 
##                   orig_dti         orig_interest_rate 
##                       0.03                       0.03 
##                   orig_ltv                  orig_cltv 
##                       0.02                       0.02 
##             orig_loan_term                     mi_pct 
##                       0.02                       0.02 
##                   loan_age                current_upb 
##                       0.01                       0.01 
##                   orig_upb        delta_interest_rate 
##                       0.01                       0.00 
##               mod_flag_12m       current_deferred_upb 
##                       0.00                       0.00 
##     unemployment_rate_lag1     unemployment_rate_lag2 
##                       0.00                       0.00 
##     unemployment_rate_lag3     unemployment_rate_lag4 
##                       0.00                       0.00 
##              hpi_qoq_qlag1              hpi_qoq_qlag2 
##                      -0.01                      -0.01 
##              hpi_qoq_qlag3              hpi_qoq_qlag4 
##                      -0.01                      -0.01 
##              num_borrowers               credit_score 
##                      -0.01                      -0.04
# Multicollinearity
pred_vars <- setdiff(all_vars, "default_next_12m")
cor_predictors <- cor(panel_sample[, ..pred_vars],
                      use    = "complete.obs",
                      method = "spearman")
round(cor_predictors, 2)
##                            current_delinquency_status loan_age current_upb
## current_delinquency_status                       1.00     0.06       -0.01
## loan_age                                         0.06     1.00       -0.14
## current_upb                                     -0.01    -0.14        1.00
## delta_interest_rate                             -0.02     0.01       -0.02
## mod_flag_12m                                     0.00     0.00        0.00
## current_deferred_upb                             0.00     0.00        0.00
## unemployment_rate_lag1                           0.02     0.35        0.02
## unemployment_rate_lag2                           0.02     0.36        0.02
## unemployment_rate_lag3                           0.02     0.36        0.02
## unemployment_rate_lag4                           0.02     0.37        0.01
## hpi_qoq_qlag1                                   -0.01    -0.12       -0.09
## hpi_qoq_qlag2                                   -0.01    -0.22       -0.08
## hpi_qoq_qlag3                                   -0.01    -0.21       -0.08
## hpi_qoq_qlag4                                   -0.01    -0.20       -0.08
## credit_score                                    -0.09    -0.10        0.04
## orig_ltv                                         0.02     0.05        0.09
## orig_cltv                                        0.01     0.06        0.12
## orig_dti                                         0.03     0.03        0.13
## orig_interest_rate                               0.05     0.22       -0.22
## orig_upb                                        -0.01    -0.09        0.96
## orig_loan_term                                   0.02     0.04        0.22
## num_borrowers                                   -0.03    -0.03        0.16
## mi_pct                                           0.02     0.03       -0.02
##                            delta_interest_rate mod_flag_12m
## current_delinquency_status               -0.02         0.00
## loan_age                                  0.01         0.00
## current_upb                              -0.02         0.00
## delta_interest_rate                       1.00        -0.13
## mod_flag_12m                             -0.13         1.00
## current_deferred_upb                     -0.29         0.15
## unemployment_rate_lag1                   -0.02         0.00
## unemployment_rate_lag2                   -0.02         0.00
## unemployment_rate_lag3                   -0.02         0.00
## unemployment_rate_lag4                   -0.02         0.00
## hpi_qoq_qlag1                             0.00        -0.01
## hpi_qoq_qlag2                             0.01         0.00
## hpi_qoq_qlag3                             0.01         0.00
## hpi_qoq_qlag4                             0.01         0.00
## credit_score                              0.02        -0.01
## orig_ltv                                 -0.01         0.00
## orig_cltv                                -0.01         0.00
## orig_dti                                 -0.02         0.01
## orig_interest_rate                       -0.01         0.01
## orig_upb                                 -0.02         0.00
## orig_loan_term                           -0.01         0.00
## num_borrowers                             0.01         0.00
## mi_pct                                    0.00         0.00
##                            current_deferred_upb unemployment_rate_lag1
## current_delinquency_status                 0.00                   0.02
## loan_age                                   0.00                   0.35
## current_upb                                0.00                   0.02
## delta_interest_rate                       -0.29                  -0.02
## mod_flag_12m                               0.15                   0.00
## current_deferred_upb                       1.00                   0.01
## unemployment_rate_lag1                     0.01                   1.00
## unemployment_rate_lag2                     0.01                   1.00
## unemployment_rate_lag3                     0.01                   0.99
## unemployment_rate_lag4                     0.01                   0.98
## hpi_qoq_qlag1                              0.00                  -0.29
## hpi_qoq_qlag2                              0.00                  -0.40
## hpi_qoq_qlag3                              0.00                  -0.48
## hpi_qoq_qlag4                             -0.01                  -0.53
## credit_score                               0.00                   0.12
## orig_ltv                                   0.00                  -0.08
## orig_cltv                                  0.00                  -0.10
## orig_dti                                   0.00                  -0.05
## orig_interest_rate                         0.00                  -0.28
## orig_upb                                   0.00                   0.05
## orig_loan_term                             0.00                  -0.03
## num_borrowers                             -0.01                   0.00
## mi_pct                                     0.00                  -0.04
##                            unemployment_rate_lag2 unemployment_rate_lag3
## current_delinquency_status                   0.02                   0.02
## loan_age                                     0.36                   0.36
## current_upb                                  0.02                   0.02
## delta_interest_rate                         -0.02                  -0.02
## mod_flag_12m                                 0.00                   0.00
## current_deferred_upb                         0.01                   0.01
## unemployment_rate_lag1                       1.00                   0.99
## unemployment_rate_lag2                       1.00                   1.00
## unemployment_rate_lag3                       1.00                   1.00
## unemployment_rate_lag4                       0.99                   1.00
## hpi_qoq_qlag1                               -0.28                  -0.27
## hpi_qoq_qlag2                               -0.38                  -0.37
## hpi_qoq_qlag3                               -0.47                  -0.45
## hpi_qoq_qlag4                               -0.53                  -0.52
## credit_score                                 0.12                   0.12
## orig_ltv                                    -0.08                  -0.08
## orig_cltv                                   -0.10                  -0.10
## orig_dti                                    -0.05                  -0.05
## orig_interest_rate                          -0.29                  -0.30
## orig_upb                                     0.04                   0.04
## orig_loan_term                              -0.03                  -0.04
## num_borrowers                                0.00                   0.00
## mi_pct                                      -0.04                  -0.04
##                            unemployment_rate_lag4 hpi_qoq_qlag1 hpi_qoq_qlag2
## current_delinquency_status                   0.02         -0.01         -0.01
## loan_age                                     0.37         -0.12         -0.22
## current_upb                                  0.01         -0.09         -0.08
## delta_interest_rate                         -0.02          0.00          0.01
## mod_flag_12m                                 0.00         -0.01          0.00
## current_deferred_upb                         0.01          0.00          0.00
## unemployment_rate_lag1                       0.98         -0.29         -0.40
## unemployment_rate_lag2                       0.99         -0.28         -0.38
## unemployment_rate_lag3                       1.00         -0.27         -0.37
## unemployment_rate_lag4                       1.00         -0.26         -0.36
## hpi_qoq_qlag1                               -0.26          1.00          0.42
## hpi_qoq_qlag2                               -0.36          0.42          1.00
## hpi_qoq_qlag3                               -0.44          0.09          0.52
## hpi_qoq_qlag4                               -0.52          0.36          0.23
## credit_score                                 0.12         -0.03         -0.05
## orig_ltv                                    -0.08          0.09          0.09
## orig_cltv                                   -0.10          0.09          0.10
## orig_dti                                    -0.05         -0.01          0.00
## orig_interest_rate                          -0.31          0.05          0.11
## orig_upb                                     0.04         -0.09         -0.10
## orig_loan_term                              -0.04         -0.02         -0.01
## num_borrowers                                0.00          0.00          0.00
## mi_pct                                      -0.04          0.05          0.05
##                            hpi_qoq_qlag3 hpi_qoq_qlag4 credit_score orig_ltv
## current_delinquency_status         -0.01         -0.01        -0.09     0.02
## loan_age                           -0.21         -0.20        -0.10     0.05
## current_upb                        -0.08         -0.08         0.04     0.09
## delta_interest_rate                 0.01          0.01         0.02    -0.01
## mod_flag_12m                        0.00          0.00        -0.01     0.00
## current_deferred_upb                0.00         -0.01         0.00     0.00
## unemployment_rate_lag1             -0.48         -0.53         0.12    -0.08
## unemployment_rate_lag2             -0.47         -0.53         0.12    -0.08
## unemployment_rate_lag3             -0.45         -0.52         0.12    -0.08
## unemployment_rate_lag4             -0.44         -0.52         0.12    -0.08
## hpi_qoq_qlag1                       0.09          0.36        -0.03     0.09
## hpi_qoq_qlag2                       0.52          0.23        -0.05     0.09
## hpi_qoq_qlag3                       1.00          0.54        -0.07     0.10
## hpi_qoq_qlag4                       0.54          1.00        -0.07     0.10
## credit_score                       -0.07         -0.07         1.00    -0.18
## orig_ltv                            0.10          0.10        -0.18     1.00
## orig_cltv                           0.11          0.11        -0.18     0.93
## orig_dti                            0.00          0.01        -0.21     0.15
## orig_interest_rate                  0.14          0.16        -0.33     0.23
## orig_upb                           -0.10         -0.10         0.05     0.08
## orig_loan_term                     -0.01          0.00        -0.08     0.25
## num_borrowers                       0.00          0.00         0.01    -0.06
## mi_pct                              0.05          0.05        -0.14     0.61
##                            orig_cltv orig_dti orig_interest_rate orig_upb
## current_delinquency_status      0.01     0.03               0.05    -0.01
## loan_age                        0.06     0.03               0.22    -0.09
## current_upb                     0.12     0.13              -0.22     0.96
## delta_interest_rate            -0.01    -0.02              -0.01    -0.02
## mod_flag_12m                    0.00     0.01               0.01     0.00
## current_deferred_upb            0.00     0.00               0.00     0.00
## unemployment_rate_lag1         -0.10    -0.05              -0.28     0.05
## unemployment_rate_lag2         -0.10    -0.05              -0.29     0.04
## unemployment_rate_lag3         -0.10    -0.05              -0.30     0.04
## unemployment_rate_lag4         -0.10    -0.05              -0.31     0.04
## hpi_qoq_qlag1                   0.09    -0.01               0.05    -0.09
## hpi_qoq_qlag2                   0.10     0.00               0.11    -0.10
## hpi_qoq_qlag3                   0.11     0.00               0.14    -0.10
## hpi_qoq_qlag4                   0.11     0.01               0.16    -0.10
## credit_score                   -0.18    -0.21              -0.33     0.05
## orig_ltv                        0.93     0.15               0.23     0.08
## orig_cltv                       1.00     0.16               0.24     0.11
## orig_dti                        0.16     1.00               0.16     0.13
## orig_interest_rate              0.24     0.16               1.00    -0.23
## orig_upb                        0.11     0.13              -0.23     1.00
## orig_loan_term                  0.26     0.14               0.25     0.19
## num_borrowers                  -0.04    -0.11              -0.10     0.17
## mi_pct                          0.53     0.09               0.19    -0.03
##                            orig_loan_term num_borrowers mi_pct
## current_delinquency_status           0.02         -0.03   0.02
## loan_age                             0.04         -0.03   0.03
## current_upb                          0.22          0.16  -0.02
## delta_interest_rate                 -0.01          0.01   0.00
## mod_flag_12m                         0.00          0.00   0.00
## current_deferred_upb                 0.00         -0.01   0.00
## unemployment_rate_lag1              -0.03          0.00  -0.04
## unemployment_rate_lag2              -0.03          0.00  -0.04
## unemployment_rate_lag3              -0.04          0.00  -0.04
## unemployment_rate_lag4              -0.04          0.00  -0.04
## hpi_qoq_qlag1                       -0.02          0.00   0.05
## hpi_qoq_qlag2                       -0.01          0.00   0.05
## hpi_qoq_qlag3                       -0.01          0.00   0.05
## hpi_qoq_qlag4                        0.00          0.00   0.05
## credit_score                        -0.08          0.01  -0.14
## orig_ltv                             0.25         -0.06   0.61
## orig_cltv                            0.26         -0.04   0.53
## orig_dti                             0.14         -0.11   0.09
## orig_interest_rate                   0.25         -0.10   0.19
## orig_upb                             0.19          0.17  -0.03
## orig_loan_term                       1.00         -0.06   0.12
## num_borrowers                       -0.06          1.00  -0.04
## mi_pct                               0.12         -0.04   1.00
# Univariate correlation (Macroeconomic lags)
results_unemp <- rbindlist(lapply(1:4, function(i) {
  var <- paste0("unemployment_rate_lag", i)
  fit <- glm(default_next_12m ~ get(var), data = panel_cor, family = binomial)
  
  data.table(
    Variable = paste("Unemployment (Lag", i, ")"),
    AIC      = round(AIC(fit), 2),
    LogLik   = round(as.numeric(logLik(fit)), 2),
    Coef     = round(coef(fit)[2], 4),
    P_value  = round(summary(fit)$coefficients[2, 4], 4)
  )
}))

results_hpi <- rbindlist(lapply(1:4, function(i) {
  var <- paste0("hpi_qoq_qlag", i)
  fit <- glm(default_next_12m ~ get(var), data = panel_cor, family = binomial)
  
  data.table(
    Variable = paste("HPI QoQ (Lag", i, ")"),
    AIC      = round(AIC(fit), 2),
    LogLik   = round(as.numeric(logLik(fit)), 2),
    Coef     = round(coef(fit)[2], 4),
    P_value  = round(summary(fit)$coefficients[2, 4], 4)
  )
}))

results_macro <- rbindlist(list(results_unemp, results_hpi))

print(results_macro)
##                 Variable      AIC    LogLik    Coef P_value
##                   <char>    <num>     <num>   <num>   <num>
## 1: Unemployment (Lag 1 ) 249326.1 -124661.1 -0.0294       0
## 2: Unemployment (Lag 2 ) 249248.1 -124622.0 -0.0391       0
## 3: Unemployment (Lag 3 ) 249157.2 -124576.6 -0.0478       0
## 4: Unemployment (Lag 4 ) 249062.2 -124529.1 -0.0554       0
## 5:      HPI QoQ (Lag 1 ) 246703.0 -123349.5 -0.1839       0
## 6:      HPI QoQ (Lag 2 ) 247201.2 -123598.6 -0.1660       0
## 7:      HPI QoQ (Lag 3 ) 248257.5 -124126.7 -0.1197       0
## 8:      HPI QoQ (Lag 4 ) 248609.2 -124302.6 -0.0992       0