library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
setwd("~/Desktop/Statistical Model/assignment_4")
orig <- read_delim("sample_orig_2007.txt", delim = "|", col_names = FALSE, na = c("", "NA"), trim_ws = TRUE)
## Rows: 50000 Columns: 32
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "|"
## chr (15): X3, X6, X8, X14, X15, X16, X17, X18, X19, X20, X21, X23, X24, X25,...
## dbl (14): X1, X2, X4, X5, X7, X9, X10, X11, X12, X13, X22, X28, X30, X32
## lgl (3): X26, X27, X29
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names <- c("credit_score", "first_payment_date", "first_time_homebuyer_flag", "maturity_date", "msa", "mi_pct", "num_units", "occupancy_status", "orig_cltv", "orig_dti", "orig_upb", "orig_ltv", "orig_interest_rate", "channel", "ppm_flag", "amortization_type", "property_state", "property_type", "postal_code",
"loan_id", "loan_purpose", "orig_loan_term", "num_borrowers", "seller_name", "servicer_name", "super_conforming_flag", "pre_relief_loan_seq_num", "program_indicator", "relief_refinance_indicator", "property_valuation_method", "io_indicator", "mi_cancel_indicator" )
names(orig)[1:32] <- names
orig <- orig %>% mutate(loan_id = trimws(as.character(loan_id)))
head(orig)
## # A tibble: 6 × 32
## credit_score first_payment_date first_time_homebuyer_flag maturity_date msa
## <dbl> <dbl> <chr> <dbl> <dbl>
## 1 769 200702 N 203701 24220
## 2 598 200703 N 203702 NA
## 3 777 200703 N 203702 20260
## 4 628 200703 N 203702 35644
## 5 651 200703 N 203702 NA
## 6 652 200702 N 203701 NA
## # ℹ 27 more variables: mi_pct <chr>, num_units <dbl>, occupancy_status <chr>,
## # orig_cltv <dbl>, orig_dti <dbl>, orig_upb <dbl>, orig_ltv <dbl>,
## # orig_interest_rate <dbl>, channel <chr>, ppm_flag <chr>,
## # amortization_type <chr>, property_state <chr>, property_type <chr>,
## # postal_code <chr>, loan_id <chr>, loan_purpose <chr>, orig_loan_term <dbl>,
## # num_borrowers <chr>, seller_name <chr>, servicer_name <chr>,
## # super_conforming_flag <lgl>, pre_relief_loan_seq_num <lgl>, …
svcg <- read_delim("sample_svcg_2007.txt", delim = "|", col_names = FALSE, col_select = c(1, 4, 5, 7, 22), col_types = cols(X1 = col_character(), X4 = col_character(), X5 = col_double(), X7 = col_character(), X22 = col_double()), na = c("", "NA"), trim_ws = TRUE)
names(svcg) <- c("loan_id", "status", "age", "defect_flag_raw", "loss")
svcg <- svcg %>% mutate(loan_id = trimws(as.character(loan_id)))
head(svcg)
## # A tibble: 6 × 5
## loan_id status age defect_flag_raw loss
## <chr> <chr> <dbl> <chr> <dbl>
## 1 F07Q10000023 0 1 <NA> NA
## 2 F07Q10000023 0 2 <NA> NA
## 3 F07Q10000023 0 3 <NA> NA
## 4 F07Q10000023 0 4 <NA> NA
## 5 F07Q10000023 0 5 <NA> NA
## 6 F07Q10000023 0 6 <NA> NA
B. Filter the servicing history to only examine loans in the first 36 months after origination (using column 5, loan age). This should remove 26 unique loan ids.
n_loans_before <- n_distinct(svcg$loan_id)
svcg_36 <- svcg %>% filter(!is.na(age), age <= 36)
n_loans_after <- n_distinct(svcg_36$loan_id)
n_removed_loans <- n_loans_before - n_loans_after
n_removed_loans
## [1] 26
C.Create a flag for loan default. A loan receives a default flag of ‘1’ if it was ever 90+ days delinquent or repossessed during its first 36 months (using column 4, status), or if took a loss with a negative dollar amount during its first 36 months (using column 22, loss). By my count 4870 loans defaulted (after filtering to first 36 months).
svcg_36 <- svcg_36 %>% mutate(status_num = suppressWarnings(as.numeric(status)))
loan_default <- svcg_36 %>%
group_by(loan_id) %>%
summarise(
default_flag = as.integer(
any(
(!is.na(status_num) & status_num >= 3) |
status == "RA" |
(!is.na(loss) & loss < 0) )),
.groups = "drop")
sum(loan_default$default_flag)
## [1] 4870
D. Create a flag for loan defects (using column 7, settlement date). A defect is a case where the loan should not have been extended to a risky borrower, and then the mortgage was fradulently sold to Freddie Mac under the false guarantee that the loans were all offered to safe borrowers. A loan receives a defect flag of ‘1’ if there was a defect settlement during its first 36 months. By my count 472 loans had defects.
svcg_36 <- svcg_36 %>%
mutate(
defect_row = !is.na(defect_flag_raw) & defect_flag_raw != 0)
loan_defect <- svcg_36 %>%
group_by(loan_id) %>%
summarise(
defect_flag = as.integer(any(defect_row, na.rm = TRUE)),
.groups = "drop")
num_defect <- sum(loan_defect$defect_flag)
num_defect
## [1] 472
E. Merge these two flags on to the origination data, removing the 26 loans with no observations in the first 36 months. F. Your finished dataset should be the 50,000 row origination data, minus 26 rows, plus two new columns for the default and defect flags.
loan_flags <- loan_default %>%inner_join(loan_defect, by = "loan_id")
orig <- orig %>% mutate(loan_id = as.character(loan_id))
df <- orig %>% inner_join(loan_flags, by = "loan_id")
nrow(orig)
## [1] 50000
nrow(df)
## [1] 49974
head(df)
## # A tibble: 6 × 34
## credit_score first_payment_date first_time_homebuyer_flag maturity_date msa
## <dbl> <dbl> <chr> <dbl> <dbl>
## 1 769 200702 N 203701 24220
## 2 598 200703 N 203702 NA
## 3 777 200703 N 203702 20260
## 4 628 200703 N 203702 35644
## 5 651 200703 N 203702 NA
## 6 652 200702 N 203701 NA
## # ℹ 29 more variables: mi_pct <chr>, num_units <dbl>, occupancy_status <chr>,
## # orig_cltv <dbl>, orig_dti <dbl>, orig_upb <dbl>, orig_ltv <dbl>,
## # orig_interest_rate <dbl>, channel <chr>, ppm_flag <chr>,
## # amortization_type <chr>, property_state <chr>, property_type <chr>,
## # postal_code <chr>, loan_id <chr>, loan_purpose <chr>, orig_loan_term <dbl>,
## # num_borrowers <chr>, seller_name <chr>, servicer_name <chr>,
## # super_conforming_flag <lgl>, pre_relief_loan_seq_num <lgl>, …
## part 2: Preliminary Logistic Regression
### 2.1 Cleaning and preparing variables
model_df <- df %>%
mutate(
orig_dti = ifelse(orig_dti %in% c(999, 0), NA, orig_dti),
orig_cltv = ifelse(orig_cltv %in% c(999, 0), NA, orig_cltv)
) %>%
select(default_flag, credit_score, orig_cltv, orig_dti, orig_upb, orig_interest_rate) %>%
filter(
!is.na(default_flag),
!is.na(credit_score),
!is.na(orig_cltv),
!is.na(orig_dti),
!is.na(orig_upb),
!is.na(orig_interest_rate)
)
nrow(model_df)
## [1] 48769
table(model_df$default_flag)
##
## 0 1
## 44043 4726
### 2.2 logistic regression
logit_mod <- glm(
default_flag ~ credit_score + orig_cltv + orig_dti + orig_upb + orig_interest_rate,
data = model_df,
family = binomial(link="logit")
)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(logit_mod)
##
## Call:
## glm(formula = default_flag ~ credit_score + orig_cltv + orig_dti +
## orig_upb + orig_interest_rate, family = binomial(link = "logit"),
## data = model_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.004e+00 3.484e-01 -2.883 0.00394 **
## credit_score -1.273e-02 2.942e-04 -43.255 < 2e-16 ***
## orig_cltv 2.072e-02 1.082e-03 19.147 < 2e-16 ***
## orig_dti 2.383e-02 1.413e-03 16.858 < 2e-16 ***
## orig_upb 2.682e-06 1.644e-07 16.310 < 2e-16 ***
## orig_interest_rate 7.284e-01 3.774e-02 19.297 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 31040 on 48768 degrees of freedom
## Residual deviance: 26498 on 48763 degrees of freedom
## AIC: 26510
##
## Number of Fisher Scoring iterations: 7
### 2.3 Odds ratios and effects
exp(coef(logit_mod))
## (Intercept) credit_score orig_cltv orig_dti
## 0.3662451 0.9873529 1.0209333 1.0241135
## orig_upb orig_interest_rate
## 1.0000027 2.0716928
### 2.4 Deviance comparison with null model
logit_mod$null.deviance
## [1] 31039.58
logit_mod$deviance
## [1] 26498.17
anova(logit_mod, test="Chisq")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: default_flag
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 48768 31040
## credit_score 1 2921.74 48767 28118 < 2.2e-16 ***
## orig_cltv 1 706.71 48766 27411 < 2.2e-16 ***
## orig_dti 1 397.57 48765 27014 < 2.2e-16 ***
## orig_upb 1 162.10 48764 26852 < 2.2e-16 ***
## orig_interest_rate 1 353.30 48763 26498 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
### 2.5 Effect size calculation
b <- coef(logit_mod)
exp(b["credit_score"]*50) # +50 fico
## credit_score
## 0.5291999
exp(b["orig_cltv"]*10) # +10 cltv
## orig_cltv
## 1.230195
exp(b["orig_dti"]*5) # +5 dti
## orig_dti
## 1.126524
exp(b["orig_interest_rate"]*1) # +1% interest
## orig_interest_rate
## 2.071693
### Interpretation: All predictors are statistically significant (p < 0.001). The sign and magnitude of the coefficients follow economic intuition. Higher FICO scores reduce the probability of default, while higher CLTV, DTI, UPB, and interest rates increase the probability of default.
### The model provides strong explanatory power. The null deviance is 31,039.6, while the residual deviance falls to 26,498.2, indicating a significantly better model fit than a flat default rate.
### In terms of effect sizes, a 50-point increase in credit score lowers default odds by approximately 47% (OR = 0.53). A 10-point increase in CLTV increases odds by roughly 23%. A 5-point increase in DTI increases odds by roughly 12.6%. A 1-percentage-point increase in the interest rate doubles the odds of default (OR = 2.07).
# Part 3: alternative probit model
### 3.1 Probit model with same predictors as the logit
probit_mod <- glm(
default_flag ~ credit_score + orig_cltv + orig_dti + orig_upb + orig_interest_rate,
data = model_df,
family = binomial(link = "probit")
)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(probit_mod)
##
## Call:
## glm(formula = default_flag ~ credit_score + orig_cltv + orig_dti +
## orig_upb + orig_interest_rate, family = binomial(link = "probit"),
## data = model_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.203e-01 1.845e-01 -2.278 0.0227 *
## credit_score -6.953e-03 1.543e-04 -45.074 <2e-16 ***
## orig_cltv 1.047e-02 5.488e-04 19.072 <2e-16 ***
## orig_dti 1.233e-02 7.365e-04 16.738 <2e-16 ***
## orig_upb 1.411e-06 8.759e-08 16.109 <2e-16 ***
## orig_interest_rate 3.857e-01 2.017e-02 19.124 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 31040 on 48768 degrees of freedom
## Residual deviance: 26438 on 48763 degrees of freedom
## AIC: 26450
##
## Number of Fisher Scoring iterations: 8
### 3.2 AIC comparison: logit vs probit
AIC(logit_mod, probit_mod)
## df AIC
## logit_mod 6 26510.17
## probit_mod 6 26450.18
# Smaller AIC => better fit
### 3.3 Confusion matrices and model metrics
# Choose a classification cutoff
# Option 1: use 0.5
# cutoff <- 0.5
# Option 2 (often better here): use overall default rate as cutoff
cutoff <- mean(model_df$default_flag)
# Predicted probabilities
logit_prob <- predict(logit_mod, type = "response")
probit_prob <- predict(probit_mod, type = "response")
# Class predictions: 1 = predicted default, 0 = predicted non-default
logit_pred <- ifelse(logit_prob >= cutoff, 1, 0)
probit_pred <- ifelse(probit_prob >= cutoff, 1, 0)
# Confusion matrices
logit_cm <- table(
Truth = model_df$default_flag,
Predicted = logit_pred
)
probit_cm <- table(
Truth = model_df$default_flag,
Predicted = probit_pred
)
logit_cm
## Predicted
## Truth 0 1
## 0 30279 13764
## 1 1261 3465
probit_cm
## Predicted
## Truth 0 1
## 0 29673 14370
## 1 1195 3531
# Helper to compute metrics from a 2x2 confusion matrix (rows: Truth, cols: Predicted)
get_metrics <- function(cm) {
# Ensure names are treated as characters "0" and "1"
tn <- as.numeric(cm["0", "0"])
fp <- as.numeric(cm["0", "1"])
fn <- as.numeric(cm["1", "0"])
tp <- as.numeric(cm["1", "1"])
accuracy <- (tp + tn) / (tp + tn + fp + fn)
precision <- tp / (tp + fp)
recall <- tp / (tp + fn) # sensitivity / true positive rate
spec <- tn / (tn + fp) # specificity / true negative rate
bal_acc <- (recall + spec) / 2
data.frame(
Accuracy = accuracy,
Precision = precision,
Recall = recall,
BalancedAccuracy = bal_acc
)
}
logit_metrics <- get_metrics(logit_cm)
probit_metrics <- get_metrics(probit_cm)
# Small comparison table for the write-up
model_comparison <- rbind(
Logit = logit_metrics,
Probit = probit_metrics
)
model_comparison
## Accuracy Precision Recall BalancedAccuracy
## Logit 0.6919149 0.2011144 0.7331782 0.7103327
## Probit 0.6808423 0.1972516 0.7471435 0.7104357
# Optional: relabel 0/1 to something more readable
dimnames(logit_cm) <- list(Truth = c("No default", "Default"),
Predicted = c("No default", "Default"))
dimnames(probit_cm) <- dimnames(logit_cm)
library(ggplot2)
plot_cm <- function(cm, title = "Confusion matrix") {
df <- as.data.frame(cm)
names(df) <- c("Truth", "Predicted", "Count")
ggplot(df, aes(x = Predicted, y = Truth, fill = Count)) +
geom_tile(color = "white") +
geom_text(aes(label = Count), size = 5) +
scale_fill_gradient(low = "white", high = "steelblue") +
labs(title = title, x = "Predicted class", y = "True class") +
theme_minimal(base_size = 12)
}
# In your Rmd chunks:
plot_cm(logit_cm, "Logit model – confusion matrix")
plot_cm(probit_cm, "Probit model – confusion matrix")