library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
setwd("~/Desktop/Statistical Model/assignment_4")
  1. Build the regression model data, starting with the origination dataset, and then removing rows and adding columns as suggested by the monthly performance dataset. A. Examine the servicing data carefully, making sure that you have coded numbers and strings as intended and that you are aware of the ways in which missing or invalid data are coded for each variable. We will only need columns 1, 4, 5, 7, and 22 (loan id, status, age, defect settlement flag, and loss), but you may come back to this dataset for more variables.
orig <- read_delim("sample_orig_2007.txt", delim = "|", col_names = FALSE, na = c("", "NA"), trim_ws = TRUE)
## Rows: 50000 Columns: 32
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "|"
## chr (15): X3, X6, X8, X14, X15, X16, X17, X18, X19, X20, X21, X23, X24, X25,...
## dbl (14): X1, X2, X4, X5, X7, X9, X10, X11, X12, X13, X22, X28, X30, X32
## lgl  (3): X26, X27, X29
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names <- c("credit_score", "first_payment_date", "first_time_homebuyer_flag", "maturity_date", "msa", "mi_pct", "num_units", "occupancy_status", "orig_cltv", "orig_dti", "orig_upb", "orig_ltv", "orig_interest_rate", "channel", "ppm_flag", "amortization_type", "property_state", "property_type",  "postal_code",
"loan_id", "loan_purpose", "orig_loan_term", "num_borrowers", "seller_name", "servicer_name", "super_conforming_flag", "pre_relief_loan_seq_num", "program_indicator", "relief_refinance_indicator", "property_valuation_method", "io_indicator", "mi_cancel_indicator" )

names(orig)[1:32] <- names
orig <- orig %>% mutate(loan_id = trimws(as.character(loan_id)))
head(orig)
## # A tibble: 6 × 32
##   credit_score first_payment_date first_time_homebuyer_flag maturity_date   msa
##          <dbl>              <dbl> <chr>                             <dbl> <dbl>
## 1          769             200702 N                                203701 24220
## 2          598             200703 N                                203702    NA
## 3          777             200703 N                                203702 20260
## 4          628             200703 N                                203702 35644
## 5          651             200703 N                                203702    NA
## 6          652             200702 N                                203701    NA
## # ℹ 27 more variables: mi_pct <chr>, num_units <dbl>, occupancy_status <chr>,
## #   orig_cltv <dbl>, orig_dti <dbl>, orig_upb <dbl>, orig_ltv <dbl>,
## #   orig_interest_rate <dbl>, channel <chr>, ppm_flag <chr>,
## #   amortization_type <chr>, property_state <chr>, property_type <chr>,
## #   postal_code <chr>, loan_id <chr>, loan_purpose <chr>, orig_loan_term <dbl>,
## #   num_borrowers <chr>, seller_name <chr>, servicer_name <chr>,
## #   super_conforming_flag <lgl>, pre_relief_loan_seq_num <lgl>, …
svcg <- read_delim("sample_svcg_2007.txt", delim = "|", col_names = FALSE, col_select = c(1, 4, 5, 7, 22), col_types = cols(X1  = col_character(), X4  = col_character(), X5  = col_double(), X7  = col_character(), X22 = col_double()), na = c("", "NA"), trim_ws = TRUE)

names(svcg) <- c("loan_id", "status", "age", "defect_flag_raw", "loss")

svcg <- svcg %>% mutate(loan_id = trimws(as.character(loan_id)))
head(svcg)
## # A tibble: 6 × 5
##   loan_id      status   age defect_flag_raw  loss
##   <chr>        <chr>  <dbl> <chr>           <dbl>
## 1 F07Q10000023 0          1 <NA>               NA
## 2 F07Q10000023 0          2 <NA>               NA
## 3 F07Q10000023 0          3 <NA>               NA
## 4 F07Q10000023 0          4 <NA>               NA
## 5 F07Q10000023 0          5 <NA>               NA
## 6 F07Q10000023 0          6 <NA>               NA

B. Filter the servicing history to only examine loans in the first 36 months after origination (using column 5, loan age). This should remove 26 unique loan ids.

n_loans_before <- n_distinct(svcg$loan_id)
svcg_36 <- svcg %>% filter(!is.na(age), age <= 36)
n_loans_after <- n_distinct(svcg_36$loan_id)
n_removed_loans <- n_loans_before - n_loans_after

n_removed_loans 
## [1] 26

C.Create a flag for loan default. A loan receives a default flag of ‘1’ if it was ever 90+ days delinquent or repossessed during its first 36 months (using column 4, status), or if took a loss with a negative dollar amount during its first 36 months (using column 22, loss). By my count 4870 loans defaulted (after filtering to first 36 months).

svcg_36 <- svcg_36 %>% mutate(status_num = suppressWarnings(as.numeric(status)))

loan_default <- svcg_36 %>%
  group_by(loan_id) %>%
  summarise(
    default_flag = as.integer(
      any(
        (!is.na(status_num) & status_num >= 3) |
        status == "RA" |                 
        (!is.na(loss) & loss < 0) )),
    .groups = "drop")

sum(loan_default$default_flag) 
## [1] 4870

D. Create a flag for loan defects (using column 7, settlement date). A defect is a case where the loan should not have been extended to a risky borrower, and then the mortgage was fradulently sold to Freddie Mac under the false guarantee that the loans were all offered to safe borrowers. A loan receives a defect flag of ‘1’ if there was a defect settlement during its first 36 months. By my count 472 loans had defects.

svcg_36 <- svcg_36 %>%
  mutate(
    defect_row = !is.na(defect_flag_raw) & defect_flag_raw != 0)

loan_defect <- svcg_36 %>%
  group_by(loan_id) %>%
  summarise(
    defect_flag = as.integer(any(defect_row, na.rm = TRUE)),
    .groups = "drop")

num_defect <- sum(loan_defect$defect_flag)
num_defect
## [1] 472

E. Merge these two flags on to the origination data, removing the 26 loans with no observations in the first 36 months. F. Your finished dataset should be the 50,000 row origination data, minus 26 rows, plus two new columns for the default and defect flags.

loan_flags <- loan_default %>%inner_join(loan_defect, by = "loan_id")
orig <- orig %>% mutate(loan_id = as.character(loan_id))

df <- orig %>% inner_join(loan_flags, by = "loan_id")
nrow(orig)
## [1] 50000
nrow(df)
## [1] 49974
head(df)
## # A tibble: 6 × 34
##   credit_score first_payment_date first_time_homebuyer_flag maturity_date   msa
##          <dbl>              <dbl> <chr>                             <dbl> <dbl>
## 1          769             200702 N                                203701 24220
## 2          598             200703 N                                203702    NA
## 3          777             200703 N                                203702 20260
## 4          628             200703 N                                203702 35644
## 5          651             200703 N                                203702    NA
## 6          652             200702 N                                203701    NA
## # ℹ 29 more variables: mi_pct <chr>, num_units <dbl>, occupancy_status <chr>,
## #   orig_cltv <dbl>, orig_dti <dbl>, orig_upb <dbl>, orig_ltv <dbl>,
## #   orig_interest_rate <dbl>, channel <chr>, ppm_flag <chr>,
## #   amortization_type <chr>, property_state <chr>, property_type <chr>,
## #   postal_code <chr>, loan_id <chr>, loan_purpose <chr>, orig_loan_term <dbl>,
## #   num_borrowers <chr>, seller_name <chr>, servicer_name <chr>,
## #   super_conforming_flag <lgl>, pre_relief_loan_seq_num <lgl>, …
## part 2: Preliminary Logistic Regression
### 2.1 Cleaning and preparing variables
model_df <- df %>%
  mutate(
    orig_dti = ifelse(orig_dti %in% c(999, 0), NA, orig_dti),
    orig_cltv = ifelse(orig_cltv %in% c(999, 0), NA, orig_cltv)
  ) %>%
  select(default_flag, credit_score, orig_cltv, orig_dti, orig_upb, orig_interest_rate) %>%
  filter(
    !is.na(default_flag),
    !is.na(credit_score),
    !is.na(orig_cltv),
    !is.na(orig_dti),
    !is.na(orig_upb),
    !is.na(orig_interest_rate)
  )

nrow(model_df)
## [1] 48769
table(model_df$default_flag)
## 
##     0     1 
## 44043  4726
### 2.2 logistic regression
logit_mod <- glm(
  default_flag ~ credit_score + orig_cltv + orig_dti + orig_upb + orig_interest_rate,
  data = model_df,
  family = binomial(link="logit")
)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(logit_mod)
## 
## Call:
## glm(formula = default_flag ~ credit_score + orig_cltv + orig_dti + 
##     orig_upb + orig_interest_rate, family = binomial(link = "logit"), 
##     data = model_df)
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -1.004e+00  3.484e-01  -2.883  0.00394 ** 
## credit_score       -1.273e-02  2.942e-04 -43.255  < 2e-16 ***
## orig_cltv           2.072e-02  1.082e-03  19.147  < 2e-16 ***
## orig_dti            2.383e-02  1.413e-03  16.858  < 2e-16 ***
## orig_upb            2.682e-06  1.644e-07  16.310  < 2e-16 ***
## orig_interest_rate  7.284e-01  3.774e-02  19.297  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 31040  on 48768  degrees of freedom
## Residual deviance: 26498  on 48763  degrees of freedom
## AIC: 26510
## 
## Number of Fisher Scoring iterations: 7
### 2.3 Odds ratios and effects
exp(coef(logit_mod))
##        (Intercept)       credit_score          orig_cltv           orig_dti 
##          0.3662451          0.9873529          1.0209333          1.0241135 
##           orig_upb orig_interest_rate 
##          1.0000027          2.0716928
### 2.4 Deviance comparison with null model
logit_mod$null.deviance
## [1] 31039.58
logit_mod$deviance
## [1] 26498.17
anova(logit_mod, test="Chisq")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: default_flag
## 
## Terms added sequentially (first to last)
## 
## 
##                    Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                               48768      31040              
## credit_score        1  2921.74     48767      28118 < 2.2e-16 ***
## orig_cltv           1   706.71     48766      27411 < 2.2e-16 ***
## orig_dti            1   397.57     48765      27014 < 2.2e-16 ***
## orig_upb            1   162.10     48764      26852 < 2.2e-16 ***
## orig_interest_rate  1   353.30     48763      26498 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
### 2.5 Effect size calculation
b <- coef(logit_mod)

exp(b["credit_score"]*50)        # +50 fico
## credit_score 
##    0.5291999
exp(b["orig_cltv"]*10)           # +10 cltv
## orig_cltv 
##  1.230195
exp(b["orig_dti"]*5)             # +5 dti
## orig_dti 
## 1.126524
exp(b["orig_interest_rate"]*1)   # +1% interest
## orig_interest_rate 
##           2.071693
### Interpretation: All predictors are statistically significant (p < 0.001). The sign and magnitude of the coefficients follow economic intuition. Higher FICO scores reduce the probability of default, while higher CLTV, DTI, UPB, and interest rates increase the probability of default.
### The model provides strong explanatory power. The null deviance is 31,039.6, while the residual deviance falls to 26,498.2, indicating a significantly better model fit than a flat default rate. 
### In terms of effect sizes, a 50-point increase in credit score lowers default odds by approximately 47% (OR = 0.53). A 10-point increase in CLTV increases odds by roughly 23%. A 5-point increase in DTI increases odds by roughly 12.6%. A 1-percentage-point increase in the interest rate doubles the odds of default (OR = 2.07).
# Part 3: alternative probit model
### 3.1 Probit model with same predictors as the logit

probit_mod <- glm(
  default_flag ~ credit_score + orig_cltv + orig_dti + orig_upb + orig_interest_rate,
  data   = model_df,
  family = binomial(link = "probit")
)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(probit_mod)
## 
## Call:
## glm(formula = default_flag ~ credit_score + orig_cltv + orig_dti + 
##     orig_upb + orig_interest_rate, family = binomial(link = "probit"), 
##     data = model_df)
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -4.203e-01  1.845e-01  -2.278   0.0227 *  
## credit_score       -6.953e-03  1.543e-04 -45.074   <2e-16 ***
## orig_cltv           1.047e-02  5.488e-04  19.072   <2e-16 ***
## orig_dti            1.233e-02  7.365e-04  16.738   <2e-16 ***
## orig_upb            1.411e-06  8.759e-08  16.109   <2e-16 ***
## orig_interest_rate  3.857e-01  2.017e-02  19.124   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 31040  on 48768  degrees of freedom
## Residual deviance: 26438  on 48763  degrees of freedom
## AIC: 26450
## 
## Number of Fisher Scoring iterations: 8
### 3.2 AIC comparison: logit vs probit
AIC(logit_mod, probit_mod)
##            df      AIC
## logit_mod   6 26510.17
## probit_mod  6 26450.18
# Smaller AIC => better fit
### 3.3 Confusion matrices and model metrics

# Choose a classification cutoff
# Option 1: use 0.5
# cutoff <- 0.5

# Option 2 (often better here): use overall default rate as cutoff
cutoff <- mean(model_df$default_flag)

# Predicted probabilities
logit_prob  <- predict(logit_mod,  type = "response")
probit_prob <- predict(probit_mod, type = "response")

# Class predictions: 1 = predicted default, 0 = predicted non-default
logit_pred  <- ifelse(logit_prob  >= cutoff, 1, 0)
probit_pred <- ifelse(probit_prob >= cutoff, 1, 0)

# Confusion matrices
logit_cm <- table(
  Truth     = model_df$default_flag,
  Predicted = logit_pred
)

probit_cm <- table(
  Truth     = model_df$default_flag,
  Predicted = probit_pred
)

logit_cm
##      Predicted
## Truth     0     1
##     0 30279 13764
##     1  1261  3465
probit_cm
##      Predicted
## Truth     0     1
##     0 29673 14370
##     1  1195  3531
# Helper to compute metrics from a 2x2 confusion matrix (rows: Truth, cols: Predicted)
get_metrics <- function(cm) {
  # Ensure names are treated as characters "0" and "1"
  tn <- as.numeric(cm["0", "0"])
  fp <- as.numeric(cm["0", "1"])
  fn <- as.numeric(cm["1", "0"])
  tp <- as.numeric(cm["1", "1"])
  
  accuracy  <- (tp + tn) / (tp + tn + fp + fn)
  precision <- tp / (tp + fp)
  recall    <- tp / (tp + fn)   # sensitivity / true positive rate
  spec      <- tn / (tn + fp)   # specificity / true negative rate
  bal_acc   <- (recall + spec) / 2
  
  data.frame(
    Accuracy         = accuracy,
    Precision        = precision,
    Recall           = recall,
    BalancedAccuracy = bal_acc
  )
}

logit_metrics  <- get_metrics(logit_cm)
probit_metrics <- get_metrics(probit_cm)

# Small comparison table for the write-up
model_comparison <- rbind(
  Logit  = logit_metrics,
  Probit = probit_metrics
)

model_comparison
##         Accuracy Precision    Recall BalancedAccuracy
## Logit  0.6919149 0.2011144 0.7331782        0.7103327
## Probit 0.6808423 0.1972516 0.7471435        0.7104357
# Optional: relabel 0/1 to something more readable
dimnames(logit_cm)  <- list(Truth     = c("No default", "Default"),
                            Predicted = c("No default", "Default"))
dimnames(probit_cm) <- dimnames(logit_cm)

library(ggplot2)

plot_cm <- function(cm, title = "Confusion matrix") {
  df <- as.data.frame(cm)
  names(df) <- c("Truth", "Predicted", "Count")
  
  ggplot(df, aes(x = Predicted, y = Truth, fill = Count)) +
    geom_tile(color = "white") +
    geom_text(aes(label = Count), size = 5) +
    scale_fill_gradient(low = "white", high = "steelblue") +
    labs(title = title, x = "Predicted class", y = "True class") +
    theme_minimal(base_size = 12)
}

# In your Rmd chunks:
plot_cm(logit_cm,  "Logit model – confusion matrix")

plot_cm(probit_cm, "Probit model – confusion matrix")