libraries:

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.4     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ✔ readr     2.1.5

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ purrr::lift()   masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(corrplot)

## corrplot 0.95 loaded

library(car)

## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:purrr':
## 
##     some
## 
## The following object is masked from 'package:dplyr':
## 
##     recode

library(ROCR)
library(ggplot2)
library(patchwork)
library(MASS)

## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:patchwork':
## 
##     area
## 
## The following object is masked from 'package:dplyr':
## 
##     select

library(e1071)
library(randomForest)

## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin
## 
## The following object is masked from 'package:dplyr':
## 
##     combine

library(class)
library(pROC)

## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

library(kknn)

## 
## Attaching package: 'kknn'
## 
## The following object is masked from 'package:caret':
## 
##     contr.dummy

Loading in the data:

data <- read.csv("hmeq.csv")
dim(data)

## [1] 5960   13

Data Exploration:

str(data)

## 'data.frame':    5960 obs. of  13 variables:
##  $ BAD    : int  1 1 1 1 0 1 1 1 1 1 ...
##  $ LOAN   : int  1100 1300 1500 1500 1700 1700 1800 1800 2000 2000 ...
##  $ MORTDUE: num  25860 70053 13500 NA 97800 ...
##  $ VALUE  : num  39025 68400 16700 NA 112000 ...
##  $ REASON : chr  "HomeImp" "HomeImp" "HomeImp" "" ...
##  $ JOB    : chr  "Other" "Other" "Other" "" ...
##  $ YOJ    : num  10.5 7 4 NA 3 9 5 11 3 16 ...
##  $ DEROG  : int  0 0 0 NA 0 0 3 0 0 0 ...
##  $ DELINQ : int  0 2 0 NA 0 0 2 0 2 0 ...
##  $ CLAGE  : num  94.4 121.8 149.5 NA 93.3 ...
##  $ NINQ   : int  1 0 1 NA 0 1 1 0 1 0 ...
##  $ CLNO   : int  9 14 10 NA 14 8 17 8 12 13 ...
##  $ DEBTINC: num  NA NA NA NA NA ...

lapply(data[, c("LOAN", "MORTDUE", "VALUE", "YOJ", "CLAGE", "DEROG", "DELINQ", "NINQ","CLNO", "DEBTINC" )], summary)

## $LOAN
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1100   11100   16300   18608   23300   89900 
## 
## $MORTDUE
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    2063   46276   65019   73761   91488  399550     518 
## 
## $VALUE
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    8000   66076   89236  101776  119824  855909     112 
## 
## $YOJ
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   3.000   7.000   8.922  13.000  41.000     515 
## 
## $CLAGE
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     0.0   115.1   173.5   179.8   231.6  1168.2     308 
## 
## $DEROG
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.0000  0.0000  0.2546  0.0000 10.0000     708 
## 
## $DELINQ
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.0000  0.0000  0.4494  0.0000 15.0000     580 
## 
## $NINQ
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   0.000   1.000   1.186   2.000  17.000     510 
## 
## $CLNO
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     0.0    15.0    20.0    21.3    26.0    71.0     222 
## 
## $DEBTINC
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##   0.5245  29.1400  34.8183  33.7799  39.0031 203.3121     1267

lapply(data[, c("BAD", "REASON", "JOB")], unique)

## $BAD
## [1] 1 0
## 
## $REASON
## [1] "HomeImp" ""        "DebtCon"
## 
## $JOB
## [1] "Other"   ""        "Office"  "Sales"   "Mgr"     "ProfExe" "Self"

The Home Equity dataset (HMEQ) contains information on home equity loan performance for 5,960 home equity loans with 13 variables including a target variable. This data comes from kaggle.com and covers observations from recent applicants granted credit. The variables describing the data include the amount of the loan requested, the amount due on their mortgage, the value of their current property, the reason for the loan, job type, years at present job, number of major derogatory reports, number of delinquent credit lines, and age of oldest trade line in months, number of recent credit inquiries, number of existing credit lines, and debt-to-income ratio.

Remove unknown and missing:

data[data == " "] <- NA
data[data == ""] <- NA
data <- na.omit(data)
data <- data |> mutate_if(is.character, as.factor)
data$BAD <- as.factor(data$BAD)

lapply(data[, c("LOAN", "MORTDUE", "VALUE", "YOJ", "CLAGE", "DEROG", "DELINQ", "NINQ","CLNO", "DEBTINC" )], summary)

## $LOAN
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1700   12000   17000   19154   23825   89900 
## 
## $MORTDUE
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    5076   49351   67279   76250   92987  399412 
## 
## $VALUE
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   21144   71235   94454  107501  122339  512650 
## 
## $YOJ
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    3.00    7.00    9.11   13.00   41.00 
## 
## $CLAGE
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##    0.4867  118.6879  176.7420  180.9937  230.4022 1168.2336 
## 
## $DEROG
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.1468  0.0000 10.0000 
## 
## $DELINQ
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.2788  0.0000 10.0000 
## 
## $NINQ
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.000   1.037   2.000  13.000 
## 
## $CLNO
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   16.00   21.00   22.11   27.00   64.00 
## 
## $DEBTINC
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##   0.8381  29.3626  35.1295  34.1354  39.0876 144.1890

lapply(data[, c("BAD", "REASON", "JOB")], unique)

## $BAD
## [1] 1 0
## Levels: 0 1
## 
## $REASON
## [1] HomeImp DebtCon
## Levels: DebtCon HomeImp
## 
## $JOB
## [1] Other   Office  Mgr     ProfExe Sales   Self   
## Levels: Mgr Office Other ProfExe Sales Self

After cleaning and processing, this leaves us with 3,364 observations.

Performing Correlation Matrix:

data_num <- dplyr::select_if(data, is.numeric)
M = cor(data_num)
corrplot(M, order = 'AOE', col = COL2('RdBu', 10), tl.col = "black",
        addCoef.col = "grey", number.cex = 0.5)

The correlation plot shows the overview of correlated numeric variables. The dark blue indicates high positive correlation. The plot shows a strong positive correlation between mortdue and value (0.87). The rest of the variables are weakly correlated, therefore, they won’t be considered significant.

table(data$BAD)

## 
##    0    1 
## 3064  300

There are 3,064 people who did not default on their loan and 300 people who did. 91% of the time, the clients did not default. This dataset is unbalanced, so lets balance it:

data_0 <- data[data$BAD == "0", ]  # "0" is the majority class
data_1 <- data[data$BAD == "1", ] # "1" is the minority class

set.seed(1)
sample_0 <- data_0[sample(nrow(data_0), nrow(data_1), replace = FALSE), ]

data_bal <- rbind(data_1, sample_0)
plot(data_bal$BAD)

reason_plot <- ggplot(aes(x = REASON), data = data_bal)+
  labs(title = "Count of Reasons why Clients Got a HomeEq Loan")+
  geom_bar()
reason_plot

reason_plot <- ggplot(aes(x = JOB), data = data_bal)+
  labs(title = "Count of Jobs of People who get HomeEq Loans")+
  geom_bar()
reason_plot

Most of the sampled data are for DebtCon (debt consolidation) and most of the people are Other jobs followed by ProfExe and Office.

Splitting the data

set.seed(1)
sample<- sample(nrow(data_bal), 0.7 * nrow(data_bal), replace = FALSE)
train <- data_bal[sample, ]
test <- data_bal[-sample, ]

Analysis Question 1: Is there a relationship between the amount of the loan requested and whether the client defaulted?

Because I am doing descriptive and explanatory models, we are going to be using the original unbalanced data to understand the true relationship between the variables.

shapiro.test(data$LOAN)

## 
##  Shapiro-Wilk normality test
## 
## data:  data$LOAN
## W = 0.84008, p-value < 2.2e-16

qqnorm(data$LOAN, main = "Q-Q Plot of LOAN")
qqline(data$LOAN, col = "red")

Because my data is not normally distributed, I will be using the Wilcoxon rank-sum test.

Hypothesis:

H0: There is no difference in loan amounts between defaulters and non-defaulters.

H1: There is a difference in loan amounts between defaulters and non-defaulters

wilcox <- wilcox.test(LOAN ~ BAD, data = data)
w <- wilcox$statistic
p <- wilcox$p.value

annotation <- paste0("Wilcoxon W = ", round(w,2), "\np-value = ", signif(p,3))

Because p < 0.05, I will reject the null hypothesis, which means there is a statistically significant difference in loan amounts between those who defaulted and those who did not.

ggplot(data, aes(BAD, LOAN)) +
  geom_boxplot() + 
  labs(title="Loan Amount by Default Status", x= "Default Status (0 = No, 1 = Yes)", y = "Amount Due")+
  annotate("text", x = 1.5, y = max(data$LOAN), label = annotation, hjust = 0.5, vjust = 1.5, size = 4.5)

The mean loan amount for default 0 is a little bit higher than default 1. There are a lot more high outliers for default 0, than default 1. This suggests that clients who do not default tend to have bigger loans.

Let’s see if the model agrees:

log_model_analysis_1 <- glm(BAD ~ LOAN, family = "binomial", data = data)
summary(log_model_analysis_1)

## 
## Call:
## glm(formula = BAD ~ LOAN, family = "binomial", data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.4714  -0.4466  -0.4337  -0.4142   2.4618  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.115e+00  1.274e-01  -16.60   <2e-16 ***
## LOAN        -1.118e-05  6.177e-06   -1.81   0.0703 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2022.7  on 3363  degrees of freedom
## Residual deviance: 2019.2  on 3362  degrees of freedom
## AIC: 2023.2
## 
## Number of Fisher Scoring iterations: 5

Based on this p-value > 0.05, Loan amount is not a significant predictor in default. However, at the 10% significance level, loan amount is marginally statistically significant. The coefficient being negative, suggests that higher loans are associated with lower chance of default, which is strange. Clients requesting larger loans are slightly less likely to default — but the effect is very weak and not strongly statistically significant.

outliers <- quantile(data$LOAN, 0.99, na.rm = TRUE)

data_no_outliers <- data[data$LOAN <= outliers, ]

log_model_analysis_1_no_outliers <- glm(BAD ~ LOAN, family = "binomial", data = data_no_outliers)

summary(log_model_analysis_1_no_outliers)

## 
## Call:
## glm(formula = BAD ~ LOAN, family = "binomial", data = data_no_outliers)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.4732  -0.4470  -0.4337  -0.4137   2.3662  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.106e+00  1.372e-01 -15.349   <2e-16 ***
## LOAN        -1.177e-05  6.916e-06  -1.702   0.0888 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2007  on 3329  degrees of freedom
## Residual deviance: 2004  on 3328  degrees of freedom
## AIC: 2008
## 
## Number of Fisher Scoring iterations: 5

shapiro.test(data_no_outliers$LOAN)

## 
##  Shapiro-Wilk normality test
## 
## data:  data_no_outliers$LOAN
## W = 0.91831, p-value < 2.2e-16

qqnorm(data_no_outliers$LOAN, main = "Q-Q Plot of LOAN")
qqline(data_no_outliers$LOAN, col = "red")

wilcox <- wilcox.test(LOAN ~ BAD, data = data_no_outliers)
w <- wilcox$statistic
p <- wilcox$p.value

annotation <- paste0("Wilcoxon W = ", round(w,2), "\np-value = ", signif(p,3))

ggplot(data_no_outliers, aes(BAD, LOAN)) +
  geom_boxplot() + 
  labs(title="Loan Amount by Default Status with 99th Percentile Cutoff", x= "Default Status (0 = No, 1 = Yes)", y = "Amount Due")+
  annotate("text", x = 1.5, y = max(data$LOAN), label = annotation, hjust = 0.5, vjust = 1.5, size = 4.5)

Because p < 0.05, I will reject the null hypothesis, which means there is a statistically significant difference in loan amounts between those who defaulted and those who did not.

Conclusion for Analysis 1: Clients who take bigger loan amounts generally do not default as much as those who take smaller loan amounts. However, there is a lot of very large loan outliers in those who do not default. After filtering the data to remove the 99th percentile outliers, the model’s AIC slightly increased. However, both of my logistic regressions confirms that although there is only a marginally significant relationship between loan amount and probability of default, the clients who take larger loans, tend to not default as much. This may do to wealthier, better-qualified clients tending to qualify for larger loans and are less likely to default whereas, small loans may be taken by more risky clients. This is not what I originally expected in my proposal. I had hypothesized that clients who take bigger loan amounts may be unable to repay these loans without defaulting. However, I did not take into consideration that the bank may not approve large loan amounts for those who are more at risk to default.

Analysis Question 2: How does the loan amount requested, the amount due on existing mortgage, and the debt-to-income ratio interact to predict whether a client will default on their HELOC?

pairs(data[, c("LOAN", "MORTDUE", "DEBTINC", "BAD")],
      col = ifelse(data$BAD == 1, "red", "green"),
      pch = 1,
      main = "Pairs Plot of LOAN, MORTDUE, DEBTINC, and BAD")

interaction_model_analysis_2 <- glm(BAD ~ LOAN * MORTDUE * DEBTINC, family = "binomial", data = data)
summary(interaction_model_analysis_2)

## 
## Call:
## glm(formula = BAD ~ LOAN * MORTDUE * DEBTINC, family = "binomial", 
##     data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1255  -0.4586  -0.3631  -0.2662   3.6781  
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          -4.663e+00  1.408e+00  -3.312 0.000927 ***
## LOAN                 -2.826e-05  7.335e-05  -0.385 0.700071    
## MORTDUE              -1.981e-05  1.552e-05  -1.276 0.201812    
## DEBTINC               1.215e-01  3.756e-02   3.235 0.001217 ** 
## LOAN:MORTDUE          1.541e-10  5.865e-10   0.263 0.792745    
## LOAN:DEBTINC         -1.190e-06  1.878e-06  -0.633 0.526545    
## MORTDUE:DEBTINC      -9.090e-08  4.074e-07  -0.223 0.823422    
## LOAN:MORTDUE:DEBTINC  1.334e-11  1.500e-11   0.889 0.373859    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2022.7  on 3363  degrees of freedom
## Residual deviance: 1818.8  on 3356  degrees of freedom
## AIC: 1834.8
## 
## Number of Fisher Scoring iterations: 6

Conclusion for Analysis 2: This model suggests that the interaction terms between the loan amount requested, the amount due on existing mortgage, and the debt-to-income ratio do not interact meaningfully to whether the client will default on their loan. This suggests that these variables should be interpreted independent of each other and there is not complex relationships between them that affects whether the client defaults or not. With that being said, the debt-to-income ratio (DEBTINC) is statistically significant in this model which we should pay attention to as we proceed.

Analysis Question 3: How do the number of major derogatory reports, the number of delinquent credit lines, and the age of the oldest trade line interact to influence the probability of defaulting on a HELOC?

pairs(data[, c("DEROG", "DELINQ", "CLAGE", "BAD")],
      col = ifelse(data$BAD == 1, "red", "green"),
      pch = 1, 
      main = "Pairs Plot of DEROG, DELINQ, CLAGE, and BAD")

log_model_analysis_3 <- glm(BAD ~ DEROG * DELINQ * CLAGE, family = "binomial", data = data)
summary(log_model_analysis_3)

## 
## Call:
## glm(formula = BAD ~ DEROG * DELINQ * CLAGE, family = "binomial", 
##     data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.3970  -0.4258  -0.3295  -0.2514   4.5963  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -1.568e+00  1.942e-01  -8.073 6.83e-16 ***
## DEROG               7.920e-01  2.537e-01   3.121   0.0018 ** 
## DELINQ              2.939e-01  1.872e-01   1.570   0.1164    
## CLAGE              -7.700e-03  1.207e-03  -6.377 1.81e-10 ***
## DEROG:DELINQ       -2.388e-02  2.344e-01  -0.102   0.9189    
## DEROG:CLAGE        -1.584e-05  1.629e-03  -0.010   0.9922    
## DELINQ:CLAGE        2.102e-03  1.031e-03   2.040   0.0414 *  
## DEROG:DELINQ:CLAGE -1.311e-04  1.324e-03  -0.099   0.9211    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2022.7  on 3363  degrees of freedom
## Residual deviance: 1717.3  on 3356  degrees of freedom
## AIC: 1733.3
## 
## Number of Fisher Scoring iterations: 6

Conclusion for Analysis 3: In this model, the presence of derogatory reports and the age of a client’s oldest credit line are statistically significant in the probability of defaulting on a HELOC. The age of the credit line seems to reduce the risk of default especially for clients with few delinquent credit lines. This proves my proposed hypothesis that a longer credit history may indicate a better financial record even if the client has several derogatory reports or delinquent credit lines. Clients with longer credit histories are less likely default which may help offset some of the risks that come with delinquent credit lines.

Now I will do some prediction models and determine which model (Logistic regression, SVM, K-Nearest Neighbor, or Random Forest) will be the best model on predicting.

Logistic regression:

log_model <- glm(BAD ~ ., data = train, family = "binomial")
summary(log_model)

## 
## Call:
## glm(formula = BAD ~ ., family = "binomial", data = train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.20264  -0.89598   0.01348   0.88814   2.51697  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -1.296e+00  6.985e-01  -1.856  0.06349 .  
## LOAN          -1.879e-06  1.377e-05  -0.136  0.89145    
## MORTDUE        7.990e-06  8.636e-06   0.925  0.35484    
## VALUE         -5.783e-06  7.357e-06  -0.786  0.43182    
## REASONHomeImp  1.819e-02  2.722e-01   0.067  0.94671    
## JOBOffice     -8.629e-01  4.680e-01  -1.844  0.06523 .  
## JOBOther      -3.204e-01  3.699e-01  -0.866  0.38638    
## JOBProfExe    -5.579e-01  3.966e-01  -1.407  0.15949    
## JOBSales       5.356e-01  8.467e-01   0.633  0.52701    
## JOBSelf        4.971e-01  7.286e-01   0.682  0.49506    
## YOJ           -4.583e-02  1.858e-02  -2.467  0.01361 *  
## DEROG          6.443e-01  1.961e-01   3.285  0.00102 ** 
## DELINQ         7.630e-01  1.455e-01   5.245 1.56e-07 ***
## CLAGE         -3.578e-03  1.717e-03  -2.084  0.03713 *  
## NINQ           1.742e-01  7.597e-02   2.293  0.02183 *  
## CLNO          -3.479e-02  1.291e-02  -2.695  0.00704 ** 
## DEBTINC        7.431e-02  1.530e-02   4.856 1.20e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 582.23  on 419  degrees of freedom
## Residual deviance: 441.19  on 403  degrees of freedom
## AIC: 475.19
## 
## Number of Fisher Scoring iterations: 5

vif(log_model)

##              GVIF Df GVIF^(1/(2*Df))
## LOAN     1.845748  1        1.358583
## MORTDUE 14.423088  1        3.797774
## VALUE   16.238469  1        4.029698
## REASON   1.105756  1        1.051549
## JOB      1.412264  5        1.035122
## YOJ      1.103780  1        1.050609
## DEROG    1.081470  1        1.039937
## DELINQ   1.158447  1        1.076312
## CLAGE    1.250601  1        1.118303
## NINQ     1.074652  1        1.036654
## CLNO     1.493746  1        1.222189
## DEBTINC  1.152535  1        1.073562

As high positive correlations were detected among variables, the VIF were also checked to detect for multicollinearity. The VALUE variable has the highest VIF. To prevent multicollinearity, we will remove Value from the models.

log_model2 <- glm(BAD ~ . -VALUE, data = train, family = "binomial")
summary(log_model2)

## 
## Call:
## glm(formula = BAD ~ . - VALUE, family = "binomial", data = train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.19938  -0.92114   0.01421   0.90004   2.61583  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -1.273e+00  6.964e-01  -1.829  0.06747 .  
## LOAN          -7.382e-06  1.225e-05  -0.602  0.54687    
## MORTDUE        1.558e-06  2.758e-06   0.565  0.57204    
## REASONHomeImp  1.055e-02  2.719e-01   0.039  0.96904    
## JOBOffice     -8.703e-01  4.674e-01  -1.862  0.06263 .  
## JOBOther      -3.224e-01  3.687e-01  -0.874  0.38187    
## JOBProfExe    -5.565e-01  3.952e-01  -1.408  0.15901    
## JOBSales       4.885e-01  8.406e-01   0.581  0.56114    
## JOBSelf        5.388e-01  7.283e-01   0.740  0.45947    
## YOJ           -4.604e-02  1.855e-02  -2.482  0.01307 *  
## DEROG          6.419e-01  1.952e-01   3.289  0.00101 ** 
## DELINQ         7.570e-01  1.459e-01   5.187 2.14e-07 ***
## CLAGE         -3.971e-03  1.672e-03  -2.376  0.01751 *  
## NINQ           1.706e-01  7.556e-02   2.257  0.02399 *  
## CLNO          -3.304e-02  1.272e-02  -2.599  0.00936 ** 
## DEBTINC        7.406e-02  1.528e-02   4.847 1.25e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 582.23  on 419  degrees of freedom
## Residual deviance: 441.81  on 404  degrees of freedom
## AIC: 473.81
## 
## Number of Fisher Scoring iterations: 5

vif(log_model2)

##             GVIF Df GVIF^(1/(2*Df))
## LOAN    1.406565  1        1.185987
## MORTDUE 1.475993  1        1.214905
## REASON  1.106564  1        1.051934
## JOB     1.392718  5        1.033681
## YOJ     1.104469  1        1.050937
## DEROG   1.079256  1        1.038872
## DELINQ  1.152149  1        1.073382
## CLAGE   1.156862  1        1.075575
## NINQ    1.073591  1        1.036143
## CLNO    1.447319  1        1.203046
## DEBTINC 1.154403  1        1.074432

log_model_backward <- stepAIC(log_model2, direction = "backward")

## Start:  AIC=473.81
## BAD ~ (LOAN + MORTDUE + VALUE + REASON + JOB + YOJ + DEROG + 
##     DELINQ + CLAGE + NINQ + CLNO + DEBTINC) - VALUE
## 
##           Df Deviance    AIC
## - JOB      5   449.55 471.55
## - REASON   1   441.82 471.82
## - MORTDUE  1   442.13 472.13
## - LOAN     1   442.19 472.19
## <none>         441.81 473.81
## - NINQ     1   447.18 477.18
## - YOJ      1   448.23 478.23
## - CLNO     1   448.69 478.69
## - CLAGE    1   449.04 479.04
## - DEROG    1   457.49 487.49
## - DEBTINC  1   472.60 502.60
## - DELINQ   1   487.59 517.59
## 
## Step:  AIC=471.55
## BAD ~ LOAN + MORTDUE + REASON + YOJ + DEROG + DELINQ + CLAGE + 
##     NINQ + CLNO + DEBTINC
## 
##           Df Deviance    AIC
## - REASON   1   449.59 469.59
## - LOAN     1   449.94 469.94
## - MORTDUE  1   449.97 469.97
## <none>         449.55 471.55
## - NINQ     1   455.15 475.15
## - CLNO     1   456.46 476.46
## - CLAGE    1   456.99 476.99
## - YOJ      1   457.41 477.41
## - DEROG    1   465.40 485.40
## - DEBTINC  1   481.58 501.58
## - DELINQ   1   493.08 513.08
## 
## Step:  AIC=469.59
## BAD ~ LOAN + MORTDUE + YOJ + DEROG + DELINQ + CLAGE + NINQ + 
##     CLNO + DEBTINC
## 
##           Df Deviance    AIC
## - MORTDUE  1   450.02 468.02
## - LOAN     1   450.03 468.03
## <none>         449.59 469.59
## - NINQ     1   455.15 473.15
## - CLNO     1   456.58 474.58
## - CLAGE    1   457.04 475.04
## - YOJ      1   457.41 475.41
## - DEROG    1   465.42 483.42
## - DEBTINC  1   481.68 499.68
## - DELINQ   1   493.32 511.32
## 
## Step:  AIC=468.02
## BAD ~ LOAN + YOJ + DEROG + DELINQ + CLAGE + NINQ + CLNO + DEBTINC
## 
##           Df Deviance    AIC
## - LOAN     1   450.23 466.23
## <none>         450.02 468.02
## - NINQ     1   455.41 471.41
## - CLNO     1   456.58 472.58
## - CLAGE    1   457.35 473.35
## - YOJ      1   457.81 473.81
## - DEROG    1   465.50 481.50
## - DEBTINC  1   482.68 498.68
## - DELINQ   1   493.39 509.39
## 
## Step:  AIC=466.23
## BAD ~ YOJ + DEROG + DELINQ + CLAGE + NINQ + CLNO + DEBTINC
## 
##           Df Deviance    AIC
## <none>         450.23 466.23
## - NINQ     1   455.45 469.45
## - CLNO     1   457.00 471.00
## - CLAGE    1   457.92 471.92
## - YOJ      1   458.99 472.99
## - DEROG    1   465.72 479.72
## - DEBTINC  1   482.73 496.73
## - DELINQ   1   494.48 508.48

summary(log_model_backward)

## 
## Call:
## glm(formula = BAD ~ YOJ + DEROG + DELINQ + CLAGE + NINQ + CLNO + 
##     DEBTINC, family = "binomial", data = train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.27396  -0.93497   0.01912   0.96430   2.65279  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.598788   0.594218  -2.691  0.00713 ** 
## YOJ         -0.050922   0.017646  -2.886  0.00391 ** 
## DEROG        0.629764   0.194128   3.244  0.00118 ** 
## DELINQ       0.732827   0.142191   5.154 2.55e-07 ***
## CLAGE       -0.004004   0.001625  -2.465  0.01372 *  
## NINQ         0.162374   0.072688   2.234  0.02549 *  
## CLNO        -0.030705   0.011876  -2.586  0.00972 ** 
## DEBTINC      0.073149   0.014807   4.940 7.81e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 582.23  on 419  degrees of freedom
## Residual deviance: 450.23  on 412  degrees of freedom
## AIC: 466.23
## 
## Number of Fisher Scoring iterations: 5

After removing the variable with the highest VIF, we ran backwards selection and found that the significant predictors of default are clients with more derogatory reports (DEROG), more delinquent credit lines (DELINQ), higher debt-to-income ratios (DEBTINC), and fewer years on the job (YOJ). On the other hand, having a longer credit history (CLAGE) and more credit lines (CLNO) may reduce the likelihood of default.

logistic_predprob <- predict(log_model_backward, newdata = test, type = "response")
logistic_predclass = ifelse(logistic_predprob >= 0.5, "1", "0")
table(logistic_predclass)

## logistic_predclass
##  0  1 
## 96 84

caret::confusionMatrix(as.factor(logistic_predclass), test$BAD, positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 65 31
##          1 26 58
##                                         
##                Accuracy : 0.6833        
##                  95% CI : (0.61, 0.7505)
##     No Information Rate : 0.5056        
##     P-Value [Acc > NIR] : 1.016e-06     
##                                         
##                   Kappa : 0.3662        
##                                         
##  Mcnemar's Test P-Value : 0.5962        
##                                         
##             Sensitivity : 0.6517        
##             Specificity : 0.7143        
##          Pos Pred Value : 0.6905        
##          Neg Pred Value : 0.6771        
##              Prevalence : 0.4944        
##          Detection Rate : 0.3222        
##    Detection Prevalence : 0.4667        
##       Balanced Accuracy : 0.6830        
##                                         
##        'Positive' Class : 1             
##

The logistic model has an accuracy of 68.33%, sensitivity of 65.17%, and specificity of 71.43%.

prediction_obj <- prediction(logistic_predprob, test$BAD)
roc_obj <- performance(prediction_obj, "tpr", "fpr")
plot(roc_obj, main = "ROC Curve of Logistic Model", col = "blue", lwd = 2)
abline(0, 1, lty = 2, col = "red")
auc_score <- performance(prediction_obj, "auc")
auc_value <- as.numeric(auc_score@y.values)
rect(0.7, 0.05, 1, 0.15, col = "white", border = "black") 
text(
  x = 0.85, y = 0.1, 
  labels = paste("AUC =", round(auc_value, 4)), 
  col = "black", 
  cex = 1.2  
)

The model correctly distinguishes between the classes about 75% of the time. This suggests that the model is robust enough to make accurate predictions for a significant proportion of cases.

SVM

set.seed(1)
svm_model <- svm(factor(BAD) ~ ., data = train, kernel = "linear", class.weights = c("0"=1, "1"=2), probability = TRUE)

svm_pred <- predict(svm_model, test, probability = TRUE)
svm_predprob <- attr(svm_pred, "probabilities")[, "1"]

caret::confusionMatrix(svm_pred, factor(test$BAD), positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 67 27
##          1 24 62
##                                           
##                Accuracy : 0.7167          
##                  95% CI : (0.6448, 0.7812)
##     No Information Rate : 0.5056          
##     P-Value [Acc > NIR] : 6.599e-09       
##                                           
##                   Kappa : 0.4331          
##                                           
##  Mcnemar's Test P-Value : 0.7794          
##                                           
##             Sensitivity : 0.6966          
##             Specificity : 0.7363          
##          Pos Pred Value : 0.7209          
##          Neg Pred Value : 0.7128          
##              Prevalence : 0.4944          
##          Detection Rate : 0.3444          
##    Detection Prevalence : 0.4778          
##       Balanced Accuracy : 0.7164          
##                                           
##        'Positive' Class : 1               
##

X <- model.matrix(factor(BAD) ~ ., data = train)[, -1]

svm_model <- svm(x = X, y = factor(train$BAD), kernel = "linear", class.weights = c("0"=1, "1"=2), probability = TRUE)

w <- t(svm_model$coefs) %*% svm_model$SV

importance <- as.vector(w)
names(importance) <- colnames(X)

sorted_importance <- sort(abs(importance), decreasing = TRUE)
sorted_importance

##        DELINQ         DEROG       DEBTINC       MORTDUE         VALUE 
##    0.84477977    0.75061130    0.74916394    0.64527583    0.53171528 
##          CLNO           YOJ    JOBProfExe     JOBOffice         CLAGE 
##    0.44114960    0.38753543    0.32059351    0.29526358    0.25546812 
##          NINQ      JOBOther       JOBSelf REASONHomeImp          LOAN 
##    0.17648035    0.10380790    0.06578830    0.04739239    0.04650422 
##      JOBSales 
##    0.01307225

set.seed(1)
svm_model_2 <- svm(factor(BAD) ~ ., data = train, kernel = "radial", class.weights = c("0"=1, "1"=2), probability = TRUE)

svm_pred2 <- predict(svm_model_2, test, probability = TRUE)
svm_predprob2 <- attr(svm_pred2, "probabilities")[, "1"]

caret::confusionMatrix(svm_pred2, factor(test$BAD), positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 63 26
##          1 28 63
##                                           
##                Accuracy : 0.7             
##                  95% CI : (0.6274, 0.7659)
##     No Information Rate : 0.5056          
##     P-Value [Acc > NIR] : 9.19e-08        
##                                           
##                   Kappa : 0.4001          
##                                           
##  Mcnemar's Test P-Value : 0.8918          
##                                           
##             Sensitivity : 0.7079          
##             Specificity : 0.6923          
##          Pos Pred Value : 0.6923          
##          Neg Pred Value : 0.7079          
##              Prevalence : 0.4944          
##          Detection Rate : 0.3500          
##    Detection Prevalence : 0.5056          
##       Balanced Accuracy : 0.7001          
##                                           
##        'Positive' Class : 1               
##

set.seed(1)
svm_model3 <- svm(factor(BAD) ~ ., data = train, kernel = "polynomial", class.weights = c("0"=1, "1"=2), probability = TRUE)

svm_pred3 <- predict(svm_model3, test, probability = TRUE)
svm_predprob3 <- attr(svm_pred3, "probabilities")[, "1"]

caret::confusionMatrix(svm_pred3, factor(test$BAD), positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 49 27
##          1 42 62
##                                          
##                Accuracy : 0.6167         
##                  95% CI : (0.5414, 0.688)
##     No Information Rate : 0.5056         
##     P-Value [Acc > NIR] : 0.001754       
##                                          
##                   Kappa : 0.2347         
##                                          
##  Mcnemar's Test P-Value : 0.091911       
##                                          
##             Sensitivity : 0.6966         
##             Specificity : 0.5385         
##          Pos Pred Value : 0.5962         
##          Neg Pred Value : 0.6447         
##              Prevalence : 0.4944         
##          Detection Rate : 0.3444         
##    Detection Prevalence : 0.5778         
##       Balanced Accuracy : 0.6175         
##                                          
##        'Positive' Class : 1              
##

set.seed(1)
svm_model4 <- svm(factor(BAD) ~ ., data = train, kernel = "sigmoid", class.weights = c("0"=1, "1"=2), probability = TRUE)

svm_pred4 <- predict(svm_model4, test, probability = TRUE)
svm_predprob4 <- attr(svm_pred4, "probabilities")[, "1"]

caret::confusionMatrix(svm_pred4, factor(test$BAD), positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 56 31
##          1 35 58
##                                           
##                Accuracy : 0.6333          
##                  95% CI : (0.5584, 0.7038)
##     No Information Rate : 0.5056          
##     P-Value [Acc > NIR] : 0.0003711       
##                                           
##                   Kappa : 0.2669          
##                                           
##  Mcnemar's Test P-Value : 0.7119232       
##                                           
##             Sensitivity : 0.6517          
##             Specificity : 0.6154          
##          Pos Pred Value : 0.6237          
##          Neg Pred Value : 0.6437          
##              Prevalence : 0.4944          
##          Detection Rate : 0.3222          
##    Detection Prevalence : 0.5167          
##       Balanced Accuracy : 0.6335          
##                                           
##        'Positive' Class : 1               
##

The linear SVM model has an accuracy of 71.67%, sensitivity of 69.66%, and specificity of 73.63%. The other kernels, radial, polynomial, and sigmoid, did not perform as well, meaning I stuck with linear in order to find which variables are the best predictors. When finding how much variables contribute to the decision barriers, I found that the more delinquencies, derogatory credit reports, and higher ratio of debt-to-income are the most important features. This aligns with the logistic regression.

prediction_obj_bal <- prediction(svm_predprob, test$BAD)
roc_obj_bal <- performance(prediction_obj_bal, "tpr", "fpr")
plot(roc_obj_bal, main = "ROC Curve using Balanced SVM Model", col = "blue", lwd = 2)
abline(0, 1, lty = 2, col = "red")
auc_score_bal <- performance(prediction_obj_bal, "auc")
auc_value_bal <- as.numeric(auc_score_bal@y.values)
rect(0.7, 0.05, 1, 0.15, col = "white", border = "black") 
text(
  x = 0.85, y = 0.1, 
  labels = paste("AUC =", round(auc_value_bal, 4)), 
  col = "black", 
  cex = 1.2  
)

The AUC of the SVM model is about 75%. Which is about the same as the logistic model!

K-nearest Neighbor

normalize <- function(x) {(x - min(x)) / (max(x) - min(x))}

train_knn <- as.data.frame(lapply(train[, sapply(train, is.numeric)], normalize))
test_knn <- as.data.frame(lapply(test[, sapply(test, is.numeric)], normalize))

train_labels <- train$BAD
test_labels <- test$BAD

k_values <- seq(1, 25, by = 2)

accuracy <- numeric(length(k_values))

for (i in seq_along(k_values)) {
  knn_pred <- knn(train = train_knn, test = test_knn, cl = train_labels, k = k_values[i])
  cm <- confusionMatrix(knn_pred, factor(test_labels))
  accuracy[i] <- cm$overall["Accuracy"]
}

plot(k_values, accuracy, type = "b", pch = 19, col = "blue",
     xlab = "K", ylab = "Accuracy", main = "Choosing Best K in KNN")

knn_pred <- knn(train = train_knn, test = test_knn, cl = train_labels, k = 3)
confusionMatrix(knn_pred, factor(test_labels))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 84 49
##          1  7 40
##                                           
##                Accuracy : 0.6889          
##                  95% CI : (0.6158, 0.7557)
##     No Information Rate : 0.5056          
##     P-Value [Acc > NIR] : 4.677e-07       
##                                           
##                   Kappa : 0.3745          
##                                           
##  Mcnemar's Test P-Value : 4.281e-08       
##                                           
##             Sensitivity : 0.9231          
##             Specificity : 0.4494          
##          Pos Pred Value : 0.6316          
##          Neg Pred Value : 0.8511          
##              Prevalence : 0.5056          
##          Detection Rate : 0.4667          
##    Detection Prevalence : 0.7389          
##       Balanced Accuracy : 0.6863          
##                                           
##        'Positive' Class : 0               
##

The KNN = 3 model has an accuracy of 68.89%, sensitivity of 62.31%, and specificity of 44.94%.

prediction_obj <- prediction(as.numeric(knn_pred), test_labels) 
roc_obj <- performance(prediction_obj, "tpr", "fpr")
plot(roc_obj, main = "ROC Curve for KNN Model", col = "blue", lwd = 2)
abline(0, 1, lty = 2, col = "red")
auc_score <- performance(prediction_obj, "auc")
auc_value <- as.numeric(auc_score@y.values)
rect(0.7, 0.05, 1, 0.15, col = "white", border = "black") 
text(x = 0.85, y = 0.1, labels = paste("AUC =", round(auc_value, 4)), col = "black", cex = 1.2)

The AUC for KNN = 5 is 68%

Random Forest model

rf_model_basic <- randomForest(BAD ~., data = train, ntree = 100, mtry = 3)
rf_preds <- predict(rf_model_basic, test)
varImpPlot(rf_model_basic)

confusionMatrix(rf_preds, test$BAD)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 69 24
##          1 22 65
##                                           
##                Accuracy : 0.7444          
##                  95% CI : (0.6742, 0.8064)
##     No Information Rate : 0.5056          
##     P-Value [Acc > NIR] : 4.786e-11       
##                                           
##                   Kappa : 0.4887          
##                                           
##  Mcnemar's Test P-Value : 0.8828          
##                                           
##             Sensitivity : 0.7582          
##             Specificity : 0.7303          
##          Pos Pred Value : 0.7419          
##          Neg Pred Value : 0.7471          
##              Prevalence : 0.5056          
##          Detection Rate : 0.3833          
##    Detection Prevalence : 0.5167          
##       Balanced Accuracy : 0.7443          
##                                           
##        'Positive' Class : 0               
##

mtry_values <- c(2, 4, 6, 7)      
ntree_values <- c(100, 200, 300, 400, 500, 600, 700, 800, 900, 1000)


results <- data.frame(mtry = integer(), ntree = integer(), Accuracy = numeric())
models <- list()  

for (m in mtry_values) {
  for (n in ntree_values) {
    rf_model <- randomForest(factor(BAD) ~ DEBTINC + CLAGE +  MORTDUE + LOAN + DELINQ + YOJ + CLNO, data = train, mtry = m, ntree = n)
    
    rf_pred <- predict(rf_model, test)
    
    cm <- confusionMatrix(rf_pred, factor(test$BAD))
    acc <- cm$overall['Accuracy']
    
    results <- rbind(results, data.frame(mtry = m, ntree = n, Accuracy = acc))
    models[[paste0("mtry_", m, "_ntree_", n)]] <- rf_model
  }
}

best_index <- which.max(results$Accuracy)
best_mtry <- results$mtry[best_index]
best_ntree <- results$ntree[best_index]


best_model_name <- paste0("mtry_", best_mtry, "_ntree_", best_ntree)
best_model <- models[[best_model_name]]

best_pred <- predict(best_model, test)

conf_matrix_best <- confusionMatrix(best_pred, factor(test$BAD))
print(conf_matrix_best)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 73 23
##          1 18 66
##                                           
##                Accuracy : 0.7722          
##                  95% CI : (0.7039, 0.8313)
##     No Information Rate : 0.5056          
##     P-Value [Acc > NIR] : 1.7e-13         
##                                           
##                   Kappa : 0.5441          
##                                           
##  Mcnemar's Test P-Value : 0.5322          
##                                           
##             Sensitivity : 0.8022          
##             Specificity : 0.7416          
##          Pos Pred Value : 0.7604          
##          Neg Pred Value : 0.7857          
##              Prevalence : 0.5056          
##          Detection Rate : 0.4056          
##    Detection Prevalence : 0.5333          
##       Balanced Accuracy : 0.7719          
##                                           
##        'Positive' Class : 0               
##

best_pred_prob <- predict(best_model, test, type = "prob")[, "1"]

prediction_obj <- prediction(best_pred_prob, test$BAD)

roc_obj <- performance(prediction_obj, "tpr", "fpr")

plot(roc_obj, 
     main = "ROC Curve for Best Random Forest Model", 
     col = "blue", lwd = 2)
abline(a = 0, b = 1, lty = 2, col = "red")

auc_obj <- performance(prediction_obj, "auc")
auc_value <- as.numeric(auc_obj@y.values)

text(x = 0.6, y = 0.2, labels = paste("AUC =", round(auc_value, 4)), cex = 1.2)

My original random forest model identified the most important features for prediction were DEBTINC, CLAGE, MORTDUE, LOAN, YOJ, CLNO, and DELINQ. It also chose VALUE which I chose to drop due to its correlation with MORTDUE. By dropping it, I got an increased accuracy, sensitivity, and specificity. I systematically searched for optimal hyperparameters for mtry and ntree using a for loop. This led me to get an accuracy of 77.22%, sensitivity of 80.22%, and specificity of 74.16%.

Results

model_performance <- data.frame(
  Model = c("Logistic Regression", "SVM", "K-Nearest Neighbors", "Random Forest"),
  Accuracy = c(0.6833, 0.7167, 0.6889, conf_matrix_best$overall["Accuracy"]),
  Sensitivity = c(0.6517, 0.6966, 0.6231, conf_matrix_best$byClass["Sensitivity"]),
  Specificity = c(0.7143, 0.7363, 0.4494, conf_matrix_best$byClass["Specificity"]),
  AUC = c(0.7545, 0.7522, 0.68, {
    best_pred_prob <- predict(best_model, test, type = "prob")[, "1"]
    rf_pred_obj <- ROCR::prediction(best_pred_prob, test$BAD)
    rf_auc <- ROCR::performance(rf_pred_obj, "auc")
    as.numeric(rf_auc@y.values)
  })
)

model_performance <- model_performance %>% 
  mutate(across(where(is.numeric), ~ round(., 4)))

knitr::kable(model_performance, caption = "Comparison of Model Performances")

Comparison of Model Performances
Model	Accuracy	Sensitivity	Specificity	AUC
Logistic Regression	0.6833	0.6517	0.7143	0.7545
SVM	0.7167	0.6966	0.7363	0.7522
K-Nearest Neighbors	0.6889	0.6231	0.4494	0.6800
Random Forest	0.7722	0.8022	0.7416	0.8105

This proves that the Random Forest is the best model for predicting whether a client will default on their HELOC loan.

DA 6813 Final Project

Abigail Davis mjv135

2025-05-05