libraries:
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.4 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ✔ readr 2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(corrplot)
## corrplot 0.95 loaded
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:purrr':
##
## some
##
## The following object is masked from 'package:dplyr':
##
## recode
library(ROCR)
library(ggplot2)
library(patchwork)
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:patchwork':
##
## area
##
## The following object is masked from 'package:dplyr':
##
## select
library(e1071)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:ggplot2':
##
## margin
##
## The following object is masked from 'package:dplyr':
##
## combine
library(class)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(kknn)
##
## Attaching package: 'kknn'
##
## The following object is masked from 'package:caret':
##
## contr.dummy
Loading in the data:
data <- read.csv("hmeq.csv")
dim(data)
## [1] 5960 13
Data Exploration:
str(data)
## 'data.frame': 5960 obs. of 13 variables:
## $ BAD : int 1 1 1 1 0 1 1 1 1 1 ...
## $ LOAN : int 1100 1300 1500 1500 1700 1700 1800 1800 2000 2000 ...
## $ MORTDUE: num 25860 70053 13500 NA 97800 ...
## $ VALUE : num 39025 68400 16700 NA 112000 ...
## $ REASON : chr "HomeImp" "HomeImp" "HomeImp" "" ...
## $ JOB : chr "Other" "Other" "Other" "" ...
## $ YOJ : num 10.5 7 4 NA 3 9 5 11 3 16 ...
## $ DEROG : int 0 0 0 NA 0 0 3 0 0 0 ...
## $ DELINQ : int 0 2 0 NA 0 0 2 0 2 0 ...
## $ CLAGE : num 94.4 121.8 149.5 NA 93.3 ...
## $ NINQ : int 1 0 1 NA 0 1 1 0 1 0 ...
## $ CLNO : int 9 14 10 NA 14 8 17 8 12 13 ...
## $ DEBTINC: num NA NA NA NA NA ...
lapply(data[, c("LOAN", "MORTDUE", "VALUE", "YOJ", "CLAGE", "DEROG", "DELINQ", "NINQ","CLNO", "DEBTINC" )], summary)
## $LOAN
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1100 11100 16300 18608 23300 89900
##
## $MORTDUE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 2063 46276 65019 73761 91488 399550 518
##
## $VALUE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 8000 66076 89236 101776 119824 855909 112
##
## $YOJ
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 3.000 7.000 8.922 13.000 41.000 515
##
## $CLAGE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0 115.1 173.5 179.8 231.6 1168.2 308
##
## $DEROG
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0000 0.0000 0.0000 0.2546 0.0000 10.0000 708
##
## $DELINQ
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0000 0.0000 0.0000 0.4494 0.0000 15.0000 580
##
## $NINQ
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 0.000 1.000 1.186 2.000 17.000 510
##
## $CLNO
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0 15.0 20.0 21.3 26.0 71.0 222
##
## $DEBTINC
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.5245 29.1400 34.8183 33.7799 39.0031 203.3121 1267
lapply(data[, c("BAD", "REASON", "JOB")], unique)
## $BAD
## [1] 1 0
##
## $REASON
## [1] "HomeImp" "" "DebtCon"
##
## $JOB
## [1] "Other" "" "Office" "Sales" "Mgr" "ProfExe" "Self"
The Home Equity dataset (HMEQ) contains information on home equity loan performance for 5,960 home equity loans with 13 variables including a target variable. This data comes from kaggle.com and covers observations from recent applicants granted credit. The variables describing the data include the amount of the loan requested, the amount due on their mortgage, the value of their current property, the reason for the loan, job type, years at present job, number of major derogatory reports, number of delinquent credit lines, and age of oldest trade line in months, number of recent credit inquiries, number of existing credit lines, and debt-to-income ratio.
Remove unknown and missing:
data[data == " "] <- NA
data[data == ""] <- NA
data <- na.omit(data)
data <- data |> mutate_if(is.character, as.factor)
data$BAD <- as.factor(data$BAD)
lapply(data[, c("LOAN", "MORTDUE", "VALUE", "YOJ", "CLAGE", "DEROG", "DELINQ", "NINQ","CLNO", "DEBTINC" )], summary)
## $LOAN
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1700 12000 17000 19154 23825 89900
##
## $MORTDUE
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5076 49351 67279 76250 92987 399412
##
## $VALUE
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 21144 71235 94454 107501 122339 512650
##
## $YOJ
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 3.00 7.00 9.11 13.00 41.00
##
## $CLAGE
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.4867 118.6879 176.7420 180.9937 230.4022 1168.2336
##
## $DEROG
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1468 0.0000 10.0000
##
## $DELINQ
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.2788 0.0000 10.0000
##
## $NINQ
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 1.000 1.037 2.000 13.000
##
## $CLNO
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 16.00 21.00 22.11 27.00 64.00
##
## $DEBTINC
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.8381 29.3626 35.1295 34.1354 39.0876 144.1890
lapply(data[, c("BAD", "REASON", "JOB")], unique)
## $BAD
## [1] 1 0
## Levels: 0 1
##
## $REASON
## [1] HomeImp DebtCon
## Levels: DebtCon HomeImp
##
## $JOB
## [1] Other Office Mgr ProfExe Sales Self
## Levels: Mgr Office Other ProfExe Sales Self
After cleaning and processing, this leaves us with 3,364 observations.
Performing Correlation Matrix:
data_num <- dplyr::select_if(data, is.numeric)
M = cor(data_num)
corrplot(M, order = 'AOE', col = COL2('RdBu', 10), tl.col = "black",
addCoef.col = "grey", number.cex = 0.5)
The correlation plot shows the overview of correlated numeric
variables. The dark blue indicates high positive correlation. The plot
shows a strong positive correlation between mortdue
and
value
(0.87). The rest of the variables are weakly
correlated, therefore, they won’t be considered significant.
table(data$BAD)
##
## 0 1
## 3064 300
There are 3,064 people who did not default on their loan and 300 people who did. 91% of the time, the clients did not default. This dataset is unbalanced, so lets balance it:
data_0 <- data[data$BAD == "0", ] # "0" is the majority class
data_1 <- data[data$BAD == "1", ] # "1" is the minority class
set.seed(1)
sample_0 <- data_0[sample(nrow(data_0), nrow(data_1), replace = FALSE), ]
data_bal <- rbind(data_1, sample_0)
plot(data_bal$BAD)
reason_plot <- ggplot(aes(x = REASON), data = data_bal)+
labs(title = "Count of Reasons why Clients Got a HomeEq Loan")+
geom_bar()
reason_plot
reason_plot <- ggplot(aes(x = JOB), data = data_bal)+
labs(title = "Count of Jobs of People who get HomeEq Loans")+
geom_bar()
reason_plot
Most of the sampled data are for DebtCon
(debt
consolidation) and most of the people are Other
jobs
followed by ProfExe
and Office
.
set.seed(1)
sample<- sample(nrow(data_bal), 0.7 * nrow(data_bal), replace = FALSE)
train <- data_bal[sample, ]
test <- data_bal[-sample, ]
Because I am doing descriptive and explanatory models, we are going to be using the original unbalanced data to understand the true relationship between the variables.
shapiro.test(data$LOAN)
##
## Shapiro-Wilk normality test
##
## data: data$LOAN
## W = 0.84008, p-value < 2.2e-16
qqnorm(data$LOAN, main = "Q-Q Plot of LOAN")
qqline(data$LOAN, col = "red")
Because my data is not normally distributed, I will be using the Wilcoxon rank-sum test.
Hypothesis:
H0: There is no difference in loan amounts between defaulters and non-defaulters.
H1: There is a difference in loan amounts between defaulters and non-defaulters
wilcox <- wilcox.test(LOAN ~ BAD, data = data)
w <- wilcox$statistic
p <- wilcox$p.value
annotation <- paste0("Wilcoxon W = ", round(w,2), "\np-value = ", signif(p,3))
Because p < 0.05, I will reject the null hypothesis, which means there is a statistically significant difference in loan amounts between those who defaulted and those who did not.
ggplot(data, aes(BAD, LOAN)) +
geom_boxplot() +
labs(title="Loan Amount by Default Status", x= "Default Status (0 = No, 1 = Yes)", y = "Amount Due")+
annotate("text", x = 1.5, y = max(data$LOAN), label = annotation, hjust = 0.5, vjust = 1.5, size = 4.5)
The mean loan amount for default 0 is a little bit higher than default 1. There are a lot more high outliers for default 0, than default 1. This suggests that clients who do not default tend to have bigger loans.
Let’s see if the model agrees:
log_model_analysis_1 <- glm(BAD ~ LOAN, family = "binomial", data = data)
summary(log_model_analysis_1)
##
## Call:
## glm(formula = BAD ~ LOAN, family = "binomial", data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.4714 -0.4466 -0.4337 -0.4142 2.4618
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.115e+00 1.274e-01 -16.60 <2e-16 ***
## LOAN -1.118e-05 6.177e-06 -1.81 0.0703 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2022.7 on 3363 degrees of freedom
## Residual deviance: 2019.2 on 3362 degrees of freedom
## AIC: 2023.2
##
## Number of Fisher Scoring iterations: 5
Based on this p-value > 0.05, Loan amount is not a significant predictor in default. However, at the 10% significance level, loan amount is marginally statistically significant. The coefficient being negative, suggests that higher loans are associated with lower chance of default, which is strange. Clients requesting larger loans are slightly less likely to default — but the effect is very weak and not strongly statistically significant.
outliers <- quantile(data$LOAN, 0.99, na.rm = TRUE)
data_no_outliers <- data[data$LOAN <= outliers, ]
log_model_analysis_1_no_outliers <- glm(BAD ~ LOAN, family = "binomial", data = data_no_outliers)
summary(log_model_analysis_1_no_outliers)
##
## Call:
## glm(formula = BAD ~ LOAN, family = "binomial", data = data_no_outliers)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.4732 -0.4470 -0.4337 -0.4137 2.3662
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.106e+00 1.372e-01 -15.349 <2e-16 ***
## LOAN -1.177e-05 6.916e-06 -1.702 0.0888 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2007 on 3329 degrees of freedom
## Residual deviance: 2004 on 3328 degrees of freedom
## AIC: 2008
##
## Number of Fisher Scoring iterations: 5
shapiro.test(data_no_outliers$LOAN)
##
## Shapiro-Wilk normality test
##
## data: data_no_outliers$LOAN
## W = 0.91831, p-value < 2.2e-16
qqnorm(data_no_outliers$LOAN, main = "Q-Q Plot of LOAN")
qqline(data_no_outliers$LOAN, col = "red")
wilcox <- wilcox.test(LOAN ~ BAD, data = data_no_outliers)
w <- wilcox$statistic
p <- wilcox$p.value
annotation <- paste0("Wilcoxon W = ", round(w,2), "\np-value = ", signif(p,3))
ggplot(data_no_outliers, aes(BAD, LOAN)) +
geom_boxplot() +
labs(title="Loan Amount by Default Status with 99th Percentile Cutoff", x= "Default Status (0 = No, 1 = Yes)", y = "Amount Due")+
annotate("text", x = 1.5, y = max(data$LOAN), label = annotation, hjust = 0.5, vjust = 1.5, size = 4.5)
Because p < 0.05, I will reject the null hypothesis, which means
there is a statistically significant difference in loan amounts between
those who defaulted and those who did not.
pairs(data[, c("LOAN", "MORTDUE", "DEBTINC", "BAD")],
col = ifelse(data$BAD == 1, "red", "green"),
pch = 1,
main = "Pairs Plot of LOAN, MORTDUE, DEBTINC, and BAD")
interaction_model_analysis_2 <- glm(BAD ~ LOAN * MORTDUE * DEBTINC, family = "binomial", data = data)
summary(interaction_model_analysis_2)
##
## Call:
## glm(formula = BAD ~ LOAN * MORTDUE * DEBTINC, family = "binomial",
## data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.1255 -0.4586 -0.3631 -0.2662 3.6781
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.663e+00 1.408e+00 -3.312 0.000927 ***
## LOAN -2.826e-05 7.335e-05 -0.385 0.700071
## MORTDUE -1.981e-05 1.552e-05 -1.276 0.201812
## DEBTINC 1.215e-01 3.756e-02 3.235 0.001217 **
## LOAN:MORTDUE 1.541e-10 5.865e-10 0.263 0.792745
## LOAN:DEBTINC -1.190e-06 1.878e-06 -0.633 0.526545
## MORTDUE:DEBTINC -9.090e-08 4.074e-07 -0.223 0.823422
## LOAN:MORTDUE:DEBTINC 1.334e-11 1.500e-11 0.889 0.373859
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2022.7 on 3363 degrees of freedom
## Residual deviance: 1818.8 on 3356 degrees of freedom
## AIC: 1834.8
##
## Number of Fisher Scoring iterations: 6
pairs(data[, c("DEROG", "DELINQ", "CLAGE", "BAD")],
col = ifelse(data$BAD == 1, "red", "green"),
pch = 1,
main = "Pairs Plot of DEROG, DELINQ, CLAGE, and BAD")
log_model_analysis_3 <- glm(BAD ~ DEROG * DELINQ * CLAGE, family = "binomial", data = data)
summary(log_model_analysis_3)
##
## Call:
## glm(formula = BAD ~ DEROG * DELINQ * CLAGE, family = "binomial",
## data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3970 -0.4258 -0.3295 -0.2514 4.5963
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.568e+00 1.942e-01 -8.073 6.83e-16 ***
## DEROG 7.920e-01 2.537e-01 3.121 0.0018 **
## DELINQ 2.939e-01 1.872e-01 1.570 0.1164
## CLAGE -7.700e-03 1.207e-03 -6.377 1.81e-10 ***
## DEROG:DELINQ -2.388e-02 2.344e-01 -0.102 0.9189
## DEROG:CLAGE -1.584e-05 1.629e-03 -0.010 0.9922
## DELINQ:CLAGE 2.102e-03 1.031e-03 2.040 0.0414 *
## DEROG:DELINQ:CLAGE -1.311e-04 1.324e-03 -0.099 0.9211
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2022.7 on 3363 degrees of freedom
## Residual deviance: 1717.3 on 3356 degrees of freedom
## AIC: 1733.3
##
## Number of Fisher Scoring iterations: 6
log_model <- glm(BAD ~ ., data = train, family = "binomial")
summary(log_model)
##
## Call:
## glm(formula = BAD ~ ., family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.20264 -0.89598 0.01348 0.88814 2.51697
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.296e+00 6.985e-01 -1.856 0.06349 .
## LOAN -1.879e-06 1.377e-05 -0.136 0.89145
## MORTDUE 7.990e-06 8.636e-06 0.925 0.35484
## VALUE -5.783e-06 7.357e-06 -0.786 0.43182
## REASONHomeImp 1.819e-02 2.722e-01 0.067 0.94671
## JOBOffice -8.629e-01 4.680e-01 -1.844 0.06523 .
## JOBOther -3.204e-01 3.699e-01 -0.866 0.38638
## JOBProfExe -5.579e-01 3.966e-01 -1.407 0.15949
## JOBSales 5.356e-01 8.467e-01 0.633 0.52701
## JOBSelf 4.971e-01 7.286e-01 0.682 0.49506
## YOJ -4.583e-02 1.858e-02 -2.467 0.01361 *
## DEROG 6.443e-01 1.961e-01 3.285 0.00102 **
## DELINQ 7.630e-01 1.455e-01 5.245 1.56e-07 ***
## CLAGE -3.578e-03 1.717e-03 -2.084 0.03713 *
## NINQ 1.742e-01 7.597e-02 2.293 0.02183 *
## CLNO -3.479e-02 1.291e-02 -2.695 0.00704 **
## DEBTINC 7.431e-02 1.530e-02 4.856 1.20e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 582.23 on 419 degrees of freedom
## Residual deviance: 441.19 on 403 degrees of freedom
## AIC: 475.19
##
## Number of Fisher Scoring iterations: 5
vif(log_model)
## GVIF Df GVIF^(1/(2*Df))
## LOAN 1.845748 1 1.358583
## MORTDUE 14.423088 1 3.797774
## VALUE 16.238469 1 4.029698
## REASON 1.105756 1 1.051549
## JOB 1.412264 5 1.035122
## YOJ 1.103780 1 1.050609
## DEROG 1.081470 1 1.039937
## DELINQ 1.158447 1 1.076312
## CLAGE 1.250601 1 1.118303
## NINQ 1.074652 1 1.036654
## CLNO 1.493746 1 1.222189
## DEBTINC 1.152535 1 1.073562
As high positive correlations were detected among variables, the VIF
were also checked to detect for multicollinearity. The VALUE variable
has the highest VIF. To prevent multicollinearity, we will remove
Value
from the models.
log_model2 <- glm(BAD ~ . -VALUE, data = train, family = "binomial")
summary(log_model2)
##
## Call:
## glm(formula = BAD ~ . - VALUE, family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.19938 -0.92114 0.01421 0.90004 2.61583
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.273e+00 6.964e-01 -1.829 0.06747 .
## LOAN -7.382e-06 1.225e-05 -0.602 0.54687
## MORTDUE 1.558e-06 2.758e-06 0.565 0.57204
## REASONHomeImp 1.055e-02 2.719e-01 0.039 0.96904
## JOBOffice -8.703e-01 4.674e-01 -1.862 0.06263 .
## JOBOther -3.224e-01 3.687e-01 -0.874 0.38187
## JOBProfExe -5.565e-01 3.952e-01 -1.408 0.15901
## JOBSales 4.885e-01 8.406e-01 0.581 0.56114
## JOBSelf 5.388e-01 7.283e-01 0.740 0.45947
## YOJ -4.604e-02 1.855e-02 -2.482 0.01307 *
## DEROG 6.419e-01 1.952e-01 3.289 0.00101 **
## DELINQ 7.570e-01 1.459e-01 5.187 2.14e-07 ***
## CLAGE -3.971e-03 1.672e-03 -2.376 0.01751 *
## NINQ 1.706e-01 7.556e-02 2.257 0.02399 *
## CLNO -3.304e-02 1.272e-02 -2.599 0.00936 **
## DEBTINC 7.406e-02 1.528e-02 4.847 1.25e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 582.23 on 419 degrees of freedom
## Residual deviance: 441.81 on 404 degrees of freedom
## AIC: 473.81
##
## Number of Fisher Scoring iterations: 5
vif(log_model2)
## GVIF Df GVIF^(1/(2*Df))
## LOAN 1.406565 1 1.185987
## MORTDUE 1.475993 1 1.214905
## REASON 1.106564 1 1.051934
## JOB 1.392718 5 1.033681
## YOJ 1.104469 1 1.050937
## DEROG 1.079256 1 1.038872
## DELINQ 1.152149 1 1.073382
## CLAGE 1.156862 1 1.075575
## NINQ 1.073591 1 1.036143
## CLNO 1.447319 1 1.203046
## DEBTINC 1.154403 1 1.074432
log_model_backward <- stepAIC(log_model2, direction = "backward")
## Start: AIC=473.81
## BAD ~ (LOAN + MORTDUE + VALUE + REASON + JOB + YOJ + DEROG +
## DELINQ + CLAGE + NINQ + CLNO + DEBTINC) - VALUE
##
## Df Deviance AIC
## - JOB 5 449.55 471.55
## - REASON 1 441.82 471.82
## - MORTDUE 1 442.13 472.13
## - LOAN 1 442.19 472.19
## <none> 441.81 473.81
## - NINQ 1 447.18 477.18
## - YOJ 1 448.23 478.23
## - CLNO 1 448.69 478.69
## - CLAGE 1 449.04 479.04
## - DEROG 1 457.49 487.49
## - DEBTINC 1 472.60 502.60
## - DELINQ 1 487.59 517.59
##
## Step: AIC=471.55
## BAD ~ LOAN + MORTDUE + REASON + YOJ + DEROG + DELINQ + CLAGE +
## NINQ + CLNO + DEBTINC
##
## Df Deviance AIC
## - REASON 1 449.59 469.59
## - LOAN 1 449.94 469.94
## - MORTDUE 1 449.97 469.97
## <none> 449.55 471.55
## - NINQ 1 455.15 475.15
## - CLNO 1 456.46 476.46
## - CLAGE 1 456.99 476.99
## - YOJ 1 457.41 477.41
## - DEROG 1 465.40 485.40
## - DEBTINC 1 481.58 501.58
## - DELINQ 1 493.08 513.08
##
## Step: AIC=469.59
## BAD ~ LOAN + MORTDUE + YOJ + DEROG + DELINQ + CLAGE + NINQ +
## CLNO + DEBTINC
##
## Df Deviance AIC
## - MORTDUE 1 450.02 468.02
## - LOAN 1 450.03 468.03
## <none> 449.59 469.59
## - NINQ 1 455.15 473.15
## - CLNO 1 456.58 474.58
## - CLAGE 1 457.04 475.04
## - YOJ 1 457.41 475.41
## - DEROG 1 465.42 483.42
## - DEBTINC 1 481.68 499.68
## - DELINQ 1 493.32 511.32
##
## Step: AIC=468.02
## BAD ~ LOAN + YOJ + DEROG + DELINQ + CLAGE + NINQ + CLNO + DEBTINC
##
## Df Deviance AIC
## - LOAN 1 450.23 466.23
## <none> 450.02 468.02
## - NINQ 1 455.41 471.41
## - CLNO 1 456.58 472.58
## - CLAGE 1 457.35 473.35
## - YOJ 1 457.81 473.81
## - DEROG 1 465.50 481.50
## - DEBTINC 1 482.68 498.68
## - DELINQ 1 493.39 509.39
##
## Step: AIC=466.23
## BAD ~ YOJ + DEROG + DELINQ + CLAGE + NINQ + CLNO + DEBTINC
##
## Df Deviance AIC
## <none> 450.23 466.23
## - NINQ 1 455.45 469.45
## - CLNO 1 457.00 471.00
## - CLAGE 1 457.92 471.92
## - YOJ 1 458.99 472.99
## - DEROG 1 465.72 479.72
## - DEBTINC 1 482.73 496.73
## - DELINQ 1 494.48 508.48
summary(log_model_backward)
##
## Call:
## glm(formula = BAD ~ YOJ + DEROG + DELINQ + CLAGE + NINQ + CLNO +
## DEBTINC, family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.27396 -0.93497 0.01912 0.96430 2.65279
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.598788 0.594218 -2.691 0.00713 **
## YOJ -0.050922 0.017646 -2.886 0.00391 **
## DEROG 0.629764 0.194128 3.244 0.00118 **
## DELINQ 0.732827 0.142191 5.154 2.55e-07 ***
## CLAGE -0.004004 0.001625 -2.465 0.01372 *
## NINQ 0.162374 0.072688 2.234 0.02549 *
## CLNO -0.030705 0.011876 -2.586 0.00972 **
## DEBTINC 0.073149 0.014807 4.940 7.81e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 582.23 on 419 degrees of freedom
## Residual deviance: 450.23 on 412 degrees of freedom
## AIC: 466.23
##
## Number of Fisher Scoring iterations: 5
After removing the variable with the highest VIF, we ran backwards selection and found that the significant predictors of default are clients with more derogatory reports (DEROG), more delinquent credit lines (DELINQ), higher debt-to-income ratios (DEBTINC), and fewer years on the job (YOJ). On the other hand, having a longer credit history (CLAGE) and more credit lines (CLNO) may reduce the likelihood of default.
logistic_predprob <- predict(log_model_backward, newdata = test, type = "response")
logistic_predclass = ifelse(logistic_predprob >= 0.5, "1", "0")
table(logistic_predclass)
## logistic_predclass
## 0 1
## 96 84
caret::confusionMatrix(as.factor(logistic_predclass), test$BAD, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 65 31
## 1 26 58
##
## Accuracy : 0.6833
## 95% CI : (0.61, 0.7505)
## No Information Rate : 0.5056
## P-Value [Acc > NIR] : 1.016e-06
##
## Kappa : 0.3662
##
## Mcnemar's Test P-Value : 0.5962
##
## Sensitivity : 0.6517
## Specificity : 0.7143
## Pos Pred Value : 0.6905
## Neg Pred Value : 0.6771
## Prevalence : 0.4944
## Detection Rate : 0.3222
## Detection Prevalence : 0.4667
## Balanced Accuracy : 0.6830
##
## 'Positive' Class : 1
##
The logistic model has an accuracy of 68.33%, sensitivity of 65.17%, and specificity of 71.43%.
prediction_obj <- prediction(logistic_predprob, test$BAD)
roc_obj <- performance(prediction_obj, "tpr", "fpr")
plot(roc_obj, main = "ROC Curve of Logistic Model", col = "blue", lwd = 2)
abline(0, 1, lty = 2, col = "red")
auc_score <- performance(prediction_obj, "auc")
auc_value <- as.numeric(auc_score@y.values)
rect(0.7, 0.05, 1, 0.15, col = "white", border = "black")
text(
x = 0.85, y = 0.1,
labels = paste("AUC =", round(auc_value, 4)),
col = "black",
cex = 1.2
)
The model correctly distinguishes between the classes about 75% of the time. This suggests that the model is robust enough to make accurate predictions for a significant proportion of cases.
set.seed(1)
svm_model <- svm(factor(BAD) ~ ., data = train, kernel = "linear", class.weights = c("0"=1, "1"=2), probability = TRUE)
svm_pred <- predict(svm_model, test, probability = TRUE)
svm_predprob <- attr(svm_pred, "probabilities")[, "1"]
caret::confusionMatrix(svm_pred, factor(test$BAD), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 67 27
## 1 24 62
##
## Accuracy : 0.7167
## 95% CI : (0.6448, 0.7812)
## No Information Rate : 0.5056
## P-Value [Acc > NIR] : 6.599e-09
##
## Kappa : 0.4331
##
## Mcnemar's Test P-Value : 0.7794
##
## Sensitivity : 0.6966
## Specificity : 0.7363
## Pos Pred Value : 0.7209
## Neg Pred Value : 0.7128
## Prevalence : 0.4944
## Detection Rate : 0.3444
## Detection Prevalence : 0.4778
## Balanced Accuracy : 0.7164
##
## 'Positive' Class : 1
##
X <- model.matrix(factor(BAD) ~ ., data = train)[, -1]
svm_model <- svm(x = X, y = factor(train$BAD), kernel = "linear", class.weights = c("0"=1, "1"=2), probability = TRUE)
w <- t(svm_model$coefs) %*% svm_model$SV
importance <- as.vector(w)
names(importance) <- colnames(X)
sorted_importance <- sort(abs(importance), decreasing = TRUE)
sorted_importance
## DELINQ DEROG DEBTINC MORTDUE VALUE
## 0.84477977 0.75061130 0.74916394 0.64527583 0.53171528
## CLNO YOJ JOBProfExe JOBOffice CLAGE
## 0.44114960 0.38753543 0.32059351 0.29526358 0.25546812
## NINQ JOBOther JOBSelf REASONHomeImp LOAN
## 0.17648035 0.10380790 0.06578830 0.04739239 0.04650422
## JOBSales
## 0.01307225
set.seed(1)
svm_model_2 <- svm(factor(BAD) ~ ., data = train, kernel = "radial", class.weights = c("0"=1, "1"=2), probability = TRUE)
svm_pred2 <- predict(svm_model_2, test, probability = TRUE)
svm_predprob2 <- attr(svm_pred2, "probabilities")[, "1"]
caret::confusionMatrix(svm_pred2, factor(test$BAD), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 63 26
## 1 28 63
##
## Accuracy : 0.7
## 95% CI : (0.6274, 0.7659)
## No Information Rate : 0.5056
## P-Value [Acc > NIR] : 9.19e-08
##
## Kappa : 0.4001
##
## Mcnemar's Test P-Value : 0.8918
##
## Sensitivity : 0.7079
## Specificity : 0.6923
## Pos Pred Value : 0.6923
## Neg Pred Value : 0.7079
## Prevalence : 0.4944
## Detection Rate : 0.3500
## Detection Prevalence : 0.5056
## Balanced Accuracy : 0.7001
##
## 'Positive' Class : 1
##
set.seed(1)
svm_model3 <- svm(factor(BAD) ~ ., data = train, kernel = "polynomial", class.weights = c("0"=1, "1"=2), probability = TRUE)
svm_pred3 <- predict(svm_model3, test, probability = TRUE)
svm_predprob3 <- attr(svm_pred3, "probabilities")[, "1"]
caret::confusionMatrix(svm_pred3, factor(test$BAD), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 49 27
## 1 42 62
##
## Accuracy : 0.6167
## 95% CI : (0.5414, 0.688)
## No Information Rate : 0.5056
## P-Value [Acc > NIR] : 0.001754
##
## Kappa : 0.2347
##
## Mcnemar's Test P-Value : 0.091911
##
## Sensitivity : 0.6966
## Specificity : 0.5385
## Pos Pred Value : 0.5962
## Neg Pred Value : 0.6447
## Prevalence : 0.4944
## Detection Rate : 0.3444
## Detection Prevalence : 0.5778
## Balanced Accuracy : 0.6175
##
## 'Positive' Class : 1
##
set.seed(1)
svm_model4 <- svm(factor(BAD) ~ ., data = train, kernel = "sigmoid", class.weights = c("0"=1, "1"=2), probability = TRUE)
svm_pred4 <- predict(svm_model4, test, probability = TRUE)
svm_predprob4 <- attr(svm_pred4, "probabilities")[, "1"]
caret::confusionMatrix(svm_pred4, factor(test$BAD), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 56 31
## 1 35 58
##
## Accuracy : 0.6333
## 95% CI : (0.5584, 0.7038)
## No Information Rate : 0.5056
## P-Value [Acc > NIR] : 0.0003711
##
## Kappa : 0.2669
##
## Mcnemar's Test P-Value : 0.7119232
##
## Sensitivity : 0.6517
## Specificity : 0.6154
## Pos Pred Value : 0.6237
## Neg Pred Value : 0.6437
## Prevalence : 0.4944
## Detection Rate : 0.3222
## Detection Prevalence : 0.5167
## Balanced Accuracy : 0.6335
##
## 'Positive' Class : 1
##
The linear SVM model has an accuracy of 71.67%, sensitivity of 69.66%, and specificity of 73.63%. The other kernels, radial, polynomial, and sigmoid, did not perform as well, meaning I stuck with linear in order to find which variables are the best predictors. When finding how much variables contribute to the decision barriers, I found that the more delinquencies, derogatory credit reports, and higher ratio of debt-to-income are the most important features. This aligns with the logistic regression.
prediction_obj_bal <- prediction(svm_predprob, test$BAD)
roc_obj_bal <- performance(prediction_obj_bal, "tpr", "fpr")
plot(roc_obj_bal, main = "ROC Curve using Balanced SVM Model", col = "blue", lwd = 2)
abline(0, 1, lty = 2, col = "red")
auc_score_bal <- performance(prediction_obj_bal, "auc")
auc_value_bal <- as.numeric(auc_score_bal@y.values)
rect(0.7, 0.05, 1, 0.15, col = "white", border = "black")
text(
x = 0.85, y = 0.1,
labels = paste("AUC =", round(auc_value_bal, 4)),
col = "black",
cex = 1.2
)
The AUC of the SVM model is about 75%. Which is about the same as the logistic model!
normalize <- function(x) {(x - min(x)) / (max(x) - min(x))}
train_knn <- as.data.frame(lapply(train[, sapply(train, is.numeric)], normalize))
test_knn <- as.data.frame(lapply(test[, sapply(test, is.numeric)], normalize))
train_labels <- train$BAD
test_labels <- test$BAD
k_values <- seq(1, 25, by = 2)
accuracy <- numeric(length(k_values))
for (i in seq_along(k_values)) {
knn_pred <- knn(train = train_knn, test = test_knn, cl = train_labels, k = k_values[i])
cm <- confusionMatrix(knn_pred, factor(test_labels))
accuracy[i] <- cm$overall["Accuracy"]
}
plot(k_values, accuracy, type = "b", pch = 19, col = "blue",
xlab = "K", ylab = "Accuracy", main = "Choosing Best K in KNN")
knn_pred <- knn(train = train_knn, test = test_knn, cl = train_labels, k = 3)
confusionMatrix(knn_pred, factor(test_labels))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 84 49
## 1 7 40
##
## Accuracy : 0.6889
## 95% CI : (0.6158, 0.7557)
## No Information Rate : 0.5056
## P-Value [Acc > NIR] : 4.677e-07
##
## Kappa : 0.3745
##
## Mcnemar's Test P-Value : 4.281e-08
##
## Sensitivity : 0.9231
## Specificity : 0.4494
## Pos Pred Value : 0.6316
## Neg Pred Value : 0.8511
## Prevalence : 0.5056
## Detection Rate : 0.4667
## Detection Prevalence : 0.7389
## Balanced Accuracy : 0.6863
##
## 'Positive' Class : 0
##
The KNN = 3 model has an accuracy of 68.89%, sensitivity of 62.31%, and specificity of 44.94%.
prediction_obj <- prediction(as.numeric(knn_pred), test_labels)
roc_obj <- performance(prediction_obj, "tpr", "fpr")
plot(roc_obj, main = "ROC Curve for KNN Model", col = "blue", lwd = 2)
abline(0, 1, lty = 2, col = "red")
auc_score <- performance(prediction_obj, "auc")
auc_value <- as.numeric(auc_score@y.values)
rect(0.7, 0.05, 1, 0.15, col = "white", border = "black")
text(x = 0.85, y = 0.1, labels = paste("AUC =", round(auc_value, 4)), col = "black", cex = 1.2)
The AUC for KNN = 5 is 68%
rf_model_basic <- randomForest(BAD ~., data = train, ntree = 100, mtry = 3)
rf_preds <- predict(rf_model_basic, test)
varImpPlot(rf_model_basic)
confusionMatrix(rf_preds, test$BAD)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 69 24
## 1 22 65
##
## Accuracy : 0.7444
## 95% CI : (0.6742, 0.8064)
## No Information Rate : 0.5056
## P-Value [Acc > NIR] : 4.786e-11
##
## Kappa : 0.4887
##
## Mcnemar's Test P-Value : 0.8828
##
## Sensitivity : 0.7582
## Specificity : 0.7303
## Pos Pred Value : 0.7419
## Neg Pred Value : 0.7471
## Prevalence : 0.5056
## Detection Rate : 0.3833
## Detection Prevalence : 0.5167
## Balanced Accuracy : 0.7443
##
## 'Positive' Class : 0
##
mtry_values <- c(2, 4, 6, 7)
ntree_values <- c(100, 200, 300, 400, 500, 600, 700, 800, 900, 1000)
results <- data.frame(mtry = integer(), ntree = integer(), Accuracy = numeric())
models <- list()
for (m in mtry_values) {
for (n in ntree_values) {
rf_model <- randomForest(factor(BAD) ~ DEBTINC + CLAGE + MORTDUE + LOAN + DELINQ + YOJ + CLNO, data = train, mtry = m, ntree = n)
rf_pred <- predict(rf_model, test)
cm <- confusionMatrix(rf_pred, factor(test$BAD))
acc <- cm$overall['Accuracy']
results <- rbind(results, data.frame(mtry = m, ntree = n, Accuracy = acc))
models[[paste0("mtry_", m, "_ntree_", n)]] <- rf_model
}
}
best_index <- which.max(results$Accuracy)
best_mtry <- results$mtry[best_index]
best_ntree <- results$ntree[best_index]
best_model_name <- paste0("mtry_", best_mtry, "_ntree_", best_ntree)
best_model <- models[[best_model_name]]
best_pred <- predict(best_model, test)
conf_matrix_best <- confusionMatrix(best_pred, factor(test$BAD))
print(conf_matrix_best)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 73 23
## 1 18 66
##
## Accuracy : 0.7722
## 95% CI : (0.7039, 0.8313)
## No Information Rate : 0.5056
## P-Value [Acc > NIR] : 1.7e-13
##
## Kappa : 0.5441
##
## Mcnemar's Test P-Value : 0.5322
##
## Sensitivity : 0.8022
## Specificity : 0.7416
## Pos Pred Value : 0.7604
## Neg Pred Value : 0.7857
## Prevalence : 0.5056
## Detection Rate : 0.4056
## Detection Prevalence : 0.5333
## Balanced Accuracy : 0.7719
##
## 'Positive' Class : 0
##
best_pred_prob <- predict(best_model, test, type = "prob")[, "1"]
prediction_obj <- prediction(best_pred_prob, test$BAD)
roc_obj <- performance(prediction_obj, "tpr", "fpr")
plot(roc_obj,
main = "ROC Curve for Best Random Forest Model",
col = "blue", lwd = 2)
abline(a = 0, b = 1, lty = 2, col = "red")
auc_obj <- performance(prediction_obj, "auc")
auc_value <- as.numeric(auc_obj@y.values)
text(x = 0.6, y = 0.2, labels = paste("AUC =", round(auc_value, 4)), cex = 1.2)
My original random forest model identified the most important features for prediction were DEBTINC, CLAGE, MORTDUE, LOAN, YOJ, CLNO, and DELINQ. It also chose VALUE which I chose to drop due to its correlation with MORTDUE. By dropping it, I got an increased accuracy, sensitivity, and specificity. I systematically searched for optimal hyperparameters for mtry and ntree using a for loop. This led me to get an accuracy of 77.22%, sensitivity of 80.22%, and specificity of 74.16%.
model_performance <- data.frame(
Model = c("Logistic Regression", "SVM", "K-Nearest Neighbors", "Random Forest"),
Accuracy = c(0.6833, 0.7167, 0.6889, conf_matrix_best$overall["Accuracy"]),
Sensitivity = c(0.6517, 0.6966, 0.6231, conf_matrix_best$byClass["Sensitivity"]),
Specificity = c(0.7143, 0.7363, 0.4494, conf_matrix_best$byClass["Specificity"]),
AUC = c(0.7545, 0.7522, 0.68, {
best_pred_prob <- predict(best_model, test, type = "prob")[, "1"]
rf_pred_obj <- ROCR::prediction(best_pred_prob, test$BAD)
rf_auc <- ROCR::performance(rf_pred_obj, "auc")
as.numeric(rf_auc@y.values)
})
)
model_performance <- model_performance %>%
mutate(across(where(is.numeric), ~ round(., 4)))
knitr::kable(model_performance, caption = "Comparison of Model Performances")
Model | Accuracy | Sensitivity | Specificity | AUC |
---|---|---|---|---|
Logistic Regression | 0.6833 | 0.6517 | 0.7143 | 0.7545 |
SVM | 0.7167 | 0.6966 | 0.7363 | 0.7522 |
K-Nearest Neighbors | 0.6889 | 0.6231 | 0.4494 | 0.6800 |
Random Forest | 0.7722 | 0.8022 | 0.7416 | 0.8105 |
This proves that the Random Forest is the best model for predicting whether a client will default on their HELOC loan.