1. Context and motivation
When it comes to financing large expenses, home equity - or second mortgage - is a cost-effective option for many households to liquidate their equity. It is a convenient and accessible method to borrow in bulk at fixed monthly payments and much lower interest rates compared to personal or credit card loans.
However, there are considerable risks associated with this type of financing. These risks go both ways. If taking on a home equity loan equals collateralizing their own home to a borrower, customers’ inability to make payments can lead to foreclosure - oftentimes a costly and time-consuming process for the bank. If a ‘second’ mortgage literally means a borrower doubling their mortgage expenditure and lowering disposable income, customers’ stronger need to spend and reduced willingness to save means less deposits - a crucial source of funding for banks.
Given the aforementioned impact home equity loan, it is obvious equity defaults can be as financially destructive as credit defaults. Understanding the bank’s need to forecast and mitigate equity risks, we now proceed to build a machine learning model to predict default probability of recorded customers.
2. Dataset: HMEQ
This dataset contains 5960 observations and 13 variables on the characteristics and delinquency information of home equity loans.
hmeq <- read.csv("~/Documents/Data Science /R/projects/risk calculator app/credit_risk_calculator/hmeq.csv")
names(hmeq)
## [1] "BAD" "LOAN" "MORTDUE" "VALUE" "REASON" "JOB" "YOJ"
## [8] "DEROG" "DELINQ" "CLAGE" "NINQ" "CLNO" "DEBTINC"
dim(hmeq)
## [1] 5960 13
str(hmeq)
## 'data.frame': 5960 obs. of 13 variables:
## $ BAD : int 1 1 1 1 0 1 1 1 1 1 ...
## $ LOAN : int 1100 1300 1500 1500 1700 1700 1800 1800 2000 2000 ...
## $ MORTDUE: num 25860 70053 13500 NA 97800 ...
## $ VALUE : num 39025 68400 16700 NA 112000 ...
## $ REASON : chr "HomeImp" "HomeImp" "HomeImp" "" ...
## $ JOB : chr "Other" "Other" "Other" "" ...
## $ YOJ : num 10.5 7 4 NA 3 9 5 11 3 16 ...
## $ DEROG : int 0 0 0 NA 0 0 3 0 0 0 ...
## $ DELINQ : int 0 2 0 NA 0 0 2 0 2 0 ...
## $ CLAGE : num 94.4 121.8 149.5 NA 93.3 ...
## $ NINQ : int 1 0 1 NA 0 1 1 0 1 0 ...
## $ CLNO : int 9 14 10 NA 14 8 17 8 12 13 ...
## $ DEBTINC: num NA NA NA NA NA ...
From the above report, JOB and REASON are the only two factor variables in character format. 11 other variables, including BAD (default status), are numerical variables. We shall keep this format for now, however some time later during the analysis, variable BAD will have to be factorized.
1. Clean & Prepare Dataset
#Number of missing values
sum(is.na(hmeq))
## [1] 4740
#Visualize
library(Amelia)
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.0, built: 2021-05-26)
## ## Copyright (C) 2005-2022 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(mlbench)
missmap(hmeq, col = c("steelblue", "brown"), legend = TRUE)
Missing values account for 6% of total observations, or 357 out of 5960. Thus, we cannot just delete the rows with missing values, as we would likely be left with a biased dataset.
#Numerical variables
hmeq$DEBTINC[which(is.na(hmeq$DEBTINC))] <- mean(hmeq$DEBTINC, na.rm = TRUE)
hmeq$DEROG[which(is.na(hmeq$DEROG))] <- round(mean(hmeq$DEROG, na.rm = TRUE), 0)
hmeq$DELINQ[which(is.na(hmeq$DELINQ))] <- round(mean(hmeq$DELINQ, na.rm = TRUE), 0)
hmeq$MORTDUE[which(is.na(hmeq$MORTDUE))] <- round(mean(hmeq$MORTDUE, na.rm = TRUE), 0)
hmeq$YOJ[which(is.na(hmeq$YOJ))] <- round(mean(hmeq$YOJ, na.rm = TRUE), 1)
hmeq$NINQ[which(is.na(hmeq$NINQ))] <- round(mean(hmeq$NINQ, na.rm = TRUE), 0)
hmeq$CLAGE[which(is.na(hmeq$CLAGE))] <- mean(hmeq$CLAGE, na.rm = TRUE)
hmeq$CLNO[which(is.na(hmeq$CLNO))] <- round(mean(hmeq$CLNO, na.rm = TRUE), 0)
hmeq$VALUE[which(is.na(hmeq$VALUE))] <- round(mean(hmeq$VALUE, na.rm = TRUE), 0)
#Factor variables
hmeq$JOB[hmeq$JOB == ""] <- "N/A"
hmeq$REASON[hmeq$REASON == ""] <- "N/A"
#Observe dataset
sum(is.na(hmeq))
## [1] 0
missmap(hmeq, col = c("steelblue", "brown"), legend = TRUE)
Now that our dataset is cleaned of NAs, we can move on to plotting and visualizing data.
2. Data Visualization
#Load required packages
library(ggplot2)
library(cowplot)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
#Factorize BAD into 2 levels: 0: Paid, 1: Defaulted
plot_dt <- hmeq
plot_dt$BAD <- factor(plot_dt$BAD, labels = c("Paid", "Defaulted"))
head(plot_dt$BAD)
## [1] Defaulted Defaulted Defaulted Defaulted Paid Defaulted
## Levels: Paid Defaulted
#Disable exponential notation in graphs
options(scipen = 999)
#BAD vs DELINQ
bar1 <- ggplot(plot_dt, aes(BAD, DELINQ, fill = BAD)) +
geom_bar(stat = "identity", show.legend = F, width = 0.4) +
labs(title = "Delinquent Credit Lines", x = "", y = "Delinquent Credit Lines", fill = "Status") +
theme_bw() +
theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 10), text = element_text(family = "serif"),
aspect.ratio = 2/1, panel.grid = element_blank()) +
scale_fill_manual(values = c("steelblue", "brown"))
#BAD vs DEBTINC
bar2 <- ggplot(plot_dt, aes(BAD, DEBTINC, fill = BAD)) +
geom_bar(stat = "identity", show.legend = F, width = 0.4) +
labs(title = "Debt-Income Ratio", x = "", y = "Debt:Income", fill = "Status") +
theme_bw() +
theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 10), text = element_text(family = "serif"),
aspect.ratio = 2/1, panel.grid = element_blank()) +
scale_fill_manual(values = c("steelblue", "brown"))
#BAD vs DEROG
bar3 <- ggplot(plot_dt, aes(BAD, DEROG, fill = BAD)) +
geom_bar(stat = "identity", show.legend = F, width = 0.4) +
labs(title = "Derogatory Reports", x = "", y = "Derogatory Reports", fill = "Status") +
theme_bw() +
theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 10), text = element_text(family = "serif"),
aspect.ratio = 2/1, panel.grid = element_blank()) +
scale_fill_manual(values = c("steelblue", "brown"))
#BAD vs CLAGE
bar4 <- ggplot(plot_dt, aes(BAD, CLAGE, fill = BAD)) +
geom_bar(stat = "identity", show.legend = F, width = 0.4) +
labs(title = "Oldest Credit Line", x = "", y = "Age", fill = "Status") +
theme_bw() +
theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 10), text = element_text(family = "serif"),
aspect.ratio = 2/1, panel.grid = element_blank()) +
scale_fill_manual(values = c("steelblue", "brown"))
#BAD vs LOAN
bar5 <- ggplot(plot_dt, aes(BAD, LOAN, fill = BAD)) +
geom_bar(stat = "identity", show.legend = F, width = 0.4) +
labs(title = "Loan Request Amount", x = "", y = "Loan", color = "Status") +
theme_bw() +
theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 10), text = element_text(family = "serif"),
aspect.ratio = 2/1, panel.grid = element_blank()) +
scale_fill_manual(values = c("steelblue", "brown"))
#BAD vs NINQ
bar6 <- ggplot(plot_dt, aes(BAD, NINQ, fill = BAD)) +
geom_bar(stat = "identity", show.legend = F, width = 0.4) +
labs(title = "Recent Credit Inquiries", x = "", y = "Recent Inquiries", fill = "Status") +
theme_bw() +
theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 10), text = element_text(family = "serif"),
aspect.ratio = 2/1, panel.grid = element_blank()) +
scale_fill_manual(values = c("steelblue", "brown"))
#BAD vs CLNO
bar7 <- ggplot(plot_dt, aes(BAD, CLNO, fill = BAD)) +
geom_bar(stat = "identity", show.legend = F, width = 0.4) +
labs(title = "Total Credit Lines", x = "", y = "Credit Lines", fill = "Status") +
theme_bw() +
theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 10), text = element_text(family = "serif"),
aspect.ratio = 2/1, panel.grid = element_blank()) +
scale_fill_manual(values = c("steelblue", "brown"))
From the set of bar plot above, we can observe the following:
plot_grid(bar1, bar3)
plot_grid(bar4, bar6, bar7, ncol = 3, nrow = 1)
plot_grid(bar2, bar5)
Overview of variable correlation:
#exclude categorical variables in string format
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
num_var <- hmeq %>%
select(BAD, LOAN, MORTDUE, VALUE, YOJ, DEROG, DELINQ, CLAGE, NINQ, CLNO, DEBTINC)
names(num_var)
## [1] "BAD" "LOAN" "MORTDUE" "VALUE" "YOJ" "DEROG" "DELINQ"
## [8] "CLAGE" "NINQ" "CLNO" "DEBTINC"
#correlation plot with calculated corr
library(corrplot)
## corrplot 0.92 loaded
corrplot.mixed(cor(num_var), order = "AOE", tl.cex = 0.66, outline = TRUE)
Collinearity Assessment:
Multicollinearity - or co-dependence of variables - can significantly reduce the precision of estimated beta coefficients, as a result weakening the statistical significance of regression model. This is a phenomenon we wish to avoid in our final model. We will now assess the relationships of “seemingly” related sets of variables.
#VALUE vs MORTDUE
fit <- lm(plot_dt$MORTDUE~plot_dt$VALUE)
scatter1 <- ggplot(plot_dt, aes(VALUE, MORTDUE, color = BAD)) +
geom_count(position = "jitter", alpha = 0.6, size = 1.6, show.legend = F) +
geom_line(aes(y = predict(fit)), col = "brown", size = 0.9) +
ylim(c(0, 225000)) +
scale_x_log10() +
labs(title = "Property value vs Outstanding mortgage", x = "Property value", y = "Outstanding mortgage", size = "Size", color = "Status") +
theme_bw() +
theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 11), text = element_text(family = "serif"),
panel.grid = element_blank()) +
scale_color_manual(values = c("steelblue", "brown"))
#NINQ vs CLNO
fit2 <- lm(plot_dt$CLNO~plot_dt$NINQ)
scatter2 <- ggplot(plot_dt, aes(NINQ, CLNO, color = BAD)) +
geom_count(position = "jitter", alpha = 0.6, size = 1.6) +
labs(title = "Recent credit inquiries vs Total credit lines", x = "Recent inquiries", y = "Total credit lines", size = "Size", color = "Status") +
theme_bw() +
theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 11), text = element_text(family = "serif"),
panel.grid = element_blank()) +
scale_color_manual(values = c("steelblue", "brown"))
#Aggregate plots
plot_grid(scatter1, scatter2)
## Warning: Removed 69 rows containing non-finite values (stat_sum).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 18 row(s) containing missing values (geom_path).
1. Feature Selection
First, we will factorize JOB and REASON (variables with more than 2 unrelated categories) into separate variables using one-hot encoding:
#One-hot encoding
library(caret)
## Loading required package: lattice
dummy <- dummyVars(" ~ .", data=hmeq)
hmeq_new <- data.frame(predict(dummy, newdata = hmeq))
names(hmeq_new)
## [1] "BAD" "LOAN" "MORTDUE" "VALUE"
## [5] "REASONDebtCon" "REASONHomeImp" "REASONN.A" "JOBMgr"
## [9] "JOBN.A" "JOBOffice" "JOBOther" "JOBProfExe"
## [13] "JOBSales" "JOBSelf" "YOJ" "DEROG"
## [17] "DELINQ" "CLAGE" "NINQ" "CLNO"
## [21] "DEBTINC"
#Factorize BAD into two levels
hmeq_new$BAD <- as.factor(hmeq_new$BAD)
#rpart method to train model
set.seed(26)
model <- train(BAD ~ ., data=hmeq_new, method = "rpart")
rankVars <- varImp(model)
print(rankVars)
## rpart variable importance
##
## Overall
## DEBTINC 100.000
## DELINQ 58.993
## DEROG 58.816
## CLAGE 31.626
## LOAN 20.934
## NINQ 18.271
## CLNO 5.423
## MORTDUE 0.000
## REASONN.A 0.000
## JOBOther 0.000
## JOBSelf 0.000
## JOBN.A 0.000
## JOBMgr 0.000
## JOBProfExe 0.000
## JOBSales 0.000
## JOBOffice 0.000
## REASONDebtCon 0.000
## YOJ 0.000
## REASONHomeImp 0.000
## VALUE 0.000
#Rank Features By Importance
plot(rankVars, top = 18, main = 'Variable Importance')
Top variables ranked by importance are: DEBTINC, DELINQ, DEROG, CLAGE, LOAN, NINQ, CLNO. We will use these variables as parameters for model 1.
library(randomForest)
## randomForest 4.7-1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
rf_model <- randomForest(BAD~., data = hmeq_new)
importance(rf_model)
## MeanDecreaseGini
## LOAN 164.709380
## MORTDUE 147.005001
## VALUE 157.198782
## REASONDebtCon 18.527666
## REASONHomeImp 17.744987
## REASONN.A 8.592420
## JOBMgr 14.608233
## JOBN.A 9.421134
## JOBOffice 18.049832
## JOBOther 23.011395
## JOBProfExe 16.583304
## JOBSales 10.952526
## JOBSelf 9.080337
## YOJ 114.256936
## DEROG 101.863745
## DELINQ 185.855493
## CLAGE 184.902111
## NINQ 84.405206
## CLNO 147.414662
## DEBTINC 405.798108
list<- sort(importance(rf_model), decreasing = TRUE)
plot(list, top = 18, main = 'Variable Importance', ylab = "", xlab = "Variable Index")
## Warning in plot.window(...): "top" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "top" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "top" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "top" is not a
## graphical parameter
## Warning in box(...): "top" is not a graphical parameter
## Warning in title(...): "top" is not a graphical parameter
lines(list)
Top variables ranked by importance are: DEBTINC, CLAGE, DELINQ, LOAN, VALUE, CLNO, MORTDUE. We will use these variables as parameters for model 2.
2. Logistic Regression
set.seed(269)
# Store row numbers for training set: index_train
Z <- sample(1:nrow(hmeq_new), 0.8 * nrow(hmeq_new))
# Create training set: training_set
train <- hmeq_new[Z, ]
# Create test set: test_set
test <- hmeq_new[-Z, ]
#Fit logistic regression model on training set
train_model <- glm(BAD ~ DEBTINC + DELINQ + DEROG + CLAGE + LOAN + NINQ + CLNO, data = train, family = "binomial")
summary(train_model)
##
## Call:
## glm(formula = BAD ~ DEBTINC + DELINQ + DEROG + CLAGE + LOAN +
## NINQ + CLNO, family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1849 -0.5983 -0.4506 -0.2942 3.6954
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.374570893 0.254288845 -9.338 < 0.0000000000000002 ***
## DEBTINC 0.052277227 0.006589645 7.933 0.00000000000000214 ***
## DELINQ 0.755135891 0.042064787 17.952 < 0.0000000000000002 ***
## DEROG 0.636595966 0.059585861 10.684 < 0.0000000000000002 ***
## CLAGE -0.005099449 0.000599859 -8.501 < 0.0000000000000002 ***
## LOAN -0.000019650 0.000004258 -4.615 0.00000392338301608 ***
## NINQ 0.167474596 0.022295055 7.512 0.00000000000005835 ***
## CLNO -0.017947870 0.004617156 -3.887 0.000101 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4795.0 on 4767 degrees of freedom
## Residual deviance: 3797.6 on 4760 degrees of freedom
## AIC: 3813.6
##
## Number of Fisher Scoring iterations: 5
#ROC curve
library(Epi)
ROC(form = BAD ~ DEBTINC + DELINQ + DEROG + CLAGE + LOAN + NINQ + CLNO, data = train, plot = "ROC", MX = TRUE, PV = TRUE)
The best cutoff point for model 1 as shown in the ROC graph is 0.236. Area under the curve is 0.79, indicating an average/good fit of model.
# fit test model to training data
test$Prob = predict(train_model, newdata = test, type = "response")
test$Prediction = 1*(test$Prob > 0.236)
#Confusion matrix on test data
table(test$Prediction, test$BAD)
##
## 0 1
## 0 801 94
## 1 164 133
#test error rate
mean(test$Prediction != test$BAD)
## [1] 0.216443
–> Model 1’s test error rate: 0.216443.
#Cross-validate model 1
fitControl <- trainControl(method = "cv", number = 10, savePredictions = T)
modCV <- train(BAD ~ DEBTINC + DELINQ + DEROG + CLAGE + LOAN + NINQ + CLNO, data = hmeq_new, method = "glm", family = "binomial", trControl = fitControl)
summary(modCV)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1403 -0.5985 -0.4466 -0.2933 3.8467
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.285292425 0.225996000 -10.112 < 0.0000000000000002 ***
## DEBTINC 0.049980611 0.005891481 8.484 < 0.0000000000000002 ***
## DELINQ 0.738766913 0.037250039 19.833 < 0.0000000000000002 ***
## DEROG 0.595326659 0.048293071 12.327 < 0.0000000000000002 ***
## CLAGE -0.005602036 0.000541113 -10.353 < 0.0000000000000002 ***
## LOAN -0.000019800 0.000003775 -5.244 0.000000156782268345 ***
## NINQ 0.162261533 0.020011518 8.108 0.000000000000000513 ***
## CLNO -0.014469349 0.004079183 -3.547 0.000389 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5956.5 on 5959 degrees of freedom
## Residual deviance: 4735.6 on 5952 degrees of freedom
## AIC: 4751.6
##
## Number of Fisher Scoring iterations: 5
confusionMatrix(table((modCV$pred)$pred, (modCV$pred)$obs))
## Confusion Matrix and Statistics
##
##
## 0 1
## 0 4628 835
## 1 143 354
##
## Accuracy : 0.8359
## 95% CI : (0.8263, 0.8452)
## No Information Rate : 0.8005
## P-Value [Acc > NIR] : 0.000000000001424
##
## Kappa : 0.3426
##
## Mcnemar's Test P-Value : < 0.00000000000000022
##
## Sensitivity : 0.9700
## Specificity : 0.2977
## Pos Pred Value : 0.8472
## Neg Pred Value : 0.7123
## Prevalence : 0.8005
## Detection Rate : 0.7765
## Detection Prevalence : 0.9166
## Balanced Accuracy : 0.6339
##
## 'Positive' Class : 0
##
Model 2: BAD ~ DEBTINC + CLAGE + DELINQ + LOAN + VALUE + CLNO + MORTDUE
#Fit logistic regression model on training set
train_model2 <- glm(BAD ~ DEBTINC + CLAGE + DELINQ + LOAN + VALUE + CLNO + MORTDUE, data = train, family = "binomial")
summary(train_model2)
##
## Call:
## glm(formula = BAD ~ DEBTINC + CLAGE + DELINQ + LOAN + VALUE +
## CLNO + MORTDUE, family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2442 -0.6266 -0.4865 -0.3210 3.9280
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.985785147 0.246918628 -8.042 0.000000000000000882 ***
## DEBTINC 0.053251087 0.006437224 8.272 < 0.0000000000000002 ***
## CLAGE -0.006247398 0.000589412 -10.599 < 0.0000000000000002 ***
## DELINQ 0.810777840 0.041184313 19.687 < 0.0000000000000002 ***
## LOAN -0.000015492 0.000004186 -3.701 0.000215 ***
## VALUE 0.000001865 0.000001196 1.559 0.118901
## CLNO -0.010511032 0.004653638 -2.259 0.023904 *
## MORTDUE -0.000003933 0.000001613 -2.438 0.014758 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4795.0 on 4767 degrees of freedom
## Residual deviance: 4007.4 on 4760 degrees of freedom
## AIC: 4023.4
##
## Number of Fisher Scoring iterations: 5
#ROC curve
ROC(form = BAD ~ DEBTINC + CLAGE + DELINQ + LOAN + VALUE + CLNO + MORTDUE, data = train, plot = "ROC", MX = TRUE, PV = TRUE)
The best cutoff point for model 1 as shown in the ROC graph is 0.182. Area under the curve is 0.766.
# fit test model to training data
test$Prob = predict(train_model2, newdata = test, type = "response")
test$Prediction = 1*(test$Prob > 0.182)
#Confusion matrix on test data
table(test$Prediction, test$BAD)
##
## 0 1
## 0 667 63
## 1 298 164
#test error rate
mean(test$Prediction != test$BAD)
## [1] 0.3028523
–> Model 2’s test error rate: 0.3028523.
#Cross-validate model 2
fitControl <- trainControl(method = "cv", number = 10, savePredictions = T)
modCV <- train(BAD ~ DEBTINC + CLAGE + DELINQ + LOAN + VALUE + CLNO + MORTDUE, data = hmeq_new, method = "glm", family = "binomial", trControl = fitControl)
summary(modCV)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1921 -0.6256 -0.4831 -0.3226 4.0662
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.9302892298 0.2187053635 -8.826 < 0.0000000000000002 ***
## DEBTINC 0.0499505756 0.0057337526 8.712 < 0.0000000000000002 ***
## CLAGE -0.0068228280 0.0005323580 -12.816 < 0.0000000000000002 ***
## DELINQ 0.7903255027 0.0365893442 21.600 < 0.0000000000000002 ***
## LOAN -0.0000177699 0.0000037587 -4.728 0.00000227 ***
## VALUE 0.0000033555 0.0000009616 3.489 0.000484 ***
## CLNO -0.0076308262 0.0041010607 -1.861 0.062787 .
## MORTDUE -0.0000042272 0.0000013559 -3.118 0.001822 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5956.5 on 5959 degrees of freedom
## Residual deviance: 4999.1 on 5952 degrees of freedom
## AIC: 5015.1
##
## Number of Fisher Scoring iterations: 5
confusionMatrix(table((modCV$pred)$pred, (modCV$pred)$obs))
## Confusion Matrix and Statistics
##
##
## 0 1
## 0 4654 919
## 1 117 270
##
## Accuracy : 0.8262
## 95% CI : (0.8163, 0.8357)
## No Information Rate : 0.8005
## P-Value [Acc > NIR] : 0.0000002527
##
## Kappa : 0.2712
##
## Mcnemar's Test P-Value : < 0.00000000000000022
##
## Sensitivity : 0.9755
## Specificity : 0.2271
## Pos Pred Value : 0.8351
## Neg Pred Value : 0.6977
## Prevalence : 0.8005
## Detection Rate : 0.7809
## Detection Prevalence : 0.9351
## Balanced Accuracy : 0.6013
##
## 'Positive' Class : 0
##
3. Summary
test_error <- c(0.216443, 0.3028523)
auc <- c(0.79, 0.76)
accuracy <- c(0.8359, 0.8263)
dts <- data.frame(test_error, auc, accuracy)
colnames(dts) <- c("Test Error Rate", "Area Under Curve", "Accuracy Test")
row.names(dts) <- c("Model 1", "Model 2")
dts
## Test Error Rate Area Under Curve Accuracy Test
## Model 1 0.2164430 0.79 0.8359
## Model 2 0.3028523 0.76 0.8263
After comparing 2 models, we choose model 1 as our final model. Final model in mathematical equation:
This web app functions like a calculator, taking new customers’ info (variables included in final model: DEBTINC, DELINQ, DEROG, CLAGE, LOAN, NINQ, CLNO) as input and outputting said customer’s mortgage default probability.
Based on the quantile in which this probability is, as well as the (adjustable) loan acceptance rate of the bank, the app ultimately outputs the bank’s decision on whether to lend to this new customer. If the customer’s default probability falls in a higher percentile than the acceptance rate, they are not qualified for loan. On the other hand, if the default probability lies in the acceptable range, the customer is qualified for loan.
library(shiny)
library(reshape2)
library(scales)
# Define UI
ui <- fluidPage(
# Application title
titlePanel(h3("Equity Risk Calculator", align = "center")),
h6(tags$a(href = "https://github.com/lchipham", "Source code: Linh-Chi Pham"), align = "center"),
# Sidebar with a slider input
sidebarLayout(
sidebarPanel(
h4("CUSTOMER INFO"),
sliderInput("debtinc",
"Debt-Income Ratio",
min = round(0.5244992,2),
max = round(203.3121487,2),
value = 16),
sliderInput("delinq",
"Delinquent Credit Lines",
min = 0,
max = 15,
value = 1),
sliderInput("derog",
"Major Derogatory Reports",
min = 0,
max = 10,
value = 1),
sliderInput("clage",
"Oldest Credit Line (month)",
min = 0,
max = round(1168.234,2),
value = 179.77),
sliderInput("loan",
"Loan Request Amount",
min = 1100,
max = 89900,
value = 18608),
sliderInput("ninq",
"Recent Credit Inquiries",
min = 0,
max = 17,
value = 2),
sliderInput("clno",
"Total Credit Lines",
min = 0,
max = 71,
value = 21)
),
# Show text output + result
mainPanel(
numericInput("acc_rate", "Loan Acceptance Rate (%):",
min = 0,
max = 100,
value = 80, width = "200px"),
uiOutput("text"),
br(),
plotOutput("percentileZ"),
uiOutput("decision")
)
)
)
# Define server logic
server <- function(input, output) {
default_prob <- function(X1 = 16, X2 = 1, X3 = 1, X4 = 179.77, X5 = 18608, X6 = 2, X7 = 21, pct = TRUE, plotDT = TRUE) {
lm_mod <- -2.37457089 + 0.05227723*X1 + 0.75513589*X2 + 0.63659597*X3 - 0.00509945*X4 - 0.00001965*X5 + 0.167474596*X6 - 0.01794787*X7
prob <- exp(lm_mod) / 1 + exp(lm_mod)
Def_Prob <<- prob
if (pct == TRUE) {
zscore <- (Def_Prob - 0.2079319) / 0.2003866
Zscore <<- zscore
pct_score <- pnorm(Def_Prob, mean = 0.2079319, sd = 0.2003866)
PCT_cus <<- pct_score
}
if (plotDT == TRUE) {
dt <- data.frame(scores = readRDS("testProb.RData"))
dens <- density(dt$scores)
df <- data.frame(x=dens$x, y=dens$y)
quantiles <- quantile(dt$scores, prob=(input$acc_rate/100))
df$quantile <- factor(findInterval(df$x, quantiles), labels = c("Accept", "Refuse"))
ggplot(df, aes(x, y)) +
geom_line() +
geom_ribbon(aes(ymin=0, ymax=y, fill=quantile)) +
scale_x_continuous(breaks=quantiles) +
geom_vline(xintercept = quantiles, color = "black", linetype = "dashed", size=0.69) +
geom_text(aes(x = quantiles - 0.026, label = "cutoff line", y=2.5), colour="black", angle=90, family = "serif") +
geom_vline(xintercept = Def_Prob, col = "brown", linetype = "dashed", size=0.69) +
geom_text(aes(x=Def_Prob - 0.026, label="customer percentile", y=2.5), colour="brown", angle=90, family = "serif") +
labs(title = "Distribution of Default Probability", fill = "Decision", x = "Percentile", y = "") +
theme_bw() +
theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 14), text = element_text(family = "serif"),
panel.grid = element_blank()) +
scale_fill_brewer(palette = "OrRd")
}
}
output$text <- renderUI({
default_prob(X1 = input$debtinc, X2 = input$delinq, X3 = input$derog, X4 = input$clage, X5 = input$loan, X6 = input$ninq, X7 = input$clno, pct = TRUE, plotDT = FALSE)
HTML(paste0(
"<b>", "Default Probability: ", format(round(Def_Prob, digits = 2)), "</b>",
"<br>",
"This customer's default probability is in the ", format(round(PCT_cus*100, digits = 2)), "th percentile.",
"<br>"
))
})
output$percentileZ <- renderPlot({
default_prob(X1 = input$debtinc, X2 = input$delinq, X3 = input$derog, X4 = input$clage, X5 = input$loan, X6 = input$ninq, X7 = input$clno, pct = TRUE, plotDT = TRUE)
})
output$decision <- renderUI({
default_prob(X1 = input$debtinc, X2 = input$delinq, X3 = input$derog, X4 = input$clage, X5 = input$loan, X6 = input$ninq, X7 = input$clno, pct = TRUE, plotDT = FALSE)
if (PCT_cus > (input$acc_rate/100)) {
HTML(paste0(
"<b>", "Bank's Decision: Refuse","</b>",
"<br>",
"This customer is not qualified for loan as ", format(round(PCT_cus*100, digits = 2)),"% > ", input$acc_rate, "% acceptance rate.",
"<br>"))
} else if (PCT_cus <= (input$acc_rate/100)){
HTML(paste0(
"<b>", "Bank's Decision: Accept","</b>",
"<br>",
"This customer is qualified for loan as ", format(round(PCT_cus*100, digits = 2)),"% <= ", input$acc_rate, "% acceptance rate.",
"<br>"))
}
})
}
# Run application
shinyApp(ui = ui, server = server)
## PhantomJS not found. You can install it with webshot::install_phantomjs(). If it is installed, please make sure the phantomjs executable can be found via the PATH variable.