# code chunk specifies whether the R code, warnings, and output 
# will be included in the output files.

if (!require("knitr")) {
   install.packages("knitr")
   library(knitr)
}
## Loading required package: knitr
if (!require("MASS")) {
   install.packages("MASS")
   library(MASS)
}
## Loading required package: MASS
if (!require("leaflet")) {
   install.packages("leaflet")
   library(leaflet)
}
## Loading required package: leaflet
if (!require("factoextra")) {
   install.packages("factoextra")
   library(factoextra)
}
## Loading required package: factoextra
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
if (!require("TSstudio")) {
   install.packages("TSstudio")
   library(TSstudio)
}
## Loading required package: TSstudio
if (!require("plotrix")) {
   install.packages("plotrix")
library(plotrix)
}
## Loading required package: plotrix
if (!require("ggridges")) {
   install.packages("ggridges")
library(ggridges)
}
## Loading required package: ggridges
if (!require("tidyverse")) {
   install.packages("tidyverse")
library(tidyverse)
}
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::select() masks MASS::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
if (!require("GGally")) {
   install.packages("GGally")
library(GGally)
}
## Loading required package: GGally
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
if (!require("corrplot")) {
   install.packages("corrplot")
library(corrplot)
}
## Loading required package: corrplot
## corrplot 0.92 loaded
if (!require("cowplot")) {
   install.packages("cowplot")
library(cowplot)
}
## Loading required package: cowplot
## 
## Attaching package: 'cowplot'
## 
## The following object is masked from 'package:lubridate':
## 
##     stamp
## 
## The following object is masked from 'package:TSstudio':
## 
##     plot_grid
if (!require("shiny")) {
   install.packages("shiny")
library(shiny)
}
## Loading required package: shiny
if (!require("ggplot2")) {
   install.packages("ggplot2")
library(ggplot2)
}

if (!require("pscl")) {
   install.packages("pscl")
library(pscl)
}
## Loading required package: pscl
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
if (!require("caret")) {
   install.packages("caret")
library(caret)
}
## Loading required package: caret
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
if (!require("pROC")) {
   install.packages("pROC")
library(pROC)
}
## Loading required package: pROC
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
if (!require("glmnet")) {
   install.packages("glmnet")
library(glmnet)
}
## Loading required package: glmnet
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loaded glmnet 4.1-7
if (!require("ggplot2")) {
   install.packages("ggplot2")
   library(ggplot2)
}

if (!require("knitr")) {
   install.packages("knitr")
   library(knitr)
}

if (!require("neuralnet")) {
   install.packages("neuralnet")
   library(neuralnet)
}
## Loading required package: neuralnet
## 
## Attaching package: 'neuralnet'
## 
## The following object is masked from 'package:dplyr':
## 
##     compute
if (!require("neuralnet")) {
   install.packages("neuralnet")
   library(neuralnet)
}
if (!require("caret")) {
   install.packages("caret")
   library(caret)
}
if (!require("nnet")) {
   install.packages("nnet")
   library(nnet)
}
## Loading required package: nnet
if (!require("devtools")) {
   install.packages("devtools")
   library(devtools)
}
## Loading required package: devtools
## Loading required package: usethis
if (!require("pander")) {
   install.packages("pander")
   library(pander)
}
## Loading required package: pander
## 
## Attaching package: 'pander'
## 
## The following object is masked from 'package:shiny':
## 
##     p
## 
## The following object is masked from 'package:GGally':
## 
##     wrap
if (!require("knitr")) {
   install.packages("knitr")
   library(knitr)
}
if (!require("MASS")) {
   install.packages("MASS")
   library(MASS)
}
if (!require("leaflet")) {
   install.packages("leaflet")
   library(leaflet)
}
if (!require("factoextra")) {
   install.packages("factoextra")
   library(factoextra)
}
if (!require("TSstudio")) {
   install.packages("TSstudio")
   library(TSstudio)
}
if (!require("plotrix")) {
   install.packages("plotrix")
library(plotrix)
}
if (!require("ggridges")) {
   install.packages("ggridges")
library(ggridges)
}
if (!require("tidyverse")) {
   install.packages("tidyverse")
library(tidyverse)
}
if (!require("GGally")) {
   install.packages("GGally")
library(GGally)
}
if (!require("corrplot")) {
   install.packages("corrplot")
library(corrplot)
}

if (!require("cowplot")) {
   install.packages("cowplot")
library(cowplot)
}

if (!require("shiny")) {
   install.packages("shiny")
library(shiny)
}

if (!require("ggplot2")) {
   install.packages("ggplot2")
library(ggplot2)
}

if (!require("pscl")) {
   install.packages("pscl")
library(pscl)
}

if (!require("caret")) {
   install.packages("caret")
library(caret)
}

if (!require("pROC")) {
   install.packages("pROC")
library(pROC)
}

if (!require("glmnet")) {
   install.packages("glmnet")
library(glmnet)
}

if (!require("ggplot2")) {
   install.packages("ggplot2")
   library(ggplot2)
}

if (!require("knitr")) {
   install.packages("knitr")
   library(knitr)
}

if (!require("neuralnet")) {
   install.packages("neuralnet")
   library(neuralnet)
}
if (!require("neuralnet")) {
   install.packages("neuralnet")
   library(neuralnet)
}
if (!require("caret")) {
   install.packages("caret")
   library(caret)
}
if (!require("nnet")) {
   install.packages("nnet")
   library(nnet)
}
if (!require("devtools")) {
   install.packages("devtools")
   library(devtools)
}
if (!require("pander")) {
   install.packages("pander")
   library(pander)
}

if (!require("rpart")) {
   install.packages("rpart")
   library(rpart)
}
## Loading required package: rpart
if (!require("rpart")) {
   install.packages("rpart")
   library(rpart)
}

if (!require("rpart.plot")) {
   install.packages("rpart.plot")
   library(rpart.plot)
}
## Loading required package: rpart.plot
if (!require("flextable")) {
   install.packages("flextable")
   library(flextable)
}
## Loading required package: flextable
## 
## Attaching package: 'flextable'
## 
## The following object is masked from 'package:purrr':
## 
##     compose
if (!require("plyr")) {
   install.packages("plyr")
   library(plyr)
}
## Loading required package: plyr
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## 
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## 
## The following object is masked from 'package:purrr':
## 
##     compact
knitr::opts_chunk$set(echo = TRUE,       # include code chunk in the output file
                      warnings = FALSE,  # sometimes, you code may produce warning messages,
                                         # you can choose to include the warning messages in
                                         # the output file. 
                      results = TRUE,    # you can also decide whether to include the output
                                         # in the output file.
                      message = FALSE
                      )  

Table of Contents

  1. Introduction
  2. Exploratory Data Analysis
  3. The Candidate Models
  4. Performance of Candidate Models
  5. Discussion

1. Introduction

Credit card companies use models to predict the likelihood that an applicant will default on their Bank Loan. With that in mind, the purpose of this project is to create a model that will predict whether or not an applicant will default on their Bank Loan. A data set with 1000 observations and 16 variables from Applied Analytics through Case Studies Using SAS and R by Deepti Gupta will be used for the project. The data set includes 8 categorical variables, including the outcome variable of interest called “Default,” and 8 numerical variables.

default <- read.csv("https://pengdsci.github.io/datasets/LoanData2/BankLoanDefaultDataset.csv")

2. Exploratory Data Analysis

After reviewing the summary statistics, it is apparent that the variables Default , Car_Loan , Personal_loan , Home_loan , and Education_Loan are categorized as numeric variables when they are in fact Categorical variables. They will be converted to Factor variables in addition to Emp_Status , Marital_Status , and Gender for ease of analysis. The summary statistic function was rerun after converting these variables and that output is displayed below.

# function to display summary with a title
print_summary_with_title <- function(x, title) {
  cat("", title, "\n")
  cat("--------------------------------\n")
  print(summary(x))
}


# Call the custom function with the desired title
print_summary_with_title(default, "Figure 1. Summary Statistics of Variables")
 Figure 1. Summary Statistics of Variables 
--------------------------------
    Default    Checking_amount       Term        Credit_score   
 Min.   :0.0   Min.   :-665.0   Min.   : 9.00   Min.   : 376.0  
 1st Qu.:0.0   1st Qu.: 164.8   1st Qu.:16.00   1st Qu.: 725.8  
 Median :0.0   Median : 351.5   Median :18.00   Median : 770.5  
 Mean   :0.3   Mean   : 362.4   Mean   :17.82   Mean   : 760.5  
 3rd Qu.:1.0   3rd Qu.: 553.5   3rd Qu.:20.00   3rd Qu.: 812.0  
 Max.   :1.0   Max.   :1319.0   Max.   :27.00   Max.   :1029.0  
    Gender          Marital_status        Car_loan     Personal_loan  
 Length:1000        Length:1000        Min.   :0.000   Min.   :0.000  
 Class :character   Class :character   1st Qu.:0.000   1st Qu.:0.000  
 Mode  :character   Mode  :character   Median :0.000   Median :0.000  
                                       Mean   :0.353   Mean   :0.474  
                                       3rd Qu.:1.000   3rd Qu.:1.000  
                                       Max.   :1.000   Max.   :1.000  
   Home_loan     Education_loan   Emp_status            Amount    
 Min.   :0.000   Min.   :0.000   Length:1000        Min.   : 244  
 1st Qu.:0.000   1st Qu.:0.000   Class :character   1st Qu.:1016  
 Median :0.000   Median :0.000   Mode  :character   Median :1226  
 Mean   :0.056   Mean   :0.112                      Mean   :1219  
 3rd Qu.:0.000   3rd Qu.:0.000                      3rd Qu.:1420  
 Max.   :1.000   Max.   :1.000                      Max.   :2362  
 Saving_amount   Emp_duration         Age        No_of_credit_acc
 Min.   :2082   Min.   :  0.00   Min.   :18.00   Min.   :1.000   
 1st Qu.:2951   1st Qu.: 15.00   1st Qu.:29.00   1st Qu.:1.000   
 Median :3203   Median : 41.00   Median :32.00   Median :2.000   
 Mean   :3179   Mean   : 49.39   Mean   :31.21   Mean   :2.546   
 3rd Qu.:3402   3rd Qu.: 85.00   3rd Qu.:34.00   3rd Qu.:3.000   
 Max.   :4108   Max.   :120.00   Max.   :42.00   Max.   :9.000   
# Specify the variables to convert to factors
variables_to_convert <- c("Default" , "Gender" , "Marital_status" , "Car_loan" , "Personal_loan" , "Home_loan" , "Education_loan" , "Emp_status")

# Convert specific variables to factor using lapply
default[variables_to_convert] <- lapply(default[variables_to_convert], as.factor)
# Change Scientific Notation
options(scipen = 999)

When looking at No_of_credit_acc, it can be seen that the number of credit accounts declines after five, however, for the time being, No_of_credit_acc will remain numeric and un collapsed. It may prove more advantageous in later stages to convert No_of_credit_acc to a factor variable and collapse 6-9 into a 6+ category.

#NUMBER OF CREDIT CARDS
n.cred.act <- table(default$No_of_credit_acc)
knitr::kable(n.cred.act , caption = "Fig 2. N Credit Accounts per person",col.names = c("Credit Accounts", "Freq"))
Fig 2. N Credit Accounts per person
Credit Accounts Freq
1 308
2 325
3 119
4 105
5 109
6 6
7 8
8 6
9 14
#PERHAPS THIS SHOULD BE LEFT ALONE OR 6-9 COLLAPSED IN TO A SINGLE 6+ category.

Univariate ChiSquare Tests were performed with categorical variables by the default variable to discover any significant association.

# Load required library



# List of variables for Chi-square tests
variables <- c("Car_loan", "Gender", "Marital_status", "Personal_loan", "Home_loan", "Education_loan" , "Emp_status")

# Function to perform Chi-square test and return relevant statistics
chi_square_test <- function(variable) {
  ct <- table(default[, variable], default$Default)
  test_result <- chisq.test(ct)
  
  return(data.frame(
    Variable = variable,
    Chi_Square = test_result$statistic,
    DF = test_result$parameter,
    P_Value = test_result$p.value
  ))
}

# Perform Chi-square tests for each variable and combine the results
chi_square_results <- ldply(variables, chi_square_test)

# Print the combined results
kable(chi_square_results , caption = "Fig 3. Univariate Chi Square Tests of Categrocial Variables by Defaulting")
Fig 3. Univariate Chi Square Tests of Categrocial Variables by Defaulting
Variable Chi_Square DF P_Value
Car_loan 5.074005 1 0.0242872
Gender 5.348516 1 0.0207399
Marital_status 6.159816 1 0.0130685
Personal_loan 45.297532 1 0.0000000
Home_loan 7.790881 1 0.0052511
Education_loan 80.093327 1 0.0000000
Emp_status 0.080655 1 0.7764118

All categorical variables with the exception of Emp_status yielded a statistically significant association with default. The most significant association occurred between default and Education_Loan where those without student loans did not default. The results are shown in the bar chart below

table_data <- table(default$Default , default$Education_loan)
barplot(table_data, beside = TRUE, legend.text = TRUE, col = c("blue", "red"), 
        main = "Bar Chart of Education Loan by Default",
        xlab = "Education Loan", ylab = "Count")
Fig 4. Bar Chart of Education by Default

Fig 4. Bar Chart of Education by Default

A correlation matrix was used to assess for any significant correlation between numeric variables. From the Matrix it can be seen that the most significant positive correlation is between Saving_amount and age. As age increases, savings amount increases. In terms of negative correlation, the most significant is between Age and Term. As age increases, term decreases. The squares with X’s indicate that there is no significant correlation between the two varibles at the .05 significance level. All squares without X’s demonstrated significant correlation at the .05 significance level.

# Select the desired columns
selected_columns <- c(2:4, 12:16)
selected_data <- default[, selected_columns]

# Compute the correlation matrix
cor_matrix <- cor(selected_data)


# Compute the p-values for the correlation matrix
testRes = cor.mtest(selected_data, conf.level = 0.95)

# Create the visual correlation matrix
corrplot(cor_matrix, method = "color" , p.mat = testRes$p ,addCoef.col ='black', number.cex = 0.7, tl.cex = 0.7 , sig.level = 0.05 , insig = "pch", diag = FALSE , type = "lower")
Fig 5. Correlation Matrix of Numeric variables

Fig 5. Correlation Matrix of Numeric variables

The density and scatter plots are coded where the Orange color reflects not defaulting and the aqua color represents defaulting.

In Review of density plots that are the numerical variables crossed by default, a number of things stick out visually. Those who default have a lower Checking amount, greater Terms, lower Credit Scores, greater Amount, lower Savings amount, and a younger age.

 ggpairs(default, columns = c(2:4 , 12:16), aes(color = Default, alpha = 0.5),
        lower = list(continuous = "smooth")) + ggtitle("Fig 6. Correlations, Density Plots, and Scatter Plots of Numeric Variables")

Scatter plots were prepared for the following numerical variables with the variable Default creating two different levels. These plots indicate visually there may be some potential co- linear relationships between the variables. For example, there appears to be a colinear relationship between Amount and Checking_Amount, Amount and Term, and Employment Duration and savings amount. The scatter plots also visualize the relationships shown in the density plots but in a different way.

3 The Candidate Models

The Original data set was split in to a training data set to train the candidate models and a test data set was used to assess the performance of the models. From the Orignal Datset of 1,000 observations, a 70/30 split was Incorporated with 700 observations in the training set and 300 observations in the testing set.

#make this example reproducible
set.seed(100)

#Use 70% of dataset as training set and remaining 30% as testing set
sample <- sample(c(TRUE, FALSE), nrow(default), replace=TRUE, prob=c(0.72,0.28))
train <- default[sample, ]
test <- default[!sample, ] 
train$Default <- as.factor(train$Default)
summary(train)
##  Default Checking_amount       Term        Credit_score       Gender   
##  0:483   Min.   :-665.0   Min.   : 9.00   Min.   : 376.0   Female:226  
##  1:217   1st Qu.: 167.0   1st Qu.:16.00   1st Qu.: 724.8   Male  :474  
##          Median : 343.0   Median :18.00   Median : 770.0               
##          Mean   : 361.9   Mean   :17.87   Mean   : 761.7               
##          3rd Qu.: 555.8   3rd Qu.:20.00   3rd Qu.: 811.0               
##          Max.   :1319.0   Max.   :27.00   Max.   :1029.0               
##  Marital_status Car_loan Personal_loan Home_loan Education_loan
##  Married:379    0:456    0:370         0:659     0:617         
##  Single :321    1:244    1:330         1: 41     1: 83         
##                                                                
##                                                                
##                                                                
##                                                                
##       Emp_status      Amount     Saving_amount   Emp_duration   
##  employed  :223   Min.   : 244   Min.   :2082   Min.   :  0.00  
##  unemployed:477   1st Qu.:1011   1st Qu.:2929   1st Qu.: 14.00  
##                   Median :1218   Median :3195   Median : 40.50  
##                   Mean   :1213   Mean   :3167   Mean   : 48.36  
##                   3rd Qu.:1411   3rd Qu.:3384   3rd Qu.: 82.25  
##                   Max.   :2362   Max.   :4108   Max.   :120.00  
##       Age        No_of_credit_acc
##  Min.   :18.00   Min.   :1.000   
##  1st Qu.:28.00   1st Qu.:1.000   
##  Median :32.00   Median :2.000   
##  Mean   :31.24   Mean   :2.504   
##  3rd Qu.:34.00   3rd Qu.:3.000   
##  Max.   :42.00   Max.   :9.000

3.A Logistic Regression Model Building

To investigate whether or not someone will default on their loan, three logistic regression models were fit to the data to determine which model performs best in predicting whether or not someone will default. The logistic regression model will also indicate which variables are the most significant in terms of association between defaulting on a loan.

A Full Logistic Regression Model is fit to the training data set with “Default” as the response variable and the other fifteen variables as predictor variables.

options(scipen = 999)
full_model <- glm(Default ~ . , family = binomial , data = train)
full_model_summary <- summary(full_model)$coef

kable(full_model_summary , caption = "Fig 7. Coefficiencts of Full Logistic Regression Model")
Fig 7. Coefficiencts of Full Logistic Regression Model
Estimate Std. Error z value Pr(>|z|)
(Intercept) 39.9025787 5.9240627 6.7356780 0.0000000
Checking_amount -0.0049317 0.0008038 -6.1354975 0.0000000
Term 0.2342254 0.0684103 3.4238316 0.0006174
Credit_score -0.0105484 0.0024878 -4.2401139 0.0000223
GenderMale -0.0468614 0.6260959 -0.0748471 0.9403364
Marital_statusSingle 0.3845520 0.6244803 0.6157953 0.5380297
Car_loan1 -1.3779309 3.5405050 -0.3891905 0.6971352
Personal_loan1 -2.4773232 3.5407698 -0.6996567 0.4841417
Home_loan1 -4.4749376 3.6213878 -1.2356969 0.2165712
Education_loan1 0.4208259 3.5738732 0.1177506 0.9062652
Emp_statusunemployed 0.2463111 0.4065920 0.6057943 0.5446513
Amount 0.0005281 0.0006678 0.7907602 0.4290840
Saving_amount -0.0048328 0.0007323 -6.5990257 0.0000000
Emp_duration 0.0056039 0.0056208 0.9969792 0.3187746
Age -0.6596863 0.0788019 -8.3714480 0.0000000
No_of_credit_acc -0.0799863 0.1207618 -0.6623480 0.5077482

From the table above it can be observed that the variables Checking_amount , Term , Credit_score , Saving_Amount , and Emp_duration have a highly significant assocation regarding whether or not someone will default. These five variables are also the only statistically significant variables at a .05 significance level in terms of association between whether or not someone will default.

To construct a reduced model, a step wise selection algorithm was incorporated to select features for the reduced model. Forward and Backward selection yielded the same features for the model, whereas forward selection chose all but one feature to include in the model. Based on this, the features chosen from backward and forward-backward (both) selection were incorporated in to the reduced, Step AIC model.

step_model <- stepAIC(full_model, data = train , direction = "both" , trace = 0)
step.model.summary <- summary(step_model)$coefficients


kable(step.model.summary , caption = "Fig 8. Coefficiencts of Stepwise Logistic Regression Model")
Fig 8. Coefficiencts of Stepwise Logistic Regression Model
Estimate Std. Error z value Pr(>|z|)
(Intercept) 40.8381151 4.6231540 8.833388 0.0000000
Checking_amount -0.0050678 0.0008035 -6.307110 0.0000000
Term 0.2370735 0.0677178 3.500905 0.0004637
Credit_score -0.0101541 0.0024044 -4.223202 0.0000241
Car_loan1 -1.8782406 0.7065856 -2.658193 0.0078561
Personal_loan1 -2.9301812 0.7097100 -4.128702 0.0000365
Home_loan1 -4.9573357 1.0644451 -4.657202 0.0000032
Saving_amount -0.0047697 0.0007217 -6.608820 0.0000000
Age -0.6575394 0.0765847 -8.585783 0.0000000

After feature selection was performed using the Step AIC method within the prior section, the selected features in the reduced model were tested for pairwise interactions one at a time. The analysis determined the following terms have a significant interaction at the .05 significance level: Term:Credit_score: 0.00696 , Credit_score:Saving_amount: 0.028013 , Car_loan:Saving_Amount: 0.01270 , Personal_loan:Saving_amount: 0.02896. These interaction terms were then included into a new stepwise AIC algorithm using backward selection. The results are found below.

interaction.model.glm <- glm(Default ~ Checking_amount + Term + Credit_score + 
    Car_loan + Personal_loan + Home_loan + Saving_amount + Age + Term:Credit_score + Credit_score:Saving_amount + Car_loan:Saving_amount + Personal_loan:Saving_amount , data = train , family = binomial)

interaction.model <- stepAIC(interaction.model.glm,  data = train , direction = "backward" , trace = 0)
interaction.model.summary <- summary(interaction.model)$coefficients

kable(interaction.model.summary , caption = "Fig 9. Coefficiencts of Stepwise Logistic Regression Model with Interaction Terms")
Fig 9. Coefficiencts of Stepwise Logistic Regression Model with Interaction Terms
Estimate Std. Error z value Pr(>|z|)
(Intercept) -38.3831883 26.6421948 -1.440692 0.1496718
Checking_amount -0.0054646 0.0008927 -6.121128 0.0000000
Term 2.1847681 0.8051805 2.713389 0.0066599
Credit_score 0.0914907 0.0356429 2.566874 0.0102620
Car_loan1 9.9227909 4.9684127 1.997175 0.0458061
Personal_loan1 -3.3529294 0.7408941 -4.525518 0.0000060
Home_loan1 -4.9800002 1.0525285 -4.731464 0.0000022
Saving_amount 0.0102921 0.0079472 1.295060 0.1952997
Age -0.6925115 0.0811738 -8.531225 0.0000000
Term:Credit_score -0.0025649 0.0010494 -2.444056 0.0145232
Credit_score:Saving_amount -0.0000184 0.0000105 -1.743303 0.0812807
Car_loan1:Saving_amount -0.0039166 0.0015985 -2.450170 0.0142789

3.B Creating a Neural Network for Prediction of Default

normalize <- function(x) {
  return ((x - min(x)) / (max(x) - min(x)))
}

columns_to_normalize <- c("Checking_amount" , "Term" , "Credit_score" , "Amount" ,"Saving_amount" , "Emp_duration" , "Age" , "No_of_credit_acc")

# Applying the custom function to the selected columns using mutate_at
train.neural <- train %>% mutate_at(vars(all_of(columns_to_normalize)), normalize)
test.neural <- test %>% mutate_at(vars(all_of(columns_to_normalize)), normalize)

The initial step in developing a Neural network is developing a model matrix so the names of all feature variables including implicitly defined dummy variables are defined and extracted. This was performed on both the test and training dataset. Feature engineering in the form of normalization was also performed on all numerical variables to prepare them for building a neural network.

## CREATE matrix test
train.matrix = model.matrix(~  ., data = train.neural)
class(train.matrix)
## [1] "matrix" "array"
## create matrix test
test.matrix = model.matrix(~  ., data = test.neural)
class(test.matrix)
## [1] "matrix" "array"

There are some naming issues in the above dummy feature variables for network modeling (although they are good for regular linear and generalized linear regression models). These feature variables were renamed by excluding special characters in order to build the neural network model.

colnames(train.matrix)[3] <- "checkingAmount"
colnames(train.matrix)[5] <- "creditScore"
colnames(train.matrix)[7] <- "maritalStatusSingle"
colnames(train.matrix)[8] <- "carLoan1"
colnames(train.matrix)[9] <- "personalLoan1"
colnames(train.matrix)[10] <- "homeLoan1"
colnames(train.matrix)[11] <- "educationLoan1"
colnames(train.matrix)[12] <- "empStatusUnemployed"
colnames(train.matrix)[14] <- "savingAmount"
colnames(train.matrix)[15] <- "empDuration"
colnames(train.matrix)[17] <- "noOfCreditAcc"

## TESTING

colnames(test.matrix)[3] <- "checkingAmount"
colnames(test.matrix)[5] <- "creditScore"
colnames(test.matrix)[7] <- "maritalStatusSingle"
colnames(test.matrix)[8] <- "carLoan1"
colnames(test.matrix)[9] <- "personalLoan1"
colnames(test.matrix)[10] <- "homeLoan1"
colnames(test.matrix)[11] <- "educationLoan1"
colnames(test.matrix)[12] <- "empStatusUnemployed"
colnames(test.matrix)[14] <- "savingAmount"
colnames(test.matrix)[15] <- "empDuration"
colnames(test.matrix)[17] <- "noOfCreditAcc"
columnNames = colnames(train.matrix)
columnList = paste(columnNames[-c(1 , 2)], collapse = "+")
columnList = paste(c(columnNames[2],"~",columnList), collapse="")
modelFormula = formula(columnList)
modelFormula
## Default1 ~ checkingAmount + Term + creditScore + GenderMale + 
##     maritalStatusSingle + carLoan1 + personalLoan1 + homeLoan1 + 
##     educationLoan1 + empStatusUnemployed + Amount + savingAmount + 
##     empDuration + Age + noOfCreditAcc

Next we build the neural network model. The neuralnet function was used for this task. A single layer neural network was developed.

NetworkModel = neuralnet(modelFormula,
                         data = train.matrix,
                         hidden = 1,               # single layer NN
                         rep = 1,                  # number of replicates in training NN
                         threshold = 0.01,         # threshold for the partial derivatives as stopping criteria.
                         learningrate = 0.1,       # user selected rate
                         algorithm = "rprop+"
                         )
kable(NetworkModel$result.matrix , caption = "Fig 10. Coefficients of Neural Network Model")
Fig 10. Coefficients of Neural Network Model
error 14.9974768
reached.threshold 0.0094634
steps 3617.0000000
Intercept.to.1layhid1 16.6141178
checkingAmount.to.1layhid1 -8.1722715
Term.to.1layhid1 4.2904723
creditScore.to.1layhid1 -6.5354810
GenderMale.to.1layhid1 0.3248997
maritalStatusSingle.to.1layhid1 0.7001326
carLoan1.to.1layhid1 -0.4712334
personalLoan1.to.1layhid1 -1.7527155
homeLoan1.to.1layhid1 -3.3455897
educationLoan1.to.1layhid1 0.8317540
empStatusUnemployed.to.1layhid1 0.3219977
Amount.to.1layhid1 -0.1122575
savingAmount.to.1layhid1 -9.1276846
empDuration.to.1layhid1 0.7902562
Age.to.1layhid1 -14.1673891
noOfCreditAcc.to.1layhid1 -0.7896480
Intercept.to.Default1 -0.0046067
1layhid1.to.Default1 1.0169859

A diagram of the neural network can be viewed below.

plot(NetworkModel, rep="best")
Fig 11. Single-layer backpropagation Neural network model for Default

Fig 11. Single-layer backpropagation Neural network model for Default

3.C Decision Tree Models for the Prediction of Default Status.

# arguments to pass into rpart():
# 1. data set (training /testing); 
# 2. Penalty coefficients
# 3. Impurity measure
## 

tree.builder = function(in.data, fp, fn, purity){
   tree = rpart(Default ~ .,                # including all features
                data = in.data, 
                na.action  = na.rpart,       # By default, deleted if the outcome is missing, 
                                             # kept if predictors are missing
                method = "class",            # Classification form factor
                model  = FALSE,
                x = FALSE,
                y = TRUE,
            parms = list( # loss matrix. Penalize false positives or negatives more heavily
                         loss = matrix(c(0, fp, fn, 0), ncol = 2, byrow = TRUE),   
                         split = purity),          # "gini" or "information"
             ## rpart algorithm options (These are defaults)
             control = rpart.control(
                        minsplit = 10,  # minimum number of observations required before split
                        minbucket= 10,  # minimum number of observations in any terminal node, default = minsplit/3
                        cp  = 0.01,  # complexity parameter for stopping rule, 0.02 -> small tree 
                       xval = 10     # number of cross-validation )
                        )
             )
  }

Four Decision Tree models were fit to the data set to see which variables perform the best in terms of predicting whether someone will default on there loan. The models are:

  1. Tree with Gini Index: Non-Penalization
  2. Tree with Entropy Non-Penalization
  3. Tree with Gini Index: Penalization False Negatives
  4. Tree with Entropy: Penalization False Negatives.
  5. Tree with Gini Index: Penalization False Positives
  6. Tree with Entropy: Penalization False Positives.

The Trees themselves can be viewed below.

## Call the tree model wrapper.

gini.tree.1.1 = tree.builder(in.data = train, fp = 1, fn = 1, purity = "gini")
info.tree.1.1 = tree.builder(in.data = train, fp = 1, fn = 1, purity = "information")
gini.tree.1.10 = tree.builder(in.data = train, fp = 1, fn = 10, purity = "gini")
info.tree.1.10 = tree.builder(in.data = train, fp = 1, fn = 10, purity = "information")
gini.tree.10.1 = tree.builder(in.data = train, fp = 10, fn = 1, purity = "gini")
info.tree.10.1 = tree.builder(in.data = train, fp = 10, fn = 1, purity = "information")
## tree plots
par(mfrow=c(1,2))
rpart.plot(gini.tree.1.1, main = "Tree with Gini index: non-penalization")
## Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
## To silence this warning:
##     Call rpart.plot with roundint=FALSE,
##     or rebuild the rpart model with model=TRUE.
rpart.plot(info.tree.1.1, main = "Tree with entropy: non-penalization")
## Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
## To silence this warning:
##     Call rpart.plot with roundint=FALSE,
##     or rebuild the rpart model with model=TRUE.
Fig 12. Non-penalized decision tree models using Gini index (left) and entropy (right).

Fig 12. Non-penalized decision tree models using Gini index (left) and entropy (right).

par(mfrow=c(1,2))
rpart.plot(gini.tree.1.10, main = "Tree with Gini index: penalization False Negatives")
## Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
## To silence this warning:
##     Call rpart.plot with roundint=FALSE,
##     or rebuild the rpart model with model=TRUE.
rpart.plot(info.tree.1.10, main = "Tree with entropy: penalization False Negatives")
## Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
## To silence this warning:
##     Call rpart.plot with roundint=FALSE,
##     or rebuild the rpart model with model=TRUE.
Fig 13. penalized decision tree models using Gini index (left) and entropy (right).

Fig 13. penalized decision tree models using Gini index (left) and entropy (right).

par(mfrow=c(1,2))
rpart.plot(gini.tree.10.1, main = "Tree with Gini index: penalization False Positives")
## Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
## To silence this warning:
##     Call rpart.plot with roundint=FALSE,
##     or rebuild the rpart model with model=TRUE.
rpart.plot(info.tree.10.1, main = "Tree with entropy: penalization False Positives")
## Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
## To silence this warning:
##     Call rpart.plot with roundint=FALSE,
##     or rebuild the rpart model with model=TRUE.
Fig 14. penalized decision tree models using Gini index (left) and entropy (right).

Fig 14. penalized decision tree models using Gini index (left) and entropy (right).

All trees incorporated Age as the main branch with a cut off between 30-32 years old. From there Checking Amount, Credit Score, Savings Amount, Amount, and term were incorporated into to branches within each model.

4 Performance of Candidate Models

The Full model contained VIF values all below five for numeric variables, indicating there’s no multi-collinearity issues. Car_loan, Personal_loan, Home_loan, and Education_loan have VIF values above five, but they are categorical variables. VIF values are applicable to continuous numerical variables.

# displaying VIF with a title
print_vif_with_title <- function(vif_values, title) {
  cat("Fig 15. Variance Inflation Factor (VIF) -", title, "\n")
  cat("--------------------------------\n")
  print(vif_values)
}

# Calculate VIF
vif.full.model <- car::vif(full_model)

# Call the custom print method with the desired title
print_vif_with_title(vif.full.model, "Full Model")
Fig 15. Variance Inflation Factor (VIF) - Full Model 
--------------------------------
 Checking_amount             Term     Credit_score           Gender 
        1.178541         1.118359         1.144263         2.712260 
  Marital_status         Car_loan    Personal_loan        Home_loan 
        2.931119        92.365892        92.220960        21.413613 
  Education_loan       Emp_status           Amount    Saving_amount 
       31.926009         1.105981         1.075671         1.217359 
    Emp_duration              Age No_of_credit_acc 
        1.255676         1.236782         1.096510 

VIF values for the reduced model are all below five indicating multi-collinearity is not an issue with the model.

# displaying VIF with a title
print_vif_with_title <- function(vif_values, title) {
  cat("Fig 16. Variance Inflation Factor (VIF) -", title, "\n")
  cat("--------------------------------\n")
  print(vif_values)
}

# Calculate VIF
vif.step.model <- car::vif(step_model)

# Call the custom print method with the desired title
print_vif_with_title(vif.step.model, "Step AIC Model")
Fig 16. Variance Inflation Factor (VIF) - Step AIC Model 
--------------------------------
Checking_amount            Term    Credit_score        Car_loan   Personal_loan 
       1.183323        1.085144        1.105634        3.756111        3.803263 
      Home_loan   Saving_amount             Age 
       1.831949        1.204715        1.190951 

Because there are interaction terms within this model, the VIF values are well above the 5-10 range which indicates there is significant multi-collinearity issues. The Variables with multi-collinearity issues are Term, Credit Score, and Saving Amount.

# displaying VIF with a title
print_vif_with_title <- function(vif_values, title) {
  cat("Fig 17. Variance Inflation Factor (VIF) -", title, "\n")
  cat("--------------------------------\n")
  print(vif_values)
}

# Calculate VIF
vif.int.model <- car::vif(interaction.model)

# Call the custom print method with the desired title
print_vif_with_title(vif.int.model, "Interaction Terms Model")
Fig 17. Variance Inflation Factor (VIF) - Interaction Terms Model 
--------------------------------
           Checking_amount                       Term 
                  1.262488                 135.626848 
              Credit_score                   Car_loan 
                190.751796                 161.014114 
             Personal_loan                  Home_loan 
                  3.814218                   1.964400 
             Saving_amount                        Age 
                143.601513                   1.365835 
         Term:Credit_score Credit_score:Saving_amount 
                181.089179                 267.646853 
    Car_loan:Saving_amount 
                163.384862 

Outliers for each Logistic Regression Model can be observed in the Pearson Residual Plots below.

Significant outliers were discovered for observations 766, 391, and 225. None of these outliers appear to be the result of any type of error and thus remained in the model. The reduced model produced the same outliers as the full model and again were maintained within the reduced model. The interaction model yielded the same outliers as the other two models and the outliers were maintained within the model.

par(mfrow = c(2,2))
plot(full_model, which = 1, main = "Residuals vs Fitted Full Model")
plot(step_model, which = 1, main = "Residuals vs Fitted")
plot(interaction.model, which = 1, main = "Residuals vs Fitted")
Fig 18. Pearson Residual Plots of Logistic Regression Models

Fig 18. Pearson Residual Plots of Logistic Regression Models

In terms of general prediction, a cut off probability first needed to be established. Five fold cross-validation was performed to determine the optimal cut off probability. After performing Five Fold Cross Validation on the training dataset, it was determined that the optimal cut off probability was 0.33 for the full model. This means that any individual with a probability greater than 0.33 was predicted to default on their loan. In addition, using the training data set, the full model correctly predicted whether someone would default or not default on their loan 94.14% of the time.

For the Step AIC model, the results yielded an optimal cut off point of 0.29 for this model and an accuracy of 93.86% on the training data.

Cross validation performed on the interaction terms Model yielded an optimal cut off probability of 0.52 with an accuracy level on the training data of 94.86%.

5-fold cross validation performed on the Neural Network to determine the optimal cut off point for determining the model’s accuracy yielded an optimal cut of point of 0.48 with an accuracy of 94% for the training data.

# 5-fold CV Full logistic model

n0 <- dim(train)[1] / 5
cut.0ff.prob <- seq(0, 1, length = 22)[-c(1, 22)]  # candidate cut off prob
pred.accuracy <- matrix(0, ncol = 20, nrow = 5, byrow = TRUE)  # null vector for storing prediction accuracy

# 5-fold CV
for (i in 1:5) {
  valid.id <- ((i - 1) * n0 + 1):(i * n0)
  valid.data <- train[valid.id, ]
  train.data <- train[-valid.id, ]
  
  train.model <- glm(Default ~ ., family = binomial(link = "logit"), data = train)  # Fit logistic regression with all possible variables
  newdata <- data.frame(valid.data[, -which(names(valid.data) == "Default")])
  pred.prob <- predict.glm(train.model, newdata = newdata, type = "response")
  
  # Define confusion matrix and accuracy
  for (j in 1:20) {
    pred.status <- as.numeric(pred.prob > cut.0ff.prob[j])
    a11 <- sum(pred.status == valid.data$Default)
    pred.accuracy[i, j] <- a11 / length(pred.status)
  }
}

avg.accuracy.fm <- apply(pred.accuracy, 2, mean)
max.id.fm <- which(avg.accuracy.fm == max(avg.accuracy.fm))


## Five fold cv Step AIC

n0 <- dim(train)[1] / 5
cut.0ff.prob <- seq(0, 1, length = 22)[-c(1, 22)]  # candidate cut off prob
pred.accuracy <- matrix(0, ncol = 20, nrow = 5, byrow = TRUE)  # null vector for storing prediction accuracy

# 5-fold CV
for (i in 1:5) {
  valid.id <- ((i - 1) * n0 + 1):(i * n0)
  valid.data <- train[valid.id, ]
  train.data <- train[-valid.id, ]
  
  train.model = glm(Default ~ Checking_amount + Term + Credit_score + Car_loan +
                    Personal_loan + Home_loan + Saving_amount + Age ,
                    family = binomial(link = logit), data = train)
  newdata = valid.data[, c("Checking_amount", "Term", "Credit_score", "Car_loan",
                           "Personal_loan", "Home_loan", "Saving_amount", "Age")]
  pred.prob = predict.glm(train.model, newdata, type = "response")

  
  # Define confusion matrix and accuracy
  for (j in 1:20) {
    pred.status <- as.numeric(pred.prob > cut.0ff.prob[j])
    a11 <- sum(pred.status == valid.data$Default)
    pred.accuracy[i, j] <- a11 / length(pred.status)
  }
}

avg.accuracy.s <- apply(pred.accuracy, 2, mean)
max.id.s <- which(avg.accuracy.s == max(avg.accuracy.s))




# Interaction Terms Included CV

n0 <- dim(train)[1] / 5
cut.0ff.prob <- seq(0, 1, length = 22)[-c(1, 22)]  # candidate cut off prob
pred.accuracy <- matrix(0, ncol = 20, nrow = 5, byrow = TRUE)  # null vector for storing prediction accuracy

# 5-fold CV
for (i in 1:5) {
  valid.id <- ((i - 1) * n0 + 1):(i * n0)
  valid.data <- train[valid.id, ]
  train.data <- train[-valid.id, ]
  
  train.model = glm(Default ~ Checking_amount + Term + Credit_score + 
    Car_loan + Personal_loan + Home_loan + Saving_amount + Age + 
    Term:Credit_score + Credit_score:Saving_amount + Car_loan:Saving_amount ,
                    family = binomial(link = logit), data = train)
  newdata = valid.data
  pred.prob = predict.glm(train.model, newdata, type = "response")

  
  # Define confusion matrix and accuracy
  for (j in 1:20) {
    pred.status <- as.numeric(pred.prob > cut.0ff.prob[j])
    a11 <- sum(pred.status == valid.data$Default)
    pred.accuracy[i, j] <- a11 / length(pred.status)
  }
}

avg.accuracy.i <- apply(pred.accuracy, 2, mean)
max.id.i <- which(avg.accuracy.i == max(avg.accuracy.i))






# Five fold Neural Network

n0 = dim(train.matrix)[1]/5
cut.off.score = seq(0,1, length = 22)[-c(1,22)]        # candidate cut off prob
pred.accuracy = matrix(0,ncol=20, nrow=5, byrow = T)  # null vector for storing prediction accuracy
##
for (i in 1:5){
  valid.id = ((i-1)*n0 + 1):(i*n0)
  valid.data = train.matrix[valid.id,]
  train.data = train.matrix[-valid.id,]
  ####
  train.model = neuralnet(modelFormula,
                         data = train.matrix,
                         hidden = 1,               # single layer NN
                         rep = 1,                  # number of replicates in training NN
                         threshold = 0.01,         # threshold for the partial derivatives as stopping criteria.
                         learningrate = 0.1,       # user selected rate
                         algorithm = "rprop+"
                         )
    pred.nn.score = predict(NetworkModel, valid.data)
    for(j in 1:20){
    #pred.status = rep(0,length(pred.nn.score))
    pred.status = as.numeric(pred.nn.score > cut.off.score[j])
    a11 = sum(pred.status == valid.data[,2])
    pred.accuracy[i,j] = a11/length(pred.nn.score)
  }
}
###

avg.accuracy.nn = apply(pred.accuracy, 2, mean)
max.id.nn = which(avg.accuracy.nn ==max(avg.accuracy.nn))
par(mfrow=c(2,2))
# Visual representation FULL MODEL
tick.label <- as.character(round(cut.0ff.prob, 2))
plot(1:20, avg.accuracy.fm, type = "b",
     xlim = c(1, 20),
     ylim = c(0.5, 1),
     axes = FALSE,
     xlab = "Cut-off Probability",
     ylab = "Accuracy",
     main = "5-fold CV performance Full Model"
)
axis(1, at = 1:20, label = tick.label, las = 2)
axis(2)
segments(max.id.fm, 0.5, max.id.fm, avg.accuracy.fm[max.id.fm], col = "red")
text(max.id.fm, avg.accuracy.fm[max.id.fm] + 0.03, as.character(round(avg.accuracy.fm[max.id.fm], 4)), col = "red", cex = 0.8)

# Visual representation STEPAIC
tick.label <- as.character(round(cut.0ff.prob, 2))
plot(1:20, avg.accuracy.s, type = "b",
     xlim = c(1, 20),
     ylim = c(0.5, 1),
     axes = FALSE,
     xlab = "Cut-off Probability",
     ylab = "Accuracy",
     main = "5-fold CV Performance Reduced Stepwise Model"
)
axis(1, at = 1:20, label = tick.label, las = 2)
axis(2)
segments(max.id.s, 0.5, max.id.s, avg.accuracy.s[max.id.s], col = "red")
text(max.id.s, avg.accuracy.s[max.id.s] + 0.03, as.character(round(avg.accuracy.s[max.id.s], 4)), col = "red", cex = 0.8)


# Visual representation Interaction
tick.label <- as.character(round(cut.0ff.prob, 2))
plot(1:20, avg.accuracy.i, type = "b",
     xlim = c(1, 20),
     ylim = c(0.5, 1),
     axes = FALSE,
     xlab = "Cut-off Probability",
     ylab = "Accuracy",
     main = "5-fold CV performance Interaction Terms Model"
)
axis(1, at = 1:20, label = tick.label, las = 2)
axis(2)
segments(max.id.i, 0.5, max.id.i, avg.accuracy.i[max.id.i], col = "red")
text(max.id.i, avg.accuracy.i[max.id.i] + 0.03, as.character(round(avg.accuracy.i[max.id.i], 4)), col = "red", cex = 0.8)

### visual representation Neural Network
tick.label = as.character(round(cut.off.score,2))
plot(1:20, avg.accuracy.nn, type = "b",
     xlim=c(1,20), 
     ylim=c(0.5,1), 
     axes = FALSE,
     xlab = "Cut-off Score",
     ylab = "Accuracy",
     main = "5-fold CV performance Neural Network"
     )
axis(1, at=1:20, label = tick.label, las = 2)
axis(2)
segments(max.id.nn, 0.5, max.id.nn, avg.accuracy.nn[max.id.nn], col = "red")
text(max.id.nn, avg.accuracy.nn[max.id.nn]+0.03, as.character(round(avg.accuracy.nn[max.id.nn],4)), col = "red", cex = 0.8)
Fig 19: Plot of optimal cut-off determination Logistic and Neural Network Models

Fig 19: Plot of optimal cut-off determination Logistic and Neural Network Models

Optm.cutoff = function(in.data, fp, fn, purity){
  n0 = dim(in.data)[1]/5
  cutoff = seq(0,1, length = 20)               # candidate cut off prob
  ## accuracy for each candidate cut-off
  accuracy.mtx = matrix(0, ncol=20, nrow=5)    # 20 candidate cutoffs and gini.11
  ##
  for (k in 1:5){
     valid.id = ((k-1)*n0 + 1):(k*n0)
     valid.dat = in.data[valid.id,]
     train.dat = in.data[-valid.id,] 
     ## tree model
     tree.model = tree.builder(in.data, fp, fn, purity)
     ## prediction 
     pred = predict(tree.model, newdata = valid.dat, type = "prob")[,2]
     ## for-loop
     for (i in 1:20){
        ## predicted probabilities
        pc.1 = ifelse(pred > cutoff[i], "1", "0")
        ## accuracy
        a1 = mean(pc.1 == valid.dat$Default)
        accuracy.mtx[k,i] = a1
       }
      }
   avg.acc = apply(accuracy.mtx, 2, mean)
   ## plots
   n = length(avg.acc)
   idx = which(avg.acc == max(avg.acc))
   tick.label = as.character(round(cutoff,2))
   ##
   plot(1:n, avg.acc, xlab="cut-off score", ylab="average accuracy", 
        ylim=c(min(avg.acc), 1), 
        axes = FALSE,
        main=paste("5-fold CV optimal cut-off \n ",purity,"(fp, fn) = (", fp, ",", fn,")" , collapse = ""),
        cex.main = 0.9,
        col.main = "navy")
        axis(1, at=1:20, label = tick.label, las = 2)
        axis(2)
        points(idx, avg.acc[idx], pch=19, col = "red")
        segments(idx , min(avg.acc), idx , avg.acc[idx ], col = "red")
       text(idx, avg.acc[idx]+0.03, as.character(round(avg.acc[idx],4)), col = "red", cex = 0.8) 
   }

Five Fold Cross-Validation was performed to determine optimal cut off points for each decision tree. the results of the Cross-Validation can be seen below. It can be observed that each tree yields several optimal cut off points. The average of the cut-off points was calculated to determine the final cut off point for each tree. The cut-off points and accuracy on the testing data are as follows: -Tree with Gini Index: Non-Penalization: 0.5, 91.71% -Tree with Entropy Non-Penalization: 0.473, 91.57% -Tree with Gini Index: Penalization False Negatives: 0 .42, 90.29% -Tree with Entropy: Penalization False Negatives: 0.605, 93.29% -Tree with Gini Index: Penalization False Positives: 0.632, 88.43% -Tree with Entropy: Penalization False Positives: 0.4483, 92%

par(mfrow=c(3,2))
Optm.cutoff(in.data = train, fp=1, fn=1, purity="gini")
Optm.cutoff(in.data = train, fp=1, fn=1, purity="information")
Optm.cutoff(in.data = train, fp=1, fn=10, purity="gini")
Optm.cutoff(in.data = train, fp=1, fn=10, purity="information")
Optm.cutoff(in.data = train, fp=10, fn=1, purity="gini")
Optm.cutoff(in.data = train, fp=10, fn=1, purity="information")
Fig 20: Plot of optimal cut-off determination Decision Tree Models

Fig 20: Plot of optimal cut-off determination Decision Tree Models

#Full Model Prediction

#Calculates How well the test data set predicts
#predicted correct 93.769 percent of the time
glm.probs.test <- predict(full_model , newdata = test , type = "response")
contrasts(test$Default)
##   1
## 0 0
## 1 1
glm.pred.test <- rep(0 , 300)
glm.pred.test[glm.probs.test > .33] = 1
table(glm.pred.test , test$Default)
##              
## glm.pred.test   0   1
##             0 206   7
##             1  11  76
fm.test.prediction <- mean(glm.pred.test == test$Default)

fm.test.prediction<- fm.test.prediction*100



#Step AIC Prediction

#Calculates How well the test data set predicts
#predicted correct 94 percent of the time
glm.probs.test <- predict(step_model , newdata = test , type = "response")
contrasts(test$Default)
##   1
## 0 0
## 1 1
glm.pred.test <- rep(0 , 300)
glm.pred.test[glm.probs.test > .29] = 1
table(glm.pred.test , test$Default)
##              
## glm.pred.test   0   1
##             0 205   7
##             1  12  76
sm.test.prediction <- mean(glm.pred.test == test$Default)



sm.test.prediction<- sm.test.prediction*100



#interaction Prediction


#Calculates How well the test data set predicts
#predicted correct 94 percent of the time
glm.probs.test <- predict(interaction.model , newdata = test , type = "response")
contrasts(test$Default)
##   1
## 0 0
## 1 1
glm.pred.test <- rep(0 , 300)
glm.pred.test[glm.probs.test > .52] = 1
table(glm.pred.test , test$Default)
##              
## glm.pred.test   0   1
##             0 209  11
##             1   8  72
int.test.prediction <- mean(glm.pred.test == test$Default)

int.test.prediction<- int.test.prediction*100




# Neural Network


#Test the resulting output
nn.results <- predict(NetworkModel, test.matrix)
results <- data.frame(actual = test.matrix[,2], prediction = nn.results > .48)
confMatrix = table(results$prediction, results$actual)               # confusion matrix
accuracy.nn=(sum(results$actual == results$prediction)/length(results$prediction))*100
list(confusion.matrix = confMatrix, accuracy = accuracy.nn)
## $confusion.matrix
##        
##           0   1
##   FALSE 198   6
##   TRUE   19  77
## 
## $accuracy
## [1] 91.66667
# Decision Trees




#predicted correct 94 percent of the time
# Get predicted probabilities using probability calibration
predicted_probs.gt11 <- predict(gini.tree.1.1, newdata = test, type = "prob")

predicted_probs_class1 <- predicted_probs.gt11[, 2]

glm.pred.test <- rep(0 , 300)
glm.pred.test[predicted_probs_class1 > .5] = 1
table(glm.pred.test , test$Default)
##              
## glm.pred.test   0   1
##             0 204  11
##             1  13  72
gt11 <- mean(glm.pred.test == test$Default)

gt11.pred<- gt11*100

##


predicted_probs.it11 <- predict(info.tree.1.1 , newdata = test, type = "prob")

predicted_probs_class1 <- predicted_probs.it11[, 2]

glm.pred.test <- rep(0 , 300)
glm.pred.test[predicted_probs_class1 > .473] = 1
table(glm.pred.test , test$Default)
##              
## glm.pred.test   0   1
##             0 206  10
##             1  11  73
it11 <- mean(glm.pred.test == test$Default)

it11.pred<- it11*100

###




predicted_probs.gt110 <- predict(gini.tree.1.10, newdata = test, type = "prob")

predicted_probs_class1 <- predicted_probs.gt110[, 2]

glm.pred.test <- rep(0 , 300)
glm.pred.test[predicted_probs_class1 > .42] = 1
table(glm.pred.test , test$Default)
##              
## glm.pred.test   0   1
##             0 192  11
##             1  25  72
gt110 <- mean(glm.pred.test == test$Default)

gt110.pred<- gt110*100

####


predicted_probs.it110 <- predict(info.tree.1.10, newdata = test, type = "prob")

predicted_probs_class1 <- predicted_probs.it110[, 2]

glm.pred.test <- rep(0 , 300)
glm.pred.test[predicted_probs_class1 > .605] = 1
table(glm.pred.test , test$Default)
##              
## glm.pred.test   0   1
##             0 201   9
##             1  16  74
it110 <- mean(glm.pred.test == test$Default)

it110.pred<- it110*100


#####


predicted_probs.gt101 <- predict(gini.tree.10.1, newdata = test, type = "prob")

predicted_probs_class1 <- predicted_probs.gt101[, 2]

glm.pred.test <- rep(0 , 300)
glm.pred.test[predicted_probs_class1 > .632] = 1
table(glm.pred.test , test$Default)
##              
## glm.pred.test   0   1
##             0 211  29
##             1   6  54
gt101 <- mean(glm.pred.test == test$Default)

gt101.pred<- gt101*100


######
 

predicted_probs.it101 <- predict(info.tree.10.1, newdata = test, type = "prob")

predicted_probs_class1 <- predicted_probs.it101[, 2]

glm.pred.test <- rep(0 , 300)
glm.pred.test[predicted_probs_class1 > .4483] = 1
table(glm.pred.test , test$Default)
##              
## glm.pred.test   0   1
##             0 202  17
##             1  15  66
it101 <- mean(glm.pred.test == test$Default)

it101.pred<- it101*100
prediction.models <- data.frame(
  Model = c("Full Model" , "StepAIC Model" , "Interaction Terms Model" , "Neural Network Model" , "Tree with Gini Index: Non-Penalization" , "Tree with Entropy Non-Penalization" , 
"Tree with Gini Index: Penalization False Negatives" , "Tree with Entropy: Penalization False Negatives" , "Tree with Gini Index: Penalization False Positives" , "Tree with Entropy: Penalization False Positives"),
  "Percent Correct" = c(fm.test.prediction, sm.test.prediction , int.test.prediction , accuracy.nn, gt11.pred , it11.pred , gt110.pred, it110.pred , gt101.pred , it101.pred))

The table below shows the accuracy of each candidate model using the optimal cut off points from cross validation on the test data set. This approach provides an indication of how each candidate model performs on a new set of data in terms of predicting whether someone will default on a loan. The full model is most accurate with an accuracy of 94%, which means the model accurately predicted whether someone will default or not default on their loan 94% of the time. Behind the full model, the Step AIC model and Interaction Terms Model were accurate 93.67% of the time. These results indicate that the addition of interaction terms to the Step AIC model does not add anything in terms of model accuracy.

kable(prediction.models, caption = "Fig 21. Proportion of Correct Predictions from Candidate Models")
Fig 21. Proportion of Correct Predictions from Candidate Models
Model Percent.Correct
Full Model 94.00000
StepAIC Model 93.66667
Interaction Terms Model 93.66667
Neural Network Model 91.66667
Tree with Gini Index: Non-Penalization 92.00000
Tree with Entropy Non-Penalization 93.00000
Tree with Gini Index: Penalization False Negatives 88.00000
Tree with Entropy: Penalization False Negatives 91.66667
Tree with Gini Index: Penalization False Positives 88.33333
Tree with Entropy: Penalization False Positives 89.33333
# Train the models and get the predicted probabilities
test.model_full <- glm(Default ~ ., family = binomial(link = logit), data = train)
pred.prob_full <- predict.glm(test.model_full, newdata = test, type = "response")

test.model_aic <- glm(Default ~ Checking_amount + Term + Credit_score + Car_loan +
                      Personal_loan + Home_loan + Saving_amount + Age,
                      family = binomial(link = logit), data = train)
pred.prob_aic <- predict.glm(test.model_aic, newdata = test, type = "response")

test.model_interaction <- glm(Default ~ Checking_amount + Term + Credit_score +
                               Car_loan + Personal_loan + Home_loan + Saving_amount + Age + 
                               Term:Credit_score + Credit_score:Saving_amount + Car_loan:Saving_amount,
                               family = binomial(link = logit), data = train)
pred.prob_interaction <- predict.glm(test.model_interaction, newdata = test, type = "response")

# Create ROC curve objects for each model
roc_full <- roc(test$Default, pred.prob_full)
roc_aic <- roc(test$Default, pred.prob_aic)
roc_interaction <- roc(test$Default, pred.prob_interaction)

#neual ROC
nn.results = predict(NetworkModel, test.matrix)
roc_neural <- roc(test$Default, nn.results)
## Warning in roc.default(test$Default, nn.results): Deprecated use a matrix as
## predictor. Unexpected results may be produced, please pass a numeric vector.
# Create an empty plot with specified xlim and ylim

Area Under the Curve and Receiver Operator curves are methods to determine how well a model performs in terms of discriminating between someone defaulting on their loan. Higher areas under the curve (AUC) indicate better discrimination. Much like accuracy, the full logistic regression model yielded the highest AUC on the test data set indicating the best discrimination among candidate models.

plot(0, 0, type = "n", xlim = c(1,0), ylim = c(0, 1), xlab = "Specificity", ylab = "Sensitivity", main = "ROC Curves of Logistic & NN Models")
plot(roc_full, col = "blue", lwd = 2 , add = TRUE , type = "l")
plot(roc_aic, col = "red", lwd = 2, add = TRUE)
plot(roc_interaction, col = "green", lwd = 2, add = TRUE)
plot(roc_neural, col = "sienna" , lwd = 2,  add = TRUE, type = "l")
# Add a legend to the plot
legend_text <- c(paste("Full Model (AUC =", round(auc(roc_full), 4), ")"),
                 paste("Step AIC Model (AUC =", round(auc(roc_aic), 4), ")"),
                 paste("Interaction Only Model (AUC =", round(auc(roc_interaction), 4), ")"),
                 paste("Neural Network (AUC =", round(auc(roc_neural), 4), ")")
                 )
legend("bottom", legend = legend_text,
       col = c("blue", "red", "green" , "sienna"), lwd = 2, cex = 0.8)
Fig 22. AUC curves of Logistic Regression Models and Neural Network Models

Fig 22. AUC curves of Logistic Regression Models and Neural Network Models

The Neural Network Model had the second best discrimination with an AUC of 0.9776, and the Step AIC model had an AUC of 0.9771. Both of these models are considered to have excellent discrimination in determining whether someone will default on their loan. The tree with the highest AUC was the Tree with Entropy: Penalization False Negatives.

# function returning a sensitivity and specificity matrix
SenSpe = function(in.data, fp, fn, purity){
  cutoff = seq(0,1, length = 20)   # 20 cut-offs including 0 and 1. 
  model = tree.builder(in.data, fp, fn, purity) 
  ## Caution: decision tree returns both "success" and "failure" probabilities.
  ## We need only "success" probability to define sensitivity and specificity!!! 
  pred = predict(model, newdata = in.data, type = "prob") # two-column matrix.
  senspe.mtx = matrix(0, ncol = length(cutoff), nrow= 2, byrow = FALSE)
  for (i in 1:length(cutoff)){
  # CAUTION: "pos" and "neg" are values of the label in this data set!
  # The following line uses only "pos" probability: pred[, "pos"] !!!!
  pred.out =  ifelse(pred[,"1"] >= cutoff[i], "1", "0")  
  TP = sum(pred.out =="1" & in.data$Default == "1")
  TN = sum(pred.out =="0" & in.data$Default == "0")
  FP = sum(pred.out =="1" & in.data$Default == "0")
  FN = sum(pred.out =="0" & in.data$Default == "1")
  senspe.mtx[1,i] = TP/(TP + FN)
  senspe.mtx[2,i] = TN/(TN + FP)
  }
  ## A better approx of ROC, need library {pROC}
  prediction = pred[, "1"]
  category = in.data$Default == "1"
  ROCobj <- roc(category, prediction)
  AUC = auc(ROCobj)
  ##
  list(senspe.mtx= senspe.mtx, AUC = round(AUC,3))
 }
giniROC11 = SenSpe(in.data = test, fp=1, fn=1, purity="gini")
infoROC11 = SenSpe(in.data = test, fp=1, fn=1, purity="information")
giniROC110 = SenSpe(in.data = test, fp=1, fn=10, purity="gini")
infoROC110 = SenSpe(in.data = test, fp=1, fn=10, purity="information")
giniROC101 = SenSpe(in.data = test, fp=10, fn=1, purity="gini")
infoROC101 = SenSpe(in.data = test, fp=10, fn=1, purity="information")
par(pty="s")      # set up square plot through graphic parameter
plot(1-giniROC11$senspe.mtx[2,], giniROC11$senspe.mtx[1,], type = "l", xlim=c(0,1), ylim=c(0,1), 
     xlab="1 - specificity: FPR", ylab="Sensitivity: TPR", col = "blue", lwd = 2,
     main="ROC Curves of Decision Trees", cex.main = 0.9, col.main = "navy")
abline(0,1, lty = 2, col = "orchid4", lwd = 2)
lines(1-infoROC11$senspe.mtx[2,], infoROC11$senspe.mtx[1,], col = "firebrick2", lwd = 2, lty=2)
lines(1-giniROC110$senspe.mtx[2,], giniROC110$senspe.mtx[1,], col = "olivedrab", lwd = 2)
lines(1-infoROC110$senspe.mtx[2,], infoROC110$senspe.mtx[1,], col = "skyblue", lwd = 2)
lines(1-giniROC101$senspe.mtx[2,], giniROC101$senspe.mtx[1,], col = "red", lwd = 2, lty = 4)
lines(1-infoROC101$senspe.mtx[2,], infoROC101$senspe.mtx[1,], col = "sienna3", lwd = 2)
legend("bottomright", c(paste("gini.1.1,  AUC =", giniROC11$AUC), 
                        paste("info.1.1,   AUC =",infoROC11$AUC), 
                        paste("gini.1.10, AUC =",giniROC110$AUC), 
                        paste("info.1.10, AUC =",infoROC110$AUC),
                        paste("gini.10.1, AUC =",giniROC101$AUC), 
                        paste("info.10.1, AUC =",infoROC101$AUC)),
                        col=c("blue","firebrick2","olivedrab","skyblue","red","sienna3"), 
                        lty=rep(1,6), lwd=rep(2,6), cex = 0.5, bty = "n")
Figure 23. Comparison of ROC curves Decision Tree Models

Figure 23. Comparison of ROC curves Decision Tree Models

5. Discussion

After review of each candidate model and their performance on a new testing dataset, the Step AIC model is the most desirable to predict whether someone will default on their loan. The Step AIC Model performed like the full model in terms of accuracy and discrimination as shown through the method of cross-validation and the area under the receiver operator curve. Little performance was lost in the Reduced Stepwise Model from paring down from fifteen variables to eight variables. It may be tempting to implement a full model to predict whether someone will default on their loan, however the principle of parsimony should be adhered to in this instance. The simplicity of the Step AIC model will facilitate greater efficiency and will reduce any risk of over fitting the data; something that may arise if the full logistic regression model is implemented. Although the Reduced Stepwise Model with Interaction Terms performed as well as the Step AIC model in terms of accuracy, it did not perform as well in terms of discrimination. The Reduced Stepwise Model with Interaction terms does not improve upon the performance of the Reduced Stepwise Model by including interaction terms into the Reduced Stepwise Model. A likely explanation is that the introduction of interaction terms may have over-fitted the model to the training data.

The Neural Network had better discrimination than the Step AIC Model, however the difference was negligible at 0.0005. The neural network model underperformed the Step AIC in accuracy by 2%, a much greater difference.

The decision tree models provide nice visual interpretations of the factors that may lead to defaulting on a loan, however the decision tree models did not perform as well as the Step AIC Logistic Regression model in predicting whether someone will default on their loan.

The Step AIC logistic regression model can be used as a tool to predict whether someone will default on their loan. This model can be employed when an applicant applies for a loan. The applicant will enter their Checking Account Balance, age, Savings Account Balance, and requested loan term. Information such on Credit Score, car loans, existing personal loans, and a home loan will be pulled from credit reporting agencies. This information will be plugged in to the logistic regression model and will predict whether someone will default on the loan. If the model predicts an applicant will default, then that applicant should be denied the loan.

Another potential step on the back end is that when the information is inserted into the logistic regression model, an acceptable loan term could be determined that will yield a result of the applicant not defaulting on their loan. If no acceptable term is found, then the applicant will be denied the loan.

When incorporating a logistic regression model building approach to determine who will and will not default on their loan, a reduced model incorporating the variables Checking_amount, Term, Credit_score, Car_loan, Personal_loan, Home_loan, Saving_amount, and Age proves to be the most parsimonious and effective model in terms of prediction and association.