Analysis Objective

This analysis is done to demonstrate the use of any of the 3 classification algorithms we’ve learned in Classification in Machine Learning Module to predict the risk status of a bank loan, show an understanding of holding out a test / cross-validation set for an estimate of the model’s performance on unseen data, sufficiently explain the model’s performance (accuracy, recall/sensitivity, and specificity), and demonstrate extra effort to improve the the accuracy obtained from the initial model.

Libraries and Setup

knitr::opts_chunk$set(cache=TRUE)
options(scipen = 9999)
rm(list=ls())
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(caret)
## 
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
## 
##     cluster
library(partykit)
## Loading required package: grid
## Loading required package: libcoin
## Loading required package: mvtnorm
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine

Import the loan.csv data

loans <- read.csv("data_input/loan.csv")
str(loans)
## 'data.frame':    1000 obs. of  17 variables:
##  $ checking_balance    : Factor w/ 4 levels "< 0 DM","> 200 DM",..: 1 3 4 1 1 4 4 3 4 3 ...
##  $ months_loan_duration: int  6 48 12 42 24 36 24 36 12 30 ...
##  $ credit_history      : Factor w/ 5 levels "critical","good",..: 1 2 1 2 4 2 2 2 2 1 ...
##  $ purpose             : Factor w/ 6 levels "business","car",..: 5 5 4 5 2 4 5 2 5 2 ...
##  $ amount              : int  1169 5951 2096 7882 4870 9055 2835 6948 3059 5234 ...
##  $ savings_balance     : Factor w/ 5 levels "< 100 DM","> 1000 DM",..: 5 1 1 1 1 5 4 1 2 1 ...
##  $ employment_duration : Factor w/ 5 levels "< 1 year","> 7 years",..: 2 3 4 4 3 3 2 3 4 5 ...
##  $ percent_of_income   : int  4 2 2 2 3 2 3 2 2 4 ...
##  $ years_at_residence  : int  4 2 3 4 4 4 4 2 4 2 ...
##  $ age                 : int  67 22 49 45 53 35 53 35 61 28 ...
##  $ other_credit        : Factor w/ 3 levels "bank","none",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ housing             : Factor w/ 3 levels "other","own",..: 2 2 2 1 1 1 2 3 2 2 ...
##  $ existing_loans_count: int  2 1 1 1 2 1 1 1 1 2 ...
##  $ job                 : Factor w/ 4 levels "management","skilled",..: 2 2 4 2 2 4 2 1 4 1 ...
##  $ dependents          : int  1 1 2 2 2 2 1 1 1 1 ...
##  $ phone               : Factor w/ 2 levels "no","yes": 2 1 1 1 1 2 1 2 1 1 ...
##  $ default             : Factor w/ 2 levels "no","yes": 1 2 1 1 2 1 1 1 1 2 ...
head(loans)
##   checking_balance months_loan_duration credit_history
## 1           < 0 DM                    6       critical
## 2       1 - 200 DM                   48           good
## 3          unknown                   12       critical
## 4           < 0 DM                   42           good
## 5           < 0 DM                   24           poor
## 6          unknown                   36           good
##                purpose amount savings_balance employment_duration
## 1 furniture/appliances   1169         unknown           > 7 years
## 2 furniture/appliances   5951        < 100 DM         1 - 4 years
## 3            education   2096        < 100 DM         4 - 7 years
## 4 furniture/appliances   7882        < 100 DM         4 - 7 years
## 5                  car   4870        < 100 DM         1 - 4 years
## 6            education   9055         unknown         1 - 4 years
##   percent_of_income years_at_residence age other_credit housing
## 1                 4                  4  67         none     own
## 2                 2                  2  22         none     own
## 3                 2                  3  49         none     own
## 4                 2                  4  45         none   other
## 5                 3                  4  53         none   other
## 6                 2                  4  35         none   other
##   existing_loans_count       job dependents phone default
## 1                    2   skilled          1   yes      no
## 2                    1   skilled          1    no     yes
## 3                    1 unskilled          2    no      no
## 4                    1   skilled          2    no      no
## 5                    2   skilled          2    no     yes
## 6                    1 unskilled          2   yes      no

Exploratory Data Analysis

Let’s investigate the relationships and discover rough structures of the data.

We can look our overall data from describe() function from Hmisc package :

describe(loans)
## loans 
## 
##  17  Variables      1000  Observations
## ---------------------------------------------------------------------------
## checking_balance 
##        n  missing distinct 
##     1000        0        4 
##                                                       
## Value          < 0 DM   > 200 DM 1 - 200 DM    unknown
## Frequency         274         63        269        394
## Proportion      0.274      0.063      0.269      0.394
## ---------------------------------------------------------------------------
## months_loan_duration 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1000        0       33    0.985     20.9    12.98        6        9 
##      .25      .50      .75      .90      .95 
##       12       18       24       36       48 
## 
## lowest :  4  5  6  7  8, highest: 47 48 54 60 72
## ---------------------------------------------------------------------------
## credit_history 
##        n  missing distinct 
##     1000        0        5 
##                                                             
## Value       critical      good   perfect      poor very good
## Frequency        293       530        40        88        49
## Proportion     0.293     0.530     0.040     0.088     0.049
## ---------------------------------------------------------------------------
## purpose 
##        n  missing distinct 
##     1000        0        6 
##                                                                          
## Value                  business                  car                 car0
## Frequency                    97                  337                   12
## Proportion                0.097                0.337                0.012
##                                                                          
## Value                 education furniture/appliances          renovations
## Frequency                    59                  473                   22
## Proportion                0.059                0.473                0.022
## ---------------------------------------------------------------------------
## amount 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1000        0      921        1     3271     2773      709      932 
##      .25      .50      .75      .90      .95 
##     1366     2320     3972     7179     9163 
## 
## lowest :   250   276   338   339   343, highest: 15653 15672 15857 15945 18424
## ---------------------------------------------------------------------------
## savings_balance 
##        n  missing distinct 
##     1000        0        5 
##                                                                   
## Value           < 100 DM     > 1000 DM  100 - 500 DM 500 - 1000 DM
## Frequency            603            48           103            63
## Proportion         0.603         0.048         0.103         0.063
##                         
## Value            unknown
## Frequency            183
## Proportion         0.183
## ---------------------------------------------------------------------------
## employment_duration 
##        n  missing distinct 
##     1000        0        5 
##                                                                       
## Value         < 1 year   > 7 years 1 - 4 years 4 - 7 years  unemployed
## Frequency          172         253         339         174          62
## Proportion       0.172       0.253       0.339       0.174       0.062
## ---------------------------------------------------------------------------
## percent_of_income 
##        n  missing distinct     Info     Mean      Gmd 
##     1000        0        4    0.873    2.973      1.2 
##                                   
## Value          1     2     3     4
## Frequency    136   231   157   476
## Proportion 0.136 0.231 0.157 0.476
## ---------------------------------------------------------------------------
## years_at_residence 
##        n  missing distinct     Info     Mean      Gmd 
##     1000        0        4    0.895    2.845    1.205 
##                                   
## Value          1     2     3     4
## Frequency    130   308   149   413
## Proportion 0.130 0.308 0.149 0.413
## ---------------------------------------------------------------------------
## age 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1000        0       53    0.999    35.55    12.41       22       23 
##      .25      .50      .75      .90      .95 
##       27       33       42       52       60 
## 
## lowest : 19 20 21 22 23, highest: 67 68 70 74 75
## ---------------------------------------------------------------------------
## other_credit 
##        n  missing distinct 
##     1000        0        3 
##                             
## Value       bank  none store
## Frequency    139   814    47
## Proportion 0.139 0.814 0.047
## ---------------------------------------------------------------------------
## housing 
##        n  missing distinct 
##     1000        0        3 
##                             
## Value      other   own  rent
## Frequency    108   713   179
## Proportion 0.108 0.713 0.179
## ---------------------------------------------------------------------------
## existing_loans_count 
##        n  missing distinct     Info     Mean      Gmd 
##     1000        0        4    0.709    1.407   0.5428 
##                                   
## Value          1     2     3     4
## Frequency    633   333    28     6
## Proportion 0.633 0.333 0.028 0.006
## ---------------------------------------------------------------------------
## job 
##        n  missing distinct 
##     1000        0        4 
##                                                       
## Value      management    skilled unemployed  unskilled
## Frequency         148        630         22        200
## Proportion      0.148      0.630      0.022      0.200
## ---------------------------------------------------------------------------
## dependents 
##        n  missing distinct     Info     Mean      Gmd 
##     1000        0        2    0.393    1.155   0.2622 
##                       
## Value          1     2
## Frequency    845   155
## Proportion 0.845 0.155
## ---------------------------------------------------------------------------
## phone 
##        n  missing distinct 
##     1000        0        2 
##                       
## Value         no   yes
## Frequency    596   404
## Proportion 0.596 0.404
## ---------------------------------------------------------------------------
## default 
##        n  missing distinct 
##     1000        0        2 
##                   
## Value       no yes
## Frequency  700 300
## Proportion 0.7 0.3
## ---------------------------------------------------------------------------

From describe(), we know that : 1. No missing data (no NA) for every variables 2. Frequency and proportion of the observations from checking_balance, credit_history, purpose, savings_balance, employment_duration, percent_of_income, years_at_residence, other_credit, housing, existing_loans_count, job, dependents, phone, and default 3. Average value from months_loan_duration, amount, percent_of_income, years_at_residence, age, existing_loans_count, and dependents.

The variables are presented below : checking_balance and savings_balance: Status of existing checking / savings account
credit_history: Between critical, good, perfect, poor and very good
purpose: Between business, car(new), car(used), education, furniture and renovations
employment_duration: Present employment since
percent_of_income: Installment rate in percentage of disposable income years_at_residence: Present residence since
other_credit: Other installment plans (bank / store)
housing: Between rent, own, or for free
job: Between management, skilled, unskilled and unemployed
dependents: Number of people being liable to provide maintenance for
phone: Between none and yes (registered under customer name)

The variable of interest is the default variable. A loan is considered yes when it is Defaulted, Charged Off, or past due date (Grace Period).

Defaulted loans according to purpose

summary(loans[loans$default == "yes", "purpose"])
##             business                  car                 car0 
##                   34                  106                    5 
##            education furniture/appliances          renovations 
##                   23                  124                    8

Defaulted loans according to checking_balance

# Change the order/level of checking_balance variable
loans$checking_balance <- factor(loans$checking_balance,
                                levels = c("< 0 DM",
                                           "1 - 200 DM",
                                           "> 200 DM",
                                           "unknown"))
summary(loans[loans$default == "yes", "checking_balance"])
##     < 0 DM 1 - 200 DM   > 200 DM    unknown 
##        135        105         14         46

Defaulted loans according to savings_balance

# Change the order/level of saving_balance variable
loans$savings_balance <- factor(loans$savings_balance,
                                levels = c("< 100 DM",
                                           "100 - 500 DM",
                                           "500 - 1000 DM",
                                           "> 1000 DM",
                                           "unknown"))
summary(loans[loans$default == "yes", "savings_balance"])
##      < 100 DM  100 - 500 DM 500 - 1000 DM     > 1000 DM       unknown 
##           217            34            11             6            32

Summary of amount of loans

summary(loans$amount)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     250    1366    2320    3271    3972   18424

Defaulted loans summary according to amount of loan

summary(loans[loans$default == "yes", "amount"])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     433    1352    2574    3938    5142   18424

Defaulted loans according to credit_history

# Change the order/level of credit_history variable
loans$credit_history <- factor(loans$credit_history,
                                levels = c("critical",
                                           "poor",
                                           "good",
                                           "very good",
                                           "perfect"))
summary(loans[loans$default == "yes", "credit_history"])
##  critical      poor      good very good   perfect 
##        50        28       169        28        25

Defaulted loans according to other_credit

# Change the order/level of other_credit variable
loans$other_credit <- factor(loans$other_credit,
                                levels = c("none",
                                           "store",
                                           "bank"))
summary(loans[loans$default == "yes", "other_credit"])
##  none store  bank 
##   224    19    57

Using Decision Tree

loans_model_dt <- ctree(default ~ ., loans_train)

Plotting our decision tree of our credit risk analysis

plot(loans_model_dt)

plot(loans_model_dt, type = "simple")

loans_model_dt
## 
## Model formula:
## default ~ checking_balance + months_loan_duration + credit_history + 
##     purpose + amount + savings_balance + employment_duration + 
##     percent_of_income + years_at_residence + age + other_credit + 
##     housing + existing_loans_count + job + dependents + phone
## 
## Fitted party:
## [1] root
## |   [2] checking_balance < 0 DM, 1 - 200 DM
## |   |   [3] months_loan_duration <= 30
## |   |   |   [4] credit_history in critical, poor, good: no (n = 286, err = 36.0%)
## |   |   |   [5] credit_history in very good, perfect: yes (n = 40, err = 30.0%)
## |   |   [6] months_loan_duration > 30: yes (n = 76, err = 34.2%)
## |   [7] checking_balance > 200 DM, unknown: no (n = 348, err = 12.4%)
## 
## Number of inner nodes:    3
## Number of terminal nodes: 4

Use the model to make prediction

loans_pred_dt <- predict(loans_model_dt, loans_test)

Accuracy, Precision, and Recall

Confusion table for the prediction and test set

(dt_conft <- table("prediction" = loans_pred_dt,
                   "actual" = loans_test$default
                   ))
##           actual
## prediction  no yes
##        no  158  50
##        yes  16  26
accu_dt <- round((dt_conft[1]+dt_conft[4])/sum(dt_conft[1:4]),4)
prec_dt <- round(dt_conft[4]/(dt_conft[2]+dt_conft[4]), 4)
reca_dt <- round(dt_conft[4]/(dt_conft[4]+dt_conft[3]), 4)
spec_dt <- round(dt_conft[1]/(dt_conft[1]+dt_conft[2]), 4)
paste("Accuracy:", accu_dt*100,"%")
## [1] "Accuracy: 73.6 %"
paste("Precision:", prec_dt*100,"%")
## [1] "Precision: 61.9 %"
paste("Recall:", reca_dt*100,"%")
## [1] "Recall: 34.21 %"
paste("Specitifity:", spec_dt*100,"%")
## [1] "Specitifity: 90.8 %"

Or we can use confusionMatrix() to find out the Accuracy, Recall, Presicion, Specitifity, etc of our model.

confusionMatrix(loans_pred_dt, loans_test$default, positive = "yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  158  50
##        yes  16  26
##                                           
##                Accuracy : 0.736           
##                  95% CI : (0.6768, 0.7895)
##     No Information Rate : 0.696           
##     P-Value [Acc > NIR] : 0.09455         
##                                           
##                   Kappa : 0.2862          
##  Mcnemar's Test P-Value : 0.00004865      
##                                           
##             Sensitivity : 0.3421          
##             Specificity : 0.9080          
##          Pos Pred Value : 0.6190          
##          Neg Pred Value : 0.7596          
##              Prevalence : 0.3040          
##          Detection Rate : 0.1040          
##    Detection Prevalence : 0.1680          
##       Balanced Accuracy : 0.6251          
##                                           
##        'Positive' Class : yes             
## 

In this case, Sensitivity (Recall) is 0.3421 and Pos Pred Value (Precision) is 0.6190.

From this decision tree model, we get Recall rate of 34.21%. Remember that Recall is more important for this analysis because the bank’s credit department would rather sacrifice some level of specificity or precision in favor of higher recall (or sensitivity). We have to make our model more sensitive to defaulted loans as it will inflict financial loss to the bank/company.

We can say in this case, that greater recall increases the chances of the bank rejected the client’s loan application because they were predicted to give defaulted loans in the future, but it will affect the sales (sales will drop).

We can manually change our decision tree model by setting : - mincriterion: Act as a “regulator” for the depth of the tree, smaller values result in larger trees; When mincriterion is 0.8, p-value must be smaller than 0.2 in order for a node to split
- minsplit : number of split - minbucket: number of terminal nodes - mtry : number of variables used in the model

Notes : alpha = 0.05 (the significance level for variable selection) mincriterion = 1 - alpha (the value of the test statistic or 1 - p-value that must be exceeded in order to implement a split)

Let’s try to set mincriterion to 0.7 to increase our Recall rate from the previous model.

loans_model_dt2 <- ctree(default ~ ., loans_train, control = ctree_control(mincriterion = 0.7))
plot(loans_model_dt2)

loans_model_dt2
## 
## Model formula:
## default ~ checking_balance + months_loan_duration + credit_history + 
##     purpose + amount + savings_balance + employment_duration + 
##     percent_of_income + years_at_residence + age + other_credit + 
##     housing + existing_loans_count + job + dependents + phone
## 
## Fitted party:
## [1] root
## |   [2] checking_balance < 0 DM, 1 - 200 DM
## |   |   [3] months_loan_duration <= 30
## |   |   |   [4] credit_history in critical, poor, good
## |   |   |   |   [5] credit_history in critical, poor: no (n = 98, err = 23.5%)
## |   |   |   |   [6] credit_history in good
## |   |   |   |   |   [7] purpose in business, furniture/appliances, renovations: no (n = 119, err = 32.8%)
## |   |   |   |   |   [8] purpose in car, car0, education: yes (n = 69, err = 40.6%)
## |   |   |   [9] credit_history in very good, perfect
## |   |   |   |   [10] housing in other, rent: yes (n = 17, err = 5.9%)
## |   |   |   |   [11] housing in own: yes (n = 23, err = 47.8%)
## |   |   [12] months_loan_duration > 30
## |   |   |   [13] employment_duration < 1 year, > 7 years, 1 - 4 years
## |   |   |   |   [14] years_at_residence <= 2: yes (n = 22, err = 45.5%)
## |   |   |   |   [15] years_at_residence > 2: yes (n = 29, err = 6.9%)
## |   |   |   [16] employment_duration in 4 - 7 years, unemployed
## |   |   |   |   [17] age <= 29: yes (n = 10, err = 20.0%)
## |   |   |   |   [18] age > 29: no (n = 15, err = 20.0%)
## |   [19] checking_balance > 200 DM, unknown
## |   |   [20] other_credit in none, store
## |   |   |   [21] checking_balance > 200 DM: no (n = 42, err = 21.4%)
## |   |   |   [22] checking_balance in unknown: no (n = 268, err = 9.0%)
## |   |   [23] other_credit in bank: no (n = 38, err = 26.3%)
## 
## Number of inner nodes:    11
## Number of terminal nodes: 12

From this model, we have 11 inner nodes and 12 terminal nodes. (More than our previous model!)

Use the model to make prediction

loans_pred_dt2 <- predict(loans_model_dt2, loans_test)

Accuracy, Precision, and Recall

confusionMatrix(loans_pred_dt2, loans_test$default, positive = "yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  148  47
##        yes  26  29
##                                           
##                Accuracy : 0.708           
##                  95% CI : (0.6474, 0.7636)
##     No Information Rate : 0.696           
##     P-Value [Acc > NIR] : 0.36856         
##                                           
##                   Kappa : 0.2517          
##  Mcnemar's Test P-Value : 0.01924         
##                                           
##             Sensitivity : 0.3816          
##             Specificity : 0.8506          
##          Pos Pred Value : 0.5273          
##          Neg Pred Value : 0.7590          
##              Prevalence : 0.3040          
##          Detection Rate : 0.1160          
##    Detection Prevalence : 0.2200          
##       Balanced Accuracy : 0.6161          
##                                           
##        'Positive' Class : yes             
## 

Fro this model, we have Sensitivity (Recall) of 0.3816 and Pos Pred Value of 0.5273.

We manage to increase our Recall rate to 38.16% from 34.21%, but the implication is our Precision rate drops to only 52.73%.

We can see that we can improve our model by adding the mincriterion and minbucket/minsplit/mtry argument in ctree_control function when we create our model from ctree().

We’d like to improve our model more by trying using Random Forest Algorithm.

Random Forest

When using random forest algorithm, we are not required to split our dataset into train, cross-validation and test sets. In practice, the random forest already have out-of-bag estimates (OOB) that can be used as a reliable estimate of its true accuracy on unseen examples.

set.seed(300)
ctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 3, allowParallel = TRUE)
# number : number of fold
loans_rf <- train(default ~ ., data = loans, method = "rf", trControl = ctrl)
loans_rf$finalModel
## 
## Call:
##  randomForest(x = x, y = y, mtry = param$mtry) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 18
## 
##         OOB estimate of  error rate: 24.4%
## Confusion matrix:
##      no yes class.error
## no  624  76   0.1085714
## yes 168 132   0.5600000
paste("Accuracy:", round((624+132)/(1000)*100, 2),"%")
## [1] "Accuracy: 75.6 %"
paste("Precision:", round((132)/(132+168)*100, 2),"%")
## [1] "Precision: 44 %"
paste("Recall:", round((132)/(132+76)*100, 2),"%")
## [1] "Recall: 63.46 %"
paste("Specitifity:", round((624)/(624+168)*100, 2), "%")
## [1] "Specitifity: 78.79 %"

These are the variables that give high significance to our random forest loan model.

varImp(loans_rf)
## rf variable importance
## 
##   only 20 most important variables shown (out of 35)
## 
##                                Overall
## amount                         100.000
## age                             65.522
## months_loan_duration            59.378
## checking_balanceunknown         51.363
## years_at_residence              23.263
## percent_of_income               22.803
## savings_balanceunknown          11.486
## other_creditbank                11.091
## existing_loans_count            10.095
## phoneyes                         9.966
## credit_historyvery good          8.896
## purposefurniture/appliances      8.566
## purposecar                       8.532
## housingown                       8.316
## employment_duration1 - 4 years   8.166
## checking_balance1 - 200 DM       8.023
## credit_historyperfect            7.942
## jobskilled                       7.664
## credit_historygood               7.082
## employment_duration> 7 years     6.828
plot(loans_rf$finalModel)
legend("topright", colnames(loans_rf$finalModel$err.rate),col = 1:6, cex = 0.8, fill = 1:6)

plot(loans_rf)

From here, we know that we can get the highest Accuracy with only 18 variables, but Recall rate is the parameter that we have to improve in our model.

From random forest model, we get Recall rate 63.46% but our Precision drops to 44%.