German Credit Risk Data Analysis

German Credit Risk Analysis

Objective

To predict if a customer poses a credit risk or not
Compare the performance of logistic regression and decision trees for this prediction

Libraries

library(ROCR)
library(ggplot2)
library(tibble)
library(class)
library(dplyr)
library(readxl)
library(tibble)
library(boot)
library(glmnet)
library(MASS)
library(rpart)
library(rpart.plot)

Data

The data has been taken from the following link:

http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data

german_credit <- read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data")
german_credit <- as_tibble(german_credit)

colnames(german_credit)=c("chk_acct","duration","credit_his","purpose","amount",
                          "saving_acct","present_emp","installment_rate","sex",
                          "other_debtor","present_resid","property","age",
                          "other_install","housing","n_credits","job","n_people",
                          "telephone","foreign","response")
#orginal response coding 1= good, 2 = bad
#we need 0 = good, 1 = bad
german_credit$response <- german_credit$response - 1

glimpse(german_credit)

## Observations: 1,000
## Variables: 21
## $ chk_acct         <fct> A11, A12, A14, A11, A11, A14, A14, A12, A14, ...
## $ duration         <int> 6, 48, 12, 42, 24, 36, 24, 36, 12, 30, 12, 48...
## $ credit_his       <fct> A34, A32, A34, A32, A33, A32, A32, A32, A32, ...
## $ purpose          <fct> A43, A43, A46, A42, A40, A46, A42, A41, A43, ...
## $ amount           <int> 1169, 5951, 2096, 7882, 4870, 9055, 2835, 694...
## $ saving_acct      <fct> A65, A61, A61, A61, A61, A65, A63, A61, A64, ...
## $ present_emp      <fct> A75, A73, A74, A74, A73, A73, A75, A73, A74, ...
## $ installment_rate <int> 4, 2, 2, 2, 3, 2, 3, 2, 2, 4, 3, 3, 1, 4, 2, ...
## $ sex              <fct> A93, A92, A93, A93, A93, A93, A93, A93, A91, ...
## $ other_debtor     <fct> A101, A101, A101, A103, A101, A101, A101, A10...
## $ present_resid    <int> 4, 2, 3, 4, 4, 4, 4, 2, 4, 2, 1, 4, 1, 4, 4, ...
## $ property         <fct> A121, A121, A121, A122, A124, A124, A122, A12...
## $ age              <int> 67, 22, 49, 45, 53, 35, 53, 35, 61, 28, 25, 2...
## $ other_install    <fct> A143, A143, A143, A143, A143, A143, A143, A14...
## $ housing          <fct> A152, A152, A152, A153, A153, A153, A152, A15...
## $ n_credits        <int> 2, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, ...
## $ job              <fct> A173, A173, A172, A173, A173, A172, A173, A17...
## $ n_people         <int> 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ telephone        <fct> A192, A191, A191, A191, A191, A192, A191, A19...
## $ foreign          <fct> A201, A201, A201, A201, A201, A201, A201, A20...
## $ response         <dbl> 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, ...

Exploratory data analysis

Checing response variable

# response variable : 1=bad 0=good
german_credit %>% ggplot(aes(x=response, fill=response)) +geom_bar() +theme_classic() +
  geom_text(stat = "count", aes(label = ..count.., y = ..count..),position = position_dodge(width = 1),
            vjust = -0.5, size = 6)

We have 300 bad records

Duration

german_credit %>% ggplot(aes(x=duration)) +geom_freqpoly(aes(col=as.factor(response), y=..density..), size=1) +theme_classic()

boxplot(duration~response, data= german_credit ,
        xlab="Response", ylab="Duration (months)")

When the duration is <20 months, we can see that the densities of both the responses is almost same. However, as we increase the duration, the density of bad loans increases. This is verified by looking at the medians for the responses.

Amount

# Amount

german_credit %>% ggplot(aes(x=amount)) +geom_freqpoly(aes( col=as.factor(response), y=..density..), size=1) +theme_classic()

boxplot(amount~response, data= german_credit ,
        xlab="Response", ylab="Amount")

The density of bad records (response=1) is generally more than the density of good records as the amount increases. The box plot confirms this observation as the median amount for the bad records is more than the median amount for good records

Age

# Age

boxplot(age~response, data= german_credit ,
        xlab="Response", ylab="Age")

The median age for bad records is lower than the median age for good records

Number of existing credits at this bank

# n_unique

ggplot(german_credit, aes(x=as.factor(n_credits))) +
  geom_bar(aes(fill = as.factor(response)), position = "fill") + xlab("Number of existing credits at this bank")

Interestingly, as the number of existing credits increase from 1 to 3, the percentage of bad record holders decreases. However, when number of existing credits=4, the percentage of bad record holders suddenly jumps to its maximum value.

Modeling

Random Sampling

set.seed(12910001)
index <- sample(nrow(german_credit),nrow(german_credit)*0.80)
Gcredit.train = german_credit[index,]
Gcredit.test = german_credit[-index,]

Logistic Regression Model

Variable Selection

#  Variable Selection with Stepwise Backward Selecton Approach - AIC 

Gcredit.glm0<- glm(response~., family=binomial, data=Gcredit.train)

Gcredit.glm.back <- step(Gcredit.glm0, direction = "backward", trace=0) 
summary(Gcredit.glm.back)

## 
## Call:
## glm(formula = response ~ chk_acct + duration + credit_his + purpose + 
##     amount + saving_acct + installment_rate + sex + other_debtor + 
##     other_install + housing + telephone + foreign, family = binomial, 
##     data = Gcredit.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0431  -0.7226  -0.3721   0.7406   2.9342  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        0.9111069  0.7395852   1.232 0.217980    
## chk_acctA12       -0.3822145  0.2378010  -1.607 0.107991    
## chk_acctA13       -1.4639321  0.4364354  -3.354 0.000796 ***
## chk_acctA14       -1.8722838  0.2611382  -7.170 7.52e-13 ***
## duration           0.0341851  0.0100998   3.385 0.000713 ***
## credit_hisA31      0.1042617  0.5875220   0.177 0.859147    
## credit_hisA32     -0.3082018  0.4467272  -0.690 0.490250    
## credit_hisA33     -0.6228628  0.5130890  -1.214 0.224768    
## credit_hisA34     -1.0864219  0.4713805  -2.305 0.021180 *  
## purposeA41        -1.4276389  0.3978571  -3.588 0.000333 ***
## purposeA410       -1.4633820  0.8380412  -1.746 0.080777 .  
## purposeA42        -0.6606897  0.2887698  -2.288 0.022141 *  
## purposeA43        -0.8613000  0.2735366  -3.149 0.001640 ** 
## purposeA44        -0.1887449  0.7956242  -0.237 0.812479    
## purposeA45        -0.9636915  0.7801460  -1.235 0.216730    
## purposeA46         0.0869488  0.4347735   0.200 0.841491    
## purposeA48        -1.2292891  1.2954250  -0.949 0.342648    
## purposeA49        -0.7225533  0.3779706  -1.912 0.055919 .  
## amount             0.0001214  0.0000475   2.556 0.010591 *  
## saving_acctA62    -0.5178019  0.3206694  -1.615 0.106364    
## saving_acctA63    -0.5238058  0.4175903  -1.254 0.209714    
## saving_acctA64    -1.7922084  0.6117351  -2.930 0.003393 ** 
## saving_acctA65    -1.0241717  0.2890986  -3.543 0.000396 ***
## installment_rate   0.3956193  0.0970572   4.076 4.58e-05 ***
## sexA92            -0.2341140  0.4349979  -0.538 0.590442    
## sexA93            -0.7829497  0.4253509  -1.841 0.065663 .  
## sexA94            -0.2720061  0.5088238  -0.535 0.592942    
## other_debtorA102   0.3031589  0.4516951   0.671 0.502120    
## other_debtorA103  -1.0204477  0.4483066  -2.276 0.022832 *  
## other_installA142 -0.0435035  0.4493194  -0.097 0.922869    
## other_installA143 -0.9172839  0.2642300  -3.472 0.000517 ***
## housingA152       -0.5646517  0.2479668  -2.277 0.022779 *  
## housingA153       -0.5467443  0.3688281  -1.482 0.138239    
## telephoneA192     -0.4111375  0.2084277  -1.973 0.048545 *  
## foreignA202       -1.3174234  0.6976330  -1.888 0.058970 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 985.71  on 799  degrees of freedom
## Residual deviance: 719.32  on 765  degrees of freedom
## AIC: 789.32
## 
## Number of Fisher Scoring iterations: 5

Model

# Final model based on backward selection

Gcredit.glm.final.train <- glm(response ~ chk_acct + duration + credit_his + purpose +
                                 amount + saving_acct + installment_rate + sex + other_debtor + other_install +
                                 housing + telephone + foreign, family = binomial, Gcredit.train)


pred.glm0.train<- predict(Gcredit.glm.final.train, type="response")

Calculating cutoff value of probability for the cost function - done because the data is imbalanced

# Calculating cutoff for probability using a cost function

costfunc = function(obs, pred.p, pcut){
  weight1 = 5   # define the weight for "true=1 but pred=0" (FN)
  weight0 = 1    # define the weight for "true=0 but pred=1" (FP)
  c1 = (obs==1)&(pred.p<pcut)    # count for "true=1 but pred=0"   (FN)
  c0 = (obs==0)&(pred.p>=pcut)   # count for "true=0 but pred=1"   (FP)
  cost = mean(weight1*c1 + weight0*c0)  # misclassification with weight
  return(cost) # you have to return to a value when you write R functions
}


p.seq = seq(0.01, 1, 0.01)

cost = rep(0, length(p.seq))  
for(i in 1:length(p.seq)){ 
  cost[i] = costfunc(obs = Gcredit.train$response, pred.p = pred.glm0.train, pcut = p.seq[i])  
} 

plot(p.seq, cost)

optimal.pcut.glm0 = p.seq[which(cost==min(cost))]

# pcutoff value
optimal.pcut.glm0

## [1] 0.13 0.17

False Negatives are penalized 5 times more than False Positives. This is because we would want to penalize a case in which our model wrongly predicted the case in which a person would actually have defaulted.

In Sample Prediction

# Checking the performance of the model on train data

pred_cutoff_train <- pred.glm0.train >optimal.pcut.glm0 

table(Gcredit.train$response, as.numeric(pred_cutoff_train),dnn = c("Actual", "Predicted"))

##       Predicted
## Actual   0   1
##      0 284 271
##      1  24 221

# Symmetric Misclassification Rate (without using cost function)
MR <- mean(as.numeric(Gcredit.train$response)!=as.numeric(pred_cutoff_train))
MR

## [1] 0.36875

# Asymmetric Misclassification Rate (using cost function)
costfunc(as.numeric(Gcredit.train$response),as.numeric(pred_cutoff_train, optimal.pcut.glm0),pcut=0.1667)

## [1] 0.48875

# We calculate cost of insample data based on using pcut=0.1667 which was given for this data set as per industry knowledge. We will find the value of pcutoff that results in minimum false negative cases and use it subsequently.

Misclassification Rate tells us how many observations were wrongly predicted by our model
Asymmetric Misclassification Rate tells us how the model performs by using weights (better option)

Out of Sample Prediction

# Testing Model performance on test data

pred.glm0.test <- predict(Gcredit.glm.final.train, Gcredit.test, type = "response")
pred_cutoff_test <- pred.glm0.test >0.1667 
pred_cutoff_test <- as.numeric(pred_cutoff_test)
table(Gcredit.test$response, pred_cutoff_test,dnn = c("Actual", "Predicted"))

##       Predicted
## Actual  0  1
##      0 78 67
##      1 10 45

# Symmetric Misclassification Rate (without using cost function)
test_data_MR_lor <- mean(as.numeric(Gcredit.test$response)!=as.numeric(pred_cutoff_test))
test_data_MR_lor

## [1] 0.385

# AMR
test_data_AMR_lor <- costfunc(Gcredit.test$response,pred_cutoff_test, optimal.pcut.glm0[1])
test_data_AMR_lor

## [1] 0.585

Confusion matrix for test data looks decent and asymmetric cost is fine. Out of sample MR is 0.385 and AMR is 0.585

ROC and AUC for Out of Sample Data

# ROC curve and Area and the curve

pred_test <- prediction(pred.glm0.test, Gcredit.test$response)
perf_test <- performance(pred_test, "tpr", "fpr")
plot(perf_test, colorize=TRUE)

#Get the AUC
AUC_test_lor <- unlist(slot(performance(pred_test, "auc"), "y.values"))
AUC_test_lor

## [1] 0.7616301

The Area Under The Curve for the Test data is 0.7616301 , which is decent.

10 Fold Cross Validation for Full Model

# Performing Cross Validation Using Asymmetric Cost Function

# cost function using the pcutiff that was calculated earlier
costfunc = function(obs, pred.p){
  weight1 = 5   # define the weight for "true=1 but pred=0" (FN)
  weight0 = 1    # define the weight for "true=0 but pred=1" (FP)
  c1 = (obs==1)&(pred.p<pcut)    # count for "true=1 but pred=0"   (FN)
  c0 = (obs==0)&(pred.p>=pcut)   # count for "true=0 but pred=1"   (FP)
  cost = mean(weight1*c1 + weight0*c0)  # misclassification with weight
  return(cost) # you have to return to a value when you write R functions
}
pcut <- optimal.pcut.glm0

# applying final model on full data
# note: here you use all data and the final variables selected
Gcredit.glm.final.full <- glm(response ~ chk_acct + duration + credit_his + purpose +
                                 amount + saving_acct + installment_rate + sex + other_debtor + other_install +
                                 housing + telephone + foreign, family = binomial, german_credit)
 

# Performin 10 Fold CV 
cv.result = cv.glm(data=german_credit, glmfit=Gcredit.glm.final.full, cost=costfunc, K=10) 
# here, the costfunc automatically picks up the response variable from full data 
# and also picks up the fitted probabilities from Gcredit.glm.final.full.
# It then calculates cost for each fold and finally averages the values of costs



# the assymetrical classification rate 
cv.result$delta[2]

## [1] 0.5106

The asymmetrical 10 Fold Cross Validation Assymetrical Misclassification Rate is 0.5106.

cv.result averages the output of the cost function. We take cv.result$delta[2]. The first component of delta is the raw cross-validation estimate of prediction error. The second component is the adjusted cross-validation estimate.

ROC and AUC of Full Model

fitted_prob_full_model<- predict(Gcredit.glm.final.full, type="response")
prediction_full <- prediction(fitted_prob_full_model, german_credit$response)
perf_full <- performance(prediction_full, "tpr", "fpr")
plot(perf_full, colorize=TRUE)

#Get the AUC
unlist(slot(performance(prediction_full, "auc"), "y.values"))

## [1] 0.8272429

The AUC is 0.8272429, which is pretty good! Now Lest us see how does Decision Trees performs prediction.

Decision Trees

Getting a lengthy tree by keeping cp very low

# The weights have been specified in loss paramter so that the comparison between decision tress and logistic regression model is fare.

Gcredit.rpart <- rpart(formula = response ~ . , data = Gcredit.train, method = "class", parms = list(loss=matrix(c(0,5,1,0), nrow = 2)),cp = 0.001)
prp(Gcredit.rpart, extra = 1)

Pruning the model

# pruning
plotcp(Gcredit.rpart)

# chosing the value of left most value of cp below the horizontal line 
Gcredit.rpart1 <- prune(Gcredit.rpart, cp = 0.018)
prp(Gcredit.rpart1)

Cost Function

cost <- function(r, pi){
  weight1 = 5
  weight0 = 1
  c1 = (r==1)&(pi==0) #logical vector - true if actual 1 but predict 0
  c0 = (r==0)&(pi==1) #logical vector - true if actual 0 but predict 1
  return(mean(weight1*c1+weight0*c0))
}

In Sample Performance

# For a binary classification problem, as you learned in logistic regression there are 2 types of predictions. One is the predicted class of response (0 or 1), and  the second type is the probability of response being 1.We use an additional argument type="class" or type="prob" to get these

Gcredit.train.pred.tree<- predict(Gcredit.rpart1, Gcredit.train, type="class")
table(Gcredit.train$response, Gcredit.train.pred.tree, dnn=c("Truth","Predicted"))

##      Predicted
## Truth   0   1
##     0 275 280
##     1  14 231

# MR_train
MR_dt_train <- mean(as.numeric(Gcredit.train$response)!=Gcredit.train.pred.tree)
MR_dt_train

## [1] 0.3675

# AMR
AMR_dt_train <- cost(Gcredit.train$response,Gcredit.train.pred.tree)
AMR_dt_train

## [1] 0.4375

The In Sample MR is 0.3675 and in sample AMR is 0.4375

Out of Sample Performance

Gcredit.test_prob <- predict(Gcredit.rpart1, Gcredit.test, type = "prob")
# Gcredit.test_prob has 2 columns, the first one is prob(Y) = 0 and the  second prob(Y) = 1. We only need the second column because they add to 1 for binary classification

Gcredit.test_prob_perf_pcutoff <- as.numeric(Gcredit.test_prob[,2] > 0.1667)
table(Gcredit.test$response, Gcredit.test_prob_perf_pcutoff, dnn=c("Truth","Predicted"))

##      Predicted
## Truth  0  1
##     0 64 81
##     1 11 44

# MR - out of sample
MR_dt_test <- mean(as.numeric(Gcredit.test$response)!=Gcredit.test_prob_perf_pcutoff)
MR_dt_test

## [1] 0.46

# AMR - out of sample
AMR_dt_test <- cost(Gcredit.test$response,Gcredit.test_prob_perf_pcutoff)
AMR_dt_test

## [1] 0.68

The test Sample MR is 0.46 and test AMR is 0.68

Out of Sample ROC and AUC for decision trees

pred_treee = prediction(Gcredit.test_prob_perf_pcutoff, Gcredit.test$response)
perf_treee = performance(pred_treee, "tpr", "fpr")
plot(perf_treee, colorize=TRUE)

AUC_test_dt <- slot(performance(pred_treee, "auc"), "y.values")[[1]]

The out of sample AUC value is 0.6206897

Model Comparison and Conclusion

Out of Sample Misclassification Rate
- Logistic Regression: 0.385
- Decision Tree: 0.46
Out of Sample Asymmetrical Misclassification Rate
- Logistic Regression: 0.585
- Decision Tree:0.68
Out of sample AUC values
- Logistic Regression: 0.7616301
- Decision Tree:0.6206897

Clearly, Logistic Regression out performs Decision Trees in all of these 3 parameters