To predict if a customer poses a credit risk or not
Compare the performance of logistic regression and decision trees for this prediction
library(ROCR)
library(ggplot2)
library(tibble)
library(class)
library(dplyr)
library(readxl)
library(tibble)
library(boot)
library(glmnet)
library(MASS)
library(rpart)
library(rpart.plot)
The data has been taken from the following link:
http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data
german_credit <- read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data")
german_credit <- as_tibble(german_credit)
colnames(german_credit)=c("chk_acct","duration","credit_his","purpose","amount",
"saving_acct","present_emp","installment_rate","sex",
"other_debtor","present_resid","property","age",
"other_install","housing","n_credits","job","n_people",
"telephone","foreign","response")
#orginal response coding 1= good, 2 = bad
#we need 0 = good, 1 = bad
german_credit$response <- german_credit$response - 1
glimpse(german_credit)
## Observations: 1,000
## Variables: 21
## $ chk_acct <fct> A11, A12, A14, A11, A11, A14, A14, A12, A14, ...
## $ duration <int> 6, 48, 12, 42, 24, 36, 24, 36, 12, 30, 12, 48...
## $ credit_his <fct> A34, A32, A34, A32, A33, A32, A32, A32, A32, ...
## $ purpose <fct> A43, A43, A46, A42, A40, A46, A42, A41, A43, ...
## $ amount <int> 1169, 5951, 2096, 7882, 4870, 9055, 2835, 694...
## $ saving_acct <fct> A65, A61, A61, A61, A61, A65, A63, A61, A64, ...
## $ present_emp <fct> A75, A73, A74, A74, A73, A73, A75, A73, A74, ...
## $ installment_rate <int> 4, 2, 2, 2, 3, 2, 3, 2, 2, 4, 3, 3, 1, 4, 2, ...
## $ sex <fct> A93, A92, A93, A93, A93, A93, A93, A93, A91, ...
## $ other_debtor <fct> A101, A101, A101, A103, A101, A101, A101, A10...
## $ present_resid <int> 4, 2, 3, 4, 4, 4, 4, 2, 4, 2, 1, 4, 1, 4, 4, ...
## $ property <fct> A121, A121, A121, A122, A124, A124, A122, A12...
## $ age <int> 67, 22, 49, 45, 53, 35, 53, 35, 61, 28, 25, 2...
## $ other_install <fct> A143, A143, A143, A143, A143, A143, A143, A14...
## $ housing <fct> A152, A152, A152, A153, A153, A153, A152, A15...
## $ n_credits <int> 2, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, ...
## $ job <fct> A173, A173, A172, A173, A173, A172, A173, A17...
## $ n_people <int> 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ telephone <fct> A192, A191, A191, A191, A191, A192, A191, A19...
## $ foreign <fct> A201, A201, A201, A201, A201, A201, A201, A20...
## $ response <dbl> 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, ...
Checing response variable
# response variable : 1=bad 0=good
german_credit %>% ggplot(aes(x=response, fill=response)) +geom_bar() +theme_classic() +
geom_text(stat = "count", aes(label = ..count.., y = ..count..),position = position_dodge(width = 1),
vjust = -0.5, size = 6)
Duration
german_credit %>% ggplot(aes(x=duration)) +geom_freqpoly(aes(col=as.factor(response), y=..density..), size=1) +theme_classic()
boxplot(duration~response, data= german_credit ,
xlab="Response", ylab="Duration (months)")
Amount
# Amount
german_credit %>% ggplot(aes(x=amount)) +geom_freqpoly(aes( col=as.factor(response), y=..density..), size=1) +theme_classic()
boxplot(amount~response, data= german_credit ,
xlab="Response", ylab="Amount")
Age
# Age
boxplot(age~response, data= german_credit ,
xlab="Response", ylab="Age")
Number of existing credits at this bank
# n_unique
ggplot(german_credit, aes(x=as.factor(n_credits))) +
geom_bar(aes(fill = as.factor(response)), position = "fill") + xlab("Number of existing credits at this bank")
set.seed(12910001)
index <- sample(nrow(german_credit),nrow(german_credit)*0.80)
Gcredit.train = german_credit[index,]
Gcredit.test = german_credit[-index,]
Variable Selection
# Variable Selection with Stepwise Backward Selecton Approach - AIC
Gcredit.glm0<- glm(response~., family=binomial, data=Gcredit.train)
Gcredit.glm.back <- step(Gcredit.glm0, direction = "backward", trace=0)
summary(Gcredit.glm.back)
##
## Call:
## glm(formula = response ~ chk_acct + duration + credit_his + purpose +
## amount + saving_acct + installment_rate + sex + other_debtor +
## other_install + housing + telephone + foreign, family = binomial,
## data = Gcredit.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0431 -0.7226 -0.3721 0.7406 2.9342
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.9111069 0.7395852 1.232 0.217980
## chk_acctA12 -0.3822145 0.2378010 -1.607 0.107991
## chk_acctA13 -1.4639321 0.4364354 -3.354 0.000796 ***
## chk_acctA14 -1.8722838 0.2611382 -7.170 7.52e-13 ***
## duration 0.0341851 0.0100998 3.385 0.000713 ***
## credit_hisA31 0.1042617 0.5875220 0.177 0.859147
## credit_hisA32 -0.3082018 0.4467272 -0.690 0.490250
## credit_hisA33 -0.6228628 0.5130890 -1.214 0.224768
## credit_hisA34 -1.0864219 0.4713805 -2.305 0.021180 *
## purposeA41 -1.4276389 0.3978571 -3.588 0.000333 ***
## purposeA410 -1.4633820 0.8380412 -1.746 0.080777 .
## purposeA42 -0.6606897 0.2887698 -2.288 0.022141 *
## purposeA43 -0.8613000 0.2735366 -3.149 0.001640 **
## purposeA44 -0.1887449 0.7956242 -0.237 0.812479
## purposeA45 -0.9636915 0.7801460 -1.235 0.216730
## purposeA46 0.0869488 0.4347735 0.200 0.841491
## purposeA48 -1.2292891 1.2954250 -0.949 0.342648
## purposeA49 -0.7225533 0.3779706 -1.912 0.055919 .
## amount 0.0001214 0.0000475 2.556 0.010591 *
## saving_acctA62 -0.5178019 0.3206694 -1.615 0.106364
## saving_acctA63 -0.5238058 0.4175903 -1.254 0.209714
## saving_acctA64 -1.7922084 0.6117351 -2.930 0.003393 **
## saving_acctA65 -1.0241717 0.2890986 -3.543 0.000396 ***
## installment_rate 0.3956193 0.0970572 4.076 4.58e-05 ***
## sexA92 -0.2341140 0.4349979 -0.538 0.590442
## sexA93 -0.7829497 0.4253509 -1.841 0.065663 .
## sexA94 -0.2720061 0.5088238 -0.535 0.592942
## other_debtorA102 0.3031589 0.4516951 0.671 0.502120
## other_debtorA103 -1.0204477 0.4483066 -2.276 0.022832 *
## other_installA142 -0.0435035 0.4493194 -0.097 0.922869
## other_installA143 -0.9172839 0.2642300 -3.472 0.000517 ***
## housingA152 -0.5646517 0.2479668 -2.277 0.022779 *
## housingA153 -0.5467443 0.3688281 -1.482 0.138239
## telephoneA192 -0.4111375 0.2084277 -1.973 0.048545 *
## foreignA202 -1.3174234 0.6976330 -1.888 0.058970 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 985.71 on 799 degrees of freedom
## Residual deviance: 719.32 on 765 degrees of freedom
## AIC: 789.32
##
## Number of Fisher Scoring iterations: 5
Model
# Final model based on backward selection
Gcredit.glm.final.train <- glm(response ~ chk_acct + duration + credit_his + purpose +
amount + saving_acct + installment_rate + sex + other_debtor + other_install +
housing + telephone + foreign, family = binomial, Gcredit.train)
pred.glm0.train<- predict(Gcredit.glm.final.train, type="response")
Calculating cutoff value of probability for the cost function - done because the data is imbalanced
# Calculating cutoff for probability using a cost function
costfunc = function(obs, pred.p, pcut){
weight1 = 5 # define the weight for "true=1 but pred=0" (FN)
weight0 = 1 # define the weight for "true=0 but pred=1" (FP)
c1 = (obs==1)&(pred.p<pcut) # count for "true=1 but pred=0" (FN)
c0 = (obs==0)&(pred.p>=pcut) # count for "true=0 but pred=1" (FP)
cost = mean(weight1*c1 + weight0*c0) # misclassification with weight
return(cost) # you have to return to a value when you write R functions
}
p.seq = seq(0.01, 1, 0.01)
cost = rep(0, length(p.seq))
for(i in 1:length(p.seq)){
cost[i] = costfunc(obs = Gcredit.train$response, pred.p = pred.glm0.train, pcut = p.seq[i])
}
plot(p.seq, cost)
optimal.pcut.glm0 = p.seq[which(cost==min(cost))]
# pcutoff value
optimal.pcut.glm0
## [1] 0.13 0.17
In Sample Prediction
# Checking the performance of the model on train data
pred_cutoff_train <- pred.glm0.train >optimal.pcut.glm0
table(Gcredit.train$response, as.numeric(pred_cutoff_train),dnn = c("Actual", "Predicted"))
## Predicted
## Actual 0 1
## 0 284 271
## 1 24 221
# Symmetric Misclassification Rate (without using cost function)
MR <- mean(as.numeric(Gcredit.train$response)!=as.numeric(pred_cutoff_train))
MR
## [1] 0.36875
# Asymmetric Misclassification Rate (using cost function)
costfunc(as.numeric(Gcredit.train$response),as.numeric(pred_cutoff_train, optimal.pcut.glm0),pcut=0.1667)
## [1] 0.48875
# We calculate cost of insample data based on using pcut=0.1667 which was given for this data set as per industry knowledge. We will find the value of pcutoff that results in minimum false negative cases and use it subsequently.
Out of Sample Prediction
# Testing Model performance on test data
pred.glm0.test <- predict(Gcredit.glm.final.train, Gcredit.test, type = "response")
pred_cutoff_test <- pred.glm0.test >0.1667
pred_cutoff_test <- as.numeric(pred_cutoff_test)
table(Gcredit.test$response, pred_cutoff_test,dnn = c("Actual", "Predicted"))
## Predicted
## Actual 0 1
## 0 78 67
## 1 10 45
# Symmetric Misclassification Rate (without using cost function)
test_data_MR_lor <- mean(as.numeric(Gcredit.test$response)!=as.numeric(pred_cutoff_test))
test_data_MR_lor
## [1] 0.385
# AMR
test_data_AMR_lor <- costfunc(Gcredit.test$response,pred_cutoff_test, optimal.pcut.glm0[1])
test_data_AMR_lor
## [1] 0.585
Confusion matrix for test data looks decent and asymmetric cost is fine. Out of sample MR is 0.385 and AMR is 0.585
ROC and AUC for Out of Sample Data
# ROC curve and Area and the curve
pred_test <- prediction(pred.glm0.test, Gcredit.test$response)
perf_test <- performance(pred_test, "tpr", "fpr")
plot(perf_test, colorize=TRUE)
#Get the AUC
AUC_test_lor <- unlist(slot(performance(pred_test, "auc"), "y.values"))
AUC_test_lor
## [1] 0.7616301
The Area Under The Curve for the Test data is 0.7616301 , which is decent.
10 Fold Cross Validation for Full Model
# Performing Cross Validation Using Asymmetric Cost Function
# cost function using the pcutiff that was calculated earlier
costfunc = function(obs, pred.p){
weight1 = 5 # define the weight for "true=1 but pred=0" (FN)
weight0 = 1 # define the weight for "true=0 but pred=1" (FP)
c1 = (obs==1)&(pred.p<pcut) # count for "true=1 but pred=0" (FN)
c0 = (obs==0)&(pred.p>=pcut) # count for "true=0 but pred=1" (FP)
cost = mean(weight1*c1 + weight0*c0) # misclassification with weight
return(cost) # you have to return to a value when you write R functions
}
pcut <- optimal.pcut.glm0
# applying final model on full data
# note: here you use all data and the final variables selected
Gcredit.glm.final.full <- glm(response ~ chk_acct + duration + credit_his + purpose +
amount + saving_acct + installment_rate + sex + other_debtor + other_install +
housing + telephone + foreign, family = binomial, german_credit)
# Performin 10 Fold CV
cv.result = cv.glm(data=german_credit, glmfit=Gcredit.glm.final.full, cost=costfunc, K=10)
# here, the costfunc automatically picks up the response variable from full data
# and also picks up the fitted probabilities from Gcredit.glm.final.full.
# It then calculates cost for each fold and finally averages the values of costs
# the assymetrical classification rate
cv.result$delta[2]
## [1] 0.5106
The asymmetrical 10 Fold Cross Validation Assymetrical Misclassification Rate is 0.5106.
cv.result averages the output of the cost function. We take cv.result$delta[2]. The first component of delta is the raw cross-validation estimate of prediction error. The second component is the adjusted cross-validation estimate.
ROC and AUC of Full Model
fitted_prob_full_model<- predict(Gcredit.glm.final.full, type="response")
prediction_full <- prediction(fitted_prob_full_model, german_credit$response)
perf_full <- performance(prediction_full, "tpr", "fpr")
plot(perf_full, colorize=TRUE)
#Get the AUC
unlist(slot(performance(prediction_full, "auc"), "y.values"))
## [1] 0.8272429
The AUC is 0.8272429, which is pretty good! Now Lest us see how does Decision Trees performs prediction.
Getting a lengthy tree by keeping cp very low
# The weights have been specified in loss paramter so that the comparison between decision tress and logistic regression model is fare.
Gcredit.rpart <- rpart(formula = response ~ . , data = Gcredit.train, method = "class", parms = list(loss=matrix(c(0,5,1,0), nrow = 2)),cp = 0.001)
prp(Gcredit.rpart, extra = 1)
Pruning the model
# pruning
plotcp(Gcredit.rpart)
# chosing the value of left most value of cp below the horizontal line
Gcredit.rpart1 <- prune(Gcredit.rpart, cp = 0.018)
prp(Gcredit.rpart1)
Cost Function
cost <- function(r, pi){
weight1 = 5
weight0 = 1
c1 = (r==1)&(pi==0) #logical vector - true if actual 1 but predict 0
c0 = (r==0)&(pi==1) #logical vector - true if actual 0 but predict 1
return(mean(weight1*c1+weight0*c0))
}
In Sample Performance
# For a binary classification problem, as you learned in logistic regression there are 2 types of predictions. One is the predicted class of response (0 or 1), and the second type is the probability of response being 1.We use an additional argument type="class" or type="prob" to get these
Gcredit.train.pred.tree<- predict(Gcredit.rpart1, Gcredit.train, type="class")
table(Gcredit.train$response, Gcredit.train.pred.tree, dnn=c("Truth","Predicted"))
## Predicted
## Truth 0 1
## 0 275 280
## 1 14 231
# MR_train
MR_dt_train <- mean(as.numeric(Gcredit.train$response)!=Gcredit.train.pred.tree)
MR_dt_train
## [1] 0.3675
# AMR
AMR_dt_train <- cost(Gcredit.train$response,Gcredit.train.pred.tree)
AMR_dt_train
## [1] 0.4375
The In Sample MR is 0.3675 and in sample AMR is 0.4375
Out of Sample Performance
Gcredit.test_prob <- predict(Gcredit.rpart1, Gcredit.test, type = "prob")
# Gcredit.test_prob has 2 columns, the first one is prob(Y) = 0 and the second prob(Y) = 1. We only need the second column because they add to 1 for binary classification
Gcredit.test_prob_perf_pcutoff <- as.numeric(Gcredit.test_prob[,2] > 0.1667)
table(Gcredit.test$response, Gcredit.test_prob_perf_pcutoff, dnn=c("Truth","Predicted"))
## Predicted
## Truth 0 1
## 0 64 81
## 1 11 44
# MR - out of sample
MR_dt_test <- mean(as.numeric(Gcredit.test$response)!=Gcredit.test_prob_perf_pcutoff)
MR_dt_test
## [1] 0.46
# AMR - out of sample
AMR_dt_test <- cost(Gcredit.test$response,Gcredit.test_prob_perf_pcutoff)
AMR_dt_test
## [1] 0.68
The test Sample MR is 0.46 and test AMR is 0.68
Out of Sample ROC and AUC for decision trees
pred_treee = prediction(Gcredit.test_prob_perf_pcutoff, Gcredit.test$response)
perf_treee = performance(pred_treee, "tpr", "fpr")
plot(perf_treee, colorize=TRUE)
AUC_test_dt <- slot(performance(pred_treee, "auc"), "y.values")[[1]]
The out of sample AUC value is 0.6206897
Clearly, Logistic Regression out performs Decision Trees in all of these 3 parameters