Requirements

library(caret)
library(lattice)
library(ROCR)
library(e1071)
library(party)

Data

data("GermanCredit")
datGerCred <- GermanCredit

Variable binning

datGerCred$amt.fac<-as.factor(ifelse(datGerCred$Amount<=2500,"0-2500",                                 ifelse(datGerCred$Amount<=5000,"2600-5000",
                                     "5000+")))

datGerCred$age.fac<-as.factor(ifelse(datGerCred$Age<=30,'0-30', 
                              ifelse(datGerCred$Age<=40,'30-40', 
                                     '40+')))
datGerCred$Duration<-as.factor(ifelse(datGerCred$Duration<=15,"0-15", 
                               ifelse(datGerCred$Duration<=30, "16-30", 
                               ifelse(datGerCred$Duration<=50, "31-50",
                               ifelse(datGerCred$Duration<=72,
                                      "51-72")))))
datGerCred$default <- as.factor(ifelse(datGerCred$Class=="Bad", 
                                       "1", "0"))

Prediction error conditions of a default when bad credit := 1:

plots describing data

mosaicplot(default~age.fac, col=T, data=datGerCred)

spineplot(Class~age.fac, data=datGerCred)

xyplot(Amount~Age, data=datGerCred)

xyplot(Amount~Age|datGerCred$default, data=datGerCred)

# remove continuous variables with discrete copies
datBin <- datGerCred[,-c(2,5,10)] 

Train/Test split

datSamp <- sort(sample(nrow(datBin), nrow(datBin)*0.7))
train <- datBin[datSamp,]
test <- datBin[-datSamp,]

Full logit to determine feature selection

m.log<-glm(train$default~.,data=train,family=binomial)
test$score <- predict(m.log, test, type="response")
pred.log<-prediction(test$score,test$default)
perf.log <- performance(pred.log, "tpr", "fpr")
plot(perf.log, col="light green")
abline(0,1, lty=8, col="red")  # 45 degree line in graph

auc1 <- performance(pred.log, "auc") # calculates area under roc curve
plot(performance(pred.log, "acc")) #cutoff 

#include significant variables from previous logit only
m.log_redux <- glm(default~Duration+  
                   InstallmentRatePercentage+
                   ForeignWorker+ 
                   CheckingAccountStatus.0.to.200+ 
                   CheckingAccountStatus.gt.200+ 
                   CheckingAccountStatus.lt.0+ 
                   CreditHistory.NoCredit.AllPaid+ 
                   CreditHistory.ThisBank.AllPaid+ 
                   CreditHistory.Delay+SavingsAccountBonds.lt.100+
                   SavingsAccountBonds.100.to.500+ 
                   OtherDebtorsGuarantors.None+ 
                   OtherDebtorsGuarantors.CoApplicant, 
                   data=train, family=binomial)
test$score <- predict(m.log_redux, test, type="response")
pred.log2 <-prediction(test$score,test$default)
perf.log2 <- performance(pred.log2, "tpr", "fpr")
plot(perf.log2, col="pink", add=T)
abline(0, 1, lty=8, col="red")

auc2 <- performance(pred.log2, "auc") 
plot(performance(pred.log2, "acc")) 

##Area under the curve is 0.7687500

Confusion matrix and stats

cmat.logit <- xtabs(~round(test$score)+test$default)
cmat.logit <- confusionMatrix(cmat.logit, dnn=c("Prediction", "Label"))
print(cmat.logit)
## Confusion Matrix and Statistics
## 
##                  test$default
## round(test$score)   0   1
##                 0 186  69
##                 1  14  31
##                                          
##                Accuracy : 0.7233         
##                  95% CI : (0.669, 0.7732)
##     No Information Rate : 0.6667         
##     P-Value [Acc > NIR] : 0.02045        
##                                          
##                   Kappa : 0.2783         
##  Mcnemar's Test P-Value : 3.08e-09       
##                                          
##             Sensitivity : 0.9300         
##             Specificity : 0.3100         
##          Pos Pred Value : 0.7294         
##          Neg Pred Value : 0.6889         
##              Prevalence : 0.6667         
##          Detection Rate : 0.6200         
##    Detection Prevalence : 0.8500         
##       Balanced Accuracy : 0.6200         
##                                          
##        'Positive' Class : 0              
## 

Decision tree model

m.tree <-ctree(default~ Duration+InstallmentRatePercentage
               +ForeignWorker+CheckingAccountStatus.0.to.200
               +CheckingAccountStatus.gt.200
               +CheckingAccountStatus.lt.0
               +CreditHistory.NoCredit.AllPaid
               +CreditHistory.ThisBank.AllPaid+CreditHistory.Delay
               +SavingsAccountBonds.lt.100
               +SavingsAccountBonds.100.to.500
               +OtherDebtorsGuarantors.None
               +OtherDebtorsGuarantors.CoApplicant, data=train)
plot(m.tree)

result.dat <- as.data.frame(do.call("rbind", 
                            treeresponse(m.tree, 
                            newdata=test))) 
test$tscore<-result.dat[,2] 
pred.tree<-prediction(test$tscore,test$default) 
perf.tree <- performance(pred.tree, "tpr", "fpr")

Combined plot of logit model performance and tree model performance

plot(perf.log2, col='red', main='Logit vs Ctree')
plot(perf.tree, col='dark green', add=TRUE)
abline(0, 1, lty=8, col="grey")

auc3<- performance(pred.tree, "auc") 
plot(performance(pred.tree, "acc")) 

#Area under the curve is 0.6860000

Confusion matrix and stats

cmat.tree <-xtabs(~round(test$tscore)+test$default)
cmat.tree = confusionMatrix(cmat.tree) 
print(cmat.tree)
## Confusion Matrix and Statistics
## 
##                   test$default
## round(test$tscore)   0   1
##                  0 184  85
##                  1  16  15
##                                           
##                Accuracy : 0.6633          
##                  95% CI : (0.6068, 0.7166)
##     No Information Rate : 0.6667          
##     P-Value [Acc > NIR] : 0.5754          
##                                           
##                   Kappa : 0.0846          
##  Mcnemar's Test P-Value : 1.322e-11       
##                                           
##             Sensitivity : 0.9200          
##             Specificity : 0.1500          
##          Pos Pred Value : 0.6840          
##          Neg Pred Value : 0.4839          
##              Prevalence : 0.6667          
##          Detection Rate : 0.6133          
##    Detection Prevalence : 0.8967          
##       Balanced Accuracy : 0.5350          
##                                           
##        'Positive' Class : 0               
## 
par(mfrow=c(1,2))
fourfoldplot(cmat.tree$table, main="Decision Tree Model")
fourfoldplot(cmat.logit$table, main="Logit Model")