Requirements
library(caret)
library(lattice)
library(ROCR)
library(e1071)
library(party)
Data
data("GermanCredit")
datGerCred <- GermanCredit
Variable binning
datGerCred$amt.fac<-as.factor(ifelse(datGerCred$Amount<=2500,"0-2500", ifelse(datGerCred$Amount<=5000,"2600-5000",
"5000+")))
datGerCred$age.fac<-as.factor(ifelse(datGerCred$Age<=30,'0-30',
ifelse(datGerCred$Age<=40,'30-40',
'40+')))
datGerCred$Duration<-as.factor(ifelse(datGerCred$Duration<=15,"0-15",
ifelse(datGerCred$Duration<=30, "16-30",
ifelse(datGerCred$Duration<=50, "31-50",
ifelse(datGerCred$Duration<=72,
"51-72")))))
datGerCred$default <- as.factor(ifelse(datGerCred$Class=="Bad",
"1", "0"))
Prediction error conditions of a default when bad credit := 1:
false positive, when incorrectly predicting an applicant has good credit
false negative, when incorrectly predicting an applicant has bad credit
plots describing data
mosaicplot(default~age.fac, col=T, data=datGerCred)
spineplot(Class~age.fac, data=datGerCred)
xyplot(Amount~Age, data=datGerCred)
xyplot(Amount~Age|datGerCred$default, data=datGerCred)
# remove continuous variables with discrete copies
datBin <- datGerCred[,-c(2,5,10)]
Train/Test split
datSamp <- sort(sample(nrow(datBin), nrow(datBin)*0.7))
train <- datBin[datSamp,]
test <- datBin[-datSamp,]
Full logit to determine feature selection
m.log<-glm(train$default~.,data=train,family=binomial)
test$score <- predict(m.log, test, type="response")
pred.log<-prediction(test$score,test$default)
perf.log <- performance(pred.log, "tpr", "fpr")
plot(perf.log, col="light green")
abline(0,1, lty=8, col="red") # 45 degree line in graph
auc1 <- performance(pred.log, "auc") # calculates area under roc curve
plot(performance(pred.log, "acc")) #cutoff
#include significant variables from previous logit only
m.log_redux <- glm(default~Duration+
InstallmentRatePercentage+
ForeignWorker+
CheckingAccountStatus.0.to.200+
CheckingAccountStatus.gt.200+
CheckingAccountStatus.lt.0+
CreditHistory.NoCredit.AllPaid+
CreditHistory.ThisBank.AllPaid+
CreditHistory.Delay+SavingsAccountBonds.lt.100+
SavingsAccountBonds.100.to.500+
OtherDebtorsGuarantors.None+
OtherDebtorsGuarantors.CoApplicant,
data=train, family=binomial)
test$score <- predict(m.log_redux, test, type="response")
pred.log2 <-prediction(test$score,test$default)
perf.log2 <- performance(pred.log2, "tpr", "fpr")
plot(perf.log2, col="pink", add=T)
abline(0, 1, lty=8, col="red")
auc2 <- performance(pred.log2, "auc")
plot(performance(pred.log2, "acc"))
##Area under the curve is 0.7687500
Confusion matrix and stats
cmat.logit <- xtabs(~round(test$score)+test$default)
cmat.logit <- confusionMatrix(cmat.logit, dnn=c("Prediction", "Label"))
print(cmat.logit)
## Confusion Matrix and Statistics
##
## test$default
## round(test$score) 0 1
## 0 186 69
## 1 14 31
##
## Accuracy : 0.7233
## 95% CI : (0.669, 0.7732)
## No Information Rate : 0.6667
## P-Value [Acc > NIR] : 0.02045
##
## Kappa : 0.2783
## Mcnemar's Test P-Value : 3.08e-09
##
## Sensitivity : 0.9300
## Specificity : 0.3100
## Pos Pred Value : 0.7294
## Neg Pred Value : 0.6889
## Prevalence : 0.6667
## Detection Rate : 0.6200
## Detection Prevalence : 0.8500
## Balanced Accuracy : 0.6200
##
## 'Positive' Class : 0
##
Decision tree model
m.tree <-ctree(default~ Duration+InstallmentRatePercentage
+ForeignWorker+CheckingAccountStatus.0.to.200
+CheckingAccountStatus.gt.200
+CheckingAccountStatus.lt.0
+CreditHistory.NoCredit.AllPaid
+CreditHistory.ThisBank.AllPaid+CreditHistory.Delay
+SavingsAccountBonds.lt.100
+SavingsAccountBonds.100.to.500
+OtherDebtorsGuarantors.None
+OtherDebtorsGuarantors.CoApplicant, data=train)
plot(m.tree)
result.dat <- as.data.frame(do.call("rbind",
treeresponse(m.tree,
newdata=test)))
test$tscore<-result.dat[,2]
pred.tree<-prediction(test$tscore,test$default)
perf.tree <- performance(pred.tree, "tpr", "fpr")
Combined plot of logit model performance and tree model performance
plot(perf.log2, col='red', main='Logit vs Ctree')
plot(perf.tree, col='dark green', add=TRUE)
abline(0, 1, lty=8, col="grey")
auc3<- performance(pred.tree, "auc")
plot(performance(pred.tree, "acc"))
#Area under the curve is 0.6860000
Confusion matrix and stats
cmat.tree <-xtabs(~round(test$tscore)+test$default)
cmat.tree = confusionMatrix(cmat.tree)
print(cmat.tree)
## Confusion Matrix and Statistics
##
## test$default
## round(test$tscore) 0 1
## 0 184 85
## 1 16 15
##
## Accuracy : 0.6633
## 95% CI : (0.6068, 0.7166)
## No Information Rate : 0.6667
## P-Value [Acc > NIR] : 0.5754
##
## Kappa : 0.0846
## Mcnemar's Test P-Value : 1.322e-11
##
## Sensitivity : 0.9200
## Specificity : 0.1500
## Pos Pred Value : 0.6840
## Neg Pred Value : 0.4839
## Prevalence : 0.6667
## Detection Rate : 0.6133
## Detection Prevalence : 0.8967
## Balanced Accuracy : 0.5350
##
## 'Positive' Class : 0
##
par(mfrow=c(1,2))
fourfoldplot(cmat.tree$table, main="Decision Tree Model")
fourfoldplot(cmat.logit$table, main="Logit Model")