set.seed(3333)
# fitting decision tree classification model
DTModel <- train(Default ~ CreditLimit
+ Gender
+ Education
+ MaritalStatus
+ Age
+ BillOutstanding
+ LastPayment,
data = trainData.dt,
method = "rpart",
metric = "ROC",
parms = list(split = "gini"),
trControl = trctrl)
# model summary
DTModel## CART
##
## 23681 samples
## 7 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (4 fold)
## Summary of sample sizes: 17761, 17761, 17761, 17760
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.002081756 0.7782188 0.05488130
## 0.002933384 0.7776698 0.05526216
## 0.002964926 0.7776698 0.05526216
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.002081756.
rpart.plot# viasulaziation
library(rpart.plot)
prp(DTModel$finalModel, box.palette = "Reds", tweak = 1.2, varlen = 20)# plot of probabilities
plot(PredDTModel$Yes,
main = "Scatterplot of Probabilities of default (test data)",
xlab = "Customer ID",
ylab = "Predicted Probability of default")# taking the cut-off probability 50%
pred.DT <- ifelse(PredDTModel$Yes > 0.50, "Yes", "No")
# saving predicted vector as factor
Pred <- as.factor(pred.DT)
# ordering the vectors
Predicted <- ordered(Pred, levels = c("Yes", "No"))
Actual <- ordered(testData.dt$Default,levels = c("Yes", "No"))
# making confusion matrix
cm <-confusionMatrix(table(Predicted,Actual))
cm## Confusion Matrix and Statistics
##
## Actual
## Predicted Yes No
## Yes 53 40
## No 1268 4559
##
## Accuracy : 0.7791
## 95% CI : (0.7683, 0.7896)
## No Information Rate : 0.7769
## P-Value [Acc > NIR] : 0.3491
##
## Kappa : 0.047
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.040121
## Specificity : 0.991302
## Pos Pred Value : 0.569892
## Neg Pred Value : 0.782392
## Prevalence : 0.223142
## Detection Rate : 0.008953
## Detection Prevalence : 0.015709
## Balanced Accuracy : 0.515712
##
## 'Positive' Class : Yes
##
# taking the cut-off probability 55%
pred.DT <- ifelse(PredDTModel$Yes > 0.55, "Yes", "No")
# saving predicted vector as factor
Pred <- as.factor(pred.DT)
# ordering the vectors
Predicted <- ordered(Pred, levels = c("Yes", "No"))
Actual <- ordered(testData.dt$Default,levels = c("Yes", "No"))
# making confusion matrix
cm <-confusionMatrix(table(Predicted,Actual))
cm## Confusion Matrix and Statistics
##
## Actual
## Predicted Yes No
## Yes 53 40
## No 1268 4559
##
## Accuracy : 0.7791
## 95% CI : (0.7683, 0.7896)
## No Information Rate : 0.7769
## P-Value [Acc > NIR] : 0.3491
##
## Kappa : 0.047
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.040121
## Specificity : 0.991302
## Pos Pred Value : 0.569892
## Neg Pred Value : 0.782392
## Prevalence : 0.223142
## Detection Rate : 0.008953
## Detection Prevalence : 0.015709
## Balanced Accuracy : 0.515712
##
## 'Positive' Class : Yes
##
# taking the cut-off probability 45%
pred.DT <- ifelse(PredDTModel$Yes > 0.45, "Yes", "No")
# saving predicted vector as factor
Pred <- as.factor(pred.DT)
# ordering the vectors
Predicted <- ordered(Pred, levels = c("Yes", "No"))
Actual <- ordered(testData.dt$Default,levels = c("Yes", "No"))
# making confusion matrix
cm <-confusionMatrix(table(Predicted,Actual))
cm## Confusion Matrix and Statistics
##
## Actual
## Predicted Yes No
## Yes 145 143
## No 1176 4456
##
## Accuracy : 0.7772
## 95% CI : (0.7664, 0.7877)
## No Information Rate : 0.7769
## P-Value [Acc > NIR] : 0.4825
##
## Kappa : 0.1091
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.10977
## Specificity : 0.96891
## Pos Pred Value : 0.50347
## Neg Pred Value : 0.79119
## Prevalence : 0.22314
## Detection Rate : 0.02449
## Detection Prevalence : 0.04865
## Balanced Accuracy : 0.53934
##
## 'Positive' Class : Yes
##
library(dplyr)
# function to print confusion matrices for diffrent cut-off levels of probability
CmFn <- function(cutoff) {
# predicting the test set results
PredDTModel <- predict(DTModel, testData.dt,type = "prob")
C1 <- ifelse(PredDTModel$Yes > cutoff, "Yes", "No")
C2 <- testData.dt$Default
predY <- as.factor(C1)
actualY <- as.factor(C2)
Predicted <- ordered(predY, levels = c("Yes", "No"))
Actual <- ordered(actualY, levels = c("Yes", "No"))
# use the confusionMatrix from the caret package
cm1 <-confusionMatrix(data = Predicted,reference = Actual, positive = "Yes")
# extracting accuracy
Accuracy <- cm1$overall[1]
# extracting sensitivity
Sensitivity <- cm1$byClass[1]
# extracting specificity
Specificity <- cm1$byClass[2]
# extracting value of kappa
Kappa <- cm1$overall[2]
# combined table
tab <- cbind(Accuracy,Sensitivity,Specificity,Kappa)
return(tab)}
# sequence of cut-off points
cutoff1 <- seq( 0, 1, by = .05 )
# loop using "lapply"
tab2 <- lapply(cutoff1, CmFn)
# creating matrix of different metrics
numrows = length(cutoff1)
pm <- matrix(1:numrows*4, nrow = numrows, ncol=4)
# applying for loop
for (i in 1:numrows){
pm[i,] = tab2[[i]]}
pm <- as.data.frame(pm)
pm <- cbind(cutoff1, pm)
pm <- rename(pm, cutoff = cutoff1, Accuracy = V1,
Senstivity = V2 ,Specificity = V3, kappa = V4)
# printing the table
print(pm)## cutoff Accuracy Senstivity Specificity kappa
## 1 0.00 0.2231419 1.00000000 0.0000000 0.00000000
## 2 0.05 0.2231419 1.00000000 0.0000000 0.00000000
## 3 0.10 0.2231419 1.00000000 0.0000000 0.00000000
## 4 0.15 0.2231419 1.00000000 0.0000000 0.00000000
## 5 0.20 0.7250000 0.28538986 0.8512720 0.14697089
## 6 0.25 0.7250000 0.28538986 0.8512720 0.14697089
## 7 0.30 0.7643581 0.16654050 0.9360731 0.13117140
## 8 0.35 0.7771959 0.14004542 0.9602087 0.13495012
## 9 0.40 0.7771959 0.10976533 0.9689063 0.10906688
## 10 0.45 0.7771959 0.10976533 0.9689063 0.10906688
## 11 0.50 0.7790541 0.04012112 0.9913025 0.04699149
## 12 0.55 0.7790541 0.04012112 0.9913025 0.04699149
## 13 0.60 0.7778716 0.02271007 0.9947815 0.02657536
## 14 0.65 0.7783784 0.00984103 0.9991302 0.01384019
## 15 0.70 0.7783784 0.00984103 0.9991302 0.01384019
## 16 0.75 0.7768581 0.00000000 1.0000000 0.00000000
## 17 0.80 0.7768581 0.00000000 1.0000000 0.00000000
## 18 0.85 0.7768581 0.00000000 1.0000000 0.00000000
## 19 0.90 0.7768581 0.00000000 1.0000000 0.00000000
## 20 0.95 0.7768581 0.00000000 1.0000000 0.00000000
## 21 1.00 0.7768581 0.00000000 1.0000000 0.00000000
plot(pm$cutoff,pm$Senstivity,pch = 16, xlab = "Cut-off Probabilities", ylab = "ML Metrics",ylim = c(0,1),xlim = c(0,1),type = "l",lwd = 2,col= "red")
lines(pm$cutoff, pm$Specificity,col= "blue",lwd = 2)
lines(pm$cutoff, pm$Accuracy,col= "black")
legend(0.8, 0.3, legend = c("Senstivity", "Specificity","Accuracy"), col=c("red","blue","black"), cex=0.8,lty=1)# False Positive Rate
FPR <- 1-pm$Specificity
# True positive Rate
TPR <- pm$Senstivity
# plotting ROC curve
plot(FPR,TPR,main = "ROC Curve",col = 2,lwd = 2,type = "l",xlab = "False Positive Rate", ylab = "True positive Rate")
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")# loading the package
library(ROCR)
DTPrediction <- predict(DTModel, testData.dt,type = "prob")
Prediction <- prediction(DTPrediction[2],testData.dt$Default)
performance <- performance(Prediction, "tpr","fpr")
# plotting ROC curve
plot(performance,main = "ROC Curve",col = 2,lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")library(ROCR)
# area under curve
DTPrediction <- prediction(PredDTModel[2],testData.dt$Default)
aucDT <- performance(DTPrediction, measure = "auc")
aucDT <- aucDT@y.values[[1]]
aucDT## [1] 0.5729193