library(caret)
# data partition
set.seed(2341)
trainIndex <- createDataPartition(df$default, p = 0.80, list = FALSE)
# 80% training data
train.df <- df[trainIndex, ]
# 20% testing data
test.df <- df[-trainIndex, ]
dim(train.df)## [1] 8001 4
## [1] 1999 4
set.seed(2345)
# fitting decision tree classification model
DTModel <- train(default ~ .,
data = train.df,
method = "rpart",
metric = "ROC",
parms = list(split = "gini"),
trControl = trctrl)
# model summary
DTModel## CART
##
## 8001 samples
## 3 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 7200, 7200, 7201, 7200, 7200, 7201, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.005617978 0.9693775 0.3755861
## 0.037453184 0.9698765 0.3736105
## 0.084269663 0.9673772 0.2717968
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.03745318.
rpart.plot# viasulaziation
library(rpart.plot)
prp(DTModel$finalModel, box.palette = "Reds", tweak = 1.2, varlen = 20)# plot of probabilities
plot(PredDTModel$Yes,
main = "Scatterplot of Probabilities of default (test data)",
xlab = "Customer ID",
ylab = "Predicted Probability of default")# taking the cut-off probability 50%
pred.DT <- ifelse(PredDTModel$Yes > 0.50, "Yes", "No")
# saving predicted vector as factor
Pred <- as.factor(pred.DT)
# ordering the vectors
Predicted <- ordered(Pred, levels = c("Yes", "No"))
Actual <- ordered(test.df$default,levels = c("Yes", "No"))
# making confusion matrix
cm <-confusionMatrix(table(Predicted,Actual))
cm## Confusion Matrix and Statistics
##
## Actual
## Predicted Yes No
## Yes 20 3
## No 46 1930
##
## Accuracy : 0.9755
## 95% CI : (0.9677, 0.9818)
## No Information Rate : 0.967
## P-Value [Acc > NIR] : 0.01625
##
## Kappa : 0.4399
##
## Mcnemar's Test P-Value : 1.973e-09
##
## Sensitivity : 0.30303
## Specificity : 0.99845
## Pos Pred Value : 0.86957
## Neg Pred Value : 0.97672
## Prevalence : 0.03302
## Detection Rate : 0.01001
## Detection Prevalence : 0.01151
## Balanced Accuracy : 0.65074
##
## 'Positive' Class : Yes
##
library(dplyr)
# function to print confusion matrices for diffrent cut-off levels of probability
CmFn <- function(cutoff) {
# predicting the test set results
PredDTModel <- predict(DTModel, test.df,type = "prob")
C1 <- ifelse(PredDTModel$Yes > cutoff, "Yes", "No")
C2 <- test.df$default
predY <- as.factor(C1)
actualY <- as.factor(C2)
Predicted <- ordered(predY, levels = c("Yes", "No"))
Actual <- ordered(actualY, levels = c("Yes", "No"))
# use the confusionMatrix from the caret package
cm1 <-confusionMatrix(data = Predicted,reference = Actual, positive = "Yes")
# extracting accuracy
Accuracy <- cm1$overall[1]
# extracting sensitivity
Sensitivity <- cm1$byClass[1]
# extracting specificity
Specificity <- cm1$byClass[2]
# extracting value of kappa
Kappa <- cm1$overall[2]
# combined table
tab <- cbind(Accuracy,Sensitivity,Specificity,Kappa)
return(tab)}
# sequence of cut-off points
cutoff1 <- seq( 0, 1, by = .05 )
# loop using "lapply"
tab2 <- lapply(cutoff1, CmFn)
# creating matrix of different metrics
numrows = length(cutoff1)
pm <- matrix(1:numrows*4, nrow = numrows, ncol=4)
# applying for loop
for (i in 1:numrows){
pm[i,] = tab2[[i]]}
pm <- as.data.frame(pm)
pm <- cbind(cutoff1, pm)
pm <- rename(pm, cutoff = cutoff1, Accuracy = V1,
Senstivity = V2 ,Specificity = V3, kappa = V4)
# printing the table
print(pm)## cutoff Accuracy Senstivity Specificity kappa
## 1 0.00 0.03301651 1.0000000 0.0000000 0.0000000
## 2 0.05 0.97448724 0.5606061 0.9886187 0.5788745
## 3 0.10 0.97448724 0.5606061 0.9886187 0.5788745
## 4 0.15 0.97448724 0.5606061 0.9886187 0.5788745
## 5 0.20 0.97448724 0.5606061 0.9886187 0.5788745
## 6 0.25 0.97448724 0.5606061 0.9886187 0.5788745
## 7 0.30 0.97448724 0.5606061 0.9886187 0.5788745
## 8 0.35 0.97448724 0.5606061 0.9886187 0.5788745
## 9 0.40 0.97448724 0.5606061 0.9886187 0.5788745
## 10 0.45 0.97548774 0.3030303 0.9984480 0.4398799
## 11 0.50 0.97548774 0.3030303 0.9984480 0.4398799
## 12 0.55 0.97548774 0.3030303 0.9984480 0.4398799
## 13 0.60 0.97548774 0.3030303 0.9984480 0.4398799
## 14 0.65 0.97548774 0.3030303 0.9984480 0.4398799
## 15 0.70 0.97548774 0.3030303 0.9984480 0.4398799
## 16 0.75 0.96698349 0.0000000 1.0000000 0.0000000
## 17 0.80 0.96698349 0.0000000 1.0000000 0.0000000
## 18 0.85 0.96698349 0.0000000 1.0000000 0.0000000
## 19 0.90 0.96698349 0.0000000 1.0000000 0.0000000
## 20 0.95 0.96698349 0.0000000 1.0000000 0.0000000
## 21 1.00 0.96698349 0.0000000 1.0000000 0.0000000
## Warning: package 'ROCR' was built under R version 4.0.4
DTPrediction <- predict(DTModel, test.df,type = "prob")
Prediction <- prediction(DTPrediction[2],test.df$default)
performance <- performance(Prediction, "tpr","fpr")
# plotting ROC curve
plot(performance,main = "ROC Curve",col = 2,lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")