library(caret)
# data partition
set.seed(2341)
trainIndex <- createDataPartition(df$default, p = 0.80, list = FALSE)
# 80% training data
train.df <- df[trainIndex, ]
# 20% testing data
test.df <- df[-trainIndex, ]
dim(train.df)
## [1] 8001 4
## [1] 1999 4
set.seed(2345)
# fitting decision tree classification model
DTModel <- train(default ~ .,
data = train.df,
method = "rpart",
metric = "ROC",
parms = list(split = "information"),
trControl = trctrl)
# model summary
DTModel
## CART
##
## 8001 samples
## 3 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 7200, 7200, 7201, 7200, 7200, 7201, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.005617978 0.9703756 0.3888087
## 0.037453184 0.9711270 0.4398514
## 0.084269663 0.9680032 0.2068109
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.03745318.
rpart.plot
# viasulaziation
library(rpart.plot)
prp(DTModel$finalModel, box.palette = "Reds", tweak = 1.2, varlen = 20)
# plot of probabilities
plot(PredDTModel$Yes,
main = "Scatterplot of Probabilities of default (test data)",
xlab = "Customer ID",
ylab = "Predicted Probability of Renege")
# taking the cut-off probability 50%
pred.DT <- ifelse(PredDTModel$Yes > 0.50, "Yes", "No")
# saving predicted vector as factor
Pred <- as.factor(pred.DT)
# ordering the vectors
Predicted <- ordered(Pred, levels = c("Yes", "No"))
Actual <- ordered(test.df$default,levels = c("Yes", "No"))
# making confusion matrix
cm <-confusionMatrix(table(Predicted,Actual))
cm
## Confusion Matrix and Statistics
##
## Actual
## Predicted Yes No
## Yes 30 15
## No 36 1918
##
## Accuracy : 0.9745
## 95% CI : (0.9666, 0.9809)
## No Information Rate : 0.967
## P-Value [Acc > NIR] : 0.031048
##
## Kappa : 0.5279
##
## Mcnemar's Test P-Value : 0.005101
##
## Sensitivity : 0.45455
## Specificity : 0.99224
## Pos Pred Value : 0.66667
## Neg Pred Value : 0.98158
## Prevalence : 0.03302
## Detection Rate : 0.01501
## Detection Prevalence : 0.02251
## Balanced Accuracy : 0.72339
##
## 'Positive' Class : Yes
##
library(dplyr)
# function to print confusion matrices for diffrent cut-off levels of probability
CmFn <- function(cutoff) {
# predicting the test set results
PredDTModel <- predict(DTModel, test.df,type = "prob")
C1 <- ifelse(PredDTModel$Yes > cutoff, "Yes", "No")
C2 <- test.df$default
predY <- as.factor(C1)
actualY <- as.factor(C2)
Predicted <- ordered(predY, levels = c("Yes", "No"))
Actual <- ordered(actualY, levels = c("Yes", "No"))
# use the confusionMatrix from the caret package
cm1 <-confusionMatrix(data = Predicted,reference = Actual, positive = "Yes")
# extracting accuracy
Accuracy <- cm1$overall[1]
# extracting sensitivity
Sensitivity <- cm1$byClass[1]
# extracting specificity
Specificity <- cm1$byClass[2]
# extracting value of kappa
Kappa <- cm1$overall[2]
# combined table
tab <- cbind(Accuracy,Sensitivity,Specificity,Kappa)
return(tab)}
# sequence of cut-off points
cutoff1 <- seq( 0, 1, by = .05 )
# loop using "lapply"
tab2 <- lapply(cutoff1, CmFn)
# creating matrix of different metrics
numrows = length(cutoff1)
pm <- matrix(1:numrows*4, nrow = numrows, ncol=4)
# applying for loop
for (i in 1:numrows){
pm[i,] = tab2[[i]]}
pm <- as.data.frame(pm)
pm <- cbind(cutoff1, pm)
pm <- rename(pm, cutoff = cutoff1, Accuracy = V1,
Senstivity = V2 ,Specificity = V3, kappa = V4)
# printing the table
print(pm)
## cutoff Accuracy Senstivity Specificity kappa
## 1 0.00 0.03301651 1.0000000 0.0000000 0.0000000
## 2 0.05 0.92046023 0.8333333 0.9234351 0.3779228
## 3 0.10 0.92046023 0.8333333 0.9234351 0.3779228
## 4 0.15 0.92046023 0.8333333 0.9234351 0.3779228
## 5 0.20 0.97448724 0.4545455 0.9922400 0.5279024
## 6 0.25 0.97448724 0.4545455 0.9922400 0.5279024
## 7 0.30 0.97448724 0.4545455 0.9922400 0.5279024
## 8 0.35 0.97448724 0.4545455 0.9922400 0.5279024
## 9 0.40 0.97448724 0.4545455 0.9922400 0.5279024
## 10 0.45 0.97448724 0.4545455 0.9922400 0.5279024
## 11 0.50 0.97448724 0.4545455 0.9922400 0.5279024
## 12 0.55 0.97448724 0.4545455 0.9922400 0.5279024
## 13 0.60 0.97448724 0.4545455 0.9922400 0.5279024
## 14 0.65 0.96698349 0.0000000 1.0000000 0.0000000
## 15 0.70 0.96698349 0.0000000 1.0000000 0.0000000
## 16 0.75 0.96698349 0.0000000 1.0000000 0.0000000
## 17 0.80 0.96698349 0.0000000 1.0000000 0.0000000
## 18 0.85 0.96698349 0.0000000 1.0000000 0.0000000
## 19 0.90 0.96698349 0.0000000 1.0000000 0.0000000
## 20 0.95 0.96698349 0.0000000 1.0000000 0.0000000
## 21 1.00 0.96698349 0.0000000 1.0000000 0.0000000
## Warning: package 'ROCR' was built under R version 4.0.4
DTPrediction <- predict(DTModel, test.df,type = "prob")
Prediction <- prediction(DTPrediction[2],test.df$default)
performance <- performance(Prediction, "tpr","fpr")
# plotting ROC curve
plot(performance,main = "ROC Curve",col = 2,lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")