library(caret)
# data partition
set.seed(2341)
trainIndex <- createDataPartition(df$default, p = 0.80, list = FALSE)
# 80% training data
train.df <- df[trainIndex, ]
# 20% testing data
test.df <- df[-trainIndex, ]
dim(train.df)## [1] 8001 4
## [1] 1999 4
set.seed(123)
# fitting random forest classification model
RFModel <- train(default ~ .,
data = train.df,
method = "rf",
nbagg = 50,
trControl = trctrl)
# model summary
RFModel## Random Forest
##
## 8001 samples
## 3 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: None
# plot of probabilities
plot(PredRFModel$Yes,
main = "Scatterplot of Probabilities of default (test data)",
xlab = "Customer ID",
ylab = "Predicted Probability of default")# taking the cut-off probability 50%
pred.PredRFModel <- ifelse(PredRFModel$Yes > 0.50, "Yes", "No")
# saving predicted vector as factor
Pred <- as.factor(pred.PredRFModel)
# ordering the vectors
Predicted <- ordered(Pred, levels = c("Yes", "No"))
Actual <- ordered(test.df$default,levels = c("Yes", "No"))
# making confusion matrix
cm <-confusionMatrix(data = Predicted,reference = Actual, positive = "Yes")
cm## Confusion Matrix and Statistics
##
## Reference
## Prediction Yes No
## Yes 19 5
## No 47 1928
##
## Accuracy : 0.974
## 95% CI : (0.966, 0.9805)
## No Information Rate : 0.967
## P-Value [Acc > NIR] : 0.04176
##
## Kappa : 0.4119
##
## Mcnemar's Test P-Value : 1.303e-08
##
## Sensitivity : 0.287879
## Specificity : 0.997413
## Pos Pred Value : 0.791667
## Neg Pred Value : 0.976203
## Prevalence : 0.033017
## Detection Rate : 0.009505
## Detection Prevalence : 0.012006
## Balanced Accuracy : 0.642646
##
## 'Positive' Class : Yes
##
library(dplyr)
# function to print confusion matrices for diffrent cut-off levels of probability
CmFn <- function(cutoff) {
# predicting the test set results
PredRFModel <- predict(RFModel, test.df,type = "prob")
C1 <- ifelse(PredRFModel$Yes > cutoff, "Yes", "No")
C2 <- test.df$default
predY <- as.factor(C1)
actualY <- as.factor(C2)
Predicted <- ordered(predY, levels = c("Yes", "No"))
Actual <- ordered(actualY,levels = c("Yes", "No"))
# use the confusionMatrix from the caret package
cm1 <-confusionMatrix(data = Predicted,reference = Actual, positive = "Yes")
# extracting accuracy
Accuracy <- cm1$overall[1]
# extracting sensitivity
Sensitivity <- cm1$byClass[1]
# extracting specificity
Specificity <- cm1$byClass[2]
# extracting value of kappa
Kappa <- cm1$overall[2]
# combined table
tab <- cbind(Accuracy,Sensitivity,Specificity,Kappa)
return(tab)}
# sequence of cut-off points
cutoff1 <- seq( 0, 1, by = .05 )
# loop using "lapply"
tab2 <- lapply(cutoff1, CmFn)
# creating matrix of different metrics
numrows = length(cutoff1)
pm <- matrix(1:numrows*4, nrow = numrows, ncol = 4)
# applying for loop
for (i in 1:numrows){
pm[i,] = tab2[[i]]}
pm <- as.data.frame(pm)
pm <- cbind(cutoff1, pm)
pm <- rename(pm, cutoff = cutoff1, Accuracy = V1,
Senstivity = V2 ,Specificity = V3, kappa = V4)
# printing the table
print(pm)## cutoff Accuracy Senstivity Specificity kappa
## 1 0.00 0.8229115 0.86363636 0.8215210 0.19810577
## 2 0.05 0.9684842 0.59090909 0.9813761 0.53692652
## 3 0.10 0.9739870 0.53030303 0.9891361 0.56044755
## 4 0.15 0.9759880 0.53030303 0.9912054 0.58102857
## 5 0.20 0.9754877 0.50000000 0.9917227 0.56157768
## 6 0.25 0.9754877 0.50000000 0.9917227 0.56157768
## 7 0.30 0.9764882 0.46969697 0.9937920 0.55727446
## 8 0.35 0.9769885 0.43939394 0.9953440 0.54675670
## 9 0.40 0.9774887 0.42424242 0.9963787 0.54402141
## 10 0.45 0.9759880 0.36363636 0.9968960 0.48946495
## 11 0.50 0.9739870 0.28787879 0.9974133 0.41186588
## 12 0.55 0.9729865 0.22727273 0.9984480 0.34791591
## 13 0.60 0.9719860 0.16666667 0.9994827 0.27468284
## 14 0.65 0.9679840 0.04545455 0.9994827 0.08225133
## 15 0.70 0.9669835 0.00000000 1.0000000 0.00000000
## 16 0.75 0.9669835 0.00000000 1.0000000 0.00000000
## 17 0.80 0.9669835 0.00000000 1.0000000 0.00000000
## 18 0.85 0.9669835 0.00000000 1.0000000 0.00000000
## 19 0.90 0.9669835 0.00000000 1.0000000 0.00000000
## 20 0.95 0.9669835 0.00000000 1.0000000 0.00000000
## 21 1.00 0.9669835 0.00000000 1.0000000 0.00000000
## Warning: package 'ROCR' was built under R version 4.0.4
RFPrediction <- predict(RFModel, test.df,type = "prob")
Prediction <- prediction(RFPrediction[2],test.df$default)
performance <- performance(Prediction, "tpr","fpr")
# plotting ROC curve
plot(performance,main = "ROC Curve",col = 2,lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")library(ROCR)
# area under curve
Prediction <- prediction(RFPrediction[2],test.df$default)
aucRF <- performance(Prediction, measure = "auc")
aucRF <- aucRF@y.values[[1]]
aucRF## [1] 0.9013819