Random Forest

TESTING THE RANDOM FOREST MODEL

Predicting Model on Test Data Set

# predicting the model on test data set
PredRFModel <- predict(RFModel, 
                    newdata = test.df,type = "prob")

Plotting the Predicted Probabilities

# plot of probabilities
plot(PredRFModel$Yes, 
     main = "Scatterplot of Probabilities of default (test data)", 
     xlab = "Customer ID", 
     ylab = "Predicted Probability of default")

Confusion Matrix at 50% Cut-Off Probability

# taking the cut-off probability 50%
pred.PredRFModel <- ifelse(PredRFModel$Yes > 0.50, "Yes", "No")

# saving predicted vector as factor 
Pred <- as.factor(pred.PredRFModel)

# ordering the vectors
Predicted <- ordered(Pred, levels = c("Yes", "No"))
Actual <- ordered(test.df$default,levels = c("Yes", "No"))

# making confusion matrix
cm <-confusionMatrix(data = Predicted,reference = Actual, positive = "Yes")
cm

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  Yes   No
##        Yes   19    5
##        No    47 1928
##                                          
##                Accuracy : 0.974          
##                  95% CI : (0.966, 0.9805)
##     No Information Rate : 0.967          
##     P-Value [Acc > NIR] : 0.04176        
##                                          
##                   Kappa : 0.4119         
##                                          
##  Mcnemar's Test P-Value : 1.303e-08      
##                                          
##             Sensitivity : 0.287879       
##             Specificity : 0.997413       
##          Pos Pred Value : 0.791667       
##          Neg Pred Value : 0.976203       
##              Prevalence : 0.033017       
##          Detection Rate : 0.009505       
##    Detection Prevalence : 0.012006       
##       Balanced Accuracy : 0.642646       
##                                          
##        'Positive' Class : Yes            
##

Performance Metrics at different Cut-Off Probabilities

library(dplyr)

# function to print confusion matrices for diffrent cut-off levels of probability
CmFn <- function(cutoff) {
      
       # predicting the test set results
         PredRFModel <- predict(RFModel, test.df,type = "prob")
         C1 <- ifelse(PredRFModel$Yes > cutoff, "Yes", "No")
         C2 <- test.df$default
         predY   <- as.factor(C1)
           actualY <- as.factor(C2)
           Predicted <- ordered(predY, levels = c("Yes", "No"))
         Actual <- ordered(actualY,levels = c("Yes", "No")) 
     
        # use the confusionMatrix from the caret package
        cm1 <-confusionMatrix(data = Predicted,reference = Actual, positive = "Yes")
        # extracting accuracy
        Accuracy <- cm1$overall[1]
        # extracting sensitivity
          Sensitivity <- cm1$byClass[1]
        # extracting specificity
          Specificity <- cm1$byClass[2]
      # extracting value of kappa
          Kappa <- cm1$overall[2]
          
        # combined table
          tab <- cbind(Accuracy,Sensitivity,Specificity,Kappa)
        return(tab)}
     # sequence of cut-off points       
        cutoff1 <- seq( 0, 1, by = .05 )
     # loop using "lapply"
        tab2    <- lapply(cutoff1, CmFn)
     # creating matrix of different metrics
        numrows = length(cutoff1)
        pm <- matrix(1:numrows*4, nrow = numrows, ncol = 4)
     #  applying for loop
        for (i in 1:numrows){
          pm[i,] = tab2[[i]]}
        pm <- as.data.frame(pm)
        pm <- cbind(cutoff1, pm)
        pm <- rename(pm, cutoff =  cutoff1, Accuracy = V1, 
                       Senstivity = V2 ,Specificity = V3, kappa = V4)
    # printing the table
        print(pm)

##    cutoff  Accuracy Senstivity Specificity      kappa
## 1    0.00 0.8229115 0.86363636   0.8215210 0.19810577
## 2    0.05 0.9684842 0.59090909   0.9813761 0.53692652
## 3    0.10 0.9739870 0.53030303   0.9891361 0.56044755
## 4    0.15 0.9759880 0.53030303   0.9912054 0.58102857
## 5    0.20 0.9754877 0.50000000   0.9917227 0.56157768
## 6    0.25 0.9754877 0.50000000   0.9917227 0.56157768
## 7    0.30 0.9764882 0.46969697   0.9937920 0.55727446
## 8    0.35 0.9769885 0.43939394   0.9953440 0.54675670
## 9    0.40 0.9774887 0.42424242   0.9963787 0.54402141
## 10   0.45 0.9759880 0.36363636   0.9968960 0.48946495
## 11   0.50 0.9739870 0.28787879   0.9974133 0.41186588
## 12   0.55 0.9729865 0.22727273   0.9984480 0.34791591
## 13   0.60 0.9719860 0.16666667   0.9994827 0.27468284
## 14   0.65 0.9679840 0.04545455   0.9994827 0.08225133
## 15   0.70 0.9669835 0.00000000   1.0000000 0.00000000
## 16   0.75 0.9669835 0.00000000   1.0000000 0.00000000
## 17   0.80 0.9669835 0.00000000   1.0000000 0.00000000
## 18   0.85 0.9669835 0.00000000   1.0000000 0.00000000
## 19   0.90 0.9669835 0.00000000   1.0000000 0.00000000
## 20   0.95 0.9669835 0.00000000   1.0000000 0.00000000
## 21   1.00 0.9669835 0.00000000   1.0000000 0.00000000

ROC Curve using Package ROCR

# loading the package
library(ROCR)

## Warning: package 'ROCR' was built under R version 4.0.4

RFPrediction <- predict(RFModel, test.df,type = "prob")
Prediction <- prediction(RFPrediction[2],test.df$default)
performance <- performance(Prediction, "tpr","fpr")
# plotting ROC curve
plot(performance,main = "ROC Curve",col = 2,lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")

AUC (Area Under the Curve)

library(ROCR)
# area under curve
Prediction <- prediction(RFPrediction[2],test.df$default)
aucRF <- performance(Prediction, measure = "auc")
aucRF <- aucRF@y.values[[1]]
aucRF

## [1] 0.9013819

Random Forest

Sameer Mathur

LOADING DATA INTO R ENVIRONMENT

Reading the Data

Splitting data into training and test set

TRAINING THE RANDOM FOREST MODEL

Setting Control parameters

Running the Training Model

Variable Importance in Random Forest Model

TESTING THE RANDOM FOREST MODEL

Predicting Model on Test Data Set

Plotting the Predicted Probabilities

Confusion Matrix at 50% Cut-Off Probability

Performance Metrics at different Cut-Off Probabilities

ROC Curve using Package ROCR

AUC (Area Under the Curve)