Logistic Regression (Using Caret Package)

MODEL BUILDING

# model building using caret package
set.seed(766)
caretLogitModel <- train(trainingSet[, 2:4],
                      trainingSet[, 1],
                      method = 'glm',
                      trControl = objControl,
                      metric = "ROC")

summary(caretLogitModel)

## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4563  -0.1434  -0.0560  -0.0203   3.6795  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.088e+01  5.560e-01 -19.576   <2e-16 ***
## studentYes  -5.865e-01  2.638e-01  -2.223   0.0262 *  
## balance      5.708e-03  2.583e-04  22.095   <2e-16 ***
## income       4.017e-06  9.319e-06   0.431   0.6664    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2333.8  on 7999  degrees of freedom
## Residual deviance: 1263.7  on 7996  degrees of freedom
## AIC: 1271.7
## 
## Number of Fisher Scoring iterations: 8

TESTING THE LOGISTIC REGRESSION MODEL

Predicting Test Set Results

# predicting the test set observations
logitModelPred <- predict(caretLogitModel, testSet, type = "prob")
logitModelPred <- logitModelPred[,2]

Plotting the Predicted Plobalities

# plot of probabilities
plot(logitModelPred, 
     main = "Scatterplot of Probabilities of Default (test data)", 
     xlab = "Customer ID", ylab = "Predicted Probability of Default")

Confusion Matrix at 50% Cut-Off Probability

# setting the cut-off probablity
classify50 <- ifelse(logitModelPred > 0.5,"Yes","No")

# ordering the levels
classify50 <- ordered(classify50, levels = c("Yes", "No"))
testSet$default <- ordered(testSet$default, levels = c("Yes", "No"))

# confusion matrix
cm <- table(Predicted = classify50, Actual = testSet$default)
cm

##          Actual
## Predicted  Yes   No
##       Yes   24    4
##       No    43 1929

Machine Learning Metrics using Caret Package

confusionMatrix(cm)

## Confusion Matrix and Statistics
## 
##          Actual
## Predicted  Yes   No
##       Yes   24    4
##       No    43 1929
##                                           
##                Accuracy : 0.9765          
##                  95% CI : (0.9689, 0.9827)
##     No Information Rate : 0.9665          
##     P-Value [Acc > NIR] : 0.005658        
##                                           
##                   Kappa : 0.4953          
##                                           
##  Mcnemar's Test P-Value : 2.976e-08       
##                                           
##             Sensitivity : 0.3582          
##             Specificity : 0.9979          
##          Pos Pred Value : 0.8571          
##          Neg Pred Value : 0.9782          
##              Prevalence : 0.0335          
##          Detection Rate : 0.0120          
##    Detection Prevalence : 0.0140          
##       Balanced Accuracy : 0.6781          
##                                           
##        'Positive' Class : Yes             
##

Measuring Machine Learning Metrics at different Cut-off Probabilities

library(caret)
library(dplyr)
# function to print confusion matrices for diffrent cut-off levels of probability
CmFn <- function(cutoff) {

       # predicting the test set results
         logitModelPred <- predict(caretLogitModel, testSet, type = "prob")
         C1 <- ifelse(logitModelPred[,2] > cutoff, "Yes", "No")
         C2 <- testSet$default
         predY   <- as.factor(C1)
         actualY <- as.factor(C2)
         
      predY <- ordered(predY, levels = c("Yes", "No"))
      actualY <- ordered(actualY, levels = c("Yes", "No"))

        # use the confusionMatrix from the caret package
        cm1 <-confusionMatrix(table(predY,actualY))
        # extracting accuracy
        Accuracy <- cm1$overall[1]
        # extracting sensitivity
          Sensitivity <- cm1$byClass[1]
        # extracting specificity
          Specificity <- cm1$byClass[2]
      # extracting value of kappa
          Kappa <- cm1$overall[2]

        # combined table
          tab <- cbind(Accuracy,Sensitivity,Specificity,Kappa)
        return(tab)}
     # making sequence of cut-off probabilities       
        cutoff1 <- seq( .1, .9, by = .05 )
     # loop using "lapply"
        tab2    <- lapply(cutoff1, CmFn)
     # extra coding for saving table as desired format
        tab3 <- rbind(tab2[[1]],tab2[[2]],tab2[[3]],tab2[[4]],tab2[[5]],tab2[[6]],tab2[[7]],
                 tab2[[8]],tab2[[9]],tab2[[10]],tab2[[11]],tab2[[12]],tab2[[13]],tab2[[14]],
                 tab2[[15]],tab2[[16]],tab2[[17]])
        # printing the table
        tab4 <- as.data.frame(tab3)
        tab5 <- cbind(cutoff1,tab4$Accuracy,tab4$Sensitivity,tab4$Specificity,tab4$Kappa)
        tab6 <- as.data.frame(tab5)
        tab7 <- rename(tab6,cutoff =  cutoff1, Accuracy = V2 , 
                       Senstivity = V3 ,Specificity =  V4 ,kappa = V5)
        tab7

##    cutoff Accuracy Senstivity Specificity      kappa
## 1    0.10   0.9340 0.73134328   0.9410243 0.39747762
## 2    0.15   0.9510 0.64179104   0.9617175 0.44369071
## 3    0.20   0.9605 0.58208955   0.9736161 0.47671723
## 4    0.25   0.9685 0.53731343   0.9834454 0.51703413
## 5    0.30   0.9705 0.52238806   0.9860321 0.52741778
## 6    0.35   0.9745 0.50746269   0.9906880 0.55850272
## 7    0.40   0.9755 0.46268657   0.9932747 0.54651464
## 8    0.45   0.9765 0.38805970   0.9968960 0.51474354
## 9    0.50   0.9765 0.35820896   0.9979307 0.49529659
## 10   0.55   0.9770 0.34328358   0.9989653 0.49072793
## 11   0.60   0.9755 0.29850746   0.9989653 0.44016635
## 12   0.65   0.9735 0.23880597   0.9989653 0.36749648
## 13   0.70   0.9725 0.19402985   0.9994827 0.31303240
## 14   0.75   0.9700 0.11940299   0.9994827 0.20421237
## 15   0.80   0.9695 0.10447761   0.9994827 0.18081220
## 16   0.85   0.9680 0.05970149   0.9994827 0.10695598
## 17   0.90   0.9680 0.04477612   1.0000000 0.08308142

ROC Curve

# loading the package
library(ROCR)

## Warning: package 'ROCR' was built under R version 4.0.4

PredLR <- predict(caretLogitModel, testSet,type = "prob")
lgPredObj <- prediction((1-PredLR[,2]),testSet$default)
lgPerfObj <- performance(lgPredObj, "tpr","fpr")
# plotting ROC curve
plot(lgPerfObj,main = "ROC Curve",col = 2,lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")

AUC (Area Under the Curve)

library(ROCR)
# area under curve
aucLR <- performance(lgPredObj, measure = "auc")
aucLR <- aucLR@y.values[[1]]
aucLR

## [1] 0.9485604

Logistic Regression (Using Caret Package)

Sameer Mathur

LOADING DATA INTO R ENVIRONMENT

Loading the Data

Splitting the Data into training set and test set

TRAINING THE LOGISTIC REGRESSION MODEL USING `caret` PACKAGE

Setting Control parameters

MODEL BUILDING

TESTING THE LOGISTIC REGRESSION MODEL

Predicting Test Set Results

Plotting the Predicted Plobalities

Confusion Matrix at 50% Cut-Off Probability

Machine Learning Metrics using Caret Package

Measuring Machine Learning Metrics at different Cut-off Probabilities

ROC Curve

AUC (Area Under the Curve)

Logistic Regression (Using Caret Package)

Sameer Mathur

LOADING DATA INTO R ENVIRONMENT

Loading the Data

Splitting the Data into training set and test set

TRAINING THE LOGISTIC REGRESSION MODEL USING caret PACKAGE

Setting Control parameters

MODEL BUILDING

TESTING THE LOGISTIC REGRESSION MODEL

Predicting Test Set Results

Plotting the Predicted Plobalities

Confusion Matrix at 50% Cut-Off Probability

Machine Learning Metrics using Caret Package

Measuring Machine Learning Metrics at different Cut-off Probabilities

ROC Curve

AUC (Area Under the Curve)

TRAINING THE LOGISTIC REGRESSION MODEL USING `caret` PACKAGE