Machine Learning Using Logistic Regression

Sameer Mathur

Credit Card Default Data

Logistic Regression

Model Building Using Logistic Regression

# changing the levels of target variable in training data set
trainingSet$default <- ifelse(trainingSet$default == "Yes",1,0)

# fit logistic regression model 
 logitModel <- glm(default ~ 
                          balance 
                        + income 
                        + student, 
                        data = trainingSet, 
                        family = binomial())
# summary of the logistic regression model 
summary(logitModel)

Model Summary


Call:
glm(formula = default ~ balance + income + student, family = binomial(), 
    data = trainingSet)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.4563  -0.1434  -0.0560  -0.0203   3.6795  

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept) -1.088e+01  5.560e-01 -19.576   <2e-16 ***
balance      5.708e-03  2.583e-04  22.095   <2e-16 ***
income       4.017e-06  9.319e-06   0.431   0.6664    
studentYes  -5.865e-01  2.638e-01  -2.223   0.0262 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2333.8  on 7999  degrees of freedom
Residual deviance: 1263.7  on 7996  degrees of freedom
AIC: 1271.7

Number of Fisher Scoring iterations: 8

Testing the Model

# predicting the test set observations
logitModelPred <- predict(logitModel, testSet, type = "response")

Plotting the Predicted Probabilities

# plot of probabilities
plot(logitModelPred, 
     main = "Scatterplot of Probabilities of Default (test data)", 
     xlab = "Customer ID", ylab = "Predicted Probability of Default")

plot of chunk test.png

Confusion Matrix

# setting the cut-off probablity
classify50 <- ifelse(logitModelPred > 0.5,"Yes","No")

# ordering the levels
classify50 <- ordered(classify50, levels = c("Yes", "No"))
testSet$default <- ordered(testSet$default, levels = c("Yes", "No"))

# confusion matrix
cm <- table(Predicted = classify50, Actual = testSet$default)
cm

         Actual
Predicted  Yes   No
      Yes   24    4
      No    43 1929

Machine Learning Metrics using Caret Package

library(caret)
confusionMatrix(cm)

Confusion Matrix and Statistics

         Actual
Predicted  Yes   No
      Yes   24    4
      No    43 1929

               Accuracy : 0.9765          
                 95% CI : (0.9689, 0.9827)
    No Information Rate : 0.9665          
    P-Value [Acc > NIR] : 0.005658        

                  Kappa : 0.4953          

 Mcnemar's Test P-Value : 2.976e-08       

            Sensitivity : 0.3582          
            Specificity : 0.9979          
         Pos Pred Value : 0.8571          
         Neg Pred Value : 0.9782          
             Prevalence : 0.0335          
         Detection Rate : 0.0120          
   Detection Prevalence : 0.0140          
      Balanced Accuracy : 0.6781          

       'Positive' Class : Yes

Measuring Machine Learning Metrics at different Cut-off Probabilities

library(caret)
# function to print confusion matrices for diffrent cut-off levels of probability
CmFn <- function(cutoff) {

       # predicting the test set results
         logitModelPred <- predict(logitModel, testSet, type = "response")
         C1 <- ifelse(logitModelPred > cutoff, "Yes", "No")
         C2 <- testSet$default
         predY   <- as.factor(C1)
         actualY <- as.factor(C2)

      predY <- ordered(predY, levels = c("Yes", "No"))
      actualY <- ordered(actualY, levels = c("Yes", "No"))

        # use the confusionMatrix from the caret package
        cm1 <-confusionMatrix(table(predY,actualY))
        # extracting accuracy
        Accuracy <- cm1$overall[1]
        # extracting sensitivity
          Sensitivity <- cm1$byClass[1]
        # extracting specificity
          Specificity <- cm1$byClass[2]
      # extracting value of kappa
          Kappa <- cm1$overall[2]

        # combined table
          tab <- cbind(Accuracy,Sensitivity,Specificity,Kappa)
        return(tab)}
     # making sequence of cut-off probabilities       
        cutoff1 <- seq( .1, .9, by = .05 )
     # loop using "lapply"
        tab2    <- lapply(cutoff1, CmFn)
     # extra coding for saving table as desired format
        tab3 <- rbind(tab2[[1]],tab2[[2]],tab2[[3]],tab2[[4]],tab2[[5]],tab2[[6]],tab2[[7]],
                 tab2[[8]],tab2[[9]],tab2[[10]],tab2[[11]],tab2[[12]],tab2[[13]],tab2[[14]],
                 tab2[[15]],tab2[[16]],tab2[[17]])

   cutoff Accuracy Senstivity Specificity      kappa
1    0.10   0.9340 0.73134328   0.9410243 0.39747762
2    0.15   0.9510 0.64179104   0.9617175 0.44369071
3    0.20   0.9605 0.58208955   0.9736161 0.47671723
4    0.25   0.9685 0.53731343   0.9834454 0.51703413
5    0.30   0.9705 0.52238806   0.9860321 0.52741778
6    0.35   0.9745 0.50746269   0.9906880 0.55850272
7    0.40   0.9755 0.46268657   0.9932747 0.54651464
8    0.45   0.9765 0.38805970   0.9968960 0.51474354
9    0.50   0.9765 0.35820896   0.9979307 0.49529659
10   0.55   0.9770 0.34328358   0.9989653 0.49072793
11   0.60   0.9755 0.29850746   0.9989653 0.44016635
12   0.65   0.9735 0.23880597   0.9989653 0.36749648
13   0.70   0.9725 0.19402985   0.9994827 0.31303240
14   0.75   0.9700 0.11940299   0.9994827 0.20421237
15   0.80   0.9695 0.10447761   0.9994827 0.18081220
16   0.85   0.9680 0.05970149   0.9994827 0.10695598
17   0.90   0.9680 0.04477612   1.0000000 0.08308142

It can be seen that accuracy is maximum at cut-off probability = 0.55

cut-off Probability = 0.10

Confusion Matrix and Statistics

         Actual
Predicted  Yes   No
      Yes   49  114
      No    18 1819

               Accuracy : 0.934           
                 95% CI : (0.9222, 0.9445)
    No Information Rate : 0.9665          
    P-Value [Acc > NIR] : 1               

                  Kappa : 0.3975          

 Mcnemar's Test P-Value : <2e-16          

            Sensitivity : 0.7313          
            Specificity : 0.9410          
         Pos Pred Value : 0.3006          
         Neg Pred Value : 0.9902          
             Prevalence : 0.0335          
         Detection Rate : 0.0245          
   Detection Prevalence : 0.0815          
      Balanced Accuracy : 0.8362          

       'Positive' Class : Yes

cut-off Probability = 0.50

Confusion Matrix and Statistics

         Actual
Predicted  Yes   No
      Yes   24    4
      No    43 1929

               Accuracy : 0.9765          
                 95% CI : (0.9689, 0.9827)
    No Information Rate : 0.9665          
    P-Value [Acc > NIR] : 0.005658        

                  Kappa : 0.4953          

 Mcnemar's Test P-Value : 2.976e-08       

            Sensitivity : 0.3582          
            Specificity : 0.9979          
         Pos Pred Value : 0.8571          
         Neg Pred Value : 0.9782          
             Prevalence : 0.0335          
         Detection Rate : 0.0120          
   Detection Prevalence : 0.0140          
      Balanced Accuracy : 0.6781          

       'Positive' Class : Yes

ROC Curve

# loading the package
library(ROCR)
PredLR <- predict(logitModel, testSet,type = "response")
lgPredObj <- prediction(PredLR[2],testSet$default)
lgPerfObj <- performance(lgPredObj, "tpr","fpr")
# plotting ROC curve
plot(lgPerfObj,main = "ROC Curve",col = 2,lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")

plot of chunk png

AUC (Area Under The Curve)

# area under curve
aucLR <- performance(lgPredObj, measure = "auc")
aucLR <- aucLR@y.values[[1]]
aucLR

[1] 0.9485604