Logistic Regression (Using glm())

TRAINING THE LOGISTIC REGRESSION MODEL

# changing the levels of target variable in training data set
trainingSet$default <- ifelse(trainingSet$default == "Yes",1,0)

# fit logistic regression model 
 logitModel <- glm(default ~ 
                          balance 
                        + income 
                        + student, 
                        data = trainingSet, 
                        family = binomial())
# summary of the logistic regression model 
summary(logitModel)

## 
## Call:
## glm(formula = default ~ balance + income + student, family = binomial(), 
##     data = trainingSet)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4563  -0.1434  -0.0560  -0.0203   3.6795  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.088e+01  5.560e-01 -19.576   <2e-16 ***
## balance      5.708e-03  2.583e-04  22.095   <2e-16 ***
## income       4.017e-06  9.319e-06   0.431   0.6664    
## studentYes  -5.865e-01  2.638e-01  -2.223   0.0262 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2333.8  on 7999  degrees of freedom
## Residual deviance: 1263.7  on 7996  degrees of freedom
## AIC: 1271.7
## 
## Number of Fisher Scoring iterations: 8

TESTING THE LOGISTIC REGRESSION MODEL

Predicting Test Set Results

# predicting the test set observations
logitModelPred <- predict(logitModel, testSet, type = "response")

Plotting the Predicted Plobalities

# plot of probabilities
plot(logitModelPred, 
     main = "Scatterplot of Probabilities of Default (test data)", 
     xlab = "Customer ID", ylab = "Predicted Probability of Default")

Confusion Matrix at 50% Cut-Off Probability

# setting the cut-off probablity
classify50 <- ifelse(logitModelPred > 0.5,"Yes","No")

# ordering the levels
classify50 <- ordered(classify50, levels = c("Yes", "No"))
testSet$default <- ordered(testSet$default, levels = c("Yes", "No"))

# confusion matrix
cm <- table(Predicted = classify50, Actual = testSet$default)
cm

##          Actual
## Predicted  Yes   No
##       Yes   24    4
##       No    43 1929

Machine Learning Metrics using Caret Package

library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

confusionMatrix(cm)

## Confusion Matrix and Statistics
## 
##          Actual
## Predicted  Yes   No
##       Yes   24    4
##       No    43 1929
##                                           
##                Accuracy : 0.9765          
##                  95% CI : (0.9689, 0.9827)
##     No Information Rate : 0.9665          
##     P-Value [Acc > NIR] : 0.005658        
##                                           
##                   Kappa : 0.4953          
##                                           
##  Mcnemar's Test P-Value : 2.976e-08       
##                                           
##             Sensitivity : 0.3582          
##             Specificity : 0.9979          
##          Pos Pred Value : 0.8571          
##          Neg Pred Value : 0.9782          
##              Prevalence : 0.0335          
##          Detection Rate : 0.0120          
##    Detection Prevalence : 0.0140          
##       Balanced Accuracy : 0.6781          
##                                           
##        'Positive' Class : Yes             
##

Measuring Machine Learning Metrics at different Cut-off Probabilities

library(caret)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

# function to print confusion matrices for diffrent cut-off levels of probability
CmFn <- function(cutoff) {

       # predicting the test set results
         logitModelPred <- predict(logitModel, testSet, type = "response")
         C1 <- ifelse(logitModelPred > cutoff, "Yes", "No")
         C2 <- testSet$default
         predY   <- as.factor(C1)
         actualY <- as.factor(C2)
         
      predY <- ordered(predY, levels = c("Yes", "No"))
      actualY <- ordered(actualY, levels = c("Yes", "No"))

        # use the confusionMatrix from the caret package
        cm1 <-confusionMatrix(table(predY,actualY))
        # extracting accuracy
        Accuracy <- cm1$overall[1]
        # extracting sensitivity
          Sensitivity <- cm1$byClass[1]
        # extracting specificity
          Specificity <- cm1$byClass[2]
      # extracting value of kappa
          Kappa <- cm1$overall[2]

        # combined table
          tab <- cbind(Accuracy,Sensitivity,Specificity,Kappa)
        return(tab)}
     # making sequence of cut-off probabilities       
        cutoff1 <- seq( .1, .9, by = .05 )
     # loop using "lapply"
        tab2    <- lapply(cutoff1, CmFn)
     # extra coding for saving table as desired format
        tab3 <- rbind(tab2[[1]],tab2[[2]],tab2[[3]],tab2[[4]],tab2[[5]],tab2[[6]],tab2[[7]],
                 tab2[[8]],tab2[[9]],tab2[[10]],tab2[[11]],tab2[[12]],tab2[[13]],tab2[[14]],
                 tab2[[15]],tab2[[16]],tab2[[17]])
        # printing the table
        tab4 <- as.data.frame(tab3)
        tab5 <- cbind(cutoff1,tab4$Accuracy,tab4$Sensitivity,tab4$Specificity,tab4$Kappa)
        tab6 <- as.data.frame(tab5)
        tab7 <- rename(tab6,cutoff =  cutoff1, Accuracy = V2 , 
                       Senstivity = V3 ,Specificity =  V4 ,kappa = V5)
        tab7

##    cutoff Accuracy Senstivity Specificity      kappa
## 1    0.10   0.9340 0.73134328   0.9410243 0.39747762
## 2    0.15   0.9510 0.64179104   0.9617175 0.44369071
## 3    0.20   0.9605 0.58208955   0.9736161 0.47671723
## 4    0.25   0.9685 0.53731343   0.9834454 0.51703413
## 5    0.30   0.9705 0.52238806   0.9860321 0.52741778
## 6    0.35   0.9745 0.50746269   0.9906880 0.55850272
## 7    0.40   0.9755 0.46268657   0.9932747 0.54651464
## 8    0.45   0.9765 0.38805970   0.9968960 0.51474354
## 9    0.50   0.9765 0.35820896   0.9979307 0.49529659
## 10   0.55   0.9770 0.34328358   0.9989653 0.49072793
## 11   0.60   0.9755 0.29850746   0.9989653 0.44016635
## 12   0.65   0.9735 0.23880597   0.9989653 0.36749648
## 13   0.70   0.9725 0.19402985   0.9994827 0.31303240
## 14   0.75   0.9700 0.11940299   0.9994827 0.20421237
## 15   0.80   0.9695 0.10447761   0.9994827 0.18081220
## 16   0.85   0.9680 0.05970149   0.9994827 0.10695598
## 17   0.90   0.9680 0.04477612   1.0000000 0.08308142

ROC Curve

# loading the package
library(ROCR)

## Warning: package 'ROCR' was built under R version 4.0.4

PredLR <- predict(logitModel, testSet,type = "response")
lgPredObj <- prediction((1-PredLR),testSet$default)
lgPerfObj <- performance(lgPredObj, "tpr","fpr")
# plotting ROC curve
plot(lgPerfObj,main = "ROC Curve",col = 2,lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")

AUC (Area Under the Curve)

library(ROCR)
# area under curve
aucLR <- performance(lgPredObj, measure = "auc")
aucLR <- aucLR@y.values[[1]]
aucLR

## [1] 0.9485604

Logistic Regression (Using glm())

Sameer Mathur

LOADING DATA INTO R ENVIRONMENT

Loading the Data

Splitting the Data into training set and test set

TRAINING THE LOGISTIC REGRESSION MODEL

TESTING THE LOGISTIC REGRESSION MODEL

Predicting Test Set Results

Plotting the Predicted Plobalities

Confusion Matrix at 50% Cut-Off Probability

Machine Learning Metrics using Caret Package

Measuring Machine Learning Metrics at different Cut-off Probabilities

ROC Curve

AUC (Area Under the Curve)