Logistic Regression (HR Analytics)

LOADING DATA INTO R ENVIRONMENT
- Loading the Data
TRAINING THE LOGISTIC REGRESSION MODEL
TESTING THE LOGISTIC REGRESSION MODEL

LOADING DATA INTO R ENVIRONMENT

Loading the Data

# loading train data into R
load("TestHR.rda")
# loading test data into R
load("TrainHR.rda")

TRAINING THE LOGISTIC REGRESSION MODEL

Setting Control parameters

library(caret)
# control parameters
objControl <- trainControl(method = "boot", 
                           number = 2, 
                           returnResamp = 'none', 
                           summaryFunction = twoClassSummary, 
                           classProbs = TRUE,
                           savePredictions = TRUE)

Running the Training Model

set.seed(766)
# model building using caret package
 LRModel <- train(Status ~ ., 
                 data = trainHR.df,
                 method = 'glmStepAIC',
                 trControl = objControl,
                 metric = "ROC")
 
# summary of the model
summary(LRModel)

## 
## Call:
## NULL
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.46589  -0.70600  -0.50971  -0.00012   2.63371  
## 
## Coefficients:
##                                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                         8.431e-01  3.607e-01   2.338 0.019407 *  
## DOJExtendedYes                     -1.892e-01  6.766e-02  -2.797 0.005159 ** 
## NoticePeriod                        2.063e-02  1.494e-03  13.811  < 2e-16 ***
## OfferedBandE1                      -1.296e+00  2.151e-01  -6.024 1.70e-09 ***
## OfferedBandE2                      -1.191e+00  2.375e-01  -5.014 5.33e-07 ***
## OfferedBandE3                      -1.278e+00  3.085e-01  -4.143 3.43e-05 ***
## PercentDifferenceCTC               -3.701e-03  1.940e-03  -1.907 0.056458 .  
## JoiningBonusYes                     2.820e-01  1.592e-01   1.772 0.076461 .  
## CandidateRelocateActualYes         -1.728e+01  1.929e+02  -0.090 0.928604    
## GenderMale                          1.390e-01  8.857e-02   1.570 0.116514    
## CandidateSourceDirect              -3.699e-01  7.435e-02  -4.976 6.51e-07 ***
## `CandidateSourceEmployee Referral` -7.322e-01  1.091e-01  -6.710 1.94e-11 ***
## RexInYrs                            3.258e-02  2.271e-02   1.434 0.151474    
## LOBBFSI                            -5.039e-01  1.326e-01  -3.801 0.000144 ***
## LOBCSMP                            -3.545e-01  1.612e-01  -2.200 0.027810 *  
## LOBERS                             -3.746e-01  1.210e-01  -3.095 0.001968 ** 
## LOBETS                             -5.497e-01  1.551e-01  -3.544 0.000394 ***
## LOBHealthcare                      -4.087e-01  2.742e-01  -1.490 0.136121    
## LOBINFRA                           -9.337e-01  1.374e-01  -6.795 1.08e-11 ***
## LOBMMS                             -1.797e+01  1.900e+03  -0.009 0.992455    
## LocationBangalore                  -1.257e-01  8.242e-02  -1.525 0.127319    
## LocationHyderabad                  -2.850e-01  1.739e-01  -1.639 0.101142    
## LocationKolkata                    -5.256e-01  2.923e-01  -1.798 0.072135 .  
## LocationMumbai                     -3.761e-01  2.655e-01  -1.416 0.156637    
## LocationNoida                      -3.708e-01  9.112e-02  -4.070 4.71e-05 ***
## LocationOthers                     -1.635e+01  1.758e+03  -0.009 0.992579    
## Age                                -3.235e-02  1.035e-02  -3.126 0.001769 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6936.1  on 7196  degrees of freedom
## Residual deviance: 5987.9  on 7170  degrees of freedom
## AIC: 6041.9
## 
## Number of Fisher Scoring iterations: 17

Variable Importance in Logistic Regression Model

# plotting variable importance
plot(varImp(LRModel))

TESTING THE LOGISTIC REGRESSION MODEL

Predicting Test Set Results

# predicting the model on test data set
PredLR <- predict(LRModel, testHR.df,type = "prob")

Plotting the Predicted Plobalities

# plot of probabilities
plot(PredLR$NotJoined, 
     main = "Scatterplot of Probabilities of Renege (Yes / No) (test data)", 
     xlab = "Customer ID", 
     ylab = "Predicted Probability of Renege")

Confusion Matrix at 50% Cut-Off Probability

# choosing cut-off probability
pred.LR <- ifelse(PredLR$NotJoined > 0.50, "NotJoined", "Joined")
Predicted <- ordered(pred.LR, levels = c("NotJoined", "Joined"))
# actual and predicted data columns
Predicted <- as.factor(Predicted)
Actual <- ordered(testHR.df$Status,levels = c("NotJoined", "Joined"))
# making confusion matrix
cm <-confusionMatrix(data =Predicted,reference = Actual,
                      positive = "NotJoined")
cm

## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  NotJoined Joined
##   NotJoined        22     22
##   Joined          314   1440
##                                           
##                Accuracy : 0.8131          
##                  95% CI : (0.7943, 0.8309)
##     No Information Rate : 0.8131          
##     P-Value [Acc > NIR] : 0.5146          
##                                           
##                   Kappa : 0.0758          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.06548         
##             Specificity : 0.98495         
##          Pos Pred Value : 0.50000         
##          Neg Pred Value : 0.82098         
##              Prevalence : 0.18687         
##          Detection Rate : 0.01224         
##    Detection Prevalence : 0.02447         
##       Balanced Accuracy : 0.52521         
##                                           
##        'Positive' Class : NotJoined       
##

library(dplyr)
library(data.table)
# function to print confusion matrices for diffrent cut-off levels of probability
CmFn <- function(cutoff) {
       # predicting the test set results
         PredLR <- predict(LRModel, testHR.df, type = "prob")
         C1 <- ifelse(PredLR$NotJoined > cutoff, "NotJoined", "Joined")
         C2 <- testHR.df$Status
         predY   <- as.factor(C1)
           actualY <- as.factor(C2)
           Predicted <- ordered(predY, levels = c("NotJoined", "Joined"))
         Actual <- ordered(actualY,levels = c("NotJoined", "Joined"))
     
        # use the confusionMatrix from the caret package
        cm1 <-confusionMatrix(data = Predicted, reference = Actual, positive = "NotJoined")
        # extracting accuracy
        Accuracy <- cm1$overall[1]
        # extracting sensitivity
          Sensitivity <- cm1$byClass[1]
        # extracting specificity
          Specificity <- cm1$byClass[2]
      # extracting value of kappa
          Kappa <- cm1$overall[2]
        # combined table
          tab <- cbind(Accuracy,Sensitivity,Specificity,Kappa)
        return(tab)}
     # making sequence of cut-off probabilities       
        cutoff1 <- seq( 0, 1, by = .05 )
     # loop using "lapply"
        tab2    <- lapply(cutoff1, CmFn)
     # creating matrix of different metrics
        numrows = length(cutoff1)
        pm <- matrix(1:numrows*4, nrow = numrows, ncol=4)
     #  applying for loop
        for (i in 1:numrows){
          pm[i,] = tab2[[i]]}
        pm <- as.data.frame(pm)
        pm <- cbind(cutoff1, pm)
        pm <- setnames(pm, "cutoff1", "cutoff")
        pm <- setnames(pm, "V1", "Accuracy")
        pm <- setnames(pm, "V2", "Senstivity")
        pm <- setnames(pm, "V3", "Specificity")
        pm <- setnames(pm, "V4", "kappa")
    # printing the table
        print(pm)

##    cutoff  Accuracy  Senstivity Specificity      kappa
## 1    0.00 0.1868743 1.000000000   0.0000000 0.00000000
## 2    0.05 0.3398220 0.994047619   0.1894665 0.07789773
## 3    0.10 0.4238042 0.946428571   0.3036936 0.11654740
## 4    0.15 0.5361513 0.857142857   0.4623803 0.17310632
## 5    0.20 0.6501669 0.758928571   0.6251710 0.25019061
## 6    0.25 0.7246941 0.556547619   0.7633379 0.26096459
## 7    0.30 0.7686318 0.351190476   0.8645691 0.22082121
## 8    0.35 0.7992214 0.255952381   0.9240766 0.21414371
## 9    0.40 0.8097887 0.160714286   0.9589603 0.16051506
## 10   0.45 0.8081201 0.092261905   0.9726402 0.09321214
## 11   0.50 0.8131257 0.065476190   0.9849521 0.07579336
## 12   0.55 0.8136819 0.032738095   0.9931601 0.04053051
## 13   0.60 0.8142380 0.008928571   0.9993160 0.01330853
## 14   0.65 0.8136819 0.002976190   1.0000000 0.00483103
## 15   0.70 0.8131257 0.000000000   1.0000000 0.00000000
## 16   0.75 0.8131257 0.000000000   1.0000000 0.00000000
## 17   0.80 0.8131257 0.000000000   1.0000000 0.00000000
## 18   0.85 0.8131257 0.000000000   1.0000000 0.00000000
## 19   0.90 0.8131257 0.000000000   1.0000000 0.00000000
## 20   0.95 0.8131257 0.000000000   1.0000000 0.00000000
## 21   1.00 0.8131257 0.000000000   1.0000000 0.00000000

ROC Curve

# False Positive Rate
FPR <- 1-pm$Specificity 
# True positive Rate
TPR <- pm$Senstivity 
# plotting ROC curve
plot(FPR,TPR,main = "ROC Curve",col = 2,lwd = 2,type = "l",xlab = "False Positive Rate", ylab = "True positive Rate")
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")

ROC Curve using Package ROCR

# loading the package
library(ROCR)

## Warning: package 'ROCR' was built under R version 4.0.4

Prediction <- prediction(PredLR$NotJoined,testHR.df$Status)
performance <- performance(Prediction, "tpr","fpr")
# plotting ROC curve
plot(performance,main = "ROC Curve",col = 2,lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")

AUC (Area Under the Curve)

library(ROCR)
# area under curve
Prediction <- prediction(PredLR$NotJoined,testHR.df$Status)
aucLR <- performance(Prediction, measure = "auc")
aucLR <- aucLR@y.values[[1]]
aucLR

## [1] 0.7425137

Logistic Regression (HR Analytics)

Sameer Mathur

LOADING DATA INTO R ENVIRONMENT

Loading the Data

TRAINING THE LOGISTIC REGRESSION MODEL

Setting Control parameters

Running the Training Model

Variable Importance in Logistic Regression Model

TESTING THE LOGISTIC REGRESSION MODEL

Predicting Test Set Results

Plotting the Predicted Plobalities

Confusion Matrix at 50% Cut-Off Probability

ROC Curve

ROC Curve using Package ROCR

AUC (Area Under the Curve)