Behavioral Modelling to Predict Renege

Sameer Mathur

Comparing Classifiers (Binomial Logistic Classifier and Decision Tree) using caret Package

---

IMPORTING DATA

(READING AND SUMMARIZING DATA)

Number of Rows and Columns

# reading data
renegeHR.df <- read.csv(paste("CleanHRDataV2.csv"))
# attach data columns
attach(renegeHR.df)
# dimension of the dataset
dim(renegeHR.df)

[1] 8995   16

Data Structure

# structure of the data table
str(renegeHR.df)

'data.frame':   8995 obs. of  16 variables:
 $ DOJExtend     : Factor w/ 2 levels "No","Yes": 2 1 1 1 2 2 2 2 1 1 ...
 $ DurToAcptOffer: int  14 18 3 26 1 17 37 16 1 6 ...
 $ NoticePeriod  : int  30 30 45 30 120 30 30 0 30 30 ...
 $ Band          : Factor w/ 4 levels "E0","E1","E2",..: 3 3 3 3 3 2 3 2 2 2 ...
 $ CTCHikeExp    : num  -20.8 50 42.8 42.8 42.6 ...
 $ CTCHikeOffered: num  13.2 320 42.8 42.8 42.6 ...
 $ CTCDiff       : num  42.9 180 0 0 0 ...
 $ JoiningBonus  : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
 $ Relocate      : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 1 1 1 1 ...
 $ Gender        : Factor w/ 2 levels "Female","Male": 1 2 2 2 2 2 2 1 1 2 ...
 $ CandidateSrc  : Factor w/ 3 levels "Agency","Direct",..: 1 3 1 3 3 3 3 2 3 3 ...
 $ RexInYrs      : int  7 8 4 4 6 2 7 8 3 3 ...
 $ LOB           : Factor w/ 9 levels "AXON","BFSI",..: 5 8 8 8 8 8 8 7 2 3 ...
 $ Location      : Factor w/ 11 levels "Ahmedabad","Bangalore",..: 9 3 9 9 9 9 9 9 5 3 ...
 $ Age           : int  34 34 27 34 34 34 32 34 26 34 ...
 $ Status        : Factor w/ 2 levels "Joined","NotJoined": 1 1 1 1 1 1 1 1 1 1 ...

Descriptive Statistics

# descriptive statistics of the dataframe
library(psych)
describe(renegeHR.df)[, c(1:5)]

               vars    n  mean    sd median
DOJExtend*        1 8995  1.47  0.50      1
DurToAcptOffer    2 8995 21.43 25.81     10
NoticePeriod      3 8995 39.29 22.22     30
Band*             4 8995  2.39  0.63      2
CTCHikeExp        5 8995 43.86 29.79     40
CTCHikeOffered    6 8995 40.66 36.06     36
CTCDiff           7 8995 -1.57 19.61      0
JoiningBonus*     8 8995  1.05  0.21      1
Relocate*         9 8995  1.14  0.35      1
Gender*          10 8995  1.83  0.38      2
CandidateSrc*    11 8995  1.89  0.67      2
RexInYrs         12 8995  4.24  2.55      4
LOB*             13 8995  5.18  2.38      5
Location*        14 8995  4.94  3.00      3
Age              15 8995 29.91  4.10     29
Status*          16 8995  1.19  0.39      1

PREPARING DATA

(DATA TRAINING AND TESTING)

Training (80%) and Testing (20%) Data

library(caret)
# data partition
set.seed(2341)
trainIndex <- createDataPartition(renegeHR.df$Status, p = 0.80, list = FALSE)
# 80% training data
trainHRData.df <- renegeHR.df[trainIndex, ]
# 20% testing data
testHRData.df <- renegeHR.df[-trainIndex, ]

Dimension of Training and Testing Data

# dimension of training data
table(trainHRData.df$Status)


   Joined NotJoined 
     5851      1346

# dimension of testing data
table(testHRData.df$Status)


   Joined NotJoined 
     1462       336

MODEL BUILDING

(BINOMIAL LOGIT CLASSIFIER USING CARET PACKAGE)

Control Parameters

# control parameters
objControl <- trainControl(method = "boot", 
                           number = 2, 
                           returnResamp = 'none', 
                           summaryFunction = twoClassSummary, 
                           classProbs = TRUE,
                           savePredictions = TRUE)

Model Building

set.seed(766)
# model building using caret package
caretLogitModel <- train(trainHRData.df[,1:15],
                      trainHRData.df$Status,
                      method = 'glmStepAIC',
                      trControl = objControl,
                      metric = "ROC",
                      verbose = FALSE)

Model Summary

# summary of the model
summary(caretLogitModel)


Call:
NULL

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.41194  -0.70671  -0.50918  -0.00012   2.63602  

Coefficients:
                                Estimate Std. Error z value Pr(>|z|)    
(Intercept)                    2.134e+00  1.292e+00   1.651  0.09864 .  
DOJExtendYes                  -1.920e-01  6.811e-02  -2.819  0.00481 ** 
NoticePeriod                   2.066e-02  1.495e-03  13.826  < 2e-16 ***
BandE1                        -1.289e+00  2.152e-01  -5.990 2.10e-09 ***
BandE2                        -1.189e+00  2.376e-01  -5.004 5.61e-07 ***
BandE3                        -1.278e+00  3.086e-01  -4.143 3.43e-05 ***
CTCDiff                       -3.785e-03  1.945e-03  -1.946  0.05170 .  
JoiningBonusYes                2.939e-01  1.597e-01   1.840  0.06578 .  
RelocateYes                   -1.728e+01  1.928e+02  -0.090  0.92860    
GenderMale                     1.393e-01  8.864e-02   1.572  0.11599    
CandidateSrcDirect            -3.714e-01  7.447e-02  -4.987 6.14e-07 ***
CandidateSrcEmployee Referral -7.367e-01  1.093e-01  -6.739 1.60e-11 ***
RexInYrs                       3.293e-02  2.273e-02   1.449  0.14729    
LOBBFSI                       -4.482e-01  1.569e-01  -2.857  0.00428 ** 
LOBCSMP                       -2.995e-01  1.811e-01  -1.654  0.09819 .  
LOBEAS                         1.240e-01  1.976e-01   0.627  0.53041    
LOBERS                        -3.221e-01  1.489e-01  -2.163  0.03053 *  
LOBETS                        -4.974e-01  1.765e-01  -2.818  0.00484 ** 
LOBHealthcare                 -3.592e-01  2.870e-01  -1.252  0.21065    
LOBINFRA                      -8.736e-01  1.617e-01  -5.402 6.61e-08 ***
LOBMMS                        -1.792e+01  1.900e+03  -0.009  0.99247    
LocationBangalore             -1.475e+00  1.232e+00  -1.197  0.23136    
LocationChennai               -1.346e+00  1.231e+00  -1.093  0.27419    
LocationCochin                -1.794e+01  2.463e+03  -0.007  0.99419    
LocationGurgaon               -1.490e+00  1.256e+00  -1.186  0.23549    
LocationHyderabad             -1.644e+00  1.241e+00  -1.325  0.18532    
LocationKolkata               -1.843e+00  1.264e+00  -1.458  0.14473    
LocationMumbai                -1.730e+00  1.254e+00  -1.379  0.16796    
LocationNoida                 -1.720e+00  1.230e+00  -1.398  0.16200    
LocationOthers                -1.771e+01  1.758e+03  -0.010  0.99197    
LocationPune                  -1.501e+00  1.295e+00  -1.159  0.24639    
Age                           -3.238e-02  1.035e-02  -3.127  0.00177 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 6936.1  on 7196  degrees of freedom
Residual deviance: 5984.3  on 7165  degrees of freedom
AIC: 6048.3

Number of Fisher Scoring iterations: 17

Fitting Decision Tree Classifier on training dataset with criterion as Information Gain

set.seed(2345)
dTreeInfoGain <- train(Status ~ ., 
                       data = trainHRData.df, 
                       method = "rpart", 
                       parms = list(split = "information"), 
                       trControl = trainControl(method = "cv"))
dTreeInfoGain

PLOT OF PREDICTED PROBABILITIES

(LOGIT AND TREE)

Predicted Probabilities of Default (Yes/No), based on Test Data

# predicted probabilities
predProbTest <- predict(caretLogitModel, testHRData.df, type = "prob")
# plot of probabilities
plot(predProbTest[, 2], 
     main = "Scatterplot of Probabilities of Renege (Yes / No) (test data)", 
     xlab = "Customer ID", 
     ylab = "Predicted Probability of Renege (Yes / No)")

plot of chunk unnamed-chunk-16

Predicted Probabilities of Default (Yes / No), based on Test Data

# predicted probabilities
predProbTestInfoGain <- predict(dTreeInfoGain, testHRData.df, type = "prob")
# plot of probabilities
plot(predProbTestInfoGain[, 2], 
     main = "Scatterplot of Probabilities of Renege (Yes / No) (test data)", 
     xlab = "Customer ID", 
     ylab = "Predicted Probability of Renege (Yes / No)")

plot of chunk unnamed-chunk-18

CONFUSION MATRIX

(LOGIT AND TREE)

Logit

# prediction on test data
predClassTest <- predict(caretLogitModel, 
                         testHRData.df[, 1:15], 
                         type = 'raw')
# confusion matrix
confusionMatrix(predClassTest, testHRData.df$Status, 
                positive = "NotJoined")

Decision Tree

# prediction on test data
predClassTestInfoGain <- predict(dTreeInfoGain, 
                         testHRData.df[, 1:15], 
                         type = 'raw')
# confusion matrix
confusionMatrix(predClassTestInfoGain, testHRData.df$Status, 
                positive = "NotJoined")

Logit

Confusion Matrix and Statistics

           Reference
Prediction  Joined NotJoined
  Joined      1442       314
  NotJoined     20        22

               Accuracy : 0.8142         
                 95% CI : (0.7955, 0.832)
    No Information Rate : 0.8131         
    P-Value [Acc > NIR] : 0.4663         

                  Kappa : 0.0781         
 Mcnemar's Test P-Value : <2e-16         

            Sensitivity : 0.06548        
            Specificity : 0.98632        
         Pos Pred Value : 0.52381        
         Neg Pred Value : 0.82118        
             Prevalence : 0.18687        
         Detection Rate : 0.01224        
   Detection Prevalence : 0.02336        
      Balanced Accuracy : 0.52590        

       'Positive' Class : NotJoined

Decision Tree

Confusion Matrix and Statistics

           Reference
Prediction  Joined NotJoined
  Joined      1428       269
  NotJoined     34        67

               Accuracy : 0.8315          
                 95% CI : (0.8134, 0.8485)
    No Information Rate : 0.8131          
    P-Value [Acc > NIR] : 0.02355         

                  Kappa : 0.2411          
 Mcnemar's Test P-Value : < 2e-16         

            Sensitivity : 0.19940         
            Specificity : 0.97674         
         Pos Pred Value : 0.66337         
         Neg Pred Value : 0.84148         
             Prevalence : 0.18687         
         Detection Rate : 0.03726         
   Detection Prevalence : 0.05617         
      Balanced Accuracy : 0.58807         

       'Positive' Class : NotJoined

ROC PLOT

(LOGIT AND TREE)

ROC Plot on the Test data

library(ROCR)
lgPredObjLogit <- prediction(predProbTest[2],testHRData.df$Status)
lgPerfObjLogit <- performance(lgPredObjLogit, "tpr","fpr")
plot(lgPerfObjLogit,main = "ROC Curve",col = "red",lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")

plot of chunk unnamed-chunk-24

ROC Plot on the Test data

library(ROCR)
lgPredObjTree <- prediction(predProbTestInfoGain[2],testHRData.df$Status)
lgPerfObjTree <- performance(lgPredObjTree, "tpr","fpr")
plot(lgPerfObjTree,main = "ROC Curve",col = "blue",lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")

plot of chunk unnamed-chunk-26

Comparing ROC Curves of Classifiers (Logit and Tree)

# List of predictions
predList <- list(predClassTest, predClassTestInfoGain)

# List of actual values (same for all)
m <- length(predList)

# ROC curves (logit and tree)
plot(lgPerfObjLogit, col = "red", lwd = 2,
     main = "ROC Curve for CC Default \n (Logit and Tree)")
plot(lgPerfObjTree, add = TRUE, col = "blue", lwd = 3,
     main = "ROC Curve for CC Default \n (Logit and Tree)")
legend(x = "bottomright", 
       legend = c("Tree", "Logit"),
       fill = 1:m)

plot of chunk unnamed-chunk-28

Area Under the Curve (Logit)

# auc for decision tree
aucLogit <- performance(lgPredObjLogit, measure = "auc")
aucLogit <- aucLogit@y.values[[1]]
aucLogit

[1] 0.7410836

Area Under the Curve (Tree)

# auc for decision tree
aucLogitTree <- performance(lgPredObjTree, measure = "auc")
aucLogitTree <- aucLogitTree@y.values[[1]]
aucLogitTree

[1] 0.7247207