Behavioral Modelling to Predict Renege

Sameer Mathur

Decision Tree using caret Package

---

IMPORTING DATA

(READING AND SUMMARIZING DATA)

Number of Rows and Columns

# reading data
renegeHR.df <- read.csv(paste("CleanHRDataV2.csv"))
# attach data columns
attach(renegeHR.df)
# dimension of the dataset
dim(renegeHR.df)
[1] 8995   16

Data Structure

# structure of the data table
str(renegeHR.df)
'data.frame':   8995 obs. of  16 variables:
 $ DOJExtend     : Factor w/ 2 levels "No","Yes": 2 1 1 1 2 2 2 2 1 1 ...
 $ DurToAcptOffer: int  14 18 3 26 1 17 37 16 1 6 ...
 $ NoticePeriod  : int  30 30 45 30 120 30 30 0 30 30 ...
 $ Band          : Factor w/ 4 levels "E0","E1","E2",..: 3 3 3 3 3 2 3 2 2 2 ...
 $ CTCHikeExp    : num  -20.8 50 42.8 42.8 42.6 ...
 $ CTCHikeOffered: num  13.2 320 42.8 42.8 42.6 ...
 $ CTCDiff       : num  42.9 180 0 0 0 ...
 $ JoiningBonus  : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
 $ Relocate      : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 1 1 1 1 ...
 $ Gender        : Factor w/ 2 levels "Female","Male": 1 2 2 2 2 2 2 1 1 2 ...
 $ CandidateSrc  : Factor w/ 3 levels "Agency","Direct",..: 1 3 1 3 3 3 3 2 3 3 ...
 $ RexInYrs      : int  7 8 4 4 6 2 7 8 3 3 ...
 $ LOB           : Factor w/ 9 levels "AXON","BFSI",..: 5 8 8 8 8 8 8 7 2 3 ...
 $ Location      : Factor w/ 11 levels "Ahmedabad","Bangalore",..: 9 3 9 9 9 9 9 9 5 3 ...
 $ Age           : int  34 34 27 34 34 34 32 34 26 34 ...
 $ Status        : Factor w/ 2 levels "Joined","NotJoined": 1 1 1 1 1 1 1 1 1 1 ...

Descriptive Statistics

# descriptive statistics of the dataframe
library(psych)
describe(renegeHR.df)[, c(1:5)]
               vars    n  mean    sd median
DOJExtend*        1 8995  1.47  0.50      1
DurToAcptOffer    2 8995 21.43 25.81     10
NoticePeriod      3 8995 39.29 22.22     30
Band*             4 8995  2.39  0.63      2
CTCHikeExp        5 8995 43.86 29.79     40
CTCHikeOffered    6 8995 40.66 36.06     36
CTCDiff           7 8995 -1.57 19.61      0
JoiningBonus*     8 8995  1.05  0.21      1
Relocate*         9 8995  1.14  0.35      1
Gender*          10 8995  1.83  0.38      2
CandidateSrc*    11 8995  1.89  0.67      2
RexInYrs         12 8995  4.24  2.55      4
LOB*             13 8995  5.18  2.38      5
Location*        14 8995  4.94  3.00      3
Age              15 8995 29.91  4.10     29
Status*          16 8995  1.19  0.39      1

PREPARING DATA

(DATA TRAINING AND TESTING)

Training (80%) and Testing (20%) Data

library(caret)
# data partition
set.seed(2341)
trainIndex <- createDataPartition(renegeHR.df$Status, p = 0.80, list = FALSE)
# 80% training data
trainHRData.df <- renegeHR.df[trainIndex, ]
# 20% testing data
testHRData.df <- renegeHR.df[-trainIndex, ]

Dimension of Training and Testing Data

# dimension of training data
table(trainHRData.df$Status)

   Joined NotJoined 
     5851      1346 
# dimension of testing data
table(testHRData.df$Status)

   Joined NotJoined 
     1462       336 

DECISION TREE

(INFORMATION GAIN, GINI INDEX)

Fitting Decision Tree Classifier on training dataset with criterion as Information Gain

dTreeInfoGain <- train(Status ~ ., 
                       data = trainHRData.df, 
                       method = "rpart", 
                       parms = list(split = "information"), 
                       trControl = trainControl(method = "cv"))
dTreeInfoGain

Visualizing Tree using package `rpart.plot`

# viasulaziation
library(rpart.plot)
prp(dTreeInfoGain$finalModel, box.palette = "Reds", tweak = 1.2, varlen = 20)

plot of chunk unnamed-chunk-11

Predicted Probabilities of Default (Yes / No), based on Test Data

# predicted probabilities
predProbTestInfoGain <- predict(dTreeInfoGain, testHRData.df, type = "prob")
# plot of probabilities
plot(predProbTestInfoGain[, 2], 
     main = "Scatterplot of Probabilities of Renege (Yes / No) (test data)", 
     xlab = "Customer ID", 
     ylab = "Predicted Probability of Renege (Yes / No)")

plot of chunk unnamed-chunk-13

Plot of Important Variables

# important variables
plot(varImp(dTreeInfoGain, 
            main = "Important Variables",
            scale = TRUE))

plot of chunk unnamed-chunk-15

Confusion Matrix on Test Data

# prediction on test data
predClassTestInfoGain <- predict(dTreeInfoGain, 
                         testHRData.df[, 1:15], 
                         type = 'raw')
# confusion matrix
confusionMatrix(predClassTestInfoGain, testHRData.df$Status, 
                positive = "NotJoined")
Confusion Matrix and Statistics

           Reference
Prediction  Joined NotJoined
  Joined      1428       269
  NotJoined     34        67

               Accuracy : 0.8315          
                 95% CI : (0.8134, 0.8485)
    No Information Rate : 0.8131          
    P-Value [Acc > NIR] : 0.02355         

                  Kappa : 0.2411          
 Mcnemar's Test P-Value : < 2e-16         

            Sensitivity : 0.19940         
            Specificity : 0.97674         
         Pos Pred Value : 0.66337         
         Neg Pred Value : 0.84148         
             Prevalence : 0.18687         
         Detection Rate : 0.03726         
   Detection Prevalence : 0.05617         
      Balanced Accuracy : 0.58807         

       'Positive' Class : NotJoined       

ROC Plot on the Test data

library(ROCR)
lgPredObj <- prediction(predProbTestInfoGain[2],testHRData.df$Status)
lgPerfObj <- performance(lgPredObj, "tpr","fpr")
plot(lgPerfObj,main = "ROC Curve",col = 2,lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")

plot of chunk unnamed-chunk-19

Area Under the Curve (Tree)

# auc for decision tree
aucLogit <- performance(lgPredObj, measure = "auc")
aucLogit <- aucLogit@y.values[[1]]
aucLogit
[1] 0.7247207