Sameer Mathur
Comparing Classifiers (Binomial Logistic Classifier and Decision Tree) using caret Package
---
# reading data
renegeHR.df <- read.csv(paste("CleanHRDataV2.csv"))
# attach data columns
attach(renegeHR.df)
# dimension of the dataset
dim(renegeHR.df)
[1] 8995 16
# structure of the data table
str(renegeHR.df)
'data.frame': 8995 obs. of 16 variables:
$ DOJExtend : Factor w/ 2 levels "No","Yes": 2 1 1 1 2 2 2 2 1 1 ...
$ DurToAcptOffer: int 14 18 3 26 1 17 37 16 1 6 ...
$ NoticePeriod : int 30 30 45 30 120 30 30 0 30 30 ...
$ Band : Factor w/ 4 levels "E0","E1","E2",..: 3 3 3 3 3 2 3 2 2 2 ...
$ CTCHikeExp : num -20.8 50 42.8 42.8 42.6 ...
$ CTCHikeOffered: num 13.2 320 42.8 42.8 42.6 ...
$ CTCDiff : num 42.9 180 0 0 0 ...
$ JoiningBonus : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
$ Relocate : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 1 1 1 1 ...
$ Gender : Factor w/ 2 levels "Female","Male": 1 2 2 2 2 2 2 1 1 2 ...
$ CandidateSrc : Factor w/ 3 levels "Agency","Direct",..: 1 3 1 3 3 3 3 2 3 3 ...
$ RexInYrs : int 7 8 4 4 6 2 7 8 3 3 ...
$ LOB : Factor w/ 9 levels "AXON","BFSI",..: 5 8 8 8 8 8 8 7 2 3 ...
$ Location : Factor w/ 11 levels "Ahmedabad","Bangalore",..: 9 3 9 9 9 9 9 9 5 3 ...
$ Age : int 34 34 27 34 34 34 32 34 26 34 ...
$ Status : Factor w/ 2 levels "Joined","NotJoined": 1 1 1 1 1 1 1 1 1 1 ...
# descriptive statistics of the dataframe
library(psych)
describe(renegeHR.df)[, c(1:5)]
vars n mean sd median
DOJExtend* 1 8995 1.47 0.50 1
DurToAcptOffer 2 8995 21.43 25.81 10
NoticePeriod 3 8995 39.29 22.22 30
Band* 4 8995 2.39 0.63 2
CTCHikeExp 5 8995 43.86 29.79 40
CTCHikeOffered 6 8995 40.66 36.06 36
CTCDiff 7 8995 -1.57 19.61 0
JoiningBonus* 8 8995 1.05 0.21 1
Relocate* 9 8995 1.14 0.35 1
Gender* 10 8995 1.83 0.38 2
CandidateSrc* 11 8995 1.89 0.67 2
RexInYrs 12 8995 4.24 2.55 4
LOB* 13 8995 5.18 2.38 5
Location* 14 8995 4.94 3.00 3
Age 15 8995 29.91 4.10 29
Status* 16 8995 1.19 0.39 1
library(caret)
# data partition
set.seed(2341)
trainIndex <- createDataPartition(renegeHR.df$Status, p = 0.80, list = FALSE)
# 80% training data
trainHRData.df <- renegeHR.df[trainIndex, ]
# 20% testing data
testHRData.df <- renegeHR.df[-trainIndex, ]
# dimension of training data
table(trainHRData.df$Status)
Joined NotJoined
5851 1346
# dimension of testing data
table(testHRData.df$Status)
Joined NotJoined
1462 336
# control parameters
objControl <- trainControl(method = "boot",
number = 2,
returnResamp = 'none',
summaryFunction = twoClassSummary,
classProbs = TRUE,
savePredictions = TRUE)
set.seed(766)
# model building using caret package
caretLogitModel <- train(trainHRData.df[,1:15],
trainHRData.df$Status,
method = 'glmStepAIC',
trControl = objControl,
metric = "ROC",
verbose = FALSE)
# summary of the model
summary(caretLogitModel)
Call:
NULL
Deviance Residuals:
Min 1Q Median 3Q Max
-1.41194 -0.70671 -0.50918 -0.00012 2.63602
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 2.134e+00 1.292e+00 1.651 0.09864 .
DOJExtendYes -1.920e-01 6.811e-02 -2.819 0.00481 **
NoticePeriod 2.066e-02 1.495e-03 13.826 < 2e-16 ***
BandE1 -1.289e+00 2.152e-01 -5.990 2.10e-09 ***
BandE2 -1.189e+00 2.376e-01 -5.004 5.61e-07 ***
BandE3 -1.278e+00 3.086e-01 -4.143 3.43e-05 ***
CTCDiff -3.785e-03 1.945e-03 -1.946 0.05170 .
JoiningBonusYes 2.939e-01 1.597e-01 1.840 0.06578 .
RelocateYes -1.728e+01 1.928e+02 -0.090 0.92860
GenderMale 1.393e-01 8.864e-02 1.572 0.11599
CandidateSrcDirect -3.714e-01 7.447e-02 -4.987 6.14e-07 ***
CandidateSrcEmployee Referral -7.367e-01 1.093e-01 -6.739 1.60e-11 ***
RexInYrs 3.293e-02 2.273e-02 1.449 0.14729
LOBBFSI -4.482e-01 1.569e-01 -2.857 0.00428 **
LOBCSMP -2.995e-01 1.811e-01 -1.654 0.09819 .
LOBEAS 1.240e-01 1.976e-01 0.627 0.53041
LOBERS -3.221e-01 1.489e-01 -2.163 0.03053 *
LOBETS -4.974e-01 1.765e-01 -2.818 0.00484 **
LOBHealthcare -3.592e-01 2.870e-01 -1.252 0.21065
LOBINFRA -8.736e-01 1.617e-01 -5.402 6.61e-08 ***
LOBMMS -1.792e+01 1.900e+03 -0.009 0.99247
LocationBangalore -1.475e+00 1.232e+00 -1.197 0.23136
LocationChennai -1.346e+00 1.231e+00 -1.093 0.27419
LocationCochin -1.794e+01 2.463e+03 -0.007 0.99419
LocationGurgaon -1.490e+00 1.256e+00 -1.186 0.23549
LocationHyderabad -1.644e+00 1.241e+00 -1.325 0.18532
LocationKolkata -1.843e+00 1.264e+00 -1.458 0.14473
LocationMumbai -1.730e+00 1.254e+00 -1.379 0.16796
LocationNoida -1.720e+00 1.230e+00 -1.398 0.16200
LocationOthers -1.771e+01 1.758e+03 -0.010 0.99197
LocationPune -1.501e+00 1.295e+00 -1.159 0.24639
Age -3.238e-02 1.035e-02 -3.127 0.00177 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 6936.1 on 7196 degrees of freedom
Residual deviance: 5984.3 on 7165 degrees of freedom
AIC: 6048.3
Number of Fisher Scoring iterations: 17
set.seed(2345)
dTreeInfoGain <- train(Status ~ .,
data = trainHRData.df,
method = "rpart",
parms = list(split = "information"),
trControl = trainControl(method = "cv"))
dTreeInfoGain
# predicted probabilities
predProbTest <- predict(caretLogitModel, testHRData.df, type = "prob")
# plot of probabilities
plot(predProbTest[, 2],
main = "Scatterplot of Probabilities of Renege (Yes / No) (test data)",
xlab = "Customer ID",
ylab = "Predicted Probability of Renege (Yes / No)")
# predicted probabilities
predProbTestInfoGain <- predict(dTreeInfoGain, testHRData.df, type = "prob")
# plot of probabilities
plot(predProbTestInfoGain[, 2],
main = "Scatterplot of Probabilities of Renege (Yes / No) (test data)",
xlab = "Customer ID",
ylab = "Predicted Probability of Renege (Yes / No)")
# prediction on test data
predClassTest <- predict(caretLogitModel,
testHRData.df[, 1:15],
type = 'raw')
# confusion matrix
confusionMatrix(predClassTest, testHRData.df$Status,
positive = "NotJoined")
# prediction on test data
predClassTestInfoGain <- predict(dTreeInfoGain,
testHRData.df[, 1:15],
type = 'raw')
# confusion matrix
confusionMatrix(predClassTestInfoGain, testHRData.df$Status,
positive = "NotJoined")
Confusion Matrix and Statistics
Reference
Prediction Joined NotJoined
Joined 1442 314
NotJoined 20 22
Accuracy : 0.8142
95% CI : (0.7955, 0.832)
No Information Rate : 0.8131
P-Value [Acc > NIR] : 0.4663
Kappa : 0.0781
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.06548
Specificity : 0.98632
Pos Pred Value : 0.52381
Neg Pred Value : 0.82118
Prevalence : 0.18687
Detection Rate : 0.01224
Detection Prevalence : 0.02336
Balanced Accuracy : 0.52590
'Positive' Class : NotJoined
Confusion Matrix and Statistics
Reference
Prediction Joined NotJoined
Joined 1428 269
NotJoined 34 67
Accuracy : 0.8315
95% CI : (0.8134, 0.8485)
No Information Rate : 0.8131
P-Value [Acc > NIR] : 0.02355
Kappa : 0.2411
Mcnemar's Test P-Value : < 2e-16
Sensitivity : 0.19940
Specificity : 0.97674
Pos Pred Value : 0.66337
Neg Pred Value : 0.84148
Prevalence : 0.18687
Detection Rate : 0.03726
Detection Prevalence : 0.05617
Balanced Accuracy : 0.58807
'Positive' Class : NotJoined
library(ROCR)
lgPredObjLogit <- prediction(predProbTest[2],testHRData.df$Status)
lgPerfObjLogit <- performance(lgPredObjLogit, "tpr","fpr")
plot(lgPerfObjLogit,main = "ROC Curve",col = "red",lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")
library(ROCR)
lgPredObjTree <- prediction(predProbTestInfoGain[2],testHRData.df$Status)
lgPerfObjTree <- performance(lgPredObjTree, "tpr","fpr")
plot(lgPerfObjTree,main = "ROC Curve",col = "blue",lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")
# List of predictions
predList <- list(predClassTest, predClassTestInfoGain)
# List of actual values (same for all)
m <- length(predList)
# ROC curves (logit and tree)
plot(lgPerfObjLogit, col = "red", lwd = 2,
main = "ROC Curve for CC Default \n (Logit and Tree)")
plot(lgPerfObjTree, add = TRUE, col = "blue", lwd = 3,
main = "ROC Curve for CC Default \n (Logit and Tree)")
legend(x = "bottomright",
legend = c("Tree", "Logit"),
fill = 1:m)
# auc for decision tree
aucLogit <- performance(lgPredObjLogit, measure = "auc")
aucLogit <- aucLogit@y.values[[1]]
aucLogit
[1] 0.7410836
# auc for decision tree
aucLogitTree <- performance(lgPredObjTree, measure = "auc")
aucLogitTree <- aucLogitTree@y.values[[1]]
aucLogitTree
[1] 0.7247207