set.seed(766)
# model building using caret package
LRModel <- train(Status ~ .,
data = trainHR.df,
method = 'glmStepAIC',
trControl = objControl,
metric = "ROC")
# summary of the model
summary(LRModel)##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.46589 -0.70600 -0.50971 -0.00012 2.63371
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 8.431e-01 3.607e-01 2.338 0.019407 *
## DOJExtendedYes -1.892e-01 6.766e-02 -2.797 0.005159 **
## NoticePeriod 2.063e-02 1.494e-03 13.811 < 2e-16 ***
## OfferedBandE1 -1.296e+00 2.151e-01 -6.024 1.70e-09 ***
## OfferedBandE2 -1.191e+00 2.375e-01 -5.014 5.33e-07 ***
## OfferedBandE3 -1.278e+00 3.085e-01 -4.143 3.43e-05 ***
## PercentDifferenceCTC -3.701e-03 1.940e-03 -1.907 0.056458 .
## JoiningBonusYes 2.820e-01 1.592e-01 1.772 0.076461 .
## CandidateRelocateActualYes -1.728e+01 1.929e+02 -0.090 0.928604
## GenderMale 1.390e-01 8.857e-02 1.570 0.116514
## CandidateSourceDirect -3.699e-01 7.435e-02 -4.976 6.51e-07 ***
## `CandidateSourceEmployee Referral` -7.322e-01 1.091e-01 -6.710 1.94e-11 ***
## RexInYrs 3.258e-02 2.271e-02 1.434 0.151474
## LOBBFSI -5.039e-01 1.326e-01 -3.801 0.000144 ***
## LOBCSMP -3.545e-01 1.612e-01 -2.200 0.027810 *
## LOBERS -3.746e-01 1.210e-01 -3.095 0.001968 **
## LOBETS -5.497e-01 1.551e-01 -3.544 0.000394 ***
## LOBHealthcare -4.087e-01 2.742e-01 -1.490 0.136121
## LOBINFRA -9.337e-01 1.374e-01 -6.795 1.08e-11 ***
## LOBMMS -1.797e+01 1.900e+03 -0.009 0.992455
## LocationBangalore -1.257e-01 8.242e-02 -1.525 0.127319
## LocationHyderabad -2.850e-01 1.739e-01 -1.639 0.101142
## LocationKolkata -5.256e-01 2.923e-01 -1.798 0.072135 .
## LocationMumbai -3.761e-01 2.655e-01 -1.416 0.156637
## LocationNoida -3.708e-01 9.112e-02 -4.070 4.71e-05 ***
## LocationOthers -1.635e+01 1.758e+03 -0.009 0.992579
## Age -3.235e-02 1.035e-02 -3.126 0.001769 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6936.1 on 7196 degrees of freedom
## Residual deviance: 5987.9 on 7170 degrees of freedom
## AIC: 6041.9
##
## Number of Fisher Scoring iterations: 17
# plot of probabilities
plot(PredLR$NotJoined,
main = "Scatterplot of Probabilities of Renege (Yes / No) (test data)",
xlab = "Customer ID",
ylab = "Predicted Probability of Renege")# choosing cut-off probability
pred.LR <- ifelse(PredLR$NotJoined > 0.50, "NotJoined", "Joined")
Predicted <- ordered(pred.LR, levels = c("NotJoined", "Joined"))
# actual and predicted data columns
Predicted <- as.factor(Predicted)
Actual <- ordered(testHR.df$Status,levels = c("NotJoined", "Joined"))
# making confusion matrix
cm <-confusionMatrix(data =Predicted,reference = Actual,
positive = "NotJoined")
cm## Confusion Matrix and Statistics
##
## Reference
## Prediction NotJoined Joined
## NotJoined 22 22
## Joined 314 1440
##
## Accuracy : 0.8131
## 95% CI : (0.7943, 0.8309)
## No Information Rate : 0.8131
## P-Value [Acc > NIR] : 0.5146
##
## Kappa : 0.0758
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.06548
## Specificity : 0.98495
## Pos Pred Value : 0.50000
## Neg Pred Value : 0.82098
## Prevalence : 0.18687
## Detection Rate : 0.01224
## Detection Prevalence : 0.02447
## Balanced Accuracy : 0.52521
##
## 'Positive' Class : NotJoined
##
library(dplyr)
library(data.table)
# function to print confusion matrices for diffrent cut-off levels of probability
CmFn <- function(cutoff) {
# predicting the test set results
PredLR <- predict(LRModel, testHR.df, type = "prob")
C1 <- ifelse(PredLR$NotJoined > cutoff, "NotJoined", "Joined")
C2 <- testHR.df$Status
predY <- as.factor(C1)
actualY <- as.factor(C2)
Predicted <- ordered(predY, levels = c("NotJoined", "Joined"))
Actual <- ordered(actualY,levels = c("NotJoined", "Joined"))
# use the confusionMatrix from the caret package
cm1 <-confusionMatrix(data = Predicted, reference = Actual, positive = "NotJoined")
# extracting accuracy
Accuracy <- cm1$overall[1]
# extracting sensitivity
Sensitivity <- cm1$byClass[1]
# extracting specificity
Specificity <- cm1$byClass[2]
# extracting value of kappa
Kappa <- cm1$overall[2]
# combined table
tab <- cbind(Accuracy,Sensitivity,Specificity,Kappa)
return(tab)}
# making sequence of cut-off probabilities
cutoff1 <- seq( 0, 1, by = .05 )
# loop using "lapply"
tab2 <- lapply(cutoff1, CmFn)
# creating matrix of different metrics
numrows = length(cutoff1)
pm <- matrix(1:numrows*4, nrow = numrows, ncol=4)
# applying for loop
for (i in 1:numrows){
pm[i,] = tab2[[i]]}
pm <- as.data.frame(pm)
pm <- cbind(cutoff1, pm)
pm <- setnames(pm, "cutoff1", "cutoff")
pm <- setnames(pm, "V1", "Accuracy")
pm <- setnames(pm, "V2", "Senstivity")
pm <- setnames(pm, "V3", "Specificity")
pm <- setnames(pm, "V4", "kappa")
# printing the table
print(pm)## cutoff Accuracy Senstivity Specificity kappa
## 1 0.00 0.1868743 1.000000000 0.0000000 0.00000000
## 2 0.05 0.3398220 0.994047619 0.1894665 0.07789773
## 3 0.10 0.4238042 0.946428571 0.3036936 0.11654740
## 4 0.15 0.5361513 0.857142857 0.4623803 0.17310632
## 5 0.20 0.6501669 0.758928571 0.6251710 0.25019061
## 6 0.25 0.7246941 0.556547619 0.7633379 0.26096459
## 7 0.30 0.7686318 0.351190476 0.8645691 0.22082121
## 8 0.35 0.7992214 0.255952381 0.9240766 0.21414371
## 9 0.40 0.8097887 0.160714286 0.9589603 0.16051506
## 10 0.45 0.8081201 0.092261905 0.9726402 0.09321214
## 11 0.50 0.8131257 0.065476190 0.9849521 0.07579336
## 12 0.55 0.8136819 0.032738095 0.9931601 0.04053051
## 13 0.60 0.8142380 0.008928571 0.9993160 0.01330853
## 14 0.65 0.8136819 0.002976190 1.0000000 0.00483103
## 15 0.70 0.8131257 0.000000000 1.0000000 0.00000000
## 16 0.75 0.8131257 0.000000000 1.0000000 0.00000000
## 17 0.80 0.8131257 0.000000000 1.0000000 0.00000000
## 18 0.85 0.8131257 0.000000000 1.0000000 0.00000000
## 19 0.90 0.8131257 0.000000000 1.0000000 0.00000000
## 20 0.95 0.8131257 0.000000000 1.0000000 0.00000000
## 21 1.00 0.8131257 0.000000000 1.0000000 0.00000000
# False Positive Rate
FPR <- 1-pm$Specificity
# True positive Rate
TPR <- pm$Senstivity
# plotting ROC curve
plot(FPR,TPR,main = "ROC Curve",col = 2,lwd = 2,type = "l",xlab = "False Positive Rate", ylab = "True positive Rate")
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")## Warning: package 'ROCR' was built under R version 4.0.4
Prediction <- prediction(PredLR$NotJoined,testHR.df$Status)
performance <- performance(Prediction, "tpr","fpr")
# plotting ROC curve
plot(performance,main = "ROC Curve",col = 2,lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")library(ROCR)
# area under curve
Prediction <- prediction(PredLR$NotJoined,testHR.df$Status)
aucLR <- performance(Prediction, measure = "auc")
aucLR <- aucLR@y.values[[1]]
aucLR## [1] 0.7425137