Mehul Garg
---
default.df <- read.csv("C:/Users/lenovo/Desktop/MLM/CleanHRData.csv")
# dimension of the data frame
dim(default.df)
[1] 8995 17
# data types of the data columns
str(default.df)
'data.frame': 8995 obs. of 17 variables:
$ CandidateRef : int 2110407 2112635 2112838 2115021 2115125 2117167 2119124 2127572 2138169 2143362 ...
$ DOJExtended : Factor w/ 2 levels "No","Yes": 2 1 1 1 2 2 2 2 1 1 ...
$ DurationToAcceptOffer : int 14 18 3 26 1 17 37 16 1 6 ...
$ NoticePeriod : int 30 30 45 30 120 30 30 0 30 30 ...
$ OfferedBand : Factor w/ 4 levels "E0","E1","E2",..: 3 3 3 3 3 2 3 2 2 2 ...
$ PercentHikeExpectedInCTC: num -20.8 50 42.8 42.8 42.6 ...
$ PercentHikeOfferedInCTC : num 13.2 320 42.8 42.8 42.6 ...
$ PercentDifferenceCTC : num 42.9 180 0 0 0 ...
$ JoiningBonus : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
$ CandidateRelocateActual : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 1 1 1 1 ...
$ Gender : Factor w/ 2 levels "Female","Male": 1 2 2 2 2 2 2 1 1 2 ...
$ CandidateSource : Factor w/ 3 levels "Agency","Direct",..: 1 3 1 3 3 3 3 2 3 3 ...
$ RexInYrs : int 7 8 4 4 6 2 7 8 3 3 ...
$ LOB : Factor w/ 9 levels "AXON","BFSI",..: 5 8 8 8 8 8 8 7 2 3 ...
$ Location : Factor w/ 11 levels "Ahmedabad","Bangalore",..: 9 3 9 9 9 9 9 9 5 3 ...
$ Age : int 34 34 27 34 34 34 32 34 26 34 ...
$ Status : Factor w/ 2 levels "Joined","NotJoined": 1 1 1 1 1 1 1 1 1 1 ...
library(ggplot2)
# Histogram
ggplot(data=default.df, aes(x=default.df$Status)) + geom_bar(stat = "count")
dev.off()
null device
1
library(caTools)
# use set.seed to use the same random number sequence
set.seed(123)
# craeting 75% data for training
split <- sample.split(default.df$Status, SplitRatio = 0.75)
trainData <- subset(default.df, split == TRUE)
# dimensions of training data
dim(trainData)
[1] 6747 17
# creating 25% data for testing
testData <- subset(default.df, split == FALSE)
# dimensions of testing data
dim(testData)
[1] 2248 17
glm())# fit logistic classifier 1
logitClassifier1 <- glm(Status ~ DurationToAcceptOffer + NoticePeriod + OfferedBand + PercentHikeExpectedInCTC + PercentHikeOfferedInCTC + PercentDifferenceCTC + JoiningBonus + CandidateRelocateActual + Gender + CandidateSource + RexInYrs + LOB + Location + Age,
data = trainData,
family = binomial())
# summary of the classifier 1
summary(logitClassifier1)
Call:
glm(formula = Status ~ DurationToAcceptOffer + NoticePeriod +
OfferedBand + PercentHikeExpectedInCTC + PercentHikeOfferedInCTC +
PercentDifferenceCTC + JoiningBonus + CandidateRelocateActual +
Gender + CandidateSource + RexInYrs + LOB + Location + Age,
family = binomial(), data = trainData)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.57995 -0.69821 -0.51236 -0.00012 2.74317
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.525e+01 2.545e+03 -0.006 0.99522
DurationToAcceptOffer -1.633e-03 1.302e-03 -1.254 0.20984
NoticePeriod 2.169e-02 1.628e-03 13.326 < 2e-16
OfferedBandE1 -1.420e+00 2.215e-01 -6.408 1.47e-10
OfferedBandE2 -1.354e+00 2.422e-01 -5.589 2.28e-08
OfferedBandE3 -1.752e+00 3.184e-01 -5.502 3.76e-08
PercentHikeExpectedInCTC 4.316e-03 4.270e-03 1.011 0.31209
PercentHikeOfferedInCTC -6.027e-03 4.358e-03 -1.383 0.16672
PercentDifferenceCTC 4.408e-03 5.809e-03 0.759 0.44798
JoiningBonusYes 2.365e-01 1.705e-01 1.387 0.16532
CandidateRelocateActualYes -1.729e+01 1.994e+02 -0.087 0.93090
GenderMale 2.041e-01 9.087e-02 2.247 0.02467
CandidateSourceDirect -3.240e-01 7.747e-02 -4.182 2.89e-05
CandidateSourceEmployee Referral -7.328e-01 1.136e-01 -6.448 1.13e-10
RexInYrs 4.404e-02 2.309e-02 1.908 0.05642
LOBBFSI -4.750e-01 1.645e-01 -2.888 0.00388
LOBCSMP -3.763e-01 1.892e-01 -1.989 0.04668
LOBEAS 1.340e-01 2.054e-01 0.653 0.51405
LOBERS -3.163e-01 1.566e-01 -2.021 0.04333
LOBETS -6.312e-01 1.858e-01 -3.398 0.00068
LOBHealthcare -5.148e-01 3.132e-01 -1.644 0.10024
LOBINFRA -7.903e-01 1.681e-01 -4.702 2.58e-06
LOBMMS -1.779e+01 2.105e+03 -0.008 0.99326
LocationBangalore 1.582e+01 2.545e+03 0.006 0.99504
LocationChennai 1.600e+01 2.545e+03 0.006 0.99499
LocationCochin -7.576e-01 3.683e+03 0.000 0.99984
LocationGurgaon 1.598e+01 2.545e+03 0.006 0.99499
LocationHyderabad 1.563e+01 2.545e+03 0.006 0.99510
LocationKolkata 1.584e+01 2.545e+03 0.006 0.99503
LocationMumbai 1.571e+01 2.545e+03 0.006 0.99507
LocationNoida 1.561e+01 2.545e+03 0.006 0.99511
LocationOthers -7.071e-01 3.315e+03 0.000 0.99983
LocationPune 1.621e+01 2.545e+03 0.006 0.99492
Age -3.040e-02 1.081e-02 -2.811 0.00493
(Intercept)
DurationToAcceptOffer
NoticePeriod ***
OfferedBandE1 ***
OfferedBandE2 ***
OfferedBandE3 ***
PercentHikeExpectedInCTC
PercentHikeOfferedInCTC
PercentDifferenceCTC
JoiningBonusYes
CandidateRelocateActualYes
GenderMale *
CandidateSourceDirect ***
CandidateSourceEmployee Referral ***
RexInYrs .
LOBBFSI **
LOBCSMP *
LOBEAS
LOBERS *
LOBETS ***
LOBHealthcare
LOBINFRA ***
LOBMMS
LocationBangalore
LocationChennai
LocationCochin
LocationGurgaon
LocationHyderabad
LocationKolkata
LocationMumbai
LocationNoida
LocationOthers
LocationPune
Age **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 6502.9 on 6746 degrees of freedom
Residual deviance: 5609.0 on 6713 degrees of freedom
AIC: 5677
Number of Fisher Scoring iterations: 17
# prediction using classifier 1
predProbClass1 <- predict(logitClassifier1, type = 'response', newdata = testData[-1])
yPred1 <- ifelse(predProbClass1 > 0.5, "NotJoined", "Joined")
table(yPred1)
yPred1
Joined NotJoined
2176 72
good is the modelgooder” the model# AIC of the classifiers
AIC(logitClassifier1)
[1] 5677.021
# confusion matrix using classifier 1
confMatrix1 <- table(yActual = testData[, 17], yPred1)
confMatrix1
yPred1
yActual Joined NotJoined
Joined 1789 39
NotJoined 387 33
MLmetrics# confusion matrix using classifier 1
library(MLmetrics)
ConfusionMatrix(y_pred = yPred1, y_true = testData[, 17])
y_pred
y_true Joined NotJoined
Joined 1789 39
NotJoined 387 33
# accuracy using classifier 1
library(MLmetrics)
Accuracy(y_pred = yPred1, y_true = testData$Status)
[1] 0.8104982
# sensitivity using classifier 1
library(MLmetrics)
Sensitivity(y_true = testData$Status, y_pred = yPred1, positive = "NotJoined")
[1] 0.07857143
# specificity using classifier 1
library(MLmetrics)
Specificity(y_true = testData$Status, y_pred = yPred1, positive = "NotJoined")
[1] 0.9786652
library(ROCR)
#Every classifier evaluation using ROCR starts with creating a prediction object. This function is used to transform the input data into a standardized format.
PredictObject1 <- prediction(predProbClass1, testData$Status)
# All kinds of predictor evaluations are performed using the performance function
PerformObject1 <- performance(PredictObject1, "tpr","fpr")
# Plot the ROC Curve for Credit Card Default
plot(PerformObject1, main = "ROC Curve for CC Default", col = "black", lwd = 2)
abline(a = 0,b = 1, lwd = 2, lty = 3, col = "black")
# ROC curves
plot(PerformObject1, col = "black", lwd = 2,
main = "ROC Curve for HR Data")