Solution Assignment 4

Mehul Garg

---

Importing Data

default.df <- read.csv("C:/Users/lenovo/Desktop/MLM/CleanHRData.csv")
# dimension of the data frame
dim(default.df)
[1] 8995   17

Data Types of the Data Columns

# data types of the data columns
str(default.df)
'data.frame':   8995 obs. of  17 variables:
 $ CandidateRef            : int  2110407 2112635 2112838 2115021 2115125 2117167 2119124 2127572 2138169 2143362 ...
 $ DOJExtended             : Factor w/ 2 levels "No","Yes": 2 1 1 1 2 2 2 2 1 1 ...
 $ DurationToAcceptOffer   : int  14 18 3 26 1 17 37 16 1 6 ...
 $ NoticePeriod            : int  30 30 45 30 120 30 30 0 30 30 ...
 $ OfferedBand             : Factor w/ 4 levels "E0","E1","E2",..: 3 3 3 3 3 2 3 2 2 2 ...
 $ PercentHikeExpectedInCTC: num  -20.8 50 42.8 42.8 42.6 ...
 $ PercentHikeOfferedInCTC : num  13.2 320 42.8 42.8 42.6 ...
 $ PercentDifferenceCTC    : num  42.9 180 0 0 0 ...
 $ JoiningBonus            : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
 $ CandidateRelocateActual : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 1 1 1 1 ...
 $ Gender                  : Factor w/ 2 levels "Female","Male": 1 2 2 2 2 2 2 1 1 2 ...
 $ CandidateSource         : Factor w/ 3 levels "Agency","Direct",..: 1 3 1 3 3 3 3 2 3 3 ...
 $ RexInYrs                : int  7 8 4 4 6 2 7 8 3 3 ...
 $ LOB                     : Factor w/ 9 levels "AXON","BFSI",..: 5 8 8 8 8 8 8 7 2 3 ...
 $ Location                : Factor w/ 11 levels "Ahmedabad","Bangalore",..: 9 3 9 9 9 9 9 9 5 3 ...
 $ Age                     : int  34 34 27 34 34 34 32 34 26 34 ...
 $ Status                  : Factor w/ 2 levels "Joined","NotJoined": 1 1 1 1 1 1 1 1 1 1 ...

DATA VISUALIZATION

Plots

library(ggplot2)
# Histogram
ggplot(data=default.df, aes(x=default.df$Status)) + geom_bar(stat = "count")

plot of chunk unnamed-chunk-3

dev.off()
null device 
          1 

```

DATA PREPARATION

Creating Training and Testing Data

library(caTools)
# use set.seed to use the same random number sequence
set.seed(123)
# craeting 75% data for training 
split <- sample.split(default.df$Status, SplitRatio = 0.75)
trainData <- subset(default.df, split == TRUE)
# dimensions of training data
dim(trainData)
[1] 6747   17
# creating 25% data for testing
testData <- subset(default.df, split == FALSE)
# dimensions of testing data
dim(testData)
[1] 2248   17

CLASSIFICATION USING BINOMIAL LOGISTIC MODEL

(Classification using Logistic Regression with glm())

Classifier 1

# fit logistic classifier 1
logitClassifier1 <- glm(Status ~ DurationToAcceptOffer + NoticePeriod + OfferedBand + PercentHikeExpectedInCTC + PercentHikeOfferedInCTC +  PercentDifferenceCTC + JoiningBonus + CandidateRelocateActual +     Gender + CandidateSource + RexInYrs + LOB + Location + Age, 
                  data = trainData, 
                  family = binomial())
# summary of the classifier 1
summary(logitClassifier1)

Call:
glm(formula = Status ~ DurationToAcceptOffer + NoticePeriod + 
    OfferedBand + PercentHikeExpectedInCTC + PercentHikeOfferedInCTC + 
    PercentDifferenceCTC + JoiningBonus + CandidateRelocateActual + 
    Gender + CandidateSource + RexInYrs + LOB + Location + Age, 
    family = binomial(), data = trainData)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.57995  -0.69821  -0.51236  -0.00012   2.74317  

Coefficients:
                                   Estimate Std. Error z value Pr(>|z|)
(Intercept)                      -1.525e+01  2.545e+03  -0.006  0.99522
DurationToAcceptOffer            -1.633e-03  1.302e-03  -1.254  0.20984
NoticePeriod                      2.169e-02  1.628e-03  13.326  < 2e-16
OfferedBandE1                    -1.420e+00  2.215e-01  -6.408 1.47e-10
OfferedBandE2                    -1.354e+00  2.422e-01  -5.589 2.28e-08
OfferedBandE3                    -1.752e+00  3.184e-01  -5.502 3.76e-08
PercentHikeExpectedInCTC          4.316e-03  4.270e-03   1.011  0.31209
PercentHikeOfferedInCTC          -6.027e-03  4.358e-03  -1.383  0.16672
PercentDifferenceCTC              4.408e-03  5.809e-03   0.759  0.44798
JoiningBonusYes                   2.365e-01  1.705e-01   1.387  0.16532
CandidateRelocateActualYes       -1.729e+01  1.994e+02  -0.087  0.93090
GenderMale                        2.041e-01  9.087e-02   2.247  0.02467
CandidateSourceDirect            -3.240e-01  7.747e-02  -4.182 2.89e-05
CandidateSourceEmployee Referral -7.328e-01  1.136e-01  -6.448 1.13e-10
RexInYrs                          4.404e-02  2.309e-02   1.908  0.05642
LOBBFSI                          -4.750e-01  1.645e-01  -2.888  0.00388
LOBCSMP                          -3.763e-01  1.892e-01  -1.989  0.04668
LOBEAS                            1.340e-01  2.054e-01   0.653  0.51405
LOBERS                           -3.163e-01  1.566e-01  -2.021  0.04333
LOBETS                           -6.312e-01  1.858e-01  -3.398  0.00068
LOBHealthcare                    -5.148e-01  3.132e-01  -1.644  0.10024
LOBINFRA                         -7.903e-01  1.681e-01  -4.702 2.58e-06
LOBMMS                           -1.779e+01  2.105e+03  -0.008  0.99326
LocationBangalore                 1.582e+01  2.545e+03   0.006  0.99504
LocationChennai                   1.600e+01  2.545e+03   0.006  0.99499
LocationCochin                   -7.576e-01  3.683e+03   0.000  0.99984
LocationGurgaon                   1.598e+01  2.545e+03   0.006  0.99499
LocationHyderabad                 1.563e+01  2.545e+03   0.006  0.99510
LocationKolkata                   1.584e+01  2.545e+03   0.006  0.99503
LocationMumbai                    1.571e+01  2.545e+03   0.006  0.99507
LocationNoida                     1.561e+01  2.545e+03   0.006  0.99511
LocationOthers                   -7.071e-01  3.315e+03   0.000  0.99983
LocationPune                      1.621e+01  2.545e+03   0.006  0.99492
Age                              -3.040e-02  1.081e-02  -2.811  0.00493

(Intercept)                         
DurationToAcceptOffer               
NoticePeriod                     ***
OfferedBandE1                    ***
OfferedBandE2                    ***
OfferedBandE3                    ***
PercentHikeExpectedInCTC            
PercentHikeOfferedInCTC             
PercentDifferenceCTC                
JoiningBonusYes                     
CandidateRelocateActualYes          
GenderMale                       *  
CandidateSourceDirect            ***
CandidateSourceEmployee Referral ***
RexInYrs                         .  
LOBBFSI                          ** 
LOBCSMP                          *  
LOBEAS                              
LOBERS                           *  
LOBETS                           ***
LOBHealthcare                       
LOBINFRA                         ***
LOBMMS                              
LocationBangalore                   
LocationChennai                     
LocationCochin                      
LocationGurgaon                     
LocationHyderabad                   
LocationKolkata                     
LocationMumbai                      
LocationNoida                       
LocationOthers                      
LocationPune                        
Age                              ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 6502.9  on 6746  degrees of freedom
Residual deviance: 5609.0  on 6713  degrees of freedom
AIC: 5677

Number of Fisher Scoring iterations: 17

PREDICTION

Predicting Test Data results using Classifier 1

# prediction using classifier 1
predProbClass1 <- predict(logitClassifier1, type = 'response', newdata = testData[-1])
yPred1 <- ifelse(predProbClass1 > 0.5, "NotJoined", "Joined")
table(yPred1)
yPred1
   Joined NotJoined 
     2176        72 

Akaike Information Criterion (AIC)

  • Measures how good is the model
  • The smaller the AIC, the “gooder” the model
# AIC of the classifiers
AIC(logitClassifier1)
[1] 5677.021

CONFUSION MATRIX

Confusion Matrix using Classifier 1

# confusion matrix using classifier 1
confMatrix1 <- table(yActual = testData[, 17], yPred1)
confMatrix1
           yPred1
yActual     Joined NotJoined
  Joined      1789        39
  NotJoined    387        33

METRICES USING MLmetrics

(ACCURACY, SENSITIVITY, SPECIFICITY)

Confusion Matrix using Classifier 1

# confusion matrix using classifier 1
library(MLmetrics)
ConfusionMatrix(y_pred = yPred1, y_true = testData[, 17])
           y_pred
y_true      Joined NotJoined
  Joined      1789        39
  NotJoined    387        33

```

Accuracy using Classifier 1

# accuracy using classifier 1
library(MLmetrics)
Accuracy(y_pred = yPred1, y_true = testData$Status)
[1] 0.8104982

Sensitivity using Classifier 1

# sensitivity using classifier 1
library(MLmetrics)
Sensitivity(y_true = testData$Status, y_pred = yPred1, positive = "NotJoined")
[1] 0.07857143

Specificity using Classifier 1

# specificity using classifier 1
library(MLmetrics)
Specificity(y_true = testData$Status, y_pred = yPred1, positive = "NotJoined")
[1] 0.9786652

ROC PLOT

ROC Plot using Classifier 1

library(ROCR)
#Every classifier evaluation using ROCR starts with creating a prediction object. This function is used to transform the input data into a standardized format.
PredictObject1 <- prediction(predProbClass1, testData$Status)

# All kinds of predictor evaluations are performed using the performance function
PerformObject1 <- performance(PredictObject1, "tpr","fpr")

# Plot the ROC Curve for Credit Card Default
plot(PerformObject1, main = "ROC Curve for CC Default", col = "black", lwd = 2)
abline(a = 0,b = 1, lwd = 2, lty = 3, col = "black")

plot of chunk unnamed-chunk-15

ROC Curves of Three Classifiers

# ROC curves
plot(PerformObject1, col = "black", lwd = 2,
     main = "ROC Curve for HR Data")