Adarsh Adwait
---
Consider a simple dataset of “Case Study: HR Data (Analysis and Visualization)”
# printing Dimensions
dim(hrdata)
[1] 8995 17
# structure of the dataset
str(hrdata)
'data.frame': 8995 obs. of 17 variables:
$ CandidateRef : int 2110407 2112635 2112838 2115021 2115125 2117167 2119124 2127572 2138169 2143362 ...
$ DOJExtended : Factor w/ 2 levels "No","Yes": 2 1 1 1 2 2 2 2 1 1 ...
$ DurationToAcceptOffer : int 14 18 3 26 1 17 37 16 1 6 ...
$ NoticePeriod : int 30 30 45 30 120 30 30 0 30 30 ...
$ OfferedBand : Factor w/ 4 levels "E0","E1","E2",..: 3 3 3 3 3 2 3 2 2 2 ...
$ PercentHikeExpectedInCTC: num -20.8 50 42.8 42.8 42.6 ...
$ PercentHikeOfferedInCTC : num 13.2 320 42.8 42.8 42.6 ...
$ PercentDifferenceCTC : num 42.9 180 0 0 0 ...
$ JoiningBonus : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
$ CandidateRelocateActual : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 1 1 1 1 ...
$ Gender : Factor w/ 2 levels "Female","Male": 1 2 2 2 2 2 2 1 1 2 ...
$ CandidateSource : Factor w/ 3 levels "Agency","Direct",..: 1 3 1 3 3 3 3 2 3 3 ...
$ RexInYrs : int 7 8 4 4 6 2 7 8 3 3 ...
$ LOB : Factor w/ 9 levels "AXON","BFSI",..: 5 8 8 8 8 8 8 7 2 3 ...
$ Location : Factor w/ 11 levels "Ahmedabad","Bangalore",..: 9 3 9 9 9 9 9 9 5 3 ...
$ Age : int 34 34 27 34 34 34 32 34 26 34 ...
$ Status : Factor w/ 2 levels "Joined","Not Joined": 1 1 1 1 1 1 1 1 1 1 ...
# structure of the dataset
psych::describe(hrdata)
vars n mean sd median trimmed
CandidateRef 1 8995 2843647.38 486344.77 2807482 2831309.11
DOJExtended* 2 8995 1.47 0.50 1 1.46
DurationToAcceptOffer 3 8995 21.43 25.81 10 16.64
NoticePeriod 4 8995 39.29 22.22 30 37.91
OfferedBand* 5 8995 2.39 0.63 2 2.32
PercentHikeExpectedInCTC 6 8995 43.86 29.79 40 40.92
PercentHikeOfferedInCTC 7 8995 40.66 36.06 36 36.70
PercentDifferenceCTC 8 8995 -1.57 19.61 0 -3.08
JoiningBonus* 9 8995 1.05 0.21 1 1.00
CandidateRelocateActual* 10 8995 1.14 0.35 1 1.05
Gender* 11 8995 1.83 0.38 2 1.91
CandidateSource* 12 8995 1.89 0.67 2 1.86
RexInYrs 13 8995 4.24 2.55 4 4.01
LOB* 14 8995 5.18 2.38 5 5.30
Location* 15 8995 4.94 3.00 3 4.78
Age 16 8995 29.91 4.10 29 29.86
Status* 17 8995 1.19 0.39 1 1.11
mad min max range skew
CandidateRef 668050.66 2109586.00 3836076.00 1726490.00 0.16
DOJExtended* 0.00 1.00 2.00 1.00 0.13
DurationToAcceptOffer 13.34 0.00 224.00 224.00 1.61
NoticePeriod 0.00 0.00 120.00 120.00 0.74
OfferedBand* 0.00 1.00 4.00 3.00 0.81
PercentHikeExpectedInCTC 19.53 -68.83 359.77 428.60 2.29
PercentHikeOfferedInCTC 20.76 -60.53 471.43 531.96 2.75
PercentDifferenceCTC 6.85 -67.27 300.00 367.27 4.80
JoiningBonus* 0.00 1.00 2.00 1.00 4.31
CandidateRelocateActual* 0.00 1.00 2.00 1.00 2.03
Gender* 0.00 1.00 2.00 1.00 -1.73
CandidateSource* 0.00 1.00 3.00 2.00 0.13
RexInYrs 1.48 0.00 24.00 24.00 1.29
LOB* 4.45 1.00 9.00 8.00 -0.22
Location* 1.48 1.00 11.00 10.00 0.50
Age 4.45 20.00 60.00 40.00 0.42
Status* 0.00 1.00 2.00 1.00 1.61
kurtosis se
CandidateRef -1.46 5127.95
DOJExtended* -1.98 0.01
DurationToAcceptOffer 2.65 0.27
NoticePeriod 0.79 0.23
OfferedBand* 0.30 0.01
PercentHikeExpectedInCTC 12.44 0.31
PercentHikeOfferedInCTC 15.92 0.38
PercentDifferenceCTC 45.55 0.21
JoiningBonus* 16.61 0.00
CandidateRelocateActual* 2.14 0.00
Gender* 1.01 0.00
CandidateSource* -0.81 0.01
RexInYrs 3.20 0.03
LOB* -1.23 0.03
Location* -1.56 0.03
Age 0.34 0.04
Status* 0.58 0.00
(ONE-WAY, TWO-WAY AND THREE-WAY CONTINGENCY TABLES)
round(prop.table(with(dataset, table(Status)))*100,2)
Status
Joined Not Joined
81.3 18.7
mytable <- xtabs(~ DOJExtended+Status, data=dataset)
addmargins(round(prop.table(mytable,1)*100,2),1)
Status
DOJExtended Joined Not Joined
No 81.08 18.92
Yes 81.55 18.45
Sum 162.63 37.37
mytable <- xtabs(~ NoticePeriod+Status, data=dataset)
addmargins(mytable)
Status
NoticePeriod Joined Not Joined Sum
0 726 51 777
30 4393 765 5158
45 397 129 526
60 1285 470 1755
75 75 35 110
90 415 212 627
120 22 20 42
Sum 7313 1682 8995
mytable <- xtabs(~ NoticePeriod+Status, data=dataset)
round(addmargins(prop.table(mytable,1)*100,1),2)
Status
NoticePeriod Joined Not Joined
0 93.44 6.56
30 85.17 14.83
45 75.48 24.52
60 73.22 26.78
75 68.18 31.82
90 66.19 33.81
120 52.38 47.62
Sum 514.05 185.95
mytable <- xtabs(~ JoiningBonus+Status, data=dataset)
round(addmargins(prop.table(mytable,1)*100,1),2)
Status
JoiningBonus Joined Not Joined
No 81.34 18.66
Yes 80.58 19.42
Sum 161.91 38.09
mytable <- xtabs(~ Gender+Status, data=dataset)
round(addmargins(prop.table(mytable,1)*100,1),2)
Status
Gender Joined Not Joined
Female 82.40 17.60
Male 81.07 18.93
Sum 163.47 36.53
mytable <- xtabs(~ CandidateSource+Status, data=dataset)
round(addmargins(prop.table(mytable,1)*100,1),2)
Status
CandidateSource Joined Not Joined
Agency 75.82 24.18
Direct 82.00 18.00
Employee Referral 88.00 12.00
Sum 245.83 54.17
mytable <- xtabs(~ OfferedBand+Status, data=dataset)
round(addmargins(prop.table(mytable,1)*100,1),2)
Status
OfferedBand Joined Not Joined
E0 76.30 23.70
E1 81.30 18.70
E2 80.97 19.03
E3 85.15 14.85
Sum 323.72 76.28
mytable <- xtabs(~ LOB+Status, data=dataset)
round(addmargins(prop.table(mytable,1)*100,1),2)
Status
LOB Joined Not Joined
AXON 77.46 22.54
BFSI 75.86 24.14
CSMP 81.52 18.48
EAS 73.41 26.59
ERS 78.11 21.89
ETS 83.07 16.93
Healthcare 82.26 17.74
INFRA 87.79 12.21
MMS 100.00 0.00
Sum 739.48 160.52
mytable <- xtabs(~ Location+Status, data=dataset)
addmargins(mytable)
Status
Location Joined Not Joined Sum
Ahmedabad 5 1 6
Bangalore 1742 488 2230
Chennai 2486 664 3150
Cochin 7 1 8
Gurgaon 118 28 146
Hyderabad 266 75 341
Kolkata 100 29 129
Mumbai 176 21 197
Noida 2362 365 2727
Others 13 0 13
Pune 38 10 48
Sum 7313 1682 8995
mytable <- xtabs(~ Location+Status, data=dataset)
round(addmargins(prop.table(mytable,1)*100,1),2)
Status
Location Joined Not Joined
Ahmedabad 83.33 16.67
Bangalore 78.12 21.88
Chennai 78.92 21.08
Cochin 87.50 12.50
Gurgaon 80.82 19.18
Hyderabad 78.01 21.99
Kolkata 77.52 22.48
Mumbai 89.34 10.66
Noida 86.62 13.38
Others 100.00 0.00
Pune 79.17 20.83
Sum 919.34 180.66
result<-aggregate(dataset$Age, by=list(Status=dataset$Status), FUN=mean)
names(result)[2]<-"AverageAgeofCandidates"
result$AverageAgeofCandidates<-round(result$AverageAgeofCandidates, 2)
result
Status AverageAgeofCandidates
1 Joined 30.00
2 Not Joined 29.52
result<-aggregate(dataset[c("Age","NoticePeriod")], by=list(Status=dataset$Status), FUN=mean)
names(result)[2:3]<-c("AverageAgeofCandidates","AverageNoticePeriod")
result[2:3]<-round(result[2:3],2)
result
Status AverageAgeofCandidates AverageNoticePeriod
1 Joined 30.00 37.24
2 Not Joined 29.52 48.19
result<-aggregate(dataset$Age, by=list(Status=dataset$Status,Gender=dataset$Gender), FUN=mean)
names(result)[3]<-"AverageAgeofCandidates"
result$AverageAgeofCandidates<-round(result$AverageAgeofCandidates, 2)
result
Status Gender AverageAgeofCandidates
1 Joined Female 29.09
2 Not Joined Female 28.02
3 Joined Male 30.20
4 Not Joined Male 29.81
result<-aggregate(dataset[c("Age","RexInYrs")], by=list(Status=dataset$Status,Gender=dataset$Gender), FUN=mean)
names(result)[3:4]<-c("AverageAgeofCandidates","YearsOfExperience")
result[3:4]<-round(result[3:4],2)
result
Status Gender AverageAgeofCandidates YearsOfExperience
1 Joined Female 29.09 3.45
2 Not Joined Female 28.02 3.61
3 Joined Male 30.20 4.35
4 Not Joined Male 29.81 4.60
result<-aggregate(dataset[c("Age","RexInYrs","DurationToAcceptOffer")], by=list(Status=dataset$Status,Gender=dataset$Gender), FUN=mean)
names(result)[3:4]<-c("AverageAgeofCandidates","YearsOfExperience")
result[3:5]<-round(result[3:5],2)
result
Status Gender AverageAgeofCandidates YearsOfExperience
1 Joined Female 29.09 3.45
2 Not Joined Female 28.02 3.61
3 Joined Male 30.20 4.35
4 Not Joined Male 29.81 4.60
DurationToAcceptOffer
1 19.43
2 24.10
3 20.88
4 25.12
result<-aggregate(dataset[c("Age","NoticePeriod")], by=list(Status=dataset$Status,Gender=dataset$Gender), FUN=mean)
names(result)[3]<-"AverageAgeofCandidates"
result[3:4]<-round(result[3:4],2)
result
Status Gender AverageAgeofCandidates NoticePeriod
1 Joined Female 29.09 35.48
2 Not Joined Female 28.02 46.65
3 Joined Male 30.20 37.62
4 Not Joined Male 29.81 48.49
# Pie Chart from data frame with Appended Sample Sizes
mytable <- table(Status)
pie(mytable,
main="Pie Chart of Percentage of Candidates who Joined / Not Joined")
Gender
Status Female Male
Joined 1278 6035
Not Joined 273 1409
OfferedBand
Status E0 E1 E2 E3
Joined 161 4527 2195 430
Not Joined 50 1041 516 75
JoiningBonus
Status No Yes
Joined 6977 336
Not Joined 1601 81
CandidateSource
Status Agency Direct Employee Referral
Joined 1960 3937 1416
Not Joined 625 864 193
Location
Status Ahmedabad Bangalore Chennai Cochin Gurgaon Hyderabad Kolkata
Joined 5 1742 2486 7 118 266 100
Not Joined 1 488 664 1 28 75 29
Location
Status Mumbai Noida Others Pune
Joined 176 2362 13 38
Not Joined 21 365 0 10
LOB
Status AXON BFSI CSMP EAS ERS ETS Healthcare INFRA MMS
Joined 440 1059 472 254 1895 574 102 2502 15
Not Joined 128 337 107 92 531 117 22 348 0
library(caTools)
library(caret)
# use set.seed to use the same random number sequence
set.seed(123)
# craeting 75% data for training
split <- sample.split(Status, SplitRatio = 0.75)
trainData <- subset(hrdata, split == TRUE)
# dimensions of training data
dim(trainData)
[1] 6747 17
# creating 25% data for testing
testData <- subset(hrdata, split == FALSE)
# dimensions of testing data
dim(testData)
[1] 2248 17
glm())
Call:
glm(formula = Status ~ PercentDifferenceCTC + DOJExtended + OfferedBand +
NoticePeriod + RexInYrs + LOB + CandidateRelocateActual,
family = binomial(), data = trainData)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.50722 -0.69675 -0.52765 -0.00012 2.50626
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -5.281e-01 2.512e-01 -2.102 0.035526 *
PercentDifferenceCTC -2.731e-03 1.975e-03 -1.383 0.166753
DOJExtendedYes -2.085e-01 6.970e-02 -2.992 0.002773 **
OfferedBandE1 -1.288e+00 2.121e-01 -6.075 1.24e-09 ***
OfferedBandE2 -1.257e+00 2.330e-01 -5.397 6.79e-08 ***
OfferedBandE3 -1.662e+00 3.082e-01 -5.392 6.98e-08 ***
NoticePeriod 2.169e-02 1.525e-03 14.229 < 2e-16 ***
RexInYrs 2.069e-02 2.057e-02 1.006 0.314503
LOBBFSI -3.553e-01 1.515e-01 -2.346 0.018974 *
LOBCSMP -4.261e-01 1.842e-01 -2.314 0.020685 *
LOBEAS 2.296e-01 1.982e-01 1.159 0.246606
LOBERS -9.486e-02 1.434e-01 -0.662 0.508159
LOBETS -5.960e-01 1.801e-01 -3.310 0.000934 ***
LOBHealthcare -4.420e-01 3.074e-01 -1.438 0.150429
LOBINFRA -8.630e-01 1.572e-01 -5.490 4.03e-08 ***
LOBMMS -1.771e+01 2.131e+03 -0.008 0.993368
CandidateRelocateActualYes -1.729e+01 2.008e+02 -0.086 0.931360
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 6502.9 on 6746 degrees of freedom
Residual deviance: 5693.1 on 6730 degrees of freedom
AIC: 5727.1
Number of Fisher Scoring iterations: 17
# prediction usinf classifier 1
predProbClass1 <- predict(logitClassifier1, type = 'response', newdata = testData[-17])
yPred1 <- ifelse(predProbClass1 > 0.5, "Joined", "Not Joined")
table(yPred1)
yPred1
Joined Not Joined
54 2194
AIC(logitClassifier1)
[1] 5727.065
library(MLmetrics)
ConfusionMatrix(yPred1, testData[, 17])
y_pred
y_true Joined Not Joined
Joined 31 1797
Not Joined 23 397
Accuracy(yPred1, testData$Status)
[1] 0.1903915
Sensitivity(yPred1, testData$Status, positive = "Joined")
[1] 0.5740741
Specificity(yPred1, testData$Status, positive = "Joined")
[1] 0.180948