rm(list = ls())
empdata_orig=fread("E:\\AbhinavB\\Kaggle\\IBM HR Analytics Employee Attrition\\ibm-hr-analytics-employee-attrition-performance\\WA_Fn-UseC_-HR-Employee-Attrition.csv",
data.table = FALSE,
colClasses =c("integer","factor","factor","integer","factor","integer","factor","factor","integer","integer",
"factor","factor","integer","factor","factor","factor","factor","factor","integer","integer",
"integer","factor","factor","integer","factor","factor","integer","factor","integer","integer",
"factor","integer","integer","integer","integer" ))
empdata=empdata_orig
glimpse(empdata)
## Observations: 1,470
## Variables: 35
## $ Age <int> 41, 49, 37, 33, 27, 32, 59, 30, 38, 3...
## $ Attrition <fct> Yes, No, Yes, No, No, No, No, No, No,...
## $ BusinessTravel <fct> Travel_Rarely, Travel_Frequently, Tra...
## $ DailyRate <int> 1102, 279, 1373, 1392, 591, 1005, 132...
## $ Department <fct> Sales, Research & Development, Resear...
## $ DistanceFromHome <int> 1, 8, 2, 3, 2, 2, 3, 24, 23, 27, 16, ...
## $ Education <fct> 2, 1, 2, 4, 1, 2, 3, 1, 3, 3, 3, 2, 1...
## $ EducationField <fct> Life Sciences, Life Sciences, Other, ...
## $ EmployeeCount <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ EmployeeNumber <int> 1, 2, 4, 5, 7, 8, 10, 11, 12, 13, 14,...
## $ EnvironmentSatisfaction <fct> 2, 3, 4, 4, 1, 4, 3, 4, 4, 3, 1, 4, 1...
## $ Gender <fct> Female, Male, Male, Female, Male, Mal...
## $ HourlyRate <int> 94, 61, 92, 56, 40, 79, 81, 67, 44, 9...
## $ JobInvolvement <fct> 3, 2, 2, 3, 3, 3, 4, 3, 2, 3, 4, 2, 3...
## $ JobLevel <fct> 2, 2, 1, 1, 1, 1, 1, 1, 3, 2, 1, 2, 1...
## $ JobRole <fct> Sales Executive, Research Scientist, ...
## $ JobSatisfaction <fct> 4, 2, 3, 3, 2, 4, 1, 3, 3, 3, 2, 3, 3...
## $ MaritalStatus <fct> Single, Married, Single, Married, Mar...
## $ MonthlyIncome <int> 5993, 5130, 2090, 2909, 3468, 3068, 2...
## $ MonthlyRate <int> 19479, 24907, 2396, 23159, 16632, 118...
## $ NumCompaniesWorked <int> 8, 1, 6, 1, 9, 0, 4, 1, 0, 6, 0, 0, 1...
## $ Over18 <fct> Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y...
## $ OverTime <fct> Yes, No, Yes, Yes, No, No, Yes, No, N...
## $ PercentSalaryHike <int> 11, 23, 15, 11, 12, 13, 20, 22, 21, 1...
## $ PerformanceRating <fct> 3, 4, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 3...
## $ RelationshipSatisfaction <fct> 1, 4, 2, 3, 4, 3, 1, 2, 2, 2, 3, 4, 4...
## $ StandardHours <int> 80, 80, 80, 80, 80, 80, 80, 80, 80, 8...
## $ StockOptionLevel <fct> 0, 1, 0, 0, 1, 0, 3, 1, 0, 2, 1, 0, 1...
## $ TotalWorkingYears <int> 8, 10, 7, 8, 6, 8, 12, 1, 10, 17, 6, ...
## $ TrainingTimesLastYear <int> 0, 3, 3, 3, 3, 2, 3, 2, 2, 3, 5, 3, 1...
## $ WorkLifeBalance <fct> 1, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 2...
## $ YearsAtCompany <int> 6, 10, 0, 8, 2, 7, 1, 1, 9, 7, 5, 9, ...
## $ YearsInCurrentRole <int> 4, 7, 0, 7, 2, 7, 0, 0, 7, 7, 4, 5, 2...
## $ YearsSinceLastPromotion <int> 0, 1, 0, 3, 2, 3, 0, 0, 1, 7, 0, 0, 4...
## $ YearsWithCurrManager <int> 5, 7, 0, 0, 2, 6, 0, 0, 8, 7, 3, 8, 3...
t(sapply(empdata,class))
## Age Attrition BusinessTravel DailyRate Department
## [1,] "integer" "factor" "factor" "integer" "factor"
## DistanceFromHome Education EducationField EmployeeCount
## [1,] "integer" "factor" "factor" "integer"
## EmployeeNumber EnvironmentSatisfaction Gender HourlyRate
## [1,] "integer" "factor" "factor" "integer"
## JobInvolvement JobLevel JobRole JobSatisfaction MaritalStatus
## [1,] "factor" "factor" "factor" "factor" "factor"
## MonthlyIncome MonthlyRate NumCompaniesWorked Over18 OverTime
## [1,] "integer" "integer" "integer" "factor" "factor"
## PercentSalaryHike PerformanceRating RelationshipSatisfaction
## [1,] "integer" "factor" "factor"
## StandardHours StockOptionLevel TotalWorkingYears
## [1,] "integer" "factor" "integer"
## TrainingTimesLastYear WorkLifeBalance YearsAtCompany
## [1,] "integer" "factor" "integer"
## YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
## [1,] "integer" "integer" "integer"
empdata$AgeGroups=as.factor(ifelse(empdata$Age<=24,"Young",ifelse((empdata$Age>24 & empdata$Age<=54),"Middle Aged","Senior Citizen")))
table(empdata$AgeGroups)
##
## Middle Aged Senior Citizen Young
## 1304 69 97
empdata %>% ggplot(aes(x=empdata$AgeGroups))+geom_histogram(stat = "count")
#### Age group wise count with Attrition and non attrition employees-figure
empdata %>% ggplot(aes(x=empdata$AgeGroups))+geom_histogram(aes(color=empdata$Attrition),stat = "count",fill="white")
#We can also analyse this with side-by-side bar chart-figure
empdata %>% ggplot(aes(x=empdata$AgeGroups))+geom_bar(aes(fill=empdata$Attrition),position = position_dodge(),color="black")
##Business travel wise count with attrition and non attrition employees
table(empdata$BusinessTravel)
##
## Non-Travel Travel_Frequently Travel_Rarely
## 150 277 1043
e2=empdata %>% group_by(empdata$BusinessTravel,empdata$Attrition) %>% summarise(count=n())
e2
## # A tibble: 6 x 3
## # Groups: empdata$BusinessTravel [3]
## `empdata$BusinessTravel` `empdata$Attrition` count
## <fct> <fct> <int>
## 1 Non-Travel No 138
## 2 Non-Travel Yes 12
## 3 Travel_Frequently No 208
## 4 Travel_Frequently Yes 69
## 5 Travel_Rarely No 887
## 6 Travel_Rarely Yes 156
empdata %>% ggplot(aes(x=empdata$BusinessTravel))+geom_bar(aes(fill=empdata$Attrition),position=position_dodge(),color="black")
We see that “TravelFrequently” has a propensity towards attrition
## Education level wise breakup
unique(empdata$Education)
## [1] 2 1 4 3 5
## Levels: 1 2 3 4 5
empdata %>% ggplot(aes(x=empdata$EducationField))+geom_bar(aes(fill=empdata$Attrition),position = position_dodge(),color="black")+coord_flip()
table(empdata$Department)
##
## Human Resources Research & Development Sales
## 63 961 446
rm(e3)
e3=empdata %>% group_by(empdata$Department,empdata$Attrition) %>% summarise(count=n()) %>% mutate(grp_pct=count/sum(count)*100)
e3
## # A tibble: 6 x 4
## # Groups: empdata$Department [3]
## `empdata$Department` `empdata$Attrition` count grp_pct
## <fct> <fct> <int> <dbl>
## 1 Human Resources No 51 81.0
## 2 Human Resources Yes 12 19.0
## 3 Research & Development No 828 86.2
## 4 Research & Development Yes 133 13.8
## 5 Sales No 354 79.4
## 6 Sales Yes 92 20.6
empdata %>% ggplot(aes(x=empdata$Department))+geom_bar(aes(fill=empdata$Attrition),position = position_dodge(),color="blue")+coord_flip()
empdata %>% ggplot(aes(x=empdata$OverTime))+geom_bar(aes(fill=empdata$Attrition),position = position_dodge(),color="grey")+facet_grid(empdata$Gender~.)
#### Check for missing values in the dataset
colnames(empdata)[colSums(is.na(empdata)>0)]
## character(0)
No missing values in the dataset ,so we don’t have to worry about imputing
k=lapply(empdata,function(x) {length(unique(x))})
w=which(!k>1)
names(w)
## [1] "EmployeeCount" "Over18" "StandardHours"
So, will remove these columns from our analysis
empdata=empdata[,-which(names(empdata) %in% names(w))]
dim(empdata)
## [1] 1470 33
Adding a few features
empdata$TotalSatisfaction=as.numeric(empdata$EnvironmentSatisfaction)+as.numeric(empdata$JobInvolvement)+as.numeric(empdata$JobSatisfaction)+as.numeric(empdata$RelationshipSatisfaction)+as.numeric(empdata$WorkLifeBalance)
summary(empdata$TotalSatisfaction)
##Adding Low or High Income indicator if Monthly income less or greater than mean
empdata$IncomeInd=as.factor(ifelse(empdata$MonthlyIncome<mean(empdata$MonthlyIncome),"Low","High"))
empdata$IncomeInd=as.numeric(factor(empdata$IncomeInd),levels=levels(empdata$IncomeInd))-1
##dropping unique column as its not useful for modelling
empdata=empdata[,-which(names(empdata) %in% c("EmployeeNumber"))]
glimpse(empdata)
Since many ML algorithms donot work well with factor/categorical variables,hence converting to one hot encoded
dmy=dummyVars("Attrition ~ . ",data=empdata)
tnsf=data.frame(predict(dmy,newdata=empdata))
empdata$Attrition=as.numeric(factor(empdata$Attrition),levels=levels(empdata$Attrition))-1
tnsf$Attrition=empdata$Attrition
glimpse(tnsf)
empdata=tnsf
XGBoost train and test set preparation
Fitting an xgboost model to training dataset
set.seed(111)
xgb_mod1=xgb.train(params = param,
data=dtrain,
watchlist = watch,
nrounds = 600,
verbose = 1)
Now making prediction on unseen data and plotting AUC
##making predictions on unseen data
finalpred=predict(xgb_mod1,newdata=dtest,type="prob")
length(finalpred)
## [1] 588
res.rocxg=roc(actual_labels,finalpred)
plot.roc(res.rocxg,print.auc = TRUE,print.thres = "best")
pROC::auc(res.rocxg)
## Area under the curve: 0.6175
As we can see we the AUC to be approximately 0.62 with xgboost
Fitting a random forest model and making prediction on the test set
rf_mod1=train(Attrition ~.,
data=rftrain,
method="rf",
metric="ROC",
trControl=trcontrolobj,
verbose=T)
pred_rf=predict(rf_mod1,newdata=rftest,type="raw")
length(pred_rf)
mydf=data.frame(predicted=pred_rf,actuals=actual_labels)
table(mydf$predicted,mydf$actuals)
pred_rfRoc=ifelse(pred_rf=="X1",1,0)
res.rocrf=roc(actual_labels,pred_rfRoc)
plotting the auc obtained from the randomforest model
plot.roc(res.rocrf,print.auc = TRUE,print.thres = "best")
pROC::auc(res.rocrf)
## Area under the curve: 0.5211
So here are the results,an auc of 0.52 with random forest model.
Let’s checkout the important variables identified by random forest
a=varImp(rf_mod1)
plot(a,top=20)
As we can see monthly income,overtime,Age,Distance from home are identified as important factors in when an employee decides to leave the company
First we will use logistic regression model to do Feature Selection and then make a refined LR model and check accuracy and AUC.
Initial LR model
set.seed(111)
dim(empdata)
logtrain=Train_set
prop.table(table(logtrain$Attrition))
logi_mod1=glm(Attrition ~ .,data=logtrain,family = binomial)
summary(logi_mod1)
Lets check the important variables indicated by this model
imp=varImp(logi_mod1)
print(imp)
## Overall
## Age 3.075042535
## BusinessTravel.Non.Travel 1.366757358
## BusinessTravel.Travel_Frequently 3.669000705
## DailyRate 2.206585978
## Department.Human.Resources 0.026478388
## Department.Research...Development 0.510530530
## DistanceFromHome 4.331830626
## Education.1 0.280561514
## Education.2 0.644389540
## Education.3 0.319117822
## Education.4 0.865774325
## EducationField.Human.Resources 0.644755512
## EducationField.Life.Sciences 2.293777829
## EducationField.Marketing 0.332544983
## EducationField.Medical 2.201660148
## EducationField.Other 1.316022543
## EnvironmentSatisfaction.1 3.608379793
## EnvironmentSatisfaction.2 1.363727346
## EnvironmentSatisfaction.3 1.237723894
## Gender.Female 2.245680628
## HourlyRate 0.296585478
## JobInvolvement.1 3.618969939
## JobInvolvement.2 1.698665855
## JobInvolvement.3 0.938863138
## JobLevel.1 1.392205514
## JobLevel.2 2.222550668
## JobLevel.3 2.190508988
## JobLevel.4 2.870336360
## JobRole.Healthcare.Representative 0.014342691
## JobRole.Human.Resources 0.024336742
## JobRole.Laboratory.Technician 0.531067925
## JobRole.Manager 0.501956630
## JobRole.Manufacturing.Director 0.558616941
## JobRole.Research.Director 0.007145477
## JobRole.Research.Scientist 0.370560606
## JobRole.Sales.Executive 0.942971925
## JobSatisfaction.1 4.183237159
## JobSatisfaction.2 1.841315540
## JobSatisfaction.3 3.031223822
## MaritalStatus.Divorced 0.894278244
## MaritalStatus.Married 0.928142766
## MonthlyIncome 1.992292711
## MonthlyRate 1.291500236
## NumCompaniesWorked 3.508113705
## OverTime.No 8.103934029
## PercentSalaryHike 0.845534390
## PerformanceRating.3 0.773107745
## RelationshipSatisfaction.1 2.744503044
## RelationshipSatisfaction.2 0.027356462
## RelationshipSatisfaction.3 0.183301971
## StockOptionLevel.0 0.809075496
## StockOptionLevel.1 1.227529418
## StockOptionLevel.2 1.354599318
## TotalWorkingYears 1.397849858
## TrainingTimesLastYear 0.486644938
## WorkLifeBalance.1 2.150215616
## WorkLifeBalance.2 0.416885872
## WorkLifeBalance.3 1.315839660
## YearsAtCompany 2.130209315
## YearsInCurrentRole 3.620476540
## YearsSinceLastPromotion 3.231134912
## YearsWithCurrManager 1.414472224
## AgeGroups.Middle.Aged 0.449113286
## AgeGroups.Senior.Citizen 2.207772049
## IncomeInd 2.022004184
Now, we will create a refined trainset,using just the important variables
logtrain_refined=logtrain[,which(names(logtrain) %in% c(rownames(imp),"Attrition"))]
dim(logtrain_refined)
dim(lrtest)
lrtest_refined=lrtest[,which(names(lrtest) %in% c(rownames(imp)))]
logi_mod2=glm(Attrition ~.,data=logtrain_refined,family = binomial)
summary(logi_mod2)
pred_lr=predict(logi_mod2,newdata = lrtest_refined,type="response")
length(pred_lr)
res.roclr=roc(actual_labels,pred_lr)
t2=coords(res.rocxg,"best","threshold",transpose=FALSE)
thresh=t2[1,1]
finalpred_classes=ifelse(finalpred>thresh,1,0)
Checking the results of this refined variable set model
pROC::auc(res.roclr)
## Area under the curve: 0.8393
plot.roc(res.roclr,print.auc = TRUE,print.thres = "best")
confusionMatrix(as.factor(finalpred_classes),as.factor(actual_labels))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 377 52
## 1 116 43
##
## Accuracy : 0.7143
## 95% CI : (0.6759, 0.7505)
## No Information Rate : 0.8384
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1709
## Mcnemar's Test P-Value : 1.171e-06
##
## Sensitivity : 0.7647
## Specificity : 0.4526
## Pos Pred Value : 0.8788
## Neg Pred Value : 0.2704
## Prevalence : 0.8384
## Detection Rate : 0.6412
## Detection Prevalence : 0.7296
## Balanced Accuracy : 0.6087
##
## 'Positive' Class : 0
##
Lets do a model comparison now…coming soon..