R Markdown report for IBM HR employee attrition dataset

Load the employee data set

rm(list = ls())
empdata_orig=fread("E:\\AbhinavB\\Kaggle\\IBM HR Analytics Employee Attrition\\ibm-hr-analytics-employee-attrition-performance\\WA_Fn-UseC_-HR-Employee-Attrition.csv",
                   data.table = FALSE,
                   colClasses =c("integer","factor","factor","integer","factor","integer","factor","factor","integer","integer",                                  
                                 "factor","factor","integer","factor","factor","factor","factor","factor","integer","integer",
                                 "integer","factor","factor","integer","factor","factor","integer","factor","integer","integer",
                                 "factor","integer","integer","integer","integer" ))



empdata=empdata_orig

Lets check out the data and class of each column

glimpse(empdata)
## Observations: 1,470
## Variables: 35
## $ Age                      <int> 41, 49, 37, 33, 27, 32, 59, 30, 38, 3...
## $ Attrition                <fct> Yes, No, Yes, No, No, No, No, No, No,...
## $ BusinessTravel           <fct> Travel_Rarely, Travel_Frequently, Tra...
## $ DailyRate                <int> 1102, 279, 1373, 1392, 591, 1005, 132...
## $ Department               <fct> Sales, Research & Development, Resear...
## $ DistanceFromHome         <int> 1, 8, 2, 3, 2, 2, 3, 24, 23, 27, 16, ...
## $ Education                <fct> 2, 1, 2, 4, 1, 2, 3, 1, 3, 3, 3, 2, 1...
## $ EducationField           <fct> Life Sciences, Life Sciences, Other, ...
## $ EmployeeCount            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ EmployeeNumber           <int> 1, 2, 4, 5, 7, 8, 10, 11, 12, 13, 14,...
## $ EnvironmentSatisfaction  <fct> 2, 3, 4, 4, 1, 4, 3, 4, 4, 3, 1, 4, 1...
## $ Gender                   <fct> Female, Male, Male, Female, Male, Mal...
## $ HourlyRate               <int> 94, 61, 92, 56, 40, 79, 81, 67, 44, 9...
## $ JobInvolvement           <fct> 3, 2, 2, 3, 3, 3, 4, 3, 2, 3, 4, 2, 3...
## $ JobLevel                 <fct> 2, 2, 1, 1, 1, 1, 1, 1, 3, 2, 1, 2, 1...
## $ JobRole                  <fct> Sales Executive, Research Scientist, ...
## $ JobSatisfaction          <fct> 4, 2, 3, 3, 2, 4, 1, 3, 3, 3, 2, 3, 3...
## $ MaritalStatus            <fct> Single, Married, Single, Married, Mar...
## $ MonthlyIncome            <int> 5993, 5130, 2090, 2909, 3468, 3068, 2...
## $ MonthlyRate              <int> 19479, 24907, 2396, 23159, 16632, 118...
## $ NumCompaniesWorked       <int> 8, 1, 6, 1, 9, 0, 4, 1, 0, 6, 0, 0, 1...
## $ Over18                   <fct> Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y...
## $ OverTime                 <fct> Yes, No, Yes, Yes, No, No, Yes, No, N...
## $ PercentSalaryHike        <int> 11, 23, 15, 11, 12, 13, 20, 22, 21, 1...
## $ PerformanceRating        <fct> 3, 4, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 3...
## $ RelationshipSatisfaction <fct> 1, 4, 2, 3, 4, 3, 1, 2, 2, 2, 3, 4, 4...
## $ StandardHours            <int> 80, 80, 80, 80, 80, 80, 80, 80, 80, 8...
## $ StockOptionLevel         <fct> 0, 1, 0, 0, 1, 0, 3, 1, 0, 2, 1, 0, 1...
## $ TotalWorkingYears        <int> 8, 10, 7, 8, 6, 8, 12, 1, 10, 17, 6, ...
## $ TrainingTimesLastYear    <int> 0, 3, 3, 3, 3, 2, 3, 2, 2, 3, 5, 3, 1...
## $ WorkLifeBalance          <fct> 1, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 2...
## $ YearsAtCompany           <int> 6, 10, 0, 8, 2, 7, 1, 1, 9, 7, 5, 9, ...
## $ YearsInCurrentRole       <int> 4, 7, 0, 7, 2, 7, 0, 0, 7, 7, 4, 5, 2...
## $ YearsSinceLastPromotion  <int> 0, 1, 0, 3, 2, 3, 0, 0, 1, 7, 0, 0, 4...
## $ YearsWithCurrManager     <int> 5, 7, 0, 0, 2, 6, 0, 0, 8, 7, 3, 8, 3...
t(sapply(empdata,class))
##      Age       Attrition BusinessTravel DailyRate Department
## [1,] "integer" "factor"  "factor"       "integer" "factor"  
##      DistanceFromHome Education EducationField EmployeeCount
## [1,] "integer"        "factor"  "factor"       "integer"    
##      EmployeeNumber EnvironmentSatisfaction Gender   HourlyRate
## [1,] "integer"      "factor"                "factor" "integer" 
##      JobInvolvement JobLevel JobRole  JobSatisfaction MaritalStatus
## [1,] "factor"       "factor" "factor" "factor"        "factor"     
##      MonthlyIncome MonthlyRate NumCompaniesWorked Over18   OverTime
## [1,] "integer"     "integer"   "integer"          "factor" "factor"
##      PercentSalaryHike PerformanceRating RelationshipSatisfaction
## [1,] "integer"         "factor"          "factor"                
##      StandardHours StockOptionLevel TotalWorkingYears
## [1,] "integer"     "factor"         "integer"        
##      TrainingTimesLastYear WorkLifeBalance YearsAtCompany
## [1,] "integer"             "factor"        "integer"     
##      YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
## [1,] "integer"          "integer"               "integer"

create age group from age field

empdata$AgeGroups=as.factor(ifelse(empdata$Age<=24,"Young",ifelse((empdata$Age>24 & empdata$Age<=54),"Middle Aged","Senior Citizen")))
table(empdata$AgeGroups)
## 
##    Middle Aged Senior Citizen          Young 
##           1304             69             97
Graph showing total count agegroupwise -figure
empdata %>% ggplot(aes(x=empdata$AgeGroups))+geom_histogram(stat = "count")

#### Age group wise count with Attrition and non attrition employees-figure

empdata %>% ggplot(aes(x=empdata$AgeGroups))+geom_histogram(aes(color=empdata$Attrition),stat = "count",fill="white")

#We can also analyse this with side-by-side bar chart-figure

empdata %>% ggplot(aes(x=empdata$AgeGroups))+geom_bar(aes(fill=empdata$Attrition),position = position_dodge(),color="black")

##Business travel wise count with attrition and non attrition employees

table(empdata$BusinessTravel)
## 
##        Non-Travel Travel_Frequently     Travel_Rarely 
##               150               277              1043
e2=empdata %>% group_by(empdata$BusinessTravel,empdata$Attrition) %>% summarise(count=n())
e2
## # A tibble: 6 x 3
## # Groups:   empdata$BusinessTravel [3]
##   `empdata$BusinessTravel` `empdata$Attrition` count
##   <fct>                    <fct>               <int>
## 1 Non-Travel               No                    138
## 2 Non-Travel               Yes                    12
## 3 Travel_Frequently        No                    208
## 4 Travel_Frequently        Yes                    69
## 5 Travel_Rarely            No                    887
## 6 Travel_Rarely            Yes                   156
empdata %>% ggplot(aes(x=empdata$BusinessTravel))+geom_bar(aes(fill=empdata$Attrition),position=position_dodge(),color="black")

We see that “TravelFrequently” has a propensity towards attrition
## Education level wise breakup

unique(empdata$Education)
## [1] 2 1 4 3 5
## Levels: 1 2 3 4 5
empdata %>% ggplot(aes(x=empdata$EducationField))+geom_bar(aes(fill=empdata$Attrition),position = position_dodge(),color="black")+coord_flip()

department wise breakup of attrition of employees

table(empdata$Department)
## 
##        Human Resources Research & Development                  Sales 
##                     63                    961                    446
rm(e3)
e3=empdata %>% group_by(empdata$Department,empdata$Attrition) %>% summarise(count=n()) %>% mutate(grp_pct=count/sum(count)*100)
e3
## # A tibble: 6 x 4
## # Groups:   empdata$Department [3]
##   `empdata$Department`   `empdata$Attrition` count grp_pct
##   <fct>                  <fct>               <int>   <dbl>
## 1 Human Resources        No                     51    81.0
## 2 Human Resources        Yes                    12    19.0
## 3 Research & Development No                    828    86.2
## 4 Research & Development Yes                   133    13.8
## 5 Sales                  No                    354    79.4
## 6 Sales                  Yes                    92    20.6
empdata %>% ggplot(aes(x=empdata$Department))+geom_bar(aes(fill=empdata$Attrition),position = position_dodge(),color="blue")+coord_flip()

overtime and attrition relationship with facet of gender hence 2 graphs

empdata %>% ggplot(aes(x=empdata$OverTime))+geom_bar(aes(fill=empdata$Attrition),position = position_dodge(),color="grey")+facet_grid(empdata$Gender~.)

#### Check for missing values in the dataset

colnames(empdata)[colSums(is.na(empdata)>0)]
## character(0)

No missing values in the dataset ,so we don’t have to worry about imputing

check for zero variance

k=lapply(empdata,function(x) {length(unique(x))})
w=which(!k>1)
names(w)
## [1] "EmployeeCount" "Over18"        "StandardHours"

So, will remove these columns from our analysis

empdata=empdata[,-which(names(empdata) %in% names(w))]
dim(empdata)
## [1] 1470   33

Adding a few features

empdata$TotalSatisfaction=as.numeric(empdata$EnvironmentSatisfaction)+as.numeric(empdata$JobInvolvement)+as.numeric(empdata$JobSatisfaction)+as.numeric(empdata$RelationshipSatisfaction)+as.numeric(empdata$WorkLifeBalance)

summary(empdata$TotalSatisfaction)
##Adding Low or High Income indicator if Monthly income less or greater than mean
empdata$IncomeInd=as.factor(ifelse(empdata$MonthlyIncome<mean(empdata$MonthlyIncome),"Low","High"))
empdata$IncomeInd=as.numeric(factor(empdata$IncomeInd),levels=levels(empdata$IncomeInd))-1
##dropping unique column as its not useful for modelling
empdata=empdata[,-which(names(empdata) %in% c("EmployeeNumber"))]
glimpse(empdata)

Since many ML algorithms donot work well with factor/categorical variables,hence converting to one hot encoded

dmy=dummyVars("Attrition ~ . ",data=empdata)
tnsf=data.frame(predict(dmy,newdata=empdata))
empdata$Attrition=as.numeric(factor(empdata$Attrition),levels=levels(empdata$Attrition))-1
tnsf$Attrition=empdata$Attrition
glimpse(tnsf)
empdata=tnsf

XGBoost train and test set preparation

Fitting an xgboost model to training dataset

set.seed(111)
xgb_mod1=xgb.train(params = param,
                   data=dtrain,
                   watchlist = watch,
                   nrounds = 600,
                   verbose = 1)

Now making prediction on unseen data and plotting AUC

##making predictions on unseen data
finalpred=predict(xgb_mod1,newdata=dtest,type="prob")
length(finalpred)
## [1] 588
res.rocxg=roc(actual_labels,finalpred)
plot.roc(res.rocxg,print.auc = TRUE,print.thres = "best")

pROC::auc(res.rocxg)
## Area under the curve: 0.6175

As we can see we the AUC to be approximately 0.62 with xgboost

Random Forest

Fitting a random forest model and making prediction on the test set

rf_mod1=train(Attrition ~.,
              data=rftrain,
              method="rf",
              metric="ROC",
              trControl=trcontrolobj,
              verbose=T)
pred_rf=predict(rf_mod1,newdata=rftest,type="raw")
length(pred_rf)
mydf=data.frame(predicted=pred_rf,actuals=actual_labels)
table(mydf$predicted,mydf$actuals)
pred_rfRoc=ifelse(pred_rf=="X1",1,0)
res.rocrf=roc(actual_labels,pred_rfRoc)

plotting the auc obtained from the randomforest model

plot.roc(res.rocrf,print.auc = TRUE,print.thres = "best")

pROC::auc(res.rocrf)
## Area under the curve: 0.5211

So here are the results,an auc of 0.52 with random forest model.
Let’s checkout the important variables identified by random forest

a=varImp(rf_mod1)
plot(a,top=20)

As we can see monthly income,overtime,Age,Distance from home are identified as important factors in when an employee decides to leave the company

Logistic regression

First we will use logistic regression model to do Feature Selection and then make a refined LR model and check accuracy and AUC.
Initial LR model

set.seed(111)
dim(empdata)
logtrain=Train_set
prop.table(table(logtrain$Attrition))
logi_mod1=glm(Attrition ~ .,data=logtrain,family = binomial)
summary(logi_mod1)

Lets check the important variables indicated by this model

imp=varImp(logi_mod1)
print(imp)
##                                       Overall
## Age                               3.075042535
## BusinessTravel.Non.Travel         1.366757358
## BusinessTravel.Travel_Frequently  3.669000705
## DailyRate                         2.206585978
## Department.Human.Resources        0.026478388
## Department.Research...Development 0.510530530
## DistanceFromHome                  4.331830626
## Education.1                       0.280561514
## Education.2                       0.644389540
## Education.3                       0.319117822
## Education.4                       0.865774325
## EducationField.Human.Resources    0.644755512
## EducationField.Life.Sciences      2.293777829
## EducationField.Marketing          0.332544983
## EducationField.Medical            2.201660148
## EducationField.Other              1.316022543
## EnvironmentSatisfaction.1         3.608379793
## EnvironmentSatisfaction.2         1.363727346
## EnvironmentSatisfaction.3         1.237723894
## Gender.Female                     2.245680628
## HourlyRate                        0.296585478
## JobInvolvement.1                  3.618969939
## JobInvolvement.2                  1.698665855
## JobInvolvement.3                  0.938863138
## JobLevel.1                        1.392205514
## JobLevel.2                        2.222550668
## JobLevel.3                        2.190508988
## JobLevel.4                        2.870336360
## JobRole.Healthcare.Representative 0.014342691
## JobRole.Human.Resources           0.024336742
## JobRole.Laboratory.Technician     0.531067925
## JobRole.Manager                   0.501956630
## JobRole.Manufacturing.Director    0.558616941
## JobRole.Research.Director         0.007145477
## JobRole.Research.Scientist        0.370560606
## JobRole.Sales.Executive           0.942971925
## JobSatisfaction.1                 4.183237159
## JobSatisfaction.2                 1.841315540
## JobSatisfaction.3                 3.031223822
## MaritalStatus.Divorced            0.894278244
## MaritalStatus.Married             0.928142766
## MonthlyIncome                     1.992292711
## MonthlyRate                       1.291500236
## NumCompaniesWorked                3.508113705
## OverTime.No                       8.103934029
## PercentSalaryHike                 0.845534390
## PerformanceRating.3               0.773107745
## RelationshipSatisfaction.1        2.744503044
## RelationshipSatisfaction.2        0.027356462
## RelationshipSatisfaction.3        0.183301971
## StockOptionLevel.0                0.809075496
## StockOptionLevel.1                1.227529418
## StockOptionLevel.2                1.354599318
## TotalWorkingYears                 1.397849858
## TrainingTimesLastYear             0.486644938
## WorkLifeBalance.1                 2.150215616
## WorkLifeBalance.2                 0.416885872
## WorkLifeBalance.3                 1.315839660
## YearsAtCompany                    2.130209315
## YearsInCurrentRole                3.620476540
## YearsSinceLastPromotion           3.231134912
## YearsWithCurrManager              1.414472224
## AgeGroups.Middle.Aged             0.449113286
## AgeGroups.Senior.Citizen          2.207772049
## IncomeInd                         2.022004184

Now, we will create a refined trainset,using just the important variables

logtrain_refined=logtrain[,which(names(logtrain) %in% c(rownames(imp),"Attrition"))]
dim(logtrain_refined)
dim(lrtest)
lrtest_refined=lrtest[,which(names(lrtest) %in% c(rownames(imp)))]
logi_mod2=glm(Attrition ~.,data=logtrain_refined,family = binomial)
summary(logi_mod2)
pred_lr=predict(logi_mod2,newdata = lrtest_refined,type="response")
length(pred_lr)
res.roclr=roc(actual_labels,pred_lr)
t2=coords(res.rocxg,"best","threshold",transpose=FALSE)
thresh=t2[1,1]
finalpred_classes=ifelse(finalpred>thresh,1,0)

Checking the results of this refined variable set model

pROC::auc(res.roclr)
## Area under the curve: 0.8393
plot.roc(res.roclr,print.auc = TRUE,print.thres = "best")

confusionMatrix(as.factor(finalpred_classes),as.factor(actual_labels))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 377  52
##          1 116  43
##                                           
##                Accuracy : 0.7143          
##                  95% CI : (0.6759, 0.7505)
##     No Information Rate : 0.8384          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1709          
##  Mcnemar's Test P-Value : 1.171e-06       
##                                           
##             Sensitivity : 0.7647          
##             Specificity : 0.4526          
##          Pos Pred Value : 0.8788          
##          Neg Pred Value : 0.2704          
##              Prevalence : 0.8384          
##          Detection Rate : 0.6412          
##    Detection Prevalence : 0.7296          
##       Balanced Accuracy : 0.6087          
##                                           
##        'Positive' Class : 0               
## 

Lets do a model comparison now…coming soon..