Execute by Neha Raut
A mid-sized IT company with over 10,000 employees needed to plan its hiring and reduce risk of projects getting delayed due to employees leaving. It also wanted to understand why its attrition rate was high and how it could be reduced.
There are best and most experienced employees leaving prematurely. Use this database and try to predict which valuable employees will leave next
Evaluation Criterion :auc score for test data should come out to be more than 0.831
We have given you two datasets , hr_train.csv and hr_test.csv . You need to use data hr_train to build predictive model for response variable ‘left’. hr_test data contains all other factors except “left”, you need to predict that using the model that you developed and submit your predicted values in a csv files.
Employee satisfaction level
Last evaluation
Number of projects
Average monthly hours
Time spent at the company
Whether they have had a work accident
Whether they have had a promotion in the last 5 years
Department
Salary
Whether the employee has left
Combining both train n test datasets prior to data preparation.
loading library dplyr
library(corrplot)
library(dplyr)
library(car)
library(randomForest)
library(tree)
library(pROC)
Read train and test datasets:
hr_train=read.csv("hr_train.csv")
hr_test=read.csv("hr_test.csv")
sum(is.na(hr_train)) # For entire dataset
## [1] 0
colSums(is.na(hr_train))
## satisfaction_level last_evaluation number_project
## 0 0 0
## average_montly_hours time_spend_company Work_accident
## 0 0 0
## left promotion_last_5years sales
## 0 0 0
## salary
## 0
You will need same set of vars on both train and test,its easier to manage that if you combine train and test in the beginning and then separate them once you are done with data preparation
We’ll fill test’s response column with NA
hr_test$left= NA
hr_train$data = 'train'
hr_test$data = 'test'
all= rbind(hr_train,hr_test)
glimpse(all)
## Observations: 14,999
## Variables: 11
## $ satisfaction_level <dbl> 0.42, 0.66, 0.55, 0.22, 0.20, 0.83, 0.87...
## $ last_evaluation <dbl> 0.46, 0.77, 0.49, 0.88, 0.72, 0.84, 0.49...
## $ number_project <int> 2, 2, 5, 4, 6, 4, 2, 3, 5, 2, 4, 3, 5, 5...
## $ average_montly_hours <int> 150, 171, 240, 213, 224, 206, 251, 208, ...
## $ time_spend_company <int> 3, 2, 3, 3, 4, 2, 3, 2, 5, 3, 5, 3, 3, 3...
## $ Work_accident <int> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0...
## $ left <int> 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1...
## $ promotion_last_5years <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ sales <fct> sales, technical, technical, technical, ...
## $ salary <fct> medium, medium, high, medium, medium, me...
## $ data <chr> "train", "train", "train", "train", "tra...
#Unique value for every column
apply(all,2,function(x) length(unique(x)))
## satisfaction_level last_evaluation number_project
## 92 65 6
## average_montly_hours time_spend_company Work_accident
## 215 8 2
## left promotion_last_5years sales
## 3 2 10
## salary data
## 3 2
glimpse(all)
## Observations: 14,999
## Variables: 11
## $ satisfaction_level <dbl> 0.42, 0.66, 0.55, 0.22, 0.20, 0.83, 0.87...
## $ last_evaluation <dbl> 0.46, 0.77, 0.49, 0.88, 0.72, 0.84, 0.49...
## $ number_project <int> 2, 2, 5, 4, 6, 4, 2, 3, 5, 2, 4, 3, 5, 5...
## $ average_montly_hours <int> 150, 171, 240, 213, 224, 206, 251, 208, ...
## $ time_spend_company <int> 3, 2, 3, 3, 4, 2, 3, 2, 5, 3, 5, 3, 3, 3...
## $ Work_accident <int> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0...
## $ left <int> 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1...
## $ promotion_last_5years <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ sales <fct> sales, technical, technical, technical, ...
## $ salary <fct> medium, medium, high, medium, medium, me...
## $ data <chr> "train", "train", "train", "train", "tra...
Next we’ll create dummy variables for remaining categorical variables
CreateDummies=function(data,var,freq_cutoff=100){
t=table(data[,var])
t=t[t>freq_cutoff]
t=sort(t)
categories=names(t)[-1]
for( cat in categories){
name=paste(var,cat,sep="_")
name=gsub(" ","",name)
name=gsub("-","_",name)
name=gsub("\\?","Q",name)
name=gsub("<","LT_",name)
name=gsub("\\+","",name)
name=gsub(">","GT_",name)
name=gsub("=","EQ_",name)
name=gsub(",","",name)
name=gsub("/","_",name)
data[,name]=as.numeric(data[,var]==cat)
}
data[,var]=NULL
return(data)
}
glimpse(all)
## Observations: 14,999
## Variables: 11
## $ satisfaction_level <dbl> 0.42, 0.66, 0.55, 0.22, 0.20, 0.83, 0.87...
## $ last_evaluation <dbl> 0.46, 0.77, 0.49, 0.88, 0.72, 0.84, 0.49...
## $ number_project <int> 2, 2, 5, 4, 6, 4, 2, 3, 5, 2, 4, 3, 5, 5...
## $ average_montly_hours <int> 150, 171, 240, 213, 224, 206, 251, 208, ...
## $ time_spend_company <int> 3, 2, 3, 3, 4, 2, 3, 2, 5, 3, 5, 3, 3, 3...
## $ Work_accident <int> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0...
## $ left <int> 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1...
## $ promotion_last_5years <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ sales <fct> sales, technical, technical, technical, ...
## $ salary <fct> medium, medium, high, medium, medium, me...
## $ data <chr> "train", "train", "train", "train", "tra...
Convert below 2 in factor, i.e, Work_accident and promotion_last_5years
table(all$promotion_last_5years)
##
## 0 1
## 14680 319
all$promotion_last_5years = as.factor(all$promotion_last_5years)
all$Work_accident = as.factor(all$Work_accident)
Remove the extra varaiables added before
hr_train = all %>% filter(data == 'train') %>% select(-data)
#View(hr_train)
hr_test= all %>% filter(data == 'test') %>% select(-left, -data)
#View(hr_test)
set.seed(2)
s=sample(1:nrow(hr_train),0.75*nrow(hr_train))
train_75=hr_train[s,] #5652
test_25=hr_train[-s,] #1884
check performance using decision Tree
tree.model=tree(left ~.,data=train_75)
#Check performance
test.score=predict(tree.model,newdata = test_25,type='vector')
pROC::roc(test_25$left,test.score)
##
## Call:
## roc.default(response = test_25$left, predictor = test.score)
##
## Data: test.score in 1871 controls (test_25$left 0) < 754 cases (test_25$left 1).
## Area under the curve: 0.8252
check performance using random Forest
rf.model=randomForest(left ~.,data=train_75)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
#Check performance
test.score=predict(rf.model,newdata = test_25)
pROC::roc(test_25$left,test.score) #AUC comes 0.8384
##
## Call:
## roc.default(response = test_25$left, predictor = test.score)
##
## Data: test.score in 1871 controls (test_25$left 0) < 754 cases (test_25$left 1).
## Area under the curve: 0.8384
So Use Random Forest model on entire train data to predict the values
fit_hr= randomForest(as.factor(left)~.,data=hr_train)
fit_hr
##
## Call:
## randomForest(formula = as.factor(left) ~ ., data = hr_train)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 12.64%
## Confusion matrix:
## 0 1 class.error
## 0 7042 382 0.05145474
## 1 945 2130 0.30731707
score=predict(fit_hr,newdata= hr_test, type="prob")[,2]
write.csv(score,'Neha_Raut_P4_part2.csv',row.names = F)
importance(fit_hr)
## MeanDecreaseGini
## satisfaction_level 1132.42693
## last_evaluation 611.00150
## number_project 558.13433
## average_montly_hours 752.36447
## time_spend_company 528.10783
## Work_accident 46.66222
## promotion_last_5years 13.86046
## sales 271.78240
## salary 101.09643
varImpPlot(fit_hr)
1. Correlation
1. Salary V.S. Turnove
vis_1<-table(all$salary,all$left)
vis_1
##
## 0 1
## high 759 121
## low 3352 1752
## medium 3313 1202
d_vis_1<-as.data.frame(vis_1)
d_vis_1
## Var1 Var2 Freq
## 1 high 0 759
## 2 low 0 3352
## 3 medium 0 3313
## 4 high 1 121
## 5 low 1 1752
## 6 medium 1 1202
d_vis_1$leftString[d_vis_1$Var2 == 1] = 'Left'
d_vis_1$leftString[d_vis_1$Var2 == 0] = 'Not Left'
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.4
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
p<-ggplot(d_vis_1, aes(x=Var1,y=Freq,fill=leftString)) +
xlab("Salary") + ylab("Frequency")+
geom_bar(position="dodge",stat='identity') + coord_flip()+ labs(title="Salary V.S. Turnover")
p
#Majority of employees who left either had low or medium salary
#Barely any employees left with high salary
2. Department V.S. Turnover
vis_2<-table(all$sales,all$left)
vis_2
##
## 0 1
## accounting 346 181
## hr 356 176
## IT 598 227
## management 349 102
## marketing 419 195
## product_mng 461 178
## RandD 396 134
## sales 2080 855
## support 1103 453
## technical 1316 574
d_vis_2<-as.data.frame(vis_2)
d_vis_2<-subset(d_vis_2,Var2==1)
d_vis_2
## Var1 Var2 Freq
## 11 accounting 1 181
## 12 hr 1 176
## 13 IT 1 227
## 14 management 1 102
## 15 marketing 1 195
## 16 product_mng 1 178
## 17 RandD 1 134
## 18 sales 1 855
## 19 support 1 453
## 20 technical 1 574
library(ggplot2)
d_vis_2$Var1 <- factor(d_vis_2$Var1, levels = d_vis_2$Var1[order(-d_vis_2$Freq)])
p<-ggplot(d_vis_2, aes(x=Var1,y=Freq,fill=Var1)) +
xlab("Department") + ylab("Frequency")+
geom_bar(stat='identity') +theme(axis.text.x = element_text(angle = 90, hjust = 1))
p
#The sales, technical, and support department were the top 3 departments to have employee turnover
#The management department had the smallest amount of turnover```
3.Turnover V.S. ProjectCount
vis_3<-table(all$number_project,all$left)
vis_3
##
## 0 1
## 2 624 1037
## 3 2483 331
## 4 2528 539
## 5 1426 542
## 6 352 457
## 7 11 169
d_vis_3<-as.data.frame(vis_3)
d_vis_3
## Var1 Var2 Freq
## 1 2 0 624
## 2 3 0 2483
## 3 4 0 2528
## 4 5 0 1426
## 5 6 0 352
## 6 7 0 11
## 7 2 1 1037
## 8 3 1 331
## 9 4 1 539
## 10 5 1 542
## 11 6 1 457
## 12 7 1 169
d_vis_3$leftString[d_vis_3$Var2 == 1] = 'Left'
d_vis_3$leftString[d_vis_3$Var2 == 0] = 'Not Left'
library(ggplot2)
p<-ggplot(d_vis_3, aes(x=Var1,y=Freq,fill=leftString)) +
xlab("Number of project") + ylab("Frequency")+
geom_bar(position="dodge",stat='identity') + coord_flip()
p
#More than half of the employees with 2,6, and 7 projects left the company
#Majority of the employees who did not leave the company had 3,4, and 5 projects
#All of the employees with 7 projects left the company
#There is an increase in employee turnover rate as project count increases
4. Turnover V.S. Evaluation
left_data<-subset(all,left==1)
stay_data<-subset(all,left==0)
ggplot() + geom_density(aes(x=last_evaluation), colour="red", data=left_data) +
geom_density(aes(x=last_evaluation), colour="blue", data=stay_data)
#There is a biomodal distribution for those that had a turnover.
#Employees with low performance tend to leave the company more
#Employees with high performance tend to leave the company more
#The sweet spot for employees that stayed is within 0.6-0.8 evaluation
5.Turnover V.S. AverageMonthlyHours
ggplot() + geom_density(aes(x=average_montly_hours), colour="red", data=left_data) +
geom_density(aes(x=average_montly_hours), colour="blue", data=stay_data)
#Employees who had less hours of work (~150hours or less) left the company more
#Employees who had too many hours of work (~250 or more) left the company
#Employees who left generally were underworked or overworked.
6. #Turnover V.S. Satisfaction
ggplot() + geom_density(aes(x=satisfaction_level), colour="red", data=left_data) +
geom_density(aes(x=satisfaction_level), colour="blue", data=stay_data)
#There is a tri-modal distribution for employees that turnovered
#Employees who had really low satisfaction levels (0.2 or less) left the company more
#Employees who had low satisfaction levels (0.3~0.5) left the company more
#Employees who had really high satisfaction levels (0.7 or more) left the company more
With all of this information, this is what Bob should know about his company and why his employees probably left:
1. Employees generally left when they are underworked (less than 150hr/month or 6hr/day)
2. Employees generally left when they are overworked (more than 250hr/month or 10hr/day)
3. Employees with either really high or low evaluations should be taken into consideration for high turnover rate
4. Employees with low to medium salaries are the bulk of employee turnover
5. Employees that had 2,6, or 7 project count was at risk of leaving the company
6. Employee satisfaction is the highest indicator for employee turnover
7. Employee that had 4 and 5 yearsAtCompany should be taken into consideration for high turnover rate