#Packages Used
require(ggplot2)
require(plotly)
require(rpart)
require(rpart.plot)
require(caret)
require(e1071)
require(markdown)
require(corrplot)
require(pROC)
require(Boruta)
require(dplyr)
require(corrplot)
require(caret)
#set working directory
setwd("~/Desktop/HR Analytics")
#Read File
hra <- read.csv("hra.csv")
attach(hra)
colSums(is.na(hra))
## satisfaction_level last_evaluation number_project
## 0 0 0
## average_montly_hours time_spend_company Work_accident
## 0 0 0
## left promotion_last_5years sales
## 0 0 0
## salary
## 0
# No Null Values
#Data type Conversion
hra$left <- as.factor(hra$left)
hra$salary <- ordered(hra$salary,levels=c("low","medium","high"))
prop.table(table(hra$left))
##
## 0 1
## 0.7619175 0.2380825
#Attrition rate is equal to 24%
#The satisfaction level is around 62% and the performance average is around 71%.
#On average people work on 3 to 4 projects a year and about 200 hours per months
summary(hra)
## satisfaction_level last_evaluation number_project average_montly_hours
## Min. :0.0900 Min. :0.3600 Min. :2.000 Min. : 96.0
## 1st Qu.:0.4400 1st Qu.:0.5600 1st Qu.:3.000 1st Qu.:156.0
## Median :0.6400 Median :0.7200 Median :4.000 Median :200.0
## Mean :0.6128 Mean :0.7161 Mean :3.803 Mean :201.1
## 3rd Qu.:0.8200 3rd Qu.:0.8700 3rd Qu.:5.000 3rd Qu.:245.0
## Max. :1.0000 Max. :1.0000 Max. :7.000 Max. :310.0
##
## time_spend_company Work_accident left promotion_last_5years
## Min. : 2.000 Min. :0.0000 0:11428 Min. :0.00000
## 1st Qu.: 3.000 1st Qu.:0.0000 1: 3571 1st Qu.:0.00000
## Median : 3.000 Median :0.0000 Median :0.00000
## Mean : 3.498 Mean :0.1446 Mean :0.02127
## 3rd Qu.: 4.000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :10.000 Max. :1.0000 Max. :1.00000
##
## sales salary
## sales :4140 low :7316
## technical :2720 medium:6446
## support :2229 high :1237
## IT :1227
## product_mng: 902
## marketing : 858
## (Other) :2923
#--EDA - Understanding the reasons for leaving
##--High Salary People are Low in Number ,Medium & Low salary people are in same range.
ggplot(hra,aes(x = salary )) + geom_bar(stat = "count",fill="#3090C7",colour= "grey") + labs(title = "Salary Distribution",x="Salary Level",y="No.Of.Observation")

##-- Most of the people belongs to Sales,technical & support Despartment. Sales is the largest
ggplot(hra,aes(x = sales)) + geom_bar(stat = "count",fill="#3090C7",colour= "grey") + labs(title = "Work Department Distribution",x="Work Department",y="No.of.Observation") + coord_flip()

##-- Most people in Sales,tech,support have low salary. Most employee in Management have high Salary
ggplot(hra,aes(x= as.factor(sales),fill=salary)) + geom_bar(outlier.colour = NA,position = "dodge")+ labs(title = "Work Department Salary Distribution",x="Work Department",y="No.of.Observation") + coord_flip()

##-- Not so many promotions just 2% which could be the main reason for expereinced employee leaving
ggplot(hra,aes(x = promotion_last_5years )) + geom_bar(stat = "count",fill="#3090C7",colour= "grey") + labs(title = "Promotion in last 5 Years",x="Promotion Status",y="No.of.Observation")

#Bi- variate analysis
#People who has satisfaction level less than 0.5 tends to leaving out in large numbers, we could classifiy them into satisfaction categories : Poor < .25 ,Unsatisfied < .5 , Higly satisfied >.6
ggplotly(ggplot(hra,aes(x= satisfaction_level, fill=left)) + geom_histogram() + facet_wrap( ~ left, ncol = 2) + labs(title = "Satisfaction Level Distribution",x="Satisfaction Level",y="No.Of.Observation"), tooltip = "all")
#Most of them leave the job because of the poor performance (below 0.7) & (Above 0.7) Considered good at their job could be leaving due to lack of promotions
ggplotly(ggplot(hra,aes(x= last_evaluation, fill=left)) + geom_histogram() + facet_wrap( ~ left, ncol = 2)+ labs(title = "Last Evaluation Distribution",x="Last Evaluation Score",y="No.Of.Observation"), tooltip = "all")
#employee who work on 2 projects & above 5 projects have left the job in large numbers
ggplotly(ggplot(hra,aes(x = as.factor(number_project),fill = left)) + geom_bar() + facet_wrap( ~ left, ncol = 2) + labs(title = "Number of Projects",x="No.of Projects",y="No.of.Observation"),tooltip = "all")
#employee who underworked (less than 160 hrs/month) & overworked (more than 240 hrs/month) have left in large numbers
ggplotly(ggplot(hra,aes(x = average_montly_hours,fill = left)) + geom_histogram() + facet_wrap( ~ left, ncol = 2) + labs(title = "Average Monlthy Working Hours",x="Average work hrs/month",y="No.of.Observation"),tooltip = "all")
# majority of employees with 3-5 years of work experience have left the company mainly due to lack of promotion & for better salary
ggplotly(ggplot(hra,aes(x = as.factor(time_spend_company),fill = left)) + geom_bar() + facet_wrap( ~ left, ncol = 2) + labs(title = "Work Experience in the Company",x="Work Experience in years",y="No.of.Observation"),tooltip = "all")
#99 % of employee who left the company didn't get promotion in last 5 years
ggplotly(ggplot(hra,aes(x = as.factor(promotion_last_5years),fill = left)) + geom_bar() + facet_wrap( ~ left, ncol = 2) + labs(title = "Promotion in last 5 Years",x="Promotion Status",y="No.of.Observation"),tooltip = "all")
#only 4 % of employee who left the job had Work accidents
ggplotly(ggplot(hra,aes(x = as.factor(Work_accident),fill = left)) + geom_bar() + facet_wrap( ~ left, ncol = 2)+ labs(title = "Work Accidents",x="Work Accident Status",y="No.of.Observations"),tooltip = "all")
#To understand the relationship between the attributes
hra_c <- hra
hra_c$left <- as.integer(hra$left)
corrplot(cor(hra_c[,1:8]), method="circle")

#There exists a positive(+) correlation between projectCount, averageMonthlyHours, and evaluation.
#Which states that those employees who spent more hours and did more projects were evaluated highly.
#The negative(-) relationships between employee attrition(left) and satisfaction are highly correlated.
#This states that employees leave the company more when they are less satisfied with the job.
#Summary
##On average people who leave have a low satisfaction level, they work more and didn???t get promoted within the past five years.
#Who is leaving?
#Create data table only with employees who have left the job
#Aim is to identify & define valuable employees and also the dominant factors affecting them to leave
hr_hist <- hra %>% filter(left==1)
# 3571 employees have left the job
nrow(hr_hist)
## [1] 3571
#The valuable employees are :
par(mfrow=c(1,3))
#The company doesn't want to retain all the employees. Performance Evaluation is the key criteria in deciding the valuable employees. Employees who have score above 0.7 are considered good at thier job.
hist(hr_hist$last_evaluation,col="#3090C7", main = "Last evaluation",xlab = "Performance Score", ylab = "No. of observation")
#A employee who is works in the company with atleast 4 years of experience is considered to be valuable
plot(as.factor(hr_hist$time_spend_company),col="#3090C7", main ="Work Experience",xlab = "Work experience in years", ylab = "No. of observation")
#Employees who with more than 5 project are considered valuable
plot(as.factor(hr_hist$number_project),col="#3090C7", main ="Number of Projects",xlab = "No. of Projects", ylab = "No. of observation")

dev.off()
## null device
## 1
# 2014 out of 3571 employees who left the job are valuable employees
hr_vaulable <- hr_hist %>% filter(last_evaluation >= 0.70 | time_spend_company >= 4 | number_project > 5)
nrow(hr_vaulable)
## [1] 2014
#-Why did they Leave ?
#--Correlation
hr_val <- hra_c %>% filter(last_evaluation >= 0.70 | time_spend_company >= 4 | number_project > 5)
hr_val_select <- hr_val %>% select(satisfaction_level, number_project: promotion_last_5years)
corrplot(cor(hr_val_select), method="circle")
#Summary - From correlation plot we can infer that, On average valuable employees that leave are not satisfayed, work on many projects, spend many hours in the company each month and aren???t promoted.
#Create data table with only valuable employee
hr_model <- hra %>% filter(last_evaluation >= 0.70 | time_spend_company >= 4 | number_project > 5)
summary(hr_model)
## satisfaction_level last_evaluation number_project average_montly_hours
## Min. :0.090 Min. :0.3600 Min. :2.000 Min. : 96.0
## 1st Qu.:0.490 1st Qu.:0.7300 1st Qu.:3.000 1st Qu.:171.0
## Median :0.680 Median :0.8300 Median :4.000 Median :218.0
## Mean :0.617 Mean :0.8015 Mean :4.159 Mean :211.8
## 3rd Qu.:0.830 3rd Qu.:0.9100 3rd Qu.:5.000 3rd Qu.:253.0
## Max. :1.000 Max. :1.0000 Max. :7.000 Max. :310.0
##
## time_spend_company Work_accident left promotion_last_5years
## Min. : 2.000 Min. :0.0000 0:7758 Min. :0.00000
## 1st Qu.: 3.000 1st Qu.:0.0000 1:2014 1st Qu.:0.00000
## Median : 4.000 Median :0.0000 Median :0.00000
## Mean : 3.916 Mean :0.1521 Mean :0.02384
## 3rd Qu.: 5.000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :10.000 Max. :1.0000 Max. :1.00000
##
## sales salary
## sales :2628 low :4671
## technical :1786 medium:4267
## support :1466 high : 834
## IT : 808
## product_mng: 582
## marketing : 561
## (Other) :1941
#--Important factors Identification
# Set the target variable as a factor
hra$left <- as.factor(hra$left)
hr_model$left <- as.factor(hr_model$left)
#Modeling -- Cross-Validation
set.seed(111)
train_control<- trainControl(method="cv", number=5, repeats=3)
head(train_control)
## $method
## [1] "cv"
##
## $number
## [1] 5
##
## $repeats
## [1] 3
##
## $search
## [1] "grid"
##
## $p
## [1] 0.75
##
## $initialWindow
## NULL
# Using decision tree to identify important variables
c5model <- train(left~.,data = hr_model,trControl=train_control, method="C5.0Tree")
#Top 4 Attributes are - Satisfaction Level, Time spent in the company, average monthly hours & Performance Evaluation
plot(varImp(c5model))
# model with only important variable - Logistic regresion
gmlmodel_imp <- train(left~ + satisfaction_level + time_spend_company + average_montly_hours, data = hr_model, trControl=train_control, method="LogitBoost" )
# predictions
predictions<- predict(gmlmodel_imp,hr_model)
gmlmodel_imp_binded <- cbind(hr_model,predictions)
# results
confusionMatrix<- confusionMatrix(gmlmodel_imp_binded$predictions,gmlmodel_imp_binded$left)
#- Accuracy : 0.9449 & Kappa : 0.8203
confusionMatrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 7663 443
## 1 95 1571
##
## Accuracy : 0.9449
## 95% CI : (0.9402, 0.9494)
## No Information Rate : 0.7939
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8203
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9878
## Specificity : 0.7800
## Pos Pred Value : 0.9453
## Neg Pred Value : 0.9430
## Prevalence : 0.7939
## Detection Rate : 0.7842
## Detection Prevalence : 0.8295
## Balanced Accuracy : 0.8839
##
## 'Positive' Class : 0
##
# Boruta Features
bor <- Boruta(left ~ ., hr_model)
bor # All are important !!
## Boruta performed 10 iterations in 40.67442 secs.
## 9 attributes confirmed important: Work_accident,
## average_montly_hours, last_evaluation, number_project,
## promotion_last_5years and 4 more;
## No attributes deemed unimportant.
#So considering all the attributes for better accuracy rate
#Logistic regression model
gmlmodel <- train(left~., data=hr_model, trControl=train_control, method="LogitBoost")
# predictions
predictions<- predict(gmlmodel,hr_model)
gmlmodelbinded <- cbind(hr_model,predictions)
# results
confusionMatrix<- confusionMatrix(gmlmodelbinded$predictions,gmlmodelbinded$left)
confusionMatrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 7671 319
## 1 87 1695
##
## Accuracy : 0.9585
## 95% CI : (0.9543, 0.9623)
## No Information Rate : 0.7939
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8674
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9888
## Specificity : 0.8416
## Pos Pred Value : 0.9601
## Neg Pred Value : 0.9512
## Prevalence : 0.7939
## Detection Rate : 0.7850
## Detection Prevalence : 0.8176
## Balanced Accuracy : 0.9152
##
## 'Positive' Class : 0
##
# SVM model
svmmodel <- train(left~., data=hr_model, trControl=train_control, method="LogitBoost")
# predictions
predictions<- predict(svmmodel,hr_model)
svmmodelbinded <- cbind(hr_model,predictions)
# results
confusionMatrix<- confusionMatrix(svmmodelbinded$predictions,svmmodelbinded$left)
confusionMatrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 7676 335
## 1 82 1679
##
## Accuracy : 0.9573
## 95% CI : (0.9531, 0.9612)
## No Information Rate : 0.7939
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8632
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9894
## Specificity : 0.8337
## Pos Pred Value : 0.9582
## Neg Pred Value : 0.9534
## Prevalence : 0.7939
## Detection Rate : 0.7855
## Detection Prevalence : 0.8198
## Balanced Accuracy : 0.9115
##
## 'Positive' Class : 0
##
#Both model yield similar results
#Final Outcome
set.seed(123)
# Split the data
inTraining <- createDataPartition(hr_model$left, p = .75, list = FALSE)
train <- hr_model[ inTraining,]
test <- hr_model[-inTraining,]
# Logistic Regression model to achieve probability value
logmodel = glm(left ~ ., family=binomial(logit), data=train)
# predictions
prob_leave=predict(logmodel,newdata=test,type="response")
# Create a data table with prediction & performance
pred_attrition = data.frame(prob_leave)
pred_attrition$performance=test$last_evaluation
#Plot Probability to Leave Vs Performance Score
ggplotly(ggplot(pred_attrition,aes(x=prob_leave,y= performance)) + geom_point(color = "blue", alpha= .5,shape = 3) +labs(title = "Probability to Leave Vs Performance Score",x="Probability to Leave",y="Performance Score"), tooltip = "all")
# need to concentrate & prioritize employee who are at the top right corner