#Packages Used
require(ggplot2)
require(plotly)
require(rpart)
require(rpart.plot)
require(caret)
require(e1071)
require(markdown)
require(corrplot)
require(pROC)
require(Boruta)
require(dplyr)
require(corrplot)
require(caret)

#set working directory
setwd("~/Desktop/HR Analytics")

#Read File
hra <- read.csv("hra.csv")
attach(hra)

colSums(is.na(hra)) 
##    satisfaction_level       last_evaluation        number_project 
##                     0                     0                     0 
##  average_montly_hours    time_spend_company         Work_accident 
##                     0                     0                     0 
##                  left promotion_last_5years                 sales 
##                     0                     0                     0 
##                salary 
##                     0
# No Null Values

#Data type Conversion
hra$left <- as.factor(hra$left)
hra$salary <- ordered(hra$salary,levels=c("low","medium","high"))

prop.table(table(hra$left))
## 
##         0         1 
## 0.7619175 0.2380825
#Attrition rate is equal to 24%
#The satisfaction level is around 62% and the performance average is around 71%. 
#On average people work on 3 to 4 projects a year and about 200 hours per months
summary(hra) 
##  satisfaction_level last_evaluation  number_project  average_montly_hours
##  Min.   :0.0900     Min.   :0.3600   Min.   :2.000   Min.   : 96.0       
##  1st Qu.:0.4400     1st Qu.:0.5600   1st Qu.:3.000   1st Qu.:156.0       
##  Median :0.6400     Median :0.7200   Median :4.000   Median :200.0       
##  Mean   :0.6128     Mean   :0.7161   Mean   :3.803   Mean   :201.1       
##  3rd Qu.:0.8200     3rd Qu.:0.8700   3rd Qu.:5.000   3rd Qu.:245.0       
##  Max.   :1.0000     Max.   :1.0000   Max.   :7.000   Max.   :310.0       
##                                                                          
##  time_spend_company Work_accident    left      promotion_last_5years
##  Min.   : 2.000     Min.   :0.0000   0:11428   Min.   :0.00000      
##  1st Qu.: 3.000     1st Qu.:0.0000   1: 3571   1st Qu.:0.00000      
##  Median : 3.000     Median :0.0000             Median :0.00000      
##  Mean   : 3.498     Mean   :0.1446             Mean   :0.02127      
##  3rd Qu.: 4.000     3rd Qu.:0.0000             3rd Qu.:0.00000      
##  Max.   :10.000     Max.   :1.0000             Max.   :1.00000      
##                                                                     
##          sales         salary    
##  sales      :4140   low   :7316  
##  technical  :2720   medium:6446  
##  support    :2229   high  :1237  
##  IT         :1227                
##  product_mng: 902                
##  marketing  : 858                
##  (Other)    :2923
#--EDA - Understanding the reasons for leaving

##--High Salary People are Low in Number ,Medium & Low salary people are in same range.

ggplot(hra,aes(x = salary )) + geom_bar(stat = "count",fill="#3090C7",colour= "grey") + labs(title = "Salary Distribution",x="Salary Level",y="No.Of.Observation") 

##-- Most of the people belongs to Sales,technical & support Despartment. Sales is the largest

ggplot(hra,aes(x = sales)) + geom_bar(stat = "count",fill="#3090C7",colour= "grey") + labs(title = "Work Department Distribution",x="Work Department",y="No.of.Observation") + coord_flip()

##-- Most people in Sales,tech,support have low salary. Most employee in Management have high Salary

ggplot(hra,aes(x= as.factor(sales),fill=salary)) + geom_bar(outlier.colour = NA,position = "dodge")+ labs(title = "Work Department Salary Distribution",x="Work Department",y="No.of.Observation") +  coord_flip()

##-- Not so many promotions just 2% which could be the main reason for expereinced employee leaving 

ggplot(hra,aes(x = promotion_last_5years )) + geom_bar(stat = "count",fill="#3090C7",colour= "grey") + labs(title = "Promotion in last 5 Years",x="Promotion Status",y="No.of.Observation")

#Bi- variate analysis 

#People who has satisfaction level less than 0.5 tends to leaving out in large numbers, we could classifiy them into satisfaction categories : Poor < .25 ,Unsatisfied < .5 , Higly satisfied >.6 

ggplotly(ggplot(hra,aes(x= satisfaction_level, fill=left)) + geom_histogram() +  facet_wrap( ~ left, ncol = 2) + labs(title = "Satisfaction Level Distribution",x="Satisfaction Level",y="No.Of.Observation"), tooltip = "all")  
#Most of them leave the job because of the poor performance (below 0.7) & (Above 0.7) Considered good at their job could be leaving due to lack of promotions

ggplotly(ggplot(hra,aes(x= last_evaluation, fill=left)) + geom_histogram() + facet_wrap( ~ left, ncol = 2)+ labs(title = "Last Evaluation Distribution",x="Last Evaluation Score",y="No.Of.Observation"), tooltip = "all")
#employee who work on 2 projects & above 5 projects have left the job in large numbers

ggplotly(ggplot(hra,aes(x = as.factor(number_project),fill = left)) + geom_bar() +  facet_wrap( ~ left, ncol = 2) + labs(title = "Number of Projects",x="No.of Projects",y="No.of.Observation"),tooltip = "all")
#employee who underworked (less than 160 hrs/month) & overworked (more than 240 hrs/month) have left in large numbers

ggplotly(ggplot(hra,aes(x = average_montly_hours,fill = left)) +  geom_histogram() +  facet_wrap( ~ left, ncol = 2) + labs(title = "Average Monlthy Working Hours",x="Average work hrs/month",y="No.of.Observation"),tooltip = "all")
# majority of employees with 3-5 years of work experience have left the company mainly due to lack of promotion & for better salary

ggplotly(ggplot(hra,aes(x = as.factor(time_spend_company),fill = left)) + geom_bar() +  facet_wrap( ~ left, ncol = 2) + labs(title = "Work Experience in the Company",x="Work Experience in years",y="No.of.Observation"),tooltip = "all")
#99 % of employee who left the company didn't get promotion in last 5 years

ggplotly(ggplot(hra,aes(x = as.factor(promotion_last_5years),fill = left)) + geom_bar() +  facet_wrap( ~ left, ncol = 2) + labs(title = "Promotion in last 5 Years",x="Promotion Status",y="No.of.Observation"),tooltip = "all")
#only 4 % of employee who left the job had Work accidents

ggplotly(ggplot(hra,aes(x = as.factor(Work_accident),fill = left)) + geom_bar() + facet_wrap( ~ left, ncol = 2)+ labs(title = "Work Accidents",x="Work Accident Status",y="No.of.Observations"),tooltip = "all")
#To understand the relationship between the attributes
hra_c <- hra
hra_c$left <- as.integer(hra$left)
corrplot(cor(hra_c[,1:8]), method="circle")

#There exists a positive(+) correlation between projectCount, averageMonthlyHours, and evaluation. 
#Which states that those employees who spent more hours and did more projects were evaluated highly.

#The negative(-) relationships between employee attrition(left) and satisfaction are highly correlated. 
#This states that employees leave the company more when they are less satisfied with the job.

#Summary 
##On average people who leave have a low satisfaction level, they work more and didn???t get promoted within the past five years.

#Who is leaving? 
#Create data table only with employees who have left the job
#Aim is to identify & define valuable employees and also the dominant factors affecting them to leave
hr_hist <- hra %>% filter(left==1)
# 3571 employees have left the job
nrow(hr_hist)
## [1] 3571
#The valuable employees are : 
par(mfrow=c(1,3))

#The company doesn't want to retain all the employees. Performance Evaluation is the key criteria in deciding the valuable employees. Employees who have score above 0.7 are considered good at thier job. 

hist(hr_hist$last_evaluation,col="#3090C7", main = "Last evaluation",xlab = "Performance Score", ylab = "No. of observation")

#A employee who is works in the company with atleast 4 years of experience is considered to be valuable

plot(as.factor(hr_hist$time_spend_company),col="#3090C7", main ="Work Experience",xlab = "Work experience in years", ylab = "No. of observation")

#Employees who with more than 5 project are considered valuable

plot(as.factor(hr_hist$number_project),col="#3090C7", main ="Number of Projects",xlab = "No. of Projects", ylab = "No. of observation")

dev.off()
## null device 
##           1
# 2014 out of 3571 employees who left the job are valuable employees
hr_vaulable <- hr_hist %>% filter(last_evaluation >= 0.70 | time_spend_company >= 4 | number_project > 5)
nrow(hr_vaulable)
## [1] 2014
#-Why did they Leave ?
#--Correlation
hr_val <- hra_c %>% filter(last_evaluation >= 0.70 | time_spend_company >= 4 | number_project > 5)
hr_val_select <- hr_val %>% select(satisfaction_level, number_project: promotion_last_5years)
corrplot(cor(hr_val_select), method="circle")
#Summary - From correlation plot we can infer that, On average valuable employees that leave are not satisfayed, work on many projects, spend many hours in the company each month and aren???t promoted.

#Create data table with only valuable employee
hr_model <- hra  %>% filter(last_evaluation >= 0.70 | time_spend_company >= 4 | number_project > 5)
summary(hr_model)
##  satisfaction_level last_evaluation  number_project  average_montly_hours
##  Min.   :0.090      Min.   :0.3600   Min.   :2.000   Min.   : 96.0       
##  1st Qu.:0.490      1st Qu.:0.7300   1st Qu.:3.000   1st Qu.:171.0       
##  Median :0.680      Median :0.8300   Median :4.000   Median :218.0       
##  Mean   :0.617      Mean   :0.8015   Mean   :4.159   Mean   :211.8       
##  3rd Qu.:0.830      3rd Qu.:0.9100   3rd Qu.:5.000   3rd Qu.:253.0       
##  Max.   :1.000      Max.   :1.0000   Max.   :7.000   Max.   :310.0       
##                                                                          
##  time_spend_company Work_accident    left     promotion_last_5years
##  Min.   : 2.000     Min.   :0.0000   0:7758   Min.   :0.00000      
##  1st Qu.: 3.000     1st Qu.:0.0000   1:2014   1st Qu.:0.00000      
##  Median : 4.000     Median :0.0000            Median :0.00000      
##  Mean   : 3.916     Mean   :0.1521            Mean   :0.02384      
##  3rd Qu.: 5.000     3rd Qu.:0.0000            3rd Qu.:0.00000      
##  Max.   :10.000     Max.   :1.0000            Max.   :1.00000      
##                                                                    
##          sales         salary    
##  sales      :2628   low   :4671  
##  technical  :1786   medium:4267  
##  support    :1466   high  : 834  
##  IT         : 808                
##  product_mng: 582                
##  marketing  : 561                
##  (Other)    :1941
#--Important factors Identification 

# Set the target variable as a factor
hra$left <- as.factor(hra$left)
hr_model$left <- as.factor(hr_model$left)

#Modeling -- Cross-Validation
set.seed(111)
train_control<- trainControl(method="cv", number=5, repeats=3)
head(train_control)
## $method
## [1] "cv"
## 
## $number
## [1] 5
## 
## $repeats
## [1] 3
## 
## $search
## [1] "grid"
## 
## $p
## [1] 0.75
## 
## $initialWindow
## NULL
# Using decision tree to identify important variables
c5model <- train(left~.,data = hr_model,trControl=train_control, method="C5.0Tree")

#Top 4 Attributes are - Satisfaction Level, Time spent in the company, average monthly hours & Performance Evaluation
plot(varImp(c5model))


# model with only important variable - Logistic regresion
gmlmodel_imp <- train(left~ + satisfaction_level + time_spend_company + average_montly_hours, data = hr_model, trControl=train_control, method="LogitBoost" )

# predictions
predictions<- predict(gmlmodel_imp,hr_model)
gmlmodel_imp_binded <- cbind(hr_model,predictions)

# results
confusionMatrix<- confusionMatrix(gmlmodel_imp_binded$predictions,gmlmodel_imp_binded$left)

#-  Accuracy : 0.9449 & Kappa : 0.8203
confusionMatrix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 7663  443
##          1   95 1571
##                                           
##                Accuracy : 0.9449          
##                  95% CI : (0.9402, 0.9494)
##     No Information Rate : 0.7939          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8203          
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9878          
##             Specificity : 0.7800          
##          Pos Pred Value : 0.9453          
##          Neg Pred Value : 0.9430          
##              Prevalence : 0.7939          
##          Detection Rate : 0.7842          
##    Detection Prevalence : 0.8295          
##       Balanced Accuracy : 0.8839          
##                                           
##        'Positive' Class : 0               
## 
# Boruta Features 
bor <- Boruta(left ~ ., hr_model)
bor # All are important !!
## Boruta performed 10 iterations in 40.67442 secs.
##  9 attributes confirmed important: Work_accident,
## average_montly_hours, last_evaluation, number_project,
## promotion_last_5years and 4 more;
##  No attributes deemed unimportant.
#So considering all the attributes for better accuracy rate

#Logistic regression model
gmlmodel <- train(left~., data=hr_model, trControl=train_control, method="LogitBoost")

# predictions
predictions<- predict(gmlmodel,hr_model)
gmlmodelbinded <- cbind(hr_model,predictions)

# results
confusionMatrix<- confusionMatrix(gmlmodelbinded$predictions,gmlmodelbinded$left)
confusionMatrix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 7671  319
##          1   87 1695
##                                           
##                Accuracy : 0.9585          
##                  95% CI : (0.9543, 0.9623)
##     No Information Rate : 0.7939          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8674          
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9888          
##             Specificity : 0.8416          
##          Pos Pred Value : 0.9601          
##          Neg Pred Value : 0.9512          
##              Prevalence : 0.7939          
##          Detection Rate : 0.7850          
##    Detection Prevalence : 0.8176          
##       Balanced Accuracy : 0.9152          
##                                           
##        'Positive' Class : 0               
## 
# SVM model
svmmodel <- train(left~., data=hr_model, trControl=train_control, method="LogitBoost")

# predictions
predictions<- predict(svmmodel,hr_model)
svmmodelbinded <- cbind(hr_model,predictions)

# results
confusionMatrix<- confusionMatrix(svmmodelbinded$predictions,svmmodelbinded$left)
confusionMatrix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 7676  335
##          1   82 1679
##                                           
##                Accuracy : 0.9573          
##                  95% CI : (0.9531, 0.9612)
##     No Information Rate : 0.7939          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8632          
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9894          
##             Specificity : 0.8337          
##          Pos Pred Value : 0.9582          
##          Neg Pred Value : 0.9534          
##              Prevalence : 0.7939          
##          Detection Rate : 0.7855          
##    Detection Prevalence : 0.8198          
##       Balanced Accuracy : 0.9115          
##                                           
##        'Positive' Class : 0               
## 
#Both model yield similar results 

#Final Outcome 

set.seed(123)
# Split the data
inTraining <- createDataPartition(hr_model$left, p = .75, list = FALSE)
train <- hr_model[ inTraining,]
test  <- hr_model[-inTraining,]

# Logistic Regression model to achieve probability value
logmodel = glm(left ~ ., family=binomial(logit), data=train)

# predictions 
prob_leave=predict(logmodel,newdata=test,type="response")

# Create a data table with prediction & performance
pred_attrition = data.frame(prob_leave)
pred_attrition$performance=test$last_evaluation

#Plot Probability to Leave Vs Performance Score
ggplotly(ggplot(pred_attrition,aes(x=prob_leave,y= performance)) + geom_point(color = "blue", alpha= .5,shape = 3) +labs(title = "Probability to Leave Vs Performance Score",x="Probability to Leave",y="Performance Score"), tooltip = "all") 
# need to concentrate & prioritize employee who are at the top right corner