Intro

Motivation: Why are our best and most experienced employees leaving prematurely?

Use the following dataset from Kaggle database and try to predict which valuable employees will leave next.

This dataset is simulated dataset and is downloaded frm Kaggle website.

Fields in the dataset include:

Ensure your working directory is set to the directory where the data resides.

Load libraries

suppressMessages(library(plyr))
suppressMessages(library(dplyr))
suppressMessages(library(ggplot2))
suppressMessages(library(caret))
suppressMessages(library(stats))
suppressMessages(library(mlbench))
suppressMessages(library(AppliedPredictiveModeling))
suppressMessages(library(ggplot2))
suppressMessages(library(gbm))
suppressMessages(library(rpart))
suppressMessages(library(ggfortify))

Load data and cleaning

hr<-read.csv("HR_comma_sep.csv", header = T)
colnames(hr)<-c("sl", "le", "nop", "amh", "tsac", "wa", "left", "promo", "sales", "salary")

Exploratory analysis

ggplot(data = hr, aes(x=hr$sl))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = hr, aes(x=hr$le))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

table(hr$left)
## 
##     0     1 
## 11428  3571
job_type<-data.frame(table(hr$sales))
colnames(job_type)<-c("Job", "Freq")
ggplot(data = job_type, aes(x=Freq, y=Job))+geom_point()

ggplot(data = hr, aes(x=amh, y=sl, color=as.factor(left)))+geom_point()

ggplot(data = hr, aes(x=sales, y=amh, color=as.factor(left)))+geom_point()

ggplot(data = hr, aes(x=salary, y=sl, color=as.factor(left)))+geom_point()

ggplot(data = hr, aes(x=salary, y=sales, color=as.factor(left)))+geom_point()

ggplot(data = hr, aes(x=amh, y=sales, color=as.factor(left)))+geom_point()

ggplot(data = hr, aes(x=amh, y=tsac, color=as.factor(left)))+geom_point()

ggplot(data = hr, aes(x=promo, y=tsac, color=as.factor(left)))+geom_point()

Data splitting

set.seed(123)
inTrain<-createDataPartition(y=hr$left, p=0.75, list = FALSE)  
train<-hr[inTrain,]
test<-hr[-inTrain,]

Feature selection (IGNORE THIS)

# train_num<-train[,1:7]
# cor_matrix<-cor(train_num, use = "complete.obs", method = "kendall")

Train models

set.seed(234)
# Gradient Boosting Model
fitControl<-trainControl(method = "repeatedcv", number = 10, repeats = 3)
fit_1<-train(as.factor(left)~., data = train, method="gbm", trControl=fitControl, verbose=FALSE)
# Logistics Regression Model
fit_2<-train(as.factor(left)~., data=train, method="glm", family="binomial")
# Decision Tree Model
fitControl<-trainControl(method = "repeatedcv", number = 10, repeats = 3)
fit_3<-train(as.factor(left)~., data = train, method = "rpart", parms = list(split = "information"), trControl=fitControl, tuneLength = 10)

Predict

# Predict with Gradient Boosting Model
predict_1<-predict(fit_1, newdata=test)
# Predict with Logistics Regression Model
predict_2<-predict(fit_2, newdata=test)
# Predict with Decision Tree Model
predict_3<-predict(fit_3, newdata=test)

Evaluate predicted results

  1. Gradient Boosting Model
confusionMatrix(predict_1, test$left)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2825   63
##          1   31  830
##                                           
##                Accuracy : 0.9749          
##                  95% CI : (0.9694, 0.9797)
##     No Information Rate : 0.7618          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9301          
##  Mcnemar's Test P-Value : 0.001387        
##                                           
##             Sensitivity : 0.9891          
##             Specificity : 0.9295          
##          Pos Pred Value : 0.9782          
##          Neg Pred Value : 0.9640          
##              Prevalence : 0.7618          
##          Detection Rate : 0.7535          
##    Detection Prevalence : 0.7703          
##       Balanced Accuracy : 0.9593          
##                                           
##        'Positive' Class : 0               
## 
  1. Logistics Regression Model
confusionMatrix(predict_2, test$left)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2665  574
##          1  191  319
##                                           
##                Accuracy : 0.7959          
##                  95% CI : (0.7827, 0.8087)
##     No Information Rate : 0.7618          
##     P-Value [Acc > NIR] : 3.328e-07       
##                                           
##                   Kappa : 0.3405          
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9331          
##             Specificity : 0.3572          
##          Pos Pred Value : 0.8228          
##          Neg Pred Value : 0.6255          
##              Prevalence : 0.7618          
##          Detection Rate : 0.7109          
##    Detection Prevalence : 0.8640          
##       Balanced Accuracy : 0.6452          
##                                           
##        'Positive' Class : 0               
## 
  1. Decision Tree Model
confusionMatrix(predict_3, test$left)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2831   69
##          1   25  824
##                                           
##                Accuracy : 0.9749          
##                  95% CI : (0.9694, 0.9797)
##     No Information Rate : 0.7618          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9297          
##  Mcnemar's Test P-Value : 9.202e-06       
##                                           
##             Sensitivity : 0.9912          
##             Specificity : 0.9227          
##          Pos Pred Value : 0.9762          
##          Neg Pred Value : 0.9706          
##              Prevalence : 0.7618          
##          Detection Rate : 0.7551          
##    Detection Prevalence : 0.7735          
##       Balanced Accuracy : 0.9570          
##                                           
##        'Positive' Class : 0               
##