Intro

Motivation: Why are our best and most experienced employees leaving prematurely?

Use the following dataset from Kaggle database and try to predict which valuable employees will leave next.

This dataset is simulated dataset and is downloaded frm Kaggle website.

Fields in the dataset include:

Satisfaction Level (sl)
Last evaluation (le)
Number of projects (nop)
Average monthly hours (amh)
Time spent at the company (tsac)
Whether they have had a work accident (wa)
Whether they have had a promotion in the last 5 years (promo)
Departments (sales)
Salary (salary)
Whether the employee has left (left)

Ensure your working directory is set to the directory where the data resides.

Load libraries

suppressMessages(library(plyr))
suppressMessages(library(dplyr))
suppressMessages(library(ggplot2))
suppressMessages(library(caret))
suppressMessages(library(stats))
suppressMessages(library(mlbench))
suppressMessages(library(AppliedPredictiveModeling))
suppressMessages(library(ggplot2))
suppressMessages(library(gbm))
suppressMessages(library(rpart))
suppressMessages(library(ggfortify))

Load data and cleaning

hr<-read.csv("HR_comma_sep.csv", header = T)
colnames(hr)<-c("sl", "le", "nop", "amh", "tsac", "wa", "left", "promo", "sales", "salary")

Exploratory analysis

ggplot(data = hr, aes(x=hr$sl))+geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = hr, aes(x=hr$le))+geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

table(hr$left)

## 
##     0     1 
## 11428  3571

job_type<-data.frame(table(hr$sales))
colnames(job_type)<-c("Job", "Freq")
ggplot(data = job_type, aes(x=Freq, y=Job))+geom_point()

ggplot(data = hr, aes(x=amh, y=sl, color=as.factor(left)))+geom_point()

ggplot(data = hr, aes(x=sales, y=amh, color=as.factor(left)))+geom_point()

ggplot(data = hr, aes(x=salary, y=sl, color=as.factor(left)))+geom_point()

ggplot(data = hr, aes(x=salary, y=sales, color=as.factor(left)))+geom_point()

ggplot(data = hr, aes(x=amh, y=sales, color=as.factor(left)))+geom_point()

ggplot(data = hr, aes(x=amh, y=tsac, color=as.factor(left)))+geom_point()

ggplot(data = hr, aes(x=promo, y=tsac, color=as.factor(left)))+geom_point()

Data splitting

set.seed(123)
inTrain<-createDataPartition(y=hr$left, p=0.75, list = FALSE)  
train<-hr[inTrain,]
test<-hr[-inTrain,]

Feature selection (IGNORE THIS)

# train_num<-train[,1:7]
# cor_matrix<-cor(train_num, use = "complete.obs", method = "kendall")

Train models

set.seed(234)
# Gradient Boosting Model
fitControl<-trainControl(method = "repeatedcv", number = 10, repeats = 3)
fit_1<-train(as.factor(left)~., data = train, method="gbm", trControl=fitControl, verbose=FALSE)
# Logistics Regression Model
fit_2<-train(as.factor(left)~., data=train, method="glm", family="binomial")
# Decision Tree Model
fitControl<-trainControl(method = "repeatedcv", number = 10, repeats = 3)
fit_3<-train(as.factor(left)~., data = train, method = "rpart", parms = list(split = "information"), trControl=fitControl, tuneLength = 10)

Predict

# Predict with Gradient Boosting Model
predict_1<-predict(fit_1, newdata=test)
# Predict with Logistics Regression Model
predict_2<-predict(fit_2, newdata=test)
# Predict with Decision Tree Model
predict_3<-predict(fit_3, newdata=test)

Evaluate predicted results

Gradient Boosting Model

confusionMatrix(predict_1, test$left)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2825   63
##          1   31  830
##                                           
##                Accuracy : 0.9749          
##                  95% CI : (0.9694, 0.9797)
##     No Information Rate : 0.7618          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9301          
##  Mcnemar's Test P-Value : 0.001387        
##                                           
##             Sensitivity : 0.9891          
##             Specificity : 0.9295          
##          Pos Pred Value : 0.9782          
##          Neg Pred Value : 0.9640          
##              Prevalence : 0.7618          
##          Detection Rate : 0.7535          
##    Detection Prevalence : 0.7703          
##       Balanced Accuracy : 0.9593          
##                                           
##        'Positive' Class : 0               
##

Logistics Regression Model

confusionMatrix(predict_2, test$left)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2665  574
##          1  191  319
##                                           
##                Accuracy : 0.7959          
##                  95% CI : (0.7827, 0.8087)
##     No Information Rate : 0.7618          
##     P-Value [Acc > NIR] : 3.328e-07       
##                                           
##                   Kappa : 0.3405          
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9331          
##             Specificity : 0.3572          
##          Pos Pred Value : 0.8228          
##          Neg Pred Value : 0.6255          
##              Prevalence : 0.7618          
##          Detection Rate : 0.7109          
##    Detection Prevalence : 0.8640          
##       Balanced Accuracy : 0.6452          
##                                           
##        'Positive' Class : 0               
##

Decision Tree Model

confusionMatrix(predict_3, test$left)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2831   69
##          1   25  824
##                                           
##                Accuracy : 0.9749          
##                  95% CI : (0.9694, 0.9797)
##     No Information Rate : 0.7618          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9297          
##  Mcnemar's Test P-Value : 9.202e-06       
##                                           
##             Sensitivity : 0.9912          
##             Specificity : 0.9227          
##          Pos Pred Value : 0.9762          
##          Neg Pred Value : 0.9706          
##              Prevalence : 0.7618          
##          Detection Rate : 0.7551          
##    Detection Prevalence : 0.7735          
##       Balanced Accuracy : 0.9570          
##                                           
##        'Positive' Class : 0               
##

Human Resources Analytics

Bauyrjan

9/15/2017