Motivation: Why are our best and most experienced employees leaving prematurely?
Use the following dataset from Kaggle database and try to predict which valuable employees will leave next.
This dataset is simulated dataset and is downloaded frm Kaggle website.
Fields in the dataset include:
Ensure your working directory is set to the directory where the data resides.
suppressMessages(library(plyr))
suppressMessages(library(dplyr))
suppressMessages(library(ggplot2))
suppressMessages(library(caret))
suppressMessages(library(stats))
suppressMessages(library(mlbench))
suppressMessages(library(AppliedPredictiveModeling))
suppressMessages(library(ggplot2))
suppressMessages(library(gbm))
suppressMessages(library(rpart))
suppressMessages(library(ggfortify))
hr<-read.csv("HR_comma_sep.csv", header = T)
colnames(hr)<-c("sl", "le", "nop", "amh", "tsac", "wa", "left", "promo", "sales", "salary")
ggplot(data = hr, aes(x=hr$sl))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = hr, aes(x=hr$le))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
table(hr$left)
##
## 0 1
## 11428 3571
job_type<-data.frame(table(hr$sales))
colnames(job_type)<-c("Job", "Freq")
ggplot(data = job_type, aes(x=Freq, y=Job))+geom_point()
ggplot(data = hr, aes(x=amh, y=sl, color=as.factor(left)))+geom_point()
ggplot(data = hr, aes(x=sales, y=amh, color=as.factor(left)))+geom_point()
ggplot(data = hr, aes(x=salary, y=sl, color=as.factor(left)))+geom_point()
ggplot(data = hr, aes(x=salary, y=sales, color=as.factor(left)))+geom_point()
ggplot(data = hr, aes(x=amh, y=sales, color=as.factor(left)))+geom_point()
ggplot(data = hr, aes(x=amh, y=tsac, color=as.factor(left)))+geom_point()
ggplot(data = hr, aes(x=promo, y=tsac, color=as.factor(left)))+geom_point()
set.seed(123)
inTrain<-createDataPartition(y=hr$left, p=0.75, list = FALSE)
train<-hr[inTrain,]
test<-hr[-inTrain,]
# train_num<-train[,1:7]
# cor_matrix<-cor(train_num, use = "complete.obs", method = "kendall")
set.seed(234)
# Gradient Boosting Model
fitControl<-trainControl(method = "repeatedcv", number = 10, repeats = 3)
fit_1<-train(as.factor(left)~., data = train, method="gbm", trControl=fitControl, verbose=FALSE)
# Logistics Regression Model
fit_2<-train(as.factor(left)~., data=train, method="glm", family="binomial")
# Decision Tree Model
fitControl<-trainControl(method = "repeatedcv", number = 10, repeats = 3)
fit_3<-train(as.factor(left)~., data = train, method = "rpart", parms = list(split = "information"), trControl=fitControl, tuneLength = 10)
# Predict with Gradient Boosting Model
predict_1<-predict(fit_1, newdata=test)
# Predict with Logistics Regression Model
predict_2<-predict(fit_2, newdata=test)
# Predict with Decision Tree Model
predict_3<-predict(fit_3, newdata=test)
confusionMatrix(predict_1, test$left)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2825 63
## 1 31 830
##
## Accuracy : 0.9749
## 95% CI : (0.9694, 0.9797)
## No Information Rate : 0.7618
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9301
## Mcnemar's Test P-Value : 0.001387
##
## Sensitivity : 0.9891
## Specificity : 0.9295
## Pos Pred Value : 0.9782
## Neg Pred Value : 0.9640
## Prevalence : 0.7618
## Detection Rate : 0.7535
## Detection Prevalence : 0.7703
## Balanced Accuracy : 0.9593
##
## 'Positive' Class : 0
##
confusionMatrix(predict_2, test$left)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2665 574
## 1 191 319
##
## Accuracy : 0.7959
## 95% CI : (0.7827, 0.8087)
## No Information Rate : 0.7618
## P-Value [Acc > NIR] : 3.328e-07
##
## Kappa : 0.3405
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9331
## Specificity : 0.3572
## Pos Pred Value : 0.8228
## Neg Pred Value : 0.6255
## Prevalence : 0.7618
## Detection Rate : 0.7109
## Detection Prevalence : 0.8640
## Balanced Accuracy : 0.6452
##
## 'Positive' Class : 0
##
confusionMatrix(predict_3, test$left)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2831 69
## 1 25 824
##
## Accuracy : 0.9749
## 95% CI : (0.9694, 0.9797)
## No Information Rate : 0.7618
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9297
## Mcnemar's Test P-Value : 9.202e-06
##
## Sensitivity : 0.9912
## Specificity : 0.9227
## Pos Pred Value : 0.9762
## Neg Pred Value : 0.9706
## Prevalence : 0.7618
## Detection Rate : 0.7551
## Detection Prevalence : 0.7735
## Balanced Accuracy : 0.9570
##
## 'Positive' Class : 0
##