Employee Resign Prediction
Intro
This dataset contains information about employees who worked in a company.
This dataset contains columns:
Satisfactory Level
Number of Project
Average Monthly Hours
Time Spend Company
Promotion Last 5Years
Department
Salary
Data Preparation
Download Data
hrd <- read.csv("HR_comma_sep.csv", stringsAsFactors = T)
head(hrd )Calling Library Needed
library(tidyverse)
library(caret)
library(tidymodels)
library(car)
library(rsample)
library(dplyr)
library(class)glimpse(hrd)#> Rows: 14,999
#> Columns: 10
#> $ satisfaction_level <dbl> 0.38, 0.80, 0.11, 0.72, 0.37, 0.41, 0.10, 0.92, …
#> $ last_evaluation <dbl> 0.53, 0.86, 0.88, 0.87, 0.52, 0.50, 0.77, 0.85, …
#> $ number_project <int> 2, 5, 7, 5, 2, 2, 6, 5, 5, 2, 2, 6, 4, 2, 2, 2, …
#> $ average_montly_hours <int> 157, 262, 272, 223, 159, 153, 247, 259, 224, 142…
#> $ time_spend_company <int> 3, 6, 4, 5, 3, 3, 4, 5, 5, 3, 3, 4, 5, 3, 3, 3, …
#> $ Work_accident <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
#> $ left <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
#> $ promotion_last_5years <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
#> $ Department <fct> sales, sales, sales, sales, sales, sales, sales,…
#> $ salary <fct> low, medium, medium, low, low, low, low, low, lo…
Convert Data Type
hrd <-
hrd %>%
mutate(left = as.factor(left))
head(hrd)Modeling
Train - Test Split
set.seed(303)
index<- sample(nrow(hrd), nrow(hrd)*0.7)
p_train <- hrd[index,]
p_test <- hrd[-index,]checking data train target proportion
prop.table(table(p_train$left))#>
#> 0 1
#> 0.7590247 0.2409753
Balancing data train using down sample
set.seed(100)
p_train<-downSample(x= p_train %>% select(-left),
y = p_train$left,
yname = "left")Data train now has balancing proportion for target
prop.table(table(p_train$left))#>
#> 0 1
#> 0.5 0.5
Naive Bayes Method
Making model with Naive Bayes
library(e1071)
model_NB_1 <- naiveBayes(formula = left~.,
data = p_train,
laplace = 1)#type class
p_test$pred_label <- predict(object = model_NB_1,
newdata = p_test,
type = "class")Confusion Matrix
nb <- confusionMatrix(data = p_test$pred_label,
reference = p_test$left,
)
nb#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction 0 1
#> 0 1970 95
#> 1 1489 946
#>
#> Accuracy : 0.648
#> 95% CI : (0.6338, 0.662)
#> No Information Rate : 0.7687
#> P-Value [Acc > NIR] : 1
#>
#> Kappa : 0.3258
#>
#> Mcnemar's Test P-Value : <0.0000000000000002
#>
#> Sensitivity : 0.5695
#> Specificity : 0.9087
#> Pos Pred Value : 0.9540
#> Neg Pred Value : 0.3885
#> Prevalence : 0.7687
#> Detection Rate : 0.4378
#> Detection Prevalence : 0.4589
#> Balanced Accuracy : 0.7391
#>
#> 'Positive' Class : 0
#>
Incorrect predictions Employee predicted left but
stay amounted 1489 and accuracy 64,8%, i assume the
accuracy at least reach 80%
ROC dand AUC
Calculating ROC
library(ROCR)
p_test$pred <- predict(object = model_NB_1,
newdata = p_test,
type = "raw")
# objek prediction
roc_pred <- prediction(predictions = p_test$pred[,1],
labels = p_test$left )
plot(performance(prediction.obj = roc_pred,
measure = "tpr",
x.measure = "fpr"))
abline(0,1, lty=2)
area under ROC
# nilai AUC
auc_pred <- performance(prediction.obj = roc_pred,
measure = "auc")
auc_pred@y.values#> [[1]]
#> [1] 0.1426399
Area under is 0.14, means our model has bad performance in predict unseen data.
Decision Tree
Making Model with Decision Tree
library(partykit)
left_tree <- ctree(formula = left~.,
data = p_train)plot(left_tree, type = 'simple')Confusion Matrix
pred_left2 <- predict(left_tree,
p_test,
type = "response")
dc <- confusionMatrix(pred_left2, p_test$left, positive = "1")
dc#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction 0 1
#> 0 3333 89
#> 1 126 952
#>
#> Accuracy : 0.9522
#> 95% CI : (0.9456, 0.9583)
#> No Information Rate : 0.7687
#> P-Value [Acc > NIR] : < 0.0000000000000002
#>
#> Kappa : 0.8673
#>
#> Mcnemar's Test P-Value : 0.01408
#>
#> Sensitivity : 0.9145
#> Specificity : 0.9636
#> Pos Pred Value : 0.8831
#> Neg Pred Value : 0.9740
#> Prevalence : 0.2313
#> Detection Rate : 0.2116
#> Detection Prevalence : 0.2396
#> Balanced Accuracy : 0.9390
#>
#> 'Positive' Class : 1
#>
tr_pred <- predict(left_tree,
p_train,
type = "response")
# confusion matrix data train
dc_t <- confusionMatrix(tr_pred, p_train$left, positive = "1")
dc_t#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction 0 1
#> 0 2442 203
#> 1 88 2327
#>
#> Accuracy : 0.9425
#> 95% CI : (0.9357, 0.9487)
#> No Information Rate : 0.5
#> P-Value [Acc > NIR] : < 0.00000000000000022
#>
#> Kappa : 0.885
#>
#> Mcnemar's Test P-Value : 0.00000000002344
#>
#> Sensitivity : 0.9198
#> Specificity : 0.9652
#> Pos Pred Value : 0.9636
#> Neg Pred Value : 0.9233
#> Prevalence : 0.5000
#> Detection Rate : 0.4599
#> Detection Prevalence : 0.4773
#> Balanced Accuracy : 0.9425
#>
#> 'Positive' Class : 1
#>
Accuracy for data train using decision tree model = 94% and data test = 95% # Evaluation Naive Bayes Method and Decision Tree
eval_nb <- data_frame(Accuracy =nb$overall[1],
Recall = nb$byClass[1],
Specificity = nb$byClass[2],
Precision = nb$byClass[3])
eval_dc <- data_frame(Accuracy = dc$overall[1],
Recall = dc$byClass[1],
Specificity = dc$byClass[2],
Precision = dc$byClass[3])Naive Bayes Model
eval_nb Decision Tree
eval_dcConclusion
Model Decision Tree more accurate in predict unseen data as we can see above the accuracy for Decision Tree = 95% compare to Naive Bayes = 64%