library(dplyr)
library(caret)
library(class)
library(ggplot2)
library(gtools)
library(tidyverse)
library(GGally)
library(partykit)Exploratory Data Analysis
This is the dataset given about the turnover in a company which have some variabels in the dataset.
I want to see the biggest probability of employee resign based on division and average monthly hours
div <- turnover %>%
select(average_montly_hours, division, left) %>%
filter(left == 1) %>%
group_by(division) %>%
summarise(avg_monthly_hours = mean(average_montly_hours), sum_left = sum(left)) %>%
arrange(avg_monthly_hours)
divThe result shows that Technical division have the highest average monthly work hours
ggplot(div, aes(reorder(division, avg_monthly_hours), avg_monthly_hours)) +
geom_col(aes(fill = avg_monthly_hours)) +
coord_flip() +
labs(title = "Average Monthly Working Hour For Each Division",
x = "Division",
y = "Average Monthly Working Hours") +
theme_minimal() +
theme(legend.position = "none")Proportion of left employee based on dataset
#>
#> 0 1
#> 0.5 0.5
turnover$left <- as.factor(turnover$left)
turnover$promotion_last_5years <- as.factor(turnover$promotion_last_5years)Checking multicollinearity
#> satisfaction_level last_evaluation number_project average_montly_hours
#> Min. :0.0900 Min. :0.3600 Min. :2.000 Min. : 97.0
#> 1st Qu.:0.3800 1st Qu.:0.5400 1st Qu.:2.000 1st Qu.:151.0
#> Median :0.5600 Median :0.7300 Median :4.000 Median :205.0
#> Mean :0.5526 Mean :0.7158 Mean :3.819 Mean :203.4
#> 3rd Qu.:0.7800 3rd Qu.:0.8800 3rd Qu.:5.000 3rd Qu.:252.0
#> Max. :1.0000 Max. :1.0000 Max. :7.000 Max. :310.0
#>
#> time_spend_company Work_accident promotion_last_5years
#> Min. : 2.000 Min. :0.0000 0:7022
#> 1st Qu.: 3.000 1st Qu.:0.0000 1: 120
#> Median : 3.000 Median :0.0000
#> Mean : 3.616 Mean :0.1088
#> 3rd Qu.: 4.000 3rd Qu.:0.0000
#> Max. :10.000 Max. :1.0000
#>
#> division salary left
#> sales :1997 high : 462 0:3571
#> technical :1306 low :3773 1:3571
#> support :1080 medium:2907
#> IT : 560
#> product_mng: 430
#> marketing : 403
#> (Other) :1366
set.seed(100)
index <- sample(nrow(turnover), nrow(turnover)*0.8)
turnover.train <- turnover[index, ]
turnover.test <- turnover[-index, ]Checking if the distribution of train data and test data already balance
#>
#> 0 1
#> 0.5016629 0.4983371
#>
#> 0 1
#> 0.493352 0.506648
Classification Modelling
Logistic Regression Model Fitting
model_logistic_none <- glm(formula = left ~ 1, data = turnover.train, family = "binomial")
summary(model_logistic_none)#>
#> Call:
#> glm(formula = left ~ 1, family = "binomial", data = turnover.train)
#>
#> Deviance Residuals:
#> Min 1Q Median 3Q Max
#> -1.175 -1.175 -1.175 1.180 1.180
#>
#> Coefficients:
#> Estimate Std. Error z value Pr(>|z|)
#> (Intercept) -0.006652 0.026461 -0.251 0.802
#>
#> (Dispersion parameter for binomial family taken to be 1)
#>
#> Null deviance: 7919.8 on 5712 degrees of freedom
#> Residual deviance: 7919.8 on 5712 degrees of freedom
#> AIC: 7921.8
#>
#> Number of Fisher Scoring iterations: 3
model_logistic <- glm(formula = left ~ ., data = turnover.train, family = "binomial")
summary(model_logistic)#>
#> Call:
#> glm(formula = left ~ ., family = "binomial", data = turnover.train)
#>
#> Deviance Residuals:
#> Min 1Q Median 3Q Max
#> -3.3293 -0.7793 -0.0844 0.8384 2.7448
#>
#> Coefficients:
#> Estimate Std. Error z value Pr(>|z|)
#> (Intercept) -1.3972485 0.2660175 -5.252 1.50e-07 ***
#> satisfaction_level -4.6480212 0.1556055 -29.871 < 2e-16 ***
#> last_evaluation 1.4509215 0.2421707 5.991 2.08e-09 ***
#> number_project -0.4818970 0.0342090 -14.087 < 2e-16 ***
#> average_montly_hours 0.0051448 0.0008361 6.153 7.59e-10 ***
#> time_spend_company 0.5631751 0.0296456 18.997 < 2e-16 ***
#> Work_accident -1.4437353 0.1197889 -12.052 < 2e-16 ***
#> promotion_last_5years1 -1.5089587 0.3320227 -4.545 5.50e-06 ***
#> divisionhr 0.1944633 0.1950226 0.997 0.31870
#> divisionIT -0.0183414 0.1838374 -0.100 0.92053
#> divisionmanagement -0.6037547 0.2293184 -2.633 0.00847 **
#> divisionmarketing 0.0594074 0.1961159 0.303 0.76195
#> divisionproduct_mng -0.0566828 0.1920876 -0.295 0.76793
#> divisionRandD -0.2990479 0.2074116 -1.442 0.14936
#> divisionsales -0.0057461 0.1539261 -0.037 0.97022
#> divisionsupport 0.0264632 0.1634755 0.162 0.87140
#> divisiontechnical 0.2451741 0.1603301 1.529 0.12622
#> salarylow 2.1051462 0.1798300 11.706 < 2e-16 ***
#> salarymedium 1.6313735 0.1811915 9.004 < 2e-16 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> (Dispersion parameter for binomial family taken to be 1)
#>
#> Null deviance: 7919.8 on 5712 degrees of freedom
#> Residual deviance: 5811.9 on 5694 degrees of freedom
#> AIC: 5849.9
#>
#> Number of Fisher Scoring iterations: 5
#> (Intercept) satisfaction_level last_evaluation
#> 0.247276421 0.009580541 4.267044977
#> number_project average_montly_hours time_spend_company
#> 0.617610662 1.005158048 1.756239921
#> Work_accident promotion_last_5years1 divisionhr
#> 0.236044407 0.221140137 1.214658927
#> divisionIT divisionmanagement divisionmarketing
#> 0.981825771 0.546754863 1.061207507
#> divisionproduct_mng divisionRandD divisionsales
#> 0.944893787 0.741523892 0.994270383
#> divisionsupport divisiontechnical salarylow
#> 1.026816469 1.277843733 8.208303163
#> salarymedium
#> 5.110889906
#> (Intercept) satisfaction_level last_evaluation
#> 0.198253103 0.009489625 0.810140220
#> number_project average_montly_hours time_spend_company
#> 0.381804273 0.501286195 0.637186882
#> Work_accident promotion_last_5years1 divisionhr
#> 0.190967578 0.181093169 0.548463202
#> divisionIT divisionmanagement divisionmarketing
#> 0.495414776 0.353485143 0.514847488
#> divisionproduct_mng divisionRandD divisionsales
#> 0.485833105 0.425790249 0.498563480
#> divisionsupport divisiontechnical salarylow
#> 0.506615416 0.560988322 0.891402359
#> salarymedium
#> 0.836357713
k-NN Model Fitting
Scaling data train and data test
test_knn <- turnover.test %>%
select_if(is.numeric) %>%
scale(center = attr(train_knn, "scaled:center"),
scale = attr(train_knn, "scaled:scale"))#> [1] 5713
Acquire number of k by square root number of rows in train_knn
#> [1] 75.58439
Predict
in logistic model, I give the threshold 0.45, so if the probability is above 0.45, the result is left / resign
pred_value <- predict(model_logistic, turnover.test, type = "response")
pred_label <- as.factor(ifelse(pred_value < 0.45, 0, 1))#> pred_label
#> 0 1
#> 592 837
in decision tree model
Model Evaluation
Logistic Confusion matrix
turnover.test$left <- as.factor(turnover.test$left)
confusionMatrix(pred_label, turnover.test$left, positive = "1")#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction 0 1
#> 0 488 104
#> 1 217 620
#>
#> Accuracy : 0.7754
#> 95% CI : (0.7528, 0.7968)
#> No Information Rate : 0.5066
#> P-Value [Acc > NIR] : < 2.2e-16
#>
#> Kappa : 0.5497
#>
#> Mcnemar's Test P-Value : 4.072e-10
#>
#> Sensitivity : 0.8564
#> Specificity : 0.6922
#> Pos Pred Value : 0.7407
#> Neg Pred Value : 0.8243
#> Prevalence : 0.5066
#> Detection Rate : 0.4339
#> Detection Prevalence : 0.5857
#> Balanced Accuracy : 0.7743
#>
#> 'Positive' Class : 1
#>
k-NN Confusion Matrix
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction 0 1
#> 0 645 63
#> 1 60 661
#>
#> Accuracy : 0.9139
#> 95% CI : (0.8982, 0.928)
#> No Information Rate : 0.5066
#> P-Value [Acc > NIR] : <2e-16
#>
#> Kappa : 0.8278
#>
#> Mcnemar's Test P-Value : 0.8569
#>
#> Sensitivity : 0.9130
#> Specificity : 0.9149
#> Pos Pred Value : 0.9168
#> Neg Pred Value : 0.9110
#> Prevalence : 0.5066
#> Detection Rate : 0.4626
#> Detection Prevalence : 0.5045
#> Balanced Accuracy : 0.9139
#>
#> 'Positive' Class : 1
#>
Decision Tree
Confusion matrix from data train
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction 0 1
#> 0 2782 193
#> 1 84 2654
#>
#> Accuracy : 0.9515
#> 95% CI : (0.9456, 0.9569)
#> No Information Rate : 0.5017
#> P-Value [Acc > NIR] : < 2.2e-16
#>
#> Kappa : 0.903
#>
#> Mcnemar's Test P-Value : 8.636e-11
#>
#> Sensitivity : 0.9322
#> Specificity : 0.9707
#> Pos Pred Value : 0.9693
#> Neg Pred Value : 0.9351
#> Prevalence : 0.4983
#> Detection Rate : 0.4646
#> Detection Prevalence : 0.4793
#> Balanced Accuracy : 0.9515
#>
#> 'Positive' Class : 1
#>
Confusion matrix from data test
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction 0 1
#> 0 686 53
#> 1 19 671
#>
#> Accuracy : 0.9496
#> 95% CI : (0.937, 0.9604)
#> No Information Rate : 0.5066
#> P-Value [Acc > NIR] : < 2.2e-16
#>
#> Kappa : 0.8993
#>
#> Mcnemar's Test P-Value : 0.0001006
#>
#> Sensitivity : 0.9268
#> Specificity : 0.9730
#> Pos Pred Value : 0.9725
#> Neg Pred Value : 0.9283
#> Prevalence : 0.5066
#> Detection Rate : 0.4696
#> Detection Prevalence : 0.4829
#> Balanced Accuracy : 0.9499
#>
#> 'Positive' Class : 1
#>
Conclusion
From the confusion matrix of 3 models, logistic regression, k-NN, and decision tree, the confusion matrix for decision tree have better accuracy, sensitivity, and precision (Pos Pred Value). Although logistic regression in this case is lower than others (k-NN and decision tree), but it is more interpretable than k-NN because it show how strong for the each variable affect the prediction. Because decision tree easy to get overfit, so I show two confusion matrix from the data test and train to see if it is ofervit or not, and it show only small difference between data train and data test predict. Beside accuracy, I also focus on Sensitivity to see my prediction of turnover employee compared to real data of turnover employee. Sensitivity from the confusion matrix using data test show that from 724 (671 + 53) employees that resign in the data test, 671 from them are predicted resign correctly.