HR Analystics Job Prediction
Intro
This dataset contains information about employees who worked in a company.
This dataset contains columns:
Satisfactory Level
Number of Project
Average Monthly Hours
Time Spend Company
Promotion Last 5Years
Department
Salary
Data Preparation
Download Data
hrd <- read.csv("hr_comma_sep.csv", stringsAsFactors = T)
head(hrd)rmarkdown::paged_table(hrd)Calling Library Needed
library(readr)
library(tidyverse)
library(caret)
library(data.table)
library(tidymodels)
library(car)
library(dplyr)
library(class)
library(gtools)
library(lmtest)Checking Missing Values
anyNA(hrd)#> [1] FALSE
colSums(is.na(hrd))#> satisfaction_level last_evaluation number_project
#> 0 0 0
#> average_montly_hours time_spend_company Work_accident
#> 0 0 0
#> left promotion_last_5years Department
#> 0 0 0
#> salary
#> 0
Convert Data Type
Above,all string columns had changed into factor, but i better to peek into dataset again, to see if any column need to convert
glimpse(hrd)#> Rows: 14,999
#> Columns: 10
#> $ satisfaction_level <dbl> 0.38, 0.80, 0.11, 0.72, 0.37, 0.41, 0.10, 0.92, …
#> $ last_evaluation <dbl> 0.53, 0.86, 0.88, 0.87, 0.52, 0.50, 0.77, 0.85, …
#> $ number_project <int> 2, 5, 7, 5, 2, 2, 6, 5, 5, 2, 2, 6, 4, 2, 2, 2, …
#> $ average_montly_hours <int> 157, 262, 272, 223, 159, 153, 247, 259, 224, 142…
#> $ time_spend_company <int> 3, 6, 4, 5, 3, 3, 4, 5, 5, 3, 3, 4, 5, 3, 3, 3, …
#> $ Work_accident <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
#> $ left <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
#> $ promotion_last_5years <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
#> $ Department <fct> sales, sales, sales, sales, sales, sales, sales,…
#> $ salary <fct> low, medium, medium, low, low, low, low, low, lo…
Since the target is class, i change left column into
factor,
In this dataset, number 1 = left , and 0 = stay
I put 1 as positive result
hrd <-
hrd %>%
mutate(left = as.factor(left))
head(hrd)Modeling
Train - Test Split
Before making the model, i split the data into data train & and data test.
set.seed(303)
index<- sample(nrow(hrd), nrow(hrd)*0.7)
p_train <- hrd[index,]
p_test <- hrd[-index,]Then check the proportion of target in data train.
prop.table(table(p_train$left))#>
#> 0 1
#> 0.7590247 0.2409753
there is large gap between data left = 1, and stay = 0, i need to balancing the data using downSample
set.seed(100)
p_train<-downSample(x= p_train %>% select(-left),
y = p_train$left,
yname = "left")
#check the proportion of target
prop.table(table(p_train$left))#>
#> 0 1
#> 0.5 0.5
Logistic Regression
In my first model, I’m using logistic regression with
glm
model_x <- glm(
formula = left ~ .,family = "binomial" ,data = p_train )
summary(model_x)#>
#> Call:
#> glm(formula = left ~ ., family = "binomial", data = p_train)
#>
#> Deviance Residuals:
#> Min 1Q Median 3Q Max
#> -3.1660 -0.7849 0.1107 0.8247 2.8549
#>
#> Coefficients:
#> Estimate Std. Error z value Pr(>|z|)
#> (Intercept) -1.1414785 0.2734808 -4.174 0.0000299444 ***
#> satisfaction_level -4.6487226 0.1625235 -28.603 < 0.0000000000000002 ***
#> last_evaluation 1.3368819 0.2579381 5.183 0.0000002184 ***
#> number_project -0.4529048 0.0360765 -12.554 < 0.0000000000000002 ***
#> average_montly_hours 0.0050391 0.0008823 5.711 0.0000000112 ***
#> time_spend_company 0.5020634 0.0303966 16.517 < 0.0000000000000002 ***
#> Work_accident -1.5464834 0.1248631 -12.385 < 0.0000000000000002 ***
#> promotion_last_5years -1.2918794 0.3380707 -3.821 0.000133 ***
#> Departmenthr 0.1500763 0.2006583 0.748 0.454509
#> DepartmentIT -0.0560436 0.1901319 -0.295 0.768176
#> Departmentmanagement -0.6300797 0.2379996 -2.647 0.008111 **
#> Departmentmarketing -0.0811253 0.2003706 -0.405 0.685568
#> Departmentproduct_mng 0.2111888 0.2007651 1.052 0.292836
#> DepartmentRandD -0.4609012 0.2151535 -2.142 0.032178 *
#> Departmentsales 0.1131679 0.1574747 0.719 0.472362
#> Departmentsupport 0.0849673 0.1680474 0.506 0.613127
#> Departmenttechnical 0.1829923 0.1637994 1.117 0.263920
#> salarylow 2.0692591 0.1852549 11.170 < 0.0000000000000002 ***
#> salarymedium 1.5334468 0.1860803 8.241 < 0.0000000000000002 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> (Dispersion parameter for binomial family taken to be 1)
#>
#> Null deviance: 7014.6 on 5059 degrees of freedom
#> Residual deviance: 5142.3 on 5041 degrees of freedom
#> AIC: 5180.3
#>
#> Number of Fisher Scoring iterations: 5
Some predictors have not significant value. To choose better model,
let’s use stepwise method.
library(MASS)
model_step <- stepAIC(object = model_x, direction = "backward", trace = F)
summary(model_x)#>
#> Call:
#> glm(formula = left ~ ., family = "binomial", data = p_train)
#>
#> Deviance Residuals:
#> Min 1Q Median 3Q Max
#> -3.1660 -0.7849 0.1107 0.8247 2.8549
#>
#> Coefficients:
#> Estimate Std. Error z value Pr(>|z|)
#> (Intercept) -1.1414785 0.2734808 -4.174 0.0000299444 ***
#> satisfaction_level -4.6487226 0.1625235 -28.603 < 0.0000000000000002 ***
#> last_evaluation 1.3368819 0.2579381 5.183 0.0000002184 ***
#> number_project -0.4529048 0.0360765 -12.554 < 0.0000000000000002 ***
#> average_montly_hours 0.0050391 0.0008823 5.711 0.0000000112 ***
#> time_spend_company 0.5020634 0.0303966 16.517 < 0.0000000000000002 ***
#> Work_accident -1.5464834 0.1248631 -12.385 < 0.0000000000000002 ***
#> promotion_last_5years -1.2918794 0.3380707 -3.821 0.000133 ***
#> Departmenthr 0.1500763 0.2006583 0.748 0.454509
#> DepartmentIT -0.0560436 0.1901319 -0.295 0.768176
#> Departmentmanagement -0.6300797 0.2379996 -2.647 0.008111 **
#> Departmentmarketing -0.0811253 0.2003706 -0.405 0.685568
#> Departmentproduct_mng 0.2111888 0.2007651 1.052 0.292836
#> DepartmentRandD -0.4609012 0.2151535 -2.142 0.032178 *
#> Departmentsales 0.1131679 0.1574747 0.719 0.472362
#> Departmentsupport 0.0849673 0.1680474 0.506 0.613127
#> Departmenttechnical 0.1829923 0.1637994 1.117 0.263920
#> salarylow 2.0692591 0.1852549 11.170 < 0.0000000000000002 ***
#> salarymedium 1.5334468 0.1860803 8.241 < 0.0000000000000002 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> (Dispersion parameter for binomial family taken to be 1)
#>
#> Null deviance: 7014.6 on 5059 degrees of freedom
#> Residual deviance: 5142.3 on 5041 degrees of freedom
#> AIC: 5180.3
#>
#> Number of Fisher Scoring iterations: 5
Model step also has the same result as 1st model, which is this model prefer to using all the predictors in data set
p_test$prob_left <- predict(model_step,type = "response", newdata = p_test)Using plot, we can see distribution of probability prediction
ggplot(p_test , aes(x=prob_left )) +
geom_density(lwd=0.5) +
labs(title = "Distribution of Probability Prediction Data") +
theme_minimal()Above figure explain that, prediction touch < 1 means that employees stay > than leaving.
p_test$prob_left <- factor(ifelse(p_test$prob_left > 0.5,"1","0"))
p_test[c("prob_left","left")]To evaluate my model, next let’s use the confusion matrix
con <- confusionMatrix(p_test$prob_left,p_test$left, positive = "1")
con#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction 0 1
#> 0 2588 204
#> 1 871 837
#>
#> Accuracy : 0.7611
#> 95% CI : (0.7484, 0.7735)
#> No Information Rate : 0.7687
#> P-Value [Acc > NIR] : 0.8884
#>
#> Kappa : 0.4512
#>
#> Mcnemar's Test P-Value : <0.0000000000000002
#>
#> Sensitivity : 0.8040
#> Specificity : 0.7482
#> Pos Pred Value : 0.4900
#> Neg Pred Value : 0.9269
#> Prevalence : 0.2313
#> Detection Rate : 0.1860
#> Detection Prevalence : 0.3796
#> Balanced Accuracy : 0.7761
#>
#> 'Positive' Class : 1
#>
- Accuracy = 76% -> is calculated as the number of all correct predictions divided by the total number of the dataset
- Sensitivity = 80% -> is calculated as the number of correct positive predictions divided by the total number of positives
- Specificity = 75% -> is calculated as the number of correct negative predictions divided by the total number of negatives.
- Precision = 49% - is calculated as the number of correct positive predictions divided by the total number of positive predictions.
K Nearest Neighbour
p_train_x <- p_train %>% select_if(is.numeric)
p_test_x <- p_test %>% select_if(is.numeric)
# target
p_train_y <- p_train[,"left"]
p_test_y <- p_test[,"left"]p_train_xs <- scale(p_train_x)
summary(p_train_xs)#> satisfaction_level last_evaluation number_project average_montly_hours
#> Min. :-1.72121 Min. :-2.0082 Min. :-1.2555 Min. :-1.971125
#> 1st Qu.:-0.64365 1st Qu.:-0.9537 1st Qu.:-1.2555 1st Qu.:-0.942374
#> Median : 0.02517 Median : 0.1008 Median : 0.1065 Median :-0.005475
#> Mean : 0.00000 Mean : 0.0000 Mean : 0.0000 Mean : 0.000000
#> 3rd Qu.: 0.87979 3rd Qu.: 0.8778 3rd Qu.: 0.7874 3rd Qu.: 0.913053
#> Max. : 1.66008 Max. : 1.5438 Max. : 2.1494 Max. : 1.941805
#> time_spend_company Work_accident promotion_last_5years
#> Min. :-1.2490 Min. :-0.3591 Min. :-0.1315
#> 1st Qu.:-0.4872 1st Qu.:-0.3591 1st Qu.:-0.1315
#> Median :-0.4872 Median :-0.3591 Median :-0.1315
#> Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
#> 3rd Qu.: 0.2746 3rd Qu.:-0.3591 3rd Qu.:-0.1315
#> Max. : 4.8456 Max. : 2.7844 Max. : 7.6043
p_test_xs <- scale(p_test_x,
center = attr(p_train_xs,"scaled:center"),
scale =attr(p_train_xs,"scaled:scale"))sqrt(nrow(p_train_xs))#> [1] 71.13368
p_pred <- knn(train=p_train_xs,
test=p_test_xs,
cl=p_train_y,k=71)confusionMatrix(data = as.factor(p_pred),
reference = p_test_y,
positive = "1")#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction 0 1
#> 0 3168 110
#> 1 291 931
#>
#> Accuracy : 0.9109
#> 95% CI : (0.9022, 0.9191)
#> No Information Rate : 0.7687
#> P-Value [Acc > NIR] : < 0.00000000000000022
#>
#> Kappa : 0.7638
#>
#> Mcnemar's Test P-Value : < 0.00000000000000022
#>
#> Sensitivity : 0.8943
#> Specificity : 0.9159
#> Pos Pred Value : 0.7619
#> Neg Pred Value : 0.9664
#> Prevalence : 0.2313
#> Detection Rate : 0.2069
#> Detection Prevalence : 0.2716
#> Balanced Accuracy : 0.9051
#>
#> 'Positive' Class : 1
#>
Conclusion
From two models above,K-NN has better model to predict unseen data