data <- read_csv("employee_attrition.csv")
## Rows: 14900 Columns: 24
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (16): Gender, Job Role, Work-Life Balance, Job Satisfaction, Performance...
## dbl (8): Employee ID, Age, Years at Company, Monthly Income, Number of Prom...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# clean column names (VERY IMPORTANT)
data <- clean_names(data)
# check names
names(data)
## [1] "employee_id" "age"
## [3] "gender" "years_at_company"
## [5] "job_role" "monthly_income"
## [7] "work_life_balance" "job_satisfaction"
## [9] "performance_rating" "number_of_promotions"
## [11] "overtime" "distance_from_home"
## [13] "education_level" "marital_status"
## [15] "number_of_dependents" "job_level"
## [17] "company_size" "company_tenure"
## [19] "remote_work" "leadership_opportunities"
## [21] "innovation_opportunities" "company_reputation"
## [23] "employee_recognition" "attrition"
# convert target to factor
data <- data %>%
mutate(attrition = as.factor(attrition))
set.seed(123)
train_index <- createDataPartition(data$attrition, p = 0.7, list = FALSE)
train <- data[train_index, ]
test <- data[-train_index, ]
model1 <- glm(attrition ~ monthly_income,
data = train,
family = binomial)
summary(model1)
##
## Call:
## glm(formula = attrition ~ monthly_income, family = binomial,
## data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.167e-01 6.950e-02 1.679 0.0932 .
## monthly_income -5.999e-07 9.131e-06 -0.066 0.9476
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 14428 on 10430 degrees of freedom
## Residual deviance: 14428 on 10429 degrees of freedom
## AIC: 14432
##
## Number of Fisher Scoring iterations: 3
if("over_time" %in% names(train)){
model2 <- glm(attrition ~ monthly_income + over_time,
data = train,
family = binomial)
} else if("overtime" %in% names(train)){
model2 <- glm(attrition ~ monthly_income + overtime,
data = train,
family = binomial)
} else {
message("⚠️ No overtime variable found — fallback to income only")
model2 <- glm(attrition ~ monthly_income,
data = train,
family = binomial)
}
summary(model2)
##
## Call:
## glm(formula = attrition ~ monthly_income + overtime, family = binomial,
## data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.977e-01 7.101e-02 2.784 0.00536 **
## monthly_income -7.202e-07 9.146e-06 -0.079 0.93723
## overtimeYes -2.425e-01 4.174e-02 -5.811 6.23e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 14428 on 10430 degrees of freedom
## Residual deviance: 14394 on 10428 degrees of freedom
## AIC: 14400
##
## Number of Fisher Scoring iterations: 3
model3 <- glm(attrition ~ .,
data = train,
family = binomial)
summary(model3)
##
## Call:
## glm(formula = attrition ~ ., family = binomial, data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.412e-01 2.126e-01 -0.664 0.50668
## employee_id 1.088e-06 1.159e-06 0.938 0.34821
## age 7.509e-03 2.417e-03 3.107 0.00189 **
## genderMale 5.696e-01 5.020e-02 11.347 < 2e-16 ***
## years_at_company 1.287e-02 2.859e-03 4.503 6.70e-06 ***
## job_roleFinance 1.971e-01 1.188e-01 1.659 0.09705 .
## job_roleHealthcare 1.758e-01 1.032e-01 1.703 0.08849 .
## job_roleMedia 2.609e-01 8.686e-02 3.004 0.00266 **
## job_roleTechnology 2.973e-01 1.194e-01 2.490 0.01277 *
## monthly_income -3.879e-05 2.029e-05 -1.912 0.05590 .
## work_life_balanceFair -1.365e+00 7.590e-02 -17.984 < 2e-16 ***
## work_life_balanceGood -3.233e-01 7.146e-02 -4.525 6.05e-06 ***
## work_life_balancePoor -1.616e+00 9.069e-02 -17.816 < 2e-16 ***
## job_satisfactionLow -5.752e-01 8.416e-02 -6.834 8.23e-12 ***
## job_satisfactionMedium -1.169e-01 6.655e-02 -1.757 0.07900 .
## job_satisfactionVery High -6.094e-01 6.564e-02 -9.284 < 2e-16 ***
## performance_ratingBelow Average -4.228e-01 7.231e-02 -5.848 4.97e-09 ***
## performance_ratingHigh -1.418e-01 6.356e-02 -2.230 0.02572 *
## performance_ratingLow -4.868e-01 1.181e-01 -4.123 3.74e-05 ***
## number_of_promotions 2.875e-01 2.536e-02 11.337 < 2e-16 ***
## overtimeYes -3.698e-01 5.291e-02 -6.990 2.75e-12 ***
## distance_from_home -9.571e-03 8.741e-04 -10.950 < 2e-16 ***
## education_levelBachelor’s Degree 1.056e-01 6.663e-02 1.585 0.11305
## education_levelHigh School 1.298e-01 7.439e-02 1.744 0.08108 .
## education_levelMaster’s Degree 2.016e-01 7.360e-02 2.739 0.00617 **
## education_levelPhD 1.878e+00 1.388e-01 13.532 < 2e-16 ***
## marital_statusMarried 2.975e-01 7.069e-02 4.209 2.57e-05 ***
## marital_statusSingle -1.642e+00 7.769e-02 -21.133 < 2e-16 ***
## number_of_dependents 1.118e-01 1.617e-02 6.915 4.66e-12 ***
## job_levelMid 1.063e+00 5.525e-02 19.237 < 2e-16 ***
## job_levelSenior 2.754e+00 7.910e-02 34.815 < 2e-16 ***
## company_sizeMedium 5.505e-02 6.572e-02 0.838 0.40218
## company_sizeSmall -1.182e-01 7.180e-02 -1.646 0.09971 .
## company_tenure -4.425e-04 1.093e-03 -0.405 0.68553
## remote_workYes 1.834e+00 7.111e-02 25.799 < 2e-16 ***
## leadership_opportunitiesYes 2.457e-01 1.159e-01 2.120 0.03404 *
## innovation_opportunitiesYes 1.052e-01 6.817e-02 1.543 0.12283
## company_reputationFair -5.511e-01 9.735e-02 -5.661 1.50e-08 ***
## company_reputationGood 4.166e-03 8.703e-02 0.048 0.96182
## company_reputationPoor -6.616e-01 9.696e-02 -6.823 8.91e-12 ***
## employee_recognitionLow -3.421e-02 6.344e-02 -0.539 0.58974
## employee_recognitionMedium -8.725e-03 6.700e-02 -0.130 0.89639
## employee_recognitionVery High 6.296e-02 1.253e-01 0.502 0.61544
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 14427.6 on 10430 degrees of freedom
## Residual deviance: 9914.3 on 10388 degrees of freedom
## AIC: 10000
##
## Number of Fisher Scoring iterations: 5
prob1 <- predict(model1, test, type = "response")
prob2 <- predict(model2, test, type = "response")
prob3 <- predict(model3, test, type = "response")
# FIX factor level mismatch (VERY IMPORTANT)
lev <- levels(test$attrition)
pred1 <- factor(ifelse(prob1 > 0.5, lev[2], lev[1]), levels = lev)
pred2 <- factor(ifelse(prob2 > 0.5, lev[2], lev[1]), levels = lev)
pred3 <- factor(ifelse(prob3 > 0.5, lev[2], lev[1]), levels = lev)
confusionMatrix(pred1, test$attrition)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Left Stayed
## Left 0 0
## Stayed 2109 2360
##
## Accuracy : 0.5281
## 95% CI : (0.5133, 0.5428)
## No Information Rate : 0.5281
## P-Value [Acc > NIR] : 0.5061
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.0000
## Specificity : 1.0000
## Pos Pred Value : NaN
## Neg Pred Value : 0.5281
## Prevalence : 0.4719
## Detection Rate : 0.0000
## Detection Prevalence : 0.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : Left
##
confusionMatrix(pred2, test$attrition)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Left Stayed
## Left 748 707
## Stayed 1361 1653
##
## Accuracy : 0.5373
## 95% CI : (0.5225, 0.552)
## No Information Rate : 0.5281
## P-Value [Acc > NIR] : 0.1124
##
## Kappa : 0.056
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.3547
## Specificity : 0.7004
## Pos Pred Value : 0.5141
## Neg Pred Value : 0.5484
## Prevalence : 0.4719
## Detection Rate : 0.1674
## Detection Prevalence : 0.3256
## Balanced Accuracy : 0.5275
##
## 'Positive' Class : Left
##
confusionMatrix(pred3, test$attrition)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Left Stayed
## Left 1565 558
## Stayed 544 1802
##
## Accuracy : 0.7534
## 95% CI : (0.7405, 0.766)
## No Information Rate : 0.5281
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.5054
##
## Mcnemar's Test P-Value : 0.6953
##
## Sensitivity : 0.7421
## Specificity : 0.7636
## Pos Pred Value : 0.7372
## Neg Pred Value : 0.7681
## Prevalence : 0.4719
## Detection Rate : 0.3502
## Detection Prevalence : 0.4751
## Balanced Accuracy : 0.7528
##
## 'Positive' Class : Left
##
tibble(
Model = c("Income Only", "Income + Overtime", "All Variables"),
Accuracy = c(
mean(pred1 == test$attrition),
mean(pred2 == test$attrition),
mean(pred3 == test$attrition)
)
)
## # A tibble: 3 × 2
## Model Accuracy
## <chr> <dbl>
## 1 Income Only 0.528
## 2 Income + Overtime 0.537
## 3 All Variables 0.753
Income alone is usually a weak predictor, while behavioral variables like overtime tend to improve classification.
Logistic regression models show that employee attrition depends on multiple factors. Models with more variables generally perform better, but may be harder to interpret. Confusion matrices reveal how well each model distinguishes employees who leave versus those who stay.