The dataset is from Kaggle: Employee
Attrition Dataset.
Place train.csv in the same folder as this
.Rmd before knitting.
## Dimensions: 59598 rows x 24 columns
## Column names:
## [1] "Employee.ID" "Age"
## [3] "Gender" "Years.at.Company"
## [5] "Job.Role" "Monthly.Income"
## [7] "Work.Life.Balance" "Job.Satisfaction"
## [9] "Performance.Rating" "Number.of.Promotions"
## [11] "Overtime" "Distance.from.Home"
## [13] "Education.Level" "Marital.Status"
## [15] "Number.of.Dependents" "Job.Level"
## [17] "Company.Size" "Company.Tenure"
## [19] "Remote.Work" "Leadership.Opportunities"
## [21] "Innovation.Opportunities" "Company.Reputation"
## [23] "Employee.Recognition" "Attrition"
##
## Attrition values:
## [1] "Stayed" "Left"
# Actual column names from CSV:
# Monthly.Income, Overtime, Attrition (values: "Left" / "Stayed")
df <- df %>%
mutate(
Attrition = factor(Attrition, levels = c("Left", "Stayed")),
Overtime = factor(Overtime),
Monthly.Income = as.numeric(Monthly.Income)
) %>%
mutate(across(where(is.character), as.factor)) %>%
select(-Employee.ID) # remove ID column — not useful for modelling
cat("Attrition distribution:\n")## Attrition distribution:
##
## Left Stayed
## 28338 31260
Following the HOML reference, we use a 70 / 30 stratified split.
set.seed(2024)
train_idx <- createDataPartition(df$Attrition, p = 0.70, list = FALSE)
train_df <- df[ train_idx, ]
test_df <- df[-train_idx, ]
cat("Training set:", nrow(train_df), "rows\n")## Training set: 41719 rows
## Test set : 17879 rows
We estimate three logistic regression models using glm()
with family = binomial.
##
## Call:
## glm(formula = Attrition ~ Monthly.Income, family = binomial,
## data = train_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.735e-02 3.468e-02 0.500 0.6169
## Monthly.Income 1.106e-05 4.554e-06 2.427 0.0152 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 57735 on 41718 degrees of freedom
## Residual deviance: 57729 on 41717 degrees of freedom
## AIC: 57733
##
## Number of Fisher Scoring iterations: 3
##
## Call:
## glm(formula = Attrition ~ Monthly.Income + Overtime, family = binomial,
## data = train_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.022e-01 3.542e-02 2.884 0.00392 **
## Monthly.Income 1.105e-05 4.562e-06 2.421 0.01548 *
## OvertimeYes -2.572e-01 2.089e-02 -12.311 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 57735 on 41718 degrees of freedom
## Residual deviance: 57577 on 41716 degrees of freedom
## AIC: 57583
##
## Number of Fisher Scoring iterations: 3
##
## Call:
## glm(formula = Attrition ~ ., family = binomial, data = train_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.983e-01 1.025e-01 -2.911 0.00361 **
## Age 6.244e-03 1.199e-03 5.209 1.90e-07 ***
## GenderMale 6.282e-01 2.476e-02 25.373 < 2e-16 ***
## Years.at.Company 1.229e-02 1.400e-03 8.784 < 2e-16 ***
## Job.RoleFinance 7.385e-02 5.767e-02 1.281 0.20031
## Job.RoleHealthcare 1.019e-01 5.057e-02 2.014 0.04396 *
## Job.RoleMedia 1.295e-01 4.316e-02 3.000 0.00270 **
## Job.RoleTechnology 9.033e-02 5.787e-02 1.561 0.11856
## Monthly.Income 6.588e-06 9.835e-06 0.670 0.50294
## Work.Life.BalanceFair -1.338e+00 3.728e-02 -35.895 < 2e-16 ***
## Work.Life.BalanceGood -3.154e-01 3.521e-02 -8.957 < 2e-16 ***
## Work.Life.BalancePoor -1.554e+00 4.482e-02 -34.673 < 2e-16 ***
## Job.SatisfactionLow -4.775e-01 4.259e-02 -11.212 < 2e-16 ***
## Job.SatisfactionMedium 3.104e-03 3.244e-02 0.096 0.92377
## Job.SatisfactionVery High -4.883e-01 3.226e-02 -15.136 < 2e-16 ***
## Performance.RatingBelow Average -3.331e-01 3.504e-02 -9.508 < 2e-16 ***
## Performance.RatingHigh 2.252e-02 3.163e-02 0.712 0.47645
## Performance.RatingLow -5.984e-01 5.792e-02 -10.331 < 2e-16 ***
## Number.of.Promotions 2.513e-01 1.241e-02 20.248 < 2e-16 ***
## OvertimeYes -3.667e-01 2.604e-02 -14.084 < 2e-16 ***
## Distance.from.Home -9.923e-03 4.341e-04 -22.856 < 2e-16 ***
## Education.LevelBachelor’s Degree -3.722e-02 3.296e-02 -1.129 0.25887
## Education.LevelHigh School -2.158e-02 3.666e-02 -0.589 0.55609
## Education.LevelMaster’s Degree -4.315e-02 3.634e-02 -1.187 0.23509
## Education.LevelPhD 1.521e+00 6.534e-02 23.282 < 2e-16 ***
## Marital.StatusMarried 2.757e-01 3.533e-02 7.804 6.02e-15 ***
## Marital.StatusSingle -1.517e+00 3.839e-02 -39.526 < 2e-16 ***
## Number.of.Dependents 1.611e-01 7.967e-03 20.224 < 2e-16 ***
## Job.LevelMid 9.803e-01 2.699e-02 36.317 < 2e-16 ***
## Job.LevelSenior 2.594e+00 3.883e-02 66.795 < 2e-16 ***
## Company.SizeMedium -1.061e-02 3.233e-02 -0.328 0.74276
## Company.SizeSmall -2.152e-01 3.531e-02 -6.096 1.08e-09 ***
## Company.Tenure 4.434e-04 5.368e-04 0.826 0.40880
## Remote.WorkYes 1.744e+00 3.463e-02 50.360 < 2e-16 ***
## Leadership.OpportunitiesYes 1.693e-01 5.659e-02 2.991 0.00278 **
## Innovation.OpportunitiesYes 1.684e-01 3.309e-02 5.090 3.59e-07 ***
## Company.ReputationFair -4.367e-01 4.738e-02 -9.217 < 2e-16 ***
## Company.ReputationGood 8.509e-02 4.228e-02 2.013 0.04415 *
## Company.ReputationPoor -7.210e-01 4.746e-02 -15.192 < 2e-16 ***
## Employee.RecognitionLow -3.523e-02 3.127e-02 -1.127 0.25987
## Employee.RecognitionMedium -5.222e-02 3.301e-02 -1.582 0.11361
## Employee.RecognitionVery High 3.398e-02 6.002e-02 0.566 0.57135
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 57735 on 41718 degrees of freedom
## Residual deviance: 40644 on 41677 degrees of freedom
## AIC: 40728
##
## Number of Fisher Scoring iterations: 5
Predicted probabilities are converted to class labels using the default 0.5 threshold.
predict_class <- function(model, newdata, threshold = 0.5) {
probs <- predict(model, newdata = newdata, type = "response")
factor(ifelse(probs >= threshold, "Left", "Stayed"),
levels = c("Left", "Stayed"))
}
pred1 <- predict_class(m1, test_df)
pred2 <- predict_class(m2, test_df)
pred3 <- predict_class(m3, test_df)plot_cm <- function(pred, truth, title) {
cm <- confusionMatrix(pred, truth, positive = "Left")
as.data.frame(cm$table) %>%
dplyr::rename(Predicted = Prediction, Actual = Reference, Count = Freq) %>%
group_by(Predicted) %>%
mutate(Pct = Count / sum(Count) * 100) %>%
ungroup() %>%
ggplot(aes(x = Actual, y = Predicted, fill = Count)) +
geom_tile(color = "white", linewidth = 1.2) +
geom_text(aes(label = paste0(Count, "\n(", round(Pct, 1), "%)")),
size = 5, fontface = "bold", color = "white") +
scale_fill_gradient(low = "#4575b4", high = "#d73027") +
labs(
title = title,
subtitle = paste0(
"Accuracy: ", round(cm$overall["Accuracy"] * 100, 2), "% | ",
"Sensitivity: ", round(cm$byClass["Sensitivity"] * 100, 2), "% | ",
"Specificity: ", round(cm$byClass["Specificity"] * 100, 2), "%"
),
x = "Actual", y = "Predicted"
) +
theme_minimal(base_size = 14) +
theme(
legend.position = "none",
plot.title = element_text(face = "bold", hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5, color = "grey40"),
panel.grid = element_blank()
)
}## Confusion Matrix and Statistics
##
## Reference
## Prediction Left Stayed
## Left 8501 9378
## Stayed 0 0
##
## Accuracy : 0.4755
## 95% CI : (0.4681, 0.4828)
## No Information Rate : 0.5245
## P-Value [Acc > NIR] : 1
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.4755
## Neg Pred Value : NaN
## Prevalence : 0.4755
## Detection Rate : 0.4755
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : Left
##
## Confusion Matrix and Statistics
##
## Reference
## Prediction Left Stayed
## Left 5586 6552
## Stayed 2915 2826
##
## Accuracy : 0.4705
## 95% CI : (0.4632, 0.4778)
## No Information Rate : 0.5245
## P-Value [Acc > NIR] : 1
##
## Kappa : -0.0407
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.6571
## Specificity : 0.3013
## Pos Pred Value : 0.4602
## Neg Pred Value : 0.4922
## Prevalence : 0.4755
## Detection Rate : 0.3124
## Detection Prevalence : 0.6789
## Balanced Accuracy : 0.4792
##
## 'Positive' Class : Left
##
## Confusion Matrix and Statistics
##
## Reference
## Prediction Left Stayed
## Left 2195 7196
## Stayed 6306 2182
##
## Accuracy : 0.2448
## 95% CI : (0.2385, 0.2512)
## No Information Rate : 0.5245
## P-Value [Acc > NIR] : 1
##
## Kappa : -0.5066
##
## Mcnemar's Test P-Value : 1.998e-14
##
## Sensitivity : 0.2582
## Specificity : 0.2327
## Pos Pred Value : 0.2337
## Neg Pred Value : 0.2571
## Prevalence : 0.4755
## Detection Rate : 0.1228
## Detection Prevalence : 0.5253
## Balanced Accuracy : 0.2454
##
## 'Positive' Class : Left
##
extract_metrics <- function(cm, model_name) {
tibble(
Model = model_name,
Accuracy = round(cm$overall["Accuracy"] * 100, 2),
Sensitivity = round(cm$byClass["Sensitivity"] * 100, 2),
Specificity = round(cm$byClass["Specificity"] * 100, 2),
`Pos Pred Value` = round(cm$byClass["Pos Pred Value"] * 100, 2),
`Neg Pred Value` = round(cm$byClass["Neg Pred Value"] * 100, 2),
Kappa = round(cm$overall["Kappa"], 4)
)
}
bind_rows(
extract_metrics(cm1, "M1: Monthly.Income"),
extract_metrics(cm2, "M2: + Overtime"),
extract_metrics(cm3, "M3: All Variables")
) %>%
kbl(caption = "Performance Metrics on Test Set (30% hold-out)") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE) %>%
row_spec(3, bold = TRUE, color = "white", background = "#2c7bb6")| Model | Accuracy | Sensitivity | Specificity | Pos Pred Value | Neg Pred Value | Kappa |
|---|---|---|---|---|---|---|
| M1: Monthly.Income | 47.55 | 100.00 | 0.00 | 47.55 | NaN | 0.0000 |
| M2: + Overtime | 47.05 | 65.71 | 30.13 | 46.02 | 49.22 | -0.0407 |
| M3: All Variables | 24.48 | 25.82 | 23.27 | 23.37 | 25.71 | -0.5066 |
roc1 <- roc(test_df$Attrition, predict(m1, test_df, type = "response"),
levels = c("Stayed", "Left"), direction = "<")
roc2 <- roc(test_df$Attrition, predict(m2, test_df, type = "response"),
levels = c("Stayed", "Left"), direction = "<")
roc3 <- roc(test_df$Attrition, predict(m3, test_df, type = "response"),
levels = c("Stayed", "Left"), direction = "<")
ggroc(list(
"M1: Monthly.Income" = roc1,
"M2: + Overtime" = roc2,
"M3: All Variables" = roc3
), linewidth = 1.2) +
geom_abline(slope = 1, intercept = 1, linetype = "dashed", color = "grey60") +
scale_color_manual(
name = "Model",
values = c("#e41a1c", "#377eb8", "#4daf4a"),
labels = c(
paste0("M1: Monthly.Income (AUC = ", round(auc(roc1), 3), ")"),
paste0("M2: + Overtime (AUC = ", round(auc(roc2), 3), ")"),
paste0("M3: All Variables (AUC = ", round(auc(roc3), 3), ")")
)
) +
labs(
title = "ROC Curves - Three Logistic Regression Models",
subtitle = "Evaluated on test set (30% hold-out)",
x = "1 - Specificity (False Positive Rate)",
y = "Sensitivity (True Positive Rate)"
) +
theme_minimal(base_size = 13) +
theme(
legend.position = "bottom",
plot.title = element_text(face = "bold", hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5, color = "grey50")
)