Employee attrition represents a significant operational and financial challenge. This analysis applies logistic regression to the Kaggle Employee Attrition Dataset (n = 14,900) to model the probability of voluntary departure and identify the most important predictors. Three models of increasing complexity are estimated, evaluated, and compared.
Modelling strategy:
# ── Load dataset ───────────────────────────────────────────────────────────────
# Place the CSV in the same directory as this .Rmd file, or update the path.
df <- read.csv("test.csv", stringsAsFactors = FALSE, check.names = TRUE)
cat("Dataset dimensions:", nrow(df), "rows x", ncol(df), "columns\n")## Dataset dimensions: 14900 rows x 24 columns
## Column names:
## [1] "Employee.ID" "Age"
## [3] "Gender" "Years.at.Company"
## [5] "Job.Role" "Monthly.Income"
## [7] "Work.Life.Balance" "Job.Satisfaction"
## [9] "Performance.Rating" "Number.of.Promotions"
## [11] "Overtime" "Distance.from.Home"
## [13] "Education.Level" "Marital.Status"
## [15] "Number.of.Dependents" "Job.Level"
## [17] "Company.Size" "Company.Tenure"
## [19] "Remote.Work" "Leadership.Opportunities"
## [21] "Innovation.Opportunities" "Company.Reputation"
## [23] "Employee.Recognition" "Attrition"
# Attrition distribution
df %>%
count(Attrition) %>%
mutate(Pct = round(n / sum(n) * 100, 1)) %>%
kable(caption = "Outcome Variable Distribution", col.names = c("Attrition", "Count", "Pct (%)")) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Attrition | Count | Pct (%) |
|---|---|---|
| Left | 7032 | 47.2 |
| Stayed | 7868 | 52.8 |
# Check for missing values
miss <- colSums(is.na(df))
if (sum(miss) == 0) {
cat("No missing values detected.\n")
} else {
print(miss[miss > 0])
}## No missing values detected.
# ── Binary encode outcome ──────────────────────────────────────────────────────
df$Attrition_bin <- ifelse(df$Attrition == "Left", 1, 0)
# ── Binary encode Overtime ─────────────────────────────────────────────────────
df$Overtime_bin <- ifelse(df$Overtime == "Yes", 1, 0)
# ── Label-encode all remaining categorical variables ───────────────────────────
cat_cols <- c(
"Gender", "Job.Role", "Work.Life.Balance", "Job.Satisfaction",
"Performance.Rating", "Education.Level", "Marital.Status",
"Job.Level", "Company.Size", "Remote.Work",
"Leadership.Opportunities", "Innovation.Opportunities",
"Company.Reputation", "Employee.Recognition"
)
for (col in cat_cols) {
if (col %in% names(df)) {
df[[paste0(col, "_enc")]] <- as.integer(factor(df[[col]]))
}
}
# ── Define feature sets ────────────────────────────────────────────────────────
numeric_cols <- c(
"Age", "Years.at.Company", "Monthly.Income",
"Number.of.Promotions", "Distance.from.Home",
"Number.of.Dependents", "Company.Tenure"
)
enc_cols <- paste0(cat_cols[cat_cols %in% names(df)], "_enc")
all_vars <- c(numeric_cols, enc_cols, "Overtime_bin")
cat("Number of predictors in full model:", length(all_vars), "\n")## Number of predictors in full model: 22
set.seed(42)
train_idx <- createDataPartition(df$Attrition_bin, p = 0.70, list = FALSE)
train_df <- df[ train_idx, ]
test_df <- df[-train_idx, ]
cat(sprintf(
"Train: %d rows (attrition rate: %.1f%%)\nTest: %d rows (attrition rate: %.1f%%)\n",
nrow(train_df), mean(train_df$Attrition_bin) * 100,
nrow(test_df), mean(test_df$Attrition_bin) * 100
))## Train: 10430 rows (attrition rate: 47.4%)
## Test: 4470 rows (attrition rate: 46.7%)
df %>%
group_by(Overtime, Attrition) %>%
summarise(n = n(), .groups = "drop") %>%
group_by(Overtime) %>%
mutate(pct = n / sum(n)) %>%
ggplot(aes(x = Overtime, y = pct, fill = Attrition)) +
geom_col(position = "dodge", width = 0.6) +
scale_y_continuous(labels = percent_format()) +
scale_fill_manual(values = c("Left" = "#E74C3C", "Stayed" = "#2E86AB")) +
labs(
title = "Attrition Rate by Overtime Status",
x = "Overtime", y = "Proportion", fill = "Attrition"
) +
theme_minimal(base_size = 13)Attrition by Overtime Status
ggplot(df, aes(x = Attrition, y = Monthly.Income, fill = Attrition)) +
geom_boxplot(alpha = 0.7, outlier.alpha = 0.3) +
scale_fill_manual(values = c("Left" = "#E74C3C", "Stayed" = "#2E86AB")) +
scale_y_continuous(labels = comma_format()) +
labs(
title = "Monthly Income Distribution by Attrition",
x = "Attrition Status", y = "Monthly Income"
) +
theme_minimal(base_size = 13) +
theme(legend.position = "none")Monthly Income Distribution by Attrition
# ── McFadden Pseudo R² ────────────────────────────────────────────────────────
pseudo_r2 <- function(model) {
1 - (model$deviance / model$null.deviance)
}
# ── Confusion matrix + metrics ────────────────────────────────────────────────
eval_model <- function(model, test_data, outcome = "Attrition_bin",
threshold = 0.5, model_name = "Model") {
probs <- predict(model, newdata = test_data, type = "response")
preds <- factor(ifelse(probs >= threshold, 1, 0), levels = c(0, 1))
actual <- factor(test_data[[outcome]], levels = c(0, 1))
cm <- confusionMatrix(preds, actual, positive = "1")
TN <- cm$table[1, 1]; FP <- cm$table[2, 1]
FN <- cm$table[1, 2]; TP <- cm$table[2, 2]
list(
name = model_name,
probs = probs,
preds = preds,
actual = actual,
cm = cm,
TN = TN, FP = FP, FN = FN, TP = TP,
accuracy = as.numeric(cm$overall["Accuracy"]),
precision = as.numeric(cm$byClass["Precision"]),
recall = as.numeric(cm$byClass["Recall"]),
f1 = as.numeric(cm$byClass["F1"])
)
}
# ── Formatted odds-ratio table ─────────────────────────────────────────────────
or_table <- function(model) {
coefs <- summary(model)$coefficients
or <- exp(coef(model))
ci <- exp(confint.default(model))
data.frame(
Variable = rownames(coefs),
Coefficient = round(coefs[, 1], 4),
Std.Error = round(coefs[, 2], 4),
z.stat = round(coefs[, 3], 3),
p.value = round(coefs[, 4], 4),
Odds.Ratio = round(or, 4),
CI.Lower = round(ci[, 1], 4),
CI.Upper = round(ci[, 2], 4),
Sig = ifelse(coefs[, 4] < .001, "***",
ifelse(coefs[, 4] < .01, "**",
ifelse(coefs[, 4] < .05, "*",
ifelse(coefs[, 4] < .1, ".", ""))))
)
}m1 <- glm(Attrition_bin ~ Monthly.Income,
data = train_df,
family = binomial(link = "logit"))
summary(m1)##
## Call:
## glm(formula = Attrition_bin ~ Monthly.Income, family = binomial(link = "logit"),
## data = train_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.485e-02 6.921e-02 -0.504 0.615
## Monthly.Income -9.488e-06 9.106e-06 -1.042 0.297
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 14431 on 10429 degrees of freedom
## Residual deviance: 14430 on 10428 degrees of freedom
## AIC: 14434
##
## Number of Fisher Scoring iterations: 3
## Log-Likelihood: -7214.89
## McFadden Pseudo R²: 0.0001
## AIC: 14433.79
or_table(m1) %>%
kable(caption = "Model 1 — Coefficient Table with Odds Ratios",
digits = 4) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE) %>%
column_spec(9, bold = TRUE, color = "darkred")| Variable | Coefficient | Std.Error | z.stat | p.value | Odds.Ratio | CI.Lower | CI.Upper | Sig | |
|---|---|---|---|---|---|---|---|---|---|
| (Intercept) | (Intercept) | -0.0349 | 0.0692 | -0.504 | 0.6146 | 0.9657 | 0.8432 | 1.1061 | |
| Monthly.Income | Monthly.Income | 0.0000 | 0.0000 | -1.042 | 0.2974 | 1.0000 | 1.0000 | 1.0000 |
## Reference
## Prediction 0 1
## 0 2382 2088
## 1 0 0
data.frame(
Metric = c("Accuracy", "Precision", "Recall", "F1-Score"),
Value = round(c(res1$accuracy, res1$precision, res1$recall, res1$f1), 4)
) %>%
kable(caption = "Model 1 — Performance Metrics") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Metric | Value |
|---|---|
| Accuracy | 0.5329 |
| Precision | NA |
| Recall | 0.0000 |
| F1-Score | NA |
Interpretation: Monthly Income is statistically non-significant (p = 0.297). The model classifies all observations as “Stayed” (the majority class), yielding an accuracy equal to the base rate and failing to detect a single leaver.
m2 <- glm(Attrition_bin ~ Monthly.Income + Overtime_bin,
data = train_df,
family = binomial(link = "logit"))
summary(m2)##
## Call:
## glm(formula = Attrition_bin ~ Monthly.Income + Overtime_bin,
## family = binomial(link = "logit"), data = train_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.115e-01 7.069e-02 -1.577 0.115
## Monthly.Income -9.469e-06 9.120e-06 -1.038 0.299
## Overtime_bin 2.303e-01 4.168e-02 5.526 3.27e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 14431 on 10429 degrees of freedom
## Residual deviance: 14399 on 10427 degrees of freedom
## AIC: 14405
##
## Number of Fisher Scoring iterations: 3
## Log-Likelihood: -7199.61
## McFadden Pseudo R²: 0.0022
## AIC: 14405.22
## Analysis of Deviance Table
##
## Model 1: Attrition_bin ~ Monthly.Income
## Model 2: Attrition_bin ~ Monthly.Income + Overtime_bin
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 10428 14430
## 2 10427 14399 1 30.567 3.226e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
or_table(m2) %>%
kable(caption = "Model 2 — Coefficient Table with Odds Ratios",
digits = 4) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE) %>%
column_spec(9, bold = TRUE, color = "darkred")| Variable | Coefficient | Std.Error | z.stat | p.value | Odds.Ratio | CI.Lower | CI.Upper | Sig | |
|---|---|---|---|---|---|---|---|---|---|
| (Intercept) | (Intercept) | -0.1115 | 0.0707 | -1.577 | 0.1147 | 0.8945 | 0.7788 | 1.0274 | |
| Monthly.Income | Monthly.Income | 0.0000 | 0.0000 | -1.038 | 0.2992 | 1.0000 | 1.0000 | 1.0000 | |
| Overtime_bin | Overtime_bin | 0.2303 | 0.0417 | 5.526 | 0.0000 | 1.2590 | 1.1603 | 1.3662 | *** |
## Reference
## Prediction 0 1
## 0 1689 1355
## 1 693 733
data.frame(
Metric = c("Accuracy", "Precision", "Recall", "F1-Score"),
Value = round(c(res2$accuracy, res2$precision, res2$recall, res2$f1), 4)
) %>%
kable(caption = "Model 2 — Performance Metrics") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Metric | Value |
|---|---|
| Accuracy | 0.5418 |
| Precision | 0.5140 |
| Recall | 0.3511 |
| F1-Score | 0.4172 |
Interpretation: Overtime is highly significant (p < 0.001, OR = 1.259), indicating employees working overtime have ~24% higher odds of leaving. Monthly Income remains non-significant. The AIC falls by 28.6 units relative to Model 1.
formula_m3 <- as.formula(
paste("Attrition_bin ~", paste(all_vars, collapse = " + "))
)
m3 <- glm(formula_m3,
data = train_df,
family = binomial(link = "logit"))
summary(m3)##
## Call:
## glm(formula = formula_m3, family = binomial(link = "logit"),
## data = train_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.304e+00 2.746e-01 8.392 < 2e-16 ***
## Age -6.545e-03 2.228e-03 -2.938 0.00330 **
## Years.at.Company -1.205e-02 2.634e-03 -4.575 4.77e-06 ***
## Monthly.Income -2.330e-06 1.285e-05 -0.181 0.85612
## Number.of.Promotions -2.320e-01 2.339e-02 -9.919 < 2e-16 ***
## Distance.from.Home 8.571e-03 8.060e-04 10.634 < 2e-16 ***
## Number.of.Dependents -9.815e-02 1.502e-02 -6.533 6.43e-11 ***
## Company.Tenure 4.642e-04 1.010e-03 0.460 0.64573
## Gender_enc -5.207e-01 4.628e-02 -11.250 < 2e-16 ***
## Job.Role_enc -3.209e-02 1.877e-02 -1.710 0.08729 .
## Work.Life.Balance_enc 1.870e-01 2.448e-02 7.639 2.18e-14 ***
## Job.Satisfaction_enc 1.522e-01 1.884e-02 8.078 6.57e-16 ***
## Performance.Rating_enc 1.223e-01 2.404e-02 5.089 3.61e-07 ***
## Education.Level_enc -1.329e-01 1.910e-02 -6.957 3.47e-12 ***
## Marital.Status_enc 9.276e-01 3.596e-02 25.792 < 2e-16 ***
## Job.Level_enc -1.120e+00 3.336e-02 -33.586 < 2e-16 ***
## Company.Size_enc 1.048e-01 3.281e-02 3.195 0.00140 **
## Remote.Work_enc -1.609e+00 6.504e-02 -24.744 < 2e-16 ***
## Leadership.Opportunities_enc -3.110e-01 1.076e-01 -2.891 0.00385 **
## Innovation.Opportunities_enc -1.811e-01 6.286e-02 -2.881 0.00396 **
## Company.Reputation_enc 7.387e-02 2.660e-02 2.777 0.00548 **
## Employee.Recognition_enc 1.080e-02 2.705e-02 0.399 0.68978
## Overtime_bin 3.180e-01 4.887e-02 6.507 7.68e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 14431 on 10429 degrees of freedom
## Residual deviance: 11308 on 10407 degrees of freedom
## AIC: 11354
##
## Number of Fisher Scoring iterations: 4
## Log-Likelihood: -5654.03
## McFadden Pseudo R²: 0.2164
## AIC: 11354.07
## Analysis of Deviance Table
##
## Model 1: Attrition_bin ~ Monthly.Income + Overtime_bin
## Model 2: Attrition_bin ~ Age + Years.at.Company + Monthly.Income + Number.of.Promotions +
## Distance.from.Home + Number.of.Dependents + Company.Tenure +
## Gender_enc + Job.Role_enc + Work.Life.Balance_enc + Job.Satisfaction_enc +
## Performance.Rating_enc + Education.Level_enc + Marital.Status_enc +
## Job.Level_enc + Company.Size_enc + Remote.Work_enc + Leadership.Opportunities_enc +
## Innovation.Opportunities_enc + Company.Reputation_enc + Employee.Recognition_enc +
## Overtime_bin
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 10427 14399
## 2 10407 11308 20 3091.2 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
or_table(m3) %>%
kable(caption = "Model 3 — Full Coefficient Table with Odds Ratios",
digits = 4) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE) %>%
column_spec(9, bold = TRUE, color = "darkred") %>%
row_spec(which(or_table(m3)$Sig %in% c("***", "**", "*")),
background = "#F0FFF0")| Variable | Coefficient | Std.Error | z.stat | p.value | Odds.Ratio | CI.Lower | CI.Upper | Sig | |
|---|---|---|---|---|---|---|---|---|---|
| (Intercept) | (Intercept) | 2.3042 | 0.2746 | 8.392 | 0.0000 | 10.0160 | 5.8477 | 17.1554 | *** |
| Age | Age | -0.0065 | 0.0022 | -2.938 | 0.0033 | 0.9935 | 0.9891 | 0.9978 | ** |
| Years.at.Company | Years.at.Company | -0.0120 | 0.0026 | -4.575 | 0.0000 | 0.9880 | 0.9829 | 0.9931 | *** |
| Monthly.Income | Monthly.Income | 0.0000 | 0.0000 | -0.181 | 0.8561 | 1.0000 | 1.0000 | 1.0000 | |
| Number.of.Promotions | Number.of.Promotions | -0.2320 | 0.0234 | -9.919 | 0.0000 | 0.7929 | 0.7574 | 0.8301 | *** |
| Distance.from.Home | Distance.from.Home | 0.0086 | 0.0008 | 10.634 | 0.0000 | 1.0086 | 1.0070 | 1.0102 | *** |
| Number.of.Dependents | Number.of.Dependents | -0.0981 | 0.0150 | -6.533 | 0.0000 | 0.9065 | 0.8802 | 0.9336 | *** |
| Company.Tenure | Company.Tenure | 0.0005 | 0.0010 | 0.460 | 0.6457 | 1.0005 | 0.9985 | 1.0024 | |
| Gender_enc | Gender_enc | -0.5207 | 0.0463 | -11.250 | 0.0000 | 0.5941 | 0.5426 | 0.6505 | *** |
| Job.Role_enc | Job.Role_enc | -0.0321 | 0.0188 | -1.710 | 0.0873 | 0.9684 | 0.9334 | 1.0047 | . |
| Work.Life.Balance_enc | Work.Life.Balance_enc | 0.1870 | 0.0245 | 7.639 | 0.0000 | 1.2057 | 1.1492 | 1.2649 | *** |
| Job.Satisfaction_enc | Job.Satisfaction_enc | 0.1522 | 0.0188 | 8.078 | 0.0000 | 1.1644 | 1.1222 | 1.2082 | *** |
| Performance.Rating_enc | Performance.Rating_enc | 0.1223 | 0.0240 | 5.089 | 0.0000 | 1.1301 | 1.0781 | 1.1847 | *** |
| Education.Level_enc | Education.Level_enc | -0.1329 | 0.0191 | -6.957 | 0.0000 | 0.8756 | 0.8434 | 0.9090 | *** |
| Marital.Status_enc | Marital.Status_enc | 0.9276 | 0.0360 | 25.792 | 0.0000 | 2.5284 | 2.3563 | 2.7130 | *** |
| Job.Level_enc | Job.Level_enc | -1.1204 | 0.0334 | -33.586 | 0.0000 | 0.3262 | 0.3055 | 0.3482 | *** |
| Company.Size_enc | Company.Size_enc | 0.1048 | 0.0328 | 3.195 | 0.0014 | 1.1105 | 1.0413 | 1.1843 | ** |
| Remote.Work_enc | Remote.Work_enc | -1.6094 | 0.0650 | -24.744 | 0.0000 | 0.2000 | 0.1761 | 0.2272 | *** |
| Leadership.Opportunities_enc | Leadership.Opportunities_enc | -0.3110 | 0.1076 | -2.891 | 0.0038 | 0.7327 | 0.5934 | 0.9047 | ** |
| Innovation.Opportunities_enc | Innovation.Opportunities_enc | -0.1811 | 0.0629 | -2.881 | 0.0040 | 0.8343 | 0.7376 | 0.9437 | ** |
| Company.Reputation_enc | Company.Reputation_enc | 0.0739 | 0.0266 | 2.777 | 0.0055 | 1.0767 | 1.0220 | 1.1343 | ** |
| Employee.Recognition_enc | Employee.Recognition_enc | 0.0108 | 0.0271 | 0.399 | 0.6898 | 1.0109 | 0.9587 | 1.0659 | |
| Overtime_bin | Overtime_bin | 0.3180 | 0.0489 | 6.507 | 0.0000 | 1.3743 | 1.2488 | 1.5124 | *** |
ot3 <- or_table(m3)
ot3 %>%
filter(Variable != "(Intercept)") %>%
mutate(abs_z = abs(z.stat)) %>%
arrange(desc(abs_z)) %>%
head(12) %>%
ggplot(aes(x = reorder(Variable, abs_z), y = abs_z,
fill = ifelse(Coefficient > 0, "Positive", "Negative"))) +
geom_col(width = 0.7) +
coord_flip() +
scale_fill_manual(values = c("Positive" = "#E74C3C", "Negative" = "#2E86AB"),
name = "Direction") +
labs(
title = "Top 12 Predictors of Attrition (Full Model)",
x = NULL, y = "|z-statistic|"
) +
theme_minimal(base_size = 13)Top Predictors in Model 3 (by |z-statistic|)
## Reference
## Prediction 0 1
## 0 1753 669
## 1 629 1419
data.frame(
Metric = c("Accuracy", "Precision", "Recall", "F1-Score"),
Value = round(c(res3$accuracy, res3$precision, res3$recall, res3$f1), 4)
) %>%
kable(caption = "Model 3 — Performance Metrics") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Metric | Value |
|---|---|
| Accuracy | 0.7096 |
| Precision | 0.6929 |
| Recall | 0.6796 |
| F1-Score | 0.6862 |
plot_cm <- function(res) {
df_cm <- data.frame(
Actual = factor(c("Stayed","Stayed","Left","Left"),
levels = c("Left","Stayed")),
Predicted = factor(c("Stayed","Left","Stayed","Left"),
levels = c("Stayed","Left")),
n = c(res$TN, res$FP, res$FN, res$TP),
label = c("TN","FP","FN","TP")
)
ggplot(df_cm, aes(x = Predicted, y = Actual, fill = label)) +
geom_tile(color = "white", linewidth = 1.2) +
geom_text(aes(label = paste0(label, "\n", n)), size = 5, fontface = "bold") +
scale_fill_manual(values = c(
TN = "#AED6F1", FP = "#F1948A", FN = "#F1948A", TP = "#A9DFBF"
)) +
labs(title = res$name, x = "Predicted", y = "Actual") +
theme_minimal(base_size = 12) +
theme(legend.position = "none",
plot.title = element_text(face = "bold", hjust = 0.5))
}
library(gridExtra)
grid.arrange(plot_cm(res1), plot_cm(res2), plot_cm(res3), ncol = 3)Confusion Matrices — All Three Models
comparison <- data.frame(
Metric = c("Log-Likelihood", "McFadden R²", "AIC",
"Accuracy", "Precision", "Recall", "F1-Score",
"True Positives", "False Negatives"),
Model.1 = c(
round(as.numeric(logLik(m1)), 2),
round(pseudo_r2(m1), 4),
round(AIC(m1), 2),
round(res1$accuracy, 4),
round(res1$precision, 4),
round(res1$recall, 4),
round(res1$f1, 4),
res1$TP, res1$FN
),
Model.2 = c(
round(as.numeric(logLik(m2)), 2),
round(pseudo_r2(m2), 4),
round(AIC(m2), 2),
round(res2$accuracy, 4),
round(res2$precision, 4),
round(res2$recall, 4),
round(res2$f1, 4),
res2$TP, res2$FN
),
Model.3 = c(
round(as.numeric(logLik(m3)), 2),
round(pseudo_r2(m3), 4),
round(AIC(m3), 2),
round(res3$accuracy, 4),
round(res3$precision, 4),
round(res3$recall, 4),
round(res3$f1, 4),
res3$TP, res3$FN
)
)
comparison %>%
kable(caption = "Model Comparison — All Metrics",
col.names = c("Metric", "Model 1", "Model 2", "Model 3")) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
row_spec(0, bold = TRUE, background = "#1F497D", color = "white") %>%
row_spec(c(4, 7), bold = TRUE)| Metric | Model 1 | Model 2 | Model 3 |
|---|---|---|---|
| Log-Likelihood | -7214.8900 | -7199.6100 | -5654.0300 |
| McFadden R² | 0.0001 | 0.0022 | 0.2164 |
| AIC | 14433.7900 | 14405.2200 | 11354.0700 |
| Accuracy | 0.5329 | 0.5418 | 0.7096 |
| Precision | NA | 0.5140 | 0.6929 |
| Recall | 0.0000 | 0.3511 | 0.6796 |
| F1-Score | NA | 0.4172 | 0.6862 |
| True Positives | 0.0000 | 733.0000 | 1419.0000 |
| False Negatives | 2088.0000 | 1355.0000 | 669.0000 |
roc1 <- roc(res1$actual, res1$probs, quiet = TRUE)
roc2 <- roc(res2$actual, res2$probs, quiet = TRUE)
roc3 <- roc(res3$actual, res3$probs, quiet = TRUE)
cat(sprintf("AUC — Model 1: %.4f\n", auc(roc1)))## AUC — Model 1: 0.4976
## AUC — Model 2: 0.5312
## AUC — Model 3: 0.7891
# Build tidy data frame for ggplot
roc_df <- bind_rows(
data.frame(fpr = 1 - roc1$specificities, tpr = roc1$sensitivities,
Model = sprintf("M1 (AUC=%.3f)", auc(roc1))),
data.frame(fpr = 1 - roc2$specificities, tpr = roc2$sensitivities,
Model = sprintf("M2 (AUC=%.3f)", auc(roc2))),
data.frame(fpr = 1 - roc3$specificities, tpr = roc3$sensitivities,
Model = sprintf("M3 (AUC=%.3f)", auc(roc3)))
)
ggplot(roc_df, aes(x = fpr, y = tpr, colour = Model)) +
geom_line(linewidth = 1.1) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", colour = "grey50") +
scale_colour_manual(values = c("#95A5A6", "#E67E22", "#2E86AB")) +
labs(
title = "ROC Curves — Logistic Regression Models",
x = "False Positive Rate (1 − Specificity)",
y = "True Positive Rate (Sensitivity)",
colour = NULL
) +
theme_minimal(base_size = 13) +
theme(legend.position = c(0.75, 0.25))ROC Curves — All Three Models
perf_long <- data.frame(
Model = rep(c("Model 1", "Model 2", "Model 3"), each = 3),
Metric = rep(c("Accuracy", "Recall", "F1-Score"), 3),
Value = c(
res1$accuracy, res1$recall, res1$f1,
res2$accuracy, res2$recall, res2$f1,
res3$accuracy, res3$recall, res3$f1
)
)
ggplot(perf_long, aes(x = Model, y = Value, fill = Metric)) +
geom_col(position = "dodge", width = 0.6) +
geom_text(aes(label = sprintf("%.3f", Value)),
position = position_dodge(0.6), vjust = -0.4, size = 3.5) +
scale_fill_manual(values = c(
Accuracy = "#2E86AB",
Recall = "#E74C3C",
"F1-Score"= "#27AE60"
)) +
scale_y_continuous(limits = c(0, 0.85), labels = percent_format()) +
labs(
title = "Model Performance Comparison",
x = NULL, y = "Score", fill = NULL
) +
theme_minimal(base_size = 13)F1-Score, Accuracy, and Recall by Model
Model 3 is unambiguously the best-performing specification on every criterion:
Monthly Income is not a significant predictor in any model. This challenges the common managerial assumption that pay is the primary driver of attrition. Even in the full model (p = 0.856), income plays no independent role once structural and environmental factors are controlled for.
The five strongest predictors in Model 3 (by |z|):
| Predictor | Odds Ratio | Direction | Interpretation |
|---|---|---|---|
| Job Level | 0.422 | Negative | Higher seniority → 57.8% lower attrition odds |
| Remote Work | 0.525 | Negative | Remote access halves attrition odds |
| Marital Status | 1.798 | Positive | Certain statuses elevate risk by ~80% |
| Gender | 0.771 | Negative | One gender shows ~23% lower attrition odds |
| Distance from Home | 1.281 | Positive | Greater commute raises odds by 28% |
Career progression is the primary retention lever. Job Level (OR = 0.422) and Number of Promotions (OR = 0.786) are among the strongest protective factors. Structured career ladders, clear promotion criteria, and internal mobility programmes represent the highest-ROI retention investments.
Remote work should be treated as a retention benefit. Employees with remote access have approximately half the attrition odds of on-site workers. Organisations restricting remote work risk significantly elevated turnover.
Commute burden matters. Distance from Home (OR = 1.281) is a significant risk factor, suggesting value in commuter support, hybrid scheduling, or location-sensitive hiring.
Work-life balance and job satisfaction must be actively monitored. Both variables are highly significant (p < 0.001), reinforcing the value of regular pulse surveys and managerial check-ins.
Overtime should be managed carefully. Overtime (OR = 1.165 in the full model) independently predicts attrition, even after controlling for 20 other variables.
Predictive intervention is feasible. Using Model 3, HR teams can score employees monthly and flag individuals exceeding a chosen risk threshold (e.g., 0.60) for proactive retention conversations — before a resignation decision is finalised.
## R version 4.5.1 (2025-06-13)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sonoma 14.3.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: Asia/Taipei
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] gridExtra_2.3 scales_1.4.0 pROC_1.19.0.1 kableExtra_1.4.0
## [5] knitr_1.50 caret_7.0-1 lattice_0.22-7 lubridate_1.9.4
## [9] forcats_1.0.1 stringr_1.5.2 dplyr_1.1.4 purrr_1.1.0
## [13] readr_2.1.5 tidyr_1.3.1 tibble_3.3.0 ggplot2_4.0.2
## [17] tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.2.1 viridisLite_0.4.2 timeDate_4051.111
## [4] farver_2.1.2 S7_0.2.0 fastmap_1.2.0
## [7] digest_0.6.37 rpart_4.1.24 timechange_0.3.0
## [10] lifecycle_1.0.4 survival_3.8-3 magrittr_2.0.4
## [13] compiler_4.5.1 rlang_1.1.6 sass_0.4.10
## [16] tools_4.5.1 yaml_2.3.10 data.table_1.17.8
## [19] labeling_0.4.3 plyr_1.8.9 xml2_1.4.0
## [22] RColorBrewer_1.1-3 withr_3.0.2 nnet_7.3-20
## [25] grid_4.5.1 stats4_4.5.1 e1071_1.7-17
## [28] future_1.67.0 globals_0.18.0 iterators_1.0.14
## [31] MASS_7.3-65 cli_3.6.5 rmarkdown_2.29
## [34] generics_0.1.4 rstudioapi_0.17.1 future.apply_1.20.0
## [37] reshape2_1.4.5 tzdb_0.5.0 proxy_0.4-29
## [40] cachem_1.1.0 splines_4.5.1 parallel_4.5.1
## [43] vctrs_0.6.5 hardhat_1.4.2 Matrix_1.7-3
## [46] jsonlite_2.0.0 hms_1.1.3 listenv_0.9.1
## [49] systemfonts_1.3.2 foreach_1.5.2 gower_1.0.2
## [52] jquerylib_0.1.4 recipes_1.3.1 glue_1.8.0
## [55] parallelly_1.45.1 codetools_0.2-20 stringi_1.8.7
## [58] gtable_0.3.6 pillar_1.11.1 htmltools_0.5.8.1
## [61] ipred_0.9-15 lava_1.8.2 R6_2.6.1
## [64] textshaping_1.0.3 evaluate_1.0.5 bslib_0.9.0
## [67] class_7.3-23 Rcpp_1.1.0 svglite_2.2.2
## [70] nlme_3.1-168 prodlim_2025.04.28 xfun_0.53
## [73] pkgconfig_2.0.3 ModelMetrics_1.2.2.2
End of Report