This report estimates three logistic regression models to predict employee attrition using the Employee Attrition Dataset from Kaggle (IBM HR Analytics structure, n = 1,470).
The modelling workflow follows Boehmke & Greenwell — Hands-On Machine Learning with R, Chapter 5: Logistic Regression.
Research question: Which employee characteristics predict the likelihood of voluntary attrition?
Three models are compared:
| Model | Formula | Purpose |
|---|---|---|
| M1 | Attrition ~ MonthlyIncome |
Baseline — single numeric predictor |
| M2 | Attrition ~ MonthlyIncome + OverTime |
Add a key categorical driver |
| M3 | Attrition ~ . |
Full model — all available predictors |
# Uncomment to install:
# install.packages(c("tidyverse","caret","pROC","knitr","kableExtra","patchwork","scales","ggcorrplot"))
library(tidyverse) # data wrangling & ggplot2
library(caret) # confusionMatrix, createDataPartition
library(pROC) # ROC / AUC
library(knitr) # kable tables
library(kableExtra) # styled tables
library(patchwork) # combine ggplots
library(scales) # axis formatting
library(ggcorrplot) # correlation heatmap
set.seed(42)Download train.csv from the Kaggle
dataset page and place it in your working directory, then use
Option A below. Option B provides a synthetic fallback
with the same column structure for reproducibility.
# ── Option A: Kaggle download (recommended) ─────────────────────────────
# df_raw <- read_csv("train.csv", show_col_types = FALSE)
# ── Option B: Synthetic dataset (same IBM HR Analytics structure) ────────
simulate_data <- function(n = 1470) {
set.seed(42)
att <- sample(c("Yes","No"), n, replace = TRUE, prob = c(0.16, 0.84))
tibble(
Age = sample(18:60, n, replace = TRUE),
Attrition = att,
BusinessTravel = sample(c("Travel_Rarely","Travel_Frequently","Non-Travel"),
n, replace = TRUE, prob = c(0.70, 0.20, 0.10)),
DailyRate = sample(100:1500, n, replace = TRUE),
Department = sample(c("Sales","Research & Development","Human Resources"),
n, replace = TRUE, prob = c(0.30, 0.60, 0.10)),
DistanceFromHome = sample(1:29, n, replace = TRUE),
Education = sample(1:5, n, replace = TRUE),
EducationField = sample(c("Life Sciences","Medical","Marketing",
"Technical Degree","Human Resources","Other"),
n, replace = TRUE),
EnvironmentSatisfaction = sample(1:4, n, replace = TRUE),
Gender = sample(c("Male","Female"), n, replace = TRUE, prob = c(0.60, 0.40)),
HourlyRate = sample(30:100, n, replace = TRUE),
JobInvolvement = sample(1:4, n, replace = TRUE),
JobLevel = sample(1:5, n, replace = TRUE),
JobRole = sample(c("Sales Executive","Research Scientist",
"Laboratory Technician","Manufacturing Director",
"Healthcare Representative","Manager",
"Sales Representative","Research Director",
"Human Resources"), n, replace = TRUE),
JobSatisfaction = sample(1:4, n, replace = TRUE),
MaritalStatus = sample(c("Single","Married","Divorced"),
n, replace = TRUE, prob = c(0.32, 0.46, 0.22)),
MonthlyIncome = as.integer(ifelse(att == "Yes",
pmax(1009, rnorm(n, 4500, 1500)),
pmax(1009, rnorm(n, 6500, 2000)))),
MonthlyRate = sample(2000:27000, n, replace = TRUE),
NumCompaniesWorked = sample(0:9, n, replace = TRUE),
OverTime = ifelse(att == "Yes",
sample(c("Yes","No"), n, replace = TRUE, prob = c(0.55, 0.45)),
sample(c("Yes","No"), n, replace = TRUE, prob = c(0.25, 0.75))),
PercentSalaryHike = sample(11:25, n, replace = TRUE),
PerformanceRating = sample(c(3L, 4L), n, replace = TRUE, prob = c(0.85, 0.15)),
RelationshipSatisfaction = sample(1:4, n, replace = TRUE),
StockOptionLevel = sample(0:3, n, replace = TRUE),
TotalWorkingYears = sample(0:40, n, replace = TRUE),
TrainingTimesLastYear = sample(0:6, n, replace = TRUE),
WorkLifeBalance = sample(1:4, n, replace = TRUE),
YearsAtCompany = sample(0:40, n, replace = TRUE),
YearsInCurrentRole = sample(0:18, n, replace = TRUE),
YearsSinceLastPromotion = sample(0:15, n, replace = TRUE),
YearsWithCurrManager = sample(0:17, n, replace = TRUE)
)
}
df_raw <- simulate_data()
cat("Rows:", nrow(df_raw), " Columns:", ncol(df_raw))## Rows: 1470 Columns: 31
## Rows: 1,470
## Columns: 31
## $ Age <int> 34, 42, 26, 57, 45, 36, 30, 50, 29, 18, 55, 3…
## $ Attrition <chr> "Yes", "Yes", "No", "No", "No", "No", "No", "…
## $ BusinessTravel <chr> "Non-Travel", "Non-Travel", "Travel_Rarely", …
## $ DailyRate <int> 760, 1409, 287, 167, 1312, 992, 1152, 1299, 1…
## $ Department <chr> "Research & Development", "Human Resources", …
## $ DistanceFromHome <int> 9, 9, 5, 28, 6, 25, 7, 14, 17, 2, 27, 12, 29,…
## $ Education <int> 4, 3, 3, 1, 3, 1, 4, 5, 4, 5, 1, 4, 5, 4, 4, …
## $ EducationField <chr> "Other", "Life Sciences", "Medical", "Life Sc…
## $ EnvironmentSatisfaction <int> 4, 4, 3, 2, 1, 1, 1, 1, 4, 1, 1, 2, 4, 4, 1, …
## $ Gender <chr> "Male", "Female", "Female", "Male", "Female",…
## $ HourlyRate <int> 90, 56, 91, 85, 62, 73, 33, 90, 84, 58, 80, 6…
## $ JobInvolvement <int> 2, 3, 4, 4, 2, 1, 3, 1, 3, 1, 1, 1, 4, 1, 3, …
## $ JobLevel <int> 5, 5, 1, 5, 3, 5, 3, 3, 1, 1, 4, 4, 3, 3, 2, …
## $ JobRole <chr> "Laboratory Technician", "Sales Representativ…
## $ JobSatisfaction <int> 2, 3, 2, 1, 3, 2, 1, 4, 2, 3, 4, 1, 3, 3, 2, …
## $ MaritalStatus <chr> "Married", "Married", "Single", "Married", "M…
## $ MonthlyIncome <int> 2733, 5179, 4619, 5726, 7957, 8187, 4969, 725…
## $ MonthlyRate <int> 8354, 19513, 5728, 3671, 15755, 15566, 21836,…
## $ NumCompaniesWorked <int> 5, 3, 7, 2, 2, 9, 8, 8, 9, 9, 2, 6, 5, 3, 9, …
## $ OverTime <chr> "No", "Yes", "No", "No", "No", "Yes", "Yes", …
## $ PercentSalaryHike <int> 16, 24, 11, 12, 23, 25, 20, 19, 15, 11, 14, 2…
## $ PerformanceRating <int> 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, …
## $ RelationshipSatisfaction <int> 2, 2, 3, 1, 2, 4, 2, 1, 4, 2, 4, 2, 2, 3, 4, …
## $ StockOptionLevel <int> 3, 1, 0, 2, 3, 0, 3, 0, 1, 0, 1, 2, 1, 2, 0, …
## $ TotalWorkingYears <int> 32, 23, 14, 18, 10, 37, 40, 21, 34, 8, 32, 21…
## $ TrainingTimesLastYear <int> 0, 6, 3, 6, 5, 0, 0, 1, 0, 5, 1, 4, 4, 0, 5, …
## $ WorkLifeBalance <int> 3, 2, 4, 4, 4, 1, 1, 2, 1, 2, 4, 4, 1, 1, 2, …
## $ YearsAtCompany <int> 27, 8, 4, 28, 23, 7, 35, 1, 15, 5, 22, 20, 33…
## $ YearsInCurrentRole <int> 3, 15, 8, 6, 6, 16, 12, 2, 1, 18, 8, 6, 16, 9…
## $ YearsSinceLastPromotion <int> 7, 10, 14, 12, 11, 6, 12, 15, 9, 1, 7, 4, 4, …
## $ YearsWithCurrManager <int> 11, 15, 11, 7, 5, 16, 3, 7, 5, 11, 4, 3, 16, …
## Age DailyRate DistanceFromHome Education
## Min. :18.00 Min. : 100.0 Min. : 1.00 Min. :1.000
## 1st Qu.:28.00 1st Qu.: 451.2 1st Qu.: 8.00 1st Qu.:2.000
## Median :38.00 Median : 825.0 Median :16.00 Median :3.000
## Mean :38.39 Mean : 815.3 Mean :15.17 Mean :2.967
## 3rd Qu.:49.00 3rd Qu.:1178.0 3rd Qu.:22.00 3rd Qu.:4.000
## Max. :60.00 Max. :1499.0 Max. :29.00 Max. :5.000
## EnvironmentSatisfaction HourlyRate JobInvolvement JobLevel
## Min. :1.000 Min. : 30.00 Min. :1.000 Min. :1.000
## 1st Qu.:1.250 1st Qu.: 48.00 1st Qu.:1.000 1st Qu.:2.000
## Median :2.000 Median : 65.00 Median :3.000 Median :3.000
## Mean :2.496 Mean : 65.34 Mean :2.514 Mean :2.971
## 3rd Qu.:4.000 3rd Qu.: 83.00 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :4.000 Max. :100.00 Max. :4.000 Max. :5.000
## JobSatisfaction MonthlyIncome MonthlyRate NumCompaniesWorked
## Min. :1.000 Min. : 1009 Min. : 2012 Min. :0.000
## 1st Qu.:1.000 1st Qu.: 4748 1st Qu.: 7938 1st Qu.:2.000
## Median :2.000 Median : 6148 Median :14522 Median :4.000
## Mean :2.487 Mean : 6164 Mean :14363 Mean :4.484
## 3rd Qu.:3.000 3rd Qu.: 7497 3rd Qu.:20589 3rd Qu.:7.000
## Max. :4.000 Max. :12421 Max. :26980 Max. :9.000
## PercentSalaryHike PerformanceRating RelationshipSatisfaction StockOptionLevel
## Min. :11.00 Min. :3.000 Min. :1.000 Min. :0.000
## 1st Qu.:14.00 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:1.000
## Median :18.00 Median :3.000 Median :2.000 Median :1.000
## Mean :17.99 Mean :3.161 Mean :2.499 Mean :1.494
## 3rd Qu.:22.00 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:2.750
## Max. :25.00 Max. :4.000 Max. :4.000 Max. :3.000
## TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany
## Min. : 0.0 Min. :0.000 Min. :1.000 Min. : 0.00
## 1st Qu.:10.0 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:10.00
## Median :20.0 Median :3.000 Median :2.000 Median :21.00
## Mean :19.7 Mean :2.996 Mean :2.459 Mean :20.57
## 3rd Qu.:30.0 3rd Qu.:5.000 3rd Qu.:3.000 3rd Qu.:31.00
## Max. :40.0 Max. :6.000 Max. :4.000 Max. :40.00
## YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
## Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 4.000 1st Qu.: 4.000 1st Qu.: 4.000
## Median : 9.000 Median : 8.000 Median : 9.000
## Mean : 9.156 Mean : 7.594 Mean : 8.712
## 3rd Qu.:14.000 3rd Qu.:12.000 3rd Qu.:13.000
## Max. :18.000 Max. :15.000 Max. :17.000
att_counts <- df_raw %>%
count(Attrition) %>%
mutate(pct = n / sum(n),
label = paste0(n, "\n(", percent(pct, accuracy = 0.1), ")"))
ggplot(att_counts, aes(x = Attrition, y = n, fill = Attrition)) +
geom_col(width = 0.5, show.legend = FALSE) +
geom_text(aes(label = label), vjust = -0.3, fontface = "bold", size = 4.5) +
scale_fill_manual(values = c("No" = "#64B5F6", "Yes" = "#EF5350")) +
scale_y_continuous(expand = expansion(mult = c(0, 0.15))) +
labs(title = "Target Variable: Employee Attrition",
subtitle = "Class imbalance — ~84% No vs ~16% Yes",
x = "Attrition", y = "Count") +
theme_minimal(base_size = 13) +
theme(plot.title = element_text(face = "bold"),
plot.subtitle = element_text(color = "grey50"))Note on class imbalance: The dataset is imbalanced (~84 % No, ~16 % Yes). Overall accuracy is therefore misleading — Recall, F1, and AUC are more informative evaluation metrics.
mean_income <- df_raw %>%
group_by(Attrition) %>%
summarise(m = mean(MonthlyIncome))
ggplot(df_raw, aes(x = MonthlyIncome, fill = Attrition)) +
geom_density(alpha = 0.55, color = NA) +
geom_vline(data = mean_income, aes(xintercept = m, color = Attrition),
linetype = "dashed", linewidth = 1) +
scale_fill_manual(values = c("No" = "#64B5F6", "Yes" = "#EF5350")) +
scale_color_manual(values = c("No" = "#1565C0", "Yes" = "#B71C1C")) +
scale_x_continuous(labels = dollar_format()) +
labs(title = "Monthly Income Distribution by Attrition Status",
subtitle = "Dashed lines = group means",
x = "Monthly Income (USD)", y = "Density",
fill = "Attrition", color = "Mean") +
theme_minimal(base_size = 13) +
theme(plot.title = element_text(face = "bold"))df_raw %>%
count(OverTime, Attrition) %>%
group_by(OverTime) %>%
mutate(pct = n / sum(n)) %>%
filter(Attrition == "Yes") %>%
ggplot(aes(x = OverTime, y = pct, fill = OverTime)) +
geom_col(width = 0.45, show.legend = FALSE) +
geom_text(aes(label = percent(pct, accuracy = 0.1)),
vjust = -0.5, fontface = "bold", size = 5) +
scale_fill_manual(values = c("No" = "#81C784", "Yes" = "#FF8A65")) +
scale_y_continuous(labels = percent_format(),
expand = expansion(mult = c(0, 0.15))) +
labs(title = "Attrition Rate by OverTime Status",
subtitle = "% of employees who left within each OverTime group",
x = "OverTime", y = "Attrition Rate") +
theme_minimal(base_size = 13) +
theme(plot.title = element_text(face = "bold"))cor_mat <- df_raw %>%
select(where(is.numeric)) %>%
cor(use = "pairwise.complete.obs")
ggcorrplot(cor_mat,
method = "square",
type = "lower",
lab = TRUE,
lab_size = 2.5,
tl.cex = 9,
colors = c("#EF5350", "white", "#42A5F5"),
title = "Correlation Matrix — Numeric Predictors",
ggtheme = theme_minimal(base_size = 11))df <- df_raw %>%
# Encode target as ordered factor (No = reference level)
mutate(Attrition = factor(Attrition, levels = c("No", "Yes"))) %>%
# Drop zero-variance / identifier columns
select(-any_of(c("EmployeeCount", "Over18", "StandardHours", "EmployeeNumber"))) %>%
# Encode remaining character columns as factors
mutate(across(where(is.character), as.factor))
cat("Final dimensions:", nrow(df), "x", ncol(df), "\n")## Final dimensions: 1470 x 31
## Column types:
## .
## factor integer
## 8 23
train_idx <- createDataPartition(df$Attrition, p = 0.70, list = FALSE)
train_df <- df[ train_idx, ]
test_df <- df[-train_idx, ]
cat(sprintf("Training set : %d rows (Attrition Yes: %.1f%%)\n",
nrow(train_df), mean(train_df$Attrition == "Yes") * 100))## Training set : 1030 rows (Attrition Yes: 15.3%)
cat(sprintf("Test set : %d rows (Attrition Yes: %.1f%%)\n",
nrow(test_df), mean(test_df$Attrition == "Yes") * 100))## Test set : 440 rows (Attrition Yes: 15.2%)
All three models use glm() with
family = binomial(link = "logit"). The logistic regression
equation is:
\[\log\left(\frac{p}{1-p}\right) = \beta_0 + \beta_1 X_1 + \cdots + \beta_k X_k\]
where \(p = P(\text{Attrition} = \text{Yes})\).
Attrition ~ MonthlyIncomem1 <- glm(Attrition ~ MonthlyIncome,
data = train_df,
family = binomial(link = "logit"))
summary(m1)##
## Call:
## glm(formula = Attrition ~ MonthlyIncome, family = binomial(link = "logit"),
## data = train_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.440e+00 2.850e-01 5.053 4.35e-07 ***
## MonthlyIncome -5.787e-04 5.527e-05 -10.471 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 882.83 on 1029 degrees of freedom
## Residual deviance: 739.21 on 1028 degrees of freedom
## AIC: 743.21
##
## Number of Fisher Scoring iterations: 5
## Coefficient: -0.000579
## Odds Ratio (per $1 increase): 0.999421
## Odds Ratio (per $1,000 increase): 0.5606
## Interpretation: A $1,000 rise in monthly income multiplies the
cat(sprintf(
"odds of attrition by %.4f — i.e., reduces them by %.1f%%.\n",
exp(b * 1000), (1 - exp(b * 1000)) * 100
))## odds of attrition by 0.5606 — i.e., reduces them by 43.9%.
Attrition ~ MonthlyIncome + OverTimem2 <- glm(Attrition ~ MonthlyIncome + OverTime,
data = train_df,
family = binomial(link = "logit"))
summary(m2)##
## Call:
## glm(formula = Attrition ~ MonthlyIncome + OverTime, family = binomial(link = "logit"),
## data = train_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.020e+00 2.981e-01 3.422 0.000622 ***
## MonthlyIncome -6.038e-04 5.781e-05 -10.444 < 2e-16 ***
## OverTimeYes 1.377e+00 1.979e-01 6.960 3.4e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 882.83 on 1029 degrees of freedom
## Residual deviance: 689.53 on 1027 degrees of freedom
## AIC: 695.53
##
## Number of Fisher Scoring iterations: 6
b2 <- coef(m2)
cat(sprintf("MonthlyIncome — OR per $1,000: %.4f (%.1f%% change)\n",
exp(b2["MonthlyIncome"] * 1000),
(exp(b2["MonthlyIncome"] * 1000) - 1) * 100))## MonthlyIncome — OR per $1,000: 0.5467 (-45.3% change)
## OverTime (Yes) — OR: 3.9632
cat(sprintf(
"Employees who work overtime have %.1fx higher odds of leaving,\nholding MonthlyIncome constant.\n",
exp(b2["OverTimeYes"])
))## Employees who work overtime have 4.0x higher odds of leaving,
## holding MonthlyIncome constant.
Attrition ~ . (All Variables)##
## Call:
## glm(formula = Attrition ~ ., family = binomial(link = "logit"),
## data = train_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.303e+00 1.447e+00 0.900 0.36800
## Age 1.678e-02 8.464e-03 1.983 0.04738 *
## BusinessTravelTravel_Frequently 4.027e-02 3.662e-01 0.110 0.91243
## BusinessTravelTravel_Rarely -2.058e-02 3.148e-01 -0.065 0.94788
## DailyRate 1.908e-04 2.562e-04 0.745 0.45635
## DepartmentResearch & Development -2.130e-01 3.382e-01 -0.630 0.52874
## DepartmentSales -4.484e-01 3.616e-01 -1.240 0.21496
## DistanceFromHome -2.603e-02 1.265e-02 -2.057 0.03971 *
## Education -4.729e-02 7.215e-02 -0.655 0.51225
## EducationFieldLife Sciences 8.467e-02 3.550e-01 0.238 0.81151
## EducationFieldMarketing -5.688e-01 3.842e-01 -1.480 0.13877
## EducationFieldMedical -1.865e-01 3.381e-01 -0.552 0.58124
## EducationFieldOther 7.965e-02 3.494e-01 0.228 0.81966
## EducationFieldTechnical Degree -4.201e-01 3.789e-01 -1.109 0.26759
## EnvironmentSatisfaction 6.438e-02 9.416e-02 0.684 0.49418
## GenderMale 1.487e-01 2.131e-01 0.698 0.48525
## HourlyRate -3.401e-03 5.034e-03 -0.676 0.49933
## JobInvolvement 1.218e-01 9.325e-02 1.306 0.19146
## JobLevel -1.862e-02 7.324e-02 -0.254 0.79932
## JobRoleHuman Resources -3.352e-01 4.620e-01 -0.726 0.46810
## JobRoleLaboratory Technician -2.233e-01 4.243e-01 -0.526 0.59871
## JobRoleManager 8.125e-02 4.053e-01 0.200 0.84111
## JobRoleManufacturing Director -8.796e-02 4.466e-01 -0.197 0.84386
## JobRoleResearch Director -2.786e-01 4.389e-01 -0.635 0.52557
## JobRoleResearch Scientist -3.627e-01 4.506e-01 -0.805 0.42092
## JobRoleSales Executive -4.563e-01 4.171e-01 -1.094 0.27401
## JobRoleSales Representative 2.741e-01 3.947e-01 0.695 0.48737
## JobSatisfaction -4.368e-02 9.481e-02 -0.461 0.64498
## MaritalStatusMarried -3.444e-01 2.773e-01 -1.242 0.21435
## MaritalStatusSingle 5.448e-02 2.953e-01 0.185 0.85361
## MonthlyIncome -6.619e-04 6.522e-05 -10.149 < 2e-16 ***
## MonthlyRate 2.573e-06 1.454e-05 0.177 0.85952
## NumCompaniesWorked 1.033e-02 3.746e-02 0.276 0.78271
## OverTimeYes 1.479e+00 2.176e-01 6.797 1.07e-11 ***
## PercentSalaryHike 1.123e-02 2.405e-02 0.467 0.64042
## PerformanceRating 1.709e-01 2.806e-01 0.609 0.54250
## RelationshipSatisfaction 6.377e-02 9.525e-02 0.670 0.50316
## StockOptionLevel -1.240e-01 9.440e-02 -1.313 0.18910
## TotalWorkingYears -2.479e-02 9.096e-03 -2.725 0.00643 **
## TrainingTimesLastYear -3.052e-02 5.167e-02 -0.591 0.55472
## WorkLifeBalance 1.309e-01 9.330e-02 1.403 0.16065
## YearsAtCompany -1.692e-02 8.695e-03 -1.946 0.05167 .
## YearsInCurrentRole -2.733e-02 1.970e-02 -1.387 0.16532
## YearsSinceLastPromotion 2.921e-03 2.244e-02 0.130 0.89641
## YearsWithCurrManager 1.868e-02 2.108e-02 0.886 0.37538
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 882.83 on 1029 degrees of freedom
## Residual deviance: 641.26 on 985 degrees of freedom
## AIC: 731.26
##
## Number of Fisher Scoring iterations: 6
AIC penalises model complexity; lower AIC = better fit-complexity trade-off.
aic_tbl <- tibble(
Model = c("M1: MonthlyIncome",
"M2: MonthlyIncome + OverTime",
"M3: All Variables"),
AIC = round(c(AIC(m1), AIC(m2), AIC(m3)), 2),
BIC = round(c(BIC(m1), BIC(m2), BIC(m3)), 2),
LogLik = round(c(as.numeric(logLik(m1)),
as.numeric(logLik(m2)),
as.numeric(logLik(m3))), 2),
Params = c(m1$df.null - m1$df.residual + 1,
m2$df.null - m2$df.residual + 1,
m3$df.null - m3$df.residual + 1)
)
aic_tbl %>%
kable(caption = "Model Fit Statistics (★ = lowest AIC, highlighted)",
align = "lcccc") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"),
full_width = FALSE) %>%
row_spec(which.min(aic_tbl$AIC), background = "#FFF9C4", bold = TRUE) %>%
column_spec(1, bold = TRUE)| Model | AIC | BIC | LogLik | Params |
|---|---|---|---|---|
| M1: MonthlyIncome | 743.21 | 753.08 | -369.60 | 2 |
| M2: MonthlyIncome + OverTime | 695.53 | 710.35 | -344.77 | 3 |
| M3: All Variables | 731.26 | 953.43 | -320.63 | 45 |
lrt_12 <- anova(m1, m2, test = "Chisq")
lrt_23 <- anova(m2, m3, test = "Chisq")
cat("─── LRT: M1 vs M2 ───\n"); print(lrt_12)## ─── LRT: M1 vs M2 ───
## Analysis of Deviance Table
##
## Model 1: Attrition ~ MonthlyIncome
## Model 2: Attrition ~ MonthlyIncome + OverTime
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 1028 739.21
## 2 1027 689.53 1 49.673 1.816e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## ─── LRT: M2 vs M3 ───
## Analysis of Deviance Table
##
## Model 1: Attrition ~ MonthlyIncome + OverTime
## Model 2: Attrition ~ Age + BusinessTravel + DailyRate + Department + DistanceFromHome +
## Education + EducationField + EnvironmentSatisfaction + Gender +
## HourlyRate + JobInvolvement + JobLevel + JobRole + JobSatisfaction +
## MaritalStatus + MonthlyIncome + MonthlyRate + NumCompaniesWorked +
## OverTime + PercentSalaryHike + PerformanceRating + RelationshipSatisfaction +
## StockOptionLevel + TotalWorkingYears + TrainingTimesLastYear +
## WorkLifeBalance + YearsAtCompany + YearsInCurrentRole + YearsSinceLastPromotion +
## YearsWithCurrManager
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 1027 689.53
## 2 985 641.26 42 48.279 0.234
A significant p-value (< 0.05) means the more complex model fits significantly better than the simpler one.
or_m2 <- tibble(
Term = names(coef(m2))[-1],
OR = exp(coef(m2))[-1],
lo = exp(confint.default(m2))[-1, 1],
hi = exp(confint.default(m2))[-1, 2]
)
ggplot(or_m2, aes(x = OR, y = reorder(Term, OR))) +
geom_vline(xintercept = 1, linetype = "dashed", color = "grey50") +
geom_errorbarh(aes(xmin = lo, xmax = hi),
height = 0.25, color = "#546E7A", linewidth = 0.9) +
geom_point(size = 5, color = "#8B5CF6") +
geom_text(aes(label = sprintf("OR = %.3f", OR)),
hjust = -0.25, size = 3.8, color = "#374151") +
scale_x_log10(limits = c(NA, max(or_m2$hi) * 2)) +
labs(title = "Odds Ratios — Model 2 (95% CI)",
subtitle = "OR > 1 increases attrition risk | OR < 1 decreases it",
x = "Odds Ratio (log scale)", y = "") +
theme_minimal(base_size = 13) +
theme(plot.title = element_text(face = "bold"))or_m3 <- tibble(
Term = names(coef(m3))[-1],
OR = exp(coef(m3))[-1],
lo = exp(confint.default(m3))[-1, 1],
hi = exp(confint.default(m3))[-1, 2]
) %>%
filter(is.finite(OR), is.finite(lo), is.finite(hi)) %>%
arrange(desc(abs(log(OR)))) %>%
slice_head(n = 20)
ggplot(or_m3, aes(x = OR, y = reorder(Term, OR), color = OR > 1)) +
geom_vline(xintercept = 1, linetype = "dashed", color = "grey50") +
geom_errorbarh(aes(xmin = lo, xmax = hi),
height = 0.3, linewidth = 0.8) +
geom_point(size = 3.5) +
scale_color_manual(values = c("TRUE" = "#EF5350",
"FALSE" = "#42A5F5"),
labels = c("TRUE" = "Increases risk",
"FALSE" = "Decreases risk")) +
scale_x_log10() +
labs(title = "Top 20 Odds Ratios — Model 3 (95% CI)",
subtitle = "Sorted by absolute log-OR (effect magnitude)",
x = "Odds Ratio (log scale)", y = "", color = "") +
theme_minimal(base_size = 12) +
theme(plot.title = element_text(face = "bold"),
legend.position = "bottom")Predictions use a 0.5 probability threshold.
Positive class = "Yes" (attrition).
predict_class <- function(model, newdata, threshold = 0.5) {
probs <- predict(model, newdata = newdata, type = "response")
factor(ifelse(probs >= threshold, "Yes", "No"), levels = c("No", "Yes"))
}
pred1 <- predict_class(m1, test_df)
pred2 <- predict_class(m2, test_df)
pred3 <- predict_class(m3, test_df)## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 365 62
## Yes 8 5
##
## Accuracy : 0.8409
## 95% CI : (0.8033, 0.8738)
## No Information Rate : 0.8477
## P-Value [Acc > NIR] : 0.6831
##
## Kappa : 0.0794
##
## Mcnemar's Test P-Value : 2.378e-10
##
## Sensitivity : 0.07463
## Specificity : 0.97855
## Pos Pred Value : 0.38462
## Neg Pred Value : 0.85480
## Prevalence : 0.15227
## Detection Rate : 0.01136
## Detection Prevalence : 0.02955
## Balanced Accuracy : 0.52659
##
## 'Positive' Class : Yes
##
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 361 53
## Yes 12 14
##
## Accuracy : 0.8523
## 95% CI : (0.8156, 0.8841)
## No Information Rate : 0.8477
## P-Value [Acc > NIR] : 0.4269
##
## Kappa : 0.236
##
## Mcnemar's Test P-Value : 6.999e-07
##
## Sensitivity : 0.20896
## Specificity : 0.96783
## Pos Pred Value : 0.53846
## Neg Pred Value : 0.87198
## Prevalence : 0.15227
## Detection Rate : 0.03182
## Detection Prevalence : 0.05909
## Balanced Accuracy : 0.58839
##
## 'Positive' Class : Yes
##
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 358 50
## Yes 15 17
##
## Accuracy : 0.8523
## 95% CI : (0.8156, 0.8841)
## No Information Rate : 0.8477
## P-Value [Acc > NIR] : 0.4269
##
## Kappa : 0.2717
##
## Mcnemar's Test P-Value : 2.474e-05
##
## Sensitivity : 0.25373
## Specificity : 0.95979
## Pos Pred Value : 0.53125
## Neg Pred Value : 0.87745
## Prevalence : 0.15227
## Detection Rate : 0.03864
## Detection Prevalence : 0.07273
## Balanced Accuracy : 0.60676
##
## 'Positive' Class : Yes
##
plot_cm <- function(cm_obj, title, fill_col) {
as.data.frame(cm_obj$table) %>%
rename(Predicted = Prediction, Actual = Reference) %>%
group_by(Actual) %>%
mutate(pct = Freq / sum(Freq)) %>%
ungroup() %>%
ggplot(aes(x = Predicted, y = Actual, fill = pct)) +
geom_tile(color = "white", linewidth = 1.5) +
geom_text(aes(label = paste0(Freq, "\n(", percent(pct, accuracy = 0.1), ")")),
size = 4.5, fontface = "bold", color = "white") +
scale_fill_gradient(low = "#CFD8DC", high = fill_col, limits = c(0, 1)) +
labs(title = title, x = "Predicted", y = "Actual") +
theme_minimal(base_size = 12) +
theme(legend.position = "none",
plot.title = element_text(face = "bold", hjust = 0.5, size = 11))
}
p1 <- plot_cm(cm1, "M1: MonthlyIncome", "#3B82F6")
p2 <- plot_cm(cm2, "M2: MonthlyIncome + OverTime", "#8B5CF6")
p3 <- plot_cm(cm3, "M3: All Variables", "#10B981")
(p1 | p2 | p3) +
plot_annotation(
title = "Confusion Matrices — Test Set (Row-normalised %)",
theme = theme(plot.title = element_text(face = "bold", size = 14, hjust = 0.5))
)roc1 <- roc(test_df$Attrition, predict(m1, test_df, type = "response"), quiet = TRUE)
roc2 <- roc(test_df$Attrition, predict(m2, test_df, type = "response"), quiet = TRUE)
roc3 <- roc(test_df$Attrition, predict(m3, test_df, type = "response"), quiet = TRUE)
roc_list <- list(
`M1` = roc1,
`M2` = roc2,
`M3` = roc3
)
ggroc(roc_list, linewidth = 1.3) +
geom_abline(slope = 1, intercept = 1,
linetype = "dashed", color = "grey60", linewidth = 0.8) +
annotate("text", x = 0.35, y = 0.08,
label = "Random classifier (AUC = 0.5)",
color = "grey55", size = 3.5, angle = -35) +
scale_color_manual(
values = c("M1" = "#3B82F6", "M2" = "#8B5CF6", "M3" = "#10B981"),
labels = c(
sprintf("M1 AUC = %.3f", auc(roc1)),
sprintf("M2 AUC = %.3f", auc(roc2)),
sprintf("M3 AUC = %.3f", auc(roc3))
)
) +
labs(title = "ROC Curves — Test Set",
subtitle = "Higher AUC = better discrimination between leavers and stayers",
x = "Specificity (1 − FPR)",
y = "Sensitivity (TPR)",
color = "") +
theme_minimal(base_size = 13) +
theme(plot.title = element_text(face = "bold"),
plot.subtitle = element_text(color = "grey50"),
legend.position = "bottom",
legend.text = element_text(family = "mono"))extract_metrics <- function(cm_obj, roc_obj, label) {
s <- cm_obj$byClass
tibble(
Model = label,
Accuracy = round(cm_obj$overall["Accuracy"], 4),
Precision = round(s["Precision"], 4),
Recall = round(s["Recall"], 4),
F1 = round(s["F1"], 4),
Specificity = round(s["Specificity"], 4),
AUC = round(auc(roc_obj), 4)
)
}
perf <- bind_rows(
extract_metrics(cm1, roc1, "M1: MonthlyIncome"),
extract_metrics(cm2, roc2, "M2: MonthlyIncome + OverTime"),
extract_metrics(cm3, roc3, "M3: All Variables")
)
perf %>%
kable(caption = "Model Performance on Test Set | Green = Best F1",
align = "lcccccc") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"),
full_width = FALSE) %>%
column_spec(1, bold = TRUE) %>%
row_spec(which.max(perf$F1), background = "#D1FAE5", bold = TRUE) %>%
add_header_above(c(" " = 1,
"Threshold-based (cutoff = 0.5)" = 5,
"Rank-based" = 1))| Model | Accuracy | Precision | Recall | F1 | Specificity | AUC |
|---|---|---|---|---|---|---|
| M1: MonthlyIncome | 0.8409 | 0.3846 | 0.0746 | 0.1250 | 0.9786 | 0.7503 |
| M2: MonthlyIncome + OverTime | 0.8523 | 0.5385 | 0.2090 | 0.3011 | 0.9678 | 0.8183 |
| M3: All Variables | 0.8523 | 0.5312 | 0.2537 | 0.3434 | 0.9598 | 0.7850 |
perf %>%
pivot_longer(-Model, names_to = "Metric", values_to = "Value") %>%
mutate(Metric = factor(Metric,
levels = c("Accuracy","Precision","Recall",
"F1","Specificity","AUC"))) %>%
ggplot(aes(x = Metric, y = Value, fill = Model)) +
geom_col(position = position_dodge(0.75), width = 0.65,
alpha = 0.88, color = "white") +
geom_text(aes(label = sprintf("%.3f", Value)),
position = position_dodge(0.75),
vjust = -0.4, size = 2.9, fontface = "bold") +
scale_fill_manual(values = c("#3B82F6","#8B5CF6","#10B981")) +
scale_y_continuous(limits = c(0, 1.12), labels = percent_format()) +
geom_hline(yintercept = 0.5, linetype = "dashed",
color = "grey60", linewidth = 0.6) +
labs(title = "Model Performance Comparison — Test Set",
subtitle = "Dashed line at 0.50 for reference",
x = NULL, y = "Score", fill = "Model") +
theme_minimal(base_size = 13) +
theme(plot.title = element_text(face = "bold"),
legend.position = "bottom",
panel.grid.major.x = element_blank())1. Class imbalance makes accuracy misleading. With ~84 % of employees not leaving, a trivial “always predict No” classifier already achieves 84 % accuracy. Recall (sensitivity to actual leavers) and AUC are the appropriate primary metrics.
2. OverTime is the single most powerful predictor (M2 >
M1). Adding one binary variable — whether an employee works
overtime — substantially improves both Recall and F1. The odds ratio for
OverTimeYes in M2 shows that employees working overtime
have markedly higher odds of leaving, holding income constant.
3. Monthly Income has a negative relationship with
attrition. Higher-paid employees are less likely to leave: each
$1,000 increase in MonthlyIncome decreases the odds of
attrition, consistent with standard labour economics theory.
4. More predictors ≠ always better at threshold-based
classification. The full model (M3) achieves the lowest AIC and
highest AUC, but does not always dominate M2 on precision/recall
metrics. Multicollinearity among correlated HR variables (see
correlation heatmap) and the relatively small sample size mean a plain
glm() without regularisation can hurt precision-recall
balance.
5. Practical recommendation for HR. For ranking employees by attrition risk (e.g., to prioritise retention conversations), use Model 3 — it has the best AUC. For a simple, interpretable rule, use Model 2 — two predictors, clear odds ratios, and a meaningful improvement over the baseline.
glmnet could improve it.## R version 4.5.1 (2025-06-13 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26200)
##
## Matrix products: default
## LAPACK version 3.12.1
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: Asia/Ulaanbaatar
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggcorrplot_0.1.4.1 scales_1.4.0 patchwork_1.3.2 kableExtra_1.4.0
## [5] knitr_1.50 pROC_1.19.0.1 caret_7.0-1 lattice_0.22-7
## [9] lubridate_1.9.4 forcats_1.0.1 stringr_1.5.2 dplyr_1.2.0
## [13] purrr_1.1.0 readr_2.1.5 tidyr_1.3.1 tibble_3.3.1
## [17] ggplot2_4.0.2 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.2.1 viridisLite_0.4.2 timeDate_4051.111
## [4] farver_2.1.2 S7_0.2.0 fastmap_1.2.0
## [7] digest_0.6.37 rpart_4.1.24 timechange_0.3.0
## [10] lifecycle_1.0.5 survival_3.8-3 magrittr_2.0.3
## [13] compiler_4.5.1 rlang_1.1.7 sass_0.4.10
## [16] tools_4.5.1 yaml_2.3.10 data.table_1.17.8
## [19] labeling_0.4.3 plyr_1.8.9 xml2_1.4.0
## [22] RColorBrewer_1.1-3 withr_3.0.2 nnet_7.3-20
## [25] grid_4.5.1 stats4_4.5.1 e1071_1.7-17
## [28] future_1.67.0 globals_0.18.0 iterators_1.0.14
## [31] MASS_7.3-65 cli_3.6.5 rmarkdown_2.29
## [34] generics_0.1.4 rstudioapi_0.17.1 future.apply_1.20.0
## [37] reshape2_1.4.5 tzdb_0.5.0 proxy_0.4-29
## [40] cachem_1.1.0 splines_4.5.1 parallel_4.5.1
## [43] vctrs_0.7.1 hardhat_1.4.2 Matrix_1.7-3
## [46] jsonlite_2.0.0 hms_1.1.3 listenv_0.9.1
## [49] systemfonts_1.3.2 foreach_1.5.2 gower_1.0.2
## [52] jquerylib_0.1.4 recipes_1.3.1 glue_1.8.0
## [55] parallelly_1.45.1 codetools_0.2-20 stringi_1.8.7
## [58] gtable_0.3.6 pillar_1.11.0 htmltools_0.5.8.1
## [61] ipred_0.9-15 lava_1.8.2 R6_2.6.1
## [64] textshaping_1.0.3 evaluate_1.0.5 bslib_0.9.0
## [67] class_7.3-23 Rcpp_1.1.0 svglite_2.2.2
## [70] nlme_3.1-168 prodlim_2025.04.28 xfun_0.53
## [73] pkgconfig_2.0.3 ModelMetrics_1.2.2.2