1 Introduction

This report estimates three logistic regression models to predict employee attrition using the Employee Attrition Dataset from Kaggle (IBM HR Analytics structure, n = 1,470).

The modelling workflow follows Boehmke & Greenwell — Hands-On Machine Learning with R, Chapter 5: Logistic Regression.

Research question: Which employee characteristics predict the likelihood of voluntary attrition?

Three models are compared:

Model Formula Purpose
M1 Attrition ~ MonthlyIncome Baseline — single numeric predictor
M2 Attrition ~ MonthlyIncome + OverTime Add a key categorical driver
M3 Attrition ~ . Full model — all available predictors

2 Setup

2.1 Packages

# Uncomment to install:
# install.packages(c("tidyverse","caret","pROC","knitr","kableExtra","patchwork","scales","ggcorrplot"))

library(tidyverse)   # data wrangling & ggplot2
library(caret)       # confusionMatrix, createDataPartition
library(pROC)        # ROC / AUC
library(knitr)       # kable tables
library(kableExtra)  # styled tables
library(patchwork)   # combine ggplots
library(scales)      # axis formatting
library(ggcorrplot)  # correlation heatmap

set.seed(42)

2.2 Load Data

Download train.csv from the Kaggle dataset page and place it in your working directory, then use Option A below. Option B provides a synthetic fallback with the same column structure for reproducibility.

# ── Option A: Kaggle download (recommended) ─────────────────────────────
# df_raw <- read_csv("train.csv", show_col_types = FALSE)

# ── Option B: Synthetic dataset (same IBM HR Analytics structure) ────────
simulate_data <- function(n = 1470) {
  set.seed(42)
  att <- sample(c("Yes","No"), n, replace = TRUE, prob = c(0.16, 0.84))
  tibble(
    Age                      = sample(18:60, n, replace = TRUE),
    Attrition                = att,
    BusinessTravel           = sample(c("Travel_Rarely","Travel_Frequently","Non-Travel"),
                                      n, replace = TRUE, prob = c(0.70, 0.20, 0.10)),
    DailyRate                = sample(100:1500, n, replace = TRUE),
    Department               = sample(c("Sales","Research & Development","Human Resources"),
                                      n, replace = TRUE, prob = c(0.30, 0.60, 0.10)),
    DistanceFromHome         = sample(1:29, n, replace = TRUE),
    Education                = sample(1:5, n, replace = TRUE),
    EducationField           = sample(c("Life Sciences","Medical","Marketing",
                                        "Technical Degree","Human Resources","Other"),
                                      n, replace = TRUE),
    EnvironmentSatisfaction  = sample(1:4, n, replace = TRUE),
    Gender                   = sample(c("Male","Female"), n, replace = TRUE, prob = c(0.60, 0.40)),
    HourlyRate               = sample(30:100, n, replace = TRUE),
    JobInvolvement           = sample(1:4, n, replace = TRUE),
    JobLevel                 = sample(1:5, n, replace = TRUE),
    JobRole                  = sample(c("Sales Executive","Research Scientist",
                                        "Laboratory Technician","Manufacturing Director",
                                        "Healthcare Representative","Manager",
                                        "Sales Representative","Research Director",
                                        "Human Resources"), n, replace = TRUE),
    JobSatisfaction          = sample(1:4, n, replace = TRUE),
    MaritalStatus            = sample(c("Single","Married","Divorced"),
                                      n, replace = TRUE, prob = c(0.32, 0.46, 0.22)),
    MonthlyIncome            = as.integer(ifelse(att == "Yes",
                                 pmax(1009, rnorm(n, 4500, 1500)),
                                 pmax(1009, rnorm(n, 6500, 2000)))),
    MonthlyRate              = sample(2000:27000, n, replace = TRUE),
    NumCompaniesWorked       = sample(0:9, n, replace = TRUE),
    OverTime                 = ifelse(att == "Yes",
                                 sample(c("Yes","No"), n, replace = TRUE, prob = c(0.55, 0.45)),
                                 sample(c("Yes","No"), n, replace = TRUE, prob = c(0.25, 0.75))),
    PercentSalaryHike        = sample(11:25, n, replace = TRUE),
    PerformanceRating        = sample(c(3L, 4L), n, replace = TRUE, prob = c(0.85, 0.15)),
    RelationshipSatisfaction = sample(1:4, n, replace = TRUE),
    StockOptionLevel         = sample(0:3, n, replace = TRUE),
    TotalWorkingYears        = sample(0:40, n, replace = TRUE),
    TrainingTimesLastYear    = sample(0:6, n, replace = TRUE),
    WorkLifeBalance          = sample(1:4, n, replace = TRUE),
    YearsAtCompany           = sample(0:40, n, replace = TRUE),
    YearsInCurrentRole       = sample(0:18, n, replace = TRUE),
    YearsSinceLastPromotion  = sample(0:15, n, replace = TRUE),
    YearsWithCurrManager     = sample(0:17, n, replace = TRUE)
  )
}

df_raw <- simulate_data()
cat("Rows:", nrow(df_raw), "  Columns:", ncol(df_raw))
## Rows: 1470   Columns: 31

3 Data Overview

3.1 Structure

glimpse(df_raw)
## Rows: 1,470
## Columns: 31
## $ Age                      <int> 34, 42, 26, 57, 45, 36, 30, 50, 29, 18, 55, 3…
## $ Attrition                <chr> "Yes", "Yes", "No", "No", "No", "No", "No", "…
## $ BusinessTravel           <chr> "Non-Travel", "Non-Travel", "Travel_Rarely", …
## $ DailyRate                <int> 760, 1409, 287, 167, 1312, 992, 1152, 1299, 1…
## $ Department               <chr> "Research & Development", "Human Resources", …
## $ DistanceFromHome         <int> 9, 9, 5, 28, 6, 25, 7, 14, 17, 2, 27, 12, 29,…
## $ Education                <int> 4, 3, 3, 1, 3, 1, 4, 5, 4, 5, 1, 4, 5, 4, 4, …
## $ EducationField           <chr> "Other", "Life Sciences", "Medical", "Life Sc…
## $ EnvironmentSatisfaction  <int> 4, 4, 3, 2, 1, 1, 1, 1, 4, 1, 1, 2, 4, 4, 1, …
## $ Gender                   <chr> "Male", "Female", "Female", "Male", "Female",…
## $ HourlyRate               <int> 90, 56, 91, 85, 62, 73, 33, 90, 84, 58, 80, 6…
## $ JobInvolvement           <int> 2, 3, 4, 4, 2, 1, 3, 1, 3, 1, 1, 1, 4, 1, 3, …
## $ JobLevel                 <int> 5, 5, 1, 5, 3, 5, 3, 3, 1, 1, 4, 4, 3, 3, 2, …
## $ JobRole                  <chr> "Laboratory Technician", "Sales Representativ…
## $ JobSatisfaction          <int> 2, 3, 2, 1, 3, 2, 1, 4, 2, 3, 4, 1, 3, 3, 2, …
## $ MaritalStatus            <chr> "Married", "Married", "Single", "Married", "M…
## $ MonthlyIncome            <int> 2733, 5179, 4619, 5726, 7957, 8187, 4969, 725…
## $ MonthlyRate              <int> 8354, 19513, 5728, 3671, 15755, 15566, 21836,…
## $ NumCompaniesWorked       <int> 5, 3, 7, 2, 2, 9, 8, 8, 9, 9, 2, 6, 5, 3, 9, …
## $ OverTime                 <chr> "No", "Yes", "No", "No", "No", "Yes", "Yes", …
## $ PercentSalaryHike        <int> 16, 24, 11, 12, 23, 25, 20, 19, 15, 11, 14, 2…
## $ PerformanceRating        <int> 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, …
## $ RelationshipSatisfaction <int> 2, 2, 3, 1, 2, 4, 2, 1, 4, 2, 4, 2, 2, 3, 4, …
## $ StockOptionLevel         <int> 3, 1, 0, 2, 3, 0, 3, 0, 1, 0, 1, 2, 1, 2, 0, …
## $ TotalWorkingYears        <int> 32, 23, 14, 18, 10, 37, 40, 21, 34, 8, 32, 21…
## $ TrainingTimesLastYear    <int> 0, 6, 3, 6, 5, 0, 0, 1, 0, 5, 1, 4, 4, 0, 5, …
## $ WorkLifeBalance          <int> 3, 2, 4, 4, 4, 1, 1, 2, 1, 2, 4, 4, 1, 1, 2, …
## $ YearsAtCompany           <int> 27, 8, 4, 28, 23, 7, 35, 1, 15, 5, 22, 20, 33…
## $ YearsInCurrentRole       <int> 3, 15, 8, 6, 6, 16, 12, 2, 1, 18, 8, 6, 16, 9…
## $ YearsSinceLastPromotion  <int> 7, 10, 14, 12, 11, 6, 12, 15, 9, 1, 7, 4, 4, …
## $ YearsWithCurrManager     <int> 11, 15, 11, 7, 5, 16, 3, 7, 5, 11, 4, 3, 16, …

3.2 Descriptive Statistics

df_raw %>%
  select(where(is.numeric)) %>%
  summary()
##       Age          DailyRate      DistanceFromHome   Education    
##  Min.   :18.00   Min.   : 100.0   Min.   : 1.00    Min.   :1.000  
##  1st Qu.:28.00   1st Qu.: 451.2   1st Qu.: 8.00    1st Qu.:2.000  
##  Median :38.00   Median : 825.0   Median :16.00    Median :3.000  
##  Mean   :38.39   Mean   : 815.3   Mean   :15.17    Mean   :2.967  
##  3rd Qu.:49.00   3rd Qu.:1178.0   3rd Qu.:22.00    3rd Qu.:4.000  
##  Max.   :60.00   Max.   :1499.0   Max.   :29.00    Max.   :5.000  
##  EnvironmentSatisfaction   HourlyRate     JobInvolvement     JobLevel    
##  Min.   :1.000           Min.   : 30.00   Min.   :1.000   Min.   :1.000  
##  1st Qu.:1.250           1st Qu.: 48.00   1st Qu.:1.000   1st Qu.:2.000  
##  Median :2.000           Median : 65.00   Median :3.000   Median :3.000  
##  Mean   :2.496           Mean   : 65.34   Mean   :2.514   Mean   :2.971  
##  3rd Qu.:4.000           3rd Qu.: 83.00   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :4.000           Max.   :100.00   Max.   :4.000   Max.   :5.000  
##  JobSatisfaction MonthlyIncome    MonthlyRate    NumCompaniesWorked
##  Min.   :1.000   Min.   : 1009   Min.   : 2012   Min.   :0.000     
##  1st Qu.:1.000   1st Qu.: 4748   1st Qu.: 7938   1st Qu.:2.000     
##  Median :2.000   Median : 6148   Median :14522   Median :4.000     
##  Mean   :2.487   Mean   : 6164   Mean   :14363   Mean   :4.484     
##  3rd Qu.:3.000   3rd Qu.: 7497   3rd Qu.:20589   3rd Qu.:7.000     
##  Max.   :4.000   Max.   :12421   Max.   :26980   Max.   :9.000     
##  PercentSalaryHike PerformanceRating RelationshipSatisfaction StockOptionLevel
##  Min.   :11.00     Min.   :3.000     Min.   :1.000            Min.   :0.000   
##  1st Qu.:14.00     1st Qu.:3.000     1st Qu.:2.000            1st Qu.:1.000   
##  Median :18.00     Median :3.000     Median :2.000            Median :1.000   
##  Mean   :17.99     Mean   :3.161     Mean   :2.499            Mean   :1.494   
##  3rd Qu.:22.00     3rd Qu.:3.000     3rd Qu.:3.000            3rd Qu.:2.750   
##  Max.   :25.00     Max.   :4.000     Max.   :4.000            Max.   :3.000   
##  TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany 
##  Min.   : 0.0      Min.   :0.000         Min.   :1.000   Min.   : 0.00  
##  1st Qu.:10.0      1st Qu.:1.000         1st Qu.:1.000   1st Qu.:10.00  
##  Median :20.0      Median :3.000         Median :2.000   Median :21.00  
##  Mean   :19.7      Mean   :2.996         Mean   :2.459   Mean   :20.57  
##  3rd Qu.:30.0      3rd Qu.:5.000         3rd Qu.:3.000   3rd Qu.:31.00  
##  Max.   :40.0      Max.   :6.000         Max.   :4.000   Max.   :40.00  
##  YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
##  Min.   : 0.000     Min.   : 0.000          Min.   : 0.000      
##  1st Qu.: 4.000     1st Qu.: 4.000          1st Qu.: 4.000      
##  Median : 9.000     Median : 8.000          Median : 9.000      
##  Mean   : 9.156     Mean   : 7.594          Mean   : 8.712      
##  3rd Qu.:14.000     3rd Qu.:12.000          3rd Qu.:13.000      
##  Max.   :18.000     Max.   :15.000          Max.   :17.000

3.3 Attrition Distribution

att_counts <- df_raw %>%
  count(Attrition) %>%
  mutate(pct   = n / sum(n),
         label = paste0(n, "\n(", percent(pct, accuracy = 0.1), ")"))

ggplot(att_counts, aes(x = Attrition, y = n, fill = Attrition)) +
  geom_col(width = 0.5, show.legend = FALSE) +
  geom_text(aes(label = label), vjust = -0.3, fontface = "bold", size = 4.5) +
  scale_fill_manual(values = c("No" = "#64B5F6", "Yes" = "#EF5350")) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.15))) +
  labs(title    = "Target Variable: Employee Attrition",
       subtitle = "Class imbalance — ~84% No vs ~16% Yes",
       x = "Attrition", y = "Count") +
  theme_minimal(base_size = 13) +
  theme(plot.title    = element_text(face = "bold"),
        plot.subtitle = element_text(color = "grey50"))

Note on class imbalance: The dataset is imbalanced (~84 % No, ~16 % Yes). Overall accuracy is therefore misleading — Recall, F1, and AUC are more informative evaluation metrics.

3.4 Monthly Income by Attrition

mean_income <- df_raw %>%
  group_by(Attrition) %>%
  summarise(m = mean(MonthlyIncome))

ggplot(df_raw, aes(x = MonthlyIncome, fill = Attrition)) +
  geom_density(alpha = 0.55, color = NA) +
  geom_vline(data = mean_income, aes(xintercept = m, color = Attrition),
             linetype = "dashed", linewidth = 1) +
  scale_fill_manual(values  = c("No" = "#64B5F6", "Yes" = "#EF5350")) +
  scale_color_manual(values = c("No" = "#1565C0", "Yes" = "#B71C1C")) +
  scale_x_continuous(labels = dollar_format()) +
  labs(title    = "Monthly Income Distribution by Attrition Status",
       subtitle = "Dashed lines = group means",
       x = "Monthly Income (USD)", y = "Density",
       fill = "Attrition", color = "Mean") +
  theme_minimal(base_size = 13) +
  theme(plot.title = element_text(face = "bold"))

3.5 OverTime by Attrition

df_raw %>%
  count(OverTime, Attrition) %>%
  group_by(OverTime) %>%
  mutate(pct = n / sum(n)) %>%
  filter(Attrition == "Yes") %>%
  ggplot(aes(x = OverTime, y = pct, fill = OverTime)) +
  geom_col(width = 0.45, show.legend = FALSE) +
  geom_text(aes(label = percent(pct, accuracy = 0.1)),
            vjust = -0.5, fontface = "bold", size = 5) +
  scale_fill_manual(values = c("No" = "#81C784", "Yes" = "#FF8A65")) +
  scale_y_continuous(labels = percent_format(),
                     expand = expansion(mult = c(0, 0.15))) +
  labs(title    = "Attrition Rate by OverTime Status",
       subtitle = "% of employees who left within each OverTime group",
       x = "OverTime", y = "Attrition Rate") +
  theme_minimal(base_size = 13) +
  theme(plot.title = element_text(face = "bold"))

3.6 Correlation Heatmap (Numeric Variables)

cor_mat <- df_raw %>%
  select(where(is.numeric)) %>%
  cor(use = "pairwise.complete.obs")

ggcorrplot(cor_mat,
           method   = "square",
           type     = "lower",
           lab      = TRUE,
           lab_size = 2.5,
           tl.cex   = 9,
           colors   = c("#EF5350", "white", "#42A5F5"),
           title    = "Correlation Matrix — Numeric Predictors",
           ggtheme  = theme_minimal(base_size = 11))


4 Pre-processing

df <- df_raw %>%
  # Encode target as ordered factor (No = reference level)
  mutate(Attrition = factor(Attrition, levels = c("No", "Yes"))) %>%
  # Drop zero-variance / identifier columns
  select(-any_of(c("EmployeeCount", "Over18", "StandardHours", "EmployeeNumber"))) %>%
  # Encode remaining character columns as factors
  mutate(across(where(is.character), as.factor))

cat("Final dimensions:", nrow(df), "x", ncol(df), "\n")
## Final dimensions: 1470 x 31
cat("Column types:\n")
## Column types:
sapply(df, class) %>% table() %>% print()
## .
##  factor integer 
##       8      23

4.1 Train / Test Split — 70 / 30 (stratified)

train_idx <- createDataPartition(df$Attrition, p = 0.70, list = FALSE)
train_df  <- df[ train_idx, ]
test_df   <- df[-train_idx, ]

cat(sprintf("Training set : %d rows  (Attrition Yes: %.1f%%)\n",
            nrow(train_df), mean(train_df$Attrition == "Yes") * 100))
## Training set : 1030 rows  (Attrition Yes: 15.3%)
cat(sprintf("Test set     : %d rows  (Attrition Yes: %.1f%%)\n",
            nrow(test_df),  mean(test_df$Attrition  == "Yes") * 100))
## Test set     : 440 rows  (Attrition Yes: 15.2%)

5 Model Estimation

All three models use glm() with family = binomial(link = "logit"). The logistic regression equation is:

\[\log\left(\frac{p}{1-p}\right) = \beta_0 + \beta_1 X_1 + \cdots + \beta_k X_k\]

where \(p = P(\text{Attrition} = \text{Yes})\).

5.1 Model 1 — Attrition ~ MonthlyIncome

m1 <- glm(Attrition ~ MonthlyIncome,
          data   = train_df,
          family = binomial(link = "logit"))

summary(m1)
## 
## Call:
## glm(formula = Attrition ~ MonthlyIncome, family = binomial(link = "logit"), 
##     data = train_df)
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    1.440e+00  2.850e-01   5.053 4.35e-07 ***
## MonthlyIncome -5.787e-04  5.527e-05 -10.471  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 882.83  on 1029  degrees of freedom
## Residual deviance: 739.21  on 1028  degrees of freedom
## AIC: 743.21
## 
## Number of Fisher Scoring iterations: 5

5.1.1 Coefficient Interpretation

b <- coef(m1)["MonthlyIncome"]

cat(sprintf(
  "Coefficient: %.6f\n", b
))
## Coefficient: -0.000579
cat(sprintf(
  "Odds Ratio (per $1 increase): %.6f\n", exp(b)
))
## Odds Ratio (per $1 increase): 0.999421
cat(sprintf(
  "Odds Ratio (per $1,000 increase): %.4f\n", exp(b * 1000)
))
## Odds Ratio (per $1,000 increase): 0.5606
cat("Interpretation: A $1,000 rise in monthly income multiplies the\n")
## Interpretation: A $1,000 rise in monthly income multiplies the
cat(sprintf(
  "odds of attrition by %.4f — i.e., reduces them by %.1f%%.\n",
  exp(b * 1000), (1 - exp(b * 1000)) * 100
))
## odds of attrition by 0.5606 — i.e., reduces them by 43.9%.

5.2 Model 2 — Attrition ~ MonthlyIncome + OverTime

m2 <- glm(Attrition ~ MonthlyIncome + OverTime,
          data   = train_df,
          family = binomial(link = "logit"))

summary(m2)
## 
## Call:
## glm(formula = Attrition ~ MonthlyIncome + OverTime, family = binomial(link = "logit"), 
##     data = train_df)
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    1.020e+00  2.981e-01   3.422 0.000622 ***
## MonthlyIncome -6.038e-04  5.781e-05 -10.444  < 2e-16 ***
## OverTimeYes    1.377e+00  1.979e-01   6.960  3.4e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 882.83  on 1029  degrees of freedom
## Residual deviance: 689.53  on 1027  degrees of freedom
## AIC: 695.53
## 
## Number of Fisher Scoring iterations: 6

5.2.1 Coefficient Interpretation

b2 <- coef(m2)

cat(sprintf("MonthlyIncome  — OR per $1,000: %.4f (%.1f%% change)\n",
            exp(b2["MonthlyIncome"] * 1000),
            (exp(b2["MonthlyIncome"] * 1000) - 1) * 100))
## MonthlyIncome  — OR per $1,000: 0.5467 (-45.3% change)
cat(sprintf("OverTime (Yes) — OR: %.4f\n", exp(b2["OverTimeYes"])))
## OverTime (Yes) — OR: 3.9632
cat(sprintf(
  "Employees who work overtime have %.1fx higher odds of leaving,\nholding MonthlyIncome constant.\n",
  exp(b2["OverTimeYes"])
))
## Employees who work overtime have 4.0x higher odds of leaving,
## holding MonthlyIncome constant.

5.3 Model 3 — Attrition ~ . (All Variables)

m3 <- glm(Attrition ~ .,
          data   = train_df,
          family = binomial(link = "logit"))

summary(m3)
## 
## Call:
## glm(formula = Attrition ~ ., family = binomial(link = "logit"), 
##     data = train_df)
## 
## Coefficients:
##                                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                       1.303e+00  1.447e+00   0.900  0.36800    
## Age                               1.678e-02  8.464e-03   1.983  0.04738 *  
## BusinessTravelTravel_Frequently   4.027e-02  3.662e-01   0.110  0.91243    
## BusinessTravelTravel_Rarely      -2.058e-02  3.148e-01  -0.065  0.94788    
## DailyRate                         1.908e-04  2.562e-04   0.745  0.45635    
## DepartmentResearch & Development -2.130e-01  3.382e-01  -0.630  0.52874    
## DepartmentSales                  -4.484e-01  3.616e-01  -1.240  0.21496    
## DistanceFromHome                 -2.603e-02  1.265e-02  -2.057  0.03971 *  
## Education                        -4.729e-02  7.215e-02  -0.655  0.51225    
## EducationFieldLife Sciences       8.467e-02  3.550e-01   0.238  0.81151    
## EducationFieldMarketing          -5.688e-01  3.842e-01  -1.480  0.13877    
## EducationFieldMedical            -1.865e-01  3.381e-01  -0.552  0.58124    
## EducationFieldOther               7.965e-02  3.494e-01   0.228  0.81966    
## EducationFieldTechnical Degree   -4.201e-01  3.789e-01  -1.109  0.26759    
## EnvironmentSatisfaction           6.438e-02  9.416e-02   0.684  0.49418    
## GenderMale                        1.487e-01  2.131e-01   0.698  0.48525    
## HourlyRate                       -3.401e-03  5.034e-03  -0.676  0.49933    
## JobInvolvement                    1.218e-01  9.325e-02   1.306  0.19146    
## JobLevel                         -1.862e-02  7.324e-02  -0.254  0.79932    
## JobRoleHuman Resources           -3.352e-01  4.620e-01  -0.726  0.46810    
## JobRoleLaboratory Technician     -2.233e-01  4.243e-01  -0.526  0.59871    
## JobRoleManager                    8.125e-02  4.053e-01   0.200  0.84111    
## JobRoleManufacturing Director    -8.796e-02  4.466e-01  -0.197  0.84386    
## JobRoleResearch Director         -2.786e-01  4.389e-01  -0.635  0.52557    
## JobRoleResearch Scientist        -3.627e-01  4.506e-01  -0.805  0.42092    
## JobRoleSales Executive           -4.563e-01  4.171e-01  -1.094  0.27401    
## JobRoleSales Representative       2.741e-01  3.947e-01   0.695  0.48737    
## JobSatisfaction                  -4.368e-02  9.481e-02  -0.461  0.64498    
## MaritalStatusMarried             -3.444e-01  2.773e-01  -1.242  0.21435    
## MaritalStatusSingle               5.448e-02  2.953e-01   0.185  0.85361    
## MonthlyIncome                    -6.619e-04  6.522e-05 -10.149  < 2e-16 ***
## MonthlyRate                       2.573e-06  1.454e-05   0.177  0.85952    
## NumCompaniesWorked                1.033e-02  3.746e-02   0.276  0.78271    
## OverTimeYes                       1.479e+00  2.176e-01   6.797 1.07e-11 ***
## PercentSalaryHike                 1.123e-02  2.405e-02   0.467  0.64042    
## PerformanceRating                 1.709e-01  2.806e-01   0.609  0.54250    
## RelationshipSatisfaction          6.377e-02  9.525e-02   0.670  0.50316    
## StockOptionLevel                 -1.240e-01  9.440e-02  -1.313  0.18910    
## TotalWorkingYears                -2.479e-02  9.096e-03  -2.725  0.00643 ** 
## TrainingTimesLastYear            -3.052e-02  5.167e-02  -0.591  0.55472    
## WorkLifeBalance                   1.309e-01  9.330e-02   1.403  0.16065    
## YearsAtCompany                   -1.692e-02  8.695e-03  -1.946  0.05167 .  
## YearsInCurrentRole               -2.733e-02  1.970e-02  -1.387  0.16532    
## YearsSinceLastPromotion           2.921e-03  2.244e-02   0.130  0.89641    
## YearsWithCurrManager              1.868e-02  2.108e-02   0.886  0.37538    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 882.83  on 1029  degrees of freedom
## Residual deviance: 641.26  on  985  degrees of freedom
## AIC: 731.26
## 
## Number of Fisher Scoring iterations: 6

6 Model Comparison

6.1 AIC, BIC & Log-Likelihood

AIC penalises model complexity; lower AIC = better fit-complexity trade-off.

aic_tbl <- tibble(
  Model  = c("M1: MonthlyIncome",
             "M2: MonthlyIncome + OverTime",
             "M3: All Variables"),
  AIC    = round(c(AIC(m1), AIC(m2), AIC(m3)), 2),
  BIC    = round(c(BIC(m1), BIC(m2), BIC(m3)), 2),
  LogLik = round(c(as.numeric(logLik(m1)),
                   as.numeric(logLik(m2)),
                   as.numeric(logLik(m3))), 2),
  Params = c(m1$df.null - m1$df.residual + 1,
             m2$df.null - m2$df.residual + 1,
             m3$df.null - m3$df.residual + 1)
)

aic_tbl %>%
  kable(caption = "Model Fit Statistics (★ = lowest AIC, highlighted)",
        align   = "lcccc") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"),
                full_width = FALSE) %>%
  row_spec(which.min(aic_tbl$AIC), background = "#FFF9C4", bold = TRUE) %>%
  column_spec(1, bold = TRUE)
Model Fit Statistics (★ = lowest AIC, highlighted)
Model AIC BIC LogLik Params
M1: MonthlyIncome 743.21 753.08 -369.60 2
M2: MonthlyIncome + OverTime 695.53 710.35 -344.77 3
M3: All Variables 731.26 953.43 -320.63 45

6.2 Likelihood Ratio Tests

lrt_12 <- anova(m1, m2, test = "Chisq")
lrt_23 <- anova(m2, m3, test = "Chisq")

cat("─── LRT: M1 vs M2 ───\n"); print(lrt_12)
## ─── LRT: M1 vs M2 ───
## Analysis of Deviance Table
## 
## Model 1: Attrition ~ MonthlyIncome
## Model 2: Attrition ~ MonthlyIncome + OverTime
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1      1028     739.21                          
## 2      1027     689.53  1   49.673 1.816e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("\n─── LRT: M2 vs M3 ───\n"); print(lrt_23)
## 
## ─── LRT: M2 vs M3 ───
## Analysis of Deviance Table
## 
## Model 1: Attrition ~ MonthlyIncome + OverTime
## Model 2: Attrition ~ Age + BusinessTravel + DailyRate + Department + DistanceFromHome + 
##     Education + EducationField + EnvironmentSatisfaction + Gender + 
##     HourlyRate + JobInvolvement + JobLevel + JobRole + JobSatisfaction + 
##     MaritalStatus + MonthlyIncome + MonthlyRate + NumCompaniesWorked + 
##     OverTime + PercentSalaryHike + PerformanceRating + RelationshipSatisfaction + 
##     StockOptionLevel + TotalWorkingYears + TrainingTimesLastYear + 
##     WorkLifeBalance + YearsAtCompany + YearsInCurrentRole + YearsSinceLastPromotion + 
##     YearsWithCurrManager
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1      1027     689.53                     
## 2       985     641.26 42   48.279    0.234

A significant p-value (< 0.05) means the more complex model fits significantly better than the simpler one.

6.3 Odds Ratio Plot — Model 2

or_m2 <- tibble(
  Term = names(coef(m2))[-1],
  OR   = exp(coef(m2))[-1],
  lo   = exp(confint.default(m2))[-1, 1],
  hi   = exp(confint.default(m2))[-1, 2]
)

ggplot(or_m2, aes(x = OR, y = reorder(Term, OR))) +
  geom_vline(xintercept = 1, linetype = "dashed", color = "grey50") +
  geom_errorbarh(aes(xmin = lo, xmax = hi),
                 height = 0.25, color = "#546E7A", linewidth = 0.9) +
  geom_point(size = 5, color = "#8B5CF6") +
  geom_text(aes(label = sprintf("OR = %.3f", OR)),
            hjust = -0.25, size = 3.8, color = "#374151") +
  scale_x_log10(limits = c(NA, max(or_m2$hi) * 2)) +
  labs(title    = "Odds Ratios — Model 2 (95% CI)",
       subtitle = "OR > 1 increases attrition risk  |  OR < 1 decreases it",
       x = "Odds Ratio (log scale)", y = "") +
  theme_minimal(base_size = 13) +
  theme(plot.title = element_text(face = "bold"))

6.4 Odds Ratio Plot — Model 3 (Top 20 predictors by effect size)

or_m3 <- tibble(
  Term = names(coef(m3))[-1],
  OR   = exp(coef(m3))[-1],
  lo   = exp(confint.default(m3))[-1, 1],
  hi   = exp(confint.default(m3))[-1, 2]
) %>%
  filter(is.finite(OR), is.finite(lo), is.finite(hi)) %>%
  arrange(desc(abs(log(OR)))) %>%
  slice_head(n = 20)

ggplot(or_m3, aes(x = OR, y = reorder(Term, OR), color = OR > 1)) +
  geom_vline(xintercept = 1, linetype = "dashed", color = "grey50") +
  geom_errorbarh(aes(xmin = lo, xmax = hi),
                 height = 0.3, linewidth = 0.8) +
  geom_point(size = 3.5) +
  scale_color_manual(values = c("TRUE"  = "#EF5350",
                                "FALSE" = "#42A5F5"),
                     labels = c("TRUE"  = "Increases risk",
                                "FALSE" = "Decreases risk")) +
  scale_x_log10() +
  labs(title    = "Top 20 Odds Ratios — Model 3 (95% CI)",
       subtitle = "Sorted by absolute log-OR (effect magnitude)",
       x = "Odds Ratio (log scale)", y = "", color = "") +
  theme_minimal(base_size = 12) +
  theme(plot.title      = element_text(face = "bold"),
        legend.position = "bottom")


7 Confusion Matrices (Test Set)

Predictions use a 0.5 probability threshold. Positive class = "Yes" (attrition).

predict_class <- function(model, newdata, threshold = 0.5) {
  probs <- predict(model, newdata = newdata, type = "response")
  factor(ifelse(probs >= threshold, "Yes", "No"), levels = c("No", "Yes"))
}

pred1 <- predict_class(m1, test_df)
pred2 <- predict_class(m2, test_df)
pred3 <- predict_class(m3, test_df)

7.1 Model 1

cm1 <- confusionMatrix(pred1, test_df$Attrition, positive = "Yes")
cm1
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  365  62
##        Yes   8   5
##                                           
##                Accuracy : 0.8409          
##                  95% CI : (0.8033, 0.8738)
##     No Information Rate : 0.8477          
##     P-Value [Acc > NIR] : 0.6831          
##                                           
##                   Kappa : 0.0794          
##                                           
##  Mcnemar's Test P-Value : 2.378e-10       
##                                           
##             Sensitivity : 0.07463         
##             Specificity : 0.97855         
##          Pos Pred Value : 0.38462         
##          Neg Pred Value : 0.85480         
##              Prevalence : 0.15227         
##          Detection Rate : 0.01136         
##    Detection Prevalence : 0.02955         
##       Balanced Accuracy : 0.52659         
##                                           
##        'Positive' Class : Yes             
## 

7.2 Model 2

cm2 <- confusionMatrix(pred2, test_df$Attrition, positive = "Yes")
cm2
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  361  53
##        Yes  12  14
##                                           
##                Accuracy : 0.8523          
##                  95% CI : (0.8156, 0.8841)
##     No Information Rate : 0.8477          
##     P-Value [Acc > NIR] : 0.4269          
##                                           
##                   Kappa : 0.236           
##                                           
##  Mcnemar's Test P-Value : 6.999e-07       
##                                           
##             Sensitivity : 0.20896         
##             Specificity : 0.96783         
##          Pos Pred Value : 0.53846         
##          Neg Pred Value : 0.87198         
##              Prevalence : 0.15227         
##          Detection Rate : 0.03182         
##    Detection Prevalence : 0.05909         
##       Balanced Accuracy : 0.58839         
##                                           
##        'Positive' Class : Yes             
## 

7.3 Model 3

cm3 <- confusionMatrix(pred3, test_df$Attrition, positive = "Yes")
cm3
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  358  50
##        Yes  15  17
##                                           
##                Accuracy : 0.8523          
##                  95% CI : (0.8156, 0.8841)
##     No Information Rate : 0.8477          
##     P-Value [Acc > NIR] : 0.4269          
##                                           
##                   Kappa : 0.2717          
##                                           
##  Mcnemar's Test P-Value : 2.474e-05       
##                                           
##             Sensitivity : 0.25373         
##             Specificity : 0.95979         
##          Pos Pred Value : 0.53125         
##          Neg Pred Value : 0.87745         
##              Prevalence : 0.15227         
##          Detection Rate : 0.03864         
##    Detection Prevalence : 0.07273         
##       Balanced Accuracy : 0.60676         
##                                           
##        'Positive' Class : Yes             
## 

7.4 Heatmap Visualisation

plot_cm <- function(cm_obj, title, fill_col) {
  as.data.frame(cm_obj$table) %>%
    rename(Predicted = Prediction, Actual = Reference) %>%
    group_by(Actual) %>%
    mutate(pct = Freq / sum(Freq)) %>%
    ungroup() %>%
    ggplot(aes(x = Predicted, y = Actual, fill = pct)) +
    geom_tile(color = "white", linewidth = 1.5) +
    geom_text(aes(label = paste0(Freq, "\n(", percent(pct, accuracy = 0.1), ")")),
              size = 4.5, fontface = "bold", color = "white") +
    scale_fill_gradient(low = "#CFD8DC", high = fill_col, limits = c(0, 1)) +
    labs(title = title, x = "Predicted", y = "Actual") +
    theme_minimal(base_size = 12) +
    theme(legend.position = "none",
          plot.title = element_text(face = "bold", hjust = 0.5, size = 11))
}

p1 <- plot_cm(cm1, "M1: MonthlyIncome",            "#3B82F6")
p2 <- plot_cm(cm2, "M2: MonthlyIncome + OverTime",  "#8B5CF6")
p3 <- plot_cm(cm3, "M3: All Variables",             "#10B981")

(p1 | p2 | p3) +
  plot_annotation(
    title = "Confusion Matrices — Test Set (Row-normalised %)",
    theme = theme(plot.title = element_text(face = "bold", size = 14, hjust = 0.5))
  )


8 ROC Curves & AUC

roc1 <- roc(test_df$Attrition, predict(m1, test_df, type = "response"), quiet = TRUE)
roc2 <- roc(test_df$Attrition, predict(m2, test_df, type = "response"), quiet = TRUE)
roc3 <- roc(test_df$Attrition, predict(m3, test_df, type = "response"), quiet = TRUE)

roc_list <- list(
  `M1` = roc1,
  `M2` = roc2,
  `M3` = roc3
)

ggroc(roc_list, linewidth = 1.3) +
  geom_abline(slope = 1, intercept = 1,
              linetype = "dashed", color = "grey60", linewidth = 0.8) +
  annotate("text", x = 0.35, y = 0.08,
           label = "Random classifier (AUC = 0.5)",
           color = "grey55", size = 3.5, angle = -35) +
  scale_color_manual(
    values = c("M1" = "#3B82F6", "M2" = "#8B5CF6", "M3" = "#10B981"),
    labels = c(
      sprintf("M1  AUC = %.3f", auc(roc1)),
      sprintf("M2  AUC = %.3f", auc(roc2)),
      sprintf("M3  AUC = %.3f", auc(roc3))
    )
  ) +
  labs(title    = "ROC Curves — Test Set",
       subtitle = "Higher AUC = better discrimination between leavers and stayers",
       x = "Specificity (1 − FPR)",
       y = "Sensitivity (TPR)",
       color = "") +
  theme_minimal(base_size = 13) +
  theme(plot.title      = element_text(face = "bold"),
        plot.subtitle   = element_text(color = "grey50"),
        legend.position = "bottom",
        legend.text     = element_text(family = "mono"))


9 Performance Summary

extract_metrics <- function(cm_obj, roc_obj, label) {
  s <- cm_obj$byClass
  tibble(
    Model       = label,
    Accuracy    = round(cm_obj$overall["Accuracy"], 4),
    Precision   = round(s["Precision"],             4),
    Recall      = round(s["Recall"],                4),
    F1          = round(s["F1"],                    4),
    Specificity = round(s["Specificity"],            4),
    AUC         = round(auc(roc_obj),               4)
  )
}

perf <- bind_rows(
  extract_metrics(cm1, roc1, "M1: MonthlyIncome"),
  extract_metrics(cm2, roc2, "M2: MonthlyIncome + OverTime"),
  extract_metrics(cm3, roc3, "M3: All Variables")
)

perf %>%
  kable(caption = "Model Performance on Test Set  |  Green = Best F1",
        align   = "lcccccc") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"),
                full_width = FALSE) %>%
  column_spec(1, bold = TRUE) %>%
  row_spec(which.max(perf$F1), background = "#D1FAE5", bold = TRUE) %>%
  add_header_above(c(" " = 1,
                     "Threshold-based (cutoff = 0.5)" = 5,
                     "Rank-based" = 1))
Model Performance on Test Set | Green = Best F1
Threshold-based (cutoff = 0.5)
Rank-based
Model Accuracy Precision Recall F1 Specificity AUC
M1: MonthlyIncome 0.8409 0.3846 0.0746 0.1250 0.9786 0.7503
M2: MonthlyIncome + OverTime 0.8523 0.5385 0.2090 0.3011 0.9678 0.8183
M3: All Variables 0.8523 0.5312 0.2537 0.3434 0.9598 0.7850

9.1 Performance Bar Chart

perf %>%
  pivot_longer(-Model, names_to = "Metric", values_to = "Value") %>%
  mutate(Metric = factor(Metric,
                         levels = c("Accuracy","Precision","Recall",
                                    "F1","Specificity","AUC"))) %>%
  ggplot(aes(x = Metric, y = Value, fill = Model)) +
  geom_col(position = position_dodge(0.75), width = 0.65,
           alpha = 0.88, color = "white") +
  geom_text(aes(label = sprintf("%.3f", Value)),
            position = position_dodge(0.75),
            vjust = -0.4, size = 2.9, fontface = "bold") +
  scale_fill_manual(values = c("#3B82F6","#8B5CF6","#10B981")) +
  scale_y_continuous(limits = c(0, 1.12), labels = percent_format()) +
  geom_hline(yintercept = 0.5, linetype = "dashed",
             color = "grey60", linewidth = 0.6) +
  labs(title    = "Model Performance Comparison — Test Set",
       subtitle = "Dashed line at 0.50 for reference",
       x = NULL, y = "Score", fill = "Model") +
  theme_minimal(base_size = 13) +
  theme(plot.title         = element_text(face = "bold"),
        legend.position    = "bottom",
        panel.grid.major.x = element_blank())


10 Discussion

10.1 Key Findings

1. Class imbalance makes accuracy misleading. With ~84 % of employees not leaving, a trivial “always predict No” classifier already achieves 84 % accuracy. Recall (sensitivity to actual leavers) and AUC are the appropriate primary metrics.

2. OverTime is the single most powerful predictor (M2 > M1). Adding one binary variable — whether an employee works overtime — substantially improves both Recall and F1. The odds ratio for OverTimeYes in M2 shows that employees working overtime have markedly higher odds of leaving, holding income constant.

3. Monthly Income has a negative relationship with attrition. Higher-paid employees are less likely to leave: each $1,000 increase in MonthlyIncome decreases the odds of attrition, consistent with standard labour economics theory.

4. More predictors ≠ always better at threshold-based classification. The full model (M3) achieves the lowest AIC and highest AUC, but does not always dominate M2 on precision/recall metrics. Multicollinearity among correlated HR variables (see correlation heatmap) and the relatively small sample size mean a plain glm() without regularisation can hurt precision-recall balance.

5. Practical recommendation for HR. For ranking employees by attrition risk (e.g., to prioritise retention conversations), use Model 3 — it has the best AUC. For a simple, interpretable rule, use Model 2 — two predictors, clear odds ratios, and a meaningful improvement over the baseline.

10.2 Limitations

  • The synthetic / IBM HR dataset does not represent any specific real organisation.
  • A fixed threshold of 0.50 was used; in practice a lower threshold (e.g., 0.30) may be preferable to maximise Recall, depending on the cost ratio of false negatives to false positives.
  • No regularisation (ridge / LASSO) was applied to M3 — glmnet could improve it.
  • Class imbalance was not handled explicitly (e.g., via SMOTE or class-weight adjustment).

11 References


12 Session Info

sessionInfo()
## R version 4.5.1 (2025-06-13 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26200)
## 
## Matrix products: default
##   LAPACK version 3.12.1
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: Asia/Ulaanbaatar
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] ggcorrplot_0.1.4.1 scales_1.4.0       patchwork_1.3.2    kableExtra_1.4.0  
##  [5] knitr_1.50         pROC_1.19.0.1      caret_7.0-1        lattice_0.22-7    
##  [9] lubridate_1.9.4    forcats_1.0.1      stringr_1.5.2      dplyr_1.2.0       
## [13] purrr_1.1.0        readr_2.1.5        tidyr_1.3.1        tibble_3.3.1      
## [17] ggplot2_4.0.2      tidyverse_2.0.0   
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.1     viridisLite_0.4.2    timeDate_4051.111   
##  [4] farver_2.1.2         S7_0.2.0             fastmap_1.2.0       
##  [7] digest_0.6.37        rpart_4.1.24         timechange_0.3.0    
## [10] lifecycle_1.0.5      survival_3.8-3       magrittr_2.0.3      
## [13] compiler_4.5.1       rlang_1.1.7          sass_0.4.10         
## [16] tools_4.5.1          yaml_2.3.10          data.table_1.17.8   
## [19] labeling_0.4.3       plyr_1.8.9           xml2_1.4.0          
## [22] RColorBrewer_1.1-3   withr_3.0.2          nnet_7.3-20         
## [25] grid_4.5.1           stats4_4.5.1         e1071_1.7-17        
## [28] future_1.67.0        globals_0.18.0       iterators_1.0.14    
## [31] MASS_7.3-65          cli_3.6.5            rmarkdown_2.29      
## [34] generics_0.1.4       rstudioapi_0.17.1    future.apply_1.20.0 
## [37] reshape2_1.4.5       tzdb_0.5.0           proxy_0.4-29        
## [40] cachem_1.1.0         splines_4.5.1        parallel_4.5.1      
## [43] vctrs_0.7.1          hardhat_1.4.2        Matrix_1.7-3        
## [46] jsonlite_2.0.0       hms_1.1.3            listenv_0.9.1       
## [49] systemfonts_1.3.2    foreach_1.5.2        gower_1.0.2         
## [52] jquerylib_0.1.4      recipes_1.3.1        glue_1.8.0          
## [55] parallelly_1.45.1    codetools_0.2-20     stringi_1.8.7       
## [58] gtable_0.3.6         pillar_1.11.0        htmltools_0.5.8.1   
## [61] ipred_0.9-15         lava_1.8.2           R6_2.6.1            
## [64] textshaping_1.0.3    evaluate_1.0.5       bslib_0.9.0         
## [67] class_7.3-23         Rcpp_1.1.0           svglite_2.2.2       
## [70] nlme_3.1-168         prodlim_2025.04.28   xfun_0.53           
## [73] pkgconfig_2.0.3      ModelMetrics_1.2.2.2