1. Introduction

This report estimates three logistic regression models to predict employee attrition (whether an employee left or stayed), following the modelling approach outlined in Boehmke & Greenwell – HOML: Logistic Regression.

The three models are:

Model Formula
M1 Attrition ~ Monthly Income
M2 Attrition ~ Monthly Income + Overtime
M3 Attrition ~ . (all variables)

Data source: The dataset contains 74,498 synthetic employee records split into a pre-defined training set (59,598 rows) and test set (14,900 rows), with 23 features covering demographics, job attributes, satisfaction scores, and work environment factors.


2. Setup

2.1 Load Libraries

library(tidyverse)    # data wrangling + ggplot2
library(caret)        # confusionMatrix()
library(pROC)         # ROC / AUC
library(knitr)        # kable tables
library(kableExtra)   # styled tables
library(gridExtra)    # arrange plots
library(scales)       # axis formatting

2.2 Load Data

The CSV files are loaded from your local archive folder. If you move the files, update DATA_DIR in the chunk below.

# ── Data directory ────────────────────────────────────────────────────────────
DATA_DIR <- "D:/1.DOWNLOADs/hicel/archive"

train <- read_csv(file.path(DATA_DIR, "train.csv"), show_col_types = FALSE)
test  <- read_csv(file.path(DATA_DIR, "test.csv"),  show_col_types = FALSE)

cat("Train:", nrow(train), "rows ×", ncol(train), "cols\n")
## Train: 59598 rows × 24 cols
cat("Test: ", nrow(test),  "rows ×", ncol(test),  "cols\n")
## Test:  14900 rows × 24 cols

2.3 Quick Peek

glimpse(train)
## Rows: 59,598
## Columns: 24
## $ `Employee ID`              <dbl> 8410, 64756, 30257, 65791, 65026, 24368, 64…
## $ Age                        <dbl> 31, 59, 24, 36, 56, 38, 47, 48, 57, 24, 30,…
## $ Gender                     <chr> "Male", "Female", "Female", "Female", "Male…
## $ `Years at Company`         <dbl> 19, 4, 10, 7, 41, 3, 23, 16, 44, 1, 12, 6, …
## $ `Job Role`                 <chr> "Education", "Media", "Healthcare", "Educat…
## $ `Monthly Income`           <dbl> 5390, 5534, 8159, 3989, 4821, 9977, 3681, 1…
## $ `Work-Life Balance`        <chr> "Excellent", "Poor", "Good", "Good", "Fair"…
## $ `Job Satisfaction`         <chr> "Medium", "High", "High", "High", "Very Hig…
## $ `Performance Rating`       <chr> "Average", "Low", "Low", "High", "Average",…
## $ `Number of Promotions`     <dbl> 2, 3, 0, 1, 0, 3, 1, 2, 1, 1, 1, 2, 1, 4, 0…
## $ Overtime                   <chr> "No", "No", "No", "No", "Yes", "No", "Yes",…
## $ `Distance from Home`       <dbl> 22, 21, 11, 27, 71, 37, 75, 5, 39, 57, 51, …
## $ `Education Level`          <chr> "Associate Degree", "Master’s Degree", "Bac…
## $ `Marital Status`           <chr> "Married", "Divorced", "Married", "Single",…
## $ `Number of Dependents`     <dbl> 0, 3, 3, 2, 0, 0, 3, 4, 4, 4, 1, 0, 0, 2, 0…
## $ `Job Level`                <chr> "Mid", "Mid", "Mid", "Mid", "Senior", "Mid"…
## $ `Company Size`             <chr> "Medium", "Medium", "Medium", "Small", "Med…
## $ `Company Tenure`           <dbl> 89, 21, 74, 50, 68, 47, 93, 88, 75, 45, 17,…
## $ `Remote Work`              <chr> "No", "No", "No", "Yes", "No", "No", "No", …
## $ `Leadership Opportunities` <chr> "No", "No", "No", "No", "No", "No", "No", "…
## $ `Innovation Opportunities` <chr> "No", "No", "No", "No", "No", "Yes", "No", …
## $ `Company Reputation`       <chr> "Excellent", "Fair", "Poor", "Good", "Fair"…
## $ `Employee Recognition`     <chr> "Medium", "Low", "Low", "Medium", "Medium",…
## $ Attrition                  <chr> "Stayed", "Stayed", "Stayed", "Stayed", "St…
train %>%
  count(Attrition) %>%
  mutate(Pct = scales::percent(n / sum(n))) %>%
  kable(caption = "Attrition Distribution (Training Set)") %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)
Attrition Distribution (Training Set)
Attrition n Pct
Left 28338 47.5%
Stayed 31260 52.5%

3. Pre-processing

3.1 Encode Variables

# ── Ordinal encoding ─────────────────────────────────────────────────────────
encode_ordinal <- function(df) {
  df %>%
    mutate(
      `Work-Life Balance` = recode(`Work-Life Balance`,
        "Poor" = 1L, "Fair" = 2L, "Good" = 3L, "Excellent" = 4L),

      `Job Satisfaction` = recode(`Job Satisfaction`,
        "Very Low" = 1L, "Low" = 2L, "Medium" = 3L,
        "High" = 4L, "Very High" = 5L),

      `Performance Rating` = recode(`Performance Rating`,
        "Low" = 1L, "Below Average" = 2L, "Average" = 3L, "High" = 4L),

      `Education Level` = recode(`Education Level`,
        "High School" = 1L, "Associate's Degree" = 2L,
        "Bachelor's Degree" = 3L, "Master's Degree" = 4L, "PhD" = 5L),

      `Job Level` = recode(`Job Level`,
        "Entry" = 1L, "Mid" = 2L, "Senior" = 3L),

      `Company Size` = recode(`Company Size`,
        "Small" = 1L, "Medium" = 2L, "Large" = 3L),

      `Company Reputation` = recode(`Company Reputation`,
        "Very Poor" = 1L, "Poor" = 2L, "Fair" = 3L,
        "Good" = 4L, "Excellent" = 5L),

      `Employee Recognition` = recode(`Employee Recognition`,
        "Very Low" = 1L, "Low" = 2L, "Medium" = 3L, "High" = 4L)
    )
}

# ── Binary yes/no columns ────────────────────────────────────────────────────
encode_binary <- function(df) {
  df %>%
    mutate(across(
      c(Overtime, `Remote Work`,
        `Leadership Opportunities`, `Innovation Opportunities`),
      ~ if_else(. == "Yes", 1L, 0L)
    ))
}

# ── Nominal categoricals → factor (glm handles internally) ──────────────────
encode_nominal <- function(df) {
  df %>%
    mutate(
      Gender        = factor(Gender),
      `Job Role`    = factor(`Job Role`),
      `Marital Status` = factor(`Marital Status`)
    )
}

# ── Target: binary 0/1 ───────────────────────────────────────────────────────
encode_target <- function(df) {
  df %>%
    mutate(Attrition_bin = if_else(Attrition == "Left", 1L, 0L))
}

# Apply to both sets
preprocess <- function(df) {
  df %>%
    encode_ordinal() %>%
    encode_binary()  %>%
    encode_nominal() %>%
    encode_target()  %>%
    select(-`Employee ID`, -Attrition)   # drop ID & original target
}

train_p <- preprocess(train)
test_p  <- preprocess(test)

cat("Processed train:", nrow(train_p), "×", ncol(train_p), "\n")
## Processed train: 59598 × 23
cat("Processed test: ", nrow(test_p),  "×", ncol(test_p),  "\n")
## Processed test:  14900 × 23

3.2 Exploratory Distributions

p1 <- ggplot(train, aes(x = `Monthly Income`, fill = Attrition)) +
  geom_density(alpha = 0.5) +
  scale_fill_manual(values = c("Stayed" = "#3B82F6", "Left" = "#F59E0B")) +
  labs(title = "Monthly Income by Attrition",
       x = "Monthly Income ($)", y = "Density") +
  theme_minimal(base_size = 11)

p2 <- train %>%
  count(Overtime, Attrition) %>%
  group_by(Overtime) %>%
  mutate(pct = n / sum(n)) %>%
  filter(Attrition == "Left") %>%
  ggplot(aes(x = Overtime, y = pct, fill = Overtime)) +
  geom_col(width = 0.5, show.legend = FALSE) +
  scale_y_continuous(labels = percent_format()) +
  scale_fill_manual(values = c("No" = "#3B82F6", "Yes" = "#F59E0B")) +
  labs(title = "Attrition Rate by Overtime",
       x = "Overtime", y = "% Left") +
  theme_minimal(base_size = 11)

grid.arrange(p1, p2, ncol = 2)


4. Logistic Regression Models

4.1 Model 1 – Attrition ~ Monthly Income

m1 <- glm(Attrition_bin ~ `Monthly Income`,
          data   = train_p,
          family = binomial(link = "logit"))

summary(m1)
## 
## Call:
## glm(formula = Attrition_bin ~ `Monthly Income`, family = binomial(link = "logit"), 
##     data = train_p)
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)   
## (Intercept)      -2.081e-02  2.902e-02  -0.717  0.47334   
## `Monthly Income` -1.059e-05  3.813e-06  -2.777  0.00548 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 82477  on 59597  degrees of freedom
## Residual deviance: 82469  on 59596  degrees of freedom
## AIC: 82473
## 
## Number of Fisher Scoring iterations: 3
broom::tidy(m1, conf.int = TRUE, exponentiate = TRUE) %>%
  kable(digits = 4,
        caption = "M1 – Odds Ratios (exponentiated coefficients)") %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)
M1 – Odds Ratios (exponentiated coefficients)
term estimate std.error statistic p.value conf.low conf.high
(Intercept) 0.9794 0.029 -0.7171 0.4733 0.9253 1.0367
Monthly Income 1.0000 0.000 -2.7773 0.0055 1.0000 1.0000

4.2 Model 2 – Attrition ~ Monthly Income + Overtime

m2 <- glm(Attrition_bin ~ `Monthly Income` + Overtime,
          data   = train_p,
          family = binomial(link = "logit"))

summary(m2)
## 
## Call:
## glm(formula = Attrition_bin ~ `Monthly Income` + Overtime, family = binomial(link = "logit"), 
##     data = train_p)
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -1.001e-01  2.965e-02  -3.375 0.000737 ***
## `Monthly Income` -1.038e-05  3.819e-06  -2.717 0.006578 ** 
## Overtime          2.374e-01  1.750e-02  13.566  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 82477  on 59597  degrees of freedom
## Residual deviance: 82285  on 59595  degrees of freedom
## AIC: 82291
## 
## Number of Fisher Scoring iterations: 3
broom::tidy(m2, conf.int = TRUE, exponentiate = TRUE) %>%
  kable(digits = 4,
        caption = "M2 – Odds Ratios (exponentiated coefficients)") %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)
M2 – Odds Ratios (exponentiated coefficients)
term estimate std.error statistic p.value conf.low conf.high
(Intercept) 0.9048 0.0296 -3.3753 0.0007 0.8537 0.9589
Monthly Income 1.0000 0.0000 -2.7175 0.0066 1.0000 1.0000
Overtime 1.2680 0.0175 13.5661 0.0000 1.2252 1.3122

4.3 Model 3 – Attrition ~ . (All Variables)

m3 <- glm(Attrition_bin ~ .,
          data   = train_p,
          family = binomial(link = "logit"))

summary(m3)
## 
## Call:
## glm(formula = Attrition_bin ~ ., family = binomial(link = "logit"), 
##     data = train_p)
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 6.869e+00  2.551e-01  26.924  < 2e-16 ***
## Age                        -6.706e-03  2.078e-03  -3.227 0.001250 ** 
## GenderMale                 -6.415e-01  4.316e-02 -14.865  < 2e-16 ***
## `Years at Company`         -1.637e-02  2.450e-03  -6.681 2.38e-11 ***
## `Job Role`Finance          -1.064e-01  1.006e-01  -1.058 0.289939    
## `Job Role`Healthcare       -1.103e-01  8.787e-02  -1.255 0.209525    
## `Job Role`Media            -1.555e-01  7.498e-02  -2.074 0.038113 *  
## `Job Role`Technology       -9.254e-02  1.004e-01  -0.921 0.356860    
## `Monthly Income`           -3.825e-06  1.699e-05  -0.225 0.821905    
## `Work-Life Balance`        -6.110e-01  2.383e-02 -25.638  < 2e-16 ***
## `Job Satisfaction`          2.422e-02  2.448e-02   0.989 0.322490    
## `Performance Rating`       -1.936e-01  2.901e-02  -6.673 2.51e-11 ***
## `Number of Promotions`     -2.663e-01  2.204e-02 -12.084  < 2e-16 ***
## Overtime                    3.338e-01  4.539e-02   7.354 1.93e-13 ***
## `Distance from Home`        8.874e-03  7.585e-04  11.700  < 2e-16 ***
## `Education Level`          -3.922e-01  1.458e-02 -26.897  < 2e-16 ***
## `Marital Status`Married    -2.599e-01  6.244e-02  -4.163 3.14e-05 ***
## `Marital Status`Single      1.569e+00  6.695e-02  23.438  < 2e-16 ***
## `Number of Dependents`     -1.602e-01  1.379e-02 -11.621  < 2e-16 ***
## `Job Level`                -1.233e+00  3.166e-02 -38.940  < 2e-16 ***
## `Company Size`             -1.086e-01  3.034e-02  -3.580 0.000343 ***
## `Company Tenure`            1.755e-03  9.380e-04   1.871 0.061400 .  
## `Remote Work`              -1.675e+00  6.030e-02 -27.769  < 2e-16 ***
## `Leadership Opportunities` -2.128e-01  9.832e-02  -2.164 0.030433 *  
## `Innovation Opportunities` -1.033e-01  5.731e-02  -1.802 0.071558 .  
## `Company Reputation`       -3.465e-01  2.335e-02 -14.841  < 2e-16 ***
## `Employee Recognition`     -3.607e-02  2.631e-02  -1.371 0.170403    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 19291  on 14091  degrees of freedom
## Residual deviance: 13412  on 14065  degrees of freedom
##   (45506 observations deleted due to missingness)
## AIC: 13466
## 
## Number of Fisher Scoring iterations: 5
broom::tidy(m3, conf.int = TRUE, exponentiate = TRUE) %>%
  arrange(p.value) %>%
  kable(digits = 4,
        caption = "M3 – Odds Ratios sorted by p-value") %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)
M3 – Odds Ratios sorted by p-value
term estimate std.error statistic p.value conf.low conf.high
Job Level 0.2914 0.0317 -38.9405 0.0000 0.2738 0.3100
Remote Work 0.1874 0.0603 -27.7688 0.0000 0.1664 0.2107
(Intercept) 961.7154 0.2551 26.9244 0.0000 584.5529 1589.1295
Education Level 0.6756 0.0146 -26.8969 0.0000 0.6564 0.6950
Work-Life Balance 0.5428 0.0238 -25.6377 0.0000 0.5180 0.5687
Marital StatusSingle 4.8029 0.0670 23.4379 0.0000 4.2142 5.4791
GenderMale 0.5265 0.0432 -14.8647 0.0000 0.4837 0.5729
Company Reputation 0.7072 0.0233 -14.8409 0.0000 0.6755 0.7402
Number of Promotions 0.7662 0.0220 -12.0835 0.0000 0.7337 0.7999
Distance from Home 1.0089 0.0008 11.6999 0.0000 1.0074 1.0104
Number of Dependents 0.8519 0.0138 -11.6215 0.0000 0.8292 0.8752
Overtime 1.3962 0.0454 7.3537 0.0000 1.2774 1.5262
Years at Company 0.9838 0.0025 -6.6808 0.0000 0.9790 0.9885
Performance Rating 0.8240 0.0290 -6.6728 0.0000 0.7784 0.8722
Marital StatusMarried 0.7711 0.0624 -4.1630 0.0000 0.6824 0.8716
Company Size 0.8971 0.0303 -3.5802 0.0003 0.8452 0.9520
Age 0.9933 0.0021 -3.2273 0.0012 0.9893 0.9974
Leadership Opportunities 0.8083 0.0983 -2.1644 0.0304 0.6662 0.9796
Job RoleMedia 0.8560 0.0750 -2.0736 0.0381 0.7390 0.9915
Company Tenure 1.0018 0.0009 1.8706 0.0614 0.9999 1.0036
Innovation Opportunities 0.9019 0.0573 -1.8019 0.0716 0.8059 1.0090
Employee Recognition 0.9646 0.0263 -1.3709 0.1704 0.9161 1.0156
Job RoleHealthcare 0.8956 0.0879 -1.2549 0.2095 0.7539 1.0639
Job RoleFinance 0.8991 0.1006 -1.0583 0.2899 0.7381 1.0948
Job Satisfaction 1.0245 0.0245 0.9894 0.3225 0.9765 1.0749
Job RoleTechnology 0.9116 0.1004 -0.9214 0.3569 0.7487 1.1099
Monthly Income 1.0000 0.0000 -0.2251 0.8219 1.0000 1.0000

5. Confusion Matrices (Test Set)

# Helper: predict class at threshold 0.5 and return factor with same levels
predict_class <- function(model, newdata, threshold = 0.5) {
  probs <- predict(model, newdata = newdata, type = "response")
  preds <- if_else(probs >= threshold, "Left", "Stayed")
  factor(preds, levels = c("Stayed", "Left"))
}

# Actual labels as factor
actual <- factor(if_else(test_p$Attrition_bin == 1, "Left", "Stayed"),
                 levels = c("Stayed", "Left"))

5.1 M1 Confusion Matrix

pred1 <- predict_class(m1, test_p)
cm1   <- confusionMatrix(pred1, actual, positive = "Left")
cm1
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Stayed Left
##     Stayed   7868 7032
##     Left        0    0
##                                         
##                Accuracy : 0.5281        
##                  95% CI : (0.52, 0.5361)
##     No Information Rate : 0.5281        
##     P-Value [Acc > NIR] : 0.5033        
##                                         
##                   Kappa : 0             
##                                         
##  Mcnemar's Test P-Value : <2e-16        
##                                         
##             Sensitivity : 0.0000        
##             Specificity : 1.0000        
##          Pos Pred Value :    NaN        
##          Neg Pred Value : 0.5281        
##              Prevalence : 0.4719        
##          Detection Rate : 0.0000        
##    Detection Prevalence : 0.0000        
##       Balanced Accuracy : 0.5000        
##                                         
##        'Positive' Class : Left          
## 

5.2 M2 Confusion Matrix

pred2 <- predict_class(m2, test_p)
cm2   <- confusionMatrix(pred2, actual, positive = "Left")
cm2
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Stayed Left
##     Stayed   5487 4530
##     Left     2381 2502
##                                           
##                Accuracy : 0.5362          
##                  95% CI : (0.5281, 0.5442)
##     No Information Rate : 0.5281          
##     P-Value [Acc > NIR] : 0.02397         
##                                           
##                   Kappa : 0.0541          
##                                           
##  Mcnemar's Test P-Value : < 2e-16         
##                                           
##             Sensitivity : 0.3558          
##             Specificity : 0.6974          
##          Pos Pred Value : 0.5124          
##          Neg Pred Value : 0.5478          
##              Prevalence : 0.4719          
##          Detection Rate : 0.1679          
##    Detection Prevalence : 0.3277          
##       Balanced Accuracy : 0.5266          
##                                           
##        'Positive' Class : Left            
## 

5.3 M3 Confusion Matrix

pred3 <- predict_class(m3, test_p)
cm3   <- confusionMatrix(pred3, actual, positive = "Left")
cm3
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Stayed Left
##     Stayed   1577  436
##     Left      404 1096
##                                           
##                Accuracy : 0.7609          
##                  95% CI : (0.7464, 0.7749)
##     No Information Rate : 0.5639          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.5127          
##                                           
##  Mcnemar's Test P-Value : 0.2848          
##                                           
##             Sensitivity : 0.7154          
##             Specificity : 0.7961          
##          Pos Pred Value : 0.7307          
##          Neg Pred Value : 0.7834          
##              Prevalence : 0.4361          
##          Detection Rate : 0.3120          
##    Detection Prevalence : 0.4270          
##       Balanced Accuracy : 0.7557          
##                                           
##        'Positive' Class : Left            
## 

6. ROC Curves & AUC

prob1 <- predict(m1, newdata = test_p, type = "response")
prob2 <- predict(m2, newdata = test_p, type = "response")
prob3 <- predict(m3, newdata = test_p, type = "response")

roc1 <- roc(test_p$Attrition_bin, prob1, quiet = TRUE)
roc2 <- roc(test_p$Attrition_bin, prob2, quiet = TRUE)
roc3 <- roc(test_p$Attrition_bin, prob3, quiet = TRUE)

# Base plot
plot(roc1, col = "#3B82F6", lwd = 2,
     main = "ROC Curves – Three Logistic Regression Models",
     xlab = "False Positive Rate (1 – Specificity)",
     ylab = "True Positive Rate (Sensitivity)")
plot(roc2, col = "#10B981", lwd = 2, add = TRUE)
plot(roc3, col = "#F59E0B", lwd = 2.5, add = TRUE)
abline(a = 0, b = 1, lty = 2, col = "grey60")

legend("bottomright",
       legend = c(
         sprintf("M1: MonthlyIncome        (AUC = %.3f)", auc(roc1)),
         sprintf("M2: + Overtime           (AUC = %.3f)", auc(roc2)),
         sprintf("M3: All Variables        (AUC = %.3f)", auc(roc3))
       ),
       col    = c("#3B82F6","#10B981","#F59E0B"),
       lwd    = 2,
       bty    = "n",
       cex    = 0.9)


7. Model Comparison Summary

extract_metrics <- function(cm, roc_obj, label) {
  tibble(
    Model        = label,
    Accuracy     = cm$overall["Accuracy"]    %>% round(4),
    `ROC-AUC`    = auc(roc_obj)              %>% round(4),
    Sensitivity  = cm$byClass["Sensitivity"] %>% round(4),
    Specificity  = cm$byClass["Specificity"] %>% round(4),
    Precision    = cm$byClass["Pos Pred Value"] %>% round(4),
    F1           = cm$byClass["F1"]          %>% round(4),
    TN = cm$table[1,1],
    FP = cm$table[1,2],
    FN = cm$table[2,1],
    TP = cm$table[2,2]
  )
}

summary_tbl <- bind_rows(
  extract_metrics(cm1, roc1, "M1: Attrition ~ MonthlyIncome"),
  extract_metrics(cm2, roc2, "M2: Attrition ~ MonthlyIncome + Overtime"),
  extract_metrics(cm3, roc3, "M3: Attrition ~ . (All Variables)")
)

summary_tbl %>%
  kable(caption = "Model Performance on Test Set (n = 14,900)") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"),
                full_width = TRUE) %>%
  row_spec(3, bold = TRUE, background = "#FEF9C3")   # highlight best model
Model Performance on Test Set (n = 14,900)
Model Accuracy ROC-AUC Sensitivity Specificity Precision F1 TN FP FN TP
M1: Attrition ~ MonthlyIncome 0.5281 0.5049 0.0000 1.0000 NaN NA 7868 7032 0 0
M2: Attrition ~ MonthlyIncome + Overtime 0.5362 0.5303 0.3558 0.6974 0.5124 0.420 5487 4530 2381 2502
M3: Attrition ~ . (All Variables) 0.7609 0.8473 0.7154 0.7961 0.7307 0.723 1577 436 404 1096

8. Top Predictors – M3 Coefficient Plot

broom::tidy(m3, conf.int = TRUE) %>%
  filter(term != "(Intercept)") %>%
  mutate(term = str_remove(term, "`"),
         Direction = if_else(estimate > 0, "Increases Risk", "Decreases Risk")) %>%
  slice_max(abs(estimate), n = 15) %>%
  ggplot(aes(x = estimate,
             y = reorder(term, estimate),
             color = Direction)) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "grey50") +
  geom_errorbarh(aes(xmin = conf.low, xmax = conf.high),
                 height = 0.25, alpha = 0.6) +
  geom_point(size = 3) +
  scale_color_manual(values = c("Increases Risk"  = "#F59E0B",
                                "Decreases Risk"  = "#3B82F6")) +
  labs(
    title    = "M3 – Top 15 Predictors of Attrition",
    subtitle = "Log-odds coefficients with 95% confidence intervals",
    x        = "Coefficient (Log-Odds)",
    y        = NULL,
    color    = NULL
  ) +
  theme_minimal(base_size = 11) +
  theme(legend.position = "top",
        plot.title      = element_text(face = "bold"))


9. Key Findings

Performance Comparison

summary_tbl %>%
  select(Model, Accuracy, `ROC-AUC`, F1) %>%
  pivot_longer(-Model, names_to = "Metric", values_to = "Value") %>%
  mutate(Model = str_extract(Model, "M[123]")) %>%
  ggplot(aes(x = Model, y = Value, fill = Model)) +
  geom_col(width = 0.55, alpha = 0.9) +
  geom_text(aes(label = round(Value, 3)),
            vjust = -0.4, fontface = "bold", size = 3.5) +
  facet_wrap(~ Metric, scales = "free_y") +
  scale_fill_manual(values = c("M1" = "#3B82F6",
                               "M2" = "#10B981",
                               "M3" = "#F59E0B")) +
  scale_y_continuous(limits = c(0, 1.05)) +
  labs(title = "Model Comparison – Accuracy, AUC, F1 on Test Set",
       x = NULL, y = "Score") +
  theme_minimal(base_size = 11) +
  theme(legend.position = "none",
        strip.text      = element_text(face = "bold"))

Interpretation

  • M1 (MonthlyIncome only): AUC ≈ 0.50, barely better than random chance. Monthly income alone provides negligible signal for attrition in this dataset.

  • M2 (+ Overtime): Adding Overtime yields a small but meaningful improvement (~3 pp accuracy, AUC 0.530). Employees working overtime are more likely to leave.

  • M3 (All variables): AUC jumps to 0.819 with accuracy of 73.5%. The richest model captures the multifactorial nature of attrition — satisfaction, recognition, work-life balance, and career growth opportunities all contribute significantly alongside income.

Actionable Insights from M3

The largest positive coefficients (increased attrition risk) are associated with:

  • Low Job Satisfaction and Employee Recognition
  • Poor Work-Life Balance and Company Reputation
  • Fewer Number of Promotions

The largest negative coefficients (retention factors) include:

  • Higher Monthly Income
  • More Years at Company (tenure breeds loyalty)
  • Remote Work availability

10. Session Info

sessionInfo()
## R version 4.5.2 (2025-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26200)
## 
## Matrix products: default
##   LAPACK version 3.12.1
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: Asia/Taipei
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] scales_1.4.0     gridExtra_2.3    kableExtra_1.4.0 knitr_1.51      
##  [5] pROC_1.19.0.1    caret_7.0-1      lattice_0.22-7   lubridate_1.9.5 
##  [9] forcats_1.0.1    stringr_1.6.0    dplyr_1.2.0      purrr_1.2.1     
## [13] readr_2.2.0      tidyr_1.3.2      tibble_3.3.1     ggplot2_4.0.2   
## [17] tidyverse_2.0.0 
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.1     viridisLite_0.4.3    timeDate_4052.112   
##  [4] farver_2.1.2         S7_0.2.1             fastmap_1.2.0       
##  [7] digest_0.6.39        rpart_4.1.24         timechange_0.4.0    
## [10] lifecycle_1.0.5      survival_3.8-3       magrittr_2.0.4      
## [13] compiler_4.5.2       rlang_1.1.7          sass_0.4.10         
## [16] tools_4.5.2          yaml_2.3.12          data.table_1.18.2.1 
## [19] labeling_0.4.3       bit_4.6.0            plyr_1.8.9          
## [22] xml2_1.5.2           RColorBrewer_1.1-3   withr_3.0.2         
## [25] nnet_7.3-20          grid_4.5.2           stats4_4.5.2        
## [28] e1071_1.7-17         future_1.69.0        globals_0.19.0      
## [31] iterators_1.0.14     MASS_7.3-65          cli_3.6.5           
## [34] crayon_1.5.3         rmarkdown_2.30       generics_0.1.4      
## [37] otel_0.2.0           rstudioapi_0.18.0    future.apply_1.20.2 
## [40] reshape2_1.4.5       tzdb_0.5.0           proxy_0.4-29        
## [43] cachem_1.1.0         splines_4.5.2        parallel_4.5.2      
## [46] vctrs_0.7.1          hardhat_1.4.2        Matrix_1.7-4        
## [49] jsonlite_2.0.0       hms_1.1.4            bit64_4.6.0-1       
## [52] listenv_0.10.0       systemfonts_1.3.1    foreach_1.5.2       
## [55] gower_1.0.2          jquerylib_0.1.4      recipes_1.3.1       
## [58] glue_1.8.0           parallelly_1.46.1    codetools_0.2-20    
## [61] stringi_1.8.7        gtable_0.3.6         pillar_1.11.1       
## [64] htmltools_0.5.9      ipred_0.9-15         lava_1.8.2          
## [67] R6_2.6.1             textshaping_1.0.4    vroom_1.7.0         
## [70] evaluate_1.0.5       backports_1.5.0      broom_1.0.12        
## [73] bslib_0.10.0         class_7.3-23         Rcpp_1.1.1          
## [76] svglite_2.2.2        nlme_3.1-168         prodlim_2025.04.28  
## [79] xfun_0.56            pkgconfig_2.0.3      ModelMetrics_1.2.2.2