1 Introduction

Employee attrition represents a significant operational and financial challenge. This analysis applies logistic regression to the Kaggle Employee Attrition Dataset (n = 14,900) to model the probability of voluntary departure and identify the most important predictors. Three models of increasing complexity are estimated, evaluated, and compared.

Modelling strategy:

Model 1 (Baseline): Attrition ~ Monthly Income
Model 2 (Extended): Attrition ~ Monthly Income + Overtime
Model 3 (Full): Attrition ~ All available explanatory variables

2 Setup and Data Loading

2.1 Libraries

library(tidyverse)      # data manipulation and plotting
library(caret)          # confusion matrix and performance metrics
library(knitr)          # table formatting
library(kableExtra)     # enhanced tables
library(pROC)           # ROC curves
library(ggplot2)        # visualisations
library(scales)         # axis formatting

2.2 Load Data

# ── Load dataset ───────────────────────────────────────────────────────────────
# Place the CSV in the same directory as this .Rmd file, or update the path.
df <- read.csv("test.csv", stringsAsFactors = FALSE, check.names = TRUE)

cat("Dataset dimensions:", nrow(df), "rows x", ncol(df), "columns\n")

## Dataset dimensions: 14900 rows x 24 columns

cat("Column names:\n")

## Column names:

print(names(df))

##  [1] "Employee.ID"              "Age"                     
##  [3] "Gender"                   "Years.at.Company"        
##  [5] "Job.Role"                 "Monthly.Income"          
##  [7] "Work.Life.Balance"        "Job.Satisfaction"        
##  [9] "Performance.Rating"       "Number.of.Promotions"    
## [11] "Overtime"                 "Distance.from.Home"      
## [13] "Education.Level"          "Marital.Status"          
## [15] "Number.of.Dependents"     "Job.Level"               
## [17] "Company.Size"             "Company.Tenure"          
## [19] "Remote.Work"              "Leadership.Opportunities"
## [21] "Innovation.Opportunities" "Company.Reputation"      
## [23] "Employee.Recognition"     "Attrition"

2.3 Data Overview

# Attrition distribution
df %>%
  count(Attrition) %>%
  mutate(Pct = round(n / sum(n) * 100, 1)) %>%
  kable(caption = "Outcome Variable Distribution", col.names = c("Attrition", "Count", "Pct (%)")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Outcome Variable Distribution
Attrition	Count	Pct (%)
Left	7032	47.2
Stayed	7868	52.8

# Check for missing values
miss <- colSums(is.na(df))
if (sum(miss) == 0) {
  cat("No missing values detected.\n")
} else {
  print(miss[miss > 0])
}

## No missing values detected.

3 Data Preparation

# ── Binary encode outcome ──────────────────────────────────────────────────────
df$Attrition_bin <- ifelse(df$Attrition == "Left", 1, 0)

# ── Binary encode Overtime ─────────────────────────────────────────────────────
df$Overtime_bin <- ifelse(df$Overtime == "Yes", 1, 0)

# ── Label-encode all remaining categorical variables ───────────────────────────
cat_cols <- c(
  "Gender", "Job.Role", "Work.Life.Balance", "Job.Satisfaction",
  "Performance.Rating", "Education.Level", "Marital.Status",
  "Job.Level", "Company.Size", "Remote.Work",
  "Leadership.Opportunities", "Innovation.Opportunities",
  "Company.Reputation", "Employee.Recognition"
)

for (col in cat_cols) {
  if (col %in% names(df)) {
    df[[paste0(col, "_enc")]] <- as.integer(factor(df[[col]]))
  }
}

# ── Define feature sets ────────────────────────────────────────────────────────
numeric_cols <- c(
  "Age", "Years.at.Company", "Monthly.Income",
  "Number.of.Promotions", "Distance.from.Home",
  "Number.of.Dependents", "Company.Tenure"
)

enc_cols <- paste0(cat_cols[cat_cols %in% names(df)], "_enc")
all_vars <- c(numeric_cols, enc_cols, "Overtime_bin")

cat("Number of predictors in full model:", length(all_vars), "\n")

## Number of predictors in full model: 22

3.1 Train / Test Split

set.seed(42)

train_idx <- createDataPartition(df$Attrition_bin, p = 0.70, list = FALSE)
train_df  <- df[ train_idx, ]
test_df   <- df[-train_idx, ]

cat(sprintf(
  "Train: %d rows (attrition rate: %.1f%%)\nTest:  %d rows (attrition rate: %.1f%%)\n",
  nrow(train_df), mean(train_df$Attrition_bin) * 100,
  nrow(test_df),  mean(test_df$Attrition_bin)  * 100
))

## Train: 10430 rows (attrition rate: 47.4%)
## Test:  4470 rows (attrition rate: 46.7%)

4 Exploratory Data Analysis

df %>%
  group_by(Overtime, Attrition) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(Overtime) %>%
  mutate(pct = n / sum(n)) %>%
  ggplot(aes(x = Overtime, y = pct, fill = Attrition)) +
  geom_col(position = "dodge", width = 0.6) +
  scale_y_continuous(labels = percent_format()) +
  scale_fill_manual(values = c("Left" = "#E74C3C", "Stayed" = "#2E86AB")) +
  labs(
    title = "Attrition Rate by Overtime Status",
    x = "Overtime", y = "Proportion", fill = "Attrition"
  ) +
  theme_minimal(base_size = 13)

Attrition by Overtime Status

ggplot(df, aes(x = Attrition, y = Monthly.Income, fill = Attrition)) +
  geom_boxplot(alpha = 0.7, outlier.alpha = 0.3) +
  scale_fill_manual(values = c("Left" = "#E74C3C", "Stayed" = "#2E86AB")) +
  scale_y_continuous(labels = comma_format()) +
  labs(
    title = "Monthly Income Distribution by Attrition",
    x = "Attrition Status", y = "Monthly Income"
  ) +
  theme_minimal(base_size = 13) +
  theme(legend.position = "none")

Monthly Income Distribution by Attrition

5 Logistic Regression Models

5.1 Helper Functions

# ── McFadden Pseudo R² ────────────────────────────────────────────────────────
pseudo_r2 <- function(model) {
  1 - (model$deviance / model$null.deviance)
}

# ── Confusion matrix + metrics ────────────────────────────────────────────────
eval_model <- function(model, test_data, outcome = "Attrition_bin",
                       threshold = 0.5, model_name = "Model") {

  probs  <- predict(model, newdata = test_data, type = "response")
  preds  <- factor(ifelse(probs >= threshold, 1, 0), levels = c(0, 1))
  actual <- factor(test_data[[outcome]], levels = c(0, 1))

  cm  <- confusionMatrix(preds, actual, positive = "1")
  TN  <- cm$table[1, 1]; FP <- cm$table[2, 1]
  FN  <- cm$table[1, 2]; TP <- cm$table[2, 2]

  list(
    name      = model_name,
    probs     = probs,
    preds     = preds,
    actual    = actual,
    cm        = cm,
    TN = TN, FP = FP, FN = FN, TP = TP,
    accuracy  = as.numeric(cm$overall["Accuracy"]),
    precision = as.numeric(cm$byClass["Precision"]),
    recall    = as.numeric(cm$byClass["Recall"]),
    f1        = as.numeric(cm$byClass["F1"])
  )
}

# ── Formatted odds-ratio table ─────────────────────────────────────────────────
or_table <- function(model) {
  coefs <- summary(model)$coefficients
  or    <- exp(coef(model))
  ci    <- exp(confint.default(model))
  data.frame(
    Variable   = rownames(coefs),
    Coefficient = round(coefs[, 1], 4),
    Std.Error  = round(coefs[, 2], 4),
    z.stat     = round(coefs[, 3], 3),
    p.value    = round(coefs[, 4], 4),
    Odds.Ratio = round(or, 4),
    CI.Lower   = round(ci[, 1], 4),
    CI.Upper   = round(ci[, 2], 4),
    Sig        = ifelse(coefs[, 4] < .001, "***",
                 ifelse(coefs[, 4] < .01,  "**",
                 ifelse(coefs[, 4] < .05,  "*",
                 ifelse(coefs[, 4] < .1,   ".", ""))))
  )
}

5.2 Model 1: Monthly Income Only

m1 <- glm(Attrition_bin ~ Monthly.Income,
          data   = train_df,
          family = binomial(link = "logit"))

summary(m1)

## 
## Call:
## glm(formula = Attrition_bin ~ Monthly.Income, family = binomial(link = "logit"), 
##     data = train_df)
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)
## (Intercept)    -3.485e-02  6.921e-02  -0.504    0.615
## Monthly.Income -9.488e-06  9.106e-06  -1.042    0.297
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 14431  on 10429  degrees of freedom
## Residual deviance: 14430  on 10428  degrees of freedom
## AIC: 14434
## 
## Number of Fisher Scoring iterations: 3

cat(sprintf("Log-Likelihood:    %.2f\n", logLik(m1)))

## Log-Likelihood:    -7214.89

cat(sprintf("McFadden Pseudo R²: %.4f\n", pseudo_r2(m1)))

## McFadden Pseudo R²: 0.0001

cat(sprintf("AIC:               %.2f\n",  AIC(m1)))

## AIC:               14433.79

or_table(m1) %>%
  kable(caption = "Model 1 — Coefficient Table with Odds Ratios",
        digits   = 4) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE) %>%
  column_spec(9, bold = TRUE, color = "darkred")

Model 1 — Coefficient Table with Odds Ratios
	Variable	Coefficient	Std.Error	z.stat	p.value	Odds.Ratio	CI.Lower	CI.Upper	Sig
(Intercept)	(Intercept)	-0.0349	0.0692	-0.504	0.6146	0.9657	0.8432	1.1061
Monthly.Income	Monthly.Income	0.0000	0.0000	-1.042	0.2974	1.0000	1.0000	1.0000

5.2.1 Model 1 — Test Set Performance

res1 <- eval_model(m1, test_df, model_name = "Model 1")
print(res1$cm$table)

##           Reference
## Prediction    0    1
##          0 2382 2088
##          1    0    0

data.frame(
  Metric    = c("Accuracy", "Precision", "Recall", "F1-Score"),
  Value     = round(c(res1$accuracy, res1$precision, res1$recall, res1$f1), 4)
) %>%
  kable(caption = "Model 1 — Performance Metrics") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Model 1 — Performance Metrics
Metric	Value
Accuracy	0.5329
Precision	NA
Recall	0.0000
F1-Score	NA

Interpretation: Monthly Income is statistically non-significant (p = 0.297). The model classifies all observations as “Stayed” (the majority class), yielding an accuracy equal to the base rate and failing to detect a single leaver.

5.3 Model 2: Monthly Income + Overtime

m2 <- glm(Attrition_bin ~ Monthly.Income + Overtime_bin,
          data   = train_df,
          family = binomial(link = "logit"))

summary(m2)

## 
## Call:
## glm(formula = Attrition_bin ~ Monthly.Income + Overtime_bin, 
##     family = binomial(link = "logit"), data = train_df)
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -1.115e-01  7.069e-02  -1.577    0.115    
## Monthly.Income -9.469e-06  9.120e-06  -1.038    0.299    
## Overtime_bin    2.303e-01  4.168e-02   5.526 3.27e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 14431  on 10429  degrees of freedom
## Residual deviance: 14399  on 10427  degrees of freedom
## AIC: 14405
## 
## Number of Fisher Scoring iterations: 3

cat(sprintf("Log-Likelihood:    %.2f\n", logLik(m2)))

## Log-Likelihood:    -7199.61

cat(sprintf("McFadden Pseudo R²: %.4f\n", pseudo_r2(m2)))

## McFadden Pseudo R²: 0.0022

cat(sprintf("AIC:               %.2f\n",  AIC(m2)))

## AIC:               14405.22

# Likelihood ratio test vs Model 1
lrt_12 <- anova(m1, m2, test = "Chisq")
print(lrt_12)

## Analysis of Deviance Table
## 
## Model 1: Attrition_bin ~ Monthly.Income
## Model 2: Attrition_bin ~ Monthly.Income + Overtime_bin
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1     10428      14430                          
## 2     10427      14399  1   30.567 3.226e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

or_table(m2) %>%
  kable(caption = "Model 2 — Coefficient Table with Odds Ratios",
        digits   = 4) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE) %>%
  column_spec(9, bold = TRUE, color = "darkred")

Model 2 — Coefficient Table with Odds Ratios
	Variable	Coefficient	Std.Error	z.stat	p.value	Odds.Ratio	CI.Lower	CI.Upper	Sig
(Intercept)	(Intercept)	-0.1115	0.0707	-1.577	0.1147	0.8945	0.7788	1.0274
Monthly.Income	Monthly.Income	0.0000	0.0000	-1.038	0.2992	1.0000	1.0000	1.0000
Overtime_bin	Overtime_bin	0.2303	0.0417	5.526	0.0000	1.2590	1.1603	1.3662	***

5.3.1 Model 2 — Test Set Performance

res2 <- eval_model(m2, test_df, model_name = "Model 2")
print(res2$cm$table)

##           Reference
## Prediction    0    1
##          0 1689 1355
##          1  693  733

data.frame(
  Metric = c("Accuracy", "Precision", "Recall", "F1-Score"),
  Value  = round(c(res2$accuracy, res2$precision, res2$recall, res2$f1), 4)
) %>%
  kable(caption = "Model 2 — Performance Metrics") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Model 2 — Performance Metrics
Metric	Value
Accuracy	0.5418
Precision	0.5140
Recall	0.3511
F1-Score	0.4172

Interpretation: Overtime is highly significant (p < 0.001, OR = 1.259), indicating employees working overtime have ~24% higher odds of leaving. Monthly Income remains non-significant. The AIC falls by 28.6 units relative to Model 1.

5.4 Model 3: Full Model (All Variables)

formula_m3 <- as.formula(
  paste("Attrition_bin ~", paste(all_vars, collapse = " + "))
)

m3 <- glm(formula_m3,
          data   = train_df,
          family = binomial(link = "logit"))

summary(m3)

## 
## Call:
## glm(formula = formula_m3, family = binomial(link = "logit"), 
##     data = train_df)
## 
## Coefficients:
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   2.304e+00  2.746e-01   8.392  < 2e-16 ***
## Age                          -6.545e-03  2.228e-03  -2.938  0.00330 ** 
## Years.at.Company             -1.205e-02  2.634e-03  -4.575 4.77e-06 ***
## Monthly.Income               -2.330e-06  1.285e-05  -0.181  0.85612    
## Number.of.Promotions         -2.320e-01  2.339e-02  -9.919  < 2e-16 ***
## Distance.from.Home            8.571e-03  8.060e-04  10.634  < 2e-16 ***
## Number.of.Dependents         -9.815e-02  1.502e-02  -6.533 6.43e-11 ***
## Company.Tenure                4.642e-04  1.010e-03   0.460  0.64573    
## Gender_enc                   -5.207e-01  4.628e-02 -11.250  < 2e-16 ***
## Job.Role_enc                 -3.209e-02  1.877e-02  -1.710  0.08729 .  
## Work.Life.Balance_enc         1.870e-01  2.448e-02   7.639 2.18e-14 ***
## Job.Satisfaction_enc          1.522e-01  1.884e-02   8.078 6.57e-16 ***
## Performance.Rating_enc        1.223e-01  2.404e-02   5.089 3.61e-07 ***
## Education.Level_enc          -1.329e-01  1.910e-02  -6.957 3.47e-12 ***
## Marital.Status_enc            9.276e-01  3.596e-02  25.792  < 2e-16 ***
## Job.Level_enc                -1.120e+00  3.336e-02 -33.586  < 2e-16 ***
## Company.Size_enc              1.048e-01  3.281e-02   3.195  0.00140 ** 
## Remote.Work_enc              -1.609e+00  6.504e-02 -24.744  < 2e-16 ***
## Leadership.Opportunities_enc -3.110e-01  1.076e-01  -2.891  0.00385 ** 
## Innovation.Opportunities_enc -1.811e-01  6.286e-02  -2.881  0.00396 ** 
## Company.Reputation_enc        7.387e-02  2.660e-02   2.777  0.00548 ** 
## Employee.Recognition_enc      1.080e-02  2.705e-02   0.399  0.68978    
## Overtime_bin                  3.180e-01  4.887e-02   6.507 7.68e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 14431  on 10429  degrees of freedom
## Residual deviance: 11308  on 10407  degrees of freedom
## AIC: 11354
## 
## Number of Fisher Scoring iterations: 4

cat(sprintf("Log-Likelihood:    %.2f\n", logLik(m3)))

## Log-Likelihood:    -5654.03

cat(sprintf("McFadden Pseudo R²: %.4f\n", pseudo_r2(m3)))

## McFadden Pseudo R²: 0.2164

cat(sprintf("AIC:               %.2f\n",  AIC(m3)))

## AIC:               11354.07

# Likelihood ratio test vs Model 2
lrt_23 <- anova(m2, m3, test = "Chisq")
print(lrt_23)

## Analysis of Deviance Table
## 
## Model 1: Attrition_bin ~ Monthly.Income + Overtime_bin
## Model 2: Attrition_bin ~ Age + Years.at.Company + Monthly.Income + Number.of.Promotions + 
##     Distance.from.Home + Number.of.Dependents + Company.Tenure + 
##     Gender_enc + Job.Role_enc + Work.Life.Balance_enc + Job.Satisfaction_enc + 
##     Performance.Rating_enc + Education.Level_enc + Marital.Status_enc + 
##     Job.Level_enc + Company.Size_enc + Remote.Work_enc + Leadership.Opportunities_enc + 
##     Innovation.Opportunities_enc + Company.Reputation_enc + Employee.Recognition_enc + 
##     Overtime_bin
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1     10427      14399                          
## 2     10407      11308 20   3091.2 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

or_table(m3) %>%
  kable(caption = "Model 3 — Full Coefficient Table with Odds Ratios",
        digits   = 4) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE) %>%
  column_spec(9, bold = TRUE, color = "darkred") %>%
  row_spec(which(or_table(m3)$Sig %in% c("***", "**", "*")),
           background = "#F0FFF0")

Model 3 — Full Coefficient Table with Odds Ratios
	Variable	Coefficient	Std.Error	z.stat	p.value	Odds.Ratio	CI.Lower	CI.Upper	Sig
(Intercept)	(Intercept)	2.3042	0.2746	8.392	0.0000	10.0160	5.8477	17.1554	***
Age	Age	-0.0065	0.0022	-2.938	0.0033	0.9935	0.9891	0.9978	**
Years.at.Company	Years.at.Company	-0.0120	0.0026	-4.575	0.0000	0.9880	0.9829	0.9931	***
Monthly.Income	Monthly.Income	0.0000	0.0000	-0.181	0.8561	1.0000	1.0000	1.0000
Number.of.Promotions	Number.of.Promotions	-0.2320	0.0234	-9.919	0.0000	0.7929	0.7574	0.8301	***
Distance.from.Home	Distance.from.Home	0.0086	0.0008	10.634	0.0000	1.0086	1.0070	1.0102	***
Number.of.Dependents	Number.of.Dependents	-0.0981	0.0150	-6.533	0.0000	0.9065	0.8802	0.9336	***
Company.Tenure	Company.Tenure	0.0005	0.0010	0.460	0.6457	1.0005	0.9985	1.0024
Gender_enc	Gender_enc	-0.5207	0.0463	-11.250	0.0000	0.5941	0.5426	0.6505	***
Job.Role_enc	Job.Role_enc	-0.0321	0.0188	-1.710	0.0873	0.9684	0.9334	1.0047	.
Work.Life.Balance_enc	Work.Life.Balance_enc	0.1870	0.0245	7.639	0.0000	1.2057	1.1492	1.2649	***
Job.Satisfaction_enc	Job.Satisfaction_enc	0.1522	0.0188	8.078	0.0000	1.1644	1.1222	1.2082	***
Performance.Rating_enc	Performance.Rating_enc	0.1223	0.0240	5.089	0.0000	1.1301	1.0781	1.1847	***
Education.Level_enc	Education.Level_enc	-0.1329	0.0191	-6.957	0.0000	0.8756	0.8434	0.9090	***
Marital.Status_enc	Marital.Status_enc	0.9276	0.0360	25.792	0.0000	2.5284	2.3563	2.7130	***
Job.Level_enc	Job.Level_enc	-1.1204	0.0334	-33.586	0.0000	0.3262	0.3055	0.3482	***
Company.Size_enc	Company.Size_enc	0.1048	0.0328	3.195	0.0014	1.1105	1.0413	1.1843	**
Remote.Work_enc	Remote.Work_enc	-1.6094	0.0650	-24.744	0.0000	0.2000	0.1761	0.2272	***
Leadership.Opportunities_enc	Leadership.Opportunities_enc	-0.3110	0.1076	-2.891	0.0038	0.7327	0.5934	0.9047	**
Innovation.Opportunities_enc	Innovation.Opportunities_enc	-0.1811	0.0629	-2.881	0.0040	0.8343	0.7376	0.9437	**
Company.Reputation_enc	Company.Reputation_enc	0.0739	0.0266	2.777	0.0055	1.0767	1.0220	1.1343	**
Employee.Recognition_enc	Employee.Recognition_enc	0.0108	0.0271	0.399	0.6898	1.0109	0.9587	1.0659
Overtime_bin	Overtime_bin	0.3180	0.0489	6.507	0.0000	1.3743	1.2488	1.5124	***

5.4.1 Model 3 — Top Predictors

ot3 <- or_table(m3)
ot3 %>%
  filter(Variable != "(Intercept)") %>%
  mutate(abs_z = abs(z.stat)) %>%
  arrange(desc(abs_z)) %>%
  head(12) %>%
  ggplot(aes(x = reorder(Variable, abs_z), y = abs_z,
             fill = ifelse(Coefficient > 0, "Positive", "Negative"))) +
  geom_col(width = 0.7) +
  coord_flip() +
  scale_fill_manual(values = c("Positive" = "#E74C3C", "Negative" = "#2E86AB"),
                    name = "Direction") +
  labs(
    title = "Top 12 Predictors of Attrition (Full Model)",
    x = NULL, y = "|z-statistic|"
  ) +
  theme_minimal(base_size = 13)

Top Predictors in Model 3 (by |z-statistic|)

5.4.2 Model 3 — Test Set Performance

res3 <- eval_model(m3, test_df, model_name = "Model 3")
print(res3$cm$table)

##           Reference
## Prediction    0    1
##          0 1753  669
##          1  629 1419

data.frame(
  Metric = c("Accuracy", "Precision", "Recall", "F1-Score"),
  Value  = round(c(res3$accuracy, res3$precision, res3$recall, res3$f1), 4)
) %>%
  kable(caption = "Model 3 — Performance Metrics") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Model 3 — Performance Metrics
Metric	Value
Accuracy	0.7096
Precision	0.6929
Recall	0.6796
F1-Score	0.6862

6 Confusion Matrices (Visualised)

plot_cm <- function(res) {
  df_cm <- data.frame(
    Actual    = factor(c("Stayed","Stayed","Left","Left"),
                       levels = c("Left","Stayed")),
    Predicted = factor(c("Stayed","Left","Stayed","Left"),
                       levels = c("Stayed","Left")),
    n         = c(res$TN, res$FP, res$FN, res$TP),
    label     = c("TN","FP","FN","TP")
  )
  ggplot(df_cm, aes(x = Predicted, y = Actual, fill = label)) +
    geom_tile(color = "white", linewidth = 1.2) +
    geom_text(aes(label = paste0(label, "\n", n)), size = 5, fontface = "bold") +
    scale_fill_manual(values = c(
      TN = "#AED6F1", FP = "#F1948A", FN = "#F1948A", TP = "#A9DFBF"
    )) +
    labs(title = res$name, x = "Predicted", y = "Actual") +
    theme_minimal(base_size = 12) +
    theme(legend.position = "none",
          plot.title = element_text(face = "bold", hjust = 0.5))
}

library(gridExtra)
grid.arrange(plot_cm(res1), plot_cm(res2), plot_cm(res3), ncol = 3)

Confusion Matrices — All Three Models

7 Model Comparison

7.1 Summary Table

comparison <- data.frame(
  Metric        = c("Log-Likelihood", "McFadden R²", "AIC",
                    "Accuracy", "Precision", "Recall", "F1-Score",
                    "True Positives", "False Negatives"),
  Model.1       = c(
    round(as.numeric(logLik(m1)), 2),
    round(pseudo_r2(m1), 4),
    round(AIC(m1), 2),
    round(res1$accuracy,  4),
    round(res1$precision, 4),
    round(res1$recall,    4),
    round(res1$f1,        4),
    res1$TP, res1$FN
  ),
  Model.2       = c(
    round(as.numeric(logLik(m2)), 2),
    round(pseudo_r2(m2), 4),
    round(AIC(m2), 2),
    round(res2$accuracy,  4),
    round(res2$precision, 4),
    round(res2$recall,    4),
    round(res2$f1,        4),
    res2$TP, res2$FN
  ),
  Model.3       = c(
    round(as.numeric(logLik(m3)), 2),
    round(pseudo_r2(m3), 4),
    round(AIC(m3), 2),
    round(res3$accuracy,  4),
    round(res3$precision, 4),
    round(res3$recall,    4),
    round(res3$f1,        4),
    res3$TP, res3$FN
  )
)

comparison %>%
  kable(caption = "Model Comparison — All Metrics",
        col.names = c("Metric", "Model 1", "Model 2", "Model 3")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(0, bold = TRUE, background = "#1F497D", color = "white") %>%
  row_spec(c(4, 7), bold = TRUE)

Model Comparison — All Metrics
Metric	Model 1	Model 2	Model 3
Log-Likelihood	-7214.8900	-7199.6100	-5654.0300
McFadden R²	0.0001	0.0022	0.2164
AIC	14433.7900	14405.2200	11354.0700
Accuracy	0.5329	0.5418	0.7096
Precision	NA	0.5140	0.6929
Recall	0.0000	0.3511	0.6796
F1-Score	NA	0.4172	0.6862
True Positives	0.0000	733.0000	1419.0000
False Negatives	2088.0000	1355.0000	669.0000

7.2 ROC Curves

roc1 <- roc(res1$actual, res1$probs, quiet = TRUE)
roc2 <- roc(res2$actual, res2$probs, quiet = TRUE)
roc3 <- roc(res3$actual, res3$probs, quiet = TRUE)

cat(sprintf("AUC — Model 1: %.4f\n", auc(roc1)))

## AUC — Model 1: 0.4976

cat(sprintf("AUC — Model 2: %.4f\n", auc(roc2)))

## AUC — Model 2: 0.5312

cat(sprintf("AUC — Model 3: %.4f\n", auc(roc3)))

## AUC — Model 3: 0.7891

# Build tidy data frame for ggplot
roc_df <- bind_rows(
  data.frame(fpr = 1 - roc1$specificities, tpr = roc1$sensitivities,
             Model = sprintf("M1 (AUC=%.3f)", auc(roc1))),
  data.frame(fpr = 1 - roc2$specificities, tpr = roc2$sensitivities,
             Model = sprintf("M2 (AUC=%.3f)", auc(roc2))),
  data.frame(fpr = 1 - roc3$specificities, tpr = roc3$sensitivities,
             Model = sprintf("M3 (AUC=%.3f)", auc(roc3)))
)

ggplot(roc_df, aes(x = fpr, y = tpr, colour = Model)) +
  geom_line(linewidth = 1.1) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", colour = "grey50") +
  scale_colour_manual(values = c("#95A5A6", "#E67E22", "#2E86AB")) +
  labs(
    title  = "ROC Curves — Logistic Regression Models",
    x      = "False Positive Rate (1 − Specificity)",
    y      = "True Positive Rate (Sensitivity)",
    colour = NULL
  ) +
  theme_minimal(base_size = 13) +
  theme(legend.position = c(0.75, 0.25))

ROC Curves — All Three Models

7.3 Performance Bar Chart

perf_long <- data.frame(
  Model  = rep(c("Model 1", "Model 2", "Model 3"), each = 3),
  Metric = rep(c("Accuracy", "Recall", "F1-Score"), 3),
  Value  = c(
    res1$accuracy, res1$recall, res1$f1,
    res2$accuracy, res2$recall, res2$f1,
    res3$accuracy, res3$recall, res3$f1
  )
)

ggplot(perf_long, aes(x = Model, y = Value, fill = Metric)) +
  geom_col(position = "dodge", width = 0.6) +
  geom_text(aes(label = sprintf("%.3f", Value)),
            position = position_dodge(0.6), vjust = -0.4, size = 3.5) +
  scale_fill_manual(values = c(
    Accuracy  = "#2E86AB",
    Recall    = "#E74C3C",
    "F1-Score"= "#27AE60"
  )) +
  scale_y_continuous(limits = c(0, 0.85), labels = percent_format()) +
  labs(
    title = "Model Performance Comparison",
    x = NULL, y = "Score", fill = NULL
  ) +
  theme_minimal(base_size = 13)

F1-Score, Accuracy, and Recall by Model

8 Discussion

8.1 Which Model is Best?

Model 3 is unambiguously the best-performing specification on every criterion:

Statistical fit: McFadden R² = 0.2164 (vs. 0.0022 for M2). AIC reduced by 3051 units from M2 to M3. The likelihood ratio test is highly significant (p < 0.001).
Predictive performance: F1-score of 0.686 vs. 0.417 for M2 — a 63% relative improvement. Model 3 correctly identifies 1419 leavers versus 733 in Model 2.
Balance: Precision (0.693) and Recall (0.68) are near-equal, indicating the model does not systematically over-predict or under-predict attrition.

8.2 Key Findings

Monthly Income is not a significant predictor in any model. This challenges the common managerial assumption that pay is the primary driver of attrition. Even in the full model (p = 0.856), income plays no independent role once structural and environmental factors are controlled for.

The five strongest predictors in Model 3 (by |z|):

Predictor	Odds Ratio	Direction	Interpretation
Job Level	0.422	Negative	Higher seniority → 57.8% lower attrition odds
Remote Work	0.525	Negative	Remote access halves attrition odds
Marital Status	1.798	Positive	Certain statuses elevate risk by ~80%
Gender	0.771	Negative	One gender shows ~23% lower attrition odds
Distance from Home	1.281	Positive	Greater commute raises odds by 28%

9 Managerial Implications

Career progression is the primary retention lever. Job Level (OR = 0.422) and Number of Promotions (OR = 0.786) are among the strongest protective factors. Structured career ladders, clear promotion criteria, and internal mobility programmes represent the highest-ROI retention investments.
Remote work should be treated as a retention benefit. Employees with remote access have approximately half the attrition odds of on-site workers. Organisations restricting remote work risk significantly elevated turnover.
Commute burden matters. Distance from Home (OR = 1.281) is a significant risk factor, suggesting value in commuter support, hybrid scheduling, or location-sensitive hiring.
Work-life balance and job satisfaction must be actively monitored. Both variables are highly significant (p < 0.001), reinforcing the value of regular pulse surveys and managerial check-ins.
Overtime should be managed carefully. Overtime (OR = 1.165 in the full model) independently predicts attrition, even after controlling for 20 other variables.
Predictive intervention is feasible. Using Model 3, HR teams can score employees monthly and flag individuals exceeding a chosen risk threshold (e.g., 0.60) for proactive retention conversations — before a resignation decision is finalised.

10 Session Information

sessionInfo()

## R version 4.5.1 (2025-06-13)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sonoma 14.3.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: Asia/Taipei
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] gridExtra_2.3    scales_1.4.0     pROC_1.19.0.1    kableExtra_1.4.0
##  [5] knitr_1.50       caret_7.0-1      lattice_0.22-7   lubridate_1.9.4 
##  [9] forcats_1.0.1    stringr_1.5.2    dplyr_1.1.4      purrr_1.1.0     
## [13] readr_2.1.5      tidyr_1.3.1      tibble_3.3.0     ggplot2_4.0.2   
## [17] tidyverse_2.0.0 
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.1     viridisLite_0.4.2    timeDate_4051.111   
##  [4] farver_2.1.2         S7_0.2.0             fastmap_1.2.0       
##  [7] digest_0.6.37        rpart_4.1.24         timechange_0.3.0    
## [10] lifecycle_1.0.4      survival_3.8-3       magrittr_2.0.4      
## [13] compiler_4.5.1       rlang_1.1.6          sass_0.4.10         
## [16] tools_4.5.1          yaml_2.3.10          data.table_1.17.8   
## [19] labeling_0.4.3       plyr_1.8.9           xml2_1.4.0          
## [22] RColorBrewer_1.1-3   withr_3.0.2          nnet_7.3-20         
## [25] grid_4.5.1           stats4_4.5.1         e1071_1.7-17        
## [28] future_1.67.0        globals_0.18.0       iterators_1.0.14    
## [31] MASS_7.3-65          cli_3.6.5            rmarkdown_2.29      
## [34] generics_0.1.4       rstudioapi_0.17.1    future.apply_1.20.0 
## [37] reshape2_1.4.5       tzdb_0.5.0           proxy_0.4-29        
## [40] cachem_1.1.0         splines_4.5.1        parallel_4.5.1      
## [43] vctrs_0.6.5          hardhat_1.4.2        Matrix_1.7-3        
## [46] jsonlite_2.0.0       hms_1.1.3            listenv_0.9.1       
## [49] systemfonts_1.3.2    foreach_1.5.2        gower_1.0.2         
## [52] jquerylib_0.1.4      recipes_1.3.1        glue_1.8.0          
## [55] parallelly_1.45.1    codetools_0.2-20     stringi_1.8.7       
## [58] gtable_0.3.6         pillar_1.11.1        htmltools_0.5.8.1   
## [61] ipred_0.9-15         lava_1.8.2           R6_2.6.1            
## [64] textshaping_1.0.3    evaluate_1.0.5       bslib_0.9.0         
## [67] class_7.3-23         Rcpp_1.1.0           svglite_2.2.2       
## [70] nlme_3.1-168         prodlim_2025.04.28   xfun_0.53           
## [73] pkgconfig_2.0.3      ModelMetrics_1.2.2.2

End of Report

Employee Attrition Prediction: A Logistic Regression Analysis

Kaggle Employee Attrition Dataset

Faiz Haikal Nugraha Sunarto (114035108)

2026-04-25