1 Introduction

Dataset: Employee Attrition Dataset — Kaggle
Reference: Boehmke, B. — Hands-On Machine Learning with R, Ch. 5


2 Setup

pkgs <- c("tidyverse", "caret", "knitr", "kableExtra",
          "gridExtra", "pROC", "broom")
invisible(lapply(pkgs, function(p) {
  if (!requireNamespace(p, quietly = TRUE)) install.packages(p)
}))

library(tidyverse)
library(caret)
library(knitr)
library(kableExtra)
library(gridExtra)
library(pROC)
library(broom)

theme_set(theme_minimal(base_size = 12))

3 Data Preparation

3.1 Loading & Cleaning

# File browser will open — select train.csv first, then test.csv
raw_train <- read.csv(file.choose(), stringsAsFactors = TRUE)
raw_test  <- read.csv(file.choose(), stringsAsFactors = TRUE)

# Standardise column names
names(raw_train) <- make.names(names(raw_train))
names(raw_test)  <- make.names(names(raw_test))

# Auto-detect Attrition column and its positive class
attr_col <- names(raw_train)[tolower(names(raw_train)) == "attrition"][1]

# Detect positive class label (Left / Yes / 1)
vals <- as.character(unique(raw_train[[attr_col]]))
pos_label <- vals[vals %in% c("Left","Yes","1","TRUE")][1]
neg_label <- vals[!vals %in% c("Left","Yes","1","TRUE")][1]
cat("Attrition values found:", paste(vals, collapse=", "),
    "\n→ Positive class:", pos_label, "| Negative class:", neg_label, "\n")
## Attrition values found: Stayed, Left 
## → Positive class: Left | Negative class: Stayed
clean_df <- function(df) {
  names(df) <- make.names(names(df))
  df[[attr_col]] <- factor(
    as.character(df[[attr_col]]),
    levels = c(neg_label, pos_label)
  )
  df
}

train_raw <- clean_df(raw_train)
test_raw  <- clean_df(raw_test)

# Combine for cleaning, then re-split
df_all <- bind_rows(train_raw, test_raw)

# Remove near-zero variance columns (excluding target)
nzv <- nearZeroVar(df_all %>% select(-all_of(attr_col)), names = TRUE)
drop_cols <- c(nzv, "Employee.ID", "EmployeeCount", "EmployeeNumber",
               "Over18", "StandardHours", "id", "ID")
cat("Dropped columns:", paste(intersect(drop_cols, names(df_all)), collapse=", "), "\n")
## Dropped columns: Leadership.Opportunities, Employee.ID
df_all <- df_all %>% select(-any_of(drop_cols))

# Re-split
n_train   <- nrow(train_raw)
train_df  <- df_all[1:n_train, ]
test_df   <- df_all[(n_train+1):nrow(df_all), ]

cat("\nDimensions — Train:", nrow(train_df), "x", ncol(train_df),
    "| Test:", nrow(test_df), "x", ncol(test_df), "\n")
## 
## Dimensions — Train: 14900 x 22 | Test: 59598 x 22
glimpse(train_df)
## Rows: 14,900
## Columns: 22
## $ Age                      <int> 36, 35, 50, 58, 39, 45, 22, 34, 48, 55, 32, 2…
## $ Gender                   <fct> Male, Male, Male, Male, Male, Female, Female,…
## $ Years.at.Company         <int> 13, 7, 7, 44, 24, 30, 5, 15, 40, 16, 12, 15, …
## $ Job.Role                 <fct> Healthcare, Education, Education, Media, Educ…
## $ Monthly.Income           <int> 8029, 4563, 5583, 5525, 4604, 8104, 8700, 110…
## $ Work.Life.Balance        <fct> Excellent, Good, Fair, Fair, Good, Fair, Good…
## $ Job.Satisfaction         <fct> High, High, High, Very High, High, High, High…
## $ Performance.Rating       <fct> Average, Average, Average, High, Average, Ave…
## $ Number.of.Promotions     <int> 1, 1, 3, 0, 0, 0, 0, 1, 0, 0, 0, 2, 3, 0, 1, …
## $ Overtime                 <fct> Yes, Yes, Yes, Yes, Yes, No, No, No, No, No, …
## $ Distance.from.Home       <int> 83, 55, 14, 43, 47, 38, 2, 9, 65, 31, 28, 35,…
## $ Education.Level          <fct> Master’s Degree, Associate Degree, Associate …
## $ Marital.Status           <fct> Married, Single, Divorced, Single, Married, D…
## $ Number.of.Dependents     <int> 1, 4, 2, 4, 6, 0, 0, 4, 1, 1, 1, 1, 3, 0, 0, …
## $ Job.Level                <fct> Mid, Entry, Senior, Entry, Mid, Senior, Mid, …
## $ Company.Size             <fct> Large, Medium, Medium, Medium, Large, Large, …
## $ Company.Tenure           <int> 22, 27, 76, 96, 45, 75, 48, 16, 52, 46, 57, 9…
## $ Remote.Work              <fct> No, No, No, No, Yes, No, No, No, No, No, No, …
## $ Innovation.Opportunities <fct> No, No, Yes, No, No, No, No, No, No, No, No, …
## $ Company.Reputation       <fct> Poor, Good, Good, Poor, Good, Good, Poor, Goo…
## $ Employee.Recognition     <fct> Medium, High, Low, Low, High, Low, High, Low,…
## $ Attrition                <fct> Stayed, Left, Stayed, Left, Stayed, Stayed, S…

3.2 Class Balance

train_df %>%
  count(.data[[attr_col]]) %>%
  mutate(Percentage = scales::percent(n / sum(n), accuracy = 0.1)) %>%
  rename(Class = 1, Count = n) %>%
  kable(caption = "Table 1 - Attrition Class Distribution") %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)
Table 1 - Attrition Class Distribution
Class Count Percentage
Stayed 7868 52.8%
Left 7032 47.2%
ggplot(train_df, aes(x = .data[[attr_col]], fill = .data[[attr_col]])) +
  geom_bar(width = 0.5, show.legend = FALSE) +
  geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.5) +
  scale_fill_manual(values = setNames(c("#74b9ff","#d63031"), c(neg_label, pos_label))) +
  labs(title = "Figure 1 - Attrition Class Distribution",
       x = "Attrition", y = "Count")


4 Exploratory Data Analysis

Before modelling, we examine the two key predictors featured in Models 1 and 2.

4.1 Monthly Income by Attrition

# Auto-detect income column
income_col <- names(train_df)[grepl("income", tolower(names(train_df)))][1]
cat("Income column:", income_col, "\n")
## Income column: Monthly.Income
ggplot(train_df, aes(x = .data[[attr_col]], y = .data[[income_col]],
                     fill = .data[[attr_col]])) +
  geom_boxplot(alpha = 0.8, outlier.shape = 21, show.legend = FALSE) +
  scale_fill_manual(values = setNames(c("#74b9ff","#d63031"), c(neg_label, pos_label))) +
  scale_y_continuous(labels = scales::dollar_format()) +
  labs(title = "Figure 2 - Monthly Income by Attrition Status",
       x = "Attrition", y = "Monthly Income (USD)")

Observation: Employees who left tend to earn lower monthly incomes, suggesting income is a meaningful predictor of attrition.

4.2 Overtime by Attrition

# Auto-detect overtime column
ot_col <- names(train_df)[grepl("overtime", gsub("\\.","", tolower(names(train_df))))][1]
cat("Overtime column:", ot_col, "\n")
## Overtime column: Overtime
if (!is.na(ot_col)) {
  train_df %>%
    count(.data[[ot_col]], .data[[attr_col]]) %>%
    group_by(.data[[ot_col]]) %>%
    mutate(pct = n / sum(n)) %>%
    ggplot(aes(x = .data[[ot_col]], y = pct, fill = .data[[attr_col]])) +
    geom_col(position = "fill", width = 0.5) +
    scale_y_continuous(labels = scales::percent) +
    scale_fill_manual(values = setNames(c("#74b9ff","#d63031"), c(neg_label, pos_label))) +
    labs(title = "Figure 3 - Attrition Rate by Overtime Status",
         x = "Works Overtime", y = "Proportion", fill = "Attrition")
}

Observation: Employees who work overtime leave at a substantially higher rate, confirming overtime as a strong predictor of attrition.


5 Train / Test Split

We use the original Kaggle train/test split (80/20).

cat(sprintf("Training set : %d rows  (%.0f%%)\nTest set     : %d rows  (%.0f%%)\n",
            nrow(train_df), 100 * nrow(train_df) / (nrow(train_df) + nrow(test_df)),
            nrow(test_df),  100 * nrow(test_df)  / (nrow(train_df) + nrow(test_df))))
## Training set : 14900 rows  (20%)
## Test set     : 59598 rows  (80%)

6 Logistic Regression Models

Logistic regression models the log-odds of an event as a linear combination of predictors:

\[\log\left(\frac{p}{1-p}\right) = \beta_0 + \beta_1 x_1 + \cdots + \beta_k x_k\]

Coefficients are exponentiated to obtain odds ratios (OR): OR > 1 increases attrition risk, OR < 1 decreases it.

f1 <- as.formula(paste(attr_col, "~", income_col))

if (!is.na(ot_col) && ot_col %in% names(train_df)) {
  f2 <- as.formula(paste(attr_col, "~", income_col, "+", ot_col))
} else {
  f2 <- f1
}

f3 <- as.formula(paste(attr_col, "~ ."))

cat("Model 1:", deparse(f1), "\n")
## Model 1: Attrition ~ Monthly.Income
cat("Model 2:", deparse(f2), "\n")
## Model 2: Attrition ~ Monthly.Income + Overtime
cat("Model 3:", deparse(f3), "\n")
## Model 3: Attrition ~ .

6.1 Model 1 — attrition ~ MonthlyIncome

A simple baseline using income alone.

m1 <- glm(f1, data = train_df, family = binomial(link = "logit"))

tidy(m1, exponentiate = TRUE, conf.int = TRUE) %>%
  mutate(across(where(is.numeric), ~ round(.x, 5))) %>%
  rename(Term = term, `Odds Ratio` = estimate,
         `p-value` = p.value, `CI 2.5%` = conf.low, `CI 97.5%` = conf.high) %>%
  kable(caption = "Table 2 - Model 1 Coefficients (Odds Ratios)") %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)
Table 2 - Model 1 Coefficients (Odds Ratios)
Term Odds Ratio std.error statistic p-value CI 2.5% CI 97.5%
(Intercept) 0.94331 0.05782 -1.00941 0.31278 0.84222 1.05649
Monthly.Income 0.99999 0.00001 -0.97326 0.33042 0.99998 1.00001
cat("AIC:", round(AIC(m1), 2),
    "| Null deviance:", round(m1$null.deviance, 2),
    "| Residual deviance:", round(m1$deviance, 2))
## AIC: 20611.91 | Null deviance: 20608.86 | Residual deviance: 20607.91

Interpretation: An OR < 1 for Monthly Income means higher earners are less likely to leave.

6.2 Model 2 — attrition ~ MonthlyIncome + Overtime

Adding overtime status to capture work-life balance effects.

m2 <- glm(f2, data = train_df, family = binomial(link = "logit"))

tidy(m2, exponentiate = TRUE, conf.int = TRUE) %>%
  mutate(across(where(is.numeric), ~ round(.x, 5))) %>%
  rename(Term = term, `Odds Ratio` = estimate,
         `p-value` = p.value, `CI 2.5%` = conf.low, `CI 97.5%` = conf.high) %>%
  kable(caption = "Table 3 - Model 2 Coefficients (Odds Ratios)") %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)
Table 3 - Model 2 Coefficients (Odds Ratios)
Term Odds Ratio std.error statistic p-value CI 2.5% CI 97.5%
(Intercept) 0.86879 0.05910 -2.38013 0.01731 0.77373 0.97545
Monthly.Income 0.99999 0.00001 -0.94195 0.34622 0.99998 1.00001
OvertimeYes 1.27740 0.03495 7.00411 0.00000 1.19283 1.36801
cat("AIC:", round(AIC(m2), 2),
    "| Null deviance:", round(m2$null.deviance, 2),
    "| Residual deviance:", round(m2$deviance, 2))
## AIC: 20564.8 | Null deviance: 20608.86 | Residual deviance: 20558.8

Interpretation: The OvertimeYes OR quantifies how much more likely overtime workers are to leave, holding income constant.

6.3 Model 3 — attrition ~ . (All Variables)

The full model uses every available feature.

m3 <- glm(f3, data = train_df, family = binomial(link = "logit"))

tidy(m3, exponentiate = TRUE, conf.int = TRUE) %>%
  mutate(across(where(is.numeric), ~ round(.x, 5))) %>%
  arrange(p.value) %>%
  rename(Term = term, `Odds Ratio` = estimate,
         `p-value` = p.value, `CI 2.5%` = conf.low, `CI 97.5%` = conf.high) %>%
  kable(caption = "Table 4 - Model 3 Coefficients (sorted by p-value)") %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE) %>%
  scroll_box(height = "350px")
Table 4 - Model 3 Coefficients (sorted by p-value)
Term Odds Ratio std.error statistic p-value CI 2.5% CI 97.5%
GenderMale 0.56713 0.04180 -13.56905 0.00000 0.52245 0.61547
Years.at.Company 0.98511 0.00239 -6.28506 0.00000 0.98051 0.98972
Work.Life.BalanceFair 3.81484 0.06291 21.28436 0.00000 3.37363 4.31714
Work.Life.BalancePoor 4.92146 0.07577 21.03315 0.00000 4.24443 5.71234
Job.SatisfactionLow 1.86251 0.07056 8.81455 0.00000 1.62229 2.13924
Job.SatisfactionVery High 1.78466 0.05466 10.59696 0.00000 1.60358 1.98679
Performance.RatingBelow Average 1.40718 0.06038 5.65760 0.00000 1.25024 1.58412
Performance.RatingLow 1.69577 0.09523 5.54609 0.00000 1.40764 2.04477
Number.of.Promotions 0.76202 0.02110 -12.88158 0.00000 0.73109 0.79412
OvertimeYes 1.42880 0.04401 8.10771 0.00000 1.31081 1.55765
Distance.from.Home 1.00983 0.00073 13.44129 0.00000 1.00840 1.01128
Education.LevelPhD 0.17016 0.11495 -15.40721 0.00000 0.13552 0.21270
Marital.StatusMarried 0.72122 0.05936 -5.50579 0.00000 0.64202 0.81024
Marital.StatusSingle 4.78559 0.06489 24.12728 0.00000 4.21574 5.43689
Number.of.Dependents 0.87974 0.01345 -9.52394 0.00000 0.85681 0.90321
Job.LevelMid 0.35595 0.04583 -22.53957 0.00000 0.32530 0.38932
Job.LevelSenior 0.06708 0.06585 -41.02980 0.00000 0.05891 0.07626
Remote.WorkYes 0.15801 0.05947 -31.02502 0.00000 0.14052 0.17742
Company.ReputationFair 1.76149 0.08136 6.95908 0.00000 1.50218 2.06650
Company.ReputationPoor 2.07553 0.08090 9.02672 0.00000 1.77165 2.43281
Work.Life.BalanceGood 1.28950 0.05946 4.27633 0.00002 1.14784 1.44913
Age 0.99253 0.00201 -3.73112 0.00019 0.98863 0.99645
Job.RoleMedia 0.78207 0.07226 -3.40176 0.00067 0.67874 0.90100
Company.SizeSmall 1.20528 0.05982 3.12099 0.00180 1.07198 1.35531
Innovation.OpportunitiesYes 0.86968 0.05654 -2.46944 0.01353 0.77837 0.97153
Job.RoleTechnology 0.81662 0.09837 -2.05951 0.03945 0.67337 0.99022
Job.RoleFinance 0.83397 0.09753 -1.86145 0.06268 0.68880 1.00961
Education.LevelBachelor’s Degree 0.90383 0.05543 -1.82407 0.06814 0.81076 1.00754
Job.SatisfactionMedium 1.09886 0.05529 1.70518 0.08816 0.98599 1.22461
(Intercept) 1.31433 0.17194 1.58969 0.11191 0.93831 1.84110
Job.RoleHealthcare 0.88026 0.08542 -1.49309 0.13541 0.74454 1.04066
Education.LevelMaster’s Degree 0.91556 0.06132 -1.43872 0.15023 0.81185 1.03246
Performance.RatingHigh 1.07547 0.05296 1.37371 0.16953 0.96941 1.19312
Education.LevelHigh School 0.92558 0.06184 -1.25058 0.21109 0.81991 1.04483
Monthly.Income 1.00001 0.00002 0.89586 0.37033 0.99998 1.00005
Company.SizeMedium 0.97431 0.05473 -0.47551 0.63443 0.87522 1.08466
Employee.RecognitionLow 1.02014 0.05292 0.37680 0.70632 0.91963 1.13165
Employee.RecognitionVery High 0.97910 0.10328 -0.20454 0.83793 0.79950 1.19859
Company.ReputationGood 1.00566 0.07258 0.07781 0.93798 0.87241 1.15957
Employee.RecognitionMedium 0.99569 0.05568 -0.07751 0.93821 0.89274 1.11052
Company.Tenure 1.00007 0.00091 0.07573 0.93963 0.99828 1.00186
cat("AIC:", round(AIC(m3), 2),
    "| Null deviance:", round(m3$null.deviance, 2),
    "| Residual deviance:", round(m3$deviance, 2))
## AIC: 14347.82 | Null deviance: 20608.86 | Residual deviance: 14265.82

6.4 AIC Comparison

A lower AIC indicates a better balance between goodness-of-fit and model complexity.

aic_tbl <- tibble(
  Model = c("Model 1: MonthlyIncome",
            "Model 2: MonthlyIncome + Overtime",
            "Model 3: All Variables"),
  Parameters = c(length(coef(m1)), length(coef(m2)), length(coef(m3))),
  AIC = round(c(AIC(m1), AIC(m2), AIC(m3)), 2),
  `Residual Deviance` = round(c(m1$deviance, m2$deviance, m3$deviance), 2)
)

aic_tbl %>%
  kable(caption = "Table 5 - AIC and Deviance Comparison") %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE) %>%
  row_spec(which.min(aic_tbl$AIC), bold = TRUE, color = "white", background = "#2c7fb8")
Table 5 - AIC and Deviance Comparison
Model Parameters AIC Residual Deviance
Model 1: MonthlyIncome 2 20611.91 20607.91
Model 2: MonthlyIncome + Overtime 3 20564.80 20558.80
Model 3: All Variables 41 14347.82 14265.82

Note: The highlighted row has the lowest AIC. Lower AIC = better model fit without unnecessary complexity.


7 Confusion Matrices on Test Data

7.1 Metric Reference

Metric Formula Meaning
Accuracy (TP + TN) / N Overall correct predictions
Sensitivity TP / (TP + FN) Correctly identified leavers
Specificity TN / (TN + FP) Correctly identified stayers
Precision TP / (TP + FP) Of predicted leavers, truly left
Kappa Agreement beyond chance (0 = random)
predict_class <- function(model, newdata, threshold = 0.5) {
  probs  <- predict(model, newdata = newdata, type = "response")
  labels <- ifelse(probs >= threshold, pos_label, neg_label)
  factor(labels, levels = c(neg_label, pos_label))
}

7.2 Model 1

pred1 <- predict_class(m1, test_df)
cm1   <- confusionMatrix(pred1, test_df[[attr_col]], positive = pos_label)
print(cm1)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Stayed  Left
##     Stayed  31260 28338
##     Left        0     0
##                                           
##                Accuracy : 0.5245          
##                  95% CI : (0.5205, 0.5285)
##     No Information Rate : 0.5245          
##     P-Value [Acc > NIR] : 0.5017          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.0000          
##             Specificity : 1.0000          
##          Pos Pred Value :    NaN          
##          Neg Pred Value : 0.5245          
##              Prevalence : 0.4755          
##          Detection Rate : 0.0000          
##    Detection Prevalence : 0.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : Left            
## 

7.3 Model 2

pred2 <- predict_class(m2, test_df)
cm2   <- confusionMatrix(pred2, test_df[[attr_col]], positive = pos_label)
print(cm2)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Stayed  Left
##     Stayed  21837 18315
##     Left     9423 10023
##                                           
##                Accuracy : 0.5346          
##                  95% CI : (0.5306, 0.5386)
##     No Information Rate : 0.5245          
##     P-Value [Acc > NIR] : 4.349e-07       
##                                           
##                   Kappa : 0.053           
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.3537          
##             Specificity : 0.6986          
##          Pos Pred Value : 0.5154          
##          Neg Pred Value : 0.5439          
##              Prevalence : 0.4755          
##          Detection Rate : 0.1682          
##    Detection Prevalence : 0.3263          
##       Balanced Accuracy : 0.5261          
##                                           
##        'Positive' Class : Left            
## 

7.4 Model 3

pred3 <- predict_class(m3, test_df)
cm3   <- confusionMatrix(pred3, test_df[[attr_col]], positive = pos_label)
print(cm3)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Stayed  Left
##     Stayed  23974  7586
##     Left     7286 20752
##                                          
##                Accuracy : 0.7505         
##                  95% CI : (0.747, 0.7539)
##     No Information Rate : 0.5245         
##     P-Value [Acc > NIR] : < 2e-16        
##                                          
##                   Kappa : 0.4995         
##                                          
##  Mcnemar's Test P-Value : 0.01421        
##                                          
##             Sensitivity : 0.7323         
##             Specificity : 0.7669         
##          Pos Pred Value : 0.7401         
##          Neg Pred Value : 0.7596         
##              Prevalence : 0.4755         
##          Detection Rate : 0.3482         
##    Detection Prevalence : 0.4705         
##       Balanced Accuracy : 0.7496         
##                                          
##        'Positive' Class : Left           
## 

7.5 Confusion Matrix Heatmaps

plot_cm <- function(cm, title) {
  tbl <- as.data.frame(cm$table)
  names(tbl) <- c("Predicted", "Actual", "Freq")
  tbl <- tbl %>%
    group_by(Actual) %>%
    mutate(pct = scales::percent(Freq / sum(Freq), accuracy = 1))

  ggplot(tbl, aes(x = Actual, y = Predicted, fill = Freq)) +
    geom_tile(colour = "white", linewidth = 1) +
    geom_text(aes(label = paste0(Freq, "\n(", pct, ")")),
              size = 4.5, fontface = "bold") +
    scale_fill_gradient(low = "#dfe6e9", high = "#2c7fb8") +
    labs(title = title, x = "Actual", y = "Predicted") +
    theme_minimal(base_size = 12) +
    theme(legend.position = "none",
          plot.title = element_text(face = "bold", hjust = 0.5))
}

grid.arrange(
  plot_cm(cm1, "Model 1\nMonthlyIncome"),
  plot_cm(cm2, "Model 2\nMonthlyIncome + Overtime"),
  plot_cm(cm3, "Model 3\nAll Variables"),
  ncol = 3,
  top = grid::textGrob("Figure 4 - Confusion Matrices (Test Set)",
                       gp = grid::gpar(fontsize = 13, fontface = "bold"))
)


8 ROC Curves & AUC

The ROC curve plots sensitivity against (1 − specificity) across all thresholds. AUC summarises discriminative ability: 0.5 = random, 1.0 = perfect.

prob1 <- predict(m1, test_df, type = "response")
prob2 <- predict(m2, test_df, type = "response")
prob3 <- predict(m3, test_df, type = "response")

roc1 <- roc(test_df[[attr_col]], prob1, levels = c(neg_label, pos_label), quiet = TRUE)
roc2 <- roc(test_df[[attr_col]], prob2, levels = c(neg_label, pos_label), quiet = TRUE)
roc3 <- roc(test_df[[attr_col]], prob3, levels = c(neg_label, pos_label), quiet = TRUE)

roc_df <- bind_rows(
  data.frame(FPR = 1 - roc1$specificities, TPR = roc1$sensitivities,
             Model = sprintf("Model 1 (AUC = %.3f)", auc(roc1))),
  data.frame(FPR = 1 - roc2$specificities, TPR = roc2$sensitivities,
             Model = sprintf("Model 2 (AUC = %.3f)", auc(roc2))),
  data.frame(FPR = 1 - roc3$specificities, TPR = roc3$sensitivities,
             Model = sprintf("Model 3 (AUC = %.3f)", auc(roc3)))
)

ggplot(roc_df, aes(x = FPR, y = TPR, colour = Model)) +
  geom_line(linewidth = 1) +
  geom_abline(linetype = "dashed", colour = "grey50") +
  scale_colour_manual(values = c("#e17055","#0984e3","#00b894")) +
  labs(title = "Figure 5 - ROC Curves (Test Set)",
       x = "False Positive Rate (1 - Specificity)",
       y = "True Positive Rate (Sensitivity)",
       colour = NULL) +
  theme(legend.position = c(0.65, 0.2),
        legend.background = element_rect(fill = "white", colour = "grey80"))

Interpretation: A model with AUC closer to 1.0 better discriminates between employees who stay and those who leave, regardless of the chosen threshold.


9 Summary Comparison

extract_metrics <- function(cm, roc_obj, model_name) {
  tibble(
    Model       = model_name,
    Accuracy    = cm$overall["Accuracy"],
    Sensitivity = cm$byClass["Sensitivity"],
    Specificity = cm$byClass["Specificity"],
    Precision   = cm$byClass["Pos Pred Value"],
    Kappa       = cm$overall["Kappa"],
    AUC         = as.numeric(auc(roc_obj))
  )
}

comparison <- bind_rows(
  extract_metrics(cm1, roc1, "Model 1: MonthlyIncome"),
  extract_metrics(cm2, roc2, "Model 2: MonthlyIncome + Overtime"),
  extract_metrics(cm3, roc3, "Model 3: All Variables")
)

comparison %>%
  mutate(across(where(is.numeric), ~ round(.x, 4))) %>%
  kable(caption = "Table 6 - Full Performance Comparison on Test Set") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"),
                full_width = TRUE) %>%
  row_spec(which.max(comparison$AUC), bold = TRUE,
           color = "white", background = "#2c7fb8")
Table 6 - Full Performance Comparison on Test Set
Model Accuracy Sensitivity Specificity Precision Kappa AUC
Model 1: MonthlyIncome 0.5245 0.0000 1.0000 NaN 0.0000 0.5065
Model 2: MonthlyIncome + Overtime 0.5346 0.3537 0.6986 0.5154 0.0530 0.5300
Model 3: All Variables 0.7505 0.7323 0.7669 0.7401 0.4995 0.8405

10 Conclusions

  1. Model complexity matters. The full model (Model 3) achieves the highest AUC and accuracy, confirming that using all available employee features substantially improves attrition prediction beyond income alone.

  2. Overtime is a key driver. Adding overtime status in Model 2 meaningfully improves on the income-only baseline (lower AIC, improved sensitivity). Employees working overtime are significantly more likely to leave, even after controlling for income.

  3. Sensitivity vs. specificity trade-off. All three models are better at identifying stayers (high specificity) than leavers (lower sensitivity). Lowering the 0.5 threshold would catch more true attrition cases at the cost of more false alarms.

  4. Practical recommendation. For deployment, Model 3 is preferred for predictive accuracy. Model 2 may be sufficient when interpretability and data availability are priorities — it uses only two variables yet captures much of the signal.


11 Session Info

sessionInfo()
## R version 4.5.1 (2025-06-13 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26200)
## 
## Matrix products: default
##   LAPACK version 3.12.1
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: Asia/Ulaanbaatar
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] broom_1.0.10     pROC_1.19.0.1    gridExtra_2.3    kableExtra_1.4.0
##  [5] knitr_1.50       caret_7.0-1      lattice_0.22-7   lubridate_1.9.4 
##  [9] forcats_1.0.1    stringr_1.5.2    dplyr_1.1.4      purrr_1.1.0     
## [13] readr_2.1.5      tidyr_1.3.1      tibble_3.3.0     ggplot2_4.0.0   
## [17] tidyverse_2.0.0 
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.1     viridisLite_0.4.2    timeDate_4041.110   
##  [4] farver_2.1.2         S7_0.2.0             fastmap_1.2.0       
##  [7] digest_0.6.37        rpart_4.1.24         timechange_0.3.0    
## [10] lifecycle_1.0.4      survival_3.8-3       magrittr_2.0.4      
## [13] compiler_4.5.1       rlang_1.1.6          sass_0.4.10         
## [16] tools_4.5.1          yaml_2.3.10          data.table_1.17.8   
## [19] labeling_0.4.3       plyr_1.8.9           xml2_1.4.0          
## [22] RColorBrewer_1.1-3   withr_3.0.2          nnet_7.3-20         
## [25] grid_4.5.1           stats4_4.5.1         e1071_1.7-17        
## [28] future_1.67.0        globals_0.18.0       scales_1.4.0        
## [31] iterators_1.0.14     MASS_7.3-65          cli_3.6.5           
## [34] rmarkdown_2.30       generics_0.1.4       rstudioapi_0.17.1   
## [37] future.apply_1.20.0  tzdb_0.5.0           reshape2_1.4.5      
## [40] proxy_0.4-29         cachem_1.1.0         splines_4.5.1       
## [43] parallel_4.5.1       vctrs_0.6.5          hardhat_1.4.2       
## [46] Matrix_1.7-4         jsonlite_2.0.0       hms_1.1.3           
## [49] listenv_0.9.1        systemfonts_1.3.2    foreach_1.5.2       
## [52] gower_1.0.2          jquerylib_0.1.4      recipes_1.3.1       
## [55] glue_1.8.0           parallelly_1.45.1    codetools_0.2-20    
## [58] stringi_1.8.7        gtable_0.3.6         pillar_1.11.1       
## [61] htmltools_0.5.8.1    ipred_0.9-15         lava_1.8.1          
## [64] R6_2.6.1             textshaping_1.0.3    evaluate_1.0.5      
## [67] backports_1.5.0      bslib_0.9.0          class_7.3-23        
## [70] Rcpp_1.1.0           svglite_2.2.2        nlme_3.1-168        
## [73] prodlim_2025.04.28   xfun_0.53            ModelMetrics_1.2.2.2
## [76] pkgconfig_2.0.3