1 Prerequisites

pkgs <- c("tidyverse", "caret", "pROC", "knitr", "kableExtra")
new_pkgs <- pkgs[!pkgs %in% installed.packages()[, "Package"]]
if (length(new_pkgs)) install.packages(new_pkgs, repos = "https://cloud.r-project.org")

library(tidyverse)
library(caret)
library(pROC)
library(knitr)
library(kableExtra)

2 Data

The dataset is from Kaggle: Employee Attrition Dataset.
Place train.csv in the same folder as this .Rmd before knitting.

df <- read.csv("train.csv")

cat("Dimensions:", nrow(df), "rows x", ncol(df), "columns\n")
## Dimensions: 59598 rows x 24 columns
cat("Column names:\n")
## Column names:
print(names(df))
##  [1] "Employee.ID"              "Age"                     
##  [3] "Gender"                   "Years.at.Company"        
##  [5] "Job.Role"                 "Monthly.Income"          
##  [7] "Work.Life.Balance"        "Job.Satisfaction"        
##  [9] "Performance.Rating"       "Number.of.Promotions"    
## [11] "Overtime"                 "Distance.from.Home"      
## [13] "Education.Level"          "Marital.Status"          
## [15] "Number.of.Dependents"     "Job.Level"               
## [17] "Company.Size"             "Company.Tenure"          
## [19] "Remote.Work"              "Leadership.Opportunities"
## [21] "Innovation.Opportunities" "Company.Reputation"      
## [23] "Employee.Recognition"     "Attrition"
cat("\nAttrition values:\n")
## 
## Attrition values:
print(unique(df$Attrition))
## [1] "Stayed" "Left"
# Actual column names from CSV:
#   Monthly.Income, Overtime, Attrition (values: "Left" / "Stayed")

df <- df %>%
  mutate(
    Attrition      = factor(Attrition, levels = c("Left", "Stayed")),
    Overtime       = factor(Overtime),
    Monthly.Income = as.numeric(Monthly.Income)
  ) %>%
  mutate(across(where(is.character), as.factor)) %>%
  select(-Employee.ID)   # remove ID column — not useful for modelling

cat("Attrition distribution:\n")
## Attrition distribution:
print(table(df$Attrition))
## 
##   Left Stayed 
##  28338  31260

3 Train / Test Split

Following the HOML reference, we use a 70 / 30 stratified split.

set.seed(2024)

train_idx <- createDataPartition(df$Attrition, p = 0.70, list = FALSE)
train_df  <- df[ train_idx, ]
test_df   <- df[-train_idx, ]

cat("Training set:", nrow(train_df), "rows\n")
## Training set: 41719 rows
cat("Test set    :", nrow(test_df),  "rows\n")
## Test set    : 17879 rows

4 Logistic Regression Models

We estimate three logistic regression models using glm() with family = binomial.

4.1 Model 1 – Monthly Income Only

m1 <- glm(Attrition ~ Monthly.Income,
          data   = train_df,
          family = binomial)
summary(m1)
## 
## Call:
## glm(formula = Attrition ~ Monthly.Income, family = binomial, 
##     data = train_df)
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)  
## (Intercept)    1.735e-02  3.468e-02   0.500   0.6169  
## Monthly.Income 1.106e-05  4.554e-06   2.427   0.0152 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 57735  on 41718  degrees of freedom
## Residual deviance: 57729  on 41717  degrees of freedom
## AIC: 57733
## 
## Number of Fisher Scoring iterations: 3

4.2 Model 2 – Monthly Income + Overtime

m2 <- glm(Attrition ~ Monthly.Income + Overtime,
          data   = train_df,
          family = binomial)
summary(m2)
## 
## Call:
## glm(formula = Attrition ~ Monthly.Income + Overtime, family = binomial, 
##     data = train_df)
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     1.022e-01  3.542e-02   2.884  0.00392 ** 
## Monthly.Income  1.105e-05  4.562e-06   2.421  0.01548 *  
## OvertimeYes    -2.572e-01  2.089e-02 -12.311  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 57735  on 41718  degrees of freedom
## Residual deviance: 57577  on 41716  degrees of freedom
## AIC: 57583
## 
## Number of Fisher Scoring iterations: 3

4.3 Model 3 – All Variables

m3 <- glm(Attrition ~ .,
          data   = train_df,
          family = binomial)
summary(m3)
## 
## Call:
## glm(formula = Attrition ~ ., family = binomial, data = train_df)
## 
## Coefficients:
##                                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                      -2.983e-01  1.025e-01  -2.911  0.00361 ** 
## Age                               6.244e-03  1.199e-03   5.209 1.90e-07 ***
## GenderMale                        6.282e-01  2.476e-02  25.373  < 2e-16 ***
## Years.at.Company                  1.229e-02  1.400e-03   8.784  < 2e-16 ***
## Job.RoleFinance                   7.385e-02  5.767e-02   1.281  0.20031    
## Job.RoleHealthcare                1.019e-01  5.057e-02   2.014  0.04396 *  
## Job.RoleMedia                     1.295e-01  4.316e-02   3.000  0.00270 ** 
## Job.RoleTechnology                9.033e-02  5.787e-02   1.561  0.11856    
## Monthly.Income                    6.588e-06  9.835e-06   0.670  0.50294    
## Work.Life.BalanceFair            -1.338e+00  3.728e-02 -35.895  < 2e-16 ***
## Work.Life.BalanceGood            -3.154e-01  3.521e-02  -8.957  < 2e-16 ***
## Work.Life.BalancePoor            -1.554e+00  4.482e-02 -34.673  < 2e-16 ***
## Job.SatisfactionLow              -4.775e-01  4.259e-02 -11.212  < 2e-16 ***
## Job.SatisfactionMedium            3.104e-03  3.244e-02   0.096  0.92377    
## Job.SatisfactionVery High        -4.883e-01  3.226e-02 -15.136  < 2e-16 ***
## Performance.RatingBelow Average  -3.331e-01  3.504e-02  -9.508  < 2e-16 ***
## Performance.RatingHigh            2.252e-02  3.163e-02   0.712  0.47645    
## Performance.RatingLow            -5.984e-01  5.792e-02 -10.331  < 2e-16 ***
## Number.of.Promotions              2.513e-01  1.241e-02  20.248  < 2e-16 ***
## OvertimeYes                      -3.667e-01  2.604e-02 -14.084  < 2e-16 ***
## Distance.from.Home               -9.923e-03  4.341e-04 -22.856  < 2e-16 ***
## Education.LevelBachelor’s Degree -3.722e-02  3.296e-02  -1.129  0.25887    
## Education.LevelHigh School       -2.158e-02  3.666e-02  -0.589  0.55609    
## Education.LevelMaster’s Degree   -4.315e-02  3.634e-02  -1.187  0.23509    
## Education.LevelPhD                1.521e+00  6.534e-02  23.282  < 2e-16 ***
## Marital.StatusMarried             2.757e-01  3.533e-02   7.804 6.02e-15 ***
## Marital.StatusSingle             -1.517e+00  3.839e-02 -39.526  < 2e-16 ***
## Number.of.Dependents              1.611e-01  7.967e-03  20.224  < 2e-16 ***
## Job.LevelMid                      9.803e-01  2.699e-02  36.317  < 2e-16 ***
## Job.LevelSenior                   2.594e+00  3.883e-02  66.795  < 2e-16 ***
## Company.SizeMedium               -1.061e-02  3.233e-02  -0.328  0.74276    
## Company.SizeSmall                -2.152e-01  3.531e-02  -6.096 1.08e-09 ***
## Company.Tenure                    4.434e-04  5.368e-04   0.826  0.40880    
## Remote.WorkYes                    1.744e+00  3.463e-02  50.360  < 2e-16 ***
## Leadership.OpportunitiesYes       1.693e-01  5.659e-02   2.991  0.00278 ** 
## Innovation.OpportunitiesYes       1.684e-01  3.309e-02   5.090 3.59e-07 ***
## Company.ReputationFair           -4.367e-01  4.738e-02  -9.217  < 2e-16 ***
## Company.ReputationGood            8.509e-02  4.228e-02   2.013  0.04415 *  
## Company.ReputationPoor           -7.210e-01  4.746e-02 -15.192  < 2e-16 ***
## Employee.RecognitionLow          -3.523e-02  3.127e-02  -1.127  0.25987    
## Employee.RecognitionMedium       -5.222e-02  3.301e-02  -1.582  0.11361    
## Employee.RecognitionVery High     3.398e-02  6.002e-02   0.566  0.57135    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 57735  on 41718  degrees of freedom
## Residual deviance: 40644  on 41677  degrees of freedom
## AIC: 40728
## 
## Number of Fisher Scoring iterations: 5

5 Confusion Matrices on Test Data

Predicted probabilities are converted to class labels using the default 0.5 threshold.

predict_class <- function(model, newdata, threshold = 0.5) {
  probs <- predict(model, newdata = newdata, type = "response")
  factor(ifelse(probs >= threshold, "Left", "Stayed"),
         levels = c("Left", "Stayed"))
}

pred1 <- predict_class(m1, test_df)
pred2 <- predict_class(m2, test_df)
pred3 <- predict_class(m3, test_df)
plot_cm <- function(pred, truth, title) {
  cm <- confusionMatrix(pred, truth, positive = "Left")

  as.data.frame(cm$table) %>%
    dplyr::rename(Predicted = Prediction, Actual = Reference, Count = Freq) %>%
    group_by(Predicted) %>%
    mutate(Pct = Count / sum(Count) * 100) %>%
    ungroup() %>%
    ggplot(aes(x = Actual, y = Predicted, fill = Count)) +
    geom_tile(color = "white", linewidth = 1.2) +
    geom_text(aes(label = paste0(Count, "\n(", round(Pct, 1), "%)")),
              size = 5, fontface = "bold", color = "white") +
    scale_fill_gradient(low = "#4575b4", high = "#d73027") +
    labs(
      title    = title,
      subtitle = paste0(
        "Accuracy: ",    round(cm$overall["Accuracy"]    * 100, 2), "%  |  ",
        "Sensitivity: ", round(cm$byClass["Sensitivity"] * 100, 2), "%  |  ",
        "Specificity: ", round(cm$byClass["Specificity"] * 100, 2), "%"
      ),
      x = "Actual", y = "Predicted"
    ) +
    theme_minimal(base_size = 14) +
    theme(
      legend.position = "none",
      plot.title      = element_text(face = "bold", hjust = 0.5),
      plot.subtitle   = element_text(hjust = 0.5, color = "grey40"),
      panel.grid      = element_blank()
    )
}

5.1 Model 1 (Monthly Income)

cm1 <- confusionMatrix(pred1, test_df$Attrition, positive = "Left")
print(cm1)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Left Stayed
##     Left   8501   9378
##     Stayed    0      0
##                                           
##                Accuracy : 0.4755          
##                  95% CI : (0.4681, 0.4828)
##     No Information Rate : 0.5245          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.0000          
##          Pos Pred Value : 0.4755          
##          Neg Pred Value :    NaN          
##              Prevalence : 0.4755          
##          Detection Rate : 0.4755          
##    Detection Prevalence : 1.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : Left            
## 
plot_cm(pred1, test_df$Attrition, "Model 1: Attrition ~ Monthly.Income")

5.2 Model 2 (Monthly Income + Overtime)

cm2 <- confusionMatrix(pred2, test_df$Attrition, positive = "Left")
print(cm2)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Left Stayed
##     Left   5586   6552
##     Stayed 2915   2826
##                                           
##                Accuracy : 0.4705          
##                  95% CI : (0.4632, 0.4778)
##     No Information Rate : 0.5245          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : -0.0407         
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.6571          
##             Specificity : 0.3013          
##          Pos Pred Value : 0.4602          
##          Neg Pred Value : 0.4922          
##              Prevalence : 0.4755          
##          Detection Rate : 0.3124          
##    Detection Prevalence : 0.6789          
##       Balanced Accuracy : 0.4792          
##                                           
##        'Positive' Class : Left            
## 
plot_cm(pred2, test_df$Attrition, "Model 2: Attrition ~ Monthly.Income + Overtime")

5.3 Model 3 (All Variables)

cm3 <- confusionMatrix(pred3, test_df$Attrition, positive = "Left")
print(cm3)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Left Stayed
##     Left   2195   7196
##     Stayed 6306   2182
##                                           
##                Accuracy : 0.2448          
##                  95% CI : (0.2385, 0.2512)
##     No Information Rate : 0.5245          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : -0.5066         
##                                           
##  Mcnemar's Test P-Value : 1.998e-14       
##                                           
##             Sensitivity : 0.2582          
##             Specificity : 0.2327          
##          Pos Pred Value : 0.2337          
##          Neg Pred Value : 0.2571          
##              Prevalence : 0.4755          
##          Detection Rate : 0.1228          
##    Detection Prevalence : 0.5253          
##       Balanced Accuracy : 0.2454          
##                                           
##        'Positive' Class : Left            
## 
plot_cm(pred3, test_df$Attrition, "Model 3: Attrition ~ . (All Variables)")

6 Model Comparison

extract_metrics <- function(cm, model_name) {
  tibble(
    Model            = model_name,
    Accuracy         = round(cm$overall["Accuracy"]       * 100, 2),
    Sensitivity      = round(cm$byClass["Sensitivity"]    * 100, 2),
    Specificity      = round(cm$byClass["Specificity"]    * 100, 2),
    `Pos Pred Value` = round(cm$byClass["Pos Pred Value"] * 100, 2),
    `Neg Pred Value` = round(cm$byClass["Neg Pred Value"] * 100, 2),
    Kappa            = round(cm$overall["Kappa"], 4)
  )
}

bind_rows(
  extract_metrics(cm1, "M1: Monthly.Income"),
  extract_metrics(cm2, "M2: + Overtime"),
  extract_metrics(cm3, "M3: All Variables")
) %>%
  kbl(caption = "Performance Metrics on Test Set (30% hold-out)") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE) %>%
  row_spec(3, bold = TRUE, color = "white", background = "#2c7bb6")
Performance Metrics on Test Set (30% hold-out)
Model Accuracy Sensitivity Specificity Pos Pred Value Neg Pred Value Kappa
M1: Monthly.Income 47.55 100.00 0.00 47.55 NaN 0.0000
M2: + Overtime 47.05 65.71 30.13 46.02 49.22 -0.0407
M3: All Variables 24.48 25.82 23.27 23.37 25.71 -0.5066

7 ROC Curves

roc1 <- roc(test_df$Attrition, predict(m1, test_df, type = "response"),
            levels = c("Stayed", "Left"), direction = "<")
roc2 <- roc(test_df$Attrition, predict(m2, test_df, type = "response"),
            levels = c("Stayed", "Left"), direction = "<")
roc3 <- roc(test_df$Attrition, predict(m3, test_df, type = "response"),
            levels = c("Stayed", "Left"), direction = "<")

ggroc(list(
  "M1: Monthly.Income" = roc1,
  "M2: + Overtime"     = roc2,
  "M3: All Variables"  = roc3
), linewidth = 1.2) +
  geom_abline(slope = 1, intercept = 1, linetype = "dashed", color = "grey60") +
  scale_color_manual(
    name   = "Model",
    values = c("#e41a1c", "#377eb8", "#4daf4a"),
    labels = c(
      paste0("M1: Monthly.Income  (AUC = ", round(auc(roc1), 3), ")"),
      paste0("M2: + Overtime      (AUC = ", round(auc(roc2), 3), ")"),
      paste0("M3: All Variables   (AUC = ", round(auc(roc3), 3), ")")
    )
  ) +
  labs(
    title    = "ROC Curves - Three Logistic Regression Models",
    subtitle = "Evaluated on test set (30% hold-out)",
    x = "1 - Specificity (False Positive Rate)",
    y = "Sensitivity (True Positive Rate)"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    legend.position = "bottom",
    plot.title      = element_text(face = "bold", hjust = 0.5),
    plot.subtitle   = element_text(hjust = 0.5, color = "grey50")
  )