Employee attrition classification dataset on kaggle

attrition <- read.csv("train.csv", stringsAsFactors = TRUE)
glimpse(attrition)

## Rows: 59,598
## Columns: 24
## $ Employee.ID              <int> 8410, 64756, 30257, 65791, 65026, 24368, 6497…
## $ Age                      <int> 31, 59, 24, 36, 56, 38, 47, 48, 57, 24, 30, 2…
## $ Gender                   <fct> Male, Female, Female, Female, Male, Female, M…
## $ Years.at.Company         <int> 19, 4, 10, 7, 41, 3, 23, 16, 44, 1, 12, 6, 38…
## $ Job.Role                 <fct> Education, Media, Healthcare, Education, Educ…
## $ Monthly.Income           <int> 5390, 5534, 8159, 3989, 4821, 9977, 3681, 112…
## $ Work.Life.Balance        <fct> Excellent, Poor, Good, Good, Fair, Fair, Fair…
## $ Job.Satisfaction         <fct> Medium, High, High, High, Very High, High, Hi…
## $ Performance.Rating       <fct> Average, Low, Low, High, Average, Below Avera…
## $ Number.of.Promotions     <int> 2, 3, 0, 1, 0, 3, 1, 2, 1, 1, 1, 2, 1, 4, 0, …
## $ Overtime                 <fct> No, No, No, No, Yes, No, Yes, No, Yes, Yes, N…
## $ Distance.from.Home       <int> 22, 21, 11, 27, 71, 37, 75, 5, 39, 57, 51, 26…
## $ Education.Level          <fct> Associate Degree, Master’s Degree, Bachelor’s…
## $ Marital.Status           <fct> Married, Divorced, Married, Single, Divorced,…
## $ Number.of.Dependents     <int> 0, 3, 3, 2, 0, 0, 3, 4, 4, 4, 1, 0, 0, 2, 0, …
## $ Job.Level                <fct> Mid, Mid, Mid, Mid, Senior, Mid, Entry, Entry…
## $ Company.Size             <fct> Medium, Medium, Medium, Small, Medium, Medium…
## $ Company.Tenure           <int> 89, 21, 74, 50, 68, 47, 93, 88, 75, 45, 17, 3…
## $ Remote.Work              <fct> No, No, No, Yes, No, No, No, No, No, No, No, …
## $ Leadership.Opportunities <fct> No, No, No, No, No, No, No, No, No, No, No, N…
## $ Innovation.Opportunities <fct> No, No, No, No, No, Yes, No, No, No, Yes, No,…
## $ Company.Reputation       <fct> Excellent, Fair, Poor, Good, Fair, Fair, Good…
## $ Employee.Recognition     <fct> Medium, Low, Low, Medium, Medium, High, Mediu…
## $ Attrition                <fct> Stayed, Stayed, Stayed, Stayed, Stayed, Left,…

attrition <- read.csv("train.csv", stringsAsFactors = TRUE)

# Drop the ID column
attrition <- attrition %>% select(-Employee.ID)

# Set "Left" as the positive class 
attrition$Attrition <- relevel(attrition$Attrition, ref = "Stayed")

glimpse(attrition)

## Rows: 59,598
## Columns: 23
## $ Age                      <int> 31, 59, 24, 36, 56, 38, 47, 48, 57, 24, 30, 2…
## $ Gender                   <fct> Male, Female, Female, Female, Male, Female, M…
## $ Years.at.Company         <int> 19, 4, 10, 7, 41, 3, 23, 16, 44, 1, 12, 6, 38…
## $ Job.Role                 <fct> Education, Media, Healthcare, Education, Educ…
## $ Monthly.Income           <int> 5390, 5534, 8159, 3989, 4821, 9977, 3681, 112…
## $ Work.Life.Balance        <fct> Excellent, Poor, Good, Good, Fair, Fair, Fair…
## $ Job.Satisfaction         <fct> Medium, High, High, High, Very High, High, Hi…
## $ Performance.Rating       <fct> Average, Low, Low, High, Average, Below Avera…
## $ Number.of.Promotions     <int> 2, 3, 0, 1, 0, 3, 1, 2, 1, 1, 1, 2, 1, 4, 0, …
## $ Overtime                 <fct> No, No, No, No, Yes, No, Yes, No, Yes, Yes, N…
## $ Distance.from.Home       <int> 22, 21, 11, 27, 71, 37, 75, 5, 39, 57, 51, 26…
## $ Education.Level          <fct> Associate Degree, Master’s Degree, Bachelor’s…
## $ Marital.Status           <fct> Married, Divorced, Married, Single, Divorced,…
## $ Number.of.Dependents     <int> 0, 3, 3, 2, 0, 0, 3, 4, 4, 4, 1, 0, 0, 2, 0, …
## $ Job.Level                <fct> Mid, Mid, Mid, Mid, Senior, Mid, Entry, Entry…
## $ Company.Size             <fct> Medium, Medium, Medium, Small, Medium, Medium…
## $ Company.Tenure           <int> 89, 21, 74, 50, 68, 47, 93, 88, 75, 45, 17, 3…
## $ Remote.Work              <fct> No, No, No, Yes, No, No, No, No, No, No, No, …
## $ Leadership.Opportunities <fct> No, No, No, No, No, No, No, No, No, No, No, N…
## $ Innovation.Opportunities <fct> No, No, No, No, No, Yes, No, No, No, Yes, No,…
## $ Company.Reputation       <fct> Excellent, Fair, Poor, Good, Fair, Fair, Good…
## $ Employee.Recognition     <fct> Medium, Low, Low, Medium, Medium, High, Mediu…
## $ Attrition                <fct> Stayed, Stayed, Stayed, Stayed, Stayed, Left,…

1. Train/Test Split

set.seed(123)
split    <- initial_split(attrition, prop = 0.7, strata = "Attrition")
train_df <- training(split)
test_df  <- testing(split)

cat("Training set:", nrow(train_df), "rows\n")

## Training set: 41718 rows

cat("Test set:    ", nrow(test_df),  "rows\n")

## Test set:     17880 rows

2. Fit Three Logistic Regression Models

lr_model <- logistic_reg() %>%
  set_engine("glm") %>%
  set_mode("classification")

Model 1: Attrition ~ Monthly.Income

model1 <- lr_model %>%
  fit(Attrition ~ Monthly.Income, data = train_df)

tidy(model1)

## # A tibble: 2 × 5
##   term             estimate  std.error statistic p.value
##   <chr>               <dbl>      <dbl>     <dbl>   <dbl>
## 1 (Intercept)    -0.00840   0.0347        -0.242 0.808  
## 2 Monthly.Income -0.0000123 0.00000456    -2.70  0.00692

Model 2: Attrition ~ Monthly.Income + Overtime

model2 <- lr_model %>%
  fit(Attrition ~ Monthly.Income + Overtime, data = train_df)

tidy(model2)

## # A tibble: 3 × 5
##   term             estimate  std.error statistic  p.value
##   <chr>               <dbl>      <dbl>     <dbl>    <dbl>
## 1 (Intercept)    -0.0810    0.0354         -2.29 2.23e- 2
## 2 Monthly.Income -0.0000119 0.00000456     -2.60 9.32e- 3
## 3 OvertimeYes     0.211     0.0209         10.1  5.80e-24

Model 3: Attrition ~ (all variables)

model3 <- lr_model %>%
  fit(Attrition ~ ., data = train_df)

tidy(model3)

## # A tibble: 42 × 5
##    term                     estimate  std.error statistic   p.value
##    <chr>                       <dbl>      <dbl>     <dbl>     <dbl>
##  1 (Intercept)            0.269      0.103          2.62  8.77e-  3
##  2 Age                   -0.00694    0.00120       -5.79  7.17e-  9
##  3 GenderMale            -0.628      0.0248       -25.3   3.66e-141
##  4 Years.at.Company      -0.0134     0.00140       -9.56  1.20e- 21
##  5 Job.RoleFinance       -0.136      0.0579        -2.34  1.92e-  2
##  6 Job.RoleHealthcare    -0.107      0.0506        -2.10  3.54e-  2
##  7 Job.RoleMedia         -0.0866     0.0430        -2.01  4.40e-  2
##  8 Job.RoleTechnology    -0.121      0.0579        -2.09  3.65e-  2
##  9 Monthly.Income        -0.00000143 0.00000986    -0.145 8.85e-  1
## 10 Work.Life.BalanceFair  1.32       0.0374        35.4   3.92e-274
## # ℹ 32 more rows

3. Confusion Matrices on Test Data

make_conf_matrix <- function(model, test_data, label) {
  preds <- predict(model, new_data = test_data, type = "class") %>%
    bind_cols(test_data %>% select(Attrition))
  
  cm  <- conf_mat(preds, truth = Attrition, estimate = .pred_class)
  acc <- accuracy(preds, truth = Attrition, estimate = .pred_class)
  
  cat("\n===========================================\n")
  cat(label, "\n")
  cat("===========================================\n")
  print(cm)
  cat("\nAccuracy:", round(acc$.estimate, 4), "\n")
  
  return(cm)
}

cm1 <- make_conf_matrix(model1, test_df, "Model 1: Attrition ~ Monthly.Income")

## 
## ===========================================
## Model 1: Attrition ~ Monthly.Income 
## ===========================================
##           Truth
## Prediction Stayed Left
##     Stayed   9378 8502
##     Left        0    0
## 
## Accuracy: 0.5245

cm2 <- make_conf_matrix(model2, test_df, "Model 2: Attrition ~ Monthly.Income + Overtime")

## 
## ===========================================
## Model 2: Attrition ~ Monthly.Income + Overtime 
## ===========================================
##           Truth
## Prediction Stayed Left
##     Stayed   6761 5587
##     Left     2617 2915
## 
## Accuracy: 0.5412

cm3 <- make_conf_matrix(model3, test_df, "Model 3: Attrition ~ . (all variables)")

## 
## ===========================================
## Model 3: Attrition ~ . (all variables) 
## ===========================================
##           Truth
## Prediction Stayed Left
##     Stayed   7194 2221
##     Left     2184 6281
## 
## Accuracy: 0.7536

4. Visualize the Best Model’s Confusion Matrix

autoplot(cm3, type = "heatmap") +
  scale_fill_gradient(low = "white", high = "steelblue") +
  labs(title = "Confusion Matrix — Model 3 (all variables)")

5. Model Comparison

## 5. Model Comparison

get_metrics <- function(model, test_data, model_name) {
  preds <- predict(model, new_data = test_data, type = "class") %>%
    bind_cols(test_data %>% select(Attrition))
  
  tibble(
    Model       = model_name,
    Accuracy    = accuracy(preds, truth = Attrition, estimate = .pred_class)$.estimate,
    Sensitivity = sens(preds,     truth = Attrition, estimate = .pred_class)$.estimate,
    Specificity = spec(preds,     truth = Attrition, estimate = .pred_class)$.estimate
  )
}

comparison <- bind_rows(
  get_metrics(model1, test_df, "Model 1: Monthly.Income"),
  get_metrics(model2, test_df, "Model 2: Monthly.Income + Overtime"),
  get_metrics(model3, test_df, "Model 3: All variables")
)

knitr::kable(comparison, digits = 4, caption = "Logistic Regression Model Comparison")

Logistic Regression Model Comparison
Model	Accuracy	Sensitivity	Specificity
Model 1: Monthly.Income	0.5245	1.0000	0.0000
Model 2: Monthly.Income + Overtime	0.5412	0.7209	0.3429
Model 3: All variables	0.7536	0.7671	0.7388

Interpretation

The full model (Model 3) substantially outperforms the simpler models, achieving roughly 75% accuracy on the test set. Model 1 (Monthly.Income alone) and Model 2 (adding Overtime) capture only a fraction of the signal, because employee attrition is driven by many interacting factors — job role, work-life balance, satisfaction scores, tenure, and recognition, not just income.

The Model 3 confusion matrix is well-balanced: the model is roughly equally good at identifying employees who stayed (specificity ≈ 77%) and those who left (sensitivity ≈ 74%). For an HR application this is the right tradeoff — you don’t want a model that only catches obvious leavers while missing borderline cases.