attrition <- read.csv("train.csv", stringsAsFactors = TRUE)
glimpse(attrition)
## Rows: 59,598
## Columns: 24
## $ Employee.ID <int> 8410, 64756, 30257, 65791, 65026, 24368, 6497…
## $ Age <int> 31, 59, 24, 36, 56, 38, 47, 48, 57, 24, 30, 2…
## $ Gender <fct> Male, Female, Female, Female, Male, Female, M…
## $ Years.at.Company <int> 19, 4, 10, 7, 41, 3, 23, 16, 44, 1, 12, 6, 38…
## $ Job.Role <fct> Education, Media, Healthcare, Education, Educ…
## $ Monthly.Income <int> 5390, 5534, 8159, 3989, 4821, 9977, 3681, 112…
## $ Work.Life.Balance <fct> Excellent, Poor, Good, Good, Fair, Fair, Fair…
## $ Job.Satisfaction <fct> Medium, High, High, High, Very High, High, Hi…
## $ Performance.Rating <fct> Average, Low, Low, High, Average, Below Avera…
## $ Number.of.Promotions <int> 2, 3, 0, 1, 0, 3, 1, 2, 1, 1, 1, 2, 1, 4, 0, …
## $ Overtime <fct> No, No, No, No, Yes, No, Yes, No, Yes, Yes, N…
## $ Distance.from.Home <int> 22, 21, 11, 27, 71, 37, 75, 5, 39, 57, 51, 26…
## $ Education.Level <fct> Associate Degree, Master’s Degree, Bachelor’s…
## $ Marital.Status <fct> Married, Divorced, Married, Single, Divorced,…
## $ Number.of.Dependents <int> 0, 3, 3, 2, 0, 0, 3, 4, 4, 4, 1, 0, 0, 2, 0, …
## $ Job.Level <fct> Mid, Mid, Mid, Mid, Senior, Mid, Entry, Entry…
## $ Company.Size <fct> Medium, Medium, Medium, Small, Medium, Medium…
## $ Company.Tenure <int> 89, 21, 74, 50, 68, 47, 93, 88, 75, 45, 17, 3…
## $ Remote.Work <fct> No, No, No, Yes, No, No, No, No, No, No, No, …
## $ Leadership.Opportunities <fct> No, No, No, No, No, No, No, No, No, No, No, N…
## $ Innovation.Opportunities <fct> No, No, No, No, No, Yes, No, No, No, Yes, No,…
## $ Company.Reputation <fct> Excellent, Fair, Poor, Good, Fair, Fair, Good…
## $ Employee.Recognition <fct> Medium, Low, Low, Medium, Medium, High, Mediu…
## $ Attrition <fct> Stayed, Stayed, Stayed, Stayed, Stayed, Left,…
attrition <- read.csv("train.csv", stringsAsFactors = TRUE)
# Drop the ID column
attrition <- attrition %>% select(-Employee.ID)
# Set "Left" as the positive class
attrition$Attrition <- relevel(attrition$Attrition, ref = "Stayed")
glimpse(attrition)
## Rows: 59,598
## Columns: 23
## $ Age <int> 31, 59, 24, 36, 56, 38, 47, 48, 57, 24, 30, 2…
## $ Gender <fct> Male, Female, Female, Female, Male, Female, M…
## $ Years.at.Company <int> 19, 4, 10, 7, 41, 3, 23, 16, 44, 1, 12, 6, 38…
## $ Job.Role <fct> Education, Media, Healthcare, Education, Educ…
## $ Monthly.Income <int> 5390, 5534, 8159, 3989, 4821, 9977, 3681, 112…
## $ Work.Life.Balance <fct> Excellent, Poor, Good, Good, Fair, Fair, Fair…
## $ Job.Satisfaction <fct> Medium, High, High, High, Very High, High, Hi…
## $ Performance.Rating <fct> Average, Low, Low, High, Average, Below Avera…
## $ Number.of.Promotions <int> 2, 3, 0, 1, 0, 3, 1, 2, 1, 1, 1, 2, 1, 4, 0, …
## $ Overtime <fct> No, No, No, No, Yes, No, Yes, No, Yes, Yes, N…
## $ Distance.from.Home <int> 22, 21, 11, 27, 71, 37, 75, 5, 39, 57, 51, 26…
## $ Education.Level <fct> Associate Degree, Master’s Degree, Bachelor’s…
## $ Marital.Status <fct> Married, Divorced, Married, Single, Divorced,…
## $ Number.of.Dependents <int> 0, 3, 3, 2, 0, 0, 3, 4, 4, 4, 1, 0, 0, 2, 0, …
## $ Job.Level <fct> Mid, Mid, Mid, Mid, Senior, Mid, Entry, Entry…
## $ Company.Size <fct> Medium, Medium, Medium, Small, Medium, Medium…
## $ Company.Tenure <int> 89, 21, 74, 50, 68, 47, 93, 88, 75, 45, 17, 3…
## $ Remote.Work <fct> No, No, No, Yes, No, No, No, No, No, No, No, …
## $ Leadership.Opportunities <fct> No, No, No, No, No, No, No, No, No, No, No, N…
## $ Innovation.Opportunities <fct> No, No, No, No, No, Yes, No, No, No, Yes, No,…
## $ Company.Reputation <fct> Excellent, Fair, Poor, Good, Fair, Fair, Good…
## $ Employee.Recognition <fct> Medium, Low, Low, Medium, Medium, High, Mediu…
## $ Attrition <fct> Stayed, Stayed, Stayed, Stayed, Stayed, Left,…
set.seed(123)
split <- initial_split(attrition, prop = 0.7, strata = "Attrition")
train_df <- training(split)
test_df <- testing(split)
cat("Training set:", nrow(train_df), "rows\n")
## Training set: 41718 rows
cat("Test set: ", nrow(test_df), "rows\n")
## Test set: 17880 rows
lr_model <- logistic_reg() %>%
set_engine("glm") %>%
set_mode("classification")
model1 <- lr_model %>%
fit(Attrition ~ Monthly.Income, data = train_df)
tidy(model1)
## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -0.00840 0.0347 -0.242 0.808
## 2 Monthly.Income -0.0000123 0.00000456 -2.70 0.00692
model2 <- lr_model %>%
fit(Attrition ~ Monthly.Income + Overtime, data = train_df)
tidy(model2)
## # A tibble: 3 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -0.0810 0.0354 -2.29 2.23e- 2
## 2 Monthly.Income -0.0000119 0.00000456 -2.60 9.32e- 3
## 3 OvertimeYes 0.211 0.0209 10.1 5.80e-24
model3 <- lr_model %>%
fit(Attrition ~ ., data = train_df)
tidy(model3)
## # A tibble: 42 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 0.269 0.103 2.62 8.77e- 3
## 2 Age -0.00694 0.00120 -5.79 7.17e- 9
## 3 GenderMale -0.628 0.0248 -25.3 3.66e-141
## 4 Years.at.Company -0.0134 0.00140 -9.56 1.20e- 21
## 5 Job.RoleFinance -0.136 0.0579 -2.34 1.92e- 2
## 6 Job.RoleHealthcare -0.107 0.0506 -2.10 3.54e- 2
## 7 Job.RoleMedia -0.0866 0.0430 -2.01 4.40e- 2
## 8 Job.RoleTechnology -0.121 0.0579 -2.09 3.65e- 2
## 9 Monthly.Income -0.00000143 0.00000986 -0.145 8.85e- 1
## 10 Work.Life.BalanceFair 1.32 0.0374 35.4 3.92e-274
## # ℹ 32 more rows
make_conf_matrix <- function(model, test_data, label) {
preds <- predict(model, new_data = test_data, type = "class") %>%
bind_cols(test_data %>% select(Attrition))
cm <- conf_mat(preds, truth = Attrition, estimate = .pred_class)
acc <- accuracy(preds, truth = Attrition, estimate = .pred_class)
cat("\n===========================================\n")
cat(label, "\n")
cat("===========================================\n")
print(cm)
cat("\nAccuracy:", round(acc$.estimate, 4), "\n")
return(cm)
}
cm1 <- make_conf_matrix(model1, test_df, "Model 1: Attrition ~ Monthly.Income")
##
## ===========================================
## Model 1: Attrition ~ Monthly.Income
## ===========================================
## Truth
## Prediction Stayed Left
## Stayed 9378 8502
## Left 0 0
##
## Accuracy: 0.5245
cm2 <- make_conf_matrix(model2, test_df, "Model 2: Attrition ~ Monthly.Income + Overtime")
##
## ===========================================
## Model 2: Attrition ~ Monthly.Income + Overtime
## ===========================================
## Truth
## Prediction Stayed Left
## Stayed 6761 5587
## Left 2617 2915
##
## Accuracy: 0.5412
cm3 <- make_conf_matrix(model3, test_df, "Model 3: Attrition ~ . (all variables)")
##
## ===========================================
## Model 3: Attrition ~ . (all variables)
## ===========================================
## Truth
## Prediction Stayed Left
## Stayed 7194 2221
## Left 2184 6281
##
## Accuracy: 0.7536
autoplot(cm3, type = "heatmap") +
scale_fill_gradient(low = "white", high = "steelblue") +
labs(title = "Confusion Matrix — Model 3 (all variables)")
## 5. Model Comparison
get_metrics <- function(model, test_data, model_name) {
preds <- predict(model, new_data = test_data, type = "class") %>%
bind_cols(test_data %>% select(Attrition))
tibble(
Model = model_name,
Accuracy = accuracy(preds, truth = Attrition, estimate = .pred_class)$.estimate,
Sensitivity = sens(preds, truth = Attrition, estimate = .pred_class)$.estimate,
Specificity = spec(preds, truth = Attrition, estimate = .pred_class)$.estimate
)
}
comparison <- bind_rows(
get_metrics(model1, test_df, "Model 1: Monthly.Income"),
get_metrics(model2, test_df, "Model 2: Monthly.Income + Overtime"),
get_metrics(model3, test_df, "Model 3: All variables")
)
knitr::kable(comparison, digits = 4, caption = "Logistic Regression Model Comparison")
| Model | Accuracy | Sensitivity | Specificity |
|---|---|---|---|
| Model 1: Monthly.Income | 0.5245 | 1.0000 | 0.0000 |
| Model 2: Monthly.Income + Overtime | 0.5412 | 0.7209 | 0.3429 |
| Model 3: All variables | 0.7536 | 0.7671 | 0.7388 |
The full model (Model 3) substantially outperforms the simpler models, achieving roughly 75% accuracy on the test set. Model 1 (Monthly.Income alone) and Model 2 (adding Overtime) capture only a fraction of the signal, because employee attrition is driven by many interacting factors — job role, work-life balance, satisfaction scores, tenure, and recognition, not just income.
The Model 3 confusion matrix is well-balanced: the model is roughly equally good at identifying employees who stayed (specificity ≈ 77%) and those who left (sensitivity ≈ 74%). For an HR application this is the right tradeoff — you don’t want a model that only catches obvious leavers while missing borderline cases.