1. Data Preparation

We load the data from testa.csv. We must ensure categorical variables are factors and the target variable Attrition is correctly leveled.

library(tidyverse)
library(rsample)   
library(caret)     

# Use the name 'testa.csv'
df <- read_csv("testa.csv")

df_clean <- df %>%
  rename(MonthlyIncome = `Monthly Income`) %>%
  mutate(
    Attrition = factor(Attrition, levels = c("Stayed", "Left")),
    Overtime = as.factor(Overtime)
  ) %>%
  select(-`Employee ID`) 

set.seed(123)
split <- initial_split(df_clean, prop = 0.7, strata = "Attrition")
train_data <- training(split)
test_data  <- testing(split)

2. Model Estimation

We estimate three models with increasing complexity to see how adding predictors improves our ability to predict attrition.

# Model 1: Single predictor
m1 <- glm(Attrition ~ MonthlyIncome, data = train_data, family = "binomial")

# Model 2: Two predictors
m2 <- glm(Attrition ~ MonthlyIncome + Overtime, data = train_data, family = "binomial")

# Model 3: All available variables
m3 <- glm(Attrition ~ ., data = train_data, family = "binomial")

3. Confusion Matrices

The confusion matrix compares the model’s predictions against the actual outcomes in the test set.

# Evaluation Function
get_results <- function(model, label) {
  probs <- predict(model, newdata = test_data, type = "response")
  preds <- factor(ifelse(probs > 0.5, "Left", "Stayed"), levels = c("Stayed", "Left"))
  
  cat("\n---", label, "---\n")
  print(confusionMatrix(preds, test_data$Attrition, positive = "Left")$table)
}

get_results(m1, "Model 1: Monthly Income")
## 
## --- Model 1: Monthly Income ---
##           Reference
## Prediction Stayed Left
##     Stayed   2361 2110
##     Left        0    0
get_results(m2, "Model 2: Monthly Income + Overtime")
## 
## --- Model 2: Monthly Income + Overtime ---
##           Reference
## Prediction Stayed Left
##     Stayed   1657 1362
##     Left      704  748
get_results(m3, "Model 3: All Variables")
## 
## --- Model 3: All Variables ---
##           Reference
## Prediction Stayed Left
##     Stayed   1795  540
##     Left      566 1570