We load the data from testa.csv. We must ensure
categorical variables are factors and the target variable
Attrition is correctly leveled.
library(tidyverse)
library(rsample)
library(caret)
# Use the name 'testa.csv'
df <- read_csv("testa.csv")
df_clean <- df %>%
rename(MonthlyIncome = `Monthly Income`) %>%
mutate(
Attrition = factor(Attrition, levels = c("Stayed", "Left")),
Overtime = as.factor(Overtime)
) %>%
select(-`Employee ID`)
set.seed(123)
split <- initial_split(df_clean, prop = 0.7, strata = "Attrition")
train_data <- training(split)
test_data <- testing(split)
We estimate three models with increasing complexity to see how adding predictors improves our ability to predict attrition.
# Model 1: Single predictor
m1 <- glm(Attrition ~ MonthlyIncome, data = train_data, family = "binomial")
# Model 2: Two predictors
m2 <- glm(Attrition ~ MonthlyIncome + Overtime, data = train_data, family = "binomial")
# Model 3: All available variables
m3 <- glm(Attrition ~ ., data = train_data, family = "binomial")
The confusion matrix compares the model’s predictions against the actual outcomes in the test set.
# Evaluation Function
get_results <- function(model, label) {
probs <- predict(model, newdata = test_data, type = "response")
preds <- factor(ifelse(probs > 0.5, "Left", "Stayed"), levels = c("Stayed", "Left"))
cat("\n---", label, "---\n")
print(confusionMatrix(preds, test_data$Attrition, positive = "Left")$table)
}
get_results(m1, "Model 1: Monthly Income")
##
## --- Model 1: Monthly Income ---
## Reference
## Prediction Stayed Left
## Stayed 2361 2110
## Left 0 0
get_results(m2, "Model 2: Monthly Income + Overtime")
##
## --- Model 2: Monthly Income + Overtime ---
## Reference
## Prediction Stayed Left
## Stayed 1657 1362
## Left 704 748
get_results(m3, "Model 3: All Variables")
##
## --- Model 3: All Variables ---
## Reference
## Prediction Stayed Left
## Stayed 1795 540
## Left 566 1570