library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(caret)

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

Step 1 — Load Data

train <- read.csv("train.csv")
test  <- read.csv("test.csv")

cat("Train dimensions:", dim(train), "\n")

## Train dimensions: 59598 24

cat("Test dimensions: ", dim(test),  "\n")

## Test dimensions:  14900 24

head(train)

##   Employee.ID Age Gender Years.at.Company   Job.Role Monthly.Income
## 1        8410  31   Male               19  Education           5390
## 2       64756  59 Female                4      Media           5534
## 3       30257  24 Female               10 Healthcare           8159
## 4       65791  36 Female                7  Education           3989
## 5       65026  56   Male               41  Education           4821
## 6       24368  38 Female                3 Technology           9977
##   Work.Life.Balance Job.Satisfaction Performance.Rating Number.of.Promotions
## 1         Excellent           Medium            Average                    2
## 2              Poor             High                Low                    3
## 3              Good             High                Low                    0
## 4              Good             High               High                    1
## 5              Fair        Very High            Average                    0
## 6              Fair             High      Below Average                    3
##   Overtime Distance.from.Home   Education.Level Marital.Status
## 1       No                 22  Associate Degree        Married
## 2       No                 21   Master’s Degree       Divorced
## 3       No                 11 Bachelor’s Degree        Married
## 4       No                 27       High School         Single
## 5      Yes                 71       High School       Divorced
## 6       No                 37 Bachelor’s Degree        Married
##   Number.of.Dependents Job.Level Company.Size Company.Tenure Remote.Work
## 1                    0       Mid       Medium             89          No
## 2                    3       Mid       Medium             21          No
## 3                    3       Mid       Medium             74          No
## 4                    2       Mid        Small             50         Yes
## 5                    0    Senior       Medium             68          No
## 6                    0       Mid       Medium             47          No
##   Leadership.Opportunities Innovation.Opportunities Company.Reputation
## 1                       No                       No          Excellent
## 2                       No                       No               Fair
## 3                       No                       No               Poor
## 4                       No                       No               Good
## 5                       No                       No               Fair
## 6                       No                      Yes               Fair
##   Employee.Recognition Attrition
## 1               Medium    Stayed
## 2                  Low    Stayed
## 3                  Low    Stayed
## 4               Medium    Stayed
## 5               Medium    Stayed
## 6                 High      Left

Step 2 — Prepare Data

train$Attrition <- as.factor(train$Attrition)
test$Attrition  <- as.factor(test$Attrition)

train <- train %>% mutate(across(where(is.character), as.factor))
test  <- test  %>% mutate(across(where(is.character), as.factor))

train <- train[, !colnames(train) %in% "Employee.ID"]
test  <- test[,  !colnames(test)  %in% "Employee.ID"]

cat("Attrition distribution in train:\n")

## Attrition distribution in train:

table(train$Attrition)

## 
##   Left Stayed 
##  28338  31260

cat("\nAttrition distribution in test:\n")

## 
## Attrition distribution in test:

table(test$Attrition)

## 
##   Left Stayed 
##   7032   7868

Step 3 — Logistic Regression Models

model1 <- glm(Attrition ~ Monthly.Income,
              data   = train,
              family = binomial)

model2 <- glm(Attrition ~ Monthly.Income + Overtime,
              data   = train,
              family = binomial)

model3 <- glm(Attrition ~ .,
              data   = train,
              family = binomial)

cat("Models estimated successfully.\n")

## Models estimated successfully.

Step 4 — Confusion Matrices

prob1 <- predict(model1, newdata = test, type = "response")
pred1 <- ifelse(prob1 > 0.5, "Stayed", "Left")
pred1 <- factor(pred1, levels = levels(test$Attrition))

prob2 <- predict(model2, newdata = test, type = "response")
pred2 <- ifelse(prob2 > 0.5, "Stayed", "Left")
pred2 <- factor(pred2, levels = levels(test$Attrition))

prob3 <- predict(model3, newdata = test, type = "response")
pred3 <- ifelse(prob3 > 0.5, "Stayed", "Left")
pred3 <- factor(pred3, levels = levels(test$Attrition))

cat("=== Model 1: Attrition ~ MonthlyIncome ===\n")

## === Model 1: Attrition ~ MonthlyIncome ===

cm1 <- confusionMatrix(pred1, test$Attrition)
print(cm1)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Left Stayed
##     Left      0      0
##     Stayed 7032   7868
##                                         
##                Accuracy : 0.5281        
##                  95% CI : (0.52, 0.5361)
##     No Information Rate : 0.5281        
##     P-Value [Acc > NIR] : 0.5033        
##                                         
##                   Kappa : 0             
##                                         
##  Mcnemar's Test P-Value : <2e-16        
##                                         
##             Sensitivity : 0.0000        
##             Specificity : 1.0000        
##          Pos Pred Value :    NaN        
##          Neg Pred Value : 0.5281        
##              Prevalence : 0.4719        
##          Detection Rate : 0.0000        
##    Detection Prevalence : 0.0000        
##       Balanced Accuracy : 0.5000        
##                                         
##        'Positive' Class : Left          
##

cat("\n=== Model 2: Attrition ~ MonthlyIncome + Overtime ===\n")

## 
## === Model 2: Attrition ~ MonthlyIncome + Overtime ===

cm2 <- confusionMatrix(pred2, test$Attrition)
print(cm2)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Left Stayed
##     Left   2502   2381
##     Stayed 4530   5487
##                                           
##                Accuracy : 0.5362          
##                  95% CI : (0.5281, 0.5442)
##     No Information Rate : 0.5281          
##     P-Value [Acc > NIR] : 0.02397         
##                                           
##                   Kappa : 0.0541          
##                                           
##  Mcnemar's Test P-Value : < 2e-16         
##                                           
##             Sensitivity : 0.3558          
##             Specificity : 0.6974          
##          Pos Pred Value : 0.5124          
##          Neg Pred Value : 0.5478          
##              Prevalence : 0.4719          
##          Detection Rate : 0.1679          
##    Detection Prevalence : 0.3277          
##       Balanced Accuracy : 0.5266          
##                                           
##        'Positive' Class : Left            
##

cat("\n=== Model 3: Attrition ~ All Variables ===\n")

## 
## === Model 3: Attrition ~ All Variables ===

cm3 <- confusionMatrix(pred3, test$Attrition)
print(cm3)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Left Stayed
##     Left   5177   1788
##     Stayed 1855   6080
##                                           
##                Accuracy : 0.7555          
##                  95% CI : (0.7485, 0.7624)
##     No Information Rate : 0.5281          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.5092          
##                                           
##  Mcnemar's Test P-Value : 0.2742          
##                                           
##             Sensitivity : 0.7362          
##             Specificity : 0.7728          
##          Pos Pred Value : 0.7433          
##          Neg Pred Value : 0.7662          
##              Prevalence : 0.4719          
##          Detection Rate : 0.3474          
##    Detection Prevalence : 0.4674          
##       Balanced Accuracy : 0.7545          
##                                           
##        'Positive' Class : Left            
##

Step 5 — Model Comparison

cat("=== Model Comparison ===\n\n")

## === Model Comparison ===

cat("Model 1 - Attrition ~ MonthlyIncome\n")

## Model 1 - Attrition ~ MonthlyIncome

cat("  Accuracy:", round(cm1$overall["Accuracy"], 4), "\n")

##   Accuracy: 0.5281

cat("  Kappa:   ", round(cm1$overall["Kappa"], 4), "\n\n")

##   Kappa:    0

cat("Model 2 - Attrition ~ MonthlyIncome + Overtime\n")

## Model 2 - Attrition ~ MonthlyIncome + Overtime

cat("  Accuracy:", round(cm2$overall["Accuracy"], 4), "\n")

##   Accuracy: 0.5362

cat("  Kappa:   ", round(cm2$overall["Kappa"], 4), "\n\n")

##   Kappa:    0.0541

cat("Model 3 - Attrition ~ All Variables\n")

## Model 3 - Attrition ~ All Variables

cat("  Accuracy:", round(cm3$overall["Accuracy"], 4), "\n")

##   Accuracy: 0.7555

cat("  Kappa:   ", round(cm3$overall["Kappa"], 4), "\n")

##   Kappa:    0.5092

Interpretation:

Model 1 (MonthlyIncome only): Accuracy = 52.81%, Kappa = 0. The model always predicts “Stayed” and is completely useless. MonthlyIncome alone cannot predict attrition.
Model 2 (MonthlyIncome + Overtime): Accuracy = 53.62%, Kappa = 0.05. A very slight improvement by adding Overtime, but still very weak predictive power.
Model 3 (All variables): Accuracy = 75.55%, Kappa = 0.51. By far the best model with moderate-to-good agreement. Using all available variables captures the multifactorial nature of employee attrition.

Conclusion: Employee attrition is a multifactorial phenomenon. A single predictor like MonthlyIncome is insufficient. The full model with all variables achieves 75.55% accuracy and should be preferred for prediction.

Homework7

Kouassi Yoan Henoc

2026-04-22

Step 1 — Load Data

Step 2 — Prepare Data

Step 3 — Logistic Regression Models

Step 4 — Confusion Matrices

Step 5 — Model Comparison