library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
train <- read.csv("train.csv")
test <- read.csv("test.csv")
cat("Train dimensions:", dim(train), "\n")
## Train dimensions: 59598 24
cat("Test dimensions: ", dim(test), "\n")
## Test dimensions: 14900 24
head(train)
## Employee.ID Age Gender Years.at.Company Job.Role Monthly.Income
## 1 8410 31 Male 19 Education 5390
## 2 64756 59 Female 4 Media 5534
## 3 30257 24 Female 10 Healthcare 8159
## 4 65791 36 Female 7 Education 3989
## 5 65026 56 Male 41 Education 4821
## 6 24368 38 Female 3 Technology 9977
## Work.Life.Balance Job.Satisfaction Performance.Rating Number.of.Promotions
## 1 Excellent Medium Average 2
## 2 Poor High Low 3
## 3 Good High Low 0
## 4 Good High High 1
## 5 Fair Very High Average 0
## 6 Fair High Below Average 3
## Overtime Distance.from.Home Education.Level Marital.Status
## 1 No 22 Associate Degree Married
## 2 No 21 Master’s Degree Divorced
## 3 No 11 Bachelor’s Degree Married
## 4 No 27 High School Single
## 5 Yes 71 High School Divorced
## 6 No 37 Bachelor’s Degree Married
## Number.of.Dependents Job.Level Company.Size Company.Tenure Remote.Work
## 1 0 Mid Medium 89 No
## 2 3 Mid Medium 21 No
## 3 3 Mid Medium 74 No
## 4 2 Mid Small 50 Yes
## 5 0 Senior Medium 68 No
## 6 0 Mid Medium 47 No
## Leadership.Opportunities Innovation.Opportunities Company.Reputation
## 1 No No Excellent
## 2 No No Fair
## 3 No No Poor
## 4 No No Good
## 5 No No Fair
## 6 No Yes Fair
## Employee.Recognition Attrition
## 1 Medium Stayed
## 2 Low Stayed
## 3 Low Stayed
## 4 Medium Stayed
## 5 Medium Stayed
## 6 High Left
train$Attrition <- as.factor(train$Attrition)
test$Attrition <- as.factor(test$Attrition)
train <- train %>% mutate(across(where(is.character), as.factor))
test <- test %>% mutate(across(where(is.character), as.factor))
train <- train[, !colnames(train) %in% "Employee.ID"]
test <- test[, !colnames(test) %in% "Employee.ID"]
cat("Attrition distribution in train:\n")
## Attrition distribution in train:
table(train$Attrition)
##
## Left Stayed
## 28338 31260
cat("\nAttrition distribution in test:\n")
##
## Attrition distribution in test:
table(test$Attrition)
##
## Left Stayed
## 7032 7868
model1 <- glm(Attrition ~ Monthly.Income,
data = train,
family = binomial)
model2 <- glm(Attrition ~ Monthly.Income + Overtime,
data = train,
family = binomial)
model3 <- glm(Attrition ~ .,
data = train,
family = binomial)
cat("Models estimated successfully.\n")
## Models estimated successfully.
prob1 <- predict(model1, newdata = test, type = "response")
pred1 <- ifelse(prob1 > 0.5, "Stayed", "Left")
pred1 <- factor(pred1, levels = levels(test$Attrition))
prob2 <- predict(model2, newdata = test, type = "response")
pred2 <- ifelse(prob2 > 0.5, "Stayed", "Left")
pred2 <- factor(pred2, levels = levels(test$Attrition))
prob3 <- predict(model3, newdata = test, type = "response")
pred3 <- ifelse(prob3 > 0.5, "Stayed", "Left")
pred3 <- factor(pred3, levels = levels(test$Attrition))
cat("=== Model 1: Attrition ~ MonthlyIncome ===\n")
## === Model 1: Attrition ~ MonthlyIncome ===
cm1 <- confusionMatrix(pred1, test$Attrition)
print(cm1)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Left Stayed
## Left 0 0
## Stayed 7032 7868
##
## Accuracy : 0.5281
## 95% CI : (0.52, 0.5361)
## No Information Rate : 0.5281
## P-Value [Acc > NIR] : 0.5033
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.0000
## Specificity : 1.0000
## Pos Pred Value : NaN
## Neg Pred Value : 0.5281
## Prevalence : 0.4719
## Detection Rate : 0.0000
## Detection Prevalence : 0.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : Left
##
cat("\n=== Model 2: Attrition ~ MonthlyIncome + Overtime ===\n")
##
## === Model 2: Attrition ~ MonthlyIncome + Overtime ===
cm2 <- confusionMatrix(pred2, test$Attrition)
print(cm2)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Left Stayed
## Left 2502 2381
## Stayed 4530 5487
##
## Accuracy : 0.5362
## 95% CI : (0.5281, 0.5442)
## No Information Rate : 0.5281
## P-Value [Acc > NIR] : 0.02397
##
## Kappa : 0.0541
##
## Mcnemar's Test P-Value : < 2e-16
##
## Sensitivity : 0.3558
## Specificity : 0.6974
## Pos Pred Value : 0.5124
## Neg Pred Value : 0.5478
## Prevalence : 0.4719
## Detection Rate : 0.1679
## Detection Prevalence : 0.3277
## Balanced Accuracy : 0.5266
##
## 'Positive' Class : Left
##
cat("\n=== Model 3: Attrition ~ All Variables ===\n")
##
## === Model 3: Attrition ~ All Variables ===
cm3 <- confusionMatrix(pred3, test$Attrition)
print(cm3)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Left Stayed
## Left 5177 1788
## Stayed 1855 6080
##
## Accuracy : 0.7555
## 95% CI : (0.7485, 0.7624)
## No Information Rate : 0.5281
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.5092
##
## Mcnemar's Test P-Value : 0.2742
##
## Sensitivity : 0.7362
## Specificity : 0.7728
## Pos Pred Value : 0.7433
## Neg Pred Value : 0.7662
## Prevalence : 0.4719
## Detection Rate : 0.3474
## Detection Prevalence : 0.4674
## Balanced Accuracy : 0.7545
##
## 'Positive' Class : Left
##
cat("=== Model Comparison ===\n\n")
## === Model Comparison ===
cat("Model 1 - Attrition ~ MonthlyIncome\n")
## Model 1 - Attrition ~ MonthlyIncome
cat(" Accuracy:", round(cm1$overall["Accuracy"], 4), "\n")
## Accuracy: 0.5281
cat(" Kappa: ", round(cm1$overall["Kappa"], 4), "\n\n")
## Kappa: 0
cat("Model 2 - Attrition ~ MonthlyIncome + Overtime\n")
## Model 2 - Attrition ~ MonthlyIncome + Overtime
cat(" Accuracy:", round(cm2$overall["Accuracy"], 4), "\n")
## Accuracy: 0.5362
cat(" Kappa: ", round(cm2$overall["Kappa"], 4), "\n\n")
## Kappa: 0.0541
cat("Model 3 - Attrition ~ All Variables\n")
## Model 3 - Attrition ~ All Variables
cat(" Accuracy:", round(cm3$overall["Accuracy"], 4), "\n")
## Accuracy: 0.7555
cat(" Kappa: ", round(cm3$overall["Kappa"], 4), "\n")
## Kappa: 0.5092
Interpretation:
Model 1 (MonthlyIncome only): Accuracy = 52.81%, Kappa = 0. The model always predicts “Stayed” and is completely useless. MonthlyIncome alone cannot predict attrition.
Model 2 (MonthlyIncome + Overtime): Accuracy = 53.62%, Kappa = 0.05. A very slight improvement by adding Overtime, but still very weak predictive power.
Model 3 (All variables): Accuracy = 75.55%, Kappa = 0.51. By far the best model with moderate-to-good agreement. Using all available variables captures the multifactorial nature of employee attrition.
Conclusion: Employee attrition is a multifactorial phenomenon. A single predictor like MonthlyIncome is insufficient. The full model with all variables achieves 75.55% accuracy and should be preferred for prediction.