library(tidyverse)
library(caret)
train <- read.csv("train.csv", stringsAsFactors = TRUE)
test <- read.csv("test.csv", stringsAsFactors = TRUE)
# Recode Attrition: "Left" = Yes, "Stayed" = No
train$Attrition <- factor(ifelse(train$Attrition == "Left", "Yes", "No"), levels = c("No", "Yes"))
test$Attrition <- factor(ifelse(test$Attrition == "Left", "Yes", "No"), levels = c("No", "Yes"))
# Remove ID column
train <- train %>% select(-Employee.ID)
test <- test %>% select(-Employee.ID)
# Confirm Attrition distribution
table(train$Attrition)
##
## No Yes
## 31260 28338
table(test$Attrition)
##
## No Yes
## 7868 7032
# Model 1: Attrition ~ Monthly.Income
model1 <- glm(Attrition ~ Monthly.Income,
data = train, family = binomial)
# Model 2: Attrition ~ Monthly.Income + Overtime
model2 <- glm(Attrition ~ Monthly.Income + Overtime,
data = train, family = binomial)
# Model 3: Attrition ~ . (all variables)
model3 <- glm(Attrition ~ .,
data = train, family = binomial)
summary(model1)
##
## Call:
## glm(formula = Attrition ~ Monthly.Income, family = binomial,
## data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.081e-02 2.902e-02 -0.717 0.47334
## Monthly.Income -1.059e-05 3.813e-06 -2.777 0.00548 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 82477 on 59597 degrees of freedom
## Residual deviance: 82469 on 59596 degrees of freedom
## AIC: 82473
##
## Number of Fisher Scoring iterations: 3
summary(model2)
##
## Call:
## glm(formula = Attrition ~ Monthly.Income + Overtime, family = binomial,
## data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.001e-01 2.965e-02 -3.375 0.000737 ***
## Monthly.Income -1.038e-05 3.819e-06 -2.717 0.006578 **
## OvertimeYes 2.374e-01 1.750e-02 13.566 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 82477 on 59597 degrees of freedom
## Residual deviance: 82285 on 59595 degrees of freedom
## AIC: 82291
##
## Number of Fisher Scoring iterations: 3
summary(model3)
##
## Call:
## glm(formula = Attrition ~ ., family = binomial, data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.563e-01 8.586e-02 2.985 0.002836 **
## Age -5.998e-03 1.004e-03 -5.974 2.31e-09 ***
## GenderMale -6.262e-01 2.078e-02 -30.131 < 2e-16 ***
## Years.at.Company -1.360e-02 1.174e-03 -11.588 < 2e-16 ***
## Job.RoleFinance -9.223e-02 4.838e-02 -1.906 0.056596 .
## Job.RoleHealthcare -7.214e-02 4.238e-02 -1.702 0.088761 .
## Job.RoleMedia -9.867e-02 3.616e-02 -2.729 0.006353 **
## Job.RoleTechnology -8.476e-02 4.847e-02 -1.749 0.080344 .
## Monthly.Income -6.718e-06 8.246e-06 -0.815 0.415242
## Work.Life.BalanceFair 1.327e+00 3.129e-02 42.410 < 2e-16 ***
## Work.Life.BalanceGood 2.934e-01 2.958e-02 9.916 < 2e-16 ***
## Work.Life.BalancePoor 1.509e+00 3.757e-02 40.167 < 2e-16 ***
## Job.SatisfactionLow 4.880e-01 3.572e-02 13.664 < 2e-16 ***
## Job.SatisfactionMedium 1.133e-02 2.721e-02 0.417 0.676953
## Job.SatisfactionVery High 4.988e-01 2.701e-02 18.469 < 2e-16 ***
## Performance.RatingBelow Average 3.339e-01 2.948e-02 11.324 < 2e-16 ***
## Performance.RatingHigh 5.078e-03 2.647e-02 0.192 0.847882
## Performance.RatingLow 5.972e-01 4.847e-02 12.321 < 2e-16 ***
## Number.of.Promotions -2.492e-01 1.043e-02 -23.900 < 2e-16 ***
## OvertimeYes 3.512e-01 2.189e-02 16.043 < 2e-16 ***
## Distance.from.Home 9.950e-03 3.636e-04 27.365 < 2e-16 ***
## Education.LevelBachelor’s Degree 4.581e-02 2.757e-02 1.662 0.096584 .
## Education.LevelHigh School 3.035e-02 3.072e-02 0.988 0.323037
## Education.LevelMaster’s Degree 3.087e-02 3.048e-02 1.013 0.311008
## Education.LevelPhD -1.564e+00 5.447e-02 -28.714 < 2e-16 ***
## Marital.StatusMarried -2.559e-01 2.957e-02 -8.654 < 2e-16 ***
## Marital.StatusSingle 1.573e+00 3.220e-02 48.863 < 2e-16 ***
## Number.of.Dependents -1.573e-01 6.660e-03 -23.621 < 2e-16 ***
## Job.LevelMid -1.003e+00 2.266e-02 -44.271 < 2e-16 ***
## Job.LevelSenior -2.616e+00 3.262e-02 -80.221 < 2e-16 ***
## Company.SizeMedium 6.143e-03 2.713e-02 0.226 0.820849
## Company.SizeSmall 2.063e-01 2.958e-02 6.975 3.06e-12 ***
## Company.Tenure -2.036e-04 4.501e-04 -0.452 0.651100
## Remote.WorkYes -1.775e+00 2.916e-02 -60.888 < 2e-16 ***
## Leadership.OpportunitiesYes -1.627e-01 4.753e-02 -3.424 0.000618 ***
## Innovation.OpportunitiesYes -1.410e-01 2.784e-02 -5.064 4.10e-07 ***
## Company.ReputationFair 4.698e-01 3.962e-02 11.858 < 2e-16 ***
## Company.ReputationGood -6.027e-02 3.535e-02 -1.705 0.088173 .
## Company.ReputationPoor 7.564e-01 3.972e-02 19.043 < 2e-16 ***
## Employee.RecognitionLow 3.956e-02 2.620e-02 1.510 0.131020
## Employee.RecognitionMedium 4.354e-02 2.769e-02 1.572 0.115845
## Employee.RecognitionVery High -8.300e-02 5.019e-02 -1.654 0.098177 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 82477 on 59597 degrees of freedom
## Residual deviance: 57833 on 59556 degrees of freedom
## AIC: 57917
##
## Number of Fisher Scoring iterations: 5
predict_class <- function(model, newdata, threshold = 0.5) {
probs <- predict(model, newdata = newdata, type = "response")
factor(ifelse(probs >= threshold, "Yes", "No"), levels = c("No", "Yes"))
}
pred1 <- predict_class(model1, test)
pred2 <- predict_class(model2, test)
pred3 <- predict_class(model3, test)
cm1 <- confusionMatrix(pred1, test$Attrition, positive = "Yes")
print(cm1)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 7868 7032
## Yes 0 0
##
## Accuracy : 0.5281
## 95% CI : (0.52, 0.5361)
## No Information Rate : 0.5281
## P-Value [Acc > NIR] : 0.5033
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.0000
## Specificity : 1.0000
## Pos Pred Value : NaN
## Neg Pred Value : 0.5281
## Prevalence : 0.4719
## Detection Rate : 0.0000
## Detection Prevalence : 0.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : Yes
##
cm2 <- confusionMatrix(pred2, test$Attrition, positive = "Yes")
print(cm2)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 5487 4530
## Yes 2381 2502
##
## Accuracy : 0.5362
## 95% CI : (0.5281, 0.5442)
## No Information Rate : 0.5281
## P-Value [Acc > NIR] : 0.02397
##
## Kappa : 0.0541
##
## Mcnemar's Test P-Value : < 2e-16
##
## Sensitivity : 0.3558
## Specificity : 0.6974
## Pos Pred Value : 0.5124
## Neg Pred Value : 0.5478
## Prevalence : 0.4719
## Detection Rate : 0.1679
## Detection Prevalence : 0.3277
## Balanced Accuracy : 0.5266
##
## 'Positive' Class : Yes
##
cm3 <- confusionMatrix(pred3, test$Attrition, positive = "Yes")
print(cm3)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 6080 1855
## Yes 1788 5177
##
## Accuracy : 0.7555
## 95% CI : (0.7485, 0.7624)
## No Information Rate : 0.5281
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.5092
##
## Mcnemar's Test P-Value : 0.2742
##
## Sensitivity : 0.7362
## Specificity : 0.7728
## Pos Pred Value : 0.7433
## Neg Pred Value : 0.7662
## Prevalence : 0.4719
## Detection Rate : 0.3474
## Detection Prevalence : 0.4674
## Balanced Accuracy : 0.7545
##
## 'Positive' Class : Yes
##
metrics <- data.frame(
Model = c("Model 1", "Model 2", "Model 3"),
Accuracy = c(cm1$overall["Accuracy"],
cm2$overall["Accuracy"],
cm3$overall["Accuracy"]),
Sensitivity = c(cm1$byClass["Sensitivity"],
cm2$byClass["Sensitivity"],
cm3$byClass["Sensitivity"]),
Specificity = c(cm1$byClass["Specificity"],
cm2$byClass["Specificity"],
cm3$byClass["Specificity"])
)
print(metrics)
## Model Accuracy Sensitivity Specificity
## 1 Model 1 0.5280537 0.0000000 1.0000000
## 2 Model 2 0.5361745 0.3558020 0.6973818
## 3 Model 3 0.7555034 0.7362059 0.7727504
Three logistic regression models were estimated using the Employee Attrition dataset and evaluated on a held-out test set.
Model 1
(Attrition ~ Monthly.Income) performed poorly, predicting
every employee as “Stayed” and achieving an accuracy of only 52.8% with
zero sensitivity — meaning it failed to identify any employee who
actually left.
Model 2
(Attrition ~ Monthly.Income + Overtime) showed a slight
improvement with 53.6% accuracy and 35.6% sensitivity, suggesting that
Overtime adds some predictive power beyond income alone.
Model 3 (Attrition ~ .) using all
available variables performed significantly better, achieving 75.6%
accuracy, 73.6% sensitivity, and 77.3% specificity. This indicates that
employee attrition is best predicted using a combination of demographic,
job-related, and organizational factors rather than any single variable.
```