1. Load Libraries

library(tidyverse)
library(caret)

2. Load & Prepare Data

train <- read.csv("train.csv", stringsAsFactors = TRUE)
test  <- read.csv("test.csv",  stringsAsFactors = TRUE)

# Recode Attrition: "Left" = Yes, "Stayed" = No
train$Attrition <- factor(ifelse(train$Attrition == "Left", "Yes", "No"), levels = c("No", "Yes"))
test$Attrition  <- factor(ifelse(test$Attrition  == "Left", "Yes", "No"), levels = c("No", "Yes"))

# Remove ID column
train <- train %>% select(-Employee.ID)
test  <- test  %>% select(-Employee.ID)

# Confirm Attrition distribution
table(train$Attrition)
## 
##    No   Yes 
## 31260 28338
table(test$Attrition)
## 
##   No  Yes 
## 7868 7032

3. Fit Three Logistic Regression Models

# Model 1: Attrition ~ Monthly.Income
model1 <- glm(Attrition ~ Monthly.Income,
              data = train, family = binomial)

# Model 2: Attrition ~ Monthly.Income + Overtime
model2 <- glm(Attrition ~ Monthly.Income + Overtime,
              data = train, family = binomial)

# Model 3: Attrition ~ . (all variables)
model3 <- glm(Attrition ~ .,
              data = train, family = binomial)

Model 1 Summary

summary(model1)
## 
## Call:
## glm(formula = Attrition ~ Monthly.Income, family = binomial, 
##     data = train)
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)   
## (Intercept)    -2.081e-02  2.902e-02  -0.717  0.47334   
## Monthly.Income -1.059e-05  3.813e-06  -2.777  0.00548 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 82477  on 59597  degrees of freedom
## Residual deviance: 82469  on 59596  degrees of freedom
## AIC: 82473
## 
## Number of Fisher Scoring iterations: 3

Model 2 Summary

summary(model2)
## 
## Call:
## glm(formula = Attrition ~ Monthly.Income + Overtime, family = binomial, 
##     data = train)
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -1.001e-01  2.965e-02  -3.375 0.000737 ***
## Monthly.Income -1.038e-05  3.819e-06  -2.717 0.006578 ** 
## OvertimeYes     2.374e-01  1.750e-02  13.566  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 82477  on 59597  degrees of freedom
## Residual deviance: 82285  on 59595  degrees of freedom
## AIC: 82291
## 
## Number of Fisher Scoring iterations: 3

Model 3 Summary

summary(model3)
## 
## Call:
## glm(formula = Attrition ~ ., family = binomial, data = train)
## 
## Coefficients:
##                                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                       2.563e-01  8.586e-02   2.985 0.002836 ** 
## Age                              -5.998e-03  1.004e-03  -5.974 2.31e-09 ***
## GenderMale                       -6.262e-01  2.078e-02 -30.131  < 2e-16 ***
## Years.at.Company                 -1.360e-02  1.174e-03 -11.588  < 2e-16 ***
## Job.RoleFinance                  -9.223e-02  4.838e-02  -1.906 0.056596 .  
## Job.RoleHealthcare               -7.214e-02  4.238e-02  -1.702 0.088761 .  
## Job.RoleMedia                    -9.867e-02  3.616e-02  -2.729 0.006353 ** 
## Job.RoleTechnology               -8.476e-02  4.847e-02  -1.749 0.080344 .  
## Monthly.Income                   -6.718e-06  8.246e-06  -0.815 0.415242    
## Work.Life.BalanceFair             1.327e+00  3.129e-02  42.410  < 2e-16 ***
## Work.Life.BalanceGood             2.934e-01  2.958e-02   9.916  < 2e-16 ***
## Work.Life.BalancePoor             1.509e+00  3.757e-02  40.167  < 2e-16 ***
## Job.SatisfactionLow               4.880e-01  3.572e-02  13.664  < 2e-16 ***
## Job.SatisfactionMedium            1.133e-02  2.721e-02   0.417 0.676953    
## Job.SatisfactionVery High         4.988e-01  2.701e-02  18.469  < 2e-16 ***
## Performance.RatingBelow Average   3.339e-01  2.948e-02  11.324  < 2e-16 ***
## Performance.RatingHigh            5.078e-03  2.647e-02   0.192 0.847882    
## Performance.RatingLow             5.972e-01  4.847e-02  12.321  < 2e-16 ***
## Number.of.Promotions             -2.492e-01  1.043e-02 -23.900  < 2e-16 ***
## OvertimeYes                       3.512e-01  2.189e-02  16.043  < 2e-16 ***
## Distance.from.Home                9.950e-03  3.636e-04  27.365  < 2e-16 ***
## Education.LevelBachelor’s Degree  4.581e-02  2.757e-02   1.662 0.096584 .  
## Education.LevelHigh School        3.035e-02  3.072e-02   0.988 0.323037    
## Education.LevelMaster’s Degree    3.087e-02  3.048e-02   1.013 0.311008    
## Education.LevelPhD               -1.564e+00  5.447e-02 -28.714  < 2e-16 ***
## Marital.StatusMarried            -2.559e-01  2.957e-02  -8.654  < 2e-16 ***
## Marital.StatusSingle              1.573e+00  3.220e-02  48.863  < 2e-16 ***
## Number.of.Dependents             -1.573e-01  6.660e-03 -23.621  < 2e-16 ***
## Job.LevelMid                     -1.003e+00  2.266e-02 -44.271  < 2e-16 ***
## Job.LevelSenior                  -2.616e+00  3.262e-02 -80.221  < 2e-16 ***
## Company.SizeMedium                6.143e-03  2.713e-02   0.226 0.820849    
## Company.SizeSmall                 2.063e-01  2.958e-02   6.975 3.06e-12 ***
## Company.Tenure                   -2.036e-04  4.501e-04  -0.452 0.651100    
## Remote.WorkYes                   -1.775e+00  2.916e-02 -60.888  < 2e-16 ***
## Leadership.OpportunitiesYes      -1.627e-01  4.753e-02  -3.424 0.000618 ***
## Innovation.OpportunitiesYes      -1.410e-01  2.784e-02  -5.064 4.10e-07 ***
## Company.ReputationFair            4.698e-01  3.962e-02  11.858  < 2e-16 ***
## Company.ReputationGood           -6.027e-02  3.535e-02  -1.705 0.088173 .  
## Company.ReputationPoor            7.564e-01  3.972e-02  19.043  < 2e-16 ***
## Employee.RecognitionLow           3.956e-02  2.620e-02   1.510 0.131020    
## Employee.RecognitionMedium        4.354e-02  2.769e-02   1.572 0.115845    
## Employee.RecognitionVery High    -8.300e-02  5.019e-02  -1.654 0.098177 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 82477  on 59597  degrees of freedom
## Residual deviance: 57833  on 59556  degrees of freedom
## AIC: 57917
## 
## Number of Fisher Scoring iterations: 5

4. Predict on Test Data

predict_class <- function(model, newdata, threshold = 0.5) {
  probs <- predict(model, newdata = newdata, type = "response")
  factor(ifelse(probs >= threshold, "Yes", "No"), levels = c("No", "Yes"))
}

pred1 <- predict_class(model1, test)
pred2 <- predict_class(model2, test)
pred3 <- predict_class(model3, test)

5. Confusion Matrices

Model 1: Attrition ~ Monthly.Income

cm1 <- confusionMatrix(pred1, test$Attrition, positive = "Yes")
print(cm1)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  7868 7032
##        Yes    0    0
##                                         
##                Accuracy : 0.5281        
##                  95% CI : (0.52, 0.5361)
##     No Information Rate : 0.5281        
##     P-Value [Acc > NIR] : 0.5033        
##                                         
##                   Kappa : 0             
##                                         
##  Mcnemar's Test P-Value : <2e-16        
##                                         
##             Sensitivity : 0.0000        
##             Specificity : 1.0000        
##          Pos Pred Value :    NaN        
##          Neg Pred Value : 0.5281        
##              Prevalence : 0.4719        
##          Detection Rate : 0.0000        
##    Detection Prevalence : 0.0000        
##       Balanced Accuracy : 0.5000        
##                                         
##        'Positive' Class : Yes           
## 

Model 2: Attrition ~ Monthly.Income + Overtime

cm2 <- confusionMatrix(pred2, test$Attrition, positive = "Yes")
print(cm2)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  5487 4530
##        Yes 2381 2502
##                                           
##                Accuracy : 0.5362          
##                  95% CI : (0.5281, 0.5442)
##     No Information Rate : 0.5281          
##     P-Value [Acc > NIR] : 0.02397         
##                                           
##                   Kappa : 0.0541          
##                                           
##  Mcnemar's Test P-Value : < 2e-16         
##                                           
##             Sensitivity : 0.3558          
##             Specificity : 0.6974          
##          Pos Pred Value : 0.5124          
##          Neg Pred Value : 0.5478          
##              Prevalence : 0.4719          
##          Detection Rate : 0.1679          
##    Detection Prevalence : 0.3277          
##       Balanced Accuracy : 0.5266          
##                                           
##        'Positive' Class : Yes             
## 

Model 3: Attrition ~ . (all variables)

cm3 <- confusionMatrix(pred3, test$Attrition, positive = "Yes")
print(cm3)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  6080 1855
##        Yes 1788 5177
##                                           
##                Accuracy : 0.7555          
##                  95% CI : (0.7485, 0.7624)
##     No Information Rate : 0.5281          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.5092          
##                                           
##  Mcnemar's Test P-Value : 0.2742          
##                                           
##             Sensitivity : 0.7362          
##             Specificity : 0.7728          
##          Pos Pred Value : 0.7433          
##          Neg Pred Value : 0.7662          
##              Prevalence : 0.4719          
##          Detection Rate : 0.3474          
##    Detection Prevalence : 0.4674          
##       Balanced Accuracy : 0.7545          
##                                           
##        'Positive' Class : Yes             
## 

6. Model Comparison

metrics <- data.frame(
  Model       = c("Model 1", "Model 2", "Model 3"),
  Accuracy    = c(cm1$overall["Accuracy"],
                  cm2$overall["Accuracy"],
                  cm3$overall["Accuracy"]),
  Sensitivity = c(cm1$byClass["Sensitivity"],
                  cm2$byClass["Sensitivity"],
                  cm3$byClass["Sensitivity"]),
  Specificity = c(cm1$byClass["Specificity"],
                  cm2$byClass["Specificity"],
                  cm3$byClass["Specificity"])
)

print(metrics)
##     Model  Accuracy Sensitivity Specificity
## 1 Model 1 0.5280537   0.0000000   1.0000000
## 2 Model 2 0.5361745   0.3558020   0.6973818
## 3 Model 3 0.7555034   0.7362059   0.7727504

7. Conclusion

Three logistic regression models were estimated using the Employee Attrition dataset and evaluated on a held-out test set.