Load Data

data <- read_csv("employee_attrition.csv")
## Rows: 14900 Columns: 24
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (16): Gender, Job Role, Work-Life Balance, Job Satisfaction, Performance...
## dbl  (8): Employee ID, Age, Years at Company, Monthly Income, Number of Prom...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# clean column names (VERY IMPORTANT)
data <- clean_names(data)

# check names
names(data)
##  [1] "employee_id"              "age"                     
##  [3] "gender"                   "years_at_company"        
##  [5] "job_role"                 "monthly_income"          
##  [7] "work_life_balance"        "job_satisfaction"        
##  [9] "performance_rating"       "number_of_promotions"    
## [11] "overtime"                 "distance_from_home"      
## [13] "education_level"          "marital_status"          
## [15] "number_of_dependents"     "job_level"               
## [17] "company_size"             "company_tenure"          
## [19] "remote_work"              "leadership_opportunities"
## [21] "innovation_opportunities" "company_reputation"      
## [23] "employee_recognition"     "attrition"

Prepare Data

# convert target to factor
data <- data %>%
  mutate(attrition = as.factor(attrition))

Train-Test Split

set.seed(123)

train_index <- createDataPartition(data$attrition, p = 0.7, list = FALSE)

train <- data[train_index, ]
test  <- data[-train_index, ]

Model 1: Income Only

model1 <- glm(attrition ~ monthly_income,
              data = train,
              family = binomial)
summary(model1)
## 
## Call:
## glm(formula = attrition ~ monthly_income, family = binomial, 
##     data = train)
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)  
## (Intercept)     1.167e-01  6.950e-02   1.679   0.0932 .
## monthly_income -5.999e-07  9.131e-06  -0.066   0.9476  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 14428  on 10430  degrees of freedom
## Residual deviance: 14428  on 10429  degrees of freedom
## AIC: 14432
## 
## Number of Fisher Scoring iterations: 3

Model 2: Income + Overtime

if("over_time" %in% names(train)){
  model2 <- glm(attrition ~ monthly_income + over_time,
                data = train,
                family = binomial)
} else if("overtime" %in% names(train)){
  model2 <- glm(attrition ~ monthly_income + overtime,
                data = train,
                family = binomial)
} else {
  message("⚠️ No overtime variable found — fallback to income only")
  model2 <- glm(attrition ~ monthly_income,
                data = train,
                family = binomial)
}

summary(model2)
## 
## Call:
## glm(formula = attrition ~ monthly_income + overtime, family = binomial, 
##     data = train)
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     1.977e-01  7.101e-02   2.784  0.00536 ** 
## monthly_income -7.202e-07  9.146e-06  -0.079  0.93723    
## overtimeYes    -2.425e-01  4.174e-02  -5.811 6.23e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 14428  on 10430  degrees of freedom
## Residual deviance: 14394  on 10428  degrees of freedom
## AIC: 14400
## 
## Number of Fisher Scoring iterations: 3

Model 3: All Variables

model3 <- glm(attrition ~ .,
              data = train,
              family = binomial)

summary(model3)
## 
## Call:
## glm(formula = attrition ~ ., family = binomial, data = train)
## 
## Coefficients:
##                                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                      -1.412e-01  2.126e-01  -0.664  0.50668    
## employee_id                       1.088e-06  1.159e-06   0.938  0.34821    
## age                               7.509e-03  2.417e-03   3.107  0.00189 ** 
## genderMale                        5.696e-01  5.020e-02  11.347  < 2e-16 ***
## years_at_company                  1.287e-02  2.859e-03   4.503 6.70e-06 ***
## job_roleFinance                   1.971e-01  1.188e-01   1.659  0.09705 .  
## job_roleHealthcare                1.758e-01  1.032e-01   1.703  0.08849 .  
## job_roleMedia                     2.609e-01  8.686e-02   3.004  0.00266 ** 
## job_roleTechnology                2.973e-01  1.194e-01   2.490  0.01277 *  
## monthly_income                   -3.879e-05  2.029e-05  -1.912  0.05590 .  
## work_life_balanceFair            -1.365e+00  7.590e-02 -17.984  < 2e-16 ***
## work_life_balanceGood            -3.233e-01  7.146e-02  -4.525 6.05e-06 ***
## work_life_balancePoor            -1.616e+00  9.069e-02 -17.816  < 2e-16 ***
## job_satisfactionLow              -5.752e-01  8.416e-02  -6.834 8.23e-12 ***
## job_satisfactionMedium           -1.169e-01  6.655e-02  -1.757  0.07900 .  
## job_satisfactionVery High        -6.094e-01  6.564e-02  -9.284  < 2e-16 ***
## performance_ratingBelow Average  -4.228e-01  7.231e-02  -5.848 4.97e-09 ***
## performance_ratingHigh           -1.418e-01  6.356e-02  -2.230  0.02572 *  
## performance_ratingLow            -4.868e-01  1.181e-01  -4.123 3.74e-05 ***
## number_of_promotions              2.875e-01  2.536e-02  11.337  < 2e-16 ***
## overtimeYes                      -3.698e-01  5.291e-02  -6.990 2.75e-12 ***
## distance_from_home               -9.571e-03  8.741e-04 -10.950  < 2e-16 ***
## education_levelBachelor’s Degree  1.056e-01  6.663e-02   1.585  0.11305    
## education_levelHigh School        1.298e-01  7.439e-02   1.744  0.08108 .  
## education_levelMaster’s Degree    2.016e-01  7.360e-02   2.739  0.00617 ** 
## education_levelPhD                1.878e+00  1.388e-01  13.532  < 2e-16 ***
## marital_statusMarried             2.975e-01  7.069e-02   4.209 2.57e-05 ***
## marital_statusSingle             -1.642e+00  7.769e-02 -21.133  < 2e-16 ***
## number_of_dependents              1.118e-01  1.617e-02   6.915 4.66e-12 ***
## job_levelMid                      1.063e+00  5.525e-02  19.237  < 2e-16 ***
## job_levelSenior                   2.754e+00  7.910e-02  34.815  < 2e-16 ***
## company_sizeMedium                5.505e-02  6.572e-02   0.838  0.40218    
## company_sizeSmall                -1.182e-01  7.180e-02  -1.646  0.09971 .  
## company_tenure                   -4.425e-04  1.093e-03  -0.405  0.68553    
## remote_workYes                    1.834e+00  7.111e-02  25.799  < 2e-16 ***
## leadership_opportunitiesYes       2.457e-01  1.159e-01   2.120  0.03404 *  
## innovation_opportunitiesYes       1.052e-01  6.817e-02   1.543  0.12283    
## company_reputationFair           -5.511e-01  9.735e-02  -5.661 1.50e-08 ***
## company_reputationGood            4.166e-03  8.703e-02   0.048  0.96182    
## company_reputationPoor           -6.616e-01  9.696e-02  -6.823 8.91e-12 ***
## employee_recognitionLow          -3.421e-02  6.344e-02  -0.539  0.58974    
## employee_recognitionMedium       -8.725e-03  6.700e-02  -0.130  0.89639    
## employee_recognitionVery High     6.296e-02  1.253e-01   0.502  0.61544    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 14427.6  on 10430  degrees of freedom
## Residual deviance:  9914.3  on 10388  degrees of freedom
## AIC: 10000
## 
## Number of Fisher Scoring iterations: 5

Predictions

prob1 <- predict(model1, test, type = "response")
prob2 <- predict(model2, test, type = "response")
prob3 <- predict(model3, test, type = "response")

# FIX factor level mismatch (VERY IMPORTANT)
lev <- levels(test$attrition)

pred1 <- factor(ifelse(prob1 > 0.5, lev[2], lev[1]), levels = lev)
pred2 <- factor(ifelse(prob2 > 0.5, lev[2], lev[1]), levels = lev)
pred3 <- factor(ifelse(prob3 > 0.5, lev[2], lev[1]), levels = lev)

Confusion Matrices

Model 1

confusionMatrix(pred1, test$attrition)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Left Stayed
##     Left      0      0
##     Stayed 2109   2360
##                                           
##                Accuracy : 0.5281          
##                  95% CI : (0.5133, 0.5428)
##     No Information Rate : 0.5281          
##     P-Value [Acc > NIR] : 0.5061          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.0000          
##             Specificity : 1.0000          
##          Pos Pred Value :    NaN          
##          Neg Pred Value : 0.5281          
##              Prevalence : 0.4719          
##          Detection Rate : 0.0000          
##    Detection Prevalence : 0.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : Left            
## 

Model 2

confusionMatrix(pred2, test$attrition)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Left Stayed
##     Left    748    707
##     Stayed 1361   1653
##                                          
##                Accuracy : 0.5373         
##                  95% CI : (0.5225, 0.552)
##     No Information Rate : 0.5281         
##     P-Value [Acc > NIR] : 0.1124         
##                                          
##                   Kappa : 0.056          
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.3547         
##             Specificity : 0.7004         
##          Pos Pred Value : 0.5141         
##          Neg Pred Value : 0.5484         
##              Prevalence : 0.4719         
##          Detection Rate : 0.1674         
##    Detection Prevalence : 0.3256         
##       Balanced Accuracy : 0.5275         
##                                          
##        'Positive' Class : Left           
## 

Model 3

confusionMatrix(pred3, test$attrition)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Left Stayed
##     Left   1565    558
##     Stayed  544   1802
##                                          
##                Accuracy : 0.7534         
##                  95% CI : (0.7405, 0.766)
##     No Information Rate : 0.5281         
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.5054         
##                                          
##  Mcnemar's Test P-Value : 0.6953         
##                                          
##             Sensitivity : 0.7421         
##             Specificity : 0.7636         
##          Pos Pred Value : 0.7372         
##          Neg Pred Value : 0.7681         
##              Prevalence : 0.4719         
##          Detection Rate : 0.3502         
##    Detection Prevalence : 0.4751         
##       Balanced Accuracy : 0.7528         
##                                          
##        'Positive' Class : Left           
## 

Accuracy Comparison

tibble(
  Model = c("Income Only", "Income + Overtime", "All Variables"),
  Accuracy = c(
    mean(pred1 == test$attrition),
    mean(pred2 == test$attrition),
    mean(pred3 == test$attrition)
  )
)
## # A tibble: 3 × 2
##   Model             Accuracy
##   <chr>                <dbl>
## 1 Income Only          0.528
## 2 Income + Overtime    0.537
## 3 All Variables        0.753

Interpretation

Income alone is usually a weak predictor, while behavioral variables like overtime tend to improve classification.


Conclusion

Logistic regression models show that employee attrition depends on multiple factors. Models with more variables generally perform better, but may be harder to interpret. Confusion matrices reveal how well each model distinguishes employees who leave versus those who stay.