1. Introduction

The goal of this analysis is to predict which employees are likely to leave the company (attrition) using their Age, Salary, and Years at Job.
Attrition is a binary variable (“Yes” for leaving, “No” for staying).

We will:

  1. Select a suitable regression model.
  2. Implement it step-by-step in R.
  3. Evaluate accuracy using validation metrics.
  4. Explain why the chosen method is appropriate.

2. Model Selection

Since the dependent variable (Attrition) is binary (“Yes”/“No”),
we will use Logistic Regression.

Justification: - Predicts probabilities for binary outcomes. - Interpretable (coefficients show how each variable affects attrition odds). - Works well with continuous predictors like age, salary, and experience.


3. Setup and Packages

# Install required packages (run once if not already installed)
# install.packages(c("caret", "pROC", "dplyr"))

library(caret)
library(pROC)
library(dplyr)

set.seed(42)
n <- 200
data <- data.frame(
Age = round(rnorm(n, mean = 35, sd = 8)),
Salary = round(rnorm(n, mean = 60000, sd = 12000)),
YearsAtJob = round(abs(rnorm(n, mean = 5, sd = 3)))
)

# Generate Attrition outcome (synthetic)

logodds <- -3 + 0.03*(data$Age) - 0.00003*(data$Salary) + 0.2*(data$YearsAtJob)
prob <- 1 / (1 + exp(-logodds))
data$Attrition <- ifelse(runif(n) < prob, "Yes", "No")
data$Attrition <- factor(data$Attrition, levels = c("No", "Yes"))

head(data)
##   Age Salary YearsAtJob Attrition
## 1  46  35989          9        No
## 2  30  64005          2        No
## 3  38  74056          5        No
## 4  40  84714          5        No
## 5  38  43478          3        No
## 6  34  46190          2        No
table(data$Attrition)
## 
##  No Yes 
## 185  15
set.seed(123)
train_index <- createDataPartition(data$Attrition, p = 0.7, list = FALSE)
train_data <- data[train_index, ]
test_data  <- data[-train_index, ]

dim(train_data)
## [1] 141   4
dim(test_data)
## [1] 59  4
model <- glm(Attrition ~ Age + Salary + YearsAtJob,
data = train_data,
family = binomial)

summary(model)
## 
## Call:
## glm(formula = Attrition ~ Age + Salary + YearsAtJob, family = binomial, 
##     data = train_data)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.688e+00  2.374e+00  -0.711    0.477
## Age          3.607e-02  4.255e-02   0.848    0.397
## Salary      -5.264e-05  3.241e-05  -1.624    0.104
## YearsAtJob   1.733e-01  1.109e-01   1.563    0.118
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 77.238  on 140  degrees of freedom
## Residual deviance: 71.956  on 137  degrees of freedom
## AIC: 79.956
## 
## Number of Fisher Scoring iterations: 6
# Predict probabilities and classes

pred_prob <- predict(model, newdata = test_data, type = "response")
pred_class <- ifelse(pred_prob > 0.5, "Yes", "No")
pred_class <- factor(pred_class, levels = c("No", "Yes"))

# Add predictions to test data

test_data$PredProb <- pred_prob
test_data$PredClass <- pred_class

head(test_data)
##    Age Salary YearsAtJob Attrition   PredProb PredClass
## 1   46  35989          9        No 0.41009578        No
## 2   30  64005          2        No 0.02586686        No
## 3   38  74056          5        No 0.03392460        No
## 9   51  52251          9        No 0.26130604        No
## 10  34  57775          1        No 0.03456548        No
## 15  34  65947          3        No 0.03188530        No
conf <- confusionMatrix(test_data$PredClass, test_data$Attrition, positive = "Yes")
conf
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No  55   4
##        Yes  0   0
##                                           
##                Accuracy : 0.9322          
##                  95% CI : (0.8354, 0.9812)
##     No Information Rate : 0.9322          
##     P-Value [Acc > NIR] : 0.6289          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : 0.1336          
##                                           
##             Sensitivity : 0.0000          
##             Specificity : 1.0000          
##          Pos Pred Value :    NaN          
##          Neg Pred Value : 0.9322          
##              Prevalence : 0.0678          
##          Detection Rate : 0.0000          
##    Detection Prevalence : 0.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : Yes             
## 
roc_obj <- roc(as.numeric(test_data$Attrition == "Yes"), test_data$PredProb)
plot(roc_obj, main = paste("ROC Curve (AUC =", round(auc(roc_obj), 3), ")"))

auc(roc_obj)
## Area under the curve: 0.8182
set.seed(123)
fit_control <- trainControl(method = "cv", number = 10,
classProbs = TRUE,
summaryFunction = twoClassSummary)

cv_model <- train(Attrition ~ Age + Salary + YearsAtJob,
data = data,
method = "glm",
family = "binomial",
trControl = fit_control,
metric = "ROC")

cv_model
## Generalized Linear Model 
## 
## 200 samples
##   3 predictor
##   2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results:
## 
##   ROC        Sens  Spec
##   0.7106725  1     0
threshold <- 0.3
pred_class_th <- ifelse(test_data$PredProb > threshold, "Yes", "No")
conf_th <- confusionMatrix(factor(pred_class_th, levels = c("No", "Yes")),
test_data$Attrition,
positive = "Yes")
conf_th
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No  53   4
##        Yes  2   0
##                                           
##                Accuracy : 0.8983          
##                  95% CI : (0.7917, 0.9618)
##     No Information Rate : 0.9322          
##     P-Value [Acc > NIR] : 0.8967          
##                                           
##                   Kappa : -0.0473         
##                                           
##  Mcnemar's Test P-Value : 0.6831          
##                                           
##             Sensitivity : 0.0000          
##             Specificity : 0.9636          
##          Pos Pred Value : 0.0000          
##          Neg Pred Value : 0.9298          
##              Prevalence : 0.0678          
##          Detection Rate : 0.0000          
##    Detection Prevalence : 0.0339          
##       Balanced Accuracy : 0.4818          
##                                           
##        'Positive' Class : Yes             
## 
exp(coef(model))
## (Intercept)         Age      Salary  YearsAtJob 
##   0.1848282   1.0367260   0.9999474   1.1892558