The goal of this analysis is to predict which employees are
likely to leave the company (attrition) using their
Age, Salary, and Years at
Job.
Attrition is a binary variable (“Yes” for leaving, “No”
for staying).
We will:
Since the dependent variable (Attrition) is binary
(“Yes”/“No”),
we will use Logistic Regression.
Justification: - Predicts probabilities for binary outcomes. - Interpretable (coefficients show how each variable affects attrition odds). - Works well with continuous predictors like age, salary, and experience.
# Install required packages (run once if not already installed)
# install.packages(c("caret", "pROC", "dplyr"))
library(caret)
library(pROC)
library(dplyr)
set.seed(42)
n <- 200
data <- data.frame(
Age = round(rnorm(n, mean = 35, sd = 8)),
Salary = round(rnorm(n, mean = 60000, sd = 12000)),
YearsAtJob = round(abs(rnorm(n, mean = 5, sd = 3)))
)
# Generate Attrition outcome (synthetic)
logodds <- -3 + 0.03*(data$Age) - 0.00003*(data$Salary) + 0.2*(data$YearsAtJob)
prob <- 1 / (1 + exp(-logodds))
data$Attrition <- ifelse(runif(n) < prob, "Yes", "No")
data$Attrition <- factor(data$Attrition, levels = c("No", "Yes"))
head(data)
## Age Salary YearsAtJob Attrition
## 1 46 35989 9 No
## 2 30 64005 2 No
## 3 38 74056 5 No
## 4 40 84714 5 No
## 5 38 43478 3 No
## 6 34 46190 2 No
table(data$Attrition)
##
## No Yes
## 185 15
set.seed(123)
train_index <- createDataPartition(data$Attrition, p = 0.7, list = FALSE)
train_data <- data[train_index, ]
test_data <- data[-train_index, ]
dim(train_data)
## [1] 141 4
dim(test_data)
## [1] 59 4
model <- glm(Attrition ~ Age + Salary + YearsAtJob,
data = train_data,
family = binomial)
summary(model)
##
## Call:
## glm(formula = Attrition ~ Age + Salary + YearsAtJob, family = binomial,
## data = train_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.688e+00 2.374e+00 -0.711 0.477
## Age 3.607e-02 4.255e-02 0.848 0.397
## Salary -5.264e-05 3.241e-05 -1.624 0.104
## YearsAtJob 1.733e-01 1.109e-01 1.563 0.118
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 77.238 on 140 degrees of freedom
## Residual deviance: 71.956 on 137 degrees of freedom
## AIC: 79.956
##
## Number of Fisher Scoring iterations: 6
# Predict probabilities and classes
pred_prob <- predict(model, newdata = test_data, type = "response")
pred_class <- ifelse(pred_prob > 0.5, "Yes", "No")
pred_class <- factor(pred_class, levels = c("No", "Yes"))
# Add predictions to test data
test_data$PredProb <- pred_prob
test_data$PredClass <- pred_class
head(test_data)
## Age Salary YearsAtJob Attrition PredProb PredClass
## 1 46 35989 9 No 0.41009578 No
## 2 30 64005 2 No 0.02586686 No
## 3 38 74056 5 No 0.03392460 No
## 9 51 52251 9 No 0.26130604 No
## 10 34 57775 1 No 0.03456548 No
## 15 34 65947 3 No 0.03188530 No
conf <- confusionMatrix(test_data$PredClass, test_data$Attrition, positive = "Yes")
conf
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 55 4
## Yes 0 0
##
## Accuracy : 0.9322
## 95% CI : (0.8354, 0.9812)
## No Information Rate : 0.9322
## P-Value [Acc > NIR] : 0.6289
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 0.1336
##
## Sensitivity : 0.0000
## Specificity : 1.0000
## Pos Pred Value : NaN
## Neg Pred Value : 0.9322
## Prevalence : 0.0678
## Detection Rate : 0.0000
## Detection Prevalence : 0.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : Yes
##
roc_obj <- roc(as.numeric(test_data$Attrition == "Yes"), test_data$PredProb)
plot(roc_obj, main = paste("ROC Curve (AUC =", round(auc(roc_obj), 3), ")"))
auc(roc_obj)
## Area under the curve: 0.8182
set.seed(123)
fit_control <- trainControl(method = "cv", number = 10,
classProbs = TRUE,
summaryFunction = twoClassSummary)
cv_model <- train(Attrition ~ Age + Salary + YearsAtJob,
data = data,
method = "glm",
family = "binomial",
trControl = fit_control,
metric = "ROC")
cv_model
## Generalized Linear Model
##
## 200 samples
## 3 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results:
##
## ROC Sens Spec
## 0.7106725 1 0
threshold <- 0.3
pred_class_th <- ifelse(test_data$PredProb > threshold, "Yes", "No")
conf_th <- confusionMatrix(factor(pred_class_th, levels = c("No", "Yes")),
test_data$Attrition,
positive = "Yes")
conf_th
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 53 4
## Yes 2 0
##
## Accuracy : 0.8983
## 95% CI : (0.7917, 0.9618)
## No Information Rate : 0.9322
## P-Value [Acc > NIR] : 0.8967
##
## Kappa : -0.0473
##
## Mcnemar's Test P-Value : 0.6831
##
## Sensitivity : 0.0000
## Specificity : 0.9636
## Pos Pred Value : 0.0000
## Neg Pred Value : 0.9298
## Prevalence : 0.0678
## Detection Rate : 0.0000
## Detection Prevalence : 0.0339
## Balanced Accuracy : 0.4818
##
## 'Positive' Class : Yes
##
exp(coef(model))
## (Intercept) Age Salary YearsAtJob
## 0.1848282 1.0367260 0.9999474 1.1892558