Logistic Regression is suitable because attrition is a binary outcome
(“Yes”/“No”).
It estimates the probability of an employee leaving based on predictors
like Age, Salary, and YearsAtJob.
library(caret)
## Warning: package 'caret' was built under R version 4.5.2
## Loading required package: ggplot2
## Loading required package: lattice
library(pROC)
## Warning: package 'pROC' was built under R version 4.5.2
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
set.seed(1)
data <- data.frame(
Age = c(25, 30, 45, 28, 35, 40, 50, 29, 31, 38),
Salary = c(30000, 40000, 60000, 35000, 45000, 55000, 65000, 37000, 42000, 50000),
YearsAtJob = c(1, 3, 10, 2, 5, 8, 12, 3, 4, 7),
Attrition = c("Yes", "No", "Yes", "No", "No", "Yes", "No", "Yes", "No", "Yes")
)
# Convert target variable
data$Attrition <- as.factor(ifelse(data$Attrition == "Yes", 1, 0))
# Fit logistic regression model
model <- glm(Attrition ~ Age + Salary + YearsAtJob, data = data, family = binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(model)
##
## Call:
## glm(formula = Attrition ~ Age + Salary + YearsAtJob, family = binomial,
## data = data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.718e+03 4.613e+05 0.004 0.997
## Age -5.430e+01 1.457e+04 -0.004 0.997
## Salary -1.801e-02 4.856e+00 -0.004 0.997
## YearsAtJob 1.806e+02 4.856e+04 0.004 0.997
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 13.8629 on 9 degrees of freedom
## Residual deviance: 6.5064 on 6 degrees of freedom
## AIC: 14.506
##
## Number of Fisher Scoring iterations: 20
# Predictions
data$Predicted_Prob <- predict(model, type="response")
data$Predicted_Class <- ifelse(data$Predicted_Prob > 0.5, 1, 0)
# Confusion Matrix
confusionMatrix(as.factor(data$Predicted_Class), data$Attrition)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4 0
## 1 1 5
##
## Accuracy : 0.9
## 95% CI : (0.555, 0.9975)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 0.01074
##
## Kappa : 0.8
##
## Mcnemar's Test P-Value : 1.00000
##
## Sensitivity : 0.8000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.8333
## Prevalence : 0.5000
## Detection Rate : 0.4000
## Detection Prevalence : 0.4000
## Balanced Accuracy : 0.9000
##
## 'Positive' Class : 0
##
# ROC Curve and AUC
roc_obj <- roc(data$Attrition, data$Predicted_Prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_obj, col="blue", main="ROC Curve for Employee Attrition Model")
auc(roc_obj)
## Area under the curve: 0.92
# 10-fold Cross Validation
train_control <- trainControl(method="cv", number=10)
cv_model <- train(Attrition ~ Age + Salary + YearsAtJob,
data=data,
method="glm",
family="binomial",
trControl=train_control)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
cv_model
## Generalized Linear Model
##
## 10 samples
## 3 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 9, 9, 9, 9, 9, 9, ...
## Resampling results:
##
## Accuracy Kappa
## 0.5 0