# ---- Libraries & seed ----
library(caret)
library(pROC)
library(knitr)
set.seed(123)
# ---- Example dataset (replace with read.csv(...) to use your file) ----
df <- data.frame(
Age = sample(22:60, 100, replace = TRUE),
Salary = sample(30000:90000, 100, replace = TRUE),
YearsAtJob = sample(1:20, 100, replace = TRUE),
Attrition = sample(c("Yes", "No"), 100, replace = TRUE, prob = c(0.3, 0.7))
)
# ---- Data prep: factor target & train/test split ----
df$Attrition <- factor(df$Attrition, levels = c("No", "Yes"))
# Split data: 70% training, 30% testing
trainIndex <- createDataPartition(df$Attrition, p = 0.7, list = FALSE)
train <- df[trainIndex, ]
test <- df[-trainIndex, ]
# ---- Fit logistic regression model ----
model <- glm(Attrition ~ Age + Salary + YearsAtJob, data = train, family = "binomial")
cat("=== Model summary ===\n")
## === Model summary ===
print(summary(model))
##
## Call:
## glm(formula = Attrition ~ Age + Salary + YearsAtJob, family = "binomial",
## data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.965e-01 1.527e+00 0.325 0.74504
## Age -5.283e-03 2.597e-02 -0.203 0.83881
## Salary 1.042e-05 1.555e-05 0.670 0.50271
## YearsAtJob -1.367e-01 4.942e-02 -2.766 0.00567 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 94.317 on 70 degrees of freedom
## Residual deviance: 85.277 on 67 degrees of freedom
## AIC: 93.277
##
## Number of Fisher Scoring iterations: 4
# ---- Predict probabilities and classes on test set ----
pred_prob <- predict(model, test, type = "response")
pred_class <- ifelse(pred_prob > 0.5, "Yes", "No")
# Create result table
results <- data.frame(
Actual = test$Attrition,
Predicted = factor(pred_class, levels = c("No","Yes")),
Probability = round(pred_prob, 3),
Age = test$Age,
Salary = test$Salary,
YearsAtJob = test$YearsAtJob
)
cat("\n=== Sample prediction results (first 10 rows) ===\n")
##
## === Sample prediction results (first 10 rows) ===
kable(head(results, 10), caption = "Sample Prediction Results")
Sample Prediction Results
| 9 |
Yes |
Yes |
0.638 |
48 |
70502 |
3 |
| 13 |
Yes |
No |
0.205 |
30 |
47368 |
16 |
| 15 |
No |
No |
0.240 |
56 |
67000 |
15 |
| 22 |
No |
No |
0.340 |
35 |
63752 |
12 |
| 25 |
No |
No |
0.449 |
33 |
41283 |
7 |
| 26 |
No |
No |
0.272 |
36 |
46578 |
13 |
| 27 |
No |
Yes |
0.576 |
53 |
74195 |
5 |
| 48 |
No |
Yes |
0.518 |
59 |
67782 |
6 |
| 50 |
Yes |
No |
0.145 |
36 |
62952 |
20 |
| 53 |
No |
Yes |
0.505 |
37 |
78181 |
8 |
# ---- Confusion matrix & accuracy ----
confusion <- confusionMatrix(results$Predicted, test$Attrition)
cat("\n=== Confusion Matrix & Metrics ===\n")
##
## === Confusion Matrix & Metrics ===
print(confusion)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 13 8
## Yes 5 3
##
## Accuracy : 0.5517
## 95% CI : (0.3569, 0.7355)
## No Information Rate : 0.6207
## P-Value [Acc > NIR] : 0.8310
##
## Kappa : -0.0053
##
## Mcnemar's Test P-Value : 0.5791
##
## Sensitivity : 0.7222
## Specificity : 0.2727
## Pos Pred Value : 0.6190
## Neg Pred Value : 0.3750
## Prevalence : 0.6207
## Detection Rate : 0.4483
## Detection Prevalence : 0.7241
## Balanced Accuracy : 0.4975
##
## 'Positive' Class : No
##
# ---- ROC curve & AUC ----
roc_obj <- roc(test$Attrition, pred_prob)
cat("\nAUC:", round(auc(roc_obj), 3), "\n")
##
## AUC: 0.53
# Plot ROC
plot(roc_obj, main = paste0("ROC Curve (AUC = ", round(auc(roc_obj),3), ")"))

# ---- Brief summary output ----
cat("\nModel Accuracy:", round(confusion$overall['Accuracy'] * 100, 2), "%\n")
##
## Model Accuracy: 55.17 %
cat("AUC (Area Under Curve):", round(auc(roc_obj), 3), "\n")
## AUC (Area Under Curve): 0.53