Employee Attrition Prediction (single R chunk)

# ---- Libraries & seed ----
library(caret)
library(pROC)
library(knitr)
set.seed(123)

# ---- Example dataset (replace with read.csv(...) to use your file) ----
df <- data.frame(
  Age = sample(22:60, 100, replace = TRUE),
  Salary = sample(30000:90000, 100, replace = TRUE),
  YearsAtJob = sample(1:20, 100, replace = TRUE),
  Attrition = sample(c("Yes", "No"), 100, replace = TRUE, prob = c(0.3, 0.7))
)

# ---- Data prep: factor target & train/test split ----
df$Attrition <- factor(df$Attrition, levels = c("No", "Yes"))

# Split data: 70% training, 30% testing
trainIndex <- createDataPartition(df$Attrition, p = 0.7, list = FALSE)
train <- df[trainIndex, ]
test  <- df[-trainIndex, ]

# ---- Fit logistic regression model ----
model <- glm(Attrition ~ Age + Salary + YearsAtJob, data = train, family = "binomial")
cat("=== Model summary ===\n")

## === Model summary ===

print(summary(model))

## 
## Call:
## glm(formula = Attrition ~ Age + Salary + YearsAtJob, family = "binomial", 
##     data = train)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  4.965e-01  1.527e+00   0.325  0.74504   
## Age         -5.283e-03  2.597e-02  -0.203  0.83881   
## Salary       1.042e-05  1.555e-05   0.670  0.50271   
## YearsAtJob  -1.367e-01  4.942e-02  -2.766  0.00567 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 94.317  on 70  degrees of freedom
## Residual deviance: 85.277  on 67  degrees of freedom
## AIC: 93.277
## 
## Number of Fisher Scoring iterations: 4

# ---- Predict probabilities and classes on test set ----
pred_prob <- predict(model, test, type = "response")
pred_class <- ifelse(pred_prob > 0.5, "Yes", "No")

# Create result table
results <- data.frame(
  Actual = test$Attrition,
  Predicted = factor(pred_class, levels = c("No","Yes")),
  Probability = round(pred_prob, 3),
  Age = test$Age,
  Salary = test$Salary,
  YearsAtJob = test$YearsAtJob
)

cat("\n=== Sample prediction results (first 10 rows) ===\n")

## 
## === Sample prediction results (first 10 rows) ===

kable(head(results, 10), caption = "Sample Prediction Results")

Sample Prediction Results
	Actual	Predicted	Probability	Age	Salary	YearsAtJob
9	Yes	Yes	0.638	48	70502	3
13	Yes	No	0.205	30	47368	16
15	No	No	0.240	56	67000	15
22	No	No	0.340	35	63752	12
25	No	No	0.449	33	41283	7
26	No	No	0.272	36	46578	13
27	No	Yes	0.576	53	74195	5
48	No	Yes	0.518	59	67782	6
50	Yes	No	0.145	36	62952	20
53	No	Yes	0.505	37	78181	8

# ---- Confusion matrix & accuracy ----
confusion <- confusionMatrix(results$Predicted, test$Attrition)
cat("\n=== Confusion Matrix & Metrics ===\n")

## 
## === Confusion Matrix & Metrics ===

print(confusion)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No  13   8
##        Yes  5   3
##                                           
##                Accuracy : 0.5517          
##                  95% CI : (0.3569, 0.7355)
##     No Information Rate : 0.6207          
##     P-Value [Acc > NIR] : 0.8310          
##                                           
##                   Kappa : -0.0053         
##                                           
##  Mcnemar's Test P-Value : 0.5791          
##                                           
##             Sensitivity : 0.7222          
##             Specificity : 0.2727          
##          Pos Pred Value : 0.6190          
##          Neg Pred Value : 0.3750          
##              Prevalence : 0.6207          
##          Detection Rate : 0.4483          
##    Detection Prevalence : 0.7241          
##       Balanced Accuracy : 0.4975          
##                                           
##        'Positive' Class : No              
##

# ---- ROC curve & AUC ----
roc_obj <- roc(test$Attrition, pred_prob)
cat("\nAUC:", round(auc(roc_obj), 3), "\n")

## 
## AUC: 0.53

# Plot ROC
plot(roc_obj, main = paste0("ROC Curve (AUC = ", round(auc(roc_obj),3), ")"))

# ---- Brief summary output ----
cat("\nModel Accuracy:", round(confusion$overall['Accuracy'] * 100, 2), "%\n")

## 
## Model Accuracy: 55.17 %

cat("AUC (Area Under Curve):", round(auc(roc_obj), 3), "\n")

## AUC (Area Under Curve): 0.53