# ---- Libraries & seed ----
library(caret)
library(pROC)
library(knitr)
set.seed(123)

# ---- Example dataset (replace with read.csv(...) to use your file) ----
df <- data.frame(
  Age = sample(22:60, 100, replace = TRUE),
  Salary = sample(30000:90000, 100, replace = TRUE),
  YearsAtJob = sample(1:20, 100, replace = TRUE),
  Attrition = sample(c("Yes", "No"), 100, replace = TRUE, prob = c(0.3, 0.7))
)

# ---- Data prep: factor target & train/test split ----
df$Attrition <- factor(df$Attrition, levels = c("No", "Yes"))

# Split data: 70% training, 30% testing
trainIndex <- createDataPartition(df$Attrition, p = 0.7, list = FALSE)
train <- df[trainIndex, ]
test  <- df[-trainIndex, ]

# ---- Fit logistic regression model ----
model <- glm(Attrition ~ Age + Salary + YearsAtJob, data = train, family = "binomial")
cat("=== Model summary ===\n")
## === Model summary ===
print(summary(model))
## 
## Call:
## glm(formula = Attrition ~ Age + Salary + YearsAtJob, family = "binomial", 
##     data = train)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  4.965e-01  1.527e+00   0.325  0.74504   
## Age         -5.283e-03  2.597e-02  -0.203  0.83881   
## Salary       1.042e-05  1.555e-05   0.670  0.50271   
## YearsAtJob  -1.367e-01  4.942e-02  -2.766  0.00567 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 94.317  on 70  degrees of freedom
## Residual deviance: 85.277  on 67  degrees of freedom
## AIC: 93.277
## 
## Number of Fisher Scoring iterations: 4
# ---- Predict probabilities and classes on test set ----
pred_prob <- predict(model, test, type = "response")
pred_class <- ifelse(pred_prob > 0.5, "Yes", "No")

# Create result table
results <- data.frame(
  Actual = test$Attrition,
  Predicted = factor(pred_class, levels = c("No","Yes")),
  Probability = round(pred_prob, 3),
  Age = test$Age,
  Salary = test$Salary,
  YearsAtJob = test$YearsAtJob
)

cat("\n=== Sample prediction results (first 10 rows) ===\n")
## 
## === Sample prediction results (first 10 rows) ===
kable(head(results, 10), caption = "Sample Prediction Results")
Sample Prediction Results
Actual Predicted Probability Age Salary YearsAtJob
9 Yes Yes 0.638 48 70502 3
13 Yes No 0.205 30 47368 16
15 No No 0.240 56 67000 15
22 No No 0.340 35 63752 12
25 No No 0.449 33 41283 7
26 No No 0.272 36 46578 13
27 No Yes 0.576 53 74195 5
48 No Yes 0.518 59 67782 6
50 Yes No 0.145 36 62952 20
53 No Yes 0.505 37 78181 8
# ---- Confusion matrix & accuracy ----
confusion <- confusionMatrix(results$Predicted, test$Attrition)
cat("\n=== Confusion Matrix & Metrics ===\n")
## 
## === Confusion Matrix & Metrics ===
print(confusion)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No  13   8
##        Yes  5   3
##                                           
##                Accuracy : 0.5517          
##                  95% CI : (0.3569, 0.7355)
##     No Information Rate : 0.6207          
##     P-Value [Acc > NIR] : 0.8310          
##                                           
##                   Kappa : -0.0053         
##                                           
##  Mcnemar's Test P-Value : 0.5791          
##                                           
##             Sensitivity : 0.7222          
##             Specificity : 0.2727          
##          Pos Pred Value : 0.6190          
##          Neg Pred Value : 0.3750          
##              Prevalence : 0.6207          
##          Detection Rate : 0.4483          
##    Detection Prevalence : 0.7241          
##       Balanced Accuracy : 0.4975          
##                                           
##        'Positive' Class : No              
## 
# ---- ROC curve & AUC ----
roc_obj <- roc(test$Attrition, pred_prob)
cat("\nAUC:", round(auc(roc_obj), 3), "\n")
## 
## AUC: 0.53
# Plot ROC
plot(roc_obj, main = paste0("ROC Curve (AUC = ", round(auc(roc_obj),3), ")"))

# ---- Brief summary output ----
cat("\nModel Accuracy:", round(confusion$overall['Accuracy'] * 100, 2), "%\n")
## 
## Model Accuracy: 55.17 %
cat("AUC (Area Under Curve):", round(auc(roc_obj), 3), "\n")
## AUC (Area Under Curve): 0.53