1. Load Required Libraries

library(tidyverse)
library(caret)
library(rsample)
library(pROC)

2. Load, Preprocess, and Split

# ── 2a. Load ──────────────────────────────────────────────────────────────────
attrition <- read.csv("test.csv", stringsAsFactors = FALSE)
cat("Raw dimensions:", nrow(attrition), "rows x", ncol(attrition), "cols\n")
## Raw dimensions: 14900 rows x 24 cols
# ── 2b. Clean all column names: spaces/hyphens → underscores, strip dots ─────
# read.csv() converts spaces to dots (e.g. "Monthly Income" -> "Monthly.Income")
# We normalise everything to snake_case for reliable referencing.
colnames(attrition) <- gsub("[. ]+", "_", trimws(colnames(attrition)))
colnames(attrition) <- gsub("_+", "_", colnames(attrition))   # collapse multiples
colnames(attrition) <- gsub("^_|_$", "", colnames(attrition)) # strip leading/trailing
cat("Cleaned column names:\n"); print(colnames(attrition))
## Cleaned column names:
##  [1] "Employee_ID"              "Age"                     
##  [3] "Gender"                   "Years_at_Company"        
##  [5] "Job_Role"                 "Monthly_Income"          
##  [7] "Work_Life_Balance"        "Job_Satisfaction"        
##  [9] "Performance_Rating"       "Number_of_Promotions"    
## [11] "Overtime"                 "Distance_from_Home"      
## [13] "Education_Level"          "Marital_Status"          
## [15] "Number_of_Dependents"     "Job_Level"               
## [17] "Company_Size"             "Company_Tenure"          
## [19] "Remote_Work"              "Leadership_Opportunities"
## [21] "Innovation_Opportunities" "Company_Reputation"      
## [23] "Employee_Recognition"     "Attrition"
# ── 2c. Standardise the target column name ────────────────────────────────────
target_col <- grep("^attrition$", colnames(attrition),
                   ignore.case = TRUE, value = TRUE)[1]
if (is.na(target_col)) stop("No Attrition column found. Check your CSV.")
cat("\nTarget column found as: '", target_col, "'\n", sep = "")
## 
## Target column found as: 'Attrition'
colnames(attrition)[colnames(attrition) == target_col] <- "Attrition"

# Convert to factor with explicit levels
attrition$Attrition <- factor(attrition$Attrition, levels = c("Stayed", "Left"))
cat("Attrition level check:", levels(attrition$Attrition), "\n")
## Attrition level check: Stayed Left
# ── 2c. Factorise Overtime ────────────────────────────────────────────────────
overtime_col <- grep("^overtime$", colnames(attrition),  # matches "Overtime" after cleaning
                     ignore.case = TRUE, value = TRUE)[1]
if (!is.na(overtime_col)) {
  attrition[[overtime_col]] <- as.factor(attrition[[overtime_col]])
  cat("Overtime column: '", overtime_col, "'\n", sep = "")
}
## Overtime column: 'Overtime'
# ── 2d. Remove ID columns ─────────────────────────────────────────────────────
id_cols <- c("Employee_ID", "EmployeeId", "EmployeeNumber", "ID", "Id")
attrition <- attrition[ , !(colnames(attrition) %in% id_cols), drop = FALSE]

# ── 2e. Drop zero-variance columns (NEVER include the target column) ────────
# Running nearZeroVar on the full df can incorrectly flag and drop Attrition
# when it is imbalanced. Restrict the check to predictors only.
predictors_only <- attrition[ , colnames(attrition) != "Attrition", drop = FALSE]
nzv_idx <- caret::nearZeroVar(predictors_only)
if (length(nzv_idx) > 0) {
  drop_cols <- colnames(predictors_only)[nzv_idx]
  cat("Dropping zero-variance cols:", paste(drop_cols, collapse = ", "), "\n")
  attrition <- attrition[ , !(colnames(attrition) %in% drop_cols), drop = FALSE]
}
## Dropping zero-variance cols: Leadership_Opportunities
# ── 2f. Convert remaining character columns to factors ────────────────────────
# Convert all character columns to factors
char_cols <- sapply(attrition, is.character)
attrition[char_cols] <- lapply(attrition[char_cols], as.factor)

# Explicitly ensure Attrition is a well-formed factor (not just any factor)
# This guards against it being silently converted to character by any step above
attrition$Attrition <- factor(as.character(attrition$Attrition),
                               levels = c("Stayed", "Left"))

# Verify Attrition survived all transformations
stopifnot("Attrition" %in% colnames(attrition))
cat("Attrition class:", class(attrition$Attrition),
    "| levels:", levels(attrition$Attrition), "\n")
## Attrition class: factor | levels: Stayed Left
cat("\nFinal column count:", ncol(attrition), "\n")
## 
## Final column count: 22
cat("Missing values per column:\n"); print(colSums(is.na(attrition)))
## Missing values per column:
##                      Age                   Gender         Years_at_Company 
##                        0                        0                        0 
##                 Job_Role           Monthly_Income        Work_Life_Balance 
##                        0                        0                        0 
##         Job_Satisfaction       Performance_Rating     Number_of_Promotions 
##                        0                        0                        0 
##                 Overtime       Distance_from_Home          Education_Level 
##                        0                        0                        0 
##           Marital_Status     Number_of_Dependents                Job_Level 
##                        0                        0                        0 
##             Company_Size           Company_Tenure              Remote_Work 
##                        0                        0                        0 
## Innovation_Opportunities       Company_Reputation     Employee_Recognition 
##                        0                        0                        0 
##                Attrition 
##                        0
# ── 2g. Train-Test Split (70/30) ──────────────────────────────────────────────
set.seed(123)

# Ensure Attrition is a proper factor before split (belt-and-suspenders)
attrition$Attrition <- factor(as.character(attrition$Attrition),
                               levels = c("Stayed", "Left"))

split      <- initial_split(attrition, prop = 0.7, strata = "Attrition")
train_data <- training(split)
test_data  <- testing(split)

# Re-apply factor levels after split — some rsample versions drop them
train_data$Attrition <- factor(as.character(train_data$Attrition),
                                levels = c("Stayed", "Left"))
test_data$Attrition  <- factor(as.character(test_data$Attrition),
                                levels = c("Stayed", "Left"))

cat("\nTraining rows:", nrow(train_data),
    "| Testing rows:", nrow(test_data), "\n")
## 
## Training rows: 10429 | Testing rows: 4471
cat("Train Attrition class:", class(train_data$Attrition), "| levels:", levels(train_data$Attrition), "\n")
## Train Attrition class: factor | levels: Stayed Left
cat("Attrition balance in train:\n"); print(table(train_data$Attrition))
## Attrition balance in train:
## 
## Stayed   Left 
##   5507   4922
cat("Attrition balance in test:\n");  print(table(test_data$Attrition))
## Attrition balance in test:
## 
## Stayed   Left 
##   2361   2110

5. Model 1 – Attrition ~ Monthly_Income

model1 <- glm(Attrition ~ Monthly_Income,
              data   = train_data,
              family = binomial)

summary(model1)
## 
## Call:
## glm(formula = Attrition ~ Monthly_Income, family = binomial, 
##     data = train_data)
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)  
## (Intercept)    -1.310e-01  6.954e-02  -1.884   0.0596 .
## Monthly_Income  2.564e-06  9.141e-06   0.280   0.7791  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 14425  on 10428  degrees of freedom
## Residual deviance: 14425  on 10427  degrees of freedom
## AIC: 14429
## 
## Number of Fisher Scoring iterations: 3

6. Model 2 – Attrition ~ Monthly_Income + Overtime

# After column-name cleaning, "Overtime" has no spaces so it stays "Overtime"
# Use grep for safety in case of any remaining capitalisation variation
overtime_col <- grep("^overtime$", colnames(train_data),
                     ignore.case = TRUE, value = TRUE)[1]
if (is.na(overtime_col)) stop("Overtime column not found in training data.")

formula2 <- as.formula(paste("Attrition ~ Monthly_Income +", overtime_col))

model2 <- glm(formula2,
              data   = train_data,
              family = binomial)

summary(model2)
## 
## Call:
## glm(formula = formula2, family = binomial, data = train_data)
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -2.104e-01  7.101e-02  -2.963  0.00305 ** 
## Monthly_Income  2.561e-06  9.156e-06   0.280  0.77973    
## OvertimeYes     2.400e-01  4.173e-02   5.751 8.86e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 14425  on 10428  degrees of freedom
## Residual deviance: 14392  on 10426  degrees of freedom
## AIC: 14398
## 
## Number of Fisher Scoring iterations: 3

7. Model 3 – Attrition ~ . (All Predictors)

# FIX: Suppress "fitted probabilities numerically 0 or 1" warning
#      that arises from perfect separation on some columns – it is
#      expected behaviour and does not invalidate the model.
model3 <- suppressWarnings(
  glm(Attrition ~ .,
      data   = train_data,
      family = binomial)
)

summary(model3)
## 
## Call:
## glm(formula = Attrition ~ ., family = binomial, data = train_data)
## 
## Coefficients:
##                                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                       1.429e-01  2.083e-01   0.686 0.492890    
## Age                              -7.592e-03  2.419e-03  -3.138 0.001700 ** 
## GenderMale                       -6.118e-01  5.031e-02 -12.162  < 2e-16 ***
## Years_at_Company                 -1.328e-02  2.863e-03  -4.639 3.50e-06 ***
## Job_RoleFinance                  -2.236e-01  1.182e-01  -1.892 0.058454 .  
## Job_RoleHealthcare               -1.588e-01  1.031e-01  -1.541 0.123378    
## Job_RoleMedia                    -2.883e-01  8.673e-02  -3.324 0.000886 ***
## Job_RoleTechnology               -2.880e-01  1.192e-01  -2.415 0.015720 *  
## Monthly_Income                    3.540e-05  2.021e-05   1.751 0.079873 .  
## Work_Life_BalanceFair             1.385e+00  7.561e-02  18.320  < 2e-16 ***
## Work_Life_BalanceGood             3.294e-01  7.105e-02   4.637 3.54e-06 ***
## Work_Life_BalancePoor             1.682e+00  9.098e-02  18.490  < 2e-16 ***
## Job_SatisfactionLow               6.545e-01  8.550e-02   7.655 1.93e-14 ***
## Job_SatisfactionMedium            1.072e-01  6.634e-02   1.616 0.106151    
## Job_SatisfactionVery High         6.375e-01  6.569e-02   9.705  < 2e-16 ***
## Performance_RatingBelow Average   4.063e-01  7.231e-02   5.619 1.92e-08 ***
## Performance_RatingHigh            1.368e-01  6.345e-02   2.156 0.031081 *  
## Performance_RatingLow             4.531e-01  1.168e-01   3.879 0.000105 ***
## Number_of_Promotions             -2.828e-01  2.548e-02 -11.100  < 2e-16 ***
## OvertimeYes                       3.599e-01  5.286e-02   6.810 9.78e-12 ***
## Distance_from_Home                1.004e-02  8.732e-04  11.493  < 2e-16 ***
## Education_LevelBachelor’s Degree -9.952e-02  6.636e-02  -1.500 0.133682    
## Education_LevelHigh School       -9.133e-02  7.429e-02  -1.229 0.218929    
## Education_LevelMaster’s Degree   -1.585e-01  7.366e-02  -2.152 0.031367 *  
## Education_LevelPhD               -1.912e+00  1.391e-01 -13.741  < 2e-16 ***
## Marital_StatusMarried            -3.238e-01  7.113e-02  -4.552 5.30e-06 ***
## Marital_StatusSingle              1.598e+00  7.794e-02  20.504  < 2e-16 ***
## Number_of_Dependents             -1.201e-01  1.614e-02  -7.446 9.64e-14 ***
## Job_LevelMid                     -1.062e+00  5.521e-02 -19.237  < 2e-16 ***
## Job_LevelSenior                  -2.725e+00  7.946e-02 -34.299  < 2e-16 ***
## Company_SizeMedium               -6.982e-02  6.580e-02  -1.061 0.288651    
## Company_SizeSmall                 1.079e-01  7.186e-02   1.502 0.133027    
## Company_Tenure                    3.363e-04  1.097e-03   0.307 0.759111    
## Remote_WorkYes                   -1.836e+00  7.105e-02 -25.845  < 2e-16 ***
## Innovation_OpportunitiesYes      -9.809e-02  6.820e-02  -1.438 0.150354    
## Company_ReputationFair            5.488e-01  9.745e-02   5.632 1.78e-08 ***
## Company_ReputationGood           -1.921e-02  8.720e-02  -0.220 0.825670    
## Company_ReputationPoor            6.555e-01  9.722e-02   6.742 1.56e-11 ***
## Employee_RecognitionLow           4.538e-02  6.343e-02   0.716 0.474272    
## Employee_RecognitionMedium        3.425e-03  6.695e-02   0.051 0.959204    
## Employee_RecognitionVery High    -1.076e-01  1.237e-01  -0.869 0.384601    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 14424.8  on 10428  degrees of freedom
## Residual deviance:  9912.6  on 10388  degrees of freedom
## AIC: 9994.6
## 
## Number of Fisher Scoring iterations: 5

8. Predictions on the Test Set

# Predicted probabilities
prob1 <- predict(model1, newdata = test_data, type = "response")
prob2 <- predict(model2, newdata = test_data, type = "response")
prob3 <- predict(model3, newdata = test_data, type = "response")

# FIX: Use factor() with the same levels as the outcome column
#      so confusionMatrix() never complains about level mismatches
make_pred <- function(prob) {
  factor(ifelse(prob > 0.5, "Left", "Stayed"), levels = c("Stayed", "Left"))
}

pred1 <- make_pred(prob1)
pred2 <- make_pred(prob2)
pred3 <- make_pred(prob3)

9. Confusion Matrices

cat("=== Model 1 Confusion Matrix ===\n")
## === Model 1 Confusion Matrix ===
cm1 <- confusionMatrix(pred1, test_data$Attrition, positive = "Left")
print(cm1)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Stayed Left
##     Stayed   2361 2110
##     Left        0    0
##                                           
##                Accuracy : 0.5281          
##                  95% CI : (0.5133, 0.5428)
##     No Information Rate : 0.5281          
##     P-Value [Acc > NIR] : 0.5061          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.0000          
##             Specificity : 1.0000          
##          Pos Pred Value :    NaN          
##          Neg Pred Value : 0.5281          
##              Prevalence : 0.4719          
##          Detection Rate : 0.0000          
##    Detection Prevalence : 0.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : Left            
## 
cat("\n=== Model 2 Confusion Matrix ===\n")
## 
## === Model 2 Confusion Matrix ===
cm2 <- confusionMatrix(pred2, test_data$Attrition, positive = "Left")
print(cm2)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Stayed Left
##     Stayed   1657 1362
##     Left      704  748
##                                           
##                Accuracy : 0.5379          
##                  95% CI : (0.5232, 0.5526)
##     No Information Rate : 0.5281          
##     P-Value [Acc > NIR] : 0.09622         
##                                           
##                   Kappa : 0.0573          
##                                           
##  Mcnemar's Test P-Value : < 2e-16         
##                                           
##             Sensitivity : 0.3545          
##             Specificity : 0.7018          
##          Pos Pred Value : 0.5152          
##          Neg Pred Value : 0.5489          
##              Prevalence : 0.4719          
##          Detection Rate : 0.1673          
##    Detection Prevalence : 0.3248          
##       Balanced Accuracy : 0.5282          
##                                           
##        'Positive' Class : Left            
## 
cat("\n=== Model 3 Confusion Matrix ===\n")
## 
## === Model 3 Confusion Matrix ===
cm3 <- confusionMatrix(pred3, test_data$Attrition, positive = "Left")
print(cm3)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Stayed Left
##     Stayed   1791  539
##     Left      570 1571
##                                          
##                Accuracy : 0.752          
##                  95% CI : (0.739, 0.7646)
##     No Information Rate : 0.5281         
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.5027         
##                                          
##  Mcnemar's Test P-Value : 0.3677         
##                                          
##             Sensitivity : 0.7445         
##             Specificity : 0.7586         
##          Pos Pred Value : 0.7338         
##          Neg Pred Value : 0.7687         
##              Prevalence : 0.4719         
##          Detection Rate : 0.3514         
##    Detection Prevalence : 0.4789         
##       Balanced Accuracy : 0.7516         
##                                          
##        'Positive' Class : Left           
## 

10. ROC Curves and AUC Comparison

# FIX: Set direction = "<" so pROC uses the correct orientation
#      (higher probability → more likely "Left" i.e. attrition)
roc1 <- roc(test_data$Attrition, prob1, levels = c("Stayed", "Left"), direction = "<")
roc2 <- roc(test_data$Attrition, prob2, levels = c("Stayed", "Left"), direction = "<")
roc3 <- roc(test_data$Attrition, prob3, levels = c("Stayed", "Left"), direction = "<")

auc1 <- auc(roc1)
auc2 <- auc(roc2)
auc3 <- auc(roc3)

cat("AUC Values:\n")
## AUC Values:
cat("  Model 1 (Monthly_Income only)       :", round(auc1, 4), "\n")
##   Model 1 (Monthly_Income only)       : 0.481
cat("  Model 2 (+ Overtime)               :", round(auc2, 4), "\n")
##   Model 2 (+ Overtime)               : 0.5166
cat("  Model 3 (All predictors)           :", round(auc3, 4), "\n")
##   Model 3 (All predictors)           : 0.839

ROC Curve Plot

# FIX: Plot all three curves in a single, clearly labelled figure
plot(roc1,
     col  = "steelblue",
     lwd  = 2,
     main = "ROC Curves – Logistic Regression Models",
     xlab = "False Positive Rate (1 – Specificity)",
     ylab = "True Positive Rate (Sensitivity)")

plot(roc2, col = "darkorange",  lwd = 2, add = TRUE)
plot(roc3, col = "forestgreen", lwd = 2, add = TRUE)

abline(a = 0, b = 1, lty = 2, col = "grey60")

legend("bottomright",
       legend = c(
         paste0("Model 1  AUC = ", round(auc1, 3)),
         paste0("Model 2  AUC = ", round(auc2, 3)),
         paste0("Model 3  AUC = ", round(auc3, 3))
       ),
       col = c("steelblue", "darkorange", "forestgreen"),
       lwd = 2,
       bty = "n")


11. Model Comparison Summary

# FIX: Collect key metrics in a tidy table for easy comparison
extract_metrics <- function(cm) {
  c(Accuracy    = cm$overall["Accuracy"],
    Sensitivity = cm$byClass["Sensitivity"],
    Specificity = cm$byClass["Specificity"],
    Precision   = cm$byClass["Pos Pred Value"],
    F1          = cm$byClass["F1"])
}

results <- rbind(
  Model1 = c(extract_metrics(cm1), AUC = as.numeric(auc1)),
  Model2 = c(extract_metrics(cm2), AUC = as.numeric(auc2)),
  Model3 = c(extract_metrics(cm3), AUC = as.numeric(auc3))
)

knitr::kable(round(results, 4),
             caption = "Performance Metrics Across Models")
Performance Metrics Across Models
Accuracy.Accuracy Sensitivity.Sensitivity Specificity.Specificity Precision.Pos Pred Value F1.F1 AUC
Model1 0.5281 0.0000 1.0000 NaN NA 0.4810
Model2 0.5379 0.3545 0.7018 0.5152 0.4200 0.5166
Model3 0.7520 0.7445 0.7586 0.7338 0.7391 0.8390