2. Load, Preprocess, and Split
# ── 2a. Load ──────────────────────────────────────────────────────────────────
attrition <- read.csv("test.csv", stringsAsFactors = FALSE)
cat("Raw dimensions:", nrow(attrition), "rows x", ncol(attrition), "cols\n")
## Raw dimensions: 14900 rows x 24 cols
# ── 2b. Clean all column names: spaces/hyphens → underscores, strip dots ─────
# read.csv() converts spaces to dots (e.g. "Monthly Income" -> "Monthly.Income")
# We normalise everything to snake_case for reliable referencing.
colnames(attrition) <- gsub("[. ]+", "_", trimws(colnames(attrition)))
colnames(attrition) <- gsub("_+", "_", colnames(attrition)) # collapse multiples
colnames(attrition) <- gsub("^_|_$", "", colnames(attrition)) # strip leading/trailing
cat("Cleaned column names:\n"); print(colnames(attrition))
## Cleaned column names:
## [1] "Employee_ID" "Age"
## [3] "Gender" "Years_at_Company"
## [5] "Job_Role" "Monthly_Income"
## [7] "Work_Life_Balance" "Job_Satisfaction"
## [9] "Performance_Rating" "Number_of_Promotions"
## [11] "Overtime" "Distance_from_Home"
## [13] "Education_Level" "Marital_Status"
## [15] "Number_of_Dependents" "Job_Level"
## [17] "Company_Size" "Company_Tenure"
## [19] "Remote_Work" "Leadership_Opportunities"
## [21] "Innovation_Opportunities" "Company_Reputation"
## [23] "Employee_Recognition" "Attrition"
# ── 2c. Standardise the target column name ────────────────────────────────────
target_col <- grep("^attrition$", colnames(attrition),
ignore.case = TRUE, value = TRUE)[1]
if (is.na(target_col)) stop("No Attrition column found. Check your CSV.")
cat("\nTarget column found as: '", target_col, "'\n", sep = "")
##
## Target column found as: 'Attrition'
colnames(attrition)[colnames(attrition) == target_col] <- "Attrition"
# Convert to factor with explicit levels
attrition$Attrition <- factor(attrition$Attrition, levels = c("Stayed", "Left"))
cat("Attrition level check:", levels(attrition$Attrition), "\n")
## Attrition level check: Stayed Left
# ── 2c. Factorise Overtime ────────────────────────────────────────────────────
overtime_col <- grep("^overtime$", colnames(attrition), # matches "Overtime" after cleaning
ignore.case = TRUE, value = TRUE)[1]
if (!is.na(overtime_col)) {
attrition[[overtime_col]] <- as.factor(attrition[[overtime_col]])
cat("Overtime column: '", overtime_col, "'\n", sep = "")
}
## Overtime column: 'Overtime'
# ── 2d. Remove ID columns ─────────────────────────────────────────────────────
id_cols <- c("Employee_ID", "EmployeeId", "EmployeeNumber", "ID", "Id")
attrition <- attrition[ , !(colnames(attrition) %in% id_cols), drop = FALSE]
# ── 2e. Drop zero-variance columns (NEVER include the target column) ────────
# Running nearZeroVar on the full df can incorrectly flag and drop Attrition
# when it is imbalanced. Restrict the check to predictors only.
predictors_only <- attrition[ , colnames(attrition) != "Attrition", drop = FALSE]
nzv_idx <- caret::nearZeroVar(predictors_only)
if (length(nzv_idx) > 0) {
drop_cols <- colnames(predictors_only)[nzv_idx]
cat("Dropping zero-variance cols:", paste(drop_cols, collapse = ", "), "\n")
attrition <- attrition[ , !(colnames(attrition) %in% drop_cols), drop = FALSE]
}
## Dropping zero-variance cols: Leadership_Opportunities
# ── 2f. Convert remaining character columns to factors ────────────────────────
# Convert all character columns to factors
char_cols <- sapply(attrition, is.character)
attrition[char_cols] <- lapply(attrition[char_cols], as.factor)
# Explicitly ensure Attrition is a well-formed factor (not just any factor)
# This guards against it being silently converted to character by any step above
attrition$Attrition <- factor(as.character(attrition$Attrition),
levels = c("Stayed", "Left"))
# Verify Attrition survived all transformations
stopifnot("Attrition" %in% colnames(attrition))
cat("Attrition class:", class(attrition$Attrition),
"| levels:", levels(attrition$Attrition), "\n")
## Attrition class: factor | levels: Stayed Left
cat("\nFinal column count:", ncol(attrition), "\n")
##
## Final column count: 22
cat("Missing values per column:\n"); print(colSums(is.na(attrition)))
## Missing values per column:
## Age Gender Years_at_Company
## 0 0 0
## Job_Role Monthly_Income Work_Life_Balance
## 0 0 0
## Job_Satisfaction Performance_Rating Number_of_Promotions
## 0 0 0
## Overtime Distance_from_Home Education_Level
## 0 0 0
## Marital_Status Number_of_Dependents Job_Level
## 0 0 0
## Company_Size Company_Tenure Remote_Work
## 0 0 0
## Innovation_Opportunities Company_Reputation Employee_Recognition
## 0 0 0
## Attrition
## 0
# ── 2g. Train-Test Split (70/30) ──────────────────────────────────────────────
set.seed(123)
# Ensure Attrition is a proper factor before split (belt-and-suspenders)
attrition$Attrition <- factor(as.character(attrition$Attrition),
levels = c("Stayed", "Left"))
split <- initial_split(attrition, prop = 0.7, strata = "Attrition")
train_data <- training(split)
test_data <- testing(split)
# Re-apply factor levels after split — some rsample versions drop them
train_data$Attrition <- factor(as.character(train_data$Attrition),
levels = c("Stayed", "Left"))
test_data$Attrition <- factor(as.character(test_data$Attrition),
levels = c("Stayed", "Left"))
cat("\nTraining rows:", nrow(train_data),
"| Testing rows:", nrow(test_data), "\n")
##
## Training rows: 10429 | Testing rows: 4471
cat("Train Attrition class:", class(train_data$Attrition), "| levels:", levels(train_data$Attrition), "\n")
## Train Attrition class: factor | levels: Stayed Left
cat("Attrition balance in train:\n"); print(table(train_data$Attrition))
## Attrition balance in train:
##
## Stayed Left
## 5507 4922
cat("Attrition balance in test:\n"); print(table(test_data$Attrition))
## Attrition balance in test:
##
## Stayed Left
## 2361 2110
9. Confusion Matrices
cat("=== Model 1 Confusion Matrix ===\n")
## === Model 1 Confusion Matrix ===
cm1 <- confusionMatrix(pred1, test_data$Attrition, positive = "Left")
print(cm1)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Stayed Left
## Stayed 2361 2110
## Left 0 0
##
## Accuracy : 0.5281
## 95% CI : (0.5133, 0.5428)
## No Information Rate : 0.5281
## P-Value [Acc > NIR] : 0.5061
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.0000
## Specificity : 1.0000
## Pos Pred Value : NaN
## Neg Pred Value : 0.5281
## Prevalence : 0.4719
## Detection Rate : 0.0000
## Detection Prevalence : 0.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : Left
##
cat("\n=== Model 2 Confusion Matrix ===\n")
##
## === Model 2 Confusion Matrix ===
cm2 <- confusionMatrix(pred2, test_data$Attrition, positive = "Left")
print(cm2)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Stayed Left
## Stayed 1657 1362
## Left 704 748
##
## Accuracy : 0.5379
## 95% CI : (0.5232, 0.5526)
## No Information Rate : 0.5281
## P-Value [Acc > NIR] : 0.09622
##
## Kappa : 0.0573
##
## Mcnemar's Test P-Value : < 2e-16
##
## Sensitivity : 0.3545
## Specificity : 0.7018
## Pos Pred Value : 0.5152
## Neg Pred Value : 0.5489
## Prevalence : 0.4719
## Detection Rate : 0.1673
## Detection Prevalence : 0.3248
## Balanced Accuracy : 0.5282
##
## 'Positive' Class : Left
##
cat("\n=== Model 3 Confusion Matrix ===\n")
##
## === Model 3 Confusion Matrix ===
cm3 <- confusionMatrix(pred3, test_data$Attrition, positive = "Left")
print(cm3)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Stayed Left
## Stayed 1791 539
## Left 570 1571
##
## Accuracy : 0.752
## 95% CI : (0.739, 0.7646)
## No Information Rate : 0.5281
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.5027
##
## Mcnemar's Test P-Value : 0.3677
##
## Sensitivity : 0.7445
## Specificity : 0.7586
## Pos Pred Value : 0.7338
## Neg Pred Value : 0.7687
## Prevalence : 0.4719
## Detection Rate : 0.3514
## Detection Prevalence : 0.4789
## Balanced Accuracy : 0.7516
##
## 'Positive' Class : Left
##
10. ROC Curves and AUC Comparison
# FIX: Set direction = "<" so pROC uses the correct orientation
# (higher probability → more likely "Left" i.e. attrition)
roc1 <- roc(test_data$Attrition, prob1, levels = c("Stayed", "Left"), direction = "<")
roc2 <- roc(test_data$Attrition, prob2, levels = c("Stayed", "Left"), direction = "<")
roc3 <- roc(test_data$Attrition, prob3, levels = c("Stayed", "Left"), direction = "<")
auc1 <- auc(roc1)
auc2 <- auc(roc2)
auc3 <- auc(roc3)
cat("AUC Values:\n")
## AUC Values:
cat(" Model 1 (Monthly_Income only) :", round(auc1, 4), "\n")
## Model 1 (Monthly_Income only) : 0.481
cat(" Model 2 (+ Overtime) :", round(auc2, 4), "\n")
## Model 2 (+ Overtime) : 0.5166
cat(" Model 3 (All predictors) :", round(auc3, 4), "\n")
## Model 3 (All predictors) : 0.839
ROC Curve Plot
# FIX: Plot all three curves in a single, clearly labelled figure
plot(roc1,
col = "steelblue",
lwd = 2,
main = "ROC Curves – Logistic Regression Models",
xlab = "False Positive Rate (1 – Specificity)",
ylab = "True Positive Rate (Sensitivity)")
plot(roc2, col = "darkorange", lwd = 2, add = TRUE)
plot(roc3, col = "forestgreen", lwd = 2, add = TRUE)
abline(a = 0, b = 1, lty = 2, col = "grey60")
legend("bottomright",
legend = c(
paste0("Model 1 AUC = ", round(auc1, 3)),
paste0("Model 2 AUC = ", round(auc2, 3)),
paste0("Model 3 AUC = ", round(auc3, 3))
),
col = c("steelblue", "darkorange", "forestgreen"),
lwd = 2,
bty = "n")
