library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.0 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(knitr)
library(kableExtra)
##
## Attaching package: 'kableExtra'
##
## The following object is masked from 'package:dplyr':
##
## group_rows
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(ggplot2)
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
df <- read.csv("test.csv", stringsAsFactors = FALSE, check.names = TRUE)
cat("Dataset dimensions:", nrow(df), "rows x", ncol(df), "columns\n")
## Dataset dimensions: 14900 rows x 24 columns
cat("Column names:\n")
## Column names:
print(names(df))
## [1] "Employee.ID" "Age"
## [3] "Gender" "Years.at.Company"
## [5] "Job.Role" "Monthly.Income"
## [7] "Work.Life.Balance" "Job.Satisfaction"
## [9] "Performance.Rating" "Number.of.Promotions"
## [11] "Overtime" "Distance.from.Home"
## [13] "Education.Level" "Marital.Status"
## [15] "Number.of.Dependents" "Job.Level"
## [17] "Company.Size" "Company.Tenure"
## [19] "Remote.Work" "Leadership.Opportunities"
## [21] "Innovation.Opportunities" "Company.Reputation"
## [23] "Employee.Recognition" "Attrition"
df %>%
count(Attrition) %>%
mutate(Pct = round(n / sum(n) * 100, 1)) %>%
kable(caption = "Outcome Variable Distribution", col.names = c("Attrition", "Count", "Pct (%)")) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Outcome Variable Distribution
|
Attrition
|
Count
|
Pct (%)
|
|
Left
|
7032
|
47.2
|
|
Stayed
|
7868
|
52.8
|
miss <- colSums(is.na(df))
if (sum(miss) == 0) {
cat("No missing values detected.\n")
} else {
print(miss[miss > 0])
}
## No missing values detected.
df$Attrition_bin <- ifelse(df$Attrition == "Left", 1, 0)
df$Overtime_bin <- ifelse(df$Overtime == "Yes", 1, 0)
cat_cols <- c(
"Gender", "Job.Role", "Work.Life.Balance", "Job.Satisfaction",
"Performance.Rating", "Education.Level", "Marital.Status",
"Job.Level", "Company.Size", "Remote.Work",
"Leadership.Opportunities", "Innovation.Opportunities",
"Company.Reputation", "Employee.Recognition"
)
for (col in cat_cols) {
if (col %in% names(df)) {
df[[paste0(col, "_enc")]] <- as.integer(factor(df[[col]]))
}
}
numeric_cols <- c(
"Age", "Years.at.Company", "Monthly.Income",
"Number.of.Promotions", "Distance.from.Home",
"Number.of.Dependents", "Company.Tenure"
)
enc_cols <- paste0(cat_cols[cat_cols %in% names(df)], "_enc")
all_vars <- c(numeric_cols, enc_cols, "Overtime_bin")
cat("Number of predictors in full model:", length(all_vars), "\n")
## Number of predictors in full model: 22
set.seed(42)
train_idx <- createDataPartition(df$Attrition_bin, p = 0.70, list = FALSE)
train_df <- df[ train_idx, ]
test_df <- df[-train_idx, ]
cat(sprintf(
"Train: %d rows (attrition rate: %.1f%%)\nTest: %d rows (attrition rate: %.1f%%)\n",
nrow(train_df), mean(train_df$Attrition_bin) * 100,
nrow(test_df), mean(test_df$Attrition_bin) * 100
))
## Train: 10430 rows (attrition rate: 47.4%)
## Test: 4470 rows (attrition rate: 46.7%)
df %>%
group_by(Overtime, Attrition) %>%
summarise(n = n(), .groups = "drop") %>%
group_by(Overtime) %>%
mutate(pct = n / sum(n)) %>%
ggplot(aes(x = Overtime, y = pct, fill = Attrition)) +
geom_col(position = "dodge", width = 0.6) +
scale_y_continuous(labels = percent_format()) +
scale_fill_manual(values = c("Left" = "#E74C3C", "Stayed" = "#2E86AB")) +
labs(
title = "Attrition Rate by Overtime Status",
x = "Overtime", y = "Proportion", fill = "Attrition"
) +
theme_minimal(base_size = 13)

ggplot(df, aes(x = Attrition, y = Monthly.Income, fill = Attrition)) +
geom_boxplot(alpha = 0.7, outlier.alpha = 0.3) +
scale_fill_manual(values = c("Left" = "#E74C3C", "Stayed" = "#2E86AB")) +
scale_y_continuous(labels = comma_format()) +
labs(
title = "Monthly Income Distribution by Attrition",
x = "Attrition Status", y = "Monthly Income"
) +
theme_minimal(base_size = 13) +
theme(legend.position = "none")

pseudo_r2 <- function(model) {
1 - (model$deviance / model$null.deviance)
}
eval_model <- function(model, test_data, outcome = "Attrition_bin",
threshold = 0.5, model_name = "Model") {
probs <- predict(model, newdata = test_data, type = "response")
preds <- factor(ifelse(probs >= threshold, 1, 0), levels = c(0, 1))
actual <- factor(test_data[[outcome]], levels = c(0, 1))
cm <- confusionMatrix(preds, actual, positive = "1")
TN <- cm$table[1, 1]; FP <- cm$table[2, 1]
FN <- cm$table[1, 2]; TP <- cm$table[2, 2]
list(
name = model_name,
probs = probs,
preds = preds,
actual = actual,
cm = cm,
TN = TN, FP = FP, FN = FN, TP = TP,
accuracy = as.numeric(cm$overall["Accuracy"]),
precision = as.numeric(cm$byClass["Precision"]),
recall = as.numeric(cm$byClass["Recall"]),
f1 = as.numeric(cm$byClass["F1"])
)
}
or_table <- function(model) {
coefs <- summary(model)$coefficients
or <- exp(coef(model))
ci <- exp(confint.default(model))
data.frame(
Variable = rownames(coefs),
Coefficient = round(coefs[, 1], 4),
Std.Error = round(coefs[, 2], 4),
z.stat = round(coefs[, 3], 3),
p.value = round(coefs[, 4], 4),
Odds.Ratio = round(or, 4),
CI.Lower = round(ci[, 1], 4),
CI.Upper = round(ci[, 2], 4),
Sig = ifelse(coefs[, 4] < .001, "***",
ifelse(coefs[, 4] < .01, "**",
ifelse(coefs[, 4] < .05, "*",
ifelse(coefs[, 4] < .1, ".", ""))))
)
}
m1 <- glm(Attrition_bin ~ Monthly.Income,
data = train_df,
family = binomial(link = "logit"))
summary(m1)
##
## Call:
## glm(formula = Attrition_bin ~ Monthly.Income, family = binomial(link = "logit"),
## data = train_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.485e-02 6.921e-02 -0.504 0.615
## Monthly.Income -9.488e-06 9.106e-06 -1.042 0.297
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 14431 on 10429 degrees of freedom
## Residual deviance: 14430 on 10428 degrees of freedom
## AIC: 14434
##
## Number of Fisher Scoring iterations: 3
cat(sprintf("Log-Likelihood: %.2f\n", logLik(m1)))
## Log-Likelihood: -7214.89
cat(sprintf("McFadden Pseudo R²: %.4f\n", pseudo_r2(m1)))
## McFadden Pseudo R²: 0.0001
cat(sprintf("AIC: %.2f\n", AIC(m1)))
## AIC: 14433.79
or_table(m1) %>%
kable(caption = "Model 1 — Coefficient Table with Odds Ratios",
digits = 4) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE) %>%
column_spec(9, bold = TRUE, color = "darkred")
Model 1 — Coefficient Table with Odds Ratios
|
|
Variable
|
Coefficient
|
Std.Error
|
z.stat
|
p.value
|
Odds.Ratio
|
CI.Lower
|
CI.Upper
|
Sig
|
|
(Intercept)
|
(Intercept)
|
-0.0349
|
0.0692
|
-0.504
|
0.6146
|
0.9657
|
0.8432
|
1.1061
|
|
|
Monthly.Income
|
Monthly.Income
|
0.0000
|
0.0000
|
-1.042
|
0.2974
|
1.0000
|
1.0000
|
1.0000
|
|
res1 <- eval_model(m1, test_df, model_name = "Model 1")
print(res1$cm$table)
## Reference
## Prediction 0 1
## 0 2382 2088
## 1 0 0
data.frame(
Metric = c("Accuracy", "Precision", "Recall", "F1-Score"),
Value = round(c(res1$accuracy, res1$precision, res1$recall, res1$f1), 4)
) %>%
kable(caption = "Model 1 — Performance Metrics") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Model 1 — Performance Metrics
|
Metric
|
Value
|
|
Accuracy
|
0.5329
|
|
Precision
|
NA
|
|
Recall
|
0.0000
|
|
F1-Score
|
NA
|
m2 <- glm(Attrition_bin ~ Monthly.Income + Overtime_bin,
data = train_df,
family = binomial(link = "logit"))
summary(m2)
##
## Call:
## glm(formula = Attrition_bin ~ Monthly.Income + Overtime_bin,
## family = binomial(link = "logit"), data = train_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.115e-01 7.069e-02 -1.577 0.115
## Monthly.Income -9.469e-06 9.120e-06 -1.038 0.299
## Overtime_bin 2.303e-01 4.168e-02 5.526 3.27e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 14431 on 10429 degrees of freedom
## Residual deviance: 14399 on 10427 degrees of freedom
## AIC: 14405
##
## Number of Fisher Scoring iterations: 3
cat(sprintf("Log-Likelihood: %.2f\n", logLik(m2)))
## Log-Likelihood: -7199.61
cat(sprintf("McFadden Pseudo R²: %.4f\n", pseudo_r2(m2)))
## McFadden Pseudo R²: 0.0022
cat(sprintf("AIC: %.2f\n", AIC(m2)))
## AIC: 14405.22
lrt_12 <- anova(m1, m2, test = "Chisq")
print(lrt_12)
## Analysis of Deviance Table
##
## Model 1: Attrition_bin ~ Monthly.Income
## Model 2: Attrition_bin ~ Monthly.Income + Overtime_bin
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 10428 14430
## 2 10427 14399 1 30.567 3.226e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
or_table(m2) %>%
kable(caption = "Model 2 — Coefficient Table with Odds Ratios",
digits = 4) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE) %>%
column_spec(9, bold = TRUE, color = "darkred")
Model 2 — Coefficient Table with Odds Ratios
|
|
Variable
|
Coefficient
|
Std.Error
|
z.stat
|
p.value
|
Odds.Ratio
|
CI.Lower
|
CI.Upper
|
Sig
|
|
(Intercept)
|
(Intercept)
|
-0.1115
|
0.0707
|
-1.577
|
0.1147
|
0.8945
|
0.7788
|
1.0274
|
|
|
Monthly.Income
|
Monthly.Income
|
0.0000
|
0.0000
|
-1.038
|
0.2992
|
1.0000
|
1.0000
|
1.0000
|
|
|
Overtime_bin
|
Overtime_bin
|
0.2303
|
0.0417
|
5.526
|
0.0000
|
1.2590
|
1.1603
|
1.3662
|
***
|
res2 <- eval_model(m2, test_df, model_name = "Model 2")
print(res2$cm$table)
## Reference
## Prediction 0 1
## 0 1689 1355
## 1 693 733
data.frame(
Metric = c("Accuracy", "Precision", "Recall", "F1-Score"),
Value = round(c(res2$accuracy, res2$precision, res2$recall, res2$f1), 4)
) %>%
kable(caption = "Model 2 — Performance Metrics") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Model 2 — Performance Metrics
|
Metric
|
Value
|
|
Accuracy
|
0.5418
|
|
Precision
|
0.5140
|
|
Recall
|
0.3511
|
|
F1-Score
|
0.4172
|
formula_m3 <- as.formula(
paste("Attrition_bin ~", paste(all_vars, collapse = " + "))
)
m3 <- glm(formula_m3,
data = train_df,
family = binomial(link = "logit"))
summary(m3)
##
## Call:
## glm(formula = formula_m3, family = binomial(link = "logit"),
## data = train_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.304e+00 2.746e-01 8.392 < 2e-16 ***
## Age -6.545e-03 2.228e-03 -2.938 0.00330 **
## Years.at.Company -1.205e-02 2.634e-03 -4.575 4.77e-06 ***
## Monthly.Income -2.330e-06 1.285e-05 -0.181 0.85612
## Number.of.Promotions -2.320e-01 2.339e-02 -9.919 < 2e-16 ***
## Distance.from.Home 8.571e-03 8.060e-04 10.634 < 2e-16 ***
## Number.of.Dependents -9.815e-02 1.502e-02 -6.533 6.43e-11 ***
## Company.Tenure 4.642e-04 1.010e-03 0.460 0.64573
## Gender_enc -5.207e-01 4.628e-02 -11.250 < 2e-16 ***
## Job.Role_enc -3.209e-02 1.877e-02 -1.710 0.08729 .
## Work.Life.Balance_enc 1.870e-01 2.448e-02 7.639 2.18e-14 ***
## Job.Satisfaction_enc 1.522e-01 1.884e-02 8.078 6.57e-16 ***
## Performance.Rating_enc 1.223e-01 2.404e-02 5.089 3.61e-07 ***
## Education.Level_enc -1.329e-01 1.910e-02 -6.957 3.47e-12 ***
## Marital.Status_enc 9.276e-01 3.596e-02 25.792 < 2e-16 ***
## Job.Level_enc -1.120e+00 3.336e-02 -33.586 < 2e-16 ***
## Company.Size_enc 1.048e-01 3.281e-02 3.195 0.00140 **
## Remote.Work_enc -1.609e+00 6.504e-02 -24.744 < 2e-16 ***
## Leadership.Opportunities_enc -3.110e-01 1.076e-01 -2.891 0.00385 **
## Innovation.Opportunities_enc -1.811e-01 6.286e-02 -2.881 0.00396 **
## Company.Reputation_enc 7.387e-02 2.660e-02 2.777 0.00548 **
## Employee.Recognition_enc 1.080e-02 2.705e-02 0.399 0.68978
## Overtime_bin 3.180e-01 4.887e-02 6.507 7.68e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 14431 on 10429 degrees of freedom
## Residual deviance: 11308 on 10407 degrees of freedom
## AIC: 11354
##
## Number of Fisher Scoring iterations: 4
cat(sprintf("Log-Likelihood: %.2f\n", logLik(m3)))
## Log-Likelihood: -5654.03
cat(sprintf("McFadden Pseudo R²: %.4f\n", pseudo_r2(m3)))
## McFadden Pseudo R²: 0.2164
cat(sprintf("AIC: %.2f\n", AIC(m3)))
## AIC: 11354.07
lrt_23 <- anova(m2, m3, test = "Chisq")
print(lrt_23)
## Analysis of Deviance Table
##
## Model 1: Attrition_bin ~ Monthly.Income + Overtime_bin
## Model 2: Attrition_bin ~ Age + Years.at.Company + Monthly.Income + Number.of.Promotions +
## Distance.from.Home + Number.of.Dependents + Company.Tenure +
## Gender_enc + Job.Role_enc + Work.Life.Balance_enc + Job.Satisfaction_enc +
## Performance.Rating_enc + Education.Level_enc + Marital.Status_enc +
## Job.Level_enc + Company.Size_enc + Remote.Work_enc + Leadership.Opportunities_enc +
## Innovation.Opportunities_enc + Company.Reputation_enc + Employee.Recognition_enc +
## Overtime_bin
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 10427 14399
## 2 10407 11308 20 3091.2 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
or_table(m3) %>%
kable(caption = "Model 3 — Full Coefficient Table with Odds Ratios",
digits = 4) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE) %>%
column_spec(9, bold = TRUE, color = "darkred") %>%
row_spec(which(or_table(m3)$Sig %in% c("***", "**", "*")),
background = "#F0FFF0")
Model 3 — Full Coefficient Table with Odds Ratios
|
|
Variable
|
Coefficient
|
Std.Error
|
z.stat
|
p.value
|
Odds.Ratio
|
CI.Lower
|
CI.Upper
|
Sig
|
|
(Intercept)
|
(Intercept)
|
2.3042
|
0.2746
|
8.392
|
0.0000
|
10.0160
|
5.8477
|
17.1554
|
***
|
|
Age
|
Age
|
-0.0065
|
0.0022
|
-2.938
|
0.0033
|
0.9935
|
0.9891
|
0.9978
|
**
|
|
Years.at.Company
|
Years.at.Company
|
-0.0120
|
0.0026
|
-4.575
|
0.0000
|
0.9880
|
0.9829
|
0.9931
|
***
|
|
Monthly.Income
|
Monthly.Income
|
0.0000
|
0.0000
|
-0.181
|
0.8561
|
1.0000
|
1.0000
|
1.0000
|
|
|
Number.of.Promotions
|
Number.of.Promotions
|
-0.2320
|
0.0234
|
-9.919
|
0.0000
|
0.7929
|
0.7574
|
0.8301
|
***
|
|
Distance.from.Home
|
Distance.from.Home
|
0.0086
|
0.0008
|
10.634
|
0.0000
|
1.0086
|
1.0070
|
1.0102
|
***
|
|
Number.of.Dependents
|
Number.of.Dependents
|
-0.0981
|
0.0150
|
-6.533
|
0.0000
|
0.9065
|
0.8802
|
0.9336
|
***
|
|
Company.Tenure
|
Company.Tenure
|
0.0005
|
0.0010
|
0.460
|
0.6457
|
1.0005
|
0.9985
|
1.0024
|
|
|
Gender_enc
|
Gender_enc
|
-0.5207
|
0.0463
|
-11.250
|
0.0000
|
0.5941
|
0.5426
|
0.6505
|
***
|
|
Job.Role_enc
|
Job.Role_enc
|
-0.0321
|
0.0188
|
-1.710
|
0.0873
|
0.9684
|
0.9334
|
1.0047
|
.
|
|
Work.Life.Balance_enc
|
Work.Life.Balance_enc
|
0.1870
|
0.0245
|
7.639
|
0.0000
|
1.2057
|
1.1492
|
1.2649
|
***
|
|
Job.Satisfaction_enc
|
Job.Satisfaction_enc
|
0.1522
|
0.0188
|
8.078
|
0.0000
|
1.1644
|
1.1222
|
1.2082
|
***
|
|
Performance.Rating_enc
|
Performance.Rating_enc
|
0.1223
|
0.0240
|
5.089
|
0.0000
|
1.1301
|
1.0781
|
1.1847
|
***
|
|
Education.Level_enc
|
Education.Level_enc
|
-0.1329
|
0.0191
|
-6.957
|
0.0000
|
0.8756
|
0.8434
|
0.9090
|
***
|
|
Marital.Status_enc
|
Marital.Status_enc
|
0.9276
|
0.0360
|
25.792
|
0.0000
|
2.5284
|
2.3563
|
2.7130
|
***
|
|
Job.Level_enc
|
Job.Level_enc
|
-1.1204
|
0.0334
|
-33.586
|
0.0000
|
0.3262
|
0.3055
|
0.3482
|
***
|
|
Company.Size_enc
|
Company.Size_enc
|
0.1048
|
0.0328
|
3.195
|
0.0014
|
1.1105
|
1.0413
|
1.1843
|
**
|
|
Remote.Work_enc
|
Remote.Work_enc
|
-1.6094
|
0.0650
|
-24.744
|
0.0000
|
0.2000
|
0.1761
|
0.2272
|
***
|
|
Leadership.Opportunities_enc
|
Leadership.Opportunities_enc
|
-0.3110
|
0.1076
|
-2.891
|
0.0038
|
0.7327
|
0.5934
|
0.9047
|
**
|
|
Innovation.Opportunities_enc
|
Innovation.Opportunities_enc
|
-0.1811
|
0.0629
|
-2.881
|
0.0040
|
0.8343
|
0.7376
|
0.9437
|
**
|
|
Company.Reputation_enc
|
Company.Reputation_enc
|
0.0739
|
0.0266
|
2.777
|
0.0055
|
1.0767
|
1.0220
|
1.1343
|
**
|
|
Employee.Recognition_enc
|
Employee.Recognition_enc
|
0.0108
|
0.0271
|
0.399
|
0.6898
|
1.0109
|
0.9587
|
1.0659
|
|
|
Overtime_bin
|
Overtime_bin
|
0.3180
|
0.0489
|
6.507
|
0.0000
|
1.3743
|
1.2488
|
1.5124
|
***
|
ot3 <- or_table(m3)
ot3 %>%
filter(Variable != "(Intercept)") %>%
mutate(abs_z = abs(z.stat)) %>%
arrange(desc(abs_z)) %>%
head(12) %>%
ggplot(aes(x = reorder(Variable, abs_z), y = abs_z,
fill = ifelse(Coefficient > 0, "Positive", "Negative"))) +
geom_col(width = 0.7) +
coord_flip() +
scale_fill_manual(values = c("Positive" = "#E74C3C", "Negative" = "#2E86AB"),
name = "Direction") +
labs(
title = "Top 12 Predictors of Attrition (Full Model)",
x = NULL, y = "|z-statistic|"
) +
theme_minimal(base_size = 13)

res3 <- eval_model(m3, test_df, model_name = "Model 3")
print(res3$cm$table)
## Reference
## Prediction 0 1
## 0 1753 669
## 1 629 1419
data.frame(
Metric = c("Accuracy", "Precision", "Recall", "F1-Score"),
Value = round(c(res3$accuracy, res3$precision, res3$recall, res3$f1), 4)
) %>%
kable(caption = "Model 3 — Performance Metrics") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Model 3 — Performance Metrics
|
Metric
|
Value
|
|
Accuracy
|
0.7096
|
|
Precision
|
0.6929
|
|
Recall
|
0.6796
|
|
F1-Score
|
0.6862
|
plot_cm <- function(res) {
df_cm <- data.frame(
Actual = factor(c("Stayed","Stayed","Left","Left"),
levels = c("Left","Stayed")),
Predicted = factor(c("Stayed","Left","Stayed","Left"),
levels = c("Stayed","Left")),
n = c(res$TN, res$FP, res$FN, res$TP),
label = c("TN","FP","FN","TP")
)
ggplot(df_cm, aes(x = Predicted, y = Actual, fill = label)) +
geom_tile(color = "white", linewidth = 1.2) +
geom_text(aes(label = paste0(label, "\n", n)), size = 5, fontface = "bold") +
scale_fill_manual(values = c(
TN = "#AED6F1", FP = "#F1948A", FN = "#F1948A", TP = "#A9DFBF"
)) +
labs(title = res$name, x = "Predicted", y = "Actual") +
theme_minimal(base_size = 12) +
theme(legend.position = "none",
plot.title = element_text(face = "bold", hjust = 0.5))
}
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
grid.arrange(plot_cm(res1), plot_cm(res2), plot_cm(res3), ncol = 3)

comparison <- data.frame(
Metric = c("Log-Likelihood", "McFadden R²", "AIC",
"Accuracy", "Precision", "Recall", "F1-Score",
"True Positives", "False Negatives"),
Model.1 = c(
round(as.numeric(logLik(m1)), 2),
round(pseudo_r2(m1), 4),
round(AIC(m1), 2),
round(res1$accuracy, 4),
round(res1$precision, 4),
round(res1$recall, 4),
round(res1$f1, 4),
res1$TP, res1$FN
),
Model.2 = c(
round(as.numeric(logLik(m2)), 2),
round(pseudo_r2(m2), 4),
round(AIC(m2), 2),
round(res2$accuracy, 4),
round(res2$precision, 4),
round(res2$recall, 4),
round(res2$f1, 4),
res2$TP, res2$FN
),
Model.3 = c(
round(as.numeric(logLik(m3)), 2),
round(pseudo_r2(m3), 4),
round(AIC(m3), 2),
round(res3$accuracy, 4),
round(res3$precision, 4),
round(res3$recall, 4),
round(res3$f1, 4),
res3$TP, res3$FN
)
)
comparison %>%
kable(caption = "Model Comparison — All Metrics",
col.names = c("Metric", "Model 1", "Model 2", "Model 3")) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
row_spec(0, bold = TRUE, background = "#1F497D", color = "white") %>%
row_spec(c(4, 7), bold = TRUE)
Model Comparison — All Metrics
|
Metric
|
Model 1
|
Model 2
|
Model 3
|
|
Log-Likelihood
|
-7214.8900
|
-7199.6100
|
-5654.0300
|
|
McFadden R²
|
0.0001
|
0.0022
|
0.2164
|
|
AIC
|
14433.7900
|
14405.2200
|
11354.0700
|
|
Accuracy
|
0.5329
|
0.5418
|
0.7096
|
|
Precision
|
NA
|
0.5140
|
0.6929
|
|
Recall
|
0.0000
|
0.3511
|
0.6796
|
|
F1-Score
|
NA
|
0.4172
|
0.6862
|
|
True Positives
|
0.0000
|
733.0000
|
1419.0000
|
|
False Negatives
|
2088.0000
|
1355.0000
|
669.0000
|
roc1 <- roc(res1$actual, res1$probs, quiet = TRUE)
roc2 <- roc(res2$actual, res2$probs, quiet = TRUE)
roc3 <- roc(res3$actual, res3$probs, quiet = TRUE)
cat(sprintf("AUC — Model 1: %.4f\n", auc(roc1)))
## AUC — Model 1: 0.4976
cat(sprintf("AUC — Model 2: %.4f\n", auc(roc2)))
## AUC — Model 2: 0.5312
cat(sprintf("AUC — Model 3: %.4f\n", auc(roc3)))
## AUC — Model 3: 0.7891
roc_df <- bind_rows(
data.frame(fpr = 1 - roc1$specificities, tpr = roc1$sensitivities,
Model = sprintf("M1 (AUC=%.3f)", auc(roc1))),
data.frame(fpr = 1 - roc2$specificities, tpr = roc2$sensitivities,
Model = sprintf("M2 (AUC=%.3f)", auc(roc2))),
data.frame(fpr = 1 - roc3$specificities, tpr = roc3$sensitivities,
Model = sprintf("M3 (AUC=%.3f)", auc(roc3)))
)
ggplot(roc_df, aes(x = fpr, y = tpr, colour = Model)) +
geom_line(linewidth = 1.1) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", colour = "grey50") +
scale_colour_manual(values = c("#95A5A6", "#E67E22", "#2E86AB")) +
labs(
title = "ROC Curves — Logistic Regression Models",
x = "False Positive Rate (1 − Specificity)",
y = "True Positive Rate (Sensitivity)",
colour = NULL
) +
theme_minimal(base_size = 13) +
theme(legend.position = c(0.75, 0.25))

perf_long <- data.frame(
Model = rep(c("Model 1", "Model 2", "Model 3"), each = 3),
Metric = rep(c("Accuracy", "Recall", "F1-Score"), 3),
Value = c(
res1$accuracy, res1$recall, res1$f1,
res2$accuracy, res2$recall, res2$f1,
res3$accuracy, res3$recall, res3$f1
)
)
ggplot(perf_long, aes(x = Model, y = Value, fill = Metric)) +
geom_col(position = "dodge", width = 0.6) +
geom_text(aes(label = sprintf("%.3f", Value)),
position = position_dodge(0.6), vjust = -0.4, size = 3.5) +
scale_fill_manual(values = c(
Accuracy = "#2E86AB",
Recall = "#E74C3C",
"F1-Score"= "#27AE60"
)) +
scale_y_continuous(limits = c(0, 0.85), labels = percent_format()) +
labs(
title = "Model Performance Comparison",
x = NULL, y = "Score", fill = NULL
) +
theme_minimal(base_size = 13)
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_col()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_text()`).

sessionInfo()
## R version 4.5.1 (2025-06-13)
## Platform: x86_64-apple-darwin20
## Running under: macOS Tahoe 26.3
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.5-x86_64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-x86_64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: Asia/Taipei
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] gridExtra_2.3 scales_1.4.0 pROC_1.19.0.1 kableExtra_1.4.0
## [5] knitr_1.50 caret_7.0-1 lattice_0.22-7 lubridate_1.9.4
## [9] forcats_1.0.1 stringr_1.5.2 dplyr_1.2.0 purrr_1.1.0
## [13] readr_2.1.5 tidyr_1.3.1 tibble_3.3.0 ggplot2_4.0.2
## [17] tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.2.1 viridisLite_0.4.2 timeDate_4041.110
## [4] farver_2.1.2 S7_0.2.0 fastmap_1.2.0
## [7] digest_0.6.37 rpart_4.1.24 timechange_0.3.0
## [10] lifecycle_1.0.5 survival_3.8-3 magrittr_2.0.4
## [13] compiler_4.5.1 rlang_1.1.7 sass_0.4.10
## [16] tools_4.5.1 yaml_2.3.10 data.table_1.17.8
## [19] labeling_0.4.3 plyr_1.8.9 xml2_1.4.0
## [22] RColorBrewer_1.1-3 withr_3.0.2 nnet_7.3-20
## [25] grid_4.5.1 stats4_4.5.1 e1071_1.7-17
## [28] future_1.67.0 globals_0.18.0 iterators_1.0.14
## [31] MASS_7.3-65 cli_3.6.5 rmarkdown_2.29
## [34] generics_0.1.4 rstudioapi_0.17.1 future.apply_1.20.0
## [37] reshape2_1.4.5 tzdb_0.5.0 proxy_0.4-29
## [40] cachem_1.1.0 splines_4.5.1 parallel_4.5.1
## [43] vctrs_0.7.1 hardhat_1.4.2 Matrix_1.7-3
## [46] jsonlite_2.0.0 hms_1.1.3 listenv_0.9.1
## [49] systemfonts_1.3.2 foreach_1.5.2 gower_1.0.2
## [52] jquerylib_0.1.4 recipes_1.3.1 glue_1.8.0
## [55] parallelly_1.45.1 codetools_0.2-20 stringi_1.8.7
## [58] gtable_0.3.6 pillar_1.11.1 htmltools_0.5.8.1
## [61] ipred_0.9-15 lava_1.8.1 R6_2.6.1
## [64] textshaping_1.0.3 evaluate_1.0.5 bslib_0.9.0
## [67] class_7.3-23 Rcpp_1.1.0 svglite_2.2.2
## [70] nlme_3.1-168 prodlim_2025.04.28 xfun_0.53
## [73] pkgconfig_2.0.3 ModelMetrics_1.2.2.2