library(tidyverse)      
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)          
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(knitr)          
library(kableExtra)     
## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(pROC)           
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(ggplot2)        
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
df <- read.csv("test.csv", stringsAsFactors = FALSE, check.names = TRUE)

cat("Dataset dimensions:", nrow(df), "rows x", ncol(df), "columns\n")
## Dataset dimensions: 14900 rows x 24 columns
cat("Column names:\n")
## Column names:
print(names(df))
##  [1] "Employee.ID"              "Age"                     
##  [3] "Gender"                   "Years.at.Company"        
##  [5] "Job.Role"                 "Monthly.Income"          
##  [7] "Work.Life.Balance"        "Job.Satisfaction"        
##  [9] "Performance.Rating"       "Number.of.Promotions"    
## [11] "Overtime"                 "Distance.from.Home"      
## [13] "Education.Level"          "Marital.Status"          
## [15] "Number.of.Dependents"     "Job.Level"               
## [17] "Company.Size"             "Company.Tenure"          
## [19] "Remote.Work"              "Leadership.Opportunities"
## [21] "Innovation.Opportunities" "Company.Reputation"      
## [23] "Employee.Recognition"     "Attrition"
df %>%
  count(Attrition) %>%
  mutate(Pct = round(n / sum(n) * 100, 1)) %>%
  kable(caption = "Outcome Variable Distribution", col.names = c("Attrition", "Count", "Pct (%)")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Outcome Variable Distribution
Attrition Count Pct (%)
Left 7032 47.2
Stayed 7868 52.8
miss <- colSums(is.na(df))
if (sum(miss) == 0) {
  cat("No missing values detected.\n")
} else {
  print(miss[miss > 0])
}
## No missing values detected.
df$Attrition_bin <- ifelse(df$Attrition == "Left", 1, 0)

df$Overtime_bin <- ifelse(df$Overtime == "Yes", 1, 0)

cat_cols <- c(
  "Gender", "Job.Role", "Work.Life.Balance", "Job.Satisfaction",
  "Performance.Rating", "Education.Level", "Marital.Status",
  "Job.Level", "Company.Size", "Remote.Work",
  "Leadership.Opportunities", "Innovation.Opportunities",
  "Company.Reputation", "Employee.Recognition"
)

for (col in cat_cols) {
  if (col %in% names(df)) {
    df[[paste0(col, "_enc")]] <- as.integer(factor(df[[col]]))
  }
}


numeric_cols <- c(
  "Age", "Years.at.Company", "Monthly.Income",
  "Number.of.Promotions", "Distance.from.Home",
  "Number.of.Dependents", "Company.Tenure"
)

enc_cols <- paste0(cat_cols[cat_cols %in% names(df)], "_enc")
all_vars <- c(numeric_cols, enc_cols, "Overtime_bin")

cat("Number of predictors in full model:", length(all_vars), "\n")
## Number of predictors in full model: 22
set.seed(42)

train_idx <- createDataPartition(df$Attrition_bin, p = 0.70, list = FALSE)
train_df  <- df[ train_idx, ]
test_df   <- df[-train_idx, ]

cat(sprintf(
  "Train: %d rows (attrition rate: %.1f%%)\nTest:  %d rows (attrition rate: %.1f%%)\n",
  nrow(train_df), mean(train_df$Attrition_bin) * 100,
  nrow(test_df),  mean(test_df$Attrition_bin)  * 100
))
## Train: 10430 rows (attrition rate: 47.4%)
## Test:  4470 rows (attrition rate: 46.7%)
df %>%
  group_by(Overtime, Attrition) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(Overtime) %>%
  mutate(pct = n / sum(n)) %>%
  ggplot(aes(x = Overtime, y = pct, fill = Attrition)) +
  geom_col(position = "dodge", width = 0.6) +
  scale_y_continuous(labels = percent_format()) +
  scale_fill_manual(values = c("Left" = "#E74C3C", "Stayed" = "#2E86AB")) +
  labs(
    title = "Attrition Rate by Overtime Status",
    x = "Overtime", y = "Proportion", fill = "Attrition"
  ) +
  theme_minimal(base_size = 13)

ggplot(df, aes(x = Attrition, y = Monthly.Income, fill = Attrition)) +
  geom_boxplot(alpha = 0.7, outlier.alpha = 0.3) +
  scale_fill_manual(values = c("Left" = "#E74C3C", "Stayed" = "#2E86AB")) +
  scale_y_continuous(labels = comma_format()) +
  labs(
    title = "Monthly Income Distribution by Attrition",
    x = "Attrition Status", y = "Monthly Income"
  ) +
  theme_minimal(base_size = 13) +
  theme(legend.position = "none")

pseudo_r2 <- function(model) {
  1 - (model$deviance / model$null.deviance)
}

eval_model <- function(model, test_data, outcome = "Attrition_bin",
                       threshold = 0.5, model_name = "Model") {

  probs  <- predict(model, newdata = test_data, type = "response")
  preds  <- factor(ifelse(probs >= threshold, 1, 0), levels = c(0, 1))
  actual <- factor(test_data[[outcome]], levels = c(0, 1))

  cm  <- confusionMatrix(preds, actual, positive = "1")
  TN  <- cm$table[1, 1]; FP <- cm$table[2, 1]
  FN  <- cm$table[1, 2]; TP <- cm$table[2, 2]

  list(
    name      = model_name,
    probs     = probs,
    preds     = preds,
    actual    = actual,
    cm        = cm,
    TN = TN, FP = FP, FN = FN, TP = TP,
    accuracy  = as.numeric(cm$overall["Accuracy"]),
    precision = as.numeric(cm$byClass["Precision"]),
    recall    = as.numeric(cm$byClass["Recall"]),
    f1        = as.numeric(cm$byClass["F1"])
  )
}

or_table <- function(model) {
  coefs <- summary(model)$coefficients
  or    <- exp(coef(model))
  ci    <- exp(confint.default(model))
  data.frame(
    Variable   = rownames(coefs),
    Coefficient = round(coefs[, 1], 4),
    Std.Error  = round(coefs[, 2], 4),
    z.stat     = round(coefs[, 3], 3),
    p.value    = round(coefs[, 4], 4),
    Odds.Ratio = round(or, 4),
    CI.Lower   = round(ci[, 1], 4),
    CI.Upper   = round(ci[, 2], 4),
    Sig        = ifelse(coefs[, 4] < .001, "***",
                 ifelse(coefs[, 4] < .01,  "**",
                 ifelse(coefs[, 4] < .05,  "*",
                 ifelse(coefs[, 4] < .1,   ".", ""))))
  )
}
m1 <- glm(Attrition_bin ~ Monthly.Income,
          data   = train_df,
          family = binomial(link = "logit"))

summary(m1)
## 
## Call:
## glm(formula = Attrition_bin ~ Monthly.Income, family = binomial(link = "logit"), 
##     data = train_df)
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)
## (Intercept)    -3.485e-02  6.921e-02  -0.504    0.615
## Monthly.Income -9.488e-06  9.106e-06  -1.042    0.297
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 14431  on 10429  degrees of freedom
## Residual deviance: 14430  on 10428  degrees of freedom
## AIC: 14434
## 
## Number of Fisher Scoring iterations: 3
cat(sprintf("Log-Likelihood:    %.2f\n", logLik(m1)))
## Log-Likelihood:    -7214.89
cat(sprintf("McFadden Pseudo R²: %.4f\n", pseudo_r2(m1)))
## McFadden Pseudo R²: 0.0001
cat(sprintf("AIC:               %.2f\n",  AIC(m1)))
## AIC:               14433.79
or_table(m1) %>%
  kable(caption = "Model 1 — Coefficient Table with Odds Ratios",
        digits   = 4) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE) %>%
  column_spec(9, bold = TRUE, color = "darkred")
Model 1 — Coefficient Table with Odds Ratios
Variable Coefficient Std.Error z.stat p.value Odds.Ratio CI.Lower CI.Upper Sig
(Intercept) (Intercept) -0.0349 0.0692 -0.504 0.6146 0.9657 0.8432 1.1061
Monthly.Income Monthly.Income 0.0000 0.0000 -1.042 0.2974 1.0000 1.0000 1.0000
res1 <- eval_model(m1, test_df, model_name = "Model 1")
print(res1$cm$table)
##           Reference
## Prediction    0    1
##          0 2382 2088
##          1    0    0
data.frame(
  Metric    = c("Accuracy", "Precision", "Recall", "F1-Score"),
  Value     = round(c(res1$accuracy, res1$precision, res1$recall, res1$f1), 4)
) %>%
  kable(caption = "Model 1 — Performance Metrics") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Model 1 — Performance Metrics
Metric Value
Accuracy 0.5329
Precision NA
Recall 0.0000
F1-Score NA
m2 <- glm(Attrition_bin ~ Monthly.Income + Overtime_bin,
          data   = train_df,
          family = binomial(link = "logit"))

summary(m2)
## 
## Call:
## glm(formula = Attrition_bin ~ Monthly.Income + Overtime_bin, 
##     family = binomial(link = "logit"), data = train_df)
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -1.115e-01  7.069e-02  -1.577    0.115    
## Monthly.Income -9.469e-06  9.120e-06  -1.038    0.299    
## Overtime_bin    2.303e-01  4.168e-02   5.526 3.27e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 14431  on 10429  degrees of freedom
## Residual deviance: 14399  on 10427  degrees of freedom
## AIC: 14405
## 
## Number of Fisher Scoring iterations: 3
cat(sprintf("Log-Likelihood:    %.2f\n", logLik(m2)))
## Log-Likelihood:    -7199.61
cat(sprintf("McFadden Pseudo R²: %.4f\n", pseudo_r2(m2)))
## McFadden Pseudo R²: 0.0022
cat(sprintf("AIC:               %.2f\n",  AIC(m2)))
## AIC:               14405.22
lrt_12 <- anova(m1, m2, test = "Chisq")
print(lrt_12)
## Analysis of Deviance Table
## 
## Model 1: Attrition_bin ~ Monthly.Income
## Model 2: Attrition_bin ~ Monthly.Income + Overtime_bin
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1     10428      14430                          
## 2     10427      14399  1   30.567 3.226e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
or_table(m2) %>%
  kable(caption = "Model 2 — Coefficient Table with Odds Ratios",
        digits   = 4) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE) %>%
  column_spec(9, bold = TRUE, color = "darkred")
Model 2 — Coefficient Table with Odds Ratios
Variable Coefficient Std.Error z.stat p.value Odds.Ratio CI.Lower CI.Upper Sig
(Intercept) (Intercept) -0.1115 0.0707 -1.577 0.1147 0.8945 0.7788 1.0274
Monthly.Income Monthly.Income 0.0000 0.0000 -1.038 0.2992 1.0000 1.0000 1.0000
Overtime_bin Overtime_bin 0.2303 0.0417 5.526 0.0000 1.2590 1.1603 1.3662 ***
res2 <- eval_model(m2, test_df, model_name = "Model 2")
print(res2$cm$table)
##           Reference
## Prediction    0    1
##          0 1689 1355
##          1  693  733
data.frame(
  Metric = c("Accuracy", "Precision", "Recall", "F1-Score"),
  Value  = round(c(res2$accuracy, res2$precision, res2$recall, res2$f1), 4)
) %>%
  kable(caption = "Model 2 — Performance Metrics") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Model 2 — Performance Metrics
Metric Value
Accuracy 0.5418
Precision 0.5140
Recall 0.3511
F1-Score 0.4172
formula_m3 <- as.formula(
  paste("Attrition_bin ~", paste(all_vars, collapse = " + "))
)

m3 <- glm(formula_m3,
          data   = train_df,
          family = binomial(link = "logit"))

summary(m3)
## 
## Call:
## glm(formula = formula_m3, family = binomial(link = "logit"), 
##     data = train_df)
## 
## Coefficients:
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   2.304e+00  2.746e-01   8.392  < 2e-16 ***
## Age                          -6.545e-03  2.228e-03  -2.938  0.00330 ** 
## Years.at.Company             -1.205e-02  2.634e-03  -4.575 4.77e-06 ***
## Monthly.Income               -2.330e-06  1.285e-05  -0.181  0.85612    
## Number.of.Promotions         -2.320e-01  2.339e-02  -9.919  < 2e-16 ***
## Distance.from.Home            8.571e-03  8.060e-04  10.634  < 2e-16 ***
## Number.of.Dependents         -9.815e-02  1.502e-02  -6.533 6.43e-11 ***
## Company.Tenure                4.642e-04  1.010e-03   0.460  0.64573    
## Gender_enc                   -5.207e-01  4.628e-02 -11.250  < 2e-16 ***
## Job.Role_enc                 -3.209e-02  1.877e-02  -1.710  0.08729 .  
## Work.Life.Balance_enc         1.870e-01  2.448e-02   7.639 2.18e-14 ***
## Job.Satisfaction_enc          1.522e-01  1.884e-02   8.078 6.57e-16 ***
## Performance.Rating_enc        1.223e-01  2.404e-02   5.089 3.61e-07 ***
## Education.Level_enc          -1.329e-01  1.910e-02  -6.957 3.47e-12 ***
## Marital.Status_enc            9.276e-01  3.596e-02  25.792  < 2e-16 ***
## Job.Level_enc                -1.120e+00  3.336e-02 -33.586  < 2e-16 ***
## Company.Size_enc              1.048e-01  3.281e-02   3.195  0.00140 ** 
## Remote.Work_enc              -1.609e+00  6.504e-02 -24.744  < 2e-16 ***
## Leadership.Opportunities_enc -3.110e-01  1.076e-01  -2.891  0.00385 ** 
## Innovation.Opportunities_enc -1.811e-01  6.286e-02  -2.881  0.00396 ** 
## Company.Reputation_enc        7.387e-02  2.660e-02   2.777  0.00548 ** 
## Employee.Recognition_enc      1.080e-02  2.705e-02   0.399  0.68978    
## Overtime_bin                  3.180e-01  4.887e-02   6.507 7.68e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 14431  on 10429  degrees of freedom
## Residual deviance: 11308  on 10407  degrees of freedom
## AIC: 11354
## 
## Number of Fisher Scoring iterations: 4
cat(sprintf("Log-Likelihood:    %.2f\n", logLik(m3)))
## Log-Likelihood:    -5654.03
cat(sprintf("McFadden Pseudo R²: %.4f\n", pseudo_r2(m3)))
## McFadden Pseudo R²: 0.2164
cat(sprintf("AIC:               %.2f\n",  AIC(m3)))
## AIC:               11354.07
lrt_23 <- anova(m2, m3, test = "Chisq")
print(lrt_23)
## Analysis of Deviance Table
## 
## Model 1: Attrition_bin ~ Monthly.Income + Overtime_bin
## Model 2: Attrition_bin ~ Age + Years.at.Company + Monthly.Income + Number.of.Promotions + 
##     Distance.from.Home + Number.of.Dependents + Company.Tenure + 
##     Gender_enc + Job.Role_enc + Work.Life.Balance_enc + Job.Satisfaction_enc + 
##     Performance.Rating_enc + Education.Level_enc + Marital.Status_enc + 
##     Job.Level_enc + Company.Size_enc + Remote.Work_enc + Leadership.Opportunities_enc + 
##     Innovation.Opportunities_enc + Company.Reputation_enc + Employee.Recognition_enc + 
##     Overtime_bin
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1     10427      14399                          
## 2     10407      11308 20   3091.2 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
or_table(m3) %>%
  kable(caption = "Model 3 — Full Coefficient Table with Odds Ratios",
        digits   = 4) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE) %>%
  column_spec(9, bold = TRUE, color = "darkred") %>%
  row_spec(which(or_table(m3)$Sig %in% c("***", "**", "*")),
           background = "#F0FFF0")
Model 3 — Full Coefficient Table with Odds Ratios
Variable Coefficient Std.Error z.stat p.value Odds.Ratio CI.Lower CI.Upper Sig
(Intercept) (Intercept) 2.3042 0.2746 8.392 0.0000 10.0160 5.8477 17.1554 ***
Age Age -0.0065 0.0022 -2.938 0.0033 0.9935 0.9891 0.9978 **
Years.at.Company Years.at.Company -0.0120 0.0026 -4.575 0.0000 0.9880 0.9829 0.9931 ***
Monthly.Income Monthly.Income 0.0000 0.0000 -0.181 0.8561 1.0000 1.0000 1.0000
Number.of.Promotions Number.of.Promotions -0.2320 0.0234 -9.919 0.0000 0.7929 0.7574 0.8301 ***
Distance.from.Home Distance.from.Home 0.0086 0.0008 10.634 0.0000 1.0086 1.0070 1.0102 ***
Number.of.Dependents Number.of.Dependents -0.0981 0.0150 -6.533 0.0000 0.9065 0.8802 0.9336 ***
Company.Tenure Company.Tenure 0.0005 0.0010 0.460 0.6457 1.0005 0.9985 1.0024
Gender_enc Gender_enc -0.5207 0.0463 -11.250 0.0000 0.5941 0.5426 0.6505 ***
Job.Role_enc Job.Role_enc -0.0321 0.0188 -1.710 0.0873 0.9684 0.9334 1.0047 .
Work.Life.Balance_enc Work.Life.Balance_enc 0.1870 0.0245 7.639 0.0000 1.2057 1.1492 1.2649 ***
Job.Satisfaction_enc Job.Satisfaction_enc 0.1522 0.0188 8.078 0.0000 1.1644 1.1222 1.2082 ***
Performance.Rating_enc Performance.Rating_enc 0.1223 0.0240 5.089 0.0000 1.1301 1.0781 1.1847 ***
Education.Level_enc Education.Level_enc -0.1329 0.0191 -6.957 0.0000 0.8756 0.8434 0.9090 ***
Marital.Status_enc Marital.Status_enc 0.9276 0.0360 25.792 0.0000 2.5284 2.3563 2.7130 ***
Job.Level_enc Job.Level_enc -1.1204 0.0334 -33.586 0.0000 0.3262 0.3055 0.3482 ***
Company.Size_enc Company.Size_enc 0.1048 0.0328 3.195 0.0014 1.1105 1.0413 1.1843 **
Remote.Work_enc Remote.Work_enc -1.6094 0.0650 -24.744 0.0000 0.2000 0.1761 0.2272 ***
Leadership.Opportunities_enc Leadership.Opportunities_enc -0.3110 0.1076 -2.891 0.0038 0.7327 0.5934 0.9047 **
Innovation.Opportunities_enc Innovation.Opportunities_enc -0.1811 0.0629 -2.881 0.0040 0.8343 0.7376 0.9437 **
Company.Reputation_enc Company.Reputation_enc 0.0739 0.0266 2.777 0.0055 1.0767 1.0220 1.1343 **
Employee.Recognition_enc Employee.Recognition_enc 0.0108 0.0271 0.399 0.6898 1.0109 0.9587 1.0659
Overtime_bin Overtime_bin 0.3180 0.0489 6.507 0.0000 1.3743 1.2488 1.5124 ***
ot3 <- or_table(m3)
ot3 %>%
  filter(Variable != "(Intercept)") %>%
  mutate(abs_z = abs(z.stat)) %>%
  arrange(desc(abs_z)) %>%
  head(12) %>%
  ggplot(aes(x = reorder(Variable, abs_z), y = abs_z,
             fill = ifelse(Coefficient > 0, "Positive", "Negative"))) +
  geom_col(width = 0.7) +
  coord_flip() +
  scale_fill_manual(values = c("Positive" = "#E74C3C", "Negative" = "#2E86AB"),
                    name = "Direction") +
  labs(
    title = "Top 12 Predictors of Attrition (Full Model)",
    x = NULL, y = "|z-statistic|"
  ) +
  theme_minimal(base_size = 13)

res3 <- eval_model(m3, test_df, model_name = "Model 3")
print(res3$cm$table)
##           Reference
## Prediction    0    1
##          0 1753  669
##          1  629 1419
data.frame(
  Metric = c("Accuracy", "Precision", "Recall", "F1-Score"),
  Value  = round(c(res3$accuracy, res3$precision, res3$recall, res3$f1), 4)
) %>%
  kable(caption = "Model 3 — Performance Metrics") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Model 3 — Performance Metrics
Metric Value
Accuracy 0.7096
Precision 0.6929
Recall 0.6796
F1-Score 0.6862
plot_cm <- function(res) {
  df_cm <- data.frame(
    Actual    = factor(c("Stayed","Stayed","Left","Left"),
                       levels = c("Left","Stayed")),
    Predicted = factor(c("Stayed","Left","Stayed","Left"),
                       levels = c("Stayed","Left")),
    n         = c(res$TN, res$FP, res$FN, res$TP),
    label     = c("TN","FP","FN","TP")
  )
  ggplot(df_cm, aes(x = Predicted, y = Actual, fill = label)) +
    geom_tile(color = "white", linewidth = 1.2) +
    geom_text(aes(label = paste0(label, "\n", n)), size = 5, fontface = "bold") +
    scale_fill_manual(values = c(
      TN = "#AED6F1", FP = "#F1948A", FN = "#F1948A", TP = "#A9DFBF"
    )) +
    labs(title = res$name, x = "Predicted", y = "Actual") +
    theme_minimal(base_size = 12) +
    theme(legend.position = "none",
          plot.title = element_text(face = "bold", hjust = 0.5))
}

library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
grid.arrange(plot_cm(res1), plot_cm(res2), plot_cm(res3), ncol = 3)

comparison <- data.frame(
  Metric        = c("Log-Likelihood", "McFadden R²", "AIC",
                    "Accuracy", "Precision", "Recall", "F1-Score",
                    "True Positives", "False Negatives"),
  Model.1       = c(
    round(as.numeric(logLik(m1)), 2),
    round(pseudo_r2(m1), 4),
    round(AIC(m1), 2),
    round(res1$accuracy,  4),
    round(res1$precision, 4),
    round(res1$recall,    4),
    round(res1$f1,        4),
    res1$TP, res1$FN
  ),
  Model.2       = c(
    round(as.numeric(logLik(m2)), 2),
    round(pseudo_r2(m2), 4),
    round(AIC(m2), 2),
    round(res2$accuracy,  4),
    round(res2$precision, 4),
    round(res2$recall,    4),
    round(res2$f1,        4),
    res2$TP, res2$FN
  ),
  Model.3       = c(
    round(as.numeric(logLik(m3)), 2),
    round(pseudo_r2(m3), 4),
    round(AIC(m3), 2),
    round(res3$accuracy,  4),
    round(res3$precision, 4),
    round(res3$recall,    4),
    round(res3$f1,        4),
    res3$TP, res3$FN
  )
)

comparison %>%
  kable(caption = "Model Comparison — All Metrics",
        col.names = c("Metric", "Model 1", "Model 2", "Model 3")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(0, bold = TRUE, background = "#1F497D", color = "white") %>%
  row_spec(c(4, 7), bold = TRUE)
Model Comparison — All Metrics
Metric Model 1 Model 2 Model 3
Log-Likelihood -7214.8900 -7199.6100 -5654.0300
McFadden R² 0.0001 0.0022 0.2164
AIC 14433.7900 14405.2200 11354.0700
Accuracy 0.5329 0.5418 0.7096
Precision NA 0.5140 0.6929
Recall 0.0000 0.3511 0.6796
F1-Score NA 0.4172 0.6862
True Positives 0.0000 733.0000 1419.0000
False Negatives 2088.0000 1355.0000 669.0000
roc1 <- roc(res1$actual, res1$probs, quiet = TRUE)
roc2 <- roc(res2$actual, res2$probs, quiet = TRUE)
roc3 <- roc(res3$actual, res3$probs, quiet = TRUE)

cat(sprintf("AUC — Model 1: %.4f\n", auc(roc1)))
## AUC — Model 1: 0.4976
cat(sprintf("AUC — Model 2: %.4f\n", auc(roc2)))
## AUC — Model 2: 0.5312
cat(sprintf("AUC — Model 3: %.4f\n", auc(roc3)))
## AUC — Model 3: 0.7891
roc_df <- bind_rows(
  data.frame(fpr = 1 - roc1$specificities, tpr = roc1$sensitivities,
             Model = sprintf("M1 (AUC=%.3f)", auc(roc1))),
  data.frame(fpr = 1 - roc2$specificities, tpr = roc2$sensitivities,
             Model = sprintf("M2 (AUC=%.3f)", auc(roc2))),
  data.frame(fpr = 1 - roc3$specificities, tpr = roc3$sensitivities,
             Model = sprintf("M3 (AUC=%.3f)", auc(roc3)))
)

ggplot(roc_df, aes(x = fpr, y = tpr, colour = Model)) +
  geom_line(linewidth = 1.1) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", colour = "grey50") +
  scale_colour_manual(values = c("#95A5A6", "#E67E22", "#2E86AB")) +
  labs(
    title  = "ROC Curves — Logistic Regression Models",
    x      = "False Positive Rate (1 − Specificity)",
    y      = "True Positive Rate (Sensitivity)",
    colour = NULL
  ) +
  theme_minimal(base_size = 13) +
  theme(legend.position = c(0.75, 0.25))

perf_long <- data.frame(
  Model  = rep(c("Model 1", "Model 2", "Model 3"), each = 3),
  Metric = rep(c("Accuracy", "Recall", "F1-Score"), 3),
  Value  = c(
    res1$accuracy, res1$recall, res1$f1,
    res2$accuracy, res2$recall, res2$f1,
    res3$accuracy, res3$recall, res3$f1
  )
)

ggplot(perf_long, aes(x = Model, y = Value, fill = Metric)) +
  geom_col(position = "dodge", width = 0.6) +
  geom_text(aes(label = sprintf("%.3f", Value)),
            position = position_dodge(0.6), vjust = -0.4, size = 3.5) +
  scale_fill_manual(values = c(
    Accuracy  = "#2E86AB",
    Recall    = "#E74C3C",
    "F1-Score"= "#27AE60"
  )) +
  scale_y_continuous(limits = c(0, 0.85), labels = percent_format()) +
  labs(
    title = "Model Performance Comparison",
    x = NULL, y = "Score", fill = NULL
  ) +
  theme_minimal(base_size = 13)
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_col()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_text()`).

sessionInfo()
## R version 4.5.1 (2025-06-13)
## Platform: x86_64-apple-darwin20
## Running under: macOS Tahoe 26.3
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.5-x86_64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-x86_64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: Asia/Taipei
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] gridExtra_2.3    scales_1.4.0     pROC_1.19.0.1    kableExtra_1.4.0
##  [5] knitr_1.50       caret_7.0-1      lattice_0.22-7   lubridate_1.9.4 
##  [9] forcats_1.0.1    stringr_1.5.2    dplyr_1.2.0      purrr_1.1.0     
## [13] readr_2.1.5      tidyr_1.3.1      tibble_3.3.0     ggplot2_4.0.2   
## [17] tidyverse_2.0.0 
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.1     viridisLite_0.4.2    timeDate_4041.110   
##  [4] farver_2.1.2         S7_0.2.0             fastmap_1.2.0       
##  [7] digest_0.6.37        rpart_4.1.24         timechange_0.3.0    
## [10] lifecycle_1.0.5      survival_3.8-3       magrittr_2.0.4      
## [13] compiler_4.5.1       rlang_1.1.7          sass_0.4.10         
## [16] tools_4.5.1          yaml_2.3.10          data.table_1.17.8   
## [19] labeling_0.4.3       plyr_1.8.9           xml2_1.4.0          
## [22] RColorBrewer_1.1-3   withr_3.0.2          nnet_7.3-20         
## [25] grid_4.5.1           stats4_4.5.1         e1071_1.7-17        
## [28] future_1.67.0        globals_0.18.0       iterators_1.0.14    
## [31] MASS_7.3-65          cli_3.6.5            rmarkdown_2.29      
## [34] generics_0.1.4       rstudioapi_0.17.1    future.apply_1.20.0 
## [37] reshape2_1.4.5       tzdb_0.5.0           proxy_0.4-29        
## [40] cachem_1.1.0         splines_4.5.1        parallel_4.5.1      
## [43] vctrs_0.7.1          hardhat_1.4.2        Matrix_1.7-3        
## [46] jsonlite_2.0.0       hms_1.1.3            listenv_0.9.1       
## [49] systemfonts_1.3.2    foreach_1.5.2        gower_1.0.2         
## [52] jquerylib_0.1.4      recipes_1.3.1        glue_1.8.0          
## [55] parallelly_1.45.1    codetools_0.2-20     stringi_1.8.7       
## [58] gtable_0.3.6         pillar_1.11.1        htmltools_0.5.8.1   
## [61] ipred_0.9-15         lava_1.8.1           R6_2.6.1            
## [64] textshaping_1.0.3    evaluate_1.0.5       bslib_0.9.0         
## [67] class_7.3-23         Rcpp_1.1.0           svglite_2.2.2       
## [70] nlme_3.1-168         prodlim_2025.04.28   xfun_0.53           
## [73] pkgconfig_2.0.3      ModelMetrics_1.2.2.2