HW07_Raffy

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(caret)

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

library(knitr)          
library(kableExtra)

## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows

library(pROC)

## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

library(ggplot2)        
library(scales)

## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor

df <- read.csv("test.csv", stringsAsFactors = FALSE, check.names = TRUE)

cat("Dataset dimensions:", nrow(df), "rows x", ncol(df), "columns\n")

## Dataset dimensions: 14900 rows x 24 columns

cat("Column names:\n")

## Column names:

print(names(df))

##  [1] "Employee.ID"              "Age"                     
##  [3] "Gender"                   "Years.at.Company"        
##  [5] "Job.Role"                 "Monthly.Income"          
##  [7] "Work.Life.Balance"        "Job.Satisfaction"        
##  [9] "Performance.Rating"       "Number.of.Promotions"    
## [11] "Overtime"                 "Distance.from.Home"      
## [13] "Education.Level"          "Marital.Status"          
## [15] "Number.of.Dependents"     "Job.Level"               
## [17] "Company.Size"             "Company.Tenure"          
## [19] "Remote.Work"              "Leadership.Opportunities"
## [21] "Innovation.Opportunities" "Company.Reputation"      
## [23] "Employee.Recognition"     "Attrition"

df %>%
  count(Attrition) %>%
  mutate(Pct = round(n / sum(n) * 100, 1)) %>%
  kable(caption = "Outcome Variable Distribution", col.names = c("Attrition", "Count", "Pct (%)")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Outcome Variable Distribution
Attrition	Count	Pct (%)
Left	7032	47.2
Stayed	7868	52.8

miss <- colSums(is.na(df))
if (sum(miss) == 0) {
  cat("No missing values detected.\n")
} else {
  print(miss[miss > 0])
}

## No missing values detected.

df$Attrition_bin <- ifelse(df$Attrition == "Left", 1, 0)

df$Overtime_bin <- ifelse(df$Overtime == "Yes", 1, 0)

cat_cols <- c(
  "Gender", "Job.Role", "Work.Life.Balance", "Job.Satisfaction",
  "Performance.Rating", "Education.Level", "Marital.Status",
  "Job.Level", "Company.Size", "Remote.Work",
  "Leadership.Opportunities", "Innovation.Opportunities",
  "Company.Reputation", "Employee.Recognition"
)

for (col in cat_cols) {
  if (col %in% names(df)) {
    df[[paste0(col, "_enc")]] <- as.integer(factor(df[[col]]))
  }
}


numeric_cols <- c(
  "Age", "Years.at.Company", "Monthly.Income",
  "Number.of.Promotions", "Distance.from.Home",
  "Number.of.Dependents", "Company.Tenure"
)

enc_cols <- paste0(cat_cols[cat_cols %in% names(df)], "_enc")
all_vars <- c(numeric_cols, enc_cols, "Overtime_bin")

cat("Number of predictors in full model:", length(all_vars), "\n")

## Number of predictors in full model: 22

set.seed(42)

train_idx <- createDataPartition(df$Attrition_bin, p = 0.70, list = FALSE)
train_df  <- df[ train_idx, ]
test_df   <- df[-train_idx, ]

cat(sprintf(
  "Train: %d rows (attrition rate: %.1f%%)\nTest:  %d rows (attrition rate: %.1f%%)\n",
  nrow(train_df), mean(train_df$Attrition_bin) * 100,
  nrow(test_df),  mean(test_df$Attrition_bin)  * 100
))

## Train: 10430 rows (attrition rate: 47.4%)
## Test:  4470 rows (attrition rate: 46.7%)

df %>%
  group_by(Overtime, Attrition) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(Overtime) %>%
  mutate(pct = n / sum(n)) %>%
  ggplot(aes(x = Overtime, y = pct, fill = Attrition)) +
  geom_col(position = "dodge", width = 0.6) +
  scale_y_continuous(labels = percent_format()) +
  scale_fill_manual(values = c("Left" = "#E74C3C", "Stayed" = "#2E86AB")) +
  labs(
    title = "Attrition Rate by Overtime Status",
    x = "Overtime", y = "Proportion", fill = "Attrition"
  ) +
  theme_minimal(base_size = 13)

ggplot(df, aes(x = Attrition, y = Monthly.Income, fill = Attrition)) +
  geom_boxplot(alpha = 0.7, outlier.alpha = 0.3) +
  scale_fill_manual(values = c("Left" = "#E74C3C", "Stayed" = "#2E86AB")) +
  scale_y_continuous(labels = comma_format()) +
  labs(
    title = "Monthly Income Distribution by Attrition",
    x = "Attrition Status", y = "Monthly Income"
  ) +
  theme_minimal(base_size = 13) +
  theme(legend.position = "none")

pseudo_r2 <- function(model) {
  1 - (model$deviance / model$null.deviance)
}

eval_model <- function(model, test_data, outcome = "Attrition_bin",
                       threshold = 0.5, model_name = "Model") {

  probs  <- predict(model, newdata = test_data, type = "response")
  preds  <- factor(ifelse(probs >= threshold, 1, 0), levels = c(0, 1))
  actual <- factor(test_data[[outcome]], levels = c(0, 1))

  cm  <- confusionMatrix(preds, actual, positive = "1")
  TN  <- cm$table[1, 1]; FP <- cm$table[2, 1]
  FN  <- cm$table[1, 2]; TP <- cm$table[2, 2]

  list(
    name      = model_name,
    probs     = probs,
    preds     = preds,
    actual    = actual,
    cm        = cm,
    TN = TN, FP = FP, FN = FN, TP = TP,
    accuracy  = as.numeric(cm$overall["Accuracy"]),
    precision = as.numeric(cm$byClass["Precision"]),
    recall    = as.numeric(cm$byClass["Recall"]),
    f1        = as.numeric(cm$byClass["F1"])
  )
}

or_table <- function(model) {
  coefs <- summary(model)$coefficients
  or    <- exp(coef(model))
  ci    <- exp(confint.default(model))
  data.frame(
    Variable   = rownames(coefs),
    Coefficient = round(coefs[, 1], 4),
    Std.Error  = round(coefs[, 2], 4),
    z.stat     = round(coefs[, 3], 3),
    p.value    = round(coefs[, 4], 4),
    Odds.Ratio = round(or, 4),
    CI.Lower   = round(ci[, 1], 4),
    CI.Upper   = round(ci[, 2], 4),
    Sig        = ifelse(coefs[, 4] < .001, "***",
                 ifelse(coefs[, 4] < .01,  "**",
                 ifelse(coefs[, 4] < .05,  "*",
                 ifelse(coefs[, 4] < .1,   ".", ""))))
  )
}

m1 <- glm(Attrition_bin ~ Monthly.Income,
          data   = train_df,
          family = binomial(link = "logit"))

summary(m1)

## 
## Call:
## glm(formula = Attrition_bin ~ Monthly.Income, family = binomial(link = "logit"), 
##     data = train_df)
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)
## (Intercept)    -3.485e-02  6.921e-02  -0.504    0.615
## Monthly.Income -9.488e-06  9.106e-06  -1.042    0.297
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 14431  on 10429  degrees of freedom
## Residual deviance: 14430  on 10428  degrees of freedom
## AIC: 14434
## 
## Number of Fisher Scoring iterations: 3

cat(sprintf("Log-Likelihood:    %.2f\n", logLik(m1)))

## Log-Likelihood:    -7214.89

cat(sprintf("McFadden Pseudo R²: %.4f\n", pseudo_r2(m1)))

## McFadden Pseudo R²: 0.0001

cat(sprintf("AIC:               %.2f\n",  AIC(m1)))

## AIC:               14433.79

or_table(m1) %>%
  kable(caption = "Model 1 — Coefficient Table with Odds Ratios",
        digits   = 4) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE) %>%
  column_spec(9, bold = TRUE, color = "darkred")

Model 1 — Coefficient Table with Odds Ratios
	Variable	Coefficient	Std.Error	z.stat	p.value	Odds.Ratio	CI.Lower	CI.Upper	Sig
(Intercept)	(Intercept)	-0.0349	0.0692	-0.504	0.6146	0.9657	0.8432	1.1061
Monthly.Income	Monthly.Income	0.0000	0.0000	-1.042	0.2974	1.0000	1.0000	1.0000

res1 <- eval_model(m1, test_df, model_name = "Model 1")
print(res1$cm$table)

##           Reference
## Prediction    0    1
##          0 2382 2088
##          1    0    0

data.frame(
  Metric    = c("Accuracy", "Precision", "Recall", "F1-Score"),
  Value     = round(c(res1$accuracy, res1$precision, res1$recall, res1$f1), 4)
) %>%
  kable(caption = "Model 1 — Performance Metrics") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Model 1 — Performance Metrics
Metric	Value
Accuracy	0.5329
Precision	NA
Recall	0.0000
F1-Score	NA

m2 <- glm(Attrition_bin ~ Monthly.Income + Overtime_bin,
          data   = train_df,
          family = binomial(link = "logit"))

summary(m2)

## 
## Call:
## glm(formula = Attrition_bin ~ Monthly.Income + Overtime_bin, 
##     family = binomial(link = "logit"), data = train_df)
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -1.115e-01  7.069e-02  -1.577    0.115    
## Monthly.Income -9.469e-06  9.120e-06  -1.038    0.299    
## Overtime_bin    2.303e-01  4.168e-02   5.526 3.27e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 14431  on 10429  degrees of freedom
## Residual deviance: 14399  on 10427  degrees of freedom
## AIC: 14405
## 
## Number of Fisher Scoring iterations: 3

cat(sprintf("Log-Likelihood:    %.2f\n", logLik(m2)))

## Log-Likelihood:    -7199.61

cat(sprintf("McFadden Pseudo R²: %.4f\n", pseudo_r2(m2)))

## McFadden Pseudo R²: 0.0022

cat(sprintf("AIC:               %.2f\n",  AIC(m2)))

## AIC:               14405.22

lrt_12 <- anova(m1, m2, test = "Chisq")
print(lrt_12)

## Analysis of Deviance Table
## 
## Model 1: Attrition_bin ~ Monthly.Income
## Model 2: Attrition_bin ~ Monthly.Income + Overtime_bin
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1     10428      14430                          
## 2     10427      14399  1   30.567 3.226e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

or_table(m2) %>%
  kable(caption = "Model 2 — Coefficient Table with Odds Ratios",
        digits   = 4) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE) %>%
  column_spec(9, bold = TRUE, color = "darkred")

Model 2 — Coefficient Table with Odds Ratios
	Variable	Coefficient	Std.Error	z.stat	p.value	Odds.Ratio	CI.Lower	CI.Upper	Sig
(Intercept)	(Intercept)	-0.1115	0.0707	-1.577	0.1147	0.8945	0.7788	1.0274
Monthly.Income	Monthly.Income	0.0000	0.0000	-1.038	0.2992	1.0000	1.0000	1.0000
Overtime_bin	Overtime_bin	0.2303	0.0417	5.526	0.0000	1.2590	1.1603	1.3662	***

res2 <- eval_model(m2, test_df, model_name = "Model 2")
print(res2$cm$table)

##           Reference
## Prediction    0    1
##          0 1689 1355
##          1  693  733

data.frame(
  Metric = c("Accuracy", "Precision", "Recall", "F1-Score"),
  Value  = round(c(res2$accuracy, res2$precision, res2$recall, res2$f1), 4)
) %>%
  kable(caption = "Model 2 — Performance Metrics") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Model 2 — Performance Metrics
Metric	Value
Accuracy	0.5418
Precision	0.5140
Recall	0.3511
F1-Score	0.4172

formula_m3 <- as.formula(
  paste("Attrition_bin ~", paste(all_vars, collapse = " + "))
)

m3 <- glm(formula_m3,
          data   = train_df,
          family = binomial(link = "logit"))

summary(m3)

## 
## Call:
## glm(formula = formula_m3, family = binomial(link = "logit"), 
##     data = train_df)
## 
## Coefficients:
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   2.304e+00  2.746e-01   8.392  < 2e-16 ***
## Age                          -6.545e-03  2.228e-03  -2.938  0.00330 ** 
## Years.at.Company             -1.205e-02  2.634e-03  -4.575 4.77e-06 ***
## Monthly.Income               -2.330e-06  1.285e-05  -0.181  0.85612    
## Number.of.Promotions         -2.320e-01  2.339e-02  -9.919  < 2e-16 ***
## Distance.from.Home            8.571e-03  8.060e-04  10.634  < 2e-16 ***
## Number.of.Dependents         -9.815e-02  1.502e-02  -6.533 6.43e-11 ***
## Company.Tenure                4.642e-04  1.010e-03   0.460  0.64573    
## Gender_enc                   -5.207e-01  4.628e-02 -11.250  < 2e-16 ***
## Job.Role_enc                 -3.209e-02  1.877e-02  -1.710  0.08729 .  
## Work.Life.Balance_enc         1.870e-01  2.448e-02   7.639 2.18e-14 ***
## Job.Satisfaction_enc          1.522e-01  1.884e-02   8.078 6.57e-16 ***
## Performance.Rating_enc        1.223e-01  2.404e-02   5.089 3.61e-07 ***
## Education.Level_enc          -1.329e-01  1.910e-02  -6.957 3.47e-12 ***
## Marital.Status_enc            9.276e-01  3.596e-02  25.792  < 2e-16 ***
## Job.Level_enc                -1.120e+00  3.336e-02 -33.586  < 2e-16 ***
## Company.Size_enc              1.048e-01  3.281e-02   3.195  0.00140 ** 
## Remote.Work_enc              -1.609e+00  6.504e-02 -24.744  < 2e-16 ***
## Leadership.Opportunities_enc -3.110e-01  1.076e-01  -2.891  0.00385 ** 
## Innovation.Opportunities_enc -1.811e-01  6.286e-02  -2.881  0.00396 ** 
## Company.Reputation_enc        7.387e-02  2.660e-02   2.777  0.00548 ** 
## Employee.Recognition_enc      1.080e-02  2.705e-02   0.399  0.68978    
## Overtime_bin                  3.180e-01  4.887e-02   6.507 7.68e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 14431  on 10429  degrees of freedom
## Residual deviance: 11308  on 10407  degrees of freedom
## AIC: 11354
## 
## Number of Fisher Scoring iterations: 4

cat(sprintf("Log-Likelihood:    %.2f\n", logLik(m3)))

## Log-Likelihood:    -5654.03

cat(sprintf("McFadden Pseudo R²: %.4f\n", pseudo_r2(m3)))

## McFadden Pseudo R²: 0.2164

cat(sprintf("AIC:               %.2f\n",  AIC(m3)))

## AIC:               11354.07

lrt_23 <- anova(m2, m3, test = "Chisq")
print(lrt_23)

## Analysis of Deviance Table
## 
## Model 1: Attrition_bin ~ Monthly.Income + Overtime_bin
## Model 2: Attrition_bin ~ Age + Years.at.Company + Monthly.Income + Number.of.Promotions + 
##     Distance.from.Home + Number.of.Dependents + Company.Tenure + 
##     Gender_enc + Job.Role_enc + Work.Life.Balance_enc + Job.Satisfaction_enc + 
##     Performance.Rating_enc + Education.Level_enc + Marital.Status_enc + 
##     Job.Level_enc + Company.Size_enc + Remote.Work_enc + Leadership.Opportunities_enc + 
##     Innovation.Opportunities_enc + Company.Reputation_enc + Employee.Recognition_enc + 
##     Overtime_bin
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1     10427      14399                          
## 2     10407      11308 20   3091.2 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

or_table(m3) %>%
  kable(caption = "Model 3 — Full Coefficient Table with Odds Ratios",
        digits   = 4) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE) %>%
  column_spec(9, bold = TRUE, color = "darkred") %>%
  row_spec(which(or_table(m3)$Sig %in% c("***", "**", "*")),
           background = "#F0FFF0")

Model 3 — Full Coefficient Table with Odds Ratios
	Variable	Coefficient	Std.Error	z.stat	p.value	Odds.Ratio	CI.Lower	CI.Upper	Sig
(Intercept)	(Intercept)	2.3042	0.2746	8.392	0.0000	10.0160	5.8477	17.1554	***
Age	Age	-0.0065	0.0022	-2.938	0.0033	0.9935	0.9891	0.9978	**
Years.at.Company	Years.at.Company	-0.0120	0.0026	-4.575	0.0000	0.9880	0.9829	0.9931	***
Monthly.Income	Monthly.Income	0.0000	0.0000	-0.181	0.8561	1.0000	1.0000	1.0000
Number.of.Promotions	Number.of.Promotions	-0.2320	0.0234	-9.919	0.0000	0.7929	0.7574	0.8301	***
Distance.from.Home	Distance.from.Home	0.0086	0.0008	10.634	0.0000	1.0086	1.0070	1.0102	***
Number.of.Dependents	Number.of.Dependents	-0.0981	0.0150	-6.533	0.0000	0.9065	0.8802	0.9336	***
Company.Tenure	Company.Tenure	0.0005	0.0010	0.460	0.6457	1.0005	0.9985	1.0024
Gender_enc	Gender_enc	-0.5207	0.0463	-11.250	0.0000	0.5941	0.5426	0.6505	***
Job.Role_enc	Job.Role_enc	-0.0321	0.0188	-1.710	0.0873	0.9684	0.9334	1.0047	.
Work.Life.Balance_enc	Work.Life.Balance_enc	0.1870	0.0245	7.639	0.0000	1.2057	1.1492	1.2649	***
Job.Satisfaction_enc	Job.Satisfaction_enc	0.1522	0.0188	8.078	0.0000	1.1644	1.1222	1.2082	***
Performance.Rating_enc	Performance.Rating_enc	0.1223	0.0240	5.089	0.0000	1.1301	1.0781	1.1847	***
Education.Level_enc	Education.Level_enc	-0.1329	0.0191	-6.957	0.0000	0.8756	0.8434	0.9090	***
Marital.Status_enc	Marital.Status_enc	0.9276	0.0360	25.792	0.0000	2.5284	2.3563	2.7130	***
Job.Level_enc	Job.Level_enc	-1.1204	0.0334	-33.586	0.0000	0.3262	0.3055	0.3482	***
Company.Size_enc	Company.Size_enc	0.1048	0.0328	3.195	0.0014	1.1105	1.0413	1.1843	**
Remote.Work_enc	Remote.Work_enc	-1.6094	0.0650	-24.744	0.0000	0.2000	0.1761	0.2272	***
Leadership.Opportunities_enc	Leadership.Opportunities_enc	-0.3110	0.1076	-2.891	0.0038	0.7327	0.5934	0.9047	**
Innovation.Opportunities_enc	Innovation.Opportunities_enc	-0.1811	0.0629	-2.881	0.0040	0.8343	0.7376	0.9437	**
Company.Reputation_enc	Company.Reputation_enc	0.0739	0.0266	2.777	0.0055	1.0767	1.0220	1.1343	**
Employee.Recognition_enc	Employee.Recognition_enc	0.0108	0.0271	0.399	0.6898	1.0109	0.9587	1.0659
Overtime_bin	Overtime_bin	0.3180	0.0489	6.507	0.0000	1.3743	1.2488	1.5124	***

ot3 <- or_table(m3)
ot3 %>%
  filter(Variable != "(Intercept)") %>%
  mutate(abs_z = abs(z.stat)) %>%
  arrange(desc(abs_z)) %>%
  head(12) %>%
  ggplot(aes(x = reorder(Variable, abs_z), y = abs_z,
             fill = ifelse(Coefficient > 0, "Positive", "Negative"))) +
  geom_col(width = 0.7) +
  coord_flip() +
  scale_fill_manual(values = c("Positive" = "#E74C3C", "Negative" = "#2E86AB"),
                    name = "Direction") +
  labs(
    title = "Top 12 Predictors of Attrition (Full Model)",
    x = NULL, y = "|z-statistic|"
  ) +
  theme_minimal(base_size = 13)

res3 <- eval_model(m3, test_df, model_name = "Model 3")
print(res3$cm$table)

##           Reference
## Prediction    0    1
##          0 1753  669
##          1  629 1419

data.frame(
  Metric = c("Accuracy", "Precision", "Recall", "F1-Score"),
  Value  = round(c(res3$accuracy, res3$precision, res3$recall, res3$f1), 4)
) %>%
  kable(caption = "Model 3 — Performance Metrics") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Model 3 — Performance Metrics
Metric	Value
Accuracy	0.7096
Precision	0.6929
Recall	0.6796
F1-Score	0.6862

plot_cm <- function(res) {
  df_cm <- data.frame(
    Actual    = factor(c("Stayed","Stayed","Left","Left"),
                       levels = c("Left","Stayed")),
    Predicted = factor(c("Stayed","Left","Stayed","Left"),
                       levels = c("Stayed","Left")),
    n         = c(res$TN, res$FP, res$FN, res$TP),
    label     = c("TN","FP","FN","TP")
  )
  ggplot(df_cm, aes(x = Predicted, y = Actual, fill = label)) +
    geom_tile(color = "white", linewidth = 1.2) +
    geom_text(aes(label = paste0(label, "\n", n)), size = 5, fontface = "bold") +
    scale_fill_manual(values = c(
      TN = "#AED6F1", FP = "#F1948A", FN = "#F1948A", TP = "#A9DFBF"
    )) +
    labs(title = res$name, x = "Predicted", y = "Actual") +
    theme_minimal(base_size = 12) +
    theme(legend.position = "none",
          plot.title = element_text(face = "bold", hjust = 0.5))
}

library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

grid.arrange(plot_cm(res1), plot_cm(res2), plot_cm(res3), ncol = 3)

comparison <- data.frame(
  Metric        = c("Log-Likelihood", "McFadden R²", "AIC",
                    "Accuracy", "Precision", "Recall", "F1-Score",
                    "True Positives", "False Negatives"),
  Model.1       = c(
    round(as.numeric(logLik(m1)), 2),
    round(pseudo_r2(m1), 4),
    round(AIC(m1), 2),
    round(res1$accuracy,  4),
    round(res1$precision, 4),
    round(res1$recall,    4),
    round(res1$f1,        4),
    res1$TP, res1$FN
  ),
  Model.2       = c(
    round(as.numeric(logLik(m2)), 2),
    round(pseudo_r2(m2), 4),
    round(AIC(m2), 2),
    round(res2$accuracy,  4),
    round(res2$precision, 4),
    round(res2$recall,    4),
    round(res2$f1,        4),
    res2$TP, res2$FN
  ),
  Model.3       = c(
    round(as.numeric(logLik(m3)), 2),
    round(pseudo_r2(m3), 4),
    round(AIC(m3), 2),
    round(res3$accuracy,  4),
    round(res3$precision, 4),
    round(res3$recall,    4),
    round(res3$f1,        4),
    res3$TP, res3$FN
  )
)

comparison %>%
  kable(caption = "Model Comparison — All Metrics",
        col.names = c("Metric", "Model 1", "Model 2", "Model 3")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(0, bold = TRUE, background = "#1F497D", color = "white") %>%
  row_spec(c(4, 7), bold = TRUE)

Model Comparison — All Metrics
Metric	Model 1	Model 2	Model 3
Log-Likelihood	-7214.8900	-7199.6100	-5654.0300
McFadden R²	0.0001	0.0022	0.2164
AIC	14433.7900	14405.2200	11354.0700
Accuracy	0.5329	0.5418	0.7096
Precision	NA	0.5140	0.6929
Recall	0.0000	0.3511	0.6796
F1-Score	NA	0.4172	0.6862
True Positives	0.0000	733.0000	1419.0000
False Negatives	2088.0000	1355.0000	669.0000

roc1 <- roc(res1$actual, res1$probs, quiet = TRUE)
roc2 <- roc(res2$actual, res2$probs, quiet = TRUE)
roc3 <- roc(res3$actual, res3$probs, quiet = TRUE)

cat(sprintf("AUC — Model 1: %.4f\n", auc(roc1)))

## AUC — Model 1: 0.4976

cat(sprintf("AUC — Model 2: %.4f\n", auc(roc2)))

## AUC — Model 2: 0.5312

cat(sprintf("AUC — Model 3: %.4f\n", auc(roc3)))

## AUC — Model 3: 0.7891

roc_df <- bind_rows(
  data.frame(fpr = 1 - roc1$specificities, tpr = roc1$sensitivities,
             Model = sprintf("M1 (AUC=%.3f)", auc(roc1))),
  data.frame(fpr = 1 - roc2$specificities, tpr = roc2$sensitivities,
             Model = sprintf("M2 (AUC=%.3f)", auc(roc2))),
  data.frame(fpr = 1 - roc3$specificities, tpr = roc3$sensitivities,
             Model = sprintf("M3 (AUC=%.3f)", auc(roc3)))
)

ggplot(roc_df, aes(x = fpr, y = tpr, colour = Model)) +
  geom_line(linewidth = 1.1) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", colour = "grey50") +
  scale_colour_manual(values = c("#95A5A6", "#E67E22", "#2E86AB")) +
  labs(
    title  = "ROC Curves — Logistic Regression Models",
    x      = "False Positive Rate (1 − Specificity)",
    y      = "True Positive Rate (Sensitivity)",
    colour = NULL
  ) +
  theme_minimal(base_size = 13) +
  theme(legend.position = c(0.75, 0.25))

perf_long <- data.frame(
  Model  = rep(c("Model 1", "Model 2", "Model 3"), each = 3),
  Metric = rep(c("Accuracy", "Recall", "F1-Score"), 3),
  Value  = c(
    res1$accuracy, res1$recall, res1$f1,
    res2$accuracy, res2$recall, res2$f1,
    res3$accuracy, res3$recall, res3$f1
  )
)

ggplot(perf_long, aes(x = Model, y = Value, fill = Metric)) +
  geom_col(position = "dodge", width = 0.6) +
  geom_text(aes(label = sprintf("%.3f", Value)),
            position = position_dodge(0.6), vjust = -0.4, size = 3.5) +
  scale_fill_manual(values = c(
    Accuracy  = "#2E86AB",
    Recall    = "#E74C3C",
    "F1-Score"= "#27AE60"
  )) +
  scale_y_continuous(limits = c(0, 0.85), labels = percent_format()) +
  labs(
    title = "Model Performance Comparison",
    x = NULL, y = "Score", fill = NULL
  ) +
  theme_minimal(base_size = 13)

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_col()`).

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_text()`).

sessionInfo()

## R version 4.5.1 (2025-06-13)
## Platform: x86_64-apple-darwin20
## Running under: macOS Tahoe 26.3
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.5-x86_64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-x86_64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: Asia/Taipei
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] gridExtra_2.3    scales_1.4.0     pROC_1.19.0.1    kableExtra_1.4.0
##  [5] knitr_1.50       caret_7.0-1      lattice_0.22-7   lubridate_1.9.4 
##  [9] forcats_1.0.1    stringr_1.5.2    dplyr_1.2.0      purrr_1.1.0     
## [13] readr_2.1.5      tidyr_1.3.1      tibble_3.3.0     ggplot2_4.0.2   
## [17] tidyverse_2.0.0 
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.1     viridisLite_0.4.2    timeDate_4041.110   
##  [4] farver_2.1.2         S7_0.2.0             fastmap_1.2.0       
##  [7] digest_0.6.37        rpart_4.1.24         timechange_0.3.0    
## [10] lifecycle_1.0.5      survival_3.8-3       magrittr_2.0.4      
## [13] compiler_4.5.1       rlang_1.1.7          sass_0.4.10         
## [16] tools_4.5.1          yaml_2.3.10          data.table_1.17.8   
## [19] labeling_0.4.3       plyr_1.8.9           xml2_1.4.0          
## [22] RColorBrewer_1.1-3   withr_3.0.2          nnet_7.3-20         
## [25] grid_4.5.1           stats4_4.5.1         e1071_1.7-17        
## [28] future_1.67.0        globals_0.18.0       iterators_1.0.14    
## [31] MASS_7.3-65          cli_3.6.5            rmarkdown_2.29      
## [34] generics_0.1.4       rstudioapi_0.17.1    future.apply_1.20.0 
## [37] reshape2_1.4.5       tzdb_0.5.0           proxy_0.4-29        
## [40] cachem_1.1.0         splines_4.5.1        parallel_4.5.1      
## [43] vctrs_0.7.1          hardhat_1.4.2        Matrix_1.7-3        
## [46] jsonlite_2.0.0       hms_1.1.3            listenv_0.9.1       
## [49] systemfonts_1.3.2    foreach_1.5.2        gower_1.0.2         
## [52] jquerylib_0.1.4      recipes_1.3.1        glue_1.8.0          
## [55] parallelly_1.45.1    codetools_0.2-20     stringi_1.8.7       
## [58] gtable_0.3.6         pillar_1.11.1        htmltools_0.5.8.1   
## [61] ipred_0.9-15         lava_1.8.1           R6_2.6.1            
## [64] textshaping_1.0.3    evaluate_1.0.5       bslib_0.9.0         
## [67] class_7.3-23         Rcpp_1.1.0           svglite_2.2.2       
## [70] nlme_3.1-168         prodlim_2025.04.28   xfun_0.53           
## [73] pkgconfig_2.0.3      ModelMetrics_1.2.2.2

HW07_Raffy

114035129_Mochammad Raffy Anugerah

2026-04-26