Introduction

This analysis develops and evaluates multiple predictive models for 30-day all-cause readmission following isolated coronary artery bypass graft (CABG) surgery. The dataset is a synthetic digital twin comprising 10,000 patient records with 16 variables, calibrated to published Ontario cardiac surgery benchmarks (target readmission rate ~13%).

Each model type is trained in two configurations:

  • Pre-operative models use only variables available before surgery (demographics, comorbidities, baseline labs, EuroSCORE II). These are actionable for pre-surgical risk stratification.
  • Full models add the five post-operative clinical endpoints (AF, AKI, DSWI, stroke, chronic pain), capturing the complete clinical picture at or after discharge.

Six classification algorithms are evaluated: Logistic Regression, K-Nearest Neighbors, Support Vector Machine, Naive Bayes, Decision Tree, and Random Forest.

Environment Setup

Package Installation and Loading

if (!require("pacman")) install.packages("pacman")
pacman::p_load(
  rstudioapi,    # Working directory
  dplyr,         # Data manipulation
  caTools,       # Stratified train/test split
  caret,         # Confusion matrix
  class,         # K-NN
  e1071,         # SVM, Naive Bayes
  rpart,         # Decision Tree
  rpart.plot,    # Decision Tree plots
  randomForest,  # Random Forest
  pROC,          # ROC curves
  ggplot2,       # Visualization
  gridExtra,     # Multiple plots
  knitr,         # Table formatting
  scales         # Plot formatting
)

# Set working directory to the folder containing this RMD file
# When knitting, knitr automatically sets the working directory to the RMD location
# The rstudioapi call is a fallback for interactive use only
if (interactive() && nchar(rstudioapi::getActiveDocumentContext()$path) > 0) {
  setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
}
cat("Working directory:", getwd(), "\n")
## Working directory: C:/Users/HusnaMalik/Downloads/Group 4 - Code

Data Loading and Exploration

dataset <- read.csv("cabg_digital_twin_dataset.csv")
cat("Original dimensions:", dim(dataset)[1], "rows,", dim(dataset)[2], "columns\n\n")
## Original dimensions: 10000 rows, 16 columns
cat("=== Missing Values Per Column ===\n")
## === Missing Values Per Column ===
print(colSums(is.na(dataset)))
##            Readmission_30Day                          Age 
##                            0                          183 
##                          Sex                  Postal_Code 
##                            0                            0 
##                 EuroSCORE_II            Ejection_Fraction 
##                          166                          206 
##             Serum_Creatinine               Pre_Op_Albumin 
##                          204                          197 
##                     Diabetes                 Hypertension 
##                          190                          193 
##                     Prior_MI                    PostOp_AF 
##                          182                          211 
##                   PostOp_AKI Deep_Sternal_Wound_Infection 
##                          202                          179 
##                PostOp_Stroke          PostOp_Chronic_Pain 
##                          195                          201
cat("\n=== Target Variable Distribution ===\n")
## 
## === Target Variable Distribution ===
cat("Readmission rate:", round(mean(dataset$Readmission_30Day, na.rm = TRUE) * 100, 1), "%\n")
## Readmission rate: 13 %
table(dataset$Readmission_30Day)
## 
##    0    1 
## 8700 1300
summary(dataset)
##  Readmission_30Day      Age            Sex            Postal_Code       
##  Min.   :0.00      Min.   :40.00   Length:10000       Length:10000      
##  1st Qu.:0.00      1st Qu.:60.00   Class :character   Class :character  
##  Median :0.00      Median :66.00   Mode  :character   Mode  :character  
##  Mean   :0.13      Mean   :65.91                                        
##  3rd Qu.:0.00      3rd Qu.:72.00                                        
##  Max.   :1.00      Max.   :90.00                                        
##                    NA's   :183                                          
##   EuroSCORE_II    Ejection_Fraction Serum_Creatinine Pre_Op_Albumin
##  Min.   : 0.500   Min.   :15.00     Min.   :0.500    Min.   :1.6   
##  1st Qu.: 3.200   1st Qu.:47.00     1st Qu.:0.800    1st Qu.:3.5   
##  Median : 4.300   Median :55.00     Median :1.100    Median :3.8   
##  Mean   : 4.686   Mean   :54.33     Mean   :1.138    Mean   :3.8   
##  3rd Qu.: 5.700   3rd Qu.:63.00     3rd Qu.:1.400    3rd Qu.:4.1   
##  Max.   :20.800   Max.   :70.00     Max.   :4.100    Max.   :5.0   
##  NA's   :166      NA's   :206       NA's   :204      NA's   :197   
##     Diabetes       Hypertension       Prior_MI        PostOp_AF     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :1.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.3525   Mean   :0.7459   Mean   :0.3423   Mean   :0.2644  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##  NA's   :190      NA's   :193      NA's   :182      NA's   :211     
##    PostOp_AKI      Deep_Sternal_Wound_Infection PostOp_Stroke   
##  Min.   :0.00000   Min.   :0.00000              Min.   :0.0000  
##  1st Qu.:0.00000   1st Qu.:0.00000              1st Qu.:0.0000  
##  Median :0.00000   Median :0.00000              Median :0.0000  
##  Mean   :0.05777   Mean   :0.02994              Mean   :0.0154  
##  3rd Qu.:0.00000   3rd Qu.:0.00000              3rd Qu.:0.0000  
##  Max.   :1.00000   Max.   :1.00000              Max.   :1.0000  
##  NA's   :202       NA's   :179                  NA's   :195     
##  PostOp_Chronic_Pain
##  Min.   :0.0000     
##  1st Qu.:0.0000     
##  Median :0.0000     
##  Mean   :0.2895     
##  3rd Qu.:1.0000     
##  Max.   :1.0000     
##  NA's   :201
str(dataset)
## 'data.frame':    10000 obs. of  16 variables:
##  $ Readmission_30Day           : int  0 0 0 0 0 1 0 0 1 0 ...
##  $ Age                         : int  63 63 67 77 49 76 73 65 71 62 ...
##  $ Sex                         : chr  "F" "F" "" "M" ...
##  $ Postal_Code                 : chr  "K0L" "M7V" "P6L" "L4S" ...
##  $ EuroSCORE_II                : num  6 4.8 9.6 7.6 3.9 2.9 4 3 4.4 1.7 ...
##  $ Ejection_Fraction           : int  51 40 70 53 59 70 57 55 46 53 ...
##  $ Serum_Creatinine            : num  1.5 1.4 1.5 1 1.3 1 1.3 1.7 1.4 2.1 ...
##  $ Pre_Op_Albumin              : num  3.4 NA 4.7 3.6 3.7 4.2 3.7 4.9 3.5 NA ...
##  $ Diabetes                    : int  0 1 1 0 1 0 0 0 0 1 ...
##  $ Hypertension                : int  1 0 1 1 1 0 1 1 1 1 ...
##  $ Prior_MI                    : int  0 0 0 0 0 0 0 0 1 0 ...
##  $ PostOp_AF                   : int  0 0 1 1 0 1 0 1 0 1 ...
##  $ PostOp_AKI                  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Deep_Sternal_Wound_Infection: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PostOp_Stroke               : int  0 0 0 0 0 NA 0 0 0 0 ...
##  $ PostOp_Chronic_Pain         : int  0 0 0 1 0 0 0 0 NA 1 ...

Data Preprocessing

data_clean <- dataset

# --- Impute missing values ---
# Sex: replace NA/empty with mode (M)
data_clean$Sex[is.na(data_clean$Sex) | data_clean$Sex == ""] <- "M"

# Numeric and binary columns: median imputation
numeric_cols <- c("Age", "EuroSCORE_II", "Ejection_Fraction", "Serum_Creatinine",
                  "Pre_Op_Albumin", "Diabetes", "Hypertension", "Prior_MI",
                  "PostOp_AF", "PostOp_AKI", "Deep_Sternal_Wound_Infection",
                  "PostOp_Stroke", "PostOp_Chronic_Pain")

for (col in numeric_cols) {
  data_clean[[col]][is.na(data_clean[[col]])] <- median(data_clean[[col]], na.rm = TRUE)
}

cat("Missing values after imputation:", sum(is.na(data_clean[, names(data_clean) != "Postal_Code"])), "\n")
## Missing values after imputation: 0
# --- Convert to appropriate types ---
data_clean$Readmission_30Day <- factor(data_clean$Readmission_30Day, levels = c(0, 1))
data_clean$Sex <- factor(data_clean$Sex, levels = c("M", "F"))

binary_cols <- c("Diabetes", "Hypertension", "Prior_MI", "PostOp_AF", "PostOp_AKI",
                 "Deep_Sternal_Wound_Infection", "PostOp_Stroke", "PostOp_Chronic_Pain")

for (col in binary_cols) {
  data_clean[[col]] <- factor(as.integer(round(data_clean[[col]])), levels = c(0, 1))
}

cat("\n=== Post-Preprocessing Validation ===\n")
## 
## === Post-Preprocessing Validation ===
cat("Readmission rate preserved:", round(mean(as.numeric(as.character(data_clean$Readmission_30Day))) * 100, 1), "%\n")
## Readmission rate preserved: 13 %
cat("Sex distribution: M =", round(mean(data_clean$Sex == "M") * 100, 1), "%, F =", round(mean(data_clean$Sex == "F") * 100, 1), "%\n")
## Sex distribution: M = 78.3 %, F = 21.6 %
str(data_clean)
## 'data.frame':    10000 obs. of  16 variables:
##  $ Readmission_30Day           : Factor w/ 2 levels "0","1": 1 1 1 1 1 2 1 1 2 1 ...
##  $ Age                         : int  63 63 67 77 49 76 73 65 71 62 ...
##  $ Sex                         : Factor w/ 2 levels "M","F": 2 2 1 1 1 1 1 1 1 1 ...
##  $ Postal_Code                 : chr  "K0L" "M7V" "P6L" "L4S" ...
##  $ EuroSCORE_II                : num  6 4.8 9.6 7.6 3.9 2.9 4 3 4.4 1.7 ...
##  $ Ejection_Fraction           : num  51 40 70 53 59 70 57 55 46 53 ...
##  $ Serum_Creatinine            : num  1.5 1.4 1.5 1 1.3 1 1.3 1.7 1.4 2.1 ...
##  $ Pre_Op_Albumin              : num  3.4 3.8 4.7 3.6 3.7 4.2 3.7 4.9 3.5 3.8 ...
##  $ Diabetes                    : Factor w/ 2 levels "0","1": 1 2 2 1 2 1 1 1 1 2 ...
##  $ Hypertension                : Factor w/ 2 levels "0","1": 2 1 2 2 2 1 2 2 2 2 ...
##  $ Prior_MI                    : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 2 1 ...
##  $ PostOp_AF                   : Factor w/ 2 levels "0","1": 1 1 2 2 1 2 1 2 1 2 ...
##  $ PostOp_AKI                  : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Deep_Sternal_Wound_Infection: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ PostOp_Stroke               : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ PostOp_Chronic_Pain         : Factor w/ 2 levels "0","1": 1 1 1 2 1 1 1 1 1 2 ...

Train/Test Split

set.seed(123)
split <- sample.split(data_clean$Readmission_30Day, SplitRatio = 2/3)
train_data <- subset(data_clean, split == TRUE)
test_data  <- subset(data_clean, split == FALSE)

cat("Training set:", nrow(train_data), "rows\n")
## Training set: 6667 rows
cat("Test set:", nrow(test_data), "rows\n")
## Test set: 3333 rows
cat("Training readmission rate:", round(mean(as.numeric(as.character(train_data$Readmission_30Day))) * 100, 1), "%\n")
## Training readmission rate: 13 %
cat("Test readmission rate:", round(mean(as.numeric(as.character(test_data$Readmission_30Day))) * 100, 1), "%\n")
## Test readmission rate: 13 %

Predictor Definitions

# Pre-operative predictors (available before surgery)
preop_vars <- c("Age", "Sex", "EuroSCORE_II", "Ejection_Fraction",
                "Serum_Creatinine", "Pre_Op_Albumin",
                "Diabetes", "Hypertension", "Prior_MI")

# Post-operative endpoints
postop_vars <- c("PostOp_AF", "PostOp_AKI", "Deep_Sternal_Wound_Infection",
                 "PostOp_Stroke", "PostOp_Chronic_Pain")

# Full model predictors
full_vars <- c(preop_vars, postop_vars)

# Formulas
formula_preop <- as.formula(paste("Readmission_30Day ~", paste(preop_vars, collapse = " + ")))
formula_full  <- as.formula(paste("Readmission_30Day ~", paste(full_vars, collapse = " + ")))

cat("Pre-op predictors:", length(preop_vars), "\n")
## Pre-op predictors: 9
cat("Full predictors:", length(full_vars), "\n")
## Full predictors: 14

Numeric Data for KNN and SVM

KNN and SVM require all-numeric inputs. Sex is converted to 0/1 and continuous features are scaled.

# Helper: convert factor data to numeric for KNN/SVM
make_numeric <- function(df, vars) {
  out <- data.frame(row.names = 1:nrow(df))
  for (v in vars) {
    if (is.factor(df[[v]])) {
      out[[v]] <- as.numeric(as.character(df[[v]]))
      # For Sex: M=0, F=1
      if (v == "Sex") out[[v]] <- ifelse(df[[v]] == "F", 1, 0)
    } else {
      out[[v]] <- as.numeric(df[[v]])
    }
  }
  return(out)
}

train_num_preop <- make_numeric(train_data, preop_vars)
test_num_preop  <- make_numeric(test_data, preop_vars)
train_num_full  <- make_numeric(train_data, full_vars)
test_num_full   <- make_numeric(test_data, full_vars)

# Scale continuous features (preserving binary columns as 0/1)
continuous_vars <- c("Age", "EuroSCORE_II", "Ejection_Fraction", "Serum_Creatinine", "Pre_Op_Albumin")

scale_data <- function(train_df, test_df, cont_vars) {
  for (v in cont_vars) {
    m <- mean(train_df[[v]], na.rm = TRUE)
    s <- sd(train_df[[v]], na.rm = TRUE)
    if (s > 0) {
      train_df[[v]] <- (train_df[[v]] - m) / s
      test_df[[v]]  <- (test_df[[v]] - m) / s
    }
  }
  return(list(train = train_df, test = test_df))
}

scaled_preop <- scale_data(train_num_preop, test_num_preop, continuous_vars)
scaled_full  <- scale_data(train_num_full, test_num_full, continuous_vars)

Helper Functions

# Extract predicted probabilities for the positive class (readmitted = 1)
get_probs <- function(model, newdata, model_type, num_newdata = NULL) {
  if (model_type == "logistic") {
    return(predict(model, newdata = newdata, type = "response"))
  } else if (model_type == "knn") {
    # KNN returns class; probabilities come from the prob attribute
    # This is handled separately in the KNN training section
    return(NULL)
  } else if (model_type == "svm") {
    pred <- predict(model, newdata = num_newdata, probability = TRUE)
    probs <- attr(pred, "probabilities")
    return(probs[, "1"])
  } else if (model_type == "nb") {
    pred <- predict(model, newdata = newdata, type = "raw")
    return(pred[, "1"])
  } else if (model_type == "tree") {
    return(predict(model, newdata = newdata, type = "prob")[, "1"])
  } else if (model_type == "rf") {
    return(predict(model, newdata = newdata, type = "prob")[, "1"])
  }
}

# Evaluate at a given threshold
evaluate_at_threshold <- function(actual, probs, threshold = 0.5) {
  actual_num <- as.numeric(as.character(actual))
  predicted <- ifelse(probs >= threshold, 1, 0)
  
  TP <- sum(predicted == 1 & actual_num == 1)
  TN <- sum(predicted == 0 & actual_num == 0)
  FP <- sum(predicted == 1 & actual_num == 0)
  FN <- sum(predicted == 0 & actual_num == 1)
  
  accuracy    <- (TP + TN) / (TP + TN + FP + FN)
  sensitivity <- ifelse((TP + FN) == 0, 0, TP / (TP + FN))
  specificity <- ifelse((TN + FP) == 0, 0, TN / (TN + FP))
  
  return(list(accuracy = accuracy, sensitivity = sensitivity, specificity = specificity))
}

# Find Youden's J optimal threshold
find_youden_threshold <- function(actual, probs) {
  roc_obj <- roc(as.numeric(as.character(actual)), probs, levels = c(0, 1), quiet = TRUE)
  best <- coords(roc_obj, "best", best.method = "youden", ret = c("threshold", "sensitivity", "specificity"))
  return(list(threshold = best$threshold[1], sensitivity = best$sensitivity[1], 
              specificity = best$specificity[1], auc = as.numeric(auc(roc_obj))))
}

Model Training: Pre-Operative Predictors

Logistic Regression (Pre-Op)

logit_preop <- glm(formula_preop, data = train_data, family = binomial)
cat("=== Logistic Regression (Pre-Op) Summary ===\n")
## === Logistic Regression (Pre-Op) Summary ===
summary(logit_preop)
## 
## Call:
## glm(formula = formula_preop, family = binomial, data = train_data)
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -1.6940678  0.4630802  -3.658 0.000254 ***
## Age                0.0107179  0.0042397   2.528 0.011472 *  
## SexF               0.3255209  0.0836410   3.892 9.95e-05 ***
## EuroSCORE_II       0.0353438  0.0169043   2.091 0.036544 *  
## Ejection_Fraction  0.0008493  0.0034169   0.249 0.803706    
## Serum_Creatinine   0.0301018  0.0958239   0.314 0.753417    
## Pre_Op_Albumin    -0.3523557  0.0737299  -4.779 1.76e-06 ***
## Diabetes1          0.1142533  0.0784659   1.456 0.145369    
## Hypertension1      0.0336808  0.0852573   0.395 0.692807    
## Prior_MI1          0.0606529  0.0768445   0.789 0.429940    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5153.2  on 6666  degrees of freedom
## Residual deviance: 5100.8  on 6657  degrees of freedom
## AIC: 5120.8
## 
## Number of Fisher Scoring iterations: 4
cat("\n=== Odds Ratios ===\n")
## 
## === Odds Ratios ===
print(round(exp(coef(logit_preop)), 3))
##       (Intercept)               Age              SexF      EuroSCORE_II 
##             0.184             1.011             1.385             1.036 
## Ejection_Fraction  Serum_Creatinine    Pre_Op_Albumin         Diabetes1 
##             1.001             1.031             0.703             1.121 
##     Hypertension1         Prior_MI1 
##             1.034             1.063
probs_logit_preop <- predict(logit_preop, newdata = test_data, type = "response")

K-Nearest Neighbors (Pre-Op)

knn_pred_preop <- knn(
  train = scaled_preop$train,
  test  = scaled_preop$test,
  cl    = train_data$Readmission_30Day,
  k     = 10,
  prob  = TRUE
)

# Extract probabilities for the positive class
probs_knn_preop <- attr(knn_pred_preop, "prob")
probs_knn_preop <- ifelse(knn_pred_preop == "1", probs_knn_preop, 1 - probs_knn_preop)

cat("=== KNN (Pre-Op) Class Distribution ===\n")
## === KNN (Pre-Op) Class Distribution ===
table(knn_pred_preop)
## knn_pred_preop
##    0    1 
## 3319   14

Support Vector Machine (Pre-Op)

# SVM needs numeric data; use unscaled numeric (SVM with linear kernel handles scaling internally)
svm_preop <- svm(
  x = train_num_preop,
  y = train_data$Readmission_30Day,
  type = "C-classification",
  kernel = "linear",
  probability = TRUE
)

cat("=== SVM (Pre-Op) Summary ===\n")
## === SVM (Pre-Op) Summary ===
summary(svm_preop)
## 
## Call:
## svm.default(x = train_num_preop, y = train_data$Readmission_30Day, 
##     type = "C-classification", kernel = "linear", probability = TRUE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1 
## 
## Number of Support Vectors:  1927
## 
##  ( 1060 867 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1
svm_pred_preop <- predict(svm_preop, newdata = test_num_preop, probability = TRUE)
probs_svm_preop <- attr(svm_pred_preop, "probabilities")[, "1"]

Naive Bayes (Pre-Op)

nb_preop <- naiveBayes(formula_preop, data = train_data)

cat("=== Naive Bayes (Pre-Op) Model ===\n")
## === Naive Bayes (Pre-Op) Model ===
print(nb_preop)
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##         0         1 
## 0.8699565 0.1300435 
## 
## Conditional probabilities:
##    Age
## Y       [,1]     [,2]
##   0 65.79069 8.873018
##   1 66.71050 8.980755
## 
##    Sex
## Y           M         F
##   0 0.7903448 0.2096552
##   1 0.7335640 0.2664360
## 
##    EuroSCORE_II
## Y       [,1]     [,2]
##   0 4.645086 2.085400
##   1 4.832526 2.198494
## 
##    Ejection_Fraction
## Y       [,1]     [,2]
##   0 54.25776 10.93811
##   1 54.16955 11.29052
## 
##    Serum_Creatinine
## Y       [,1]      [,2]
##   0 1.137017 0.4026303
##   1 1.151557 0.4092259
## 
##    Pre_Op_Albumin
## Y       [,1]      [,2]
##   0 3.812534 0.4949649
##   1 3.727566 0.4937147
## 
##    Diabetes
## Y           0         1
##   0 0.6581034 0.3418966
##   1 0.6320646 0.3679354
## 
##    Hypertension
## Y           0         1
##   0 0.2498276 0.7501724
##   1 0.2422145 0.7577855
## 
##    Prior_MI
## Y           0         1
##   0 0.6656897 0.3343103
##   1 0.6516724 0.3483276
probs_nb_preop <- predict(nb_preop, newdata = test_data, type = "raw")[, "1"]

Decision Tree (Pre-Op)

tree_preop <- rpart(formula_preop, data = train_data, method = "class")

cat("=== Decision Tree (Pre-Op) Complexity Table ===\n")
## === Decision Tree (Pre-Op) Complexity Table ===
printcp(tree_preop)
## 
## Classification tree:
## rpart(formula = formula_preop, data = train_data, method = "class")
## 
## Variables actually used in tree construction:
## character(0)
## 
## Root node error: 867/6667 = 0.13004
## 
## n= 6667 
## 
##   CP nsplit rel error xerror xstd
## 1  0      0         1      0    0
# Only plot if the tree has splits
if (nrow(tree_preop$frame) > 1) {
  rpart.plot(tree_preop, main = "Decision Tree (Pre-Op)")
} else {
  cat("Note: Tree produced no splits (root-only). All patients predicted as majority class.\n")
}
## Note: Tree produced no splits (root-only). All patients predicted as majority class.
probs_tree_preop <- predict(tree_preop, newdata = test_data, type = "prob")[, "1"]

Random Forest (Pre-Op)

set.seed(123)
rf_preop <- randomForest(
  formula_preop, data = train_data,
  ntree = 500, importance = TRUE
)

cat("=== Random Forest (Pre-Op) ===\n")
## === Random Forest (Pre-Op) ===
print(rf_preop)
## 
## Call:
##  randomForest(formula = formula_preop, data = train_data, ntree = 500,      importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 13.15%
## Confusion matrix:
##      0  1 class.error
## 0 5787 13 0.002241379
## 1  864  3 0.996539792
varImpPlot(rf_preop, main = "Variable Importance (Pre-Op Random Forest)")

probs_rf_preop <- predict(rf_preop, newdata = test_data, type = "prob")[, "1"]

Model Training: Full Predictors (Pre-Op + Post-Op)

Logistic Regression (Full)

logit_full <- glm(formula_full, data = train_data, family = binomial)
cat("=== Logistic Regression (Full) Summary ===\n")
## === Logistic Regression (Full) Summary ===
summary(logit_full)
## 
## Call:
## glm(formula = formula_full, family = binomial, data = train_data)
## 
## Coefficients:
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   -1.749519   0.469246  -3.728 0.000193 ***
## Age                            0.009249   0.004328   2.137 0.032586 *  
## SexF                           0.338001   0.083902   4.028 5.61e-05 ***
## EuroSCORE_II                   0.034201   0.016905   2.023 0.043057 *  
## Ejection_Fraction              0.001542   0.003434   0.449 0.653421    
## Serum_Creatinine               0.022294   0.096139   0.232 0.816617    
## Pre_Op_Albumin                -0.357740   0.074042  -4.832 1.35e-06 ***
## Diabetes1                      0.092019   0.079033   1.164 0.244295    
## Hypertension1                  0.028181   0.085456   0.330 0.741576    
## Prior_MI1                      0.065245   0.077048   0.847 0.397101    
## PostOp_AF1                     0.300150   0.081016   3.705 0.000212 ***
## PostOp_AKI1                    0.263192   0.143811   1.830 0.067231 .  
## Deep_Sternal_Wound_Infection1  0.518557   0.187105   2.771 0.005580 ** 
## PostOp_Stroke1                 0.364967   0.278778   1.309 0.190478    
## PostOp_Chronic_Pain1           0.115169   0.080694   1.427 0.153517    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5153.2  on 6666  degrees of freedom
## Residual deviance: 5073.7  on 6652  degrees of freedom
## AIC: 5103.7
## 
## Number of Fisher Scoring iterations: 4
cat("\n=== Odds Ratios ===\n")
## 
## === Odds Ratios ===
print(round(exp(coef(logit_full)), 3))
##                   (Intercept)                           Age 
##                         0.174                         1.009 
##                          SexF                  EuroSCORE_II 
##                         1.402                         1.035 
##             Ejection_Fraction              Serum_Creatinine 
##                         1.002                         1.023 
##                Pre_Op_Albumin                     Diabetes1 
##                         0.699                         1.096 
##                 Hypertension1                     Prior_MI1 
##                         1.029                         1.067 
##                    PostOp_AF1                   PostOp_AKI1 
##                         1.350                         1.301 
## Deep_Sternal_Wound_Infection1                PostOp_Stroke1 
##                         1.680                         1.440 
##          PostOp_Chronic_Pain1 
##                         1.122
cat("\n=== 95% Confidence Intervals for Odds Ratios ===\n")
## 
## === 95% Confidence Intervals for Odds Ratios ===
print(round(exp(confint(logit_full)), 3))
##                               2.5 % 97.5 %
## (Intercept)                   0.069  0.435
## Age                           1.001  1.018
## SexF                          1.188  1.651
## EuroSCORE_II                  1.001  1.069
## Ejection_Fraction             0.995  1.008
## Serum_Creatinine              0.845  1.232
## Pre_Op_Albumin                0.605  0.808
## Diabetes1                     0.938  1.279
## Hypertension1                 0.871  1.218
## Prior_MI1                     0.917  1.240
## PostOp_AF1                    1.151  1.581
## PostOp_AKI1                   0.974  1.713
## Deep_Sternal_Wound_Infection1 1.149  2.397
## PostOp_Stroke1                0.806  2.421
## PostOp_Chronic_Pain1          0.957  1.313
probs_logit_full <- predict(logit_full, newdata = test_data, type = "response")

K-Nearest Neighbors (Full)

knn_pred_full <- knn(
  train = scaled_full$train,
  test  = scaled_full$test,
  cl    = train_data$Readmission_30Day,
  k     = 10,
  prob  = TRUE
)

probs_knn_full <- attr(knn_pred_full, "prob")
probs_knn_full <- ifelse(knn_pred_full == "1", probs_knn_full, 1 - probs_knn_full)

cat("=== KNN (Full) Class Distribution ===\n")
## === KNN (Full) Class Distribution ===
table(knn_pred_full)
## knn_pred_full
##    0    1 
## 3323   10

Support Vector Machine (Full)

svm_full <- svm(
  x = train_num_full,
  y = train_data$Readmission_30Day,
  type = "C-classification",
  kernel = "linear",
  probability = TRUE
)

cat("=== SVM (Full) Summary ===\n")
## === SVM (Full) Summary ===
summary(svm_full)
## 
## Call:
## svm.default(x = train_num_full, y = train_data$Readmission_30Day, 
##     type = "C-classification", kernel = "linear", probability = TRUE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1 
## 
## Number of Support Vectors:  2123
## 
##  ( 1256 867 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1
svm_pred_full <- predict(svm_full, newdata = test_num_full, probability = TRUE)
probs_svm_full <- attr(svm_pred_full, "probabilities")[, "1"]

Naive Bayes (Full)

nb_full <- naiveBayes(formula_full, data = train_data)

cat("=== Naive Bayes (Full) Model ===\n")
## === Naive Bayes (Full) Model ===
print(nb_full)
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##         0         1 
## 0.8699565 0.1300435 
## 
## Conditional probabilities:
##    Age
## Y       [,1]     [,2]
##   0 65.79069 8.873018
##   1 66.71050 8.980755
## 
##    Sex
## Y           M         F
##   0 0.7903448 0.2096552
##   1 0.7335640 0.2664360
## 
##    EuroSCORE_II
## Y       [,1]     [,2]
##   0 4.645086 2.085400
##   1 4.832526 2.198494
## 
##    Ejection_Fraction
## Y       [,1]     [,2]
##   0 54.25776 10.93811
##   1 54.16955 11.29052
## 
##    Serum_Creatinine
## Y       [,1]      [,2]
##   0 1.137017 0.4026303
##   1 1.151557 0.4092259
## 
##    Pre_Op_Albumin
## Y       [,1]      [,2]
##   0 3.812534 0.4949649
##   1 3.727566 0.4937147
## 
##    Diabetes
## Y           0         1
##   0 0.6581034 0.3418966
##   1 0.6320646 0.3679354
## 
##    Hypertension
## Y           0         1
##   0 0.2498276 0.7501724
##   1 0.2422145 0.7577855
## 
##    Prior_MI
## Y           0         1
##   0 0.6656897 0.3343103
##   1 0.6516724 0.3483276
## 
##    PostOp_AF
## Y           0         1
##   0 0.7537931 0.2462069
##   1 0.6908881 0.3091119
## 
##    PostOp_AKI
## Y            0          1
##   0 0.94362069 0.05637931
##   1 0.92618224 0.07381776
## 
##    Deep_Sternal_Wound_Infection
## Y            0          1
##   0 0.97327586 0.02672414
##   1 0.95617070 0.04382930
## 
##    PostOp_Stroke
## Y            0          1
##   0 0.98655172 0.01344828
##   1 0.98154556 0.01845444
## 
##    PostOp_Chronic_Pain
## Y           0         1
##   0 0.7200000 0.2800000
##   1 0.7001153 0.2998847
probs_nb_full <- predict(nb_full, newdata = test_data, type = "raw")[, "1"]

Decision Tree (Full)

tree_full <- rpart(formula_full, data = train_data, method = "class")

cat("=== Decision Tree (Full) Complexity Table ===\n")
## === Decision Tree (Full) Complexity Table ===
printcp(tree_full)
## 
## Classification tree:
## rpart(formula = formula_full, data = train_data, method = "class")
## 
## Variables actually used in tree construction:
## character(0)
## 
## Root node error: 867/6667 = 0.13004
## 
## n= 6667 
## 
##   CP nsplit rel error xerror xstd
## 1  0      0         1      0    0
if (nrow(tree_full$frame) > 1) {
  rpart.plot(tree_full, main = "Decision Tree (Full Model)")
} else {
  cat("Note: Tree produced no splits (root-only). All patients predicted as majority class.\n")
}
## Note: Tree produced no splits (root-only). All patients predicted as majority class.
probs_tree_full <- predict(tree_full, newdata = test_data, type = "prob")[, "1"]

Random Forest (Full)

set.seed(123)
rf_full <- randomForest(
  formula_full, data = train_data,
  ntree = 500, importance = TRUE
)

cat("=== Random Forest (Full) ===\n")
## === Random Forest (Full) ===
print(rf_full)
## 
## Call:
##  randomForest(formula = formula_full, data = train_data, ntree = 500,      importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 13%
## Confusion matrix:
##      0 1 class.error
## 0 5800 0           0
## 1  867 0           1
varImpPlot(rf_full, main = "Variable Importance (Full Random Forest)")

probs_rf_full <- predict(rf_full, newdata = test_data, type = "prob")[, "1"]

Model Evaluation

Performance at Default Threshold (0.5)

At the standard 0.5 probability threshold, class imbalance (~13% readmission rate) causes most models to predict the majority class for all patients. This is expected and is documented here for completeness.

# Collect all probability vectors
all_probs <- list(
  Logistic_PreOp = probs_logit_preop, Logistic_Full = probs_logit_full,
  KNN_PreOp = probs_knn_preop,        KNN_Full = probs_knn_full,
  SVM_PreOp = probs_svm_preop,        SVM_Full = probs_svm_full,
  NaiveBayes_PreOp = probs_nb_preop,  NaiveBayes_Full = probs_nb_full,
  DecisionTree_PreOp = probs_tree_preop, DecisionTree_Full = probs_tree_full,
  RandomForest_PreOp = probs_rf_preop,   RandomForest_Full = probs_rf_full
)

# Evaluate all at 0.5
default_results <- data.frame(
  Model = character(), Accuracy = numeric(),
  Sensitivity = numeric(), Specificity = numeric(),
  stringsAsFactors = FALSE
)

for (name in names(all_probs)) {
  eval <- evaluate_at_threshold(test_data$Readmission_30Day, all_probs[[name]], 0.5)
  default_results <- rbind(default_results, data.frame(
    Model = name, Accuracy = round(eval$accuracy, 4),
    Sensitivity = round(eval$sensitivity, 4),
    Specificity = round(eval$specificity, 4),
    stringsAsFactors = FALSE
  ))
}

cat("=== Model Performance at Default Threshold (0.5) ===\n")
## === Model Performance at Default Threshold (0.5) ===
cat("Note: Near-zero sensitivity is expected with 13% positive class rate.\n\n")
## Note: Near-zero sensitivity is expected with 13% positive class rate.
print(default_results, row.names = FALSE)
##               Model Accuracy Sensitivity Specificity
##      Logistic_PreOp   0.8701      0.0000      1.0000
##       Logistic_Full   0.8701      0.0000      1.0000
##           KNN_PreOp   0.8650      0.0069      0.9931
##            KNN_Full   0.8665      0.0046      0.9952
##           SVM_PreOp   0.8701      0.0000      1.0000
##            SVM_Full   0.8701      0.0000      1.0000
##    NaiveBayes_PreOp   0.8695      0.0000      0.9993
##     NaiveBayes_Full   0.8695      0.0000      0.9993
##  DecisionTree_PreOp   0.8701      0.0000      1.0000
##   DecisionTree_Full   0.8701      0.0000      1.0000
##  RandomForest_PreOp   0.8689      0.0000      0.9986
##   RandomForest_Full   0.8698      0.0000      0.9997

ROC Analysis and AUC

The ROC curve evaluates discriminative ability across all possible thresholds, independent of any single cutoff.

# Compute ROC and AUC for all models
roc_results <- data.frame(
  Model = character(), AUC = numeric(), stringsAsFactors = FALSE
)

roc_objects <- list()

for (name in names(all_probs)) {
  roc_obj <- roc(as.numeric(as.character(test_data$Readmission_30Day)),
                 all_probs[[name]], levels = c(0, 1), quiet = TRUE)
  roc_objects[[name]] <- roc_obj
  roc_results <- rbind(roc_results, data.frame(
    Model = name, AUC = round(as.numeric(auc(roc_obj)), 4),
    stringsAsFactors = FALSE
  ))
}

roc_results <- roc_results[order(-roc_results$AUC), ]
cat("=== AUC Rankings ===\n")
## === AUC Rankings ===
print(roc_results, row.names = FALSE)
##               Model    AUC
##     NaiveBayes_Full 0.5613
##       Logistic_Full 0.5579
##    NaiveBayes_PreOp 0.5392
##      Logistic_PreOp 0.5361
##   RandomForest_Full 0.5339
##           KNN_PreOp 0.5200
##            KNN_Full 0.5184
##           SVM_PreOp 0.5177
##  RandomForest_PreOp 0.5127
##            SVM_Full 0.5120
##  DecisionTree_PreOp 0.5000
##   DecisionTree_Full 0.5000

Combined ROC Curves (Pre-Op Models)

preop_models <- c("Logistic_PreOp", "KNN_PreOp", "SVM_PreOp",
                  "NaiveBayes_PreOp", "DecisionTree_PreOp", "RandomForest_PreOp")
colors_preop <- c("blue", "darkgreen", "red", "purple", "orange", "brown")

plot(roc_objects[[preop_models[1]]], col = colors_preop[1], lwd = 2,
     main = "ROC Curves: Pre-Operative Models")
for (i in 2:length(preop_models)) {
  lines(roc_objects[[preop_models[i]]], col = colors_preop[i], lwd = 2)
}
abline(a = 0, b = 1, lty = 3, col = "gray")
legend("bottomright",
       legend = paste0(gsub("_PreOp", "", preop_models), " (AUC=",
                       round(sapply(preop_models, function(m) as.numeric(auc(roc_objects[[m]]))), 3), ")"),
       col = colors_preop, lwd = 2, cex = 0.8)

Combined ROC Curves (Full Models)

full_models <- c("Logistic_Full", "KNN_Full", "SVM_Full",
                 "NaiveBayes_Full", "DecisionTree_Full", "RandomForest_Full")
colors_full <- c("darkblue", "forestgreen", "darkred", "darkorchid", "darkorange", "saddlebrown")

plot(roc_objects[[full_models[1]]], col = colors_full[1], lwd = 2,
     main = "ROC Curves: Full Models (Pre-Op + Post-Op)")
for (i in 2:length(full_models)) {
  lines(roc_objects[[full_models[i]]], col = colors_full[i], lwd = 2)
}
abline(a = 0, b = 1, lty = 3, col = "gray")
legend("bottomright",
       legend = paste0(gsub("_Full", "", full_models), " (AUC=",
                       round(sapply(full_models, function(m) as.numeric(auc(roc_objects[[m]]))), 3), ")"),
       col = colors_full, lwd = 2, cex = 0.8)

Threshold Optimization (Youden’s J)

The default 0.5 threshold fails because predicted probabilities cluster well below 0.5 given the 13% base rate. Youden’s J statistic (Sensitivity + Specificity - 1) identifies the threshold that maximizes the combined ability to detect both readmitted and non-readmitted patients.

youden_results <- data.frame(
  Model = character(), Optimal_Threshold = numeric(),
  Sensitivity = numeric(), Specificity = numeric(),
  Accuracy = numeric(), AUC = numeric(),
  stringsAsFactors = FALSE
)

for (name in names(all_probs)) {
  yj <- find_youden_threshold(test_data$Readmission_30Day, all_probs[[name]])
  eval_opt <- evaluate_at_threshold(test_data$Readmission_30Day, all_probs[[name]], yj$threshold)
  
  youden_results <- rbind(youden_results, data.frame(
    Model = name,
    Optimal_Threshold = round(yj$threshold, 4),
    Sensitivity = round(eval_opt$sensitivity, 4),
    Specificity = round(eval_opt$specificity, 4),
    Accuracy = round(eval_opt$accuracy, 4),
    AUC = round(yj$auc, 4),
    stringsAsFactors = FALSE
  ))
}

cat("=== Model Performance at Youden's J Optimized Threshold ===\n\n")
## === Model Performance at Youden's J Optimized Threshold ===
print(youden_results, row.names = FALSE)
##               Model Optimal_Threshold Sensitivity Specificity Accuracy    AUC
##      Logistic_PreOp            0.1224      0.5982      0.4645   0.4818 0.5361
##       Logistic_Full            0.1202      0.6374      0.4700   0.4917 0.5579
##           KNN_PreOp            0.2864      0.1617      0.8769   0.7840 0.5200
##            KNN_Full            0.1909      0.3603      0.6745   0.6337 0.5184
##           SVM_PreOp            0.1357      0.2864      0.6648   0.6157 0.5177
##            SVM_Full            0.1301      0.3441      0.6097   0.5752 0.5120
##    NaiveBayes_PreOp            0.1313      0.4688      0.6066   0.5887 0.5392
##     NaiveBayes_Full            0.1104      0.7136      0.3862   0.4287 0.5613
##  DecisionTree_PreOp              -Inf      1.0000      0.0000   0.1299 0.5000
##   DecisionTree_Full              -Inf      1.0000      0.0000   0.1299 0.5000
##  RandomForest_PreOp            0.2190      0.1894      0.8534   0.7672 0.5127
##   RandomForest_Full            0.0930      0.6143      0.4579   0.4782 0.5339

Best Model Selection

cat("=== BEST MODEL BY METRIC (Youden-Optimized) ===\n\n")
## === BEST MODEL BY METRIC (Youden-Optimized) ===
best_acc  <- youden_results[which.max(youden_results$Accuracy), ]
best_sens <- youden_results[which.max(youden_results$Sensitivity), ]
best_spec <- youden_results[which.max(youden_results$Specificity), ]
best_auc  <- youden_results[which.max(youden_results$AUC), ]

cat("Best Accuracy:    ", best_acc$Model, " (", best_acc$Accuracy, " at threshold ", best_acc$Optimal_Threshold, ")\n")
## Best Accuracy:     KNN_PreOp  ( 0.784  at threshold  0.2864 )
cat("Best Sensitivity: ", best_sens$Model, " (", best_sens$Sensitivity, " at threshold ", best_sens$Optimal_Threshold, ")\n")
## Best Sensitivity:  DecisionTree_PreOp  ( 1  at threshold  -Inf )
cat("Best Specificity: ", best_spec$Model, " (", best_spec$Specificity, " at threshold ", best_spec$Optimal_Threshold, ")\n")
## Best Specificity:  KNN_PreOp  ( 0.8769  at threshold  0.2864 )
cat("Best AUC:         ", best_auc$Model, " (", best_auc$AUC, ")\n")
## Best AUC:          NaiveBayes_Full  ( 0.5613 )

Comparison Visualization

library(tidyr)

plot_data <- youden_results %>%
  select(Model, Accuracy, Sensitivity, Specificity, AUC) %>%
  pivot_longer(cols = c(Accuracy, Sensitivity, Specificity, AUC),
               names_to = "Metric", values_to = "Value")

# Separate Pre-Op and Full for cleaner labels
plot_data$Config <- ifelse(grepl("PreOp", plot_data$Model), "Pre-Op", "Full")
plot_data$Algorithm <- gsub("_PreOp|_Full", "", plot_data$Model)

ggplot(plot_data, aes(x = Algorithm, y = Value, fill = Config)) +
  geom_bar(stat = "identity", position = "dodge", alpha = 0.8) +
  facet_wrap(~ Metric, scales = "free_y") +
  geom_text(aes(label = round(Value, 3)), position = position_dodge(width = 0.9),
            vjust = -0.3, size = 2.5) +
  labs(title = "Model Performance Comparison (Youden-Optimized Thresholds)",
       x = "Algorithm", y = "Value", fill = "Configuration") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"),
        axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_fill_manual(values = c("Pre-Op" = "#3498db", "Full" = "#e74c3c"))

Threshold Sensitivity Analysis (Best Model)

# Use the best AUC model for threshold sweep
best_model_name <- best_auc$Model
best_probs <- all_probs[[best_model_name]]

thresholds <- seq(0.05, 0.50, by = 0.01)
sweep_results <- data.frame(Threshold = numeric(), Accuracy = numeric(),
                            Sensitivity = numeric(), Specificity = numeric())

for (t in thresholds) {
  ev <- evaluate_at_threshold(test_data$Readmission_30Day, best_probs, t)
  sweep_results <- rbind(sweep_results, data.frame(
    Threshold = t, Accuracy = ev$accuracy,
    Sensitivity = ev$sensitivity, Specificity = ev$specificity
  ))
}

# Youden's J for each threshold
sweep_results$Youden_J <- sweep_results$Sensitivity + sweep_results$Specificity - 1

plot(sweep_results$Threshold, sweep_results$Sensitivity, type = "l", col = "red", lwd = 2,
     ylim = c(0, 1), xlab = "Classification Threshold", ylab = "Metric Value",
     main = paste0("Sensitivity-Specificity Trade-off\n(", best_model_name, ")"))
lines(sweep_results$Threshold, sweep_results$Specificity, col = "blue", lwd = 2)
lines(sweep_results$Threshold, sweep_results$Accuracy, col = "darkgreen", lwd = 2)
lines(sweep_results$Threshold, sweep_results$Youden_J, col = "orange", lwd = 2, lty = 2)
abline(v = best_auc$Optimal_Threshold, col = "gray", lty = 3)
legend("right", legend = c("Sensitivity", "Specificity", "Accuracy", "Youden's J", "Optimal Threshold"),
       col = c("red", "blue", "darkgreen", "orange", "gray"),
       lwd = 2, lty = c(1, 1, 1, 2, 3), cex = 0.8)

Predicted Probability Distribution (Logistic Full)

This histogram shows why the default 0.5 threshold produces zero sensitivity. The predicted probabilities for both classes cluster between 0.05 and 0.25, well below 0.5. At a 0.5 cutoff, no patient is predicted as readmitted. The Youden-optimized threshold (~0.12) sits where the two distributions overlap, balancing sensitivity and specificity.

prob_df <- data.frame(
  Probability = probs_logit_full,
  Actual = factor(ifelse(as.numeric(as.character(test_data$Readmission_30Day)) == 1,
                         "Readmitted", "Not Readmitted"),
                  levels = c("Not Readmitted", "Readmitted"))
)

ggplot(prob_df, aes(x = Probability, fill = Actual)) +
  geom_histogram(bins = 50, alpha = 0.6, position = "identity") +
  geom_vline(xintercept = 0.5, color = "red", linetype = "dashed", size = 1) +
  geom_vline(xintercept = youden_results$Optimal_Threshold[youden_results$Model == "Logistic_Full"],
             color = "darkgreen", linetype = "dashed", size = 1) +
  annotate("text", x = 0.48, y = Inf, label = "Default (0.5)", color = "red",
           hjust = 1, vjust = 2, size = 3.5) +
  annotate("text", x = youden_results$Optimal_Threshold[youden_results$Model == "Logistic_Full"] + 0.02,
           y = Inf, label = "Youden Optimal", color = "darkgreen",
           hjust = 0, vjust = 2, size = 3.5) +
  scale_fill_manual(values = c("Not Readmitted" = "#3498db", "Readmitted" = "#e74c3c")) +
  labs(title = "Distribution of Predicted Probabilities (Logistic Regression Full)",
       subtitle = "Why the 0.5 threshold fails: all predictions cluster below 0.25",
       x = "Predicted Probability of Readmission", y = "Count", fill = "Actual Outcome") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"),
        plot.subtitle = element_text(hjust = 0.5),
        legend.position = "bottom")


Model Selection and Interpretation

cat("================================================================\n")
## ================================================================
cat("              MODEL SELECTION SUMMARY\n")
##               MODEL SELECTION SUMMARY
cat("================================================================\n\n")
## ================================================================
cat("DISCRIMINATIVE ABILITY (AUC):\n")
## DISCRIMINATIVE ABILITY (AUC):
cat("  Best overall model:", best_auc$Model, "(AUC =", best_auc$AUC, ")\n\n")
##   Best overall model: NaiveBayes_Full (AUC = 0.5613 )
cat("KEY FINDING -- CLASS IMBALANCE:\n")
## KEY FINDING -- CLASS IMBALANCE:
cat("  At the standard 0.5 threshold, all models achieve ~87% accuracy\n")
##   At the standard 0.5 threshold, all models achieve ~87% accuracy
cat("  with near-zero sensitivity. This is not a model failure; it\n")
##   with near-zero sensitivity. This is not a model failure; it
cat("  reflects the challenge of predicting a 13% base-rate event with\n")
##   reflects the challenge of predicting a 13% base-rate event with
cat("  modest predictor effect sizes. Youden's J threshold optimization\n")
##   modest predictor effect sizes. Youden's J threshold optimization
cat("  reveals the models' true discriminative ability.\n\n")
##   reveals the models' true discriminative ability.
cat("PRE-OP vs FULL MODEL COMPARISON:\n")
## PRE-OP vs FULL MODEL COMPARISON:
cat("  Full models (with post-op complications) generally achieve higher\n")
##   Full models (with post-op complications) generally achieve higher
cat("  AUC than pre-op-only models, confirming that post-operative events\n")
##   AUC than pre-op-only models, confirming that post-operative events
cat("  (AF, DSWI, stroke, AKI, chronic pain) carry additional predictive\n")
##   (AF, DSWI, stroke, AKI, chronic pain) carry additional predictive
cat("  information for readmission risk beyond baseline characteristics.\n\n")
##   information for readmission risk beyond baseline characteristics.
cat("CLINICALLY SIGNIFICANT PREDICTORS (Full Logistic Model):\n")
## CLINICALLY SIGNIFICANT PREDICTORS (Full Logistic Model):
cat("  See odds ratios above. Key predictors expected based on the CABG\n")
##   See odds ratios above. Key predictors expected based on the CABG
cat("  readmission literature include: Pre_Op_Albumin (protective),\n")
##   readmission literature include: Pre_Op_Albumin (protective),
cat("  Female Sex (risk factor), DSWI (strongest complication risk),\n")
##   Female Sex (risk factor), DSWI (strongest complication risk),
cat("  PostOp_Stroke, PostOp_AF, EuroSCORE_II, Diabetes, and Prior_MI.\n\n")
##   PostOp_Stroke, PostOp_AF, EuroSCORE_II, Diabetes, and Prior_MI.
cat("RECOMMENDATION:\n")
## RECOMMENDATION:
cat("  The Logistic Regression Full model is recommended as the primary\n")
##   The Logistic Regression Full model is recommended as the primary
cat("  model. It produces interpretable odds ratios consistent with\n")
##   model. It produces interpretable odds ratios consistent with
cat("  published literature, competitive AUC, and coefficients that can\n")
##   published literature, competitive AUC, and coefficients that can
cat("  directly inform clinical decision-making. For deployment, a\n")
##   directly inform clinical decision-making. For deployment, a
cat("  threshold calibrated via Youden's J (or a clinically defined\n")
##   threshold calibrated via Youden's J (or a clinically defined
cat("  cost-benefit analysis) should replace the default 0.5 cutoff.\n")
##   cost-benefit analysis) should replace the default 0.5 cutoff.
cat("================================================================\n")
## ================================================================

Next Best Actions

cat("NEXT BEST ACTIONS BASED ON MODEL RESULTS\n")
## NEXT BEST ACTIONS BASED ON MODEL RESULTS
cat("=========================================\n\n")
## =========================================
cat("1. IMPLEMENT A RISK-SCORING TOOL WITH OPTIMIZED THRESHOLD\n")
## 1. IMPLEMENT A RISK-SCORING TOOL WITH OPTIMIZED THRESHOLD
cat("   Deploy the full logistic model as a discharge risk score.\n")
##    Deploy the full logistic model as a discharge risk score.
cat("   Use a threshold calibrated by Youden's J rather than 0.50\n")
##    Use a threshold calibrated by Youden's J rather than 0.50
cat("   to maximize sensitivity for identifying high-risk patients,\n")
##    to maximize sensitivity for identifying high-risk patients,
cat("   accepting a higher false-positive rate as the cost of prevention.\n\n")
##    accepting a higher false-positive rate as the cost of prevention.
cat("2. PRIORITIZE NUTRITIONAL OPTIMIZATION\n")
## 2. PRIORITIZE NUTRITIONAL OPTIMIZATION
cat("   Pre-operative albumin is a modifiable predictor. Routine\n")
##    Pre-operative albumin is a modifiable predictor. Routine
cat("   nutritional screening and supplementation for patients with\n")
##    nutritional screening and supplementation for patients with
cat("   albumin below 3.5 g/dL before surgery is evidence-based.\n\n")
##    albumin below 3.5 g/dL before surgery is evidence-based.
cat("3. ENHANCED SURVEILLANCE FOR POST-OP COMPLICATIONS\n")
## 3. ENHANCED SURVEILLANCE FOR POST-OP COMPLICATIONS
cat("   DSWI and post-op stroke are strong readmission predictors.\n")
##    DSWI and post-op stroke are strong readmission predictors.
cat("   Patients who develop these complications should be enrolled\n")
##    Patients who develop these complications should be enrolled
cat("   in structured follow-up (phone calls at 7 and 14 days,\n")
##    in structured follow-up (phone calls at 7 and 14 days,
cat("   virtual care visits) per CCN quality standards.\n\n")
##    virtual care visits) per CCN quality standards.
cat("4. ADDRESS THE SEX DISPARITY\n")
## 4. ADDRESS THE SEX DISPARITY
cat("   Female sex is an independent predictor of readmission\n")
##    Female sex is an independent predictor of readmission
cat("   (consistent with published literature, OR ~1.29). This\n")
##    (consistent with published literature, OR ~1.29). This
cat("   warrants sex-specific discharge planning protocols.\n\n")
##    warrants sex-specific discharge planning protocols.
cat("5. ENRICH THE DATASET FOR FUTURE MODELING\n")
## 5. ENRICH THE DATASET FOR FUTURE MODELING
cat("   Modest AUC values suggest important predictors are missing.\n")
##    Modest AUC values suggest important predictors are missing.
cat("   Social determinants (income, social support, distance from\n")
##    Social determinants (income, social support, distance from
cat("   hospital), length of stay, and discharge disposition are\n")
##    hospital), length of stay, and discharge disposition are
cat("   known readmission predictors that could improve performance.\n")
##    known readmission predictors that could improve performance.

Export Results

write.csv(as.data.frame(youden_results), "cabg_model_comparison_results.csv", row.names = FALSE)
write.csv(as.data.frame(default_results), "cabg_model_default_threshold_results.csv", row.names = FALSE)
cat("Results exported to CSV.\n")
## Results exported to CSV.

Session Information

sessionInfo()
## R version 4.5.2 (2025-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26200)
## 
## Matrix products: default
##   LAPACK version 3.12.1
## 
## locale:
## [1] LC_COLLATE=English_Canada.utf8  LC_CTYPE=English_Canada.utf8   
## [3] LC_MONETARY=English_Canada.utf8 LC_NUMERIC=C                   
## [5] LC_TIME=English_Canada.utf8    
## 
## time zone: America/Toronto
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] tidyr_1.3.2          scales_1.4.0         knitr_1.51          
##  [4] gridExtra_2.3        pROC_1.19.0.1        randomForest_4.7-1.2
##  [7] rpart.plot_3.1.4     rpart_4.1.24         e1071_1.7-17        
## [10] class_7.3-23         caret_7.0-1          lattice_0.22-7      
## [13] ggplot2_4.0.2        caTools_1.18.3       dplyr_1.2.0         
## [16] rstudioapi_0.18.0    pacman_0.5.1        
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6         xfun_0.56            bslib_0.10.0        
##  [4] recipes_1.3.1        vctrs_0.7.1          tools_4.5.2         
##  [7] bitops_1.0-9         generics_0.1.4       stats4_4.5.2        
## [10] parallel_4.5.2       proxy_0.4-29         tibble_3.3.1        
## [13] ModelMetrics_1.2.2.2 pkgconfig_2.0.3      Matrix_1.7-4        
## [16] data.table_1.18.2.1  RColorBrewer_1.1-3   S7_0.2.1            
## [19] lifecycle_1.0.5      compiler_4.5.2       farver_2.1.2        
## [22] stringr_1.6.0        codetools_0.2-20     htmltools_0.5.9     
## [25] sass_0.4.10          yaml_2.3.12          prodlim_2025.04.28  
## [28] pillar_1.11.1        jquerylib_0.1.4      MASS_7.3-65         
## [31] cachem_1.1.0         gower_1.0.2          iterators_1.0.14    
## [34] foreach_1.5.2        nlme_3.1-168         parallelly_1.46.1   
## [37] lava_1.8.2           tidyselect_1.2.1     digest_0.6.39       
## [40] stringi_1.8.7        future_1.69.0        reshape2_1.4.5      
## [43] purrr_1.2.1          listenv_0.10.0       labeling_0.4.3      
## [46] splines_4.5.2        fastmap_1.2.0        grid_4.5.2          
## [49] cli_3.6.5            magrittr_2.0.4       survival_3.8-3      
## [52] future.apply_1.20.2  withr_3.0.2          lubridate_1.9.5     
## [55] timechange_0.4.0     rmarkdown_2.30       globals_0.19.0      
## [58] otel_0.2.0           nnet_7.3-20          timeDate_4052.112   
## [61] evaluate_1.0.5       hardhat_1.4.2        rlang_1.1.7         
## [64] Rcpp_1.1.1           glue_1.8.0           ipred_0.9-15        
## [67] jsonlite_2.0.0       R6_2.6.1             plyr_1.8.9