library(randomForest)
library(fastshap)
library(dplyr)
library(caret)
library(pROC)
library(PRROC)
library(prg)
library(tidyr)
library(patchwork)
library(cowplot)
library(ggplot2)

1. Data

df_test     <- read.csv("~/Downloads/read-ai/df_test_f_v2.csv")
df_external <- read.csv("~/Downloads/read-ai/df_external_f_v2.csv")

cat(sprintf("Study 1 (internal): n=%d  IR=%d  Non-IR=%d  prevalence=%.3f\n",
  nrow(df_test),
  sum(df_test$IR_label == "IR"),
  sum(df_test$IR_label == "Non-IR"),
  mean(df_test$IR_label == "IR")))
## Study 1 (internal): n=97  IR=32  Non-IR=65  prevalence=0.330
cat(sprintf("Study 2 (external): n=%d  IR=%d  Non-IR=%d  prevalence=%.3f\n",
  nrow(df_external),
  sum(df_external$IR_label == "IR"),
  sum(df_external$IR_label == "Non-IR"),
  mean(df_external$IR_label == "IR")))
## Study 2 (external): n=61  IR=19  Non-IR=42  prevalence=0.311

2. Feature sets

feature_sets <- list(
  CGM_model = c(
    "bmi", "mean_fasting_glucose", "sd_fasting_glucose",
    "excursions_per_day", "rec_time_50pct_median",
    "overall_max_glucose", "age", "whr"
  ),
  Smartwatch_model = c(
    "stress_overall_daily_mean_stress", "sd_hr_day_night_diff",
    "age", "bmi", "whr"
  ),
  Baseline_model = c("bmi", "age", "whr")
)

model_names <- c("Model CGM", "Model smartwatch", "Model baseline")

for (nm in names(feature_sets)) {
  p <- length(feature_sets[[nm]])
  cat(sprintf("  %-20s  p = %d   mtry = %d\n", nm, p, floor(sqrt(p))))
}
##   CGM_model             p = 8   mtry = 2
##   Smartwatch_model      p = 5   mtry = 2
##   Baseline_model        p = 3   mtry = 1

3. Helper functions

standardize <- function(df, model_name) {
  df <- as.data.frame(df)
  df$Model <- model_name
  df
}

# AUPRG — Precision-Recall Gain curve (Flach & Kull, NeurIPS 2015).
# Random classifier → 0;  perfect classifier → 1.
# install.packages("prg") if needed.
compute_auprg <- function(labels, scores) {
  tryCatch({
    curve <- prg::create_prg_curve(labels, scores)
    prg::calc_auprg(curve)
  }, error = function(e) NA_real_)
}

4. Training function (bootstrap loop)

Trains an RF on the 70% training partition of each bootstrap split. mtry = floor(sqrt(p)) throughout; upsampling applied within each split. SHAP values are computed on each 30% holdout for internal visualisation.

train_model_with_shap <- function(features, df, train_idx) {

  X <- df %>%
    dplyr::select(all_of(features)) %>%
    mutate(across(where(is.numeric), as.double))

  y <- factor(trimws(df$IR_label))
  levels(y) <- make.names(levels(y))

  X_train <- X[train_idx, ];  X_test <- X[-train_idx, ]
  y_train <- y[train_idx];    y_test  <- y[-train_idx]

  p    <- length(features)
  mtry <- floor(sqrt(p))

  ctrl <- trainControl(
    method = "repeatedcv", number = 5, repeats = 20,
    sampling = "up", classProbs = TRUE,
    summaryFunction = twoClassSummary
  )

  model <- train(
    x = X_train, y = y_train,
    method = "rf", trControl = ctrl,
    preProcess = "medianImpute",
    tuneGrid = expand.grid(mtry = mtry),
    ntree = 300, metric = "ROC"
  )

  prob_test <- predict(model, X_test, type = "prob")[, "IR"]
  pred_test <- predict(model, X_test)

  cm_test  <- confusionMatrix(pred_test, y_test)
  auc_test <- as.numeric(auc(roc(y_test, prob_test,
                                  levels = c("Non.IR","IR"))))
  pr_test  <- pr.curve(scores.class0  = prob_test,
                       weights.class0 = ifelse(y_test == "IR", 1, 0),
                       curve = FALSE)

  internal_holdout_metrics <- data.frame(
    Accuracy          = cm_test$overall["Accuracy"],
    Sensitivity       = cm_test$byClass["Sensitivity"],
    Specificity       = cm_test$byClass["Specificity"],
    Balanced_Accuracy = (cm_test$byClass["Sensitivity"] +
                         cm_test$byClass["Specificity"]) / 2,
    Precision         = cm_test$byClass["Pos Pred Value"],
    F1 = 2 * cm_test$byClass["Pos Pred Value"] *
             cm_test$byClass["Sensitivity"] /
         (cm_test$byClass["Pos Pred Value"] +
          cm_test$byClass["Sensitivity"]),
    AUC   = auc_test,
    PRAUC = pr_test$auc.integral,
    AUPRG = compute_auprg(as.integer(y_test == "IR"), prob_test)
  )

  pfun <- function(object, newdata)
    predict(object, newdata, type = "prob")[, "IR"]

  shap_values <- fastshap::explain(
    object = model, X = as.data.frame(X_test),
    pred_wrapper = pfun, nsim = 50, adjust = TRUE
  )

  list(
    model                    = model,
    features                 = features,
    internal_holdout_metrics = internal_holdout_metrics,
    shap_validation          = shap_values,
    X_test_df                = as.data.frame(X_test)
  )
}

5. Fast final-model function

Fits the RF directly (no caret CV) on all internal data for a given seed. Hyperparameters are fixed a priori so cross-validation is not required here. Pipeline: medianImpute → upSample → randomForest.

fit_rf_seed <- function(features, df_train, seed) {
  set.seed(seed)
  p    <- length(features)
  mtry <- floor(sqrt(p))

  X_tr <- df_train %>%
    dplyr::select(all_of(features)) %>%
    mutate(across(where(is.numeric), as.double))
  y_tr <- factor(make.names(trimws(df_train$IR_label)))

  imp      <- caret::preProcess(X_tr, method = "medianImpute")
  X_tr_imp <- predict(imp, X_tr)
  up_data  <- caret::upSample(x = X_tr_imp, y = y_tr, yname = ".outcome")

  rf <- randomForest::randomForest(
    x = up_data[, names(X_tr_imp), drop = FALSE],
    y = up_data$.outcome,
    ntree = 300, mtry = mtry, nodesize = 1
  )

  list(rf = rf, imp = imp, features = features)
}

predict_rf <- function(fit_obj, df_new) {
  X_new <- df_new %>%
    dplyr::select(all_of(fit_obj$features)) %>%
    mutate(across(where(is.numeric), as.double))
  X_imp <- predict(fit_obj$imp, X_new)
  predict(fit_obj$rf, X_imp, type = "prob")[, "IR"]
}

6. Multi-seed external evaluation function

Fits 20 independent RF models (seeds 1–20) on all internal data. Ensemble = mean predicted probability across seeds per patient. AUC-ROC CI: DeLong on ensemble. AUPRC / AUPRG CI: 2,000-iteration bootstrap.

evaluate_multiseed <- function(features, df_train, df_ext,
                                N_SEEDS = 20, n_boot = 2000) {
  y_ext <- factor(make.names(trimws(df_ext$IR_label)))

  all_probs  <- matrix(NA_real_, nrow = nrow(df_ext), ncol = N_SEEDS)
  all_aucs   <- numeric(N_SEEDS)
  all_auprcs <- numeric(N_SEEDS)
  all_auprgs <- numeric(N_SEEDS)

  for (s in seq_len(N_SEEDS)) {
    fit  <- fit_rf_seed(features, df_train, seed = s)
    prob <- predict_rf(fit, df_ext)
    all_probs[, s] <- prob

    roc_s         <- roc(y_ext, prob, levels = c("Non.IR","IR"), quiet = TRUE)
    all_aucs[s]   <- as.numeric(auc(roc_s))
    all_auprcs[s] <- pr.curve(scores.class0  = prob,
                               weights.class0 = ifelse(y_ext == "IR", 1, 0),
                               curve = FALSE)$auc.integral
    all_auprgs[s] <- compute_auprg(as.integer(y_ext == "IR"), prob)
  }

  ens <- rowMeans(all_probs)

  # AUC-ROC: DeLong CI on ensemble
  roc_ens <- roc(y_ext, ens, levels = c("Non.IR","IR"), quiet = TRUE)
  ens_auc <- as.numeric(auc(roc_ens))
  dl      <- tryCatch(
    as.numeric(ci.auc(roc_ens, method = "delong", conf.level = 0.95)),
    error = function(e) c(NA, NA, NA))

  # AUPRC / AUPRG: point estimates on ensemble
  ens_auprc <- pr.curve(scores.class0  = ens,
                         weights.class0 = ifelse(y_ext == "IR", 1, 0),
                         curve = FALSE)$auc.integral
  ens_auprg <- compute_auprg(as.integer(y_ext == "IR"), ens)

  # Bootstrap CI for AUPRC and AUPRG on ensemble
  set.seed(42)
  n_ext <- length(y_ext)
  prauc_boot <- auprg_boot <- numeric(n_boot)
  for (i in seq_len(n_boot)) {
    idx <- sample(n_ext, n_ext, replace = TRUE)
    yb  <- y_ext[idx]; pb <- ens[idx]
    if (length(unique(yb)) < 2) { prauc_boot[i] <- NA; auprg_boot[i] <- NA; next }
    prauc_boot[i] <- tryCatch(
      pr.curve(scores.class0=pb, weights.class0=ifelse(yb=="IR",1,0),
               curve=FALSE)$auc.integral, error=function(e) NA_real_)
    auprg_boot[i] <- tryCatch(
      compute_auprg(as.integer(yb=="IR"), pb), error=function(e) NA_real_)
  }

  # Classification metrics on ensemble at 0.5 threshold
  ens_pred <- factor(ifelse(ens >= 0.5, "IR", "Non.IR"),
                     levels = c("Non.IR","IR"))
  cm_ens   <- confusionMatrix(ens_pred, y_ext)

  list(
    auc_mean    = round(mean(all_aucs),  3),
    auc_sd      = round(sd(all_aucs),    3),
    auc_range   = range(all_aucs),
    auprc_mean  = round(mean(all_auprcs),3),
    auprc_sd    = round(sd(all_auprcs),  3),
    auprg_mean  = round(mean(all_auprgs),3),
    auprg_sd    = round(sd(all_auprgs),  3),
    all_aucs    = all_aucs,
    all_auprcs  = all_auprcs,
    all_auprgs  = all_auprgs,
    ens_auc     = round(ens_auc,  3),
    ens_auc_lo  = round(dl[1],    3),
    ens_auc_hi  = round(dl[3],    3),
    ens_auprc   = round(ens_auprc,3),
    ens_auprc_lo= round(quantile(prauc_boot,0.025,na.rm=TRUE),3),
    ens_auprc_hi= round(quantile(prauc_boot,0.975,na.rm=TRUE),3),
    ens_auprg   = round(ens_auprg,3),
    ens_auprg_lo= round(quantile(auprg_boot,0.025,na.rm=TRUE),3),
    ens_auprg_hi= round(quantile(auprg_boot,0.975,na.rm=TRUE),3),
    ensemble_prob = ens,
    y_ext         = y_ext,
    cm            = cm_ens,
    N_SEEDS       = N_SEEDS
  )
}

7. Bootstrap loop — internal metrics

100 stratified 70/30 splits. AUC-ROC, AUPRC, and AUPRG are collected per iteration; CIs are 2.5–97.5 percentiles across iterations.

all_results  <- list()
all_shap_val <- list()

set.seed(123)
n_iter <- 100

for (i in seq_len(n_iter)) {
  y <- factor(trimws(df_test$IR_label))
  levels(y) <- make.names(levels(y))
  train_idx <- createDataPartition(y, p = 0.7, list = FALSE)

  CGM_model  <- train_model_with_shap(feature_sets$CGM_model,       df_test, train_idx)
  SW_model   <- train_model_with_shap(feature_sets$Smartwatch_model, df_test, train_idx)
  Base_model <- train_model_with_shap(feature_sets$Baseline_model,   df_test, train_idx)

  trained_models <- list(CGM_model, SW_model, Base_model)

  iter_results <- dplyr::bind_rows(
    standardize(CGM_model$internal_holdout_metrics,  "Model CGM")        |> mutate(Dataset = "Internal"),
    standardize(SW_model$internal_holdout_metrics,   "Model smartwatch") |> mutate(Dataset = "Internal"),
    standardize(Base_model$internal_holdout_metrics, "Model baseline")   |> mutate(Dataset = "Internal")
  ) %>% mutate(iteration = i)

  all_results[[i]] <- iter_results

  for (m in seq_along(trained_models)) {
    mdl <- trained_models[[m]]
    all_shap_val[[length(all_shap_val) + 1]] <- list(
      iteration  = i,
      model_name = model_names[m],
      shap       = mdl$shap_validation,
      X_df       = mdl$X_test_df
    )
  }

  if (i %% 10 == 0) cat(sprintf("  Iteration %d / %d\n", i, n_iter))
}
##   Iteration 10 / 100
##   Iteration 20 / 100
##   Iteration 30 / 100
##   Iteration 40 / 100
##   Iteration 50 / 100
##   Iteration 60 / 100
##   Iteration 70 / 100
##   Iteration 80 / 100
##   Iteration 90 / 100
##   Iteration 100 / 100
bootstrap_results <- dplyr::bind_rows(all_results)

final_summary <- bootstrap_results %>%
  group_by(Model, Dataset) %>%
  summarise(
    across(
      c(Accuracy, Sensitivity, Specificity, Balanced_Accuracy,
        Precision, F1, AUC, PRAUC, AUPRG),
      list(mean  = ~mean(.x,            na.rm = TRUE),
           lower = ~quantile(.x, 0.025, na.rm = TRUE),
           upper = ~quantile(.x, 0.975, na.rm = TRUE)),
      .names = "{.col}_{.fn}"
    ),
    .groups = "drop"
  )

cat("\nInternal performance summary (mean [2.5%, 97.5%]):\n")
## 
## Internal performance summary (mean [2.5%, 97.5%]):
print(
  final_summary %>%
    dplyr::select(Model,
                  AUC_mean,   AUC_lower,   AUC_upper,
                  PRAUC_mean, PRAUC_lower, PRAUC_upper,
                  AUPRG_mean, AUPRG_lower, AUPRG_upper) %>%
    dplyr::rename(
      AUC_ROC   = AUC_mean,   AUC_lo  = AUC_lower,   AUC_hi  = AUC_upper,
      AUPRC     = PRAUC_mean, AUPRC_lo= PRAUC_lower,  AUPRC_hi= PRAUC_upper,
      AUPRG_lo  = AUPRG_lower,AUPRG_hi= AUPRG_upper
    ) %>%
    mutate(across(where(is.numeric), ~round(.x, 3)))
)
## # A tibble: 3 × 10
##   Model        AUC_ROC AUC_lo AUC_hi AUPRC AUPRC_lo AUPRC_hi AUPRG_mean AUPRG_lo
##   <chr>          <dbl>  <dbl>  <dbl> <dbl>    <dbl>    <dbl>      <dbl>    <dbl>
## 1 Model CGM      0.833  0.684  0.953 0.725    0.488    0.915      0.742    0.363
## 2 Model basel…   0.817  0.655  0.933 0.644    0.419    0.853      0.68     0.359
## 3 Model smart…   0.812  0.663  0.933 0.637    0.43     0.875      0.677    0.356
## # ℹ 1 more variable: AUPRG_hi <dbl>
write.csv(bootstrap_results, "bootstrap_results.csv", row.names = FALSE)
write.csv(final_summary,     "bootstrap_summary.csv", row.names = FALSE)

8. External validation — 20-seed ensemble

Each model is refitted 20 times (seeds 1–20) on all 97 internal samples. The ensemble prediction is the mean probability across seeds per patient.

cat("Evaluating external cohort (20 seeds per model)...\n")
## Evaluating external cohort (20 seeds per model)...
ms_CGM  <- evaluate_multiseed(feature_sets$CGM_model,       df_test, df_external)
ms_SW   <- evaluate_multiseed(feature_sets$Smartwatch_model, df_test, df_external)
ms_Base <- evaluate_multiseed(feature_sets$Baseline_model,   df_test, df_external)

ms_list <- list(
  "CGM model"        = ms_CGM,
  "Smartwatch model" = ms_SW,
  "Baseline model"   = ms_Base
)

ms_summary <- dplyr::bind_rows(lapply(names(ms_list), function(nm) {
  ms <- ms_list[[nm]]
  data.frame(
    Model        = nm,
    AUC_mean     = ms$auc_mean,     AUC_sd       = ms$auc_sd,
    Ens_AUC      = ms$ens_auc,      Ens_AUC_lo   = ms$ens_auc_lo,
    Ens_AUC_hi   = ms$ens_auc_hi,
    AUPRC_mean   = ms$auprc_mean,   AUPRC_sd     = ms$auprc_sd,
    Ens_AUPRC    = ms$ens_auprc,    Ens_AUPRC_lo = ms$ens_auprc_lo,
    Ens_AUPRC_hi = ms$ens_auprc_hi,
    AUPRG_mean   = ms$auprg_mean,   AUPRG_sd     = ms$auprg_sd,
    Ens_AUPRG    = ms$ens_auprg,    Ens_AUPRG_lo = ms$ens_auprg_lo,
    Ens_AUPRG_hi = ms$ens_auprg_hi,
    stringsAsFactors = FALSE
  )
}))

cat("\nExternal validation results:\n")
## 
## External validation results:
print(ms_summary %>% mutate(across(where(is.numeric), ~round(.x, 3))))
##                     Model AUC_mean AUC_sd Ens_AUC Ens_AUC_lo Ens_AUC_hi
## 2.5%...1        CGM model    0.878  0.012   0.890      0.789      0.990
## 2.5%...2 Smartwatch model    0.794  0.011   0.797      0.674      0.920
## 2.5%...3   Baseline model    0.753  0.009   0.763      0.625      0.901
##          AUPRC_mean AUPRC_sd Ens_AUPRC Ens_AUPRC_lo Ens_AUPRC_hi AUPRG_mean
## 2.5%...1      0.802    0.032     0.825        0.633        0.954      0.866
## 2.5%...2      0.698    0.019     0.708        0.486        0.858      0.725
## 2.5%...3      0.568    0.035     0.582        0.373        0.826      0.633
##          AUPRG_sd Ens_AUPRG Ens_AUPRG_lo Ens_AUPRG_hi
## 2.5%...1    0.033     0.883        0.715        0.983
## 2.5%...2    0.025     0.740        0.364        0.934
## 2.5%...3    0.026     0.679        0.321        0.901
ext_probs <- list(
  "CGM model"        = ms_CGM$ensemble_prob,
  "Smartwatch model" = ms_SW$ensemble_prob,
  "Baseline model"   = ms_Base$ensemble_prob
)
y_ext_fac <- ms_CGM$y_ext
y_bin     <- as.integer(y_ext_fac == "IR")
ir_prev   <- mean(y_bin)

Full classification metrics on ensemble

compute_ext_full <- function(y_true, prob, model_name, n_boot = 2000, seed = 42) {
  n    <- length(y_true)
  pred <- factor(ifelse(prob >= 0.5, "IR", "Non.IR"), levels = c("Non.IR","IR"))
  cm   <- confusionMatrix(pred, y_true)
  sens <- as.numeric(cm$byClass["Sensitivity"])
  spec <- as.numeric(cm$byClass["Specificity"])
  prec <- as.numeric(cm$byClass["Pos Pred Value"])
  f1   <- ifelse(is.na(prec)|prec+sens==0, NA_real_, 2*prec*sens/(prec+sens))
  acc  <- as.numeric(cm$overall["Accuracy"])
  auprc<- pr.curve(scores.class0=prob, weights.class0=ifelse(y_true=="IR",1,0),
                   curve=FALSE)$auc.integral
  auprg<- compute_auprg(as.integer(y_true=="IR"), prob)

  set.seed(seed)
  mat <- matrix(NA_real_, nrow=n_boot, ncol=8,
                dimnames=list(NULL, c("sens","spec","prec","f1","acc",
                                      "balacc","prauc","auprg")))
  for (i in seq_len(n_boot)) {
    idx  <- sample(n, n, replace=TRUE)
    yb   <- y_true[idx]; pb_cls <- pred[idx]; pb_pr <- prob[idx]
    if (length(unique(yb)) < 2) next
    cm_b <- confusionMatrix(pb_cls, yb)
    s_b  <- as.numeric(cm_b$byClass["Sensitivity"])
    sp_b <- as.numeric(cm_b$byClass["Specificity"])
    pr_b <- as.numeric(cm_b$byClass["Pos Pred Value"])
    f1_b <- ifelse(is.na(pr_b)|pr_b+s_b==0, NA_real_, 2*pr_b*s_b/(pr_b+s_b))
    prauc_b <- tryCatch(
      pr.curve(scores.class0=pb_pr, weights.class0=ifelse(yb=="IR",1,0),
               curve=FALSE)$auc.integral, error=function(e) NA_real_)
    auprg_b <- tryCatch(
      compute_auprg(as.integer(yb=="IR"), pb_pr), error=function(e) NA_real_)
    mat[i,] <- c(s_b, sp_b, pr_b, f1_b, as.numeric(cm_b$overall["Accuracy"]),
                 (s_b+sp_b)/2, prauc_b, auprg_b)
  }
  ci <- apply(mat, 2, quantile, probs=c(0.025,0.975), na.rm=TRUE)

  data.frame(
    Model            = model_name,
    Sensitivity      = round(sens,3), Sensitivity_lo   = round(ci["2.5%","sens"],3),  Sensitivity_hi   = round(ci["97.5%","sens"],3),
    Specificity      = round(spec,3), Specificity_lo   = round(ci["2.5%","spec"],3),  Specificity_hi   = round(ci["97.5%","spec"],3),
    Balanced_Accuracy= round((sens+spec)/2,3),
    BalAcc_lo        = round(ci["2.5%","balacc"],3),   BalAcc_hi        = round(ci["97.5%","balacc"],3),
    Precision        = round(prec,3), Precision_lo     = round(ci["2.5%","prec"],3),  Precision_hi     = round(ci["97.5%","prec"],3),
    F1               = round(f1,3),   F1_lo            = round(ci["2.5%","f1"],3),    F1_hi            = round(ci["97.5%","f1"],3),
    Accuracy         = round(acc,3),  Accuracy_lo      = round(ci["2.5%","acc"],3),   Accuracy_hi      = round(ci["97.5%","acc"],3),
    AUPRC            = round(auprc,3),AUPRC_lo         = round(ci["2.5%","prauc"],3), AUPRC_hi         = round(ci["97.5%","prauc"],3),
    AUPRG            = round(auprg,3),AUPRG_lo         = round(ci["2.5%","auprg"],3), AUPRG_hi         = round(ci["97.5%","auprg"],3),
    stringsAsFactors = FALSE
  )
}

ext_full <- dplyr::bind_rows(lapply(names(ext_probs), function(nm)
  compute_ext_full(y_ext_fac, ext_probs[[nm]], nm)
)) %>%
  left_join(ms_summary %>%
              dplyr::select(Model, AUC_mean, AUC_sd, Ens_AUC,
                            Ens_AUC_lo, Ens_AUC_hi,
                            AUPRC_mean, AUPRC_sd, Ens_AUPRC,
                            Ens_AUPRC_lo, Ens_AUPRC_hi,
                            AUPRG_mean, AUPRG_sd, Ens_AUPRG,
                            Ens_AUPRG_lo, Ens_AUPRG_hi),
            by = "Model")

cat("Full external metrics:\n")
## Full external metrics:
print(ext_full %>% dplyr::select(Model, AUC_mean, AUC_sd, Ens_AUC_lo, Ens_AUC_hi,
                                  AUPRC_mean, AUPRC_sd, AUPRG_mean, AUPRG_sd,
                                  Sensitivity, Specificity, Balanced_Accuracy, F1))
##              Model AUC_mean AUC_sd Ens_AUC_lo Ens_AUC_hi AUPRC_mean AUPRC_sd
## 1        CGM model    0.878  0.012      0.789      0.990      0.802    0.032
## 2 Smartwatch model    0.794  0.011      0.674      0.920      0.698    0.019
## 3   Baseline model    0.753  0.009      0.625      0.901      0.568    0.035
##   AUPRG_mean AUPRG_sd Sensitivity Specificity Balanced_Accuracy    F1
## 1      0.866    0.033       0.842       0.810             0.826 0.744
## 2      0.725    0.025       0.737       0.714             0.726 0.622
## 3      0.633    0.026       0.737       0.690             0.714 0.609
write.csv(ext_full, "external_results.csv", row.names = FALSE)

9. Seed stability

seed_df <- dplyr::bind_rows(lapply(names(ms_list), function(nm) {
  data.frame(Model = nm,
             AUC   = ms_list[[nm]]$all_aucs,
             AUPRC = ms_list[[nm]]$all_auprcs,
             AUPRG = ms_list[[nm]]$all_auprgs)
})) %>% mutate(Model = factor(Model, levels = names(ms_list)))

model_colors <- c("CGM model"="#1f77b4","Smartwatch model"="#2ca02c",
                  "Baseline model"="#d62728")
model_labels <- c("CGM model"="CGM","Smartwatch model"="Smartwatch",
                  "Baseline model"="Baseline")

summ_seed <- seed_df %>%
  group_by(Model) %>%
  summarise(m_auc=mean(AUC),   sd_auc=sd(AUC),   mx_auc=max(AUC),
            m_auprc=mean(AUPRC),sd_auprc=sd(AUPRC),mx_auprc=max(AUPRC),
            m_auprg=mean(AUPRG),sd_auprg=sd(AUPRG),mx_auprg=max(AUPRG),
            .groups="drop")

make_seed_panel <- function(df, summ, y_var, y_lab, y_ref=NULL, title="") {
  m_col  <- paste0("m_",  tolower(y_var))
  sd_col <- paste0("sd_", tolower(y_var))
  mx_col <- paste0("mx_", tolower(y_var))
  p <- ggplot(df, aes(x=Model, y=.data[[y_var]], colour=Model)) +
    geom_jitter(width=0.12, alpha=0.65, size=2, show.legend=FALSE) +
    geom_crossbar(data=summ,
                  aes(x=Model, y=.data[[m_col]],
                      ymin=.data[[m_col]]-.data[[sd_col]],
                      ymax=.data[[m_col]]+.data[[sd_col]]),
                  width=0.35, linewidth=0.8, fill=NA, show.legend=FALSE) +
    geom_text(data=summ,
              aes(x=Model, y=.data[[mx_col]]+0.02,
                  label=sprintf("%.3f\n\u00b1%.3f", .data[[m_col]], .data[[sd_col]])),
              size=3, vjust=0, show.legend=FALSE) +
    scale_colour_manual(values=model_colors) +
    scale_x_discrete(labels=model_labels) +
    coord_cartesian(ylim=c(0.35,1.08)) +
    labs(title=title, x=NULL, y=y_lab) +
    theme_bw(base_size=11) +
    theme(plot.title=element_text(face="bold",size=11),
          panel.grid.minor=element_blank(),
          panel.grid.major.x=element_blank())
  if (!is.null(y_ref))
    p <- p + geom_hline(yintercept=y_ref, linetype="dashed",
                        colour="grey50", linewidth=0.7)
  p
}

p_seed_auc   <- make_seed_panel(seed_df, summ_seed, "AUC",   "AUC-ROC",  0.5,    "A  AUC-ROC")
p_seed_auprc <- make_seed_panel(seed_df, summ_seed, "AUPRC", "AUPRC",    ir_prev,"B  AUPRC") +
  annotate("text", x=0.6, y=ir_prev+0.05, label=sprintf("Chance (%.2f)", ir_prev),
           size=3, colour="grey50")
p_seed_auprg <- make_seed_panel(seed_df, summ_seed, "AUPRG", "AUPRG",    0,      "C  AUPRG") +
  annotate("text", x=0.6, y=0.05, label="Chance = 0", size=3, colour="grey50")

p_seed_auc | p_seed_auprc | p_seed_auprg


10. Decision curve analysis

Net benefit computed manually: NB(t) = TP/n − FP/n × t/(1−t). All models are evaluated on ensemble probabilities from the 20-seed fit.

n_ext      <- length(y_bin)
thresholds <- seq(0.05, 0.50, by = 0.01)

nb_fn <- function(y, prob, t) {
  pred <- as.integer(prob >= t)
  tp   <- sum(pred == 1 & y == 1)
  fp   <- sum(pred == 1 & y == 0)
  tp / n_ext - fp / n_ext * t / (1 - t)
}

dca_df <- do.call(rbind, lapply(thresholds, function(t) {
  data.frame(
    threshold   = t,
    model       = c("CGM","SW","Baseline","All","None"),
    net_benefit = c(
      nb_fn(y_bin, ext_probs[["CGM model"]],        t),
      nb_fn(y_bin, ext_probs[["Smartwatch model"]], t),
      nb_fn(y_bin, ext_probs[["Baseline model"]],   t),
      ir_prev - (1 - ir_prev) * t / (1 - t),
      0),
    stringsAsFactors = FALSE)
}))

p_dca <- ggplot(dca_df, aes(x=threshold, y=net_benefit,
                             colour=model, linetype=model)) +
  geom_line(linewidth=0.9) +
  scale_colour_manual(
    values = c("CGM"="#1f77b4","SW"="#2ca02c","Baseline"="#d62728",
               "All"="#888888","None"="#bbbbbb"),
    labels = c("CGM"="CGM model","SW"="Smartwatch model",
               "Baseline"="Baseline","All"="Treat all","None"="Treat none")) +
  scale_linetype_manual(
    values = c("CGM"="solid","SW"="dashed","Baseline"="dotdash",
               "All"="dotted","None"="dotted"),
    labels = c("CGM"="CGM model","SW"="Smartwatch model",
               "Baseline"="Baseline","All"="Treat all","None"="Treat none")) +
  geom_hline(yintercept=0, colour="black", linewidth=0.5) +
  geom_vline(xintercept=ir_prev, colour="black",
             linetype="dashed", linewidth=0.6, alpha=0.4) +
  annotate("text", x=ir_prev+0.01, y=0.38,
           label=sprintf("Prevalence\n(%.2f)", ir_prev),
           size=3, colour="grey40", hjust=0) +
  coord_cartesian(xlim=c(0.05,0.50), ylim=c(-0.06,0.42)) +
  labs(x="Threshold probability", y="Net benefit",
       colour=NULL, linetype=NULL) +
  theme_bw(base_size=12) +
  theme(legend.position="right", panel.grid.minor=element_blank())

print(p_dca)

ggsave("dca.pdf", p_dca, width=7.5, height=5, device=cairo_pdf)
## Warning in grSoftVersion(): unable to load shared object '/Library/Frameworks/R.framework/Resources/modules//R_X11.so':
##   dlopen(/Library/Frameworks/R.framework/Resources/modules//R_X11.so, 0x0006): Library not loaded: /opt/X11/lib/libSM.6.dylib
##   Referenced from: <7ECC4104-EC6A-38FD-9BEA-BFE0B870925C> /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/modules/R_X11.so
##   Reason: tried: '/opt/X11/lib/libSM.6.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/X11/lib/libSM.6.dylib' (no such file), '/opt/X11/lib/libSM.6.dylib' (no such file), '/Library/Frameworks/R.framework/Resources/lib/libSM.6.dylib' (no such file), '/Library/Java/JavaVirtualMachines/jdk-11.0.18+10/Contents/Home/lib/server/libSM.6.dylib' (no such file)
## Warning in cairoVersion(): unable to load shared object '/Library/Frameworks/R.framework/Resources/library/grDevices/libs//cairo.so':
##   dlopen(/Library/Frameworks/R.framework/Resources/library/grDevices/libs//cairo.so, 0x0006): Library not loaded: /opt/X11/lib/libXrender.1.dylib
##   Referenced from: <263B701F-2238-3478-A7D2-B7513231EF98> /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/grDevices/libs/cairo.so
##   Reason: tried: '/opt/X11/lib/libXrender.1.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/X11/lib/libXrender.1.dylib' (no such file), '/opt/X11/lib/libXrender.1.dylib' (no such file), '/Library/Frameworks/R.framework/Resources/lib/libXrender.1.dylib' (no such file), '/Library/Java/JavaVirtualMachines/jdk-11.0.18+10/Contents/Home/lib/server/libXrender.1.dylib' (no such file)
## Warning in device(filename = "dca.pdf", width = 7.5, height = 5, bg = "white"):
## failed to load cairo DLL
cat(sprintf("\nNet benefit at prevalence threshold (%.2f):\n", ir_prev))
## 
## Net benefit at prevalence threshold (0.31):
for (m in c("CGM","SW","Baseline","All","None")) {
  sub <- dca_df[dca_df$model == m, ]
  nb  <- sub$net_benefit[which.min(abs(sub$threshold - ir_prev))]
  cat(sprintf("  %-10s  NB = %.4f\n", m, nb))
}
##   CGM         NB = 0.1314
##   SW          NB = 0.1224
##   Baseline    NB = 0.1003
##   All         NB = 0.0021
##   None        NB = 0.0000
write.csv(dca_df, "dca_results.csv", row.names = FALSE)

11. SHAP feature names

feature_name_map <- c(
  "bmi"                              = "BMI",
  "mean_fasting_glucose"             = "Mean Fasting Glucose",
  "sd_fasting_glucose"               = "SD Fasting Glucose",
  "excursions_per_day"               = "Glucose Excursions / Day",
  "rec_time_50pct_median"            = "Recovery Time (50th pct)",
  "overall_max_glucose"              = "Overall Max Glucose",
  "age"                              = "Age",
  "whr"                              = "WHR",
  "stress_overall_daily_mean_stress" = "Mean Stress (HRV-based)",
  "sd_hr_day_night_diff"             = "HR Day-Night Variability"
)

rename_features <- function(x)
  ifelse(x %in% names(feature_name_map), feature_name_map[x], x)

12. SHAP helpers

aggregate_shap <- function(shap_list, model_filter) {
  entries <- Filter(function(x) x$model_name == model_filter, shap_list)
  mat     <- do.call(rbind, lapply(entries,
               function(e) colMeans(abs(as.data.frame(e$shap)))))
  data.frame(feature=colnames(mat), mean_shap=colMeans(mat),
             sd_shap=apply(mat,2,sd), row.names=NULL) %>%
    arrange(desc(mean_shap)) %>%
    mutate(pct_share = mean_shap / sum(mean_shap) * 100,
           feature   = rename_features(feature))
}

collect_shap_long <- function(shap_list, model_filter) {
  entries <- Filter(function(x) x$model_name == model_filter, shap_list)
  bind_rows(lapply(entries, function(e) {
    sd <- as.data.frame(e$shap) %>%
      mutate(obs=row_number(), iteration=e$iteration) %>%
      pivot_longer(-c(obs,iteration), names_to="feature", values_to="shap_value") %>%
      mutate(feature=rename_features(feature))
    fd <- as.data.frame(e$X_df) %>%
      mutate(obs=row_number()) %>%
      pivot_longer(-obs, names_to="feature", values_to="feature_value") %>%
      mutate(feature=rename_features(feature))
    left_join(sd, fd, by=c("obs","feature"))
  }))
}

rank_stability_df <- function(shap_list, model_filter) {
  entries  <- Filter(function(x) x$model_name == model_filter, shap_list)
  mat      <- do.call(rbind, lapply(entries,
               function(e) colMeans(abs(as.data.frame(e$shap)))))
  rank_mat <- t(apply(mat,1,rank))
  data.frame(feature=colnames(rank_mat), mean_rank=colMeans(rank_mat),
             iqr_rank=apply(rank_mat,2,IQR), row.names=NULL) %>%
    arrange(mean_rank) %>%
    mutate(feature=rename_features(feature))
}

panel_beeswarm <- function(shap_list, model_filter,
                           shared_x_range=NULL, is_external=FALSE) {
  agg  <- aggregate_shap(shap_list, model_filter)
  long <- collect_shap_long(shap_list, model_filter)
  lmap <- agg %>%
    mutate(label=paste0(feature," (",round(pct_share,1),"%)")) %>%
    dplyr::select(feature, label)
  long <- long %>%
    left_join(lmap, by="feature") %>%
    group_by(feature) %>%
    mutate(feat_scaled=(feature_value-min(feature_value,na.rm=TRUE)) /
             (max(feature_value,na.rm=TRUE)-min(feature_value,na.rm=TRUE)+1e-9)) %>%
    ungroup() %>%
    mutate(label=factor(label, levels=lmap$label[match(rev(agg$feature),lmap$feature)]))
  p <- ggplot(long, aes(x=shap_value, y=label, colour=feat_scaled)) +
    geom_jitter(height=0.18, alpha=ifelse(is_external,0.9,0.5), size=1.5, stroke=0) +
    scale_colour_gradient(low="#3B82C4", high="#E84545",
                          name="Feature\nvalue", breaks=c(0,1), labels=c("Low","High")) +
    geom_vline(xintercept=0, linetype="dashed", colour="grey50", linewidth=0.4) +
    labs(title=model_filter, x="SHAP value", y=NULL) +
    theme_bw(base_size=11) +
    theme(plot.title=element_text(face="bold",hjust=0.5,size=11),
          legend.position="none", panel.grid.minor=element_blank(),
          axis.text.y=element_text(size=9))
  if (!is.null(shared_x_range)) p <- p + coord_cartesian(xlim=shared_x_range)
  p
}

panel_bar <- function(shap_list, model_filter, shared_x_max=NULL) {
  agg <- aggregate_shap(shap_list, model_filter) %>%
    mutate(label=paste0(feature," (",round(pct_share,1),"%)"),
           label=factor(label, levels=rev(paste0(feature," (",round(pct_share,1),"%)"))),
           ymin_eb=pmax(mean_shap-sd_shap,0), ymax_eb=mean_shap+sd_shap)
  p <- ggplot(agg, aes(x=mean_shap, y=label)) +
    geom_col(fill="#4C72B0", alpha=0.85) +
    geom_errorbarh(aes(xmin=ymin_eb,xmax=ymax_eb), height=0.3, colour="grey30") +
    labs(title=model_filter, x="Mean |SHAP|", y=NULL) +
    theme_bw(base_size=11) +
    theme(plot.title=element_text(face="bold",hjust=0.5,size=11),
          panel.grid.minor=element_blank(), axis.text.y=element_text(size=9))
  if (!is.null(shared_x_max)) p <- p + coord_cartesian(xlim=c(0,shared_x_max))
  p
}

panel_rank <- function(shap_list, model_filter, shared_x_range=NULL) {
  rs <- rank_stability_df(shap_list, model_filter) %>%
    mutate(label=factor(feature, levels=rev(feature)))
  p <- ggplot(rs, aes(x=mean_rank, y=label)) +
    geom_point(size=3, colour="#2ca02c") +
    geom_errorbarh(aes(xmin=mean_rank-iqr_rank/2,xmax=mean_rank+iqr_rank/2),
                   height=0.3, colour="grey40") +
    scale_x_reverse() +
    labs(title=model_filter, x="Mean rank (lower = more important)", y=NULL) +
    theme_bw(base_size=11) +
    theme(plot.title=element_text(face="bold",hjust=0.5,size=11),
          panel.grid.minor=element_blank(), axis.text.y=element_text(size=9))
  if (!is.null(shared_x_range)) p <- p + coord_cartesian(xlim=shared_x_range)
  p
}

make_shap_figure <- function(shap_list, dataset_label, model_names_vec,
                              plot_type=c("beeswarm","bar","rank"),
                              panel_labels=LETTERS[seq_along(model_names_vec)]) {
  plot_type <- match.arg(plot_type)
  if (plot_type=="beeswarm") {
    all_vals <- unlist(lapply(model_names_vec, function(mn)
      unlist(lapply(Filter(function(x) x$model_name==mn, shap_list),
                   function(e) as.numeric(as.matrix(e$shap))))))
    srng <- c(-max(abs(all_vals),na.rm=TRUE)*1.05,
               max(abs(all_vals),na.rm=TRUE)*1.05)
  }
  if (plot_type=="bar") {
    am <- unlist(lapply(model_names_vec, function(mn) {
      a <- aggregate_shap(shap_list,mn); a$mean_shap+a$sd_shap }))
    sxmax <- max(am,na.rm=TRUE)*1.05
  }
  if (plot_type=="rank") {
    nf   <- max(unlist(lapply(model_names_vec, function(mn)
      nrow(rank_stability_df(shap_list,mn)))))
    srng <- c(nf+0.5,0.5)
  }
  panels <- lapply(seq_along(model_names_vec), function(j) {
    mn <- model_names_vec[rev(seq_along(model_names_vec))[j]]
    p  <- switch(plot_type,
      beeswarm=panel_beeswarm(shap_list,mn,shared_x_range=srng),
      bar=panel_bar(shap_list,mn,shared_x_max=sxmax),
      rank=panel_rank(shap_list,mn,shared_x_range=srng))
    p + labs(tag=panel_labels[j]) +
      theme(plot.tag=element_text(face="bold",size=13))
  })
  subtitle <- switch(plot_type,
    beeswarm="Each point = one observation x one bootstrap iteration. Labels = mean |SHAP| share (%).",
    bar="Error bars = ±1 SD across bootstrap iterations. Labels = mean |SHAP| share (%).",
    rank="Error bars = IQR across bootstrap iterations. Leftward = more important.")
  if (plot_type=="beeswarm") {
    dummy <- ggplot(data.frame(x=0,y=0,v=c(0,1)),aes(x,y,colour=v)) +
      geom_point() +
      scale_colour_gradient(low="#3B82C4",high="#E84545",name="Feature\nvalue",
                            breaks=c(0,1),labels=c("Low","High")) +
      theme(legend.position="right",legend.title=element_text(size=9),
            legend.text=element_text(size=8),legend.key.height=unit(1.2,"cm"))
    leg <- cowplot::get_legend(dummy)
    combined <- (Reduce(`|`,panels)|patchwork::wrap_elements(leg)) +
      plot_layout(widths=c(rep(1,length(panels)),0.15))
  } else {
    combined <- Reduce(`|`,panels)
  }
  combined + plot_annotation(
    title=paste0(switch(plot_type,beeswarm="SHAP Summary",
                        bar="Mean |SHAP| Feature Importance",
                        rank="Feature Rank Stability")," — ",dataset_label),
    caption=subtitle,
    theme=theme(plot.title=element_text(face="bold",hjust=0.5,size=13),
                plot.caption=element_text(size=10,colour="grey40",hjust=0.5)))
}

13. SHAP — internal validation

Aggregated across 100 bootstrap iterations. Each iteration contributes one SHAP matrix from its own model trained on 70% of the internal data.

for (pt in c("beeswarm","bar","rank")) {
  fig <- make_shap_figure(all_shap_val, "Internal Validation", model_names, pt)
  print(fig)
  ggsave(sprintf("shap_%s_internal.pdf", pt), fig,
         width=15, height=5.5, device=cairo_pdf)
}


14. SHAP — external test set

SHAP values are averaged across all 20 seeds: for each seed the model is refitted on all internal data and SHAP is computed for the 61 external patients. The 20 matrices are averaged element-wise before plotting. This ensures the SHAP figure reflects the same ensemble used for reported metrics.

compute_ensemble_shap <- function(features, df_train, df_ext,
                                   N_SEEDS = 20, nsim = 50) {
  p    <- length(features)
  mtry <- floor(sqrt(p))

  X_ext_raw <- df_ext %>%
    dplyr::select(all_of(features)) %>%
    mutate(across(where(is.numeric), as.double))

  shap_matrices <- vector("list", N_SEEDS)
  X_ext_imp_ref <- NULL

  for (s in seq_len(N_SEEDS)) {
    set.seed(s)
    X_tr <- df_train %>%
      dplyr::select(all_of(features)) %>%
      mutate(across(where(is.numeric), as.double))
    y_tr <- factor(make.names(trimws(df_train$IR_label)))

    imp      <- caret::preProcess(X_tr, method = "medianImpute")
    X_tr_imp <- predict(imp, X_tr)
    X_ext_imp <- predict(imp, X_ext_raw)

    up_data  <- caret::upSample(x=X_tr_imp, y=y_tr, yname=".outcome")
    rf       <- randomForest::randomForest(
      x=up_data[,names(X_tr_imp),drop=FALSE], y=up_data$.outcome,
      ntree=300, mtry=mtry, nodesize=1)

    shap_s <- fastshap::explain(
      object       = rf,
      X            = as.data.frame(X_ext_imp),
      pred_wrapper = function(object, newdata)
        predict(object, newdata, type="prob")[,"IR"],
      nsim   = nsim,
      adjust = TRUE
    )

    shap_matrices[[s]] <- as.matrix(shap_s)
    if (s == 1) X_ext_imp_ref <- as.data.frame(X_ext_imp)

    cat(sprintf("    seed %2d / %d\n", s, N_SEEDS))
  }

  shap_avg <- as.data.frame(Reduce("+", shap_matrices) / N_SEEDS)
  list(shap = shap_avg, X_df = X_ext_imp_ref)
}

cat("Computing ensemble SHAP for external cohort (20 seeds x 3 models)...\n")
## Computing ensemble SHAP for external cohort (20 seeds x 3 models)...
cat("  CGM model:\n")
##   CGM model:
es_CGM  <- compute_ensemble_shap(feature_sets$CGM_model,       df_test, df_external)
##     seed  1 / 20
##     seed  2 / 20
##     seed  3 / 20
##     seed  4 / 20
##     seed  5 / 20
##     seed  6 / 20
##     seed  7 / 20
##     seed  8 / 20
##     seed  9 / 20
##     seed 10 / 20
##     seed 11 / 20
##     seed 12 / 20
##     seed 13 / 20
##     seed 14 / 20
##     seed 15 / 20
##     seed 16 / 20
##     seed 17 / 20
##     seed 18 / 20
##     seed 19 / 20
##     seed 20 / 20
cat("  Smartwatch model:\n")
##   Smartwatch model:
es_SW   <- compute_ensemble_shap(feature_sets$Smartwatch_model, df_test, df_external)
##     seed  1 / 20
##     seed  2 / 20
##     seed  3 / 20
##     seed  4 / 20
##     seed  5 / 20
##     seed  6 / 20
##     seed  7 / 20
##     seed  8 / 20
##     seed  9 / 20
##     seed 10 / 20
##     seed 11 / 20
##     seed 12 / 20
##     seed 13 / 20
##     seed 14 / 20
##     seed 15 / 20
##     seed 16 / 20
##     seed 17 / 20
##     seed 18 / 20
##     seed 19 / 20
##     seed 20 / 20
cat("  Baseline model:\n")
##   Baseline model:
es_Base <- compute_ensemble_shap(feature_sets$Baseline_model,   df_test, df_external)
##     seed  1 / 20
##     seed  2 / 20
##     seed  3 / 20
##     seed  4 / 20
##     seed  5 / 20
##     seed  6 / 20
##     seed  7 / 20
##     seed  8 / 20
##     seed  9 / 20
##     seed 10 / 20
##     seed 11 / 20
##     seed 12 / 20
##     seed 13 / 20
##     seed 14 / 20
##     seed 15 / 20
##     seed 16 / 20
##     seed 17 / 20
##     seed 18 / 20
##     seed 19 / 20
##     seed 20 / 20
all_shap_ext_final <- list(
  list(iteration=1, model_name="Model CGM",        shap=es_CGM$shap,  X_df=es_CGM$X_df),
  list(iteration=1, model_name="Model smartwatch", shap=es_SW$shap,   X_df=es_SW$X_df),
  list(iteration=1, model_name="Model baseline",   shap=es_Base$shap, X_df=es_Base$X_df)
)
make_shap_ext_figure <- function(shap_list, model_names_vec,
                                  panel_labels=LETTERS[seq_along(model_names_vec)]) {
  all_vals <- unlist(lapply(model_names_vec, function(mn)
    unlist(lapply(Filter(function(x) x$model_name==mn, shap_list),
                 function(e) as.numeric(as.matrix(e$shap))))))
  xmax <- max(abs(all_vals),na.rm=TRUE)*1.05; srng <- c(-xmax,xmax)
  panels <- lapply(seq_along(model_names_vec), function(j) {
    panel_beeswarm(shap_list,model_names_vec[j],
                   shared_x_range=srng, is_external=TRUE) +
      labs(tag=panel_labels[j]) +
      theme(plot.tag=element_text(face="bold",size=13))
  })
  dummy <- ggplot(data.frame(x=0,y=0,v=c(0,1)),aes(x,y,colour=v)) +
    geom_point() +
    scale_colour_gradient(low="#3B82C4",high="#E84545",name="Feature\nvalue",
                          breaks=c(0,1),labels=c("Low","High")) +
    theme(legend.position="right",legend.title=element_text(size=9),
          legend.text=element_text(size=8),legend.key.height=unit(1.2,"cm"))
  leg <- cowplot::get_legend(dummy)
  (Reduce(`|`,panels)|patchwork::wrap_elements(leg)) +
    plot_layout(widths=c(rep(1,length(panels)),0.15)) +
    plot_annotation(
      title   = "SHAP Summary — External Test Set",
      caption = paste0("Each point = one external patient. ",
                       "SHAP values averaged across 20 independently fitted models. ",
                       "Labels = mean |SHAP| share (%)."),
      theme   = theme(
        plot.title  = element_text(face="bold",hjust=0.5,size=13),
        plot.caption= element_text(size=10,colour="grey40",hjust=0.5)))
}

fig_ext <- make_shap_ext_figure(all_shap_ext_final, model_names)
print(fig_ext)

ggsave("shap_beeswarm_external.pdf", fig_ext, width=15, height=5.5, device=cairo_pdf)

15. Combined summary figure

(p_dca        + labs(title="A  Decision curve analysis")) |
(p_seed_auc   + labs(title="B  AUC-ROC seed stability"))  |
(p_seed_auprc + labs(title="C  AUPRC seed stability"))    |
(p_seed_auprg + labs(title="D  AUPRG seed stability"))


16. Session info

sessionInfo()
## R version 4.5.2 (2025-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sequoia 15.5
## 
## Matrix products: default
## BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: Europe/Zurich
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] cowplot_1.2.0        patchwork_1.3.2      tidyr_1.3.2         
##  [4] prg_0.5.1            PRROC_1.4            rlang_1.2.0         
##  [7] pROC_1.19.0.1        caret_7.0-1          lattice_0.22-7      
## [10] ggplot2_4.0.1        dplyr_1.2.0          fastshap_0.1.1      
## [13] randomForest_4.7-1.2
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.1     timeDate_4052.112    farver_2.1.2        
##  [4] S7_0.2.1             fastmap_1.2.0        digest_0.6.39       
##  [7] rpart_4.1.24         timechange_0.3.0     lifecycle_1.0.5     
## [10] survival_3.8-3       magrittr_2.0.4       compiler_4.5.2      
## [13] sass_0.4.10          tools_4.5.2          utf8_1.2.6          
## [16] yaml_2.3.12          data.table_1.18.0    knitr_1.51          
## [19] labeling_0.4.3       plyr_1.8.9           RColorBrewer_1.1-3  
## [22] withr_3.0.2          purrr_1.2.0          nnet_7.3-20         
## [25] grid_4.5.2           stats4_4.5.2         e1071_1.7-17        
## [28] future_1.69.0        globals_0.19.1       scales_1.4.0        
## [31] iterators_1.0.14     MASS_7.3-65          cli_3.6.6           
## [34] rmarkdown_2.30       generics_0.1.4       otel_0.2.0          
## [37] rstudioapi_0.17.1    future.apply_1.20.2  reshape2_1.4.5      
## [40] cachem_1.1.0         proxy_0.4-29         stringr_1.6.0       
## [43] splines_4.5.2        parallel_4.5.2       vctrs_0.7.2         
## [46] hardhat_1.4.2        Matrix_1.7-4         jsonlite_2.0.0      
## [49] listenv_0.10.1       foreach_1.5.2        gower_1.0.2         
## [52] jquerylib_0.1.4      recipes_1.3.1        glue_1.8.0          
## [55] parallelly_1.46.1    codetools_0.2-20     lubridate_1.9.4     
## [58] stringi_1.8.7        gtable_0.3.6         tibble_3.3.0        
## [61] pillar_1.11.1        htmltools_0.5.9      ipred_0.9-15        
## [64] lava_1.8.2           R6_2.6.1             evaluate_1.0.5      
## [67] bslib_0.9.0          class_7.3-23         Rcpp_1.1.1          
## [70] nlme_3.1-168         prodlim_2026.03.11   xfun_0.55           
## [73] pkgconfig_2.0.3      ModelMetrics_1.2.2.2