library(randomForest)   # direct RF fitting for final models (fast, no CV)
library(fastshap)
library(dplyr)
library(caret)
library(pROC)
library(PRROC)
library(prg)            # Precision-Recall Gain curves (Flach & Kull 2015)
library(tidyr)
library(patchwork)
library(cowplot)
library(ggplot2)

1 · Data

df_test     <- read.csv("~/Downloads/read-ai/df_test_f_v2.csv")
df_external <- read.csv("~/Downloads/read-ai/df_external_f_v2.csv")
df_sleep    <- read.csv("~/Downloads/sleep_participant_summary-3.csv")

# Keep only the 3 sleep features selected by forward-selection analysis
sleep_cols <- c("participant_id",
                "mean_midpoint_clock_h",
                "sri",
                "social_jetlag_h")

df_test     <- left_join(df_test,     df_sleep[, sleep_cols], by = "participant_id")
df_external <- left_join(df_external, df_sleep[, sleep_cols], by = "participant_id")

cat(sprintf(
  "Internal : n=%d  IR=%d  Non-IR=%d  sleep_missing=%d\n",
  nrow(df_test),
  sum(df_test$IR_label == "IR"),
  sum(df_test$IR_label == "Non-IR"),
  sum(is.na(df_test$mean_midpoint_clock_h))
))
## Internal : n=97  IR=32  Non-IR=65  sleep_missing=2
cat(sprintf(
  "External : n=%d  IR=%d  Non-IR=%d  sleep_missing=%d\n",
  nrow(df_external),
  sum(df_external$IR_label == "IR"),
  sum(df_external$IR_label == "Non-IR"),
  sum(is.na(df_external$mean_midpoint_clock_h))
))
## External : n=61  IR=19  Non-IR=42  sleep_missing=0

2 · Feature sets & model names

feature_sets <- list(
  CGM_model = c(
    "bmi", "mean_fasting_glucose", "sd_fasting_glucose",
    "excursions_per_day", "rec_time_50pct_median",
    "overall_max_glucose", "age", "whr"
  ),
  Smartwatch_only_model = c(
    "stress_overall_daily_mean_stress", "sd_hr_day_night_diff",
    "age", "bmi", "whr"
  ),
  Baseline_model = c("bmi", "age", "whr")
)

model_names <- c("Model CGM", "Model smartwatch", "Model baseline")

for (nm in names(feature_sets)) {
  p <- length(feature_sets[[nm]])
  cat(sprintf("  %-25s  p=%d  mtry=%d\n", nm, p, floor(sqrt(p))))
}
##   CGM_model                  p=8  mtry=2
##   Smartwatch_only_model      p=5  mtry=2
##   Baseline_model             p=3  mtry=1

3 · Helper functions

standardize <- function(df, model_name) {
  df <- as.data.frame(df); df$Model <- model_name; df
}

# AUPRG — Precision-Recall Gain curve (Flach & Kull, NeurIPS 2015)
compute_auprg <- function(labels, scores) {
  tryCatch({
    curve <- prg::create_prg_curve(labels, scores)
    prg::calc_auprg(curve)
  }, error = function(e) NA_real_)
}

compute_metrics <- function(y_true, y_pred, y_prob,
                            dataset = "Unknown",
                            n_boot  = 2000,
                            seed    = 42) {

  cm     <- confusionMatrix(y_pred, y_true)
  acc    <- as.numeric(cm$overall["Accuracy"])
  sens   <- as.numeric(cm$byClass["Sensitivity"])
  spec   <- as.numeric(cm$byClass["Specificity"])
  balacc <- (sens + spec) / 2
  prec   <- as.numeric(cm$byClass["Pos Pred Value"])
  f1     <- ifelse(is.na(prec) | prec + sens == 0, NA_real_,
                   2 * prec * sens / (prec + sens))

  roc_obj <- roc(y_true, y_prob, levels = c("Non.IR", "IR"), quiet = TRUE)
  auc_val <- as.numeric(auc(roc_obj))

  pr_obj <- pr.curve(scores.class0  = y_prob,
                     weights.class0 = ifelse(y_true == "IR", 1, 0),
                     curve = FALSE)
  prauc  <- pr_obj$auc.integral

  auc_lo <- NA_real_; auc_hi <- NA_real_; auc_method <- "DeLong"
  tryCatch({
    dl     <- as.numeric(ci.auc(roc_obj, method = "delong", conf.level = 0.95))
    auc_lo <- dl[1]; auc_hi <- dl[3]
  }, error = function(e) { auc_method <<- "Bootstrap" })

  set.seed(seed)
  n   <- length(y_true)
  mat <- matrix(NA_real_, nrow = n_boot, ncol = 7,
                dimnames = list(NULL,
                  c("acc","sens","spec","balacc","prec","f1","prauc")))

  for (i in seq_len(n_boot)) {
    idx_b  <- sample(n, n, replace = TRUE)
    yb     <- y_true[idx_b]; pb_cls <- y_pred[idx_b]; pb_pr <- y_prob[idx_b]
    if (length(unique(yb)) < 2) next
    cm_b  <- confusionMatrix(pb_cls, yb)
    s_b   <- as.numeric(cm_b$byClass["Sensitivity"])
    sp_b  <- as.numeric(cm_b$byClass["Specificity"])
    pr_b  <- as.numeric(cm_b$byClass["Pos Pred Value"])
    f1_b  <- ifelse(is.na(pr_b) | pr_b + s_b == 0, NA_real_,
                    2 * pr_b * s_b / (pr_b + s_b))
    prauc_b <- tryCatch(
      pr.curve(scores.class0  = pb_pr,
               weights.class0 = ifelse(yb == "IR", 1, 0),
               curve = FALSE)$auc.integral,
      error = function(e) NA_real_)
    mat[i, ] <- c(as.numeric(cm_b$overall["Accuracy"]),
                  s_b, sp_b, (s_b + sp_b) / 2, pr_b, f1_b, prauc_b)
  }
  ci <- apply(mat, 2, quantile, probs = c(0.025, 0.975), na.rm = TRUE)

  if (is.na(auc_lo)) {
    auc_lo <- ci["2.5%",  "auc"]
    auc_hi <- ci["97.5%", "auc"]
  }

  data.frame(
    Dataset            = dataset,
    AUC_ROC            = round(auc_val, 3),
    AUC_lo             = round(auc_lo,  3),
    AUC_hi             = round(auc_hi,  3),
    AUC_CI_method      = auc_method,
    AUPRC              = round(prauc,   3),
    AUPRC_lo           = round(ci["2.5%",  "prauc"], 3),
    AUPRC_hi           = round(ci["97.5%", "prauc"], 3),
    Sensitivity        = round(sens,    3),
    Sensitivity_lo     = round(ci["2.5%",  "sens"],  3),
    Sensitivity_hi     = round(ci["97.5%", "sens"],  3),
    Specificity        = round(spec,    3),
    Specificity_lo     = round(ci["2.5%",  "spec"],  3),
    Specificity_hi     = round(ci["97.5%", "spec"],  3),
    Balanced_Accuracy  = round(balacc,  3),
    BalAcc_lo          = round(ci["2.5%",  "balacc"],3),
    BalAcc_hi          = round(ci["97.5%", "balacc"],3),
    Precision          = round(prec,    3),
    Precision_lo       = round(ci["2.5%",  "prec"],  3),
    Precision_hi       = round(ci["97.5%", "prec"],  3),
    F1                 = round(f1,      3),
    F1_lo              = round(ci["2.5%",  "f1"],    3),
    F1_hi              = round(ci["97.5%", "f1"],    3),
    Accuracy           = round(acc,     3),
    Accuracy_lo        = round(ci["2.5%",  "acc"],   3),
    Accuracy_hi        = round(ci["97.5%", "acc"],   3),
    stringsAsFactors   = FALSE
  )
}

4 · Bootstrap training function (internal, with SHAP)

train_model_with_shap <- function(features, df, train_idx) {

  missing <- setdiff(features, names(df))
  if (length(missing) > 0)
    stop(paste("Missing columns:", paste(missing, collapse = ", ")))

  X <- df %>%
    dplyr::select(all_of(features)) %>%
    mutate(across(where(is.numeric), as.double))

  y <- factor(trimws(df$IR_label))
  levels(y) <- make.names(levels(y))

  X_train <- X[train_idx, ];  X_test <- X[-train_idx, ]
  y_train <- y[train_idx];    y_test  <- y[-train_idx]

  p    <- length(features)
  mtry <- floor(sqrt(p))

  ctrl <- trainControl(
    method          = "repeatedcv",
    number          = 5,
    repeats         = 20,
    sampling        = "up",
    classProbs      = TRUE,
    summaryFunction = twoClassSummary
  )

  model <- train(
    x          = X_train,
    y          = y_train,
    method     = "rf",
    trControl  = ctrl,
    preProcess = "medianImpute",
    tuneGrid   = expand.grid(mtry = mtry),
    ntree      = 300,
    metric     = "ROC"
  )

  prob_test <- predict(model, X_test, type = "prob")[, "IR"]
  pred_test <- predict(model, X_test)

  cm_test   <- confusionMatrix(pred_test, y_test)
  roc_test  <- roc(y_test, prob_test, levels = c("Non.IR", "IR"))
  auc_test  <- as.numeric(auc(roc_test))
  pr_test   <- pr.curve(scores.class0  = prob_test,
                        weights.class0 = ifelse(y_test == "IR", 1, 0),
                        curve = FALSE)

  internal_holdout_metrics <- data.frame(
    Accuracy          = cm_test$overall["Accuracy"],
    Sensitivity       = cm_test$byClass["Sensitivity"],
    Specificity       = cm_test$byClass["Specificity"],
    Balanced_Accuracy = (cm_test$byClass["Sensitivity"] +
                         cm_test$byClass["Specificity"]) / 2,
    Precision         = cm_test$byClass["Pos Pred Value"],
    F1 = 2 * cm_test$byClass["Pos Pred Value"] * cm_test$byClass["Sensitivity"] /
         (cm_test$byClass["Pos Pred Value"] + cm_test$byClass["Sensitivity"]),
    AUC   = auc_test,
    PRAUC = pr_test$auc.integral,
    AUPRG = compute_auprg(as.integer(y_test == "IR"), prob_test)
  )

  pfun <- function(object, newdata)
    predict(object, newdata, type = "prob")[, "IR"]

  shap_values <- fastshap::explain(
    object       = model,
    X            = as.data.frame(X_test),
    pred_wrapper = pfun,
    nsim         = 50,
    adjust       = TRUE
  )

  list(
    model                    = model,
    features                 = features,
    internal_holdout_metrics = internal_holdout_metrics,
    shap_validation          = shap_values,
    X_test_df                = as.data.frame(X_test)
  )
}

5 · Fast multi-seed final model function

evaluate_multiseed_fast <- function(features, df_train, df_ext,
                                    N_SEEDS = 20, ntree = 300) {
  p    <- length(features)
  mtry <- floor(sqrt(p))

  X_ext_raw <- df_ext %>%
    dplyr::select(all_of(features)) %>%
    mutate(across(where(is.numeric), as.double))

  y_ext <- factor(make.names(trimws(df_ext$IR_label)))

  all_probs  <- matrix(NA_real_, nrow = nrow(df_ext), ncol = N_SEEDS)
  all_aucs   <- numeric(N_SEEDS)
  all_auprcs <- numeric(N_SEEDS)

  for (s in seq_len(N_SEEDS)) {
    set.seed(s)

    X_tr <- df_train %>%
      dplyr::select(all_of(features)) %>%
      mutate(across(where(is.numeric), as.double))
    y_tr <- factor(make.names(trimws(df_train$IR_label)))

    imp      <- caret::preProcess(X_tr, method = "medianImpute")
    X_tr_imp <- predict(imp, X_tr)
    X_ex_imp <- predict(imp, X_ext_raw)

    up_data  <- caret::upSample(x = X_tr_imp, y = y_tr, yname = ".outcome")
    y_up     <- up_data$.outcome
    X_up     <- up_data[, names(X_tr_imp), drop = FALSE]

    rf <- randomForest::randomForest(
      x = X_up, y = y_up,
      ntree = ntree, mtry = mtry, nodesize = 1
    )

    prob <- predict(rf, X_ex_imp, type = "prob")[, "IR"]
    all_probs[, s] <- prob

    roc_s        <- roc(y_ext, prob, levels = c("Non.IR","IR"), quiet = TRUE)
    all_aucs[s]  <- as.numeric(auc(roc_s))
    all_auprcs[s] <- pr.curve(scores.class0  = prob,
                               weights.class0 = ifelse(y_ext == "IR", 1, 0),
                               curve = FALSE)$auc.integral
  }

  all_auprgs <- sapply(seq_len(N_SEEDS), function(s) {
    compute_auprg(as.integer(y_ext == "IR"), all_probs[, s])
  })

  ensemble_prob <- rowMeans(all_probs)

  auc_mean  <- mean(all_aucs);  auc_sd <- sd(all_aucs)
  auprc_mean <- mean(all_auprcs); auprc_sd <- sd(all_auprcs)
  auprg_mean <- mean(all_auprgs); auprg_sd <- sd(all_auprgs)

  roc_ens <- roc(y_ext, ensemble_prob, levels = c("Non.IR","IR"), quiet = TRUE)
  ens_auc <- as.numeric(auc(roc_ens))
  dl      <- tryCatch(
    as.numeric(ci.auc(roc_ens, method = "delong", conf.level = 0.95)),
    error = function(e) c(NA, NA, NA)
  )

  ens_auprc <- pr.curve(scores.class0  = ensemble_prob,
                         weights.class0 = ifelse(y_ext == "IR", 1, 0),
                         curve = FALSE)$auc.integral
  n_ext <- length(y_ext)
  set.seed(42)
  prauc_boot <- replicate(2000, {
    idx <- sample(n_ext, n_ext, replace = TRUE)
    yb  <- y_ext[idx]; pb <- ensemble_prob[idx]
    if (length(unique(yb)) < 2) return(NA_real_)
    tryCatch(
      pr.curve(scores.class0  = pb,
               weights.class0 = ifelse(yb == "IR", 1, 0),
               curve = FALSE)$auc.integral,
      error = function(e) NA_real_)
  })
  auprc_lo <- quantile(prauc_boot, 0.025, na.rm = TRUE)
  auprc_hi <- quantile(prauc_boot, 0.975, na.rm = TRUE)

  ens_auprg <- compute_auprg(as.integer(y_ext == "IR"), ensemble_prob)
  set.seed(43)
  auprg_boot <- replicate(2000, {
    idx <- sample(n_ext, n_ext, replace = TRUE)
    yb  <- y_ext[idx]; pb <- ensemble_prob[idx]
    if (length(unique(yb)) < 2) return(NA_real_)
    tryCatch(compute_auprg(as.integer(yb == "IR"), pb), error = function(e) NA_real_)
  })
  auprg_lo <- quantile(auprg_boot, 0.025, na.rm = TRUE)
  auprg_hi <- quantile(auprg_boot, 0.975, na.rm = TRUE)

  ens_pred <- factor(ifelse(ensemble_prob >= 0.5, "IR", "Non.IR"),
                     levels = c("Non.IR", "IR"))
  cm_ens   <- confusionMatrix(ens_pred, y_ext)

  list(
    auc_mean    = round(auc_mean,   3), auc_sd      = round(auc_sd,     3),
    auc_lo_seed = round(min(all_aucs),  3), auc_hi_seed = round(max(all_aucs),  3),
    auprc_mean  = round(auprc_mean, 3), auprc_sd    = round(auprc_sd,   3),
    auprg_mean  = round(auprg_mean, 3), auprg_sd    = round(auprg_sd,   3),
    all_aucs    = all_aucs, all_auprcs = all_auprcs, all_auprgs = all_auprgs,
    ens_auc     = round(ens_auc,    3), ens_auc_lo  = round(dl[1],      3), ens_auc_hi  = round(dl[3], 3),
    ens_auprc   = round(ens_auprc,  3), ens_auprc_lo = round(auprc_lo,  3), ens_auprc_hi = round(auprc_hi, 3),
    ens_auprg   = round(ens_auprg,  3), ens_auprg_lo = round(auprg_lo,  3), ens_auprg_hi = round(auprg_hi, 3),
    ensemble_prob = ensemble_prob, y_ext = y_ext, cm = cm_ens, N_SEEDS = N_SEEDS
  )
}

6 · Bootstrap loop (100 iterations — internal metrics)

all_results  <- list()
all_shap_val <- list()

set.seed(123)
n_iter <- 100

for (i in seq_len(n_iter)) {
  y <- factor(trimws(df_test$IR_label))
  levels(y) <- make.names(levels(y))
  train_idx <- createDataPartition(y, p = 0.7, list = FALSE)

  CGM_model  <- train_model_with_shap(feature_sets$CGM_model,            df_test, train_idx)
  SW_model   <- train_model_with_shap(feature_sets$Smartwatch_only_model, df_test, train_idx)
  Base_model <- train_model_with_shap(feature_sets$Baseline_model,        df_test, train_idx)

  trained_models <- list(CGM_model, SW_model, Base_model)

  iter_results <- dplyr::bind_rows(
    standardize(CGM_model$internal_holdout_metrics,  "Model CGM")        |> mutate(Dataset = "Internal"),
    standardize(SW_model$internal_holdout_metrics,   "Model smartwatch") |> mutate(Dataset = "Internal"),
    standardize(Base_model$internal_holdout_metrics, "Model baseline")   |> mutate(Dataset = "Internal")
  ) %>% mutate(iteration = i)

  all_results[[i]] <- iter_results

  for (m in seq_along(trained_models)) {
    mdl <- trained_models[[m]]
    all_shap_val[[length(all_shap_val) + 1]] <- list(
      iteration  = i,
      model_name = model_names[m],
      shap       = mdl$shap_validation,
      X_df       = mdl$X_test_df
    )
  }

  if (i %% 10 == 0) cat(sprintf("  Iteration %d / %d\n", i, n_iter))
}
##   Iteration 10 / 100
##   Iteration 20 / 100
##   Iteration 30 / 100
##   Iteration 40 / 100
##   Iteration 50 / 100
##   Iteration 60 / 100
##   Iteration 70 / 100
##   Iteration 80 / 100
##   Iteration 90 / 100
##   Iteration 100 / 100
bootstrap_results <- dplyr::bind_rows(all_results)

final_summary <- bootstrap_results %>%
  group_by(Model, Dataset) %>%
  summarise(
    across(
      c(Accuracy, Sensitivity, Specificity, Balanced_Accuracy,
        Precision, F1, AUC, PRAUC, AUPRG),
      list(mean  = ~mean(.x,            na.rm = TRUE),
           lower = ~quantile(.x, 0.025, na.rm = TRUE),
           upper = ~quantile(.x, 0.975, na.rm = TRUE)),
      .names = "{.col}_{.fn}"
    ),
    .groups = "drop"
  )

cat("\nInternal bootstrap summary (mean over 100 iterations, 2.5-97.5% CI):\n")
## 
## Internal bootstrap summary (mean over 100 iterations, 2.5-97.5% CI):
print(
  final_summary %>%
    dplyr::select(Model,
                  AUC_mean,   AUC_lower,   AUC_upper,
                  PRAUC_mean, PRAUC_lower, PRAUC_upper,
                  AUPRG_mean, AUPRG_lower, AUPRG_upper,
                  Sensitivity_mean, Specificity_mean, Balanced_Accuracy_mean) %>%
    dplyr::rename(
      AUC_ROC_mean = AUC_mean,   AUC_ROC_lo = AUC_lower,    AUC_ROC_hi = AUC_upper,
      AUPRC_mean   = PRAUC_mean, AUPRC_lo   = PRAUC_lower,  AUPRC_hi   = PRAUC_upper,
      AUPRG_lo     = AUPRG_lower, AUPRG_hi  = AUPRG_upper,
      Sens_mean    = Sensitivity_mean,
      Spec_mean    = Specificity_mean,
      BalAcc_mean  = Balanced_Accuracy_mean
    ) %>%
    mutate(across(where(is.numeric), ~round(.x, 3)))
)
## # A tibble: 3 × 13
##   Model          AUC_ROC_mean AUC_ROC_lo AUC_ROC_hi AUPRC_mean AUPRC_lo AUPRC_hi
##   <chr>                 <dbl>      <dbl>      <dbl>      <dbl>    <dbl>    <dbl>
## 1 Model CGM             0.833      0.684      0.953      0.725    0.488    0.915
## 2 Model baseline        0.817      0.655      0.933      0.644    0.419    0.853
## 3 Model smartwa…        0.812      0.663      0.933      0.637    0.43     0.875
## # ℹ 6 more variables: AUPRG_mean <dbl>, AUPRG_lo <dbl>, AUPRG_hi <dbl>,
## #   Sens_mean <dbl>, Spec_mean <dbl>, BalAcc_mean <dbl>
write.csv(bootstrap_results, "bootstrap_results_v3.csv", row.names = FALSE)
write.csv(final_summary,     "bootstrap_summary_v3.csv", row.names = FALSE)

7 · External validation — 20-seed ensemble

cat("Fitting 20-seed ensembles for external evaluation...\n")
## Fitting 20-seed ensembles for external evaluation...
ms_CGM  <- evaluate_multiseed_fast(feature_sets$CGM_model,            df_test, df_external)
ms_SW   <- evaluate_multiseed_fast(feature_sets$Smartwatch_only_model, df_test, df_external)
ms_Base <- evaluate_multiseed_fast(feature_sets$Baseline_model,        df_test, df_external)

ms_list <- list(
  "CGM model"        = ms_CGM,
  "Smartwatch model" = ms_SW,
  "Baseline model"   = ms_Base
)

ms_summary <- dplyr::bind_rows(lapply(names(ms_list), function(nm) {
  ms <- ms_list[[nm]]
  data.frame(
    Model        = nm,
    AUC_mean     = ms$auc_mean,      AUC_sd       = ms$auc_sd,
    AUC_lo_seed  = ms$auc_lo_seed,   AUC_hi_seed  = ms$auc_hi_seed,
    Ens_AUC      = ms$ens_auc,       Ens_AUC_lo   = ms$ens_auc_lo,    Ens_AUC_hi   = ms$ens_auc_hi,
    AUPRC_mean   = ms$auprc_mean,    AUPRC_sd     = ms$auprc_sd,
    Ens_AUPRC    = ms$ens_auprc,     Ens_AUPRC_lo = ms$ens_auprc_lo,  Ens_AUPRC_hi = ms$ens_auprc_hi,
    AUPRG_mean   = ms$auprg_mean,    AUPRG_sd     = ms$auprg_sd,
    Ens_AUPRG    = ms$ens_auprg,     Ens_AUPRG_lo = ms$ens_auprg_lo,  Ens_AUPRG_hi = ms$ens_auprg_hi,
    stringsAsFactors = FALSE
  )
}))

cat("\nExternal — AUC-ROC and AUPRC (20-seed ensemble):\n")
## 
## External — AUC-ROC and AUPRC (20-seed ensemble):
print(ms_summary %>% mutate(across(where(is.numeric), ~round(.x, 3))))
##                     Model AUC_mean AUC_sd AUC_lo_seed AUC_hi_seed Ens_AUC
## 2.5%...1        CGM model    0.878  0.012       0.858       0.900   0.890
## 2.5%...2 Smartwatch model    0.794  0.011       0.772       0.811   0.797
## 2.5%...3   Baseline model    0.753  0.009       0.737       0.776   0.763
##          Ens_AUC_lo Ens_AUC_hi AUPRC_mean AUPRC_sd Ens_AUPRC Ens_AUPRC_lo
## 2.5%...1      0.789      0.990      0.802    0.032     0.825        0.633
## 2.5%...2      0.674      0.920      0.698    0.019     0.708        0.486
## 2.5%...3      0.625      0.901      0.568    0.035     0.582        0.373
##          Ens_AUPRC_hi AUPRG_mean AUPRG_sd Ens_AUPRG Ens_AUPRG_lo Ens_AUPRG_hi
## 2.5%...1        0.954      0.866    0.033     0.883        0.706        0.981
## 2.5%...2        0.858      0.725    0.025     0.740        0.340        0.934
## 2.5%...3        0.826      0.633    0.026     0.679        0.313        0.899
# Ensemble probabilities and labels — used downstream in sections 8, 9, 14
ext_probs <- list(
  "CGM model"        = ms_CGM$ensemble_prob,
  "Smartwatch model" = ms_SW$ensemble_prob,
  "Baseline model"   = ms_Base$ensemble_prob
)
y_ext_fac <- ms_CGM$y_ext
y_bin     <- as.integer(y_ext_fac == "IR")
ir_prev   <- mean(y_bin)

7b · Seed stability plot

seed_df <- dplyr::bind_rows(lapply(names(ms_list), function(nm) {
  data.frame(
    Model = nm,
    AUC   = ms_list[[nm]]$all_aucs,
    AUPRC = ms_list[[nm]]$all_auprcs,
    AUPRG = ms_list[[nm]]$all_auprgs
  )
})) %>% mutate(Model = factor(Model, levels = names(ms_list)))

model_colors <- c(
  "CGM model"        = "#1f77b4",
  "Smartwatch model" = "#2ca02c",
  "Baseline model"   = "#d62728"
)

summ_seed <- seed_df %>%
  group_by(Model) %>%
  summarise(m_auc   = mean(AUC),   sd_auc   = sd(AUC),   mx_auc   = max(AUC),
            m_auprc = mean(AUPRC), sd_auprc = sd(AUPRC), mx_auprc = max(AUPRC),
            m_auprg = mean(AUPRG), sd_auprg = sd(AUPRG), mx_auprg = max(AUPRG),
            .groups = "drop")

make_seed_panel <- function(df, summ, y_var, y_lab, y_ref = NULL, title = "") {
  p <- ggplot(df, aes(x = Model, y = .data[[y_var]], colour = Model)) +
    geom_jitter(width = 0.12, alpha = 0.65, size = 2, show.legend = FALSE) +
    geom_crossbar(data = summ,
                  aes(x = Model, y = .data[[paste0("m_", tolower(y_var))]],
                      ymin = .data[[paste0("m_", tolower(y_var))]] - .data[[paste0("sd_", tolower(y_var))]],
                      ymax = .data[[paste0("m_", tolower(y_var))]] + .data[[paste0("sd_", tolower(y_var))]]),
                  width = 0.35, linewidth = 0.8, fill = NA, show.legend = FALSE) +
    geom_text(data = summ,
              aes(x = Model, y = .data[[paste0("mx_", tolower(y_var))]] + 0.018,
                  label = sprintf("%.3f\n\u00b1%.3f",
                                  .data[[paste0("m_", tolower(y_var))]],
                                  .data[[paste0("sd_", tolower(y_var))]])),
              size = 3, vjust = 0, show.legend = FALSE) +
    scale_colour_manual(values = model_colors) +
    scale_x_discrete(labels = c("CGM model"        = "CGM",
                                 "Smartwatch model" = "Smartwatch",
                                 "Baseline model"   = "Baseline")) +
    coord_cartesian(ylim = c(0.40, 1.05)) +
    labs(title = title, x = NULL, y = y_lab) +
    theme_bw(base_size = 12) +
    theme(plot.title = element_text(face = "bold", size = 11),
          panel.grid.minor = element_blank(),
          panel.grid.major.x = element_blank())
  if (!is.null(y_ref))
    p <- p + geom_hline(yintercept = y_ref, linetype = "dashed",
                        colour = "grey50", linewidth = 0.7)
  p
}

p_seed_auc   <- make_seed_panel(seed_df, summ_seed, "AUC",   "External AUC-ROC", 0.5,    "A  AUC-ROC")
p_seed_auprc <- make_seed_panel(seed_df, summ_seed, "AUPRC", "External AUPRC",   ir_prev,"B  AUPRC") +
  annotate("text", x = 0.6, y = ir_prev + 0.04,
           label = sprintf("Chance (%.2f)", ir_prev), size = 3, colour = "grey50")
p_seed_auprg <- make_seed_panel(seed_df, summ_seed, "AUPRG", "External AUPRG",   0,      "C  AUPRG") +
  annotate("text", x = 0.6, y = 0.04, label = "Chance = 0", size = 3, colour = "grey50")

p_seed_combined <- p_seed_auc | p_seed_auprc | p_seed_auprg
print(p_seed_combined)

ggsave("seed_stability_v3.pdf", p_seed_combined, width = 16, height = 5, device = cairo_pdf)
## Warning in grSoftVersion(): unable to load shared object '/Library/Frameworks/R.framework/Resources/modules//R_X11.so':
##   dlopen(/Library/Frameworks/R.framework/Resources/modules//R_X11.so, 0x0006): Library not loaded: /opt/X11/lib/libSM.6.dylib
##   Referenced from: <7ECC4104-EC6A-38FD-9BEA-BFE0B870925C> /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/modules/R_X11.so
##   Reason: tried: '/opt/X11/lib/libSM.6.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/X11/lib/libSM.6.dylib' (no such file), '/opt/X11/lib/libSM.6.dylib' (no such file), '/Library/Frameworks/R.framework/Resources/lib/libSM.6.dylib' (no such file), '/Library/Java/JavaVirtualMachines/jdk-11.0.18+10/Contents/Home/lib/server/libSM.6.dylib' (no such file)
## Warning in cairoVersion(): unable to load shared object '/Library/Frameworks/R.framework/Resources/library/grDevices/libs//cairo.so':
##   dlopen(/Library/Frameworks/R.framework/Resources/library/grDevices/libs//cairo.so, 0x0006): Library not loaded: /opt/X11/lib/libXrender.1.dylib
##   Referenced from: <263B701F-2238-3478-A7D2-B7513231EF98> /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/grDevices/libs/cairo.so
##   Reason: tried: '/opt/X11/lib/libXrender.1.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/X11/lib/libXrender.1.dylib' (no such file), '/opt/X11/lib/libXrender.1.dylib' (no such file), '/Library/Frameworks/R.framework/Resources/lib/libXrender.1.dylib' (no such file), '/Library/Java/JavaVirtualMachines/jdk-11.0.18+10/Contents/Home/lib/server/libXrender.1.dylib' (no such file)
## Warning in device(filename = "seed_stability_v3.pdf", width = 16, height = 5, :
## failed to load cairo DLL

8 · External validation — full metric table (ensemble)

# ── BUG FIX: n <- length(y_true) added at top of function ──────────────────
# Previously n was undefined inside this function, causing sample(n, n, ...) to
# fail with "invalid 'size' argument" when n resolved to NULL from parent scope.
# ────────────────────────────────────────────────────────────────────────────
compute_ext_full <- function(y_true, prob, model_name, n_boot = 2000, seed = 42) {

  n <- length(y_true)   # FIX: define n locally — was missing in original

  pred  <- factor(ifelse(prob >= 0.5, "IR", "Non.IR"), levels = c("Non.IR","IR"))
  cm    <- confusionMatrix(pred, y_true)
  sens  <- as.numeric(cm$byClass["Sensitivity"])
  spec  <- as.numeric(cm$byClass["Specificity"])
  prec  <- as.numeric(cm$byClass["Pos Pred Value"])
  f1    <- ifelse(is.na(prec)|prec+sens==0, NA_real_, 2*prec*sens/(prec+sens))
  acc   <- as.numeric(cm$overall["Accuracy"])
  auprc <- pr.curve(scores.class0  = prob,
                    weights.class0 = ifelse(y_true == "IR", 1, 0),
                    curve = FALSE)$auc.integral
  auprg <- compute_auprg(as.integer(y_true == "IR"), prob)

  set.seed(seed)
  mat <- matrix(NA_real_, nrow = n_boot, ncol = 9,
                dimnames = list(NULL,
                  c("sens","spec","prec","f1","acc","balacc","prauc","auprg","auc_roc")))

  for (i in seq_len(n_boot)) {
    idx    <- sample(n, n, replace = TRUE)
    yb     <- y_true[idx]; pb_cls <- pred[idx]; pb_pr <- prob[idx]
    if (length(unique(yb)) < 2) next
    cm_b <- confusionMatrix(pb_cls, yb)
    s_b  <- as.numeric(cm_b$byClass["Sensitivity"])
    sp_b <- as.numeric(cm_b$byClass["Specificity"])
    pr_b <- as.numeric(cm_b$byClass["Pos Pred Value"])
    f1_b <- ifelse(is.na(pr_b)|pr_b+s_b==0, NA_real_, 2*pr_b*s_b/(pr_b+s_b))
    prauc_b <- tryCatch(
      pr.curve(scores.class0=pb_pr, weights.class0=ifelse(yb=="IR",1,0),
               curve=FALSE)$auc.integral, error=function(e) NA_real_)
    auprg_b <- tryCatch(
      compute_auprg(as.integer(yb == "IR"), pb_pr), error=function(e) NA_real_)
    auc_b <- tryCatch(
      as.numeric(auc(roc(yb, pb_pr, levels=c("Non.IR","IR"), quiet=TRUE))),
      error=function(e) NA_real_)
    mat[i,] <- c(s_b, sp_b, pr_b, f1_b, as.numeric(cm_b$overall["Accuracy"]),
                 (s_b+sp_b)/2, prauc_b, auprg_b, auc_b)
  }
  ci <- apply(mat, 2, quantile, probs=c(0.025,0.975), na.rm=TRUE)

  data.frame(
    Model                 = model_name,
    Sensitivity           = round(sens, 3),
    Sensitivity_lo        = round(ci["2.5%",  "sens"],   3),
    Sensitivity_hi        = round(ci["97.5%", "sens"],   3),
    Specificity           = round(spec, 3),
    Specificity_lo        = round(ci["2.5%",  "spec"],   3),
    Specificity_hi        = round(ci["97.5%", "spec"],   3),
    Balanced_Accuracy     = round((sens+spec)/2, 3),
    Balanced_Accuracy_lo  = round(ci["2.5%",  "balacc"], 3),
    Balanced_Accuracy_hi  = round(ci["97.5%", "balacc"], 3),
    Precision             = round(prec, 3),
    Precision_lo          = round(ci["2.5%",  "prec"],   3),
    Precision_hi          = round(ci["97.5%", "prec"],   3),
    F1                    = round(f1,   3),
    F1_lo                 = round(ci["2.5%",  "f1"],     3),
    F1_hi                 = round(ci["97.5%", "f1"],     3),
    Accuracy              = round(acc,  3),
    Accuracy_lo           = round(ci["2.5%",  "acc"],    3),
    Accuracy_hi           = round(ci["97.5%", "acc"],    3),
    AUPRC                 = round(auprc, 3),
    AUPRC_lo              = round(ci["2.5%",  "prauc"],  3),
    AUPRC_hi              = round(ci["97.5%", "prauc"],  3),
    AUPRG                 = round(auprg, 3),
    AUPRG_lo              = round(ci["2.5%",  "auprg"],  3),
    AUPRG_hi              = round(ci["97.5%", "auprg"],  3),
    stringsAsFactors = FALSE
  )
}

ext_full <- dplyr::bind_rows(lapply(names(ext_probs), function(nm)
  compute_ext_full(y_ext_fac, ext_probs[[nm]], nm)
))

# Merge in AUC-ROC columns from ms_summary
ext_full <- ext_full %>%
  left_join(ms_summary %>%
              dplyr::select(Model, AUC_mean, AUC_sd, Ens_AUC, Ens_AUC_lo, Ens_AUC_hi),
            by = "Model")

cat("\nFull external metrics (ensemble of 20 seeds):\n")
## 
## Full external metrics (ensemble of 20 seeds):
print(ext_full %>% dplyr::select(
  Model, AUC_mean, AUC_sd, Ens_AUC, Ens_AUC_lo, Ens_AUC_hi,
  AUPRC, AUPRC_lo, AUPRC_hi,
  Sensitivity, Sensitivity_lo, Sensitivity_hi,
  Specificity, Specificity_lo, Specificity_hi,
  Balanced_Accuracy, F1, Accuracy
))
##              Model AUC_mean AUC_sd Ens_AUC Ens_AUC_lo Ens_AUC_hi AUPRC AUPRC_lo
## 1        CGM model    0.878  0.012   0.890      0.789      0.990 0.825    0.633
## 2 Smartwatch model    0.794  0.011   0.797      0.674      0.920 0.708    0.486
## 3   Baseline model    0.753  0.009   0.763      0.625      0.901 0.582    0.373
##   AUPRC_hi Sensitivity Sensitivity_lo Sensitivity_hi Specificity Specificity_lo
## 1    0.954       0.842          0.667          1.000       0.810          0.690
## 2    0.858       0.737          0.526          0.923       0.714          0.575
## 3    0.826       0.737          0.526          0.933       0.690          0.550
##   Specificity_hi Balanced_Accuracy    F1 Accuracy
## 1          0.923             0.826 0.744    0.820
## 2          0.844             0.726 0.622    0.721
## 3          0.821             0.714 0.609    0.705
write.csv(ext_full, "external_full_v3.csv", row.names = FALSE)

combined_metrics <- dplyr::bind_rows(
  final_summary %>%
    dplyr::select(Model, AUC_mean, AUC_lower, AUC_upper,
                  PRAUC_mean, PRAUC_lower, PRAUC_upper,
                  AUPRG_mean, AUPRG_lower, AUPRG_upper) %>%
    mutate(Dataset = "Internal (100-iter bootstrap)") %>%
    dplyr::rename(AUC_ROC  = AUC_mean,   AUC_lo   = AUC_lower,   AUC_hi   = AUC_upper,
                  AUPRC    = PRAUC_mean,  AUPRC_lo = PRAUC_lower, AUPRC_hi = PRAUC_upper,
                  AUPRG_lo = AUPRG_lower, AUPRG_hi = AUPRG_upper),
  ext_full %>%
    dplyr::select(Model, AUC_mean, Ens_AUC_lo, Ens_AUC_hi,
                  AUPRC, AUPRC_lo, AUPRC_hi, AUPRG, AUPRG_lo, AUPRG_hi) %>%
    mutate(Dataset = "External (20-seed ensemble)") %>%
    dplyr::rename(AUC_ROC = AUC_mean, AUC_lo = Ens_AUC_lo, AUC_hi = Ens_AUC_hi)
) %>%
  dplyr::select(Model, Dataset, AUC_ROC, AUC_lo, AUC_hi,
                AUPRC, AUPRC_lo, AUPRC_hi, AUPRG, AUPRG_lo, AUPRG_hi) %>%
  mutate(across(where(is.numeric), ~round(.x, 3)))

cat("\nCombined AUC-ROC + AUPRC + AUPRG (internal & external):\n")
## 
## Combined AUC-ROC + AUPRC + AUPRG (internal & external):
print(combined_metrics)
## # A tibble: 6 × 11
##   Model    Dataset AUC_ROC AUC_lo AUC_hi AUPRC AUPRC_lo AUPRC_hi  AUPRG AUPRG_lo
##   <chr>    <chr>     <dbl>  <dbl>  <dbl> <dbl>    <dbl>    <dbl>  <dbl>    <dbl>
## 1 Model C… Intern…   0.833  0.684  0.953 0.725    0.488    0.915 NA        0.363
## 2 Model b… Intern…   0.817  0.655  0.933 0.644    0.419    0.853 NA        0.359
## 3 Model s… Intern…   0.812  0.663  0.933 0.637    0.43     0.875 NA        0.356
## 4 CGM mod… Extern…   0.878  0.789  0.99  0.825    0.633    0.954  0.883    0.715
## 5 Smartwa… Extern…   0.794  0.674  0.92  0.708    0.486    0.858  0.74     0.364
## 6 Baselin… Extern…   0.753  0.625  0.901 0.582    0.373    0.826  0.679    0.321
## # ℹ 1 more variable: AUPRG_hi <dbl>
write.csv(combined_metrics, "combined_metrics_v3.csv", row.names = FALSE)

8b · AUC-ROC + AUPRC + AUPRG comparison plot

model_order <- c("Model CGM","Model smartwatch","Model baseline")

plot_df <- combined_metrics %>%
  mutate(Model = factor(Model, levels = model_order)) %>%
  tidyr::pivot_longer(cols = c(AUC_ROC, AUPRC),
                      names_to = "Metric", values_to = "Value") %>%
  mutate(lo = ifelse(Metric == "AUC_ROC", AUC_lo,   AUPRC_lo),
         hi = ifelse(Metric == "AUC_ROC", AUC_hi,   AUPRC_hi),
         Metric  = factor(Metric, levels = c("AUC_ROC","AUPRC"),
                           labels = c("AUC-ROC","AUPRC")),
         Dataset = factor(Dataset, levels = c("Internal (100-iter bootstrap)",
                                               "External (20-seed ensemble)")))

ref_lines <- data.frame(
  Metric = c("AUC-ROC","AUPRC"),
  xint   = c(0.5, ir_prev)
)

p_metrics <- ggplot(plot_df,
                    aes(x = Value, y = Model, colour = Model,
                        shape = Dataset, group = interaction(Model, Dataset))) +
  geom_errorbarh(aes(xmin = lo, xmax = hi), height = 0.2, linewidth = 0.7,
                 position = position_dodge(width = 0.55)) +
  geom_point(size = 3.5, position = position_dodge(width = 0.55)) +
  geom_vline(data = ref_lines, aes(xintercept = xint),
             linetype = "dashed", colour = "grey60", linewidth = 0.7) +
  scale_colour_manual(values = model_colors, guide = "none") +
  scale_shape_manual(values = c("Internal (100-iter bootstrap)" = 16,
                                 "External (20-seed ensemble)"   = 17),
                     name = NULL) +
  scale_y_discrete(labels = c("Model CGM"        = "CGM model",
                               "Model smartwatch" = "Smartwatch model",
                               "Model baseline"   = "Baseline (anthropometrics)")) +
  facet_wrap(~Metric, scales = "free_x") +
  scale_x_continuous(limits = c(0.35, 1.02), breaks = seq(0.4, 1.0, 0.1)) +
  labs(title = "AUC-ROC and AUPRC — internal bootstrap vs external 20-seed ensemble",
       x = "Value (95% CI)", y = NULL) +
  theme_bw(base_size = 12) +
  theme(plot.title = element_text(face = "bold", size = 11),
        strip.text = element_text(face = "bold", size = 11),
        strip.background = element_rect(fill = "grey93"),
        panel.grid.minor = element_blank(),
        legend.position = "bottom")
## Warning: `geom_errorbarh()` was deprecated in ggplot2 4.0.0.
## ℹ Please use the `orientation` argument of `geom_errorbar()` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
print(p_metrics)
## `height` was translated to `width`.
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_vline()`).

ggsave("auc_auprc_plot_v3.pdf", p_metrics, width = 11, height = 5, device = cairo_pdf)
## Warning in device(filename = "auc_auprc_plot_v3.pdf", width = 11, height = 5, :
## failed to load cairo DLL
## `height` was translated to `width`.
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_vline()`).
cat(sprintf("\nChance AUPRC (= prevalence):  internal=%.2f  external=%.2f\n",
            mean(df_test$IR_label == "IR"), ir_prev))
## 
## Chance AUPRC (= prevalence):  internal=0.33  external=0.31

9 · Decision Curve Analysis

n_ext      <- length(y_bin)
thresholds <- seq(0.05, 0.50, by = 0.01)

nb_fn <- function(y, prob, t) {
  pred <- as.integer(prob >= t)
  tp   <- sum(pred == 1 & y == 1)
  fp   <- sum(pred == 1 & y == 0)
  tp / n_ext - fp / n_ext * t / (1 - t)
}

dca_df <- do.call(rbind, lapply(thresholds, function(t) {
  data.frame(
    threshold   = t,
    model       = c("CGM","SW","Baseline","All","None"),
    net_benefit = c(
      nb_fn(y_bin, ext_probs[["CGM model"]],        t),
      nb_fn(y_bin, ext_probs[["Smartwatch model"]], t),
      nb_fn(y_bin, ext_probs[["Baseline model"]],   t),
      ir_prev - (1 - ir_prev) * t / (1 - t),
      0
    ),
    stringsAsFactors = FALSE
  )
}))

p_dca <- ggplot(dca_df, aes(x = threshold, y = net_benefit,
                             colour = model, linetype = model)) +
  geom_line(linewidth = 0.9) +
  scale_colour_manual(
    values = c("CGM"="#1f77b4","SW"="#2ca02c","Baseline"="#d62728",
               "All"="#888888","None"="#bbbbbb"),
    labels = c("CGM"="CGM model","SW"="Smartwatch model",
               "Baseline"="Baseline model","All"="Treat all","None"="Treat none")) +
  scale_linetype_manual(
    values = c("CGM"="solid","SW"="dashed","Baseline"="dotdash",
               "All"="dotted","None"="dotted"),
    labels = c("CGM"="CGM model","SW"="Smartwatch model",
               "Baseline"="Baseline model","All"="Treat all","None"="Treat none")) +
  geom_hline(yintercept = 0, colour = "black", linewidth = 0.5) +
  geom_vline(xintercept = ir_prev, colour = "black",
             linetype = "dashed", linewidth = 0.6, alpha = 0.45) +
  annotate("text", x = ir_prev + 0.01, y = 0.38,
           label = sprintf("Prevalence\n(%.2f)", ir_prev),
           size = 3, colour = "grey40", hjust = 0) +
  coord_cartesian(xlim = c(0.05, 0.50), ylim = c(-0.06, 0.42)) +
  labs(title = "Decision curve analysis — external cohort (n=61)",
       x = "Threshold probability", y = "Net benefit",
       colour = NULL, linetype = NULL) +
  theme_bw(base_size = 12) +
  theme(legend.position = "right",
        plot.title = element_text(face = "bold", size = 12),
        panel.grid.minor = element_blank())

print(p_dca)

ggsave("dca_v3.pdf", p_dca, width = 8, height = 5, device = cairo_pdf)
## Warning in device(filename = "dca_v3.pdf", width = 8, height = 5, bg =
## "white"): failed to load cairo DLL
cat(sprintf("\nNet benefit at prevalence threshold (%.2f):\n", ir_prev))
## 
## Net benefit at prevalence threshold (0.31):
for (m in c("CGM","SW","Baseline","All","None")) {
  sub <- dca_df[dca_df$model == m, ]
  nb  <- sub$net_benefit[which.min(abs(sub$threshold - ir_prev))]
  cat(sprintf("  %-12s  NB = %.4f\n", m, nb))
}
##   CGM           NB = 0.1314
##   SW            NB = 0.1224
##   Baseline      NB = 0.1003
##   All           NB = 0.0021
##   None          NB = 0.0000
write.csv(dca_df, "dca_v3.csv", row.names = FALSE)

10 · Seed stability summary

seed_df %>%
  group_by(Model) %>%
  summarise(
    AUC_mean   = round(mean(AUC),   3), AUC_sd    = round(sd(AUC),   3),
    AUC_min    = round(min(AUC),    3), AUC_max   = round(max(AUC),  3),
    AUPRC_mean = round(mean(AUPRC), 3), AUPRC_sd  = round(sd(AUPRC), 3),
    .groups = "drop") %>%
  print()
## # A tibble: 3 × 7
##   Model            AUC_mean AUC_sd AUC_min AUC_max AUPRC_mean AUPRC_sd
##   <fct>               <dbl>  <dbl>   <dbl>   <dbl>      <dbl>    <dbl>
## 1 CGM model           0.878  0.012   0.858   0.9        0.802    0.032
## 2 Smartwatch model    0.794  0.011   0.772   0.811      0.698    0.019
## 3 Baseline model      0.753  0.009   0.737   0.776      0.568    0.035

11 · SHAP feature name map

feature_name_map <- c(
  "bmi"                              = "BMI",
  "mean_fasting_glucose"             = "Mean Fasting Glucose",
  "sd_fasting_glucose"               = "SD Fasting Glucose",
  "excursions_per_day"               = "Glucose Excursions/Day",
  "rec_time_50pct_median"            = "Recovery Time (50th pct)",
  "overall_max_glucose"              = "Overall Max Glucose",
  "age"                              = "Age",
  "whr"                              = "WHR",
  "stress_overall_daily_mean_stress" = "Mean Stress (HRV-based)",
  "sd_hr_day_night_diff"             = "HR Day-Night Variability",
  "mean_midpoint_clock_h"            = "Sleep Midpoint",
  "sri"                              = "Sleep Regularity Index",
  "social_jetlag_h"                  = "Social Jetlag"
)

rename_features <- function(x)
  ifelse(x %in% names(feature_name_map), feature_name_map[x], x)

12 · SHAP helper functions

aggregate_shap <- function(shap_list, model_filter) {
  entries <- Filter(function(x) x$model_name == model_filter, shap_list)
  mat     <- do.call(rbind, lapply(entries, function(e)
    colMeans(abs(as.data.frame(e$shap)))))
  data.frame(feature   = colnames(mat),
             mean_shap = colMeans(mat),
             sd_shap   = apply(mat, 2, sd),
             row.names = NULL) %>%
    arrange(desc(mean_shap)) %>%
    mutate(pct_share = mean_shap / sum(mean_shap) * 100,
           feature   = rename_features(feature))
}

collect_shap_long <- function(shap_list, model_filter) {
  entries <- Filter(function(x) x$model_name == model_filter, shap_list)
  bind_rows(lapply(entries, function(e) {
    sd <- as.data.frame(e$shap) %>%
      mutate(obs = row_number(), iteration = e$iteration) %>%
      pivot_longer(-c(obs,iteration), names_to="feature", values_to="shap_value") %>%
      mutate(feature = rename_features(feature))
    fd <- as.data.frame(e$X_df) %>%
      mutate(obs = row_number()) %>%
      pivot_longer(-obs, names_to="feature", values_to="feature_value") %>%
      mutate(feature = rename_features(feature))
    left_join(sd, fd, by = c("obs","feature"))
  }))
}

rank_stability_df <- function(shap_list, model_filter) {
  entries  <- Filter(function(x) x$model_name == model_filter, shap_list)
  mat      <- do.call(rbind, lapply(entries, function(e)
    colMeans(abs(as.data.frame(e$shap)))))
  rank_mat <- t(apply(mat, 1, rank))
  data.frame(feature   = colnames(rank_mat),
             mean_rank = colMeans(rank_mat),
             iqr_rank  = apply(rank_mat, 2, IQR),
             row.names = NULL) %>%
    arrange(mean_rank) %>%
    mutate(feature = rename_features(feature))
}

panel_beeswarm <- function(shap_list, model_filter,
                           shared_x_range = NULL, is_external = FALSE) {
  agg  <- aggregate_shap(shap_list, model_filter)
  long <- collect_shap_long(shap_list, model_filter)
  lmap <- agg %>%
    mutate(label = paste0(feature," (",round(pct_share,1),"%)")) %>%
    dplyr::select(feature, label)
  lev  <- rev(agg$feature)
  long <- long %>%
    left_join(lmap, by = "feature") %>%
    group_by(feature) %>%
    mutate(feat_scaled = (feature_value - min(feature_value, na.rm=TRUE)) /
             (max(feature_value, na.rm=TRUE) - min(feature_value, na.rm=TRUE) + 1e-9)) %>%
    ungroup() %>%
    mutate(label = factor(label, levels = lmap$label[match(lev, lmap$feature)]))
  p <- ggplot(long, aes(x=shap_value, y=label, colour=feat_scaled)) +
    geom_jitter(height=0.18, alpha=ifelse(is_external,0.9,0.5), size=1.5, stroke=0) +
    scale_colour_gradient(low="#3B82C4", high="#E84545", name="Feature\nvalue",
                          breaks=c(0,1), labels=c("Low","High")) +
    geom_vline(xintercept=0, linetype="dashed", colour="grey50", linewidth=0.4) +
    labs(title=model_filter, x="SHAP value", y=NULL) +
    theme_bw(base_size=11) +
    theme(plot.title=element_text(face="bold",hjust=0.5,size=11),
          legend.position="none", panel.grid.minor=element_blank(),
          axis.text.y=element_text(size=9))
  if (!is.null(shared_x_range)) p <- p + coord_cartesian(xlim=shared_x_range)
  p
}

panel_bar <- function(shap_list, model_filter, shared_x_max = NULL) {
  agg <- aggregate_shap(shap_list, model_filter) %>%
    mutate(label   = paste0(feature," (",round(pct_share,1),"%)"),
           label   = factor(label, levels=rev(paste0(feature," (",round(pct_share,1),"%)"))),
           ymin_eb = pmax(mean_shap-sd_shap,0), ymax_eb = mean_shap+sd_shap)
  p <- ggplot(agg, aes(x=mean_shap, y=label)) +
    geom_col(fill="#4C72B0", alpha=0.85) +
    geom_errorbarh(aes(xmin=ymin_eb, xmax=ymax_eb), height=0.3, colour="grey30") +
    labs(title=model_filter, x="Mean |SHAP| value", y=NULL) +
    theme_bw(base_size=11) +
    theme(plot.title=element_text(face="bold",hjust=0.5,size=11),
          panel.grid.minor=element_blank(), axis.text.y=element_text(size=9))
  if (!is.null(shared_x_max)) p <- p + coord_cartesian(xlim=c(0,shared_x_max))
  p
}

panel_rank <- function(shap_list, model_filter, shared_x_range = NULL) {
  rs <- rank_stability_df(shap_list, model_filter) %>%
    mutate(label = factor(feature, levels=rev(feature)))
  p <- ggplot(rs, aes(x=mean_rank, y=label)) +
    geom_point(size=3, colour="#2ca02c") +
    geom_errorbarh(aes(xmin=mean_rank-iqr_rank/2, xmax=mean_rank+iqr_rank/2),
                   height=0.3, colour="grey40") +
    scale_x_reverse() +
    labs(title=model_filter, x="Mean rank (lower=more important)", y=NULL) +
    theme_bw(base_size=11) +
    theme(plot.title=element_text(face="bold",hjust=0.5,size=11),
          panel.grid.minor=element_blank(), axis.text.y=element_text(size=9))
  if (!is.null(shared_x_range)) p <- p + coord_cartesian(xlim=shared_x_range)
  p
}

make_shap_figure <- function(shap_list, dataset_label, model_names_vec,
                              plot_type = c("beeswarm","bar","rank"),
                              panel_labels = LETTERS[seq_along(model_names_vec)]) {
  plot_type <- match.arg(plot_type)
  if (plot_type == "beeswarm") {
    all_vals  <- unlist(lapply(model_names_vec, function(mn) {
      unlist(lapply(Filter(function(x) x$model_name==mn, shap_list),
                   function(e) as.numeric(as.matrix(e$shap))))
    }))
    xmax <- max(abs(all_vals), na.rm=TRUE)*1.05
    srng <- c(-xmax, xmax)
  }
  if (plot_type == "bar") {
    am <- unlist(lapply(model_names_vec, function(mn) {
      a <- aggregate_shap(shap_list, mn); a$mean_shap + a$sd_shap
    }))
    sxmax <- max(am, na.rm=TRUE)*1.05
  }
  if (plot_type == "rank") {
    nf   <- max(unlist(lapply(model_names_vec, function(mn)
      nrow(rank_stability_df(shap_list, mn)))))
    srng <- c(nf+0.5, 0.5)
  }
  panels <- lapply(seq_along(model_names_vec), function(j) {
    mn <- model_names_vec[rev(seq_along(model_names_vec))[j]]
    p  <- switch(plot_type,
      beeswarm = panel_beeswarm(shap_list, mn, shared_x_range=srng),
      bar      = panel_bar(shap_list, mn, shared_x_max=sxmax),
      rank     = panel_rank(shap_list, mn, shared_x_range=srng))
    p + labs(tag=panel_labels[j]) +
      theme(plot.tag=element_text(face="bold",size=13))
  })
  subtitle <- switch(plot_type,
    beeswarm = "Each point = one observation x one bootstrap iteration. Labels show mean |SHAP| share (%).",
    bar      = "Error bars = +/-1 SD across bootstrap iterations. Labels show mean |SHAP| share (%).",
    rank     = "Error bars = IQR of rank across bootstrap iterations. Leftward = more important.")
  if (plot_type == "beeswarm") {
    dummy <- ggplot(data.frame(x=0,y=0,v=c(0,1)),aes(x,y,colour=v)) +
      geom_point() +
      scale_colour_gradient(low="#3B82C4",high="#E84545",name="Feature\nvalue",
                            breaks=c(0,1),labels=c("Low","High")) +
      theme(legend.position="right",legend.title=element_text(size=9),
            legend.text=element_text(size=8),legend.key.height=unit(1.2,"cm"))
    leg <- cowplot::get_legend(dummy)
    combined <- (Reduce(`|`, panels) | patchwork::wrap_elements(leg)) +
      plot_layout(widths=c(rep(1,length(panels)),0.15))
  } else {
    combined <- Reduce(`|`, panels)
  }
  combined + plot_annotation(
    title   = paste0(switch(plot_type, beeswarm="SHAP Summary",
                            bar="Mean |SHAP| Feature Importance",
                            rank="Feature Rank Stability"), " — ", dataset_label),
    caption = subtitle,
    theme   = theme(plot.title  =element_text(face="bold",hjust=0.5,size=13),
                    plot.caption=element_text(size=10,colour="grey40",hjust=0.5)))
}

13 · SHAP figures — internal validation

for (pt in c("beeswarm","bar","rank")) {
  fig <- make_shap_figure(all_shap_val, "Internal Validation", model_names, pt)
  print(fig)
  ggsave(sprintf("shap_%s_internal_v3.pdf", pt), fig,
         width=20, height=5.5, device=cairo_pdf)
}


14 · Single-seed models for external SHAP

fit_shap_model <- function(features, df_train, df_ext, seed = 1) {
  set.seed(seed)
  X_tr <- df_train %>% dplyr::select(all_of(features)) %>%
    mutate(across(where(is.numeric), as.double))
  y_tr <- factor(make.names(trimws(df_train$IR_label)))
  imp       <- caret::preProcess(X_tr, method = "medianImpute")
  X_tr_imp  <- predict(imp, X_tr)
  up_data   <- caret::upSample(x=X_tr_imp, y=y_tr, yname=".outcome")
  rf        <- randomForest::randomForest(
    x=up_data[,names(X_tr_imp),drop=FALSE], y=up_data$.outcome,
    ntree=300, mtry=floor(sqrt(length(features))), nodesize=1)
  X_ext_imp <- predict(imp, df_ext %>%
    dplyr::select(all_of(features)) %>%
    mutate(across(where(is.numeric), as.double)))
  shap <- fastshap::explain(
    object=rf, X=as.data.frame(X_ext_imp),
    pred_wrapper=function(object, newdata) predict(object, newdata, type="prob")[,"IR"],
    nsim=50, adjust=TRUE)
  list(shap=shap, X_df=as.data.frame(X_ext_imp))
}

cat("Fitting single-seed models for external SHAP (seed=1)...\n")
## Fitting single-seed models for external SHAP (seed=1)...
sr_CGM  <- fit_shap_model(feature_sets$CGM_model,            df_test, df_external)
sr_SW   <- fit_shap_model(feature_sets$Smartwatch_only_model, df_test, df_external)
sr_Base <- fit_shap_model(feature_sets$Baseline_model,        df_test, df_external)

all_shap_ext_final <- list(
  list(iteration=1, model_name="Model CGM",        shap=sr_CGM$shap,  X_df=sr_CGM$X_df),
  list(iteration=1, model_name="Model smartwatch", shap=sr_SW$shap,   X_df=sr_SW$X_df),
  list(iteration=1, model_name="Model baseline",   shap=sr_Base$shap, X_df=sr_Base$X_df)
)

15 · SHAP figures — external test set

make_shap_ext_figure <- function(shap_list, model_names_vec,
                                  panel_labels=LETTERS[seq_along(model_names_vec)]) {
  all_vals <- unlist(lapply(model_names_vec, function(mn)
    unlist(lapply(Filter(function(x) x$model_name==mn, shap_list),
                 function(e) as.numeric(as.matrix(e$shap))))))
  xmax <- max(abs(all_vals),na.rm=TRUE)*1.05; srng <- c(-xmax,xmax)
  panels <- lapply(seq_along(model_names_vec), function(j) {
    panel_beeswarm(shap_list,model_names_vec[j],shared_x_range=srng,is_external=TRUE) +
      labs(tag=panel_labels[j]) +
      theme(plot.tag=element_text(face="bold",size=13))
  })
  dummy <- ggplot(data.frame(x=0,y=0,v=c(0,1)),aes(x,y,colour=v)) +
    geom_point() +
    scale_colour_gradient(low="#3B82C4",high="#E84545",name="Feature\nvalue",
                          breaks=c(0,1),labels=c("Low","High")) +
    theme(legend.position="right",legend.title=element_text(size=9),
          legend.text=element_text(size=8),legend.key.height=unit(1.2,"cm"))
  leg <- cowplot::get_legend(dummy)
  (Reduce(`|`, panels) | patchwork::wrap_elements(leg)) +
    plot_layout(widths=c(rep(1,length(panels)),0.15)) +
    plot_annotation(
      title   = "SHAP Summary — External Test Set",
      caption = "Each point = one external patient. Model trained on all internal data (seed=1). Labels show |SHAP| share (%).",
      theme   = theme(plot.title  =element_text(face="bold",hjust=0.5,size=13),
                      plot.caption=element_text(size=10,colour="grey40",hjust=0.5)))
}

fig_ext <- make_shap_ext_figure(all_shap_ext_final, model_names)
print(fig_ext)

ggsave("shap_beeswarm_external_v3.pdf", fig_ext, width=20, height=5.5, device=cairo_pdf)

16 · Combined summary figure

p_combined <- (p_dca  + labs(title="A  Decision curve analysis")) |
              (p_seed_auc   + labs(title="B  AUC-ROC stability"))  |
              (p_seed_auprc + labs(title="C  AUPRC stability"))    |
              (p_seed_auprg + labs(title="D  AUPRG stability"))

print(p_combined)

ggsave("summary_combined_v3.pdf", p_combined, width=20, height=5, device=cairo_pdf)
## Warning in device(filename = "summary_combined_v3.pdf", width = 20, height = 5,
## : failed to load cairo DLL

17 · Session info

sessionInfo()
## R version 4.5.2 (2025-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sequoia 15.5
## 
## Matrix products: default
## BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: Europe/Zurich
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] cowplot_1.2.0        patchwork_1.3.2      tidyr_1.3.2         
##  [4] prg_0.5.1            PRROC_1.4            rlang_1.2.0         
##  [7] pROC_1.19.0.1        caret_7.0-1          lattice_0.22-7      
## [10] ggplot2_4.0.1        dplyr_1.2.0          fastshap_0.1.1      
## [13] randomForest_4.7-1.2
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.1     timeDate_4052.112    farver_2.1.2        
##  [4] S7_0.2.1             fastmap_1.2.0        digest_0.6.39       
##  [7] rpart_4.1.24         timechange_0.3.0     lifecycle_1.0.5     
## [10] survival_3.8-3       magrittr_2.0.4       compiler_4.5.2      
## [13] sass_0.4.10          tools_4.5.2          utf8_1.2.6          
## [16] yaml_2.3.12          data.table_1.18.0    knitr_1.51          
## [19] labeling_0.4.3       plyr_1.8.9           RColorBrewer_1.1-3  
## [22] withr_3.0.2          purrr_1.2.0          nnet_7.3-20         
## [25] grid_4.5.2           stats4_4.5.2         e1071_1.7-17        
## [28] future_1.69.0        globals_0.19.1       scales_1.4.0        
## [31] iterators_1.0.14     MASS_7.3-65          cli_3.6.6           
## [34] rmarkdown_2.30       generics_0.1.4       otel_0.2.0          
## [37] rstudioapi_0.17.1    future.apply_1.20.2  reshape2_1.4.5      
## [40] cachem_1.1.0         proxy_0.4-29         stringr_1.6.0       
## [43] splines_4.5.2        parallel_4.5.2       vctrs_0.7.2         
## [46] hardhat_1.4.2        Matrix_1.7-4         jsonlite_2.0.0      
## [49] listenv_0.10.1       foreach_1.5.2        gower_1.0.2         
## [52] jquerylib_0.1.4      recipes_1.3.1        glue_1.8.0          
## [55] parallelly_1.46.1    codetools_0.2-20     lubridate_1.9.4     
## [58] stringi_1.8.7        gtable_0.3.6         tibble_3.3.0        
## [61] pillar_1.11.1        htmltools_0.5.9      ipred_0.9-15        
## [64] lava_1.8.2           R6_2.6.1             evaluate_1.0.5      
## [67] bslib_0.9.0          class_7.3-23         Rcpp_1.1.1          
## [70] nlme_3.1-168         prodlim_2026.03.11   xfun_0.55           
## [73] pkgconfig_2.0.3      ModelMetrics_1.2.2.2