library(randomForest)
library(fastshap)
library(dplyr)
library(caret)
library(pROC)
library(PRROC)
library(prg)
library(tidyr)
library(patchwork)
library(cowplot)
library(ggplot2)

knitr::opts_chunk$set(message = FALSE, warning = FALSE, fig.align = "center", out.width = "100%")

1 Data Loading

Two AI-READI cohorts: Study 1 (internal, n=97) and Study 2 (external, n=61).

df_test     <- read.csv("~/Downloads/read-ai/df_test_f_v2.csv")
df_external <- read.csv("~/Downloads/read-ai/df_external_f_v2.csv")

cat(sprintf("Study 1: n=%d, IR=%d (%.1f%%)\n", 
            nrow(df_test), sum(df_test$IR_label=="IR"), 
            mean(df_test$IR_label=="IR")*100))
## Study 1: n=97, IR=32 (33.0%)
cat(sprintf("Study 2: n=%d, IR=%d (%.1f%%)\n", 
            nrow(df_external), sum(df_external$IR_label=="IR"), 
            mean(df_external$IR_label=="IR")*100))
## Study 2: n=61, IR=19 (31.1%)

2 Feature Sets

Three modality-specific feature sets with fixed mtry = floor(sqrt(p)).

feature_sets <- list(
  CGM_model = c("bmi", "mean_fasting_glucose", "sd_fasting_glucose",
                "excursions_per_day", "rec_time_50pct_median",
                "overall_max_glucose", "age", "whr"),
  Smartwatch_model = c("stress_overall_daily_mean_stress", "sd_hr_day_night_diff",
                       "age", "bmi", "whr"),
  Baseline_model = c("bmi", "age", "whr")
)

for (nm in names(feature_sets)) {
  p <- length(feature_sets[[nm]])
  cat(sprintf("%-20s: p=%d, mtry=%d\n", nm, p, floor(sqrt(p))))
}
## CGM_model           : p=8, mtry=2
## Smartwatch_model    : p=5, mtry=2
## Baseline_model      : p=3, mtry=1

3 Model Fitting Functions

Pipeline: median imputation → upsampling → random forest (200 trees).

fit_rf_seed <- function(features, df_train, seed) {
  set.seed(seed)
  p    <- length(features)
  mtry <- floor(sqrt(p))
  
  X_tr <- df_train %>%
    dplyr::select(all_of(features)) %>%
    mutate(across(where(is.numeric), as.double))
  y_tr <- factor(make.names(trimws(df_train$IR_label)))
  
  imp      <- caret::preProcess(X_tr, method = "medianImpute")
  X_tr_imp <- predict(imp, X_tr)
  up_data  <- caret::upSample(x = X_tr_imp, y = y_tr, yname = ".outcome")
  
  rf <- randomForest::randomForest(
    x = up_data[, names(X_tr_imp), drop = FALSE],
    y = up_data$.outcome,
    ntree = 200, mtry = mtry, nodesize = 1
  )
  
  list(rf = rf, imp = imp, features = features)
}

predict_rf <- function(fit_obj, df_new) {
  X_new <- df_new %>%
    dplyr::select(all_of(fit_obj$features)) %>%
    mutate(across(where(is.numeric), as.double))
  X_imp <- predict(fit_obj$imp, X_new)
  predict(fit_obj$rf, X_imp, type = "prob")[, "IR"]
}

compute_auprg <- function(labels, scores) {
  tryCatch(prg::calc_auprg(prg::create_prg_curve(labels, scores)), 
           error = function(e) NA_real_)
}

4 Internal Validation (5 Bootstrap Iterations)

Five stratified 70/30 splits on Study 1 for this debug/review copy.

set.seed(123)
# Original full-run code:
# n_iter <- 200
# added: keep this review copy fast while preserving the same workflow.
n_iter <- 5
bootstrap_seeds <- sample.int(99999L, n_iter)

all_results <- list()
all_shap_val <- list()

for (i in seq_len(n_iter)) {
  y <- factor(trimws(df_test$IR_label))
  levels(y) <- make.names(levels(y))
  train_idx <- createDataPartition(y, p = 0.7, list = FALSE)
  
  test_set  <- df_test[-train_idx, ]
  train_set <- df_test[ train_idx, ]
  y_test    <- factor(make.names(trimws(test_set$IR_label)))
  
  iter_seed <- i * 7L
  
  CGM_fit  <- fit_rf_seed(feature_sets$CGM_model,        train_set, iter_seed)
  SW_fit   <- fit_rf_seed(feature_sets$Smartwatch_model, train_set, iter_seed)
  Base_fit <- fit_rf_seed(feature_sets$Baseline_model,   train_set, iter_seed)
  fits     <- list(CGM_fit, SW_fit, Base_fit)
  
  model_names <- c("CGM model", "Smartwatch model", "Baseline model")
  
  iter_rows <- lapply(seq_along(fits), function(m) {
    fit  <- fits[[m]]
    prob <- predict_rf(fit, test_set)
    pred <- factor(ifelse(prob >= 0.5, "IR", "Non.IR"), levels = c("Non.IR","IR"))
    if (length(unique(as.integer(y_test))) < 2) return(NULL)
    
    cm   <- confusionMatrix(pred, y_test)
    sens <- as.numeric(cm$byClass["Sensitivity"])
    spec <- as.numeric(cm$byClass["Specificity"])
    prec <- as.numeric(cm$byClass["Pos Pred Value"])
    
    data.frame(
      Model = model_names[m],
      Accuracy = as.numeric(cm$overall["Accuracy"]),
      Sensitivity = sens,
      Specificity = spec,
      Precision = prec,
      F1 = ifelse(is.na(prec)|prec+sens==0, NA_real_, 2*prec*sens/(prec+sens)),
      AUC = as.numeric(auc(roc(y_test, prob, levels=c("Non.IR","IR"), quiet=TRUE))),
      PRAUC = pr.curve(scores.class0=prob, weights.class0=ifelse(y_test=="IR",1,0), 
                       curve=FALSE)$auc.integral,
      AUPRG = compute_auprg(as.integer(y_test=="IR"), prob),
      iteration = i
    )
  })
  
  all_results[[i]] <- dplyr::bind_rows(iter_rows)
  
  # SHAP values
  for (m in seq_along(fits)) {
    fit <- fits[[m]]
    X_test_raw <- test_set %>%
      dplyr::select(all_of(fit$features)) %>%
      mutate(across(where(is.numeric), as.double))
    X_test_imp <- predict(fit$imp, X_test_raw)
    
    shap_s <- tryCatch(
      fastshap::explain(
        object = fit$rf,
        X = as.data.frame(X_test_imp),
        pred_wrapper = function(object, newdata) 
          predict(object, newdata, type="prob")[,"IR"],
        nsim = 50, adjust = TRUE
      ),
      error = function(e) NULL
    )
    
    if (!is.null(shap_s))
      all_shap_val[[length(all_shap_val)+1]] <- list(
        iteration = i,
        model_name = model_names[m],
        shap = shap_s,
        X_df = as.data.frame(X_test_imp)
      )
  }
}

bootstrap_results <- dplyr::bind_rows(all_results)

4.1 Internal Performance Summary

final_summary <- bootstrap_results %>%
  group_by(Model) %>%
  summarise(
    across(c(AUC, PRAUC, AUPRG),
           list(mean = ~mean(.x, na.rm=TRUE),
                lo = ~quantile(.x, 0.025, na.rm=TRUE),
                hi = ~quantile(.x, 0.975, na.rm=TRUE)),
           .names = "{.col}_{.fn}"),
    .groups = "drop"
  )

final_summary %>%
  select(Model, starts_with("AUC"), starts_with("PRAUC"), starts_with("AUPRG")) %>%
  mutate(across(where(is.numeric), ~round(.x, 3)))

5 External Validation (5 Bootstrap Iterations)

Five models trained on entire Study 1, evaluated on Study 2 bootstrap resamples.

evaluate_external_bootstrap <- function(features, df_train, df_ext, seeds_vec, 
                                        model_label = NULL, nsim_shap = 50) {
  
  y_ext_fac <- factor(make.names(trimws(df_ext$IR_label)))
  y_ext_bin <- as.integer(y_ext_fac == "IR")
  n_ext     <- length(y_ext_bin)
  pos_idx   <- which(y_ext_bin == 1)
  neg_idx   <- which(y_ext_bin == 0)
  n_iter    <- length(seeds_vec)
  
  iter_aucs   <- numeric(n_iter)
  iter_auprcs <- numeric(n_iter)
  iter_auprgs <- numeric(n_iter)
  
  all_probs_full <- matrix(NA_real_, nrow = n_ext, ncol = n_iter)
  shap_matrices  <- vector("list", n_iter)
  X_ext_ref      <- NULL
  
  for (i in seq_len(n_iter)) {
    seed <- seeds_vec[i]
    fit  <- fit_rf_seed(features, df_train, seed = seed)
    
    X_ext_raw <- df_ext %>%
      dplyr::select(all_of(features)) %>%
      mutate(across(where(is.numeric), as.double))
    X_ext_imp <- predict(fit$imp, X_ext_raw)
    if (i == 1) X_ext_ref <- as.data.frame(X_ext_imp)
    
    prob_full <- predict(fit$rf, X_ext_imp, type="prob")[,"IR"]
    all_probs_full[,i] <- prob_full
    
    set.seed(seed)
    boot_idx  <- c(sample(pos_idx, length(pos_idx), replace=TRUE),
                   sample(neg_idx, length(neg_idx), replace=TRUE))
    y_boot    <- y_ext_bin[boot_idx]
    prob_boot <- prob_full[boot_idx]
    
    if (length(unique(y_boot)) < 2) {
      iter_aucs[i] <- iter_auprcs[i] <- iter_auprgs[i] <- NA_real_
    } else {
      iter_aucs[i] <- as.numeric(
        auc(roc(y_boot, prob_boot, levels=c(0,1), direction="<", quiet=TRUE))
      )
      iter_auprcs[i] <- tryCatch(
        pr.curve(scores.class0=prob_boot, weights.class0=y_boot, 
                 curve=FALSE)$auc.integral,
        error = function(e) NA_real_
      )
      iter_auprgs[i] <- compute_auprg(y_boot, prob_boot)
    }
    
    shap_matrices[[i]] <- tryCatch(
      as.matrix(fastshap::explain(
        object = fit$rf,
        X = as.data.frame(X_ext_imp),
        pred_wrapper = function(object, newdata) 
          predict(object, newdata, type="prob")[,"IR"],
        nsim = nsim_shap, adjust = TRUE
      )),
      error = function(e) NULL
    )
  }
  
  ens      <- rowMeans(all_probs_full, na.rm=TRUE)
  ens_pred <- factor(ifelse(ens >= 0.5, "IR", "Non.IR"), levels=c("Non.IR","IR"))
  
  ens_auc <- as.numeric(
    auc(roc(y_ext_fac, ens, levels=c("Non.IR","IR"), quiet=TRUE))
  )
  ens_auprc <- pr.curve(scores.class0=ens, weights.class0=y_ext_bin, 
                        curve=FALSE)$auc.integral
  ens_auprg <- compute_auprg(y_ext_bin, ens)
  
  valid_shap <- Filter(Negate(is.null), shap_matrices)
  ensemble_shap <- if (length(valid_shap) > 0)
    as.data.frame(Reduce("+", valid_shap) / length(valid_shap))
  else NULL
  
  shap_list_per_iter <- lapply(seq_len(n_iter), function(i) {
    if (is.null(shap_matrices[[i]])) return(NULL)
    list(iteration = i, model_name = model_label,
         shap = as.data.frame(shap_matrices[[i]]), X_df = X_ext_ref)
  })
  shap_list_per_iter <- Filter(Negate(is.null), shap_list_per_iter)
  
  list(
    auc_mean = round(mean(iter_aucs, na.rm=TRUE), 3),
    auc_lo = round(quantile(iter_aucs, 0.025, na.rm=TRUE), 3),
    auc_hi = round(quantile(iter_aucs, 0.975, na.rm=TRUE), 3),
    auprc_mean = round(mean(iter_auprcs, na.rm=TRUE), 3),
    auprc_lo = round(quantile(iter_auprcs, 0.025, na.rm=TRUE), 3),
    auprc_hi = round(quantile(iter_auprcs, 0.975, na.rm=TRUE), 3),
    auprg_mean = round(mean(iter_auprgs, na.rm=TRUE), 3),
    auprg_lo = round(quantile(iter_auprgs, 0.025, na.rm=TRUE), 3),
    auprg_hi = round(quantile(iter_auprgs, 0.975, na.rm=TRUE), 3),
    iter_aucs = iter_aucs,
    iter_auprcs = iter_auprcs,
    iter_auprgs = iter_auprgs,
    all_probs_matrix = all_probs_full,
    ens_auc = round(ens_auc, 3),
    ens_auprc = round(ens_auprc, 3),
    ens_auprg = round(ens_auprg, 3),
    ensemble_prob = ens,
    y_ext = y_ext_fac,
    cm = confusionMatrix(ens_pred, y_ext_fac),
    ensemble_shap = ensemble_shap,
    shap_list_per_iter = shap_list_per_iter,
    X_ext_ref = X_ext_ref,
    model_label = model_label
  )
}

ext_CGM  <- evaluate_external_bootstrap(
  feature_sets$CGM_model, df_test, df_external, bootstrap_seeds, "CGM model")
ext_SW   <- evaluate_external_bootstrap(
  feature_sets$Smartwatch_model, df_test, df_external, bootstrap_seeds, "Smartwatch model")
ext_Base <- evaluate_external_bootstrap(
  feature_sets$Baseline_model, df_test, df_external, bootstrap_seeds, "Baseline model")

ext_list <- list("CGM model" = ext_CGM, "Smartwatch model" = ext_SW, 
                 "Baseline model" = ext_Base)

5.1 External Performance Summary

ext_summary <- dplyr::bind_rows(lapply(names(ext_list), function(nm) {
  r <- ext_list[[nm]]
  data.frame(
    Model = nm,
    AUC_mean = r$auc_mean, AUC_lo = r$auc_lo, AUC_hi = r$auc_hi,
    AUPRC_mean = r$auprc_mean, AUPRC_lo = r$auprc_lo, AUPRC_hi = r$auprc_hi,
    AUPRG_mean = r$auprg_mean, AUPRG_lo = r$auprg_lo, AUPRG_hi = r$auprg_hi
  )
}))

ext_summary

6 SHAP Feature Importance

feature_name_map <- c(
  "bmi" = "BMI",
  "mean_fasting_glucose" = "Mean Fasting Glucose",
  "sd_fasting_glucose" = "SD Fasting Glucose",
  "excursions_per_day" = "Glucose Excursions / Day",
  "rec_time_50pct_median" = "Recovery Time (50th pct)",
  "overall_max_glucose" = "Overall Max Glucose",
  "age" = "Age",
  "whr" = "WHR",
  "stress_overall_daily_mean_stress" = "Mean Stress (HRV)",
  "sd_hr_day_night_diff" = "HR Day-Night Variability"
)

rename_features <- function(x) 
  ifelse(x %in% names(feature_name_map), feature_name_map[x], x)

aggregate_shap <- function(shap_list, model_filter) {
  entries <- Filter(function(x) x$model_name == model_filter, shap_list)
  mat <- do.call(rbind, lapply(entries, 
           function(e) colMeans(abs(as.data.frame(e$shap)))))
  data.frame(feature = colnames(mat), mean_shap = colMeans(mat),
             sd_shap = apply(mat, 2, sd), row.names = NULL) %>%
    arrange(desc(mean_shap)) %>%
    mutate(pct_share = mean_shap / sum(mean_shap) * 100,
           feature = rename_features(feature))
}

collect_shap_long <- function(shap_list, model_filter) {
  entries <- Filter(function(x) x$model_name == model_filter, shap_list)
  bind_rows(lapply(entries, function(e) {
    sd <- as.data.frame(e$shap) %>%
      mutate(obs = row_number(), iteration = e$iteration) %>%
      pivot_longer(-c(obs, iteration), names_to = "feature", values_to = "shap_value") %>%
      mutate(feature = rename_features(feature))
    fd <- as.data.frame(e$X_df) %>%
      mutate(obs = row_number()) %>%
      pivot_longer(-obs, names_to = "feature", values_to = "feature_value") %>%
      mutate(feature = rename_features(feature))
    left_join(sd, fd, by = c("obs", "feature"))
  }))
}

panel_beeswarm <- function(shap_list, model_filter, shared_x_range = NULL, 
                          is_external = FALSE, base_size = 14) {
  agg  <- aggregate_shap(shap_list, model_filter)
  long <- collect_shap_long(shap_list, model_filter)
  lmap <- agg %>%
    mutate(label = paste0(feature, " (", round(pct_share, 1), "%)")) %>%
    dplyr::select(feature, label)
  long <- long %>%
    left_join(lmap, by = "feature") %>%
    group_by(feature) %>%
    mutate(feat_scaled = (feature_value - min(feature_value, na.rm=TRUE)) /
             (max(feature_value, na.rm=TRUE) - min(feature_value, na.rm=TRUE) + 1e-9)) %>%
    ungroup() %>%
    mutate(label = factor(label, 
             levels = lmap$label[match(rev(agg$feature), lmap$feature)]))
  
  title_clean <- gsub("^Model ", "", model_filter)
  title_clean <- paste0(toupper(substring(title_clean, 1, 1)), 
                        substring(title_clean, 2))
  
  p <- ggplot(long, aes(x = shap_value, y = label, colour = feat_scaled)) +
    geom_jitter(height = 0.20, 
                alpha = ifelse(is_external, 0.80, 0.35),
                size = ifelse(is_external, 0.9, 0.6), stroke = 0) +
    scale_colour_gradient(low = "#3B82C4", high = "#E84545",
                          name = "Feature\nvalue", breaks = c(0,1), 
                          labels = c("Low","High")) +
    geom_vline(xintercept = 0, linetype = "dashed", 
               colour = "grey60", linewidth = 0.4) +
    labs(title = title_clean, x = "SHAP value", y = NULL) +
    scale_y_discrete(expand = expansion(add = 0.5)) +
    theme_bw(base_size = base_size) +
    theme(
      plot.title = element_text(face = "bold", hjust = 0.5, size = base_size),
      axis.text.y = element_text(size = base_size-2, face = "bold"),
      axis.text.x = element_text(size = base_size-4),
      axis.title.x = element_text(size = base_size-3),
      legend.position = "none",
      panel.grid.minor = element_blank()
    )
  
  if (!is.null(shared_x_range))
    p <- p + coord_cartesian(xlim = shared_x_range)
  
  p
}

6.1 SHAP Beeswarm — Internal

model_names <- c("CGM model", "Smartwatch model", "Baseline model")

all_vals <- unlist(lapply(model_names, function(mn)
  unlist(lapply(Filter(function(x) x$model_name == mn, all_shap_val),
               function(e) as.numeric(as.matrix(e$shap))))))
srng <- c(-max(abs(all_vals), na.rm=TRUE)*1.05, max(abs(all_vals), na.rm=TRUE)*1.05)

panels <- lapply(seq_along(model_names), function(j) {
  panel_beeswarm(all_shap_val, model_names[j], shared_x_range = srng) +
    labs(tag = LETTERS[j]) +
    theme(plot.tag = element_text(face = "bold", size = 15))
})

dummy <- ggplot(data.frame(x=0, y=0, v=c(0,1)), aes(x, y, colour=v)) +
  geom_point() +
  scale_colour_gradient(low = "#3B82C4", high = "#E84545",
                        name = "Feature\nvalue", breaks = c(0,1), 
                        labels = c("Low","High")) +
  theme(legend.position = "right",
        legend.title = element_text(size = 10),
        legend.text = element_text(size = 9),
        legend.key.height = unit(1.2, "cm"))
leg <- cowplot::get_legend(dummy)

(Reduce(`|`, panels) | patchwork::wrap_elements(leg)) +
  plot_layout(widths = c(rep(1, length(panels)), 0.15)) +
  plot_annotation(
    title = "SHAP Beeswarm — Internal Validation",
    theme = theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 14)))

6.2 SHAP Beeswarm — External

all_shap_ext <- list(
  list(iteration = 1, model_name = "CGM model",
       shap = ext_CGM$ensemble_shap, X_df = ext_CGM$X_ext_ref),
  list(iteration = 1, model_name = "Smartwatch model",
       shap = ext_SW$ensemble_shap, X_df = ext_SW$X_ext_ref),
  list(iteration = 1, model_name = "Baseline model",
       shap = ext_Base$ensemble_shap, X_df = ext_Base$X_ext_ref)
)

all_vals_ext <- unlist(lapply(model_names, function(mn)
  unlist(lapply(Filter(function(x) x$model_name == mn, all_shap_ext),
               function(e) as.numeric(as.matrix(e$shap))))))
srng_ext <- c(-max(abs(all_vals_ext), na.rm=TRUE)*1.05, 
              max(abs(all_vals_ext), na.rm=TRUE)*1.05)

panels_ext <- lapply(seq_along(model_names), function(j) {
  panel_beeswarm(all_shap_ext, model_names[j], shared_x_range = srng_ext, 
                is_external = TRUE) +
    labs(tag = LETTERS[j]) +
    theme(plot.tag = element_text(face = "bold", size = 15))
})

(Reduce(`|`, panels_ext) | patchwork::wrap_elements(leg)) +
  plot_layout(widths = c(rep(1, length(panels_ext)), 0.15)) +
  plot_annotation(
    title = "SHAP Beeswarm — External Validation",
    theme = theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 14)))

7 Final Publication Figure: ROC + Decision Curve

# ==============================================================================
# PANEL A: ROC CURVE WITH MONOTONIC 95% CI
# ==============================================================================

y_ext_fac <- ext_CGM$y_ext
y_ext_bin <- as.integer(y_ext_fac == "IR")
ir_prev   <- mean(y_ext_bin)

# Function to compute MONOTONIC ROC curve with 95% CI from bootstrap
roc_band_monotonic <- function(prob_matrix, y_fac, n_boot = 200) {
  fpr_grid <- seq(0, 1, by = 0.01)
  n_grid   <- length(fpr_grid)
  tpr_mat  <- matrix(NA_real_, nrow = n_grid, ncol = n_boot)
  
  for (i in seq_len(n_boot)) {
    prob_i <- prob_matrix[, i]
    if (all(is.na(prob_i))) next
    
    r <- tryCatch(
      pROC::roc(y_fac, prob_i, levels = c("Non.IR", "IR"), 
                direction = "<", quiet = TRUE),
      error = function(e) NULL
    )
    if (is.null(r)) next
    
    fpr_i <- 1 - r$specificities
    tpr_i <- r$sensitivities
    
    # Sort by FPR (required for proper interpolation)
    ord   <- order(fpr_i)
    fpr_i <- fpr_i[ord]
    tpr_i <- tpr_i[ord]
    
    # Interpolate onto common grid
    tpr_interp <- approx(fpr_i, tpr_i, xout = fpr_grid, 
                         method = "linear", rule = 2)$y
    
    # ENFORCE MONOTONICITY
    tpr_interp <- cummax(tpr_interp)
    tpr_mat[, i] <- tpr_interp
  }
  
  tpr_mean <- rowMeans(tpr_mat, na.rm = TRUE)
  tpr_lo   <- apply(tpr_mat, 1, quantile, probs = 0.025, na.rm = TRUE)
  tpr_hi   <- apply(tpr_mat, 1, quantile, probs = 0.975, na.rm = TRUE)
  
  # Final enforcement on mean
  tpr_mean <- cummax(tpr_mean)
  tpr_lo   <- cummax(tpr_lo)
  tpr_hi   <- cummax(tpr_hi)
  
  data.frame(fpr = fpr_grid, tpr = tpr_mean, lo = tpr_lo, hi = tpr_hi)
}

# Original full-run code:
# band_cgm  <- roc_band_monotonic(ext_CGM$all_probs_matrix,  y_ext_fac, 200)
# band_sw   <- roc_band_monotonic(ext_SW$all_probs_matrix,   y_ext_fac, 200)
# band_base <- roc_band_monotonic(ext_Base$all_probs_matrix, y_ext_fac, 200)
# added: use the active iteration count so this 5-iteration copy stays consistent.
band_cgm  <- roc_band_monotonic(ext_CGM$all_probs_matrix,  y_ext_fac, n_iter)
band_sw   <- roc_band_monotonic(ext_SW$all_probs_matrix,   y_ext_fac, n_iter)
band_base <- roc_band_monotonic(ext_Base$all_probs_matrix, y_ext_fac, n_iter)

band_cgm$model  <- "CGM model"
band_sw$model   <- "Smartwatch model"
band_base$model <- "Baseline model"

band_df <- rbind(band_cgm, band_sw, band_base) %>%
  mutate(model = factor(model, levels = c("Baseline model", "Smartwatch model", "CGM model")))

# Original code: plotted ROC from ensemble-averaged probabilities. This did
# not match the shaded CI, which is computed from per-seed ROC curves.
# make_roc_step <- function(prob, y_fac, model_name) {
#   r <- pROC::roc(y_fac, prob, levels = c("Non.IR", "IR"), direction = "<", quiet = TRUE)
#   data.frame(fpr = 1 - r$specificities, tpr = r$sensitivities, model = model_name)
# }
#
# ext_probs <- list(
#   CGM = ext_CGM$ensemble_prob,
#   Smartwatch = ext_SW$ensemble_prob,
#   Baseline = ext_Base$ensemble_prob
# )
#
# step_df <- rbind(
#   make_roc_step(ext_probs$CGM,        y_ext_fac, "CGM model"),
#   make_roc_step(ext_probs$Smartwatch, y_ext_fac, "Smartwatch model"),
#   make_roc_step(ext_probs$Baseline,   y_ext_fac, "Baseline model")
# ) %>% mutate(model = factor(model, levels = c("Baseline model", "Smartwatch model", "CGM model")))
#
# added: plot the pointwise mean ROC from the same per-seed curves that define
# the shaded confidence bands, and sort explicitly before geom_step().
step_df <- band_df %>%
  dplyr::select(fpr, tpr, model) %>%
  dplyr::arrange(model, fpr, tpr)

# AUC stats for legend
compute_auc_stats <- function(prob_matrix, y_bin) {
  aucs <- apply(prob_matrix, 2, function(p) {
    if (all(is.na(p))) return(NA_real_)
    tryCatch({
      roc_obj <- pROC::roc(y_bin, p, levels = c(0, 1), direction = "<", quiet = TRUE)
      as.numeric(pROC::auc(roc_obj))
    }, error = function(e) NA_real_)
  })
  list(mean = mean(aucs, na.rm=TRUE), sd = sd(aucs, na.rm=TRUE),
       lo = quantile(aucs, 0.025, na.rm=TRUE), hi = quantile(aucs, 0.975, na.rm=TRUE))
}

auc_cgm  <- compute_auc_stats(ext_CGM$all_probs_matrix,  y_ext_bin)
auc_sw   <- compute_auc_stats(ext_SW$all_probs_matrix,   y_ext_bin)
auc_base <- compute_auc_stats(ext_Base$all_probs_matrix, y_ext_bin)

legend_labels <- c(
  "Baseline model" = sprintf("Baseline model  AUC = %.3f ±%.3f [%.3f–%.3f]",
                             auc_base$mean, auc_base$sd, auc_base$lo, auc_base$hi),
  "Smartwatch model" = sprintf("Smartwatch model  AUC = %.3f ±%.3f [%.3f–%.3f]",
                               auc_sw$mean, auc_sw$sd, auc_sw$lo, auc_sw$hi),
  "CGM model" = sprintf("CGM model  AUC = %.3f ±%.3f [%.3f–%.3f]",
                        auc_cgm$mean, auc_cgm$sd, auc_cgm$lo, auc_cgm$hi)
)

model_cols <- c("CGM model" = "#1f77b4", "Smartwatch model" = "#2ca02c", 
                "Baseline model" = "#d62728")

p_roc <- ggplot() +
  geom_ribbon(data = band_df, aes(x = fpr, ymin = lo, ymax = hi, fill = model),
              alpha = 0.20, show.legend = FALSE) +
  geom_step(data = step_df, aes(x = fpr, y = tpr, colour = model),
            linewidth = 1.2, direction = "hv") +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", 
              colour = "grey50", linewidth = 0.8) +
  annotate("text", x = 0.70, y = 0.63, label = "Chance",
           angle = 38, colour = "grey50", size = 5.5) +
  # Original code hard-coded the full-run iteration count:
  # annotate("text", x = 0.02, y = 0.96,
  #          label = "Shaded band = 95% CI\nacross 200 bootstraps",
  #          hjust = 0, vjust = 1, size = 5, colour = "grey40") +
  # added: report the actual iteration count used by this script.
  annotate("text", x = 0.02, y = 0.96,
           label = sprintf("Shaded band = 95%% CI\nacross %d bootstraps", n_iter),
           hjust = 0, vjust = 1, size = 5, colour = "grey40") +
  scale_colour_manual(values = model_cols, labels = legend_labels, name = NULL) +
  scale_fill_manual(values = model_cols, guide = "none") +
  scale_x_continuous(breaks = seq(0, 1, 0.2), limits = c(0, 1),
                     expand = expansion(mult = 0.01)) +
  scale_y_continuous(breaks = seq(0, 1, 0.2), limits = c(0, 1),
                     expand = expansion(mult = 0.01)) +
  coord_equal() +
  labs(title = "A", x = "1 − Specificity (FPR)", y = "Sensitivity (TPR)") +
  theme_bw(base_size = 18) +
  theme(
    plot.title = element_text(face = "bold", hjust = 0, size = 28),
    axis.title = element_text(size = 20, face = "bold"),
    axis.text = element_text(size = 18),
    legend.position = c(0.62, 0.22),
    legend.background = element_rect(fill = "white", colour = "grey70", linewidth = 0.5),
    legend.text = element_text(size = 14),
    legend.spacing.y = unit(6, "pt"),
    panel.grid.minor = element_blank()
  )

# ==============================================================================
# PANEL B: DECISION CURVE ANALYSIS
# ==============================================================================

# added: keep ensemble probabilities for the DCA panel only. These were
# previously also used for the ROC line, which created the mismatch above.
ext_probs <- list(
  CGM = ext_CGM$ensemble_prob,
  Smartwatch = ext_SW$ensemble_prob,
  Baseline = ext_Base$ensemble_prob
)

nb_fn <- function(y, prob, t) {
  pred <- as.integer(prob >= t)
  tp   <- sum(pred == 1 & y == 1)
  fp   <- sum(pred == 1 & y == 0)
  tp / length(y) - fp / length(y) * t / (1 - t)
}

thresholds <- seq(0.05, 0.50, by = 0.01)

dca_df <- do.call(rbind, lapply(thresholds, function(t) {
  data.frame(
    threshold = t,
    model = c("CGM", "Smartwatch", "Baseline", "Treat all", "Treat none"),
    net_benefit = c(
      nb_fn(y_ext_bin, ext_probs$CGM, t),
      nb_fn(y_ext_bin, ext_probs$Smartwatch, t),
      nb_fn(y_ext_bin, ext_probs$Baseline, t),
      ir_prev - (1 - ir_prev) * t / (1 - t),
      0
    )
  )
}))

p_dca <- ggplot(dca_df, aes(x = threshold, y = net_benefit, 
                            colour = model, linetype = model)) +
  geom_line(linewidth = 1.2) +
  scale_colour_manual(
    values = c("CGM" = "#1f77b4", "Smartwatch" = "#2ca02c", 
               "Baseline" = "#d62728", "Treat all" = "#888888", 
               "Treat none" = "#bbbbbb"),
    labels = c("CGM" = "CGM model", "Smartwatch" = "Smartwatch model",
               "Baseline" = "Baseline", "Treat all" = "Treat all", 
               "Treat none" = "Treat none")
  ) +
  scale_linetype_manual(
    values = c("CGM" = "solid", "Smartwatch" = "dashed", 
               "Baseline" = "dotdash", "Treat all" = "dotted", 
               "Treat none" = "dotted"),
    labels = c("CGM" = "CGM model", "Smartwatch" = "Smartwatch model",
               "Baseline" = "Baseline", "Treat all" = "Treat all", 
               "Treat none" = "Treat none")
  ) +
  geom_hline(yintercept = 0, colour = "black", linewidth = 0.6) +
  geom_vline(xintercept = ir_prev, colour = "grey40",
             linetype = "dashed", linewidth = 0.8, alpha = 0.4) +
  annotate("text", x = ir_prev + 0.01, y = 0.38,
           label = sprintf("Prevalence\n(%.2f)", ir_prev),
           size = 5.5, colour = "grey30", hjust = 0) +
  coord_cartesian(xlim = c(0.05, 0.50), ylim = c(-0.06, 0.42)) +
  labs(title = "B", x = "Threshold probability", y = "Net benefit",
       colour = NULL, linetype = NULL) +
  theme_bw(base_size = 18) +
  theme(
    plot.title = element_text(face = "bold", hjust = 0, size = 28),
    axis.title = element_text(size = 20, face = "bold"),
    axis.text = element_text(size = 18),
    legend.position = "right",
    legend.text = element_text(size = 14),
    legend.spacing.y = unit(4, "pt"),
    panel.grid.minor = element_blank()
  )

# COMBINE
fig_final <- p_roc | p_dca
print(fig_final)

# Original full-run output filenames:
# ggsave("figure_final_roc_dca.pdf", fig_final, width = 16, height = 7.5, device = cairo_pdf)
# ggsave("figure_final_roc_dca.png", fig_final, width = 16, height = 7.5, dpi = 300)
# added: write separate files so this 5-iteration review run does not overwrite
# the full-run outputs.
ggsave("figure_final_roc_dca_5iter.pdf", fig_final, width = 16, height = 7.5, device = cairo_pdf)
ggsave("figure_final_roc_dca_5iter.png", fig_final, width = 16, height = 7.5, dpi = 300)

All analyses complete. Final figure saved as figure_final_roc_dca_5iter.pdf and .png