library(randomForest) # direct RF fitting for final models (fast, no CV)
library(fastshap)
library(dplyr)
library(caret)
library(pROC)
library(PRROC)
library(prg) # Precision-Recall Gain curves (Flach & Kull 2015)
library(tidyr)
library(patchwork)
library(cowplot)
library(ggplot2)
1 · Data
df_test <- read.csv("~/Downloads/read-ai/df_test_f_v2.csv")
df_external <- read.csv("~/Downloads/read-ai/df_external_f_v2.csv")
df_sleep <- read.csv("~/Downloads/sleep_participant_summary-3.csv")
# Keep only the 3 sleep features selected by forward-selection analysis
sleep_cols <- c("participant_id",
"mean_midpoint_clock_h",
"sri",
"social_jetlag_h")
df_test <- left_join(df_test, df_sleep[, sleep_cols], by = "participant_id")
df_external <- left_join(df_external, df_sleep[, sleep_cols], by = "participant_id")
cat(sprintf(
"Internal : n=%d IR=%d Non-IR=%d sleep_missing=%d\n",
nrow(df_test),
sum(df_test$IR_label == "IR"),
sum(df_test$IR_label == "Non-IR"),
sum(is.na(df_test$mean_midpoint_clock_h))
))
## Internal : n=97 IR=32 Non-IR=65 sleep_missing=2
cat(sprintf(
"External : n=%d IR=%d Non-IR=%d sleep_missing=%d\n",
nrow(df_external),
sum(df_external$IR_label == "IR"),
sum(df_external$IR_label == "Non-IR"),
sum(is.na(df_external$mean_midpoint_clock_h))
))
## External : n=61 IR=19 Non-IR=42 sleep_missing=0
2 · Feature sets & model names
feature_sets <- list(
CGM_model = c(
"bmi", "mean_fasting_glucose", "sd_fasting_glucose",
"excursions_per_day", "rec_time_50pct_median",
"overall_max_glucose", "age", "whr"
),
Smartwatch_only_model = c(
"stress_overall_daily_mean_stress", "sd_hr_day_night_diff",
"age", "bmi", "whr"
),
Baseline_model = c("bmi", "age", "whr")
)
model_names <- c("Model CGM", "Model smartwatch", "Model baseline")
for (nm in names(feature_sets)) {
p <- length(feature_sets[[nm]])
cat(sprintf(" %-25s p=%d mtry=%d\n", nm, p, floor(sqrt(p))))
}
## CGM_model p=8 mtry=2
## Smartwatch_only_model p=5 mtry=2
## Baseline_model p=3 mtry=1
3 · Helper functions
standardize <- function(df, model_name) {
df <- as.data.frame(df); df$Model <- model_name; df
}
# AUPRG — Precision-Recall Gain curve (Flach & Kull, NeurIPS 2015)
compute_auprg <- function(labels, scores) {
tryCatch({
curve <- prg::create_prg_curve(labels, scores)
prg::calc_auprg(curve)
}, error = function(e) NA_real_)
}
compute_metrics <- function(y_true, y_pred, y_prob,
dataset = "Unknown",
n_boot = 2000,
seed = 42) {
cm <- confusionMatrix(y_pred, y_true)
acc <- as.numeric(cm$overall["Accuracy"])
sens <- as.numeric(cm$byClass["Sensitivity"])
spec <- as.numeric(cm$byClass["Specificity"])
balacc <- (sens + spec) / 2
prec <- as.numeric(cm$byClass["Pos Pred Value"])
f1 <- ifelse(is.na(prec) | prec + sens == 0, NA_real_,
2 * prec * sens / (prec + sens))
roc_obj <- roc(y_true, y_prob, levels = c("Non.IR", "IR"), quiet = TRUE)
auc_val <- as.numeric(auc(roc_obj))
pr_obj <- pr.curve(scores.class0 = y_prob,
weights.class0 = ifelse(y_true == "IR", 1, 0),
curve = FALSE)
prauc <- pr_obj$auc.integral
auc_lo <- NA_real_; auc_hi <- NA_real_; auc_method <- "DeLong"
tryCatch({
dl <- as.numeric(ci.auc(roc_obj, method = "delong", conf.level = 0.95))
auc_lo <- dl[1]; auc_hi <- dl[3]
}, error = function(e) { auc_method <<- "Bootstrap" })
set.seed(seed)
n <- length(y_true)
mat <- matrix(NA_real_, nrow = n_boot, ncol = 7,
dimnames = list(NULL,
c("acc","sens","spec","balacc","prec","f1","prauc")))
for (i in seq_len(n_boot)) {
idx_b <- sample(n, n, replace = TRUE)
yb <- y_true[idx_b]; pb_cls <- y_pred[idx_b]; pb_pr <- y_prob[idx_b]
if (length(unique(yb)) < 2) next
cm_b <- confusionMatrix(pb_cls, yb)
s_b <- as.numeric(cm_b$byClass["Sensitivity"])
sp_b <- as.numeric(cm_b$byClass["Specificity"])
pr_b <- as.numeric(cm_b$byClass["Pos Pred Value"])
f1_b <- ifelse(is.na(pr_b) | pr_b + s_b == 0, NA_real_,
2 * pr_b * s_b / (pr_b + s_b))
prauc_b <- tryCatch(
pr.curve(scores.class0 = pb_pr,
weights.class0 = ifelse(yb == "IR", 1, 0),
curve = FALSE)$auc.integral,
error = function(e) NA_real_)
mat[i, ] <- c(as.numeric(cm_b$overall["Accuracy"]),
s_b, sp_b, (s_b + sp_b) / 2, pr_b, f1_b, prauc_b)
}
ci <- apply(mat, 2, quantile, probs = c(0.025, 0.975), na.rm = TRUE)
if (is.na(auc_lo)) {
auc_lo <- ci["2.5%", "auc"]
auc_hi <- ci["97.5%", "auc"]
}
data.frame(
Dataset = dataset,
AUC_ROC = round(auc_val, 3),
AUC_lo = round(auc_lo, 3),
AUC_hi = round(auc_hi, 3),
AUC_CI_method = auc_method,
AUPRC = round(prauc, 3),
AUPRC_lo = round(ci["2.5%", "prauc"], 3),
AUPRC_hi = round(ci["97.5%", "prauc"], 3),
Sensitivity = round(sens, 3),
Sensitivity_lo = round(ci["2.5%", "sens"], 3),
Sensitivity_hi = round(ci["97.5%", "sens"], 3),
Specificity = round(spec, 3),
Specificity_lo = round(ci["2.5%", "spec"], 3),
Specificity_hi = round(ci["97.5%", "spec"], 3),
Balanced_Accuracy = round(balacc, 3),
BalAcc_lo = round(ci["2.5%", "balacc"],3),
BalAcc_hi = round(ci["97.5%", "balacc"],3),
Precision = round(prec, 3),
Precision_lo = round(ci["2.5%", "prec"], 3),
Precision_hi = round(ci["97.5%", "prec"], 3),
F1 = round(f1, 3),
F1_lo = round(ci["2.5%", "f1"], 3),
F1_hi = round(ci["97.5%", "f1"], 3),
Accuracy = round(acc, 3),
Accuracy_lo = round(ci["2.5%", "acc"], 3),
Accuracy_hi = round(ci["97.5%", "acc"], 3),
stringsAsFactors = FALSE
)
}
4 · Bootstrap training function (internal, with SHAP)
train_model_with_shap <- function(features, df, train_idx) {
missing <- setdiff(features, names(df))
if (length(missing) > 0)
stop(paste("Missing columns:", paste(missing, collapse = ", ")))
X <- df %>%
dplyr::select(all_of(features)) %>%
mutate(across(where(is.numeric), as.double))
y <- factor(trimws(df$IR_label))
levels(y) <- make.names(levels(y))
X_train <- X[train_idx, ]; X_test <- X[-train_idx, ]
y_train <- y[train_idx]; y_test <- y[-train_idx]
p <- length(features)
mtry <- floor(sqrt(p))
ctrl <- trainControl(
method = "repeatedcv",
number = 5,
repeats = 20,
sampling = "up",
classProbs = TRUE,
summaryFunction = twoClassSummary
)
model <- train(
x = X_train,
y = y_train,
method = "rf",
trControl = ctrl,
preProcess = "medianImpute",
tuneGrid = expand.grid(mtry = mtry),
ntree = 300,
metric = "ROC"
)
prob_test <- predict(model, X_test, type = "prob")[, "IR"]
pred_test <- predict(model, X_test)
cm_test <- confusionMatrix(pred_test, y_test)
roc_test <- roc(y_test, prob_test, levels = c("Non.IR", "IR"))
auc_test <- as.numeric(auc(roc_test))
pr_test <- pr.curve(scores.class0 = prob_test,
weights.class0 = ifelse(y_test == "IR", 1, 0),
curve = FALSE)
internal_holdout_metrics <- data.frame(
Accuracy = cm_test$overall["Accuracy"],
Sensitivity = cm_test$byClass["Sensitivity"],
Specificity = cm_test$byClass["Specificity"],
Balanced_Accuracy = (cm_test$byClass["Sensitivity"] +
cm_test$byClass["Specificity"]) / 2,
Precision = cm_test$byClass["Pos Pred Value"],
F1 = 2 * cm_test$byClass["Pos Pred Value"] * cm_test$byClass["Sensitivity"] /
(cm_test$byClass["Pos Pred Value"] + cm_test$byClass["Sensitivity"]),
AUC = auc_test,
PRAUC = pr_test$auc.integral,
AUPRG = compute_auprg(as.integer(y_test == "IR"), prob_test)
)
pfun <- function(object, newdata)
predict(object, newdata, type = "prob")[, "IR"]
shap_values <- fastshap::explain(
object = model,
X = as.data.frame(X_test),
pred_wrapper = pfun,
nsim = 50,
adjust = TRUE
)
list(
model = model,
features = features,
internal_holdout_metrics = internal_holdout_metrics,
shap_validation = shap_values,
X_test_df = as.data.frame(X_test)
)
}
5 · Fast multi-seed final model function
evaluate_multiseed_fast <- function(features, df_train, df_ext,
N_SEEDS = 20, ntree = 300) {
p <- length(features)
mtry <- floor(sqrt(p))
X_ext_raw <- df_ext %>%
dplyr::select(all_of(features)) %>%
mutate(across(where(is.numeric), as.double))
y_ext <- factor(make.names(trimws(df_ext$IR_label)))
all_probs <- matrix(NA_real_, nrow = nrow(df_ext), ncol = N_SEEDS)
all_aucs <- numeric(N_SEEDS)
all_auprcs <- numeric(N_SEEDS)
for (s in seq_len(N_SEEDS)) {
set.seed(s)
X_tr <- df_train %>%
dplyr::select(all_of(features)) %>%
mutate(across(where(is.numeric), as.double))
y_tr <- factor(make.names(trimws(df_train$IR_label)))
imp <- caret::preProcess(X_tr, method = "medianImpute")
X_tr_imp <- predict(imp, X_tr)
X_ex_imp <- predict(imp, X_ext_raw)
up_data <- caret::upSample(x = X_tr_imp, y = y_tr, yname = ".outcome")
y_up <- up_data$.outcome
X_up <- up_data[, names(X_tr_imp), drop = FALSE]
rf <- randomForest::randomForest(
x = X_up, y = y_up,
ntree = ntree, mtry = mtry, nodesize = 1
)
prob <- predict(rf, X_ex_imp, type = "prob")[, "IR"]
all_probs[, s] <- prob
roc_s <- roc(y_ext, prob, levels = c("Non.IR","IR"), quiet = TRUE)
all_aucs[s] <- as.numeric(auc(roc_s))
all_auprcs[s] <- pr.curve(scores.class0 = prob,
weights.class0 = ifelse(y_ext == "IR", 1, 0),
curve = FALSE)$auc.integral
}
all_auprgs <- sapply(seq_len(N_SEEDS), function(s) {
compute_auprg(as.integer(y_ext == "IR"), all_probs[, s])
})
ensemble_prob <- rowMeans(all_probs)
auc_mean <- mean(all_aucs); auc_sd <- sd(all_aucs)
auprc_mean <- mean(all_auprcs); auprc_sd <- sd(all_auprcs)
auprg_mean <- mean(all_auprgs); auprg_sd <- sd(all_auprgs)
roc_ens <- roc(y_ext, ensemble_prob, levels = c("Non.IR","IR"), quiet = TRUE)
ens_auc <- as.numeric(auc(roc_ens))
dl <- tryCatch(
as.numeric(ci.auc(roc_ens, method = "delong", conf.level = 0.95)),
error = function(e) c(NA, NA, NA)
)
ens_auprc <- pr.curve(scores.class0 = ensemble_prob,
weights.class0 = ifelse(y_ext == "IR", 1, 0),
curve = FALSE)$auc.integral
n_ext <- length(y_ext)
set.seed(42)
prauc_boot <- replicate(2000, {
idx <- sample(n_ext, n_ext, replace = TRUE)
yb <- y_ext[idx]; pb <- ensemble_prob[idx]
if (length(unique(yb)) < 2) return(NA_real_)
tryCatch(
pr.curve(scores.class0 = pb,
weights.class0 = ifelse(yb == "IR", 1, 0),
curve = FALSE)$auc.integral,
error = function(e) NA_real_)
})
auprc_lo <- quantile(prauc_boot, 0.025, na.rm = TRUE)
auprc_hi <- quantile(prauc_boot, 0.975, na.rm = TRUE)
ens_auprg <- compute_auprg(as.integer(y_ext == "IR"), ensemble_prob)
set.seed(43)
auprg_boot <- replicate(2000, {
idx <- sample(n_ext, n_ext, replace = TRUE)
yb <- y_ext[idx]; pb <- ensemble_prob[idx]
if (length(unique(yb)) < 2) return(NA_real_)
tryCatch(compute_auprg(as.integer(yb == "IR"), pb), error = function(e) NA_real_)
})
auprg_lo <- quantile(auprg_boot, 0.025, na.rm = TRUE)
auprg_hi <- quantile(auprg_boot, 0.975, na.rm = TRUE)
ens_pred <- factor(ifelse(ensemble_prob >= 0.5, "IR", "Non.IR"),
levels = c("Non.IR", "IR"))
cm_ens <- confusionMatrix(ens_pred, y_ext)
list(
auc_mean = round(auc_mean, 3), auc_sd = round(auc_sd, 3),
auc_lo_seed = round(min(all_aucs), 3), auc_hi_seed = round(max(all_aucs), 3),
auprc_mean = round(auprc_mean, 3), auprc_sd = round(auprc_sd, 3),
auprg_mean = round(auprg_mean, 3), auprg_sd = round(auprg_sd, 3),
all_aucs = all_aucs, all_auprcs = all_auprcs, all_auprgs = all_auprgs,
ens_auc = round(ens_auc, 3), ens_auc_lo = round(dl[1], 3), ens_auc_hi = round(dl[3], 3),
ens_auprc = round(ens_auprc, 3), ens_auprc_lo = round(auprc_lo, 3), ens_auprc_hi = round(auprc_hi, 3),
ens_auprg = round(ens_auprg, 3), ens_auprg_lo = round(auprg_lo, 3), ens_auprg_hi = round(auprg_hi, 3),
ensemble_prob = ensemble_prob, y_ext = y_ext, cm = cm_ens, N_SEEDS = N_SEEDS
)
}
6 · Bootstrap loop (100 iterations — internal metrics)
all_results <- list()
all_shap_val <- list()
set.seed(123)
n_iter <- 100
for (i in seq_len(n_iter)) {
y <- factor(trimws(df_test$IR_label))
levels(y) <- make.names(levels(y))
train_idx <- createDataPartition(y, p = 0.7, list = FALSE)
CGM_model <- train_model_with_shap(feature_sets$CGM_model, df_test, train_idx)
SW_model <- train_model_with_shap(feature_sets$Smartwatch_only_model, df_test, train_idx)
Base_model <- train_model_with_shap(feature_sets$Baseline_model, df_test, train_idx)
trained_models <- list(CGM_model, SW_model, Base_model)
iter_results <- dplyr::bind_rows(
standardize(CGM_model$internal_holdout_metrics, "Model CGM") |> mutate(Dataset = "Internal"),
standardize(SW_model$internal_holdout_metrics, "Model smartwatch") |> mutate(Dataset = "Internal"),
standardize(Base_model$internal_holdout_metrics, "Model baseline") |> mutate(Dataset = "Internal")
) %>% mutate(iteration = i)
all_results[[i]] <- iter_results
for (m in seq_along(trained_models)) {
mdl <- trained_models[[m]]
all_shap_val[[length(all_shap_val) + 1]] <- list(
iteration = i,
model_name = model_names[m],
shap = mdl$shap_validation,
X_df = mdl$X_test_df
)
}
if (i %% 10 == 0) cat(sprintf(" Iteration %d / %d\n", i, n_iter))
}
## Iteration 10 / 100
## Iteration 20 / 100
## Iteration 30 / 100
## Iteration 40 / 100
## Iteration 50 / 100
## Iteration 60 / 100
## Iteration 70 / 100
## Iteration 80 / 100
## Iteration 90 / 100
## Iteration 100 / 100
bootstrap_results <- dplyr::bind_rows(all_results)
final_summary <- bootstrap_results %>%
group_by(Model, Dataset) %>%
summarise(
across(
c(Accuracy, Sensitivity, Specificity, Balanced_Accuracy,
Precision, F1, AUC, PRAUC, AUPRG),
list(mean = ~mean(.x, na.rm = TRUE),
lower = ~quantile(.x, 0.025, na.rm = TRUE),
upper = ~quantile(.x, 0.975, na.rm = TRUE)),
.names = "{.col}_{.fn}"
),
.groups = "drop"
)
cat("\nInternal bootstrap summary (mean over 100 iterations, 2.5-97.5% CI):\n")
##
## Internal bootstrap summary (mean over 100 iterations, 2.5-97.5% CI):
print(
final_summary %>%
dplyr::select(Model,
AUC_mean, AUC_lower, AUC_upper,
PRAUC_mean, PRAUC_lower, PRAUC_upper,
AUPRG_mean, AUPRG_lower, AUPRG_upper,
Sensitivity_mean, Specificity_mean, Balanced_Accuracy_mean) %>%
dplyr::rename(
AUC_ROC_mean = AUC_mean, AUC_ROC_lo = AUC_lower, AUC_ROC_hi = AUC_upper,
AUPRC_mean = PRAUC_mean, AUPRC_lo = PRAUC_lower, AUPRC_hi = PRAUC_upper,
AUPRG_lo = AUPRG_lower, AUPRG_hi = AUPRG_upper,
Sens_mean = Sensitivity_mean,
Spec_mean = Specificity_mean,
BalAcc_mean = Balanced_Accuracy_mean
) %>%
mutate(across(where(is.numeric), ~round(.x, 3)))
)
## # A tibble: 3 × 13
## Model AUC_ROC_mean AUC_ROC_lo AUC_ROC_hi AUPRC_mean AUPRC_lo AUPRC_hi
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Model CGM 0.833 0.684 0.953 0.725 0.488 0.915
## 2 Model baseline 0.817 0.655 0.933 0.644 0.419 0.853
## 3 Model smartwa… 0.812 0.663 0.933 0.637 0.43 0.875
## # ℹ 6 more variables: AUPRG_mean <dbl>, AUPRG_lo <dbl>, AUPRG_hi <dbl>,
## # Sens_mean <dbl>, Spec_mean <dbl>, BalAcc_mean <dbl>
write.csv(bootstrap_results, "bootstrap_results_v3.csv", row.names = FALSE)
write.csv(final_summary, "bootstrap_summary_v3.csv", row.names = FALSE)
7 · External validation — 20-seed ensemble
cat("Fitting 20-seed ensembles for external evaluation...\n")
## Fitting 20-seed ensembles for external evaluation...
ms_CGM <- evaluate_multiseed_fast(feature_sets$CGM_model, df_test, df_external)
ms_SW <- evaluate_multiseed_fast(feature_sets$Smartwatch_only_model, df_test, df_external)
ms_Base <- evaluate_multiseed_fast(feature_sets$Baseline_model, df_test, df_external)
ms_list <- list(
"CGM model" = ms_CGM,
"Smartwatch model" = ms_SW,
"Baseline model" = ms_Base
)
ms_summary <- dplyr::bind_rows(lapply(names(ms_list), function(nm) {
ms <- ms_list[[nm]]
data.frame(
Model = nm,
AUC_mean = ms$auc_mean, AUC_sd = ms$auc_sd,
AUC_lo_seed = ms$auc_lo_seed, AUC_hi_seed = ms$auc_hi_seed,
Ens_AUC = ms$ens_auc, Ens_AUC_lo = ms$ens_auc_lo, Ens_AUC_hi = ms$ens_auc_hi,
AUPRC_mean = ms$auprc_mean, AUPRC_sd = ms$auprc_sd,
Ens_AUPRC = ms$ens_auprc, Ens_AUPRC_lo = ms$ens_auprc_lo, Ens_AUPRC_hi = ms$ens_auprc_hi,
AUPRG_mean = ms$auprg_mean, AUPRG_sd = ms$auprg_sd,
Ens_AUPRG = ms$ens_auprg, Ens_AUPRG_lo = ms$ens_auprg_lo, Ens_AUPRG_hi = ms$ens_auprg_hi,
stringsAsFactors = FALSE
)
}))
cat("\nExternal — AUC-ROC and AUPRC (20-seed ensemble):\n")
##
## External — AUC-ROC and AUPRC (20-seed ensemble):
print(ms_summary %>% mutate(across(where(is.numeric), ~round(.x, 3))))
## Model AUC_mean AUC_sd AUC_lo_seed AUC_hi_seed Ens_AUC
## 2.5%...1 CGM model 0.878 0.012 0.858 0.900 0.890
## 2.5%...2 Smartwatch model 0.794 0.011 0.772 0.811 0.797
## 2.5%...3 Baseline model 0.753 0.009 0.737 0.776 0.763
## Ens_AUC_lo Ens_AUC_hi AUPRC_mean AUPRC_sd Ens_AUPRC Ens_AUPRC_lo
## 2.5%...1 0.789 0.990 0.802 0.032 0.825 0.633
## 2.5%...2 0.674 0.920 0.698 0.019 0.708 0.486
## 2.5%...3 0.625 0.901 0.568 0.035 0.582 0.373
## Ens_AUPRC_hi AUPRG_mean AUPRG_sd Ens_AUPRG Ens_AUPRG_lo Ens_AUPRG_hi
## 2.5%...1 0.954 0.866 0.033 0.883 0.706 0.981
## 2.5%...2 0.858 0.725 0.025 0.740 0.340 0.934
## 2.5%...3 0.826 0.633 0.026 0.679 0.313 0.899
# Ensemble probabilities and labels — used downstream in sections 8, 9, 14
ext_probs <- list(
"CGM model" = ms_CGM$ensemble_prob,
"Smartwatch model" = ms_SW$ensemble_prob,
"Baseline model" = ms_Base$ensemble_prob
)
y_ext_fac <- ms_CGM$y_ext
y_bin <- as.integer(y_ext_fac == "IR")
ir_prev <- mean(y_bin)
7b · Seed stability plot
seed_df <- dplyr::bind_rows(lapply(names(ms_list), function(nm) {
data.frame(
Model = nm,
AUC = ms_list[[nm]]$all_aucs,
AUPRC = ms_list[[nm]]$all_auprcs,
AUPRG = ms_list[[nm]]$all_auprgs
)
})) %>% mutate(Model = factor(Model, levels = names(ms_list)))
model_colors <- c(
"CGM model" = "#1f77b4",
"Smartwatch model" = "#2ca02c",
"Baseline model" = "#d62728"
)
summ_seed <- seed_df %>%
group_by(Model) %>%
summarise(m_auc = mean(AUC), sd_auc = sd(AUC), mx_auc = max(AUC),
m_auprc = mean(AUPRC), sd_auprc = sd(AUPRC), mx_auprc = max(AUPRC),
m_auprg = mean(AUPRG), sd_auprg = sd(AUPRG), mx_auprg = max(AUPRG),
.groups = "drop")
make_seed_panel <- function(df, summ, y_var, y_lab, y_ref = NULL, title = "") {
p <- ggplot(df, aes(x = Model, y = .data[[y_var]], colour = Model)) +
geom_jitter(width = 0.12, alpha = 0.65, size = 2, show.legend = FALSE) +
geom_crossbar(data = summ,
aes(x = Model, y = .data[[paste0("m_", tolower(y_var))]],
ymin = .data[[paste0("m_", tolower(y_var))]] - .data[[paste0("sd_", tolower(y_var))]],
ymax = .data[[paste0("m_", tolower(y_var))]] + .data[[paste0("sd_", tolower(y_var))]]),
width = 0.35, linewidth = 0.8, fill = NA, show.legend = FALSE) +
geom_text(data = summ,
aes(x = Model, y = .data[[paste0("mx_", tolower(y_var))]] + 0.018,
label = sprintf("%.3f\n\u00b1%.3f",
.data[[paste0("m_", tolower(y_var))]],
.data[[paste0("sd_", tolower(y_var))]])),
size = 3, vjust = 0, show.legend = FALSE) +
scale_colour_manual(values = model_colors) +
scale_x_discrete(labels = c("CGM model" = "CGM",
"Smartwatch model" = "Smartwatch",
"Baseline model" = "Baseline")) +
coord_cartesian(ylim = c(0.40, 1.05)) +
labs(title = title, x = NULL, y = y_lab) +
theme_bw(base_size = 12) +
theme(plot.title = element_text(face = "bold", size = 11),
panel.grid.minor = element_blank(),
panel.grid.major.x = element_blank())
if (!is.null(y_ref))
p <- p + geom_hline(yintercept = y_ref, linetype = "dashed",
colour = "grey50", linewidth = 0.7)
p
}
p_seed_auc <- make_seed_panel(seed_df, summ_seed, "AUC", "External AUC-ROC", 0.5, "A AUC-ROC")
p_seed_auprc <- make_seed_panel(seed_df, summ_seed, "AUPRC", "External AUPRC", ir_prev,"B AUPRC") +
annotate("text", x = 0.6, y = ir_prev + 0.04,
label = sprintf("Chance (%.2f)", ir_prev), size = 3, colour = "grey50")
p_seed_auprg <- make_seed_panel(seed_df, summ_seed, "AUPRG", "External AUPRG", 0, "C AUPRG") +
annotate("text", x = 0.6, y = 0.04, label = "Chance = 0", size = 3, colour = "grey50")
p_seed_combined <- p_seed_auc | p_seed_auprc | p_seed_auprg
print(p_seed_combined)

ggsave("seed_stability_v3.pdf", p_seed_combined, width = 16, height = 5, device = cairo_pdf)
## Warning in grSoftVersion(): unable to load shared object '/Library/Frameworks/R.framework/Resources/modules//R_X11.so':
## dlopen(/Library/Frameworks/R.framework/Resources/modules//R_X11.so, 0x0006): Library not loaded: /opt/X11/lib/libSM.6.dylib
## Referenced from: <7ECC4104-EC6A-38FD-9BEA-BFE0B870925C> /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/modules/R_X11.so
## Reason: tried: '/opt/X11/lib/libSM.6.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/X11/lib/libSM.6.dylib' (no such file), '/opt/X11/lib/libSM.6.dylib' (no such file), '/Library/Frameworks/R.framework/Resources/lib/libSM.6.dylib' (no such file), '/Library/Java/JavaVirtualMachines/jdk-11.0.18+10/Contents/Home/lib/server/libSM.6.dylib' (no such file)
## Warning in cairoVersion(): unable to load shared object '/Library/Frameworks/R.framework/Resources/library/grDevices/libs//cairo.so':
## dlopen(/Library/Frameworks/R.framework/Resources/library/grDevices/libs//cairo.so, 0x0006): Library not loaded: /opt/X11/lib/libXrender.1.dylib
## Referenced from: <263B701F-2238-3478-A7D2-B7513231EF98> /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/library/grDevices/libs/cairo.so
## Reason: tried: '/opt/X11/lib/libXrender.1.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/X11/lib/libXrender.1.dylib' (no such file), '/opt/X11/lib/libXrender.1.dylib' (no such file), '/Library/Frameworks/R.framework/Resources/lib/libXrender.1.dylib' (no such file), '/Library/Java/JavaVirtualMachines/jdk-11.0.18+10/Contents/Home/lib/server/libXrender.1.dylib' (no such file)
## Warning in device(filename = "seed_stability_v3.pdf", width = 16, height = 5, :
## failed to load cairo DLL
8 · External validation — full metric table (ensemble)
# ── BUG FIX: n <- length(y_true) added at top of function ──────────────────
# Previously n was undefined inside this function, causing sample(n, n, ...) to
# fail with "invalid 'size' argument" when n resolved to NULL from parent scope.
# ────────────────────────────────────────────────────────────────────────────
compute_ext_full <- function(y_true, prob, model_name, n_boot = 2000, seed = 42) {
n <- length(y_true) # FIX: define n locally — was missing in original
pred <- factor(ifelse(prob >= 0.5, "IR", "Non.IR"), levels = c("Non.IR","IR"))
cm <- confusionMatrix(pred, y_true)
sens <- as.numeric(cm$byClass["Sensitivity"])
spec <- as.numeric(cm$byClass["Specificity"])
prec <- as.numeric(cm$byClass["Pos Pred Value"])
f1 <- ifelse(is.na(prec)|prec+sens==0, NA_real_, 2*prec*sens/(prec+sens))
acc <- as.numeric(cm$overall["Accuracy"])
auprc <- pr.curve(scores.class0 = prob,
weights.class0 = ifelse(y_true == "IR", 1, 0),
curve = FALSE)$auc.integral
auprg <- compute_auprg(as.integer(y_true == "IR"), prob)
set.seed(seed)
mat <- matrix(NA_real_, nrow = n_boot, ncol = 9,
dimnames = list(NULL,
c("sens","spec","prec","f1","acc","balacc","prauc","auprg","auc_roc")))
for (i in seq_len(n_boot)) {
idx <- sample(n, n, replace = TRUE)
yb <- y_true[idx]; pb_cls <- pred[idx]; pb_pr <- prob[idx]
if (length(unique(yb)) < 2) next
cm_b <- confusionMatrix(pb_cls, yb)
s_b <- as.numeric(cm_b$byClass["Sensitivity"])
sp_b <- as.numeric(cm_b$byClass["Specificity"])
pr_b <- as.numeric(cm_b$byClass["Pos Pred Value"])
f1_b <- ifelse(is.na(pr_b)|pr_b+s_b==0, NA_real_, 2*pr_b*s_b/(pr_b+s_b))
prauc_b <- tryCatch(
pr.curve(scores.class0=pb_pr, weights.class0=ifelse(yb=="IR",1,0),
curve=FALSE)$auc.integral, error=function(e) NA_real_)
auprg_b <- tryCatch(
compute_auprg(as.integer(yb == "IR"), pb_pr), error=function(e) NA_real_)
auc_b <- tryCatch(
as.numeric(auc(roc(yb, pb_pr, levels=c("Non.IR","IR"), quiet=TRUE))),
error=function(e) NA_real_)
mat[i,] <- c(s_b, sp_b, pr_b, f1_b, as.numeric(cm_b$overall["Accuracy"]),
(s_b+sp_b)/2, prauc_b, auprg_b, auc_b)
}
ci <- apply(mat, 2, quantile, probs=c(0.025,0.975), na.rm=TRUE)
data.frame(
Model = model_name,
Sensitivity = round(sens, 3),
Sensitivity_lo = round(ci["2.5%", "sens"], 3),
Sensitivity_hi = round(ci["97.5%", "sens"], 3),
Specificity = round(spec, 3),
Specificity_lo = round(ci["2.5%", "spec"], 3),
Specificity_hi = round(ci["97.5%", "spec"], 3),
Balanced_Accuracy = round((sens+spec)/2, 3),
Balanced_Accuracy_lo = round(ci["2.5%", "balacc"], 3),
Balanced_Accuracy_hi = round(ci["97.5%", "balacc"], 3),
Precision = round(prec, 3),
Precision_lo = round(ci["2.5%", "prec"], 3),
Precision_hi = round(ci["97.5%", "prec"], 3),
F1 = round(f1, 3),
F1_lo = round(ci["2.5%", "f1"], 3),
F1_hi = round(ci["97.5%", "f1"], 3),
Accuracy = round(acc, 3),
Accuracy_lo = round(ci["2.5%", "acc"], 3),
Accuracy_hi = round(ci["97.5%", "acc"], 3),
AUPRC = round(auprc, 3),
AUPRC_lo = round(ci["2.5%", "prauc"], 3),
AUPRC_hi = round(ci["97.5%", "prauc"], 3),
AUPRG = round(auprg, 3),
AUPRG_lo = round(ci["2.5%", "auprg"], 3),
AUPRG_hi = round(ci["97.5%", "auprg"], 3),
stringsAsFactors = FALSE
)
}
ext_full <- dplyr::bind_rows(lapply(names(ext_probs), function(nm)
compute_ext_full(y_ext_fac, ext_probs[[nm]], nm)
))
# Merge in AUC-ROC columns from ms_summary
ext_full <- ext_full %>%
left_join(ms_summary %>%
dplyr::select(Model, AUC_mean, AUC_sd, Ens_AUC, Ens_AUC_lo, Ens_AUC_hi),
by = "Model")
cat("\nFull external metrics (ensemble of 20 seeds):\n")
##
## Full external metrics (ensemble of 20 seeds):
print(ext_full %>% dplyr::select(
Model, AUC_mean, AUC_sd, Ens_AUC, Ens_AUC_lo, Ens_AUC_hi,
AUPRC, AUPRC_lo, AUPRC_hi,
Sensitivity, Sensitivity_lo, Sensitivity_hi,
Specificity, Specificity_lo, Specificity_hi,
Balanced_Accuracy, F1, Accuracy
))
## Model AUC_mean AUC_sd Ens_AUC Ens_AUC_lo Ens_AUC_hi AUPRC AUPRC_lo
## 1 CGM model 0.878 0.012 0.890 0.789 0.990 0.825 0.633
## 2 Smartwatch model 0.794 0.011 0.797 0.674 0.920 0.708 0.486
## 3 Baseline model 0.753 0.009 0.763 0.625 0.901 0.582 0.373
## AUPRC_hi Sensitivity Sensitivity_lo Sensitivity_hi Specificity Specificity_lo
## 1 0.954 0.842 0.667 1.000 0.810 0.690
## 2 0.858 0.737 0.526 0.923 0.714 0.575
## 3 0.826 0.737 0.526 0.933 0.690 0.550
## Specificity_hi Balanced_Accuracy F1 Accuracy
## 1 0.923 0.826 0.744 0.820
## 2 0.844 0.726 0.622 0.721
## 3 0.821 0.714 0.609 0.705
write.csv(ext_full, "external_full_v3.csv", row.names = FALSE)
combined_metrics <- dplyr::bind_rows(
final_summary %>%
dplyr::select(Model, AUC_mean, AUC_lower, AUC_upper,
PRAUC_mean, PRAUC_lower, PRAUC_upper,
AUPRG_mean, AUPRG_lower, AUPRG_upper) %>%
mutate(Dataset = "Internal (100-iter bootstrap)") %>%
dplyr::rename(AUC_ROC = AUC_mean, AUC_lo = AUC_lower, AUC_hi = AUC_upper,
AUPRC = PRAUC_mean, AUPRC_lo = PRAUC_lower, AUPRC_hi = PRAUC_upper,
AUPRG_lo = AUPRG_lower, AUPRG_hi = AUPRG_upper),
ext_full %>%
dplyr::select(Model, AUC_mean, Ens_AUC_lo, Ens_AUC_hi,
AUPRC, AUPRC_lo, AUPRC_hi, AUPRG, AUPRG_lo, AUPRG_hi) %>%
mutate(Dataset = "External (20-seed ensemble)") %>%
dplyr::rename(AUC_ROC = AUC_mean, AUC_lo = Ens_AUC_lo, AUC_hi = Ens_AUC_hi)
) %>%
dplyr::select(Model, Dataset, AUC_ROC, AUC_lo, AUC_hi,
AUPRC, AUPRC_lo, AUPRC_hi, AUPRG, AUPRG_lo, AUPRG_hi) %>%
mutate(across(where(is.numeric), ~round(.x, 3)))
cat("\nCombined AUC-ROC + AUPRC + AUPRG (internal & external):\n")
##
## Combined AUC-ROC + AUPRC + AUPRG (internal & external):
print(combined_metrics)
## # A tibble: 6 × 11
## Model Dataset AUC_ROC AUC_lo AUC_hi AUPRC AUPRC_lo AUPRC_hi AUPRG AUPRG_lo
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Model C… Intern… 0.833 0.684 0.953 0.725 0.488 0.915 NA 0.363
## 2 Model b… Intern… 0.817 0.655 0.933 0.644 0.419 0.853 NA 0.359
## 3 Model s… Intern… 0.812 0.663 0.933 0.637 0.43 0.875 NA 0.356
## 4 CGM mod… Extern… 0.878 0.789 0.99 0.825 0.633 0.954 0.883 0.715
## 5 Smartwa… Extern… 0.794 0.674 0.92 0.708 0.486 0.858 0.74 0.364
## 6 Baselin… Extern… 0.753 0.625 0.901 0.582 0.373 0.826 0.679 0.321
## # ℹ 1 more variable: AUPRG_hi <dbl>
write.csv(combined_metrics, "combined_metrics_v3.csv", row.names = FALSE)
8b · AUC-ROC + AUPRC + AUPRG comparison plot
model_order <- c("Model CGM","Model smartwatch","Model baseline")
plot_df <- combined_metrics %>%
mutate(Model = factor(Model, levels = model_order)) %>%
tidyr::pivot_longer(cols = c(AUC_ROC, AUPRC),
names_to = "Metric", values_to = "Value") %>%
mutate(lo = ifelse(Metric == "AUC_ROC", AUC_lo, AUPRC_lo),
hi = ifelse(Metric == "AUC_ROC", AUC_hi, AUPRC_hi),
Metric = factor(Metric, levels = c("AUC_ROC","AUPRC"),
labels = c("AUC-ROC","AUPRC")),
Dataset = factor(Dataset, levels = c("Internal (100-iter bootstrap)",
"External (20-seed ensemble)")))
ref_lines <- data.frame(
Metric = c("AUC-ROC","AUPRC"),
xint = c(0.5, ir_prev)
)
p_metrics <- ggplot(plot_df,
aes(x = Value, y = Model, colour = Model,
shape = Dataset, group = interaction(Model, Dataset))) +
geom_errorbarh(aes(xmin = lo, xmax = hi), height = 0.2, linewidth = 0.7,
position = position_dodge(width = 0.55)) +
geom_point(size = 3.5, position = position_dodge(width = 0.55)) +
geom_vline(data = ref_lines, aes(xintercept = xint),
linetype = "dashed", colour = "grey60", linewidth = 0.7) +
scale_colour_manual(values = model_colors, guide = "none") +
scale_shape_manual(values = c("Internal (100-iter bootstrap)" = 16,
"External (20-seed ensemble)" = 17),
name = NULL) +
scale_y_discrete(labels = c("Model CGM" = "CGM model",
"Model smartwatch" = "Smartwatch model",
"Model baseline" = "Baseline (anthropometrics)")) +
facet_wrap(~Metric, scales = "free_x") +
scale_x_continuous(limits = c(0.35, 1.02), breaks = seq(0.4, 1.0, 0.1)) +
labs(title = "AUC-ROC and AUPRC — internal bootstrap vs external 20-seed ensemble",
x = "Value (95% CI)", y = NULL) +
theme_bw(base_size = 12) +
theme(plot.title = element_text(face = "bold", size = 11),
strip.text = element_text(face = "bold", size = 11),
strip.background = element_rect(fill = "grey93"),
panel.grid.minor = element_blank(),
legend.position = "bottom")
## Warning: `geom_errorbarh()` was deprecated in ggplot2 4.0.0.
## ℹ Please use the `orientation` argument of `geom_errorbar()` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
print(p_metrics)
## `height` was translated to `width`.
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_vline()`).

ggsave("auc_auprc_plot_v3.pdf", p_metrics, width = 11, height = 5, device = cairo_pdf)
## Warning in device(filename = "auc_auprc_plot_v3.pdf", width = 11, height = 5, :
## failed to load cairo DLL
## `height` was translated to `width`.
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_vline()`).
cat(sprintf("\nChance AUPRC (= prevalence): internal=%.2f external=%.2f\n",
mean(df_test$IR_label == "IR"), ir_prev))
##
## Chance AUPRC (= prevalence): internal=0.33 external=0.31
9 · Decision Curve Analysis
n_ext <- length(y_bin)
thresholds <- seq(0.05, 0.50, by = 0.01)
nb_fn <- function(y, prob, t) {
pred <- as.integer(prob >= t)
tp <- sum(pred == 1 & y == 1)
fp <- sum(pred == 1 & y == 0)
tp / n_ext - fp / n_ext * t / (1 - t)
}
dca_df <- do.call(rbind, lapply(thresholds, function(t) {
data.frame(
threshold = t,
model = c("CGM","SW","Baseline","All","None"),
net_benefit = c(
nb_fn(y_bin, ext_probs[["CGM model"]], t),
nb_fn(y_bin, ext_probs[["Smartwatch model"]], t),
nb_fn(y_bin, ext_probs[["Baseline model"]], t),
ir_prev - (1 - ir_prev) * t / (1 - t),
0
),
stringsAsFactors = FALSE
)
}))
p_dca <- ggplot(dca_df, aes(x = threshold, y = net_benefit,
colour = model, linetype = model)) +
geom_line(linewidth = 0.9) +
scale_colour_manual(
values = c("CGM"="#1f77b4","SW"="#2ca02c","Baseline"="#d62728",
"All"="#888888","None"="#bbbbbb"),
labels = c("CGM"="CGM model","SW"="Smartwatch model",
"Baseline"="Baseline model","All"="Treat all","None"="Treat none")) +
scale_linetype_manual(
values = c("CGM"="solid","SW"="dashed","Baseline"="dotdash",
"All"="dotted","None"="dotted"),
labels = c("CGM"="CGM model","SW"="Smartwatch model",
"Baseline"="Baseline model","All"="Treat all","None"="Treat none")) +
geom_hline(yintercept = 0, colour = "black", linewidth = 0.5) +
geom_vline(xintercept = ir_prev, colour = "black",
linetype = "dashed", linewidth = 0.6, alpha = 0.45) +
annotate("text", x = ir_prev + 0.01, y = 0.38,
label = sprintf("Prevalence\n(%.2f)", ir_prev),
size = 3, colour = "grey40", hjust = 0) +
coord_cartesian(xlim = c(0.05, 0.50), ylim = c(-0.06, 0.42)) +
labs(title = "Decision curve analysis — external cohort (n=61)",
x = "Threshold probability", y = "Net benefit",
colour = NULL, linetype = NULL) +
theme_bw(base_size = 12) +
theme(legend.position = "right",
plot.title = element_text(face = "bold", size = 12),
panel.grid.minor = element_blank())
print(p_dca)

ggsave("dca_v3.pdf", p_dca, width = 8, height = 5, device = cairo_pdf)
## Warning in device(filename = "dca_v3.pdf", width = 8, height = 5, bg =
## "white"): failed to load cairo DLL
cat(sprintf("\nNet benefit at prevalence threshold (%.2f):\n", ir_prev))
##
## Net benefit at prevalence threshold (0.31):
for (m in c("CGM","SW","Baseline","All","None")) {
sub <- dca_df[dca_df$model == m, ]
nb <- sub$net_benefit[which.min(abs(sub$threshold - ir_prev))]
cat(sprintf(" %-12s NB = %.4f\n", m, nb))
}
## CGM NB = 0.1314
## SW NB = 0.1224
## Baseline NB = 0.1003
## All NB = 0.0021
## None NB = 0.0000
write.csv(dca_df, "dca_v3.csv", row.names = FALSE)
10 · Seed stability summary
seed_df %>%
group_by(Model) %>%
summarise(
AUC_mean = round(mean(AUC), 3), AUC_sd = round(sd(AUC), 3),
AUC_min = round(min(AUC), 3), AUC_max = round(max(AUC), 3),
AUPRC_mean = round(mean(AUPRC), 3), AUPRC_sd = round(sd(AUPRC), 3),
.groups = "drop") %>%
print()
## # A tibble: 3 × 7
## Model AUC_mean AUC_sd AUC_min AUC_max AUPRC_mean AUPRC_sd
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 CGM model 0.878 0.012 0.858 0.9 0.802 0.032
## 2 Smartwatch model 0.794 0.011 0.772 0.811 0.698 0.019
## 3 Baseline model 0.753 0.009 0.737 0.776 0.568 0.035
11 · SHAP feature name map
feature_name_map <- c(
"bmi" = "BMI",
"mean_fasting_glucose" = "Mean Fasting Glucose",
"sd_fasting_glucose" = "SD Fasting Glucose",
"excursions_per_day" = "Glucose Excursions/Day",
"rec_time_50pct_median" = "Recovery Time (50th pct)",
"overall_max_glucose" = "Overall Max Glucose",
"age" = "Age",
"whr" = "WHR",
"stress_overall_daily_mean_stress" = "Mean Stress (HRV-based)",
"sd_hr_day_night_diff" = "HR Day-Night Variability",
"mean_midpoint_clock_h" = "Sleep Midpoint",
"sri" = "Sleep Regularity Index",
"social_jetlag_h" = "Social Jetlag"
)
rename_features <- function(x)
ifelse(x %in% names(feature_name_map), feature_name_map[x], x)
12 · SHAP helper functions
aggregate_shap <- function(shap_list, model_filter) {
entries <- Filter(function(x) x$model_name == model_filter, shap_list)
mat <- do.call(rbind, lapply(entries, function(e)
colMeans(abs(as.data.frame(e$shap)))))
data.frame(feature = colnames(mat),
mean_shap = colMeans(mat),
sd_shap = apply(mat, 2, sd),
row.names = NULL) %>%
arrange(desc(mean_shap)) %>%
mutate(pct_share = mean_shap / sum(mean_shap) * 100,
feature = rename_features(feature))
}
collect_shap_long <- function(shap_list, model_filter) {
entries <- Filter(function(x) x$model_name == model_filter, shap_list)
bind_rows(lapply(entries, function(e) {
sd <- as.data.frame(e$shap) %>%
mutate(obs = row_number(), iteration = e$iteration) %>%
pivot_longer(-c(obs,iteration), names_to="feature", values_to="shap_value") %>%
mutate(feature = rename_features(feature))
fd <- as.data.frame(e$X_df) %>%
mutate(obs = row_number()) %>%
pivot_longer(-obs, names_to="feature", values_to="feature_value") %>%
mutate(feature = rename_features(feature))
left_join(sd, fd, by = c("obs","feature"))
}))
}
rank_stability_df <- function(shap_list, model_filter) {
entries <- Filter(function(x) x$model_name == model_filter, shap_list)
mat <- do.call(rbind, lapply(entries, function(e)
colMeans(abs(as.data.frame(e$shap)))))
rank_mat <- t(apply(mat, 1, rank))
data.frame(feature = colnames(rank_mat),
mean_rank = colMeans(rank_mat),
iqr_rank = apply(rank_mat, 2, IQR),
row.names = NULL) %>%
arrange(mean_rank) %>%
mutate(feature = rename_features(feature))
}
panel_beeswarm <- function(shap_list, model_filter,
shared_x_range = NULL, is_external = FALSE) {
agg <- aggregate_shap(shap_list, model_filter)
long <- collect_shap_long(shap_list, model_filter)
lmap <- agg %>%
mutate(label = paste0(feature," (",round(pct_share,1),"%)")) %>%
dplyr::select(feature, label)
lev <- rev(agg$feature)
long <- long %>%
left_join(lmap, by = "feature") %>%
group_by(feature) %>%
mutate(feat_scaled = (feature_value - min(feature_value, na.rm=TRUE)) /
(max(feature_value, na.rm=TRUE) - min(feature_value, na.rm=TRUE) + 1e-9)) %>%
ungroup() %>%
mutate(label = factor(label, levels = lmap$label[match(lev, lmap$feature)]))
p <- ggplot(long, aes(x=shap_value, y=label, colour=feat_scaled)) +
geom_jitter(height=0.18, alpha=ifelse(is_external,0.9,0.5), size=1.5, stroke=0) +
scale_colour_gradient(low="#3B82C4", high="#E84545", name="Feature\nvalue",
breaks=c(0,1), labels=c("Low","High")) +
geom_vline(xintercept=0, linetype="dashed", colour="grey50", linewidth=0.4) +
labs(title=model_filter, x="SHAP value", y=NULL) +
theme_bw(base_size=11) +
theme(plot.title=element_text(face="bold",hjust=0.5,size=11),
legend.position="none", panel.grid.minor=element_blank(),
axis.text.y=element_text(size=9))
if (!is.null(shared_x_range)) p <- p + coord_cartesian(xlim=shared_x_range)
p
}
panel_bar <- function(shap_list, model_filter, shared_x_max = NULL) {
agg <- aggregate_shap(shap_list, model_filter) %>%
mutate(label = paste0(feature," (",round(pct_share,1),"%)"),
label = factor(label, levels=rev(paste0(feature," (",round(pct_share,1),"%)"))),
ymin_eb = pmax(mean_shap-sd_shap,0), ymax_eb = mean_shap+sd_shap)
p <- ggplot(agg, aes(x=mean_shap, y=label)) +
geom_col(fill="#4C72B0", alpha=0.85) +
geom_errorbarh(aes(xmin=ymin_eb, xmax=ymax_eb), height=0.3, colour="grey30") +
labs(title=model_filter, x="Mean |SHAP| value", y=NULL) +
theme_bw(base_size=11) +
theme(plot.title=element_text(face="bold",hjust=0.5,size=11),
panel.grid.minor=element_blank(), axis.text.y=element_text(size=9))
if (!is.null(shared_x_max)) p <- p + coord_cartesian(xlim=c(0,shared_x_max))
p
}
panel_rank <- function(shap_list, model_filter, shared_x_range = NULL) {
rs <- rank_stability_df(shap_list, model_filter) %>%
mutate(label = factor(feature, levels=rev(feature)))
p <- ggplot(rs, aes(x=mean_rank, y=label)) +
geom_point(size=3, colour="#2ca02c") +
geom_errorbarh(aes(xmin=mean_rank-iqr_rank/2, xmax=mean_rank+iqr_rank/2),
height=0.3, colour="grey40") +
scale_x_reverse() +
labs(title=model_filter, x="Mean rank (lower=more important)", y=NULL) +
theme_bw(base_size=11) +
theme(plot.title=element_text(face="bold",hjust=0.5,size=11),
panel.grid.minor=element_blank(), axis.text.y=element_text(size=9))
if (!is.null(shared_x_range)) p <- p + coord_cartesian(xlim=shared_x_range)
p
}
make_shap_figure <- function(shap_list, dataset_label, model_names_vec,
plot_type = c("beeswarm","bar","rank"),
panel_labels = LETTERS[seq_along(model_names_vec)]) {
plot_type <- match.arg(plot_type)
if (plot_type == "beeswarm") {
all_vals <- unlist(lapply(model_names_vec, function(mn) {
unlist(lapply(Filter(function(x) x$model_name==mn, shap_list),
function(e) as.numeric(as.matrix(e$shap))))
}))
xmax <- max(abs(all_vals), na.rm=TRUE)*1.05
srng <- c(-xmax, xmax)
}
if (plot_type == "bar") {
am <- unlist(lapply(model_names_vec, function(mn) {
a <- aggregate_shap(shap_list, mn); a$mean_shap + a$sd_shap
}))
sxmax <- max(am, na.rm=TRUE)*1.05
}
if (plot_type == "rank") {
nf <- max(unlist(lapply(model_names_vec, function(mn)
nrow(rank_stability_df(shap_list, mn)))))
srng <- c(nf+0.5, 0.5)
}
panels <- lapply(seq_along(model_names_vec), function(j) {
mn <- model_names_vec[rev(seq_along(model_names_vec))[j]]
p <- switch(plot_type,
beeswarm = panel_beeswarm(shap_list, mn, shared_x_range=srng),
bar = panel_bar(shap_list, mn, shared_x_max=sxmax),
rank = panel_rank(shap_list, mn, shared_x_range=srng))
p + labs(tag=panel_labels[j]) +
theme(plot.tag=element_text(face="bold",size=13))
})
subtitle <- switch(plot_type,
beeswarm = "Each point = one observation x one bootstrap iteration. Labels show mean |SHAP| share (%).",
bar = "Error bars = +/-1 SD across bootstrap iterations. Labels show mean |SHAP| share (%).",
rank = "Error bars = IQR of rank across bootstrap iterations. Leftward = more important.")
if (plot_type == "beeswarm") {
dummy <- ggplot(data.frame(x=0,y=0,v=c(0,1)),aes(x,y,colour=v)) +
geom_point() +
scale_colour_gradient(low="#3B82C4",high="#E84545",name="Feature\nvalue",
breaks=c(0,1),labels=c("Low","High")) +
theme(legend.position="right",legend.title=element_text(size=9),
legend.text=element_text(size=8),legend.key.height=unit(1.2,"cm"))
leg <- cowplot::get_legend(dummy)
combined <- (Reduce(`|`, panels) | patchwork::wrap_elements(leg)) +
plot_layout(widths=c(rep(1,length(panels)),0.15))
} else {
combined <- Reduce(`|`, panels)
}
combined + plot_annotation(
title = paste0(switch(plot_type, beeswarm="SHAP Summary",
bar="Mean |SHAP| Feature Importance",
rank="Feature Rank Stability"), " — ", dataset_label),
caption = subtitle,
theme = theme(plot.title =element_text(face="bold",hjust=0.5,size=13),
plot.caption=element_text(size=10,colour="grey40",hjust=0.5)))
}
14 · Single-seed models for external SHAP
fit_shap_model <- function(features, df_train, df_ext, seed = 1) {
set.seed(seed)
X_tr <- df_train %>% dplyr::select(all_of(features)) %>%
mutate(across(where(is.numeric), as.double))
y_tr <- factor(make.names(trimws(df_train$IR_label)))
imp <- caret::preProcess(X_tr, method = "medianImpute")
X_tr_imp <- predict(imp, X_tr)
up_data <- caret::upSample(x=X_tr_imp, y=y_tr, yname=".outcome")
rf <- randomForest::randomForest(
x=up_data[,names(X_tr_imp),drop=FALSE], y=up_data$.outcome,
ntree=300, mtry=floor(sqrt(length(features))), nodesize=1)
X_ext_imp <- predict(imp, df_ext %>%
dplyr::select(all_of(features)) %>%
mutate(across(where(is.numeric), as.double)))
shap <- fastshap::explain(
object=rf, X=as.data.frame(X_ext_imp),
pred_wrapper=function(object, newdata) predict(object, newdata, type="prob")[,"IR"],
nsim=50, adjust=TRUE)
list(shap=shap, X_df=as.data.frame(X_ext_imp))
}
cat("Fitting single-seed models for external SHAP (seed=1)...\n")
## Fitting single-seed models for external SHAP (seed=1)...
sr_CGM <- fit_shap_model(feature_sets$CGM_model, df_test, df_external)
sr_SW <- fit_shap_model(feature_sets$Smartwatch_only_model, df_test, df_external)
sr_Base <- fit_shap_model(feature_sets$Baseline_model, df_test, df_external)
all_shap_ext_final <- list(
list(iteration=1, model_name="Model CGM", shap=sr_CGM$shap, X_df=sr_CGM$X_df),
list(iteration=1, model_name="Model smartwatch", shap=sr_SW$shap, X_df=sr_SW$X_df),
list(iteration=1, model_name="Model baseline", shap=sr_Base$shap, X_df=sr_Base$X_df)
)
15 · SHAP figures — external test set
make_shap_ext_figure <- function(shap_list, model_names_vec,
panel_labels=LETTERS[seq_along(model_names_vec)]) {
all_vals <- unlist(lapply(model_names_vec, function(mn)
unlist(lapply(Filter(function(x) x$model_name==mn, shap_list),
function(e) as.numeric(as.matrix(e$shap))))))
xmax <- max(abs(all_vals),na.rm=TRUE)*1.05; srng <- c(-xmax,xmax)
panels <- lapply(seq_along(model_names_vec), function(j) {
panel_beeswarm(shap_list,model_names_vec[j],shared_x_range=srng,is_external=TRUE) +
labs(tag=panel_labels[j]) +
theme(plot.tag=element_text(face="bold",size=13))
})
dummy <- ggplot(data.frame(x=0,y=0,v=c(0,1)),aes(x,y,colour=v)) +
geom_point() +
scale_colour_gradient(low="#3B82C4",high="#E84545",name="Feature\nvalue",
breaks=c(0,1),labels=c("Low","High")) +
theme(legend.position="right",legend.title=element_text(size=9),
legend.text=element_text(size=8),legend.key.height=unit(1.2,"cm"))
leg <- cowplot::get_legend(dummy)
(Reduce(`|`, panels) | patchwork::wrap_elements(leg)) +
plot_layout(widths=c(rep(1,length(panels)),0.15)) +
plot_annotation(
title = "SHAP Summary — External Test Set",
caption = "Each point = one external patient. Model trained on all internal data (seed=1). Labels show |SHAP| share (%).",
theme = theme(plot.title =element_text(face="bold",hjust=0.5,size=13),
plot.caption=element_text(size=10,colour="grey40",hjust=0.5)))
}
fig_ext <- make_shap_ext_figure(all_shap_ext_final, model_names)
print(fig_ext)

ggsave("shap_beeswarm_external_v3.pdf", fig_ext, width=20, height=5.5, device=cairo_pdf)
17 · Session info
sessionInfo()
## R version 4.5.2 (2025-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sequoia 15.5
##
## Matrix products: default
## BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: Europe/Zurich
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] cowplot_1.2.0 patchwork_1.3.2 tidyr_1.3.2
## [4] prg_0.5.1 PRROC_1.4 rlang_1.2.0
## [7] pROC_1.19.0.1 caret_7.0-1 lattice_0.22-7
## [10] ggplot2_4.0.1 dplyr_1.2.0 fastshap_0.1.1
## [13] randomForest_4.7-1.2
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.2.1 timeDate_4052.112 farver_2.1.2
## [4] S7_0.2.1 fastmap_1.2.0 digest_0.6.39
## [7] rpart_4.1.24 timechange_0.3.0 lifecycle_1.0.5
## [10] survival_3.8-3 magrittr_2.0.4 compiler_4.5.2
## [13] sass_0.4.10 tools_4.5.2 utf8_1.2.6
## [16] yaml_2.3.12 data.table_1.18.0 knitr_1.51
## [19] labeling_0.4.3 plyr_1.8.9 RColorBrewer_1.1-3
## [22] withr_3.0.2 purrr_1.2.0 nnet_7.3-20
## [25] grid_4.5.2 stats4_4.5.2 e1071_1.7-17
## [28] future_1.69.0 globals_0.19.1 scales_1.4.0
## [31] iterators_1.0.14 MASS_7.3-65 cli_3.6.6
## [34] rmarkdown_2.30 generics_0.1.4 otel_0.2.0
## [37] rstudioapi_0.17.1 future.apply_1.20.2 reshape2_1.4.5
## [40] cachem_1.1.0 proxy_0.4-29 stringr_1.6.0
## [43] splines_4.5.2 parallel_4.5.2 vctrs_0.7.2
## [46] hardhat_1.4.2 Matrix_1.7-4 jsonlite_2.0.0
## [49] listenv_0.10.1 foreach_1.5.2 gower_1.0.2
## [52] jquerylib_0.1.4 recipes_1.3.1 glue_1.8.0
## [55] parallelly_1.46.1 codetools_0.2-20 lubridate_1.9.4
## [58] stringi_1.8.7 gtable_0.3.6 tibble_3.3.0
## [61] pillar_1.11.1 htmltools_0.5.9 ipred_0.9-15
## [64] lava_1.8.2 R6_2.6.1 evaluate_1.0.5
## [67] bslib_0.9.0 class_7.3-23 Rcpp_1.1.1
## [70] nlme_3.1-168 prodlim_2026.03.11 xfun_0.55
## [73] pkgconfig_2.0.3 ModelMetrics_1.2.2.2