📥 Data loading and label creation
# Load and clean
data <- read_excel("matrix.xlsx") %>% clean_names()
# Build binary label (benign vs cancer)
data <- data %>%
mutate(
disease_type = tolower(trimws(disease_type)),
label = case_when(
disease_type %in% c("lymphoma", "other cancer") ~ "cancer",
disease_type == "benign" ~ "benign",
TRUE ~ NA_character_
)
) %>%
mutate(label = factor(label, levels = c("benign", "cancer")))
# Quick sanity check
table(data$label, useNA = "ifany")
##
## benign cancer
## 22 94
# Feature columns of interest
features <- c(
"mutation_score","sv_score","cn_score","bcr_clonality","tcr_clonality",
"ebv","m_tb","tumor_fraction","composite_genomic_score"
)
📊 ROC analysis table (R)
library(knitr)
# Function to run ROC and return scalars
run_one_roc <- function(df, feature, label_col = "label") {
v <- df[[feature]]
y <- df[[label_col]]
ok <- complete.cases(v, y)
v <- as.numeric(v[ok])
y <- droplevels(y[ok])
n_benign <- sum(y == "benign")
n_cancer <- sum(y == "cancer")
n <- length(y)
if (length(unique(y)) < 2 || n_benign < 2 || n_cancer < 2) {
return(list(
feature = feature, n = n, n_benign = n_benign, n_cancer = n_cancer,
auc = NA_real_, threshold = NA_real_, sensitivity = NA_real_,
specificity = NA_real_, ties = NA, roc_obj = NULL
))
}
r <- roc(
response = y,
predictor = v,
levels = c("benign", "cancer"),
direction = "auto",
quiet = TRUE
)
coords_best <- as.data.frame(
coords(r, x = "best", ret = c("threshold", "sensitivity", "specificity"),
best.method = "youden")
)
ties_flag <- ifelse(nrow(coords_best) > 1, TRUE, FALSE)
list(
feature = feature,
n = n, n_benign = n_benign, n_cancer = n_cancer,
auc = as.numeric(auc(r)[1]),
threshold = coords_best$threshold[1],
sensitivity = coords_best$sensitivity[1],
specificity = coords_best$specificity[1],
ties = ties_flag,
roc_obj = r
)
}
roc_list <- lapply(features, function(f) run_one_roc(data, f))
results_table <- do.call(
rbind,
lapply(roc_list, function(x) {
data.frame(
Feature = x$feature,
N = x$n,
N_benign = x$n_benign,
N_cancer = x$n_cancer,
AUC = round(x$auc, 3),
Threshold = round(x$threshold, 3),
Sensitivity = round(x$sensitivity, 3),
Specificity = round(x$specificity, 3),
Tied_best = ifelse(is.na(x$ties), NA, x$ties)
)
})
)
kable(results_table,
caption = "ROC results (Youden’s J): AUC, optimal threshold, sensitivity, specificity",
digits = 3)
ROC results (Youden’s J): AUC, optimal threshold, sensitivity,
specificity
| mutation_score |
116 |
22 |
94 |
0.885 |
3.500 |
0.745 |
0.909 |
FALSE |
| sv_score |
116 |
22 |
94 |
0.590 |
0.500 |
0.181 |
1.000 |
FALSE |
| cn_score |
116 |
22 |
94 |
0.705 |
2.500 |
0.511 |
0.909 |
FALSE |
| bcr_clonality |
116 |
22 |
94 |
0.622 |
0.062 |
0.245 |
1.000 |
FALSE |
| tcr_clonality |
116 |
22 |
94 |
0.587 |
0.024 |
0.511 |
0.682 |
FALSE |
| ebv |
116 |
22 |
94 |
0.665 |
0.500 |
0.330 |
1.000 |
FALSE |
| m_tb |
116 |
22 |
94 |
0.337 |
-Inf |
1.000 |
0.000 |
TRUE |
| tumor_fraction |
116 |
22 |
94 |
0.861 |
0.032 |
0.798 |
0.864 |
FALSE |
| composite_genomic_score |
116 |
22 |
94 |
0.883 |
7.453 |
0.798 |
0.909 |
FALSE |
write.csv(results_table, "ROC_results.csv", row.names = FALSE)
🎨 ROC curves (R, ggplot)
exclude_from_plot <- c("ebv", "m_tb")
roc_named <- setNames(roc_list, features)
roc_keep <- roc_named[!names(roc_named) %in% exclude_from_plot]
roc_keep <- roc_keep[!vapply(roc_keep, function(x) is.null(x$roc_obj), logical(1))]
roc_plot <- lapply(roc_keep, function(x) {
r <- x$roc_obj
if (as.numeric(auc(r)) < 0.5) {
r <- roc(response = r$response,
predictor = -r$predictor,
levels = c("benign", "cancer"),
quiet = TRUE)
}
r
})
auc_vals <- vapply(roc_plot, function(r) as.numeric(auc(r)), numeric(1))
roc_plot <- roc_plot[order(auc_vals, decreasing = TRUE)]
auc_vals <- auc_vals[order(auc_vals, decreasing = TRUE)]
auc_labels <- paste0(names(roc_plot), " (AUC = ", sprintf("%.2f", auc_vals), ")")
names(roc_plot) <- auc_labels
g <- ggroc(roc_plot, aes = "colour") +
scale_color_brewer(palette = "Set1", name = NULL) +
scale_x_continuous(breaks = seq(0, 1, 0.2), limits = c(0,1), expand = c(0,0)) +
scale_y_continuous(breaks = seq(0, 1, 0.2), limits = c(0,1), expand = c(0,0)) +
labs(
title = "ROC Curves — Benign vs Cancer",
x = "False Positive Rate (1 - Specificity)",
y = "True Positive Rate (Sensitivity)"
) +
theme_minimal(base_size = 14) +
theme(
legend.position = c(0.25, 0.35),
legend.key.size = unit(0.4, "cm"),
legend.text = element_text(size = 8),
legend.background = element_rect(fill = "white", colour = "grey80"),
plot.title = element_text(hjust = 0.5)
)
print(g)

📦 Boxplot of Composite Genomic Score
g_box <- ggplot(data, aes(x = disease_type, y = composite_genomic_score, fill = disease_type)) +
geom_boxplot(outlier.shape = NA, alpha = 0.7) +
geom_jitter(width = 0.2, alpha = 0.6, size = 1.5) +
scale_fill_brewer(palette = "Set2") +
labs(
title = "Composite Genomic Score by Disease Group",
x = "Disease Group",
y = "Composite Genomic Score"
) +
theme_classic(base_size = 14) +
theme(legend.position = "none")
# Commented out stat_compare_means for stability
# comparisons <- list(
# c("benign", "lymphoma"),
# c("lymphoma", "other cancer"),
# c("benign", "other cancer")
# )
# g_box <- g_box + stat_compare_means(comparisons = comparisons, method = "wilcox.test")
print(g_box)

ggsave("Composite_Boxplot.png", g_box, width = 6, height = 5, dpi = 300)
🐍 Python ROC analysis (reticulate)
import matplotlib
matplotlib.use("Agg") # non-interactive backend
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score
df = pd.read_excel("matrix.xlsx")
y = df["Disease_type"].str.strip().str.lower().map(
{"benign": 0, "lymphoma": 1, "other cancer": 1}
).values
features = [
"Mutation_score", "CN_score", "SV_score",
"BCR_clonality", "TCR_clonality", "tumor_fraction"
]
feature_aucs = {}
for f in features:
scores = df[f].values
auc = roc_auc_score(y, scores)
feature_aucs[f] = auc
sorted_features = sorted(feature_aucs, key=feature_aucs.get, reverse=True)
plt.figure(figsize=(7,6))
for f in sorted_features:
scores = df[f].values
fpr, tpr, thresholds = roc_curve(y, scores)
auc = feature_aucs[f]
legend_name = "Tumor_fraction" if f == "tumor_fraction" else f
plt.plot(fpr, tpr, lw=2, label=f"{legend_name} (AUC = {auc:.2f})")
plt.plot([0,1],[0,1],"k--", lw=1)
plt.xlabel("False Positive Rate (1 - Specificity)")
plt.ylabel("True Positive Rate (Sensitivity)")
plt.title("ROC Curves for Individual Features (Benign vs Cancer)")
plt.legend(loc="lower right", fontsize=9)
plt.grid(alpha=0.3)
plt.show()

plt.savefig("ROC_features.png", dpi=300, bbox_inches="tight")
# Composite score
comp_scores = df["Composite_genomic_score"].values
fpr, tpr, thresholds = roc_curve(y, comp_scores)
auc = roc_auc_score(y, comp_scores)
j_scores = tpr - fpr
j_best_idx = j_scores.argmax()
opt_thr = thresholds[j_best_idx]
opt_fpr, opt_tpr = fpr[j_best_idx], tpr[j_best_idx]
plt.figure(figsize=(7,6))
plt.plot(fpr, tpr, color="blue", lw=2, label=f"Composite (AUC = {auc:.2f})")
plt.plot([0,1],[0,1],"k--", lw=1)
plt.scatter(opt_fpr, opt_tpr, color="red", s=60, zorder=5,
label=f"Optimal threshold = {opt_thr:.2f}")
plt.xlabel("False Positive Rate (1 - Specificity)")
plt.ylabel("True Positive Rate (Sensitivity)")
plt.title("ROC Curve for Composite Genomic Score")
plt.legend(loc="lower right", fontsize=9)
plt.grid(alpha=0.3)
plt.show()

plt.savefig("ROC_composite.png", dpi=300, bbox_inches="tight")
