📥 Data loading and label creation

# Load and clean
data <- read_excel("matrix.xlsx") %>% clean_names()

# Build binary label (benign vs cancer)
data <- data %>%
  mutate(
    disease_type = tolower(trimws(disease_type)),
    label = case_when(
      disease_type %in% c("lymphoma", "other cancer") ~ "cancer",
      disease_type == "benign" ~ "benign",
      TRUE ~ NA_character_
    )
  ) %>%
  mutate(label = factor(label, levels = c("benign", "cancer")))

# Quick sanity check
table(data$label, useNA = "ifany")
## 
## benign cancer 
##     22     94
# Feature columns of interest
features <- c(
  "mutation_score","sv_score","cn_score","bcr_clonality","tcr_clonality",
  "ebv","m_tb","tumor_fraction","composite_genomic_score"
)

📊 ROC analysis table (R)

library(knitr)

# Function to run ROC and return scalars
run_one_roc <- function(df, feature, label_col = "label") {
  v <- df[[feature]]
  y <- df[[label_col]]
  
  ok <- complete.cases(v, y)
  v <- as.numeric(v[ok])
  y <- droplevels(y[ok])
  
  n_benign <- sum(y == "benign")
  n_cancer <- sum(y == "cancer")
  n <- length(y)
  
  if (length(unique(y)) < 2 || n_benign < 2 || n_cancer < 2) {
    return(list(
      feature = feature, n = n, n_benign = n_benign, n_cancer = n_cancer,
      auc = NA_real_, threshold = NA_real_, sensitivity = NA_real_,
      specificity = NA_real_, ties = NA, roc_obj = NULL
    ))
  }
  
  r <- roc(
    response = y,
    predictor = v,
    levels = c("benign", "cancer"),
    direction = "auto",
    quiet = TRUE
  )
  
  coords_best <- as.data.frame(
    coords(r, x = "best", ret = c("threshold", "sensitivity", "specificity"),
           best.method = "youden")
  )
  
  ties_flag <- ifelse(nrow(coords_best) > 1, TRUE, FALSE)
  
  list(
    feature = feature,
    n = n, n_benign = n_benign, n_cancer = n_cancer,
    auc = as.numeric(auc(r)[1]),
    threshold = coords_best$threshold[1],
    sensitivity = coords_best$sensitivity[1],
    specificity = coords_best$specificity[1],
    ties = ties_flag,
    roc_obj = r
  )
}

roc_list <- lapply(features, function(f) run_one_roc(data, f))

results_table <- do.call(
  rbind,
  lapply(roc_list, function(x) {
    data.frame(
      Feature     = x$feature,
      N           = x$n,
      N_benign    = x$n_benign,
      N_cancer    = x$n_cancer,
      AUC         = round(x$auc, 3),
      Threshold   = round(x$threshold, 3),
      Sensitivity = round(x$sensitivity, 3),
      Specificity = round(x$specificity, 3),
      Tied_best   = ifelse(is.na(x$ties), NA, x$ties)
    )
  })
)

kable(results_table,
      caption = "ROC results (Youden’s J): AUC, optimal threshold, sensitivity, specificity",
      digits = 3)
ROC results (Youden’s J): AUC, optimal threshold, sensitivity, specificity
Feature N N_benign N_cancer AUC Threshold Sensitivity Specificity Tied_best
mutation_score 116 22 94 0.885 3.500 0.745 0.909 FALSE
sv_score 116 22 94 0.590 0.500 0.181 1.000 FALSE
cn_score 116 22 94 0.705 2.500 0.511 0.909 FALSE
bcr_clonality 116 22 94 0.622 0.062 0.245 1.000 FALSE
tcr_clonality 116 22 94 0.587 0.024 0.511 0.682 FALSE
ebv 116 22 94 0.665 0.500 0.330 1.000 FALSE
m_tb 116 22 94 0.337 -Inf 1.000 0.000 TRUE
tumor_fraction 116 22 94 0.861 0.032 0.798 0.864 FALSE
composite_genomic_score 116 22 94 0.883 7.453 0.798 0.909 FALSE
write.csv(results_table, "ROC_results.csv", row.names = FALSE)

🎨 ROC curves (R, ggplot)

exclude_from_plot <- c("ebv", "m_tb")

roc_named <- setNames(roc_list, features)
roc_keep <- roc_named[!names(roc_named) %in% exclude_from_plot]
roc_keep <- roc_keep[!vapply(roc_keep, function(x) is.null(x$roc_obj), logical(1))]

roc_plot <- lapply(roc_keep, function(x) {
  r <- x$roc_obj
  if (as.numeric(auc(r)) < 0.5) {
    r <- roc(response = r$response,
             predictor = -r$predictor,
             levels = c("benign", "cancer"),
             quiet = TRUE)
  }
  r
})

auc_vals <- vapply(roc_plot, function(r) as.numeric(auc(r)), numeric(1))
roc_plot <- roc_plot[order(auc_vals, decreasing = TRUE)]
auc_vals <- auc_vals[order(auc_vals, decreasing = TRUE)]

auc_labels <- paste0(names(roc_plot), " (AUC = ", sprintf("%.2f", auc_vals), ")")
names(roc_plot) <- auc_labels

g <- ggroc(roc_plot, aes = "colour") +
  scale_color_brewer(palette = "Set1", name = NULL) +
  scale_x_continuous(breaks = seq(0, 1, 0.2), limits = c(0,1), expand = c(0,0)) +
  scale_y_continuous(breaks = seq(0, 1, 0.2), limits = c(0,1), expand = c(0,0)) +
  labs(
    title = "ROC Curves — Benign vs Cancer",
    x = "False Positive Rate (1 - Specificity)",
    y = "True Positive Rate (Sensitivity)"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    legend.position = c(0.25, 0.35),
    legend.key.size = unit(0.4, "cm"),
    legend.text = element_text(size = 8),
    legend.background = element_rect(fill = "white", colour = "grey80"),
    plot.title = element_text(hjust = 0.5)
  )

print(g)

📦 Boxplot of Composite Genomic Score

g_box <- ggplot(data, aes(x = disease_type, y = composite_genomic_score, fill = disease_type)) +
  geom_boxplot(outlier.shape = NA, alpha = 0.7) +
  geom_jitter(width = 0.2, alpha = 0.6, size = 1.5) +
  scale_fill_brewer(palette = "Set2") +
  labs(
    title = "Composite Genomic Score by Disease Group",
    x = "Disease Group",
    y = "Composite Genomic Score"
  ) +
  theme_classic(base_size = 14) +
  theme(legend.position = "none")

# Commented out stat_compare_means for stability
# comparisons <- list(
#   c("benign", "lymphoma"),
#   c("lymphoma", "other cancer"),
#   c("benign", "other cancer")
# )
# g_box <- g_box + stat_compare_means(comparisons = comparisons, method = "wilcox.test")

print(g_box)

ggsave("Composite_Boxplot.png", g_box, width = 6, height = 5, dpi = 300)

🐍 Python ROC analysis (reticulate)

import matplotlib
matplotlib.use("Agg")  # non-interactive backend

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score

df = pd.read_excel("matrix.xlsx")

y = df["Disease_type"].str.strip().str.lower().map(
    {"benign": 0, "lymphoma": 1, "other cancer": 1}
).values

features = [
    "Mutation_score", "CN_score", "SV_score",
    "BCR_clonality", "TCR_clonality", "tumor_fraction"
]

feature_aucs = {}
for f in features:
    scores = df[f].values
    auc = roc_auc_score(y, scores)
    feature_aucs[f] = auc

sorted_features = sorted(feature_aucs, key=feature_aucs.get, reverse=True)

plt.figure(figsize=(7,6))
for f in sorted_features:
    scores = df[f].values
    fpr, tpr, thresholds = roc_curve(y, scores)
    auc = feature_aucs[f]
    legend_name = "Tumor_fraction" if f == "tumor_fraction" else f
    plt.plot(fpr, tpr, lw=2, label=f"{legend_name} (AUC = {auc:.2f})")

plt.plot([0,1],[0,1],"k--", lw=1)
plt.xlabel("False Positive Rate (1 - Specificity)")
plt.ylabel("True Positive Rate (Sensitivity)")
plt.title("ROC Curves for Individual Features (Benign vs Cancer)")
plt.legend(loc="lower right", fontsize=9)
plt.grid(alpha=0.3)
plt.show()

plt.savefig("ROC_features.png", dpi=300, bbox_inches="tight")

# Composite score
comp_scores = df["Composite_genomic_score"].values
fpr, tpr, thresholds = roc_curve(y, comp_scores)
auc = roc_auc_score(y, comp_scores)

j_scores = tpr - fpr
j_best_idx = j_scores.argmax()
opt_thr = thresholds[j_best_idx]
opt_fpr, opt_tpr = fpr[j_best_idx], tpr[j_best_idx]

plt.figure(figsize=(7,6))
plt.plot(fpr, tpr, color="blue", lw=2, label=f"Composite (AUC = {auc:.2f})")
plt.plot([0,1],[0,1],"k--", lw=1)
plt.scatter(opt_fpr, opt_tpr, color="red", s=60, zorder=5,
            label=f"Optimal threshold = {opt_thr:.2f}")
plt.xlabel("False Positive Rate (1 - Specificity)")
plt.ylabel("True Positive Rate (Sensitivity)")
plt.title("ROC Curve for Composite Genomic Score")
plt.legend(loc="lower right", fontsize=9)
plt.grid(alpha=0.3)
plt.show()

plt.savefig("ROC_composite.png", dpi=300, bbox_inches="tight")