openDVP_zscoring_scDVP

Author

Daniela Valdes, 19.12.2025

Since we expect analysis methods to have highly differing depth coverages and thus differing ranges in relative LFQ values, we attempt to correct by z-scoring per approach, to reveal that all methods capture important biological differences.

This likely did not work since this protein subset could be an artifact of the filtering based on method (scDVP, cnDVP, DVP) coverage and not of acute biological relevance

# load transformed, filtered and imputed matrix of all measurements
protein_data = read.delim("/Users/dvaldes/Downloads/20251219_2134_DV_adatadata.txt", header = TRUE, sep = "\t")
metadata_df = read.delim("/Users/dvaldes/Downloads/20251219_2134_DV_adatametadata.txt", header = TRUE, sep = "\t")
counts = t(as.matrix(protein_data))
colnamies = as.character(unlist(counts[1, ]))
counts = counts[-1, , drop = FALSE]
rownamies = rownames(counts)
counts = apply(counts, 2, as.numeric)
rownames(counts) = rownamies
colnames(counts) = colnamies

cat("Number of proteins in matrix:", (nrow((counts))))
Number of proteins in matrix: 84
# make sce for easier downstream analyses 
sce = SingleCellExperiment(assays = list(counts = counts), 
                           colData = metadata_df)

# Separate per approach and do a z-scoring that accounds for method-specific variations in protein abundances
sce_approach1 = sce[, sce$approach == "scDVP"]
counts_approach1 = as.matrix(counts(sce_approach1))
colnames1 = colnames(counts_approach1)
rownames1 = rownames(counts_approach1)
counts_approach1 = apply(counts_approach1, 2, as.numeric)
colnames(counts_approach1) = colnames1
rownames(counts_approach1) = rownames1
z_approach1 = t(scale(t(counts_approach1)))

sce_approach2 = sce[, sce$approach == "cnDVP"]
counts_approach2 = as.matrix(counts(sce_approach2))
colnames2 = colnames(counts_approach2)
rownames2 = rownames(counts_approach2)
counts_approach2 = apply(counts_approach2, 2, as.numeric)
colnames(counts_approach2) = colnames2
rownames(counts_approach2) = rownames2
z_approach2 = t(scale(t(counts_approach2)))

sce_approach3 = sce[, sce$approach == "DVP"]
counts_approach3 = as.matrix(counts(sce_approach3))
colnames3 = colnames(counts_approach3)
counts_approach3 = counts_approach3
rownames3 = rownames(counts_approach3)
counts_approach3 = apply(counts_approach3, 2, as.numeric)
colnames(counts_approach3) = colnames3
rownames(counts_approach3) = rownames3
z_approach3 = t(scale(t(counts_approach3)))

# Join 
corrected_mat = cbind(z_approach1, z_approach2, z_approach3)  

In the original paper data as well as this new dataset, PC2 appears to separate between the biological replicates (primary vs relapse, i.e. 991 vs 992).

The PCA shows this approach also corrects for biological replicate in this subset of proteins, supporting the notion that this protein subset is more an artifact of the filtering based on coverage and not of acute biological relevance (i.e. important biology was lost with this filtering).

# PCA
pca = prcomp(t(corrected_mat))
pca_df = as.data.frame(pca$x)
pca_df$approach = metadata_df$approach
pca_df$sample_id = as.factor(metadata_df$sample_id)

explained = summary(pca)$importance[2, 1:2] * 100

label_cols = c(DVP = "blue3", scDVP = "green4", cnDVP = "orange2")  

a = ggplot(pca_df, aes(x = PC1, y = PC2, fill = approach, color = approach, shape = sample_id)) +
  geom_point(size = 3, alpha = 0.8) +
  stat_ellipse(type = "norm", linetype = "dashed") +
  scale_color_manual(values = label_cols)+
  labs(
    x = paste0("PC1 (", round(explained[1], 1), "% variance)"),
    y = paste0("PC2 (", round(explained[2], 1), "% variance)"),
    title = "PCA zscore per method"
  ) +
  theme_minimal(base_size = 14) +
  theme(legend.position = "right")

pca = prcomp(t(counts))
pca_df = as.data.frame(pca$x)
pca_df$approach = metadata_df$approach
pca_df$sample_id = as.factor(metadata_df$sample_id)

explained = summary(pca)$importance[2, 1:2] * 100


b = ggplot(pca_df, aes(x = PC1, y = PC2, fill = approach, color = approach, shape = sample_id)) +
  geom_point(size = 3, alpha = 0.8) +
  stat_ellipse(type = "norm", linetype = "dashed") +
  scale_color_manual(values = label_cols)+
  labs(
    x = paste0("PC1 (", round(explained[1], 1), "% variance)"),
    y = paste0("PC2 (", round(explained[2], 1), "% variance)"),
    title = "PCA original"
  ) +
  theme_minimal(base_size = 14) +
  theme(legend.position = "right")

cowplot::plot_grid(b, a, nrow = 2)

This is also supperted within this protein subset by heatmap and cluster analysis of zscored proteins and also when subjected to COMBAT batch correction.

# do combat correction
batch = as.factor(metadata_df$approach) 
model = model.matrix(~ sample_id, data = metadata_df)  # to preserve biology
expr_bc = ComBat(dat = counts, batch = batch, mod = model)
ann_col= data.frame(
  approach = metadata_df$approach,
  sample_id = factor(as.character(metadata_df$sample_id))
)
rownames(ann_col) = metadata_df$Sample_filepath

scale_max = 3
scale_min = -3
my_breaks = seq(scale_min, scale_max, length.out = 100)

sample_id_levels = levels(ann_col$sample_id)
ann_colors = list(
  approach = c(DVP = "blue3", scDVP = "green4", cnDVP = "orange2"),
  sample_id = c("991" = "purple", "992" = "cyan3")  
)

pheatmap(counts,
         main = "Abundances row zscore",
         scale = "row", 
         show_rownames = FALSE, 
         show_colnames = FALSE, 
         annotation_col = ann_col, 
         annotation_colors = ann_colors,
         breaks = my_breaks)

pheatmap(corrected_mat,
         main = "Abundances method zscored",
         scale = "none", 
         show_rownames = FALSE, 
         show_colnames = FALSE, 
         annotation_col = ann_col, 
         annotation_colors = ann_colors,
         breaks = my_breaks)

pheatmap(expr_bc,
         main = "Abundances COMBAT",
         scale = "row", 
         show_rownames = FALSE, 
         show_colnames = FALSE, 
         annotation_col = ann_col, 
         annotation_colors = ann_colors,
         breaks = my_breaks)