# ================================
# 1. Load libraries
# ================================
library(pheatmap)
library(EnhancedVolcano)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.3.3
## Loading required package: ggrepel
## Warning: package 'ggrepel' was built under R version 4.3.3
library(tidyverse)
## Warning: package 'tidyr' was built under R version 4.3.2
## Warning: package 'purrr' was built under R version 4.3.3
## Warning: package 'lubridate' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0.9000 ✔ stringr 1.5.1
## ✔ lubridate 1.9.4 ✔ tibble 3.2.1
## ✔ purrr 1.0.4 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(M3C)
library(ggplot2)
library(limma)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(variancePartition)
## Loading required package: BiocParallel
## Warning: package 'BiocParallel' was built under R version 4.3.1
##
## Attaching package: 'variancePartition'
##
## The following object is masked from 'package:limma':
##
## topTable
library(BiocParallel)
library(gridExtra)
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
library(ggrepel)
library(patchwork)
## Warning: package 'patchwork' was built under R version 4.3.3
# ================================
# 2. Load data
# ================================
CSF_data <- read.csv("~/Desktop/KV1.3_CSF.PAP1.30july2025/CSF_PAP1_Proteinid_50percent_IMPUTATION_matrix.unique.csv",
row.names = 1)
CSF_metadata <- readRDS("~/Desktop/KV1.3_CSF.PAP1.30july2025/CSF.KV1.3.Proteomics.Analysis/CSF_PAP1_Metadata.rds")
CSF_metadata <- CSF_metadata[colnames(CSF_data), ] # match order
# ================================
# 3. Define blood contamination markers (mouse) in lowercase
# ================================
#blood_markers_mouse <- c("cd24", "cd235a", "cd45", "ter119", "glya", "cd71","cd49d", "hla-dr", "cd33", "cd11b", "hbb", "spi1", "alas2", "hba2")
#blood_markers_mouse <- c("Hba", "Hbb", "Hbd", "Alb", "Tf", "Fga", "Fgb", "Fgg", "Hp", "Hpx", "Apoa1", "Apoa2", "Apoe")
blood_markers_mouse <- c("Alb", "Ighg1", "Ighg2b", "Ighg2c", "Ighg3", "Ighm", "Trf", "Fga", "Fgb", "Fgg", "Hba-a1", "Hbb-b1", "C3", "C4a", "C4b", "Serpina1a", "A2m", "Apoa1", "Apob", "Apoe", "Hp", "Fth1", "Ftl1", "Cp", "F8", "Vwf", "F2", "Serpinc1", "Plg", "Vegfa")
blood_markers <- blood_markers_mouse[blood_markers_mouse %in% rownames(CSF_data)]
if (length(blood_markers) == 0) stop("No blood contamination markers found in dataset!")
# ================================
# 4. Blood contamination score
# ================================
blood_matrix <- t(CSF_data[blood_markers, ])
blood_score <- rowMeans(blood_matrix)
CSF_metadata$blood_score <- blood_score
# ================================
# 5. VPA BEFORE correction
# ================================
form <- ~ blood_score + (1|Group)
vpa_before <- fitExtractVarPartModel(as.matrix(CSF_data), form, CSF_metadata, BPPARAM = SnowParam(4))
## Dividing work into 100 chunks...
##
## Total:25 s
p1 <- plotVarPart(vpa_before) + ggtitle("VPA - BEFORE Blood Correction")
p1

blood_var_before <- mean(vpa_before["blood_score", ])
## Warning in mean.default(vpa_before["blood_score", ]): argument is not numeric
## or logical: returning NA
# ================================
# 6. PCA BEFORE correction (annotated)
# ================================
pca_before <- prcomp(t(CSF_data), scale. = TRUE)
p_pca_before <- fviz_pca_ind(pca_before, geom = "point", habillage = CSF_metadata$Group, addEllipses = TRUE) +
ggtitle("PCA - BEFORE Blood Correction")
blood_loadings_before <- as.data.frame(pca_before$rotation)
blood_loadings_before$Protein <- rownames(blood_loadings_before)
blood_annot_before <- blood_loadings_before %>% filter(tolower(Protein) %in% blood_markers_mouse)
p_pca_before <- p_pca_before +
geom_text_repel(data = blood_annot_before, aes(x = PC1, y = PC2, label = Protein),
color = "red", size = 10, inherit.aes = FALSE)
p_pca_before
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse

# ================================
# 7. Regress out blood effect
# ================================
corrected_data <- removeBatchEffect(CSF_data, covariates = CSF_metadata$blood_score)
# ================================
# 8. VPA AFTER correction
# ================================
vpa_after <- fitExtractVarPartModel(as.matrix(corrected_data), form, CSF_metadata, BPPARAM = SnowParam(4))
## Dividing work into 100 chunks...
##
## Total:23 s
head(vpa_after, 5)
## Group blood_score Residuals
## Gbp6 0.3023920 0.0213845351 0.6762234
## Nova2 0.1334427 0.0027416719 0.8638157
## Aspg 0.3912072 0.0187368495 0.5900559
## Cul4b 0.5615592 0.0465022580 0.3919386
## Arfgef2 0.1272203 0.0002068904 0.8725728
# Check for NA values
summary(vpa_after)
## Group blood_score Residuals
## Min. :0.0000 Min. :0.00000 Min. :0.1370
## 1st Qu.:0.1360 1st Qu.:0.00074 1st Qu.:0.5304
## Median :0.2947 Median :0.00676 Median :0.6932
## Mean :0.2952 Mean :0.01291 Mean :0.6919
## 3rd Qu.:0.4485 3rd Qu.:0.02036 3rd Qu.:0.8619
## Max. :0.8610 Max. :0.11456 Max. :1.0000
## NA's :176 NA's :176 NA's :176
# Replace NA with 0 (or filter them out)
vpa_after_clean <- vpa_after
vpa_after_clean[is.na(vpa_after_clean)] <- 0
# Re-plot VPA after cleaning
p2 <- plotVarPart(vpa_after_clean) +
ggtitle("VPA - AFTER Blood Correction")
p2

# Recalculate blood variance effect
blood_var_after <- mean(vpa_after_clean["blood_score", ], na.rm = TRUE)
## Warning in mean.default(vpa_after_clean["blood_score", ], na.rm = TRUE):
## argument is not numeric or logical: returning NA
variance_reduction <- ((blood_var_before - blood_var_after) / blood_var_before) * 100
cat("Blood effect variance reduced by:", round(variance_reduction, 2), "%\n")
## Blood effect variance reduced by: NA %
# Annotated plot
p2_annotated <- p2 +
annotate("text",
x = 2, y = max(vpa_after_clean, na.rm = TRUE) * 0.9,
label = paste0("Blood effect ↓ ", round(variance_reduction, 1), "%"),
color = "red", size = 5, fontface = "bold")
p2_annotated

# 9. PCA AFTER correction (annotated)
# ================================
pca_after <- prcomp(t(corrected_data), scale. = TRUE)
p_pca_after <- fviz_pca_ind(pca_after, geom = "point", habillage = CSF_metadata$Group, addEllipses = TRUE) +
ggtitle("PCA - AFTER Blood Correction")
blood_loadings_after <- as.data.frame(pca_after$rotation)
blood_loadings_after$Protein <- rownames(blood_loadings_after)
blood_annot_after <- blood_loadings_after %>% filter(tolower(Protein) %in% blood_markers_mouse)
p_pca_after <- p_pca_after +
geom_text_repel(data = blood_annot_after, aes(x = PC1, y = PC2, label = Protein),
color = "blue", size = 4, inherit.aes = FALSE)
# Combine PCA plots
p_pca_combined <- p_pca_before | p_pca_after
p_pca_combined
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse

# 10. Blood protein paired boxplot
# ================================
before_blood <- colMeans(CSF_data[blood_markers, ])
after_blood <- colMeans(corrected_data[blood_markers, ])
wilcox_result <- wilcox.test(before_blood, after_blood, paired = TRUE)
cat("Wilcoxon signed-rank test p-value for blood marker reduction:", wilcox_result$p.value, "\n")
## Wilcoxon signed-rank test p-value for blood marker reduction: 1.907349e-06
blood_df <- data.frame(
Sample = rep(colnames(CSF_data), 2),
Level = c(before_blood, after_blood),
Stage = rep(c("Before", "After"), each = length(before_blood))
)
head(blood_df, 5)
## Sample Level Stage
## 1 CSF_PAP1_SHK.P1 23.09521 Before
## 2 CSF_PAP1_SHK.P2 24.70939 Before
## 3 CSF_PAP1_SHK.P3 25.95392 Before
## 4 CSF_PAP1_SHK.C1 26.75385 Before
## 5 CSF_PAP1_SHK.C2 26.90857 Before
p3 <- ggplot(blood_df, aes(x = Stage, y = Level, group = Sample)) +
geom_line(alpha = 0.5) +
geom_point(aes(color = Stage), size = 3) +
theme_minimal() +
ggtitle(paste0("Blood Protein Levels (p = ", signif(wilcox_result$p.value, 3), ")")) +
ylab("Mean Blood Protein Intensity")
p3

# ================================
# 11. Combine all plots into one figure
# ================================
final_plot <- (p1 | p2_annotated) / (p_pca_combined | p3)
final_plot
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse

# ================================
# 12. Save results
# ================================
ggsave("~/Desktop/KV1.3_CSF.PAP1.30july2025/CSF_BloodEffect_VPA_PCA_Annotated.png",
final_plot, width = 16, height = 12, dpi = 300)
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
write.csv(corrected_data,
"~/Desktop/KV1.3_CSF.PAP1.30july2025/CSF_PAP1_Proteinid_50percent_IMPUTATION_matrix.blood_corrected.csv")
# 1 Clean up NA values
vpa_before_clean <- vpa_before
vpa_before_clean[is.na(vpa_before_clean)] <- 0
vpa_after_clean <- vpa_after
vpa_after_clean[is.na(vpa_after_clean)] <- 0
# 2 Extract numeric vectors for blood score variance (as column, not row)
blood_var_before_vec <- vpa_before_clean$blood_score
blood_var_after_vec <- vpa_after_clean$blood_score
# 3️ Calculate mean blood variance reduction
blood_var_before <- mean(blood_var_before_vec, na.rm = TRUE)
blood_var_after <- mean(blood_var_after_vec, na.rm = TRUE)
variance_reduction <- ((blood_var_before - blood_var_after) / blood_var_before) * 100
cat("Blood effect variance reduced by:", round(variance_reduction, 2), "%\n")
## Blood effect variance reduced by: 98 %
# 4 QC: Top 50 proteins BEFORE correction
top_idx <- order(blood_var_before_vec, decreasing = TRUE)[1:50]
qc_df <- data.frame(
Protein = rownames(vpa_before_clean)[top_idx],
BloodVar_Before = blood_var_before_vec[top_idx],
BloodVar_After = blood_var_after_vec[top_idx]
)
print(qc_df)
## Protein BloodVar_Before BloodVar_After
## 1 Plg 0.9667348 4.209198e-03
## 2 Cfb 0.9653926 6.101827e-04
## 3 Fgg 0.9628480 1.238944e-03
## 4 Apoa4 0.9614168 2.971136e-29
## 5 Fgb 0.9605727 4.548948e-03
## 6 Serpinf2 0.9602677 2.495213e-04
## 7 C3 0.9593456 1.140119e-28
## 8 C5 0.9590446 1.522844e-03
## 9 Fga 0.9590346 2.090209e-04
## 10 Ahsg 0.9577168 0.000000e+00
## 11 Arhgap31 0.9574615 4.666708e-04
## 12 F2 0.9571769 1.111867e-03
## 13 Cfi 0.9570279 1.588331e-03
## 14 GCAB 0.9567476 0.000000e+00
## 15 Cela1 0.9540920 1.954916e-04
## 16 C1s1 0.9531240 1.659745e-03
## 17 Ighg2b 0.9515195 8.233682e-29
## 18 Ambp 0.9510674 5.929704e-06
## 19 Vwf 0.9505519 3.972312e-27
## 20 Itih2 0.9482417 3.145124e-05
## 21 Gpx3 0.9470157 4.368753e-03
## 22 Igkc 0.9467733 3.138888e-28
## 23 Hpx 0.9463409 4.429426e-03
## 24 Itih1 0.9458954 1.174404e-06
## 25 Gpld1 0.9452034 8.164253e-05
## 26 Tf 0.9448900 7.581785e-27
## 27 Serpina1b 0.9428779 9.285564e-31
## 28 Klkb1 0.9403044 1.974471e-03
## 29 Serpina10 0.9394209 2.049979e-04
## 30 Hgfac 0.9386660 9.589659e-05
## 31 Gsn 0.9381400 1.090296e-05
## 32 Kng1 0.9367674 4.288541e-03
## 33 Inhca 0.9366817 1.531680e-02
## 34 Postn 0.9356554 2.368516e-03
## 35 Igkv5-48 0.9342478 3.280640e-27
## 36 Mug1 0.9331267 0.000000e+00
## 37 Cpn2 0.9316329 2.612944e-05
## 38 Selenop 0.9284976 3.512434e-03
## 39 Igkv6-17 0.9283687 2.097635e-28
## 40 Spp2 0.9269293 2.763294e-03
## 41 F13b 0.9259803 6.671188e-27
## 42 Masp1 0.9258608 1.250388e-03
## 43 Apoh 0.9258043 2.537687e-02
## 44 Serpina3m 0.9257657 3.439177e-05
## 45 Azgp1 0.9256004 2.891170e-03
## 46 Il1rap 0.9250053 1.725561e-03
## 47 Alb 0.9245062 9.526116e-04
## 48 F13a1 0.9241661 9.736577e-06
## 49 B2m 0.9221890 3.148021e-02
## 50 Igfals 0.9216181 2.360624e-02
# 5 Plot VPA AFTER correction
p2 <- plotVarPart(vpa_after_clean) +
ggtitle("VPA - AFTER Blood Correction")
p2_annotated <- p2 +
annotate("text",
x = 2, y = max(blood_var_after_vec, na.rm = TRUE) * 0.9,
label = paste0("Blood effect ↓ ", round(variance_reduction, 1), "%"),
color = "red", size = 5, fontface = "bold")
print(p2_annotated)

library(tidyverse)
# Convert to percentage for plotting
qc_df_long <- qc_df %>%
pivot_longer(cols = c(BloodVar_Before, BloodVar_After),
names_to = "Stage", values_to = "Variance") %>%
group_by(Protein) %>%
mutate(Variance_Percent = (Variance / sum(Variance, na.rm = TRUE)) * 100) %>%
ungroup()
# Factor ordering by variance before correction (largest to smallest)
qc_df_long$Protein <- factor(qc_df_long$Protein,
levels = qc_df$Protein[order(qc_df$BloodVar_Before, decreasing = TRUE)])
# Plot stacked bar (percentage-based)
ggplot(qc_df_long, aes(x = Protein, y = Variance_Percent, fill = Stage)) +
geom_bar(stat = "identity", position = "stack") +
scale_fill_manual(values = c("BloodVar_Before" = "red", "BloodVar_After" = "blue")) +
labs(
title = "Top 50 Proteins - Blood Effect Variance (Before vs After Correction)",
x = "Proteins",
y = "Variance Contribution (%)",
fill = "Stage"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 6),
plot.title = element_text(size = 14, face = "bold")
)

library(variancePartition)
# Reorder proteins by blood effect before correction
GroupSortOrder <- order(vpa_before_clean[["blood_score"]], decreasing = TRUE)
# Reorder rows for all variance components
for (i in ls(vpa_before_clean)) {
vpa_before_clean[[i]] <- vpa_before_clean[[i]][GroupSortOrder]
}
rownames(vpa_before_clean) <- rownames(vpa_before_clean)[GroupSortOrder]
# Plot top 50 proteins based on blood variance (BEFORE correction)
plotPercentBars(vpa_before_clean[1:50, ]) +
ggtitle("Top 50 Proteins Associated with Blood Effect (Before Correction)")

# After correction
GroupSortOrder <- order(vpa_after[["blood_score"]], decreasing = TRUE)
for (i in ls(vpa_after)) {
vpa_after[[i]] <- vpa_after[[i]][GroupSortOrder]
}
rownames(vpa_after) <- rownames(vpa_after)[GroupSortOrder]
plotPercentBars(vpa_after[1:50, ]) +
ggtitle("Top 50 Proteins Associated with Blood Effect (After Correction)")

library(variancePartition)
library(patchwork) # for side-by-side plots
# Sort proteins by blood variance BEFORE correction
GroupSortOrder <- order(vpa_before_clean[["blood_score"]], decreasing = TRUE)
for (i in ls(vpa_before_clean)) {
vpa_before_clean[[i]] <- vpa_before_clean[[i]][GroupSortOrder]
}
rownames(vpa_before_clean) <- rownames(vpa_before_clean)[GroupSortOrder]
# Plot BEFORE correction
p_before <- plotPercentBars(vpa_before_clean[1:50, ]) +
ggtitle("Top 50 Proteins (Blood Effect - BEFORE Correction)")
# Sort proteins by blood variance AFTER correction
GroupSortOrder <- order(vpa_after[["blood_score"]], decreasing = TRUE)
for (i in ls(vpa_after)) {
vpa_after[[i]] <- vpa_after[[i]][GroupSortOrder]
}
rownames(vpa_after) <- rownames(vpa_after)[GroupSortOrder]
# Plot AFTER correction
p_after <- plotPercentBars(vpa_after[1:50, ]) +
ggtitle("Top 50 Proteins (Blood Effect - AFTER Correction)")
# Combine the two plots side-by-side
combined_plot <- p_before | p_after
combined_plot

library(ggplot2)
library(patchwork) # to combine plots
# Perform PCA before correction
pca_before <- prcomp(t(as.matrix(CSF_data)), scale. = TRUE)
pca_before_df <- data.frame(
PC1 = pca_before$x[, 1],
PC2 = pca_before$x[, 2],
Group = CSF_metadata$Group
)
# Plot BEFORE correction
p_pca_before <- ggplot(pca_before_df, aes(x = PC1, y = PC2, color = Group)) +
geom_point(size = 4) +
xlab(paste0("PC1 (", round(summary(pca_before)$importance[2, 1] * 100, 1), "% variance)")) +
ylab(paste0("PC2 (", round(summary(pca_before)$importance[2, 2] * 100, 1), "% variance)")) +
ggtitle("PCA - BEFORE Blood Correction") +
theme_bw()
# Perform PCA after correction
pca_after <- prcomp(t(as.matrix(corrected_data)), scale. = TRUE)
pca_after_df <- data.frame(
PC1 = pca_after$x[, 1],
PC2 = pca_after$x[, 2],
Group = CSF_metadata$Group
)
# Plot AFTER correction
p_pca_after <- ggplot(pca_after_df, aes(x = PC1, y = PC2, color = Group)) +
geom_point(size = 4) +
xlab(paste0("PC1 (", round(summary(pca_after)$importance[2, 1] * 100, 1), "% variance)")) +
ylab(paste0("PC2 (", round(summary(pca_after)$importance[2, 2] * 100, 1), "% variance)")) +
ggtitle("PCA - AFTER Blood Correction") +
theme_bw()
# Combine the two PCA plots
combined_pca <- p_pca_before | p_pca_after
combined_pca

# ===========================
library(ggplot2)
df <- CSF_metadata
ggplot(df, aes(x = Group, y = blood_score, fill = Group)) +
geom_boxplot() +
theme_minimal() +
labs(title = "Blood Score by Group", x = "Group", y = "Blood Score") +
theme(legend.position = "none")
