# ================================
# 1. Load libraries
# ================================
library(pheatmap)
library(EnhancedVolcano)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.3.3
## Loading required package: ggrepel
## Warning: package 'ggrepel' was built under R version 4.3.3
library(tidyverse)
## Warning: package 'tidyr' was built under R version 4.3.2
## Warning: package 'purrr' was built under R version 4.3.3
## Warning: package 'lubridate' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4          ✔ readr     2.1.5     
## ✔ forcats   1.0.0.9000     ✔ stringr   1.5.1     
## ✔ lubridate 1.9.4          ✔ tibble    3.2.1     
## ✔ purrr     1.0.4          ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(M3C)
library(ggplot2)
library(limma)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(variancePartition)
## Loading required package: BiocParallel
## Warning: package 'BiocParallel' was built under R version 4.3.1
## 
## Attaching package: 'variancePartition'
## 
## The following object is masked from 'package:limma':
## 
##     topTable
library(BiocParallel)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
library(ggrepel)
library(patchwork)
## Warning: package 'patchwork' was built under R version 4.3.3
# ================================
# 2. Load data
# ================================
CSF_data <- read.csv("~/Desktop/KV1.3_CSF.PAP1.30july2025/CSF_PAP1_Proteinid_50percent_IMPUTATION_matrix.unique.csv",
                     row.names = 1)
CSF_metadata <- readRDS("~/Desktop/KV1.3_CSF.PAP1.30july2025/CSF.KV1.3.Proteomics.Analysis/CSF_PAP1_Metadata.rds")
CSF_metadata <- CSF_metadata[colnames(CSF_data), ]  # match order

# ================================
# 3. Define blood contamination markers (mouse) in lowercase
# ================================
#blood_markers_mouse <- c("cd24", "cd235a", "cd45", "ter119", "glya", "cd71","cd49d", "hla-dr", "cd33", "cd11b", "hbb", "spi1", "alas2", "hba2")

#blood_markers_mouse <- c("Hba", "Hbb", "Hbd", "Alb", "Tf", "Fga", "Fgb", "Fgg", "Hp", "Hpx", "Apoa1", "Apoa2", "Apoe")
blood_markers_mouse <- c("Alb", "Ighg1", "Ighg2b", "Ighg2c", "Ighg3", "Ighm", "Trf", "Fga", "Fgb", "Fgg", "Hba-a1", "Hbb-b1", "C3", "C4a", "C4b", "Serpina1a", "A2m", "Apoa1", "Apob", "Apoe", "Hp", "Fth1", "Ftl1", "Cp", "F8", "Vwf", "F2", "Serpinc1", "Plg", "Vegfa")

blood_markers <- blood_markers_mouse[blood_markers_mouse %in% rownames(CSF_data)]

if (length(blood_markers) == 0) stop("No blood contamination markers found in dataset!")

# ================================
# 4. Blood contamination score
# ================================
blood_matrix <- t(CSF_data[blood_markers, ])
blood_score <- rowMeans(blood_matrix)
CSF_metadata$blood_score <- blood_score

# ================================
# 5. VPA BEFORE correction
# ================================
form <- ~ blood_score + (1|Group)
vpa_before <- fitExtractVarPartModel(as.matrix(CSF_data), form, CSF_metadata, BPPARAM = SnowParam(4))
## Dividing work into 100 chunks...
## 
## Total:25 s
p1 <- plotVarPart(vpa_before) + ggtitle("VPA - BEFORE Blood Correction")
p1

blood_var_before <- mean(vpa_before["blood_score", ])
## Warning in mean.default(vpa_before["blood_score", ]): argument is not numeric
## or logical: returning NA
# ================================
# 6. PCA BEFORE correction (annotated)
# ================================
pca_before <- prcomp(t(CSF_data), scale. = TRUE)
p_pca_before <- fviz_pca_ind(pca_before, geom = "point", habillage = CSF_metadata$Group, addEllipses = TRUE) +
                ggtitle("PCA - BEFORE Blood Correction")

blood_loadings_before <- as.data.frame(pca_before$rotation)
blood_loadings_before$Protein <- rownames(blood_loadings_before)
blood_annot_before <- blood_loadings_before %>% filter(tolower(Protein) %in% blood_markers_mouse)

p_pca_before <- p_pca_before +
  geom_text_repel(data = blood_annot_before, aes(x = PC1, y = PC2, label = Protein),
                  color = "red", size = 10, inherit.aes = FALSE)


p_pca_before
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse

# ================================
# 7. Regress out blood effect
# ================================
corrected_data <- removeBatchEffect(CSF_data, covariates = CSF_metadata$blood_score)

# ================================
# 8. VPA AFTER correction
# ================================
vpa_after <- fitExtractVarPartModel(as.matrix(corrected_data), form, CSF_metadata, BPPARAM = SnowParam(4))
## Dividing work into 100 chunks...
## 
## Total:23 s
head(vpa_after, 5)
##             Group  blood_score Residuals
## Gbp6    0.3023920 0.0213845351 0.6762234
## Nova2   0.1334427 0.0027416719 0.8638157
## Aspg    0.3912072 0.0187368495 0.5900559
## Cul4b   0.5615592 0.0465022580 0.3919386
## Arfgef2 0.1272203 0.0002068904 0.8725728
# Check for NA values
summary(vpa_after)
##      Group         blood_score        Residuals     
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.1370  
##  1st Qu.:0.1360   1st Qu.:0.00074   1st Qu.:0.5304  
##  Median :0.2947   Median :0.00676   Median :0.6932  
##  Mean   :0.2952   Mean   :0.01291   Mean   :0.6919  
##  3rd Qu.:0.4485   3rd Qu.:0.02036   3rd Qu.:0.8619  
##  Max.   :0.8610   Max.   :0.11456   Max.   :1.0000  
##  NA's   :176      NA's   :176       NA's   :176
# Replace NA with 0 (or filter them out)
vpa_after_clean <- vpa_after
vpa_after_clean[is.na(vpa_after_clean)] <- 0

# Re-plot VPA after cleaning
p2 <- plotVarPart(vpa_after_clean) + 
  ggtitle("VPA - AFTER Blood Correction")

p2

# Recalculate blood variance effect
blood_var_after <- mean(vpa_after_clean["blood_score", ], na.rm = TRUE)
## Warning in mean.default(vpa_after_clean["blood_score", ], na.rm = TRUE):
## argument is not numeric or logical: returning NA
variance_reduction <- ((blood_var_before - blood_var_after) / blood_var_before) * 100

cat("Blood effect variance reduced by:", round(variance_reduction, 2), "%\n")
## Blood effect variance reduced by: NA %
# Annotated plot
p2_annotated <- p2 +
  annotate("text",
           x = 2, y = max(vpa_after_clean, na.rm = TRUE) * 0.9,
           label = paste0("Blood effect ↓ ", round(variance_reduction, 1), "%"),
           color = "red", size = 5, fontface = "bold")

p2_annotated

# 9. PCA AFTER correction (annotated)
# ================================
pca_after <- prcomp(t(corrected_data), scale. = TRUE)
p_pca_after <- fviz_pca_ind(pca_after, geom = "point", habillage = CSF_metadata$Group, addEllipses = TRUE) +
               ggtitle("PCA - AFTER Blood Correction")

blood_loadings_after <- as.data.frame(pca_after$rotation)
blood_loadings_after$Protein <- rownames(blood_loadings_after)
blood_annot_after <- blood_loadings_after %>% filter(tolower(Protein) %in% blood_markers_mouse)

p_pca_after <- p_pca_after +
  geom_text_repel(data = blood_annot_after, aes(x = PC1, y = PC2, label = Protein),
                  color = "blue", size = 4, inherit.aes = FALSE)

# Combine PCA plots
p_pca_combined <- p_pca_before | p_pca_after

p_pca_combined
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse

# 10. Blood protein paired boxplot
# ================================
before_blood <- colMeans(CSF_data[blood_markers, ])
after_blood  <- colMeans(corrected_data[blood_markers, ])


wilcox_result <- wilcox.test(before_blood, after_blood, paired = TRUE)
cat("Wilcoxon signed-rank test p-value for blood marker reduction:", wilcox_result$p.value, "\n")
## Wilcoxon signed-rank test p-value for blood marker reduction: 1.907349e-06
blood_df <- data.frame(
  Sample = rep(colnames(CSF_data), 2),
  Level = c(before_blood, after_blood),
  Stage = rep(c("Before", "After"), each = length(before_blood))
)

head(blood_df, 5)
##            Sample    Level  Stage
## 1 CSF_PAP1_SHK.P1 23.09521 Before
## 2 CSF_PAP1_SHK.P2 24.70939 Before
## 3 CSF_PAP1_SHK.P3 25.95392 Before
## 4 CSF_PAP1_SHK.C1 26.75385 Before
## 5 CSF_PAP1_SHK.C2 26.90857 Before
p3 <- ggplot(blood_df, aes(x = Stage, y = Level, group = Sample)) +
  geom_line(alpha = 0.5) +
  geom_point(aes(color = Stage), size = 3) +
  theme_minimal() +
  ggtitle(paste0("Blood Protein Levels (p = ", signif(wilcox_result$p.value, 3), ")")) +
  ylab("Mean Blood Protein Intensity")

p3

# ================================
# 11. Combine all plots into one figure
# ================================
final_plot <- (p1 | p2_annotated) / (p_pca_combined | p3)
final_plot
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse

# ================================
# 12. Save results
# ================================
ggsave("~/Desktop/KV1.3_CSF.PAP1.30july2025/CSF_BloodEffect_VPA_PCA_Annotated.png",
       final_plot, width = 16, height = 12, dpi = 300)
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
write.csv(corrected_data,
          "~/Desktop/KV1.3_CSF.PAP1.30july2025/CSF_PAP1_Proteinid_50percent_IMPUTATION_matrix.blood_corrected.csv")
# 1 Clean up NA values
vpa_before_clean <- vpa_before
vpa_before_clean[is.na(vpa_before_clean)] <- 0

vpa_after_clean <- vpa_after
vpa_after_clean[is.na(vpa_after_clean)] <- 0

# 2 Extract numeric vectors for blood score variance (as column, not row)
blood_var_before_vec <- vpa_before_clean$blood_score
blood_var_after_vec  <- vpa_after_clean$blood_score

# 3️ Calculate mean blood variance reduction
blood_var_before <- mean(blood_var_before_vec, na.rm = TRUE)
blood_var_after  <- mean(blood_var_after_vec, na.rm = TRUE)

variance_reduction <- ((blood_var_before - blood_var_after) / blood_var_before) * 100
cat("Blood effect variance reduced by:", round(variance_reduction, 2), "%\n")
## Blood effect variance reduced by: 98 %
# 4 QC: Top 50 proteins BEFORE correction
top_idx <- order(blood_var_before_vec, decreasing = TRUE)[1:50]

qc_df <- data.frame(
  Protein = rownames(vpa_before_clean)[top_idx],
  BloodVar_Before = blood_var_before_vec[top_idx],
  BloodVar_After  = blood_var_after_vec[top_idx]
)

print(qc_df)
##      Protein BloodVar_Before BloodVar_After
## 1        Plg       0.9667348   4.209198e-03
## 2        Cfb       0.9653926   6.101827e-04
## 3        Fgg       0.9628480   1.238944e-03
## 4      Apoa4       0.9614168   2.971136e-29
## 5        Fgb       0.9605727   4.548948e-03
## 6   Serpinf2       0.9602677   2.495213e-04
## 7         C3       0.9593456   1.140119e-28
## 8         C5       0.9590446   1.522844e-03
## 9        Fga       0.9590346   2.090209e-04
## 10      Ahsg       0.9577168   0.000000e+00
## 11  Arhgap31       0.9574615   4.666708e-04
## 12        F2       0.9571769   1.111867e-03
## 13       Cfi       0.9570279   1.588331e-03
## 14      GCAB       0.9567476   0.000000e+00
## 15     Cela1       0.9540920   1.954916e-04
## 16      C1s1       0.9531240   1.659745e-03
## 17    Ighg2b       0.9515195   8.233682e-29
## 18      Ambp       0.9510674   5.929704e-06
## 19       Vwf       0.9505519   3.972312e-27
## 20     Itih2       0.9482417   3.145124e-05
## 21      Gpx3       0.9470157   4.368753e-03
## 22      Igkc       0.9467733   3.138888e-28
## 23       Hpx       0.9463409   4.429426e-03
## 24     Itih1       0.9458954   1.174404e-06
## 25     Gpld1       0.9452034   8.164253e-05
## 26        Tf       0.9448900   7.581785e-27
## 27 Serpina1b       0.9428779   9.285564e-31
## 28     Klkb1       0.9403044   1.974471e-03
## 29 Serpina10       0.9394209   2.049979e-04
## 30     Hgfac       0.9386660   9.589659e-05
## 31       Gsn       0.9381400   1.090296e-05
## 32      Kng1       0.9367674   4.288541e-03
## 33     Inhca       0.9366817   1.531680e-02
## 34     Postn       0.9356554   2.368516e-03
## 35  Igkv5-48       0.9342478   3.280640e-27
## 36      Mug1       0.9331267   0.000000e+00
## 37      Cpn2       0.9316329   2.612944e-05
## 38   Selenop       0.9284976   3.512434e-03
## 39  Igkv6-17       0.9283687   2.097635e-28
## 40      Spp2       0.9269293   2.763294e-03
## 41      F13b       0.9259803   6.671188e-27
## 42     Masp1       0.9258608   1.250388e-03
## 43      Apoh       0.9258043   2.537687e-02
## 44 Serpina3m       0.9257657   3.439177e-05
## 45     Azgp1       0.9256004   2.891170e-03
## 46    Il1rap       0.9250053   1.725561e-03
## 47       Alb       0.9245062   9.526116e-04
## 48     F13a1       0.9241661   9.736577e-06
## 49       B2m       0.9221890   3.148021e-02
## 50    Igfals       0.9216181   2.360624e-02
# 5 Plot VPA AFTER correction
p2 <- plotVarPart(vpa_after_clean) +
  ggtitle("VPA - AFTER Blood Correction")

p2_annotated <- p2 +
  annotate("text",
           x = 2, y = max(blood_var_after_vec, na.rm = TRUE) * 0.9,
           label = paste0("Blood effect ↓ ", round(variance_reduction, 1), "%"),
           color = "red", size = 5, fontface = "bold")

print(p2_annotated)

library(tidyverse)

# Convert to percentage for plotting
qc_df_long <- qc_df %>%
  pivot_longer(cols = c(BloodVar_Before, BloodVar_After),
               names_to = "Stage", values_to = "Variance") %>%
  group_by(Protein) %>%
  mutate(Variance_Percent = (Variance / sum(Variance, na.rm = TRUE)) * 100) %>%
  ungroup()

# Factor ordering by variance before correction (largest to smallest)
qc_df_long$Protein <- factor(qc_df_long$Protein,
                              levels = qc_df$Protein[order(qc_df$BloodVar_Before, decreasing = TRUE)])

# Plot stacked bar (percentage-based)
ggplot(qc_df_long, aes(x = Protein, y = Variance_Percent, fill = Stage)) +
  geom_bar(stat = "identity", position = "stack") +
  scale_fill_manual(values = c("BloodVar_Before" = "red", "BloodVar_After" = "blue")) +
  labs(
    title = "Top 50 Proteins - Blood Effect Variance (Before vs After Correction)",
    x = "Proteins",
    y = "Variance Contribution (%)",
    fill = "Stage"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 6),
    plot.title = element_text(size = 14, face = "bold")
  )

library(variancePartition)

# Reorder proteins by blood effect before correction
GroupSortOrder <- order(vpa_before_clean[["blood_score"]], decreasing = TRUE)

# Reorder rows for all variance components
for (i in ls(vpa_before_clean)) { 
    vpa_before_clean[[i]] <- vpa_before_clean[[i]][GroupSortOrder]
}
rownames(vpa_before_clean) <- rownames(vpa_before_clean)[GroupSortOrder]

# Plot top 50 proteins based on blood variance (BEFORE correction)
plotPercentBars(vpa_before_clean[1:50, ]) +
    ggtitle("Top 50 Proteins Associated with Blood Effect (Before Correction)")

# After correction
GroupSortOrder <- order(vpa_after[["blood_score"]], decreasing = TRUE)

for (i in ls(vpa_after)) { 
    vpa_after[[i]] <- vpa_after[[i]][GroupSortOrder]
}
rownames(vpa_after) <- rownames(vpa_after)[GroupSortOrder]

plotPercentBars(vpa_after[1:50, ]) +
    ggtitle("Top 50 Proteins Associated with Blood Effect (After Correction)")

library(variancePartition)
library(patchwork)   # for side-by-side plots

# Sort proteins by blood variance BEFORE correction
GroupSortOrder <- order(vpa_before_clean[["blood_score"]], decreasing = TRUE)
for (i in ls(vpa_before_clean)) { 
    vpa_before_clean[[i]] <- vpa_before_clean[[i]][GroupSortOrder]
}
rownames(vpa_before_clean) <- rownames(vpa_before_clean)[GroupSortOrder]

# Plot BEFORE correction
p_before <- plotPercentBars(vpa_before_clean[1:50, ]) +
    ggtitle("Top 50 Proteins (Blood Effect - BEFORE Correction)")

# Sort proteins by blood variance AFTER correction
GroupSortOrder <- order(vpa_after[["blood_score"]], decreasing = TRUE)
for (i in ls(vpa_after)) { 
    vpa_after[[i]] <- vpa_after[[i]][GroupSortOrder]
}
rownames(vpa_after) <- rownames(vpa_after)[GroupSortOrder]

# Plot AFTER correction
p_after <- plotPercentBars(vpa_after[1:50, ]) +
    ggtitle("Top 50 Proteins (Blood Effect - AFTER Correction)")

# Combine the two plots side-by-side
combined_plot <- p_before | p_after
combined_plot

library(ggplot2)
library(patchwork)  # to combine plots

# Perform PCA before correction
pca_before <- prcomp(t(as.matrix(CSF_data)), scale. = TRUE)
pca_before_df <- data.frame(
  PC1 = pca_before$x[, 1],
  PC2 = pca_before$x[, 2],
  Group = CSF_metadata$Group
)

# Plot BEFORE correction
p_pca_before <- ggplot(pca_before_df, aes(x = PC1, y = PC2, color = Group)) +
  geom_point(size = 4) +
  xlab(paste0("PC1 (", round(summary(pca_before)$importance[2, 1] * 100, 1), "% variance)")) +
  ylab(paste0("PC2 (", round(summary(pca_before)$importance[2, 2] * 100, 1), "% variance)")) +
  ggtitle("PCA - BEFORE Blood Correction") +
  theme_bw()

# Perform PCA after correction
pca_after <- prcomp(t(as.matrix(corrected_data)), scale. = TRUE)
pca_after_df <- data.frame(
  PC1 = pca_after$x[, 1],
  PC2 = pca_after$x[, 2],
  Group = CSF_metadata$Group
)

# Plot AFTER correction
p_pca_after <- ggplot(pca_after_df, aes(x = PC1, y = PC2, color = Group)) +
  geom_point(size = 4) +
  xlab(paste0("PC1 (", round(summary(pca_after)$importance[2, 1] * 100, 1), "% variance)")) +
  ylab(paste0("PC2 (", round(summary(pca_after)$importance[2, 2] * 100, 1), "% variance)")) +
  ggtitle("PCA - AFTER Blood Correction") +
  theme_bw()

# Combine the two PCA plots
combined_pca <- p_pca_before | p_pca_after
combined_pca

# ===========================
library(ggplot2)
df <- CSF_metadata
ggplot(df, aes(x = Group, y = blood_score,  fill = Group)) +
  geom_boxplot() +
  theme_minimal() +
  labs(title = "Blood Score by Group", x = "Group", y = "Blood Score") +
  theme(legend.position = "none")