library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.3.3
## Warning: package 'tidyr' was built under R version 4.3.2
## Warning: package 'purrr' was built under R version 4.3.3
## Warning: package 'lubridate' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4          ✔ readr     2.1.5     
## ✔ forcats   1.0.0.9000     ✔ stringr   1.5.1     
## ✔ ggplot2   3.5.2          ✔ tibble    3.2.1     
## ✔ lubridate 1.9.4          ✔ tidyr     1.3.1     
## ✔ purrr     1.0.4          
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(writexl)
## Warning: package 'writexl' was built under R version 4.3.3
library(ggrepel)
## Warning: package 'ggrepel' was built under R version 4.3.3
library(gplots)
## Warning: package 'gplots' was built under R version 4.3.3
## 
## Attaching package: 'gplots'
## 
## The following object is masked from 'package:stats':
## 
##     lowess
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.3.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(variancePartition)
## Loading required package: limma
## Loading required package: BiocParallel
## Warning: package 'BiocParallel' was built under R version 4.3.1
## 
## Attaching package: 'variancePartition'
## 
## The following object is masked from 'package:limma':
## 
##     topTable
library(limma)
library(BiocParallel)
library(ComplexHeatmap)
## Warning: package 'ComplexHeatmap' was built under R version 4.3.1
## Loading required package: grid
## ========================================
## ComplexHeatmap version 2.18.0
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
## 
## If you use it in published research, please cite either one:
## - Gu, Z. Complex Heatmap Visualization. iMeta 2022.
## - Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
##     genomic data. Bioinformatics 2016.
## 
## 
## The new InteractiveComplexHeatmap package can directly export static 
## complex heatmaps into an interactive Shiny app with zero effort. Have a try!
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================
library(org.Hs.eg.db)
## Loading required package: AnnotationDbi
## Warning: package 'AnnotationDbi' was built under R version 4.3.2
## Loading required package: stats4
## Loading required package: BiocGenerics
## Warning: package 'BiocGenerics' was built under R version 4.3.1
## 
## Attaching package: 'BiocGenerics'
## 
## The following object is masked from 'package:limma':
## 
##     plotMA
## 
## The following object is masked from 'package:pROC':
## 
##     var
## 
## The following object is masked from 'package:randomForest':
## 
##     combine
## 
## The following objects are masked from 'package:lubridate':
## 
##     intersect, setdiff, union
## 
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## 
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## 
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min
## 
## Loading required package: Biobase
## Warning: package 'Biobase' was built under R version 4.3.1
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Loading required package: IRanges
## Warning: package 'IRanges' was built under R version 4.3.1
## Loading required package: S4Vectors
## Warning: package 'S4Vectors' was built under R version 4.3.2
## 
## Attaching package: 'S4Vectors'
## 
## The following object is masked from 'package:gplots':
## 
##     space
## 
## The following objects are masked from 'package:lubridate':
## 
##     second, second<-
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## 
## The following object is masked from 'package:tidyr':
## 
##     expand
## 
## The following object is masked from 'package:utils':
## 
##     findMatches
## 
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## 
## 
## Attaching package: 'IRanges'
## 
## The following object is masked from 'package:lubridate':
## 
##     %within%
## 
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## 
## The following object is masked from 'package:purrr':
## 
##     reduce
## 
## 
## Attaching package: 'AnnotationDbi'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
library(clusterProfiler)
## Warning: package 'clusterProfiler' was built under R version 4.3.1
## 
## Registered S3 methods overwritten by 'treeio':
##   method              from    
##   MRCA.phylo          tidytree
##   MRCA.treedata       tidytree
##   Nnode.treedata      tidytree
##   Ntip.treedata       tidytree
##   ancestor.phylo      tidytree
##   ancestor.treedata   tidytree
##   child.phylo         tidytree
##   child.treedata      tidytree
##   full_join.phylo     tidytree
##   full_join.treedata  tidytree
##   groupClade.phylo    tidytree
##   groupClade.treedata tidytree
##   groupOTU.phylo      tidytree
##   groupOTU.treedata   tidytree
##   inner_join.phylo    tidytree
##   inner_join.treedata tidytree
##   is.rooted.treedata  tidytree
##   nodeid.phylo        tidytree
##   nodeid.treedata     tidytree
##   nodelab.phylo       tidytree
##   nodelab.treedata    tidytree
##   offspring.phylo     tidytree
##   offspring.treedata  tidytree
##   parent.phylo        tidytree
##   parent.treedata     tidytree
##   root.treedata       tidytree
##   rootnode.phylo      tidytree
##   sibling.phylo       tidytree
## clusterProfiler v4.8.3  For help: https://yulab-smu.top/biomedical-knowledge-mining-book/
## 
## If you use clusterProfiler in published research, please cite:
## T Wu, E Hu, S Xu, M Chen, P Guo, Z Dai, T Feng, L Zhou, W Tang, L Zhan, X Fu, S Liu, X Bo, and G Yu. clusterProfiler 4.0: A universal enrichment tool for interpreting omics data. The Innovation. 2021, 2(3):100141
## 
## Attaching package: 'clusterProfiler'
## 
## The following object is masked from 'package:AnnotationDbi':
## 
##     select
## 
## The following object is masked from 'package:IRanges':
## 
##     slice
## 
## The following object is masked from 'package:S4Vectors':
## 
##     rename
## 
## The following object is masked from 'package:purrr':
## 
##     simplify
## 
## The following object is masked from 'package:stats':
## 
##     filter
library(pheatmap)
## 
## Attaching package: 'pheatmap'
## 
## The following object is masked from 'package:ComplexHeatmap':
## 
##     pheatmap
library(EnhancedVolcano)
library(tidyverse)
library(dplyr)
library(clusterProfiler)
library(org.Hs.eg.db)
library(dplyr)
library(preprocessCore)
library(M3C)

#import csf TMT data 
all.csf <- readRDS("~/Desktop/Resilience.Study_Talk/CSF.Pro.resilience/all.csf.rds")
all_CSF_metadata_used <- read.csv("~/Desktop/Resilience.Study_Talk/CSF.Pro.resilience/all.CSF.metadata.used.csv", row.names = 1)
metadata <- all_CSF_metadata_used
row.names(metadata) <- colnames(all.csf)
head(metadata , 5)
##                     samples    Condition
## CSF.FAD.1.8_1 CSF.FAD.1.8_1 FAD.1.8month
## CSF.FAD.1.8_2 CSF.FAD.1.8_2 FAD.1.8month
## CSF.FAD.1.8_3 CSF.FAD.1.8_3 FAD.1.8month
## CSF.FAD.1.8_4 CSF.FAD.1.8_4 FAD.1.8month
## CSF.FAD.1.8_5 CSF.FAD.1.8_5 FAD.1.8month
table(metadata$Condition)
## 
##  FAD.1.8month FAD.10.2month FAD.14.4month  FAD.3.1month    FAD.6month 
##             8             8             6             9             9 
##   wt.1.8month  wt.10.2month  wt.14.4month   wt.3.1month     wt.6month 
##             9             9            10             8            10
#import Brain metadata

Brain.5xFAD.Metadata <- read.csv('/Users/usri/Desktop/Ali.Brain.VPA_04.2025/Brain.5xFAD.Metadata.csv', row.names = 1)
dim(Brain.5xFAD.Metadata)
## [1] 86  8
head(Brain.5xFAD.Metadata)
##            Treatment MSbatch.channel           RNAIDs UniqueID Age Sex Genotype
## FAD.1.8M.1  FAD.1.8M        b02.130C p20189_s081_1828     1828 1.8   M      FAD
## FAD.1.8M.2  FAD.1.8M        b03.127N p20189_s080_1827     1827 1.8   M      FAD
## FAD.1.8M.3  FAD.1.8M        b03.129C p20189_s083_1823     1823 1.8   F      FAD
## FAD.1.8M.4  FAD.1.8M        b04.129C p20189_s079_1825     1825 1.8   M      FAD
## FAD.1.8M.5  FAD.1.8M        b04.131C p20189_s084_1824     1824 1.8   F      FAD
## FAD.1.8M.6  FAD.1.8M        b05.128C p20189_s082_1830     1830 1.8   M      FAD
##            Batch
## FAD.1.8M.1     2
## FAD.1.8M.2     3
## FAD.1.8M.3     3
## FAD.1.8M.4     4
## FAD.1.8M.5     4
## FAD.1.8M.6     5
##
all_CSF_metadata_used$Age <- Brain.5xFAD.Metadata$Age
all_CSF_metadata_used$Genotype <- Brain.5xFAD.Metadata$Genotype
all_CSF_metadata_used$Treatment <- Brain.5xFAD.Metadata$Treatment
all_CSF_metadata_used$UniqueID <- Brain.5xFAD.Metadata$UniqueID
all_CSF_metadata_used$Batch <- Brain.5xFAD.Metadata$Batch
all_CSF_metadata_used$MSbatch.channel <- Brain.5xFAD.Metadata$MSbatch.channel
all_CSF_metadata_used$RNAIDs <- Brain.5xFAD.Metadata$RNAIDs
all_CSF_metadata_used$Sex <- Brain.5xFAD.Metadata$Sex
saveRDS(all_CSF_metadata_used, file = 'all_CSF_metadata_used.rds')
write.csv(all_CSF_metadata_used, file = 'all_CSF_metadata_used.csv')
write.csv(Brain.5xFAD.Metadata, file = 'Brain.5xFAD.Metadata.csv')
cleanDat.original <- all.csf
numericMeta <- all_CSF_metadata_used  #This is the traits dataframe 

regvars.vp<-data.frame(numericMeta)
regvars.vp$Sex<-factor(regvars.vp$Sex)
regvars.vp$Genotype<-factor(regvars.vp$Genotype)
regvars.vp$Batch <- factor(regvars.vp$Batch)
form <- ~Age+(1|Batch)+(1|Sex)+(1|Genotype)
library(variancePartition)
regvars.vp.final <- regvars.vp
cleanDat.noNA.unreg <-impute::impute.knn(as.matrix(cleanDat.original))$data     #This cleanDat.original is the dataframe contained in the input file that is undergoing clean-up
## Cluster size 3721 broken into 2907 814 
## Cluster size 2907 broken into 972 1935 
## Done cluster 972 
## Cluster size 1935 broken into 487 1448 
## Done cluster 487 
## Done cluster 1448 
## Done cluster 1935 
## Done cluster 2907 
## Done cluster 814
dim(cleanDat.noNA.unreg )
## [1] 3721   86
saveRDS(cleanDat.noNA.unreg, file = 'cleanDat.noNA.unreg.CSF.unreg.rds')

varPart1 <- fitExtractVarPartModel(cleanDat.noNA.unreg, form, regvars.vp)
## Dividing work into 100 chunks...
## 
## Total:70 s
vp1 <- sortCols(varPart1,FUN=max,last= c("Batch","Age","Sex","Genotype","Residuals"))
saveRDS(vp1, file = 'vp2.BEFORE.Regression.FINAL.rds')
pdf(file="upasna_mouse_VariancePartition_Unregressed_with_imputation.FINAL.pdf")
    par(mfrow=c(1,1))

# Before regression
#   cleanDat.noNA.unreg<-na.omit(cleanDat.original)  #.unreg)
    plotVarPart( vp1, main="WITHOUT regression" )

    DxSortOrder<-order(vp1[["Batch"]],decreasing=TRUE)
    #rownames(cleanDat.noNA.unreg)[DxSortOrder][1:50]
    for (i in ls(vp1)) { vp1[[i]]<-vp1[[i]][DxSortOrder]; }
    rownames(vp1)<-rownames(vp1)[DxSortOrder]

    plotPercentBars( vp1[1:50,]) + ggtitle( "Top Batch-covariate proteins, w/o Regression" )

    SexSortOrder<-order(vp1[["Sex"]],decreasing=TRUE)
    #rownames(cleanDat.noNA.unreg)[SexSortOrder][1:50]
    for (i in ls(vp1)) { vp1[[i]]<-vp1[[i]][SexSortOrder]; }
    rownames(vp1)<-rownames(vp1)[SexSortOrder]

    plotPercentBars( vp1[1:50,]) + ggtitle( "Top Sex-covariate proteins, w/o Regression" )

    AgeSortOrder<-order(vp1[["Age"]],decreasing=TRUE)
    #rownames(cleanDat.noNA.unreg)[AgeSortOrder][1:50]
    for (i in ls(vp1)) { vp1[[i]]<-vp1[[i]][AgeSortOrder]; }
    rownames(vp1)<-rownames(vp1)[AgeSortOrder]

    plotPercentBars( vp1[1:50,]) + ggtitle( "Top Age-covariate proteins, w/o Regression" )

    CogSortOrder<-order(vp1[["Genotype"]],decreasing=TRUE)
    #rownames(cleanDat.noNA.unreg)[CogSortOrder][1:50]
    for (i in ls(vp1)) { vp1[[i]]<-vp1[[i]][CogSortOrder]; }
    rownames(vp1)<-rownames(vp1)[CogSortOrder]

    plotPercentBars( vp1[1:50,]) + ggtitle( "Top Genotype-covariate proteins, w/o Regression" )
plotPercentBars( vp1[1:50,]) + ggtitle( "Top Genotype-covariate proteins, w/o Regression" )

plotPercentBars( vp1[1:50,]) + ggtitle( "Top Age-covariate proteins, w/o Regression" )

plotPercentBars( vp1[1:50,]) + ggtitle( "Top Sex-covariate proteins, w/o Regression" )

plotPercentBars( vp1[1:50,]) + ggtitle( "Top Batch-covariate proteins, w/o Regression" )

cleanDat <- cleanDat.noNA.unreg
Exp.data.noNA.12samples <- cleanDat.noNA.unreg
cleanDat.noNA <-impute::impute.knn(as.matrix(cleanDat))$data            #This cleanDat is the variable containing the data that has been regressed and corrected for age, sex and batch effects, used to verify that regression is working without error
## Cluster size 3721 broken into 2926 795 
## Cluster size 2926 broken into 2138 788 
## Cluster size 2138 broken into 1268 870 
## Done cluster 1268 
## Done cluster 870 
## Done cluster 2138 
## Done cluster 788 
## Done cluster 2926 
## Done cluster 795
varPart2 <- fitExtractVarPartModel(cleanDat.noNA, form, regvars.vp)
## Dividing work into 100 chunks...
## 
## Total:69 s
saveRDS(varPart2, file = 'varPart2.rds')
vp2 <- sortCols(varPart2,FUN=max,last= c("Batch","Age","Sex","Genotype","Residuals"))
saveRDS(vp2, file = 'vp2.After.Regression.FINAL.rds')

pdf(file="Upasna_mouse_VariancePartition-regressed_all_with imputation.FINAL.pdf")
par(mfrow=c(1,1))

# Post-correction (or regression)
#   cleanDat.noNA<-na.omit(cleanDat)
plotVarPart( vp2, main="Effects Stripped by Bootstrap Regression, Age, Sex and Batch" )

DxSortOrder<-order(vp2[["Batch"]],decreasing=TRUE)
#rownames(cleanDat.noNA)[DxSortOrder][1:50]
for (i in ls(vp2)) { vp2[[i]]<-vp2[[i]][DxSortOrder]; }
rownames(vp2)<-rownames(vp2)[DxSortOrder]

plotPercentBars( vp2[1:50,]) + ggtitle( "Top Batch-covariate proteins, Post-Regression, Age, Sex and Batch" )

SexSortOrder<-order(vp2[["Sex"]],decreasing=TRUE)
#rownames(cleanDat.noNA)[SexSortOrder][1:50]
for (i in ls(vp2)) { vp2[[i]]<-vp2[[i]][SexSortOrder]; }
rownames(vp2)<-rownames(vp2)[SexSortOrder]

plotPercentBars( vp2[1:50,]) + ggtitle( "Top Sex-covariate proteins, Post-Regression, Age, Sex and Batch" )

AgeSortOrder<-order(vp2[["Age"]],decreasing=TRUE)
#rownames(cleanDat.noNA)[AgeSortOrder][1:50]
for (i in ls(vp2)) { vp2[[i]]<-vp2[[i]][AgeSortOrder]; }
rownames(vp2)<-rownames(vp2)[AgeSortOrder]

plotPercentBars( vp2[1:50,]) + ggtitle( "Top Age-covariate proteins, Post-Regression, Age, Sex and Batch" )

CogSortOrder<-order(vp2[["Genotype"]],decreasing=TRUE)
#rownames(cleanDat.noNA)[CogSortOrder][1:50]
for (i in ls(vp2)) { vp2[[i]]<-vp2[[i]][CogSortOrder]; }
rownames(vp2)<-rownames(vp2)[CogSortOrder]

plotPercentBars( vp2[1:50,]) + ggtitle( "Top Genotype-covariate proteins, Post-Regression, Age, Sex and Batch" )

dev.off()
## quartz_off_screen 
##                 2
plotPercentBars( vp2[1:50,]) + ggtitle( "Top Batch-covariate proteins, Post-Regression, Age, Sex and Batch" )

plotPercentBars( vp2[1:50,]) + ggtitle( "Top Sex-covariate proteins, Post-Regression, Age, Sex and Batch" )

plotPercentBars( vp2[1:50,]) + ggtitle( "Top Age-covariate proteins, Post-Regression, Age, Sex and Batch" )

plotPercentBars( vp2[1:50,]) + ggtitle( "Top Genotype-covariate proteins, Post-Regression, Age, Sex and Batch" )

library(ggplot2)

# Save PDF
pdf("VPA_Before_Adjustment_Top50_PerCovariate.pdf", width=12, height=10)
par(mfrow=c(1,1))

# Batch
BatchOrder <- order(vp1[["Batch"]], decreasing=TRUE)
BatchTop50 <- rownames(vp1)[BatchOrder][1:50]
saveRDS(vp1[BatchTop50, ], "VP1_Top50_Batch.rds")
write.csv(vp1[BatchTop50, ], "VP1_Top50_Batch.csv")
plotPercentBars(vp1[BatchTop50, ]) + ggtitle("Top 50 Batch-affected Proteins (Before Adjustment)")

# Age
AgeOrder <- order(vp1[["Age"]], decreasing=TRUE)
AgeTop50 <- rownames(vp1)[AgeOrder][1:50]
saveRDS(vp1[AgeTop50, ], "VP1_Top50_Age.rds")
write.csv(vp1[AgeTop50, ], "VP1_Top50_Age.csv")
plotPercentBars(vp1[AgeTop50, ]) + ggtitle("Top 50 Age-affected Proteins (Before Adjustment)")

# Sex
SexOrder <- order(vp1[["Sex"]], decreasing=TRUE)
SexTop50 <- rownames(vp1)[SexOrder][1:50]
saveRDS(vp1[SexTop50, ], "VP1_Top50_Sex.rds")
write.csv(vp1[SexTop50, ], "VP1_Top50_Sex.csv")
plotPercentBars(vp1[SexTop50, ]) + ggtitle("Top 50 Sex-affected Proteins (Before Adjustment)")

# Genotype
GenotypeOrder <- order(vp1[["Genotype"]], decreasing=TRUE)
GenotypeTop50 <- rownames(vp1)[GenotypeOrder][1:50]
saveRDS(vp1[GenotypeTop50, ], "VP1_Top50_Genotype.rds")
write.csv(vp1[GenotypeTop50, ], "VP1_Top50_Genotype.csv")
plotPercentBars(vp1[GenotypeTop50, ]) + ggtitle("Top 50 Genotype-affected Proteins (Before Adjustment)")

dev.off()
## quartz_off_screen 
##                 2
color_list <- c('skyblue', 'red', 'yellow', 'green', 'purple', 'orange', 'pink', 'brown', 'grey', 'blue')
# Perform PCA with the specified color vector and other parameters
pca(cleanDat.noNA,
    colvec = color_list,
    labels = regvars.vp.final$Batch,
    controlscale = TRUE,
    scale = 3,
    printres = TRUE,
    printwidth = 25,
    legendtitle = "Batch")
## ***PCA wrapper function***
## running...
## printing PCA to current directory...
## done.

# Perform PCA with the specified color vector and other parameters
pca(cleanDat.noNA,
    colvec = color_list,
    labels = regvars.vp.final$Treatment,
    controlscale = TRUE,
    scale = 3,
    printres = TRUE,
    printwidth = 25,
    legendtitle = "Treatment")
## ***PCA wrapper function***
## running...
## printing PCA to current directory...
## done.

### Setup
regvars.vp <- data.frame(regvars.vp.final)
regvars.vp$Sex <- factor(regvars.vp$Sex)
regvars.vp$Genotype <- factor(regvars.vp$Genotype)
regvars.vp$Batch <- factor(regvars.vp$Batch)
regvars.vp$Age <- factor(regvars.vp$Age)
saveRDS(regvars.vp, file = 'regvars.vp.rds')
# MODEL FORMULA: All random effects
form <- ~(1|Age) + (1|Batch) + (1|Sex) + (1|Genotype)

info <- regvars.vp.final

# Calculate variance for each gene
gene_variances <- apply(Exp.data.noNA.12samples, 1, var)

# Filter genes with non-zero variance
non_zero_variance_genes <- gene_variances > 0
Exp.data.filtered <- Exp.data.noNA.12samples[non_zero_variance_genes, ]

# VPA
varPart <- fitExtractVarPartModel(Exp.data.filtered, form, regvars.vp)
## Dividing work into 100 chunks...
## 
## Total:79 s
# Sort
vp <- sortCols(varPart)

# Bar plot
plotPercentBars(vp[1:50, ])

# Violin plot
plotVarPart(vp) + ggtitle('Variation Partition Analysis')

saveRDS(vp, file = 'vp.afterbatch.correction.rds')


library(dplyr)
library(tidyr)
library(ggplot2)
library(variancePartition)
#Step 1: Prepare the variance partition dataframe
# Assume your variance partition result is in `vp`
vp_df <- as.data.frame(vp) 
vp_df$Gene <- rownames(vp_df)  # Save gene names into a column

# Reshape to long format (needed for ggplot)
vp_long <- pivot_longer(vp_df, cols = -Gene, names_to = "Variable", values_to = "Variance")
head(vp_long, 10)
## # A tibble: 10 × 3
##    Gene  Variable  Variance
##    <chr> <chr>        <dbl>
##  1 Pzp   Batch     0       
##  2 Pzp   Age       0       
##  3 Pzp   Sex       0       
##  4 Pzp   Genotype  0       
##  5 Pzp   Residuals 1       
##  6 Alb   Batch     0.000354
##  7 Alb   Age       0.0140  
##  8 Alb   Sex       0.00776 
##  9 Alb   Genotype  0       
## 10 Alb   Residuals 0.978
#Step 2: Find the top 10 proteins per variable
# For each variable, find top 10 genes
top10_labels <- vp_long %>%
  group_by(Variable) %>%
  top_n(10, Variance) %>%
  ungroup()
top10_labels
## # A tibble: 338 × 3
##    Gene      Variable  Variance
##    <chr>     <chr>        <dbl>
##  1 Pzp       Residuals        1
##  2 Apob      Residuals        1
##  3 Serpina1b Residuals        1
##  4 Cp.1      Residuals        1
##  5 Hbbt1     Residuals        1
##  6 Hbbt1.1   Residuals        1
##  7 Hbbt1.2   Residuals        1
##  8 Cfh       Residuals        1
##  9 Hbbt1.3   Residuals        1
## 10 Hbbt1.4   Residuals        1
## # ℹ 328 more rows
#Step 3: Plot the violin plot and add labels
# Basic violin plot
p <- ggplot(vp_long, aes(x = Variable, y = Variance)) +
  geom_violin(aes(fill = Variable), trim = FALSE, color = "black", alpha = 0.5) +
  geom_boxplot(width = 0.1, fill = "white", outlier.size = 0.5) +
  geom_point(data = top10_labels, aes(x = Variable, y = Variance), color = "red", size = 1.5) +
  geom_text(data = top10_labels, aes(x = Variable, y = Variance, label = Gene),
            hjust = -0.1, size = 2, angle = 45, color = "black") +
  theme_bw() +
  ggtitle("5xFAD CSF: VPA with Top 10 Proteins Labeled") +
  ylab("Variance explained (%)") +
  xlab("") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

p

library(dplyr)
library(tidyr)
library(ggplot2)

# 1. Prepare data
vp_df <- as.data.frame(vp)
vp_df$Gene <- rownames(vp_df)

# 2. Reshape to long format
vp_long <- pivot_longer(vp_df, cols = -Gene, names_to = "Variable", values_to = "Variance")

# 3. Find top 10 genes per variable
top10 <- vp_long %>%
  group_by(Variable) %>%
  slice_max(order_by = Variance, n = 15) %>%
  ungroup()
top10
## # A tibble: 358 × 3
##    Gene      Variable Variance
##    <chr>     <chr>       <dbl>
##  1 Hapln2    Age         0.601
##  2 Ctsd      Age         0.381
##  3 Fah       Age         0.358
##  4 Ifit1     Age         0.343
##  5 Sprr1a    Age         0.337
##  6 Mobp      Age         0.318
##  7 Ppp1r14a  Age         0.304
##  8 Serpinb6d Age         0.302
##  9 Col1a1    Age         0.298
## 10 Acan      Age         0.278
## # ℹ 348 more rows
#Plot for Age:
top10_age <- top10 %>% filter(Variable == "Age")

ggplot(top10_age, aes(x = Variable, y = Variance)) +
  geom_violin(fill = "red", trim = FALSE, color = "black", alpha = 0.6) +
  geom_point(color = "black", size = 2) +
  geom_text(aes(label = Gene), vjust = -0.5, size = 3, angle = 45, check_overlap = TRUE) +
  theme_bw() +
  ggtitle("Top 10 Proteins - Age") +
  ylab("Variance explained (%)") +
  xlab("") +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank())

##BY Genotype

#Plot for Genotype:
top10_genotype <- top10 %>% filter(Variable == "Genotype")

ggplot(top10_genotype, aes(x = Variable, y = Variance)) +
  geom_violin(fill = "green", trim = FALSE, color = "black", alpha = 0.6) +
  geom_point(color = "black", size = 2) +
  geom_text(aes(label = Gene), vjust = -0.5, size = 3, angle = 45, check_overlap = TRUE) +
  theme_bw() +
  ggtitle("Top 10 Proteins - Genotype") +
  ylab("Variance explained (%)") +
  xlab("") +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank())

####Plot for Sex:
top10_sex <- top10 %>% filter(Variable == "Sex")

ggplot(top10_sex, aes(x = Variable, y = Variance)) +
  geom_violin(fill = "orange", trim = FALSE, color = "black", alpha = 0.6) +
  geom_point(color = "black", size = 2) +
  geom_text(aes(label = Gene), vjust = -0.5, size = 3, angle = 45, check_overlap = TRUE) +
  theme_bw() +
  ggtitle("Top 10 Proteins - Sex") +
  ylab("Variance explained (%)") +
  xlab("") +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank())

##Plot for Batch:
top10_batch <- top10 %>% filter(Variable == "Batch")

ggplot(top10_batch, aes(x = Variable, y = Variance)) +
  geom_violin(fill = "brown", trim = FALSE, color = "black", alpha = 0.6) +
  geom_point(color = "black", size = 2) +
  geom_text(aes(label = Gene), vjust = -0.5, size = 3, angle = 45, check_overlap = TRUE) +
  theme_bw() +
  ggtitle("Top 10 Proteins - Batch") +
  ylab("Variance explained (%)") +
  xlab("") +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank())