library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.3.3
## Warning: package 'tidyr' was built under R version 4.3.2
## Warning: package 'purrr' was built under R version 4.3.3
## Warning: package 'lubridate' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0.9000 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(writexl)
## Warning: package 'writexl' was built under R version 4.3.3
library(ggrepel)
## Warning: package 'ggrepel' was built under R version 4.3.3
library(gplots)
## Warning: package 'gplots' was built under R version 4.3.3
##
## Attaching package: 'gplots'
##
## The following object is masked from 'package:stats':
##
## lowess
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.3.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(variancePartition)
## Loading required package: limma
## Loading required package: BiocParallel
## Warning: package 'BiocParallel' was built under R version 4.3.1
##
## Attaching package: 'variancePartition'
##
## The following object is masked from 'package:limma':
##
## topTable
library(limma)
library(BiocParallel)
library(ComplexHeatmap)
## Warning: package 'ComplexHeatmap' was built under R version 4.3.1
## Loading required package: grid
## ========================================
## ComplexHeatmap version 2.18.0
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
##
## If you use it in published research, please cite either one:
## - Gu, Z. Complex Heatmap Visualization. iMeta 2022.
## - Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional
## genomic data. Bioinformatics 2016.
##
##
## The new InteractiveComplexHeatmap package can directly export static
## complex heatmaps into an interactive Shiny app with zero effort. Have a try!
##
## This message can be suppressed by:
## suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================
library(org.Hs.eg.db)
## Loading required package: AnnotationDbi
## Warning: package 'AnnotationDbi' was built under R version 4.3.2
## Loading required package: stats4
## Loading required package: BiocGenerics
## Warning: package 'BiocGenerics' was built under R version 4.3.1
##
## Attaching package: 'BiocGenerics'
##
## The following object is masked from 'package:limma':
##
## plotMA
##
## The following object is masked from 'package:pROC':
##
## var
##
## The following object is masked from 'package:randomForest':
##
## combine
##
## The following objects are masked from 'package:lubridate':
##
## intersect, setdiff, union
##
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
##
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
##
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which.max, which.min
##
## Loading required package: Biobase
## Warning: package 'Biobase' was built under R version 4.3.1
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Loading required package: IRanges
## Warning: package 'IRanges' was built under R version 4.3.1
## Loading required package: S4Vectors
## Warning: package 'S4Vectors' was built under R version 4.3.2
##
## Attaching package: 'S4Vectors'
##
## The following object is masked from 'package:gplots':
##
## space
##
## The following objects are masked from 'package:lubridate':
##
## second, second<-
##
## The following objects are masked from 'package:dplyr':
##
## first, rename
##
## The following object is masked from 'package:tidyr':
##
## expand
##
## The following object is masked from 'package:utils':
##
## findMatches
##
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
##
##
## Attaching package: 'IRanges'
##
## The following object is masked from 'package:lubridate':
##
## %within%
##
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
##
## The following object is masked from 'package:purrr':
##
## reduce
##
##
## Attaching package: 'AnnotationDbi'
##
## The following object is masked from 'package:dplyr':
##
## select
library(clusterProfiler)
## Warning: package 'clusterProfiler' was built under R version 4.3.1
##
## Registered S3 methods overwritten by 'treeio':
## method from
## MRCA.phylo tidytree
## MRCA.treedata tidytree
## Nnode.treedata tidytree
## Ntip.treedata tidytree
## ancestor.phylo tidytree
## ancestor.treedata tidytree
## child.phylo tidytree
## child.treedata tidytree
## full_join.phylo tidytree
## full_join.treedata tidytree
## groupClade.phylo tidytree
## groupClade.treedata tidytree
## groupOTU.phylo tidytree
## groupOTU.treedata tidytree
## inner_join.phylo tidytree
## inner_join.treedata tidytree
## is.rooted.treedata tidytree
## nodeid.phylo tidytree
## nodeid.treedata tidytree
## nodelab.phylo tidytree
## nodelab.treedata tidytree
## offspring.phylo tidytree
## offspring.treedata tidytree
## parent.phylo tidytree
## parent.treedata tidytree
## root.treedata tidytree
## rootnode.phylo tidytree
## sibling.phylo tidytree
## clusterProfiler v4.8.3 For help: https://yulab-smu.top/biomedical-knowledge-mining-book/
##
## If you use clusterProfiler in published research, please cite:
## T Wu, E Hu, S Xu, M Chen, P Guo, Z Dai, T Feng, L Zhou, W Tang, L Zhan, X Fu, S Liu, X Bo, and G Yu. clusterProfiler 4.0: A universal enrichment tool for interpreting omics data. The Innovation. 2021, 2(3):100141
##
## Attaching package: 'clusterProfiler'
##
## The following object is masked from 'package:AnnotationDbi':
##
## select
##
## The following object is masked from 'package:IRanges':
##
## slice
##
## The following object is masked from 'package:S4Vectors':
##
## rename
##
## The following object is masked from 'package:purrr':
##
## simplify
##
## The following object is masked from 'package:stats':
##
## filter
library(pheatmap)
##
## Attaching package: 'pheatmap'
##
## The following object is masked from 'package:ComplexHeatmap':
##
## pheatmap
library(EnhancedVolcano)
library(tidyverse)
library(dplyr)
library(clusterProfiler)
library(org.Hs.eg.db)
library(dplyr)
library(preprocessCore)
library(M3C)
#import csf TMT data
all.csf <- readRDS("~/Desktop/Resilience.Study_Talk/CSF.Pro.resilience/all.csf.rds")
all_CSF_metadata_used <- read.csv("~/Desktop/Resilience.Study_Talk/CSF.Pro.resilience/all.CSF.metadata.used.csv", row.names = 1)
metadata <- all_CSF_metadata_used
row.names(metadata) <- colnames(all.csf)
head(metadata , 5)
## samples Condition
## CSF.FAD.1.8_1 CSF.FAD.1.8_1 FAD.1.8month
## CSF.FAD.1.8_2 CSF.FAD.1.8_2 FAD.1.8month
## CSF.FAD.1.8_3 CSF.FAD.1.8_3 FAD.1.8month
## CSF.FAD.1.8_4 CSF.FAD.1.8_4 FAD.1.8month
## CSF.FAD.1.8_5 CSF.FAD.1.8_5 FAD.1.8month
table(metadata$Condition)
##
## FAD.1.8month FAD.10.2month FAD.14.4month FAD.3.1month FAD.6month
## 8 8 6 9 9
## wt.1.8month wt.10.2month wt.14.4month wt.3.1month wt.6month
## 9 9 10 8 10
#import Brain metadata
Brain.5xFAD.Metadata <- read.csv('/Users/usri/Desktop/Ali.Brain.VPA_04.2025/Brain.5xFAD.Metadata.csv', row.names = 1)
dim(Brain.5xFAD.Metadata)
## [1] 86 8
head(Brain.5xFAD.Metadata)
## Treatment MSbatch.channel RNAIDs UniqueID Age Sex Genotype
## FAD.1.8M.1 FAD.1.8M b02.130C p20189_s081_1828 1828 1.8 M FAD
## FAD.1.8M.2 FAD.1.8M b03.127N p20189_s080_1827 1827 1.8 M FAD
## FAD.1.8M.3 FAD.1.8M b03.129C p20189_s083_1823 1823 1.8 F FAD
## FAD.1.8M.4 FAD.1.8M b04.129C p20189_s079_1825 1825 1.8 M FAD
## FAD.1.8M.5 FAD.1.8M b04.131C p20189_s084_1824 1824 1.8 F FAD
## FAD.1.8M.6 FAD.1.8M b05.128C p20189_s082_1830 1830 1.8 M FAD
## Batch
## FAD.1.8M.1 2
## FAD.1.8M.2 3
## FAD.1.8M.3 3
## FAD.1.8M.4 4
## FAD.1.8M.5 4
## FAD.1.8M.6 5
##
all_CSF_metadata_used$Age <- Brain.5xFAD.Metadata$Age
all_CSF_metadata_used$Genotype <- Brain.5xFAD.Metadata$Genotype
all_CSF_metadata_used$Treatment <- Brain.5xFAD.Metadata$Treatment
all_CSF_metadata_used$UniqueID <- Brain.5xFAD.Metadata$UniqueID
all_CSF_metadata_used$Batch <- Brain.5xFAD.Metadata$Batch
all_CSF_metadata_used$MSbatch.channel <- Brain.5xFAD.Metadata$MSbatch.channel
all_CSF_metadata_used$RNAIDs <- Brain.5xFAD.Metadata$RNAIDs
all_CSF_metadata_used$Sex <- Brain.5xFAD.Metadata$Sex
saveRDS(all_CSF_metadata_used, file = 'all_CSF_metadata_used.rds')
write.csv(all_CSF_metadata_used, file = 'all_CSF_metadata_used.csv')
write.csv(Brain.5xFAD.Metadata, file = 'Brain.5xFAD.Metadata.csv')
cleanDat.original <- all.csf
numericMeta <- all_CSF_metadata_used #This is the traits dataframe
regvars.vp<-data.frame(numericMeta)
regvars.vp$Sex<-factor(regvars.vp$Sex)
regvars.vp$Genotype<-factor(regvars.vp$Genotype)
regvars.vp$Batch <- factor(regvars.vp$Batch)
form <- ~Age+(1|Batch)+(1|Sex)+(1|Genotype)
library(variancePartition)
regvars.vp.final <- regvars.vp
cleanDat.noNA.unreg <-impute::impute.knn(as.matrix(cleanDat.original))$data #This cleanDat.original is the dataframe contained in the input file that is undergoing clean-up
## Cluster size 3721 broken into 2907 814
## Cluster size 2907 broken into 972 1935
## Done cluster 972
## Cluster size 1935 broken into 487 1448
## Done cluster 487
## Done cluster 1448
## Done cluster 1935
## Done cluster 2907
## Done cluster 814
dim(cleanDat.noNA.unreg )
## [1] 3721 86
saveRDS(cleanDat.noNA.unreg, file = 'cleanDat.noNA.unreg.CSF.unreg.rds')
varPart1 <- fitExtractVarPartModel(cleanDat.noNA.unreg, form, regvars.vp)
## Dividing work into 100 chunks...
##
## Total:70 s
vp1 <- sortCols(varPart1,FUN=max,last= c("Batch","Age","Sex","Genotype","Residuals"))
saveRDS(vp1, file = 'vp2.BEFORE.Regression.FINAL.rds')
pdf(file="upasna_mouse_VariancePartition_Unregressed_with_imputation.FINAL.pdf")
par(mfrow=c(1,1))
# Before regression
# cleanDat.noNA.unreg<-na.omit(cleanDat.original) #.unreg)
plotVarPart( vp1, main="WITHOUT regression" )
DxSortOrder<-order(vp1[["Batch"]],decreasing=TRUE)
#rownames(cleanDat.noNA.unreg)[DxSortOrder][1:50]
for (i in ls(vp1)) { vp1[[i]]<-vp1[[i]][DxSortOrder]; }
rownames(vp1)<-rownames(vp1)[DxSortOrder]
plotPercentBars( vp1[1:50,]) + ggtitle( "Top Batch-covariate proteins, w/o Regression" )
SexSortOrder<-order(vp1[["Sex"]],decreasing=TRUE)
#rownames(cleanDat.noNA.unreg)[SexSortOrder][1:50]
for (i in ls(vp1)) { vp1[[i]]<-vp1[[i]][SexSortOrder]; }
rownames(vp1)<-rownames(vp1)[SexSortOrder]
plotPercentBars( vp1[1:50,]) + ggtitle( "Top Sex-covariate proteins, w/o Regression" )
AgeSortOrder<-order(vp1[["Age"]],decreasing=TRUE)
#rownames(cleanDat.noNA.unreg)[AgeSortOrder][1:50]
for (i in ls(vp1)) { vp1[[i]]<-vp1[[i]][AgeSortOrder]; }
rownames(vp1)<-rownames(vp1)[AgeSortOrder]
plotPercentBars( vp1[1:50,]) + ggtitle( "Top Age-covariate proteins, w/o Regression" )
CogSortOrder<-order(vp1[["Genotype"]],decreasing=TRUE)
#rownames(cleanDat.noNA.unreg)[CogSortOrder][1:50]
for (i in ls(vp1)) { vp1[[i]]<-vp1[[i]][CogSortOrder]; }
rownames(vp1)<-rownames(vp1)[CogSortOrder]
plotPercentBars( vp1[1:50,]) + ggtitle( "Top Genotype-covariate proteins, w/o Regression" )
plotPercentBars( vp1[1:50,]) + ggtitle( "Top Genotype-covariate proteins, w/o Regression" )

plotPercentBars( vp1[1:50,]) + ggtitle( "Top Age-covariate proteins, w/o Regression" )

plotPercentBars( vp1[1:50,]) + ggtitle( "Top Sex-covariate proteins, w/o Regression" )

plotPercentBars( vp1[1:50,]) + ggtitle( "Top Batch-covariate proteins, w/o Regression" )

cleanDat <- cleanDat.noNA.unreg
Exp.data.noNA.12samples <- cleanDat.noNA.unreg
cleanDat.noNA <-impute::impute.knn(as.matrix(cleanDat))$data #This cleanDat is the variable containing the data that has been regressed and corrected for age, sex and batch effects, used to verify that regression is working without error
## Cluster size 3721 broken into 2926 795
## Cluster size 2926 broken into 2138 788
## Cluster size 2138 broken into 1268 870
## Done cluster 1268
## Done cluster 870
## Done cluster 2138
## Done cluster 788
## Done cluster 2926
## Done cluster 795
varPart2 <- fitExtractVarPartModel(cleanDat.noNA, form, regvars.vp)
## Dividing work into 100 chunks...
##
## Total:69 s
saveRDS(varPart2, file = 'varPart2.rds')
vp2 <- sortCols(varPart2,FUN=max,last= c("Batch","Age","Sex","Genotype","Residuals"))
saveRDS(vp2, file = 'vp2.After.Regression.FINAL.rds')
pdf(file="Upasna_mouse_VariancePartition-regressed_all_with imputation.FINAL.pdf")
par(mfrow=c(1,1))
# Post-correction (or regression)
# cleanDat.noNA<-na.omit(cleanDat)
plotVarPart( vp2, main="Effects Stripped by Bootstrap Regression, Age, Sex and Batch" )
DxSortOrder<-order(vp2[["Batch"]],decreasing=TRUE)
#rownames(cleanDat.noNA)[DxSortOrder][1:50]
for (i in ls(vp2)) { vp2[[i]]<-vp2[[i]][DxSortOrder]; }
rownames(vp2)<-rownames(vp2)[DxSortOrder]
plotPercentBars( vp2[1:50,]) + ggtitle( "Top Batch-covariate proteins, Post-Regression, Age, Sex and Batch" )
SexSortOrder<-order(vp2[["Sex"]],decreasing=TRUE)
#rownames(cleanDat.noNA)[SexSortOrder][1:50]
for (i in ls(vp2)) { vp2[[i]]<-vp2[[i]][SexSortOrder]; }
rownames(vp2)<-rownames(vp2)[SexSortOrder]
plotPercentBars( vp2[1:50,]) + ggtitle( "Top Sex-covariate proteins, Post-Regression, Age, Sex and Batch" )
AgeSortOrder<-order(vp2[["Age"]],decreasing=TRUE)
#rownames(cleanDat.noNA)[AgeSortOrder][1:50]
for (i in ls(vp2)) { vp2[[i]]<-vp2[[i]][AgeSortOrder]; }
rownames(vp2)<-rownames(vp2)[AgeSortOrder]
plotPercentBars( vp2[1:50,]) + ggtitle( "Top Age-covariate proteins, Post-Regression, Age, Sex and Batch" )
CogSortOrder<-order(vp2[["Genotype"]],decreasing=TRUE)
#rownames(cleanDat.noNA)[CogSortOrder][1:50]
for (i in ls(vp2)) { vp2[[i]]<-vp2[[i]][CogSortOrder]; }
rownames(vp2)<-rownames(vp2)[CogSortOrder]
plotPercentBars( vp2[1:50,]) + ggtitle( "Top Genotype-covariate proteins, Post-Regression, Age, Sex and Batch" )
dev.off()
## quartz_off_screen
## 2
plotPercentBars( vp2[1:50,]) + ggtitle( "Top Batch-covariate proteins, Post-Regression, Age, Sex and Batch" )

plotPercentBars( vp2[1:50,]) + ggtitle( "Top Sex-covariate proteins, Post-Regression, Age, Sex and Batch" )

plotPercentBars( vp2[1:50,]) + ggtitle( "Top Age-covariate proteins, Post-Regression, Age, Sex and Batch" )

plotPercentBars( vp2[1:50,]) + ggtitle( "Top Genotype-covariate proteins, Post-Regression, Age, Sex and Batch" )

library(ggplot2)
# Save PDF
pdf("VPA_Before_Adjustment_Top50_PerCovariate.pdf", width=12, height=10)
par(mfrow=c(1,1))
# Batch
BatchOrder <- order(vp1[["Batch"]], decreasing=TRUE)
BatchTop50 <- rownames(vp1)[BatchOrder][1:50]
saveRDS(vp1[BatchTop50, ], "VP1_Top50_Batch.rds")
write.csv(vp1[BatchTop50, ], "VP1_Top50_Batch.csv")
plotPercentBars(vp1[BatchTop50, ]) + ggtitle("Top 50 Batch-affected Proteins (Before Adjustment)")
# Age
AgeOrder <- order(vp1[["Age"]], decreasing=TRUE)
AgeTop50 <- rownames(vp1)[AgeOrder][1:50]
saveRDS(vp1[AgeTop50, ], "VP1_Top50_Age.rds")
write.csv(vp1[AgeTop50, ], "VP1_Top50_Age.csv")
plotPercentBars(vp1[AgeTop50, ]) + ggtitle("Top 50 Age-affected Proteins (Before Adjustment)")
# Sex
SexOrder <- order(vp1[["Sex"]], decreasing=TRUE)
SexTop50 <- rownames(vp1)[SexOrder][1:50]
saveRDS(vp1[SexTop50, ], "VP1_Top50_Sex.rds")
write.csv(vp1[SexTop50, ], "VP1_Top50_Sex.csv")
plotPercentBars(vp1[SexTop50, ]) + ggtitle("Top 50 Sex-affected Proteins (Before Adjustment)")
# Genotype
GenotypeOrder <- order(vp1[["Genotype"]], decreasing=TRUE)
GenotypeTop50 <- rownames(vp1)[GenotypeOrder][1:50]
saveRDS(vp1[GenotypeTop50, ], "VP1_Top50_Genotype.rds")
write.csv(vp1[GenotypeTop50, ], "VP1_Top50_Genotype.csv")
plotPercentBars(vp1[GenotypeTop50, ]) + ggtitle("Top 50 Genotype-affected Proteins (Before Adjustment)")
dev.off()
## quartz_off_screen
## 2
color_list <- c('skyblue', 'red', 'yellow', 'green', 'purple', 'orange', 'pink', 'brown', 'grey', 'blue')
# Perform PCA with the specified color vector and other parameters
pca(cleanDat.noNA,
colvec = color_list,
labels = regvars.vp.final$Batch,
controlscale = TRUE,
scale = 3,
printres = TRUE,
printwidth = 25,
legendtitle = "Batch")
## ***PCA wrapper function***
## running...
## printing PCA to current directory...
## done.

# Perform PCA with the specified color vector and other parameters
pca(cleanDat.noNA,
colvec = color_list,
labels = regvars.vp.final$Treatment,
controlscale = TRUE,
scale = 3,
printres = TRUE,
printwidth = 25,
legendtitle = "Treatment")
## ***PCA wrapper function***
## running...
## printing PCA to current directory...
## done.

### Setup
regvars.vp <- data.frame(regvars.vp.final)
regvars.vp$Sex <- factor(regvars.vp$Sex)
regvars.vp$Genotype <- factor(regvars.vp$Genotype)
regvars.vp$Batch <- factor(regvars.vp$Batch)
regvars.vp$Age <- factor(regvars.vp$Age)
saveRDS(regvars.vp, file = 'regvars.vp.rds')
# MODEL FORMULA: All random effects
form <- ~(1|Age) + (1|Batch) + (1|Sex) + (1|Genotype)
info <- regvars.vp.final
# Calculate variance for each gene
gene_variances <- apply(Exp.data.noNA.12samples, 1, var)
# Filter genes with non-zero variance
non_zero_variance_genes <- gene_variances > 0
Exp.data.filtered <- Exp.data.noNA.12samples[non_zero_variance_genes, ]
# VPA
varPart <- fitExtractVarPartModel(Exp.data.filtered, form, regvars.vp)
## Dividing work into 100 chunks...
##
## Total:79 s
# Sort
vp <- sortCols(varPart)
# Bar plot
plotPercentBars(vp[1:50, ])

# Violin plot
plotVarPart(vp) + ggtitle('Variation Partition Analysis')

saveRDS(vp, file = 'vp.afterbatch.correction.rds')
library(dplyr)
library(tidyr)
library(ggplot2)
library(variancePartition)
#Step 1: Prepare the variance partition dataframe
# Assume your variance partition result is in `vp`
vp_df <- as.data.frame(vp)
vp_df$Gene <- rownames(vp_df) # Save gene names into a column
# Reshape to long format (needed for ggplot)
vp_long <- pivot_longer(vp_df, cols = -Gene, names_to = "Variable", values_to = "Variance")
head(vp_long, 10)
## # A tibble: 10 × 3
## Gene Variable Variance
## <chr> <chr> <dbl>
## 1 Pzp Batch 0
## 2 Pzp Age 0
## 3 Pzp Sex 0
## 4 Pzp Genotype 0
## 5 Pzp Residuals 1
## 6 Alb Batch 0.000354
## 7 Alb Age 0.0140
## 8 Alb Sex 0.00776
## 9 Alb Genotype 0
## 10 Alb Residuals 0.978
#Step 2: Find the top 10 proteins per variable
# For each variable, find top 10 genes
top10_labels <- vp_long %>%
group_by(Variable) %>%
top_n(10, Variance) %>%
ungroup()
top10_labels
## # A tibble: 338 × 3
## Gene Variable Variance
## <chr> <chr> <dbl>
## 1 Pzp Residuals 1
## 2 Apob Residuals 1
## 3 Serpina1b Residuals 1
## 4 Cp.1 Residuals 1
## 5 Hbbt1 Residuals 1
## 6 Hbbt1.1 Residuals 1
## 7 Hbbt1.2 Residuals 1
## 8 Cfh Residuals 1
## 9 Hbbt1.3 Residuals 1
## 10 Hbbt1.4 Residuals 1
## # ℹ 328 more rows
#Step 3: Plot the violin plot and add labels
# Basic violin plot
p <- ggplot(vp_long, aes(x = Variable, y = Variance)) +
geom_violin(aes(fill = Variable), trim = FALSE, color = "black", alpha = 0.5) +
geom_boxplot(width = 0.1, fill = "white", outlier.size = 0.5) +
geom_point(data = top10_labels, aes(x = Variable, y = Variance), color = "red", size = 1.5) +
geom_text(data = top10_labels, aes(x = Variable, y = Variance, label = Gene),
hjust = -0.1, size = 2, angle = 45, color = "black") +
theme_bw() +
ggtitle("5xFAD CSF: VPA with Top 10 Proteins Labeled") +
ylab("Variance explained (%)") +
xlab("") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
p

library(dplyr)
library(tidyr)
library(ggplot2)
# 1. Prepare data
vp_df <- as.data.frame(vp)
vp_df$Gene <- rownames(vp_df)
# 2. Reshape to long format
vp_long <- pivot_longer(vp_df, cols = -Gene, names_to = "Variable", values_to = "Variance")
# 3. Find top 10 genes per variable
top10 <- vp_long %>%
group_by(Variable) %>%
slice_max(order_by = Variance, n = 15) %>%
ungroup()
top10
## # A tibble: 358 × 3
## Gene Variable Variance
## <chr> <chr> <dbl>
## 1 Hapln2 Age 0.601
## 2 Ctsd Age 0.381
## 3 Fah Age 0.358
## 4 Ifit1 Age 0.343
## 5 Sprr1a Age 0.337
## 6 Mobp Age 0.318
## 7 Ppp1r14a Age 0.304
## 8 Serpinb6d Age 0.302
## 9 Col1a1 Age 0.298
## 10 Acan Age 0.278
## # ℹ 348 more rows
#Plot for Age:
top10_age <- top10 %>% filter(Variable == "Age")
ggplot(top10_age, aes(x = Variable, y = Variance)) +
geom_violin(fill = "red", trim = FALSE, color = "black", alpha = 0.6) +
geom_point(color = "black", size = 2) +
geom_text(aes(label = Gene), vjust = -0.5, size = 3, angle = 45, check_overlap = TRUE) +
theme_bw() +
ggtitle("Top 10 Proteins - Age") +
ylab("Variance explained (%)") +
xlab("") +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank())

##BY Genotype
#Plot for Genotype:
top10_genotype <- top10 %>% filter(Variable == "Genotype")
ggplot(top10_genotype, aes(x = Variable, y = Variance)) +
geom_violin(fill = "green", trim = FALSE, color = "black", alpha = 0.6) +
geom_point(color = "black", size = 2) +
geom_text(aes(label = Gene), vjust = -0.5, size = 3, angle = 45, check_overlap = TRUE) +
theme_bw() +
ggtitle("Top 10 Proteins - Genotype") +
ylab("Variance explained (%)") +
xlab("") +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank())

####Plot for Sex:
top10_sex <- top10 %>% filter(Variable == "Sex")
ggplot(top10_sex, aes(x = Variable, y = Variance)) +
geom_violin(fill = "orange", trim = FALSE, color = "black", alpha = 0.6) +
geom_point(color = "black", size = 2) +
geom_text(aes(label = Gene), vjust = -0.5, size = 3, angle = 45, check_overlap = TRUE) +
theme_bw() +
ggtitle("Top 10 Proteins - Sex") +
ylab("Variance explained (%)") +
xlab("") +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank())

##Plot for Batch:
top10_batch <- top10 %>% filter(Variable == "Batch")
ggplot(top10_batch, aes(x = Variable, y = Variance)) +
geom_violin(fill = "brown", trim = FALSE, color = "black", alpha = 0.6) +
geom_point(color = "black", size = 2) +
geom_text(aes(label = Gene), vjust = -0.5, size = 3, angle = 45, check_overlap = TRUE) +
theme_bw() +
ggtitle("Top 10 Proteins - Batch") +
ylab("Variance explained (%)") +
xlab("") +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank())
