if (!require("devtools")) {
install.packages("devtools")
}
devtools::install_github("shbrief/PCAGenomicSignatures")
library(PCAGenomicSignatures)
Currently, you can download PCAGenomicSignatures (711.2 MB) from Google Cloud bucket using AnVIL Bioconductor/R package. This model is built from 1,399 studies (containing 75,433 samples) and 7,951 common genes from each of 1,399 study’s top 90% varying genes based on thier study-level standard deviation.
dat_dir <- "~/data2/PCAGenomicSignatureLibrary/refinebioRseq"
PCAmodel <- readRDS(file.path(dat_dir, "PCAmodel_1399/refinebioRseq_PCAmodel_hclust.rds"))
# PCAmodel <- readRDS(file.path(dat_dir, "PCAmodel_1399_C2/refinebioRseq_PCAmodel_hclust_C2.rds"))
# PCAmodel <- readRDS(file.path(dat_dir, "PCAmodel_536/refinebioRseq_PCAmodel.rds"))
# PCAmodel <- readRDS(file.path(dat_dir, "PCAmodel_536_C2/refinebioRseq_PCAmodel.rds"))
Human B-cell expression dataset The human B-cell dataset (Gene Expression Omnibus series GSE2350) consists of 211 normal and tumor human B-cell phenotypes whose expression was profiled on Affymatrix HG-U95Av2 arrays, and it is contained in an ExpressionSet object with 6,249 features x 211 samples.
suppressPackageStartupMessages(library(bcellViper))
data(bcellViper)
dset
#> ExpressionSet (storageMode: lockedEnvironment)
#> assayData: 6249 features, 211 samples
#> element names: exprs
#> protocolData: none
#> phenoData
#> sampleNames: GSM44075 GSM44078 ... GSM44302 (211 total)
#> varLabels: sampleID description detailed_description
#> varMetadata: labelDescription
#> featureData: none
#> experimentData: use 'experimentData(object)'
#> Annotation:
dataset <- exprs(dset)
You can provide your own expression dataset in any of these formats: simple matrix, ExpressionSet, or SummarizedExperiment. Just make sure that genes in rows are in a ‘symbol’ format.
heatmapTable outputs a two panel table: top panel represents average silhouette
width (avg.sw) and the bottom panel represents the validation score.
You can subset the validation output in two ways - cutoff value and top ranked.
If you specify cutoff value though cutoff argument of heatmapTable, any validation
result above cutoff value will be shown. If you specify the number of top validation
results (= n) through num.out argument of heatmapTable, the output will be a n-columned
heatmap table.
val_all <- validate(dataset, PCAmodel)
heatmapTable(val_all, num.out = 7)
heatmapTable(val_all, scoreCutoff = 0.55)
Under the default condition, plotValidate plots all non single-element clusters’
validation results in a single graph, where x-axis represent average Silhouette
width of the PCcluster (a quality control measure of the signature) and y-axis
represent validation score. We recommend users to focus on PCclusters with higher
validation score and use avgerage silhouette width as a secondary criteria.
plotValidate(val_all, interactive = FALSE)
plotValidate(val_all, interactive = TRUE, minClusterSize = 4)
You can hover each data point for more information:
- sw : the average silhouette width of the clutser
- score : the top validation score between 8 PCs of the dataset and the cluster
- cl_size : the size of the cluster, represented by the dot size
- cl_num : the PCcluster number of the model. You need this index to find more information
about the cluster.
- PC : Test dataset’s PC number that validate each PCcluster. Because we used
top 8 PCs of the test dataset, there are 8 categories.
If you double-click the PC legend on the right, you will enter an individual display mode where you can add an additional group of data point by single-click.
You can draw a wordcloud with the enriched MeSH term of PCclusters that validate
your dataset. validatedIndex function will output the validated index based on
num.out or *Cutoff arguments in a same way as heatmapTable. Here we select the
top 5 validated PCclusters.
heatmapTable(val_all, num.out = 4, scoreCutoff = 0.5, swCutoff = 0)
Because 2nd-4th PCclusters (2516, 2847, 1210) show high validation scores with positive
average silhouette widths, we draw wordclouds of those PCclusters using drawWordclout
function. You need to provide PCAmodel and the index of the PCcluster you are interested in.
validatedSig <- validatedSignatures(val_all, num.out = 5, scoreCutoff = 0.5, swCutoff = 0)
validated_ind <- gsub("PCcluster", "", colnames(validatedSig))
validated_ind <- as.numeric(validated_ind)
set.seed(1)
drawWordcloud(PCAmodel, validated_ind[1])
#> Warning in wordcloud(words = all$word, freq = all$freq, scale = scale, max.words
#> = Inf, : Protein-Arginine N-Methyltransferases could not be fit on page. It will
#> not be plotted.
drawWordcloud(PCAmodel, validated_ind[2])
drawWordcloud(PCAmodel, validated_ind[3])
drawWordcloud(PCAmodel, validated_ind[4])
res1 <- msigdb_gsea(validated_ind[1], PCAmodel, category = "C2")
#>
#> Attaching package: 'S4Vectors'
#> The following object is masked from 'package:PCAGenomicSignatures':
#>
#> metadata
#> The following object is masked from 'package:base':
#>
#> expand.grid
#>
#> Excluded 1444 from.IDs without a corresponding to.ID
#> Registered S3 methods overwritten by 'clusterProfiler.dplyr':
#> method from
#> arrange.compareClusterResult clusterProfiler
#> arrange.enrichResult clusterProfiler
#> arrange.gseaResult clusterProfiler
#> filter.compareClusterResult clusterProfiler
#> filter.enrichResult clusterProfiler
#> filter.gseaResult clusterProfiler
#> group_by.compareClusterResult clusterProfiler
#> group_by.enrichResult clusterProfiler
#> group_by.gseaResult clusterProfiler
#> mutate.compareClusterResult clusterProfiler
#> mutate.enrichResult clusterProfiler
#> mutate.gseaResult clusterProfiler
#> rename.compareClusterResult clusterProfiler
#> rename.enrichResult clusterProfiler
#> rename.gseaResult clusterProfiler
#> select.compareClusterResult clusterProfiler
#> select.enrichResult clusterProfiler
#> select.gseaResult clusterProfiler
#> slice.compareClusterResult clusterProfiler
#> slice.enrichResult clusterProfiler
#> slice.gseaResult clusterProfiler
#> summarise.compareClusterResult clusterProfiler
#> summarise.enrichResult clusterProfiler
#> summarise.gseaResult clusterProfiler
#> Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize,
#> gseaParam, : There are duplicate gene names, fgsea may produce unexpected
#> results.
#> Warning: The `add` argument of `group_by()` is deprecated as of dplyr 1.0.0.
#> Please use the `.add` argument instead.
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_warnings()` to see where this warning was generated.
dim(res1)
#> [1] 5 13
gseaSub <- subsetGSEA(res1, n = 20)
gseaBarplot(gseaRes = gseaSub)
res_os <- gseaNetwork(gseaRes = gseaSub, similarity_metric = "overlap_similarity", similarity_cutoff = 0.3)
res_js <- gseaNetwork(gseaRes = gseaSub, similarity_metric = "jaccard_similarity", similarity_cutoff = 0.3)
visNetwork::visSave(res_os, file = "res_os.html")
htmltools::includeHTML("res_os.html")
visNetwork::visSave(res_js, file = "res_js.html")
htmltools::includeHTML("res_js.html")
res2 <- msigdb_gsea(validated_ind[2], PCAmodel, category = "C2")
#> Excluded 1444 from.IDs without a corresponding to.ID
#> Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize,
#> gseaParam, : There are duplicate gene names, fgsea may produce unexpected
#> results.
#> Warning in fgseaMultilevel(...): For some of the pathways the P-values were
#> likely overestimated. For such pathways log2err is set to NA.
dim(res2)
#> [1] 23 13
gseaSub <- subsetGSEA(res2, n = 20)
gseaBarplot(gseaRes = gseaSub)
gseaNetwork(gseaRes = gseaSub, similarity_metric = "overlap_similarity", similarity_cutoff = 0.3)
gseaNetwork(gseaRes = gseaSub, similarity_metric = "jaccard_similarity", similarity_cutoff = 0.3)
res3 <- msigdb_gsea(validated_ind[3], PCAmodel, category = "C2")
#> Excluded 1444 from.IDs without a corresponding to.ID
#> Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize,
#> gseaParam, : There are duplicate gene names, fgsea may produce unexpected
#> results.
dim(res3)
#> [1] 14 13
gseaSub <- subsetGSEA(res3, n = 20)
gseaBarplot(gseaRes = gseaSub)
gseaNetwork(gseaRes = gseaSub, similarity_metric = "overlap_similarity", similarity_cutoff = 0.3)
gseaNetwork(gseaRes = gseaSub, similarity_metric = "jaccard_similarity", similarity_cutoff = 0.3)
res4 <- msigdb_gsea(validated_ind[4], PCAmodel, category = "C2")
#> Excluded 1444 from.IDs without a corresponding to.ID
#> Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize,
#> gseaParam, : There are duplicate gene names, fgsea may produce unexpected
#> results.
dim(res4)
#> [1] 12 13
gseaSub <- subsetGSEA(res4, n = 20)
gseaBarplot(gseaRes = gseaSub)
gseaNetwork(gseaRes = gseaSub, similarity_metric = "overlap_similarity", similarity_cutoff = 0.3)
gseaNetwork(gseaRes = gseaSub, similarity_metric = "jaccard_similarity", similarity_cutoff = 0.3)