Contents

Setup

Install and load package

if (!require("devtools")) {
  install.packages("devtools")
}
devtools::install_github("shbrief/PCAGenomicSignatures")
library(PCAGenomicSignatures)

Download PCAmodel

Currently, you can download PCAGenomicSignatures (711.2 MB) from Google Cloud bucket using AnVIL Bioconductor/R package. This model is built from 1,399 studies (containing 75,433 samples) and 7,951 common genes from each of 1,399 study’s top 90% varying genes based on thier study-level standard deviation.

dat_dir <- "~/data2/PCAGenomicSignatureLibrary/refinebioRseq"
PCAmodel <- readRDS(file.path(dat_dir, "PCAmodel_1399/refinebioRseq_PCAmodel_hclust.rds"))
# PCAmodel <- readRDS(file.path(dat_dir, "PCAmodel_1399_C2/refinebioRseq_PCAmodel_hclust_C2.rds"))
# PCAmodel <- readRDS(file.path(dat_dir, "PCAmodel_536/refinebioRseq_PCAmodel.rds"))
# PCAmodel <- readRDS(file.path(dat_dir, "PCAmodel_536_C2/refinebioRseq_PCAmodel.rds"))

Example dataset

Human B-cell expression dataset The human B-cell dataset (Gene Expression Omnibus series GSE2350) consists of 211 normal and tumor human B-cell phenotypes whose expression was profiled on Affymatrix HG-U95Av2 arrays, and it is contained in an ExpressionSet object with 6,249 features x 211 samples.

suppressPackageStartupMessages(library(bcellViper))
data(bcellViper)
dset
#> ExpressionSet (storageMode: lockedEnvironment)
#> assayData: 6249 features, 211 samples 
#>   element names: exprs 
#> protocolData: none
#> phenoData
#>   sampleNames: GSM44075 GSM44078 ... GSM44302 (211 total)
#>   varLabels: sampleID description detailed_description
#>   varMetadata: labelDescription
#> featureData: none
#> experimentData: use 'experimentData(object)'
#> Annotation:
dataset <- exprs(dset)

You can provide your own expression dataset in any of these formats: simple matrix, ExpressionSet, or SummarizedExperiment. Just make sure that genes in rows are in a ‘symbol’ format.

Validate

HeatmapTable

heatmapTable outputs a two panel table: top panel represents average silhouette width (avg.sw) and the bottom panel represents the validation score.

You can subset the validation output in two ways - cutoff value and top ranked. If you specify cutoff value though cutoff argument of heatmapTable, any validation result above cutoff value will be shown. If you specify the number of top validation results (= n) through num.out argument of heatmapTable, the output will be a n-columned heatmap table.

val_all <- validate(dataset, PCAmodel)  
heatmapTable(val_all, num.out = 7)

heatmapTable(val_all, scoreCutoff = 0.55)

Interactive Graph

Under the default condition, plotValidate plots all non single-element clusters’ validation results in a single graph, where x-axis represent average Silhouette width of the PCcluster (a quality control measure of the signature) and y-axis represent validation score. We recommend users to focus on PCclusters with higher validation score and use avgerage silhouette width as a secondary criteria.

plotValidate(val_all, interactive = FALSE)

plotValidate(val_all, interactive = TRUE, minClusterSize = 4)


You can hover each data point for more information:
- sw : the average silhouette width of the clutser
- score : the top validation score between 8 PCs of the dataset and the cluster
- cl_size : the size of the cluster, represented by the dot size
- cl_num : the PCcluster number of the model. You need this index to find more information about the cluster.
- PC : Test dataset’s PC number that validate each PCcluster. Because we used top 8 PCs of the test dataset, there are 8 categories.

If you double-click the PC legend on the right, you will enter an individual display mode where you can add an additional group of data point by single-click.

MeSH terms in wordcloud

You can draw a wordcloud with the enriched MeSH term of PCclusters that validate your dataset. validatedIndex function will output the validated index based on num.out or *Cutoff arguments in a same way as heatmapTable. Here we select the top 5 validated PCclusters.

heatmapTable(val_all, num.out = 4, scoreCutoff = 0.5, swCutoff = 0)

Because 2nd-4th PCclusters (2516, 2847, 1210) show high validation scores with positive average silhouette widths, we draw wordclouds of those PCclusters using drawWordclout function. You need to provide PCAmodel and the index of the PCcluster you are interested in.

validatedSig <- validatedSignatures(val_all, num.out = 5, scoreCutoff = 0.5, swCutoff = 0)
validated_ind <- gsub("PCcluster", "", colnames(validatedSig)) 
validated_ind <- as.numeric(validated_ind)

set.seed(1)
drawWordcloud(PCAmodel, validated_ind[1])
#> Warning in wordcloud(words = all$word, freq = all$freq, scale = scale, max.words
#> = Inf, : Protein-Arginine N-Methyltransferases could not be fit on page. It will
#> not be plotted.

drawWordcloud(PCAmodel, validated_ind[2])

drawWordcloud(PCAmodel, validated_ind[3])

drawWordcloud(PCAmodel, validated_ind[4])

GSEA (Coming soon…)

res1 <- msigdb_gsea(validated_ind[1], PCAmodel, category = "C2")
#> 
#> Attaching package: 'S4Vectors'
#> The following object is masked from 'package:PCAGenomicSignatures':
#> 
#>     metadata
#> The following object is masked from 'package:base':
#> 
#>     expand.grid
#> 
#> Excluded 1444 from.IDs without a corresponding to.ID
#> Registered S3 methods overwritten by 'clusterProfiler.dplyr':
#>   method                         from           
#>   arrange.compareClusterResult   clusterProfiler
#>   arrange.enrichResult           clusterProfiler
#>   arrange.gseaResult             clusterProfiler
#>   filter.compareClusterResult    clusterProfiler
#>   filter.enrichResult            clusterProfiler
#>   filter.gseaResult              clusterProfiler
#>   group_by.compareClusterResult  clusterProfiler
#>   group_by.enrichResult          clusterProfiler
#>   group_by.gseaResult            clusterProfiler
#>   mutate.compareClusterResult    clusterProfiler
#>   mutate.enrichResult            clusterProfiler
#>   mutate.gseaResult              clusterProfiler
#>   rename.compareClusterResult    clusterProfiler
#>   rename.enrichResult            clusterProfiler
#>   rename.gseaResult              clusterProfiler
#>   select.compareClusterResult    clusterProfiler
#>   select.enrichResult            clusterProfiler
#>   select.gseaResult              clusterProfiler
#>   slice.compareClusterResult     clusterProfiler
#>   slice.enrichResult             clusterProfiler
#>   slice.gseaResult               clusterProfiler
#>   summarise.compareClusterResult clusterProfiler
#>   summarise.enrichResult         clusterProfiler
#>   summarise.gseaResult           clusterProfiler
#> Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize,
#> gseaParam, : There are duplicate gene names, fgsea may produce unexpected
#> results.
#> Warning: The `add` argument of `group_by()` is deprecated as of dplyr 1.0.0.
#> Please use the `.add` argument instead.
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_warnings()` to see where this warning was generated.
dim(res1)
#> [1]  5 13
gseaSub <- subsetGSEA(res1, n = 20)
gseaBarplot(gseaRes = gseaSub)

res_os <- gseaNetwork(gseaRes = gseaSub, similarity_metric = "overlap_similarity", similarity_cutoff = 0.3)
res_js <- gseaNetwork(gseaRes = gseaSub, similarity_metric = "jaccard_similarity", similarity_cutoff = 0.3)

visNetwork::visSave(res_os, file = "res_os.html")
htmltools::includeHTML("res_os.html")
visNetwork

visNetwork::visSave(res_js, file = "res_js.html")
htmltools::includeHTML("res_js.html")
visNetwork
res2 <- msigdb_gsea(validated_ind[2], PCAmodel, category = "C2")
#> Excluded 1444 from.IDs without a corresponding to.ID
#> Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize,
#> gseaParam, : There are duplicate gene names, fgsea may produce unexpected
#> results.
#> Warning in fgseaMultilevel(...): For some of the pathways the P-values were
#> likely overestimated. For such pathways log2err is set to NA.
dim(res2)
#> [1] 23 13
gseaSub <- subsetGSEA(res2, n = 20)
gseaBarplot(gseaRes = gseaSub)

gseaNetwork(gseaRes = gseaSub, similarity_metric = "overlap_similarity", similarity_cutoff = 0.3)
gseaNetwork(gseaRes = gseaSub, similarity_metric = "jaccard_similarity", similarity_cutoff = 0.3)
res3 <- msigdb_gsea(validated_ind[3], PCAmodel, category = "C2")
#> Excluded 1444 from.IDs without a corresponding to.ID
#> Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize,
#> gseaParam, : There are duplicate gene names, fgsea may produce unexpected
#> results.
dim(res3)
#> [1] 14 13
gseaSub <- subsetGSEA(res3, n = 20)
gseaBarplot(gseaRes = gseaSub)

gseaNetwork(gseaRes = gseaSub, similarity_metric = "overlap_similarity", similarity_cutoff = 0.3)
gseaNetwork(gseaRes = gseaSub, similarity_metric = "jaccard_similarity", similarity_cutoff = 0.3)
res4 <- msigdb_gsea(validated_ind[4], PCAmodel, category = "C2")
#> Excluded 1444 from.IDs without a corresponding to.ID
#> Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize,
#> gseaParam, : There are duplicate gene names, fgsea may produce unexpected
#> results.
dim(res4)
#> [1] 12 13
gseaSub <- subsetGSEA(res4, n = 20)
gseaBarplot(gseaRes = gseaSub)

gseaNetwork(gseaRes = gseaSub, similarity_metric = "overlap_similarity", similarity_cutoff = 0.3)
gseaNetwork(gseaRes = gseaSub, similarity_metric = "jaccard_similarity", similarity_cutoff = 0.3)