1 Load packages
2 B cell dataset
3 GenomicSignature
4 Validate
- 4.1 Validation heatmapTable
- 4.2 Studies involved
5 Silhouette Width
6 MeSH
- 6.1 MeSH terms for each clusters
- 6.2 MeSH terms observed more than once
7 GSEA

1 Load packages

suppressPackageStartupMessages({
  library(magrittr)
  library(factoextra)
  library(rsvd)
  library(grid)
  library(SummarizedExperiment)
  library(GenomicSignatures)
  library(RColorBrewer)
  library(wordcloud)
  library(org.Hs.eg.db)
  library(msigdbr)
  library(clusterProfiler)
})

source('~/data2/Genomic_Super_Signature/refinebio/methods/4_Hierarchical_Clustering_func.R')

# color setup
colfunc <- colorRampPalette(c("white", "red"))
n <- 20
col <- colfunc(n)[c(1,2,n)]

For visualization of enrichment results

suppressPackageStartupMessages({
  library(clusterProfiler.dplyr)
  library(ggstance)
  library(enrichplot)
  library(forcats)
  library(ggplot2)
})
source('~/data2/Genomic_Super_Signature/refinebio/R/func_viz.R')

2 B cell dataset

Human B-cell expression dataset The human B-cell dataset (Gene Expression Omnibus series GSE2350) consists of 211 normal and tumor human B-cell phenotypes whose expression was profiled on Affymatrix HG-U95Av2 arrays, and it is contained in an ExpressionSet object with 6,249 features x 211 samples.

library(bcellViper)
data(bcellViper)
dset
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 6249 features, 211 samples 
##   element names: exprs 
## protocolData: none
## phenoData
##   sampleNames: GSM44075 GSM44078 ... GSM44302 (211 total)
##   varLabels: sampleID description detailed_description
##   varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation:
dataset <- exprs(dset)

3 GenomicSignature

3.1 Clustering

Distance matrix here is calculated from 13,570 PCs composed of 677 refine.bio datasets (13,540 PCs) + PC1s from 10 negative controls (10 PCs) + training subset of TCGA-COAD (20 PCs). The number of clusters to cut dendrogram of hierarchical clustering is defined in refinebio/methods/5_Hierarchical_Clustering_with_SpikeInTCGA_moreNegControl.Rmd.

dat_dir <- "~/data2/Genomic_Super_Signature/refinebio/methods/5_Hierarchical_Clustering_with_SpikeInTCGA_moreNegControl"
all <- readRDS(file.path(dat_dir, "all_10.rds"))
res.dist <- readRDS(file.path(dat_dir, "res_dist_10.rds"))

# cut the tree
res.hcut <- factoextra::hcut(res.dist, k = round(nrow(all)/2.25,0), hc_funct = "hclust", 
                             hc_method = "ward.D", hc_metric = "spearman")

max(table(res.hcut$cluster))   # 28
sum(table(res.hcut$cluster) == 1)   # 2331

dat_dir <- "~/data2/PCAGenomicSignatureLibrary/refinebioRseq/immunologic"
all <- readRDS(file.path(dat_dir, "allZ_immunologic.rds")) %>% t
res.dist <- readRDS(file.path(dat_dir, "res_dist.rds"))

# cut the tree
res.hcut <- factoextra::hcut(res.dist, k = round(nrow(all)/2.25,0), hc_funct = "hclust", 
                             hc_method = "ward.D", hc_metric = "spearman")

max(table(res.hcut$cluster))   # 26
sum(table(res.hcut$cluster) == 1)   # 2165

3.2 Calculate avgLoading

source('~/data2/GenomicSuperSignature/R/buildAvgLoading_temp.R')
PCclusters <- buildAvgLoading(t(all), clustering = FALSE, cluster = res.hcut$cluster, iter.max = 100)
names(PCclusters)

3.3 Pre-build PCAmodel

genesets <- "immunologic"
# wd <- "~/data2/PCAGenomicSignatureLibrary/refinebioRseq/canonicalPathways"
# fname <- "refinebioRseq_canonicalPathways_PCAmodel_hclust_allGenes.rds"
wd <- file.path("~/data2/PCAGenomicSignatureLibrary/refinebioRseq", genesets)
fname <- paste0("refinebioRseq_",genesets,"_PCAmodel_hclust.rds")
PCAmodel <- readRDS(file.path(wd, fname))

4 Validate

4.1 Validation heatmapTable

val_all <- validate(dataset, PCAmodel)  
## Warning in if (class(dataset) == "ExpressionSet") {: the condition has length >
## 1 and only the first element will be used
## Warning in if (class(dataset) %in% c("SummarizedExperiment",
## "RangedSummarizedExperiment")) {: the condition has length > 1 and only the
## first element will be used
## Warning in if (class(dataset) == "matrix") {: the condition has length > 1 and
## only the first element will be used
cutoff <- 0.55
k <- sapply(colnames(val_all), function(x) {sum(val_all[1,x] > cutoff) != 0})
validated_ind <- which(k=="TRUE")
heatmapTable(val_all[1, names(k)[validated_ind], drop=FALSE], 
             column_title = paste0("PCAmodel (cutoff ", cutoff, ")"), 
             colors = col)

Top validation scores are all from PC2 of the dataset.

val_all[, names(k)[validated_ind], drop=FALSE]
##       PCcluster152 PCcluster216 PCcluster409 PCcluster474 PCcluster1035
## score    0.5926301    0.6036905    0.5977377    0.5797338     0.5610756
## PC       2.0000000    2.0000000    2.0000000    2.0000000     2.0000000

Average loadings most closely releated to top 8 PCs of dataset.

val_PCs <- validate(dataset, PCAmodel, maxFrom = "avgLoading")  
## Warning in if (class(dataset) == "ExpressionSet") {: the condition has length >
## 1 and only the first element will be used
## Warning in if (class(dataset) %in% c("SummarizedExperiment",
## "RangedSummarizedExperiment")) {: the condition has length > 1 and only the
## first element will be used
## Warning in if (class(dataset) == "matrix") {: the condition has length > 1 and
## only the first element will be used
val_PCs
##                    PC1         PC2         PC3          PC4          PC5
## score        0.3134023   0.6036905   0.3043408    0.2792233    0.3053959
## avgLoading 100.0000000 216.0000000 595.0000000 3159.0000000 1126.0000000
##                    PC6        PC7          PC8
## score        0.3088051   0.313912    0.3689585
## avgLoading 306.0000000 100.000000 1015.0000000

4.2 Studies involved

studies <- studies(PCAmodel)[validated_ind]
studies
## $Cl6018_152
## [1] "ERP008608" "ERP022034" "SRP057244" "SRP066118" "SRP071854" "SRP075396"
## [7] "SRP119676" "SRP144003" "SRP144515"
## 
## $Cl6018_216
## [1] "ERP011411" "SRP072919" "SRP080367" "SRP092004" "SRP092795" "SRP124967"
## [7] "SRP132553" "SRP170684"
## 
## $Cl6018_409
##  [1] "ERP022766" "SRP011546" "SRP047476" "SRP061881" "SRP066450" "SRP101809"
##  [7] "SRP109107" "SRP125396" "SRP128610" "SRP162300" "SRP163514" "SRP178543"
## [13] "SRP185955"
## 
## $Cl6018_474
## [1] "ERP023424" "ERP105662" "SRP093990" "SRP094756" "SRP098715" "SRP101784"
## [7] "SRP117629"
## 
## $Cl6018_1035
##  [1] "SRP015640" "SRP050499" "SRP055810" "SRP071854" "SRP074894" "SRP076334"
##  [7] "SRP082321" "SRP082522" "SRP094958" "SRP106788" "SRP119800" "SRP149535"
## [13] "SRP156532" "SRP187053" "SRP213794"

5 Silhouette Width

Silhouette width ranges from -1 to 1 for each observation in your data and can be interpreted as follows:
- Values close to 1 suggest that the observation is well matched to the assigned cluster
- Values close to 0 suggest that the observation is borderline matched between two clusters
- Values close to -1 suggest that the observations may be assigned to the wrong cluster

Can I use silhouette width to assign the level of validity for avgLoadings?

x <- metadata(PCAmodel)$cluster
silh_res <- cluster::silhouette(x, res.dist)

summary(silh_res)$si.summary
##       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
## -0.3014270 -0.0393883 -0.0044084 -0.0006789  0.0115878  0.9745160

cl_silh_width <- summary(silh_res)$clus.avg.widths
cl_silh_width[validated_ind] 
##         152         216         409         474        1035 
## -0.06239464  0.03437512 -0.07936130  0.14822858  0.01708513

6 MeSH

6.1 MeSH terms for each clusters

mesh_terms <- readRDS("~/data2/Genomic_Super_Signature/refinebio/data/MeSH_terms_889refinebio.rds")

for (i in seq_along(validated_ind)) {
  unique_studies <- which(PCclusters$cluster == validated_ind[i]) %>% names %>% gsub(".PC.*$", "", .) %>% unique
  mesh_terms_sub <- mesh_terms[which(mesh_terms$identifier %in% unique_studies),]
  mesh <- table(mesh_terms_sub$name) %>% sort(., decreasing = TRUE)

  fname <- paste0("PCcluster", validated_ind[i])
  assign(fname, mesh)
}

6.2 MeSH terms observed more than once

source('~/data2/GenomicSuperSignature/R/drawWordcloud.R')

set.seed(1)
for (i in seq_along(validated_ind)) {
  wc <- drawWordcloud(PCAmodel, validated_ind[i], weighted = TRUE, rm.noise = 3) 
  print(wc)
}
## `summarise()` ungrouping output (override with `.groups` argument)
## NULL
## `summarise()` ungrouping output (override with `.groups` argument)

## NULL
## `summarise()` ungrouping output (override with `.groups` argument)

## NULL
## `summarise()` ungrouping output (override with `.groups` argument)

## NULL
## `summarise()` ungrouping output (override with `.groups` argument)

## NULL

7 GSEA

7.1 MSigDB C2

# lapply(validated_ind, msigdb_gsea, PCAmodel, "C2")
msigdb_gsea(validated_ind[1], PCAmodel, category = "C2")

msigdb_gsea(validated_ind[2], PCAmodel, category = "C2")

msigdb_gsea(validated_ind[3], PCAmodel, category = "C2")

msigdb_gsea(validated_ind[4], PCAmodel, category = "C2")

msigdb_gsea(validated_ind[5], PCAmodel, category = "C2")

# msigdb_gsea(validated_ind[6], PCAmodel, category = "C2")

7.2 MSigDB C6

msigdb_gsea(validated_ind[1], PCAmodel, category = "C6")
msigdb_gsea(validated_ind[2], PCAmodel, category = "C6")
msigdb_gsea(validated_ind[3], PCAmodel, category = "C6")
msigdb_gsea(validated_ind[4], PCAmodel, category = "C6")
msigdb_gsea(validated_ind[5], PCAmodel, category = "C6")
# msigdb_gsea(validated_ind[6], PCAmodel, category = "C6")

7.3 MSigDB C7

msigdb_gsea(validated_ind[1], PCAmodel, category = "C7")

msigdb_gsea(validated_ind[2], PCAmodel, category = "C7")

msigdb_gsea(validated_ind[3], PCAmodel, category = "C7")

msigdb_gsea(validated_ind[4], PCAmodel, category = "C7")

msigdb_gsea(validated_ind[5], PCAmodel, category = "C7")

# msigdb_gsea(validated_ind[6], PCAmodel, category = "C7")

7.4 Disease Ontology

Visualize GSEA results for Disaese Ontology (= output from gseDO)

do_dotplot(validated_ind[2], PCAmodel)
do_dotplot(validated_ind[4], PCAmodel)

Validate avgLoading with B-cell dataset

2020-06-24

Contents