Contents

1 Load packages

suppressPackageStartupMessages({
  library(magrittr)
  library(factoextra)
  library(rsvd)
  library(grid)
  library(SummarizedExperiment)
  library(GenomicSignatures)
  library(RColorBrewer)
  library(wordcloud)
  library(org.Hs.eg.db)
  library(msigdbr)
  library(clusterProfiler)
})

source('~/data2/Genomic_Super_Signature/refinebio/methods/4_Hierarchical_Clustering_func.R')
# color setup
colfunc <- colorRampPalette(c("white", "red"))
n <- 20
col <- colfunc(n)[c(1,2,n)]

For visualization of enrichment results

suppressPackageStartupMessages({
  library(clusterProfiler.dplyr)
  library(ggstance)
  library(enrichplot)
  library(forcats)
  library(ggplot2)
})
source('~/data2/Genomic_Super_Signature/refinebio/R/func_viz.R')

2 B cell dataset

Human B-cell expression dataset The human B-cell dataset (Gene Expression Omnibus series GSE2350) consists of 211 normal and tumor human B-cell phenotypes whose expression was profiled on Affymatrix HG-U95Av2 arrays, and it is contained in an ExpressionSet object with 6,249 features x 211 samples.

library(bcellViper)
data(bcellViper)
dset
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 6249 features, 211 samples 
##   element names: exprs 
## protocolData: none
## phenoData
##   sampleNames: GSM44075 GSM44078 ... GSM44302 (211 total)
##   varLabels: sampleID description detailed_description
##   varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation:
dataset <- exprs(dset)

3 GenomicSignature

3.1 Clustering

Distance matrix here is calculated from 13,570 PCs composed of 677 refine.bio datasets (13,540 PCs) + PC1s from 10 negative controls (10 PCs) + training subset of TCGA-COAD (20 PCs). The number of clusters to cut dendrogram of hierarchical clustering is defined in refinebio/methods/5_Hierarchical_Clustering_with_SpikeInTCGA_moreNegControl.Rmd.

dat_dir <- "~/data2/Genomic_Super_Signature/refinebio/methods/5_Hierarchical_Clustering_with_SpikeInTCGA_moreNegControl"
all <- readRDS(file.path(dat_dir, "all_10.rds"))
res.dist <- readRDS(file.path(dat_dir, "res_dist_10.rds"))

# cut the tree
res.hcut <- factoextra::hcut(res.dist, k = round(nrow(all)/2.25,0), hc_funct = "hclust", 
                             hc_method = "ward.D", hc_metric = "spearman")

max(table(res.hcut$cluster))   # 28
sum(table(res.hcut$cluster) == 1)   # 2331

3.2 Calculate avgLoading

source('~/data2/GenomicSuperSignature/R/buildAvgLoading_temp.R')
PCclusters <- buildAvgLoading(t(all), clustering = FALSE, cluster = res.hcut$cluster, iter.max = 100)
names(PCclusters)

3.3 Pre-build PCAmodel

wd <- "~/data2/PCAGenomicSignatureLibrary/refinebioRseq/canonicalPathways"
fname <- "refinebioRseq_canonicalPathways_PCAmodel_hclust.rds"
PCAmodel <- readRDS(file.path(wd, fname))

4 Validate

4.1 Validation heatmapTable

val_all <- validate(dataset, PCAmodel)  
## Warning in if (class(dataset) == "ExpressionSet") {: the condition has length >
## 1 and only the first element will be used
## Warning in if (class(dataset) %in% c("SummarizedExperiment",
## "RangedSummarizedExperiment")) {: the condition has length > 1 and only the
## first element will be used
## Warning in if (class(dataset) == "matrix") {: the condition has length > 1 and
## only the first element will be used
cutoff <- 0.6
k <- sapply(colnames(val_all), function(x) {sum(val_all[,x] > cutoff) != 0})
validated_ind <- which(k=="TRUE")
heatmapTable(val_all[, names(k)[validated_ind], drop=FALSE], 
             column_title = paste0("PCAmodel (cutoff ", cutoff, ")"), 
             colors = col)

4.2 Studies involved

studies <- studies(PCAmodel)[validated_ind]
studies
## $Cl6031_208
##  [1] "ERP011411" "ERP021140" "SRP074198" "SRP074715" "SRP080367" "SRP092795"
##  [7] "SRP093793" "SRP102549" "SRP107198" "SRP124967" "SRP127035" "SRP132553"
## [13] "SRP149847" "SRP170684" "SRP186685" "SRP201347"
## 
## $Cl6031_457
## [1] "ERP023424" "SRP093990" "SRP095405" "SRP144647"
## 
## $Cl6031_604
## [1] "ERP105662" "SRP028567" "SRP093990" "SRP098715" "SRP101784" "SRP117629"
## 
## $Cl6031_990
##  [1] "SRP014320" "SRP064259" "SRP072919" "SRP075396" "SRP078505" "SRP092004"
##  [7] "SRP094756" "SRP098926" "SRP111343" "SRP113755" "SRP122507" "SRP132553"
## 
## $Cl6031_1746
##  [1] "SRP055810" "SRP071854" "SRP073318" "SRP074894" "SRP075613" "SRP076334"
##  [7] "SRP082321" "SRP082522" "SRP108039" "SRP108807" "SRP119800" "SRP125154"
## [13] "SRP132313" "SRP133058" "SRP144750" "SRP154483" "SRP179781"
## 
## $Cl6031_1934
##  [1] "SRP059035" "SRP066834" "SRP096727" "SRP124299" "SRP128610" "SRP148472"
##  [7] "SRP154573" "SRP161553" "SRP162300" "SRP178543"

5 MeSH

5.1 MeSH terms for each clusters

mesh_terms <- readRDS("~/data2/Genomic_Super_Signature/refinebio/data/MeSH_terms_889refinebio.rds")

for (i in seq_along(validated_ind)) {
  unique_studies <- which(PCclusters$cluster == validated_ind[i]) %>% names %>% gsub(".PC.*$", "", .) %>% unique
  mesh_terms_sub <- mesh_terms[which(mesh_terms$identifier %in% unique_studies),]
  mesh <- table(mesh_terms_sub$name) %>% sort(., decreasing = TRUE)

  fname <- paste0("PCcluster", validated_ind[i])
  assign(fname, mesh)
}

5.2 MeSH terms observed more than once

for (ind in validated_ind) {
  wc <- drawWordcloud(PCAmodel, ind, rm.noise = 4, weighted = TRUE) 
  print(wc)
}

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

6 GSEA

6.1 MSigDB C2

# lapply(validated_ind, msigdb_gsea, PCAmodel, "C2")
msigdb_gsea(validated_ind[1], PCAmodel, category = "C2")

msigdb_gsea(validated_ind[2], PCAmodel, category = "C2")

msigdb_gsea(validated_ind[3], PCAmodel, category = "C2")

msigdb_gsea(validated_ind[4], PCAmodel, category = "C2")

msigdb_gsea(validated_ind[5], PCAmodel, category = "C2")

msigdb_gsea(validated_ind[6], PCAmodel, category = "C2")

6.2 MSigDB C6

msigdb_gsea(validated_ind[1], PCAmodel, category = "C6")
msigdb_gsea(validated_ind[2], PCAmodel, category = "C6")
msigdb_gsea(validated_ind[3], PCAmodel, category = "C6")
msigdb_gsea(validated_ind[4], PCAmodel, category = "C6")
msigdb_gsea(validated_ind[5], PCAmodel, category = "C6")
msigdb_gsea(validated_ind[6], PCAmodel, category = "C6")

6.3 Disease Ontology

Visualize GSEA results for Disaese Ontology (= output from gseDO)

do_dotplot(validated_ind[2], PCAmodel)
do_dotplot(validated_ind[4], PCAmodel)