suppressPackageStartupMessages({
library(magrittr)
library(factoextra)
library(rsvd)
library(grid)
library(SummarizedExperiment)
library(GenomicSignatures)
library(RColorBrewer)
library(wordcloud)
library(org.Hs.eg.db)
library(msigdbr)
library(clusterProfiler)
})
source('~/data2/Genomic_Super_Signature/refinebio/methods/4_Hierarchical_Clustering_func.R')
# color setup
colfunc <- colorRampPalette(c("white", "red"))
n <- 20
col <- colfunc(n)[c(1,2,n)]
For visualization of enrichment results
suppressPackageStartupMessages({
library(clusterProfiler.dplyr)
library(ggstance)
library(enrichplot)
library(forcats)
library(ggplot2)
})
source('~/data2/Genomic_Super_Signature/refinebio/R/func_viz.R')
Human B-cell expression dataset The human B-cell dataset (Gene Expression Omnibus series GSE2350) consists of 211 normal and tumor human B-cell phenotypes whose expression was profiled on Affymatrix HG-U95Av2 arrays, and it is contained in an ExpressionSet object with 6,249 features x 211 samples.
library(bcellViper)
data(bcellViper)
dset
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 6249 features, 211 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: GSM44075 GSM44078 ... GSM44302 (211 total)
## varLabels: sampleID description detailed_description
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation:
dataset <- exprs(dset)
Distance matrix here is calculated from 13,570 PCs composed of 677 refine.bio
datasets (13,540 PCs) + PC1s from 10 negative controls (10 PCs) + training subset
of TCGA-COAD (20 PCs). The number of clusters to cut dendrogram of hierarchical
clustering is defined in
refinebio/methods/5_Hierarchical_Clustering_with_SpikeInTCGA_moreNegControl.Rmd.
dat_dir <- "~/data2/Genomic_Super_Signature/refinebio/methods/5_Hierarchical_Clustering_with_SpikeInTCGA_moreNegControl"
all <- readRDS(file.path(dat_dir, "all_10.rds"))
res.dist <- readRDS(file.path(dat_dir, "res_dist_10.rds"))
# cut the tree
res.hcut <- factoextra::hcut(res.dist, k = round(nrow(all)/2.25,0), hc_funct = "hclust",
hc_method = "ward.D", hc_metric = "spearman")
max(table(res.hcut$cluster)) # 28
sum(table(res.hcut$cluster) == 1) # 2331
source('~/data2/GenomicSuperSignature/R/buildAvgLoading_temp.R')
PCclusters <- buildAvgLoading(t(all), clustering = FALSE, cluster = res.hcut$cluster, iter.max = 100)
names(PCclusters)
wd <- "~/data2/PCAGenomicSignatureLibrary/refinebioRseq/canonicalPathways"
fname <- "refinebioRseq_canonicalPathways_PCAmodel_hclust.rds"
PCAmodel <- readRDS(file.path(wd, fname))
val_all <- validate(dataset, PCAmodel)
## Warning in if (class(dataset) == "ExpressionSet") {: the condition has length >
## 1 and only the first element will be used
## Warning in if (class(dataset) %in% c("SummarizedExperiment",
## "RangedSummarizedExperiment")) {: the condition has length > 1 and only the
## first element will be used
## Warning in if (class(dataset) == "matrix") {: the condition has length > 1 and
## only the first element will be used
cutoff <- 0.6
k <- sapply(colnames(val_all), function(x) {sum(val_all[,x] > cutoff) != 0})
validated_ind <- which(k=="TRUE")
heatmapTable(val_all[, names(k)[validated_ind], drop=FALSE],
column_title = paste0("PCAmodel (cutoff ", cutoff, ")"),
colors = col)
studies <- studies(PCAmodel)[validated_ind]
studies
## $Cl6031_208
## [1] "ERP011411" "ERP021140" "SRP074198" "SRP074715" "SRP080367" "SRP092795"
## [7] "SRP093793" "SRP102549" "SRP107198" "SRP124967" "SRP127035" "SRP132553"
## [13] "SRP149847" "SRP170684" "SRP186685" "SRP201347"
##
## $Cl6031_457
## [1] "ERP023424" "SRP093990" "SRP095405" "SRP144647"
##
## $Cl6031_604
## [1] "ERP105662" "SRP028567" "SRP093990" "SRP098715" "SRP101784" "SRP117629"
##
## $Cl6031_990
## [1] "SRP014320" "SRP064259" "SRP072919" "SRP075396" "SRP078505" "SRP092004"
## [7] "SRP094756" "SRP098926" "SRP111343" "SRP113755" "SRP122507" "SRP132553"
##
## $Cl6031_1746
## [1] "SRP055810" "SRP071854" "SRP073318" "SRP074894" "SRP075613" "SRP076334"
## [7] "SRP082321" "SRP082522" "SRP108039" "SRP108807" "SRP119800" "SRP125154"
## [13] "SRP132313" "SRP133058" "SRP144750" "SRP154483" "SRP179781"
##
## $Cl6031_1934
## [1] "SRP059035" "SRP066834" "SRP096727" "SRP124299" "SRP128610" "SRP148472"
## [7] "SRP154573" "SRP161553" "SRP162300" "SRP178543"
mesh_terms <- readRDS("~/data2/Genomic_Super_Signature/refinebio/data/MeSH_terms_889refinebio.rds")
for (i in seq_along(validated_ind)) {
unique_studies <- which(PCclusters$cluster == validated_ind[i]) %>% names %>% gsub(".PC.*$", "", .) %>% unique
mesh_terms_sub <- mesh_terms[which(mesh_terms$identifier %in% unique_studies),]
mesh <- table(mesh_terms_sub$name) %>% sort(., decreasing = TRUE)
fname <- paste0("PCcluster", validated_ind[i])
assign(fname, mesh)
}
for (ind in validated_ind) {
wc <- drawWordcloud(PCAmodel, ind, rm.noise = 4, weighted = TRUE)
print(wc)
}
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
# lapply(validated_ind, msigdb_gsea, PCAmodel, "C2")
msigdb_gsea(validated_ind[1], PCAmodel, category = "C2")
msigdb_gsea(validated_ind[2], PCAmodel, category = "C2")
msigdb_gsea(validated_ind[3], PCAmodel, category = "C2")
msigdb_gsea(validated_ind[4], PCAmodel, category = "C2")
msigdb_gsea(validated_ind[5], PCAmodel, category = "C2")
msigdb_gsea(validated_ind[6], PCAmodel, category = "C2")
msigdb_gsea(validated_ind[1], PCAmodel, category = "C6")
msigdb_gsea(validated_ind[2], PCAmodel, category = "C6")
msigdb_gsea(validated_ind[3], PCAmodel, category = "C6")
msigdb_gsea(validated_ind[4], PCAmodel, category = "C6")
msigdb_gsea(validated_ind[5], PCAmodel, category = "C6")
msigdb_gsea(validated_ind[6], PCAmodel, category = "C6")
Visualize GSEA results for Disaese Ontology (= output from gseDO)
do_dotplot(validated_ind[2], PCAmodel)
do_dotplot(validated_ind[4], PCAmodel)