suppressPackageStartupMessages({
library(magrittr)
library(factoextra)
library(rsvd)
library(grid)
library(SummarizedExperiment)
library(GenomicSignatures)
library(RColorBrewer)
library(wordcloud)
library(org.Hs.eg.db)
library(msigdbr)
library(clusterProfiler)
})
source('~/data2/Genomic_Super_Signature/refinebio/methods/4_Hierarchical_Clustering_func.R')
# color setup
colfunc <- colorRampPalette(c("white", "red"))
n <- 20
col <- colfunc(n)[c(1,2,n)]
For visualization of enrichment results
suppressPackageStartupMessages({
library(clusterProfiler.dplyr)
library(ggstance)
library(enrichplot)
library(forcats)
library(ggplot2)
})
source('~/data2/Genomic_Super_Signature/refinebio/R/func_viz.R')
Human B-cell expression dataset The human B-cell dataset (Gene Expression Omnibus series GSE2350) consists of 211 normal and tumor human B-cell phenotypes whose expression was profiled on Affymatrix HG-U95Av2 arrays, and it is contained in an ExpressionSet object with 6,249 features x 211 samples.
library(bcellViper)
data(bcellViper)
dset
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 6249 features, 211 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: GSM44075 GSM44078 ... GSM44302 (211 total)
## varLabels: sampleID description detailed_description
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation:
dataset <- exprs(dset)
Distance matrix here is calculated from 13,570 PCs composed of 677 refine.bio
datasets (13,540 PCs) + PC1s from 10 negative controls (10 PCs) + training subset
of TCGA-COAD (20 PCs). The number of clusters to cut dendrogram of hierarchical
clustering is defined in
refinebio/methods/5_Hierarchical_Clustering_with_SpikeInTCGA_moreNegControl.Rmd.
dat_dir <- "~/data2/Genomic_Super_Signature/refinebio/methods/5_Hierarchical_Clustering_with_SpikeInTCGA_moreNegControl"
all <- readRDS(file.path(dat_dir, "all_10.rds"))
res.dist <- readRDS(file.path(dat_dir, "res_dist_10.rds"))
# cut the tree
res.hcut <- factoextra::hcut(res.dist, k = round(nrow(all)/2.25,0), hc_funct = "hclust",
hc_method = "ward.D", hc_metric = "spearman")
max(table(res.hcut$cluster)) # 28
sum(table(res.hcut$cluster) == 1) # 2331
dat_dir <- "~/data2/PCAGenomicSignatureLibrary/refinebioRseq/immunologic"
all <- readRDS(file.path(dat_dir, "allZ_immunologic.rds")) %>% t
res.dist <- readRDS(file.path(dat_dir, "res_dist.rds"))
# cut the tree
res.hcut <- factoextra::hcut(res.dist, k = round(nrow(all)/2.25,0), hc_funct = "hclust",
hc_method = "ward.D", hc_metric = "spearman")
max(table(res.hcut$cluster)) # 26
sum(table(res.hcut$cluster) == 1) # 2165
source('~/data2/GenomicSuperSignature/R/buildAvgLoading_temp.R')
PCclusters <- buildAvgLoading(t(all), clustering = FALSE, cluster = res.hcut$cluster, iter.max = 100)
names(PCclusters)
genesets <- "immunologic"
# wd <- "~/data2/PCAGenomicSignatureLibrary/refinebioRseq/canonicalPathways"
# fname <- "refinebioRseq_canonicalPathways_PCAmodel_hclust_allGenes.rds"
wd <- file.path("~/data2/PCAGenomicSignatureLibrary/refinebioRseq", genesets)
fname <- paste0("refinebioRseq_",genesets,"_PCAmodel_hclust.rds")
PCAmodel <- readRDS(file.path(wd, fname))
val_all <- validate(dataset, PCAmodel)
## Warning in if (class(dataset) == "ExpressionSet") {: the condition has length >
## 1 and only the first element will be used
## Warning in if (class(dataset) %in% c("SummarizedExperiment",
## "RangedSummarizedExperiment")) {: the condition has length > 1 and only the
## first element will be used
## Warning in if (class(dataset) == "matrix") {: the condition has length > 1 and
## only the first element will be used
cutoff <- 0.55
k <- sapply(colnames(val_all), function(x) {sum(val_all[1,x] > cutoff) != 0})
validated_ind <- which(k=="TRUE")
heatmapTable(val_all[1, names(k)[validated_ind], drop=FALSE],
column_title = paste0("PCAmodel (cutoff ", cutoff, ")"),
colors = col)
Top validation scores are all from PC2 of the dataset.
val_all[, names(k)[validated_ind], drop=FALSE]
## PCcluster152 PCcluster216 PCcluster409 PCcluster474 PCcluster1035
## score 0.5926301 0.6036905 0.5977377 0.5797338 0.5610756
## PC 2.0000000 2.0000000 2.0000000 2.0000000 2.0000000
Average loadings most closely releated to top 8 PCs of dataset.
val_PCs <- validate(dataset, PCAmodel, maxFrom = "avgLoading")
## Warning in if (class(dataset) == "ExpressionSet") {: the condition has length >
## 1 and only the first element will be used
## Warning in if (class(dataset) %in% c("SummarizedExperiment",
## "RangedSummarizedExperiment")) {: the condition has length > 1 and only the
## first element will be used
## Warning in if (class(dataset) == "matrix") {: the condition has length > 1 and
## only the first element will be used
val_PCs
## PC1 PC2 PC3 PC4 PC5
## score 0.3134023 0.6036905 0.3043408 0.2792233 0.3053959
## avgLoading 100.0000000 216.0000000 595.0000000 3159.0000000 1126.0000000
## PC6 PC7 PC8
## score 0.3088051 0.313912 0.3689585
## avgLoading 306.0000000 100.000000 1015.0000000
studies <- studies(PCAmodel)[validated_ind]
studies
## $Cl6018_152
## [1] "ERP008608" "ERP022034" "SRP057244" "SRP066118" "SRP071854" "SRP075396"
## [7] "SRP119676" "SRP144003" "SRP144515"
##
## $Cl6018_216
## [1] "ERP011411" "SRP072919" "SRP080367" "SRP092004" "SRP092795" "SRP124967"
## [7] "SRP132553" "SRP170684"
##
## $Cl6018_409
## [1] "ERP022766" "SRP011546" "SRP047476" "SRP061881" "SRP066450" "SRP101809"
## [7] "SRP109107" "SRP125396" "SRP128610" "SRP162300" "SRP163514" "SRP178543"
## [13] "SRP185955"
##
## $Cl6018_474
## [1] "ERP023424" "ERP105662" "SRP093990" "SRP094756" "SRP098715" "SRP101784"
## [7] "SRP117629"
##
## $Cl6018_1035
## [1] "SRP015640" "SRP050499" "SRP055810" "SRP071854" "SRP074894" "SRP076334"
## [7] "SRP082321" "SRP082522" "SRP094958" "SRP106788" "SRP119800" "SRP149535"
## [13] "SRP156532" "SRP187053" "SRP213794"
Silhouette width ranges from -1 to 1 for each observation in your data and can be
interpreted as follows:
- Values close to 1 suggest that the observation is well matched to the assigned cluster
- Values close to 0 suggest that the observation is borderline matched between two clusters
- Values close to -1 suggest that the observations may be assigned to the wrong cluster
Can I use silhouette width to assign the level of validity for avgLoadings?
x <- metadata(PCAmodel)$cluster
silh_res <- cluster::silhouette(x, res.dist)
summary(silh_res)$si.summary
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.3014270 -0.0393883 -0.0044084 -0.0006789 0.0115878 0.9745160
cl_silh_width <- summary(silh_res)$clus.avg.widths
cl_silh_width[validated_ind]
## 152 216 409 474 1035
## -0.06239464 0.03437512 -0.07936130 0.14822858 0.01708513
mesh_terms <- readRDS("~/data2/Genomic_Super_Signature/refinebio/data/MeSH_terms_889refinebio.rds")
for (i in seq_along(validated_ind)) {
unique_studies <- which(PCclusters$cluster == validated_ind[i]) %>% names %>% gsub(".PC.*$", "", .) %>% unique
mesh_terms_sub <- mesh_terms[which(mesh_terms$identifier %in% unique_studies),]
mesh <- table(mesh_terms_sub$name) %>% sort(., decreasing = TRUE)
fname <- paste0("PCcluster", validated_ind[i])
assign(fname, mesh)
}
source('~/data2/GenomicSuperSignature/R/drawWordcloud.R')
set.seed(1)
for (i in seq_along(validated_ind)) {
wc <- drawWordcloud(PCAmodel, validated_ind[i], weighted = TRUE, rm.noise = 3)
print(wc)
}
## `summarise()` ungrouping output (override with `.groups` argument)
## NULL
## `summarise()` ungrouping output (override with `.groups` argument)
## NULL
## `summarise()` ungrouping output (override with `.groups` argument)
## NULL
## `summarise()` ungrouping output (override with `.groups` argument)
## NULL
## `summarise()` ungrouping output (override with `.groups` argument)
## NULL
# lapply(validated_ind, msigdb_gsea, PCAmodel, "C2")
msigdb_gsea(validated_ind[1], PCAmodel, category = "C2")
msigdb_gsea(validated_ind[2], PCAmodel, category = "C2")
msigdb_gsea(validated_ind[3], PCAmodel, category = "C2")
msigdb_gsea(validated_ind[4], PCAmodel, category = "C2")
msigdb_gsea(validated_ind[5], PCAmodel, category = "C2")
# msigdb_gsea(validated_ind[6], PCAmodel, category = "C2")
msigdb_gsea(validated_ind[1], PCAmodel, category = "C6")
msigdb_gsea(validated_ind[2], PCAmodel, category = "C6")
msigdb_gsea(validated_ind[3], PCAmodel, category = "C6")
msigdb_gsea(validated_ind[4], PCAmodel, category = "C6")
msigdb_gsea(validated_ind[5], PCAmodel, category = "C6")
# msigdb_gsea(validated_ind[6], PCAmodel, category = "C6")
msigdb_gsea(validated_ind[1], PCAmodel, category = "C7")
msigdb_gsea(validated_ind[2], PCAmodel, category = "C7")
msigdb_gsea(validated_ind[3], PCAmodel, category = "C7")
msigdb_gsea(validated_ind[4], PCAmodel, category = "C7")
msigdb_gsea(validated_ind[5], PCAmodel, category = "C7")
# msigdb_gsea(validated_ind[6], PCAmodel, category = "C7")
Visualize GSEA results for Disaese Ontology (= output from gseDO)
do_dotplot(validated_ind[2], PCAmodel)
do_dotplot(validated_ind[4], PCAmodel)