1 PCAmodel

dat_dir <- "~/data2/PCAGenomicSignatureLibrary/refinebioRseq"
# PCAmodel <- readRDS(file.path(dat_dir, "PCAmodel_1399/refinebioRseq_PCAmodel_hclust.rds"))
PCAmodel <- readRDS(file.path(dat_dir, "PCAmodel_1399_C2/refinebioRseq_PCAmodel_hclust_C2.rds"))
# PCAmodel <- readRDS(file.path(dat_dir, "PCAmodel_536/refinebioRseq_PCAmodel.rds"))
# PCAmodel <- readRDS(file.path(dat_dir, "PCAmodel_536_C2/refinebioRseq_PCAmodel.rds"))

2 TCGA

load("~/data2/GenomicSuperSignature/data/TCGA_validationDatasets.rda")
names(TCGA_validationDatasets)
## [1] "COAD" "BRCA" "LUAD" "READ" "OV"
dataset <- TCGA_validationDatasets[[1]]   # COAD

3 Extra functions

pathwaySearch <- function(gseaRes, keyword, proportion = FALSE) {
  names <- as.list(gseaRes$ID) %>% unlist
  pathwayOfInterest <- stringr::str_detect(names, keyword) %>% sum 
  if (isFALSE(proportion)) {
    return(pathwayOfInterest)
  } else {
    prop <- pathwayOfInterest/length(names)
    return(prop)
  }
}

4 Apply PCAmodel

val_all <- validate(dataset, PCAmodel)

validatedSignatures(val_all, num.out = 10)
##                   score PC          sw cl_size cl_num
## PCcluster1285 0.6979823  4  0.37984982       5   1285
## PCcluster52   0.6973840  3  0.07131795       8     52
## PCcluster2746 0.6882357  3 -0.02113886      10   2746
## PCcluster4751 0.6817435  2  0.05977570       7   4751
## PCcluster480  0.6730446  3 -0.07261584      11    480
## PCcluster972  0.6640647  3 -0.03666227       8    972
## PCcluster6189 0.6561188  4  0.04593779       6   6189
## PCcluster1934 0.6526794  5  0.11082432       2   1934
## PCcluster4    0.6390947  3  0.04745134      31      4
## PCcluster2778 0.6327733  6  0.13484688       6   2778
heatmapTable(val_all, num.out = 10)


validated_ind <- validatedSignatures(val_all, scoreCutoff = 0.5, swCutoff = 0, indexOnly = TRUE)
heatmapTable(val_all, scoreCutoff = 0.5, swCutoff = 0)

plotValidate(val_all, interactive = TRUE, minClusterSize = 4)

4.1 Wordcloud

set.seed(1)
for (ind in validated_ind) {
  drawWordcloud(PCAmodel, ind)
}

4.2 GSEA

gseaRes_all <- vector(mode = "list", length = length(validated_ind))
names(gseaRes_all) <- paste0("PCcluster_", validated_ind)

for (i in seq_along(validated_ind)) {
  res <- msigdb_gsea(validated_ind[i], PCAmodel, category = "C2")
  gseaRes_all[[i]] <- res
}
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## The following object is masked from 'package:PCAGenomicSignatures':
## 
##     metadata
## The following object is masked from 'package:base':
## 
##     expand.grid
## 
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## 
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## Encountered 4 from.IDs with >1 corresponding to.ID
## (the first to.ID was chosen for each of them)
## Excluded 24 from.IDs without a corresponding to.ID
## Registered S3 methods overwritten by 'clusterProfiler.dplyr':
##   method                         from           
##   arrange.compareClusterResult   clusterProfiler
##   arrange.enrichResult           clusterProfiler
##   arrange.gseaResult             clusterProfiler
##   filter.compareClusterResult    clusterProfiler
##   filter.enrichResult            clusterProfiler
##   filter.gseaResult              clusterProfiler
##   group_by.compareClusterResult  clusterProfiler
##   group_by.enrichResult          clusterProfiler
##   group_by.gseaResult            clusterProfiler
##   mutate.compareClusterResult    clusterProfiler
##   mutate.enrichResult            clusterProfiler
##   mutate.gseaResult              clusterProfiler
##   rename.compareClusterResult    clusterProfiler
##   rename.enrichResult            clusterProfiler
##   rename.gseaResult              clusterProfiler
##   select.compareClusterResult    clusterProfiler
##   select.enrichResult            clusterProfiler
##   select.gseaResult              clusterProfiler
##   slice.compareClusterResult     clusterProfiler
##   slice.enrichResult             clusterProfiler
##   slice.gseaResult               clusterProfiler
##   summarise.compareClusterResult clusterProfiler
##   summarise.enrichResult         clusterProfiler
##   summarise.gseaResult           clusterProfiler
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize,
## gseaParam, : There are duplicate gene names, fgsea may produce unexpected
## results.
## Warning in fgseaMultilevel(...): For some pathways, in reality P-values are less
## than 1e-10. You can set the `eps` argument to zero for better estimation.
## Warning: The `add` argument of `group_by()` is deprecated as of dplyr 1.0.0.
## Please use the `.add` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Encountered 4 from.IDs with >1 corresponding to.ID
## (the first to.ID was chosen for each of them)
## Excluded 24 from.IDs without a corresponding to.ID
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize,
## gseaParam, : There are duplicate gene names, fgsea may produce unexpected
## results.
## Warning in fgseaMultilevel(...): There were 6 pathways for which P-values were
## not calculated properly due to unbalanced (positive and negative) gene-level
## statistic values.
## Warning in fgseaMultilevel(...): For some of the pathways the P-values were
## likely overestimated. For such pathways log2err is set to NA.
## Encountered 4 from.IDs with >1 corresponding to.ID
## (the first to.ID was chosen for each of them)
## Excluded 24 from.IDs without a corresponding to.ID
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize,
## gseaParam, : There are duplicate gene names, fgsea may produce unexpected
## results.
## Warning in fgseaMultilevel(...): There were 6 pathways for which P-values were
## not calculated properly due to unbalanced (positive and negative) gene-level
## statistic values.
## Warning in fgseaMultilevel(...): For some of the pathways the P-values were
## likely overestimated. For such pathways log2err is set to NA.
## Warning in fgseaMultilevel(...): For some pathways, in reality P-values are less
## than 1e-10. You can set the `eps` argument to zero for better estimation.
## Encountered 4 from.IDs with >1 corresponding to.ID
## (the first to.ID was chosen for each of them)
## Excluded 24 from.IDs without a corresponding to.ID
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize,
## gseaParam, : There are duplicate gene names, fgsea may produce unexpected
## results.
## Encountered 4 from.IDs with >1 corresponding to.ID
## (the first to.ID was chosen for each of them)
## Excluded 24 from.IDs without a corresponding to.ID
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize,
## gseaParam, : There are duplicate gene names, fgsea may produce unexpected
## results.

## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize,
## gseaParam, : For some pathways, in reality P-values are less than 1e-10. You can
## set the `eps` argument to zero for better estimation.

sapply(gseaRes_all, dim)
##      PCcluster_1285 PCcluster_52 PCcluster_4751 PCcluster_6189 PCcluster_1934
## [1,]            481           93            264            434           1028
## [2,]             13           13             13             13             13
sapply(gseaRes_all, pathwaySearch, "COLO")
## PCcluster_1285   PCcluster_52 PCcluster_4751 PCcluster_6189 PCcluster_1934 
##              1              0              1              1              6
sapply(gseaRes_all, pathwaySearch, "COLO", proportion = TRUE)
## PCcluster_1285   PCcluster_52 PCcluster_4751 PCcluster_6189 PCcluster_1934 
##    0.002079002    0.000000000    0.003787879    0.002304147    0.005836576

for (i in seq_along(gseaRes_all)) {
  res <- gseaRes_all[[i]]
  gseaSub <- subsetGSEA(res, n = 20)
  print(gseaBarplot(gseaRes = gseaSub))
  print(gseaNetwork(gseaRes = gseaSub, similarity_metric = "overlap_similarity", similarity_cutoff = 0.3))
}

Validate TCGA data with MSigDB C2

2020-08-12

Contents

1 PCAmodel

2 TCGA

3 Extra functions

4 Apply PCAmodel

4.1 Wordcloud

4.2 GSEA