1 PCAmodel

dat_dir <- "~/data2/PCAGenomicSignatureLibrary/refinebioRseq"
# PCAmodel <- readRDS(file.path(dat_dir, "PCAmodel_1399/refinebioRseq_PCAmodel_hclust.rds"))
# PCAmodel <- readRDS(file.path(dat_dir, "PCAmodel_1399_C2/refinebioRseq_PCAmodel_hclust_C2.rds"))
# PCAmodel <- readRDS(file.path(dat_dir, "PCAmodel_536/refinebioRseq_PCAmodel.rds"))
PCAmodel <- readRDS(file.path(dat_dir, "PCAmodel_536_C2/refinebioRseq_PCAmodel.rds"))

2 TCGA

load("~/data2/GenomicSuperSignature/data/TCGA_validationDatasets.rda")
names(TCGA_validationDatasets)
## [1] "COAD" "BRCA" "LUAD" "READ" "OV"
dataset <- TCGA_validationDatasets[[1]]   # COAD

3 Extra functions

pathwaySearch <- function(gseaRes, keyword, proportion = FALSE) {
  names <- as.list(gseaRes$ID) %>% unlist
  pathwayOfInterest <- stringr::str_detect(names, keyword) %>% sum 
  if (isFALSE(proportion)) {
    return(pathwayOfInterest)
  } else {
    prop <- pathwayOfInterest/length(names)
    return(prop)
  }
}

4 Apply PCAmodel

val_all <- validate(dataset, PCAmodel)

validatedSignatures(val_all, num.out = 10)
##                   score PC            sw cl_size cl_num
## PCcluster551  0.7095595  4  0.3961096771       6    551
## PCcluster1277 0.6777265  4  0.4366502443       7   1277
## PCcluster448  0.6400697  3  0.1411814846      15    448
## PCcluster583  0.6102776  3 -0.0593527494      15    583
## PCcluster891  0.6101585  6  0.0658258820       5    891
## PCcluster179  0.6092750  4  0.0264377885       5    179
## PCcluster781  0.6040598  5 -0.0020920251       4    781
## PCcluster739  0.6026471  3  0.1859284441       7    739
## PCcluster115  0.5993255  3  0.0008686181      12    115
## PCcluster259  0.5967371  3 -0.0453980649      10    259
heatmapTable(val_all, num.out = 10)


validated_ind <- validatedSignatures(val_all, scoreCutoff = 0.5, swCutoff = 0, indexOnly = TRUE)
heatmapTable(val_all, scoreCutoff = 0.5, swCutoff = 0)

plotValidate(val_all, interactive = TRUE, minClusterSize = 4)

4.1 Wordcloud

set.seed(1)
for (ind in validated_ind) {
  drawWordcloud(PCAmodel, ind)
}

4.2 GSEA

gseaRes_all <- vector(mode = "list", length = length(validated_ind))
names(gseaRes_all) <- paste0("PCcluster_", validated_ind)

for (i in seq_along(validated_ind)) {
  res <- msigdb_gsea(validated_ind[i], PCAmodel, category = "C2")
  gseaRes_all[[i]] <- res
}
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## The following object is masked from 'package:PCAGenomicSignatures':
## 
##     metadata
## The following object is masked from 'package:base':
## 
##     expand.grid
## 
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## 
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## Encountered 4 from.IDs with >1 corresponding to.ID
## (the first to.ID was chosen for each of them)
## Excluded 24 from.IDs without a corresponding to.ID
## Registered S3 methods overwritten by 'clusterProfiler.dplyr':
##   method                         from           
##   arrange.compareClusterResult   clusterProfiler
##   arrange.enrichResult           clusterProfiler
##   arrange.gseaResult             clusterProfiler
##   filter.compareClusterResult    clusterProfiler
##   filter.enrichResult            clusterProfiler
##   filter.gseaResult              clusterProfiler
##   group_by.compareClusterResult  clusterProfiler
##   group_by.enrichResult          clusterProfiler
##   group_by.gseaResult            clusterProfiler
##   mutate.compareClusterResult    clusterProfiler
##   mutate.enrichResult            clusterProfiler
##   mutate.gseaResult              clusterProfiler
##   rename.compareClusterResult    clusterProfiler
##   rename.enrichResult            clusterProfiler
##   rename.gseaResult              clusterProfiler
##   select.compareClusterResult    clusterProfiler
##   select.enrichResult            clusterProfiler
##   select.gseaResult              clusterProfiler
##   slice.compareClusterResult     clusterProfiler
##   slice.enrichResult             clusterProfiler
##   slice.gseaResult               clusterProfiler
##   summarise.compareClusterResult clusterProfiler
##   summarise.enrichResult         clusterProfiler
##   summarise.gseaResult           clusterProfiler
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize,
## gseaParam, : There are duplicate gene names, fgsea may produce unexpected
## results.
## Warning in fgseaMultilevel(...): For some pathways, in reality P-values are less
## than 1e-10. You can set the `eps` argument to zero for better estimation.
## Warning: The `add` argument of `group_by()` is deprecated as of dplyr 1.0.0.
## Please use the `.add` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Encountered 4 from.IDs with >1 corresponding to.ID
## (the first to.ID was chosen for each of them)
## Excluded 24 from.IDs without a corresponding to.ID
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize,
## gseaParam, : There are duplicate gene names, fgsea may produce unexpected
## results.
## Warning in fgseaMultilevel(...): For some pathways, in reality P-values are less
## than 1e-10. You can set the `eps` argument to zero for better estimation.
## Encountered 4 from.IDs with >1 corresponding to.ID
## (the first to.ID was chosen for each of them)
## Excluded 24 from.IDs without a corresponding to.ID
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize,
## gseaParam, : There are duplicate gene names, fgsea may produce unexpected
## results.
## Encountered 4 from.IDs with >1 corresponding to.ID
## (the first to.ID was chosen for each of them)
## Excluded 24 from.IDs without a corresponding to.ID
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize,
## gseaParam, : There are duplicate gene names, fgsea may produce unexpected
## results.
## Warning in fgseaMultilevel(...): There were 16 pathways for which P-values were
## not calculated properly due to unbalanced (positive and negative) gene-level
## statistic values.
## Warning in fgseaMultilevel(...): For some of the pathways the P-values were
## likely overestimated. For such pathways log2err is set to NA.
## Warning in fgseaMultilevel(...): For some pathways, in reality P-values are less
## than 1e-10. You can set the `eps` argument to zero for better estimation.
## Encountered 4 from.IDs with >1 corresponding to.ID
## (the first to.ID was chosen for each of them)
## Excluded 24 from.IDs without a corresponding to.ID
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize,
## gseaParam, : There are duplicate gene names, fgsea may produce unexpected
## results.

sapply(gseaRes_all, dim)
##      PCcluster_551 PCcluster_1277 PCcluster_448 PCcluster_891 PCcluster_179
## [1,]           366            419            61           362           111
## [2,]            13             13            13            13            13
sapply(gseaRes_all, pathwaySearch, "COLO")
##  PCcluster_551 PCcluster_1277  PCcluster_448  PCcluster_891  PCcluster_179 
##              0              1              0              2              0
sapply(gseaRes_all, pathwaySearch, "COLO", proportion = TRUE)
##  PCcluster_551 PCcluster_1277  PCcluster_448  PCcluster_891  PCcluster_179 
##    0.000000000    0.002386635    0.000000000    0.005524862    0.000000000

for (i in seq_along(gseaRes_all)) {
  res <- gseaRes_all[[i]]
  gseaSub <- subsetGSEA(res, n = 20)
  print(gseaBarplot(gseaRes = gseaSub))
  print(gseaNetwork(gseaRes = gseaSub, similarity_metric = "overlap_similarity", similarity_cutoff = 0.3))
}

Validate TCGA data with MSigDB C2

2020-08-12

Contents

1 PCAmodel

2 TCGA

3 Extra functions

4 Apply PCAmodel

4.1 Wordcloud

4.2 GSEA