1 PCAmodel

dat_dir <- "~/data2/PCAGenomicSignatureLibrary/refinebioRseq"
# PCAmodel <- readRDS(file.path(dat_dir, "PCAmodel_1399/refinebioRseq_PCAmodel_hclust.rds"))
# PCAmodel <- readRDS(file.path(dat_dir, "PCAmodel_1399_C2/refinebioRseq_PCAmodel_hclust_C2.rds"))
PCAmodel <- readRDS(file.path(dat_dir, "PCAmodel_536/refinebioRseq_PCAmodel.rds"))
# PCAmodel <- readRDS(file.path(dat_dir, "PCAmodel_536_C2/refinebioRseq_PCAmodel.rds"))

2 TCGA

load("~/data2/GenomicSuperSignature/data/TCGA_validationDatasets.rda")
names(TCGA_validationDatasets)
## [1] "COAD" "BRCA" "LUAD" "READ" "OV"
dataset <- TCGA_validationDatasets[[1]]   # COAD

3 Extra functions

pathwaySearch <- function(gseaRes, keyword, proportion = FALSE) {
  names <- as.list(gseaRes$ID) %>% unlist
  pathwayOfInterest <- stringr::str_detect(names, keyword) %>% sum 
  if (isFALSE(proportion)) {
    return(pathwayOfInterest)
  } else {
    prop <- pathwayOfInterest/length(names)
    return(prop)
  }
}

4 Apply PCAmodel

val_all <- validate(dataset, PCAmodel)

validatedSignatures(val_all, num.out = 10)
##                  score PC           sw cl_size cl_num
## PCcluster312 0.7440772  3 -0.066746154      24    312
## PCcluster832 0.7273877  4 -0.026991654       5    832
## PCcluster188 0.6837048  4  0.066205326       6    188
## PCcluster468 0.6771226  3  0.124024704       7    468
## PCcluster21  0.6685406  3  0.207640365       4     21
## PCcluster119 0.6591724  3 -0.006215226      13    119
## PCcluster684 0.6588895  3 -0.064556198      19    684
## PCcluster579 0.6586892  4  0.200188957       6    579
## PCcluster192 0.6512816  2 -0.040692944       7    192
## PCcluster758 0.6415406  3 -0.071998965      10    758
heatmapTable(val_all, num.out = 10)


validated_ind <- validatedSignatures(val_all, scoreCutoff = 0.5, swCutoff = 0, indexOnly = TRUE)
heatmapTable(val_all, scoreCutoff = 0.5, swCutoff = 0)

plotValidate(val_all, interactive = TRUE, minClusterSize = 4)

4.1 Wordcloud

set.seed(1)
for (ind in validated_ind) {
  drawWordcloud(PCAmodel, ind)
}

4.2 GSEA

gseaRes_all <- vector(mode = "list", length = length(validated_ind))
names(gseaRes_all) <- paste0("PCcluster_", validated_ind)

for (i in seq_along(validated_ind)) {
  res <- msigdb_gsea(validated_ind[i], PCAmodel, category = "C2")
  gseaRes_all[[i]] <- res
}
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## The following object is masked from 'package:PCAGenomicSignatures':
## 
##     metadata
## The following object is masked from 'package:base':
## 
##     expand.grid
## 
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## 
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## Encountered 1 from.IDs with >1 corresponding to.ID
## (the first to.ID was chosen for each of them)
## Excluded 2744 from.IDs without a corresponding to.ID
## Registered S3 methods overwritten by 'clusterProfiler.dplyr':
##   method                         from           
##   arrange.compareClusterResult   clusterProfiler
##   arrange.enrichResult           clusterProfiler
##   arrange.gseaResult             clusterProfiler
##   filter.compareClusterResult    clusterProfiler
##   filter.enrichResult            clusterProfiler
##   filter.gseaResult              clusterProfiler
##   group_by.compareClusterResult  clusterProfiler
##   group_by.enrichResult          clusterProfiler
##   group_by.gseaResult            clusterProfiler
##   mutate.compareClusterResult    clusterProfiler
##   mutate.enrichResult            clusterProfiler
##   mutate.gseaResult              clusterProfiler
##   rename.compareClusterResult    clusterProfiler
##   rename.enrichResult            clusterProfiler
##   rename.gseaResult              clusterProfiler
##   select.compareClusterResult    clusterProfiler
##   select.enrichResult            clusterProfiler
##   select.gseaResult              clusterProfiler
##   slice.compareClusterResult     clusterProfiler
##   slice.enrichResult             clusterProfiler
##   slice.gseaResult               clusterProfiler
##   summarise.compareClusterResult clusterProfiler
##   summarise.enrichResult         clusterProfiler
##   summarise.gseaResult           clusterProfiler
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize,
## gseaParam, : There are duplicate gene names, fgsea may produce unexpected
## results.
## no term enriched under specific pvalueCutoff...
## Warning: The `add` argument of `group_by()` is deprecated as of dplyr 1.0.0.
## Please use the `.add` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Encountered 1 from.IDs with >1 corresponding to.ID
## (the first to.ID was chosen for each of them)
## Excluded 2744 from.IDs without a corresponding to.ID
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize,
## gseaParam, : There are duplicate gene names, fgsea may produce unexpected
## results.
## Encountered 1 from.IDs with >1 corresponding to.ID
## (the first to.ID was chosen for each of them)
## Excluded 2744 from.IDs without a corresponding to.ID
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize,
## gseaParam, : There are duplicate gene names, fgsea may produce unexpected
## results.
## Warning in fgseaMultilevel(...): For some of the pathways the P-values were
## likely overestimated. For such pathways log2err is set to NA.
## no term enriched under specific pvalueCutoff...
## Encountered 1 from.IDs with >1 corresponding to.ID
## (the first to.ID was chosen for each of them)
## Excluded 2744 from.IDs without a corresponding to.ID
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize,
## gseaParam, : There are duplicate gene names, fgsea may produce unexpected
## results.
## Encountered 1 from.IDs with >1 corresponding to.ID
## (the first to.ID was chosen for each of them)
## Excluded 2744 from.IDs without a corresponding to.ID
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize,
## gseaParam, : There are duplicate gene names, fgsea may produce unexpected
## results.
## no term enriched under specific pvalueCutoff...

sapply(gseaRes_all, dim)
##      PCcluster_188 PCcluster_468 PCcluster_21 PCcluster_579 PCcluster_1166
## [1,]             0            65            0           616              0
## [2,]            10            13           10            13             10
sapply(gseaRes_all, pathwaySearch, "COLO")
##  PCcluster_188  PCcluster_468   PCcluster_21  PCcluster_579 PCcluster_1166 
##              0              1              0              2              0
sapply(gseaRes_all, pathwaySearch, "COLO", proportion = TRUE)
##  PCcluster_188  PCcluster_468   PCcluster_21  PCcluster_579 PCcluster_1166 
##            NaN    0.015384615            NaN    0.003246753            NaN

for (i in seq_along(gseaRes_all)) {
  res <- gseaRes_all[[i]]
  gseaSub <- subsetGSEA(res, n = 20)
  print(gseaBarplot(gseaRes = gseaSub))
  print(gseaNetwork(gseaRes = gseaSub, similarity_metric = "overlap_similarity", similarity_cutoff = 0.3))
}
## [1] "No pathway is enriched"
## [1] "No pathway is enriched"

## [1] "No pathway is enriched"
## [1] "No pathway is enriched"

## [1] "No pathway is enriched"
## [1] "No pathway is enriched"

Validate TCGA data with MSigDB C2

2020-08-12

Contents

1 PCAmodel

2 TCGA

3 Extra functions

4 Apply PCAmodel

4.1 Wordcloud

4.2 GSEA