suppressPackageStartupMessages({
  library(magrittr)
  library(clusterProfiler)
  library(SummarizedExperiment)
  library(PCAGenomicSignatures)
  library(AnnotationDbi)
  library(org.Hs.eg.db)
  library(DOSE)
  library(msigdbr)
})

1 TCGA-COAD

keyword <- "colo"

TCGA_COAD_Tumor <- readRDS("~/data2/GenomicSuperSignature/data/TCGA_COAD_Tumor.rds")
tcga_coad_raw <- assay(TCGA_COAD_Tumor)
a <- scater::calculateTPM(tcga_coad_raw)   # raw counts to TPM
b <- log2(a + 1)
b <- b[-which(rowSums(b) == 0),]   # remove 569 non-expressing genes from 23,368 genes
c <- rmNaInf(b)
d <- rowNorm(c)
par(mfrow = c(2,2))
hist(a, main = "a. TPM")
hist(b, main = "b. log2(TPM+1)")
hist(c, main = "c. rmNaInf")
hist(d, main = "d. row normalized")

geneList_b <- rowSums(b)[order(rowSums(b), decreasing = TRUE)]
geneList_d <- rowSums(d)[order(rowSums(d), decreasing = TRUE)]

names(geneList_b) <- AnnotationDbi::mapIds(org.Hs.eg.db, keys = names(geneList_b), 
                                           column = "ENTREZID", keytype = "SYMBOL")
names(geneList_d) <- AnnotationDbi::mapIds(org.Hs.eg.db, keys = names(geneList_d), 
                                           column = "ENTREZID", keytype = "SYMBOL")
geneList <- geneList_b

2 WikiPathways analysis

wpgmtfile <- "~/data2/Genomic_Super_Signature/GSEA/data/wikipathways-20200510-gmt-Homo_sapiens.gmt"
wp2gene <- read.gmt(wpgmtfile)
wp2gene <- wp2gene %>% tidyr::separate(term, c("name","version","wpid","org"), "%")
wpid2gene <- wp2gene %>% dplyr::select(wpid, gene) #TERM2GENE
wpid2name <- wp2gene %>% dplyr::select(wpid, name) #TERM2NAME

## GSEA with wikiPathways
wp2 <- GSEA(geneList, TERM2GENE = wpid2gene, TERM2NAME = wpid2name, verbose = FALSE)
## wikiPathways containing keyword
length(grep(keyword, unique(wpid2name$name), ignore.case = TRUE))
## [1] 4
unique(wpid2name$name)[grep(keyword, unique(wpid2name$name), ignore.case = TRUE)]
## [1] "LncRNA involvement in canonical Wnt signaling and colorectal cancer"
## [2] "Metabolic reprogramming in colon cancer"                            
## [3] "Chromosomal and microsatellite instability in colorectal cancer "   
## [4] "Epithelial to mesenchymal transition in colorectal cancer"
## wikiGSEA output containing keyword
dim(wp2)
## [1] 207  11
grep(keyword, wp2$Description, ignore.case = TRUE)
## [1]  12 133 151
wp2$Description[grep(keyword, wp2$Description, ignore.case = TRUE)]
## [1] "Metabolic reprogramming in colon cancer"                         
## [2] "Epithelial to mesenchymal transition in colorectal cancer"       
## [3] "Chromosomal and microsatellite instability in colorectal cancer "

3 MSigDB C2

library(msigdbr)
m_c2 <- msigdbr(species = "Homo sapiens", category = "C2") %>% 
  dplyr::select(gs_name, entrez_gene)
length(unique(m_c2$gs_name))  # the number of unique gene sets in MSigDB C2 database
## [1] 5529
length(grep(keyword, unique(m_c2$gs_name), ignore.case = TRUE))  # the number of keyword containing gene sets
## [1] 29

msC2_2 <- GSEA(geneList, TERM2GENE = m_c2, pvalueCutoff = 0.05)
dim(msC2_2)
## [1] 2080   11

3.1 Sorted by NES

MSigDB C2 that contain the keyword (ordered by NES).

## [1] "The number of enriched pathways = 2080"
## [1] "The number of enriched pathways containing keyword = 7"

Index containing the keyword

ind2
## [1]  188  261  554  974 1173 1382 1933

3.2 Sorted by qvalues

MSigDB C2 that contain the keyword (ordered by qvalues).

## [1] "The number of enriched pathways = 2080"
## [1] "The number of enriched pathways containing keyword = 7"

Index containing the keyword

ind2
## [1]   41  218  474 1179 1344 1896 1963

4 GO GSEA

Gene Set Enrichment Analysis of Gene Ontology

library(clusterProfiler)
ego2 <- gseGO(geneList     = geneList,
              OrgDb        = org.Hs.eg.db,
              ont          = "BP",
              nPerm        = 1000,
              minGSSize    = 50,
              maxGSSize    = 500,
              pvalueCutoff = 0.01,
              verbose      = FALSE)
nrow(ego2)
## [1] 987
# grep(keyword, ego2$Description, ignore.case = TRUE)