suppressPackageStartupMessages({
  library(magrittr)
  library(clusterProfiler)
  library(SummarizedExperiment)
  library(PCAGenomicSignatures)
  library(AnnotationDbi)
  library(org.Hs.eg.db)
  library(DOSE)
  library(msigdbr)
})

1 TCGA-COAD

1.1 Normalization

keyword <- "colo"

TCGA_COAD_Tumor <- readRDS("~/data2/GenomicSuperSignature/data/TCGA_COAD_Tumor.rds")
tcga_coad_raw <- assay(TCGA_COAD_Tumor)
a <- scater::calculateTPM(tcga_coad_raw)   # raw counts to TPM
b <- log2(a + 1)
b <- b[-which(rowSums(b) == 0),]   # remove 569 non-expressing genes from 23,368 genes
c <- rmNaInf(b)
d <- rowNorm(c)

1.2 PCA

rownames(b) <- AnnotationDbi::mapIds(org.Hs.eg.db, keys = rownames(b),
                                     column = "ENTREZID", keytype = "SYMBOL")
e <- prcomp(t(b))
f <- prcomp(t(d))

PCARes <- e
PCNum <- 1

## Target geneList
geneList <- PCARes$rotation[,PCNum]   
geneList <- sort(geneList, decreasing = TRUE)

2 WikiPathways analysis

wpgmtfile <- "~/data2/Genomic_Super_Signature/GSEA/data/wikipathways-20200510-gmt-Homo_sapiens.gmt"
wp2gene <- read.gmt(wpgmtfile)
wp2gene <- wp2gene %>% tidyr::separate(term, c("name","version","wpid","org"), "%")
wpid2gene <- wp2gene %>% dplyr::select(wpid, gene) #TERM2GENE
wpid2name <- wp2gene %>% dplyr::select(wpid, name) #TERM2NAME

## GSEA with wikiPathways
wp2 <- GSEA(geneList, TERM2GENE = wpid2gene, TERM2NAME = wpid2name, verbose = FALSE)

## wikiPathways containing keyword
length(grep(keyword, unique(wpid2name$name), ignore.case = TRUE))

## [1] 4

unique(wpid2name$name)[grep(keyword, unique(wpid2name$name), ignore.case = TRUE)]

## [1] "LncRNA involvement in canonical Wnt signaling and colorectal cancer"
## [2] "Metabolic reprogramming in colon cancer"                            
## [3] "Chromosomal and microsatellite instability in colorectal cancer "   
## [4] "Epithelial to mesenchymal transition in colorectal cancer"

## wikiGSEA output containing keyword
dim(wp2)

## [1] 80 11

grep(keyword, wp2$Description, ignore.case = TRUE)

## [1] 10

wp2$Description[grep(keyword, wp2$Description, ignore.case = TRUE)]

## [1] "Metabolic reprogramming in colon cancer"

3 MSigDB C2

library(msigdbr)
m_c2 <- msigdbr(species = "Homo sapiens", category = "C2") %>% 
  dplyr::select(gs_name, entrez_gene)
length(unique(m_c2$gs_name))  # the number of unique gene sets in MSigDB C2 database
## [1] 5529
length(grep(keyword, unique(m_c2$gs_name), ignore.case = TRUE))  # the number of keyword containing gene sets
## [1] 29

msC2_2 <- GSEA(geneList, TERM2GENE = m_c2, pvalueCutoff = 0.05)

dim(msC2_2)
## [1] 1434   11

3.1 Sorted by NES

MSigDB C2 that contain the keyword (ordered by NES).

## [1] "The number of enriched pathways = 1434"

## [1] "The number of enriched pathways containing keyword = 8"

Index containing the keyword

ind2
## [1]  106  384  797 1021 1044 1097 1172 1309

3.2 Sorted by qvalues

MSigDB C2 that contain the keyword (ordered by qvalues).

## [1] "The number of enriched pathways = 1434"

## [1] "The number of enriched pathways containing keyword = 8"

Index containing the keyword

ind2
## [1]  126  204  262  561  620  933 1101 1184

4 GO GSEA

Gene Set Enrichment Analysis of Gene Ontology

library(clusterProfiler)
ego2 <- gseGO(geneList     = geneList,
              OrgDb        = org.Hs.eg.db,
              ont          = "BP",
              nPerm        = 1000,
              minGSSize    = 50,
              maxGSSize    = 500,
              pvalueCutoff = 0.01,
              verbose      = FALSE)

nrow(ego2)
## [1] 0
# grep(keyword, ego2$Description, ignore.case = TRUE)

Effect of normalization on GSEA

2020-08-18