suppressPackageStartupMessages({
library(magrittr)
library(clusterProfiler)
library(SummarizedExperiment)
library(PCAGenomicSignatures)
library(AnnotationDbi)
library(org.Hs.eg.db)
library(DOSE)
library(msigdbr)
})
keyword <- "colo"
TCGA_COAD_Tumor <- readRDS("~/data2/GenomicSuperSignature/data/TCGA_COAD_Tumor.rds")
tcga_coad_raw <- assay(TCGA_COAD_Tumor)
a <- scater::calculateTPM(tcga_coad_raw) # raw counts to TPM
b <- log2(a + 1)
b <- b[-which(rowSums(b) == 0),] # remove 569 non-expressing genes from 23,368 genes
c <- rmNaInf(b)
d <- rowNorm(c)
rownames(b) <- AnnotationDbi::mapIds(org.Hs.eg.db, keys = rownames(b),
column = "ENTREZID", keytype = "SYMBOL")
e <- prcomp(t(b))
f <- prcomp(t(d))
PCARes <- e
PCNum <- 1
## Target geneList
geneList <- PCARes$rotation[,PCNum]
geneList <- sort(geneList, decreasing = TRUE)
wpgmtfile <- "~/data2/Genomic_Super_Signature/GSEA/data/wikipathways-20200510-gmt-Homo_sapiens.gmt"
wp2gene <- read.gmt(wpgmtfile)
wp2gene <- wp2gene %>% tidyr::separate(term, c("name","version","wpid","org"), "%")
wpid2gene <- wp2gene %>% dplyr::select(wpid, gene) #TERM2GENE
wpid2name <- wp2gene %>% dplyr::select(wpid, name) #TERM2NAME
## GSEA with wikiPathways
wp2 <- GSEA(geneList, TERM2GENE = wpid2gene, TERM2NAME = wpid2name, verbose = FALSE)
## wikiPathways containing keyword
length(grep(keyword, unique(wpid2name$name), ignore.case = TRUE))
## [1] 4
unique(wpid2name$name)[grep(keyword, unique(wpid2name$name), ignore.case = TRUE)]
## [1] "LncRNA involvement in canonical Wnt signaling and colorectal cancer"
## [2] "Metabolic reprogramming in colon cancer"
## [3] "Chromosomal and microsatellite instability in colorectal cancer "
## [4] "Epithelial to mesenchymal transition in colorectal cancer"
## wikiGSEA output containing keyword
dim(wp2)
## [1] 80 11
grep(keyword, wp2$Description, ignore.case = TRUE)
## [1] 10
wp2$Description[grep(keyword, wp2$Description, ignore.case = TRUE)]
## [1] "Metabolic reprogramming in colon cancer"
library(msigdbr)
m_c2 <- msigdbr(species = "Homo sapiens", category = "C2") %>%
dplyr::select(gs_name, entrez_gene)
length(unique(m_c2$gs_name)) # the number of unique gene sets in MSigDB C2 database
## [1] 5529
length(grep(keyword, unique(m_c2$gs_name), ignore.case = TRUE)) # the number of keyword containing gene sets
## [1] 29
msC2_2 <- GSEA(geneList, TERM2GENE = m_c2, pvalueCutoff = 0.05)
dim(msC2_2)
## [1] 1434 11
MSigDB C2 that contain the keyword (ordered by NES).
## [1] "The number of enriched pathways = 1434"
## [1] "The number of enriched pathways containing keyword = 8"
Index containing the keyword
ind2
## [1] 106 384 797 1021 1044 1097 1172 1309
MSigDB C2 that contain the keyword (ordered by qvalues).
## [1] "The number of enriched pathways = 1434"
## [1] "The number of enriched pathways containing keyword = 8"
Index containing the keyword
ind2
## [1] 126 204 262 561 620 933 1101 1184
Gene Set Enrichment Analysis of Gene Ontology
library(clusterProfiler)
ego2 <- gseGO(geneList = geneList,
OrgDb = org.Hs.eg.db,
ont = "BP",
nPerm = 1000,
minGSSize = 50,
maxGSSize = 500,
pvalueCutoff = 0.01,
verbose = FALSE)
nrow(ego2)
## [1] 0
# grep(keyword, ego2$Description, ignore.case = TRUE)