suppressPackageStartupMessages({
library(magrittr)
library(clusterProfiler)
library(SummarizedExperiment)
library(PCAGenomicSignatures)
library(AnnotationDbi)
library(org.Hs.eg.db)
library(DOSE)
library(msigdbr)
})
keyword <- "colo"
TCGA_COAD_Tumor <- readRDS("~/data2/GenomicSuperSignature/data/TCGA_COAD_Tumor.rds")
tcga_coad_raw <- assay(TCGA_COAD_Tumor)
a <- scater::calculateTPM(tcga_coad_raw) # raw counts to TPM
b <- log2(a + 1)
b <- b[-which(rowSums(b) == 0),] # remove 569 non-expressing genes from 23,368 genes
c <- rmNaInf(b)
d <- rowNorm(c)
par(mfrow = c(2,2))
hist(a, main = "a. TPM")
hist(b, main = "b. log2(TPM+1)")
hist(c, main = "c. rmNaInf")
hist(d, main = "d. row normalized")
geneList_b <- rowSums(b)[order(rowSums(b), decreasing = TRUE)]
geneList_d <- rowSums(d)[order(rowSums(d), decreasing = TRUE)]
names(geneList_b) <- AnnotationDbi::mapIds(org.Hs.eg.db, keys = names(geneList_b),
column = "ENTREZID", keytype = "SYMBOL")
names(geneList_d) <- AnnotationDbi::mapIds(org.Hs.eg.db, keys = names(geneList_d),
column = "ENTREZID", keytype = "SYMBOL")
geneList <- geneList_b
wpgmtfile <- "~/data2/Genomic_Super_Signature/GSEA/data/wikipathways-20200510-gmt-Homo_sapiens.gmt"
wp2gene <- read.gmt(wpgmtfile)
wp2gene <- wp2gene %>% tidyr::separate(term, c("name","version","wpid","org"), "%")
wpid2gene <- wp2gene %>% dplyr::select(wpid, gene) #TERM2GENE
wpid2name <- wp2gene %>% dplyr::select(wpid, name) #TERM2NAME
## GSEA with wikiPathways
wp2 <- GSEA(geneList, TERM2GENE = wpid2gene, TERM2NAME = wpid2name, verbose = FALSE)
## wikiPathways containing keyword
length(grep(keyword, unique(wpid2name$name), ignore.case = TRUE))
## [1] 4
unique(wpid2name$name)[grep(keyword, unique(wpid2name$name), ignore.case = TRUE)]
## [1] "LncRNA involvement in canonical Wnt signaling and colorectal cancer"
## [2] "Metabolic reprogramming in colon cancer"
## [3] "Chromosomal and microsatellite instability in colorectal cancer "
## [4] "Epithelial to mesenchymal transition in colorectal cancer"
## wikiGSEA output containing keyword
dim(wp2)
## [1] 207 11
grep(keyword, wp2$Description, ignore.case = TRUE)
## [1] 12 133 151
wp2$Description[grep(keyword, wp2$Description, ignore.case = TRUE)]
## [1] "Metabolic reprogramming in colon cancer"
## [2] "Epithelial to mesenchymal transition in colorectal cancer"
## [3] "Chromosomal and microsatellite instability in colorectal cancer "
library(msigdbr)
m_c2 <- msigdbr(species = "Homo sapiens", category = "C2") %>%
dplyr::select(gs_name, entrez_gene)
length(unique(m_c2$gs_name)) # the number of unique gene sets in MSigDB C2 database
## [1] 5529
length(grep(keyword, unique(m_c2$gs_name), ignore.case = TRUE)) # the number of keyword containing gene sets
## [1] 29
msC2_2 <- GSEA(geneList, TERM2GENE = m_c2, pvalueCutoff = 0.05)
dim(msC2_2)
## [1] 2080 11
MSigDB C2 that contain the keyword (ordered by NES).
## [1] "The number of enriched pathways = 2080"
## [1] "The number of enriched pathways containing keyword = 7"
Index containing the keyword
ind2
## [1] 188 261 554 974 1173 1382 1933
MSigDB C2 that contain the keyword (ordered by qvalues).
## [1] "The number of enriched pathways = 2080"
## [1] "The number of enriched pathways containing keyword = 7"
Index containing the keyword
ind2
## [1] 41 218 474 1179 1344 1896 1963
Gene Set Enrichment Analysis of Gene Ontology
library(clusterProfiler)
ego2 <- gseGO(geneList = geneList,
OrgDb = org.Hs.eg.db,
ont = "BP",
nPerm = 1000,
minGSSize = 50,
maxGSSize = 500,
pvalueCutoff = 0.01,
verbose = FALSE)
nrow(ego2)
## [1] 987
# grep(keyword, ego2$Description, ignore.case = TRUE)