suppressPackageStartupMessages({
library(magrittr)
library(clusterProfiler)
library(SummarizedExperiment)
library(PCAGenomicSignatures)
library(AnnotationDbi)
library(org.Hs.eg.db)
library(DOSE)
library(msigdbr)
})
# Top 90% varying genes from 1,399 studies
topGenes_7951 <- readRDS("~/data2/Genomic_Super_Signature/refinebio/data/topGenes_7951.rds")
# Top 90% varying genes from 536 studies
topGenes_13934 <- readRDS("~/data2/Genomic_Super_Signature/refinebio/data/topGenes_13934.rds")
keyword <- "colo"
TCGA_COAD_Tumor <- readRDS("~/data2/GenomicSuperSignature/data/TCGA_COAD_Tumor.rds")
tcga_coad_raw <- assay(TCGA_COAD_Tumor)
a <- scater::calculateTPM(tcga_coad_raw) # raw counts to TPM
b <- log2(a + 1)
b <- b[-which(rowSums(b) == 0),] # remove 569 non-expressing genes from 23,368 genes
b <- b[intersect(rownames(b), topGenes_13934),] # subset genes
rownames(b) <- AnnotationDbi::mapIds(org.Hs.eg.db, keys = rownames(b),
column = "ENTREZID", keytype = "SYMBOL")
c <- rmNaInf(b)
d <- rowNorm(c)
e <- prcomp(t(b))
f <- prcomp(t(d))
PCARes <- e
PCNum <- 2
## Target geneList
geneList <- PCARes$rotation[,PCNum]
geneList <- sort(geneList, decreasing = TRUE)
wpgmtfile <- "~/data2/Genomic_Super_Signature/GSEA/data/wikipathways-20200510-gmt-Homo_sapiens.gmt"
wp2gene <- read.gmt(wpgmtfile)
wp2gene <- wp2gene %>% tidyr::separate(term, c("name","version","wpid","org"), "%")
wpid2gene <- wp2gene %>% dplyr::select(wpid, gene) #TERM2GENE
wpid2name <- wp2gene %>% dplyr::select(wpid, name) #TERM2NAME
## GSEA with wikiPathways
wp2 <- GSEA(geneList, TERM2GENE = wpid2gene, TERM2NAME = wpid2name, verbose = FALSE,
pvalueCutoff = 0.05, minGSSize = 10, maxGSSize = 500)
## wikiPathways containing keyword
length(grep(keyword, unique(wpid2name$name), ignore.case = TRUE))
## [1] 4
unique(wpid2name$name)[grep(keyword, unique(wpid2name$name), ignore.case = TRUE)]
## [1] "LncRNA involvement in canonical Wnt signaling and colorectal cancer"
## [2] "Metabolic reprogramming in colon cancer"
## [3] "Chromosomal and microsatellite instability in colorectal cancer "
## [4] "Epithelial to mesenchymal transition in colorectal cancer"
## wikiGSEA output containing keyword
dim(wp2)
## [1] 90 11
grep(keyword, wp2$Description, ignore.case = TRUE)
## integer(0)
wp2$Description[grep(keyword, wp2$Description, ignore.case = TRUE)]
## character(0)
library(msigdbr)
m_c2 <- msigdbr(species = "Homo sapiens", category = "C2") %>%
dplyr::select(gs_name, entrez_gene)
length(unique(m_c2$gs_name)) # the number of unique gene sets in MSigDB C2 database
## [1] 5529
length(grep(keyword, unique(m_c2$gs_name), ignore.case = TRUE)) # the number of keyword containing gene sets
## [1] 29
msC2_2 <- GSEA(geneList, TERM2GENE = m_c2, pvalueCutoff = 0.05)
dim(msC2_2)
## [1] 1412 11
MSigDB C2 that contain the keyword (ordered by NES).
## [1] "The number of enriched pathways = 1412"
## [1] "The number of enriched pathways containing keyword = 7"
Index containing the keyword
ind2
## [1] 45 426 670 782 843 1305 1405
MSigDB C2 that contain the keyword (ordered by qvalues).
## [1] "The number of enriched pathways = 1412"
## [1] "The number of enriched pathways containing keyword = 7"
Index containing the keyword
ind2
## [1] 128 159 185 793 1081 1155 1239
Gene Set Enrichment Analysis of Gene Ontology
library(clusterProfiler)
ego2 <- gseGO(geneList = geneList,
OrgDb = org.Hs.eg.db,
ont = "BP",
nPerm = 1000,
minGSSize = 50,
maxGSSize = 500,
pvalueCutoff = 0.01,
verbose = FALSE)
nrow(ego2)
## [1] 544
# grep(keyword, ego2$Description, ignore.case = TRUE)