This R Notebook describes the pipeline for RNA-seq analysis with differential expression (DESeq), ORA (Over-Representation) and GSEA (Gene Set Enrichment Analysis).
First, we need to create two data frames: (1)
GSE123564_data
with the expression data and the sample
names (control vs mutant); and (2) a GSE123564_metadata
containing meta data with the sample information. Both data are
available at Gene
Expression Omnibus GSE123564
library(DESeq2)
library(dplyr)
library(ggpubr)
library(pheatmap)
library(stringr)
# Data analysis on GEO: GSE123564 - retina from patients
# with Hbs1L deficiency(retinal development with delayed
# development and retinal pigmentary deposits)
patient1_GSE123564 <- read.delim("/home/bad23/Documents/R-studies/Data_Analysis/Bioinformatic-Analysis-DSMZ/GSE123564/Samples/GSM3507209_patient_rep1_mRNA-seq.counts.txt",
header = T, sep = "")
patient2_GSE123564 <- read.delim("/home/bad23/Documents/R-studies/Data_Analysis/Bioinformatic-Analysis-DSMZ/GSE123564/Samples/GSM3507210_patient_rep2_mRNA-seq.counts.txt",
header = T, sep = "")
patient3_GSE123564 <- read.delim("/home/bad23//Documents/R-studies/Data_Analysis/Bioinformatic-Analysis-DSMZ/GSE123564/Samples/GSM3507211_patient_rep3_mRNA-seq.counts.txt",
header = T, sep = "")
patient4_GSE123564 <- read.delim("/home/bad23//Documents/R-studies/Data_Analysis/Bioinformatic-Analysis-DSMZ/GSE123564/Samples/GSM3507212_patient_rep4_mRNA-seq.counts.txt",
header = T, sep = "")
patient5_GSE123564 <- read.delim("/home/bad23/Documents/R-studies/Data_Analysis/Bioinformatic-Analysis-DSMZ/GSE123564/Samples/GSM3507212_patient_rep4_mRNA-seq.counts.txt",
header = T, sep = "")
patient6_GSE123564 <- read.delim("/home/bad23/Documents/R-studies/Data_Analysis/Bioinformatic-Analysis-DSMZ/GSE123564/Samples/GSM3507213_patient_rep5_mRNA-seq.counts.txt",
header = T, sep = "")
healthy1_GSE123564 <- read.delim("/home/bad23/Documents/R-studies/Data_Analysis/Bioinformatic-Analysis-DSMZ/GSE123564/Samples/GSM3507215_healthy_individual-1_rep1_mRNA-seq.counts.txt",
header = T, sep = "")
healthy2_GSE123564 <- read.delim("/home/bad23/Documents/R-studies/Data_Analysis/Bioinformatic-Analysis-DSMZ/GSE123564/Samples/GSM3507216_healthy_individual-1_rep2_mRNA-seq.counts.txt",
header = T, sep = "")
healthy3_GSE123564 <- read.delim("/home/bad23/Documents/R-studies/Data_Analysis/Bioinformatic-Analysis-DSMZ/GSE123564/Samples/GSM3507217_healthy_individual-1_rep3_mRNA-seq.counts.txt",
header = T, sep = "")
healthy4_GSE123564 <- read.delim("/home/bad23/Documents/R-studies/Data_Analysis/Bioinformatic-Analysis-DSMZ/GSE123564/Samples/GSM3507218_healthy_individual-2_rep1_mRNA-seq.counts.txt",
header = T, sep = "")
healthy5_GSE123564 <- read.delim("/home/bad23/Documents/R-studies/Data_Analysis/Bioinformatic-Analysis-DSMZ/GSE123564/Samples/GSM3507219_healthy_individual-2_rep2_mRNA-seq.counts.txt",
header = T, sep = "")
healthy6_GSE123564 <- read.delim("/home/bad23/Documents/R-studies/Data_Analysis/Bioinformatic-Analysis-DSMZ/GSE123564/Samples/GSM3507220_healthy_individual-2_rep3_mRNA-seq.counts.txt",
header = T, sep = "")
GSE123564_genes <- patient1_GSE123564[2:nrow(patient1_GSE123564),
1]
GSE123564_data <- cbind(patient1_GSE123564[, c(1, 7)], patient2_GSE123564[,
7], patient3_GSE123564[, 7], patient4_GSE123564[, 7], patient5_GSE123564[,
7], patient6_GSE123564[, 7], healthy1_GSE123564[, 7], healthy2_GSE123564[,
7], healthy3_GSE123564[, 7], healthy4_GSE123564[, 7], healthy5_GSE123564[,
7], healthy6_GSE123564[, 7])
colnames(GSE123564_data) <- c("gene_id", "mutation", "mutation",
"mutation", "mutation", "mutation", "mutation", "control",
"control", "control", "control", "control", "control")
GSE123564_data <- GSE123564_data[2:nrow(GSE123564_data), ]
GSE123564_data <- matrix(as.numeric((unlist(GSE123564_data[,
2:13]))), ncol = ncol(GSE123564_data[, 2:13]))
colnames(GSE123564_data) <- c("Patient_1", "Patient_2", "Patient_3",
"Patient_4", "Patient_5", "Patient_6", "Healthy_1", "Healthy_2",
"Healthy_3", "Healthy_4", "Healthy_5", "Healthy_6")
GSE123564_metadata <- data.frame(Condition = c("Mutant", "Mutant",
"Mutant", "Mutant", "Mutant", "Mutant", "Control", "Control",
"Control", "Control", "Control", "Control"), Sample = colnames(GSE123564_data))
head(GSE123564_data)
## Patient_1 Patient_2 Patient_3 Patient_4 Patient_5 Patient_6 Healthy_1
## [1,] 0 0 0 0 0 0 0
## [2,] 0 0 0 0 0 0 0
## [3,] 0 0 0 0 0 0 0
## [4,] 1410 1928 1426 1299 1299 1130 820
## [5,] 2611 4338 4775 3960 3960 4276 2626
## [6,] 256 413 390 429 429 571 201
## Healthy_2 Healthy_3 Healthy_4 Healthy_5 Healthy_6
## [1,] 0 0 0 0 0
## [2,] 0 0 0 0 0
## [3,] 0 0 0 0 0
## [4,] 498 478 166 387 556
## [5,] 2168 2084 613 1853 2295
## [6,] 151 144 75 226 278
## Condition Sample
## 1 Mutant Patient_1
## 2 Mutant Patient_2
## 3 Mutant Patient_3
## 4 Mutant Patient_4
## 5 Mutant Patient_5
## 6 Mutant Patient_6
cts
and coldata
tables.To run the statistical analysis, we need to create a
numeric cts
data frame, and a metadata
table coldata
. Both tables should have important params to
further be used as input for DESeq2.
cts <- matrix(as.numeric(GSE123564_data), ncol = ncol(GSE123564_data))
row.names(cts) <- GSE123564_genes
class(cts)
## [1] "matrix" "array"
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## log2 fold change (MLE): Condition Mutant vs Control
## Wald test p-value: Condition Mutant vs Control
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## OR4F5 0.000 NA NA NA NA NA
## OR4F29 0.000 NA NA NA NA NA
## OR4F16 0.000 NA NA NA NA NA
## SAMD11 850.774 1.079469 0.1701953 6.34253 2.26018e-10 7.63934e-10
## NOC2L 2674.022 0.511913 0.0696001 7.35506 1.90845e-13 7.87757e-13
## KLHL17 272.819 0.592730 0.2213172 2.67819 7.40206e-03 1.21827e-02
## converting counts to integer mode
## 1 2 3 4 5 6 7
## OR4F5 6.304496 6.304496 6.304496 6.304496 6.304496 6.304496 6.304496
## OR4F29 6.304496 6.304496 6.304496 6.304496 6.304496 6.304496 6.304496
## OR4F16 6.304496 6.304496 6.304496 6.304496 6.304496 6.304496 6.304496
## SAMD11 10.850864 10.623302 10.115579 10.191607 10.191607 9.925705 9.640657
## NOC2L 11.683739 11.713613 11.714521 11.668704 11.668704 11.671843 11.123074
## KLHL17 8.818493 8.810708 8.659464 8.909468 8.909468 9.133888 8.208435
## 8 9 10 11 12
## OR4F5 6.304496 6.304496 6.304496 6.304496 6.304496
## OR4F29 6.304496 6.304496 6.304496 6.304496 6.304496
## OR4F16 6.304496 6.304496 6.304496 6.304496 6.304496
## SAMD11 9.216625 9.228861 9.584815 9.425198 9.507513
## NOC2L 11.038030 11.054819 11.251227 11.416899 11.312274
## KLHL17 8.080256 8.083888 8.722805 8.842158 8.757643
This analysis is based on NYU Next-Generation Sequencing Analysis Tutorial
First, we will need to install some important packages:
clusterProfiler
: Universal interface for gene
functional and enrichment results tool.pathview
: Pathway based data integration and
visualization tool.wordcloud
: Analyze the frequency of a word in textual
content using visualization.enrichplot
: Implements several visualization methods
for interpreting functional enrichment results obtained from ORA or GSEA
analysis.org.Hs.eg.db
: Genome wide annotation for Human,
primarily based on mapping using Entrez Gene identifiers. For other
animal models, search for specific genomic annotation on Bioconductor# First, we want the log2 fold-change list from res derived
# of DESEq
original_gene_list <- res$log2FoldChange
# Properly name the vector
names(original_gene_list) <- rownames(res)
# Omit NA values
gene_list <- na.omit(original_gene_list)
# Sort the list in decreasing order (required for
# clusterProfiler)
gene_list = sort(gene_list, decreasing = TRUE)
# Extract significant results (pvalue < 0.05 or padj <
# 0.05)
sig_genes_df <- subset(res, padj < 0.05)
# From significant results, we want to filter on log2fold
# change
genes <- sig_genes_df$log2FoldChange
names(genes) <- rownames(sig_genes_df) # Name the vector
genes <- na.omit(genes) # omit NA
genes <- names(genes)[abs(genes) > 2] # Filtering genes with log2fold > 2
To create a enrichGO object, important params are neccessary:
Ontology Options
: “BP
” (biological
process), “MF
” (molecular function),
“CC
”cellular component]keyType
: This is the source of the annotation (gene
ids). The options vary for each annotation. In the example of
org.Dm.eg.db
, the options are:## [1] "ACCNUM" "ALIAS" "ENSEMBL" "ENSEMBLPROT" "ENSEMBLTRANS"
## [6] "ENTREZID" "ENZYME" "EVIDENCE" "EVIDENCEALL" "GENENAME"
## [11] "GENETYPE" "GO" "GOALL" "IPI" "MAP"
## [16] "OMIM" "ONTOLOGY" "ONTOLOGYALL" "PATH" "PFAM"
## [21] "PMID" "PROSITE" "REFSEQ" "SYMBOL" "UCSCKG"
## [26] "UNIPROT"
go_enrich <- enrichGO(gene = genes, universe = names(gene_list),
OrgDb = org.Hs.eg.db, keyType = "SYMBOL", readable = TRUE,
ont = "ALL", pvalueCutoff = 0.01, qvalueCutoff = 0.001)
go_enrich
## #
## # over-representation test
## #
## #...@organism Homo sapiens
## #...@ontology GOALL
## #...@keytype SYMBOL
## #...@gene chr [1:1624] "HES5" "SMIM1" "AJAP1" "SLC2A5" "CASZ1" "C1orf167" "NPPB" ...
## #...pvalues adjusted by 'BH' with cutoff <0.01
## #...180 enriched terms found
## 'data.frame': 180 obs. of 10 variables:
## $ ONTOLOGY : chr "BP" "BP" "BP" "BP" ...
## $ ID : chr "GO:0048562" "GO:0044057" "GO:0007389" "GO:0003002" ...
## $ Description: chr "embryonic organ morphogenesis" "regulation of system process" "pattern specification process" "regionalization" ...
## $ GeneRatio : chr "63/1412" "94/1412" "83/1412" "76/1412" ...
## $ BgRatio : chr "274/14590" "500/14590" "421/14590" "378/14590" ...
## $ pvalue : num 4.49e-11 1.65e-10 1.82e-10 4.23e-10 4.73e-10 ...
## $ p.adjust : num 2.40e-07 3.24e-07 3.24e-07 5.06e-07 5.06e-07 ...
## $ qvalue : num 1.97e-07 2.66e-07 2.66e-07 4.15e-07 4.15e-07 ...
## $ geneID : chr "ALX3/EFNA1/VANGL2/SOX11/SIX2/EFEMP1/OTX1/MYO3B/DLX2/HOXD10/HOXD4/HOXD3/RARB/PLS1/CHRNA9/FGF10/HAND1/FOXF2/TFAP2"| __truncated__ "NPPB/KCND3/NPR1/ATP1A2/CASQ1/NOS1AP/RGS4/RGS2/MYBPH/TGFB2/AGT/POMC/UCN/TACR1/INHBB/IGFBP5/INHA/LMCD1/CAV3/OXTR/"| __truncated__ "HES5/NBL1/WLS/ALX3/VANGL2/PBX1/PLXNA2/SIX2/OTX1/EN1/DLX1/DLX2/HOXD13/HOXD12/HOXD11/HOXD10/HOXD8/HOXD4/HOXD3/NRP"| __truncated__ "HES5/NBL1/WLS/VANGL2/PBX1/PLXNA2/SIX2/OTX1/EN1/DLX1/DLX2/HOXD13/HOXD11/HOXD10/HOXD8/HOXD4/HOXD3/NRP2/ROBO2/BMPR"| __truncated__ ...
## $ Count : int 63 94 83 76 87 36 78 81 64 36 ...
## #...Citation
## T Wu, E Hu, S Xu, M Chen, P Guo, Z Dai, T Feng, L Zhou, W Tang, L Zhan, X Fu, S Liu, X Bo, and G Yu.
## clusterProfiler 4.0: A universal enrichment tool for interpreting omics data.
## The Innovation. 2021, 2(3):100141
After the results, it is possible to perform different filtering
using go_enrich
dataset. Here, I’m performing an analysis
based on count
values.
go_enrich_filtered <- go_enrich
go_enrich_filtered_results <- go_enrich_filtered@result
# Sort GO terms by counting results
go_enrich_filtered_results <- go_enrich_filtered_results %>%
arrange(p.adjust)
go_enrich_filtered@result <- go_enrich_filtered_results
# Extract genes for cleaning view
core_genes <- str_split(as.data.frame(go_enrich_filtered_results)[,
"geneID"], "/")
core_genes_top <- stringi::stri_list2matrix(core_genes) # Transform list into matrix - conserving proper list characteristics
core_genes_top <- core_genes_top[1:10, ]
core_genes_top <- as.list(as.data.frame(core_genes_top)) # Re-transforming list
# putting new gene list into go_enrich again This is an
# important process for processing cnetplot
filtered_core_genes_top <- sapply(lapply(core_genes_top, function(x) x[x %in%
genes]), paste, collapse = "/")
go_enrich_filtered@result$geneID <- filtered_core_genes_top
upsetplot(go_enrich_filtered, n = 9, layout = "mds") + ggtitle("Upset-plot with filtered GO enrichment data")
barplot(go_enrich, drop = TRUE, showCategory = 15, title = "GO Biological Pathways",
font.size = 8) + facet_grid(ONTOLOGY ~ ., scale = "free") +
ggtitle("Bar-plot with raw GO enrichment data")
barplot(go_enrich_filtered, drop = TRUE, showCategory = 15, title = "GO Biological Pathways",
font.size = 8) + facet_grid(ONTOLOGY ~ ., scale = "free") +
ggtitle("Bar-plot with filtered GO enrichment data")
dotplot(go_enrich, showCategory = 10) + ggtitle("dotplot for ORA") +
facet_grid(ONTOLOGY ~ ., scale = "free") + ggtitle("Dot-plot with raw GO enrichment data")
dotplot(go_enrich_filtered, showCategory = 10) + ggtitle("dotplot for ORA") +
facet_grid(ONTOLOGY ~ ., scale = "free") + ggtitle("Dot-plot with filtered GO enrichment data")
emdf <- pairwise_termsim(go_enrich_filtered)
emapplot(emdf, showCategory = 15) + ggtitle("Enrichmend-map with raw GO enrichment data")
emdf <- pairwise_termsim(go_enrich)
emapplot(emdf, showCategory = 15) + ggtitle("Enrichment-map with filtered GO enrichment data")
OBS: Different forms for netplot analysis can be found here. It is also possible to export R graphs as vectors and edit, as this example.
cnetplot(go_enrich, categorySize = "pvalue", foldChange = gene_list) +
ggtitle("Standard cnet-plot with raw GO enrichment data")
cnetplot(go_enrich_filtered, categorySize = "pvalue", foldChange = gene_list) +
ggtitle("Standard cnet-plot with filtered GO enrichment data")
cnetplot(go_enrich, circular = TRUE, color.params = list(foldChange = gene_list),
colorEdge = TRUE, showCategory = 8, cex.params = list(gene_node = 0.01),
node_label = "all", cex_label_category = 0.6, cex_label_gene = 0.9) +
ggtitle("Round cnet-plot with raw GO enrichment data")
cnetplot(go_enrich_filtered, circular = TRUE, color.params = list(foldChange = gene_list),
colorEdge = TRUE, showCategory = 8, cex.params = list(gene_node = 0.01),
node_label = "all", cex_label_category = 0.6, cex_label_gene = 0.9) +
ggtitle("Round cnet-plot with filtered GO enrichment data")
The goal of GSEA is to detect situations where many genes in a gene
set change in a coordinated way, even when individual changes are small
in magnitude. First, we will set up the differential expression table
obtained with DESEq2
, containing: Gene ID, log2
fold change values,and adjusted p-values.
Then, we need to load the important libraries.
library(clusterProfiler)
library(msigdbr)
library(org.Hs.eg.db) # This is for Homo sapies annontation. For mouse, is library(org.Mm.eg.db)
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:GenomicRanges':
##
## subtract
Is also important to getting familiar with MSigDB gene set avaiable
with msigdbr
, because it allow us to acess these gene sets
in a format compatible with the package we’ll use for analysis,
clusterProfiler
. First, check the organisms the package
supports, using msigdbr_species()
. As the data we are
analyzing comes from human samples, we shoulhd obtain
only the gene sets relevants to H.sapies, by specifying
species = "Homo sapiens"
(for mouse samples, you have to
change just species = "Mus musculus"
).The category will
depends on the analysis. There are many categories:
H
: Hallmark gene sets are coherently
expressed signatures derived by aggregating many MSigDB gene sets to
represent well-defined biological states or processes.C1
: Positional gene sets corresponding
to human chromosome cytogenetic bands.C2
: Curated gene sets from online
pathway databases, publications in PubMed, and knowledge of domain
experts.C3
: Regulatory target gene sets based
on gene target predictions for microRNA seed sequences and predicted
transcription factor binding sites.C4
: Computational gene sets defined by
mining large collections of cancer-oriented microarray data.MH
: Mouse-ortholog hallmark gene sets
are versions of gene sets in the MSigDB Hallmarks collection mapped to
their mouse orthologs.M1
: Positional gene sets corresponding
to mouse chromossome cytogenetic bands.And more. For informations check-it msigdb website
hs_hallmark_sets <- msigdbr(
species = "Homo sapiens", # Replace with species name relevant to your data
category = "H" )
# Check if there are duplicates on the DESEq results we are going to analyze.
any(duplicated(rownames(dge_mapped_df))) # If it is false, no data processing is necessary
## [1] FALSE
# Let's create a named vector ranked based on the log2 fold change values
lfc_vector <- dge_mapped_df$log2FoldChange
names(lfc_vector) <- rownames(dge_mapped_df)
# We need to sort the log2 fold change values in descending order here
lfc_vector <- sort(lfc_vector, decreasing = TRUE)
head(lfc_vector)
## ADD2 CACNG4 HOXB6 HOXB9 GRP HOXB5
## 11.278668 11.100574 10.956444 10.534387 10.238942 9.758519
Now we are going to run GSEA
set.seed(2020) # Set the seed so our results are reproducible:
gsea_results <- GSEA(
geneList = lfc_vector, # Ordered ranked gene list
minGSSize = 25, # Minimum gene set size
maxGSSize = 500, # Maximum gene set set
pvalueCutoff = 0.05, # p-value cutoff
eps = 0, # Boundary for calculating the p value
seed = TRUE, # Set seed to make results reproducible
pAdjustMethod = "BH", # Benjamini-Hochberg correction
TERM2GENE = dplyr::select(
hs_hallmark_sets,
gs_name,
gene_symbol
)
)
## preparing geneSet collections...
## GSEA analysis...
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (5.63% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
## leading edge analysis...
## done...
gsea_result_df <- data.frame(gsea_results@result)
gsea_result_df %>%
# This returns the 3 rows with the largest NES values
dplyr::slice_max(NES, n = 3)
## ID Description setSize
## HALLMARK_E2F_TARGETS HALLMARK_E2F_TARGETS HALLMARK_E2F_TARGETS 198
## HALLMARK_G2M_CHECKPOINT HALLMARK_G2M_CHECKPOINT HALLMARK_G2M_CHECKPOINT 194
## HALLMARK_MYC_TARGETS_V1 HALLMARK_MYC_TARGETS_V1 HALLMARK_MYC_TARGETS_V1 196
## enrichmentScore NES pvalue p.adjust
## HALLMARK_E2F_TARGETS 0.7140609 2.840478 5.462311e-31 2.731156e-29
## HALLMARK_G2M_CHECKPOINT 0.7008499 2.751088 2.045073e-27 5.112682e-26
## HALLMARK_MYC_TARGETS_V1 0.5199776 2.053363 1.269521e-10 2.115868e-09
## qvalue rank leading_edge
## HALLMARK_E2F_TARGETS 1.839936e-29 3809 tags=75%, list=22%, signal=59%
## HALLMARK_G2M_CHECKPOINT 3.444333e-26 2710 tags=58%, list=16%, signal=50%
## HALLMARK_MYC_TARGETS_V1 1.425427e-09 4448 tags=53%, list=26%, signal=39%
## core_enrichment
## HALLMARK_E2F_TARGETS ORC6/CDK1/PLK4/TCF19/AURKB/MYBL2/KIF18B/PLK1/E2F8/CDCA8/RAD51AP1/CDC25A/DEPDC1/PSMC3IP/UBE2T/HMMR/SPAG5/KIF4A/DIAPH3/KIF2C/UBE2S/BUB1B/ASF1B/SPC25/MAD2L1/MMS22L/CKS1B/RFC3/CENPE/ESPL1/TRIP13/STMN1/CENPM/TOP2A/RACGAP1/GINS1/LMNB1/TACC3/MELK/RRM2/MKI67/DSCC1/CDCA3/BARD1/ATAD2/POLA2/DLGAP5/CDC20/HELLS/BIRC5/PRPS1/AURKA/DCK/HMGB2/SUV39H1/KIF22/CCNB2/RNASEH2A/MCM5/EZH2/PTTG1/GINS3/TIMELESS/MCM2/RFC2/TMPO/MCM3/CTPS1/BRCA2/BRCA1/MCM7/MCM4/GINS4/DCLRE1B/POLE/JPT1/POLD1/TIPIN/LIG1/SMC1A/ANP32E/SPC24/CSE1L/CDKN3/USP1/PCNA/KPNA2/MXD3/CKS2/DDX39A/RANBP1/TUBG1/DONSON/HMGB3/UBR7/TK1/POLD2/LYAR/DEK/SMC4/MSH2/CIT/LBR/NME1/DCTPP1/PAICS/CCP110/PA2G4/XPO1/MYC/EXOSC8/CBX5/NOP56/GSPT1/NASP/RPA2/SNRPB/SLBP/NUP205/RPA1/NUP107/ZW10/POP7/EIF2S1/CNOT9/HUS1/RAD51C/BRMS1L/RAN/CCNE1/CHEK1/DNMT1/TUBB/POLD3/EED/NUP153/ILF3/SRSF2/UNG/TRA2B/SRSF1/SYNCRIP/HNRNPD/MCM6/DUT/SMC6/CDKN1A/CTCF
## HALLMARK_G2M_CHECKPOINT EGF/E2F1/ORC6/CDC6/TTK/CDK1/UBE2C/PLK4/AURKB/RAD54L/MYBL2/RPS6KA5/CDC45/PRC1/PLK1/POLQ/EXO1/CDC25A/CCNA2/GINS2/E2F2/HMMR/KIF4A/CCND1/KIF2C/UBE2S/MAD2L1/TRAIP/KNL1/CKS1B/PBK/CENPE/ESPL1/STIL/KIF20B/CENPF/STMN1/NEK2/KIF23/TOP2A/CENPA/RACGAP1/LMNB1/TACC3/NUSAP1/TPX2/CCNF/KIF11/MKI67/BARD1/FBXO5/TROAP/BUB1/POLA2/CDC20/BIRC5/SMC2/RBL1/AURKA/DBF4/SLC7A5/CHAF1A/SUV39H1/KIF22/CCNB2/KIF15/NDC80/MCM5/EZH2/PTTG1/MCM2/TMPO/NSD2/MCM3/INCENP/BRCA2/DTYMK/POLE/JPT1/CBX1/SLC7A1/SMC1A/TGFB1/CDKN3/KPNA2/CKS2/CDC7/MAP3K20/DMD/DDX39A/ODC1/SAP30/HMGB3/DR1/KIF5B/SMC4/LBR/HOXC10/MTF2/BUB3/DKC1/E2F3/HMGN2/TFDP1/CASP8AP2/XPO1/MYC/HSPA8/ORC5/GSPT1/NASP/RPA2/MEIS2
## HALLMARK_MYC_TARGETS_V1 CDC45/CCNA2/MAD2L1/CDC20/MCM5/MCM2/CTPS1/EIF1AX/RFC4/MCM7/MCM4/CDK2/IFRD1/USP1/PCNA/KPNA2/POLE3/TYMS/RANBP1/SRM/ODC1/POLD2/YWHAQ/EIF4E/DDX21/DEK/RRP9/VBP1/ILF2/NME1/BUB3/CSTF2/PSMD14/RRM1/HSPE1/TFDP1/ABCE1/PA2G4/XPO1/MYC/HNRNPR/NOP56/GSPT1/NDUFAB1/NOP16/TXNL4A/GLO1/HDAC2/EIF3J/CCT5/EIF4G2/NCBP1/CCT2/SRSF7/PSMD1/PTGES3/LDHA/SNRPD1/EIF2S1/PSMA7/TCP1/RAN/SRPK1/HPRT1/HNRNPA2B1/RUVBL2/SNRPA1/PSMD3/AIMP2/HSPD1/SRSF2/SRSF3/PSMB2/SET/HNRNPA3/TRA2B/SRSF1/SYNCRIP/HNRNPD/PHB/NHP2/MCM6/DUT/DDX18/CANX/SNRPG/RNPS1/LSM2/PPIA/CCT7/GOT2/PSMC6/XRCC6/G3BP1/SERBP1/CYC1/PSMC4/UBA2/PSMD7/PPM1G/KPNB1/LSM7/GNL3
most_positive_nes_plot <- enrichplot::gseaplot(
gsea_results,
geneSetID = "HALLMARK_E2F_TARGETS",
title = "E2F_TARGETS",
color.line = "#FFCE30"
)
most_positive_nes_plot
gsea_result_df %>%
# This returns the 3 rows with the lowest NES values
dplyr::slice_min(NES, n = 3)
## ID
## HALLMARK_INTERFERON_ALPHA_RESPONSE HALLMARK_INTERFERON_ALPHA_RESPONSE
## HALLMARK_INTERFERON_GAMMA_RESPONSE HALLMARK_INTERFERON_GAMMA_RESPONSE
## HALLMARK_IL6_JAK_STAT3_SIGNALING HALLMARK_IL6_JAK_STAT3_SIGNALING
## Description setSize
## HALLMARK_INTERFERON_ALPHA_RESPONSE HALLMARK_INTERFERON_ALPHA_RESPONSE 95
## HALLMARK_INTERFERON_GAMMA_RESPONSE HALLMARK_INTERFERON_GAMMA_RESPONSE 191
## HALLMARK_IL6_JAK_STAT3_SIGNALING HALLMARK_IL6_JAK_STAT3_SIGNALING 79
## enrichmentScore NES pvalue
## HALLMARK_INTERFERON_ALPHA_RESPONSE -0.6719990 -2.009001 1.823305e-08
## HALLMARK_INTERFERON_GAMMA_RESPONSE -0.5805272 -1.845527 9.745889e-09
## HALLMARK_IL6_JAK_STAT3_SIGNALING -0.5758590 -1.679564 2.920490e-04
## p.adjust qvalue rank
## HALLMARK_INTERFERON_ALPHA_RESPONSE 1.823305e-07 1.228332e-07 3741
## HALLMARK_INTERFERON_GAMMA_RESPONSE 1.218236e-07 8.207064e-08 3741
## HALLMARK_IL6_JAK_STAT3_SIGNALING 1.327495e-03 8.943127e-04 3639
## leading_edge
## HALLMARK_INTERFERON_ALPHA_RESPONSE tags=60%, list=22%, signal=47%
## HALLMARK_INTERFERON_GAMMA_RESPONSE tags=50%, list=22%, signal=39%
## HALLMARK_IL6_JAK_STAT3_SIGNALING tags=48%, list=21%, signal=38%
## core_enrichment
## HALLMARK_INTERFERON_ALPHA_RESPONSE PSME1/TRIM25/IRF9/PSMB8/GBP2/NMI/TRIM21/IRF7/IRF2/IFITM2/CXCL11/IL15/CMPK2/TRAFD1/HLA-C/IRF1/IFIT3/ELF1/IFITM3/PARP14/LY6E/GBP4/PARP9/LPAR6/DHX58/PSMB9/PLSCR1/IL7/OASL/DDX60/STAT2/LGALS3BP/NCOA7/IFIT2/HERC6/IFI35/IFITM1/UBE2L6/GMPR/IFI44L/RSAD2/UBA7/IFIH1/IFI27/BATF2/EPSTI1/SELL/LAMP3/BST2/CD74/CASP1/ISG20/CXCL10/TMEM140/C1S/RTP4/OAS1
## HALLMARK_INTERFERON_GAMMA_RESPONSE PSME1/TRIM25/IL18BP/IRF9/RAPGEF6/PSMB8/RNF213/PSMB10/CCL7/NMI/TRIM21/IRF7/IRF2/SAMHD1/XCL1/STAT3/IFITM2/CXCL11/FCGR1A/CD86/IRF8/IL15/CMPK2/STAT1/TRAFD1/CASP4/METTL7B/APOL6/IRF1/IFIT3/XAF1/TNFAIP2/IFITM3/PARP14/LY6E/GBP4/DHX58/PSMB9/CCL2/PLA2G4A/SOD2/PLSCR1/IL7/OASL/DDX60/STAT2/IL15RA/HLA-G/IL2RB/LGALS3BP/IL6/SOCS1/IFIT2/HERC6/FGL2/HLA-DMA/IFI35/CMKLR1/CD40/UBE2L6/IFI44L/RSAD2/ARID5B/CIITA/IFIH1/IL10RA/IRF5/IFI27/BATF2/TNFAIP6/IFIT1/EPSTI1/SELP/OAS2/CFH/SOCS3/ICAM1/GCH1/BST2/MX2/CD74/HLA-DRB1/CASP1/TNFSF10/ISG20/IRF4/CXCL10/CFB/P2RY14/SECTM1/SERPING1/C1S/C1R/RTP4/VCAM1
## HALLMARK_IL6_JAK_STAT3_SIGNALING ITGB3/IRF9/CCL7/STAT3/OSMR/HMOX1/CXCL11/CSF2/DNTT/STAT1/IL17RA/IRF1/IL10RB/ACVRL1/IL7/INHBE/STAT2/IL15RA/IL6/SOCS1/LTB/IL6ST/CD9/TNF/IL12RB1/IL1R2/PIK3R5/MAP3K8/PLA2G2A/SOCS3/IL17RB/IL1R1/CXCL10/IL18R1/TLR2/A2M/CCR1/CSF3R
most_negative_nes_plot <- enrichplot::gseaplot(
gsea_results,
geneSetID = "HALLMARK_IL6_JAK_STAT3_SIGNALING",
title = "IL6_JAK_STAT3_SIGNALING",
color.line = "#288BA8"
)
most_negative_nes_plot