This R Notebook describes the pipeline for RNA-seq analysis with differential expression (DESeq), ORA (Over-Representation) and GSEA (Gene Set Enrichment Analysis).

Introduction to RNA-seq Analysis

1. Preparing the data.

First, we need to create two data frames: (1) GSE123564_data with the expression data and the sample names (control vs mutant); and (2) a GSE123564_metadata containing meta data with the sample information. Both data are available at Gene Expression Omnibus GSE123564

library(DESeq2)
library(dplyr)
library(ggpubr)
library(pheatmap)
library(stringr)

# Data analysis on GEO: GSE123564 - retina from patients
# with Hbs1L deficiency(retinal development with delayed
# development and retinal pigmentary deposits)
patient1_GSE123564 <- read.delim("/home/bad23/Documents/R-studies/Data_Analysis/Bioinformatic-Analysis-DSMZ/GSE123564/Samples/GSM3507209_patient_rep1_mRNA-seq.counts.txt",
    header = T, sep = "")
patient2_GSE123564 <- read.delim("/home/bad23/Documents/R-studies/Data_Analysis/Bioinformatic-Analysis-DSMZ/GSE123564/Samples/GSM3507210_patient_rep2_mRNA-seq.counts.txt",
    header = T, sep = "")
patient3_GSE123564 <- read.delim("/home/bad23//Documents/R-studies/Data_Analysis/Bioinformatic-Analysis-DSMZ/GSE123564/Samples/GSM3507211_patient_rep3_mRNA-seq.counts.txt",
    header = T, sep = "")
patient4_GSE123564 <- read.delim("/home/bad23//Documents/R-studies/Data_Analysis/Bioinformatic-Analysis-DSMZ/GSE123564/Samples/GSM3507212_patient_rep4_mRNA-seq.counts.txt",
    header = T, sep = "")
patient5_GSE123564 <- read.delim("/home/bad23/Documents/R-studies/Data_Analysis/Bioinformatic-Analysis-DSMZ/GSE123564/Samples/GSM3507212_patient_rep4_mRNA-seq.counts.txt",
    header = T, sep = "")
patient6_GSE123564 <- read.delim("/home/bad23/Documents/R-studies/Data_Analysis/Bioinformatic-Analysis-DSMZ/GSE123564/Samples/GSM3507213_patient_rep5_mRNA-seq.counts.txt",
    header = T, sep = "")
healthy1_GSE123564 <- read.delim("/home/bad23/Documents/R-studies/Data_Analysis/Bioinformatic-Analysis-DSMZ/GSE123564/Samples/GSM3507215_healthy_individual-1_rep1_mRNA-seq.counts.txt",
    header = T, sep = "")
healthy2_GSE123564 <- read.delim("/home/bad23/Documents/R-studies/Data_Analysis/Bioinformatic-Analysis-DSMZ/GSE123564/Samples/GSM3507216_healthy_individual-1_rep2_mRNA-seq.counts.txt",
    header = T, sep = "")
healthy3_GSE123564 <- read.delim("/home/bad23/Documents/R-studies/Data_Analysis/Bioinformatic-Analysis-DSMZ/GSE123564/Samples/GSM3507217_healthy_individual-1_rep3_mRNA-seq.counts.txt",
    header = T, sep = "")
healthy4_GSE123564 <- read.delim("/home/bad23/Documents/R-studies/Data_Analysis/Bioinformatic-Analysis-DSMZ/GSE123564/Samples/GSM3507218_healthy_individual-2_rep1_mRNA-seq.counts.txt",
    header = T, sep = "")
healthy5_GSE123564 <- read.delim("/home/bad23/Documents/R-studies/Data_Analysis/Bioinformatic-Analysis-DSMZ/GSE123564/Samples/GSM3507219_healthy_individual-2_rep2_mRNA-seq.counts.txt",
    header = T, sep = "")
healthy6_GSE123564 <- read.delim("/home/bad23/Documents/R-studies/Data_Analysis/Bioinformatic-Analysis-DSMZ/GSE123564/Samples/GSM3507220_healthy_individual-2_rep3_mRNA-seq.counts.txt",
    header = T, sep = "")
GSE123564_genes <- patient1_GSE123564[2:nrow(patient1_GSE123564),
    1]


GSE123564_data <- cbind(patient1_GSE123564[, c(1, 7)], patient2_GSE123564[,
    7], patient3_GSE123564[, 7], patient4_GSE123564[, 7], patient5_GSE123564[,
    7], patient6_GSE123564[, 7], healthy1_GSE123564[, 7], healthy2_GSE123564[,
    7], healthy3_GSE123564[, 7], healthy4_GSE123564[, 7], healthy5_GSE123564[,
    7], healthy6_GSE123564[, 7])
colnames(GSE123564_data) <- c("gene_id", "mutation", "mutation",
    "mutation", "mutation", "mutation", "mutation", "control",
    "control", "control", "control", "control", "control")

GSE123564_data <- GSE123564_data[2:nrow(GSE123564_data), ]
GSE123564_data <- matrix(as.numeric((unlist(GSE123564_data[,
    2:13]))), ncol = ncol(GSE123564_data[, 2:13]))
colnames(GSE123564_data) <- c("Patient_1", "Patient_2", "Patient_3",
    "Patient_4", "Patient_5", "Patient_6", "Healthy_1", "Healthy_2",
    "Healthy_3", "Healthy_4", "Healthy_5", "Healthy_6")

GSE123564_metadata <- data.frame(Condition = c("Mutant", "Mutant",
    "Mutant", "Mutant", "Mutant", "Mutant", "Control", "Control",
    "Control", "Control", "Control", "Control"), Sample = colnames(GSE123564_data))
head(GSE123564_data)
##      Patient_1 Patient_2 Patient_3 Patient_4 Patient_5 Patient_6 Healthy_1
## [1,]         0         0         0         0         0         0         0
## [2,]         0         0         0         0         0         0         0
## [3,]         0         0         0         0         0         0         0
## [4,]      1410      1928      1426      1299      1299      1130       820
## [5,]      2611      4338      4775      3960      3960      4276      2626
## [6,]       256       413       390       429       429       571       201
##      Healthy_2 Healthy_3 Healthy_4 Healthy_5 Healthy_6
## [1,]         0         0         0         0         0
## [2,]         0         0         0         0         0
## [3,]         0         0         0         0         0
## [4,]       498       478       166       387       556
## [5,]      2168      2084       613      1853      2295
## [6,]       151       144        75       226       278
head(GSE123564_metadata)
##   Condition    Sample
## 1    Mutant Patient_1
## 2    Mutant Patient_2
## 3    Mutant Patient_3
## 4    Mutant Patient_4
## 5    Mutant Patient_5
## 6    Mutant Patient_6

2. Creating cts and coldata tables.

To run the statistical analysis, we need to create a numeric cts data frame, and a metadata table coldata. Both tables should have important params to further be used as input for DESeq2.

cts <- matrix(as.numeric(GSE123564_data), ncol = ncol(GSE123564_data))
row.names(cts) <- GSE123564_genes
class(cts)
## [1] "matrix" "array"
coldata <- GSE123564_metadata

3. Perform DESEq2 normalization and transformation

3.1 Statistical analysis

res <- DESeqDataSetFromMatrix(countData = round(cts), colData = coldata,
    design = ~Condition)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
res <- DESeq(res)
res <- results(res, contrast = c("Condition", "Mutant", "Control"))
head(res)
## log2 fold change (MLE): Condition Mutant vs Control 
## Wald test p-value: Condition Mutant vs Control 
## DataFrame with 6 rows and 6 columns
##         baseMean log2FoldChange     lfcSE      stat      pvalue        padj
##        <numeric>      <numeric> <numeric> <numeric>   <numeric>   <numeric>
## OR4F5      0.000             NA        NA        NA          NA          NA
## OR4F29     0.000             NA        NA        NA          NA          NA
## OR4F16     0.000             NA        NA        NA          NA          NA
## SAMD11   850.774       1.079469 0.1701953   6.34253 2.26018e-10 7.63934e-10
## NOC2L   2674.022       0.511913 0.0696001   7.35506 1.90845e-13 7.87757e-13
## KLHL17   272.819       0.592730 0.2213172   2.67819 7.40206e-03 1.21827e-02

3.2 Symple DESEq2 normalization on counts table

res_norm <- DESeqDataSetFromMatrix(countData = round(cts), colData = coldata,
    design = ~1)
## converting counts to integer mode
res_norm <- vst(res_norm)
res_norm <- assay(res_norm)
head(res_norm)
##                1         2         3         4         5         6         7
## OR4F5   6.304496  6.304496  6.304496  6.304496  6.304496  6.304496  6.304496
## OR4F29  6.304496  6.304496  6.304496  6.304496  6.304496  6.304496  6.304496
## OR4F16  6.304496  6.304496  6.304496  6.304496  6.304496  6.304496  6.304496
## SAMD11 10.850864 10.623302 10.115579 10.191607 10.191607  9.925705  9.640657
## NOC2L  11.683739 11.713613 11.714521 11.668704 11.668704 11.671843 11.123074
## KLHL17  8.818493  8.810708  8.659464  8.909468  8.909468  9.133888  8.208435
##                8         9        10        11        12
## OR4F5   6.304496  6.304496  6.304496  6.304496  6.304496
## OR4F29  6.304496  6.304496  6.304496  6.304496  6.304496
## OR4F16  6.304496  6.304496  6.304496  6.304496  6.304496
## SAMD11  9.216625  9.228861  9.584815  9.425198  9.507513
## NOC2L  11.038030 11.054819 11.251227 11.416899 11.312274
## KLHL17  8.080256  8.083888  8.722805  8.842158  8.757643

4. Perform ORA analysis

This analysis is based on NYU Next-Generation Sequencing Analysis Tutorial

4.1 Installing packages

First, we will need to install some important packages:

  • clusterProfiler: Universal interface for gene functional and enrichment results tool.
  • pathview: Pathway based data integration and visualization tool.
  • wordcloud: Analyze the frequency of a word in textual content using visualization.
  • enrichplot: Implements several visualization methods for interpreting functional enrichment results obtained from ORA or GSEA analysis.
  • org.Hs.eg.db: Genome wide annotation for Human, primarily based on mapping using Entrez Gene identifiers. For other animal models, search for specific genomic annotation on Bioconductor

4.2 Prepare input

# First, we want the log2 fold-change list from res derived
# of DESEq
original_gene_list <- res$log2FoldChange
# Properly name the vector
names(original_gene_list) <- rownames(res)
# Omit NA values
gene_list <- na.omit(original_gene_list)
# Sort the list in decreasing order (required for
# clusterProfiler)
gene_list = sort(gene_list, decreasing = TRUE)
# Extract significant results (pvalue < 0.05 or padj <
# 0.05)
sig_genes_df <- subset(res, padj < 0.05)
# From significant results, we want to filter on log2fold
# change
genes <- sig_genes_df$log2FoldChange
names(genes) <- rownames(sig_genes_df)  # Name the vector
genes <- na.omit(genes)  # omit NA
genes <- names(genes)[abs(genes) > 2]  # Filtering genes with log2fold > 2

4.3 Create a enrichGO object

To create a enrichGO object, important params are neccessary:

  • Ontology Options: “BP” (biological process), “MF” (molecular function), “CC”cellular component]
  • keyType: This is the source of the annotation (gene ids). The options vary for each annotation. In the example of org.Dm.eg.db, the options are:
# Check which options are available with the keytypes
# command, for example
keytypes(org.Hs.eg.db)
##  [1] "ACCNUM"       "ALIAS"        "ENSEMBL"      "ENSEMBLPROT"  "ENSEMBLTRANS"
##  [6] "ENTREZID"     "ENZYME"       "EVIDENCE"     "EVIDENCEALL"  "GENENAME"    
## [11] "GENETYPE"     "GO"           "GOALL"        "IPI"          "MAP"         
## [16] "OMIM"         "ONTOLOGY"     "ONTOLOGYALL"  "PATH"         "PFAM"        
## [21] "PMID"         "PROSITE"      "REFSEQ"       "SYMBOL"       "UCSCKG"      
## [26] "UNIPROT"
go_enrich <- enrichGO(gene = genes, universe = names(gene_list),
    OrgDb = org.Hs.eg.db, keyType = "SYMBOL", readable = TRUE,
    ont = "ALL", pvalueCutoff = 0.01, qvalueCutoff = 0.001)
go_enrich
## #
## # over-representation test
## #
## #...@organism     Homo sapiens 
## #...@ontology     GOALL 
## #...@keytype      SYMBOL 
## #...@gene     chr [1:1624] "HES5" "SMIM1" "AJAP1" "SLC2A5" "CASZ1" "C1orf167" "NPPB" ...
## #...pvalues adjusted by 'BH' with cutoff <0.01 
## #...180 enriched terms found
## 'data.frame':    180 obs. of  10 variables:
##  $ ONTOLOGY   : chr  "BP" "BP" "BP" "BP" ...
##  $ ID         : chr  "GO:0048562" "GO:0044057" "GO:0007389" "GO:0003002" ...
##  $ Description: chr  "embryonic organ morphogenesis" "regulation of system process" "pattern specification process" "regionalization" ...
##  $ GeneRatio  : chr  "63/1412" "94/1412" "83/1412" "76/1412" ...
##  $ BgRatio    : chr  "274/14590" "500/14590" "421/14590" "378/14590" ...
##  $ pvalue     : num  4.49e-11 1.65e-10 1.82e-10 4.23e-10 4.73e-10 ...
##  $ p.adjust   : num  2.40e-07 3.24e-07 3.24e-07 5.06e-07 5.06e-07 ...
##  $ qvalue     : num  1.97e-07 2.66e-07 2.66e-07 4.15e-07 4.15e-07 ...
##  $ geneID     : chr  "ALX3/EFNA1/VANGL2/SOX11/SIX2/EFEMP1/OTX1/MYO3B/DLX2/HOXD10/HOXD4/HOXD3/RARB/PLS1/CHRNA9/FGF10/HAND1/FOXF2/TFAP2"| __truncated__ "NPPB/KCND3/NPR1/ATP1A2/CASQ1/NOS1AP/RGS4/RGS2/MYBPH/TGFB2/AGT/POMC/UCN/TACR1/INHBB/IGFBP5/INHA/LMCD1/CAV3/OXTR/"| __truncated__ "HES5/NBL1/WLS/ALX3/VANGL2/PBX1/PLXNA2/SIX2/OTX1/EN1/DLX1/DLX2/HOXD13/HOXD12/HOXD11/HOXD10/HOXD8/HOXD4/HOXD3/NRP"| __truncated__ "HES5/NBL1/WLS/VANGL2/PBX1/PLXNA2/SIX2/OTX1/EN1/DLX1/DLX2/HOXD13/HOXD11/HOXD10/HOXD8/HOXD4/HOXD3/NRP2/ROBO2/BMPR"| __truncated__ ...
##  $ Count      : int  63 94 83 76 87 36 78 81 64 36 ...
## #...Citation
##  T Wu, E Hu, S Xu, M Chen, P Guo, Z Dai, T Feng, L Zhou, W Tang, L Zhan, X Fu, S Liu, X Bo, and G Yu.
##  clusterProfiler 4.0: A universal enrichment tool for interpreting omics data.
##  The Innovation. 2021, 2(3):100141
# This analysis is 'raw', without any filtering

4.4 EnrichGO filtering

After the results, it is possible to perform different filtering using go_enrich dataset. Here, I’m performing an analysis based on count values.

go_enrich_filtered <- go_enrich
go_enrich_filtered_results <- go_enrich_filtered@result

# Sort GO terms by counting results
go_enrich_filtered_results <- go_enrich_filtered_results %>%
    arrange(p.adjust)
go_enrich_filtered@result <- go_enrich_filtered_results

# Extract genes for cleaning view
core_genes <- str_split(as.data.frame(go_enrich_filtered_results)[,
    "geneID"], "/")
core_genes_top <- stringi::stri_list2matrix(core_genes)  # Transform list into matrix - conserving proper list characteristics
core_genes_top <- core_genes_top[1:10, ]
core_genes_top <- as.list(as.data.frame(core_genes_top))  # Re-transforming list

# putting new gene list into go_enrich again This is an
# important process for processing cnetplot
filtered_core_genes_top <- sapply(lapply(core_genes_top, function(x) x[x %in%
    genes]), paste, collapse = "/")
go_enrich_filtered@result$geneID <- filtered_core_genes_top

4.3 Making graphs

a. Upset plot

upsetplot(go_enrich, n = 9, layout = "mds") + ggtitle("Upset-plot with raw GO enrichment data")

upsetplot(go_enrich_filtered, n = 9, layout = "mds") + ggtitle("Upset-plot with filtered GO enrichment data")

b. Barplot

barplot(go_enrich, drop = TRUE, showCategory = 15, title = "GO Biological Pathways",
    font.size = 8) + facet_grid(ONTOLOGY ~ ., scale = "free") +
    ggtitle("Bar-plot with raw GO enrichment data")

barplot(go_enrich_filtered, drop = TRUE, showCategory = 15, title = "GO Biological Pathways",
    font.size = 8) + facet_grid(ONTOLOGY ~ ., scale = "free") +
    ggtitle("Bar-plot with filtered GO enrichment data")

c. Dotplot

dotplot(go_enrich, showCategory = 10) + ggtitle("dotplot for ORA") +
    facet_grid(ONTOLOGY ~ ., scale = "free") + ggtitle("Dot-plot with raw GO enrichment data")

dotplot(go_enrich_filtered, showCategory = 10) + ggtitle("dotplot for ORA") +
    facet_grid(ONTOLOGY ~ ., scale = "free") + ggtitle("Dot-plot with filtered GO enrichment data")

d. Enrichment map

emdf <- pairwise_termsim(go_enrich_filtered)
emapplot(emdf, showCategory = 15) + ggtitle("Enrichmend-map with raw GO enrichment data")

emdf <- pairwise_termsim(go_enrich)
emapplot(emdf, showCategory = 15) + ggtitle("Enrichment-map with filtered GO enrichment data")

e. Category Netplot

OBS: Different forms for netplot analysis can be found here. It is also possible to export R graphs as vectors and edit, as this example.

cnetplot(go_enrich, categorySize = "pvalue", foldChange = gene_list) +
    ggtitle("Standard cnet-plot with raw GO enrichment data")

cnetplot(go_enrich_filtered, categorySize = "pvalue", foldChange = gene_list) +
    ggtitle("Standard cnet-plot with filtered GO enrichment data")

cnetplot(go_enrich, circular = TRUE, color.params = list(foldChange = gene_list),
    colorEdge = TRUE, showCategory = 8, cex.params = list(gene_node = 0.01),
    node_label = "all", cex_label_category = 0.6, cex_label_gene = 0.9) +
    ggtitle("Round cnet-plot with raw GO enrichment data")

cnetplot(go_enrich_filtered, circular = TRUE, color.params = list(foldChange = gene_list),
    colorEdge = TRUE, showCategory = 8, cex.params = list(gene_node = 0.01),
    node_label = "all", cex_label_category = 0.6, cex_label_gene = 0.9) +
    ggtitle("Round cnet-plot with filtered GO enrichment data")

5. Perform GSEA analysis

The goal of GSEA is to detect situations where many genes in a gene set change in a coordinated way, even when individual changes are small in magnitude. First, we will set up the differential expression table obtained with DESEq2, containing: Gene ID, log2 fold change values,and adjusted p-values.

dge_mapped_df <- data.frame(res)

Then, we need to load the important libraries.

library(clusterProfiler)
library(msigdbr)
library(org.Hs.eg.db)  # This is for Homo sapies annontation. For mouse, is library(org.Mm.eg.db)
library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:GenomicRanges':
## 
##     subtract

Is also important to getting familiar with MSigDB gene set avaiable with msigdbr, because it allow us to acess these gene sets in a format compatible with the package we’ll use for analysis, clusterProfiler. First, check the organisms the package supports, using msigdbr_species(). As the data we are analyzing comes from human samples, we shoulhd obtain only the gene sets relevants to H.sapies, by specifying species = "Homo sapiens" (for mouse samples, you have to change just species = "Mus musculus").The category will depends on the analysis. There are many categories:

  • H: Hallmark gene sets are coherently expressed signatures derived by aggregating many MSigDB gene sets to represent well-defined biological states or processes.
  • C1: Positional gene sets corresponding to human chromosome cytogenetic bands.
  • C2: Curated gene sets from online pathway databases, publications in PubMed, and knowledge of domain experts.
  • C3: Regulatory target gene sets based on gene target predictions for microRNA seed sequences and predicted transcription factor binding sites.
  • C4: Computational gene sets defined by mining large collections of cancer-oriented microarray data.
  • MH: Mouse-ortholog hallmark gene sets are versions of gene sets in the MSigDB Hallmarks collection mapped to their mouse orthologs.
  • M1: Positional gene sets corresponding to mouse chromossome cytogenetic bands.

And more. For informations check-it msigdb website

hs_hallmark_sets <- msigdbr(
  species = "Homo sapiens", # Replace with species name relevant to your data
  category = "H" ) 
  
# Check if there are duplicates on the DESEq results we are going to analyze.
any(duplicated(rownames(dge_mapped_df))) # If it is false, no data processing is necessary
## [1] FALSE
# Let's create a named vector ranked based on the log2 fold change values
lfc_vector <- dge_mapped_df$log2FoldChange
names(lfc_vector) <- rownames(dge_mapped_df)

# We need to sort the log2 fold change values in descending order here
lfc_vector <- sort(lfc_vector, decreasing = TRUE)

head(lfc_vector)
##      ADD2    CACNG4     HOXB6     HOXB9       GRP     HOXB5 
## 11.278668 11.100574 10.956444 10.534387 10.238942  9.758519

Now we are going to run GSEA

set.seed(2020) # Set the seed so our results are reproducible:

gsea_results <- GSEA(
  geneList = lfc_vector, # Ordered ranked gene list
  minGSSize = 25, # Minimum gene set size
  maxGSSize = 500, # Maximum gene set set
  pvalueCutoff = 0.05, # p-value cutoff
  eps = 0, # Boundary for calculating the p value
  seed = TRUE, # Set seed to make results reproducible
  pAdjustMethod = "BH", # Benjamini-Hochberg correction
  TERM2GENE = dplyr::select(
    hs_hallmark_sets,
    gs_name,
    gene_symbol
  )
)
## preparing geneSet collections...
## GSEA analysis...
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (5.63% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
## leading edge analysis...
## done...
gsea_result_df <- data.frame(gsea_results@result)


gsea_result_df %>%
    # This returns the 3 rows with the largest NES values
    dplyr::slice_max(NES, n = 3)
##                                              ID             Description setSize
## HALLMARK_E2F_TARGETS       HALLMARK_E2F_TARGETS    HALLMARK_E2F_TARGETS     198
## HALLMARK_G2M_CHECKPOINT HALLMARK_G2M_CHECKPOINT HALLMARK_G2M_CHECKPOINT     194
## HALLMARK_MYC_TARGETS_V1 HALLMARK_MYC_TARGETS_V1 HALLMARK_MYC_TARGETS_V1     196
##                         enrichmentScore      NES       pvalue     p.adjust
## HALLMARK_E2F_TARGETS          0.7140609 2.840478 5.462311e-31 2.731156e-29
## HALLMARK_G2M_CHECKPOINT       0.7008499 2.751088 2.045073e-27 5.112682e-26
## HALLMARK_MYC_TARGETS_V1       0.5199776 2.053363 1.269521e-10 2.115868e-09
##                               qvalue rank                   leading_edge
## HALLMARK_E2F_TARGETS    1.839936e-29 3809 tags=75%, list=22%, signal=59%
## HALLMARK_G2M_CHECKPOINT 3.444333e-26 2710 tags=58%, list=16%, signal=50%
## HALLMARK_MYC_TARGETS_V1 1.425427e-09 4448 tags=53%, list=26%, signal=39%
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            core_enrichment
## HALLMARK_E2F_TARGETS    ORC6/CDK1/PLK4/TCF19/AURKB/MYBL2/KIF18B/PLK1/E2F8/CDCA8/RAD51AP1/CDC25A/DEPDC1/PSMC3IP/UBE2T/HMMR/SPAG5/KIF4A/DIAPH3/KIF2C/UBE2S/BUB1B/ASF1B/SPC25/MAD2L1/MMS22L/CKS1B/RFC3/CENPE/ESPL1/TRIP13/STMN1/CENPM/TOP2A/RACGAP1/GINS1/LMNB1/TACC3/MELK/RRM2/MKI67/DSCC1/CDCA3/BARD1/ATAD2/POLA2/DLGAP5/CDC20/HELLS/BIRC5/PRPS1/AURKA/DCK/HMGB2/SUV39H1/KIF22/CCNB2/RNASEH2A/MCM5/EZH2/PTTG1/GINS3/TIMELESS/MCM2/RFC2/TMPO/MCM3/CTPS1/BRCA2/BRCA1/MCM7/MCM4/GINS4/DCLRE1B/POLE/JPT1/POLD1/TIPIN/LIG1/SMC1A/ANP32E/SPC24/CSE1L/CDKN3/USP1/PCNA/KPNA2/MXD3/CKS2/DDX39A/RANBP1/TUBG1/DONSON/HMGB3/UBR7/TK1/POLD2/LYAR/DEK/SMC4/MSH2/CIT/LBR/NME1/DCTPP1/PAICS/CCP110/PA2G4/XPO1/MYC/EXOSC8/CBX5/NOP56/GSPT1/NASP/RPA2/SNRPB/SLBP/NUP205/RPA1/NUP107/ZW10/POP7/EIF2S1/CNOT9/HUS1/RAD51C/BRMS1L/RAN/CCNE1/CHEK1/DNMT1/TUBB/POLD3/EED/NUP153/ILF3/SRSF2/UNG/TRA2B/SRSF1/SYNCRIP/HNRNPD/MCM6/DUT/SMC6/CDKN1A/CTCF
## HALLMARK_G2M_CHECKPOINT                                                                                                                                                                                                                               EGF/E2F1/ORC6/CDC6/TTK/CDK1/UBE2C/PLK4/AURKB/RAD54L/MYBL2/RPS6KA5/CDC45/PRC1/PLK1/POLQ/EXO1/CDC25A/CCNA2/GINS2/E2F2/HMMR/KIF4A/CCND1/KIF2C/UBE2S/MAD2L1/TRAIP/KNL1/CKS1B/PBK/CENPE/ESPL1/STIL/KIF20B/CENPF/STMN1/NEK2/KIF23/TOP2A/CENPA/RACGAP1/LMNB1/TACC3/NUSAP1/TPX2/CCNF/KIF11/MKI67/BARD1/FBXO5/TROAP/BUB1/POLA2/CDC20/BIRC5/SMC2/RBL1/AURKA/DBF4/SLC7A5/CHAF1A/SUV39H1/KIF22/CCNB2/KIF15/NDC80/MCM5/EZH2/PTTG1/MCM2/TMPO/NSD2/MCM3/INCENP/BRCA2/DTYMK/POLE/JPT1/CBX1/SLC7A1/SMC1A/TGFB1/CDKN3/KPNA2/CKS2/CDC7/MAP3K20/DMD/DDX39A/ODC1/SAP30/HMGB3/DR1/KIF5B/SMC4/LBR/HOXC10/MTF2/BUB3/DKC1/E2F3/HMGN2/TFDP1/CASP8AP2/XPO1/MYC/HSPA8/ORC5/GSPT1/NASP/RPA2/MEIS2
## HALLMARK_MYC_TARGETS_V1                                                                                                                                                                                                                                                                                 CDC45/CCNA2/MAD2L1/CDC20/MCM5/MCM2/CTPS1/EIF1AX/RFC4/MCM7/MCM4/CDK2/IFRD1/USP1/PCNA/KPNA2/POLE3/TYMS/RANBP1/SRM/ODC1/POLD2/YWHAQ/EIF4E/DDX21/DEK/RRP9/VBP1/ILF2/NME1/BUB3/CSTF2/PSMD14/RRM1/HSPE1/TFDP1/ABCE1/PA2G4/XPO1/MYC/HNRNPR/NOP56/GSPT1/NDUFAB1/NOP16/TXNL4A/GLO1/HDAC2/EIF3J/CCT5/EIF4G2/NCBP1/CCT2/SRSF7/PSMD1/PTGES3/LDHA/SNRPD1/EIF2S1/PSMA7/TCP1/RAN/SRPK1/HPRT1/HNRNPA2B1/RUVBL2/SNRPA1/PSMD3/AIMP2/HSPD1/SRSF2/SRSF3/PSMB2/SET/HNRNPA3/TRA2B/SRSF1/SYNCRIP/HNRNPD/PHB/NHP2/MCM6/DUT/DDX18/CANX/SNRPG/RNPS1/LSM2/PPIA/CCT7/GOT2/PSMC6/XRCC6/G3BP1/SERBP1/CYC1/PSMC4/UBA2/PSMD7/PPM1G/KPNB1/LSM7/GNL3
most_positive_nes_plot <- enrichplot::gseaplot(
    gsea_results,
    geneSetID = "HALLMARK_E2F_TARGETS",
    title = "E2F_TARGETS",
    color.line = "#FFCE30"
)
most_positive_nes_plot

gsea_result_df %>%
  # This returns the 3 rows with the lowest NES values
  dplyr::slice_min(NES, n = 3)
##                                                                    ID
## HALLMARK_INTERFERON_ALPHA_RESPONSE HALLMARK_INTERFERON_ALPHA_RESPONSE
## HALLMARK_INTERFERON_GAMMA_RESPONSE HALLMARK_INTERFERON_GAMMA_RESPONSE
## HALLMARK_IL6_JAK_STAT3_SIGNALING     HALLMARK_IL6_JAK_STAT3_SIGNALING
##                                                           Description setSize
## HALLMARK_INTERFERON_ALPHA_RESPONSE HALLMARK_INTERFERON_ALPHA_RESPONSE      95
## HALLMARK_INTERFERON_GAMMA_RESPONSE HALLMARK_INTERFERON_GAMMA_RESPONSE     191
## HALLMARK_IL6_JAK_STAT3_SIGNALING     HALLMARK_IL6_JAK_STAT3_SIGNALING      79
##                                    enrichmentScore       NES       pvalue
## HALLMARK_INTERFERON_ALPHA_RESPONSE      -0.6719990 -2.009001 1.823305e-08
## HALLMARK_INTERFERON_GAMMA_RESPONSE      -0.5805272 -1.845527 9.745889e-09
## HALLMARK_IL6_JAK_STAT3_SIGNALING        -0.5758590 -1.679564 2.920490e-04
##                                        p.adjust       qvalue rank
## HALLMARK_INTERFERON_ALPHA_RESPONSE 1.823305e-07 1.228332e-07 3741
## HALLMARK_INTERFERON_GAMMA_RESPONSE 1.218236e-07 8.207064e-08 3741
## HALLMARK_IL6_JAK_STAT3_SIGNALING   1.327495e-03 8.943127e-04 3639
##                                                      leading_edge
## HALLMARK_INTERFERON_ALPHA_RESPONSE tags=60%, list=22%, signal=47%
## HALLMARK_INTERFERON_GAMMA_RESPONSE tags=50%, list=22%, signal=39%
## HALLMARK_IL6_JAK_STAT3_SIGNALING   tags=48%, list=21%, signal=38%
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   core_enrichment
## HALLMARK_INTERFERON_ALPHA_RESPONSE                                                                                                                                                                                                                                               PSME1/TRIM25/IRF9/PSMB8/GBP2/NMI/TRIM21/IRF7/IRF2/IFITM2/CXCL11/IL15/CMPK2/TRAFD1/HLA-C/IRF1/IFIT3/ELF1/IFITM3/PARP14/LY6E/GBP4/PARP9/LPAR6/DHX58/PSMB9/PLSCR1/IL7/OASL/DDX60/STAT2/LGALS3BP/NCOA7/IFIT2/HERC6/IFI35/IFITM1/UBE2L6/GMPR/IFI44L/RSAD2/UBA7/IFIH1/IFI27/BATF2/EPSTI1/SELL/LAMP3/BST2/CD74/CASP1/ISG20/CXCL10/TMEM140/C1S/RTP4/OAS1
## HALLMARK_INTERFERON_GAMMA_RESPONSE PSME1/TRIM25/IL18BP/IRF9/RAPGEF6/PSMB8/RNF213/PSMB10/CCL7/NMI/TRIM21/IRF7/IRF2/SAMHD1/XCL1/STAT3/IFITM2/CXCL11/FCGR1A/CD86/IRF8/IL15/CMPK2/STAT1/TRAFD1/CASP4/METTL7B/APOL6/IRF1/IFIT3/XAF1/TNFAIP2/IFITM3/PARP14/LY6E/GBP4/DHX58/PSMB9/CCL2/PLA2G4A/SOD2/PLSCR1/IL7/OASL/DDX60/STAT2/IL15RA/HLA-G/IL2RB/LGALS3BP/IL6/SOCS1/IFIT2/HERC6/FGL2/HLA-DMA/IFI35/CMKLR1/CD40/UBE2L6/IFI44L/RSAD2/ARID5B/CIITA/IFIH1/IL10RA/IRF5/IFI27/BATF2/TNFAIP6/IFIT1/EPSTI1/SELP/OAS2/CFH/SOCS3/ICAM1/GCH1/BST2/MX2/CD74/HLA-DRB1/CASP1/TNFSF10/ISG20/IRF4/CXCL10/CFB/P2RY14/SECTM1/SERPING1/C1S/C1R/RTP4/VCAM1
## HALLMARK_IL6_JAK_STAT3_SIGNALING                                                                                                                                                                                                                                                                                                                                                                    ITGB3/IRF9/CCL7/STAT3/OSMR/HMOX1/CXCL11/CSF2/DNTT/STAT1/IL17RA/IRF1/IL10RB/ACVRL1/IL7/INHBE/STAT2/IL15RA/IL6/SOCS1/LTB/IL6ST/CD9/TNF/IL12RB1/IL1R2/PIK3R5/MAP3K8/PLA2G2A/SOCS3/IL17RB/IL1R1/CXCL10/IL18R1/TLR2/A2M/CCR1/CSF3R
most_negative_nes_plot <- enrichplot::gseaplot(
  gsea_results,
  geneSetID = "HALLMARK_IL6_JAK_STAT3_SIGNALING",
  title = "IL6_JAK_STAT3_SIGNALING",
  color.line = "#288BA8"
)
most_negative_nes_plot