#GSEA analysis
rm(list = ls())
#The analysis is performed by:
#1. ranking all genes in the data set
#2. identifying the rank positions of all members of the gene set in the ranked data set
#3. calculating an enrichment score (ES) that represents the difference between the observed rankings and
# that which would be expected assuming a random rank distribution.
##################################method_1
library(fgsea)
library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------- tidyverse 1.3.0 --
## √ ggplot2 3.3.2 √ purrr 0.3.4
## √ tibble 3.0.3 √ dplyr 1.0.1
## √ tidyr 1.1.1 √ stringr 1.4.0
## √ readr 1.3.1 √ forcats 0.5.0
## -- Conflicts ---------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
####################1.1 Ranking Data
data_gene <- read.csv("gene_list.csv",header = T,stringsAsFactors = F)[,c(7,3)]
dim(data_gene) #[1] 8630 2
## [1] 8630 2
head(data_gene)
## entrez_id fdr
## 1 2268 1.87000e-07
## 2 2519 9.18533e-04
## 3 57185 4.88000e-08
## 4 9957 6.01000e-18
## 5 9108 4.61000e-08
## 6 952 1.66000e-11
data_gene_1 <- na.omit(data_gene)
dim(data_gene_1)
## [1] 3912 2
length(unique(data_gene_1$entrez_id)) #[1] 3911
## [1] 3911
data_gene_2 <- aggregate(data_gene_1[2], list(entrez_id = data_gene_1$entrez_id), FUN = mean)
length(unique(data_gene_2$entrez_id)) #[1] 3911
## [1] 3911
head(data_gene_2)
## entrez_id fdr
## 1 9 1.41e-05
## 2 15 4.82e-06
## 3 21 1.91e-05
## 4 28 7.38e-13
## 5 34 9.29e-06
## 6 40 2.42e-07
##############
rankData <- data_gene_2$fdr
names(rankData) <- data_gene_2$entrez_id
head(rankData,6)
## 9 15 21 28 34 40
## 1.41e-05 4.82e-06 1.91e-05 7.38e-13 9.29e-06 2.42e-07
class(rankData)
## [1] "numeric"
rankData <- sort(rankData,decreasing=F)
#1.2 Load pathways
data_pathway <- read.csv("pathways_BP_database.csv",header = T,stringsAsFactors = F)[,-1]
head(data_pathway)
## gene_id PATHWAY_NAME
## 1 7076 Inhibition of matrix metalloproteinases
## 2 7078 Inhibition of matrix metalloproteinases
## 3 8434 Inhibition of matrix metalloproteinases
## 4 3265 Inhibition of matrix metalloproteinases
## 5 7077 Inhibition of matrix metalloproteinases
## 6 4323 Inhibition of matrix metalloproteinases
dim(data_pathway) #[1] 93107 2
## [1] 93107 2
length(unique(data_pathway$PATHWAY_NAME)) #[1] 1883
## [1] 1883
#convert list format
pathway_1 <- list()
for (i in 1:length(unique(data_pathway$PATHWAY_NAME))) {
#i = 1
#print(i/length(unique(data_pathway$PATHWAY_NAME)))
data_1 <- data_pathway[data_pathway$PATHWAY_NAME == unique(data_pathway$PATHWAY_NAME)[i],]
data_1 <- data_1$gene_id
pathway_1[[i]] <- data_1
names(pathway_1)[i] <- unique(data_pathway$PATHWAY_NAME)[i]
}
length(pathway_1) #[1] 1883
## [1] 1883
head(pathway_1,2)
## $`Inhibition of matrix metalloproteinases`
## [1] 7076 7078 8434 3265 7077 4323 4313 7079 4318
##
## $`Adhesion and diapedesis of granulocytes`
## [1] 3576 3684 5175 727 3458 3689 3552 3383 6402 3384 3683 6404 6403 1440 7124
#1.3 Conduct analysis
fgseaRes <- fgsea(pathways = pathway_1,
stats = rankData,
minSize = 15,
maxSize = 500,
scoreType = "pos")
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (20.3% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
head(fgseaRes[order(pval), ])
## pathway pval padj
## 1: PI3K class IB pathway in neutrophils 7.501648e-05 0.02017943
## 2: Retrograde endocannabinoid signaling 2.682372e-03 0.28032890
## 3: GPCR ligand binding 3.126345e-03 0.28032890
## 4: Gastrin-CREB signaling pathway via PKC and MAPK 5.763040e-03 0.31805278
## 5: Carbohydrate metabolism 5.911762e-03 0.31805278
## 6: Serotonergic synapse 7.101536e-03 0.31838552
## log2err ES NES size leadingEdge
## 1: 0.5384341 0.9682727 1.383565 15 5579,5588
## 2: 0.4317077 0.8407896 1.231352 26 2893,3763,5579,2570,2559,2568,...
## 3: 0.4317077 0.7545818 1.137751 70 643,2847,1236,7481,2798,117579,...
## 4: 0.4070179 0.7923994 1.175258 39 2847,2798,2784,56413,2922
## 5: 0.4070179 0.8234955 1.207419 27 9951,221914,8509,4958,176,10165,...
## 6: 0.4070179 0.8196702 1.200422 26 3763,5579,283748,100137049,5613,2784,...
#1.4 Enrichment score plot
plotEnrichment(pathway_1[["GPCR ligand binding"]],
rankData) + labs(title="GPCR ligand binding")

#Or make a table plot for a bunch of selected pathways:
#plot using ggplot2
pathway <- pathway_1[["GPCR ligand binding"]]
class(pathway)
## [1] "integer"
ranks <- rankData
head(ranks, 20)
## 7409 489 4070 7454 257106 54941 440686 146330
## 2.12e-41 2.14e-36 2.14e-36 4.06e-32 2.14e-31 1.20e-30 1.31e-30 1.52e-29
## 2709 3071 11151 126129 53836 100506544 10659 10320
## 1.70e-29 1.85e-29 2.07e-29 2.07e-29 2.87e-29 2.87e-29 1.85e-28 4.82e-28
## 388849 83706 10278 165
## 2.85e-27 3.61e-27 5.48e-27 1.62e-26
stats <- ranks
gseaParam = 1
ticksSize = 0.2
rnk <- rank(-stats)
ord <- order(rnk)
statsAdj <- stats[ord]
statsAdj <- sign(statsAdj) * (abs(statsAdj) ^ gseaParam)
statsAdj <- statsAdj / max(abs(statsAdj))
pathway <- unname(as.vector(na.omit(match(pathway, names(statsAdj)))))
pathway <- sort(pathway)
gseaRes <- calcGseaStat(statsAdj, selectedStats = pathway,
returnAllExtremes = TRUE)
bottoms <- gseaRes$bottoms
tops <- gseaRes$tops
n <- length(statsAdj)
xs <- as.vector(rbind(pathway - 1, pathway))
ys <- as.vector(rbind(bottoms, tops))
toPlot <- data.frame(x=c(0, xs, n + 1), y=c(0, ys, 0))
diff <- (max(tops) - min(bottoms)) / 8
# Getting rid of NOTEs
x=y=NULL
##############################Plot
ggplot(toPlot, aes(x=x, y=y)) +
geom_point(color="yellowgreen", size=0.1) +
geom_hline(yintercept=0, colour="gray60") +
geom_line(color="yellowgreen",size = 1) +
geom_segment(data=data.frame(x=pathway),
mapping=aes(x=x, y=-diff/2-0.2,
xend=x, yend=diff/2-0.2),
size=ticksSize, colour="darkblue") +
geom_hline(yintercept=-diff/2-0.2, colour="black", linetype="solid") +
geom_hline(yintercept=diff/2-0.2, colour="black", linetype="solid") +
theme(panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=1),
panel.grid = element_blank(),
axis.text = element_text(size= 10, color = "black",family = "sans"),
axis.title = element_text(size=10, color = "black",family = "sans")) +
labs(x="Rank in Ordered Dataset", y="Enrichment score (ES)")+
scale_x_continuous(expand = c(0.01,0.01),) +
scale_y_continuous(expand = c(0.001,0.001),
labels = round(seq(min(bottoms),max(tops)+0.1,0.1),1),
breaks = round(seq(min(bottoms),max(tops)+0.1,0.1),1))+
annotate(geom="text", x=1500, y= 0.1, color="black",family = "sans",size=4,
label= paste0("Pval = ",round(fgseaRes[fgseaRes$pathway == "GPCR ligand binding",]$pval[1],3)))+
annotate(geom="text", x=1500, y=0.2, color="black",family = "sans",size=4,
label= paste0("FDR = ",round(fgseaRes[fgseaRes$pathway == "GPCR ligand binding",]$padj[1],3)))+
annotate(geom="text", x=1500, y=0.3, color="black",family = "sans",size=4,
label= paste0("NES = ",round(fgseaRes[fgseaRes$pathway == "GPCR ligand binding",]$NES[1],3)))

ggsave(paste0(Sys.Date(),fgseaRes[fgseaRes$pathway == "GPCR ligand binding",]$pathway[1],".tiff"),
plot = last_plot(), device = NULL,
scale = 1, width = 10, height = 10, units ="cm",dpi = 300, limitsize = TRUE,compression = "lzw")
#GSEA table plot
#topPathwaysUp <- fgseaRes[ES > 0][head(order(pval), n=10), pathway]
#topPathwaysDown <- fgseaRes[ES < 0][head(order(pval), n=2),pathway]
#topPathways <- c(topPathwaysUp, rev(topPathwaysDown))
#plotGseaTable(pathway_1[topPathwaysUp], rankData, fgseaRes,
# gseaParam= 0.5)