#GSEA analysis
rm(list = ls())
#The analysis is performed by:
#1. ranking all genes in the data set
#2. identifying the rank positions of all members of the gene set in the ranked data set
#3. calculating an enrichment score (ES) that represents the difference between the observed rankings and 
#   that which would be expected assuming a random rank distribution.
##################################method_1
library(fgsea)
library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------- tidyverse 1.3.0 --
## √ ggplot2 3.3.2     √ purrr   0.3.4
## √ tibble  3.0.3     √ dplyr   1.0.1
## √ tidyr   1.1.1     √ stringr 1.4.0
## √ readr   1.3.1     √ forcats 0.5.0
## -- Conflicts ---------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
####################1.1  Ranking Data
data_gene <- read.csv("gene_list.csv",header = T,stringsAsFactors = F)[,c(7,3)]
dim(data_gene) #[1] 8630   2
## [1] 8630    2
head(data_gene)
##   entrez_id         fdr
## 1      2268 1.87000e-07
## 2      2519 9.18533e-04
## 3     57185 4.88000e-08
## 4      9957 6.01000e-18
## 5      9108 4.61000e-08
## 6       952 1.66000e-11
data_gene_1 <- na.omit(data_gene)
dim(data_gene_1)
## [1] 3912    2
length(unique(data_gene_1$entrez_id)) #[1] 3911
## [1] 3911
data_gene_2 <- aggregate(data_gene_1[2], list(entrez_id = data_gene_1$entrez_id), FUN = mean)
length(unique(data_gene_2$entrez_id)) #[1] 3911
## [1] 3911
head(data_gene_2)
##   entrez_id      fdr
## 1         9 1.41e-05
## 2        15 4.82e-06
## 3        21 1.91e-05
## 4        28 7.38e-13
## 5        34 9.29e-06
## 6        40 2.42e-07
##############
rankData <- data_gene_2$fdr
names(rankData) <- data_gene_2$entrez_id
head(rankData,6)
##        9       15       21       28       34       40 
## 1.41e-05 4.82e-06 1.91e-05 7.38e-13 9.29e-06 2.42e-07
class(rankData) 
## [1] "numeric"
rankData <- sort(rankData,decreasing=F)
#1.2 Load pathways
data_pathway <- read.csv("pathways_BP_database.csv",header = T,stringsAsFactors = F)[,-1]
head(data_pathway)
##   gene_id                            PATHWAY_NAME
## 1    7076 Inhibition of matrix metalloproteinases
## 2    7078 Inhibition of matrix metalloproteinases
## 3    8434 Inhibition of matrix metalloproteinases
## 4    3265 Inhibition of matrix metalloproteinases
## 5    7077 Inhibition of matrix metalloproteinases
## 6    4323 Inhibition of matrix metalloproteinases
dim(data_pathway)  #[1] 93107     2
## [1] 93107     2
length(unique(data_pathway$PATHWAY_NAME)) #[1] 1883
## [1] 1883
#convert list format
pathway_1 <- list()
for (i in 1:length(unique(data_pathway$PATHWAY_NAME))) {
  #i = 1
  #print(i/length(unique(data_pathway$PATHWAY_NAME)))
  data_1 <- data_pathway[data_pathway$PATHWAY_NAME == unique(data_pathway$PATHWAY_NAME)[i],]
  data_1 <- data_1$gene_id
  pathway_1[[i]] <- data_1
  names(pathway_1)[i] <- unique(data_pathway$PATHWAY_NAME)[i]
}
length(pathway_1) #[1] 1883
## [1] 1883
head(pathway_1,2)
## $`Inhibition of matrix metalloproteinases`
## [1] 7076 7078 8434 3265 7077 4323 4313 7079 4318
## 
## $`Adhesion and diapedesis of granulocytes`
##  [1] 3576 3684 5175  727 3458 3689 3552 3383 6402 3384 3683 6404 6403 1440 7124
#1.3 Conduct analysis
fgseaRes <- fgsea(pathways = pathway_1, 
                  stats    = rankData,
                  minSize  = 15,
                  maxSize  = 500,
                  scoreType = "pos")
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (20.3% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
head(fgseaRes[order(pval), ])
##                                            pathway         pval       padj
## 1:            PI3K class IB pathway in neutrophils 7.501648e-05 0.02017943
## 2:            Retrograde endocannabinoid signaling 2.682372e-03 0.28032890
## 3:                             GPCR ligand binding 3.126345e-03 0.28032890
## 4: Gastrin-CREB signaling pathway via PKC and MAPK 5.763040e-03 0.31805278
## 5:                         Carbohydrate metabolism 5.911762e-03 0.31805278
## 6:                            Serotonergic synapse 7.101536e-03 0.31838552
##      log2err        ES      NES size                              leadingEdge
## 1: 0.5384341 0.9682727 1.383565   15                                5579,5588
## 2: 0.4317077 0.8407896 1.231352   26        2893,3763,5579,2570,2559,2568,...
## 3: 0.4317077 0.7545818 1.137751   70       643,2847,1236,7481,2798,117579,...
## 4: 0.4070179 0.7923994 1.175258   39                2847,2798,2784,56413,2922
## 5: 0.4070179 0.8234955 1.207419   27      9951,221914,8509,4958,176,10165,...
## 6: 0.4070179 0.8196702 1.200422   26 3763,5579,283748,100137049,5613,2784,...
#1.4 Enrichment score plot
plotEnrichment(pathway_1[["GPCR ligand binding"]],
               rankData) + labs(title="GPCR ligand binding")

#Or make a table plot for a bunch of selected pathways:

#plot using ggplot2
pathway <-  pathway_1[["GPCR ligand binding"]]
class(pathway)
## [1] "integer"
ranks <- rankData
head(ranks, 20)
##      7409       489      4070      7454    257106     54941    440686    146330 
##  2.12e-41  2.14e-36  2.14e-36  4.06e-32  2.14e-31  1.20e-30  1.31e-30  1.52e-29 
##      2709      3071     11151    126129     53836 100506544     10659     10320 
##  1.70e-29  1.85e-29  2.07e-29  2.07e-29  2.87e-29  2.87e-29  1.85e-28  4.82e-28 
##    388849     83706     10278       165 
##  2.85e-27  3.61e-27  5.48e-27  1.62e-26
stats <- ranks
gseaParam = 1
ticksSize = 0.2
rnk <- rank(-stats)
ord <- order(rnk)
statsAdj <- stats[ord]
statsAdj <- sign(statsAdj) * (abs(statsAdj) ^ gseaParam)
statsAdj <- statsAdj / max(abs(statsAdj))

pathway <- unname(as.vector(na.omit(match(pathway, names(statsAdj)))))
pathway <- sort(pathway)

gseaRes <- calcGseaStat(statsAdj, selectedStats = pathway,
                        returnAllExtremes = TRUE)

bottoms <- gseaRes$bottoms
tops <- gseaRes$tops

n <- length(statsAdj)
xs <- as.vector(rbind(pathway - 1, pathway))
ys <- as.vector(rbind(bottoms, tops))
toPlot <- data.frame(x=c(0, xs, n + 1), y=c(0, ys, 0))
diff <- (max(tops) - min(bottoms)) / 8
# Getting rid of NOTEs
x=y=NULL 
##############################Plot
ggplot(toPlot, aes(x=x, y=y)) +
  geom_point(color="yellowgreen", size=0.1) +
  geom_hline(yintercept=0, colour="gray60") +
  geom_line(color="yellowgreen",size = 1) +
  geom_segment(data=data.frame(x=pathway),
               mapping=aes(x=x, y=-diff/2-0.2,
                           xend=x, yend=diff/2-0.2),
               size=ticksSize, colour="darkblue") + 
  geom_hline(yintercept=-diff/2-0.2, colour="black", linetype="solid") +
  geom_hline(yintercept=diff/2-0.2, colour="black", linetype="solid") +
  theme(panel.background = element_blank(),
        panel.border = element_rect(colour = "black", fill=NA, size=1),
        panel.grid = element_blank(),
        axis.text   = element_text(size= 10, color = "black",family = "sans"),
        axis.title  = element_text(size=10, color = "black",family = "sans")) +
  labs(x="Rank in Ordered Dataset", y="Enrichment score (ES)")+ 
  scale_x_continuous(expand = c(0.01,0.01),) +
  scale_y_continuous(expand = c(0.001,0.001),
                     labels = round(seq(min(bottoms),max(tops)+0.1,0.1),1), 
                     breaks = round(seq(min(bottoms),max(tops)+0.1,0.1),1))+
  annotate(geom="text", x=1500, y= 0.1, color="black",family = "sans",size=4,
           label= paste0("Pval = ",round(fgseaRes[fgseaRes$pathway == "GPCR ligand binding",]$pval[1],3)))+
  annotate(geom="text", x=1500, y=0.2, color="black",family = "sans",size=4,
           label= paste0("FDR = ",round(fgseaRes[fgseaRes$pathway == "GPCR ligand binding",]$padj[1],3)))+
  annotate(geom="text", x=1500, y=0.3, color="black",family = "sans",size=4,
           label= paste0("NES = ",round(fgseaRes[fgseaRes$pathway == "GPCR ligand binding",]$NES[1],3)))

ggsave(paste0(Sys.Date(),fgseaRes[fgseaRes$pathway == "GPCR ligand binding",]$pathway[1],".tiff"), 
       plot = last_plot(), device = NULL, 
       scale = 1, width = 10, height = 10, units ="cm",dpi = 300, limitsize = TRUE,compression = "lzw")

#GSEA table plot
#topPathwaysUp <- fgseaRes[ES > 0][head(order(pval), n=10), pathway]
#topPathwaysDown <- fgseaRes[ES < 0][head(order(pval), n=2),pathway]
#topPathways <- c(topPathwaysUp, rev(topPathwaysDown))
#plotGseaTable(pathway_1[topPathwaysUp], rankData, fgseaRes, 
#              gseaParam= 0.5)