GO analysis

libraries

library(readxl)
library(clusterProfiler)
library(gtools)
library(xlsx)
library(org.Hs.eg.db)
library(enrichplot)
library(ggplot2)
library(stringr)
library(tibble)
library(ggupset)
library(DOSE)

Working directory

getwd();
## [1] "/Users/zhuofanmou/All documents/UOE/PhD work/my project/raw_CEL_analysis/workspace"
working_directory = "~/All documents/UOE/PhD work/my project/raw_CEL_analysis/workspace";   
setwd(working_directory);

Importing previous DGE result and information

input_table_PC = load(file = 'table_PC.RData')
input_table_PC
## [1] "table_PC"

volcano plot

DEG=table_PC
logFC_cutoff <- log2(1.5)  #LogFC Threshold
DEG$change = as.factor(ifelse(DEG$adj.P.Val < 0.1 & abs(DEG$logFC) > logFC_cutoff,
                              ifelse(DEG$logFC > logFC_cutoff ,'UP','DOWN'),'NOT')     #!if logFC>cutoff+P<0.05->UP, if not then abs(lofFC)>cutoff+P<0.05->down, else->NOT;
)
this_tile <- paste0('Cutoff for logFC is ',round(logFC_cutoff,3),
                    '\nThe number of up gene is ',nrow(DEG[DEG$change =='UP',]) ,
                    '\nThe number of down gene is ',nrow(DEG[DEG$change =='DOWN',])
)
this_tile
## [1] "Cutoff for logFC is 0.585\nThe number of up gene is 47\nThe number of down gene is 2"
head(DEG)
g = ggplot(data=DEG, aes(x=logFC, y=-log10(adj.P.Val), color=change)) + 
  geom_point(alpha=0.4, size=1.75) + 
  theme_set(theme_set(theme_bw(base_size=20)))+
  xlab("log2 fold change") + ylab("-log10 FDR corrected p-value") +
  ggtitle( this_tile  ) + theme(plot.title = element_text(size=15,hjust = 0.5))+
  scale_colour_manual(values = c('blue','black','red'))  ## corresponding to the levels(res$change)
print(g)

table(DEG$change)
## 
##  DOWN   NOT    UP 
##     2 22116    47

DGE result

head(DEG)
write.xlsx(DEG, file = "Prostate_cancer_DGE.xlsx", sheetName = "DEG_result_22165", append = FALSE)

Extracting differentially expressed genes

# transfer from gene symbol to EntrzID
gene_for_up_down_set_PC <- bitr(unique(DEG$SYMBOL), fromType = "SYMBOL",
                                      toType = c("ENTREZID"),
                                      OrgDb = org.Hs.eg.db)             #19949 by EntrezID only and 47 DE genes; 20912 by c(EntrezID and Ensembl) and 53 DE genes
## 'select()' returned 1:many mapping between keys and columns
## Warning in bitr(unique(DEG$SYMBOL), fromType = "SYMBOL", toType =
## c("ENTREZID"), : 10.01% of input gene IDs are fail to map...
head(gene_for_up_down_set_PC)
dim(gene_for_up_down_set_PC)
## [1] 19949     2
head(DEG)
dim(DEG)
## [1] 22165    11
DEG_for_up_down_set_PC=merge(DEG,gene_for_up_down_set_PC,by.y='SYMBOL',by.x='SYMBOL') #merge symbol with entrezID
head(DEG_for_up_down_set_PC)
dim(DEG_for_up_down_set_PC)
## [1] 19949    12
table(DEG_for_up_down_set_PC$change)
## 
##  DOWN   NOT    UP 
##     2 19902    45
gene_up_PC= DEG_for_up_down_set_PC[DEG_for_up_down_set_PC$change == 'UP','ENTREZID']      #up regulated geneset, denoted by EntrezID
gene_down_PC=DEG_for_up_down_set_PC[DEG_for_up_down_set_PC$change == 'DOWN','ENTREZID']   #down regulated geneset, denoted by EntrezID
gene_diff_PC=c(gene_up_PC,gene_down_PC)   #upgene+downgene
length(gene_diff_PC)
## [1] 47
ids_updown_PC = DEG_for_up_down_set_PC$ENTREZID %in% gene_diff_PC
gene_diff_list_PC= subset(DEG_for_up_down_set_PC, ids_updown_PC)
dim(gene_diff_list_PC)
## [1] 47 12
#write.xlsx(gene_diff_list_PC, file = "Prostate_cancer_DGE.xlsx", sheetName = "diff_genesONLY", append = TRUE, row.names = FALSE) # MUST be TRUE otherwise will update the previous excel

GO classification

gene_PC = gene_diff_list_PC   # DE genes (no duplicated IDs)
# gene.df_PC = bitr(gene_PC$SYMBOL, fromType = "SYMBOL",
#         toType = c("ENSEMBL", "ENTREZID"),
#         OrgDb = org.Hs.eg.db)   # 53; many mapping on the ENSEMBL
# head(gene.df_PC)
# dim(gene.df_PC)

gene_PC = gene_PC$ENTREZID

ggo_BP_PC = groupGO(gene     = gene_PC,
               OrgDb    = org.Hs.eg.db,
               ont      = "BP",      # BP,CC,MF
               level    = 3,         # level 3
               readable = TRUE)
ggo_CC_PC = groupGO(gene     = gene_PC,
               OrgDb    = org.Hs.eg.db,
               ont      = "CC",      # BP,CC,MF
               level    = 3,         # level 3
               readable = TRUE)
ggo_MF_PC = groupGO(gene     = gene_PC,
               OrgDb    = org.Hs.eg.db,
               ont      = "MF",      # BP,CC,MF
               level    = 3,         # level 3
               readable = TRUE)


head(ggo_BP_PC)
head(ggo_CC_PC)
head(ggo_MF_PC)
# write.xlsx(ggo_BP_PC@result, file = "GO_enrichment_results_PC.xlsx", sheetName = "ggo_BP_PC", append = FALSE)
# write.xlsx(ggo_CC_PC@result, file = "GO_enrichment_results_PC.xlsx", sheetName = "ggo_CC_PC", append = TRUE)
# write.xlsx(ggo_MF_PC@result, file = "GO_enrichment_results_PC.xlsx", sheetName = "ggo_MF_PC", append = TRUE)

GO over-representation analysis

We apply the over-representation analysis on the total of \(\color{blue}{\text{47}}\) deferentially expressed genes.

ego_ORA_PC <- enrichGO(gene     = gene_PC,
                OrgDb         = org.Hs.eg.db,
                keyType       = 'ENTREZID',  #specify the 'keytype' to the input ID type
                ont           = "ALL",
                pAdjustMethod = "BH",
                pvalueCutoff  = 0.01,
                qvalueCutoff  = 0.05)
ego_ORA_PC <- setReadable(ego_ORA_PC, OrgDb = org.Hs.eg.db) # convert EntrezID to gene symbol

ego_ORA_90cutoff <- enrichGO(gene     = gene_PC,
                OrgDb         = org.Hs.eg.db,
                keyType       = 'ENTREZID',
                ont           = "ALL",
                pAdjustMethod = "BH",
                pvalueCutoff  = 0.9,
                qvalueCutoff  = 0.9)
ego_ORA_90cutoff <- setReadable(ego_ORA_90cutoff, OrgDb = org.Hs.eg.db) # convert EntrezID to gene symbol; either add 'ENTRZID'
head(ego_ORA_PC)
head(ego_ORA_90cutoff)
# write.xlsx(ego_ORA_PC@result, file = "GO_enrichment_results_PC.xlsx", sheetName = "ego_ORA_ALL_PC", append = TRUE)
# write.xlsx(ego_ORA_90cutoff@result, file = "GO_enrichment_results_PC.xlsx", sheetName = "ego_ORA_90cutoff_ALL_PC", append = TRUE)

GO GSEA on the total input gene (with unique EntrezID)

ORA approach is useful to find genes where the difference is large, but it will not detect a situation where the difference is small, but evidenced in coordinated way in a set of related genes. Gene Set Enrichment Analysis (GSEA)(Subramanian et al. 2005) directly addresses this limitation. All genes can be used in GSEA; GSEA aggregates the per gene statistics across genes within a gene set, therefore making it possible to detect situations where all genes in a predefined set change in a small but coordinated way. Thus, we will apply the GSEA on the overall \(\color{blue}{\text{19949}}\) genes which have the unique EntrezID. Note that there is no duplicated gene IDs as we have already collapsed during the pre-processing step. (\(\color{red}{\text{Q:beneficial?}}\))

dframe_GO_PC = DEG_for_up_down_set_PC[order(DEG_for_up_down_set_PC$logFC, decreasing = TRUE), c(12,5)]  # we re-order the dataframe in decreasing order based on the log2FC and then only include columns: ENTREZ and logFC


# we MUST creat a numeric vector as geneList
original_geneList_PC <- dframe_GO_PC$logFC   # log2FC

names(original_geneList_PC) <- dframe_GO_PC$ENTREZID # (row)name the vector
# omit any NA values 
geneList_PC <-na.omit(original_geneList_PC)  #should have any NA here

geneList_PC = sort(geneList_PC, decreasing = TRUE) # sort the list in decreasing order (required for clusterProfiler); final geneset list; NO need actually

# GO GSEA
 ego_GSEA_PC <- gseGO(geneList     = geneList_PC,
              OrgDb        = org.Hs.eg.db,
              ont          = "ALL",
              nPerm        = 1000,
              minGSSize    = 100,
              maxGSSize    = 500,
              pvalueCutoff = 0.05,
              verbose      = FALSE)
ego_GSEA_PC <- setReadable(ego_GSEA_PC, OrgDb = org.Hs.eg.db) # convert EntrezID to gene symbol; either add 'ENTRZID'
head(ego_GSEA_PC)
#write.xlsx(ego_GSEA_PC@result, file = "GO_enrichment_results_PC.xlsx", sheetName = "ego_GSEA_ALL_PC", append = TRUE)

# ego_GSEA_PC2 <- gseGO(geneList     = geneList_PC,
#               OrgDb        = org.Hs.eg.db,
#               keyType = "ENTREZID",
#               ont          = "ALL",
#               nPerm        = 1000,
#               minGSSize    = 10,
#               maxGSSize    = 500,
#               pvalueCutoff = 0.05,
#               pAdjustMethod = "none",
#               verbose      = FALSE )

NCG_GSEA_PC = gseNCG(geneList_PC, nPerm=10000)

Visulisation

We show some possible plots that can be produced from the enrichment results we obtained.

Barplot

Bar plot visualises gene count or ratio as bar height and colored by enrichment scores such as adjusted p values.

barplot(ego_ORA_PC, split = "ONTOLOGY")+facet_grid(ONTOLOGY~., scale = "free")+ggtitle("Barplot for GO_ORA_PC")
barplot(ego_ORA_90cutoff, split = "ONTOLOGY")+facet_grid(ONTOLOGY~., scale = "free")+ggtitle("Barplot for GO_ORA_90cuttoff_PC")

Barplot

Barplot_90cutoff

Dotplot

Similar to bar plot, it visualises pre-defined number of GO enriched terms simultaneously and the size of the ‘bubble’ depends on the number of genes in that GO term.

dotplot(ego_ORA_PC, showCategory=15) + ggtitle("Dotplot for GO_ORA_PC")
dotplot(ego_ORA_90cutoff, showCategory=15) + ggtitle("Dotplot for GO_ORA_90cutoff_PC")

Dotplot

Dotplot_90cutoff

Enrichmap

Enrichment map organises enriched terms into a network with edges weighted by the ratio of overlapping gene sets. Mutually overlapping gene sets are tending to cluster together, making it easy to identify functional modules.

# par(mfrow=c(1,2))
# emapplot(ego2_3rd)+ ggtitle("Enrichment map for GO_ORA_3rd")
#emapplot(ego_ORA_PC,layout="kk")+ ggtitle("Enrichment map for GO_ORA_PC")
emapplot(ego_ORA_90cutoff,layout="kk")+ ggtitle("Enrichment map for GO_ORA_90cutoffPC")
# emapplot(ego2_2nd_90cutoff,layout="nicely", showCategory = 20)+ ggtitle("Enrichment map for GO_ORA_3rd_90cutoff")

Enrichmap

GCN

Gene-concept network shows the linkage of genes and corresponding biological concepts as a network. ‘clusterProfiler’ provides also a circular version of the network.

par(mfrow=c(1,2))

# cnetplot(ego2_3rd_90cutoff) + ggtitle("Gene-Concept Network for GO_ORA_3rd_90cutoff")
cnetplot(ego_ORA_90cutoff,foldChange=geneList_PC, showCategory =15 ) + ggtitle("Gene-Concept Network for GO_ORA_90cutoff_PC") # with log2fc
cnetplot(ego_ORA_90cutoff,foldChange=geneList_PC, circular = TRUE, colorEdge = TRUE, showCategory = 15) + ggtitle("Gene-Concept Network for GO_ORA_90cutoff_PC")#with log2fc
# cnetplot(ego3_GSEA, foldChange=geneList_sheet2, showCategory = 10) + ggtitle("Gene-Concept Network for GO_GSEA")
# 
# cnetplot(ego2_1st_90cutoff, circular = TRUE, colorEdge = TRUE, showCategory = 10)
# cnetplot(ego2_1st_90cutoff, foldChange=geneList_sheet2, circular = TRUE, colorEdge = TRUE, showCategory = 10) + ggtitle("Gene-Concept Network for GO_ORA_1st_90cutoff")#with log2fc
# cnetplot(ego3_GSEA, foldChange=geneList_sheet2, circular = TRUE, colorEdge = TRUE, showCategory = 3)

GCN_circle

Heatmap

Heatmap can be used to visualise the relationship between selected genes and corresponding biological concepts.

heatplot(ego_ORA_90cutoff, foldChange=geneList_PC) + ggtitle("Heatmap for GO_ORA_90cutoff_PC")

Heatmap

Upsetplot

The upsetplot is an alternative to cnetplot for visualizing the complex association between genes and gene sets. It emphasizes the gene overlapping among different gene sets.

upsetplot(ego_ORA_90cutoff)+ ggtitle("Upsetplot for GO_ORA_90cutoff_PC")

Upset

Ridgeplot

Ridge line plot is used for showing the expression distribution of GSEA result \(\color{red}{\text{works only on gsea results}}\)

# ridge plot works only on gsea results
ridgeplot(ego_GSEA_PC)+ ggtitle("Ridge for GO_GSEA_PC")
ridgeplot(NCG_GSEA_PC)+ ggtitle("Ridge for NCG_GSEA_PC")
## Picking joint bandwidth of 0.121

ridge_NCG

GSEA

Some classic running score and pre-ranked list of GSEA result for enriched terms (‘mitotic sister chromatid segregation’ as an example here)

gseaplot(ego_GSEA_PC, geneSetID = 1, title = ego_GSEA_PC$Description[1])
gseaplot2(ego_GSEA_PC,pvalue_table = TRUE, geneSetID = 1:3)

GSEA 1st gene set

GSEA plot for the first three gene sets

Reference

http://yulab-smu.top/clusterProfiler-book/chapter5.html https://learn.gencore.bio.nyu.edu/rna-seq-analysis/gene-set-enrichment-analysis/