library(readxl)
library(clusterProfiler)
library(gtools)
library(xlsx)
library(org.Hs.eg.db)
library(enrichplot)
library(ggplot2)
library(stringr)
library(tibble)
library(ggupset)
library(DOSE)
getwd();
## [1] "/Users/zhuofanmou/All documents/UOE/PhD work/my project/raw_CEL_analysis/workspace"
working_directory = "~/All documents/UOE/PhD work/my project/raw_CEL_analysis/workspace";
setwd(working_directory);
input_table_PC = load(file = 'table_PC.RData')
input_table_PC
## [1] "table_PC"
DEG=table_PC
logFC_cutoff <- log2(1.5) #LogFC Threshold
DEG$change = as.factor(ifelse(DEG$adj.P.Val < 0.1 & abs(DEG$logFC) > logFC_cutoff,
ifelse(DEG$logFC > logFC_cutoff ,'UP','DOWN'),'NOT') #!if logFC>cutoff+P<0.05->UP, if not then abs(lofFC)>cutoff+P<0.05->down, else->NOT;
)
this_tile <- paste0('Cutoff for logFC is ',round(logFC_cutoff,3),
'\nThe number of up gene is ',nrow(DEG[DEG$change =='UP',]) ,
'\nThe number of down gene is ',nrow(DEG[DEG$change =='DOWN',])
)
this_tile
## [1] "Cutoff for logFC is 0.585\nThe number of up gene is 47\nThe number of down gene is 2"
head(DEG)
g = ggplot(data=DEG, aes(x=logFC, y=-log10(adj.P.Val), color=change)) +
geom_point(alpha=0.4, size=1.75) +
theme_set(theme_set(theme_bw(base_size=20)))+
xlab("log2 fold change") + ylab("-log10 FDR corrected p-value") +
ggtitle( this_tile ) + theme(plot.title = element_text(size=15,hjust = 0.5))+
scale_colour_manual(values = c('blue','black','red')) ## corresponding to the levels(res$change)
print(g)
table(DEG$change)
##
## DOWN NOT UP
## 2 22116 47
head(DEG)
write.xlsx(DEG, file = "Prostate_cancer_DGE.xlsx", sheetName = "DEG_result_22165", append = FALSE)
# transfer from gene symbol to EntrzID
gene_for_up_down_set_PC <- bitr(unique(DEG$SYMBOL), fromType = "SYMBOL",
toType = c("ENTREZID"),
OrgDb = org.Hs.eg.db) #19949 by EntrezID only and 47 DE genes; 20912 by c(EntrezID and Ensembl) and 53 DE genes
## 'select()' returned 1:many mapping between keys and columns
## Warning in bitr(unique(DEG$SYMBOL), fromType = "SYMBOL", toType =
## c("ENTREZID"), : 10.01% of input gene IDs are fail to map...
head(gene_for_up_down_set_PC)
dim(gene_for_up_down_set_PC)
## [1] 19949 2
head(DEG)
dim(DEG)
## [1] 22165 11
DEG_for_up_down_set_PC=merge(DEG,gene_for_up_down_set_PC,by.y='SYMBOL',by.x='SYMBOL') #merge symbol with entrezID
head(DEG_for_up_down_set_PC)
dim(DEG_for_up_down_set_PC)
## [1] 19949 12
table(DEG_for_up_down_set_PC$change)
##
## DOWN NOT UP
## 2 19902 45
gene_up_PC= DEG_for_up_down_set_PC[DEG_for_up_down_set_PC$change == 'UP','ENTREZID'] #up regulated geneset, denoted by EntrezID
gene_down_PC=DEG_for_up_down_set_PC[DEG_for_up_down_set_PC$change == 'DOWN','ENTREZID'] #down regulated geneset, denoted by EntrezID
gene_diff_PC=c(gene_up_PC,gene_down_PC) #upgene+downgene
length(gene_diff_PC)
## [1] 47
ids_updown_PC = DEG_for_up_down_set_PC$ENTREZID %in% gene_diff_PC
gene_diff_list_PC= subset(DEG_for_up_down_set_PC, ids_updown_PC)
dim(gene_diff_list_PC)
## [1] 47 12
#write.xlsx(gene_diff_list_PC, file = "Prostate_cancer_DGE.xlsx", sheetName = "diff_genesONLY", append = TRUE, row.names = FALSE) # MUST be TRUE otherwise will update the previous excel
gene_PC = gene_diff_list_PC # DE genes (no duplicated IDs)
# gene.df_PC = bitr(gene_PC$SYMBOL, fromType = "SYMBOL",
# toType = c("ENSEMBL", "ENTREZID"),
# OrgDb = org.Hs.eg.db) # 53; many mapping on the ENSEMBL
# head(gene.df_PC)
# dim(gene.df_PC)
gene_PC = gene_PC$ENTREZID
ggo_BP_PC = groupGO(gene = gene_PC,
OrgDb = org.Hs.eg.db,
ont = "BP", # BP,CC,MF
level = 3, # level 3
readable = TRUE)
ggo_CC_PC = groupGO(gene = gene_PC,
OrgDb = org.Hs.eg.db,
ont = "CC", # BP,CC,MF
level = 3, # level 3
readable = TRUE)
ggo_MF_PC = groupGO(gene = gene_PC,
OrgDb = org.Hs.eg.db,
ont = "MF", # BP,CC,MF
level = 3, # level 3
readable = TRUE)
head(ggo_BP_PC)
head(ggo_CC_PC)
head(ggo_MF_PC)
# write.xlsx(ggo_BP_PC@result, file = "GO_enrichment_results_PC.xlsx", sheetName = "ggo_BP_PC", append = FALSE)
# write.xlsx(ggo_CC_PC@result, file = "GO_enrichment_results_PC.xlsx", sheetName = "ggo_CC_PC", append = TRUE)
# write.xlsx(ggo_MF_PC@result, file = "GO_enrichment_results_PC.xlsx", sheetName = "ggo_MF_PC", append = TRUE)
We apply the over-representation analysis on the total of \(\color{blue}{\text{47}}\) deferentially expressed genes.
ego_ORA_PC <- enrichGO(gene = gene_PC,
OrgDb = org.Hs.eg.db,
keyType = 'ENTREZID', #specify the 'keytype' to the input ID type
ont = "ALL",
pAdjustMethod = "BH",
pvalueCutoff = 0.01,
qvalueCutoff = 0.05)
ego_ORA_PC <- setReadable(ego_ORA_PC, OrgDb = org.Hs.eg.db) # convert EntrezID to gene symbol
ego_ORA_90cutoff <- enrichGO(gene = gene_PC,
OrgDb = org.Hs.eg.db,
keyType = 'ENTREZID',
ont = "ALL",
pAdjustMethod = "BH",
pvalueCutoff = 0.9,
qvalueCutoff = 0.9)
ego_ORA_90cutoff <- setReadable(ego_ORA_90cutoff, OrgDb = org.Hs.eg.db) # convert EntrezID to gene symbol; either add 'ENTRZID'
head(ego_ORA_PC)
head(ego_ORA_90cutoff)
# write.xlsx(ego_ORA_PC@result, file = "GO_enrichment_results_PC.xlsx", sheetName = "ego_ORA_ALL_PC", append = TRUE)
# write.xlsx(ego_ORA_90cutoff@result, file = "GO_enrichment_results_PC.xlsx", sheetName = "ego_ORA_90cutoff_ALL_PC", append = TRUE)
ORA approach is useful to find genes where the difference is large, but it will not detect a situation where the difference is small, but evidenced in coordinated way in a set of related genes. Gene Set Enrichment Analysis (GSEA)(Subramanian et al. 2005) directly addresses this limitation. All genes can be used in GSEA; GSEA aggregates the per gene statistics across genes within a gene set, therefore making it possible to detect situations where all genes in a predefined set change in a small but coordinated way. Thus, we will apply the GSEA on the overall \(\color{blue}{\text{19949}}\) genes which have the unique EntrezID. Note that there is no duplicated gene IDs as we have already collapsed during the pre-processing step. (\(\color{red}{\text{Q:beneficial?}}\))
dframe_GO_PC = DEG_for_up_down_set_PC[order(DEG_for_up_down_set_PC$logFC, decreasing = TRUE), c(12,5)] # we re-order the dataframe in decreasing order based on the log2FC and then only include columns: ENTREZ and logFC
# we MUST creat a numeric vector as geneList
original_geneList_PC <- dframe_GO_PC$logFC # log2FC
names(original_geneList_PC) <- dframe_GO_PC$ENTREZID # (row)name the vector
# omit any NA values
geneList_PC <-na.omit(original_geneList_PC) #should have any NA here
geneList_PC = sort(geneList_PC, decreasing = TRUE) # sort the list in decreasing order (required for clusterProfiler); final geneset list; NO need actually
# GO GSEA
ego_GSEA_PC <- gseGO(geneList = geneList_PC,
OrgDb = org.Hs.eg.db,
ont = "ALL",
nPerm = 1000,
minGSSize = 100,
maxGSSize = 500,
pvalueCutoff = 0.05,
verbose = FALSE)
ego_GSEA_PC <- setReadable(ego_GSEA_PC, OrgDb = org.Hs.eg.db) # convert EntrezID to gene symbol; either add 'ENTRZID'
head(ego_GSEA_PC)
#write.xlsx(ego_GSEA_PC@result, file = "GO_enrichment_results_PC.xlsx", sheetName = "ego_GSEA_ALL_PC", append = TRUE)
# ego_GSEA_PC2 <- gseGO(geneList = geneList_PC,
# OrgDb = org.Hs.eg.db,
# keyType = "ENTREZID",
# ont = "ALL",
# nPerm = 1000,
# minGSSize = 10,
# maxGSSize = 500,
# pvalueCutoff = 0.05,
# pAdjustMethod = "none",
# verbose = FALSE )
NCG_GSEA_PC = gseNCG(geneList_PC, nPerm=10000)
We show some possible plots that can be produced from the enrichment results we obtained.
Bar plot visualises gene count or ratio as bar height and colored by enrichment scores such as adjusted p values.
barplot(ego_ORA_PC, split = "ONTOLOGY")+facet_grid(ONTOLOGY~., scale = "free")+ggtitle("Barplot for GO_ORA_PC")
barplot(ego_ORA_90cutoff, split = "ONTOLOGY")+facet_grid(ONTOLOGY~., scale = "free")+ggtitle("Barplot for GO_ORA_90cuttoff_PC")
Barplot
Barplot_90cutoff
Similar to bar plot, it visualises pre-defined number of GO enriched terms simultaneously and the size of the ‘bubble’ depends on the number of genes in that GO term.
dotplot(ego_ORA_PC, showCategory=15) + ggtitle("Dotplot for GO_ORA_PC")
dotplot(ego_ORA_90cutoff, showCategory=15) + ggtitle("Dotplot for GO_ORA_90cutoff_PC")
Dotplot
Dotplot_90cutoff
Enrichment map organises enriched terms into a network with edges weighted by the ratio of overlapping gene sets. Mutually overlapping gene sets are tending to cluster together, making it easy to identify functional modules.
# par(mfrow=c(1,2))
# emapplot(ego2_3rd)+ ggtitle("Enrichment map for GO_ORA_3rd")
#emapplot(ego_ORA_PC,layout="kk")+ ggtitle("Enrichment map for GO_ORA_PC")
emapplot(ego_ORA_90cutoff,layout="kk")+ ggtitle("Enrichment map for GO_ORA_90cutoffPC")
# emapplot(ego2_2nd_90cutoff,layout="nicely", showCategory = 20)+ ggtitle("Enrichment map for GO_ORA_3rd_90cutoff")
Enrichmap
Gene-concept network shows the linkage of genes and corresponding biological concepts as a network. ‘clusterProfiler’ provides also a circular version of the network.
par(mfrow=c(1,2))
# cnetplot(ego2_3rd_90cutoff) + ggtitle("Gene-Concept Network for GO_ORA_3rd_90cutoff")
cnetplot(ego_ORA_90cutoff,foldChange=geneList_PC, showCategory =15 ) + ggtitle("Gene-Concept Network for GO_ORA_90cutoff_PC") # with log2fc
cnetplot(ego_ORA_90cutoff,foldChange=geneList_PC, circular = TRUE, colorEdge = TRUE, showCategory = 15) + ggtitle("Gene-Concept Network for GO_ORA_90cutoff_PC")#with log2fc
# cnetplot(ego3_GSEA, foldChange=geneList_sheet2, showCategory = 10) + ggtitle("Gene-Concept Network for GO_GSEA")
#
# cnetplot(ego2_1st_90cutoff, circular = TRUE, colorEdge = TRUE, showCategory = 10)
# cnetplot(ego2_1st_90cutoff, foldChange=geneList_sheet2, circular = TRUE, colorEdge = TRUE, showCategory = 10) + ggtitle("Gene-Concept Network for GO_ORA_1st_90cutoff")#with log2fc
# cnetplot(ego3_GSEA, foldChange=geneList_sheet2, circular = TRUE, colorEdge = TRUE, showCategory = 3)
GCN_circle
Heatmap can be used to visualise the relationship between selected genes and corresponding biological concepts.
heatplot(ego_ORA_90cutoff, foldChange=geneList_PC) + ggtitle("Heatmap for GO_ORA_90cutoff_PC")
Heatmap
The upsetplot is an alternative to cnetplot for visualizing the complex association between genes and gene sets. It emphasizes the gene overlapping among different gene sets.
upsetplot(ego_ORA_90cutoff)+ ggtitle("Upsetplot for GO_ORA_90cutoff_PC")
Upset
Ridge line plot is used for showing the expression distribution of GSEA result \(\color{red}{\text{works only on gsea results}}\)
# ridge plot works only on gsea results
ridgeplot(ego_GSEA_PC)+ ggtitle("Ridge for GO_GSEA_PC")
ridgeplot(NCG_GSEA_PC)+ ggtitle("Ridge for NCG_GSEA_PC")
## Picking joint bandwidth of 0.121
ridge_NCG
Some classic running score and pre-ranked list of GSEA result for enriched terms (‘mitotic sister chromatid segregation’ as an example here)
gseaplot(ego_GSEA_PC, geneSetID = 1, title = ego_GSEA_PC$Description[1])
gseaplot2(ego_GSEA_PC,pvalue_table = TRUE, geneSetID = 1:3)
GSEA 1st gene set
GSEA plot for the first three gene sets