This pipeline is developed for batch processing for GO and KEGG enrichment derived from WGCNA module analysis. Plase check the format of node/module results derived from co-expression network analysis
devtools::install_github('GuangchuangYu/clusterProfiler')
BiocManager::install("Rgraphviz")
BiocManager::install("topGO")
BiocManager::install("clusterProfiler")
BiocManager::install("GOplot")
library(dplyr)
library(ggplot2)
library(reshape2)
library(topGO)
library(plyr)
library(tidyr)
library(scales)
library(clusterProfiler)
library(GOplot)
library(stringr)
### Load genome-wide GO list
geneID2GO <- readMappings("/Users/leon/Documents/Project/Sorghum/0_Genome/Sb_geneid2go.map")
geneNames <- names(geneID2GO)
head(geneNames, 20)
## [1] "Sobic.001G000100" "Sobic.001G000200" "Sobic.001G000400" "Sobic.001G000700"
## [5] "Sobic.001G000700" "Sobic.001G000800" "Sobic.001G000900" "Sobic.001G001000"
## [9] "Sobic.001G001066" "Sobic.001G001132" "Sobic.001G001200" "Sobic.001G001300"
## [13] "Sobic.001G001400" "Sobic.001G001800" "Sobic.001G001900" "Sobic.001G002000"
## [17] "Sobic.001G002200" "Sobic.001G002300" "Sobic.001G002400" "Sobic.001G002500"
#Load sample_list for reading respective phenotype
sample.list <- read.table("/Users/Leon/OneDrive - Cornell University/Projects/9_Sorghum/7_WGCNA/vsd100/Node/node.list",
sep ="\t", header = F)
sample.list
for (i in 1:nrow(sample.list)){
sample.df <- read.csv(paste0("/Users/Leon/OneDrive - Cornell University/Projects/9_Sorghum/7_WGCNA/vsd100/Node/",
sample.list[i,1]), header = T)
if(nrow(sample.df) != 0){
assign(sample.list[i,1], sample.df)
}
}
#Create Meta phenotype sheet by combining muliple files
merge_list <- lapply(ls(pattern = "edges"), get)
MOdule_merge <- bind_rows(merge_list)
MOdule_merge <- MOdule_merge[,c(1,2)]
colnames(MOdule_merge) <- c("id","module")
write.csv(MOdule_merge, "/Users/Leon/OneDrive - Cornell University/Projects/9_Sorghum/7_WGCNA/vsd100/Module_merge.csv", quote = F, row.names = F)
head(MOdule_merge, 30)
### generate the unique list of module information
module_colors <- data.frame(module = unique(MOdule_merge$module))
module_colors <- na.omit(module_colors)
module_colors
#Load sample_list for reading respective phenotype
sample.list <- read.table("/Users/Leon/OneDrive - Cornell University/Projects/9_Sorghum/16_AME/GO_list",
sep ="\t", header = F)
for (i in 1:nrow(sample.list)){
sample.df <- read.table(paste0("/Users/Leon/OneDrive - Cornell University/Projects/9_Sorghum/16_AME/AME_peak/",
sample.list[i,1]), header = F)
if(nrow(sample.df) != 0){
assign(sample.list[i,1], sample.df)
}
}
merge_list <- lapply(ls(pattern = "clean_GO"), get)
MOdule_merge <- bind_rows(merge_list)
MOdule_merge <- MOdule_merge[,c(1,2)]
colnames(MOdule_merge) <- c("module","Gene")
MOdule_clean <- MOdule_merge[,c(2,1)]
MOdule_merge <- read.csv("/Users/Leon/OneDrive - Cornell University/Projects/9_Sorghum/7_WGCNA/vsd100/Module_merge.csv", header = T)
### Load gene function
Gene_Function <- read.csv("/Users/leon/Documents/Project/Sorghum/0_Genome/Sorghun_annotation_info.clean.csv", header = T)
Gene_sub <- data.frame(matrix(ncol = 6, nrow = nrow(MOdule_merge)))
colnames(Gene_sub) <- c("transcriptName", "Best.hit.arabi.name", "arabi.symbol", "arabi.defline", "KO", "GO")
head(Gene_sub)
for (i in 1:nrow(MOdule_merge)){
temp.df <- Gene_Function[Gene_Function$transcriptName == MOdule_merge[i,1],
c("transcriptName", "Best.hit.arabi.name", "arabi.symbol", "arabi.defline", "KO", "GO")]
if(nrow(temp.df) != 0 ){
Gene_sub[i,] <- temp.df
} else {
Gene_sub[i,] = "NA"
}
}
Gene_info <- cbind(MOdule_merge, Gene_sub)
### write to pdf
write.table(Gene_info,
"/Users/Leon/OneDrive - Cornell University/Projects/9_Sorghum/7_WGCNA/vsd100/Functional_annotation.txt",
row.names = F, quote = F, sep = "\t")
head(Gene_info)
NOTE: Please remove the transcripts isoform information before doing this step
MOdule_clean <- read.csv("/Users/Leon/OneDrive - Cornell University/Projects/9_Sorghum/7_WGCNA/vsd100/Module_merge_clean.csv", header=T)
colnames(MOdule_clean) <- c("Gene", "module")
head(MOdule_clean,20)
### for AME peak analysis
module_colors <- data.frame(color = unique(MOdule_merge$module))
### for Modification analysis
MOdule_clean <- read.csv("/Users/Leon/OneDrive - Cornell University/Projects/9_Sorghum/11_MODs_profile/1_MODs_transcripts_GO-enrichemnt_May10-2022.csv", header = T)
### for module enrichement
module_colors <- data.frame(color = unique(MOdule_clean$module))
for (i in 1:nrow(module_colors)){
myInterestingGenes <- MOdule_clean[MOdule_clean$module == module_colors[i,1],1]
geneList <- factor(as.integer(geneNames %in% myInterestingGenes))
names(geneList) <- geneNames
#Build the topGOdata matrix for each ontology categories
GOdata_BP <- new("topGOdata", ontology = "BP", allGenes = geneList,
annot = annFUN.gene2GO, gene2GO = geneID2GO,nodeSize=5)
GOdata_CC <- new("topGOdata", ontology = "CC", allGenes = geneList,
annot = annFUN.gene2GO, gene2GO = geneID2GO,nodeSize=5)
GOdata_MF <- new("topGOdata", ontology = "MF", allGenes = geneList,
annot = annFUN.gene2GO, gene2GO = geneID2GO, nodeSize=5)
#Perform enrichemnt analysis
resultTopGO.BP <- runTest(GOdata_BP, algorithm = "classic", statistic = "fisher")
resultTopGO.CC <- runTest(GOdata_CC, algorithm = "classic", statistic = "fisher" )
resultTopGO.MF <- runTest(GOdata_MF, algorithm = "classic", statistic = "fisher" )
allBPGO = genesInTerm(GOdata_BP)
allCCGO = genesInTerm(GOdata_CC)
allMFGO = genesInTerm(GOdata_MF)
### Load function for extracting genes associated with a GO ID
BP_ANOTATION = lapply(allBPGO,function(x) x[x %in% myInterestingGenes])
CC_ANOTATION = lapply(allCCGO,function(x) x[x %in% myInterestingGenes])
MF_ANOTATION = lapply(allMFGO,function(x) x[x %in% myInterestingGenes])
#Transform data and save into csv
BP_Res <- GenTable(GOdata_BP, classicFisher = resultTopGO.BP, topNodes = 20)
BP_Res$Class = "Biological_Process"
for (x in 1:nrow(BP_Res)){
BP_Res[x,8] <- str_c(BP_ANOTATION[[BP_Res[x,1]]],collapse = ",")
}
CC_Res <- GenTable(GOdata_CC, classicFisher = resultTopGO.CC, topNodes = 20)
CC_Res$Class = "Cellular_component"
for (x in 1:nrow(CC_Res)){
CC_Res[x,8] <- str_c(CC_ANOTATION[[CC_Res[x,1]]],collapse = ",")
}
MF_Res <- GenTable(GOdata_MF, classicFisher = resultTopGO.MF, topNodes = 20)
MF_Res$Class = "Molecular_function"
for (x in 1:nrow(MF_Res)){
MF_Res[x,8] <- str_c(MF_ANOTATION[[MF_Res[x,1]]],collapse = ",")
}
#Combine three types of GO for output
GO_combine <- rbind(MF_Res,CC_Res,BP_Res)
GO_combine$classicFisher <- as.numeric(GO_combine$classicFisher)
### filter enriched term using adjusted P-value
GO_combine <- GO_combine[GO_combine$classicFisher < 0.05 ,]
names(GO_combine)[8] = "GeneList"
write.csv(GO_combine,
paste0("/Users/Leon/OneDrive - Cornell University/Projects/9_Sorghum/8_GO/vsd100/GO_vsd100_",module_colors[i,1],".csv"), sep ="\t",
quote = F, row.names = F)
#write.csv(GO_combine,
#paste0("/Users/Leon/OneDrive - Cornell University/Projects/9_Sorghum/16_AME/",module_colors[i,1],"peak_GO.csv"), sep ="\t",
#quote = F, row.names = F)
#write.csv(GO_combine,
#paste0("/Users/Leon/OneDrive - Cornell University/Projects/9_Sorghum/11_MODs_profile/MODs_GO/",module_colors[i,1],"specific_GO.csv"), sep ="\t",
#quote = F, row.names = F)
}
GO_combine
ggdata <- GO_results[,2:6]
ggdata$Q.value = as.numeric(ggdata$Q.value)
ggdata$Name <- factor(ggdata$Name, levels=ggdata[order(ggdata$GO.Type, -ggdata$Q.value, decreasing = T), ]$Name)
### Plot into barplot
ggplot(ggdata, aes(Name, -log10(Q.value), fill = GO.Type)) +
geom_bar(stat='identity') + coord_flip() +
theme(legend.position = "bottom",axis.title.y = element_blank()) +
theme_bw()
showSigOfNodes(GOdata_MF, score(resultTopGO.MF), firstSigNodes = 3, useInfo = 'all')
K2gene <- read.table("/Users/leon/Documents/Project/Sorghum/0_Genome/Sb_geneid2KEGG.map",header=T,sep="\t")
K2Ko <- read.table("/Users/leon/Documents/Project/Sorghum/0_Genome/Gh_k2ko.map",header=T,sep="\t")
head(K2gene,50)
head(K2Ko,50)
gene2ko=merge(K2gene,K2Ko,by="K")
write.table(gene2ko[,c(3,2)],"/Users/leon/Documents/Project/Sorghum/0_Genome/Sb_gene2ko.tab",row.names = F,sep = "\t")
head(gene2ko,50)
term2gene <- read.table("/Users/leon/Documents/Project/Sorghum/0_Genome/Sb_gene2ko.tab",header=T,sep="\t")
term2name <- read.csv("/Users/leon/Documents/Project/Sorghum/0_Genome/At_k2ko_description.csv", header = T)
head(term2gene,20)
head(term2name,20)
### Perform KEGG enrichment
for (i in 1:nrow(module_colors)){
KEGG_gene <- as.factor(MOdule_clean[MOdule_clean$module == module_colors[i,1],1])
KEGG <- enricher(KEGG_gene,TERM2GENE=term2gene,TERM2NAME=term2name,
pAdjustMethod = "bonferroni")
write.csv(na.omit(data.frame(KEGG)),
paste0("/Users/Leon/OneDrive - Cornell University/Projects/9_Sorghum/9_KEGG/vsd100/KEGG_vsd100",module_colors[i,1],".csv"), row.names = F)
}
example <- data.frame(KEGG)
example