This pipeline is developed for batch processing for GO and KEGG enrichment derived from WGCNA module analysis. Plase check the format of node/module results derived from co-expression network analysis

Load packages

devtools::install_github('GuangchuangYu/clusterProfiler')
BiocManager::install("Rgraphviz")
BiocManager::install("topGO")
BiocManager::install("clusterProfiler")
BiocManager::install("GOplot")
library(dplyr)
library(ggplot2)
library(reshape2)
library(topGO)
library(plyr)
library(tidyr)
library(scales)
library(clusterProfiler)
library(GOplot)
library(stringr)

Load gene annocation information

### Load genome-wide GO list
geneID2GO <- readMappings("/Users/leon/Documents/Project/Sorghum/0_Genome/Sb_geneid2go.map")
geneNames <- names(geneID2GO)
head(geneNames, 20)
##  [1] "Sobic.001G000100" "Sobic.001G000200" "Sobic.001G000400" "Sobic.001G000700"
##  [5] "Sobic.001G000700" "Sobic.001G000800" "Sobic.001G000900" "Sobic.001G001000"
##  [9] "Sobic.001G001066" "Sobic.001G001132" "Sobic.001G001200" "Sobic.001G001300"
## [13] "Sobic.001G001400" "Sobic.001G001800" "Sobic.001G001900" "Sobic.001G002000"
## [17] "Sobic.001G002200" "Sobic.001G002300" "Sobic.001G002400" "Sobic.001G002500"

Load sample from WGCNA network

#Load sample_list for reading respective phenotype
sample.list <- read.table("/Users/Leon/OneDrive - Cornell University/Projects/9_Sorghum/7_WGCNA/vsd100/Node/node.list", 
                          sep ="\t", header = F)
sample.list 
for (i in 1:nrow(sample.list)){
  sample.df <- read.csv(paste0("/Users/Leon/OneDrive - Cornell University/Projects/9_Sorghum/7_WGCNA/vsd100/Node/",
                               sample.list[i,1]), header = T)
  if(nrow(sample.df) != 0){
    assign(sample.list[i,1], sample.df)
  } 
}
#Create Meta phenotype sheet by combining muliple files
merge_list <- lapply(ls(pattern = "edges"), get)
MOdule_merge <- bind_rows(merge_list)
MOdule_merge <- MOdule_merge[,c(1,2)]
colnames(MOdule_merge) <- c("id","module")

write.csv(MOdule_merge, "/Users/Leon/OneDrive - Cornell University/Projects/9_Sorghum/7_WGCNA/vsd100/Module_merge.csv", quote = F, row.names = F)
head(MOdule_merge, 30)
### generate the unique list of module information
module_colors <- data.frame(module = unique(MOdule_merge$module))
module_colors <- na.omit(module_colors)
module_colors

Load sample for AME targeted downstream genes (OPTIONAL WITH THE STEP ABOVE)

#Load sample_list for reading respective phenotype
sample.list <- read.table("/Users/Leon/OneDrive - Cornell University/Projects/9_Sorghum/16_AME/GO_list", 
                          sep ="\t", header = F)

for (i in 1:nrow(sample.list)){
  sample.df <- read.table(paste0("/Users/Leon/OneDrive - Cornell University/Projects/9_Sorghum/16_AME/AME_peak/",
                               sample.list[i,1]), header = F)
  if(nrow(sample.df) != 0){
    assign(sample.list[i,1], sample.df)
  } 
}

merge_list <- lapply(ls(pattern = "clean_GO"), get)
MOdule_merge <- bind_rows(merge_list)
MOdule_merge <- MOdule_merge[,c(1,2)]
colnames(MOdule_merge) <- c("module","Gene")


MOdule_clean <- MOdule_merge[,c(2,1)]

Wrap gene function information to each module

MOdule_merge <- read.csv("/Users/Leon/OneDrive - Cornell University/Projects/9_Sorghum/7_WGCNA/vsd100/Module_merge.csv", header = T)
### Load gene function
Gene_Function <- read.csv("/Users/leon/Documents/Project/Sorghum/0_Genome/Sorghun_annotation_info.clean.csv", header = T)
Gene_sub <- data.frame(matrix(ncol = 6, nrow = nrow(MOdule_merge)))
colnames(Gene_sub) <- c("transcriptName", "Best.hit.arabi.name", "arabi.symbol", "arabi.defline", "KO", "GO")
head(Gene_sub)

for (i in 1:nrow(MOdule_merge)){
        temp.df <- Gene_Function[Gene_Function$transcriptName == MOdule_merge[i,1],
                c("transcriptName", "Best.hit.arabi.name", "arabi.symbol", "arabi.defline", "KO", "GO")]
        if(nrow(temp.df) != 0 ){
           Gene_sub[i,] <- temp.df
        } else {
        Gene_sub[i,] = "NA"
        }
}

Gene_info <- cbind(MOdule_merge, Gene_sub)
### write to pdf
write.table(Gene_info, 
          "/Users/Leon/OneDrive - Cornell University/Projects/9_Sorghum/7_WGCNA/vsd100/Functional_annotation.txt", 
          row.names = F, quote = F, sep = "\t")

head(Gene_info)

Reload the information of gene and module

NOTE: Please remove the transcripts isoform information before doing this step

MOdule_clean <- read.csv("/Users/Leon/OneDrive - Cornell University/Projects/9_Sorghum/7_WGCNA/vsd100/Module_merge_clean.csv", header=T)
colnames(MOdule_clean) <- c("Gene", "module")

head(MOdule_clean,20)

Optional input for other analysis

### for AME peak analysis
module_colors <- data.frame(color = unique(MOdule_merge$module)) 

### for Modification analysis
MOdule_clean <- read.csv("/Users/Leon/OneDrive - Cornell University/Projects/9_Sorghum/11_MODs_profile/1_MODs_transcripts_GO-enrichemnt_May10-2022.csv", header = T)

Perform functional enrichment

### for module enrichement
module_colors <- data.frame(color = unique(MOdule_clean$module)) 

for (i in 1:nrow(module_colors)){
  myInterestingGenes <- MOdule_clean[MOdule_clean$module == module_colors[i,1],1]
  geneList <- factor(as.integer(geneNames %in% myInterestingGenes))
  names(geneList) <- geneNames
  
  #Build the topGOdata matrix for each ontology categories
  GOdata_BP <- new("topGOdata", ontology = "BP", allGenes = geneList,
                    annot = annFUN.gene2GO, gene2GO = geneID2GO,nodeSize=5)

  GOdata_CC <- new("topGOdata", ontology = "CC", allGenes = geneList,
                    annot = annFUN.gene2GO, gene2GO = geneID2GO,nodeSize=5)

  GOdata_MF <- new("topGOdata", ontology = "MF", allGenes = geneList,
                    annot = annFUN.gene2GO, gene2GO = geneID2GO, nodeSize=5)
  
  #Perform enrichemnt analysis 
  resultTopGO.BP <- runTest(GOdata_BP, algorithm = "classic", statistic = "fisher")
  resultTopGO.CC <- runTest(GOdata_CC, algorithm = "classic", statistic = "fisher" )
  resultTopGO.MF <- runTest(GOdata_MF, algorithm = "classic", statistic = "fisher" )
  
  allBPGO = genesInTerm(GOdata_BP)
  allCCGO = genesInTerm(GOdata_CC)
  allMFGO = genesInTerm(GOdata_MF)
  
  ### Load function for extracting genes associated with a GO ID
  BP_ANOTATION = lapply(allBPGO,function(x) x[x %in% myInterestingGenes])
  CC_ANOTATION = lapply(allCCGO,function(x) x[x %in% myInterestingGenes])
  MF_ANOTATION = lapply(allMFGO,function(x) x[x %in% myInterestingGenes])
  
  
  #Transform data and save into csv
  BP_Res <- GenTable(GOdata_BP, classicFisher = resultTopGO.BP,  topNodes = 20)
  BP_Res$Class = "Biological_Process"
  for (x in 1:nrow(BP_Res)){
    BP_Res[x,8] <- str_c(BP_ANOTATION[[BP_Res[x,1]]],collapse = ",")
  }
  
  CC_Res <- GenTable(GOdata_CC, classicFisher = resultTopGO.CC,  topNodes =  20)
  CC_Res$Class = "Cellular_component"
  for (x in 1:nrow(CC_Res)){
    CC_Res[x,8] <- str_c(CC_ANOTATION[[CC_Res[x,1]]],collapse = ",")
  }

  MF_Res <- GenTable(GOdata_MF, classicFisher = resultTopGO.MF,  topNodes =  20)
  MF_Res$Class = "Molecular_function"
  for (x in 1:nrow(MF_Res)){
    MF_Res[x,8] <- str_c(MF_ANOTATION[[MF_Res[x,1]]],collapse = ",")
  }

  #Combine three types of GO for output
  GO_combine <- rbind(MF_Res,CC_Res,BP_Res)
  GO_combine$classicFisher <- as.numeric(GO_combine$classicFisher)
  
  ### filter enriched term using adjusted P-value
  GO_combine <- GO_combine[GO_combine$classicFisher < 0.05 ,] 
  names(GO_combine)[8] = "GeneList"
  
  write.csv(GO_combine, 
            paste0("/Users/Leon/OneDrive - Cornell University/Projects/9_Sorghum/8_GO/vsd100/GO_vsd100_",module_colors[i,1],".csv"), sep ="\t",
            quote = F, row.names = F)
   #write.csv(GO_combine, 
            #paste0("/Users/Leon/OneDrive - Cornell University/Projects/9_Sorghum/16_AME/",module_colors[i,1],"peak_GO.csv"), sep ="\t",
            #quote = F, row.names = F)
   
  #write.csv(GO_combine, 
            #paste0("/Users/Leon/OneDrive - Cornell University/Projects/9_Sorghum/11_MODs_profile/MODs_GO/",module_colors[i,1],"specific_GO.csv"), sep ="\t",
            #quote = F, row.names = F)
}

GO_combine

Plot results (Plot for module of interests)

ggdata <- GO_results[,2:6]
ggdata$Q.value = as.numeric(ggdata$Q.value)
ggdata$Name <- factor(ggdata$Name, levels=ggdata[order(ggdata$GO.Type, -ggdata$Q.value, decreasing = T), ]$Name) 

### Plot into barplot
ggplot(ggdata, aes(Name, -log10(Q.value), fill = GO.Type)) +
    geom_bar(stat='identity') + coord_flip() +
    theme(legend.position = "bottom",axis.title.y = element_blank()) +
    theme_bw()

Plot GO class plot (Just show one example)

showSigOfNodes(GOdata_MF, score(resultTopGO.MF), firstSigNodes = 3, useInfo = 'all')

Load KEGG pathway data

K2gene <- read.table("/Users/leon/Documents/Project/Sorghum/0_Genome/Sb_geneid2KEGG.map",header=T,sep="\t")
K2Ko <- read.table("/Users/leon/Documents/Project/Sorghum/0_Genome/Gh_k2ko.map",header=T,sep="\t")
head(K2gene,50)
head(K2Ko,50)
gene2ko=merge(K2gene,K2Ko,by="K")
write.table(gene2ko[,c(3,2)],"/Users/leon/Documents/Project/Sorghum/0_Genome/Sb_gene2ko.tab",row.names = F,sep = "\t")
head(gene2ko,50)

Perform the KEGG enrichement analysis

term2gene <- read.table("/Users/leon/Documents/Project/Sorghum/0_Genome/Sb_gene2ko.tab",header=T,sep="\t")
term2name <- read.csv("/Users/leon/Documents/Project/Sorghum/0_Genome/At_k2ko_description.csv", header = T)
head(term2gene,20)
head(term2name,20)
### Perform KEGG enrichment
for (i in 1:nrow(module_colors)){
  KEGG_gene <- as.factor(MOdule_clean[MOdule_clean$module == module_colors[i,1],1])
  KEGG <- enricher(KEGG_gene,TERM2GENE=term2gene,TERM2NAME=term2name, 
               pAdjustMethod = "bonferroni")
  write.csv(na.omit(data.frame(KEGG)), 
            paste0("/Users/Leon/OneDrive - Cornell University/Projects/9_Sorghum/9_KEGG/vsd100/KEGG_vsd100",module_colors[i,1],".csv"), row.names = F)
}

example <- data.frame(KEGG)
example