GO_KEGG_enrichemnt

Load packages
Load gene annocation information
Load sample from WGCNA network
Load sample for AME targeted downstream genes (OPTIONAL WITH THE STEP ABOVE)
Wrap gene function information to each module
Reload the information of gene and module
Optional input for other analysis
Perform functional enrichment
Plot results (Plot for module of interests)
Plot GO class plot (Just show one example)
Load KEGG pathway data
Perform the KEGG enrichement analysis

This pipeline is developed for batch processing for GO and KEGG enrichment derived from WGCNA module analysis. Plase check the format of node/module results derived from co-expression network analysis

Load packages

devtools::install_github('GuangchuangYu/clusterProfiler')
BiocManager::install("Rgraphviz")
BiocManager::install("topGO")
BiocManager::install("clusterProfiler")
BiocManager::install("GOplot")

library(dplyr)
library(ggplot2)
library(reshape2)
library(topGO)
library(plyr)
library(tidyr)
library(scales)
library(clusterProfiler)
library(GOplot)
library(stringr)

Load gene annocation information

### Load genome-wide GO list
geneID2GO <- readMappings("/Users/leon/Documents/Project/Sorghum/0_Genome/Sb_geneid2go.map")
geneNames <- names(geneID2GO)
head(geneNames, 20)

##  [1] "Sobic.001G000100" "Sobic.001G000200" "Sobic.001G000400" "Sobic.001G000700"
##  [5] "Sobic.001G000700" "Sobic.001G000800" "Sobic.001G000900" "Sobic.001G001000"
##  [9] "Sobic.001G001066" "Sobic.001G001132" "Sobic.001G001200" "Sobic.001G001300"
## [13] "Sobic.001G001400" "Sobic.001G001800" "Sobic.001G001900" "Sobic.001G002000"
## [17] "Sobic.001G002200" "Sobic.001G002300" "Sobic.001G002400" "Sobic.001G002500"

Load sample from WGCNA network

#Load sample_list for reading respective phenotype
sample.list <- read.table("/Users/Leon/OneDrive - Cornell University/Projects/9_Sorghum/7_WGCNA/vsd100/Node/node.list", 
                          sep ="\t", header = F)
sample.list

for (i in 1:nrow(sample.list)){
  sample.df <- read.csv(paste0("/Users/Leon/OneDrive - Cornell University/Projects/9_Sorghum/7_WGCNA/vsd100/Node/",
                               sample.list[i,1]), header = T)
  if(nrow(sample.df) != 0){
    assign(sample.list[i,1], sample.df)
  } 
}

#Create Meta phenotype sheet by combining muliple files
merge_list <- lapply(ls(pattern = "edges"), get)
MOdule_merge <- bind_rows(merge_list)
MOdule_merge <- MOdule_merge[,c(1,2)]
colnames(MOdule_merge) <- c("id","module")

write.csv(MOdule_merge, "/Users/Leon/OneDrive - Cornell University/Projects/9_Sorghum/7_WGCNA/vsd100/Module_merge.csv", quote = F, row.names = F)
head(MOdule_merge, 30)

### generate the unique list of module information
module_colors <- data.frame(module = unique(MOdule_merge$module))
module_colors <- na.omit(module_colors)
module_colors

Load sample for AME targeted downstream genes (OPTIONAL WITH THE STEP ABOVE)

#Load sample_list for reading respective phenotype
sample.list <- read.table("/Users/Leon/OneDrive - Cornell University/Projects/9_Sorghum/16_AME/GO_list", 
                          sep ="\t", header = F)

for (i in 1:nrow(sample.list)){
  sample.df <- read.table(paste0("/Users/Leon/OneDrive - Cornell University/Projects/9_Sorghum/16_AME/AME_peak/",
                               sample.list[i,1]), header = F)
  if(nrow(sample.df) != 0){
    assign(sample.list[i,1], sample.df)
  } 
}

merge_list <- lapply(ls(pattern = "clean_GO"), get)
MOdule_merge <- bind_rows(merge_list)
MOdule_merge <- MOdule_merge[,c(1,2)]
colnames(MOdule_merge) <- c("module","Gene")


MOdule_clean <- MOdule_merge[,c(2,1)]

Wrap gene function information to each module

MOdule_merge <- read.csv("/Users/Leon/OneDrive - Cornell University/Projects/9_Sorghum/7_WGCNA/vsd100/Module_merge.csv", header = T)
### Load gene function
Gene_Function <- read.csv("/Users/leon/Documents/Project/Sorghum/0_Genome/Sorghun_annotation_info.clean.csv", header = T)
Gene_sub <- data.frame(matrix(ncol = 6, nrow = nrow(MOdule_merge)))
colnames(Gene_sub) <- c("transcriptName", "Best.hit.arabi.name", "arabi.symbol", "arabi.defline", "KO", "GO")
head(Gene_sub)

for (i in 1:nrow(MOdule_merge)){
        temp.df <- Gene_Function[Gene_Function$transcriptName == MOdule_merge[i,1],
                c("transcriptName", "Best.hit.arabi.name", "arabi.symbol", "arabi.defline", "KO", "GO")]
        if(nrow(temp.df) != 0 ){
           Gene_sub[i,] <- temp.df
        } else {
        Gene_sub[i,] = "NA"
        }
}

Gene_info <- cbind(MOdule_merge, Gene_sub)
### write to pdf
write.table(Gene_info, 
          "/Users/Leon/OneDrive - Cornell University/Projects/9_Sorghum/7_WGCNA/vsd100/Functional_annotation.txt", 
          row.names = F, quote = F, sep = "\t")

head(Gene_info)

Reload the information of gene and module

NOTE: Please remove the transcripts isoform information before doing this step

MOdule_clean <- read.csv("/Users/Leon/OneDrive - Cornell University/Projects/9_Sorghum/7_WGCNA/vsd100/Module_merge_clean.csv", header=T)
colnames(MOdule_clean) <- c("Gene", "module")

head(MOdule_clean,20)

Optional input for other analysis

### for AME peak analysis
module_colors <- data.frame(color = unique(MOdule_merge$module)) 

### for Modification analysis
MOdule_clean <- read.csv("/Users/Leon/OneDrive - Cornell University/Projects/9_Sorghum/11_MODs_profile/1_MODs_transcripts_GO-enrichemnt_May10-2022.csv", header = T)

Perform functional enrichment

### for module enrichement
module_colors <- data.frame(color = unique(MOdule_clean$module)) 

for (i in 1:nrow(module_colors)){
  myInterestingGenes <- MOdule_clean[MOdule_clean$module == module_colors[i,1],1]
  geneList <- factor(as.integer(geneNames %in% myInterestingGenes))
  names(geneList) <- geneNames
  
  #Build the topGOdata matrix for each ontology categories
  GOdata_BP <- new("topGOdata", ontology = "BP", allGenes = geneList,
                    annot = annFUN.gene2GO, gene2GO = geneID2GO,nodeSize=5)

  GOdata_CC <- new("topGOdata", ontology = "CC", allGenes = geneList,
                    annot = annFUN.gene2GO, gene2GO = geneID2GO,nodeSize=5)

  GOdata_MF <- new("topGOdata", ontology = "MF", allGenes = geneList,
                    annot = annFUN.gene2GO, gene2GO = geneID2GO, nodeSize=5)
  
  #Perform enrichemnt analysis 
  resultTopGO.BP <- runTest(GOdata_BP, algorithm = "classic", statistic = "fisher")
  resultTopGO.CC <- runTest(GOdata_CC, algorithm = "classic", statistic = "fisher" )
  resultTopGO.MF <- runTest(GOdata_MF, algorithm = "classic", statistic = "fisher" )
  
  allBPGO = genesInTerm(GOdata_BP)
  allCCGO = genesInTerm(GOdata_CC)
  allMFGO = genesInTerm(GOdata_MF)
  
  ### Load function for extracting genes associated with a GO ID
  BP_ANOTATION = lapply(allBPGO,function(x) x[x %in% myInterestingGenes])
  CC_ANOTATION = lapply(allCCGO,function(x) x[x %in% myInterestingGenes])
  MF_ANOTATION = lapply(allMFGO,function(x) x[x %in% myInterestingGenes])
  
  
  #Transform data and save into csv
  BP_Res <- GenTable(GOdata_BP, classicFisher = resultTopGO.BP,  topNodes = 20)
  BP_Res$Class = "Biological_Process"
  for (x in 1:nrow(BP_Res)){
    BP_Res[x,8] <- str_c(BP_ANOTATION[[BP_Res[x,1]]],collapse = ",")
  }
  
  CC_Res <- GenTable(GOdata_CC, classicFisher = resultTopGO.CC,  topNodes =  20)
  CC_Res$Class = "Cellular_component"
  for (x in 1:nrow(CC_Res)){
    CC_Res[x,8] <- str_c(CC_ANOTATION[[CC_Res[x,1]]],collapse = ",")
  }

  MF_Res <- GenTable(GOdata_MF, classicFisher = resultTopGO.MF,  topNodes =  20)
  MF_Res$Class = "Molecular_function"
  for (x in 1:nrow(MF_Res)){
    MF_Res[x,8] <- str_c(MF_ANOTATION[[MF_Res[x,1]]],collapse = ",")
  }

  #Combine three types of GO for output
  GO_combine <- rbind(MF_Res,CC_Res,BP_Res)
  GO_combine$classicFisher <- as.numeric(GO_combine$classicFisher)
  
  ### filter enriched term using adjusted P-value
  GO_combine <- GO_combine[GO_combine$classicFisher < 0.05 ,] 
  names(GO_combine)[8] = "GeneList"
  
  write.csv(GO_combine, 
            paste0("/Users/Leon/OneDrive - Cornell University/Projects/9_Sorghum/8_GO/vsd100/GO_vsd100_",module_colors[i,1],".csv"), sep ="\t",
            quote = F, row.names = F)
   #write.csv(GO_combine, 
            #paste0("/Users/Leon/OneDrive - Cornell University/Projects/9_Sorghum/16_AME/",module_colors[i,1],"peak_GO.csv"), sep ="\t",
            #quote = F, row.names = F)
   
  #write.csv(GO_combine, 
            #paste0("/Users/Leon/OneDrive - Cornell University/Projects/9_Sorghum/11_MODs_profile/MODs_GO/",module_colors[i,1],"specific_GO.csv"), sep ="\t",
            #quote = F, row.names = F)
}

GO_combine

Plot results (Plot for module of interests)

ggdata <- GO_results[,2:6]
ggdata$Q.value = as.numeric(ggdata$Q.value)
ggdata$Name <- factor(ggdata$Name, levels=ggdata[order(ggdata$GO.Type, -ggdata$Q.value, decreasing = T), ]$Name) 

### Plot into barplot
ggplot(ggdata, aes(Name, -log10(Q.value), fill = GO.Type)) +
    geom_bar(stat='identity') + coord_flip() +
    theme(legend.position = "bottom",axis.title.y = element_blank()) +
    theme_bw()

Plot GO class plot (Just show one example)

showSigOfNodes(GOdata_MF, score(resultTopGO.MF), firstSigNodes = 3, useInfo = 'all')

Load KEGG pathway data

K2gene <- read.table("/Users/leon/Documents/Project/Sorghum/0_Genome/Sb_geneid2KEGG.map",header=T,sep="\t")
K2Ko <- read.table("/Users/leon/Documents/Project/Sorghum/0_Genome/Gh_k2ko.map",header=T,sep="\t")
head(K2gene,50)

head(K2Ko,50)

gene2ko=merge(K2gene,K2Ko,by="K")
write.table(gene2ko[,c(3,2)],"/Users/leon/Documents/Project/Sorghum/0_Genome/Sb_gene2ko.tab",row.names = F,sep = "\t")
head(gene2ko,50)

Perform the KEGG enrichement analysis

term2gene <- read.table("/Users/leon/Documents/Project/Sorghum/0_Genome/Sb_gene2ko.tab",header=T,sep="\t")
term2name <- read.csv("/Users/leon/Documents/Project/Sorghum/0_Genome/At_k2ko_description.csv", header = T)
head(term2gene,20)

head(term2name,20)

### Perform KEGG enrichment
for (i in 1:nrow(module_colors)){
  KEGG_gene <- as.factor(MOdule_clean[MOdule_clean$module == module_colors[i,1],1])
  KEGG <- enricher(KEGG_gene,TERM2GENE=term2gene,TERM2NAME=term2name, 
               pAdjustMethod = "bonferroni")
  write.csv(na.omit(data.frame(KEGG)), 
            paste0("/Users/Leon/OneDrive - Cornell University/Projects/9_Sorghum/9_KEGG/vsd100/KEGG_vsd100",module_colors[i,1],".csv"), row.names = F)
}

example <- data.frame(KEGG)

example

GO_KEGG_enrichemnt_pipeline

Liang Yu, Andrew Nelson

2022/2/7