Enrichment

设置工作路径

setwd("~/Desktop/bioinfor/RNAseqR/GO_Enrichment")

1 创建注释文件包

1.1 直接下载注释文件。

一般已发表的基因组都会给注释好的文件，直接下载即可。

比如：月季基因组的四个注释文件（https://www.rosaceae.org/species/rosa/chinensis/genome_v1.0）。

InterPro注释使用 InterProScan 分析 Rosa_chinensis 转录本以分配 InterPro 域

OB_IRP <- read.csv("Rosa_chinensis_v1.0_IRP.csv")
head(OB_IRP,3)

##         Query     Match                      Description
## 1 RC0G0000100 IPR001356                  Homeobox domain
## 2 RC0G0000100 IPR009057 Homeobox-like domain superfamily
## 3 RC0G0000200 IPR001356                  Homeobox domain

GO注释

OB_GO <- read.csv("Rosa_chinensis_v1.0_GO.csv")
head(OB_GO,3)

##         Query      Match                    Description
## 1 RC0G0000100 GO:0003677 Molecular Function:DNA binding
## 2 RC0G0000200 GO:0003677 Molecular Function:DNA binding
## 3 RC0G0000300 GO:0005524 Molecular Function:ATP binding

KEGG_orthologs注释

OB_KEGG_orthologs <- read.csv("Rosa_chinensis_v1.0_KEGG_orthologs.csv")
head(OB_KEGG_orthologs,3)

##         Query  Match
## 1 RC0G0000400 K08909
## 2 RC0G0001100 K00975
## 3 RC0G0001500 K19307
##                                                    Description
## 1 light-harvesting complex I chlorophyll a/b binding protein 3
## 2        glucose-1-phosphate adenylyltransferase [EC:2.7.7.27]
## 3    25S rRNA (uracil2634-N3)-methyltransferase [EC:2.1.1.313]

KEGG_pathways注释

OB_KEGG_pathways <- read.csv("Rosa_chinensis_v1.0_KEGG_pathways.csv")
head(OB_KEGG_pathways,3)

##         Query   Match                       Description
## 1 RC0G0000400 ko00196 Photosynthesis - antenna proteins
## 2 RC0G0000900 ko03460            Fanconi anemia pathway
## 3 RC0G0001100 ko00500     Starch and sucrose metabolism

1.2 使用EggNOG-mapper v2进行注释

功能注释、直系同源分配和域预测

如果没有注释文件，就需要自己注释，这时需要准备蛋白序列文件。

比如：下载月季基因组的蛋白文件Rosa chinensis v1.0_prot.fasta.gz（https://www.rosaceae.org/species/rosa/chinensis/genome_v1.0）。

步骤如下：

打开网站：http://eggnog-mapper.embl.de
Upload sequences (比如：上传Rosa chinensis v1.0_prot.fasta.gz)
填写Email address
选择数据库Database（eggNOG 5是基于5090个生物体和2502个病毒的分层、功能和系统发育注释的直系同源资源。
设置参数Search filters
submit
打开邮箱Click to manage your job
start job
注释完成，下载注释文件（tsv或者exel）。
转换格式，提取所需要的列

原始注释tsv文件

ob.emapper <- read.delim("MM_h8ls6nrd.emapper.annotations.tsv", comment.char="#")
head(ob.emapper, 1)

##   RC0G0000100 X57918.XP_004289091.1 X2.13e.20 X93.6
## 1 RC0G0000200   3988.XP_002530735.1   6.4e-28   112
##   X2CMU5.1.root.2QRYV.2759.Eukaryota.37TDI.33090.Viridiplantae.3GJEB.35493.Streptophyta.4JKKC.91835.fabids
## 1  2CMU5@1|root,2QRYV@2759|Eukaryota,37TDI@33090|Viridiplantae,3GJEB@35493|Streptophyta,4JKKC@91835|fabids
##   X35493.Streptophyta K Homeodomain WUS
## 1  35493|Streptophyta K Homeodomain WUS
##                                                                                                                                                                                                                                                                                                                                                                                                                                              GO.0000003.GO.0001763.GO.0003006.GO.0003674.GO.0003700.GO.0006355.GO.0007275.GO.0008150.GO.0009653.GO.0009791.GO.0009888.GO.0009889.GO.0009908.GO.0009933.GO.0009987.GO.0010014.GO.0010016.GO.0010223.GO.0010346.GO.0010468.GO.0010556.GO.0019219.GO.00192 ...
## 1 GO:0000003,GO:0001763,GO:0003006,GO:0003674,GO:0003700,GO:0006355,GO:0007275,GO:0008150,GO:0009653,GO:0009791,GO:0009888,GO:0009889,GO:0009908,GO:0009933,GO:0009987,GO:0010014,GO:0010016,GO:0010223,GO:0010346,GO:0010468,GO:0010556,GO:0019219,GO:0019222,GO:0019827,GO:0022412,GO:0022414,GO:0031323,GO:0031326,GO:0032501,GO:0032502,GO:0032504,GO:0048367,GO:0048437,GO:0048438,GO:0048443,GO:0048466,GO:0048507,GO:0048532,GO:0048608,GO:0048646,GO:0048653,GO:0048731,GO:0048827,GO:0048856,GO:0050789,GO:0050794,GO:0051171,GO:0051252,GO:0060255,GO:0061458,GO:0065007,GO:0080090,GO:0080166,GO:0090506,GO:0090567,GO:0098727,GO:0099402,GO:0140110,GO:1903506,GO:1905393,GO:2000112,GO:2001141
##   X. X..1 X..2 X..3 X..4 X..5 X..6 X..7 X..8 X..9 Homeobox
## 1  -    -    -    -    -    -    -    -    -    - Homeobox

转换格式需要tidyverse()包

install.packages("tidyverse") #如没有包先安装，如有无需重复安装。

suppressWarnings(suppressMessages({library(tidyverse)}))
#suppressWarnings(suppressMessages())抑制R包加载时的警告和其他信息

转换后的文件格式

ob.emapper.m <- read_delim(
  file = 'MM_h8ls6nrd.emapper.annotations.tsv',"\t", 
  escape_double = FALSE, col_names = FALSE,
  comment = "#", trim_ws = TRUE) %>%   #行以"#"开头的行将被视为注释，
  dplyr::select(GID = X1, 
                Gene_Symbol = X21, 
                GO = X10, 
                KO = X12, 
                Pathway = X13, 
                OG = X21, 
                Gene_Name = X8) #筛选需要的列，并设置名称

## Rows: 32301 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (19): X1, X2, X5, X6, X7, X8, X9, X10, X11, X12, X13, X14, X15, X16, X17...
## dbl  (2): X3, X4
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

head(ob.emapper.m)

## # A tibble: 6 × 7
##   GID         Gene_Symbol     GO                   KO    Pathway OG    Gene_Name
##   <chr>       <chr>           <chr>                <chr> <chr>   <chr> <chr>    
## 1 RC0G0000100 Homeobox        GO:0000003,GO:00017… -     -       Home… Homeodom…
## 2 RC0G0000200 Homeobox        GO:0000003,GO:00017… -     -       Home… Homeodom…
## 3 RC0G0000300 DEAD,Helicase_C GO:0000373,GO:00003… ko:K… ko0111… DEAD… DEAD-box…
## 4 RC0G0000400 Chloroa_b-bind  GO:0003674,GO:00054… ko:K… ko0019… Chlo… The ligh…
## 5 RC0G0000500 EMP70           GO:0005575,GO:00056… ko:K… -       EMP70 Belongs …
## 6 RC0G0000600 Myb_DNA-binding -                    -     -       Myb_… Myb fami…

2 创建和安装自己的 “OrgDB” R packages

安装所需R包

if (!requireNamespace("BiocManager", quietly = TRUE))
  install.packages("BiocManager")
BiocManager::install("clusterProfiler")
BiocManager::install("topGO")
BiocManager::install("magrittr")
BiocManager::install("AnnotationForge")

2.1 创建”OrgDB”R包

在ob.emapper.m中提取基因信息，并且去掉gene_name空值的数据

gene_info <- dplyr::select(ob.emapper.m,GID, Gene_Name) %>%
  dplyr::filter(!is.na(Gene_Name))
head(gene_info)

## # A tibble: 6 × 2
##   GID         Gene_Name                                                         
##   <chr>       <chr>                                                             
## 1 RC0G0000100 Homeodomain                                                       
## 2 RC0G0000200 Homeodomain                                                       
## 3 RC0G0000300 DEAD-box ATP-dependent RNA helicase                               
## 4 RC0G0000400 The light-harvesting complex (LHC) functions as a light receptor,…
## 5 RC0G0000500 Belongs to the nonaspanin (TM9SF) (TC 9.A.2) family               
## 6 RC0G0000600 Myb family transcription factor

注意：如果是直接下载的注释文件，可根据ob.emapper.m文件格式进行排版。

提取GO信息，以逗号进行分裂，去掉空值

gene2go <- dplyr::select(ob.emapper.m, GID, GO) %>%
  separate_rows(GO, sep = ',', convert = F) %>%
  filter(GO != "-") %>%
  filter(!is.na(GO)) %>%
  mutate(EVIDENCE = 'IEA') 
head(gene2go)

## # A tibble: 6 × 3
##   GID         GO         EVIDENCE
##   <chr>       <chr>      <chr>   
## 1 RC0G0000100 GO:0000003 IEA     
## 2 RC0G0000100 GO:0001763 IEA     
## 3 RC0G0000100 GO:0003006 IEA     
## 4 RC0G0000100 GO:0003674 IEA     
## 5 RC0G0000100 GO:0003700 IEA     
## 6 RC0G0000100 GO:0006355 IEA

AnnotationForge::makeOrgPackage(gene_info=gene_info, #基因信息
                                go=gene2go,#go信息
                                maintainer='<ningning.zhou@ynu.edu.cn>',#邮箱，不能是qq和网易邮箱
                                author='Mory',#创建者
                                outputDir=".",#输出路径
                                tax_id=6231,#ncbi中构建的这个物种的taxonomy ID
                                genus="Nematoda",#属
                                species="celegans",#种
                                goTable="go",
                                version="1.0")
pkgbuild::build('./org.RcOB.eg.db', dest_path = "./")# 打包

2.2 安装”OrgDB”R包

dir.create('R_Library', recursive = T)
install.packages('org.Ncelegans.eg.db', 
                 repos = NULL, 
                 lib = 'R_Library',
                 type = "source")

install.packages("readxl")

3 GO富集分析

差异genelist

genelist <- read.csv("NPIvsPI_sigGenelist.csv", sep="")
#如果导入的文件是xlsx格式，需要加载library(readxl)

library(clusterProfiler)

##

## clusterProfiler v4.10.0  For help: https://yulab-smu.top/biomedical-knowledge-mining-book/
## 
## If you use clusterProfiler in published research, please cite:
## T Wu, E Hu, S Xu, M Chen, P Guo, Z Dai, T Feng, L Zhou, W Tang, L Zhan, X Fu, S Liu, X Bo, and G Yu. clusterProfiler 4.0: A universal enrichment tool for interpreting omics data. The Innovation. 2021, 2(3):100141

## 
## Attaching package: 'clusterProfiler'

## The following object is masked from 'package:purrr':
## 
##     simplify

## The following object is masked from 'package:stats':
## 
##     filter

library(org.Ncelegans.eg.db, lib = 'R_Library') #如果在前面没有设置路径，需要在lib = ""中加入路径

## Loading required package: AnnotationDbi

## Loading required package: stats4

## Loading required package: BiocGenerics

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:lubridate':
## 
##     intersect, setdiff, union

## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min

## Loading required package: Biobase

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

## Loading required package: IRanges

## Loading required package: S4Vectors

## 
## Attaching package: 'S4Vectors'

## The following object is masked from 'package:clusterProfiler':
## 
##     rename

## The following objects are masked from 'package:lubridate':
## 
##     second, second<-

## The following objects are masked from 'package:dplyr':
## 
##     first, rename

## The following object is masked from 'package:tidyr':
## 
##     expand

## The following object is masked from 'package:utils':
## 
##     findMatches

## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname

## 
## Attaching package: 'IRanges'

## The following object is masked from 'package:clusterProfiler':
## 
##     slice

## The following object is masked from 'package:lubridate':
## 
##     %within%

## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice

## The following object is masked from 'package:purrr':
## 
##     reduce

## 
## Attaching package: 'AnnotationDbi'

## The following object is masked from 'package:clusterProfiler':
## 
##     select

## The following object is masked from 'package:dplyr':
## 
##     select

##

gene = as.vector(as.matrix(genelist$GeneID))

de_ego <- enrichGO(gene = gene,
                   OrgDb = org.Ncelegans.eg.db,
                   keyType = 'GID',
                   ont = 'ALL', #BP MF CC
                   qvalueCutoff = 0.01,
                   pvalueCutoff = 0.01,
                   pAdjustMethod = "BH" #   one of "holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr", "none"
)

head(de_ego)

##            ONTOLOGY         ID                                   Description
## GO:0010481       BP GO:0010481                       epidermal cell division
## GO:0010482       BP GO:0010482         regulation of epidermal cell division
## GO:0042814       BP GO:0042814                         monopolar cell growth
## GO:0048530       BP GO:0048530                           fruit morphogenesis
## GO:2000039       BP GO:2000039          regulation of trichome morphogenesis
## GO:0030856       BP GO:0030856 regulation of epithelial cell differentiation
##            GeneRatio  BgRatio       pvalue    p.adjust      qvalue
## GO:0010481     5/500 11/14528 1.841261e-05 0.006886316 0.006632415
## GO:0010482     5/500 11/14528 1.841261e-05 0.006886316 0.006632415
## GO:0042814     5/500 11/14528 1.841261e-05 0.006886316 0.006632415
## GO:0048530     5/500 11/14528 1.841261e-05 0.006886316 0.006632415
## GO:2000039     5/500 11/14528 1.841261e-05 0.006886316 0.006632415
## GO:0030856     5/500 12/14528 3.067219e-05 0.007169623 0.006905278
##                                                                 geneID Count
## GO:0010481 RC1G0366400/RC1G0366500/RC1G0366800/RC1G0367300/RC1G0367400     5
## GO:0010482 RC1G0366400/RC1G0366500/RC1G0366800/RC1G0367300/RC1G0367400     5
## GO:0042814 RC1G0366400/RC1G0366500/RC1G0366800/RC1G0367300/RC1G0367400     5
## GO:0048530 RC1G0366400/RC1G0366500/RC1G0366800/RC1G0367300/RC1G0367400     5
## GO:2000039 RC1G0366400/RC1G0366500/RC1G0366800/RC1G0367300/RC1G0367400     5
## GO:0030856 RC1G0366400/RC1G0366500/RC1G0366800/RC1G0367300/RC1G0367400     5

保存文件

write.csv(de_ego, file = "NPIvsPI_GO.csv") #保存为csv文件

#rmredunego <- simplify(de_ego, cutoff=0.7, by="p.adjust", select_fun=min)
de_ego_df <- as.data.frame(de_ego)
barplot(de_ego, showCategory = 15)

enrichplot::dotplot(de_ego, showCategory = 15)

BiocManager::install("GOSemSim")

4 KEGG 富集

解决R版本太高会出现的不兼容问题，如没有该问题请忽略

library(R.utils)
R.utils::setOption("clusterProfiler.download.method","auto")

4.1 准备 TERM2GENE

将ko号与基因ID对应上

pathway2gene <- dplyr::select(ob.emapper.m, Pathway, GID) %>%
  separate_rows(Pathway, sep = ',', convert = F)%>%
  filter(str_detect(Pathway, 'ko'))%>%
  mutate(Pathway = str_remove(Pathway, 'ko'))#去掉ko

将ko号和path_name对应上

library(magrittr)

get_path2name <- function(){
  keggpathid2name.df <- clusterProfiler:::kegg_list("pathway")
  keggpathid2name.df[,1] %<>% gsub("path:map", "", .)
  colnames(keggpathid2name.df) <- c("path_id","path_name")
  return(keggpathid2name.df)
}

kegg_list <- read_csv("kegg_list.csv")

get_path2name <- function(){
  keggpathid2name.df <- kegg_list
  keggpathid2name.df[,1] %<>% gsub("path:map", "", .)
  colnames(keggpathid2name.df) <- c("path_id","path_name")
  return(keggpathid2name.df)
}

pathway2name <- get_path2name()
pathway2name <- dplyr::select(pathway2name, path_id, path_name) %>%
  filter(str_detect(path_id, 'map'))%>%
  mutate(path_id = str_remove(path_id, 'map'))

library(clusterProfiler)
de_ekp <- enricher(gene=gene,
                   TERM2GENE = pathway2gene,
                   TERM2NAME = pathway2name,
                   pvalueCutoff = 0.05,
                   qvalueCutoff = 0.05,
                   pAdjustMethod  = "BH")
de_ekp_df <- as.data.frame(de_ekp)
de_ekp_df <- left_join(de_ekp_df,pathway2des,by = "ID")

导出数据

write.csv(de_ekp_df, file = "NPIvsPI_kegg.csv") 
write.csv(pathway2name, file = "pathway2name.csv") 
#openxlsx::write.xlsx(de_ekp_df,"UP0H_kegg.xlsx")
#openxlsx::write.xlsx(pathway2name,"pathway2name.xlsx")

可视化

barplot(de_ekp, showCa = 20)
enrichplot::dotplot(de_ekp, showCategory = 20)