设置工作路径
setwd("~/Desktop/bioinfor/RNAseqR/GO_Enrichment")
一般已发表的基因组都会给注释好的文件,直接下载即可。
比如:月季基因组的四个注释文件(https://www.rosaceae.org/species/rosa/chinensis/genome_v1.0)。
OB_IRP <- read.csv("Rosa_chinensis_v1.0_IRP.csv")
head(OB_IRP,3)
## Query Match Description
## 1 RC0G0000100 IPR001356 Homeobox domain
## 2 RC0G0000100 IPR009057 Homeobox-like domain superfamily
## 3 RC0G0000200 IPR001356 Homeobox domain
OB_GO <- read.csv("Rosa_chinensis_v1.0_GO.csv")
head(OB_GO,3)
## Query Match Description
## 1 RC0G0000100 GO:0003677 Molecular Function:DNA binding
## 2 RC0G0000200 GO:0003677 Molecular Function:DNA binding
## 3 RC0G0000300 GO:0005524 Molecular Function:ATP binding
OB_KEGG_orthologs <- read.csv("Rosa_chinensis_v1.0_KEGG_orthologs.csv")
head(OB_KEGG_orthologs,3)
## Query Match
## 1 RC0G0000400 K08909
## 2 RC0G0001100 K00975
## 3 RC0G0001500 K19307
## Description
## 1 light-harvesting complex I chlorophyll a/b binding protein 3
## 2 glucose-1-phosphate adenylyltransferase [EC:2.7.7.27]
## 3 25S rRNA (uracil2634-N3)-methyltransferase [EC:2.1.1.313]
OB_KEGG_pathways <- read.csv("Rosa_chinensis_v1.0_KEGG_pathways.csv")
head(OB_KEGG_pathways,3)
## Query Match Description
## 1 RC0G0000400 ko00196 Photosynthesis - antenna proteins
## 2 RC0G0000900 ko03460 Fanconi anemia pathway
## 3 RC0G0001100 ko00500 Starch and sucrose metabolism
功能注释、直系同源分配和域预测
如果没有注释文件,就需要自己注释,这时需要准备蛋白序列文件。
比如:下载月季基因组的蛋白文件Rosa chinensis v1.0_prot.fasta.gz(https://www.rosaceae.org/species/rosa/chinensis/genome_v1.0)。
步骤如下:
Upload sequences (比如:上传Rosa chinensis v1.0_prot.fasta.gz)
填写Email address
选择数据库Database(eggNOG 5是基于5090个生物体和2502个病毒的分层、功能和系统发育注释的直系同源资源。
设置参数Search filters
submit
打开邮箱Click to manage your job
start job
注释完成,下载注释文件(tsv或者exel)。
转换格式,提取所需要的列
原始注释tsv文件
ob.emapper <- read.delim("MM_h8ls6nrd.emapper.annotations.tsv", comment.char="#")
head(ob.emapper, 1)
## RC0G0000100 X57918.XP_004289091.1 X2.13e.20 X93.6
## 1 RC0G0000200 3988.XP_002530735.1 6.4e-28 112
## X2CMU5.1.root.2QRYV.2759.Eukaryota.37TDI.33090.Viridiplantae.3GJEB.35493.Streptophyta.4JKKC.91835.fabids
## 1 2CMU5@1|root,2QRYV@2759|Eukaryota,37TDI@33090|Viridiplantae,3GJEB@35493|Streptophyta,4JKKC@91835|fabids
## X35493.Streptophyta K Homeodomain WUS
## 1 35493|Streptophyta K Homeodomain WUS
## GO.0000003.GO.0001763.GO.0003006.GO.0003674.GO.0003700.GO.0006355.GO.0007275.GO.0008150.GO.0009653.GO.0009791.GO.0009888.GO.0009889.GO.0009908.GO.0009933.GO.0009987.GO.0010014.GO.0010016.GO.0010223.GO.0010346.GO.0010468.GO.0010556.GO.0019219.GO.00192 ...
## 1 GO:0000003,GO:0001763,GO:0003006,GO:0003674,GO:0003700,GO:0006355,GO:0007275,GO:0008150,GO:0009653,GO:0009791,GO:0009888,GO:0009889,GO:0009908,GO:0009933,GO:0009987,GO:0010014,GO:0010016,GO:0010223,GO:0010346,GO:0010468,GO:0010556,GO:0019219,GO:0019222,GO:0019827,GO:0022412,GO:0022414,GO:0031323,GO:0031326,GO:0032501,GO:0032502,GO:0032504,GO:0048367,GO:0048437,GO:0048438,GO:0048443,GO:0048466,GO:0048507,GO:0048532,GO:0048608,GO:0048646,GO:0048653,GO:0048731,GO:0048827,GO:0048856,GO:0050789,GO:0050794,GO:0051171,GO:0051252,GO:0060255,GO:0061458,GO:0065007,GO:0080090,GO:0080166,GO:0090506,GO:0090567,GO:0098727,GO:0099402,GO:0140110,GO:1903506,GO:1905393,GO:2000112,GO:2001141
## X. X..1 X..2 X..3 X..4 X..5 X..6 X..7 X..8 X..9 Homeobox
## 1 - - - - - - - - - - Homeobox
转换格式需要tidyverse()包
install.packages("tidyverse") #如没有包先安装,如有无需重复安装。
suppressWarnings(suppressMessages({library(tidyverse)}))
#suppressWarnings(suppressMessages())抑制R包加载时的警告和其他信息
转换后的文件格式
ob.emapper.m <- read_delim(
file = 'MM_h8ls6nrd.emapper.annotations.tsv',"\t",
escape_double = FALSE, col_names = FALSE,
comment = "#", trim_ws = TRUE) %>% #行以"#"开头的行将被视为注释,
dplyr::select(GID = X1,
Gene_Symbol = X21,
GO = X10,
KO = X12,
Pathway = X13,
OG = X21,
Gene_Name = X8) #筛选需要的列,并设置名称
## Rows: 32301 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (19): X1, X2, X5, X6, X7, X8, X9, X10, X11, X12, X13, X14, X15, X16, X17...
## dbl (2): X3, X4
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(ob.emapper.m)
## # A tibble: 6 × 7
## GID Gene_Symbol GO KO Pathway OG Gene_Name
## <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 RC0G0000100 Homeobox GO:0000003,GO:00017… - - Home… Homeodom…
## 2 RC0G0000200 Homeobox GO:0000003,GO:00017… - - Home… Homeodom…
## 3 RC0G0000300 DEAD,Helicase_C GO:0000373,GO:00003… ko:K… ko0111… DEAD… DEAD-box…
## 4 RC0G0000400 Chloroa_b-bind GO:0003674,GO:00054… ko:K… ko0019… Chlo… The ligh…
## 5 RC0G0000500 EMP70 GO:0005575,GO:00056… ko:K… - EMP70 Belongs …
## 6 RC0G0000600 Myb_DNA-binding - - - Myb_… Myb fami…
安装所需R包
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("clusterProfiler")
BiocManager::install("topGO")
BiocManager::install("magrittr")
BiocManager::install("AnnotationForge")
在ob.emapper.m中提取基因信息,并且去掉gene_name空值的数据
gene_info <- dplyr::select(ob.emapper.m,GID, Gene_Name) %>%
dplyr::filter(!is.na(Gene_Name))
head(gene_info)
## # A tibble: 6 × 2
## GID Gene_Name
## <chr> <chr>
## 1 RC0G0000100 Homeodomain
## 2 RC0G0000200 Homeodomain
## 3 RC0G0000300 DEAD-box ATP-dependent RNA helicase
## 4 RC0G0000400 The light-harvesting complex (LHC) functions as a light receptor,…
## 5 RC0G0000500 Belongs to the nonaspanin (TM9SF) (TC 9.A.2) family
## 6 RC0G0000600 Myb family transcription factor
注意:如果是直接下载的注释文件,可根据ob.emapper.m文件格式进行排版。
提取GO信息,以逗号进行分裂,去掉空值
gene2go <- dplyr::select(ob.emapper.m, GID, GO) %>%
separate_rows(GO, sep = ',', convert = F) %>%
filter(GO != "-") %>%
filter(!is.na(GO)) %>%
mutate(EVIDENCE = 'IEA')
head(gene2go)
## # A tibble: 6 × 3
## GID GO EVIDENCE
## <chr> <chr> <chr>
## 1 RC0G0000100 GO:0000003 IEA
## 2 RC0G0000100 GO:0001763 IEA
## 3 RC0G0000100 GO:0003006 IEA
## 4 RC0G0000100 GO:0003674 IEA
## 5 RC0G0000100 GO:0003700 IEA
## 6 RC0G0000100 GO:0006355 IEA
AnnotationForge::makeOrgPackage(gene_info=gene_info, #基因信息
go=gene2go,#go信息
maintainer='<ningning.zhou@ynu.edu.cn>',#邮箱,不能是qq和网易邮箱
author='Mory',#创建者
outputDir=".",#输出路径
tax_id=6231,#ncbi中构建的这个物种的taxonomy ID
genus="Nematoda",#属
species="celegans",#种
goTable="go",
version="1.0")
pkgbuild::build('./org.RcOB.eg.db', dest_path = "./")# 打包
dir.create('R_Library', recursive = T)
install.packages('org.Ncelegans.eg.db',
repos = NULL,
lib = 'R_Library',
type = "source")
install.packages("readxl")
差异genelist
genelist <- read.csv("NPIvsPI_sigGenelist.csv", sep="")
#如果导入的文件是xlsx格式,需要加载library(readxl)
library(clusterProfiler)
##
## clusterProfiler v4.10.0 For help: https://yulab-smu.top/biomedical-knowledge-mining-book/
##
## If you use clusterProfiler in published research, please cite:
## T Wu, E Hu, S Xu, M Chen, P Guo, Z Dai, T Feng, L Zhou, W Tang, L Zhan, X Fu, S Liu, X Bo, and G Yu. clusterProfiler 4.0: A universal enrichment tool for interpreting omics data. The Innovation. 2021, 2(3):100141
##
## Attaching package: 'clusterProfiler'
## The following object is masked from 'package:purrr':
##
## simplify
## The following object is masked from 'package:stats':
##
## filter
library(org.Ncelegans.eg.db, lib = 'R_Library') #如果在前面没有设置路径,需要在lib = ""中加入路径
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:lubridate':
##
## intersect, setdiff, union
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: IRanges
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:clusterProfiler':
##
## rename
## The following objects are masked from 'package:lubridate':
##
## second, second<-
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following object is masked from 'package:tidyr':
##
## expand
## The following object is masked from 'package:utils':
##
## findMatches
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:clusterProfiler':
##
## slice
## The following object is masked from 'package:lubridate':
##
## %within%
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
## The following object is masked from 'package:purrr':
##
## reduce
##
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:clusterProfiler':
##
## select
## The following object is masked from 'package:dplyr':
##
## select
##
gene = as.vector(as.matrix(genelist$GeneID))
de_ego <- enrichGO(gene = gene,
OrgDb = org.Ncelegans.eg.db,
keyType = 'GID',
ont = 'ALL', #BP MF CC
qvalueCutoff = 0.01,
pvalueCutoff = 0.01,
pAdjustMethod = "BH" # one of "holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr", "none"
)
head(de_ego)
## ONTOLOGY ID Description
## GO:0010481 BP GO:0010481 epidermal cell division
## GO:0010482 BP GO:0010482 regulation of epidermal cell division
## GO:0042814 BP GO:0042814 monopolar cell growth
## GO:0048530 BP GO:0048530 fruit morphogenesis
## GO:2000039 BP GO:2000039 regulation of trichome morphogenesis
## GO:0030856 BP GO:0030856 regulation of epithelial cell differentiation
## GeneRatio BgRatio pvalue p.adjust qvalue
## GO:0010481 5/500 11/14528 1.841261e-05 0.006886316 0.006632415
## GO:0010482 5/500 11/14528 1.841261e-05 0.006886316 0.006632415
## GO:0042814 5/500 11/14528 1.841261e-05 0.006886316 0.006632415
## GO:0048530 5/500 11/14528 1.841261e-05 0.006886316 0.006632415
## GO:2000039 5/500 11/14528 1.841261e-05 0.006886316 0.006632415
## GO:0030856 5/500 12/14528 3.067219e-05 0.007169623 0.006905278
## geneID Count
## GO:0010481 RC1G0366400/RC1G0366500/RC1G0366800/RC1G0367300/RC1G0367400 5
## GO:0010482 RC1G0366400/RC1G0366500/RC1G0366800/RC1G0367300/RC1G0367400 5
## GO:0042814 RC1G0366400/RC1G0366500/RC1G0366800/RC1G0367300/RC1G0367400 5
## GO:0048530 RC1G0366400/RC1G0366500/RC1G0366800/RC1G0367300/RC1G0367400 5
## GO:2000039 RC1G0366400/RC1G0366500/RC1G0366800/RC1G0367300/RC1G0367400 5
## GO:0030856 RC1G0366400/RC1G0366500/RC1G0366800/RC1G0367300/RC1G0367400 5
保存文件
write.csv(de_ego, file = "NPIvsPI_GO.csv") #保存为csv文件
#rmredunego <- simplify(de_ego, cutoff=0.7, by="p.adjust", select_fun=min)
de_ego_df <- as.data.frame(de_ego)
barplot(de_ego, showCategory = 15)
enrichplot::dotplot(de_ego, showCategory = 15)
BiocManager::install("GOSemSim")
解决R版本太高会出现的不兼容问题,如没有该问题请忽略
library(R.utils)
R.utils::setOption("clusterProfiler.download.method","auto")
4.1 准备 TERM2GENE
将ko号与基因ID对应上
pathway2gene <- dplyr::select(ob.emapper.m, Pathway, GID) %>%
separate_rows(Pathway, sep = ',', convert = F)%>%
filter(str_detect(Pathway, 'ko'))%>%
mutate(Pathway = str_remove(Pathway, 'ko'))#去掉ko
将ko号和path_name对应上
library(magrittr)
get_path2name <- function(){
keggpathid2name.df <- clusterProfiler:::kegg_list("pathway")
keggpathid2name.df[,1] %<>% gsub("path:map", "", .)
colnames(keggpathid2name.df) <- c("path_id","path_name")
return(keggpathid2name.df)
}
kegg_list <- read_csv("kegg_list.csv")
get_path2name <- function(){
keggpathid2name.df <- kegg_list
keggpathid2name.df[,1] %<>% gsub("path:map", "", .)
colnames(keggpathid2name.df) <- c("path_id","path_name")
return(keggpathid2name.df)
}
pathway2name <- get_path2name()
pathway2name <- dplyr::select(pathway2name, path_id, path_name) %>%
filter(str_detect(path_id, 'map'))%>%
mutate(path_id = str_remove(path_id, 'map'))
library(clusterProfiler)
de_ekp <- enricher(gene=gene,
TERM2GENE = pathway2gene,
TERM2NAME = pathway2name,
pvalueCutoff = 0.05,
qvalueCutoff = 0.05,
pAdjustMethod = "BH")
de_ekp_df <- as.data.frame(de_ekp)
de_ekp_df <- left_join(de_ekp_df,pathway2des,by = "ID")
导出数据
write.csv(de_ekp_df, file = "NPIvsPI_kegg.csv")
write.csv(pathway2name, file = "pathway2name.csv")
#openxlsx::write.xlsx(de_ekp_df,"UP0H_kegg.xlsx")
#openxlsx::write.xlsx(pathway2name,"pathway2name.xlsx")
可视化
barplot(de_ekp, showCa = 20)
enrichplot::dotplot(de_ekp, showCategory = 20)