近年来,大量的生物数据已在公共数据存储库中可用。要进行全面的生物信息学数据分析,需要轻松访问这些宝贵的数据资源,并与数据分析进行可靠的集成。biomartR包为越来越多的实现生物集市软件套件的数据库集合提供了接口。该包能够统一检索大量数据,而无需了解基础数据库架构或写入复杂的sql查询例如:ensmbl、uniprot和hapmap。
biomaRt 包提供了在R中访问多种数据库的机会,这样就避免了自己去下载读取的繁琐步骤
##wk and path
dir.create("D:/R/biomaRt")
## Warning in dir.create("D:/R/biomaRt"): 'D:\R\biomaRt' already exists
setwd("D:/R/biomaRt")
library("biomaRt")
listMarts()# 列出资源
## biomart version
## 1 ENSEMBL_MART_ENSEMBL Ensembl Genes 97
## 2 ENSEMBL_MART_MOUSE Mouse strains 97
## 3 ENSEMBL_MART_SNP Ensembl Variation 97
## 4 ENSEMBL_MART_FUNCGEN Ensembl Regulation 97
## useMart
ensembl=useMart("ensembl")
## 查看有哪些数据集可用
datasets <- listDatasets(ensembl)
dim(datasets)
## [1] 190 3
head(datasets)
## dataset description
## 1 abrachyrhynchus_gene_ensembl Pink-footed goose genes (ASM259213v1)
## 2 acalliptera_gene_ensembl Eastern happy genes (fAstCal1.2)
## 3 acarolinensis_gene_ensembl Anole lizard genes (AnoCar2.0)
## 4 acitrinellus_gene_ensembl Midas cichlid genes (Midas_v5)
## 5 ahaastii_gene_ensembl Great spotted kiwi genes (aptHaa1)
## 6 amelanoleuca_gene_ensembl Panda genes (ailMel1)
## version
## 1 ASM259213v1
## 2 fAstCal1.2
## 3 AnoCar2.0
## 4 Midas_v5
## 5 aptHaa1
## 6 ailMel1
## 选择感兴趣的数据集
ensembl = useDataset("hsapiens_gene_ensembl",mart=ensembl)
## 或者
#ensembl = useMart("ensembl",dataset="hsapiens_gene_ensembl")
## getBM函数的参数
## Filter 限制输出
filters = listFilters(ensembl)# 结果输出筛选
filters[1:5,]
## name description
## 1 chromosome_name Chromosome/scaffold name
## 2 start Start
## 3 end End
## 4 band_start Band Start
## 5 band_end Band End
## Attributes 获取信息
attributes = listAttributes(ensembl)
dim(attributes)
## [1] 3450 3
attributes[1:5,]
## name description page
## 1 ensembl_gene_id Gene stable ID feature_page
## 2 ensembl_gene_id_version Gene stable ID version feature_page
## 3 ensembl_transcript_id Transcript stable ID feature_page
## 4 ensembl_transcript_id_version Transcript stable ID version feature_page
## 5 ensembl_peptide_id Protein stable ID feature_page
attributes[104:136,] ## 芯片注释信息
## name
## 104 affy_hc_g110
## 105 affy_hg_focus
## 106 affy_hg_u133a
## 107 affy_hg_u133a_2
## 108 affy_hg_u133b
## 109 affy_hg_u133_plus_2
## 110 affy_hg_u95a
## 111 affy_hg_u95av2
## 112 affy_hg_u95b
## 113 affy_hg_u95c
## 114 affy_hg_u95d
## 115 affy_hg_u95e
## 116 affy_hta_2_0
## 117 affy_huex_1_0_st_v2
## 118 affy_hugenefl
## 119 affy_hugene_1_0_st_v1
## 120 affy_hugene_2_0_st_v1
## 121 affy_primeview
## 122 affy_u133_x3p
## 123 agilent_cgh_44b
## 124 agilent_gpl6848
## 125 agilent_sureprint_g3_ge_8x60k
## 126 agilent_sureprint_g3_ge_8x60k_v2
## 127 agilent_wholegenome
## 128 agilent_wholegenome_4x44k_v1
## 129 agilent_wholegenome_4x44k_v2
## 130 codelink_codelink
## 131 illumina_humanht_12_v3
## 132 illumina_humanht_12_v4
## 133 illumina_humanref_8_v3
## 134 illumina_humanwg_6_v1
## 135 illumina_humanwg_6_v2
## 136 illumina_humanwg_6_v3
## description page
## 104 AFFY HC G110 probe feature_page
## 105 AFFY HG Focus probe feature_page
## 106 AFFY HG U133A probe feature_page
## 107 AFFY HG U133A 2 probe feature_page
## 108 AFFY HG U133B probe feature_page
## 109 AFFY HG U133 Plus 2 probe feature_page
## 110 AFFY HG U95A probe feature_page
## 111 AFFY HG U95Av2 probe feature_page
## 112 AFFY HG U95B probe feature_page
## 113 AFFY HG U95C probe feature_page
## 114 AFFY HG U95D probe feature_page
## 115 AFFY HG U95E probe feature_page
## 116 AFFY HTA 2 0 probe feature_page
## 117 AFFY HuEx 1 0 st v2 probe feature_page
## 118 AFFY HuGeneFL probe feature_page
## 119 AFFY HuGene 1 0 st v1 probe feature_page
## 120 AFFY HuGene 2 0 st v1 probe feature_page
## 121 AFFY PrimeView probe feature_page
## 122 AFFY U133 X3P probe feature_page
## 123 AGILENT CGH 44b probe feature_page
## 124 AGILENT GPL6848 probe feature_page
## 125 AGILENT SurePrint G3 GE 8x60k probe feature_page
## 126 AGILENT SurePrint G3 GE 8x60k v2 probe feature_page
## 127 AGILENT WholeGenome probe feature_page
## 128 AGILENT WholeGenome 4x44k v1 probe feature_page
## 129 AGILENT WholeGenome 4x44k v2 probe feature_page
## 130 CODELINK CODELINK probe feature_page
## 131 ILLUMINA HumanHT 12 V3 probe feature_page
## 132 ILLUMINA HumanHT 12 V4 probe feature_page
## 133 ILLUMINA HumanRef 8 V3 probe feature_page
## 134 ILLUMINA HumanWG 6 V1 probe feature_page
## 135 ILLUMINA HumanWG 6 V2 probe feature_page
## 136 ILLUMINA HumanWG 6 V3 probe feature_page
## 所有可供探针信息
prod<-attributes$description
probe<-attributes[grepl("probe",prod),]
str(probe)
## 'data.frame': 34 obs. of 3 variables:
## $ name : chr "affy_hc_g110" "affy_hg_focus" "affy_hg_u133a" "affy_hg_u133a_2" ...
## $ description: chr "AFFY HC G110 probe" "AFFY HG Focus probe" "AFFY HG U133A probe" "AFFY HG U133A 2 probe" ...
## $ page : chr "feature_page" "feature_page" "feature_page" "feature_page" ...
probe
## name
## 104 affy_hc_g110
## 105 affy_hg_focus
## 106 affy_hg_u133a
## 107 affy_hg_u133a_2
## 108 affy_hg_u133b
## 109 affy_hg_u133_plus_2
## 110 affy_hg_u95a
## 111 affy_hg_u95av2
## 112 affy_hg_u95b
## 113 affy_hg_u95c
## 114 affy_hg_u95d
## 115 affy_hg_u95e
## 116 affy_hta_2_0
## 117 affy_huex_1_0_st_v2
## 118 affy_hugenefl
## 119 affy_hugene_1_0_st_v1
## 120 affy_hugene_2_0_st_v1
## 121 affy_primeview
## 122 affy_u133_x3p
## 123 agilent_cgh_44b
## 124 agilent_gpl6848
## 125 agilent_sureprint_g3_ge_8x60k
## 126 agilent_sureprint_g3_ge_8x60k_v2
## 127 agilent_wholegenome
## 128 agilent_wholegenome_4x44k_v1
## 129 agilent_wholegenome_4x44k_v2
## 130 codelink_codelink
## 131 illumina_humanht_12_v3
## 132 illumina_humanht_12_v4
## 133 illumina_humanref_8_v3
## 134 illumina_humanwg_6_v1
## 135 illumina_humanwg_6_v2
## 136 illumina_humanwg_6_v3
## 137 phalanx_onearray
## description page
## 104 AFFY HC G110 probe feature_page
## 105 AFFY HG Focus probe feature_page
## 106 AFFY HG U133A probe feature_page
## 107 AFFY HG U133A 2 probe feature_page
## 108 AFFY HG U133B probe feature_page
## 109 AFFY HG U133 Plus 2 probe feature_page
## 110 AFFY HG U95A probe feature_page
## 111 AFFY HG U95Av2 probe feature_page
## 112 AFFY HG U95B probe feature_page
## 113 AFFY HG U95C probe feature_page
## 114 AFFY HG U95D probe feature_page
## 115 AFFY HG U95E probe feature_page
## 116 AFFY HTA 2 0 probe feature_page
## 117 AFFY HuEx 1 0 st v2 probe feature_page
## 118 AFFY HuGeneFL probe feature_page
## 119 AFFY HuGene 1 0 st v1 probe feature_page
## 120 AFFY HuGene 2 0 st v1 probe feature_page
## 121 AFFY PrimeView probe feature_page
## 122 AFFY U133 X3P probe feature_page
## 123 AGILENT CGH 44b probe feature_page
## 124 AGILENT GPL6848 probe feature_page
## 125 AGILENT SurePrint G3 GE 8x60k probe feature_page
## 126 AGILENT SurePrint G3 GE 8x60k v2 probe feature_page
## 127 AGILENT WholeGenome probe feature_page
## 128 AGILENT WholeGenome 4x44k v1 probe feature_page
## 129 AGILENT WholeGenome 4x44k v2 probe feature_page
## 130 CODELINK CODELINK probe feature_page
## 131 ILLUMINA HumanHT 12 V3 probe feature_page
## 132 ILLUMINA HumanHT 12 V4 probe feature_page
## 133 ILLUMINA HumanRef 8 V3 probe feature_page
## 134 ILLUMINA HumanWG 6 V1 probe feature_page
## 135 ILLUMINA HumanWG 6 V2 probe feature_page
## 136 ILLUMINA HumanWG 6 V3 probe feature_page
## 137 PHALANX OneArray probe feature_page
## 假设你有Affy id
affyids=c("202763_at","209310_s_at","207500_at")
getBM(attributes = c('affy_hg_u133_plus_2', 'hgnc_symbol', 'chromosome_name',
'start_position', 'end_position', 'band'),## output
filters = 'affy_hg_u133_plus_2', ## input属性
values = affyids, # input值
mart = ensembl)# 数据库
## affy_hg_u133_plus_2 hgnc_symbol chromosome_name start_position
## 1 209310_s_at CASP4 11 104942866
## 2 207500_at CASP5 11 104994235
## 3 202763_at CASP3 4 184627696
## end_position band
## 1 104969366 q22.3
## 2 105023168 q22.3
## 3 184649509 q35.1
entrez=c("673","837")
goids = getBM(attributes = c('entrezgene_id', 'go_id'), # 获取信息
filters = 'entrezgene_id', # 输入属性
values = entrez, # 输入值
mart = ensembl)
head(goids)
## entrezgene_id go_id
## 1 673 GO:0005524
## 2 673 GO:0007165
## 3 673 GO:0006468
## 4 673 GO:0035556
## 5 673 GO:0004672
## 6 673 GO:0046872
go=c("GO:0051330","GO:0000080","GO:0000114","GO:0000082")
chrom=c(17,20,"Y")
getBM(attributes= "hgnc_symbol",# 获取信息
filters=c("go","chromosome_name"),## 输入属性
values=list(go, chrom), #输入参数
mart=ensembl)
## hgnc_symbol
## 1 RPA1
## 2 CDC6
## 3 MCM8
## 4 CRLF3
## 5 CDK3
## 6 RPS6KB1
getBM(attributes = c('entrezgene_id','hgnc_symbol'), # 获取数据
filters = 'go', # 输入属性
values = 'GO:0004707', # 输入值
mart = ensembl)
## entrezgene_id hgnc_symbol
## 1 225689 MAPK15
## 2 5597 MAPK6
## 3 5598 MAPK7
## 4 51701 NLK
## 5 5601 MAPK9
## 6 1432 MAPK14
## 7 5594 MAPK1
## 8 5599 MAPK8
## 9 5595 MAPK3
## 10 5602 MAPK10
## 11 5603 MAPK13
## 12 5600 MAPK11
## 13 6300 MAPK12
## 14 5596 MAPK4
## 看有哪些数据可获取
attributes = listAttributes(ensembl)
dim(attributes)
## [1] 3450 3
attributes[1:5,]
## name description page
## 1 ensembl_gene_id Gene stable ID feature_page
## 2 ensembl_gene_id_version Gene stable ID version feature_page
## 3 ensembl_transcript_id Transcript stable ID feature_page
## 4 ensembl_transcript_id_version Transcript stable ID version feature_page
## 5 ensembl_peptide_id Protein stable ID feature_page
attributes[grepl("go_id",attributes$name),]# grepl返回true or False
## name description page
## 41 go_id GO term accession feature_page
attributes[grepl("KEGG",attributes$description),]# grepl返回true or False
## name description page
## 68 kegg_enzyme KEGG Pathway and Enzyme ID feature_page
## 增加一个gene_bio
getBM(attributes = c('entrezgene_id','hgnc_symbol',"gene_biotype"), # 获取数据
filters = 'go', # 输入属性
values = 'GO:0004707', # 输入值
mart = ensembl)
## entrezgene_id hgnc_symbol gene_biotype
## 1 225689 MAPK15 protein_coding
## 2 5597 MAPK6 protein_coding
## 3 5598 MAPK7 protein_coding
## 4 51701 NLK protein_coding
## 5 5601 MAPK9 protein_coding
## 6 1432 MAPK14 protein_coding
## 7 5594 MAPK1 protein_coding
## 8 5599 MAPK8 protein_coding
## 9 5595 MAPK3 protein_coding
## 10 5602 MAPK10 protein_coding
## 11 5603 MAPK13 protein_coding
## 12 5600 MAPK11 protein_coding
## 13 6300 MAPK12 protein_coding
## 14 5596 MAPK4 protein_coding