知道基因的ENTREZID。需要下载基因的核苷酸序列。
R语言当中的rentrez可以对ncbi数据库转换为API形式来批量下载数据。由于ENTREZID是ncbi核苷酸数据库的唯一ID。所以我们需要的就是通过这个ID找到蛋白数据库当中的ID号。进而下载数据即可。步骤主要分成这几部。
id + [UID]可以进行唯一检索protein数据对应的基因的蛋白IDfasta的核苷酸序列。# 构建示例ID
entrezID <- c("8241", "4008", "51025", "399687", "9782")
# 加载关键的包
library(rentrez)
# 单个基因演示
proteinID <- entrez_link(dbfrom = "gene", db = "protein", id = "8241[UID]")
proteinID$links$gene_protein_refseq # 一个基因可能有多个转录本。
## [1] "1370525043" "1370525040" "1370525038" "1370525036" "1370525034"
## [6] "1370525032" "1370525030" "1370525028" "1370525026" "1034675411"
## [11] "1034675409" "530421602" "530421600" "530421598" "325120986"
## [16] "325120984" "325120982" "23111018" "20127479"
fasta1 <- entrez_fetch(db = "protein", id = proteinID$links$gene_protein_refseq[1], rettype = "fasta")
fasta1
## [1] ">XP_024308233.1 RNA-binding protein 10 isoform X13 [Homo sapiens]\nMDSILGALAPYAVLSSSNVRVIKDKQTQLNRGFAFIQLSTIVEAAQLLQILQALHPPLTIDGKTINVEFA\nKGSKRDMASNEGSRISAASVASTAIAAAQWAISQASQGGEGTWATSEEPPVDYSYYQQDEGYGNSQGTES\nSLYAHGYLKGTKGPGITGTKGDPTGAGPEASLEPGADSVSMQAFSRAQPGAAPGIYQQSAEASSSQGTAA\nNSQVSEPCGYVSRGGRQAAGVAWADTEPCSPVWPHDQSYTIMSPAVLKSELQSPTHPSSALPPATSPTAQ\nESYSQYPVPDVSTYQYDETSGYYYDPQTGLYYDPNSQYYYNAQSQQYLYWDGERRTYVPALEQSADGHKE\nTGAPSKEGKEKKEKHKTKTAQQIAKDMERWARSLNKQKENFKNSFQPISSLRDDERRESATADAGYAILE\nKKGALAERQHTSMDLPKLASDDRPSPPRGLVAAYSGESDSEEEQERGGPEREEKLTDWQKLACLLCRRQF\nPSKEALIRHQQLSGLHKQNLEIHRRAHLSENELEALEKNDMEQMKYRDRAAERREKYGIPEPPEPKRRKY\nGGISTASVDFEQPTRDGLGSDNIGSRMLQAMGWKEGSGLGRKKQGIVTPIEAQTRVRGSGLGARGSSYGV\nTSTESYKETLHKTMVTRFNEAQ\n\n"
如果是多个基因我们构建一个循环即可
ProteinSeq <- c()
for(i in entrezID){
index <- paste0(i, "[UID]")
proteinID <- entrez_link(dbfrom = "gene", db = "protein", id = index)
fasta1 <- entrez_fetch(db = "protein",
id = proteinID$links$gene_protein_refseq, rettype = "fasta")
ProteinSeq <- c(ProteinSeq, fasta1)
}