We know how to use entrez_fetch to download one FASTA sequence, but a lot of times we need to download many FASTA sequences to study together. This code shows how to download a bunch of FASTA sequences at the same time.
Load the necessary libraries
library(compbio4all)
library(rentrez)
library(seqinr)
Make a table with all the accession numbers for the DIO1 gene in different organisms
dio1_table<-c("NP_000783", "P49895","NA","Homo sapiens" , "Human", "DIO1",
"NP_001116123","NA", "NA", "Pan troglodytes" , "Chimpanzee","DIO1",
"NP_031886", "Q61153","NA","Mus musculus", "Mouse" ,"DIO1",
"NP_001091083","P24389","NA","Rattus norvegicus", "Rat", "Dio1",
"NP_001243226","Q2QEI3","NA","Xenopus tropicalis","Frog", "dio1",
"NP_001007284","F1R7E6","NA","Danio rerio", "Fish", "dio1")
# convert the vector to matrix using matrix()
dio1_table_matrix <- matrix(dio1_table, byrow = T, nrow = 6)
# convert the matrix to a dataframe using data.frame()
dio1_table <- as.data.frame(dio1_table_matrix, stringsAsFactors = F)
# name columns of dataframe using names() function
colnames(dio1_table) <- c("ncbi.protein.accession", "UniProt.id", "PDB", "species", "common.name", "gene.name")
# convert table to dataframe
dio1_table <- as.data.frame(dio1_table)
Use the entrez_fetch_list function to download all of the FASTA sequences from the NCBI database. This function allows us to download a bunch of sequences at once and store them in a list.
dio1s_list <- compbio4all::entrez_fetch_list(db = "protein",
id = dio1_table$ncbi.protein.accession,
rettype = "fasta")
Display the list to show all of the sequences
dio1s_list
## $NP_000783
## [1] ">NP_000783.2 type I iodothyronine deiodinase isoform a [Homo sapiens]\nMGLPQPGLWLKRLWVLLEVAVHVVVGKVLLILFPDRVKRNILAMGEKTGMTRNPHFSHDNWIPTFFSTQY\nFWFVLKVRWQRLEDTTELGGLAPNCPVVRLSGQRCNIWEFMQGNRPLVLNFGSCTUPSFMFKFDQFKRLI\nEDFSSIADFLVIYIEEAHASDGWAFKNNMDIRNHQNLQDRLQAAHLLLARSPQCPVVVDTMQNQSSQLYA\nALPERLYIIQEGRILYKGKSGPWNYNPEEVRAVLEKLHS\n\n"
##
## $NP_001116123
## [1] ">NP_001116123.2 type I iodothyronine deiodinase [Pan troglodytes]\nMGLPQPGLWLKRLWVLLEVAVHVVVGKVLLILFPDRVKRNILAMGEKTGMTRNPHFSHDNWIPTFFSTQY\nFWFVLKVRWQRLEDTTELGGLAPNCPVVHLSGQRCNIWEFMQGNRPLVLNFGSCTUPSFMFKFDQFKRLI\nEDFSSIADFLVIYIEEAHASDGWAFKNNMDIRNHQNLQDRLQAAHLLLARSPQCPVVVDTMQNQSSQLYA\nALPERLYVIQEGRILYKGKSGPWNYNPEEVRAVLEKLHS\n\n"
##
## $NP_031886
## [1] ">NP_031886.3 type I iodothyronine deiodinase [Mus musculus]\nMGLPQLWLWLKRLVIFLQVALEVAVGKVLMTLFPGRVKQSILAMGQKTGMARNPRFAPDNWVPTFFSIQY\nFWFVLKVRWQRLEDRAEFGGLAPNCTVVCLSGQKCNIWDFIQGSRPLVLNFGSCTUPSFLLKFDQFKRLV\nDDFASTADFLIIYIEEAHATDGWAFKNNVDIRQHRSLQERVRAARMLLARSPQCPVVVDTMQNQSSQLYA\nALPERLYVIQEGRICYKGKAGPWNYNPEEVRAVLEKLCTPPRHVPQL\n\n"
##
## $NP_001091083
## [1] ">NP_001091083.2 type I iodothyronine deiodinase [Gallus gallus]\nMLSIGVLLHKLLILLQVTLSVVVGKTMMILFPDATKRYILKLGEKSRMNQNPKFSYENWGPTFFSFQYLL\nFVLKVKWRRLEDEAHEGRPAPNTPVVALNGEMQHLFSFMRDNRPLILNFGSCTUPSFMLKFDEFNKLVKD\nFSSIADFLIIYIEEAHAVDGWAFRNNVVIKNHRSLEDRKTAAQFLQQKNPLCPVVLDTMENLSSSKYAAL\nPERLYILQAGNVIYKGGVGPWNYHPQEIRAVLEKLK\n\n"
##
## $NP_001243226
## [1] ">NP_001243226.1 type I iodothyronine deiodinase isoform 1 [Xenopus tropicalis]\nMESLLQTIKLMVRFIQKTMIFFFLFIYVVVGKVLMFFFPQTMASVLKSRFETTGVHDPKFQYEDWGPTFF\nTYKFLRSVLEIMWLRLEDEAFVGHSAPNTPVIDLNGELHHIWDYLQGTRPLVLNFGSCTUPPFLFRLGEF\nNKLVNDFSSIADFLIIYIDEAHAADEWALKNNLHIKKHRCLQDRLAAAKRLLEELPSCPVVLDTMSNLCS\nAKYAALPERLYILQEGKIIYKGKMGPWGYKPEEVHSVLEKKK\n\n"
##
## $NP_001007284
## [1] ">NP_001007284.3 type I iodothyronine deiodinase isoform 1 [Danio rerio]\nMGSAVGFALRKLFVYISAVLMVCAAILRMSMLKLLSFISPGRMRKIHMKMGERTTMTQNPKFRYEDWGPA\nFFSLAFIKTLFFVNWCSLGLEAFEGHSAPDSALITLDRQKTSVHRFLKGNRPLVLSFGSCTUPPFLYKLD\nEFKQLVKDFSNVADFLIVYLAEAHATDAWAFKNNVDISVHKNLEERLAAARTLLKEDPPCPVVVDEMNNI\nTASKYGALPERLYVIQSGKVIYQGGIGPWGYKPEEVRKVLEKSK\n\n"