Get your query ids from file. Here i use fie named Streptomyces_Protein_ID_for_DNA_seq.txt containing my query protein ids. File contains one column (no header) where each row belongs to one id.
query_ids <- read_delim("Streptomyces_Protein_ID_for_DNA_seq.txt" , col_names = F, delim = "\t") %>% pull(1)
## Parsed with column specification:
## cols(
## X1 = col_character()
## )
query_ids %>% sample(5)
## [1] "OMP15435.1" "WP_043476645.1" "WP_094052900.1" "WP_028800626.1"
## [5] "WP_093909732.1"
Before we get our query protein sequences, we have to download the target db of cds from NCBI.
As per the NCBI guideline WP_* id belongs to non redundant protein sequences and therefore while querying them through batch entrez, it doesnโt return any result for cds fasta output. To know more about Wp_* id type, please refer NCBI guidelines. Alternative is to use NCBI elink and efetch function to download the cds fasta.
Using elink and efetch together first we have to download CDS sequences. Downloaded fasta file contains all the sequences given under one Wp_* and many other sequences. Therefore, we have to get rid of redundant and non required sequences.
Below is the command to download CDS fasta using elink and efetch utility.
elink -target nuccore -db protein -name protein_nuccore -id WP_114529907|efetch -format fasta_cds_na > all_cds.fasta
Once the CDS downloaded, below I use R script to filter the sequences for query ids.
fasta_cds <- Biostrings::readDNAStringSet("all_cds2.fasta")
fasta_cds
## A DNAStringSet instance of length 1371135
## width seq names
## [1] 264 CTGTGTTTATTTCAAAGG...TTCCGACTTTATCAGAG lcl|NZ_QTTG010000...
## [2] 1686 TTGAGTCATCCGCATCCT...TCGTGGTGGAGGTCTGA lcl|NZ_QTTG010000...
## [3] 900 ATGGCTCCGACCTCGACT...GCGGGGTACAGCTCTGA lcl|NZ_QTTG010000...
## [4] 735 GTGACCTCCGGCGACGCC...GCCGTGTGGCGCCGTAG lcl|NZ_QTTG010000...
## [5] 237 ATGTTCCGTCGGCGTGAA...CCTCCCAGGGGCACTGA lcl|NZ_QTTG010000...
## ... ... ...
## [1371131] 423 ATGACGTACATCACCCGA...TCGCCCTGGACAGGTGA lcl|NZ_KQ948020.1...
## [1371132] 1251 GTGAGTCTCCTGCCGGAG...GGGAGCGGGCGGCGTGA lcl|NZ_KQ948020.1...
## [1371133] 1434 ATGAGCATCAACACACCG...GCCTGCACCAGCAGTGA lcl|NZ_KQ948020.1...
## [1371134] 933 TTGAGCCTGCGGCAGATG...GCGAGCTGCCCGCCTGA lcl|NZ_KQ948020.1...
## [1371135] 0 lcl|NZ_KQ948020.1...
fasta_cds_ids <- base::names(fasta_cds)
fasta_cds_ids %>% head()
## [1] "lcl|NZ_QTTG01000001.1_cds_1 [locus_tag=CLW14_RS00015] [protein=hypothetical protein] [pseudo=true] [partial=3'] [location=3770..>4033] [gbkey=CDS]"
## [2] "lcl|NZ_QTTG01000001.1_cds_WP_115905474.1_2 [locus_tag=CLW14_RS00020] [protein=ribonuclease J] [protein_id=WP_115905474.1] [location=complement(4194..5879)] [gbkey=CDS]"
## [3] "lcl|NZ_QTTG01000001.1_cds_WP_115905475.1_3 [gene=dapA] [locus_tag=CLW14_RS00025] [protein=4-hydroxy-tetrahydrodipicolinate synthase] [protein_id=WP_115905475.1] [location=complement(6021..6920)] [gbkey=CDS]"
## [4] "lcl|NZ_QTTG01000001.1_cds_WP_115905476.1_4 [locus_tag=CLW14_RS00030] [protein=FAD-dependent thymidylate synthase] [protein_id=WP_115905476.1] [location=complement(7130..7864)] [gbkey=CDS]"
## [5] "lcl|NZ_QTTG01000001.1_cds_WP_115905477.1_5 [locus_tag=CLW14_RS00035] [protein=hypothetical protein] [protein_id=WP_115905477.1] [location=8034..8270] [gbkey=CDS]"
## [6] "lcl|NZ_QTTG01000001.1_cds_WP_115905478.1_6 [locus_tag=CLW14_RS00040] [protein=hypothetical protein] [protein_id=WP_115905478.1] [location=8361..8912] [gbkey=CDS]"
## from original header get protein id for each seq and assign as sequence id
get_prot_id <- as_mapper(~ gsub(pattern = ".*\\[protein_id=(.*?)\\].*" , replacement = "\\1" ,x = .))
pp_id <-map_chr(fasta_cds_ids ,get_prot_id)
pp_id %>% sample(10)
## [1] "WP_099944679.1"
## [2] "WP_143603202.1"
## [3] "WP_143600966.1"
## [4] "WP_052229905.1"
## [5] "WP_097257548.1"
## [6] "lcl|NZ_QEOG01000001.1_cds_2635 [locus_tag=C8R38_RS13245] [protein=transposase] [pseudo=true] [partial=3'] [location=complement(<2839365..2839730)] [gbkey=CDS]"
## [7] "WP_108952783.1"
## [8] "WP_054232629.1"
## [9] "WP_097287354.1"
## [10] "WP_111329647.1"
length(pp_id)
## [1] 1371135
names(fasta_cds) <- pp_id
fasta_cds_uniq <- fasta_cds[ ! (names(fasta_cds) %>% duplicated())]
query_fasta_cds <- fasta_cds_uniq [names(fasta_cds_uniq) %in% query_ids]
query_fasta_cds
## A DNAStringSet instance of length 514
## width seq names
## [1] 846 ATGCGTCGTCCCTTTGTCCG...CGGGGGCAGGGCTCACTGA WP_115909503.1
## [2] 792 ATGTTCGGGTACAACCGTGC...CGCACAGGTCAACTCGTAG WP_115909964.1
## [3] 792 ATGTTCGGGCTCAGACGTGC...CGCGCAGGTCAACTCGTAA WP_066997879.1
## [4] 795 ATGGCGCGGTCGCGGTCGTG...TATCGCGCGGTCGTCGTAG WP_066974658.1
## [5] 810 ATGCGACCGACCGTACGCGC...GGCGCACGGCGCCGGCTGA WP_066974204.1
## ... ... ...
## [510] 789 ATGTTCGGGCTCACCCGTGC...CGAGCAGGTCGAGTCCTGA WP_062716689.1
## [511] 885 ATGCGTCGTCCCATTGCCCG...AGCGCCTCAGAGGCGCTGA WP_062238494.1
## [512] 792 ATGTTCGGGCTCACACGTGC...CGCGCAGGTCGGCTCGTGA WP_014675804.1
## [513] 792 ATGTTCGCGCTCAACCGTGC...CGCGCAGGTCAACTCGTAA WP_055635421.1
## [514] 837 ATGCGGTGTCGCATTGCCCG...CGAGGCCCGCGGGGGCTGA WP_053190916.1
not_found <- query_ids[! query_ids %in% names(query_fasta_cds)]
not_found %>% length()
## [1] 3
## write fasta
Biostrings::writeXStringSet(query_fasta_cds , filepath = "./query_id_cds2.fasta")
write_delim(path = "./query_id_not_found.txt" , x = not_found %>% as.data.frame() , delim = "\t")