Get CDS seq from NCBI protein id

Get query id

Get your query ids from file. Here i use fie named Streptomyces_Protein_ID_for_DNA_seq.txt containing my query protein ids. File contains one column (no header) where each row belongs to one id.

query_ids <- read_delim("Streptomyces_Protein_ID_for_DNA_seq.txt" , col_names = F, delim = "\t") %>% pull(1)

## Parsed with column specification:
## cols(
##   X1 = col_character()
## )

query_ids %>% sample(5)

## [1] "OMP15435.1"     "WP_043476645.1" "WP_094052900.1" "WP_028800626.1"
## [5] "WP_093909732.1"

Prepare cds file db

Before we get our query protein sequences, we have to download the target db of cds from NCBI.

As per the NCBI guideline WP_* id belongs to non redundant protein sequences and therefore while querying them through batch entrez, it doesn’t return any result for cds fasta output. To know more about Wp_* id type, please refer NCBI guidelines. Alternative is to use NCBI elink and efetch function to download the cds fasta.

Using elink and efetch together first we have to download CDS sequences. Downloaded fasta file contains all the sequences given under one Wp_* and many other sequences. Therefore, we have to get rid of redundant and non required sequences.

Below is the command to download CDS fasta using elink and efetch utility.

elink -target nuccore -db protein -name protein_nuccore -id WP_114529907|efetch -format fasta_cds_na > all_cds.fasta

Once the CDS downloaded, below I use R script to filter the sequences for query ids.

get sequences in R

fasta_cds <- Biostrings::readDNAStringSet("all_cds2.fasta")
fasta_cds

##   A DNAStringSet instance of length 1371135
##           width seq                                    names               
##       [1]   264 CTGTGTTTATTTCAAAGG...TTCCGACTTTATCAGAG lcl|NZ_QTTG010000...
##       [2]  1686 TTGAGTCATCCGCATCCT...TCGTGGTGGAGGTCTGA lcl|NZ_QTTG010000...
##       [3]   900 ATGGCTCCGACCTCGACT...GCGGGGTACAGCTCTGA lcl|NZ_QTTG010000...
##       [4]   735 GTGACCTCCGGCGACGCC...GCCGTGTGGCGCCGTAG lcl|NZ_QTTG010000...
##       [5]   237 ATGTTCCGTCGGCGTGAA...CCTCCCAGGGGCACTGA lcl|NZ_QTTG010000...
##       ...   ... ...
## [1371131]   423 ATGACGTACATCACCCGA...TCGCCCTGGACAGGTGA lcl|NZ_KQ948020.1...
## [1371132]  1251 GTGAGTCTCCTGCCGGAG...GGGAGCGGGCGGCGTGA lcl|NZ_KQ948020.1...
## [1371133]  1434 ATGAGCATCAACACACCG...GCCTGCACCAGCAGTGA lcl|NZ_KQ948020.1...
## [1371134]   933 TTGAGCCTGCGGCAGATG...GCGAGCTGCCCGCCTGA lcl|NZ_KQ948020.1...
## [1371135]     0                                        lcl|NZ_KQ948020.1...

fasta_cds_ids <- base::names(fasta_cds) 
fasta_cds_ids %>% head()

## [1] "lcl|NZ_QTTG01000001.1_cds_1 [locus_tag=CLW14_RS00015] [protein=hypothetical protein] [pseudo=true] [partial=3'] [location=3770..>4033] [gbkey=CDS]"                                                            
## [2] "lcl|NZ_QTTG01000001.1_cds_WP_115905474.1_2 [locus_tag=CLW14_RS00020] [protein=ribonuclease J] [protein_id=WP_115905474.1] [location=complement(4194..5879)] [gbkey=CDS]"                                       
## [3] "lcl|NZ_QTTG01000001.1_cds_WP_115905475.1_3 [gene=dapA] [locus_tag=CLW14_RS00025] [protein=4-hydroxy-tetrahydrodipicolinate synthase] [protein_id=WP_115905475.1] [location=complement(6021..6920)] [gbkey=CDS]"
## [4] "lcl|NZ_QTTG01000001.1_cds_WP_115905476.1_4 [locus_tag=CLW14_RS00030] [protein=FAD-dependent thymidylate synthase] [protein_id=WP_115905476.1] [location=complement(7130..7864)] [gbkey=CDS]"                   
## [5] "lcl|NZ_QTTG01000001.1_cds_WP_115905477.1_5 [locus_tag=CLW14_RS00035] [protein=hypothetical protein] [protein_id=WP_115905477.1] [location=8034..8270] [gbkey=CDS]"                                             
## [6] "lcl|NZ_QTTG01000001.1_cds_WP_115905478.1_6 [locus_tag=CLW14_RS00040] [protein=hypothetical protein] [protein_id=WP_115905478.1] [location=8361..8912] [gbkey=CDS]"

Clean fasta headers

## from original header get protein id for each seq and assign as sequence id 
get_prot_id <- as_mapper(~ gsub(pattern = ".*\\[protein_id=(.*?)\\].*" , replacement = "\\1" ,x = .))
pp_id <-map_chr(fasta_cds_ids ,get_prot_id)
pp_id %>% sample(10)

##  [1] "WP_099944679.1"                                                                                                                                                
##  [2] "WP_143603202.1"                                                                                                                                                
##  [3] "WP_143600966.1"                                                                                                                                                
##  [4] "WP_052229905.1"                                                                                                                                                
##  [5] "WP_097257548.1"                                                                                                                                                
##  [6] "lcl|NZ_QEOG01000001.1_cds_2635 [locus_tag=C8R38_RS13245] [protein=transposase] [pseudo=true] [partial=3'] [location=complement(<2839365..2839730)] [gbkey=CDS]"
##  [7] "WP_108952783.1"                                                                                                                                                
##  [8] "WP_054232629.1"                                                                                                                                                
##  [9] "WP_097287354.1"                                                                                                                                                
## [10] "WP_111329647.1"

length(pp_id)

## [1] 1371135

Use cleaned headers as fasta headers

names(fasta_cds) <- pp_id

Remove redundant sequences

fasta_cds_uniq <- fasta_cds[ ! (names(fasta_cds) %>% duplicated())]

Get cds for query ids

query_fasta_cds <- fasta_cds_uniq [names(fasta_cds_uniq) %in% query_ids] 
query_fasta_cds

##   A DNAStringSet instance of length 514
##       width seq                                        names               
##   [1]   846 ATGCGTCGTCCCTTTGTCCG...CGGGGGCAGGGCTCACTGA WP_115909503.1
##   [2]   792 ATGTTCGGGTACAACCGTGC...CGCACAGGTCAACTCGTAG WP_115909964.1
##   [3]   792 ATGTTCGGGCTCAGACGTGC...CGCGCAGGTCAACTCGTAA WP_066997879.1
##   [4]   795 ATGGCGCGGTCGCGGTCGTG...TATCGCGCGGTCGTCGTAG WP_066974658.1
##   [5]   810 ATGCGACCGACCGTACGCGC...GGCGCACGGCGCCGGCTGA WP_066974204.1
##   ...   ... ...
## [510]   789 ATGTTCGGGCTCACCCGTGC...CGAGCAGGTCGAGTCCTGA WP_062716689.1
## [511]   885 ATGCGTCGTCCCATTGCCCG...AGCGCCTCAGAGGCGCTGA WP_062238494.1
## [512]   792 ATGTTCGGGCTCACACGTGC...CGCGCAGGTCGGCTCGTGA WP_014675804.1
## [513]   792 ATGTTCGCGCTCAACCGTGC...CGCGCAGGTCAACTCGTAA WP_055635421.1
## [514]   837 ATGCGGTGTCGCATTGCCCG...CGAGGCCCGCGGGGGCTGA WP_053190916.1

Check how many query ids does not return cds sequences

not_found <- query_ids[! query_ids %in% names(query_fasta_cds)]

not_found %>% length()

## [1] 3

Write results in a file

## write fasta 
Biostrings::writeXStringSet(query_fasta_cds , filepath = "./query_id_cds2.fasta")

write_delim(path =  "./query_id_not_found.txt" , x = not_found %>% as.data.frame() , delim = "\t")