Preparation

Necessary libraries

library(compbio4all)
library(rentrez)
library(seqinr)

Data

A table of accession numbers

dio1_table<-c("NP_000783",   "P49895","NA","Homo sapiens" ,    "Human",      "DIO1",
              "NP_001116123","NA",    "NA", "Pan troglodytes" , "Chimpanzee","DIO1",
              "NP_031886",   "Q61153","NA","Mus musculus",      "Mouse"    ,"DIO1",
              "NP_001091083","P24389","NA","Rattus norvegicus", "Rat",      "Dio1",
              "NP_001243226","Q2QEI3","NA","Xenopus tropicalis","Frog",     "dio1",
              "NP_001007284","F1R7E6","NA","Danio rerio",       "Fish",     "dio1")

# convert the vector to matrix using matrix()
dio1_table_matrix <- matrix(dio1_table, byrow = T, nrow = 6)

# convert the matrix to a dataframe using data.frame()
dio1_table <- as.data.frame(dio1_table_matrix,  stringsAsFactors = F)

# name columns of dataframe using names() function
colnames(dio1_table) <- c("ncbi.protein.accession", "UniProt.id", "PDB", "species", "common.name", "gene.name")

# convert table to dataframe
dio1_table <- as.data.frame(dio1_table)

Get FASTA sequences

Download all accession numbers in the table made above.

dio1s_list <- compbio4all::entrez_fetch_list(db = "protein", 
                          id = dio1_table$ncbi.protein.accession, 
                          rettype = "fasta")

Pull out focal sequence

dio1s_human <- dio1s_list[[1]]

Evaluated with parse = T

dio1s_human_vector <- compbio4all::fasta_cleaner(dio1s_human, 
                                                 parse = T)  # NOTE: parse = TRUE

dio1s_human_vector
##   [1] "M" "G" "L" "P" "Q" "P" "G" "L" "W" "L" "K" "R" "L" "W" "V" "L" "L" "E"
##  [19] "V" "A" "V" "H" "V" "V" "V" "G" "K" "V" "L" "L" "I" "L" "F" "P" "D" "R"
##  [37] "V" "K" "R" "N" "I" "L" "A" "M" "G" "E" "K" "T" "G" "M" "T" "R" "N" "P"
##  [55] "H" "F" "S" "H" "D" "N" "W" "I" "P" "T" "F" "F" "S" "T" "Q" "Y" "F" "W"
##  [73] "F" "V" "L" "K" "V" "R" "W" "Q" "R" "L" "E" "D" "T" "T" "E" "L" "G" "G"
##  [91] "L" "A" "P" "N" "C" "P" "V" "V" "R" "L" "S" "G" "Q" "R" "C" "N" "I" "W"
## [109] "E" "F" "M" "Q" "G" "N" "R" "P" "L" "V" "L" "N" "F" "G" "S" "C" "T" "U"
## [127] "P" "S" "F" "M" "F" "K" "F" "D" "Q" "F" "K" "R" "L" "I" "E" "D" "F" "S"
## [145] "S" "I" "A" "D" "F" "L" "V" "I" "Y" "I" "E" "E" "A" "H" "A" "S" "D" "G"
## [163] "W" "A" "F" "K" "N" "N" "M" "D" "I" "R" "N" "H" "Q" "N" "L" "Q" "D" "R"
## [181] "L" "Q" "A" "A" "H" "L" "L" "L" "A" "R" "S" "P" "Q" "C" "P" "V" "V" "V"
## [199] "D" "T" "M" "Q" "N" "Q" "S" "S" "Q" "L" "Y" "A" "A" "L" "P" "E" "R" "L"
## [217] "Y" "I" "I" "Q" "E" "G" "R" "I" "L" "Y" "K" "G" "K" "S" "G" "P" "W" "N"
## [235] "Y" "N" "P" "E" "E" "V" "R" "A" "V" "L" "E" "K" "L" "H" "S"

Evaluated with parse = F

dio1s_human_vector <- compbio4all::fasta_cleaner(dio1s_human, 
                                                 parse = F)  # NOTE: parse = False

dio1s_human_vector
## [1] "MGLPQPGLWLKRLWVLLEVAVHVVVGKVLLILFPDRVKRNILAMGEKTGMTRNPHFSHDNWIPTFFSTQYFWFVLKVRWQRLEDTTELGGLAPNCPVVRLSGQRCNIWEFMQGNRPLVLNFGSCTUPSFMFKFDQFKRLIEDFSSIADFLVIYIEEAHASDGWAFKNNMDIRNHQNLQDRLQAAHLLLARSPQCPVVVDTMQNQSSQLYAALPERLYIIQEGRILYKGKSGPWNYNPEEVRAVLEKLHS"

When parse = T, vector prints with individual characters. When parse = F, vector has all characters together