Preparation

Necessary libraries

library(compbio4all)
library(rentrez)
library(seqinr)

Data

A table of accession numbers

dio1_table<-c("NP_000783",   "P49895","NA","Homo sapiens" ,    "Human",      "DIO1",
              "NP_001116123","NA",    "NA", "Pan troglodytes" , "Chimpanzee","DIO1",
              "NP_031886",   "Q61153","NA","Mus musculus",      "Mouse"    ,"DIO1",
              "NP_001091083","P24389","NA","Rattus norvegicus", "Rat",      "Dio1",
              "NP_001243226","Q2QEI3","NA","Xenopus tropicalis","Frog",     "dio1",
              "NP_001007284","F1R7E6","NA","Danio rerio",       "Fish",     "dio1")

# convert the vector to matrix using matrix()
dio1_table_matrix <- matrix(dio1_table, byrow = T, nrow = 6)

# convert the matrix to a dataframe using data.frame()
dio1_table <- as.data.frame(dio1_table_matrix,  stringsAsFactors = F)

# name columns of dataframe using names() function
colnames(dio1_table) <- c("ncbi.protein.accession", "UniProt.id", "PDB", "species", "common.name", "gene.name")

# convert table to dataframe
dio1_table <- as.data.frame(dio1_table)

Get FASTA sequences

Download all accession numbers in the table made above.

dio1s_list <- compbio4all::entrez_fetch_list(db = "protein", 
                          id = dio1_table$ncbi.protein.accession, 
                          rettype = "fasta")

Pull out focal sequence

dio1s_human <- dio1s_list[[1]]

Clean Sequence

dio1s_human_vector <- compbio4all::fasta_cleaner(dio1s_human, 
                                                 parse = T)

Use table() function

table(dio1s_human_vector)
## dio1s_human_vector
##  A  C  D  E  F  G  H  I  K  L  M  N  P  Q  R  S  T  U  V  W  Y 
## 13  4 10 13 15 14  7 12 11 30  7 14 14 14 16 13  8  1 19  8  6

table() function displays frequency of each amino acid in a polypeptide sequence