library(compbio4all)
library(rentrez)
library(seqinr)
A table of accession numbers
dio1_table<-c("NP_000783", "P49895","NA","Homo sapiens" , "Human", "DIO1",
"NP_001116123","NA", "NA", "Pan troglodytes" , "Chimpanzee","DIO1",
"NP_031886", "Q61153","NA","Mus musculus", "Mouse" ,"DIO1",
"NP_001091083","P24389","NA","Rattus norvegicus", "Rat", "Dio1",
"NP_001243226","Q2QEI3","NA","Xenopus tropicalis","Frog", "dio1",
"NP_001007284","F1R7E6","NA","Danio rerio", "Fish", "dio1")
# convert the vector to matrix using matrix()
dio1_table_matrix <- matrix(dio1_table, byrow = T, nrow = 6)
# convert the matrix to a dataframe using data.frame()
dio1_table <- as.data.frame(dio1_table_matrix, stringsAsFactors = F)
# name columns of dataframe using names() function
colnames(dio1_table) <- c("ncbi.protein.accession", "UniProt.id", "PDB", "species", "common.name", "gene.name")
# convert table to dataframe
dio1_table <- as.data.frame(dio1_table)
Download all accession numbers in the table made above.
dio1s_list <- compbio4all::entrez_fetch_list(db = "protein",
id = dio1_table$ncbi.protein.accession,
rettype = "fasta")
dio1s_human <- dio1s_list[[1]]
dio1s_human_vector <- compbio4all::fasta_cleaner(dio1s_human,
parse = T)
table(dio1s_human_vector)
## dio1s_human_vector
## A C D E F G H I K L M N P Q R S T U V W Y
## 13 4 10 13 15 14 7 12 11 30 7 14 14 14 16 13 8 1 19 8 6
table() function displays frequency of each amino acid in a polypeptide sequence