Preparation

Necessary libraries

library(compbio4all)
library(rentrez)
library(seqinr)

Data

A table of accession numbers

dio1_table<-c("NP_000783",   "P49895","NA","Homo sapiens" ,    "Human",      "DIO1",
              "NP_001116123","NA",    "NA", "Pan troglodytes" , "Chimpanzee","DIO1",
              "NP_031886",   "Q61153","NA","Mus musculus",      "Mouse"    ,"DIO1",
              "NP_001091083","P24389","NA","Rattus norvegicus", "Rat",      "Dio1",
              "NP_001243226","Q2QEI3","NA","Xenopus tropicalis","Frog",     "dio1",
              "NP_001007284","F1R7E6","NA","Danio rerio",       "Fish",     "dio1")

# convert the vector to matrix using matrix()
dio1_table_matrix <- matrix(dio1_table, byrow = T, nrow = 6)

# convert the matrix to a dataframe using data.frame()
dio1_table <- as.data.frame(dio1_table_matrix,  stringsAsFactors = F)

# name columns of dataframe using names() function
colnames(dio1_table) <- c("ncbi.protein.accession", "UniProt.id", "PDB", "species", "common.name", "gene.name")

# convert table to dataframe
dio1_table <- as.data.frame(dio1_table)

Get FASTA sequences

Download all accession numbers in the table made above.

dio1s_list <- compbio4all::entrez_fetch_list(db = "protein", 
                          id = dio1_table$ncbi.protein.accession, 
                          rettype = "fasta")

Pull out focal sequence

dio1s_human <- dio1s_list[[1]]

Evaluated the focal sequence

is(dio1s_human)
## [1] "character"           "vector"              "data.frameRowLabels"
## [4] "SuperClassMethod"
is.vector(dio1s_human)
## [1] TRUE
is.list(dio1s_human)
## [1] FALSE
length(dio1s_human)
## [1] 1
dim(dio1s_human)
## NULL
nchar(dio1s_human)
## [1] 324
dio1s_human
## [1] ">NP_000783.2 type I iodothyronine deiodinase isoform a [Homo sapiens]\nMGLPQPGLWLKRLWVLLEVAVHVVVGKVLLILFPDRVKRNILAMGEKTGMTRNPHFSHDNWIPTFFSTQY\nFWFVLKVRWQRLEDTTELGGLAPNCPVVRLSGQRCNIWEFMQGNRPLVLNFGSCTUPSFMFKFDQFKRLI\nEDFSSIADFLVIYIEEAHASDGWAFKNNMDIRNHQNLQDRLQAAHLLLARSPQCPVVVDTMQNQSSQLYA\nALPERLYIIQEGRILYKGKSGPWNYNPEEVRAVLEKLHS\n\n"

Prepare sequence for dotPlot

Dotplot wants your sequence data to be in a vector liks this

c("A","C","M","R","T")
## [1] "A" "C" "M" "R" "T"

It does NOT want a single single string.

c("ACMRT")
## [1] "ACMRT"

To set up a vector for dotPlot() you can use compbio4all::fasta_cleaner() with parse = T, which can take a vector with all the sequence as a single string (“ACMRT”) and separate it out into a vector where each letter is in a separate slot of the vector (“A”,“C”,“M”,“R”,“T”)

dio1s_human_vector <- compbio4all::fasta_cleaner(dio1s_human, 
                                                 parse = T)  # NOTE: parse = TRUE

Build dotPlot

seqinr::dotPlot(dio1s_human_vector, dio1s_human_vector)