library(compbio4all)
library(rentrez)
library(seqinr)
A table of accession numbers
dio1_table<-c("NP_000783", "P49895","NA","Homo sapiens" , "Human", "DIO1",
"NP_001116123","NA", "NA", "Pan troglodytes" , "Chimpanzee","DIO1",
"NP_031886", "Q61153","NA","Mus musculus", "Mouse" ,"DIO1",
"NP_001091083","P24389","NA","Rattus norvegicus", "Rat", "Dio1",
"NP_001243226","Q2QEI3","NA","Xenopus tropicalis","Frog", "dio1",
"NP_001007284","F1R7E6","NA","Danio rerio", "Fish", "dio1")
# convert the vector to matrix using matrix()
dio1_table_matrix <- matrix(dio1_table, byrow = T, nrow = 6)
# convert the matrix to a dataframe using data.frame()
dio1_table <- as.data.frame(dio1_table_matrix, stringsAsFactors = F)
# name columns of dataframe using names() function
colnames(dio1_table) <- c("ncbi.protein.accession", "UniProt.id", "PDB", "species", "common.name", "gene.name")
# convert table to dataframe
dio1_table <- as.data.frame(dio1_table)
Download all accession numbers in the table made above.
dio1s_list <- compbio4all::entrez_fetch_list(db = "protein",
id = dio1_table$ncbi.protein.accession,
rettype = "fasta")
dio1s_human <- dio1s_list[[1]]
is(dio1s_human)
## [1] "character" "vector" "data.frameRowLabels"
## [4] "SuperClassMethod"
is.vector(dio1s_human)
## [1] TRUE
is.list(dio1s_human)
## [1] FALSE
length(dio1s_human)
## [1] 1
dim(dio1s_human)
## NULL
nchar(dio1s_human)
## [1] 324
dio1s_human
## [1] ">NP_000783.2 type I iodothyronine deiodinase isoform a [Homo sapiens]\nMGLPQPGLWLKRLWVLLEVAVHVVVGKVLLILFPDRVKRNILAMGEKTGMTRNPHFSHDNWIPTFFSTQY\nFWFVLKVRWQRLEDTTELGGLAPNCPVVRLSGQRCNIWEFMQGNRPLVLNFGSCTUPSFMFKFDQFKRLI\nEDFSSIADFLVIYIEEAHASDGWAFKNNMDIRNHQNLQDRLQAAHLLLARSPQCPVVVDTMQNQSSQLYA\nALPERLYIIQEGRILYKGKSGPWNYNPEEVRAVLEKLHS\n\n"
Dotplot wants your sequence data to be in a vector liks this
c("A","C","M","R","T")
## [1] "A" "C" "M" "R" "T"
It does NOT want a single single string.
c("ACMRT")
## [1] "ACMRT"
To set up a vector for dotPlot() you can use compbio4all::fasta_cleaner() with parse = T, which can take a vector with all the sequence as a single string (“ACMRT”) and separate it out into a vector where each letter is in a separate slot of the vector (“A”,“C”,“M”,“R”,“T”)
dio1s_human_vector <- compbio4all::fasta_cleaner(dio1s_human,
parse = T) # NOTE: parse = TRUE
seqinr::dotPlot(dio1s_human_vector, dio1s_human_vector)