This code compiles summary information about the gene DOCK3.
This gene is specifically expressed in the central nervous system (CNS). It encodes a member of the DOCK (dedicator of cytokinesis) family of guanine nucleotide exchange factors (GEFs). This protein, dedicator of cytokinesis 3 (DOCK3), is also known as modifier of cell adhesion (MOCA) and presenilin-binding protein (PBP). The DOCK3 and DOCK1, -2 and -4 share several conserved amino acids in their DHR-2 (DOCK homology region 2) domains that are required for GEF activity, and bind directly to WAVE proteins [Wiskott-Aldrich syndrome protein (WASP) family Verprolin-homologous proteins] via their DHR-1 domains. The DOCK3 induces axonal outgrowth in CNS by stimulating membrane recruitment of the WAVE complex and activating the small G protein Rac1. This gene is associated with an attention deficit hyperactivity disorder-like phenotype by a complex chromosomal rearrangement. [provided by RefSeq, Aug 2010].
Refseq Gene: https://www.ncbi.nlm.nih.gov/gene/1795 Refseq Homologene: https://www.ncbi.nlm.nih.gov/homologene/21030 Other resources consulted includes: Uniprot: https://www.uniprot.org/uniprot/Q8IZD9
#install.packages("BiocManager")
library(BiocManager)
## Bioconductor version '3.13' is out-of-date; the current release version '3.14'
## is available with R version '4.1'; see https://bioconductor.org/install
#install("drawProteins")
#BiocManager::install("drawProteins")
library(drawProteins)
# github packages
library(compbio4all)
library(ggmsa)
## Registered S3 methods overwritten by 'ggalt':
## method from
## grid.draw.absoluteGrob ggplot2
## grobHeight.absoluteGrob ggplot2
## grobWidth.absoluteGrob ggplot2
## grobX.absoluteGrob ggplot2
## grobY.absoluteGrob ggplot2
## ggmsa v0.99.5 Document: http://yulab-smu.top/ggmsa/
##
## If you use ggmsa in published research, please cite: DOI: 10.18129/B9.bioc.ggmsa
# CRAN packages
library(rentrez)
library(seqinr)
library(ape)
##
## Attaching package: 'ape'
## The following objects are masked from 'package:seqinr':
##
## as.alignment, consensus
library(pander)
library(ggplot2)
library(drawProteins)
# github packages
library(compbio4all)
library(ggmsa)
# Bioconductor packages
## msa
### The msa package is having problems on some platforms
### You can skip the msa steps if necessary. The msa output
### is used to make a distance matrix and then phylogenetics trees,
### but I provide code to build the matrix by hand so
### you can proceed even if msa doesn't work for you.
#BiocManager::install("msa")
library(msa)
## Loading required package: Biostrings
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: XVector
## Loading required package: GenomeInfoDb
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:ape':
##
## complement
## The following object is masked from 'package:seqinr':
##
## translate
## The following object is masked from 'package:base':
##
## strsplit
##
## Attaching package: 'msa'
## The following object is masked from 'package:BiocManager':
##
## version
library(drawProteins)
## Biostrings
#install.packages("Biostrings")
library(Biostrings)
#install.packages("HGNChelper")
library(HGNChelper)
# CRAN packages
library(rentrez)
library(seqinr)
library(ape)
# RefSeq Uniprot PDB sci name common name gene name
dock3_table<-c("NP_004938.1","Q8IZD9","NA","Homo sapiens","Human","DOCK3",
"XP_516488.4","NA","NA", "Pan troglodytes","Chimpanzee","DOCK3",
"NP_700462.2", "Q8CIQ7","NA","Mus musculus","Mouse","DOCK3",
"XP_001089458.2","Q8HXW5","NA","Macaca mulatta","Cynomolgus monkey","DOCK3",
"XP_533813.4","Q6RH31","NA","Canis Lupus","Dog","DOCK3",
"XP_002697118.2","Q9XT97","NA","Bos Taurus","Bovine","DOCK3",
"ENSP00000266037","Q8IZD9","NA","Homo Sapiens","Human","DOCK3",
"XM_032320975.1","NA","NA","Mustela erminea","stoat","DOCK3",
"NW_020847977.1","NA","NA","Tachysurus fulvidraco","Catfish","DOCK3",
"XP_006243927.1","P97887","NA","Rattus norvegicus", "rat","DOCK3")
#
## [1] 10 10
## [1] "data.frame" "list" "oldClass" "vector"
## [5] "list_OR_List" "vector_OR_Vector" "vector_OR_factor"
## [1] "V1" "V2" "V3" "V4" "V5" "V6" "V7" "V8" "V9" "V10"
matrix(data = dock3_table, nrow = 10, ncol = 6, byrow = TRUE,
dimnames = NULL)
## [,1] [,2] [,3] [,4]
## [1,] "NP_004938.1" "Q8IZD9" "NA" "Homo sapiens"
## [2,] "XP_516488.4" "NA" "NA" "Pan troglodytes"
## [3,] "NP_700462.2" "Q8CIQ7" "NA" "Mus musculus"
## [4,] "XP_001089458.2" "Q8HXW5" "NA" "Macaca mulatta"
## [5,] "XP_533813.4" "Q6RH31" "NA" "Canis Lupus"
## [6,] "XP_002697118.2" "Q9XT97" "NA" "Bos Taurus"
## [7,] "ENSP00000266037" "Q8IZD9" "NA" "Homo Sapiens"
## [8,] "XM_032320975.1" "NA" "NA" "Mustela erminea"
## [9,] "NW_020847977.1" "NA" "NA" "Tachysurus fulvidraco"
## [10,] "XP_006243927.1" "P97887" "NA" "Rattus norvegicus"
## [,5] [,6]
## [1,] "Human" "DOCK3"
## [2,] "Chimpanzee" "DOCK3"
## [3,] "Mouse" "DOCK3"
## [4,] "Cynomolgus monkey" "DOCK3"
## [5,] "Dog" "DOCK3"
## [6,] "Bovine" "DOCK3"
## [7,] "Human" "DOCK3"
## [8,] "stoat" "DOCK3"
## [9,] "Catfish" "DOCK3"
## [10,] "rat" "DOCK3"
length(dock3_table)
## [1] 60
dock3_table[[1]]
## [1] "NP_004938.1"
for(i in 1:length(dock3_table)){
dock3_table[[i]] <- compbio4all::fasta_cleaner(dock3_table[[i]], parse = F)
}
library(BiocManager)
#install("drawProteins")
library(drawProteins)
library(ggplot2)
library(drawProteins)
Q8IZD9_json <- drawProteins::get_features("Q8IZD9")
## [1] "Download has worked"
is(Q8IZD9_json)
## [1] "list" "vector" "list_OR_List" "vector_OR_Vector"
## [5] "vector_OR_factor"
my_prot_df <- drawProteins::feature_to_dataframe(Q8IZD9_json)
is(my_prot_df)
## [1] "data.frame" "list" "oldClass" "vector"
## [5] "list_OR_List" "vector_OR_Vector" "vector_OR_factor"
my_prot_df[,-2]
## type begin end length accession entryName taxid order
## featuresTemp CHAIN 1 2030 2029 Q8IZD9 DOCK3_HUMAN 9606 1
## featuresTemp.1 DOMAIN 6 67 61 Q8IZD9 DOCK3_HUMAN 9606 1
## featuresTemp.2 DOMAIN 421 599 178 Q8IZD9 DOCK3_HUMAN 9606 1
## featuresTemp.3 DOMAIN 1228 1635 407 Q8IZD9 DOCK3_HUMAN 9606 1
## featuresTemp.4 REGION 1641 1662 21 Q8IZD9 DOCK3_HUMAN 9606 1
## featuresTemp.5 REGION 1734 1771 37 Q8IZD9 DOCK3_HUMAN 9606 1
## featuresTemp.6 REGION 1849 1927 78 Q8IZD9 DOCK3_HUMAN 9606 1
## featuresTemp.7 REGION 1951 2030 79 Q8IZD9 DOCK3_HUMAN 9606 1
## featuresTemp.8 MOTIF 1970 1976 6 Q8IZD9 DOCK3_HUMAN 9606 1
## featuresTemp.9 COMPBIAS 1734 1765 31 Q8IZD9 DOCK3_HUMAN 9606 1
## featuresTemp.10 COMPBIAS 1871 1920 49 Q8IZD9 DOCK3_HUMAN 9606 1
## featuresTemp.11 COMPBIAS 1964 1980 16 Q8IZD9 DOCK3_HUMAN 9606 1
## featuresTemp.12 COMPBIAS 1981 2005 24 Q8IZD9 DOCK3_HUMAN 9606 1
## featuresTemp.13 MOD_RES 1658 1658 0 Q8IZD9 DOCK3_HUMAN 9606 1
## featuresTemp.14 VARIANT 128 2030 1902 Q8IZD9 DOCK3_HUMAN 9606 1
## featuresTemp.15 VARIANT 392 392 0 Q8IZD9 DOCK3_HUMAN 9606 1
## featuresTemp.16 VARIANT 1296 1296 0 Q8IZD9 DOCK3_HUMAN 9606 1
## featuresTemp.17 VARIANT 1674 1674 0 Q8IZD9 DOCK3_HUMAN 9606 1
my_canvas <- draw_canvas(my_prot_df)
my_canvas <- draw_chains(my_canvas, my_prot_df,
label_size = 2.5)
my_canvas <- draw_domains(my_canvas, my_prot_df)
my_canvas
Q8IZD9_FASTA <- rentrez::entrez_fetch(id ="Q8IZD9" ,
db = "protein",
rettype="fasta")
Q8IZD9_vector <- fasta_cleaner(Q8IZD9_FASTA)
Q8IZD9_FASTA_str <- fasta_cleaner(Q8IZD9_FASTA,
parse = F)
length(Q8IZD9_FASTA)
## [1] 1
nchar(Q8IZD9_FASTA)
## [1] 2228
str(Q8IZD9_FASTA)
## chr ">sp|Q8IZD9.1|DOCK3_HUMAN RecName: Full=Dedicator of cytokinesis protein 3; AltName: Full=Modifier of cell adhes"| __truncated__
str(Q8IZD9_vector)
## chr [1:2030] "M" "W" "T" "P" "T" "E" "E" "E" "K" "Y" "G" "V" "V" "I" "C" ...
str(Q8IZD9_FASTA_str)
## chr "MWTPTEEEKYGVVICSFRGSVPQGLVLEIGETVQILEKCEGWYRGVSTKKPNVKGIFPANYIHLKKAIVSNRGQYETVVPLEDSIVTEVTATLQEWASLWKQLYVKHKVDL"| __truncated__
align <- pairwiseAlignment(Q8IZD9_FASTA_str,
Q8IZD9_FASTA_str,
type = "global")
# set up 2 x 2 grid, make margins things
par(mfrow = c(2,2),
mar = c(0,0,2,1))
# plot 1: Defaults
dotPlot(Q8IZD9_vector, Q8IZD9_vector,
wsize = 1,
nmatch = 1,
main = "")
# plot 2 size = 10, nmatch = 1
dotPlot(Q8IZD9_vector, Q8IZD9_vector,
wsize = 10,
nmatch = 1,
main = "")
# plot 3: size = 10, nmatch = 5
dotPlot(Q8IZD9_vector, Q8IZD9_vector,
wsize = 10,
nmatch = 5,
main = "")
# plot 4: size = 20, nmatch = 5
dotPlot(Q8IZD9_vector, Q8IZD9_vector,
wsize = 20,
nmatch = 5,
main = "")
# reset par() - run this or other plots will be small!
par(mfrow = c(1,1),
mar = c(4,4,4,4))
par(mfrow = c(1,1),
mar = c(4,4,4,4))
dotPlot(Q8IZD9_vector,
Q8IZD9_vector,
wsize = 20,
wstep = 1,
nmatch = 5
)
aa.1.1 <- c("A","R","N","D","C","Q","E","G","H","I",
"L","K","M","F","P","S","T","W","Y","V")
## alpha proteins
alpha <- c(285, 53, 97, 163, 22, 67, 134, 197, 111, 91,
221, 249, 48, 123, 82, 122, 119, 33, 63, 167)
## beta proteins
beta <- c(203, 67, 139, 121, 75, 122, 86, 297, 49, 120,
177, 115, 16, 85, 127, 341, 253, 44, 110, 229)
## alpha + beta
a.plus.b <- c(175, 78, 120, 111, 74, 74, 86, 171, 33, 93,
110, 112, 25, 52, 71, 126, 117, 30, 108, 123)
## alpha/beta
a.div.b <- c(361, 146, 183, 244, 63, 114, 257, 377, 107, 239,
339, 321, 91, 158, 188, 327, 238, 72, 130, 378)
data.frame(aa.1.1, alpha, beta, a.plus.b, a.div.b)
## aa.1.1 alpha beta a.plus.b a.div.b
## 1 A 285 203 175 361
## 2 R 53 67 78 146
## 3 N 97 139 120 183
## 4 D 163 121 111 244
## 5 C 22 75 74 63
## 6 Q 67 122 74 114
## 7 E 134 86 86 257
## 8 G 197 297 171 377
## 9 H 111 49 33 107
## 10 I 91 120 93 239
## 11 L 221 177 110 339
## 12 K 249 115 112 321
## 13 M 48 16 25 91
## 14 F 123 85 52 158
## 15 P 82 127 71 188
## 16 S 122 341 126 327
## 17 T 119 253 117 238
## 18 W 33 44 30 72
## 19 Y 63 110 108 130
## 20 V 167 229 123 378
# convert them to frequencies
alpha.prop <- alpha/sum(alpha)
beta.prop <- beta/sum(beta)
a.plus.b.prop <- a.plus.b/sum(a.plus.b)
a.div.b <- a.div.b/sum(a.div.b)
## dataframe
aa.prop <- data.frame(alpha.prop,
beta.prop,
a.plus.b.prop,
a.div.b)
## row labels
row.names(aa.prop) <- aa.1.1
pander::pander(aa.prop)
| Â | alpha.prop | beta.prop | a.plus.b.prop | a.div.b |
|---|---|---|---|---|
| A | 0.1165 | 0.07313 | 0.09264 | 0.08331 |
| R | 0.02166 | 0.02414 | 0.04129 | 0.03369 |
| N | 0.03964 | 0.05007 | 0.06353 | 0.04223 |
| D | 0.06661 | 0.04359 | 0.05876 | 0.05631 |
| C | 0.008991 | 0.02702 | 0.03917 | 0.01454 |
| Q | 0.02738 | 0.04395 | 0.03917 | 0.02631 |
| E | 0.05476 | 0.03098 | 0.04553 | 0.05931 |
| G | 0.08051 | 0.107 | 0.09052 | 0.08701 |
| H | 0.04536 | 0.01765 | 0.01747 | 0.02469 |
| I | 0.03719 | 0.04323 | 0.04923 | 0.05516 |
| L | 0.09031 | 0.06376 | 0.05823 | 0.07824 |
| K | 0.1018 | 0.04143 | 0.05929 | 0.07408 |
| M | 0.01962 | 0.005764 | 0.01323 | 0.021 |
| F | 0.05027 | 0.03062 | 0.02753 | 0.03646 |
| P | 0.03351 | 0.04575 | 0.03759 | 0.04339 |
| S | 0.04986 | 0.1228 | 0.0667 | 0.07547 |
| T | 0.04863 | 0.09114 | 0.06194 | 0.05493 |
| W | 0.01349 | 0.01585 | 0.01588 | 0.01662 |
| Y | 0.02575 | 0.03963 | 0.05717 | 0.03 |
| V | 0.06825 | 0.08249 | 0.06511 | 0.08724 |
plot(aa.prop,panel = panel.smooth)
names(dock3_table)
## NULL
length(dock3_table)
## [1] 60
dock3_table[1]
## [1] "NP_004938.1"
# Make each entry of the list into a vector
dock3_table
## [1] "NP_004938.1" "Q8IZD9" "NA"
## [4] "Homo sapiens" "Human" "DOCK3"
## [7] "XP_516488.4" "NA" "NA"
## [10] "Pan troglodytes" "Chimpanzee" "DOCK3"
## [13] "NP_700462.2" "Q8CIQ7" "NA"
## [16] "Mus musculus" "Mouse" "DOCK3"
## [19] "XP_001089458.2" "Q8HXW5" "NA"
## [22] "Macaca mulatta" "Cynomolgus monkey" "DOCK3"
## [25] "XP_533813.4" "Q6RH31" "NA"
## [28] "Canis Lupus" "Dog" "DOCK3"
## [31] "XP_002697118.2" "Q9XT97" "NA"
## [34] "Bos Taurus" "Bovine" "DOCK3"
## [37] "ENSP00000266037" "Q8IZD9" "NA"
## [40] "Homo Sapiens" "Human" "DOCK3"
## [43] "XM_032320975.1" "NA" "NA"
## [46] "Mustela erminea" "stoat" "DOCK3"
## [49] "NW_020847977.1" "NA" "NA"
## [52] "Tachysurus fulvidraco" "Catfish" "DOCK3"
## [55] "XP_006243927.1" "P97887" "NA"
## [58] "Rattus norvegicus" "rat" "DOCK3"
human <- unlist(dock3_table[1])
chimpanzee <- unlist(dock3_table[2])
mouse <- unlist(dock3_table[3])
monkey <- unlist(dock3_table[4])
dog <- unlist(dock3_table[5])
bovine <- unlist(dock3_table[6])
human <- unlist(dock3_table[7])
stoat <- unlist(dock3_table[8])
catfish <- unlist(dock3_table[9])
rat <- unlist(dock3_table[10])
dock3_vector <- rep(NA, length(dock3_table))
for(i in 1:length(Q8IZD9_vector)){
Q8IZD9_vector[i] <- dock3_table[i]}
# name the vector
names(Q8IZD9_vector) <- names(dock3_table)
pairwise alignment for human, chimpanze, mouse and rat.
#Chimps: XP_516488.4
chimps_fasta <- rentrez::entrez_fetch(db = "protein",
id = "XP_516488.4",
rettype = "fasta")
#Human: NP_004938.1
human_fasta <- rentrez::entrez_fetch(db = "protein",
id = "NP_004938.1",
rettype = "fasta")
#Fruit_Fly: NP_700462.2
mouse_fasta <- rentrez::entrez_fetch(db = "protein",
id = "NP_700462.2",
rettype = "fasta")
#Cattle: XP_001089458
cynomolgus_monkey_fasta <- rentrez::entrez_fetch(db = "protein",
id = "XP_001089458",
rettype = "fasta")
#BiocManager::install("fasta_cleaner")
chimps_vector <- fasta_cleaner(chimps_fasta)
human_vector <- fasta_cleaner(human_fasta)
mouse_vector <- fasta_cleaner(mouse_fasta)
cynomolgus_monkey_vector <- fasta_cleaner(cynomolgus_monkey_fasta)
data(package="Biostrings")
chimps_string <- paste(chimps_vector,collapse = "")
human_string <- paste(human_vector,collapse = "" )
mouse_string <- paste(mouse_vector,collapse = "")
cynomolgus_monkey_string <- paste(cynomolgus_monkey_vector,collapse = "")
chimps_string <- toupper(chimps_string)
human_string <- toupper(human_string)
mouse_string <- toupper(mouse_string)
cynomolgus_monkey_string <- toupper(cynomolgus_monkey_string)
data(BLOSUM50)
#CHIMPS VS OTHER
chimps_vs_human <- Biostrings::pairwiseAlignment(chimps_string,
human_string,
substitutionMatrix = BLOSUM50,
gapOpening = -8,
gapExtension = -2,
scoreOnly = FALSE)
chimps_vs_mouse <- Biostrings::pairwiseAlignment(chimps_string,
mouse_string,
substitutionMatrix = BLOSUM50,
gapOpening = -8,
gapExtension = -2,
scoreOnly = FALSE)
chimps_vs_cynomolgus_monkey <- Biostrings::pairwiseAlignment(chimps_string,
cynomolgus_monkey_string ,
substitutionMatrix = BLOSUM50,
gapOpening = -8,
gapExtension = -2,
scoreOnly = FALSE)
pid(chimps_vs_human)
## [1] 98.20301
pid(chimps_vs_mouse)
## [1] 96.45459
pid(chimps_vs_cynomolgus_monkey)
## [1] 97.13453
#HUMAN VS OTHER
human_vs_mouse <- Biostrings::pairwiseAlignment(human_string,
mouse_string,
substitutionMatrix = BLOSUM50,
gapOpening = -8,
gapExtension = -2,
scoreOnly = FALSE)
human_vs_cynomolgus_monkey <- Biostrings::pairwiseAlignment(human_string,
cynomolgus_monkey_string ,
substitutionMatrix = BLOSUM50,
gapOpening = -8,
gapExtension = -2,
scoreOnly = FALSE)
pid(human_vs_mouse)
## [1] 98.12808
pid(human_vs_cynomolgus_monkey)
## [1] 98.37438
#mouse VS OTHER
mouse_vs_cynomolgus_monkey <- Biostrings::pairwiseAlignment(mouse_string,
cynomolgus_monkey_string ,
substitutionMatrix = BLOSUM50,
gapOpening = -8,
gapExtension = -2,
scoreOnly = FALSE)
pid(mouse_vs_cynomolgus_monkey)
## [1] 96.74877
pids <- c(1, NA, NA, NA,
98.20301, 1, NA, NA,
96.45459, 98.12808, 1, NA,
97.13453, 98.37438, 96.74877, 1)
mat <- matrix(pids, nrow = 4, byrow = T)
row.names(mat) <- c("CHIMPS","HUMANS","MOUSE","CYNOMOLGUS_MONKEY")
colnames(mat) <- c("CHIMPS","HUMANS","MOUSE","CYNOMOLGUS_MONKEY")
pander::pander(mat)
| Â | CHIMPS | HUMANS | MOUSE | CYNOMOLGUS_MONKEY |
|---|---|---|---|---|
| CHIMPS | 1 | NA | NA | NA |
| HUMANS | 98.2 | 1 | NA | NA |
| MOUSE | 96.45 | 98.13 | 1 | NA |
| CYNOMOLGUS_MONKEY | 97.13 | 98.37 | 96.75 | 1 |
#PID methods comparison
#chimps vs human
pid(chimps_vs_human, type = "PID1")
## [1] 98.20301
pid(chimps_vs_human, type = "PID2")
## [1] 99.60591
pid(chimps_vs_human, type = "PID3")
## [1] 99.60591
pid(chimps_vs_human, type = "PID4")
## [1] 98.89949
pids_comparison <- c("PID1", 98.23875, "(aligned_positions_PLUS_internal_gap_positions)",
"PID2", 98.23875, "(aligned_positions)",
"PID3", 98.23875, "(length_shorter_sequence)",
"PID4", 98.23875, "(average_length_of_the_two_sequences)")
mat <- matrix(pids_comparison, nrow = 4, byrow = T)
row.names(mat) <- c("1","2","3","4")
colnames(mat) <- c("Method","PID","denominator")
pander::pander(mat)
| Method | PID | denominator |
|---|---|---|
| PID1 | 98.23875 | (aligned_positions_PLUS_internal_gap_positions) |
| PID2 | 98.23875 | (aligned_positions) |
| PID3 | 98.23875 | (length_shorter_sequence) |
| PID4 | 98.23875 | (average_length_of_the_two_sequences) |
#Multiple sequence alignment
a1 <- entrez_fetch(db = "protein",
id = "NP_004938.1",
rettype = "fasta")
a2 <- entrez_fetch(db = "protein",
id = "XP_516488.4",
rettype = "fasta")
a3 <- entrez_fetch(db = "protein",
id = "NP_700462.2",
rettype = "fasta")
a4 <- entrez_fetch(db = "protein",
id = "NP_001008222.1",
rettype = "fasta")
a5 <- entrez_fetch(db = "protein",
id = "XP_001089458.2",
rettype = "fasta")
a1 <- fasta_cleaner(a1, parse = F)
a2 <- fasta_cleaner(a2, parse = F)
a3 <- fasta_cleaner(a3, parse = F)
a4 <- fasta_cleaner(a4, parse = F)
a5 <- fasta_cleaner(a5, parse = F)
TABLE <- c("NP_004938.1", "Homo_sapiens", "a1",
"XP_516488.4", "Pan_troglodytes", "a2",
"NP_700462.2", "Mus_musculus", "a3",
"NP_001008222.1", "Homo_sapiens", "a4",
"XP_001089458.2", "Macaca_mulatta", "a5" )
TABLE_matrix <- matrix(TABLE,
byrow = T,
nrow = 5)
table <- data.frame(TABLE_matrix,
stringsAsFactors = F)
names(table) <- c("accession", "name.orig","name.new")
table$accession
## [1] "NP_004938.1" "XP_516488.4" "NP_700462.2" "NP_001008222.1"
## [5] "XP_001089458.2"
LIST <- entrez_fetch(db = "protein",
id = table$accession,
rettype = "fasta")
cat(LIST)
## >NP_004938.1 dedicator of cytokinesis protein 3 [Homo sapiens]
## MWTPTEEEKYGVVICSFRGSVPQGLVLEIGETVQILEKCEGWYRGVSTKKPNVKGIFPANYIHLKKAIVS
## NRGQYETVVPLEDSIVTEVTATLQEWASLWKQLYVKHKVDLFYKLRHVMNELIDLRRQLLSGHLTQDQVR
## EVKRHITVRLDWGNEHLGLDLVPRKDFEVVDSDQISVSDLYKMHLSSRQSVQQSTSQVDTMRPRHGETCR
## MPVPHHFFLSLKSFTYNTIGEDTDVFFSLYDMREGKQISERFLVRLNKNGGPRNPEKIERMCALFTDLSS
## KDMKRDLYIVAHVIRIGRMLLNDSKKGPPHLHYRRPYGCAVLSILDVLQSLTEVKEEKDFVLKVYTCNNE
## SEWSQIHENIIRKSSAKYSAPSASHGLIISLQLLRGDMEQIRRENPMIFNRGLAITRKLGFPDVIMPGDI
## RNDLYLTLEKGDFERGGKSVQKNIEVTMYVLYADGEILKDCISLGSGEPNRSSYHSFVLYHSNSPRWGEI
## IKLPIPIDRFRGSHLRFEFRHCSTKDKGEKKLFGFAFSTLMRDDGTTLSDDIHELYVYKCDENSTFNNHA
## LYLGLPCCKEDYNGCPNIPSSLIFQRSTKESFFISTQLSSTKLTQNVDLLALLKWKAFPDRIMDVLGRLR
## HVSGEEIVKFLQDILDTLFVILDDNTEKYGLLVFQSLVFIINLLRDIKYFHFRPVMDTYIQKHFAGALAY
## KELIRCLKWYMDCSAELIRQDHIQEAMRALEYLFKFIVQSRILYSRATCGMEEEQFRSSIQELFQSIRFV
## LSLDSRNSETLLFTQAALLNSFPTIFDELLQMFTVQEVAEFVRGTLGSMPSTVHIGQSMDVVKLQSIART
## VDSRLFSFSESRRILLPVVLHHIHLHLRQQKELLICSGILGSIFSIVKTSSLEADVMEEVEMMVESLLDV
## LLQTLLTIMSKSHAQEAVRGQRCPQCTAEITGEYVSCLLSLLRQMCDTHFQHLLDNFQSKDELKEFLLKI
## FCVFRNLMKMSVFPRDWMVMRLLTSNIIVTTVQYLSSALHKNFTETDFDFKVWNSYFSLAVLFINQPSLQ
## LEIITSAKRKKILDKYGDMRVMMAYELFSMWQNLGEHKIHFIPGMIGPFLGVTLVPQPEVRNIMIPIFHD
## MMDWEQRKNGNFKQVEAELIDKLDSMVSEGKGDESYRELFSLLTQLFGPYPSLLEKVEQETWRETGISFV
## TSVTRLMERLLDYRDCMKGEETENKKIGCTVNLMNFYKSEINKEEMYIRYIHKLCDMHLQAENYTEAAFT
## LLLYCELLQWEDRPLREFLHYPSQTEWQRKEGLCRKIIHYFNKGKSWEFGIPLCRELACQYESLYDYQSL
## SWIRKMEASYYDNIMEQQRLEPEFFRVGFYGRKFPFFLRNKEYVCRGHDYERLEAFQQRMLSEFPQAVAM
## QHPNHPDDAILQCDAQYLQIYAVTPIPDYVDVLQMDRVPDRVKSFYRVNNVRKFRYDRPFHKGPKDKENE
## FKSLWIERTTLTLTHSLPGISRWFEVERRELVEVSPLENAIQVVENKNQELRSLISQYQHKQVHGNINLL
## SMCLNGVIDAAVNGGIARYQEAFFDKDYINKHPGDAEKITQLKELMQEQVHVLGVGLAVHEKFVHPEMRP
## LHKKLIDQFQMMRASLYHEFPGLDKLSPACSGTSTPRGNVLASHSPMSPESIKMTHRHSPMNLMGTGRHS
## SSSLSSHASSEAGNMVMLGDGSMGDAPEDLYHHMQLAYPNPRYQGSVTNVSVLSSSQASPSSSSLSSTHS
## APSQMITSAPSSARGSPSLPDKYRHAREMMLLLPTYRDRPSSAMYPAAILENGQPPNFQRALFQQVVGAC
## KPCSDPNLSVAEKGHYSLHFDAFHHPLGDTPPALPARTLRKSPLHPIPASPTSPQSGLDGSNSTLSGSAS
## SGVSSLSESNFGHSSEAPPRTDTMDSMPSQAWNADEDLEPPYLPVHYSLSESAVLDSIKAQPCRSHSAPG
## CVIPQDPMDPPALPPKPYHPRLPALEHDEGVLLREETERPRGLHRKAPLPPGSAKEEQARMAWEHGRGEQ
##
## >XP_516488.4 PREDICTED: dedicator of cytokinesis protein 3 [Pan troglodytes]
## MEENEENMEPKDLQQEHSDYRMLRANGVEGCSPELRKEWSDVICSFRGSVPQGLVLEIGETVQILEKCEG
## WYRGVSTKKPNVKGIFPANYIHLKKAIVSNRGQYETVVPLEDSIVTEVTATLQEWASLWKQLYVKHKVDL
## FYKLRHVMNELIDLRRQLLSGHLTQDQVREVKRHITVRLDWGNEHLGLDLVPRKDFEVVDSDQISVSDLY
## KMHLSSRQSVQQSTSQVDTMRPRHGETCRMPVPHHFFLSLKSFTYNTIGEDTDVFFSLYDMREGKQISER
## FLVRLNKNGGPRNPEKIERMCALFTDLSSKDMKRDLYIVAHVIRIGRMLLNDSKKGPPHLHYRRPYGCAV
## LSILDVLQSLTEVKEEKDFVLKVYTCNNESEWSQIHENIIRKSSAKYSAPSASHGLIISLQLLRGDMEQI
## RRENPMIFNRGLAITRKLGFPDVIMPGDIRNDLYLTLEKGDFERGGKSVQKNIEVTMYVLYADGEILKDC
## ISLGSGEPNRSSYHSFVLYHSNSPRWGEIIKLPIPIDRFRGSHLRFEFRHCSTKDKGEKKLFGFAFSPLM
## RDDGTTLSDDIHELYVYKCDENSTFNNHALYLGLPCCKEDYNGCPNIPSSLIFQRSTKESFFISTQLSST
## KLTQNVDLLALLKWKAFPDRIMDVLGRLRHVSGEEIVKFLQDILDTLFVILDDNTEKYGLLVFQSLVFII
## NLLRDIKYFHFRPVMDTYIQKHFAGALAYKELIRCLKWYMDCSAELIRQDHIQEAMRALEYLFKFIVQSR
## ILYSRATCGMEEEQFRSSIQELFQSIRFVLSLDSRNSETLLFTQAALLNSFPTIFDELLQMFTVQEVAEF
## VRGTLGSMPSTVHIGQSMDVVKLQSIARTVDSRLFSFSESRRILLPVVLHHIHLHLRQQKELLICSGILG
## SIFSIVKTSSLEADVMEEVEMMVESLLDVLLQTLLTIMSKSHAQEAVRGQRCPQCTAEITGEYVSCLLSL
## LRQMCDTHFQHLLDNFQSKDELKEFLLKIFCVFRNLMKMSVFPRDWMVMRLLTSNIIVTTVQYLSSALHK
## NFTETDFDFKVWNSYFSLAVLFINQPSLQLEIITSAKRKKILDKYGDMRVMMAYELFSMWQNLGEHKIHF
## IPGMIGPFLGVTLVPQPEVRNIMIPIFHDMMDWEQRKNGNFKQVEAELIDKLDSMVSEGKGDESYRELFS
## LLTQLFGPYPSLLEKVEQETWRETGISFVTSVTRLMERLLDYRDCMKGEETENKKIGCTVNLMNFYKSEI
## NKEEMYIRYIHKLCDMHLQAENYTEAAFTLLLYCELLQWEDRPLREFLHYPSQTEWQRKEGLCRKIIHYF
## NKGKSWEFGIPLCRELACQYESLYDYQSLSWIRKMEASYYDNIMEQQRLEPEFFRVGFYGRKFPFFLRNK
## EYVCRGHDYERLEAFQQRMLSEFPQAVAMQHPNHPDDAILQCDAQYLQIYAVTPIPDYVDVLQMDRVPDR
## VKSFYRVNNVRKFRYDRPFHKGPKDKENEFKSLWIERTTLTLTHSLPGISRWFEVERRELVEVSPLENAI
## QVVENKNQELRSLISQYQHKQVHGNINLLSMCLNGVIDAAVNGGIARYQEAFFDKDYINKHPGDAEKITQ
## LKELMQEQVHVLGVGLAVHEKFVHPEMRPLHKKLIDQFQMMRASLYHEFPGLDKLSPACSGTNTPRGNVL
## ASHSPMSPESIKMTHRHSPMNLMGTGRHSSSSLSSHASSEAGNMVMLGDGSMGDAPEDLYHHMQLAYPNP
## RYQGSVTNVSVLSSSQASPSSSSLSSTHSAPSQMITSAPSSARGSPSLPDKYRHAREMMLLLPTYRDRPS
## SAMYPAAILENGQPPNFQRALFQQVVGACKPCSDPNLSVAEKGHYSLHFDAFHHPLGDTPPALPARTLRK
## SPLHPIPASPTSPQSGLDGSNSTLSGSASSGVSSLSESNFGHSSEAPPRTDTMDSMPSQAWNADEDLEPP
## YLPVHYSLSESAVLDSIKAQPCRSHSAPGCVIPQDPMDPPALPPKPYHPRLPALEHDEGVLLREETERPR
## GLHRKAPLPPGSAKEEQARMAWEHGRGEQ
##
## >NP_700462.2 dedicator of cytokinesis protein 3 [Mus musculus]
## MWTPTEEEKYGVVICSFRGSVPQGLVLEIGETVQILEKCEGWYRGVSTKKPNVKGLFPANYIHLKKAIVS
## NRGQYETVVPLEDSIVTEVTTTLQEWASLWKQLYVKHKVDLFYKLRHVMNELIDLRRQLLSGHLTQDQVR
## EVKRHITVRLDWGNEHLGLDLVPRKDFEVVDSDQISVSDLYKMHLSSRQSVQQSTSQVDTMRPRHGETCR
## MPVPHHFFFSLKSFTYNTIGEDSDVFFSLYDMREGKQISERFLVRLNKNGGPRNPEKIERMCALFTDLSS
## KDMKRDLYIVAHVIRIGRMLLNDSKKGPAHLHYRRPYGCAVLSILDVLQSLTELKEEKDFVLKVYTCNNE
## SEWTQIHENIIRKSSTKYSAPSASHGLIISLQLFRGDMEQIRRENPMIFNRGLAITRKLGFPDVIMPGDI
## RNDLYLTLEKGDFERGGKSVQKNIEVTMYVLYADGEILKDCISLGSGEPNRSSYHSFVLYHSNSPRWGEI
## IKLPIPIDRFRGSHLRFEFRHCSTKDKGEKKLFGFAFSPLMRDDGTTLSDDIHELYVYKCDENSTFNNHA
## LYLGLPCCKEDYNGCPNIPSSLIFQRSAKESFFISTQLSSTKLTQNVDLLALLKWKAFPDRIMDILGRLR
## HVSGEEIVKFLQDILDTLFVILDDNTEKYGLLVFQSLVFIINLLRDIKYFHFRPVMDTYIQKHFAGALAY
## KELIRCLKWYMDCSAELIRQDHIQEAMRALEYLFKFIVQSRILYSRATCGMEEEQFRSSIQELFQSIRFV
## LSLDSRNSETLLFTQAALLNSFPTIFDELLQMFTVQEVAEFVRGTLGSMPSTVHIGQSMDVVKLQSIART
## VDSRLFSFSESRRILLPVVLHHIHLHLRQQKELLICSGILGSIFSIVKTSSLEADVMEEVEMMVESLLDV
## LLQTLLTIMSKSHAQEAVRGQRCPQCTAEITGEYVSCLLSLLRQMCDTHFQHLLDNFQSKDELKEFLLKI
## FCVFRNLMKMSVFPRDWMVMRLLTSNIIVTTVQYLSSALHKNFTETDFDFKVWNSYFSLAVLFINQPSLQ
## LEIITSAKRKKILDKYGDMRVMMAYELFSMWQNLGDHKIHFIPGMIGPFLGVTLVPQPEVRNIMIPIFHD
## MMDWEQRKNGNFKQVEAELIDKLDSMVSEGKGDESYRELFGLLTQLFGPYPSLLEKVEQETWRETGISFV
## TSVTRLMERLLDYRDCMKGEETENKKVGCTVNLMNFYKSEINKEEMYIRYIHKLCDMHLQAENYTEAAFT
## LLLYCELLQWEDRPLREFLHYPSQTEWQRKEGLCRKIIHYFNKGKSWEFGIPLCRELACQYESLYDYQSL
## SWIRKMEASYYDNIIEQQRLEPEFFRVGFYGRKFPFFLRNKEYVCRGHDYERLEAFQQRMLSEFPQAVAM
## QHPNHPDDAILQCDAQYLQIYAVTPIPDYVDVLQMDRVPDRVKSFYRVNNVRKFRYDRPFHKGPKDKDNE
## FKSLWIERTTLTLTHSLPGISRWFEVERRELVEVSPLENAIQVVENKNQELRALISQYQHKQVHGNINLL
## SMCLNGVIDAAVNGGIARYQEAFFDKDYITKHPGDAEKISQLKELMQEQVHVLGVGLAVHEKFVHPEMRP
## LHKKLIDQFQMMRASLYHEFPGLDKLSPACSGTSTPRGNVLASHSPMSPENIKMTHRHSPMNLMGTGRHS
## SSSLSSHASSEAGNMMMMGDNSMGEAPEDLYHHMQLAYHNPRYQGSVTNVSVLSSSQASPSSSSLSSTHS
## APSQMITSAPSSTRGSPSLPDKYRHAREMMLLLPTHRDRPSSAMYPAAILENGQPPNFQRALFQQVVGAC
## KPCSDPNLSMAEKGHYSLHFDAFHHPLGDTPPALPARTLRKSPLHPIPASPTSPQSGLDGSNSTLSGSAS
## SGVSSLSESNFGHSSEAPPRTDTMDSMPSQAWNGDEDLEPPYLPVHYSLSESAVLDAIKSQPCRSHSAPG
## CVLPQDPMDPPALPPKPYHPRLPALEHDEGMLLREEAERPRGLHRKASLPPGSVKEEQARLAWEHGRGEQ
##
## >NP_001008222.1 alpha-amylase 1A precursor [Homo sapiens]
## MKLFWLLFTIGFCWAQYSSNTQQGRTSIVHLFEWRWVDIALECERYLAPKGFGGVQVSPPNENVAIHNPF
## RPWWERYQPVSYKLCTRSGNEDEFRNMVTRCNNVGVRIYVDAVINHMCGNAVSAGTSSTCGSYFNPGSRD
## FPAVPYSGWDFNDGKCKTGSGDIENYNDATQVRDCRLSGLLDLALGKDYVRSKIAEYMNHLIDIGVAGFR
## IDASKHMWPGDIKAILDKLHNLNSNWFPEGSKPFIYQEVIDLGGEPIKSSDYFGNGRVTEFKYGAKLGTV
## IRKWNGEKMSYLKNWGEGWGFMPSDRALVFVDNHDNQRGHGAGGASILTFWDARLYKMAVGFMLAHPYGF
## TRVMSSYRWPRYFENGKDVNDWVGPPNDNGVTKEVTINPDTTCGNDWVCEHRWRQIRNMVNFRNVVDGQP
## FTNWYDNGSNQVAFGRGNRGFIVFNNDDWTFSLTLQTGLPAGTYCDVISGDKINGNCTGIKIYVSDDGKA
## HFSISNSAEDPFIAIHAESKL
##
## >XP_001089458.2 PREDICTED: dedicator of cytokinesis protein 3-like [Macaca mulatta]
## MLIGVFILAFLVICSFRGSVPQGLVLEIGETVQILEKCEGWYRGVSTKKPNVKGIFPANYIHLKKAIVSN
## RGQYETVVPLEDSIVTEVTATLQEWASLWKQLYVKHKVDLFYKLRHVMNELIDLRRQLLSGHLTQDQVRE
## VKRHITVRLDWGNEHLGLDLVPRKDFEVVDSDQISVSDLYKMHLSSRQSVQQSTSQVDTMRPRHGETCRM
## PVPHHFFLSLKSFTYNTIGEDTDVFFSLYDMREGKQISERFLVRLNKNGGPRNPEKIERMCALFTDLSSK
## DMKRDLYIVAHVIRIGRMLLNDSKKGPPHLHYRRPYGCAVLSILDVLQSLTEVKEEKDFVLKVYTCNNES
## EWSQIHENIIRKSSAKYSAPSASHGLIISLQLLRGDMEQIRRENPMIFNRGLAITRKLGFPDVIMPGDIR
## NDLYLTLEKGDFERGGKSVQKNIEVTMYVLYADGEILKDCISLGSGEPNRSSYHSFVLYHSNSPRWGEII
## KLPIPIDRFRGSHLRFEFRHCSTKDKGEKKLFGFAFSPLMRDDGTTLSDDIHELYVYKCDENSTFNNHAL
## YLGLPCCKEDYNGCPNIPSSLIFQRSTKXXXXXDLLALLKWKAFPDRIMDVLGRLRHVSGEEIVKFLQDI
## LDTLFVILDDNTEKYGLLVFQSLVFIINLLRDIKYFHFRPVMDTYIQKHFAGALAYKELIRCLKWYMDCS
## AELIRQDHIQEAMRALEYLFKFIVQSRILYSRATCGMEEEQFRSSIQELFQSIRFVLSLDSRNSETLLFT
## QAALLNSFPTIFDELLQMFTVQEVAEFVRGTLGSMPSTVHIGQSMDVVKLQSIARTVDSRLFSFSESRRI
## LLPVVLHHIHLHLRQQKELLICSGILGSIFSIVKTSSLEADVMEEVEMMVESLLDVLLQTLLTIMSKSHA
## QEAVRGQRCPQCTAEITGEYVSCLLSLLRQMCDTHFQHLLDNFQSKDELKEFLLKIFCVFRNLMKMSVFP
## RDWMVMRLLTSNIIVTTVQYLSSALHKNFTETDFDFKVWNSYFSLAVLFINQPSLQLEIITSVKRKKILD
## KYGDMRVMMAYELFSMWQNLGEHKIHFIPGMIGPFLGVTLVPQPEVRNIMIPIFHDMMDWEQRKNGNFKQ
## VEAELIDKLDSMVSEGKGDESYRELFSLLTQLFGPYPSLLEKVEQETWRETGISFVTSVTRLMERLLDYR
## DCMKGEETENKKIGCTVNLMNFYKSEINKEEMYIRYIHKLCDMHLQAENYTEAAFTLLLYCELLQWEDRP
## LREFLHYPSQTEWQRKEGLCRKIIHYFNKGKSWEFGIPLCRELACQYESLYDYQSLSWIRKMEASYYDNI
## MEQQRLEPEFFRVGFYGRKFPFFLRNKEYVCRGHDYERLEAFQQRMLSEFPQAVAMQHPNHPDDAILQCD
## AQYLQIYAVTPIPDYVDVLQMDRVPDRVKSFYRVNNVRKFRYDRPFHKGPKDKENEFKSLWIERTTLTLT
## HSLPGISRWFEVERRELVEVSPLENAIQVVENKNQELRSLISQYQHKQVHGNINLLSMCLNGVIDAAVNG
## GIARYQEAFFDKDYINKHPGDAEKITQLKELMQEQVHVLGVGLAVHEKFVHPEMRPLHKKLIDQFQMMRA
## SLYHEFPGLDKLSPACSGTSTPRGNVLASHSPMSPESIKMTHRHSPMNLMGTGRHSSSSLSSHASSEAGN
## MVMLGDGSMGDAPEDLYHHMQLAYPNPRYQGSVTNVSVLSSSQASPSSSSLSSTHSAPSQMITSAPSSAR
## GSPSLPDKYRHAREMMLLLPTYRDRPSSAMYPAAILENGQPPNFQRALFQQVVGACKPCSDPNLSVAEKG
## HYSLHFDAFHHPLGDTPPALPARTLRKSPLHPIPASPTSPQSGLDGSNSTLSGSASSGVSSLSESNFGHS
## SEVPPRTDTMDSMPSQAWNADEDLEPPYLPVHYSLSESAVLDSIKAQPCRSHSAPGCVIPQDPMDPPALP
## PKPYHPRLPALEHDEGVLLREETERPRGLHRKASLPPGSAKEEQARMAWEHGRGEQ
entrez_fetch_list <- function(db, id, rettype, ...){
#setup list for storing output
n.seq <- length(id)
list.output <- as.list(rep(NA, n.seq))
names(list.output) <- id
# get output
for(i in 1:length(id)){
list.output[[i]] <- rentrez::entrez_fetch(db = db,
id = id[i],
rettype = rettype)
}
return(list.output)
}
list <- entrez_fetch_list(db = "protein",
id =table$accession,
rettype = "fasta")
list[[1]]
## [1] ">NP_004938.1 dedicator of cytokinesis protein 3 [Homo sapiens]\nMWTPTEEEKYGVVICSFRGSVPQGLVLEIGETVQILEKCEGWYRGVSTKKPNVKGIFPANYIHLKKAIVS\nNRGQYETVVPLEDSIVTEVTATLQEWASLWKQLYVKHKVDLFYKLRHVMNELIDLRRQLLSGHLTQDQVR\nEVKRHITVRLDWGNEHLGLDLVPRKDFEVVDSDQISVSDLYKMHLSSRQSVQQSTSQVDTMRPRHGETCR\nMPVPHHFFLSLKSFTYNTIGEDTDVFFSLYDMREGKQISERFLVRLNKNGGPRNPEKIERMCALFTDLSS\nKDMKRDLYIVAHVIRIGRMLLNDSKKGPPHLHYRRPYGCAVLSILDVLQSLTEVKEEKDFVLKVYTCNNE\nSEWSQIHENIIRKSSAKYSAPSASHGLIISLQLLRGDMEQIRRENPMIFNRGLAITRKLGFPDVIMPGDI\nRNDLYLTLEKGDFERGGKSVQKNIEVTMYVLYADGEILKDCISLGSGEPNRSSYHSFVLYHSNSPRWGEI\nIKLPIPIDRFRGSHLRFEFRHCSTKDKGEKKLFGFAFSTLMRDDGTTLSDDIHELYVYKCDENSTFNNHA\nLYLGLPCCKEDYNGCPNIPSSLIFQRSTKESFFISTQLSSTKLTQNVDLLALLKWKAFPDRIMDVLGRLR\nHVSGEEIVKFLQDILDTLFVILDDNTEKYGLLVFQSLVFIINLLRDIKYFHFRPVMDTYIQKHFAGALAY\nKELIRCLKWYMDCSAELIRQDHIQEAMRALEYLFKFIVQSRILYSRATCGMEEEQFRSSIQELFQSIRFV\nLSLDSRNSETLLFTQAALLNSFPTIFDELLQMFTVQEVAEFVRGTLGSMPSTVHIGQSMDVVKLQSIART\nVDSRLFSFSESRRILLPVVLHHIHLHLRQQKELLICSGILGSIFSIVKTSSLEADVMEEVEMMVESLLDV\nLLQTLLTIMSKSHAQEAVRGQRCPQCTAEITGEYVSCLLSLLRQMCDTHFQHLLDNFQSKDELKEFLLKI\nFCVFRNLMKMSVFPRDWMVMRLLTSNIIVTTVQYLSSALHKNFTETDFDFKVWNSYFSLAVLFINQPSLQ\nLEIITSAKRKKILDKYGDMRVMMAYELFSMWQNLGEHKIHFIPGMIGPFLGVTLVPQPEVRNIMIPIFHD\nMMDWEQRKNGNFKQVEAELIDKLDSMVSEGKGDESYRELFSLLTQLFGPYPSLLEKVEQETWRETGISFV\nTSVTRLMERLLDYRDCMKGEETENKKIGCTVNLMNFYKSEINKEEMYIRYIHKLCDMHLQAENYTEAAFT\nLLLYCELLQWEDRPLREFLHYPSQTEWQRKEGLCRKIIHYFNKGKSWEFGIPLCRELACQYESLYDYQSL\nSWIRKMEASYYDNIMEQQRLEPEFFRVGFYGRKFPFFLRNKEYVCRGHDYERLEAFQQRMLSEFPQAVAM\nQHPNHPDDAILQCDAQYLQIYAVTPIPDYVDVLQMDRVPDRVKSFYRVNNVRKFRYDRPFHKGPKDKENE\nFKSLWIERTTLTLTHSLPGISRWFEVERRELVEVSPLENAIQVVENKNQELRSLISQYQHKQVHGNINLL\nSMCLNGVIDAAVNGGIARYQEAFFDKDYINKHPGDAEKITQLKELMQEQVHVLGVGLAVHEKFVHPEMRP\nLHKKLIDQFQMMRASLYHEFPGLDKLSPACSGTSTPRGNVLASHSPMSPESIKMTHRHSPMNLMGTGRHS\nSSSLSSHASSEAGNMVMLGDGSMGDAPEDLYHHMQLAYPNPRYQGSVTNVSVLSSSQASPSSSSLSSTHS\nAPSQMITSAPSSARGSPSLPDKYRHAREMMLLLPTYRDRPSSAMYPAAILENGQPPNFQRALFQQVVGAC\nKPCSDPNLSVAEKGHYSLHFDAFHHPLGDTPPALPARTLRKSPLHPIPASPTSPQSGLDGSNSTLSGSAS\nSGVSSLSESNFGHSSEAPPRTDTMDSMPSQAWNADEDLEPPYLPVHYSLSESAVLDSIKAQPCRSHSAPG\nCVIPQDPMDPPALPPKPYHPRLPALEHDEGVLLREETERPRGLHRKAPLPPGSAKEEQARMAWEHGRGEQ\n\n"
list[[1]] <- fasta_cleaner(list[[1]], parse = F)
list[[2]] <- fasta_cleaner(list[[2]], parse = F)
list[[3]] <- fasta_cleaner(list[[3]], parse = F)
list[[4]] <- fasta_cleaner(list[[4]], parse = F)
list[[5]] <- fasta_cleaner(list[[5]], parse = F)
length(list)
## [1] 5
list_vector <- rep(NA, length(list))
list_vector
## [1] NA NA NA NA NA
for(i in 1:length(list_vector)){
list_vector[i] <- list[[i]]}
names(list_vector) <- names(list)
list_vector_ss <- Biostrings::AAStringSet(list_vector)
list_align <- msa(list_vector_ss,
method = "ClustalW")
## use default substitution matrix
list_align
## CLUSTAL 2.1
##
## Call:
## msa(list_vector_ss, method = "ClustalW")
##
## MsaAAMultipleAlignment with 5 rows and 2059 columns
## aln names
## [1] --------------------------...KAPLPPGSAKEEQARMAWEHGRGEQ NP_004938.1
## [2] MEENEENMEPKDLQQEHSDYRMLRAN...KAPLPPGSAKEEQARMAWEHGRGEQ XP_516488.4
## [3] --------------------------...KASLPPGSAKEEQARMAWEHGRGEQ XP_001089458.2
## [4] --------------------------...KASLPPGSVKEEQARLAWEHGRGEQ NP_700462.2
## [5] --------------------------...------------------------- NP_001008222.1
## Con --------------------------...KA?LPPGSAKEEQARMAWEHGRGEQ Consensus
class(list_align)
## [1] "MsaAAMultipleAlignment"
## attr(,"package")
## [1] "msa"
class(list_align) <- "AAMultipleAlignment"
list_align_seqinr <- msaConvert(list_align,
type = "seqinr::alignment")
compbio4all::print_msa(alignment = list_align_seqinr,
chunksize = 60)
## [1] "-----------------------------MWTPTEEEKYGVVICSFRGSVPQGLVLEIGE 0"
## [1] "MEENEENMEPKDLQQEHSDYRMLRANGVEGCSPELRKEWSDVICSFRGSVPQGLVLEIGE 0"
## [1] "------------------------------MLIGVFILAFLVICSFRGSVPQGLVLEIGE 0"
## [1] "-----------------------------MWTPTEEEKYGVVICSFRGSVPQGLVLEIGE 0"
## [1] "------------------------------------------------------------ 0"
## [1] " "
## [1] "TVQILEKCEGWYRGVSTKKPNVKGIFPANYIHLKKAIVSNRGQYETVVPLEDSIVTEVTA 0"
## [1] "TVQILEKCEGWYRGVSTKKPNVKGIFPANYIHLKKAIVSNRGQYETVVPLEDSIVTEVTA 0"
## [1] "TVQILEKCEGWYRGVSTKKPNVKGIFPANYIHLKKAIVSNRGQYETVVPLEDSIVTEVTA 0"
## [1] "TVQILEKCEGWYRGVSTKKPNVKGLFPANYIHLKKAIVSNRGQYETVVPLEDSIVTEVTT 0"
## [1] "------------------------------------------------------------ 0"
## [1] " "
## [1] "TLQEWASLWKQLYVKHKVDLFYKLRHVMNELIDLRRQLLSGHLTQDQVREVKRHITVRLD 0"
## [1] "TLQEWASLWKQLYVKHKVDLFYKLRHVMNELIDLRRQLLSGHLTQDQVREVKRHITVRLD 0"
## [1] "TLQEWASLWKQLYVKHKVDLFYKLRHVMNELIDLRRQLLSGHLTQDQVREVKRHITVRLD 0"
## [1] "TLQEWASLWKQLYVKHKVDLFYKLRHVMNELIDLRRQLLSGHLTQDQVREVKRHITVRLD 0"
## [1] "-----------------MKLFWLLFTIG----------FCWAQYSSNTQQGRTSIVHLFE 0"
## [1] " "
## [1] "WGNEHLGLDLVPRKDFEVVDSDQISVSDLYKMHLSSRQSVQQSTSQVDTMRPRHGETCRM 0"
## [1] "WGNEHLGLDLVPRKDFEVVDSDQISVSDLYKMHLSSRQSVQQSTSQVDTMRPRHGETCRM 0"
## [1] "WGNEHLGLDLVPRKDFEVVDSDQISVSDLYKMHLSSRQSVQQSTSQVDTMRPRHGETCRM 0"
## [1] "WGNEHLGLDLVPRKDFEVVDSDQISVSDLYKMHLSSRQSVQQSTSQVDTMRPRHGETCRM 0"
## [1] "WRWVDIALECER------------------------------------------------ 0"
## [1] " "
## [1] "PVPHHFFLSLKSFTYNTIGEDTDVFFSLYDMREGKQISERFLVRLNKNGGPRNPEKIERM 0"
## [1] "PVPHHFFLSLKSFTYNTIGEDTDVFFSLYDMREGKQISERFLVRLNKNGGPRNPEKIERM 0"
## [1] "PVPHHFFLSLKSFTYNTIGEDTDVFFSLYDMREGKQISERFLVRLNKNGGPRNPEKIERM 0"
## [1] "PVPHHFFFSLKSFTYNTIGEDSDVFFSLYDMREGKQISERFLVRLNKNGGPRNPEKIERM 0"
## [1] "------YLAPKGFGGVQVSPPNENVAIHNPFRPWWERYQPVSYKLCTRSG---------- 0"
## [1] " "
## [1] "CALFTDLSSKDMKRDLYIVAHVIRIGRMLLNDSKKGPPHLHYRRPYGCAVLSILDVLQSL 0"
## [1] "CALFTDLSSKDMKRDLYIVAHVIRIGRMLLNDSKKGPPHLHYRRPYGCAVLSILDVLQSL 0"
## [1] "CALFTDLSSKDMKRDLYIVAHVIRIGRMLLNDSKKGPPHLHYRRPYGCAVLSILDVLQSL 0"
## [1] "CALFTDLSSKDMKRDLYIVAHVIRIGRMLLNDSKKGPAHLHYRRPYGCAVLSILDVLQSL 0"
## [1] "------------------------------------------------------------ 0"
## [1] " "
## [1] "TEVKEEKDFVLKVYTCNNESEWSQIHENIIRKSSAKYSAPSASHGLIISLQLLRGDMEQI 0"
## [1] "TEVKEEKDFVLKVYTCNNESEWSQIHENIIRKSSAKYSAPSASHGLIISLQLLRGDMEQI 0"
## [1] "TEVKEEKDFVLKVYTCNNESEWSQIHENIIRKSSAKYSAPSASHGLIISLQLLRGDMEQI 0"
## [1] "TELKEEKDFVLKVYTCNNESEWTQIHENIIRKSSTKYSAPSASHGLIISLQLFRGDMEQI 0"
## [1] "----NEDEFRNMVTRCNNVGVRIYVDAVINHMCGNAVSAGTSST---------------- 0"
## [1] " "
## [1] "RRENPMIFNRGLAITRKLGFPDVIMPGDIRNDLYLTLEKGDFERGGKSVQKNIEVTMYVL 0"
## [1] "RRENPMIFNRGLAITRKLGFPDVIMPGDIRNDLYLTLEKGDFERGGKSVQKNIEVTMYVL 0"
## [1] "RRENPMIFNRGLAITRKLGFPDVIMPGDIRNDLYLTLEKGDFERGGKSVQKNIEVTMYVL 0"
## [1] "RRENPMIFNRGLAITRKLGFPDVIMPGDIRNDLYLTLEKGDFERGGKSVQKNIEVTMYVL 0"
## [1] "---CGSYFNP-----GSRDFPAVPYSG-------WDFNDGKCKTGSGDIEN--------- 0"
## [1] " "
## [1] "YADGEILKDCISLGSGEPNRSSYHSFVLYHSNSPRWGEIIKLPIPIDRFRGSHLRFEFRH 0"
## [1] "YADGEILKDCISLGSGEPNRSSYHSFVLYHSNSPRWGEIIKLPIPIDRFRGSHLRFEFRH 0"
## [1] "YADGEILKDCISLGSGEPNRSSYHSFVLYHSNSPRWGEIIKLPIPIDRFRGSHLRFEFRH 0"
## [1] "YADGEILKDCISLGSGEPNRSSYHSFVLYHSNSPRWGEIIKLPIPIDRFRGSHLRFEFRH 0"
## [1] "YNDATQVRDCRLSG------------------------LLDLALGKDYVR---------- 0"
## [1] " "
## [1] "CSTKDKGEKKLFGFAFSTLMRDDGTTLSDDIHELYVYKCDENSTFNNHALYLGLPCCKED 0"
## [1] "CSTKDKGEKKLFGFAFSPLMRDDGTTLSDDIHELYVYKCDENSTFNNHALYLGLPCCKED 0"
## [1] "CSTKDKGEKKLFGFAFSPLMRDDGTTLSDDIHELYVYKCDENSTFNNHALYLGLPCCKED 0"
## [1] "CSTKDKGEKKLFGFAFSPLMRDDGTTLSDDIHELYVYKCDENSTFNNHALYLGLPCCKED 0"
## [1] "---------------------------------------SKIAEYMNHLIDIGVAGFRID 0"
## [1] " "
## [1] "YNGCPNIPSSLIFQRSTKESFFISTQLSSTKLTQNVDLLALLKWKAFPDRIMDVLGRLRH 0"
## [1] "YNGCPNIPSSLIFQRSTKESFFISTQLSSTKLTQNVDLLALLKWKAFPDRIMDVLGRLRH 0"
## [1] "YNGCPNIPSSLIFQRSTKXXXXX-------------DLLALLKWKAFPDRIMDVLGRLRH 0"
## [1] "YNGCPNIPSSLIFQRSAKESFFISTQLSSTKLTQNVDLLALLKWKAFPDRIMDILGRLRH 0"
## [1] "AS-----------------------------------------KHMWPGDIKAILDKLHN 0"
## [1] " "
## [1] "VSGEEIVKFLQDILDTLFVILDDNTEKYGLLVFQSLVFIINLLRDIKYFHFRPVMDTYIQ 0"
## [1] "VSGEEIVKFLQDILDTLFVILDDNTEKYGLLVFQSLVFIINLLRDIKYFHFRPVMDTYIQ 0"
## [1] "VSGEEIVKFLQDILDTLFVILDDNTEKYGLLVFQSLVFIINLLRDIKYFHFRPVMDTYIQ 0"
## [1] "VSGEEIVKFLQDILDTLFVILDDNTEKYGLLVFQSLVFIINLLRDIKYFHFRPVMDTYIQ 0"
## [1] "LNSN-------------------------------------------------------- 0"
## [1] " "
## [1] "KHFAGALAYKELIRCLKWYMDCSAELIRQDHIQEAMRALEYLFKFIVQSRILYSRATCGM 0"
## [1] "KHFAGALAYKELIRCLKWYMDCSAELIRQDHIQEAMRALEYLFKFIVQSRILYSRATCGM 0"
## [1] "KHFAGALAYKELIRCLKWYMDCSAELIRQDHIQEAMRALEYLFKFIVQSRILYSRATCGM 0"
## [1] "KHFAGALAYKELIRCLKWYMDCSAELIRQDHIQEAMRALEYLFKFIVQSRILYSRATCGM 0"
## [1] "-----------------WFPEGSKPFIYQEVIDLGGEPIK-------------------- 0"
## [1] " "
## [1] "EEEQFRSSIQELFQSIRFVLSLDSRNSETLLFTQAALLNSFPTIFDELLQMFTVQEVAEF 0"
## [1] "EEEQFRSSIQELFQSIRFVLSLDSRNSETLLFTQAALLNSFPTIFDELLQMFTVQEVAEF 0"
## [1] "EEEQFRSSIQELFQSIRFVLSLDSRNSETLLFTQAALLNSFPTIFDELLQMFTVQEVAEF 0"
## [1] "EEEQFRSSIQELFQSIRFVLSLDSRNSETLLFTQAALLNSFPTIFDELLQMFTVQEVAEF 0"
## [1] "-----------------------------------------------SSDYFGNGRVTEF 0"
## [1] " "
## [1] "VRGTLGSMPSTVHIGQSMDVVKLQSIARTVDSRLFSFSESRRILLPVVLHHIHLHLRQQK 0"
## [1] "VRGTLGSMPSTVHIGQSMDVVKLQSIARTVDSRLFSFSESRRILLPVVLHHIHLHLRQQK 0"
## [1] "VRGTLGSMPSTVHIGQSMDVVKLQSIARTVDSRLFSFSESRRILLPVVLHHIHLHLRQQK 0"
## [1] "VRGTLGSMPSTVHIGQSMDVVKLQSIARTVDSRLFSFSESRRILLPVVLHHIHLHLRQQK 0"
## [1] "KYG-----------------AKLGTVIRKWNGEKMSYLKN-------------------- 0"
## [1] " "
## [1] "ELLICSGILGSIFSIVKTSSLEADVMEEVEMMVESLLDVLLQTLLTIMSKSHAQEAVRGQ 0"
## [1] "ELLICSGILGSIFSIVKTSSLEADVMEEVEMMVESLLDVLLQTLLTIMSKSHAQEAVRGQ 0"
## [1] "ELLICSGILGSIFSIVKTSSLEADVMEEVEMMVESLLDVLLQTLLTIMSKSHAQEAVRGQ 0"
## [1] "ELLICSGILGSIFSIVKTSSLEADVMEEVEMMVESLLDVLLQTLLTIMSKSHAQEAVRGQ 0"
## [1] "--------WGEGWGFMPSD----------------------RALVFVDNHDNQRGHGAGG 0"
## [1] " "
## [1] "RCPQCTAEITGEYVSCLLSLLRQMCDTHFQHLLDNFQSKDELKEFLLKIFCVFRNLMKMS 0"
## [1] "RCPQCTAEITGEYVSCLLSLLRQMCDTHFQHLLDNFQSKDELKEFLLKIFCVFRNLMKMS 0"
## [1] "RCPQCTAEITGEYVSCLLSLLRQMCDTHFQHLLDNFQSKDELKEFLLKIFCVFRNLMKMS 0"
## [1] "RCPQCTAEITGEYVSCLLSLLRQMCDTHFQHLLDNFQSKDELKEFLLKIFCVFRNLMKMS 0"
## [1] "-----------------------------ASILTFWDAR----------------LYKMA 0"
## [1] " "
## [1] "VFPRDWMVMRLLTSNIIVTTVQYLSSALHKNFTETDFDFKVWNSYFSLAVLFINQPSLQL 0"
## [1] "VFPRDWMVMRLLTSNIIVTTVQYLSSALHKNFTETDFDFKVWNSYFSLAVLFINQPSLQL 0"
## [1] "VFPRDWMVMRLLTSNIIVTTVQYLSSALHKNFTETDFDFKVWNSYFSLAVLFINQPSLQL 0"
## [1] "VFPRDWMVMRLLTSNIIVTTVQYLSSALHKNFTETDFDFKVWNSYFSLAVLFINQPSLQL 0"
## [1] "VG------------------------------------------------FMLAHP---- 0"
## [1] " "
## [1] "EIITSAKRKKILDKYGDMRVMMAYELFSMWQNLGEHKIHFIPGMIGPFLGVTLVPQPEVR 0"
## [1] "EIITSAKRKKILDKYGDMRVMMAYELFSMWQNLGEHKIHFIPGMIGPFLGVTLVPQPEVR 0"
## [1] "EIITSVKRKKILDKYGDMRVMMAYELFSMWQNLGEHKIHFIPGMIGPFLGVTLVPQPEVR 0"
## [1] "EIITSAKRKKILDKYGDMRVMMAYELFSMWQNLGDHKIHFIPGMIGPFLGVTLVPQPEVR 0"
## [1] "--------------YGFTRVMSSYRWPRYFENG--------------------------- 0"
## [1] " "
## [1] "NIMIPIFHDMMDWEQRKNGNFKQVEAELIDKLDSMVSEGKGDESYRELFSLLTQLFGPYP 0"
## [1] "NIMIPIFHDMMDWEQRKNGNFKQVEAELIDKLDSMVSEGKGDESYRELFSLLTQLFGPYP 0"
## [1] "NIMIPIFHDMMDWEQRKNGNFKQVEAELIDKLDSMVSEGKGDESYRELFSLLTQLFGPYP 0"
## [1] "NIMIPIFHDMMDWEQRKNGNFKQVEAELIDKLDSMVSEGKGDESYRELFGLLTQLFGPYP 0"
## [1] "-------KDVNDWVGPPND--NGVTKEVTINPDTTCGN---------------------- 0"
## [1] " "
## [1] "SLLEKVEQETWRETGISFVTSVTRLMERLLDYRDCMKGEETENKKIGCTVNLMNFYKSEI 0"
## [1] "SLLEKVEQETWRETGISFVTSVTRLMERLLDYRDCMKGEETENKKIGCTVNLMNFYKSEI 0"
## [1] "SLLEKVEQETWRETGISFVTSVTRLMERLLDYRDCMKGEETENKKIGCTVNLMNFYKSEI 0"
## [1] "SLLEKVEQETWRETGISFVTSVTRLMERLLDYRDCMKGEETENKKVGCTVNLMNFYKSEI 0"
## [1] "---DWVCEHRWRQIRN------------MVNFRNVVDGQ-----------PFTNWYDNGS 0"
## [1] " "
## [1] "NKEEMYIRYIHKLCDMHLQAENYTEAAFTLLLYCELLQWEDRPLREFLHYPSQTEWQRKE 0"
## [1] "NKEEMYIRYIHKLCDMHLQAENYTEAAFTLLLYCELLQWEDRPLREFLHYPSQTEWQRKE 0"
## [1] "NKEEMYIRYIHKLCDMHLQAENYTEAAFTLLLYCELLQWEDRPLREFLHYPSQTEWQRKE 0"
## [1] "NKEEMYIRYIHKLCDMHLQAENYTEAAFTLLLYCELLQWEDRPLREFLHYPSQTEWQRKE 0"
## [1] "N---------------------------------------------------QVAFGRGN 0"
## [1] " "
## [1] "GLCRKIIHYFNKGKSWEFGIPLCRELACQYESLYDYQSLSWIRKMEASYYDNIMEQQRLE 0"
## [1] "GLCRKIIHYFNKGKSWEFGIPLCRELACQYESLYDYQSLSWIRKMEASYYDNIMEQQRLE 0"
## [1] "GLCRKIIHYFNKGKSWEFGIPLCRELACQYESLYDYQSLSWIRKMEASYYDNIMEQQRLE 0"
## [1] "GLCRKIIHYFNKGKSWEFGIPLCRELACQYESLYDYQSLSWIRKMEASYYDNIIEQQRLE 0"
## [1] "---RGFIVFNN--DDWTF-------------------SLTLQTGLPAGTYCDVISGDKIN 0"
## [1] " "
## [1] "PEFFRVGFYGRKFPFFLRNKEYVCRGHDYERLEAFQQRMLSEFPQAVAMQHPNHPDDAIL 0"
## [1] "PEFFRVGFYGRKFPFFLRNKEYVCRGHDYERLEAFQQRMLSEFPQAVAMQHPNHPDDAIL 0"
## [1] "PEFFRVGFYGRKFPFFLRNKEYVCRGHDYERLEAFQQRMLSEFPQAVAMQHPNHPDDAIL 0"
## [1] "PEFFRVGFYGRKFPFFLRNKEYVCRGHDYERLEAFQQRMLSEFPQAVAMQHPNHPDDAIL 0"
## [1] "GNCTGIKIY-------------------------VSDDGKAHFSISNSAEDP-------- 0"
## [1] " "
## [1] "QCDAQYLQIYAVTPIPDYVDVLQMDRVPDRVKSFYRVNNVRKFRYDRPFHKGPKDKENEF 0"
## [1] "QCDAQYLQIYAVTPIPDYVDVLQMDRVPDRVKSFYRVNNVRKFRYDRPFHKGPKDKENEF 0"
## [1] "QCDAQYLQIYAVTPIPDYVDVLQMDRVPDRVKSFYRVNNVRKFRYDRPFHKGPKDKENEF 0"
## [1] "QCDAQYLQIYAVTPIPDYVDVLQMDRVPDRVKSFYRVNNVRKFRYDRPFHKGPKDKDNEF 0"
## [1] "-----FIAIHAESKL--------------------------------------------- 0"
## [1] " "
## [1] "KSLWIERTTLTLTHSLPGISRWFEVERRELVEVSPLENAIQVVENKNQELRSLISQYQHK 0"
## [1] "KSLWIERTTLTLTHSLPGISRWFEVERRELVEVSPLENAIQVVENKNQELRSLISQYQHK 0"
## [1] "KSLWIERTTLTLTHSLPGISRWFEVERRELVEVSPLENAIQVVENKNQELRSLISQYQHK 0"
## [1] "KSLWIERTTLTLTHSLPGISRWFEVERRELVEVSPLENAIQVVENKNQELRALISQYQHK 0"
## [1] "------------------------------------------------------------ 0"
## [1] " "
## [1] "QVHGNINLLSMCLNGVIDAAVNGGIARYQEAFFDKDYINKHPGDAEKITQLKELMQEQVH 0"
## [1] "QVHGNINLLSMCLNGVIDAAVNGGIARYQEAFFDKDYINKHPGDAEKITQLKELMQEQVH 0"
## [1] "QVHGNINLLSMCLNGVIDAAVNGGIARYQEAFFDKDYINKHPGDAEKITQLKELMQEQVH 0"
## [1] "QVHGNINLLSMCLNGVIDAAVNGGIARYQEAFFDKDYITKHPGDAEKISQLKELMQEQVH 0"
## [1] "------------------------------------------------------------ 0"
## [1] " "
## [1] "VLGVGLAVHEKFVHPEMRPLHKKLIDQFQMMRASLYHEFPGLDKLSPACSGTSTPRGNVL 0"
## [1] "VLGVGLAVHEKFVHPEMRPLHKKLIDQFQMMRASLYHEFPGLDKLSPACSGTNTPRGNVL 0"
## [1] "VLGVGLAVHEKFVHPEMRPLHKKLIDQFQMMRASLYHEFPGLDKLSPACSGTSTPRGNVL 0"
## [1] "VLGVGLAVHEKFVHPEMRPLHKKLIDQFQMMRASLYHEFPGLDKLSPACSGTSTPRGNVL 0"
## [1] "------------------------------------------------------------ 0"
## [1] " "
## [1] "ASHSPMSPESIKMTHRHSPMNLMGTGRHSSSSLSSHASSEAGNMVMLGDGSMGDAPEDLY 0"
## [1] "ASHSPMSPESIKMTHRHSPMNLMGTGRHSSSSLSSHASSEAGNMVMLGDGSMGDAPEDLY 0"
## [1] "ASHSPMSPESIKMTHRHSPMNLMGTGRHSSSSLSSHASSEAGNMVMLGDGSMGDAPEDLY 0"
## [1] "ASHSPMSPENIKMTHRHSPMNLMGTGRHSSSSLSSHASSEAGNMMMMGDNSMGEAPEDLY 0"
## [1] "------------------------------------------------------------ 0"
## [1] " "
## [1] "HHMQLAYPNPRYQGSVTNVSVLSSSQASPSSSSLSSTHSAPSQMITSAPSSARGSPSLPD 0"
## [1] "HHMQLAYPNPRYQGSVTNVSVLSSSQASPSSSSLSSTHSAPSQMITSAPSSARGSPSLPD 0"
## [1] "HHMQLAYPNPRYQGSVTNVSVLSSSQASPSSSSLSSTHSAPSQMITSAPSSARGSPSLPD 0"
## [1] "HHMQLAYHNPRYQGSVTNVSVLSSSQASPSSSSLSSTHSAPSQMITSAPSSTRGSPSLPD 0"
## [1] "------------------------------------------------------------ 0"
## [1] " "
## [1] "KYRHAREMMLLLPTYRDRPSSAMYPAAILENGQPPNFQRALFQQVVGACKPCSDPNLSVA 0"
## [1] "KYRHAREMMLLLPTYRDRPSSAMYPAAILENGQPPNFQRALFQQVVGACKPCSDPNLSVA 0"
## [1] "KYRHAREMMLLLPTYRDRPSSAMYPAAILENGQPPNFQRALFQQVVGACKPCSDPNLSVA 0"
## [1] "KYRHAREMMLLLPTHRDRPSSAMYPAAILENGQPPNFQRALFQQVVGACKPCSDPNLSMA 0"
## [1] "------------------------------------------------------------ 0"
## [1] " "
## [1] "EKGHYSLHFDAFHHPLGDTPPALPARTLRKSPLHPIPASPTSPQSGLDGSNSTLSGSASS 0"
## [1] "EKGHYSLHFDAFHHPLGDTPPALPARTLRKSPLHPIPASPTSPQSGLDGSNSTLSGSASS 0"
## [1] "EKGHYSLHFDAFHHPLGDTPPALPARTLRKSPLHPIPASPTSPQSGLDGSNSTLSGSASS 0"
## [1] "EKGHYSLHFDAFHHPLGDTPPALPARTLRKSPLHPIPASPTSPQSGLDGSNSTLSGSASS 0"
## [1] "------------------------------------------------------------ 0"
## [1] " "
## [1] "GVSSLSESNFGHSSEAPPRTDTMDSMPSQAWNADEDLEPPYLPVHYSLSESAVLDSIKAQ 0"
## [1] "GVSSLSESNFGHSSEAPPRTDTMDSMPSQAWNADEDLEPPYLPVHYSLSESAVLDSIKAQ 0"
## [1] "GVSSLSESNFGHSSEVPPRTDTMDSMPSQAWNADEDLEPPYLPVHYSLSESAVLDSIKAQ 0"
## [1] "GVSSLSESNFGHSSEAPPRTDTMDSMPSQAWNGDEDLEPPYLPVHYSLSESAVLDAIKSQ 0"
## [1] "------------------------------------------------------------ 0"
## [1] " "
## [1] "PCRSHSAPGCVIPQDPMDPPALPPKPYHPRLPALEHDEGVLLREETERPRGLHRKAPLPP 0"
## [1] "PCRSHSAPGCVIPQDPMDPPALPPKPYHPRLPALEHDEGVLLREETERPRGLHRKAPLPP 0"
## [1] "PCRSHSAPGCVIPQDPMDPPALPPKPYHPRLPALEHDEGVLLREETERPRGLHRKASLPP 0"
## [1] "PCRSHSAPGCVLPQDPMDPPALPPKPYHPRLPALEHDEGMLLREEAERPRGLHRKASLPP 0"
## [1] "------------------------------------------------------------ 0"
## [1] " "
## [1] "GSAKEEQARMAWEHGRGEQ 41"
## [1] "GSAKEEQARMAWEHGRGEQ 41"
## [1] "GSAKEEQARMAWEHGRGEQ 41"
## [1] "GSVKEEQARLAWEHGRGEQ 41"
## [1] "------------------- 41"
## [1] " "
class(list_align) <- "AAMultipleAlignment"
ggmsa::ggmsa(list_align,
start = 1,
end = 50)
#Distance matrix
list_subset_dist <- seqinr::dist.alignment(list_align_seqinr,
matrix = "identity")
list_subset_dist
## NP_004938.1 XP_516488.4 XP_001089458.2 NP_700462.2
## XP_516488.4 0.08002463
## XP_001089458.2 0.08636536 0.08636536
## NP_700462.2 0.13681817 0.15536387 0.15609608
## NP_001008222.1 0.88916065 0.88916065 0.88916065 0.88805952
#Phylogenetic tree for all sequences
tree_subset <- nj(list_subset_dist)
#rooted
plot.phylo(tree_subset, main="Phylogenetic Tree",
use.edge.length = F)
mtext(text = "DOCK3 family gene tree - rooted, no branch lenths")