Introduction
This code compiles summary information about the gene CISH cytokine inducible SH2 containing protein
It also generates alignments and a phylogeneitc tree to indicating the evolutionary relationship betweeen the human version of the gene and its homologs in other species.
n
library(BiocManager)
## Bioconductor version '3.13' is out-of-date; the current release version '3.14'
## is available with R version '4.1'; see https://bioconductor.org/install
install("drawProteins")
## Bioconductor version 3.13 (BiocManager 1.30.16), R 4.1.1 (2021-08-10)
## Warning: package(s) not installed when version(s) same as current; use `force = TRUE` to
## re-install: 'drawProteins'
## Old packages: 'backports', 'brio', 'broom', 'car', 'cli', 'conquer',
## 'corrplot', 'cpp11', 'crayon', 'credentials', 'crosstalk', 'data.table',
## 'desc', 'devtools', 'diffobj', 'digest', 'fs', 'generics', 'gert', 'glue',
## 'Hmisc', 'htmlTable', 'knitr', 'maps', 'Matrix', 'memoise', 'mgcv', 'mime',
## 'nloptr', 'openxlsx', 'pillar', 'pkgbuild', 'pkgload', 'plotly', 'rcmdcheck',
## 'RcppArmadillo', 'readr', 'remotes', 'rio', 'rlang', 'rsconnect',
## 'S4Vectors', 'sessioninfo', 'sp', 'stringi', 'testthat', 'tibble', 'tidyr',
## 'tinytex', 'tzdb', 'usethis', 'viridis', 'vroom', 'withr', 'xfun', 'xml2',
## 'yulab.utils'
library(drawProteins)
# github packages
library(compbio4all)
library(ggmsa)
## Registered S3 methods overwritten by 'ggalt':
## method from
## grid.draw.absoluteGrob ggplot2
## grobHeight.absoluteGrob ggplot2
## grobWidth.absoluteGrob ggplot2
## grobX.absoluteGrob ggplot2
## grobY.absoluteGrob ggplot2
# CRAN packages
library(rentrez)
library(seqinr)
library(ape)
##
## Attaching package: 'ape'
## The following objects are masked from 'package:seqinr':
##
## as.alignment, consensus
library(pander)
library(ggplot2)
library(msa)
## Loading required package: Biostrings
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: XVector
## Loading required package: GenomeInfoDb
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:ape':
##
## complement
## The following object is masked from 'package:seqinr':
##
## translate
## The following object is masked from 'package:base':
##
## strsplit
##
## Attaching package: 'msa'
## The following object is masked from 'package:BiocManager':
##
## version
library(drawProteins)
## Biostrings
library(Biostrings)
library(HGNChelper)
HGNChelper::checkGeneSymbols(x = c("CISH"))
## Maps last updated on: Thu Oct 24 12:31:05 2019
## x Approved Suggested.Symbol
## 1 CISH TRUE CISH
CISH_table<-c("NP_659508.1", "Q9NSE2", "NA", "Homo sapiens", "Human", "CISH",
"XP_003309858.1", "H2QMP4", "NA", "P.troglodytes", "Chimpanzee", "CISH",
"NP_034025.1", "Q62225", "NA", "Mus musculus", "Mouse", "CISH",
"NP_113992.1", "B1WBX9", "NA", "R.norvegicus", "Rat", "CISH",
"NP_989957.1", "Q9PW70", "NA", "G.gallus", "birds", "CISH",
"NP_001107161.1", "A9ULE1", "NA", "X.tropicalis", "frog ", "cish",
"XP_541873.3", "F1PSF4", "NA", "C.lupus", "wolf", "CISH",
"NP_001070085.1", "Q08BW5", "NA", "D.rerio", "zebrafish", "cish",
"XP_001097824.1", "NA", "NA", "M.mulatta", "Monkey", "CISH")
refseq <- c("NP_659508.1","XP_003309858.1","NP_034025.1", "NP_113992.1", "NP_989957.1" ,"NP_001107161.1", "XP_541873.3","NP_001070085.1", "XP_001097824.1")
UniProt.id<-c("Q9NSE2", "H2QMP4","Q62225", "B1WBX9","Q9PW70","A9ULE1","F1PSF4","Q08BW5","NA" )
PDB <- c("NA","NA","NA","NA","NA","NA","NA","NA","NA","NA")
species <- c("Homo sapiens", "P.troglodytes", "Mus musculus", "R.norvegicus", "G.gallus", "X.tropicalis","C.lupus", "D.rerio", "M.mulatta")
common.name <- c("Human","Chimpanzee","Mouse","Rat","birds", "frog ","wolf","zebrafish", "Monkey" )
gene.name <- c("CISH","CISH","CISH","CISH","CISH","cish","CISH","cish","CISH")
CISH_table_matrix<-matrix(CISH_table, byrow = T, nrow = 9)
CISH_table <- data.frame(CISH_table_matrix, stringsAsFactors = F)
names(CISH_table) <- c("refseq", "UniProt.id","PDB", "species", "common.name", "gene.name")
CISH_table
## refseq UniProt.id PDB species common.name gene.name
## 1 NP_659508.1 Q9NSE2 NA Homo sapiens Human CISH
## 2 XP_003309858.1 H2QMP4 NA P.troglodytes Chimpanzee CISH
## 3 NP_034025.1 Q62225 NA Mus musculus Mouse CISH
## 4 NP_113992.1 B1WBX9 NA R.norvegicus Rat CISH
## 5 NP_989957.1 Q9PW70 NA G.gallus birds CISH
## 6 NP_001107161.1 A9ULE1 NA X.tropicalis frog cish
## 7 XP_541873.3 F1PSF4 NA C.lupus wolf CISH
## 8 NP_001070085.1 Q08BW5 NA D.rerio zebrafish cish
## 9 XP_001097824.1 NA NA M.mulatta Monkey CISH
pander::pander(CISH_table)
| NP_659508.1 |
Q9NSE2 |
NA |
Homo sapiens |
Human |
CISH |
| XP_003309858.1 |
H2QMP4 |
NA |
P.troglodytes |
Chimpanzee |
CISH |
| NP_034025.1 |
Q62225 |
NA |
Mus musculus |
Mouse |
CISH |
| NP_113992.1 |
B1WBX9 |
NA |
R.norvegicus |
Rat |
CISH |
| NP_989957.1 |
Q9PW70 |
NA |
G.gallus |
birds |
CISH |
| NP_001107161.1 |
A9ULE1 |
NA |
X.tropicalis |
frog |
cish |
| XP_541873.3 |
F1PSF4 |
NA |
C.lupus |
wolf |
CISH |
| NP_001070085.1 |
Q08BW5 |
NA |
D.rerio |
zebrafish |
cish |
| XP_001097824.1 |
NA |
NA |
M.mulatta |
Monkey |
CISH |
CISH_list <- compbio4all::entrez_fetch_list(db = "protein",
id = CISH_table$refseq,
rettype = "fasta")
length(CISH_list)
## [1] 9
CISH_list[[1]]
## [1] ">NP_659508.1 cytokine-inducible SH2-containing protein isoform 2 [Homo sapiens]\nMVLCVQGPRPLLAVERTGQRPLWAPSLELPKPVMQPLPAGAFLEEVAEGTPAQTESEPKVLDPEEDLLCI\nAKTFSYLRESGWYWGSITASEARQHLQKMPEGTFLVRDSTHPSYLFTLSVKTTRGPTNVRIEYADSSFRL\nDSNCLSRPRILAFPDVVSLVQHYVASCTADTRSDSPDPAPTPALPMPKEDAPSDPALPAPPPATAVHLKL\nVQPFVRRSSARSLQHLCRLVINRLVADVDCLPLPRRMADYLRQYPFQL\n\n"
for(i in 1:length(CISH_list)){
CISH_list[[i]] <- compbio4all::fasta_cleaner(CISH_list[[i]], parse = F)
}
Q9NSE2_json <- drawProteins::get_features("Q9NSE2")
## [1] "Download has worked"
my_prot_df <- drawProteins::feature_to_dataframe(Q9NSE2_json)
my_canvas <- draw_canvas(my_prot_df)
my_canvas <- draw_chains(my_canvas, my_prot_df,
label_size = 2.5)
my_canvas <- draw_domains(my_canvas, my_prot_df)
my_canvas

CISH_vector <- fasta_cleaner(CISH_list)
par(mfrow = c(2,2),
mar = c(2,2,2,1))
dotPlot(CISH_vector,
CISH_vector,
wsize = 1,
nmatch = 1,
main = "CISH HUMAN VECTOR")
dotPlot(CISH_vector,
CISH_vector,
wsize = 10,
nmatch = 1,
main = "CISH - size = 10, nmatch = 1")
dotPlot(CISH_vector,
CISH_vector,
wsize = 10,
nmatch = 5,
main = "CISH - size = 10, nmatch = 15")
dotPlot(CISH_vector,
CISH_vector,
wsize = 20,
nmatch = 5,
main = "CISH - size = 20, nmatch = 5")

aa.1.1 <- c("A","R","N","D","C","Q","E","G","H","I",
"L","K","M","F","P","S","T","W","Y","V")
## alpha proteins
alpha <- c(285, 53, 97, 163, 22, 67, 134, 197, 111, 91,
221, 249, 48, 123, 82, 122, 119, 33, 63, 167)
## beta proteins
beta <- c(203, 67, 139, 121, 75, 122, 86, 297, 49, 120,
177, 115, 16, 85, 127, 341, 253, 44, 110, 229)
## alpha + beta
a.plus.b <- c(175, 78, 120, 111, 74, 74, 86, 171, 33, 93,
110, 112, 25, 52, 71, 126, 117, 30, 108, 123)
## alpha/beta
a.div.b <- c(361, 146, 183, 244, 63, 114, 257, 377, 107, 239,
339, 321, 91, 158, 188, 327, 238, 72, 130, 378)
data.frame(aa.1.1, alpha, beta, a.plus.b, a.div.b)
## aa.1.1 alpha beta a.plus.b a.div.b
## 1 A 285 203 175 361
## 2 R 53 67 78 146
## 3 N 97 139 120 183
## 4 D 163 121 111 244
## 5 C 22 75 74 63
## 6 Q 67 122 74 114
## 7 E 134 86 86 257
## 8 G 197 297 171 377
## 9 H 111 49 33 107
## 10 I 91 120 93 239
## 11 L 221 177 110 339
## 12 K 249 115 112 321
## 13 M 48 16 25 91
## 14 F 123 85 52 158
## 15 P 82 127 71 188
## 16 S 122 341 126 327
## 17 T 119 253 117 238
## 18 W 33 44 30 72
## 19 Y 63 110 108 130
## 20 V 167 229 123 378
# convert them to frequencies
alpha.prop <- alpha/sum(alpha)
beta.prop <- beta/sum(beta)
a.plus.b.prop <- a.plus.b/sum(a.plus.b)
a.div.b <- a.div.b/sum(a.div.b)
## dataframe
aa.prop <- data.frame(alpha.prop,
beta.prop,
a.plus.b.prop,
a.div.b)
## row labels
row.names(aa.prop) <- aa.1.1
pander::pander(aa.prop)
| A |
0.1165 |
0.07313 |
0.09264 |
0.08331 |
| R |
0.02166 |
0.02414 |
0.04129 |
0.03369 |
| N |
0.03964 |
0.05007 |
0.06353 |
0.04223 |
| D |
0.06661 |
0.04359 |
0.05876 |
0.05631 |
| C |
0.008991 |
0.02702 |
0.03917 |
0.01454 |
| Q |
0.02738 |
0.04395 |
0.03917 |
0.02631 |
| E |
0.05476 |
0.03098 |
0.04553 |
0.05931 |
| G |
0.08051 |
0.107 |
0.09052 |
0.08701 |
| H |
0.04536 |
0.01765 |
0.01747 |
0.02469 |
| I |
0.03719 |
0.04323 |
0.04923 |
0.05516 |
| L |
0.09031 |
0.06376 |
0.05823 |
0.07824 |
| K |
0.1018 |
0.04143 |
0.05929 |
0.07408 |
| M |
0.01962 |
0.005764 |
0.01323 |
0.021 |
| F |
0.05027 |
0.03062 |
0.02753 |
0.03646 |
| P |
0.03351 |
0.04575 |
0.03759 |
0.04339 |
| S |
0.04986 |
0.1228 |
0.0667 |
0.07547 |
| T |
0.04863 |
0.09114 |
0.06194 |
0.05493 |
| W |
0.01349 |
0.01585 |
0.01588 |
0.01662 |
| Y |
0.02575 |
0.03963 |
0.05717 |
0.03 |
| V |
0.06825 |
0.08249 |
0.06511 |
0.08724 |
## determine the number of each amino acid in protein.
table(CISH_vector)
## CISH_vector
## A C D E F G H I K L M N P Q R S T V W Y
## 23 6 14 14 8 8 5 5 7 32 5 3 32 11 20 21 15 19 3 7
CISH_human_table <- table(CISH_vector)/length(CISH_vector)
## A Function to convert a table into a vector
table_to_vector <- function(CISH_human_table){
table_names <- attr(CISH_human_table, "dimnames")[[1]]
table_vect <- as.vector(CISH_human_table)
names(table_vect) <- table_names
return(table_vect)
}
CISH.human.aa.freq <- table_to_vector(CISH_human_table)
CISH.human.aa.freq
## A C D E F G H
## 0.08914729 0.02325581 0.05426357 0.05426357 0.03100775 0.03100775 0.01937984
## I K L M N P Q
## 0.01937984 0.02713178 0.12403101 0.01937984 0.01162791 0.12403101 0.04263566
## R S T V W Y
## 0.07751938 0.08139535 0.05813953 0.07364341 0.01162791 0.02713178
## Check for the presence of “U” (unknown aa.)
aa.names <- names(CISH.human.aa.freq)
any(aa.names == "U")
## [1] FALSE
i.U <- which(aa.names == "U")
aa.names[i.U]
## character(0)
CISH.human.aa.freq[i.U]
## named numeric(0)
aa.prop$CISH.human.aa.freq <- CISH.human.aa.freq
pander::pander(aa.prop)
| A |
0.1165 |
0.07313 |
0.09264 |
0.08331 |
0.08915 |
| R |
0.02166 |
0.02414 |
0.04129 |
0.03369 |
0.02326 |
| N |
0.03964 |
0.05007 |
0.06353 |
0.04223 |
0.05426 |
| D |
0.06661 |
0.04359 |
0.05876 |
0.05631 |
0.05426 |
| C |
0.008991 |
0.02702 |
0.03917 |
0.01454 |
0.03101 |
| Q |
0.02738 |
0.04395 |
0.03917 |
0.02631 |
0.03101 |
| E |
0.05476 |
0.03098 |
0.04553 |
0.05931 |
0.01938 |
| G |
0.08051 |
0.107 |
0.09052 |
0.08701 |
0.01938 |
| H |
0.04536 |
0.01765 |
0.01747 |
0.02469 |
0.02713 |
| I |
0.03719 |
0.04323 |
0.04923 |
0.05516 |
0.124 |
| L |
0.09031 |
0.06376 |
0.05823 |
0.07824 |
0.01938 |
| K |
0.1018 |
0.04143 |
0.05929 |
0.07408 |
0.01163 |
| M |
0.01962 |
0.005764 |
0.01323 |
0.021 |
0.124 |
| F |
0.05027 |
0.03062 |
0.02753 |
0.03646 |
0.04264 |
| P |
0.03351 |
0.04575 |
0.03759 |
0.04339 |
0.07752 |
| S |
0.04986 |
0.1228 |
0.0667 |
0.07547 |
0.0814 |
| T |
0.04863 |
0.09114 |
0.06194 |
0.05493 |
0.05814 |
| W |
0.01349 |
0.01585 |
0.01588 |
0.01662 |
0.07364 |
| Y |
0.02575 |
0.03963 |
0.05717 |
0.03 |
0.01163 |
| V |
0.06825 |
0.08249 |
0.06511 |
0.08724 |
0.02713 |
# Corrleation used in Chou adn Zhange 1992.
chou_cor <- function(x,y){
numerator <- sum(x*y)
denominator <- sqrt((sum(x^2))*(sum(y^2)))
result <- numerator/denominator
return(result)
}
# Cosine similarity used in Higgs and Attwood (2005).
chou_cosine <- function(z.1, z.2){
z.1.abs <- sqrt(sum(z.1^2))
z.2.abs <- sqrt(sum(z.2^2))
my.cosine <- sum(z.1*z.2)/(z.1.abs*z.2.abs)
return(my.cosine)
}
# Calculate correlation between each column
corr.alpha <- chou_cor(aa.prop[,5], aa.prop[,1])
corr.beta <- chou_cor(aa.prop[,5], aa.prop[,2])
corr.apb <- chou_cor(aa.prop[,5], aa.prop[,3])
corr.adb <- chou_cor(aa.prop[,5], aa.prop[,4])
# Calculate cosine similarity
cos.alpha <- chou_cosine(aa.prop[,5], aa.prop[,1])
cos.beta <- chou_cosine(aa.prop[,5], aa.prop[,2])
cos.apb <- chou_cosine(aa.prop[,5], aa.prop[,3])
cos.adb <- chou_cosine(aa.prop[,5], aa.prop[,4])
# Calculate distance.
aa.prop.flipped <- t(aa.prop)
round(aa.prop.flipped,2)
## A R N D C Q E G H I L K
## alpha.prop 0.12 0.02 0.04 0.07 0.01 0.03 0.05 0.08 0.05 0.04 0.09 0.10
## beta.prop 0.07 0.02 0.05 0.04 0.03 0.04 0.03 0.11 0.02 0.04 0.06 0.04
## a.plus.b.prop 0.09 0.04 0.06 0.06 0.04 0.04 0.05 0.09 0.02 0.05 0.06 0.06
## a.div.b 0.08 0.03 0.04 0.06 0.01 0.03 0.06 0.09 0.02 0.06 0.08 0.07
## CISH.human.aa.freq 0.09 0.02 0.05 0.05 0.03 0.03 0.02 0.02 0.03 0.12 0.02 0.01
## M F P S T W Y V
## alpha.prop 0.02 0.05 0.03 0.05 0.05 0.01 0.03 0.07
## beta.prop 0.01 0.03 0.05 0.12 0.09 0.02 0.04 0.08
## a.plus.b.prop 0.01 0.03 0.04 0.07 0.06 0.02 0.06 0.07
## a.div.b 0.02 0.04 0.04 0.08 0.05 0.02 0.03 0.09
## CISH.human.aa.freq 0.12 0.04 0.08 0.08 0.06 0.07 0.01 0.03
# distance matrix
dist(aa.prop.flipped, method = "euclidean")
## alpha.prop beta.prop a.plus.b.prop a.div.b
## beta.prop 0.13342098
## a.plus.b.prop 0.09281824 0.08289406
## a.div.b 0.06699039 0.08659174 0.06175113
## CISH.human.aa.freq 0.21707581 0.20735229 0.19228630 0.19476457
# Individual distances using dist()
dist.alpha <- dist((aa.prop.flipped[c(1,5),]), method = "euclidean")
dist.beta <- dist((aa.prop.flipped[c(2,5),]), method = "euclidean")
dist.apb <- dist((aa.prop.flipped[c(3,5),]), method = "euclidean")
dist.adb <- dist((aa.prop.flipped[c(4,5),]), method = "euclidean")
# Compile the information. Rounding makes it easier to read
## fold types
fold.type <- c("alpha","beta","alpha plus beta", "alpha/beta")
## data
corr.sim <- round(c(corr.alpha,corr.beta,corr.apb,corr.adb),5)
cosine.sim <- round(c(cos.alpha,cos.beta,cos.apb,cos.adb),5)
Euclidean.dist <- round(c(dist.alpha,dist.beta,dist.apb,dist.adb),5)
## summary
sim.sum <- c("","","most.sim","")
dist.sum <- c("","","min.dist","")
df <- data.frame(fold.type,
corr.sim ,
cosine.sim ,
Euclidean.dist ,
sim.sum ,
dist.sum )
# display output
pander::pander(df)
| alpha |
0.6639 |
0.6639 |
0.2171 |
|
|
| beta |
0.6968 |
0.6968 |
0.2074 |
|
|
| alpha plus beta |
0.7247 |
0.7247 |
0.1923 |
most.sim |
min.dist |
| alpha/beta |
0.721 |
0.721 |
0.1948 |
|
|
names(CISH_list)
## [1] "NP_659508.1" "XP_003309858.1" "NP_034025.1" "NP_113992.1"
## [5] "NP_989957.1" "NP_001107161.1" "XP_541873.3" "NP_001070085.1"
## [9] "XP_001097824.1"
length(CISH_list)
## [1] 9
CISH_list[1]
## $NP_659508.1
## [1] "MVLCVQGPRPLLAVERTGQRPLWAPSLELPKPVMQPLPAGAFLEEVAEGTPAQTESEPKVLDPEEDLLCIAKTFSYLRESGWYWGSITASEARQHLQKMPEGTFLVRDSTHPSYLFTLSVKTTRGPTNVRIEYADSSFRLDSNCLSRPRILAFPDVVSLVQHYVASCTADTRSDSPDPAPTPALPMPKEDAPSDPALPAPPPATAVHLKLVQPFVRRSSARSLQHLCRLVINRLVADVDCLPLPRRMADYLRQYPFQL"
# Make each entry of the list into a vector
CISH_table
## refseq UniProt.id PDB species common.name gene.name
## 1 NP_659508.1 Q9NSE2 NA Homo sapiens Human CISH
## 2 XP_003309858.1 H2QMP4 NA P.troglodytes Chimpanzee CISH
## 3 NP_034025.1 Q62225 NA Mus musculus Mouse CISH
## 4 NP_113992.1 B1WBX9 NA R.norvegicus Rat CISH
## 5 NP_989957.1 Q9PW70 NA G.gallus birds CISH
## 6 NP_001107161.1 A9ULE1 NA X.tropicalis frog cish
## 7 XP_541873.3 F1PSF4 NA C.lupus wolf CISH
## 8 NP_001070085.1 Q08BW5 NA D.rerio zebrafish cish
## 9 XP_001097824.1 NA NA M.mulatta Monkey CISH
human <- unlist(CISH_list[1])
mouse <- unlist(CISH_list[2])
rat <- unlist(CISH_list[3])
frog <- unlist(CISH_list[4])
fruit_fly <- unlist(CISH_list[5])
roundworm <- unlist(CISH_list[6])
chimpanze <- unlist(CISH_list[7])
monkey <- unlist(CISH_list[8])
cattle <- unlist(CISH_list[9])
pest <- unlist(CISH_list[10])
# name the vector
## names(cul5_vector) <- names(cul5_list)
data(BLOSUM50)
align01.07 <- Biostrings::pairwiseAlignment(human,
chimpanze,
substitutionMatrix = BLOSUM50,
gapOpening = -2,
gapExtension = -8,
scoreOnly = FALSE)
align01.02 <- Biostrings::pairwiseAlignment(human,
mouse,
substitutionMatrix = BLOSUM50,
gapOpening = -2,
gapExtension = -8,
scoreOnly = FALSE)
align01.03 <- Biostrings::pairwiseAlignment(human,
rat,
substitutionMatrix = BLOSUM50,
gapOpening = -2,
gapExtension = -8,
scoreOnly = FALSE)
align07.02 <- Biostrings::pairwiseAlignment(chimpanze,
mouse,
substitutionMatrix = BLOSUM50,
gapOpening = -2,
gapExtension = -8,
scoreOnly = FALSE)
align07.03 <- Biostrings::pairwiseAlignment(chimpanze,
rat,
substitutionMatrix = BLOSUM50,
gapOpening = -2,
gapExtension = -8,
scoreOnly = FALSE)
align02.03 <- Biostrings::pairwiseAlignment(mouse,
rat,
substitutionMatrix = BLOSUM50,
gapOpening = -2,
gapExtension = -8,
scoreOnly = FALSE)
Biostrings::pid(align01.07)
## [1] 90.69767
Biostrings::pid(align01.02)
## [1] 99.6124
Biostrings::pid(align01.03)
## [1] 90.69767
pids <- c(1,NA,NA,NA,
pid(align01.07), 1, NA, NA,
pid(align01.02), pid(align07.02), 1, NA,
pid(align01.03), pid(align07.03), pid(align02.03), 1)
mat <- matrix(pids, nrow = 4, byrow = T)
row.names(mat) <- c("Homo","Pan","Mouse","Rat")
colnames(mat) <- c("Homo","Pan","Mouse","Rat")
pander::pander(mat)
| Homo |
1 |
NA |
NA |
NA |
| Pan |
90.7 |
1 |
NA |
NA |
| Mouse |
99.61 |
90.31 |
1 |
NA |
| Rat |
90.7 |
89.11 |
91.09 |
1 |
human_PID1 <- pid(align01.07,type ="PID1" )
human_PID2 <- pid(align01.07,type ="PID2" )
human_PID3 <- pid(align01.07,type ="PID3" )
human_PID4 <- pid(align01.07,type ="PID4" )
method <- c("PID1","PID2","PID3","PID4")
PID_methods <- c(human_PID1,human_PID2,human_PID3,human_PID4)
denominator <- c("(aligned positions + internal gap positions)",
"(aligned positions)",
"(length shorter sequence)",
"(average length of the two sequences)")
PID_comparsions <- data.frame(method,PID=PID_methods,denominator)
PID_comparsions
## method PID denominator
## 1 PID1 90.69767 (aligned positions + internal gap positions)
## 2 PID2 91.05058 (aligned positions)
## 3 PID3 91.05058 (length shorter sequence)
## 4 PID4 90.87379 (average length of the two sequences)
hShroom3 <- rentrez::entrez_fetch(db = "protein",
id = "NP_065910",
rettype = "fasta")
cat(hShroom3)
## >NP_065910.3 protein Shroom3 [Homo sapiens]
## MMRTTEDFHKPSATLNSNTATKGRYIYLEAFLEGGAPWGFTLKGGLEHGEPLIISKVEEGGKADTLSSKL
## QAGDEVVHINEVTLSSSRKEAVSLVKGSYKTLRLVVRRDVCTDPGHADTGASNFVSPEHLTSGPQHRKAA
## WSGGVKLRLKHRRSEPAGRPHSWHTTKSGEKQPDASMMQISQGMIGPPWHQSYHSSSSTSDLSNYDHAYL
## RRSPDQCSSQGSMESLEPSGAYPPCHLSPAKSTGSIDQLSHFHNKRDSAYSSFSTSSSILEYPHPGISGR
## ERSGSMDNTSARGGLLEGMRQADIRYVKTVYDTRRGVSAEYEVNSSALLLQGREARASANGQGYDKWSNI
## PRGKGVPPPSWSQQCPSSLETATDNLPPKVGAPLPPARSDSYAAFRHRERPSSWSSLDQKRLCRPQANSL
## GSLKSPFIEEQLHTVLEKSPENSPPVKPKHNYTQKAQPGQPLLPTSIYPVPSLEPHFAQVPQPSVSSNGM
## LYPALAKESGYIAPQGACNKMATIDENGNQNGSGRPGFAFCQPLEHDLLSPVEKKPEATAKYVPSKVHFC
## SVPENEEDASLKRHLTPPQGNSPHSNERKSTHSNKPSSHPHSLKCPQAQAWQAGEDKRSSRLSEPWEGDF
## QEDHNANLWRRLEREGLGQSLSGNFGKTKSAFSSLQNIPESLRRHSSLELGRGTQEGYPGGRPTCAVNTK
## AEDPGRKAAPDLGSHLDRQVSYPRPEGRTGASASFNSTDPSPEEPPAPSHPHTSSLGRRGPGPGSASALQ
## GFQYGKPHCSVLEKVSKFEQREQGSQRPSVGGSGFGHNYRPHRTVSTSSTSGNDFEETKAHIRFSESAEP
## LGNGEQHFKNGELKLEEASRQPCGQQLSGGASDSGRGPQRPDARLLRSQSTFQLSSEPEREPEWRDRPGS
## PESPLLDAPFSRAYRNSIKDAQSRVLGATSFRRRDLELGAPVASRSWRPRPSSAHVGLRSPEASASASPH
## TPRERHSVTPAEGDLARPVPPAARRGARRRLTPEQKKRSYSEPEKMNEVGIVEEAEPAPLGPQRNGMRFP
## ESSVADRRRLFERDGKACSTLSLSGPELKQFQQSALADYIQRKTGKRPTSAAGCSLQEPGPLRERAQSAY
## LQPGPAALEGSGLASASSLSSLREPSLQPRREATLLPATVAETQQAPRDRSSSFAGGRRLGERRRGDLLS
## GANGGTRGTQRGDETPREPSSWGARAGKSMSAEDLLERSDVLAGPVHVRSRSSPATADKRQDVLLGQDSG
## FGLVKDPCYLAGPGSRSLSCSERGQEEMLPLFHHLTPRWGGSGCKAIGDSSVPSECPGTLDHQRQASRTP
## CPRPPLAGTQGLVTDTRAAPLTPIGTPLPSAIPSGYCSQDGQTGRQPLPPYTPAMMHRSNGHTLTQPPGP
## RGCEGDGPEHGVEEGTRKRVSLPQWPPPSRAKWAHAAREDSLPEESSAPDFANLKHYQKQQSLPSLCSTS
## DPDTPLGAPSTPGRISLRISESVLRDSPPPHEDYEDEVFVRDPHPKATSSPTFEPLPPPPPPPPSQETPV
## YSMDDFPPPPPHTVCEAQLDSEDPEGPRPSFNKLSKVTIARERHMPGAAHVVGSQTLASRLQTSIKGSEA
## ESTPPSFMSVHAQLAGSLGGQPAPIQTQSLSHDPVSGTQGLEKKVSPDPQKSSEDIRTEALAKEIVHQDK
## SLADILDPDSRLKTTMDLMEGLFPRDVNLLKENSVKRKAIQRTVSSSGCEGKRNEDKEAVSMLVNCPAYY
## SVSAPKAELLNKIKEMPAEVNEEEEQADVNEKKAELIGSLTHKLETLQEAKGSLLTDIKLNNALGEEVEA
## LISELCKPNEFDKYRMFIGDLDKVVNLLLSLSGRLARVENVLSGLGEDASNEERSSLYEKRKILAGQHED
## ARELKENLDRRERVVLGILANYLSEEQLQDYQHFVKMKSTLLIEQRKLDDKIKLGQEQVKCLLESLPSDF
## IPKAGALALPPNLTSEPIPAGGCTFSGIFPTLTSPL
nchar(hShroom3)
## [1] 2070
mShroom3a <- entrez_fetch(db = "protein",
id = "AAF13269",
rettype = "fasta")
# Human shroom 2 (H. sapiens)
hShroom2 <- entrez_fetch(db = "protein",
id = "CAA58534",
rettype = "fasta")
# Sea-urchin shroom
sShroom <- entrez_fetch(db = "protein",
id = "XP_783573",
rettype = "fasta")
nchar(hShroom3)
## [1] 2070
nchar(mShroom3a)
## [1] 2083
nchar(sShroom)
## [1] 1758
nchar(hShroom2)
## [1] 1673
fasta_cleaner <- function(fasta_object, parse = TRUE){
fasta_object <- sub("^(>)(.*?)(\\n)(.*)(\\n\\n)","\\4",fasta_object)
fasta_object <- gsub("\n", "", fasta_object)
if(parse == TRUE){
fasta_object <- stringr::str_split(fasta_object,
pattern = "",
simplify = FALSE)
}
return(fasta_object[[1]])
}
fasta_cleaner
## function(fasta_object, parse = TRUE){
##
## fasta_object <- sub("^(>)(.*?)(\\n)(.*)(\\n\\n)","\\4",fasta_object)
## fasta_object <- gsub("\n", "", fasta_object)
##
## if(parse == TRUE){
## fasta_object <- stringr::str_split(fasta_object,
## pattern = "",
## simplify = FALSE)
## }
##
## return(fasta_object[[1]])
## }
hShroom3 <- fasta_cleaner(hShroom3, parse = F)
mShroom3a <- fasta_cleaner(mShroom3a, parse = F)
hShroom2 <- fasta_cleaner(hShroom2, parse = F)
sShroom <- fasta_cleaner(sShroom, parse = F)
hShroom3
## [1] "MMRTTEDFHKPSATLNSNTATKGRYIYLEAFLEGGAPWGFTLKGGLEHGEPLIISKVEEGGKADTLSSKLQAGDEVVHINEVTLSSSRKEAVSLVKGSYKTLRLVVRRDVCTDPGHADTGASNFVSPEHLTSGPQHRKAAWSGGVKLRLKHRRSEPAGRPHSWHTTKSGEKQPDASMMQISQGMIGPPWHQSYHSSSSTSDLSNYDHAYLRRSPDQCSSQGSMESLEPSGAYPPCHLSPAKSTGSIDQLSHFHNKRDSAYSSFSTSSSILEYPHPGISGRERSGSMDNTSARGGLLEGMRQADIRYVKTVYDTRRGVSAEYEVNSSALLLQGREARASANGQGYDKWSNIPRGKGVPPPSWSQQCPSSLETATDNLPPKVGAPLPPARSDSYAAFRHRERPSSWSSLDQKRLCRPQANSLGSLKSPFIEEQLHTVLEKSPENSPPVKPKHNYTQKAQPGQPLLPTSIYPVPSLEPHFAQVPQPSVSSNGMLYPALAKESGYIAPQGACNKMATIDENGNQNGSGRPGFAFCQPLEHDLLSPVEKKPEATAKYVPSKVHFCSVPENEEDASLKRHLTPPQGNSPHSNERKSTHSNKPSSHPHSLKCPQAQAWQAGEDKRSSRLSEPWEGDFQEDHNANLWRRLEREGLGQSLSGNFGKTKSAFSSLQNIPESLRRHSSLELGRGTQEGYPGGRPTCAVNTKAEDPGRKAAPDLGSHLDRQVSYPRPEGRTGASASFNSTDPSPEEPPAPSHPHTSSLGRRGPGPGSASALQGFQYGKPHCSVLEKVSKFEQREQGSQRPSVGGSGFGHNYRPHRTVSTSSTSGNDFEETKAHIRFSESAEPLGNGEQHFKNGELKLEEASRQPCGQQLSGGASDSGRGPQRPDARLLRSQSTFQLSSEPEREPEWRDRPGSPESPLLDAPFSRAYRNSIKDAQSRVLGATSFRRRDLELGAPVASRSWRPRPSSAHVGLRSPEASASASPHTPRERHSVTPAEGDLARPVPPAARRGARRRLTPEQKKRSYSEPEKMNEVGIVEEAEPAPLGPQRNGMRFPESSVADRRRLFERDGKACSTLSLSGPELKQFQQSALADYIQRKTGKRPTSAAGCSLQEPGPLRERAQSAYLQPGPAALEGSGLASASSLSSLREPSLQPRREATLLPATVAETQQAPRDRSSSFAGGRRLGERRRGDLLSGANGGTRGTQRGDETPREPSSWGARAGKSMSAEDLLERSDVLAGPVHVRSRSSPATADKRQDVLLGQDSGFGLVKDPCYLAGPGSRSLSCSERGQEEMLPLFHHLTPRWGGSGCKAIGDSSVPSECPGTLDHQRQASRTPCPRPPLAGTQGLVTDTRAAPLTPIGTPLPSAIPSGYCSQDGQTGRQPLPPYTPAMMHRSNGHTLTQPPGPRGCEGDGPEHGVEEGTRKRVSLPQWPPPSRAKWAHAAREDSLPEESSAPDFANLKHYQKQQSLPSLCSTSDPDTPLGAPSTPGRISLRISESVLRDSPPPHEDYEDEVFVRDPHPKATSSPTFEPLPPPPPPPPSQETPVYSMDDFPPPPPHTVCEAQLDSEDPEGPRPSFNKLSKVTIARERHMPGAAHVVGSQTLASRLQTSIKGSEAESTPPSFMSVHAQLAGSLGGQPAPIQTQSLSHDPVSGTQGLEKKVSPDPQKSSEDIRTEALAKEIVHQDKSLADILDPDSRLKTTMDLMEGLFPRDVNLLKENSVKRKAIQRTVSSSGCEGKRNEDKEAVSMLVNCPAYYSVSAPKAELLNKIKEMPAEVNEEEEQADVNEKKAELIGSLTHKLETLQEAKGSLLTDIKLNNALGEEVEALISELCKPNEFDKYRMFIGDLDKVVNLLLSLSGRLARVENVLSGLGEDASNEERSSLYEKRKILAGQHEDARELKENLDRRERVVLGILANYLSEEQLQDYQHFVKMKSTLLIEQRKLDDKIKLGQEQVKCLLESLPSDFIPKAGALALPPNLTSEPIPAGGCTFSGIFPTLTSPL"
nchar(hShroom3)
## [1] 1996
library(Biostrings)
align.h3.vs.m3a <- Biostrings::pairwiseAlignment (
hShroom3,
mShroom3a)
align.h3.vs.m3a
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: MMRTTEDFHKPSATLN-SNTATKGRYIYLEAFLE...KAGALALPPNLTSEPIPAGGCTFSGIFPTLTSPL
## subject: MK-TPENLEEPSATPNPSRTPTE-RFVYLEALLE...KAGAISLPPALTGHATPGGTSVFGGVFPTLTSPL
## score: 2189.934
Biostrings::pid(align.h3.vs.m3a)
## [1] 70.56511
align.h3.vs.h2 <- Biostrings::pairwiseAlignment(
hShroom3,
hShroom2)
score(align.h3.vs.h2)
## [1] -5673.853
Biostrings::pid(align.h3.vs.h2)
## [1] 33.83277
shroom_table <- c("CAA78718" , "X. laevis Apx" , "xShroom1",
"NP_597713" , "H. sapiens APXL2" , "hShroom1",
"CAA58534" , "H. sapiens APXL", "hShroom2",
"ABD19518" , "M. musculus Apxl" , "mShroom2",
"AAF13269" , "M. musculus ShroomL" , "mShroom3a",
"AAF13270" , "M. musculus ShroomS" , "mShroom3b",
"NP_065910", "H. sapiens Shroom" , "hShroom3",
"ABD59319" , "X. laevis Shroom-like", "xShroom3",
"NP_065768", "H. sapiens KIAA1202" , "hShroom4a",
"AAK95579" , "H. sapiens SHAP-A" , "hShroom4b",
#"DQ435686" , "M. musculus KIAA1202" , "mShroom4",
"ABA81834" , "D. melanogaster Shroom", "dmShroom",
"EAA12598" , "A. gambiae Shroom", "agShroom",
"XP_392427" , "A. mellifera Shroom" , "amShroom",
"XP_783573" , "S. purpuratus Shroom" , "spShroom")
# convert to matrix
shroom_table_matrix <- matrix(shroom_table,
byrow = T,
nrow = 14)
# convert to dataframe
shroom_table <- data.frame(shroom_table_matrix,
stringsAsFactors = F)
# setting up columns of new created dataframe
names(shroom_table) <- c("accession", "name.orig","name.new")
# Create simplified species names
shroom_table$spp <- "Homo"
shroom_table$spp[grep("laevis",shroom_table$name.orig)] <- "Xenopus"
shroom_table$spp[grep("musculus",shroom_table$name.orig)] <- "Mus"
shroom_table$spp[grep("melanogaster",shroom_table$name.orig)] <- "Drosophila"
shroom_table$spp[grep("gambiae",shroom_table$name.orig)] <- "mosquito"
shroom_table$spp[grep("mellifera",shroom_table$name.orig)] <- "bee"
shroom_table$spp[grep("purpuratus",shroom_table$name.orig)] <- "sea urchin"
shroom_table
## accession name.orig name.new spp
## 1 CAA78718 X. laevis Apx xShroom1 Xenopus
## 2 NP_597713 H. sapiens APXL2 hShroom1 Homo
## 3 CAA58534 H. sapiens APXL hShroom2 Homo
## 4 ABD19518 M. musculus Apxl mShroom2 Mus
## 5 AAF13269 M. musculus ShroomL mShroom3a Mus
## 6 AAF13270 M. musculus ShroomS mShroom3b Mus
## 7 NP_065910 H. sapiens Shroom hShroom3 Homo
## 8 ABD59319 X. laevis Shroom-like xShroom3 Xenopus
## 9 NP_065768 H. sapiens KIAA1202 hShroom4a Homo
## 10 AAK95579 H. sapiens SHAP-A hShroom4b Homo
## 11 ABA81834 D. melanogaster Shroom dmShroom Drosophila
## 12 EAA12598 A. gambiae Shroom agShroom mosquito
## 13 XP_392427 A. mellifera Shroom amShroom bee
## 14 XP_783573 S. purpuratus Shroom spShroom sea urchin
shroom_table$accession
## [1] "CAA78718" "NP_597713" "CAA58534" "ABD19518" "AAF13269" "AAF13270"
## [7] "NP_065910" "ABD59319" "NP_065768" "AAK95579" "ABA81834" "EAA12598"
## [13] "XP_392427" "XP_783573"
shrooms <- rentrez::entrez_fetch(db = "protein",
id = shroom_table$accession,
rettype = "fasta")
larp <- entrez_fetch(db = "protein",
id = "NP_291029.2",
rettype = "fasta")
larp_clean <- fasta_cleaner(larp, parse = F)
larp
## [1] ">NP_291029.2 la-related protein 1 isoform 2 [Homo sapiens]\nMATQVEPLLPGGATLLQAEEHGGLVRKKPPPAPEGKGEPGPNDVRGGEPDGSARRPRPPCAKPHKEGTGQ\nQERESPRPLQLPGAEGPAISDGEEGGGEPGAGGGAAGAAGAGRRDFVEAPPPKVNPWTKNALPPVLTTVN\nGQSPPEHSAPAKVVRAAVPKQRKGSKVGDFGDAINWPTPGEIAHKSVQPQSHKPQPTRKLPPKKDMKEQE\nKGEGSDSKESPKTKSDESGEEKNGDEDCQRGGQKKKGNKHKWVPLQIDMKPEVPREKLASRPTRPPEPRH\nIPANRGEIKGSESATYVPVAPPTPAWQPEIKPEPAWHDQDETSSVKSDGAGGARASFRGRGRGRGRGRGR\nGRGGTRTHFDYQFGYRKFDGVEGPRTPKYMNNITYYFDNVSSTELYSVDQELLKDYIKRQIEYYFSVDNL\nERDFFLRRKMDADGFLPITLIASFHRVQALTTDISLIFAALKDSKVVEIVDEKVRRREEPEKWPLPPIVD\nYSQTDFSQLLNCPEFVPRQHYQKETESAPGSPRAVTPVPTKTEEVSNLKTLPKGLSASLPDLDSENWIEV\nKKRPRPSPARPKKSEESRFSHLTSLPQQLPSQQLMSKDQDEQEELDFLFDEEMEQMDGRKNTFTAWSDEE\nSDYEIDDRDVNKILIVTQTPHYMRRHPGGDRTGNHTSRAKMSAELAKVINDGLFYYEQDLWAEKFEPEYS\nQIKQEVENFKKVNMISREQFDTLTPEPPVDPNQEVPPGPPRFQQVPTDALANKLFGAPEPSTIARSLPTT\nVPESPNYRNTRTPRTPRTPQLKDSSQTSRFYPVVKEGRTLDAKMPRKRKTRHSSNPPLESHVGWVMDSRE\nHRPRTASISSSPSEGTPTVGSYGCTPQSLPKFQHPSHELLKENGFTQHVYHKYRRRCLNERKRLGIGQSQ\nEMNTLFRFWSFFLRDHFNKKMYEEFKQLALEDAKEGYRYGLECLFRYYSYGLEKKFRLDIFKDFQEETVK\nDYEAGQLYGLEKFWAFLKYSKAKNLDIDPKLQEYLGKFRRLEDFRVDPPMGEEGNHKRHSVVAGGGGGEG\nRKRCPSQSSSRPAAMISQPPTPPTGQPVREDAKWTSQHSNTQTLGK\n\n"
nchar(larp)
## [1] 1172
nchar(larp_clean)
## [1] 1096
cat(shrooms)
## >CAA78718.1 apical protein [Xenopus laevis]
## MSAFGNTIERWNIKSTGVIAGLGHSERISPVRSMTTLVDSAYSSFSGSSYVPEYQNSFQHDGCHYNDEQL
## SYMDSEYVRAIYNPSLLDKDGVYNDIVSEHGSSKVALSGRSSSSLCSDNTTSVHRTSPAKLDNYVTNLDS
## EKNIYGDPINMKHKQNRPNHKAYGLQRNSPTGINSLQEKENQLYNPSNFMEIKDNYFGRSLDVLQADGDI
## MTQDSYTQNALYFPQNQPDQYRNTQYPGANRMSKEQFKVNDVQKSNEENTERDGPYLTKDGQFVQGQYAS
## DVRTSFKNIRRSLKKSASGKIVAHDSQGSCWIMKPGKDTPSFNSEGTITDMDYDNREQWDIRKSRLSTRA
## SQSLYYESNEDVSGPPLKAMNSKNEVDQTLSFQKDATVKSIPLLSQQLQQEKCKSHPLSDLNCEKITKAS
## TPMLYHLAGGRHSAFIAPVHNTNPAQQEKLKLESKTLERMNNISVLQLSEPRPDNHKLPKNKSLTQLADL
## HDSVEGGNSGNLNSSAEESLMNDYIEKLKVAQKKVLRETSFKRKDLQMSLPCRFKLNPPKRPTIDHFRSY
## SSSSANEESAYLQTKNSADSSYKKDDTEKVAVTRIGGRKRITKEQKKLCYSEPEKLDHLGIQKSNFAWKE
## EPTFANRREMSDSDISANRIKYLESKERTNSSSNLSKTELKQIQHNALVQYMERKTNQRPNSNPQVQMER
## TSLGLPNYNEWSIYSSETSSSDASQKYLRRRSAGASSSYDATVTWNDRFGKTSPLGRSAAEKTAGVQRKT
## FSDQRTLDGSQEHLEGSSPSLSQKTSKSTHNEQVSYVNMEFLPSSHSKNHMYNDRLTVPGDGTSAESGRM
## FVSKSRGKSMEEIGTTDIVKLAELSHSSDQLYHIKGPVISSRLENTRTTAASHQDRLLASTQIETGNLPR
## QTHQESVVGPCRSDLANLGQEAHSWPLRASDVSPGTDNPCSSSPSAEVQPGAPEPLHCLQTEDEVFTPAS
## TARNEEPNSTAFSYLLSTGKPVSQGEATALSFTFLPEQDRLEHPIVSETTPSSESDENVSDAAAEKETTT
## TQLPETSNVNKPLGFTVDNQEVEGDGEPMQPEFIDSSKQLELSSLPSSQVNIMQTAEPYLGDKNIGNEQK
## TEDLEQKSKNPEEDDLPKVKLKSPEDEILEELVKEIVAKDKSLLNCLQPVSVRESAMDLMKSLFPMDVTA
## AEKSRTRGLLGKDKGETLKKNNSDLESSSKLPSKITGMLQKRPDGESLDDITLKKMELLSKIGSKLEDLC
## EQREFLLSDISKNTTNGNNMQTMVKELCKPNEFERYMMFIGDLEKVVSLLFSLSTRLTRVENSLSKVDEN
## TDAEEMQSLKERHNLLSSQREDAKDLKANLDRREQVVTGILVKYLNEEQLQDYKHFVRLKTSLLIEQKNL
## EEKIKVYEEQFESIHNSLPP
##
## >NP_597713.2 protein Shroom1 isoform 2 [Homo sapiens]
## MEALGPGGDRASPASSTSSLDLWHLSMRADSAYSSFSAASGGPEPRTQSPGTDLLPYLDWDYVRVVWGGP
## GPAPPDAALCTSPRPRPAVAARSGPQPTEVPGTPGPLNRQATPLLYALAAEAEAAAQAAEPPSPPASRAA
## YRQRLQGAQRRVLRETSFQRKELRMSLPARLRPTVPARPPATHPRSASLSHPGGEGEPARSRAPAPGTAG
## RGPLANQQRKWCFSEPGKLDRVGRGGGPARECLGEACSSSGLPGPEPLEFQHPALAKFEDHEVGWLPETQ
## PQGSMNLDSGSLKLGDAFRPASRSRSASGEVLGSWGGSGGTIPIVQAVPQGAETPRPLFQTKLSRFLPQK
## EAAVMYPAELPQSSPADSEQRVSETCIVPAWLPSLPDEVFLEEAPLVRMRSPPDPHASQGPPASVHASDQ
## PYGTGLGQRTGQVTVPTEYPLHECPGTAGADDCWQGVNGSVGISRPTSHTPTGTANDNIPTIDPTGLTTN
## PPTAAESDLLKPVPADALGLSGNDTPGPSHNTALARGTGQPGSRPTWPSQCLEELVQELARLDPSLCDPL
## ASQPSPEPPLGLLDGLIPLAEVRAAMRPACGEAGEEAASTFEPGSYQFSFTQLLPAPREETRLENPATHP
## VLDQPCGQGLPAPNNSIQGKKVELAARLQKMLQDLHTEQERLQGEAQAWARRQAALEAAVRQACAPQELE
## RFSRFMADLERVLGLLLLLGSRLARVRRALARAASDSDPDEQRLRLLQRQEEDAKELKEHVARRERAVRE
## VLVRALPVEELRVYCALLAGKAAVLAQQRNLDERIRLLQDQLDAIRDDLGHHAPSPSPARPPGTCPPVQP
## PFPLLLT
##
## >CAA58534.1 APXL [Homo sapiens]
## MEGAEPRARPERLAEAETRAADGGRLVEVQLSGGAPWGFTLKGGREHGEPLVITKIEEGSKAAAVDKLLA
## GDEIVGINDIGLSGFRQEAICLVKGSHKTLKLVVKRRSELGWRPHSWHATKFSDSHPELAASPFTSTSGC
## PSWSGRHHASSSSHDLSSSWEQTNLQRTLDHFSSLGSVDSLDHPSSRLSVAKSNSSIDHLGSHSKRDSAY
## GSFSTSSSTPDHTLSKADTSSAENILYTVGLWEAPRQGGRQAQAAGDPQGSEEKLSCFPPRVPGDSGKGP
## RPEYNAEPKLAAPGRSNFGPVWYVPDKKKAPSSPPPPPPPLRSDSFAATKSHEKAQGPVFSEAAAAQHFT
## ALAQAQPRGDRRPELTDRPWRSAHPGSLGKGSGGPGCPQEAHADGSWPPSKDGASSRLQASLSSSDVRFP
## QSPHSGRHPPLYSDHSPLCADSLGQEPGAASFQNDSPPQVRGLSSCDQKLGSGWQGPRPCVQGDLQAAQL
## WAGCWPSDTALGALESLPPPTVGQSPRHHLPQPEGPPDARETGRCYPLDKGAEGCSAGAQEPPRASRAEK
## ASQRLAASITWADGESSRICPQETPLLHSLTQEGKRRPESSPEDSATRPPPFDAHVGKPTRRSDRFATTL
## RNEIQMHRAKLQKSRSTVALTAAGEAEDGTGRWRAGLGGGTQEGPLAGTYKDHLKEAQARVLRATSFKRR
## DLDPNPGDLYPESLEHRMGDPDTVPHFWEAGLAQPPSSTSGGPHPPRIGGRRRFTAEQKLKSYSEPEKMN
## EVGLTRGYSPHQHPRTSEDTVGTFADRWKFFEETSKPVPQRPAQKQALHGIPRDKPERPRTAGRTCEGTE
## PWSRTTSLGDSLNAHSAAEKAGTSDLPRRLGTFAEYQASWKEQRKPLEARSSGRCHSADDILDVSLDPQE
## RPQHVHGRSRSSPSTDHYKQEASVELRRQAGDPGEPREELPSAVRAEEGQSTPRQADAQCREGSPGSQQH
## PPSQKAPNPPTFSELSHCRGAPELPREGRGRAGTLPRDYRYSEESTPADLGPRAQSPGSPLHARGQDSWP
## VSSALLSKRPAPQRPPPPKREPRRYRATDGAPADAPVGVLGRPFPTPSPASLDVYVARLSLSHSPSVFSS
## AQPQDTPKATVCERGSQHVSGDASRPLPEALLPPKQQHLRLQTATMETSRSPSPQFAPQKLTDKPPLLIQ
## DEDSTRIERVMDNNTTVKMVPIKIVHSESQPEKESRQSLACPAEPPALPHGLEKDQIKTLSTSEQFYSRF
## CLYTRQGAEPEAPHRAQPAEPQPLGTQVPPEKDRCTSPPGLSYMKAKEKTVEDLKSEELAREIVGKDKSL
## ADILDPSVKIKTTMDLMEGIFPKDEHLLEEAQQRRKLLPKIPSPRSTEERKEEPSVPAAVSLATNSTYYS
## TSAPKAELLIKMKDLQEQQEHEEDSGSDLDHDLSVKKQELIESISRKLQVLREARESLLEDVQANTVLGA
## EVEAIVKGVCKPSEFDKFRMFIGDLDKVVNLLLSLSGRLARVENALNNLDDGASPGDRQSLLEKQRVLIQ
## QHEDAKELKENLDRRERIVFDILANYLSEESLADYEHFVKMKSALIIEQRELEDKIHLGEEQLKCLLDSL
## QPERGK
##
## >ABD19518.1 Apxl protein [Mus musculus]
## MEGAEPRARPERLAEAEAPATDGVRLVEVQLSGGAPWGFTLKGGREHGEPLVITKIEEGSKAAAVDKLLA
## GDEIVAINDVSLSGFRQEAICLVKGSHKTLKLVVKRKSDPSWRPHSWHATKYFDVHPEPAASLFLNTSGS
## PSWKSQHQASSSSHDLSGSWEHTSLQRTSDHFSSMGSIDSLDHSSQLYPSGHLSSAKSNSSIDHLGGHSK
## RDSAYGSFSTCSSTPDHTLPKADASSTENILYKVGLWEASRPGSSRQSQSTGDPQGLQDRPSCSIPRVPG
## NSSKSPRPEDNVEPKIATHGRSNFGPVWYVPDKKKAPSPPPLGLPLRSDSFSVAARGHEKARGPPFSDLA
## SMQHFITLPHVQPRGDHRMETTDRQWKLTHLSSGKEIGNVGYQSEGHLDCRWLCSDDRAGRPSGPPGRLQ
## FSDVHFLKSYHGSQHQQQCSDESPRAPSSPRELLHITSGGGLQEPPEPSQDDNPTQVRWPGSAHQKLDDR
## GRSHYFPGSLRQPVQGSAQVVIPRGDYWHSDTTPVDLEYPLLRPVGQRTYLQQHEETPASHEKEGYHQLN
## AGIEGCCSGIQEPPRASRTVRTGLQCPSNDFKLVDGESGRISCQRTPMLHSLTQDGTWRPGNSKDCGNDK
## PPLFDAQVGKPTRRSDRFATTLRNEIQMRRAKLQKSKSTVTLAGDSEAEDCAGDWRADVGAVPEGSFPST
## YKEHLKEAQTRVLKATSFQRRDLDPTPADQYSGPSEHRTFDHSASSSLSSFPGEPDSAPRFCETGLAKAP
## SSGVGVPHVLRIGGRKRFTAEQKLKSYSEPEKINEVGLSGDHRPHPTVRTPEDTVGTFADRWKFFEETSK
## SLLQKAGHRQVHCGLPXEKAERPQTGHHECESTEPWFQKRSLATSCGEILSDRKVEKASEKLNPPRRLGT
## FAEYQASWKEQKKPLEARSSGRYHSADDILDAGLDQQQRPQYIHERSRSSPSTDHYSQEVPVEPNRQAED
## SGDHKEAILCTLQAEEGCSAPSAQPQDSQHVNEDTTFPQPETQLSSKCQHLQTSAMETSRSPSPQFAPQK
## LTDKPPLLIHEDNSARIERVMDNNTTVKMVPIKIVHSESQPEKESRQSLSCPAELPPLPSGLERDQIKTL
## STSEQCYSRFCVYTRQEVEAPHRARPPEPRPPXTPAPPVRDSCSSPPSLNYGKAKEKTMDDLKSEELARE
## IVGKDKSLADILDPSVKIKTTMDLMEGIFPKDEYLLKEAQQRRKLLPKSPYPEHRGQETGPRYARGCVLG
## HLSTYYSTSAPKAELLIKMKDLQEPEEYSAGDLDHDLSVKKQELIDSISRKLQVLREARESLLEDIQANN
## ALGDEVEAIVKDVCKPNEFDKFRMFIGDLDKVVNLLLSLSGRLARVENALNNLDDNPSPGDRQSLLEKQR
## VLTQQHEDAKELKENLDRRERIVFDILATYLSEENLADYEHFVKMKSALIIEQRELEDKIHLGEEQLKCL
## FDSLQPERSK
##
## >AAF13269.1 PDZ domain actin binding protein Shroom [Mus musculus]
## MKTPENLEEPSATPNPSRTPTERFVYLEALLEGGAPWGFTLKGGLERGEPLIISKIEEGGKADSVSSGLQ
## AGDEVIHINEVALSSPRREAVSLVKGSYKTLRLVVRRDVCAAPGHADPGTSKSLSSELLTCSPQHRKATW
## SGGVKLRLKQRCSEPATRPHSWHTTKFGETQPDVSMMQISQGTMGPPWHQSYHSSSSTSDLSNYDHAYLR
## RSPDQCSSQGSMESLEPSGGYPPCHLLSPAKSTSSIDQLGHLHNKRDSAYSSFSTSSSIFEYPPPGGSAR
## ERSGSMDVISARGGLLEGMRQADIRYVKTVYDTRRGVSSEYEVNPSALLLQGRDAHASADSQGCAKWHSI
## PRGKGTPSPSWSQQCSGSLETATDNLPQKAGAPLPPTRSDSYAAFRHRERPSSWSSLDHKRFCRPQTNSS
## GSQKTPFAEDQLHTVPERSPENSPPVKSKHNYTQKAQPGQPLLPTGIYPVPSPEPHFAQVPQPSVSSNGT
## VYPALVKESGYTAAQGTCNKMATLDENGNQNEASRPGFAFCQPLEHNSVTPVEKRPEPTAKYIYKVHFSS
## VPENEDSSLKRHITPPHGHSPYPSERKNIHGGSRACSNHHSLSSPQAQALHVGDDRKPSRLSQPWEGDFQ
## EDHNANLRQKVEREGQGQGLSGNSGRTRSAFSSLQNIPESLRRQSNVELGEAQEVHPGGRSKVEDPGRKA
## GASDIRGYLDRSVSYPRPEGKMNAVDSVHSADSRYEESPAPALPQTSGASQRRLSSSSSAAPQYRKPHCS
## VLEKVSRIEEREQGRHRPLSVGSSAYGPGNRPGRTGPTPSTSSSDLDDPKAGSVHFSESTEHLRNGEQNP
## PNGEAKQEEASRPQCSHLIRRAPADGRGPPARGGEPSRPEARLLRSQSTFQLYSEAEREASWSEDRPGTP
## ESPLLDAPFSRAYRNSIKDAQSRVLGATSFRRRDLEPGTPATSRPWRPRPASAHVGMRSPEAAVPSSSPH
## TPRERHSVTPAAPQAARRGPRRRLTVEQKKRSYSEPEKMNEVGVSEEAEPTPCGPPRPAQPRFSESTVAD
## RRRIFERDGKACSTLSLSGPELKQFQQNALADYIQRKTGKRPTGAASHTGGRAARARTERLPPGRPRGAR
## WPRLASACSLSSLREPEALPRKEHTHPSAADGPQAPRDRSSSFASGRLVGERRRWDPQVPRQLLSGANCE
## PRGVQRMDGAPGGPPSWGMVAGKAGKSKSAEDLLERSDTLAVPVHVRSKSSPTSDKKGQDVLLREGSNFG
## FVKDPCCLAGPGPRSLSCSDKGQNELALPLHHHTPCWNGSGCKATVASSAPPESSGAADHLKQRRAPGPR
## PLSAGMHGHFPDARAASLSSPLPSPVPSASPVPSSYRSQLAMDQQTGQQPPSSPASAVTQPTSPRSLELS
## SPAYGLGEGMWKRTSLPQRPPPPWVKWAHAVREDGLAEDTLAPEFANLKHYRNQPSRPSSCSTSDPDTPG
## RISLRISESALQPSPPPRGDYDDEVFMKDLHPKVTSSPTFEALPPPPPPSPPSEEPLVNGTDDFPPPPPP
## QALCEVLLDGEASTEAGSGPCRIPRVMVTREGHVPGAAHSEGSQIMTATPPQTSAKGSEAESNTPSSASA
## QPQLNGSPGKQLCPSQTRNLTYEPVERTQDLGKKTHAEPQKTSEDIRTEALAKEIVHQDKSLADILDPDS
## RMKTTMDLMEGLFPGDASVLMDSGAKRKALDITARRAGCEAKASDHKEAVSVLVNCPAYYSVSAAKAELL
## NKIKDMPEELQEEEGQEDVYEKKAELIGSLTHKLESLQEAKGSLLTDIKLNNALGEEVEALISELCKPNE
## FDKYKMFIGDLDKVVNLLLSLSGRLARVENVLRGLGEDASKEERSSLNEKRKVLAGQHEDARELKENLDR
## RERVVLDILANYLSAEQLQDYQHFVKMKSTLLIEQRKLDDKIKLGQEQVRCLLESLPSDFRPKAGAISLP
## PALTGHATPGGTSVFGGVFPTLTSPL
##
## >AAF13270.1 actin binding protein ShroomS [Mus musculus]
## MMQISQGTMGPPWHQSYHSSSSTSDLSNYDHAYLRRSPDQCSSQGSMESLEPSGGYPPCHLLSPAKSTSS
## IDQLGHLHNKRDSAYSSFSTSSSIFEYPPPGGSARERSGSMDVISARGGLLEGMRQADIRYVKTVYDTRR
## GVSSEYEVNPSALLLQGRDAHASADSQGCAKWHSIPRGKGTPSPSWSQQCSGSLETATDNLPQKAGAPLP
## PTRSDSYAAFRHRERPSSWSSLDHKRFCRPQTNSSGSQKTPFAEDQLHTVPERSPENSPPVKSKHNYTQK
## AQPGQPLLPTGIYPVPSPEPHFAQVPQPSVSSNGTVYPALVKESGYTAAQGTCNKMATLDENGNQNEASR
## PGFAFCQPLEHNSVTPVEKRPEPTAKYIYKVHFSSVPENEDSSLKRHITPPHGHSPYPSERKNIHGGSRA
## CSNHHSLSSPQAQALHVGDDRKPSRLSQPWEGDFQEDHNANLRQKVEREGQGQGLSGNSGRTRSAFSSLQ
## NIPESLRRQSNVELGEAQEVHPGGRSKVEDPGRKAGASDIRGYLDRSVSYPRPEGKMNAVDSVHSADSRY
## EESPAPALPQTSGASQRRLSSSSSAAPQYRKPHCSVLEKVSRIEEREQGRHRPLSVGSSAYGPGNRPGRT
## GPTPSTSSSDLDDPKAGSVHFSESTEHLRNGEQNPPNGEAKQEEASRPQCSHLIRRAPADGRGPPARGGE
## PSRPEARLLRSQSTFQLYSEAEREASWSEDRPGTPESPLLDAPFSRAYRNSIKDAQSRVLGATSFRRRDL
## EPGTPATSRPWRPRPASAHVGMRSPEAAVPSSSPHTPRERHSVTPAAPQAARRGPRRRLTVEQKKRSYSE
## PEKMNEVGVSEEAEPTPCGPPRPAQPRFSESTVADRRRIFERDGKACSTLSLSGPELKQFQQNALADYIQ
## RKTGKRPTGAASHTGGRAARARTERLPPGRPRGARWPRLASACSLSSLREPEALPRKEHTHPSAADGPQA
## PRDRSSSFASGRLVGERRRWDPQVPRQLLSGANCEPRGVQRMDGAPGGPPSWGMVAGKAGKSKSAEDLLE
## RSDTLAVPVHVRSKSSPTSDKKGQDVLLREGSNFGFVKDPCCLAGPGPRSLSCSDKGQNELALPLHHHTP
## CWNGSGCKATVASSAPPESSGAADHLKQRRAPGPRPLSAGMHGHFPDARAASLSSPLPSPVPSASPVPSS
## YRSQLAMDQQTGQQPPSSPASAVTQPTSPRSLELSSPAYGLGEGMWKRTSLPQRPPPPWVKWAHAVREDG
## LAEDTLAPEFANLKHYRNQPSRPSSCSTSDPDTPGRISLRISESALQPSPPPRGDYDDEVFMKDLHPKVT
## SSPTFEALPPPPPPSPPSEEPLVNGTDDFPPPPPPQALCEVLLDGEASTEAGSGPCRIPRVMVTREGHVP
## GAAHSEGSQIMTATPPQTSAKGSEAESNTPSSASAQPQLNGSPGKQLCPSQTRNLTYEPVERTQDLGKKT
## HAEPQKTSEDIRTEALAKEIVHQDKSLADILDPDSRMKTTMDLMEGLFPGDASVLMDSGAKRKALDITAR
## RAGCEAKASDHKEAVSVLVNCPAYYSVSAAKAELLNKIKDMPEELQEEEGQEDVYEKKAELIGSLTHKLE
## SLQEAKGSLLTDIKLNNALGEEVEALISELCKPNEFDKYKMFIGDLDKVVNLLLSLSGRLARVENVLRGL
## GEDASKEERSSLNEKRKVLAGQHEDARELKENLDRRERVVLDILANYLSAEQLQDYQHFVKMKSTLLIEQ
## RKLDDKIKLGQEQVRCLLESLPSDFRPKAGAISLPPALTGHATPGGTSVFGGVFPTLTSPL
##
## >NP_065910.3 protein Shroom3 [Homo sapiens]
## MMRTTEDFHKPSATLNSNTATKGRYIYLEAFLEGGAPWGFTLKGGLEHGEPLIISKVEEGGKADTLSSKL
## QAGDEVVHINEVTLSSSRKEAVSLVKGSYKTLRLVVRRDVCTDPGHADTGASNFVSPEHLTSGPQHRKAA
## WSGGVKLRLKHRRSEPAGRPHSWHTTKSGEKQPDASMMQISQGMIGPPWHQSYHSSSSTSDLSNYDHAYL
## RRSPDQCSSQGSMESLEPSGAYPPCHLSPAKSTGSIDQLSHFHNKRDSAYSSFSTSSSILEYPHPGISGR
## ERSGSMDNTSARGGLLEGMRQADIRYVKTVYDTRRGVSAEYEVNSSALLLQGREARASANGQGYDKWSNI
## PRGKGVPPPSWSQQCPSSLETATDNLPPKVGAPLPPARSDSYAAFRHRERPSSWSSLDQKRLCRPQANSL
## GSLKSPFIEEQLHTVLEKSPENSPPVKPKHNYTQKAQPGQPLLPTSIYPVPSLEPHFAQVPQPSVSSNGM
## LYPALAKESGYIAPQGACNKMATIDENGNQNGSGRPGFAFCQPLEHDLLSPVEKKPEATAKYVPSKVHFC
## SVPENEEDASLKRHLTPPQGNSPHSNERKSTHSNKPSSHPHSLKCPQAQAWQAGEDKRSSRLSEPWEGDF
## QEDHNANLWRRLEREGLGQSLSGNFGKTKSAFSSLQNIPESLRRHSSLELGRGTQEGYPGGRPTCAVNTK
## AEDPGRKAAPDLGSHLDRQVSYPRPEGRTGASASFNSTDPSPEEPPAPSHPHTSSLGRRGPGPGSASALQ
## GFQYGKPHCSVLEKVSKFEQREQGSQRPSVGGSGFGHNYRPHRTVSTSSTSGNDFEETKAHIRFSESAEP
## LGNGEQHFKNGELKLEEASRQPCGQQLSGGASDSGRGPQRPDARLLRSQSTFQLSSEPEREPEWRDRPGS
## PESPLLDAPFSRAYRNSIKDAQSRVLGATSFRRRDLELGAPVASRSWRPRPSSAHVGLRSPEASASASPH
## TPRERHSVTPAEGDLARPVPPAARRGARRRLTPEQKKRSYSEPEKMNEVGIVEEAEPAPLGPQRNGMRFP
## ESSVADRRRLFERDGKACSTLSLSGPELKQFQQSALADYIQRKTGKRPTSAAGCSLQEPGPLRERAQSAY
## LQPGPAALEGSGLASASSLSSLREPSLQPRREATLLPATVAETQQAPRDRSSSFAGGRRLGERRRGDLLS
## GANGGTRGTQRGDETPREPSSWGARAGKSMSAEDLLERSDVLAGPVHVRSRSSPATADKRQDVLLGQDSG
## FGLVKDPCYLAGPGSRSLSCSERGQEEMLPLFHHLTPRWGGSGCKAIGDSSVPSECPGTLDHQRQASRTP
## CPRPPLAGTQGLVTDTRAAPLTPIGTPLPSAIPSGYCSQDGQTGRQPLPPYTPAMMHRSNGHTLTQPPGP
## RGCEGDGPEHGVEEGTRKRVSLPQWPPPSRAKWAHAAREDSLPEESSAPDFANLKHYQKQQSLPSLCSTS
## DPDTPLGAPSTPGRISLRISESVLRDSPPPHEDYEDEVFVRDPHPKATSSPTFEPLPPPPPPPPSQETPV
## YSMDDFPPPPPHTVCEAQLDSEDPEGPRPSFNKLSKVTIARERHMPGAAHVVGSQTLASRLQTSIKGSEA
## ESTPPSFMSVHAQLAGSLGGQPAPIQTQSLSHDPVSGTQGLEKKVSPDPQKSSEDIRTEALAKEIVHQDK
## SLADILDPDSRLKTTMDLMEGLFPRDVNLLKENSVKRKAIQRTVSSSGCEGKRNEDKEAVSMLVNCPAYY
## SVSAPKAELLNKIKEMPAEVNEEEEQADVNEKKAELIGSLTHKLETLQEAKGSLLTDIKLNNALGEEVEA
## LISELCKPNEFDKYRMFIGDLDKVVNLLLSLSGRLARVENVLSGLGEDASNEERSSLYEKRKILAGQHED
## ARELKENLDRRERVVLGILANYLSEEQLQDYQHFVKMKSTLLIEQRKLDDKIKLGQEQVKCLLESLPSDF
## IPKAGALALPPNLTSEPIPAGGCTFSGIFPTLTSPL
##
## >ABD59319.1 shroom-like protein [Xenopus laevis]
## MMQVSQGTIGSPWHQAYHSSSSTSDLSGYNHEFLRRSPDQYSSRGSMESLDQASAAYHHHLPPAKSTNCI
## DQLVHLHNKRDSAYSSFSTNASIPEYRSSPFSKERSYSMESMHSRNSSGQEGIKHADIKYIKTVYDVQRG
## ISEEYEVNSSSVKNRNYSRQPAYNRHSIGPHGRLEQSRFFSESGGFERAAPMPPTRSDSYALTRHHERPN
## SWSSLDQNRNFRTPKAAGLHSTNTSSNAAQQPKHVHGDGHLHPVLERSPESSPLIKPKQVYSETPQPGQP
## MLPTGIYPVPAPEPHFAHAPQPPKNNNGRLYPALAKEGSYGAKSSEKVLPFSEPNKNEKDTQNLRSKSVG
## QYPMNHSVKEREKKQEGPTGFAHYKLHFTAGPDISTSSLTNDRNDQQPLRLDNIDINEQQKNGTKVAEEF
## SVYAHPAFQNEWSDSKTKQDIASSDIIGLHRNSLSSDAHGEHEYHNHFNIASSSHNKMDERSNRQADHRK
## KLESLSFTVHADEADGPSSNPLKPDESPSPSQKKSYDFTRRRLSSSSSQSSKTDGNKLSSVFDKVCKIEQ
## REHENHRSQFLCGNINQSGLSTRGQNNKGSFTMVEEIRNKFISQDQTPNPNEWRRLSSSHSNEKVTGMHQ
## LTRQGIVYGLQTGDAQKQMPEKQAEKMHSYNQEQNILQAVPDDDNRSFNSQTMPNKEDDWQCAAQDTLGF
## NRAYRNSVKDAQCKVLEATSYRRKDLEISPPHYKKPEKNVRPASAPFRKKSSSLSPHAPKERHSVTPTDN
## CASIQESQGVFFPSRIGAKRRITAEQKKRSYSEPEKMNEVGASESESAPLTVSKMEPVASFSENSVADRR
## RIFEREGKACSTINLSKPQLKQLQQNALADYIERKTGRRPSSQETRLLKERSQSTYFSGSIMDNQSMTST
## SSMNSLNEHNLSYRHREPLSKTGRVSSTLPPGLTGFFDLSSFENNPEYPENRSRSSSFAHQLRSERLLDH
## RSKVEFGKGRETNKPKEVSLQSDDDVIITSSRRHGKSASAEDLLDRLPQPPALHVRSRSSPASDMKSREY
## MSRQEVGNKTSYASASNKEIRSIKSNHFEQMSFTPSFKNHIDTGEDPVPENSSTIQRSAQLENQRNTKTQ
## SISGIYSPHPETKQEPLALPIHSVPAKVTQTSLAHATFDYITAEEYLYSGKRGKESASPTDNKEISDQEW
## CLPENSSSEDLNDPERFAKYTSAQRPQSFETKSGNSINETVQQNKSSGPTAGPKFSTSWKSNGMWSSGSS
## EAETTFNHGKISLHISESCLQPQSPMTGQEDEGDDEVFVKEQDTESFSGTFVPPSPPPFPPPSLEDALLK
## QRIEKFPLVPNTLDEIWENTEEASTQVKVKSNERYLQCASEYTASTESSGSYLLNSGITKRDTDGPLLRL
## SSIVPAPEPLASPVDPTKPIEEQETQPHGADTSILQSSEGNFNPSDSQSTLPHVRSELMSSEDAKSQELA
## KEIVTKDKSLANILDPDSRMKTTMDLMEGLFTKSSSALKEKNQKRKAKKQIDNIIAPESEXKEEKRETLD
## NASNYSAYYSTSAPKAELLRKMKTIHSQIGGKEEQFDVNEKKAELISSLTCKLEVLKDAKESLIDDIKLN
## NSLGEEVETQIETLCKPNEFDKYKMFIGDLDKVVNLLLSLSGRLARVENALSSLGEDASAEERKTWNEKK
## KQLCGQHEDARELKENLDRREKLVMDFLGNYLTGEEFAHYQHFVKMKSALLIEQRELDDKIKLGQEQLRC
## LTESLPSDYLISMKVSLPEERRSSLGNKSLPPPLTSSL
##
## >NP_065768.2 protein Shroom4 [Homo sapiens]
## MENRPGSFQYVPVQLQGGAPWGFTLKGGLEHCEPLTVSKIEDGGKAALSQKMRTGDELVNINGTPLYGSR
## QEALILIKGSFRILKLIVRRRNAPVSRPHSWHVAKLLEGCPEAATTMHFPSEAFSLSWHSGCNTSDVCVQ
## WCPLSRHCSTEKSSSIGSMESLEQPGQATYESHLLPIDQNMYPNQRDSAYSSFSASSNASDCALSLRPEE
## PASTDCIMQGPGPTKAPSGRPNVAETSGGSRRTNGGHLTPSSQMSSRPQEGYQSGPAKAVRGPPQPPVRR
## DSLQASRAQLLNGEQRRASEPVVPLPQKEKLSLEPVLPARNPNRFCCLSGHDQVTSEGHQNCEFSQPPES
## SQQGSEHLLMQASTKAVGSPKACDRASSVDSNPLNEASAELAKASFGRPPHLIGPTGHRHSAPEQLLASH
## LQHVHLDTRGSKGMELPPVQDGHQWTLSPLHSSHKGKKSPCPPTGGTHDQSSKERKTRQVDDRSLVLGHQ
## SQSSPPHGEADGHPSEKGFLDPNRTSRAASELANQQPSASGSLVQQATDCSSTTKAASGTEAGEEGDSEP
## KECSRMGGRRSGGTRGRSIQNRRKSERFATNLRNEIQRRKAQLQKSKGPLSQLCDTKEPVEETQEPPESP
## PLTASNTSLLSSCKKPPSPRDKLFNKSMMLRARSSECLSQAPESHESRTGLEGRISPGQRPGQSSLGLNT
## WWKAPDPSSSDPEKAHAHCGVRGGHWRWSPEHNSQPLVAAAMEGPSNPGDNKELKASTAQAGEDAILLPF
## ADRRKFFEESSKSLSTSHLPGLTTHSNKTFTQRPKPIDQNFQPMSSSCRELRRHPMDQSYHSADQPYHAT
## DQSYHSMSPLQSETPTYSECFASKGLENSMCCKPLHCGDFDYHRTCSYSCSVQGALVHDPCIYCSGEICP
## ALLKRNMMPNCYNCRCHHHQCIRCSVCYHNPQHSALEDSSLAPGNTWKPRKLTVQEFPGDKWNPITGNRK
## TSQSGREMAHSKTSFSWATPFHPCLENPALDLSSYRAISSLDLLGDFKHALKKSEETSVYEEGSSLASMP
## HPLRSRAFSESHISLAPQSTRAWGQHRRELFSKGDETQSDLLGARKKAFPPPRPPPPNWEKYRLFRAAQQ
## QKQQQQQQKQQEEEEEEEEEEEEEEEEEEEEAEEEEEELPPQYFSSETSGSCALNPEEVLEQPQPLSFGH
## LEGSRQGSQSVPAEQESFALHSSDFLPPIRGHLGSQPEQAQPPCYYGIGGLWRTSGQEATESAKQEFQHF
## SPPSGAPGIPTSYSAYYNISVAKAELLNKLKDQPEMAEIGLGEEEVDHELAQKKIQLIESISRKLSVLRE
## AQRGLLEDINANSALGEEVEANLKAVCKSNEFEKYHLFVGDLDKVVNLLLSLSGRLARVENALNSIDSEA
## NQEKLVLIEKKQQLTGQLADAKELKEHVDRREKLVFGMVSRYLPQDQLQDYQHFVKMKSALIIEQRELEE
## KIKLGEEQLKCLRESLLLGPSNF
##
## >AAK95579.1 SHAP-A, partial [Homo sapiens]
## MHFPSEAFSLSWHSGCNTSDVCVQWCPLSRHCSTEKSSSIGSMESLEQPGQATYESHLLPIDQNMYPNQR
## DSAYSSFSASSNASDCALSLRPEEPASTDCIMQGPGPTKAPSGRPNVAETSGGSRRTNGGHLTPSSQMSS
## RPQEGYQSGPAKAVRGPPQPPVRRDSLQASRAQLLNGEQRRASEPVVPLPQKEKLSLEPVLPARNPNRFC
## CLSGHDQVTSEGHQNCEFSQPPESSQQGSEHLLMQASTKAVGSPKACDRASSVDSNPLNEASAELAKASF
## GRPPHLIGPTGHRHSAPEQLLASHLQHVHLDTRGSKGMELPPVQDGHQWTLSPLHSSHKGKKSPCPPTGG
## THDQSSKERKTRQVDDRSLVLGHQSQSSPPHGEADGHPSEKGFLDPNRTSRAASELANQQPSASGSLVQQ
## ATDCSSTTKAASGTEAGEEGDSEPKECSRMGGRRSGGTRGRSIQNRRKSERFATNLRNEIQRRKAQLQKS
## KGPLSQLCDTKEPVEETQEPPESPPLTASNTSLLSSCKKPPSPRDKLFNKSMMLRARSSECLSQAPESHE
## SRTGLEGRISPGQRPGQSSLGLNTWWKAPDPSSSDPEKAHAHCGVRGGHWRWSPEHNSQPLVAAAMEGPS
## NPGDNKELKASTAQAGEDAILLPFADRRKFFEESSKSLSTSHLPGLTTHSNKTFTQRPKPIDQNFQPMSS
## SCRELRRHPMDQSYHSADQPYHA
##
## >ABA81834.1 LP13775p [Drosophila melanogaster]
## MKMRNHKENGNGSEMGESTKSLAKMEPENNNKISVVSVSKLLLKDSNGANSRSSNSNASFSSASVAGSVQ
## DDLPHHNSSSSQLGQQHGSSLDQCGLTQAGLEEYNNRSSSYYDQTAFHHQKQPSYAQSEGYHSYVSSSDS
## TSATPFLDKLRQESDLLSRQSHHWSENDLSSVCSNSVAPSPIPLLARQSHSHSHSHAHSHSNSHGHSHGH
## AHSASSSSSSNNNSNGSATNNNNNNSSESTSSTETLKWLGSMSDISEASHATGYSAISESVSSSQRIVHS
## SRVPTPKRHHSESVLYLHNNEEQGDSSPTASNSSQMMISEEANGEESPPSVQPLRIQHRHSPSYPPVHTS
## MVLHHFQQQQQQQQDYQHPSRHHTNQSTLSTQSSLLELASPTEKPRSLMGQSHSMGDLQQKNPHQNPMLG
## RSAGQQHKSSISVTISSSEAVVTIAPQPPAGKPSKLQLSLGKSEALSCSTPNMGEQSPTNSIDSYRSNHR
## LFPVSTYTEPVHSNTSQYVQHPKPQFSSGLHKSAKLPVITPAGATVQPTWHSVAERINDFERSQLGEPPK
## FAYLEPTKTHRLSNPALKALQKNAVQSYVERQQQQQKEEQQLLRPHSQSYQACHVERKSLPNNLSPIMVG
## LPTGSNSASTRDCSSPTPPPPPRRSGSLLPNLLRRSSSASDYAEFRELHQAQGQVKGPSIRNISNAEKIS
## FNDCGMPPPPPPPRGRLAVPTRRTSSATEYAPMRDKLLLQQAAALAHQQHHPQQHRHAQPPHVPPERPPK
## HPNLRVPSPELPPPPQSELDISYTFDEPLPPPPPPEVLQPRPPPSPNRRNCFAGASTRRTTYEAPPPTAI
## VAAKVPPLVPKKPTSLQHKHLANGGGGSRKRPHHATPQPILENVASPVAPPPPLLPRARSTAHDNVIASN
## LESNQQKRSNSKASYLPRQSLEKLNNTDPDHGIYKLTLTSNEDLVAHTKPSYGVTGKLPNNLPDVLPLGV
## KLHQQPKLQPGSPNGDANVTLRYGSNNNLTGNSPTVAPPPYYGGGQRYSTPVLGQGYGKSSKPVTPQQYT
## RSQSYDVKHTSAVTMPTMSQSHVDLKQAAHDLETTLEEVLPTATPTPTPTPTPTPPRLSPASSHSDCSLS
## TSSLECTINPIATPIPKPEAHIFRAEVISTTLNTNPLTTPPKPAMNRQESLRENIEKITQLQSVLMSAHL
## CDASLLGGYTTPLITSPTASFANEPLMTPPLPPSPPPPLEPEEEEEQEENDVHDKQPEIEELQLMQRSEL
## VLMVNPKPSTTDMACQTDELEDRDTDLEAAREEHQTRTTLQPRQRQPIELDYEQMSRELVKLLPPGDKIA
## DILTPKICKPTSQYVSNLYNPDVPLRLAKRDVGTSTLMRMKSITSSAEIRVVSVELQLAEPSEEPTNLIK
## QKMDELIKHLNQKIVSLKREQQTISEECSANDRLGQDLFAKLAEKVRPSEASKFRTHVDAVGNITSLLLS
## LSERLAQTESSLETRQQERGALESKRDLLYEQMEEAQRLKSDIERRGVSIAGLLAKNLSADMCADYDYFI
## NMKAKLIADARDLAVRIKGSEEQLSSLSDALVQSDC
##
## >EAA12598.4 AGAP008245-PA, partial [Anopheles gambiae str. PEST]
## IPFSSSPKNRSNSKASYLPRQPRDKLHSDPDHGSYKLTLTSNEDCINHNTGEIITASPKCNLPDVLPPGV
## KYSLYSTNNNNNNNNNSVSNNNSINNHHNGIKPKPHSAPIISTANSLKSLFNFSTSSSTTSTSSSDAAKD
## RDGPQTPATGPPPALVGNFEQQQRQHQHDATVLPPPTGGSTVAGAERAPNEPALDSEASSASTSTRDDDA
## LSSNDAAPATVPVVVVAKEQEGESCPSTAEPLVNGVGVGGVSEHTISGSPAALERVTKEINLSPVVGDAV
## ACEPPSPLPLQRTEIVLRVQAPTSEAASQTDSDDAGLARGFAELTIDCGRRAKDQQDATTVSQQCSNGAS
## TSVATSTTSPIGSPPGTPPSGKEQQGQKFFAPLSSSSSPPPPPPSTPRKLHPEEIDCDKLSHDLVSQLSP
## SDKLHTILAPKTFKSSSDYVSDLFNIQIAPRPLKKDASTATPTETTVANGRRSLSITASQRQQLVSKCKG
## EEAVKKNQEELVQRLGKKLLVLTNEQTNIAEESNANDLLGNDVALKVTQKVRPADASKFRSYVDDVGYIT
## MLLLSLSGRLARTDNALHMIDANHPDKKILEAKRERLLEQLDEAKQLKDDIDQRGATIARILEQSLTIEE
## YADYDYFINMKAKLIVDSREIADKIKLGEEQLAALKDTLVQSEC
##
## >XP_392427.4 PREDICTED: hypothetical protein LOC408897 [Apis mellifera]
## MTELQPSPPGYRVQDEAPGPPSCPPASYKYASHGHGSEAANFKSTSSSYPQEGYGGLKQSPSRTVPPNEY
## YRRRNDGRRSTENEHEAGNATKKATIPGYNESHKKNTTSYKNDSGYSESLGFDSYTLPLNERDEASPPPT
## PPVRDASSLKGVCYGPGHEKYPSWPSAPERHPDEDVHGSGHSGSHRSKSWTDHTNYPKEKPAQYTRPHTK
## RPNPAFTQQLKTVMERCEKIPAETFESRNRGNVTEEEPRLWPRVDREGKALGDAEYVVPSPPEREQPQTA
## QTLSHADLEAYVRSYQVDPQVSQVDIYRESTLTQAGLEEYTRVQHSQQASYAQSEGYHSYVSSVDSTTNT
## PFLDRLRRDSEAVAQRPTSTWEDSTSREGRDSVVTTSSGSASSSETLKWHGSMSDVSVSSGLPARQDRTS
## DRWHHGSLSDVSSVNGGVLSQKAGSNGCRDKWQGSMSDVSTSCGLSPTAKGRHGGEKWPDKWQVVSMNDR
## NKQSQGRSSLPAVINHCNASSNAADVSSAMRESKWQESSIDEESLVDKSQVSSSGATASMQWDNSMRIEG
## DKYGSLAQPMPQSPIRQQIGSTTPQSPENWNHPIHGSMSDVSQVNGLSCSKQLIAHSARVQTPQRHHSES
## VLYLDRERNQRKLYPVATTQPQLDSAQTSQRMPPALPSQQISVAERINELEKQQQQQQQQQQQQQQQQQQ
## QMRYTYLDPEKRHRVSDPTLKAIQKKALLSFYERHHQASWRSEPQLAQGSQTIAAPQSPPPQPPPRPRPP
## SSRRASSASDYASGAWRENGNRNQNQGNVGELSSPKHQHSNSCGSLSTDLLGPVIVGPAISIDDWVPERP
## PKKPHLRNVYNDRVPSPDLPPPSPPTVTENEVHDCDDPLPPPPPELSDDCFNDATTTATTATATAATTAH
## HHHQSSEESKIRDRSCDRHKLERHSIRRSKHGSKRDYEKLSSGKSSPNSAAKTSMSHQQAEQEQQHQFVR
## GGIALKHVESEMVLENGVSAGFATHRMVATGRSSLRYPSAQKLMMNGRVTPARRISEERGFSARPPAQLP
## DLISQRYTDSGNQRPAPQVPVEPRIIRQESMRVDGTRIDVSGTLLRNDSAQRLESSSGQRPQPQVAKNAD
## KSGSNNQTTRPNYLPVPENSKCASKYLETGNHGFTIGSPVKTGYEANSKMYPSESSPQKYHEPPKYVANN
## HHQSQRHSGDGTQRGNYYALPPKYIDAPKQKPQPQCPTDRYGGSSNASPSSPPPPPLAPRQNTASRKSLP
## PPPRPAPPHALQGSQSKASYLAYRRERGAPDTEGSYKRTMSPTSRLEDWPPPRDHDDPVLLRVTPHHPLQ
## HHHHHHHNNHHHQQQPELSKSHSVDALHHRLEERSAQQQQQQQQQQQQQQQQQQQQQQQQQQNSAEMIGK
## LSHDLNKKLQLNDARSRTSENNNNDNLEDRHQHHHYHHHHHHHYHQERRHDYEQRKERQSPQSIEVLNDR
## NRQLERERRKLAASCEPLSQNREKQNRSMEIPSSVVEENLFRERSATIEMTSQNIELLNRRNEKRTNVTS
## TTTSTITTCITTSSTTTAMTITTDSCWPRSLEVQSSIEASPVSEKPTLPTSSPPRSPDQVEGDSSRGAVP
## RRSGSCSSSSSSSSSSSSSSSSSSSSSSSSSSGKSSDLHISPRNLSNSFSKNENSFLNNRKVSSPTQTDN
## TGSSNRSSSPVRSNQTKEIELRMDRSSLSSKSRSGGRSSTSSVSSCVSKASSSSSSRNSPVEEDISFSPM
## SPCVSPQPGIEGLTLLQRTEVVLRVNTATSDVASQTDIPETTEIESSTVKIREILLCRKKLPEEIECEEL
## GRDLASQLNPNDKLVPLLVPAPEHKKPTDYVTGLFRVEATLHPRPKRRSSLEEPTTPCSDNGDEEKKHES
## IPSTPLSADSTSPLSPTSAYFTTSEGKARFLTRYSRDVTVEGSTRQEDVPPIIPTNSLDLRQKKEELMMS
## LDKKLVVLRAEQEAVREEGEVNEALGARVATRISAVARPAEASKYRLHVEEVGKITSLLLGLSGRLARAE
## NALYGMPAEHAERKILESKRDKLMDQLEEAKILKSNIDKRSVNVSTILSKYLNEEEFADYQHFINMKAKL
## IVDGREIQDKVKLGEEQLAALREAID
##
## >XP_783573.4 protein Shroom3 isoform X5 [Strongylocentrotus purpuratus]
## MMKDAMYPTTTSTTSSSVNPLPKEVAEQKPVNTKRVRKRESQPGSPRPKSWHTDVRTLSQPDLSRMPQHS
## RQRHGEQTQPRYRNPPPTQYNKFHSSSDSSFMMSSYEEKTGYHQHGRSTGNINNNSAEDTIEPLPGHVQK
## KREAFERTIMSQSTDKINTEDQYGDVYSKRYSKGKEAITQGVNPKLRNIRHDLEPAETYPKVVVATHIHS
## VQSKAVLGRVPDSSDQTGQKYGGAQDVNIYAVQMPERQIASSADSSVTRNYAQAHSNLSGNPQTSYVQST
## FGSNPHSSSFATHHGEIRKVPPAIPKREDSKTKTQAYSDHVKSSSWPVSTISSETTCTLTVCPTILPSDL
## PPVKLTKTEKLQKSPTHSVTQSNPNQNSNSTDQHIVQAKRIWIDDASEHEFNFEDSKMLSSDNTLNTNTR
## PPSPPVRDNENGNYKPKQSKTTRSSDDRFNTSDHLILDYRSFLEKTEQQQENLKSIQPVNSVPESKNKRE
## LFTRTHDFSKATQQEESSPMAQTQPARESSTRNSWYQEKKKQRKRSSLSSEDSLNFSEFDLNKKNLGQNP
## ARTWRNPSESRESTTSDLHPQLTGHPQQPQQQQPVEPRISSHSRQSSDLDNPNAPRKRTPISPSLMEETF
## RMEQEPEQKTFTEKVERTVNRQESRDSRKSGIFDQNDDCQLEKLQPDGNLSENSILRRLEREGSFKNNVN
## LDPQRSEGNETSARKTMPDTKRDLRNLGLSEDAFKQDHRPKSNHYKSGSFTHDSRGNAGDPRLMRTAPVP
## SSHQRTHSIDTYNRNPPRRHESFERRGNPVSRESSFSEHKKSKSDSDQHPKADQQKKKMSDPINKPQNVR
## KTSDPENRQQIWDALKGFVHNRRSPPGTSPASSRPPSMSGSEQSLYRSDMYHSTSSLASGYSSSRHYPQD
## SLSSIGSSFSHPLHQPQDSGFGSNSDISQVRGPHSPSQTGVVSPKDVRIAASIAHSSSMSSSGPQYQQTT
## NERRRSHQVQRPPHQTKHLTSRMSLDSINLPNSRNQQEKMRPNRSPQDKFEFSPTRQISPSPSYNVHVAE
## RVSISSLKEEESNKEGTVFYESLQSERTETEVNHRVFRYPPRSDQTSSSGGQRTSPKSSTVHPPVKSLSM
## DPSYQELELSPPPPPTPLSPLDGRGNMEFPPPPPELAPASNTKRSSPKQEPSEQTVRQATQGGSMPLTSP
## ERITPSFAEQLQQAPSLIHVQVDQVPSAKGEETTPSISPNSIISGRSSPDNHDVEPATSPQQVPRLQSVQ
## ENIAFTDRKDSPVLIRPATLLDSPTRCSQAEPEPLDVAEEEAFDACDGGSNICDSGSNIFTREQEKTDKE
## LREVSNPVLKWVLQALTPSDTVLSDLFPLPRSKTSRSDTMTMDVTTPTKSESEMVMEQSSPSECVNLVLS
## SSRYLRISPAKAIILQRAQTMNKSDDLGNNNTELRKTQEELVDRIGKKVEDIKDLQKEVAEEMSNLEDMG
## RQVMDSVKATCKASEYNKCNMYIADIERVTKLLLSLSRRLNKVESVLGSIENSEEEEKVNLEKLKVTVNS
## KYQDAKMLKESITGRHSTISSMLLNKISNDQHDNFTYYIQMLPRHLIMGQELEDKVKLGEEQLEALGESL
## KQMSLSSDSGSSRDTNGNVSHGFKEEAATSSSSNGIGGPEQLNSNATSSYC
shrooms_list <- compbio4all::entrez_fetch_list(db = "protein",
id = shroom_table$accession,
rettype = "fasta")
is(shrooms_list)
## [1] "list" "vector" "list_OR_List" "vector_OR_Vector"
## [5] "vector_OR_factor"
length(shrooms_list)
## [1] 14
nchar(shrooms_list)
## CAA78718 NP_597713 CAA58534 ABD19518 AAF13269 AAF13270 NP_065910 ABD59319
## 1486 915 1673 1543 2083 1895 2070 1864
## NP_065768 AAK95579 ABA81834 EAA12598 XP_392427 XP_783573
## 1560 778 1647 750 2230 1758
entrez_fetch_list
## function (db, id, rettype, ...)
## {
## n.seq <- length(id)
## list.output <- as.list(rep(NA, n.seq))
## names(list.output) <- id
## for (i in 1:length(id)) {
## list.output[[i]] <- rentrez::entrez_fetch(db = db, id = id[i],
## rettype = rettype)
## }
## return(list.output)
## }
## <bytecode: 0x7fb145e02588>
## <environment: namespace:compbio4all>
length(shrooms_list)
## [1] 14
for(i in 1:length(shrooms_list)){
shrooms_list[[i]] <- fasta_cleaner(shrooms_list[[i]], parse = F)
}
# creating a new vector with the same length as shrooms list
shrooms_vector <- rep(NA, length(shrooms_list))
# assigning every item in shrooms list to new-created vector
for(i in 1:length(shrooms_vector)){
shrooms_vector[i] <- shrooms_list[[i]]
}
# renaming all items in new vector
names(shrooms_vector) <- names(shrooms_list)
shrooms_vector_ss <- Biostrings::AAStringSet(shrooms_vector)
shrooms_vector_ss
## AAStringSet object of length 14:
## width seq names
## [1] 1420 MSAFGNTIERWNIKSTGVIAGLG...NLEEKIKVYEEQFESIHNSLPP CAA78718
## [2] 847 MEALGPGGDRASPASSTSSLDLW...PSPARPPGTCPPVQPPFPLLLT NP_597713
## [3] 1616 MEGAEPRARPERLAEAETRAADG...KIHLGEEQLKCLLDSLQPERGK CAA58534
## [4] 1480 MEGAEPRARPERLAEAEAPATDG...KIHLGEEQLKCLFDSLQPERSK ABD19518
## [5] 1986 MKTPENLEEPSATPNPSRTPTER...GHATPGGTSVFGGVFPTLTSPL AAF13269
## ... ... ...
## [10] 723 MHFPSEAFSLSWHSGCNTSDVCV...CRELRRHPMDQSYHSADQPYHA AAK95579
## [11] 1576 MKMRNHKENGNGSEMGESTKSLA...VRIKGSEEQLSSLSDALVQSDC ABA81834
## [12] 674 IPFSSSPKNRSNSKASYLPRQPR...DKIKLGEEQLAALKDTLVQSEC EAA12598
## [13] 2126 MTELQPSPPGYRVQDEAPGPPSC...REIQDKVKLGEEQLAALREAID XP_392427
## [14] 1661 MMKDAMYPTTTSTTSSSVNPLPK...SSSSNGIGGPEQLNSNATSSYC XP_783573
library(msa)
shrooms_align <-msa (shrooms_vector_ss,
method = "ClustalW")
## use default substitution matrix
shrooms_align
## CLUSTAL 2.1
##
## Call:
## msa(shrooms_vector_ss, method = "ClustalW")
##
## MsaAAMultipleAlignment with 14 rows and 2252 columns
## aln names
## [1] -------------------------...------------------------- NP_065768
## [2] -------------------------...------------------------- AAK95579
## [3] -------------------------...SVFGGVFPTLTSPL----------- AAF13269
## [4] -------------------------...SVFGGVFPTLTSPL----------- AAF13270
## [5] -------------------------...CTFSGIFPTLTSPL----------- NP_065910
## [6] -------------------------...NKS--LPPPLTSSL----------- ABD59319
## [7] -------------------------...------------------------- CAA58534
## [8] -------------------------...------------------------- ABD19518
## [9] -------------------------...LT----------------------- NP_597713
## [10] -------------------------...------------------------- CAA78718
## [11] -------------------------...------------------------- EAA12598
## [12] -------------------------...------------------------- ABA81834
## [13] MTELQPSPPGYRVQDEAPGPPSCPP...------------------------- XP_392427
## [14] -------------------------...AATSSSSNGIGGPEQLNSNATSSYC XP_783573
## Con -------------------------...------------------------- Consensus
shrooms_align
## CLUSTAL 2.1
##
## Call:
## msa(shrooms_vector_ss, method = "ClustalW")
##
## MsaAAMultipleAlignment with 14 rows and 2252 columns
## aln names
## [1] -------------------------...------------------------- NP_065768
## [2] -------------------------...------------------------- AAK95579
## [3] -------------------------...SVFGGVFPTLTSPL----------- AAF13269
## [4] -------------------------...SVFGGVFPTLTSPL----------- AAF13270
## [5] -------------------------...CTFSGIFPTLTSPL----------- NP_065910
## [6] -------------------------...NKS--LPPPLTSSL----------- ABD59319
## [7] -------------------------...------------------------- CAA58534
## [8] -------------------------...------------------------- ABD19518
## [9] -------------------------...LT----------------------- NP_597713
## [10] -------------------------...------------------------- CAA78718
## [11] -------------------------...------------------------- EAA12598
## [12] -------------------------...------------------------- ABA81834
## [13] MTELQPSPPGYRVQDEAPGPPSCPP...------------------------- XP_392427
## [14] -------------------------...AATSSSSNGIGGPEQLNSNATSSYC XP_783573
## Con -------------------------...------------------------- Consensus
# WHAT IS THE LINE BELOW DOING? (its tricky - do your best)
class(shrooms_align) <- "AAMultipleAlignment"
# WHAT IS THE LINE BELOW DOING? This is simpler
shrooms_align_seqinr <- msaConvert(shrooms_align, type = "seqinr::alignment")
library(ggmsa)
ggmsa::ggmsa(shrooms_align, # shrooms_align, NOT shrooms_align_seqinr
start = 2000,
end = 2100)

CISH_subset_dist <- seqinr::dist.alignment(shrooms_align_seqinr,
matrix = "identity")
is(CISH_subset_dist)
## [1] "dist" "oldClass"
class(CISH_subset_dist)
## [1] "dist"
CISH_subset_dist_alt <- matrix(data = NA,
nrow = 5,
ncol = 5)
distances <- c(0.8260049,
0.8478722, 0.9000568,
0.9244596, 0.9435187, 0.9372139,
0.9238779, 0.9370038, 0.9323225,0.9413209)
CISH_subset_dist_alt[lower.tri(CISH_subset_dist_alt)] <- distances
seqnames <- c("EAA12598","ABA81834","XP_392427", "XP_783573","CAA78718")
colnames(CISH_subset_dist_alt) <- seqnames
row.names(CISH_subset_dist_alt) <- seqnames
CISH_subset_dist_alt <- as.dist(CISH_subset_dist_alt)
CISH_subset_dist <- CISH_subset_dist_alt
CISH_subset_dist_rounded <- round(CISH_subset_dist,
digits = 3)
CISH_subset_dist_rounded
## EAA12598 ABA81834 XP_392427 XP_783573
## ABA81834 0.826
## XP_392427 0.848 0.944
## XP_783573 0.900 0.937 0.937
## CAA78718 0.924 0.924 0.932 0.941
tree_subset <- nj(CISH_subset_dist)
plot.phylo(tree_subset, main="Phylogenetic Tree",
type = "unrooted",
use.edge.length = F)
mtext(text = "Shroom family gene tree - UNrooted, no branch lengths")

# plot tree
plot.phylo(tree_subset, main="Phylogenetic Tree",
use.edge.length = F)
mtext(text = "Shroom family gene tree - rooted, no branch lenths")

# plot tree
plot.phylo(tree_subset, main="Phylogenetic Tree",
type = "unrooted",
use.edge.length = T)
# add label
mtext(text = "Shroom family gene tree - rooted, with branch lenths")
