Introduction

This code compiles summary information about the gene CISH cytokine inducible SH2 containing protein

It also generates alignments and a phylogeneitc tree to indicating the evolutionary relationship betweeen the human version of the gene and its homologs in other species.

n

library(BiocManager)
## Bioconductor version '3.13' is out-of-date; the current release version '3.14'
##   is available with R version '4.1'; see https://bioconductor.org/install
install("drawProteins")
## Bioconductor version 3.13 (BiocManager 1.30.16), R 4.1.1 (2021-08-10)
## Warning: package(s) not installed when version(s) same as current; use `force = TRUE` to
##   re-install: 'drawProteins'
## Old packages: 'backports', 'brio', 'broom', 'car', 'cli', 'conquer',
##   'corrplot', 'cpp11', 'crayon', 'credentials', 'crosstalk', 'data.table',
##   'desc', 'devtools', 'diffobj', 'digest', 'fs', 'generics', 'gert', 'glue',
##   'Hmisc', 'htmlTable', 'knitr', 'maps', 'Matrix', 'memoise', 'mgcv', 'mime',
##   'nloptr', 'openxlsx', 'pillar', 'pkgbuild', 'pkgload', 'plotly', 'rcmdcheck',
##   'RcppArmadillo', 'readr', 'remotes', 'rio', 'rlang', 'rsconnect',
##   'S4Vectors', 'sessioninfo', 'sp', 'stringi', 'testthat', 'tibble', 'tidyr',
##   'tinytex', 'tzdb', 'usethis', 'viridis', 'vroom', 'withr', 'xfun', 'xml2',
##   'yulab.utils'
library(drawProteins)
# github packages
library(compbio4all)
library(ggmsa)
## Registered S3 methods overwritten by 'ggalt':
##   method                  from   
##   grid.draw.absoluteGrob  ggplot2
##   grobHeight.absoluteGrob ggplot2
##   grobWidth.absoluteGrob  ggplot2
##   grobX.absoluteGrob      ggplot2
##   grobY.absoluteGrob      ggplot2
# CRAN packages
library(rentrez)
library(seqinr)
library(ape)
## 
## Attaching package: 'ape'
## The following objects are masked from 'package:seqinr':
## 
##     as.alignment, consensus
library(pander)


library(ggplot2)

library(msa)
## Loading required package: Biostrings
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: XVector
## Loading required package: GenomeInfoDb
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:ape':
## 
##     complement
## The following object is masked from 'package:seqinr':
## 
##     translate
## The following object is masked from 'package:base':
## 
##     strsplit
## 
## Attaching package: 'msa'
## The following object is masked from 'package:BiocManager':
## 
##     version
library(drawProteins)

## Biostrings
library(Biostrings)

library(HGNChelper)
HGNChelper::checkGeneSymbols(x = c("CISH"))
## Maps last updated on: Thu Oct 24 12:31:05 2019
##      x Approved Suggested.Symbol
## 1 CISH     TRUE             CISH
CISH_table<-c("NP_659508.1",    "Q9NSE2", "NA", "Homo sapiens",  "Human",      "CISH",
              "XP_003309858.1", "H2QMP4", "NA", "P.troglodytes", "Chimpanzee", "CISH",
              "NP_034025.1",    "Q62225", "NA", "Mus musculus",  "Mouse",      "CISH",
              "NP_113992.1",    "B1WBX9", "NA", "R.norvegicus",  "Rat",        "CISH",
              "NP_989957.1",    "Q9PW70", "NA", "G.gallus",      "birds",      "CISH",
              "NP_001107161.1", "A9ULE1", "NA", "X.tropicalis",  "frog ",      "cish",
              "XP_541873.3",    "F1PSF4", "NA", "C.lupus",       "wolf",       "CISH", 
              "NP_001070085.1", "Q08BW5", "NA", "D.rerio",       "zebrafish",  "cish",
              "XP_001097824.1", "NA",     "NA", "M.mulatta",     "Monkey",     "CISH")

refseq <- c("NP_659508.1","XP_003309858.1","NP_034025.1", "NP_113992.1", "NP_989957.1" ,"NP_001107161.1", "XP_541873.3","NP_001070085.1", "XP_001097824.1")

UniProt.id<-c("Q9NSE2", "H2QMP4","Q62225", "B1WBX9","Q9PW70","A9ULE1","F1PSF4","Q08BW5","NA" )

PDB <- c("NA","NA","NA","NA","NA","NA","NA","NA","NA","NA")

species <- c("Homo sapiens", "P.troglodytes", "Mus musculus", "R.norvegicus", "G.gallus", "X.tropicalis","C.lupus", "D.rerio",  "M.mulatta")

common.name <- c("Human","Chimpanzee","Mouse","Rat","birds", "frog ","wolf","zebrafish", "Monkey" )

gene.name <- c("CISH","CISH","CISH","CISH","CISH","cish","CISH","cish","CISH")

CISH_table_matrix<-matrix(CISH_table, byrow = T, nrow = 9)

CISH_table <- data.frame(CISH_table_matrix, stringsAsFactors = F)
names(CISH_table) <- c("refseq", "UniProt.id","PDB", "species", "common.name", "gene.name")


CISH_table
##           refseq UniProt.id PDB       species common.name gene.name
## 1    NP_659508.1     Q9NSE2  NA  Homo sapiens       Human      CISH
## 2 XP_003309858.1     H2QMP4  NA P.troglodytes  Chimpanzee      CISH
## 3    NP_034025.1     Q62225  NA  Mus musculus       Mouse      CISH
## 4    NP_113992.1     B1WBX9  NA  R.norvegicus         Rat      CISH
## 5    NP_989957.1     Q9PW70  NA      G.gallus       birds      CISH
## 6 NP_001107161.1     A9ULE1  NA  X.tropicalis       frog       cish
## 7    XP_541873.3     F1PSF4  NA       C.lupus        wolf      CISH
## 8 NP_001070085.1     Q08BW5  NA       D.rerio   zebrafish      cish
## 9 XP_001097824.1         NA  NA     M.mulatta      Monkey      CISH
pander::pander(CISH_table)
refseq UniProt.id PDB species common.name gene.name
NP_659508.1 Q9NSE2 NA Homo sapiens Human CISH
XP_003309858.1 H2QMP4 NA P.troglodytes Chimpanzee CISH
NP_034025.1 Q62225 NA Mus musculus Mouse CISH
NP_113992.1 B1WBX9 NA R.norvegicus Rat CISH
NP_989957.1 Q9PW70 NA G.gallus birds CISH
NP_001107161.1 A9ULE1 NA X.tropicalis frog cish
XP_541873.3 F1PSF4 NA C.lupus wolf CISH
NP_001070085.1 Q08BW5 NA D.rerio zebrafish cish
XP_001097824.1 NA NA M.mulatta Monkey CISH
CISH_list <- compbio4all::entrez_fetch_list(db = "protein", 
                          id = CISH_table$refseq, 
                          rettype = "fasta")
length(CISH_list)
## [1] 9
CISH_list[[1]]
## [1] ">NP_659508.1 cytokine-inducible SH2-containing protein isoform 2 [Homo sapiens]\nMVLCVQGPRPLLAVERTGQRPLWAPSLELPKPVMQPLPAGAFLEEVAEGTPAQTESEPKVLDPEEDLLCI\nAKTFSYLRESGWYWGSITASEARQHLQKMPEGTFLVRDSTHPSYLFTLSVKTTRGPTNVRIEYADSSFRL\nDSNCLSRPRILAFPDVVSLVQHYVASCTADTRSDSPDPAPTPALPMPKEDAPSDPALPAPPPATAVHLKL\nVQPFVRRSSARSLQHLCRLVINRLVADVDCLPLPRRMADYLRQYPFQL\n\n"
for(i in 1:length(CISH_list)){
  CISH_list[[i]] <- compbio4all::fasta_cleaner(CISH_list[[i]], parse = F)
}
Q9NSE2_json  <- drawProteins::get_features("Q9NSE2")
## [1] "Download has worked"
my_prot_df <- drawProteins::feature_to_dataframe(Q9NSE2_json)

my_canvas <- draw_canvas(my_prot_df)  
my_canvas <- draw_chains(my_canvas, my_prot_df, 
                         label_size = 2.5)
my_canvas <- draw_domains(my_canvas, my_prot_df)
my_canvas

CISH_vector <- fasta_cleaner(CISH_list)

par(mfrow = c(2,2), 
    mar = c(2,2,2,1))

dotPlot(CISH_vector, 
        CISH_vector, 
        wsize = 1, 
        nmatch = 1, 
        main = "CISH HUMAN VECTOR")

dotPlot(CISH_vector, 
        CISH_vector, 
        wsize = 10, 
        nmatch = 1, 
        main = "CISH - size = 10, nmatch = 1")

dotPlot(CISH_vector, 
        CISH_vector, 
        wsize = 10, 
        nmatch = 5, 
        main = "CISH - size = 10, nmatch = 15")

dotPlot(CISH_vector, 
        CISH_vector, 
        wsize = 20, 
        nmatch = 5, 
        main = "CISH - size = 20, nmatch = 5")

aa.1.1 <- c("A","R","N","D","C","Q","E","G","H","I",
            "L","K","M","F","P","S","T","W","Y","V")
## alpha proteins
alpha <- c(285, 53, 97, 163, 22, 67, 134, 197, 111, 91, 
           221, 249, 48, 123, 82, 122, 119, 33, 63, 167)
## beta proteins
beta <- c(203, 67, 139, 121, 75, 122, 86, 297, 49, 120, 
          177, 115, 16, 85, 127, 341, 253, 44, 110, 229)
## alpha + beta
a.plus.b <- c(175, 78, 120, 111, 74, 74, 86, 171, 33, 93,
              110, 112, 25, 52, 71, 126, 117, 30, 108, 123)
## alpha/beta
a.div.b <- c(361, 146, 183, 244, 63, 114, 257, 377, 107, 239, 
             339, 321, 91, 158, 188, 327, 238, 72, 130, 378)

data.frame(aa.1.1, alpha, beta, a.plus.b, a.div.b)
##    aa.1.1 alpha beta a.plus.b a.div.b
## 1       A   285  203      175     361
## 2       R    53   67       78     146
## 3       N    97  139      120     183
## 4       D   163  121      111     244
## 5       C    22   75       74      63
## 6       Q    67  122       74     114
## 7       E   134   86       86     257
## 8       G   197  297      171     377
## 9       H   111   49       33     107
## 10      I    91  120       93     239
## 11      L   221  177      110     339
## 12      K   249  115      112     321
## 13      M    48   16       25      91
## 14      F   123   85       52     158
## 15      P    82  127       71     188
## 16      S   122  341      126     327
## 17      T   119  253      117     238
## 18      W    33   44       30      72
## 19      Y    63  110      108     130
## 20      V   167  229      123     378
# convert them to frequencies
alpha.prop <- alpha/sum(alpha)
beta.prop <- beta/sum(beta)
a.plus.b.prop <- a.plus.b/sum(a.plus.b)
a.div.b <- a.div.b/sum(a.div.b)

## dataframe
aa.prop <- data.frame(alpha.prop,
                      beta.prop,
                      a.plus.b.prop,
                      a.div.b)
## row labels
row.names(aa.prop) <- aa.1.1

pander::pander(aa.prop)
  alpha.prop beta.prop a.plus.b.prop a.div.b
A 0.1165 0.07313 0.09264 0.08331
R 0.02166 0.02414 0.04129 0.03369
N 0.03964 0.05007 0.06353 0.04223
D 0.06661 0.04359 0.05876 0.05631
C 0.008991 0.02702 0.03917 0.01454
Q 0.02738 0.04395 0.03917 0.02631
E 0.05476 0.03098 0.04553 0.05931
G 0.08051 0.107 0.09052 0.08701
H 0.04536 0.01765 0.01747 0.02469
I 0.03719 0.04323 0.04923 0.05516
L 0.09031 0.06376 0.05823 0.07824
K 0.1018 0.04143 0.05929 0.07408
M 0.01962 0.005764 0.01323 0.021
F 0.05027 0.03062 0.02753 0.03646
P 0.03351 0.04575 0.03759 0.04339
S 0.04986 0.1228 0.0667 0.07547
T 0.04863 0.09114 0.06194 0.05493
W 0.01349 0.01585 0.01588 0.01662
Y 0.02575 0.03963 0.05717 0.03
V 0.06825 0.08249 0.06511 0.08724
## determine the number of each amino acid in protein.
table(CISH_vector)
## CISH_vector
##  A  C  D  E  F  G  H  I  K  L  M  N  P  Q  R  S  T  V  W  Y 
## 23  6 14 14  8  8  5  5  7 32  5  3 32 11 20 21 15 19  3  7
CISH_human_table <- table(CISH_vector)/length(CISH_vector)

## A Function to convert a table into a vector
table_to_vector <- function(CISH_human_table){
  table_names <- attr(CISH_human_table, "dimnames")[[1]]
  table_vect <- as.vector(CISH_human_table)
  names(table_vect) <- table_names
  return(table_vect)
}

CISH.human.aa.freq <- table_to_vector(CISH_human_table)
CISH.human.aa.freq
##          A          C          D          E          F          G          H 
## 0.08914729 0.02325581 0.05426357 0.05426357 0.03100775 0.03100775 0.01937984 
##          I          K          L          M          N          P          Q 
## 0.01937984 0.02713178 0.12403101 0.01937984 0.01162791 0.12403101 0.04263566 
##          R          S          T          V          W          Y 
## 0.07751938 0.08139535 0.05813953 0.07364341 0.01162791 0.02713178
## Check for the presence of “U” (unknown aa.)
aa.names <- names(CISH.human.aa.freq)
any(aa.names == "U")
## [1] FALSE
i.U <- which(aa.names == "U")
aa.names[i.U]
## character(0)
CISH.human.aa.freq[i.U]
## named numeric(0)
aa.prop$CISH.human.aa.freq <- CISH.human.aa.freq
pander::pander(aa.prop)
  alpha.prop beta.prop a.plus.b.prop a.div.b CISH.human.aa.freq
A 0.1165 0.07313 0.09264 0.08331 0.08915
R 0.02166 0.02414 0.04129 0.03369 0.02326
N 0.03964 0.05007 0.06353 0.04223 0.05426
D 0.06661 0.04359 0.05876 0.05631 0.05426
C 0.008991 0.02702 0.03917 0.01454 0.03101
Q 0.02738 0.04395 0.03917 0.02631 0.03101
E 0.05476 0.03098 0.04553 0.05931 0.01938
G 0.08051 0.107 0.09052 0.08701 0.01938
H 0.04536 0.01765 0.01747 0.02469 0.02713
I 0.03719 0.04323 0.04923 0.05516 0.124
L 0.09031 0.06376 0.05823 0.07824 0.01938
K 0.1018 0.04143 0.05929 0.07408 0.01163
M 0.01962 0.005764 0.01323 0.021 0.124
F 0.05027 0.03062 0.02753 0.03646 0.04264
P 0.03351 0.04575 0.03759 0.04339 0.07752
S 0.04986 0.1228 0.0667 0.07547 0.0814
T 0.04863 0.09114 0.06194 0.05493 0.05814
W 0.01349 0.01585 0.01588 0.01662 0.07364
Y 0.02575 0.03963 0.05717 0.03 0.01163
V 0.06825 0.08249 0.06511 0.08724 0.02713
# Corrleation used in Chou adn Zhange 1992.
chou_cor <- function(x,y){
  numerator <- sum(x*y)
denominator <- sqrt((sum(x^2))*(sum(y^2)))
result <- numerator/denominator
return(result)
}

# Cosine similarity used in Higgs and Attwood (2005). 
chou_cosine <- function(z.1, z.2){
  z.1.abs <- sqrt(sum(z.1^2))
  z.2.abs <- sqrt(sum(z.2^2))
  my.cosine <- sum(z.1*z.2)/(z.1.abs*z.2.abs)
  return(my.cosine)
}
# Calculate correlation between each column
corr.alpha <- chou_cor(aa.prop[,5], aa.prop[,1])
corr.beta  <- chou_cor(aa.prop[,5], aa.prop[,2])
corr.apb   <- chou_cor(aa.prop[,5], aa.prop[,3])
corr.adb   <- chou_cor(aa.prop[,5], aa.prop[,4])

# Calculate cosine similarity
cos.alpha <- chou_cosine(aa.prop[,5], aa.prop[,1])
cos.beta  <- chou_cosine(aa.prop[,5], aa.prop[,2])
cos.apb   <- chou_cosine(aa.prop[,5], aa.prop[,3])
cos.adb   <- chou_cosine(aa.prop[,5], aa.prop[,4])

# Calculate distance.
aa.prop.flipped <- t(aa.prop)
round(aa.prop.flipped,2)
##                       A    R    N    D    C    Q    E    G    H    I    L    K
## alpha.prop         0.12 0.02 0.04 0.07 0.01 0.03 0.05 0.08 0.05 0.04 0.09 0.10
## beta.prop          0.07 0.02 0.05 0.04 0.03 0.04 0.03 0.11 0.02 0.04 0.06 0.04
## a.plus.b.prop      0.09 0.04 0.06 0.06 0.04 0.04 0.05 0.09 0.02 0.05 0.06 0.06
## a.div.b            0.08 0.03 0.04 0.06 0.01 0.03 0.06 0.09 0.02 0.06 0.08 0.07
## CISH.human.aa.freq 0.09 0.02 0.05 0.05 0.03 0.03 0.02 0.02 0.03 0.12 0.02 0.01
##                       M    F    P    S    T    W    Y    V
## alpha.prop         0.02 0.05 0.03 0.05 0.05 0.01 0.03 0.07
## beta.prop          0.01 0.03 0.05 0.12 0.09 0.02 0.04 0.08
## a.plus.b.prop      0.01 0.03 0.04 0.07 0.06 0.02 0.06 0.07
## a.div.b            0.02 0.04 0.04 0.08 0.05 0.02 0.03 0.09
## CISH.human.aa.freq 0.12 0.04 0.08 0.08 0.06 0.07 0.01 0.03
# distance matrix 
dist(aa.prop.flipped, method = "euclidean")
##                    alpha.prop  beta.prop a.plus.b.prop    a.div.b
## beta.prop          0.13342098                                    
## a.plus.b.prop      0.09281824 0.08289406                         
## a.div.b            0.06699039 0.08659174    0.06175113           
## CISH.human.aa.freq 0.21707581 0.20735229    0.19228630 0.19476457
# Individual distances using dist()
dist.alpha <- dist((aa.prop.flipped[c(1,5),]),  method = "euclidean")
dist.beta  <- dist((aa.prop.flipped[c(2,5),]),  method = "euclidean")
dist.apb   <- dist((aa.prop.flipped[c(3,5),]),  method = "euclidean")
dist.adb  <- dist((aa.prop.flipped[c(4,5),]), method = "euclidean")

# Compile the information. Rounding makes it easier to read
## fold types
fold.type <- c("alpha","beta","alpha plus beta", "alpha/beta")

## data
corr.sim <- round(c(corr.alpha,corr.beta,corr.apb,corr.adb),5)
cosine.sim <- round(c(cos.alpha,cos.beta,cos.apb,cos.adb),5)
Euclidean.dist <- round(c(dist.alpha,dist.beta,dist.apb,dist.adb),5)

## summary
sim.sum <- c("","","most.sim","")
dist.sum <- c("","","min.dist","")

df <- data.frame(fold.type,
           corr.sim ,
           cosine.sim ,
           Euclidean.dist ,
           sim.sum ,
           dist.sum )

# display output
pander::pander(df)
fold.type corr.sim cosine.sim Euclidean.dist sim.sum dist.sum
alpha 0.6639 0.6639 0.2171
beta 0.6968 0.6968 0.2074
alpha plus beta 0.7247 0.7247 0.1923 most.sim min.dist
alpha/beta 0.721 0.721 0.1948
names(CISH_list)
## [1] "NP_659508.1"    "XP_003309858.1" "NP_034025.1"    "NP_113992.1"   
## [5] "NP_989957.1"    "NP_001107161.1" "XP_541873.3"    "NP_001070085.1"
## [9] "XP_001097824.1"
length(CISH_list)
## [1] 9
CISH_list[1]
## $NP_659508.1
## [1] "MVLCVQGPRPLLAVERTGQRPLWAPSLELPKPVMQPLPAGAFLEEVAEGTPAQTESEPKVLDPEEDLLCIAKTFSYLRESGWYWGSITASEARQHLQKMPEGTFLVRDSTHPSYLFTLSVKTTRGPTNVRIEYADSSFRLDSNCLSRPRILAFPDVVSLVQHYVASCTADTRSDSPDPAPTPALPMPKEDAPSDPALPAPPPATAVHLKLVQPFVRRSSARSLQHLCRLVINRLVADVDCLPLPRRMADYLRQYPFQL"
# Make each entry of the list into a vector
CISH_table
##           refseq UniProt.id PDB       species common.name gene.name
## 1    NP_659508.1     Q9NSE2  NA  Homo sapiens       Human      CISH
## 2 XP_003309858.1     H2QMP4  NA P.troglodytes  Chimpanzee      CISH
## 3    NP_034025.1     Q62225  NA  Mus musculus       Mouse      CISH
## 4    NP_113992.1     B1WBX9  NA  R.norvegicus         Rat      CISH
## 5    NP_989957.1     Q9PW70  NA      G.gallus       birds      CISH
## 6 NP_001107161.1     A9ULE1  NA  X.tropicalis       frog       cish
## 7    XP_541873.3     F1PSF4  NA       C.lupus        wolf      CISH
## 8 NP_001070085.1     Q08BW5  NA       D.rerio   zebrafish      cish
## 9 XP_001097824.1         NA  NA     M.mulatta      Monkey      CISH
human <- unlist(CISH_list[1])
mouse <- unlist(CISH_list[2])
rat <- unlist(CISH_list[3])
frog <- unlist(CISH_list[4])
fruit_fly <- unlist(CISH_list[5])
roundworm <- unlist(CISH_list[6])
chimpanze <- unlist(CISH_list[7])
monkey <- unlist(CISH_list[8])
cattle <- unlist(CISH_list[9])
pest <- unlist(CISH_list[10])

# name the vector
## names(cul5_vector) <- names(cul5_list)
data(BLOSUM50)

align01.07 <- Biostrings::pairwiseAlignment(human, 
                                            chimpanze,
                                            substitutionMatrix = BLOSUM50, 
                                            gapOpening = -2, 
                                            gapExtension = -8, 
                                            scoreOnly = FALSE)
align01.02 <- Biostrings::pairwiseAlignment(human, 
                                            mouse,
                                            substitutionMatrix = BLOSUM50, 
                                            gapOpening = -2, 
                                            gapExtension = -8, 
                                            scoreOnly = FALSE)
align01.03 <- Biostrings::pairwiseAlignment(human, 
                                            rat,
                                            substitutionMatrix = BLOSUM50, 
                                            gapOpening = -2, 
                                            gapExtension = -8, 
                                            scoreOnly = FALSE)

align07.02 <- Biostrings::pairwiseAlignment(chimpanze, 
                                            mouse,
                                            substitutionMatrix = BLOSUM50, 
                                            gapOpening = -2, 
                                            gapExtension = -8, 
                                            scoreOnly = FALSE)
align07.03 <- Biostrings::pairwiseAlignment(chimpanze, 
                                            rat,
                                            substitutionMatrix = BLOSUM50, 
                                            gapOpening = -2, 
                                            gapExtension = -8, 
                                            scoreOnly = FALSE)

align02.03 <- Biostrings::pairwiseAlignment(mouse, 
                                            rat,
                                            substitutionMatrix = BLOSUM50, 
                                            gapOpening = -2, 
                                            gapExtension = -8, 
                                            scoreOnly = FALSE)

Biostrings::pid(align01.07)
## [1] 90.69767
Biostrings::pid(align01.02)
## [1] 99.6124
Biostrings::pid(align01.03)
## [1] 90.69767
pids <- c(1,NA,NA,NA,
          pid(align01.07),          1,     NA,     NA,
          pid(align01.02), pid(align07.02),      1,     NA,
          pid(align01.03), pid(align07.03), pid(align02.03), 1)

mat <- matrix(pids, nrow = 4, byrow = T)
row.names(mat) <- c("Homo","Pan","Mouse","Rat")   
colnames(mat) <- c("Homo","Pan","Mouse","Rat")   
pander::pander(mat)  
  Homo Pan Mouse Rat
Homo 1 NA NA NA
Pan 90.7 1 NA NA
Mouse 99.61 90.31 1 NA
Rat 90.7 89.11 91.09 1
human_PID1 <- pid(align01.07,type ="PID1" )
human_PID2 <- pid(align01.07,type ="PID2" )
human_PID3 <- pid(align01.07,type ="PID3" )
human_PID4 <- pid(align01.07,type ="PID4" )

method <- c("PID1","PID2","PID3","PID4")
PID_methods <- c(human_PID1,human_PID2,human_PID3,human_PID4)
denominator <- c("(aligned positions + internal gap positions)",
                 "(aligned positions)",
                 "(length shorter sequence)",
                 "(average length of the two sequences)")

PID_comparsions <- data.frame(method,PID=PID_methods,denominator)
PID_comparsions
##   method      PID                                  denominator
## 1   PID1 90.69767 (aligned positions + internal gap positions)
## 2   PID2 91.05058                          (aligned positions)
## 3   PID3 91.05058                    (length shorter sequence)
## 4   PID4 90.87379        (average length of the two sequences)
hShroom3 <- rentrez::entrez_fetch(db = "protein", 
                          id = "NP_065910", 
                          rettype = "fasta")

cat(hShroom3)
## >NP_065910.3 protein Shroom3 [Homo sapiens]
## MMRTTEDFHKPSATLNSNTATKGRYIYLEAFLEGGAPWGFTLKGGLEHGEPLIISKVEEGGKADTLSSKL
## QAGDEVVHINEVTLSSSRKEAVSLVKGSYKTLRLVVRRDVCTDPGHADTGASNFVSPEHLTSGPQHRKAA
## WSGGVKLRLKHRRSEPAGRPHSWHTTKSGEKQPDASMMQISQGMIGPPWHQSYHSSSSTSDLSNYDHAYL
## RRSPDQCSSQGSMESLEPSGAYPPCHLSPAKSTGSIDQLSHFHNKRDSAYSSFSTSSSILEYPHPGISGR
## ERSGSMDNTSARGGLLEGMRQADIRYVKTVYDTRRGVSAEYEVNSSALLLQGREARASANGQGYDKWSNI
## PRGKGVPPPSWSQQCPSSLETATDNLPPKVGAPLPPARSDSYAAFRHRERPSSWSSLDQKRLCRPQANSL
## GSLKSPFIEEQLHTVLEKSPENSPPVKPKHNYTQKAQPGQPLLPTSIYPVPSLEPHFAQVPQPSVSSNGM
## LYPALAKESGYIAPQGACNKMATIDENGNQNGSGRPGFAFCQPLEHDLLSPVEKKPEATAKYVPSKVHFC
## SVPENEEDASLKRHLTPPQGNSPHSNERKSTHSNKPSSHPHSLKCPQAQAWQAGEDKRSSRLSEPWEGDF
## QEDHNANLWRRLEREGLGQSLSGNFGKTKSAFSSLQNIPESLRRHSSLELGRGTQEGYPGGRPTCAVNTK
## AEDPGRKAAPDLGSHLDRQVSYPRPEGRTGASASFNSTDPSPEEPPAPSHPHTSSLGRRGPGPGSASALQ
## GFQYGKPHCSVLEKVSKFEQREQGSQRPSVGGSGFGHNYRPHRTVSTSSTSGNDFEETKAHIRFSESAEP
## LGNGEQHFKNGELKLEEASRQPCGQQLSGGASDSGRGPQRPDARLLRSQSTFQLSSEPEREPEWRDRPGS
## PESPLLDAPFSRAYRNSIKDAQSRVLGATSFRRRDLELGAPVASRSWRPRPSSAHVGLRSPEASASASPH
## TPRERHSVTPAEGDLARPVPPAARRGARRRLTPEQKKRSYSEPEKMNEVGIVEEAEPAPLGPQRNGMRFP
## ESSVADRRRLFERDGKACSTLSLSGPELKQFQQSALADYIQRKTGKRPTSAAGCSLQEPGPLRERAQSAY
## LQPGPAALEGSGLASASSLSSLREPSLQPRREATLLPATVAETQQAPRDRSSSFAGGRRLGERRRGDLLS
## GANGGTRGTQRGDETPREPSSWGARAGKSMSAEDLLERSDVLAGPVHVRSRSSPATADKRQDVLLGQDSG
## FGLVKDPCYLAGPGSRSLSCSERGQEEMLPLFHHLTPRWGGSGCKAIGDSSVPSECPGTLDHQRQASRTP
## CPRPPLAGTQGLVTDTRAAPLTPIGTPLPSAIPSGYCSQDGQTGRQPLPPYTPAMMHRSNGHTLTQPPGP
## RGCEGDGPEHGVEEGTRKRVSLPQWPPPSRAKWAHAAREDSLPEESSAPDFANLKHYQKQQSLPSLCSTS
## DPDTPLGAPSTPGRISLRISESVLRDSPPPHEDYEDEVFVRDPHPKATSSPTFEPLPPPPPPPPSQETPV
## YSMDDFPPPPPHTVCEAQLDSEDPEGPRPSFNKLSKVTIARERHMPGAAHVVGSQTLASRLQTSIKGSEA
## ESTPPSFMSVHAQLAGSLGGQPAPIQTQSLSHDPVSGTQGLEKKVSPDPQKSSEDIRTEALAKEIVHQDK
## SLADILDPDSRLKTTMDLMEGLFPRDVNLLKENSVKRKAIQRTVSSSGCEGKRNEDKEAVSMLVNCPAYY
## SVSAPKAELLNKIKEMPAEVNEEEEQADVNEKKAELIGSLTHKLETLQEAKGSLLTDIKLNNALGEEVEA
## LISELCKPNEFDKYRMFIGDLDKVVNLLLSLSGRLARVENVLSGLGEDASNEERSSLYEKRKILAGQHED
## ARELKENLDRRERVVLGILANYLSEEQLQDYQHFVKMKSTLLIEQRKLDDKIKLGQEQVKCLLESLPSDF
## IPKAGALALPPNLTSEPIPAGGCTFSGIFPTLTSPL
nchar(hShroom3)
## [1] 2070
mShroom3a <- entrez_fetch(db = "protein", 
                          id = "AAF13269", 
                          rettype = "fasta")

# Human shroom 2 (H. sapiens)
hShroom2 <- entrez_fetch(db = "protein", 
                          id = "CAA58534", 
                          rettype = "fasta")


# Sea-urchin shroom
sShroom <- entrez_fetch(db = "protein", 
                          id = "XP_783573", 
                          rettype = "fasta")

nchar(hShroom3)
## [1] 2070
nchar(mShroom3a)
## [1] 2083
nchar(sShroom)
## [1] 1758
nchar(hShroom2)
## [1] 1673
fasta_cleaner <- function(fasta_object, parse = TRUE){

  fasta_object <- sub("^(>)(.*?)(\\n)(.*)(\\n\\n)","\\4",fasta_object)
  fasta_object <- gsub("\n", "", fasta_object)

  if(parse == TRUE){
    fasta_object <- stringr::str_split(fasta_object,
                                       pattern = "",
                                       simplify = FALSE)
  }

  return(fasta_object[[1]])
}

fasta_cleaner
## function(fasta_object, parse = TRUE){
## 
##   fasta_object <- sub("^(>)(.*?)(\\n)(.*)(\\n\\n)","\\4",fasta_object)
##   fasta_object <- gsub("\n", "", fasta_object)
## 
##   if(parse == TRUE){
##     fasta_object <- stringr::str_split(fasta_object,
##                                        pattern = "",
##                                        simplify = FALSE)
##   }
## 
##   return(fasta_object[[1]])
## }
hShroom3  <- fasta_cleaner(hShroom3,  parse = F)
mShroom3a <- fasta_cleaner(mShroom3a, parse = F)
hShroom2  <- fasta_cleaner(hShroom2,  parse = F)
sShroom   <- fasta_cleaner(sShroom,   parse = F)


hShroom3
## [1] "MMRTTEDFHKPSATLNSNTATKGRYIYLEAFLEGGAPWGFTLKGGLEHGEPLIISKVEEGGKADTLSSKLQAGDEVVHINEVTLSSSRKEAVSLVKGSYKTLRLVVRRDVCTDPGHADTGASNFVSPEHLTSGPQHRKAAWSGGVKLRLKHRRSEPAGRPHSWHTTKSGEKQPDASMMQISQGMIGPPWHQSYHSSSSTSDLSNYDHAYLRRSPDQCSSQGSMESLEPSGAYPPCHLSPAKSTGSIDQLSHFHNKRDSAYSSFSTSSSILEYPHPGISGRERSGSMDNTSARGGLLEGMRQADIRYVKTVYDTRRGVSAEYEVNSSALLLQGREARASANGQGYDKWSNIPRGKGVPPPSWSQQCPSSLETATDNLPPKVGAPLPPARSDSYAAFRHRERPSSWSSLDQKRLCRPQANSLGSLKSPFIEEQLHTVLEKSPENSPPVKPKHNYTQKAQPGQPLLPTSIYPVPSLEPHFAQVPQPSVSSNGMLYPALAKESGYIAPQGACNKMATIDENGNQNGSGRPGFAFCQPLEHDLLSPVEKKPEATAKYVPSKVHFCSVPENEEDASLKRHLTPPQGNSPHSNERKSTHSNKPSSHPHSLKCPQAQAWQAGEDKRSSRLSEPWEGDFQEDHNANLWRRLEREGLGQSLSGNFGKTKSAFSSLQNIPESLRRHSSLELGRGTQEGYPGGRPTCAVNTKAEDPGRKAAPDLGSHLDRQVSYPRPEGRTGASASFNSTDPSPEEPPAPSHPHTSSLGRRGPGPGSASALQGFQYGKPHCSVLEKVSKFEQREQGSQRPSVGGSGFGHNYRPHRTVSTSSTSGNDFEETKAHIRFSESAEPLGNGEQHFKNGELKLEEASRQPCGQQLSGGASDSGRGPQRPDARLLRSQSTFQLSSEPEREPEWRDRPGSPESPLLDAPFSRAYRNSIKDAQSRVLGATSFRRRDLELGAPVASRSWRPRPSSAHVGLRSPEASASASPHTPRERHSVTPAEGDLARPVPPAARRGARRRLTPEQKKRSYSEPEKMNEVGIVEEAEPAPLGPQRNGMRFPESSVADRRRLFERDGKACSTLSLSGPELKQFQQSALADYIQRKTGKRPTSAAGCSLQEPGPLRERAQSAYLQPGPAALEGSGLASASSLSSLREPSLQPRREATLLPATVAETQQAPRDRSSSFAGGRRLGERRRGDLLSGANGGTRGTQRGDETPREPSSWGARAGKSMSAEDLLERSDVLAGPVHVRSRSSPATADKRQDVLLGQDSGFGLVKDPCYLAGPGSRSLSCSERGQEEMLPLFHHLTPRWGGSGCKAIGDSSVPSECPGTLDHQRQASRTPCPRPPLAGTQGLVTDTRAAPLTPIGTPLPSAIPSGYCSQDGQTGRQPLPPYTPAMMHRSNGHTLTQPPGPRGCEGDGPEHGVEEGTRKRVSLPQWPPPSRAKWAHAAREDSLPEESSAPDFANLKHYQKQQSLPSLCSTSDPDTPLGAPSTPGRISLRISESVLRDSPPPHEDYEDEVFVRDPHPKATSSPTFEPLPPPPPPPPSQETPVYSMDDFPPPPPHTVCEAQLDSEDPEGPRPSFNKLSKVTIARERHMPGAAHVVGSQTLASRLQTSIKGSEAESTPPSFMSVHAQLAGSLGGQPAPIQTQSLSHDPVSGTQGLEKKVSPDPQKSSEDIRTEALAKEIVHQDKSLADILDPDSRLKTTMDLMEGLFPRDVNLLKENSVKRKAIQRTVSSSGCEGKRNEDKEAVSMLVNCPAYYSVSAPKAELLNKIKEMPAEVNEEEEQADVNEKKAELIGSLTHKLETLQEAKGSLLTDIKLNNALGEEVEALISELCKPNEFDKYRMFIGDLDKVVNLLLSLSGRLARVENVLSGLGEDASNEERSSLYEKRKILAGQHEDARELKENLDRRERVVLGILANYLSEEQLQDYQHFVKMKSTLLIEQRKLDDKIKLGQEQVKCLLESLPSDFIPKAGALALPPNLTSEPIPAGGCTFSGIFPTLTSPL"
nchar(hShroom3)
## [1] 1996
library(Biostrings)
align.h3.vs.m3a <- Biostrings::pairwiseAlignment (
                  hShroom3,
                  mShroom3a)

align.h3.vs.m3a
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: MMRTTEDFHKPSATLN-SNTATKGRYIYLEAFLE...KAGALALPPNLTSEPIPAGGCTFSGIFPTLTSPL
## subject: MK-TPENLEEPSATPNPSRTPTE-RFVYLEALLE...KAGAISLPPALTGHATPGGTSVFGGVFPTLTSPL
## score: 2189.934
Biostrings::pid(align.h3.vs.m3a)
## [1] 70.56511
align.h3.vs.h2 <- Biostrings::pairwiseAlignment(
                  hShroom3,
                  hShroom2)

score(align.h3.vs.h2)
## [1] -5673.853
Biostrings::pid(align.h3.vs.h2)
## [1] 33.83277
shroom_table <- c("CAA78718" , "X. laevis Apx" ,         "xShroom1",
            "NP_597713" , "H. sapiens APXL2" ,     "hShroom1",
            "CAA58534" , "H. sapiens APXL",        "hShroom2",
            "ABD19518" , "M. musculus Apxl" ,      "mShroom2",
            "AAF13269" , "M. musculus ShroomL" ,   "mShroom3a",
            "AAF13270" , "M. musculus ShroomS" ,   "mShroom3b",
            "NP_065910", "H. sapiens Shroom" ,     "hShroom3",
            "ABD59319" , "X. laevis Shroom-like",  "xShroom3",
            "NP_065768", "H. sapiens KIAA1202" ,   "hShroom4a",
            "AAK95579" , "H. sapiens SHAP-A" ,     "hShroom4b",
            #"DQ435686" , "M. musculus KIAA1202" ,  "mShroom4",
            "ABA81834" , "D. melanogaster Shroom", "dmShroom",
            "EAA12598" , "A. gambiae Shroom",      "agShroom",
            "XP_392427" , "A. mellifera Shroom" ,  "amShroom",
            "XP_783573" , "S. purpuratus Shroom" , "spShroom") 


# convert to matrix
shroom_table_matrix <- matrix(shroom_table,
                                  byrow = T,
                                  nrow = 14)
# convert to dataframe
shroom_table <- data.frame(shroom_table_matrix, 
                     stringsAsFactors = F)

# setting up columns of new created dataframe 
names(shroom_table) <- c("accession", "name.orig","name.new")

# Create simplified species names
shroom_table$spp <- "Homo"
shroom_table$spp[grep("laevis",shroom_table$name.orig)] <- "Xenopus"
shroom_table$spp[grep("musculus",shroom_table$name.orig)] <- "Mus"
shroom_table$spp[grep("melanogaster",shroom_table$name.orig)] <- "Drosophila"
shroom_table$spp[grep("gambiae",shroom_table$name.orig)] <- "mosquito"
shroom_table$spp[grep("mellifera",shroom_table$name.orig)] <- "bee"
shroom_table$spp[grep("purpuratus",shroom_table$name.orig)] <- "sea urchin"


shroom_table
##    accession              name.orig  name.new        spp
## 1   CAA78718          X. laevis Apx  xShroom1    Xenopus
## 2  NP_597713       H. sapiens APXL2  hShroom1       Homo
## 3   CAA58534        H. sapiens APXL  hShroom2       Homo
## 4   ABD19518       M. musculus Apxl  mShroom2        Mus
## 5   AAF13269    M. musculus ShroomL mShroom3a        Mus
## 6   AAF13270    M. musculus ShroomS mShroom3b        Mus
## 7  NP_065910      H. sapiens Shroom  hShroom3       Homo
## 8   ABD59319  X. laevis Shroom-like  xShroom3    Xenopus
## 9  NP_065768    H. sapiens KIAA1202 hShroom4a       Homo
## 10  AAK95579      H. sapiens SHAP-A hShroom4b       Homo
## 11  ABA81834 D. melanogaster Shroom  dmShroom Drosophila
## 12  EAA12598      A. gambiae Shroom  agShroom   mosquito
## 13 XP_392427    A. mellifera Shroom  amShroom        bee
## 14 XP_783573   S. purpuratus Shroom  spShroom sea urchin
shroom_table$accession
##  [1] "CAA78718"  "NP_597713" "CAA58534"  "ABD19518"  "AAF13269"  "AAF13270" 
##  [7] "NP_065910" "ABD59319"  "NP_065768" "AAK95579"  "ABA81834"  "EAA12598" 
## [13] "XP_392427" "XP_783573"
shrooms <- rentrez::entrez_fetch(db = "protein", 
                          id = shroom_table$accession, 
                          rettype = "fasta")

larp <- entrez_fetch(db = "protein", 
                          id = "NP_291029.2", 
                          rettype = "fasta")
larp_clean <- fasta_cleaner(larp, parse = F)
larp
## [1] ">NP_291029.2 la-related protein 1 isoform 2 [Homo sapiens]\nMATQVEPLLPGGATLLQAEEHGGLVRKKPPPAPEGKGEPGPNDVRGGEPDGSARRPRPPCAKPHKEGTGQ\nQERESPRPLQLPGAEGPAISDGEEGGGEPGAGGGAAGAAGAGRRDFVEAPPPKVNPWTKNALPPVLTTVN\nGQSPPEHSAPAKVVRAAVPKQRKGSKVGDFGDAINWPTPGEIAHKSVQPQSHKPQPTRKLPPKKDMKEQE\nKGEGSDSKESPKTKSDESGEEKNGDEDCQRGGQKKKGNKHKWVPLQIDMKPEVPREKLASRPTRPPEPRH\nIPANRGEIKGSESATYVPVAPPTPAWQPEIKPEPAWHDQDETSSVKSDGAGGARASFRGRGRGRGRGRGR\nGRGGTRTHFDYQFGYRKFDGVEGPRTPKYMNNITYYFDNVSSTELYSVDQELLKDYIKRQIEYYFSVDNL\nERDFFLRRKMDADGFLPITLIASFHRVQALTTDISLIFAALKDSKVVEIVDEKVRRREEPEKWPLPPIVD\nYSQTDFSQLLNCPEFVPRQHYQKETESAPGSPRAVTPVPTKTEEVSNLKTLPKGLSASLPDLDSENWIEV\nKKRPRPSPARPKKSEESRFSHLTSLPQQLPSQQLMSKDQDEQEELDFLFDEEMEQMDGRKNTFTAWSDEE\nSDYEIDDRDVNKILIVTQTPHYMRRHPGGDRTGNHTSRAKMSAELAKVINDGLFYYEQDLWAEKFEPEYS\nQIKQEVENFKKVNMISREQFDTLTPEPPVDPNQEVPPGPPRFQQVPTDALANKLFGAPEPSTIARSLPTT\nVPESPNYRNTRTPRTPRTPQLKDSSQTSRFYPVVKEGRTLDAKMPRKRKTRHSSNPPLESHVGWVMDSRE\nHRPRTASISSSPSEGTPTVGSYGCTPQSLPKFQHPSHELLKENGFTQHVYHKYRRRCLNERKRLGIGQSQ\nEMNTLFRFWSFFLRDHFNKKMYEEFKQLALEDAKEGYRYGLECLFRYYSYGLEKKFRLDIFKDFQEETVK\nDYEAGQLYGLEKFWAFLKYSKAKNLDIDPKLQEYLGKFRRLEDFRVDPPMGEEGNHKRHSVVAGGGGGEG\nRKRCPSQSSSRPAAMISQPPTPPTGQPVREDAKWTSQHSNTQTLGK\n\n"
nchar(larp)
## [1] 1172
nchar(larp_clean)
## [1] 1096
cat(shrooms)
## >CAA78718.1 apical protein [Xenopus laevis]
## MSAFGNTIERWNIKSTGVIAGLGHSERISPVRSMTTLVDSAYSSFSGSSYVPEYQNSFQHDGCHYNDEQL
## SYMDSEYVRAIYNPSLLDKDGVYNDIVSEHGSSKVALSGRSSSSLCSDNTTSVHRTSPAKLDNYVTNLDS
## EKNIYGDPINMKHKQNRPNHKAYGLQRNSPTGINSLQEKENQLYNPSNFMEIKDNYFGRSLDVLQADGDI
## MTQDSYTQNALYFPQNQPDQYRNTQYPGANRMSKEQFKVNDVQKSNEENTERDGPYLTKDGQFVQGQYAS
## DVRTSFKNIRRSLKKSASGKIVAHDSQGSCWIMKPGKDTPSFNSEGTITDMDYDNREQWDIRKSRLSTRA
## SQSLYYESNEDVSGPPLKAMNSKNEVDQTLSFQKDATVKSIPLLSQQLQQEKCKSHPLSDLNCEKITKAS
## TPMLYHLAGGRHSAFIAPVHNTNPAQQEKLKLESKTLERMNNISVLQLSEPRPDNHKLPKNKSLTQLADL
## HDSVEGGNSGNLNSSAEESLMNDYIEKLKVAQKKVLRETSFKRKDLQMSLPCRFKLNPPKRPTIDHFRSY
## SSSSANEESAYLQTKNSADSSYKKDDTEKVAVTRIGGRKRITKEQKKLCYSEPEKLDHLGIQKSNFAWKE
## EPTFANRREMSDSDISANRIKYLESKERTNSSSNLSKTELKQIQHNALVQYMERKTNQRPNSNPQVQMER
## TSLGLPNYNEWSIYSSETSSSDASQKYLRRRSAGASSSYDATVTWNDRFGKTSPLGRSAAEKTAGVQRKT
## FSDQRTLDGSQEHLEGSSPSLSQKTSKSTHNEQVSYVNMEFLPSSHSKNHMYNDRLTVPGDGTSAESGRM
## FVSKSRGKSMEEIGTTDIVKLAELSHSSDQLYHIKGPVISSRLENTRTTAASHQDRLLASTQIETGNLPR
## QTHQESVVGPCRSDLANLGQEAHSWPLRASDVSPGTDNPCSSSPSAEVQPGAPEPLHCLQTEDEVFTPAS
## TARNEEPNSTAFSYLLSTGKPVSQGEATALSFTFLPEQDRLEHPIVSETTPSSESDENVSDAAAEKETTT
## TQLPETSNVNKPLGFTVDNQEVEGDGEPMQPEFIDSSKQLELSSLPSSQVNIMQTAEPYLGDKNIGNEQK
## TEDLEQKSKNPEEDDLPKVKLKSPEDEILEELVKEIVAKDKSLLNCLQPVSVRESAMDLMKSLFPMDVTA
## AEKSRTRGLLGKDKGETLKKNNSDLESSSKLPSKITGMLQKRPDGESLDDITLKKMELLSKIGSKLEDLC
## EQREFLLSDISKNTTNGNNMQTMVKELCKPNEFERYMMFIGDLEKVVSLLFSLSTRLTRVENSLSKVDEN
## TDAEEMQSLKERHNLLSSQREDAKDLKANLDRREQVVTGILVKYLNEEQLQDYKHFVRLKTSLLIEQKNL
## EEKIKVYEEQFESIHNSLPP
## 
## >NP_597713.2 protein Shroom1 isoform 2 [Homo sapiens]
## MEALGPGGDRASPASSTSSLDLWHLSMRADSAYSSFSAASGGPEPRTQSPGTDLLPYLDWDYVRVVWGGP
## GPAPPDAALCTSPRPRPAVAARSGPQPTEVPGTPGPLNRQATPLLYALAAEAEAAAQAAEPPSPPASRAA
## YRQRLQGAQRRVLRETSFQRKELRMSLPARLRPTVPARPPATHPRSASLSHPGGEGEPARSRAPAPGTAG
## RGPLANQQRKWCFSEPGKLDRVGRGGGPARECLGEACSSSGLPGPEPLEFQHPALAKFEDHEVGWLPETQ
## PQGSMNLDSGSLKLGDAFRPASRSRSASGEVLGSWGGSGGTIPIVQAVPQGAETPRPLFQTKLSRFLPQK
## EAAVMYPAELPQSSPADSEQRVSETCIVPAWLPSLPDEVFLEEAPLVRMRSPPDPHASQGPPASVHASDQ
## PYGTGLGQRTGQVTVPTEYPLHECPGTAGADDCWQGVNGSVGISRPTSHTPTGTANDNIPTIDPTGLTTN
## PPTAAESDLLKPVPADALGLSGNDTPGPSHNTALARGTGQPGSRPTWPSQCLEELVQELARLDPSLCDPL
## ASQPSPEPPLGLLDGLIPLAEVRAAMRPACGEAGEEAASTFEPGSYQFSFTQLLPAPREETRLENPATHP
## VLDQPCGQGLPAPNNSIQGKKVELAARLQKMLQDLHTEQERLQGEAQAWARRQAALEAAVRQACAPQELE
## RFSRFMADLERVLGLLLLLGSRLARVRRALARAASDSDPDEQRLRLLQRQEEDAKELKEHVARRERAVRE
## VLVRALPVEELRVYCALLAGKAAVLAQQRNLDERIRLLQDQLDAIRDDLGHHAPSPSPARPPGTCPPVQP
## PFPLLLT
## 
## >CAA58534.1 APXL [Homo sapiens]
## MEGAEPRARPERLAEAETRAADGGRLVEVQLSGGAPWGFTLKGGREHGEPLVITKIEEGSKAAAVDKLLA
## GDEIVGINDIGLSGFRQEAICLVKGSHKTLKLVVKRRSELGWRPHSWHATKFSDSHPELAASPFTSTSGC
## PSWSGRHHASSSSHDLSSSWEQTNLQRTLDHFSSLGSVDSLDHPSSRLSVAKSNSSIDHLGSHSKRDSAY
## GSFSTSSSTPDHTLSKADTSSAENILYTVGLWEAPRQGGRQAQAAGDPQGSEEKLSCFPPRVPGDSGKGP
## RPEYNAEPKLAAPGRSNFGPVWYVPDKKKAPSSPPPPPPPLRSDSFAATKSHEKAQGPVFSEAAAAQHFT
## ALAQAQPRGDRRPELTDRPWRSAHPGSLGKGSGGPGCPQEAHADGSWPPSKDGASSRLQASLSSSDVRFP
## QSPHSGRHPPLYSDHSPLCADSLGQEPGAASFQNDSPPQVRGLSSCDQKLGSGWQGPRPCVQGDLQAAQL
## WAGCWPSDTALGALESLPPPTVGQSPRHHLPQPEGPPDARETGRCYPLDKGAEGCSAGAQEPPRASRAEK
## ASQRLAASITWADGESSRICPQETPLLHSLTQEGKRRPESSPEDSATRPPPFDAHVGKPTRRSDRFATTL
## RNEIQMHRAKLQKSRSTVALTAAGEAEDGTGRWRAGLGGGTQEGPLAGTYKDHLKEAQARVLRATSFKRR
## DLDPNPGDLYPESLEHRMGDPDTVPHFWEAGLAQPPSSTSGGPHPPRIGGRRRFTAEQKLKSYSEPEKMN
## EVGLTRGYSPHQHPRTSEDTVGTFADRWKFFEETSKPVPQRPAQKQALHGIPRDKPERPRTAGRTCEGTE
## PWSRTTSLGDSLNAHSAAEKAGTSDLPRRLGTFAEYQASWKEQRKPLEARSSGRCHSADDILDVSLDPQE
## RPQHVHGRSRSSPSTDHYKQEASVELRRQAGDPGEPREELPSAVRAEEGQSTPRQADAQCREGSPGSQQH
## PPSQKAPNPPTFSELSHCRGAPELPREGRGRAGTLPRDYRYSEESTPADLGPRAQSPGSPLHARGQDSWP
## VSSALLSKRPAPQRPPPPKREPRRYRATDGAPADAPVGVLGRPFPTPSPASLDVYVARLSLSHSPSVFSS
## AQPQDTPKATVCERGSQHVSGDASRPLPEALLPPKQQHLRLQTATMETSRSPSPQFAPQKLTDKPPLLIQ
## DEDSTRIERVMDNNTTVKMVPIKIVHSESQPEKESRQSLACPAEPPALPHGLEKDQIKTLSTSEQFYSRF
## CLYTRQGAEPEAPHRAQPAEPQPLGTQVPPEKDRCTSPPGLSYMKAKEKTVEDLKSEELAREIVGKDKSL
## ADILDPSVKIKTTMDLMEGIFPKDEHLLEEAQQRRKLLPKIPSPRSTEERKEEPSVPAAVSLATNSTYYS
## TSAPKAELLIKMKDLQEQQEHEEDSGSDLDHDLSVKKQELIESISRKLQVLREARESLLEDVQANTVLGA
## EVEAIVKGVCKPSEFDKFRMFIGDLDKVVNLLLSLSGRLARVENALNNLDDGASPGDRQSLLEKQRVLIQ
## QHEDAKELKENLDRRERIVFDILANYLSEESLADYEHFVKMKSALIIEQRELEDKIHLGEEQLKCLLDSL
## QPERGK
## 
## >ABD19518.1 Apxl protein [Mus musculus]
## MEGAEPRARPERLAEAEAPATDGVRLVEVQLSGGAPWGFTLKGGREHGEPLVITKIEEGSKAAAVDKLLA
## GDEIVAINDVSLSGFRQEAICLVKGSHKTLKLVVKRKSDPSWRPHSWHATKYFDVHPEPAASLFLNTSGS
## PSWKSQHQASSSSHDLSGSWEHTSLQRTSDHFSSMGSIDSLDHSSQLYPSGHLSSAKSNSSIDHLGGHSK
## RDSAYGSFSTCSSTPDHTLPKADASSTENILYKVGLWEASRPGSSRQSQSTGDPQGLQDRPSCSIPRVPG
## NSSKSPRPEDNVEPKIATHGRSNFGPVWYVPDKKKAPSPPPLGLPLRSDSFSVAARGHEKARGPPFSDLA
## SMQHFITLPHVQPRGDHRMETTDRQWKLTHLSSGKEIGNVGYQSEGHLDCRWLCSDDRAGRPSGPPGRLQ
## FSDVHFLKSYHGSQHQQQCSDESPRAPSSPRELLHITSGGGLQEPPEPSQDDNPTQVRWPGSAHQKLDDR
## GRSHYFPGSLRQPVQGSAQVVIPRGDYWHSDTTPVDLEYPLLRPVGQRTYLQQHEETPASHEKEGYHQLN
## AGIEGCCSGIQEPPRASRTVRTGLQCPSNDFKLVDGESGRISCQRTPMLHSLTQDGTWRPGNSKDCGNDK
## PPLFDAQVGKPTRRSDRFATTLRNEIQMRRAKLQKSKSTVTLAGDSEAEDCAGDWRADVGAVPEGSFPST
## YKEHLKEAQTRVLKATSFQRRDLDPTPADQYSGPSEHRTFDHSASSSLSSFPGEPDSAPRFCETGLAKAP
## SSGVGVPHVLRIGGRKRFTAEQKLKSYSEPEKINEVGLSGDHRPHPTVRTPEDTVGTFADRWKFFEETSK
## SLLQKAGHRQVHCGLPXEKAERPQTGHHECESTEPWFQKRSLATSCGEILSDRKVEKASEKLNPPRRLGT
## FAEYQASWKEQKKPLEARSSGRYHSADDILDAGLDQQQRPQYIHERSRSSPSTDHYSQEVPVEPNRQAED
## SGDHKEAILCTLQAEEGCSAPSAQPQDSQHVNEDTTFPQPETQLSSKCQHLQTSAMETSRSPSPQFAPQK
## LTDKPPLLIHEDNSARIERVMDNNTTVKMVPIKIVHSESQPEKESRQSLSCPAELPPLPSGLERDQIKTL
## STSEQCYSRFCVYTRQEVEAPHRARPPEPRPPXTPAPPVRDSCSSPPSLNYGKAKEKTMDDLKSEELARE
## IVGKDKSLADILDPSVKIKTTMDLMEGIFPKDEYLLKEAQQRRKLLPKSPYPEHRGQETGPRYARGCVLG
## HLSTYYSTSAPKAELLIKMKDLQEPEEYSAGDLDHDLSVKKQELIDSISRKLQVLREARESLLEDIQANN
## ALGDEVEAIVKDVCKPNEFDKFRMFIGDLDKVVNLLLSLSGRLARVENALNNLDDNPSPGDRQSLLEKQR
## VLTQQHEDAKELKENLDRRERIVFDILATYLSEENLADYEHFVKMKSALIIEQRELEDKIHLGEEQLKCL
## FDSLQPERSK
## 
## >AAF13269.1 PDZ domain actin binding protein Shroom [Mus musculus]
## MKTPENLEEPSATPNPSRTPTERFVYLEALLEGGAPWGFTLKGGLERGEPLIISKIEEGGKADSVSSGLQ
## AGDEVIHINEVALSSPRREAVSLVKGSYKTLRLVVRRDVCAAPGHADPGTSKSLSSELLTCSPQHRKATW
## SGGVKLRLKQRCSEPATRPHSWHTTKFGETQPDVSMMQISQGTMGPPWHQSYHSSSSTSDLSNYDHAYLR
## RSPDQCSSQGSMESLEPSGGYPPCHLLSPAKSTSSIDQLGHLHNKRDSAYSSFSTSSSIFEYPPPGGSAR
## ERSGSMDVISARGGLLEGMRQADIRYVKTVYDTRRGVSSEYEVNPSALLLQGRDAHASADSQGCAKWHSI
## PRGKGTPSPSWSQQCSGSLETATDNLPQKAGAPLPPTRSDSYAAFRHRERPSSWSSLDHKRFCRPQTNSS
## GSQKTPFAEDQLHTVPERSPENSPPVKSKHNYTQKAQPGQPLLPTGIYPVPSPEPHFAQVPQPSVSSNGT
## VYPALVKESGYTAAQGTCNKMATLDENGNQNEASRPGFAFCQPLEHNSVTPVEKRPEPTAKYIYKVHFSS
## VPENEDSSLKRHITPPHGHSPYPSERKNIHGGSRACSNHHSLSSPQAQALHVGDDRKPSRLSQPWEGDFQ
## EDHNANLRQKVEREGQGQGLSGNSGRTRSAFSSLQNIPESLRRQSNVELGEAQEVHPGGRSKVEDPGRKA
## GASDIRGYLDRSVSYPRPEGKMNAVDSVHSADSRYEESPAPALPQTSGASQRRLSSSSSAAPQYRKPHCS
## VLEKVSRIEEREQGRHRPLSVGSSAYGPGNRPGRTGPTPSTSSSDLDDPKAGSVHFSESTEHLRNGEQNP
## PNGEAKQEEASRPQCSHLIRRAPADGRGPPARGGEPSRPEARLLRSQSTFQLYSEAEREASWSEDRPGTP
## ESPLLDAPFSRAYRNSIKDAQSRVLGATSFRRRDLEPGTPATSRPWRPRPASAHVGMRSPEAAVPSSSPH
## TPRERHSVTPAAPQAARRGPRRRLTVEQKKRSYSEPEKMNEVGVSEEAEPTPCGPPRPAQPRFSESTVAD
## RRRIFERDGKACSTLSLSGPELKQFQQNALADYIQRKTGKRPTGAASHTGGRAARARTERLPPGRPRGAR
## WPRLASACSLSSLREPEALPRKEHTHPSAADGPQAPRDRSSSFASGRLVGERRRWDPQVPRQLLSGANCE
## PRGVQRMDGAPGGPPSWGMVAGKAGKSKSAEDLLERSDTLAVPVHVRSKSSPTSDKKGQDVLLREGSNFG
## FVKDPCCLAGPGPRSLSCSDKGQNELALPLHHHTPCWNGSGCKATVASSAPPESSGAADHLKQRRAPGPR
## PLSAGMHGHFPDARAASLSSPLPSPVPSASPVPSSYRSQLAMDQQTGQQPPSSPASAVTQPTSPRSLELS
## SPAYGLGEGMWKRTSLPQRPPPPWVKWAHAVREDGLAEDTLAPEFANLKHYRNQPSRPSSCSTSDPDTPG
## RISLRISESALQPSPPPRGDYDDEVFMKDLHPKVTSSPTFEALPPPPPPSPPSEEPLVNGTDDFPPPPPP
## QALCEVLLDGEASTEAGSGPCRIPRVMVTREGHVPGAAHSEGSQIMTATPPQTSAKGSEAESNTPSSASA
## QPQLNGSPGKQLCPSQTRNLTYEPVERTQDLGKKTHAEPQKTSEDIRTEALAKEIVHQDKSLADILDPDS
## RMKTTMDLMEGLFPGDASVLMDSGAKRKALDITARRAGCEAKASDHKEAVSVLVNCPAYYSVSAAKAELL
## NKIKDMPEELQEEEGQEDVYEKKAELIGSLTHKLESLQEAKGSLLTDIKLNNALGEEVEALISELCKPNE
## FDKYKMFIGDLDKVVNLLLSLSGRLARVENVLRGLGEDASKEERSSLNEKRKVLAGQHEDARELKENLDR
## RERVVLDILANYLSAEQLQDYQHFVKMKSTLLIEQRKLDDKIKLGQEQVRCLLESLPSDFRPKAGAISLP
## PALTGHATPGGTSVFGGVFPTLTSPL
## 
## >AAF13270.1 actin binding protein ShroomS [Mus musculus]
## MMQISQGTMGPPWHQSYHSSSSTSDLSNYDHAYLRRSPDQCSSQGSMESLEPSGGYPPCHLLSPAKSTSS
## IDQLGHLHNKRDSAYSSFSTSSSIFEYPPPGGSARERSGSMDVISARGGLLEGMRQADIRYVKTVYDTRR
## GVSSEYEVNPSALLLQGRDAHASADSQGCAKWHSIPRGKGTPSPSWSQQCSGSLETATDNLPQKAGAPLP
## PTRSDSYAAFRHRERPSSWSSLDHKRFCRPQTNSSGSQKTPFAEDQLHTVPERSPENSPPVKSKHNYTQK
## AQPGQPLLPTGIYPVPSPEPHFAQVPQPSVSSNGTVYPALVKESGYTAAQGTCNKMATLDENGNQNEASR
## PGFAFCQPLEHNSVTPVEKRPEPTAKYIYKVHFSSVPENEDSSLKRHITPPHGHSPYPSERKNIHGGSRA
## CSNHHSLSSPQAQALHVGDDRKPSRLSQPWEGDFQEDHNANLRQKVEREGQGQGLSGNSGRTRSAFSSLQ
## NIPESLRRQSNVELGEAQEVHPGGRSKVEDPGRKAGASDIRGYLDRSVSYPRPEGKMNAVDSVHSADSRY
## EESPAPALPQTSGASQRRLSSSSSAAPQYRKPHCSVLEKVSRIEEREQGRHRPLSVGSSAYGPGNRPGRT
## GPTPSTSSSDLDDPKAGSVHFSESTEHLRNGEQNPPNGEAKQEEASRPQCSHLIRRAPADGRGPPARGGE
## PSRPEARLLRSQSTFQLYSEAEREASWSEDRPGTPESPLLDAPFSRAYRNSIKDAQSRVLGATSFRRRDL
## EPGTPATSRPWRPRPASAHVGMRSPEAAVPSSSPHTPRERHSVTPAAPQAARRGPRRRLTVEQKKRSYSE
## PEKMNEVGVSEEAEPTPCGPPRPAQPRFSESTVADRRRIFERDGKACSTLSLSGPELKQFQQNALADYIQ
## RKTGKRPTGAASHTGGRAARARTERLPPGRPRGARWPRLASACSLSSLREPEALPRKEHTHPSAADGPQA
## PRDRSSSFASGRLVGERRRWDPQVPRQLLSGANCEPRGVQRMDGAPGGPPSWGMVAGKAGKSKSAEDLLE
## RSDTLAVPVHVRSKSSPTSDKKGQDVLLREGSNFGFVKDPCCLAGPGPRSLSCSDKGQNELALPLHHHTP
## CWNGSGCKATVASSAPPESSGAADHLKQRRAPGPRPLSAGMHGHFPDARAASLSSPLPSPVPSASPVPSS
## YRSQLAMDQQTGQQPPSSPASAVTQPTSPRSLELSSPAYGLGEGMWKRTSLPQRPPPPWVKWAHAVREDG
## LAEDTLAPEFANLKHYRNQPSRPSSCSTSDPDTPGRISLRISESALQPSPPPRGDYDDEVFMKDLHPKVT
## SSPTFEALPPPPPPSPPSEEPLVNGTDDFPPPPPPQALCEVLLDGEASTEAGSGPCRIPRVMVTREGHVP
## GAAHSEGSQIMTATPPQTSAKGSEAESNTPSSASAQPQLNGSPGKQLCPSQTRNLTYEPVERTQDLGKKT
## HAEPQKTSEDIRTEALAKEIVHQDKSLADILDPDSRMKTTMDLMEGLFPGDASVLMDSGAKRKALDITAR
## RAGCEAKASDHKEAVSVLVNCPAYYSVSAAKAELLNKIKDMPEELQEEEGQEDVYEKKAELIGSLTHKLE
## SLQEAKGSLLTDIKLNNALGEEVEALISELCKPNEFDKYKMFIGDLDKVVNLLLSLSGRLARVENVLRGL
## GEDASKEERSSLNEKRKVLAGQHEDARELKENLDRRERVVLDILANYLSAEQLQDYQHFVKMKSTLLIEQ
## RKLDDKIKLGQEQVRCLLESLPSDFRPKAGAISLPPALTGHATPGGTSVFGGVFPTLTSPL
## 
## >NP_065910.3 protein Shroom3 [Homo sapiens]
## MMRTTEDFHKPSATLNSNTATKGRYIYLEAFLEGGAPWGFTLKGGLEHGEPLIISKVEEGGKADTLSSKL
## QAGDEVVHINEVTLSSSRKEAVSLVKGSYKTLRLVVRRDVCTDPGHADTGASNFVSPEHLTSGPQHRKAA
## WSGGVKLRLKHRRSEPAGRPHSWHTTKSGEKQPDASMMQISQGMIGPPWHQSYHSSSSTSDLSNYDHAYL
## RRSPDQCSSQGSMESLEPSGAYPPCHLSPAKSTGSIDQLSHFHNKRDSAYSSFSTSSSILEYPHPGISGR
## ERSGSMDNTSARGGLLEGMRQADIRYVKTVYDTRRGVSAEYEVNSSALLLQGREARASANGQGYDKWSNI
## PRGKGVPPPSWSQQCPSSLETATDNLPPKVGAPLPPARSDSYAAFRHRERPSSWSSLDQKRLCRPQANSL
## GSLKSPFIEEQLHTVLEKSPENSPPVKPKHNYTQKAQPGQPLLPTSIYPVPSLEPHFAQVPQPSVSSNGM
## LYPALAKESGYIAPQGACNKMATIDENGNQNGSGRPGFAFCQPLEHDLLSPVEKKPEATAKYVPSKVHFC
## SVPENEEDASLKRHLTPPQGNSPHSNERKSTHSNKPSSHPHSLKCPQAQAWQAGEDKRSSRLSEPWEGDF
## QEDHNANLWRRLEREGLGQSLSGNFGKTKSAFSSLQNIPESLRRHSSLELGRGTQEGYPGGRPTCAVNTK
## AEDPGRKAAPDLGSHLDRQVSYPRPEGRTGASASFNSTDPSPEEPPAPSHPHTSSLGRRGPGPGSASALQ
## GFQYGKPHCSVLEKVSKFEQREQGSQRPSVGGSGFGHNYRPHRTVSTSSTSGNDFEETKAHIRFSESAEP
## LGNGEQHFKNGELKLEEASRQPCGQQLSGGASDSGRGPQRPDARLLRSQSTFQLSSEPEREPEWRDRPGS
## PESPLLDAPFSRAYRNSIKDAQSRVLGATSFRRRDLELGAPVASRSWRPRPSSAHVGLRSPEASASASPH
## TPRERHSVTPAEGDLARPVPPAARRGARRRLTPEQKKRSYSEPEKMNEVGIVEEAEPAPLGPQRNGMRFP
## ESSVADRRRLFERDGKACSTLSLSGPELKQFQQSALADYIQRKTGKRPTSAAGCSLQEPGPLRERAQSAY
## LQPGPAALEGSGLASASSLSSLREPSLQPRREATLLPATVAETQQAPRDRSSSFAGGRRLGERRRGDLLS
## GANGGTRGTQRGDETPREPSSWGARAGKSMSAEDLLERSDVLAGPVHVRSRSSPATADKRQDVLLGQDSG
## FGLVKDPCYLAGPGSRSLSCSERGQEEMLPLFHHLTPRWGGSGCKAIGDSSVPSECPGTLDHQRQASRTP
## CPRPPLAGTQGLVTDTRAAPLTPIGTPLPSAIPSGYCSQDGQTGRQPLPPYTPAMMHRSNGHTLTQPPGP
## RGCEGDGPEHGVEEGTRKRVSLPQWPPPSRAKWAHAAREDSLPEESSAPDFANLKHYQKQQSLPSLCSTS
## DPDTPLGAPSTPGRISLRISESVLRDSPPPHEDYEDEVFVRDPHPKATSSPTFEPLPPPPPPPPSQETPV
## YSMDDFPPPPPHTVCEAQLDSEDPEGPRPSFNKLSKVTIARERHMPGAAHVVGSQTLASRLQTSIKGSEA
## ESTPPSFMSVHAQLAGSLGGQPAPIQTQSLSHDPVSGTQGLEKKVSPDPQKSSEDIRTEALAKEIVHQDK
## SLADILDPDSRLKTTMDLMEGLFPRDVNLLKENSVKRKAIQRTVSSSGCEGKRNEDKEAVSMLVNCPAYY
## SVSAPKAELLNKIKEMPAEVNEEEEQADVNEKKAELIGSLTHKLETLQEAKGSLLTDIKLNNALGEEVEA
## LISELCKPNEFDKYRMFIGDLDKVVNLLLSLSGRLARVENVLSGLGEDASNEERSSLYEKRKILAGQHED
## ARELKENLDRRERVVLGILANYLSEEQLQDYQHFVKMKSTLLIEQRKLDDKIKLGQEQVKCLLESLPSDF
## IPKAGALALPPNLTSEPIPAGGCTFSGIFPTLTSPL
## 
## >ABD59319.1 shroom-like protein [Xenopus laevis]
## MMQVSQGTIGSPWHQAYHSSSSTSDLSGYNHEFLRRSPDQYSSRGSMESLDQASAAYHHHLPPAKSTNCI
## DQLVHLHNKRDSAYSSFSTNASIPEYRSSPFSKERSYSMESMHSRNSSGQEGIKHADIKYIKTVYDVQRG
## ISEEYEVNSSSVKNRNYSRQPAYNRHSIGPHGRLEQSRFFSESGGFERAAPMPPTRSDSYALTRHHERPN
## SWSSLDQNRNFRTPKAAGLHSTNTSSNAAQQPKHVHGDGHLHPVLERSPESSPLIKPKQVYSETPQPGQP
## MLPTGIYPVPAPEPHFAHAPQPPKNNNGRLYPALAKEGSYGAKSSEKVLPFSEPNKNEKDTQNLRSKSVG
## QYPMNHSVKEREKKQEGPTGFAHYKLHFTAGPDISTSSLTNDRNDQQPLRLDNIDINEQQKNGTKVAEEF
## SVYAHPAFQNEWSDSKTKQDIASSDIIGLHRNSLSSDAHGEHEYHNHFNIASSSHNKMDERSNRQADHRK
## KLESLSFTVHADEADGPSSNPLKPDESPSPSQKKSYDFTRRRLSSSSSQSSKTDGNKLSSVFDKVCKIEQ
## REHENHRSQFLCGNINQSGLSTRGQNNKGSFTMVEEIRNKFISQDQTPNPNEWRRLSSSHSNEKVTGMHQ
## LTRQGIVYGLQTGDAQKQMPEKQAEKMHSYNQEQNILQAVPDDDNRSFNSQTMPNKEDDWQCAAQDTLGF
## NRAYRNSVKDAQCKVLEATSYRRKDLEISPPHYKKPEKNVRPASAPFRKKSSSLSPHAPKERHSVTPTDN
## CASIQESQGVFFPSRIGAKRRITAEQKKRSYSEPEKMNEVGASESESAPLTVSKMEPVASFSENSVADRR
## RIFEREGKACSTINLSKPQLKQLQQNALADYIERKTGRRPSSQETRLLKERSQSTYFSGSIMDNQSMTST
## SSMNSLNEHNLSYRHREPLSKTGRVSSTLPPGLTGFFDLSSFENNPEYPENRSRSSSFAHQLRSERLLDH
## RSKVEFGKGRETNKPKEVSLQSDDDVIITSSRRHGKSASAEDLLDRLPQPPALHVRSRSSPASDMKSREY
## MSRQEVGNKTSYASASNKEIRSIKSNHFEQMSFTPSFKNHIDTGEDPVPENSSTIQRSAQLENQRNTKTQ
## SISGIYSPHPETKQEPLALPIHSVPAKVTQTSLAHATFDYITAEEYLYSGKRGKESASPTDNKEISDQEW
## CLPENSSSEDLNDPERFAKYTSAQRPQSFETKSGNSINETVQQNKSSGPTAGPKFSTSWKSNGMWSSGSS
## EAETTFNHGKISLHISESCLQPQSPMTGQEDEGDDEVFVKEQDTESFSGTFVPPSPPPFPPPSLEDALLK
## QRIEKFPLVPNTLDEIWENTEEASTQVKVKSNERYLQCASEYTASTESSGSYLLNSGITKRDTDGPLLRL
## SSIVPAPEPLASPVDPTKPIEEQETQPHGADTSILQSSEGNFNPSDSQSTLPHVRSELMSSEDAKSQELA
## KEIVTKDKSLANILDPDSRMKTTMDLMEGLFTKSSSALKEKNQKRKAKKQIDNIIAPESEXKEEKRETLD
## NASNYSAYYSTSAPKAELLRKMKTIHSQIGGKEEQFDVNEKKAELISSLTCKLEVLKDAKESLIDDIKLN
## NSLGEEVETQIETLCKPNEFDKYKMFIGDLDKVVNLLLSLSGRLARVENALSSLGEDASAEERKTWNEKK
## KQLCGQHEDARELKENLDRREKLVMDFLGNYLTGEEFAHYQHFVKMKSALLIEQRELDDKIKLGQEQLRC
## LTESLPSDYLISMKVSLPEERRSSLGNKSLPPPLTSSL
## 
## >NP_065768.2 protein Shroom4 [Homo sapiens]
## MENRPGSFQYVPVQLQGGAPWGFTLKGGLEHCEPLTVSKIEDGGKAALSQKMRTGDELVNINGTPLYGSR
## QEALILIKGSFRILKLIVRRRNAPVSRPHSWHVAKLLEGCPEAATTMHFPSEAFSLSWHSGCNTSDVCVQ
## WCPLSRHCSTEKSSSIGSMESLEQPGQATYESHLLPIDQNMYPNQRDSAYSSFSASSNASDCALSLRPEE
## PASTDCIMQGPGPTKAPSGRPNVAETSGGSRRTNGGHLTPSSQMSSRPQEGYQSGPAKAVRGPPQPPVRR
## DSLQASRAQLLNGEQRRASEPVVPLPQKEKLSLEPVLPARNPNRFCCLSGHDQVTSEGHQNCEFSQPPES
## SQQGSEHLLMQASTKAVGSPKACDRASSVDSNPLNEASAELAKASFGRPPHLIGPTGHRHSAPEQLLASH
## LQHVHLDTRGSKGMELPPVQDGHQWTLSPLHSSHKGKKSPCPPTGGTHDQSSKERKTRQVDDRSLVLGHQ
## SQSSPPHGEADGHPSEKGFLDPNRTSRAASELANQQPSASGSLVQQATDCSSTTKAASGTEAGEEGDSEP
## KECSRMGGRRSGGTRGRSIQNRRKSERFATNLRNEIQRRKAQLQKSKGPLSQLCDTKEPVEETQEPPESP
## PLTASNTSLLSSCKKPPSPRDKLFNKSMMLRARSSECLSQAPESHESRTGLEGRISPGQRPGQSSLGLNT
## WWKAPDPSSSDPEKAHAHCGVRGGHWRWSPEHNSQPLVAAAMEGPSNPGDNKELKASTAQAGEDAILLPF
## ADRRKFFEESSKSLSTSHLPGLTTHSNKTFTQRPKPIDQNFQPMSSSCRELRRHPMDQSYHSADQPYHAT
## DQSYHSMSPLQSETPTYSECFASKGLENSMCCKPLHCGDFDYHRTCSYSCSVQGALVHDPCIYCSGEICP
## ALLKRNMMPNCYNCRCHHHQCIRCSVCYHNPQHSALEDSSLAPGNTWKPRKLTVQEFPGDKWNPITGNRK
## TSQSGREMAHSKTSFSWATPFHPCLENPALDLSSYRAISSLDLLGDFKHALKKSEETSVYEEGSSLASMP
## HPLRSRAFSESHISLAPQSTRAWGQHRRELFSKGDETQSDLLGARKKAFPPPRPPPPNWEKYRLFRAAQQ
## QKQQQQQQKQQEEEEEEEEEEEEEEEEEEEEAEEEEEELPPQYFSSETSGSCALNPEEVLEQPQPLSFGH
## LEGSRQGSQSVPAEQESFALHSSDFLPPIRGHLGSQPEQAQPPCYYGIGGLWRTSGQEATESAKQEFQHF
## SPPSGAPGIPTSYSAYYNISVAKAELLNKLKDQPEMAEIGLGEEEVDHELAQKKIQLIESISRKLSVLRE
## AQRGLLEDINANSALGEEVEANLKAVCKSNEFEKYHLFVGDLDKVVNLLLSLSGRLARVENALNSIDSEA
## NQEKLVLIEKKQQLTGQLADAKELKEHVDRREKLVFGMVSRYLPQDQLQDYQHFVKMKSALIIEQRELEE
## KIKLGEEQLKCLRESLLLGPSNF
## 
## >AAK95579.1 SHAP-A, partial [Homo sapiens]
## MHFPSEAFSLSWHSGCNTSDVCVQWCPLSRHCSTEKSSSIGSMESLEQPGQATYESHLLPIDQNMYPNQR
## DSAYSSFSASSNASDCALSLRPEEPASTDCIMQGPGPTKAPSGRPNVAETSGGSRRTNGGHLTPSSQMSS
## RPQEGYQSGPAKAVRGPPQPPVRRDSLQASRAQLLNGEQRRASEPVVPLPQKEKLSLEPVLPARNPNRFC
## CLSGHDQVTSEGHQNCEFSQPPESSQQGSEHLLMQASTKAVGSPKACDRASSVDSNPLNEASAELAKASF
## GRPPHLIGPTGHRHSAPEQLLASHLQHVHLDTRGSKGMELPPVQDGHQWTLSPLHSSHKGKKSPCPPTGG
## THDQSSKERKTRQVDDRSLVLGHQSQSSPPHGEADGHPSEKGFLDPNRTSRAASELANQQPSASGSLVQQ
## ATDCSSTTKAASGTEAGEEGDSEPKECSRMGGRRSGGTRGRSIQNRRKSERFATNLRNEIQRRKAQLQKS
## KGPLSQLCDTKEPVEETQEPPESPPLTASNTSLLSSCKKPPSPRDKLFNKSMMLRARSSECLSQAPESHE
## SRTGLEGRISPGQRPGQSSLGLNTWWKAPDPSSSDPEKAHAHCGVRGGHWRWSPEHNSQPLVAAAMEGPS
## NPGDNKELKASTAQAGEDAILLPFADRRKFFEESSKSLSTSHLPGLTTHSNKTFTQRPKPIDQNFQPMSS
## SCRELRRHPMDQSYHSADQPYHA
## 
## >ABA81834.1 LP13775p [Drosophila melanogaster]
## MKMRNHKENGNGSEMGESTKSLAKMEPENNNKISVVSVSKLLLKDSNGANSRSSNSNASFSSASVAGSVQ
## DDLPHHNSSSSQLGQQHGSSLDQCGLTQAGLEEYNNRSSSYYDQTAFHHQKQPSYAQSEGYHSYVSSSDS
## TSATPFLDKLRQESDLLSRQSHHWSENDLSSVCSNSVAPSPIPLLARQSHSHSHSHAHSHSNSHGHSHGH
## AHSASSSSSSNNNSNGSATNNNNNNSSESTSSTETLKWLGSMSDISEASHATGYSAISESVSSSQRIVHS
## SRVPTPKRHHSESVLYLHNNEEQGDSSPTASNSSQMMISEEANGEESPPSVQPLRIQHRHSPSYPPVHTS
## MVLHHFQQQQQQQQDYQHPSRHHTNQSTLSTQSSLLELASPTEKPRSLMGQSHSMGDLQQKNPHQNPMLG
## RSAGQQHKSSISVTISSSEAVVTIAPQPPAGKPSKLQLSLGKSEALSCSTPNMGEQSPTNSIDSYRSNHR
## LFPVSTYTEPVHSNTSQYVQHPKPQFSSGLHKSAKLPVITPAGATVQPTWHSVAERINDFERSQLGEPPK
## FAYLEPTKTHRLSNPALKALQKNAVQSYVERQQQQQKEEQQLLRPHSQSYQACHVERKSLPNNLSPIMVG
## LPTGSNSASTRDCSSPTPPPPPRRSGSLLPNLLRRSSSASDYAEFRELHQAQGQVKGPSIRNISNAEKIS
## FNDCGMPPPPPPPRGRLAVPTRRTSSATEYAPMRDKLLLQQAAALAHQQHHPQQHRHAQPPHVPPERPPK
## HPNLRVPSPELPPPPQSELDISYTFDEPLPPPPPPEVLQPRPPPSPNRRNCFAGASTRRTTYEAPPPTAI
## VAAKVPPLVPKKPTSLQHKHLANGGGGSRKRPHHATPQPILENVASPVAPPPPLLPRARSTAHDNVIASN
## LESNQQKRSNSKASYLPRQSLEKLNNTDPDHGIYKLTLTSNEDLVAHTKPSYGVTGKLPNNLPDVLPLGV
## KLHQQPKLQPGSPNGDANVTLRYGSNNNLTGNSPTVAPPPYYGGGQRYSTPVLGQGYGKSSKPVTPQQYT
## RSQSYDVKHTSAVTMPTMSQSHVDLKQAAHDLETTLEEVLPTATPTPTPTPTPTPPRLSPASSHSDCSLS
## TSSLECTINPIATPIPKPEAHIFRAEVISTTLNTNPLTTPPKPAMNRQESLRENIEKITQLQSVLMSAHL
## CDASLLGGYTTPLITSPTASFANEPLMTPPLPPSPPPPLEPEEEEEQEENDVHDKQPEIEELQLMQRSEL
## VLMVNPKPSTTDMACQTDELEDRDTDLEAAREEHQTRTTLQPRQRQPIELDYEQMSRELVKLLPPGDKIA
## DILTPKICKPTSQYVSNLYNPDVPLRLAKRDVGTSTLMRMKSITSSAEIRVVSVELQLAEPSEEPTNLIK
## QKMDELIKHLNQKIVSLKREQQTISEECSANDRLGQDLFAKLAEKVRPSEASKFRTHVDAVGNITSLLLS
## LSERLAQTESSLETRQQERGALESKRDLLYEQMEEAQRLKSDIERRGVSIAGLLAKNLSADMCADYDYFI
## NMKAKLIADARDLAVRIKGSEEQLSSLSDALVQSDC
## 
## >EAA12598.4 AGAP008245-PA, partial [Anopheles gambiae str. PEST]
## IPFSSSPKNRSNSKASYLPRQPRDKLHSDPDHGSYKLTLTSNEDCINHNTGEIITASPKCNLPDVLPPGV
## KYSLYSTNNNNNNNNNSVSNNNSINNHHNGIKPKPHSAPIISTANSLKSLFNFSTSSSTTSTSSSDAAKD
## RDGPQTPATGPPPALVGNFEQQQRQHQHDATVLPPPTGGSTVAGAERAPNEPALDSEASSASTSTRDDDA
## LSSNDAAPATVPVVVVAKEQEGESCPSTAEPLVNGVGVGGVSEHTISGSPAALERVTKEINLSPVVGDAV
## ACEPPSPLPLQRTEIVLRVQAPTSEAASQTDSDDAGLARGFAELTIDCGRRAKDQQDATTVSQQCSNGAS
## TSVATSTTSPIGSPPGTPPSGKEQQGQKFFAPLSSSSSPPPPPPSTPRKLHPEEIDCDKLSHDLVSQLSP
## SDKLHTILAPKTFKSSSDYVSDLFNIQIAPRPLKKDASTATPTETTVANGRRSLSITASQRQQLVSKCKG
## EEAVKKNQEELVQRLGKKLLVLTNEQTNIAEESNANDLLGNDVALKVTQKVRPADASKFRSYVDDVGYIT
## MLLLSLSGRLARTDNALHMIDANHPDKKILEAKRERLLEQLDEAKQLKDDIDQRGATIARILEQSLTIEE
## YADYDYFINMKAKLIVDSREIADKIKLGEEQLAALKDTLVQSEC
## 
## >XP_392427.4 PREDICTED: hypothetical protein LOC408897 [Apis mellifera]
## MTELQPSPPGYRVQDEAPGPPSCPPASYKYASHGHGSEAANFKSTSSSYPQEGYGGLKQSPSRTVPPNEY
## YRRRNDGRRSTENEHEAGNATKKATIPGYNESHKKNTTSYKNDSGYSESLGFDSYTLPLNERDEASPPPT
## PPVRDASSLKGVCYGPGHEKYPSWPSAPERHPDEDVHGSGHSGSHRSKSWTDHTNYPKEKPAQYTRPHTK
## RPNPAFTQQLKTVMERCEKIPAETFESRNRGNVTEEEPRLWPRVDREGKALGDAEYVVPSPPEREQPQTA
## QTLSHADLEAYVRSYQVDPQVSQVDIYRESTLTQAGLEEYTRVQHSQQASYAQSEGYHSYVSSVDSTTNT
## PFLDRLRRDSEAVAQRPTSTWEDSTSREGRDSVVTTSSGSASSSETLKWHGSMSDVSVSSGLPARQDRTS
## DRWHHGSLSDVSSVNGGVLSQKAGSNGCRDKWQGSMSDVSTSCGLSPTAKGRHGGEKWPDKWQVVSMNDR
## NKQSQGRSSLPAVINHCNASSNAADVSSAMRESKWQESSIDEESLVDKSQVSSSGATASMQWDNSMRIEG
## DKYGSLAQPMPQSPIRQQIGSTTPQSPENWNHPIHGSMSDVSQVNGLSCSKQLIAHSARVQTPQRHHSES
## VLYLDRERNQRKLYPVATTQPQLDSAQTSQRMPPALPSQQISVAERINELEKQQQQQQQQQQQQQQQQQQ
## QMRYTYLDPEKRHRVSDPTLKAIQKKALLSFYERHHQASWRSEPQLAQGSQTIAAPQSPPPQPPPRPRPP
## SSRRASSASDYASGAWRENGNRNQNQGNVGELSSPKHQHSNSCGSLSTDLLGPVIVGPAISIDDWVPERP
## PKKPHLRNVYNDRVPSPDLPPPSPPTVTENEVHDCDDPLPPPPPELSDDCFNDATTTATTATATAATTAH
## HHHQSSEESKIRDRSCDRHKLERHSIRRSKHGSKRDYEKLSSGKSSPNSAAKTSMSHQQAEQEQQHQFVR
## GGIALKHVESEMVLENGVSAGFATHRMVATGRSSLRYPSAQKLMMNGRVTPARRISEERGFSARPPAQLP
## DLISQRYTDSGNQRPAPQVPVEPRIIRQESMRVDGTRIDVSGTLLRNDSAQRLESSSGQRPQPQVAKNAD
## KSGSNNQTTRPNYLPVPENSKCASKYLETGNHGFTIGSPVKTGYEANSKMYPSESSPQKYHEPPKYVANN
## HHQSQRHSGDGTQRGNYYALPPKYIDAPKQKPQPQCPTDRYGGSSNASPSSPPPPPLAPRQNTASRKSLP
## PPPRPAPPHALQGSQSKASYLAYRRERGAPDTEGSYKRTMSPTSRLEDWPPPRDHDDPVLLRVTPHHPLQ
## HHHHHHHNNHHHQQQPELSKSHSVDALHHRLEERSAQQQQQQQQQQQQQQQQQQQQQQQQQQNSAEMIGK
## LSHDLNKKLQLNDARSRTSENNNNDNLEDRHQHHHYHHHHHHHYHQERRHDYEQRKERQSPQSIEVLNDR
## NRQLERERRKLAASCEPLSQNREKQNRSMEIPSSVVEENLFRERSATIEMTSQNIELLNRRNEKRTNVTS
## TTTSTITTCITTSSTTTAMTITTDSCWPRSLEVQSSIEASPVSEKPTLPTSSPPRSPDQVEGDSSRGAVP
## RRSGSCSSSSSSSSSSSSSSSSSSSSSSSSSSGKSSDLHISPRNLSNSFSKNENSFLNNRKVSSPTQTDN
## TGSSNRSSSPVRSNQTKEIELRMDRSSLSSKSRSGGRSSTSSVSSCVSKASSSSSSRNSPVEEDISFSPM
## SPCVSPQPGIEGLTLLQRTEVVLRVNTATSDVASQTDIPETTEIESSTVKIREILLCRKKLPEEIECEEL
## GRDLASQLNPNDKLVPLLVPAPEHKKPTDYVTGLFRVEATLHPRPKRRSSLEEPTTPCSDNGDEEKKHES
## IPSTPLSADSTSPLSPTSAYFTTSEGKARFLTRYSRDVTVEGSTRQEDVPPIIPTNSLDLRQKKEELMMS
## LDKKLVVLRAEQEAVREEGEVNEALGARVATRISAVARPAEASKYRLHVEEVGKITSLLLGLSGRLARAE
## NALYGMPAEHAERKILESKRDKLMDQLEEAKILKSNIDKRSVNVSTILSKYLNEEEFADYQHFINMKAKL
## IVDGREIQDKVKLGEEQLAALREAID
## 
## >XP_783573.4 protein Shroom3 isoform X5 [Strongylocentrotus purpuratus]
## MMKDAMYPTTTSTTSSSVNPLPKEVAEQKPVNTKRVRKRESQPGSPRPKSWHTDVRTLSQPDLSRMPQHS
## RQRHGEQTQPRYRNPPPTQYNKFHSSSDSSFMMSSYEEKTGYHQHGRSTGNINNNSAEDTIEPLPGHVQK
## KREAFERTIMSQSTDKINTEDQYGDVYSKRYSKGKEAITQGVNPKLRNIRHDLEPAETYPKVVVATHIHS
## VQSKAVLGRVPDSSDQTGQKYGGAQDVNIYAVQMPERQIASSADSSVTRNYAQAHSNLSGNPQTSYVQST
## FGSNPHSSSFATHHGEIRKVPPAIPKREDSKTKTQAYSDHVKSSSWPVSTISSETTCTLTVCPTILPSDL
## PPVKLTKTEKLQKSPTHSVTQSNPNQNSNSTDQHIVQAKRIWIDDASEHEFNFEDSKMLSSDNTLNTNTR
## PPSPPVRDNENGNYKPKQSKTTRSSDDRFNTSDHLILDYRSFLEKTEQQQENLKSIQPVNSVPESKNKRE
## LFTRTHDFSKATQQEESSPMAQTQPARESSTRNSWYQEKKKQRKRSSLSSEDSLNFSEFDLNKKNLGQNP
## ARTWRNPSESRESTTSDLHPQLTGHPQQPQQQQPVEPRISSHSRQSSDLDNPNAPRKRTPISPSLMEETF
## RMEQEPEQKTFTEKVERTVNRQESRDSRKSGIFDQNDDCQLEKLQPDGNLSENSILRRLEREGSFKNNVN
## LDPQRSEGNETSARKTMPDTKRDLRNLGLSEDAFKQDHRPKSNHYKSGSFTHDSRGNAGDPRLMRTAPVP
## SSHQRTHSIDTYNRNPPRRHESFERRGNPVSRESSFSEHKKSKSDSDQHPKADQQKKKMSDPINKPQNVR
## KTSDPENRQQIWDALKGFVHNRRSPPGTSPASSRPPSMSGSEQSLYRSDMYHSTSSLASGYSSSRHYPQD
## SLSSIGSSFSHPLHQPQDSGFGSNSDISQVRGPHSPSQTGVVSPKDVRIAASIAHSSSMSSSGPQYQQTT
## NERRRSHQVQRPPHQTKHLTSRMSLDSINLPNSRNQQEKMRPNRSPQDKFEFSPTRQISPSPSYNVHVAE
## RVSISSLKEEESNKEGTVFYESLQSERTETEVNHRVFRYPPRSDQTSSSGGQRTSPKSSTVHPPVKSLSM
## DPSYQELELSPPPPPTPLSPLDGRGNMEFPPPPPELAPASNTKRSSPKQEPSEQTVRQATQGGSMPLTSP
## ERITPSFAEQLQQAPSLIHVQVDQVPSAKGEETTPSISPNSIISGRSSPDNHDVEPATSPQQVPRLQSVQ
## ENIAFTDRKDSPVLIRPATLLDSPTRCSQAEPEPLDVAEEEAFDACDGGSNICDSGSNIFTREQEKTDKE
## LREVSNPVLKWVLQALTPSDTVLSDLFPLPRSKTSRSDTMTMDVTTPTKSESEMVMEQSSPSECVNLVLS
## SSRYLRISPAKAIILQRAQTMNKSDDLGNNNTELRKTQEELVDRIGKKVEDIKDLQKEVAEEMSNLEDMG
## RQVMDSVKATCKASEYNKCNMYIADIERVTKLLLSLSRRLNKVESVLGSIENSEEEEKVNLEKLKVTVNS
## KYQDAKMLKESITGRHSTISSMLLNKISNDQHDNFTYYIQMLPRHLIMGQELEDKVKLGEEQLEALGESL
## KQMSLSSDSGSSRDTNGNVSHGFKEEAATSSSSNGIGGPEQLNSNATSSYC
shrooms_list <- compbio4all::entrez_fetch_list(db = "protein", 
                          id = shroom_table$accession, 
                          rettype = "fasta")

is(shrooms_list)
## [1] "list"             "vector"           "list_OR_List"     "vector_OR_Vector"
## [5] "vector_OR_factor"
length(shrooms_list)
## [1] 14
nchar(shrooms_list)
##  CAA78718 NP_597713  CAA58534  ABD19518  AAF13269  AAF13270 NP_065910  ABD59319 
##      1486       915      1673      1543      2083      1895      2070      1864 
## NP_065768  AAK95579  ABA81834  EAA12598 XP_392427 XP_783573 
##      1560       778      1647       750      2230      1758
entrez_fetch_list
## function (db, id, rettype, ...) 
## {
##     n.seq <- length(id)
##     list.output <- as.list(rep(NA, n.seq))
##     names(list.output) <- id
##     for (i in 1:length(id)) {
##         list.output[[i]] <- rentrez::entrez_fetch(db = db, id = id[i], 
##             rettype = rettype)
##     }
##     return(list.output)
## }
## <bytecode: 0x7fb145e02588>
## <environment: namespace:compbio4all>
length(shrooms_list)
## [1] 14
for(i in 1:length(shrooms_list)){
  shrooms_list[[i]] <- fasta_cleaner(shrooms_list[[i]], parse = F)
}

# creating a new vector with the same length as shrooms list
shrooms_vector <- rep(NA, length(shrooms_list))

# assigning every item in shrooms list to new-created vector
for(i in 1:length(shrooms_vector)){
  shrooms_vector[i] <- shrooms_list[[i]]
}

#  renaming all items in new vector
names(shrooms_vector) <- names(shrooms_list)

shrooms_vector_ss <- Biostrings::AAStringSet(shrooms_vector)

shrooms_vector_ss
## AAStringSet object of length 14:
##      width seq                                              names               
##  [1]  1420 MSAFGNTIERWNIKSTGVIAGLG...NLEEKIKVYEEQFESIHNSLPP CAA78718
##  [2]   847 MEALGPGGDRASPASSTSSLDLW...PSPARPPGTCPPVQPPFPLLLT NP_597713
##  [3]  1616 MEGAEPRARPERLAEAETRAADG...KIHLGEEQLKCLLDSLQPERGK CAA58534
##  [4]  1480 MEGAEPRARPERLAEAEAPATDG...KIHLGEEQLKCLFDSLQPERSK ABD19518
##  [5]  1986 MKTPENLEEPSATPNPSRTPTER...GHATPGGTSVFGGVFPTLTSPL AAF13269
##  ...   ... ...
## [10]   723 MHFPSEAFSLSWHSGCNTSDVCV...CRELRRHPMDQSYHSADQPYHA AAK95579
## [11]  1576 MKMRNHKENGNGSEMGESTKSLA...VRIKGSEEQLSSLSDALVQSDC ABA81834
## [12]   674 IPFSSSPKNRSNSKASYLPRQPR...DKIKLGEEQLAALKDTLVQSEC EAA12598
## [13]  2126 MTELQPSPPGYRVQDEAPGPPSC...REIQDKVKLGEEQLAALREAID XP_392427
## [14]  1661 MMKDAMYPTTTSTTSSSVNPLPK...SSSSNGIGGPEQLNSNATSSYC XP_783573
library(msa)
shrooms_align <-msa (shrooms_vector_ss,
                     method = "ClustalW")
## use default substitution matrix
shrooms_align
## CLUSTAL 2.1  
## 
## Call:
##    msa(shrooms_vector_ss, method = "ClustalW")
## 
## MsaAAMultipleAlignment with 14 rows and 2252 columns
##      aln                                                   names
##  [1] -------------------------...------------------------- NP_065768
##  [2] -------------------------...------------------------- AAK95579
##  [3] -------------------------...SVFGGVFPTLTSPL----------- AAF13269
##  [4] -------------------------...SVFGGVFPTLTSPL----------- AAF13270
##  [5] -------------------------...CTFSGIFPTLTSPL----------- NP_065910
##  [6] -------------------------...NKS--LPPPLTSSL----------- ABD59319
##  [7] -------------------------...------------------------- CAA58534
##  [8] -------------------------...------------------------- ABD19518
##  [9] -------------------------...LT----------------------- NP_597713
## [10] -------------------------...------------------------- CAA78718
## [11] -------------------------...------------------------- EAA12598
## [12] -------------------------...------------------------- ABA81834
## [13] MTELQPSPPGYRVQDEAPGPPSCPP...------------------------- XP_392427
## [14] -------------------------...AATSSSSNGIGGPEQLNSNATSSYC XP_783573
##  Con -------------------------...------------------------- Consensus
shrooms_align
## CLUSTAL 2.1  
## 
## Call:
##    msa(shrooms_vector_ss, method = "ClustalW")
## 
## MsaAAMultipleAlignment with 14 rows and 2252 columns
##      aln                                                   names
##  [1] -------------------------...------------------------- NP_065768
##  [2] -------------------------...------------------------- AAK95579
##  [3] -------------------------...SVFGGVFPTLTSPL----------- AAF13269
##  [4] -------------------------...SVFGGVFPTLTSPL----------- AAF13270
##  [5] -------------------------...CTFSGIFPTLTSPL----------- NP_065910
##  [6] -------------------------...NKS--LPPPLTSSL----------- ABD59319
##  [7] -------------------------...------------------------- CAA58534
##  [8] -------------------------...------------------------- ABD19518
##  [9] -------------------------...LT----------------------- NP_597713
## [10] -------------------------...------------------------- CAA78718
## [11] -------------------------...------------------------- EAA12598
## [12] -------------------------...------------------------- ABA81834
## [13] MTELQPSPPGYRVQDEAPGPPSCPP...------------------------- XP_392427
## [14] -------------------------...AATSSSSNGIGGPEQLNSNATSSYC XP_783573
##  Con -------------------------...------------------------- Consensus
# WHAT IS THE LINE BELOW DOING? (its tricky - do your best)
class(shrooms_align) <- "AAMultipleAlignment"

# WHAT IS THE LINE BELOW DOING? This is simpler
shrooms_align_seqinr <- msaConvert(shrooms_align, type = "seqinr::alignment")

library(ggmsa)
ggmsa::ggmsa(shrooms_align,   # shrooms_align, NOT shrooms_align_seqinr
      start = 2000, 
      end = 2100)

CISH_subset_dist <- seqinr::dist.alignment(shrooms_align_seqinr, 
                                       matrix = "identity")
is(CISH_subset_dist)
## [1] "dist"     "oldClass"
class(CISH_subset_dist)
## [1] "dist"
CISH_subset_dist_alt <- matrix(data = NA,
                              nrow = 5, 
                              ncol = 5)

distances <- c(0.8260049, 
               0.8478722, 0.9000568, 
               0.9244596, 0.9435187, 0.9372139, 
               0.9238779, 0.9370038, 0.9323225,0.9413209)
CISH_subset_dist_alt[lower.tri(CISH_subset_dist_alt)] <- distances

seqnames <- c("EAA12598","ABA81834","XP_392427", "XP_783573","CAA78718")
colnames(CISH_subset_dist_alt) <- seqnames
row.names(CISH_subset_dist_alt)  <- seqnames
CISH_subset_dist_alt <- as.dist(CISH_subset_dist_alt)
CISH_subset_dist <- CISH_subset_dist_alt

CISH_subset_dist_rounded <- round(CISH_subset_dist,
                              digits = 3)

CISH_subset_dist_rounded
##           EAA12598 ABA81834 XP_392427 XP_783573
## ABA81834     0.826                             
## XP_392427    0.848    0.944                    
## XP_783573    0.900    0.937     0.937          
## CAA78718     0.924    0.924     0.932     0.941
tree_subset <- nj(CISH_subset_dist)


plot.phylo(tree_subset, main="Phylogenetic Tree", 
            type = "unrooted", 
            use.edge.length = F)


mtext(text = "Shroom family gene tree - UNrooted, no branch lengths")

# plot tree
plot.phylo(tree_subset, main="Phylogenetic Tree", 
            use.edge.length = F)


mtext(text = "Shroom family gene tree - rooted, no branch lenths")

# plot tree
plot.phylo(tree_subset, main="Phylogenetic Tree", 
           type = "unrooted",
            use.edge.length = T)

# add label
mtext(text = "Shroom family gene tree - rooted, with branch lenths")