R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

#Introduction Amylases are secreted proteins that hydrolyze 1,4-alpha-glucoside bonds in oligosaccharides and polysaccharides, and thus catalyze the first step in digestion of dietary starch and glycogen. The human genome has a cluster of several amylase genes that are expressed at high levels in either salivary gland or pancreas. This gene encodes an amylase isoenzyme produced by the salivary gland. Alternative splicing results in multiple transcript variants encoding the same protein.

#Resources/ References: Refseq gene: https://www.ncbi.nlm.nih.gov/gene/11722 Homologene: https://www.ncbi.nlm.nih.gov/homologene/20179 UniProt: https://www.uniprot.org/uniprot/P00687 PDB: https://www.rcsb.org/structure/1RPK

library(BiocManager)
install("drawProteins")
## Bioconductor version 3.14 (BiocManager 1.30.16), R 4.1.1 (2021-08-10)
## Warning: package(s) not installed when version(s) same as current; use `force = TRUE` to
##   re-install: 'drawProteins'
## Old packages: 'glue', 'RcppArmadillo'
library(drawProteins)
# github packages
library(compbio4all)
library(ggmsa)
## Registered S3 methods overwritten by 'ggalt':
##   method                  from   
##   grid.draw.absoluteGrob  ggplot2
##   grobHeight.absoluteGrob ggplot2
##   grobWidth.absoluteGrob  ggplot2
##   grobX.absoluteGrob      ggplot2
##   grobY.absoluteGrob      ggplot2
## ggmsa v1.0.0  Document: http://yulab-smu.top/ggmsa/
## 
## If you use ggmsa in published research, please cite: DOI: 10.18129/B9.bioc.ggmsa
# CRAN packages
library(rentrez)
library(seqinr)
library(ape)
## 
## Attaching package: 'ape'
## The following objects are masked from 'package:seqinr':
## 
##     as.alignment, consensus
library(pander)


library(ggplot2)

library(msa)
## Loading required package: Biostrings
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Warning: package 'S4Vectors' was built under R version 4.1.2
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: XVector
## Loading required package: GenomeInfoDb
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:ape':
## 
##     complement
## The following object is masked from 'package:seqinr':
## 
##     translate
## The following object is masked from 'package:base':
## 
##     strsplit
## 
## Attaching package: 'msa'
## The following object is masked from 'package:BiocManager':
## 
##     version
library(drawProteins)

## Biostrings
library(Biostrings)

library(HGNChelper)
amy1_table <- c(
     "NP_000690.1",    "P04746", "1B2Y", "Homo sapiens",            "Human",         "AMY2A",
     "NP_001103628.1", "NA",     "NA",   "Pan troglodytes",         "chimpanzee",    "AMY2A",
     "XP_001109826.1", "NA",     "NA",   "Macaca mulatta",          "Rhesus monkey", "AMY2A",
     "XP_005642408.1", "J9PAL7", "NA",   "Canis lupus familiaris",  "dog",           "AMY2A",
     "XP_537942.2",    "J9PAL7", "NA",   "Canis lupus familiaris",  "dog",           "AMY2B",
     "NP_001030188.1", "Q3MHH8", "NA",   "Bos taurus",              "cattle",        "AMY2B",
     "XP_002686210.1", "F1MP21", "NA",   "Bos taurus",              "cattle",        "AMY2A",
     "ENSP00000359100","P0DUB6", "NA",   "Homo sapiens",            "human",         "AMY1A",
     "NP_001286518.1", "P08144", "NA",   "Drosophila melanogaster", "Fruit Fly",     "AMY-P",
     "XP_021013165.1", "A0A6P5PER8", "NA", "Mus caroli",            "Ryukyu mouse",  "AMY1")

#make a table
amy1_table <- matrix(data = amy1_table, nrow = 10, ncol = 6, byrow = TRUE, dimnames = NULL)

amy1_table
##       [,1]              [,2]         [,3]   [,4]                     
##  [1,] "NP_000690.1"     "P04746"     "1B2Y" "Homo sapiens"           
##  [2,] "NP_001103628.1"  "NA"         "NA"   "Pan troglodytes"        
##  [3,] "XP_001109826.1"  "NA"         "NA"   "Macaca mulatta"         
##  [4,] "XP_005642408.1"  "J9PAL7"     "NA"   "Canis lupus familiaris" 
##  [5,] "XP_537942.2"     "J9PAL7"     "NA"   "Canis lupus familiaris" 
##  [6,] "NP_001030188.1"  "Q3MHH8"     "NA"   "Bos taurus"             
##  [7,] "XP_002686210.1"  "F1MP21"     "NA"   "Bos taurus"             
##  [8,] "ENSP00000359100" "P0DUB6"     "NA"   "Homo sapiens"           
##  [9,] "NP_001286518.1"  "P08144"     "NA"   "Drosophila melanogaster"
## [10,] "XP_021013165.1"  "A0A6P5PER8" "NA"   "Mus caroli"             
##       [,5]            [,6]   
##  [1,] "Human"         "AMY2A"
##  [2,] "chimpanzee"    "AMY2A"
##  [3,] "Rhesus monkey" "AMY2A"
##  [4,] "dog"           "AMY2A"
##  [5,] "dog"           "AMY2B"
##  [6,] "cattle"        "AMY2B"
##  [7,] "cattle"        "AMY2A"
##  [8,] "human"         "AMY1A"
##  [9,] "Fruit Fly"     "AMY-P"
## [10,] "Ryukyu mouse"  "AMY1"
length(amy1_table)
## [1] 60
amy1_table[[1]]
## [1] "NP_000690.1"
##[1] ">NP_031472.2 alpha-amylase 1 precursor [Mus musculus] MKFFLLLSLIGFCWAQYDPHTQYGRTAIVHLFEWRWVDIAKECERYLAPNGFAGVQVSPPNENIVVHSPS RPWWERYQPISYKICSRSGNEDEFRDMVNRCNNVGVRIYVDAVINHMCGVGAQAGQSSTCGSYFNPNNRD FPGVPYSGFDFNDGKCRTASGGIENYQDAAQVRDCRLSGLLDLALEKDYVRTKVADYMNHLIDIGVAGFR LDASKHMWPGDIKAILDKLHNLNTKWFSQGSRPFIFQEVIDLGGEAVSSNEYFGNGRVTEFKYGAKLGKV MRKWDGEKMSYLKNWGEGWGLMPSDRALVFVDNHDNQRGHGAGGASILTFWDARLYKMAVGFMLAHPYGF TRVMSSYYWPRNFQNGKDVNDWVGPPNNNGKTKEVSINPDSTCGNDWICEHRWRQIRNMVAFRNVVNGQP FANWWDNDSNQVAFGRGNKGFIVFNNDDWALSETLQTGLPAGTYCDVISGDKVDGNCTGIKVYVGNDGKAHFSISNSAEDPFIAIHAESKI"

for(i in 1:length(amy1_table)){
  amy1_table[[i]] <- compbio4all::fasta_cleaner(amy1_table[[i]], parse = F)
}
#Protein diagram


P00687_json  <- drawProteins::get_features("P00687")
## [1] "Download has worked"
is(P00687_json)
## [1] "list"             "vector"           "list_OR_List"     "vector_OR_Vector"
## [5] "vector_OR_factor"
my_prot_df <- drawProteins::feature_to_dataframe(P00687_json)
is(my_prot_df)
## [1] "data.frame"       "list"             "oldClass"         "vector"          
## [5] "list_OR_List"     "vector_OR_Vector" "vector_OR_factor"
my_prot_df[,-2]
##                     type begin end length accession  entryName taxid order
## featuresTemp      SIGNAL     1  15     14    P00687 AMY1_MOUSE 10090     1
## featuresTemp.1     CHAIN    16 511    495    P00687 AMY1_MOUSE 10090     1
## featuresTemp.2  ACT_SITE   212 212      0    P00687 AMY1_MOUSE 10090     1
## featuresTemp.3  ACT_SITE   248 248      0    P00687 AMY1_MOUSE 10090     1
## featuresTemp.4     METAL   115 115      0    P00687 AMY1_MOUSE 10090     1
## featuresTemp.5     METAL   173 173      0    P00687 AMY1_MOUSE 10090     1
## featuresTemp.6     METAL   182 182      0    P00687 AMY1_MOUSE 10090     1
## featuresTemp.7     METAL   216 216      0    P00687 AMY1_MOUSE 10090     1
## featuresTemp.8   BINDING   210 210      0    P00687 AMY1_MOUSE 10090     1
## featuresTemp.9   BINDING   313 313      0    P00687 AMY1_MOUSE 10090     1
## featuresTemp.10  BINDING   352 352      0    P00687 AMY1_MOUSE 10090     1
## featuresTemp.11     SITE   315 315      0    P00687 AMY1_MOUSE 10090     1
## featuresTemp.12  MOD_RES    16  16      0    P00687 AMY1_MOUSE 10090     1
## featuresTemp.13 DISULFID    43 101     58    P00687 AMY1_MOUSE 10090     1
## featuresTemp.14 DISULFID    85 130     45    P00687 AMY1_MOUSE 10090     1
## featuresTemp.15 DISULFID   156 175     19    P00687 AMY1_MOUSE 10090     1
## featuresTemp.16 DISULFID   393 399      6    P00687 AMY1_MOUSE 10090     1
## featuresTemp.17 DISULFID   465 477     12    P00687 AMY1_MOUSE 10090     1
## featuresTemp.18 CONFLICT    29  29      0    P00687 AMY1_MOUSE 10090     1
## featuresTemp.19 CONFLICT   229 229      0    P00687 AMY1_MOUSE 10090     1
## featuresTemp.20 CONFLICT   441 441      0    P00687 AMY1_MOUSE 10090     1
my_canvas <- draw_canvas(my_prot_df)  
my_canvas <- draw_chains(my_canvas, my_prot_df, 
                         label_size = 2.5)
my_canvas <- draw_domains(my_canvas, my_prot_df)
my_canvas

#dot plot


P00687_FASTA <- rentrez::entrez_fetch(id =  "P00687",
                                      db = "protein", 
                                      rettype="fasta")
P00687_vector <- fasta_cleaner(P00687_FASTA)
P00687_FASTA_str <- fasta_cleaner(P00687_FASTA, 
                               parse = F)

str(P00687_FASTA)
##  chr ">sp|P00687.2|AMY1_MOUSE RecName: Full=Alpha-amylase 1; AltName: Full=1,4-alpha-D-glucan glucanohydrolase 1; Alt"| __truncated__
str(P00687_vector)
##  chr [1:511] "M" "K" "F" "F" "L" "L" "L" "S" "L" "I" "G" "F" "C" "W" "A" ...
str(P00687_FASTA_str)
##  chr "MKFFLLLSLIGFCWAQYDPHTQYGRTAIVHLFEWRWVDIAKECERYLAPNGFAGVQVSPPNENIVVHSPSRPWWERYQPISYKICSRSGNEDEFRDMVNRCNNVGVRIYVD"| __truncated__
align <- pairwiseAlignment(P00687_FASTA_str, 
                           P00687_FASTA_str, 
                           type = "global")
#Create a 2x2 panel to show different values for dotPlots settings


par(mfrow = c(2,2), 
    mar = c(0,0,2,1))

dotPlot(P00687_vector, P00687_vector, 
        wsize = 1, 
        nmatch = 1)

dotPlot(P00687_vector, P00687_vector, 
        wsize = 10, 
        nmatch = 1)

dotPlot(P00687_vector, P00687_vector, 
        wsize = 10, 
        nmatch = 5)

dotPlot(P00687_vector, P00687_vector, 
        wsize = 20,
        nmatch = 5)

par(mfrow = c(1,1), 
    mar = c(4,4,4,4))
#Create a single large plot with the best version of the plot

dotPlot(P00687_vector, P00687_vector, 
        wsize = 1, 
        nmatch = 1)

#Protein properties compiled from databases

1. Pfham:http://pfam.xfam.org/protein/P00687  the entire gene have 2 domains that are "Alpha-amylase" and "Alpha-amylase_C"
2. DisProt: no information available
3. RepeatDB: no information available
4. UniProt sub-cellular locations: Extracellular region or secreted
5. PDB secondary structural location: no PDB entry available
#Protein feature prediction

aa.1.1 <- c("A","R","N","D","C","Q","E","G","H","I",
            "L","K","M","F","P","S","T","W","Y","V")

alpha <- c(285, 53, 97, 163, 22, 67, 134, 197, 111, 91, 
           221, 249, 48, 123, 82, 122, 119, 33, 63, 167)

beta <- c(203, 67, 139, 121, 75, 122, 86, 297, 49, 120, 
          177, 115, 16, 85, 127, 341, 253, 44, 110, 229)

a.plus.b <- c(175, 78, 120, 111, 74, 74, 86, 171, 33, 93,
              110, 112, 25, 52, 71, 126, 117, 30, 108, 123)

a.div.b <- c(361, 146, 183, 244, 63, 114, 257, 377, 107, 239, 
             339, 321, 91, 158, 188, 327, 238, 72, 130, 378)

data.frame(aa.1.1, alpha, beta, a.plus.b, a.div.b)
##    aa.1.1 alpha beta a.plus.b a.div.b
## 1       A   285  203      175     361
## 2       R    53   67       78     146
## 3       N    97  139      120     183
## 4       D   163  121      111     244
## 5       C    22   75       74      63
## 6       Q    67  122       74     114
## 7       E   134   86       86     257
## 8       G   197  297      171     377
## 9       H   111   49       33     107
## 10      I    91  120       93     239
## 11      L   221  177      110     339
## 12      K   249  115      112     321
## 13      M    48   16       25      91
## 14      F   123   85       52     158
## 15      P    82  127       71     188
## 16      S   122  341      126     327
## 17      T   119  253      117     238
## 18      W    33   44       30      72
## 19      Y    63  110      108     130
## 20      V   167  229      123     378
alpha.prop <- alpha/sum(alpha)
beta.prop <- beta/sum(beta)
a.plus.b.prop <- a.plus.b/sum(a.plus.b)
a.div.b <- a.div.b/sum(a.div.b)

aa.prop <- data.frame(alpha.prop,
                      beta.prop,
                      a.plus.b.prop,
                      a.div.b)
row.names(aa.prop) <- aa.1.1

aa.prop
##    alpha.prop   beta.prop a.plus.b.prop    a.div.b
## A 0.116469146 0.073126801    0.09264161 0.08331410
## R 0.021659174 0.024135447    0.04129169 0.03369490
## N 0.039640376 0.050072046    0.06352567 0.04223402
## D 0.066612178 0.043587896    0.05876125 0.05631202
## C 0.008990601 0.027017291    0.03917417 0.01453958
## Q 0.027380466 0.043948127    0.03917417 0.02630972
## E 0.054760932 0.030979827    0.04552673 0.05931225
## G 0.080506743 0.106988473    0.09052409 0.08700669
## H 0.045361667 0.017651297    0.01746956 0.02469421
## I 0.037188394 0.043227666    0.04923240 0.05515809
## L 0.090314671 0.063760807    0.05823187 0.07823679
## K 0.101757254 0.041426513    0.05929063 0.07408262
## M 0.019615856 0.005763689    0.01323452 0.02100162
## F 0.050265631 0.030619597    0.02752779 0.03646434
## P 0.033510421 0.045749280    0.03758602 0.04338795
## S 0.049856968 0.122838617    0.06670196 0.07546734
## T 0.048630977 0.091138329    0.06193753 0.05492730
## W 0.013485901 0.015850144    0.01588142 0.01661666
## Y 0.025745811 0.039625360    0.05717311 0.03000231
## V 0.068246833 0.082492795    0.06511382 0.08723748
plot(aa.prop,panel = panel.smooth)

#Percent Identity Comparisons (PID)


#Chimps: NP_001103628.1
chimps_fasta <- rentrez::entrez_fetch(db = "protein",
                        id = "NP_001103628.1",
                         rettype = "fasta")

#Human: NP_000690.1
human_fasta <- rentrez::entrez_fetch(db = "protein",
                        id = "NP_000690.1",
                         rettype = "fasta")

#Fruit_Fly: NP_001286518.1
fruit_fly_fasta <- rentrez::entrez_fetch(db = "protein",
                        id = "NP_001286518.1",
                         rettype = "fasta")

#Cattle: NP_001030188.1
cattle_fasta <- rentrez::entrez_fetch(db = "protein",
                        id = "NP_001030188.1",
                         rettype = "fasta")

chimps_vector   <- fasta_cleaner(chimps_fasta)
human_vector <- fasta_cleaner(human_fasta)
fruit_fly_vector   <- fasta_cleaner(fruit_fly_fasta)
cattle_vector <- fasta_cleaner(cattle_fasta)


chimps_vector
##   [1] "M" "K" "F" "F" "L" "L" "L" "L" "T" "I" "G" "F" "C" "W" "A" "Q" "Y" "S"
##  [19] "P" "N" "T" "Q" "Q" "G" "R" "T" "S" "I" "V" "H" "L" "F" "E" "W" "R" "W"
##  [37] "V" "D" "I" "A" "L" "E" "C" "E" "R" "Y" "L" "A" "P" "K" "G" "F" "G" "G"
##  [55] "V" "Q" "V" "S" "P" "P" "N" "E" "N" "V" "A" "I" "Y" "N" "P" "F" "R" "P"
##  [73] "W" "W" "E" "R" "Y" "Q" "P" "V" "S" "Y" "K" "L" "C" "T" "R" "S" "G" "N"
##  [91] "E" "D" "E" "F" "R" "N" "M" "V" "T" "R" "C" "N" "N" "V" "G" "V" "R" "I"
## [109] "Y" "V" "D" "A" "V" "I" "N" "H" "M" "C" "G" "N" "A" "V" "S" "A" "G" "T"
## [127] "S" "S" "T" "C" "G" "S" "Y" "F" "N" "P" "G" "S" "R" "D" "F" "P" "A" "V"
## [145] "P" "Y" "S" "G" "W" "D" "F" "N" "D" "G" "K" "C" "K" "T" "G" "S" "G" "D"
## [163] "I" "E" "N" "Y" "N" "D" "A" "T" "Q" "V" "R" "D" "C" "R" "L" "S" "G" "L"
## [181] "L" "D" "L" "A" "L" "E" "K" "D" "Y" "V" "R" "S" "K" "I" "A" "E" "Y" "M"
## [199] "N" "H" "L" "I" "D" "I" "G" "V" "A" "G" "F" "R" "L" "D" "A" "S" "K" "H"
## [217] "M" "W" "P" "G" "D" "I" "K" "A" "I" "L" "D" "K" "L" "H" "N" "L" "N" "S"
## [235] "N" "W" "F" "P" "E" "G" "S" "K" "P" "F" "I" "Y" "Q" "E" "V" "I" "D" "L"
## [253] "G" "G" "E" "P" "I" "K" "S" "N" "D" "Y" "F" "G" "N" "G" "R" "V" "T" "E"
## [271] "F" "K" "Y" "G" "A" "K" "L" "G" "T" "V" "I" "R" "K" "W" "N" "G" "E" "K"
## [289] "M" "S" "Y" "L" "K" "N" "W" "G" "E" "G" "W" "G" "F" "I" "P" "S" "D" "R"
## [307] "A" "L" "V" "F" "V" "D" "N" "H" "D" "N" "Q" "R" "G" "H" "G" "A" "G" "G"
## [325] "A" "S" "I" "L" "T" "F" "W" "D" "A" "R" "L" "Y" "K" "M" "A" "V" "G" "F"
## [343] "M" "L" "A" "H" "P" "Y" "G" "F" "T" "R" "V" "M" "S" "S" "Y" "R" "W" "P"
## [361] "R" "Q" "F" "Q" "N" "G" "N" "D" "V" "N" "D" "W" "V" "G" "P" "P" "N" "N"
## [379] "N" "G" "V" "I" "K" "E" "V" "T" "I" "N" "P" "D" "T" "T" "C" "G" "N" "D"
## [397] "W" "V" "C" "E" "H" "R" "W" "R" "Q" "I" "R" "N" "M" "V" "N" "F" "R" "N"
## [415] "V" "V" "D" "G" "Q" "P" "F" "X" "N" "W" "Y" "D" "N" "G" "X" "N" "Q" "V"
## [433] "X" "F" "G" "R" "G" "N" "R" "G" "F" "I" "V" "F" "N" "N" "D" "D" "W" "S"
## [451] "F" "S" "L" "T" "L" "Q" "T" "G" "L" "P" "A" "G" "T" "Y" "C" "D" "V" "I"
## [469] "S" "G" "D" "K" "I" "N" "G" "N" "C" "T" "G" "I" "K" "I" "Y" "V" "S" "D"
## [487] "D" "G" "K" "A" "H" "F" "S" "I" "S" "N" "S" "A" "E" "D" "P" "F" "I" "A"
## [505] "I" "H" "A" "E" "S" "K" "L"
human_vector
##   [1] "M" "K" "F" "F" "L" "L" "L" "F" "T" "I" "G" "F" "C" "W" "A" "Q" "Y" "S"
##  [19] "P" "N" "T" "Q" "Q" "G" "R" "T" "S" "I" "V" "H" "L" "F" "E" "W" "R" "W"
##  [37] "V" "D" "I" "A" "L" "E" "C" "E" "R" "Y" "L" "A" "P" "K" "G" "F" "G" "G"
##  [55] "V" "Q" "V" "S" "P" "P" "N" "E" "N" "V" "A" "I" "Y" "N" "P" "F" "R" "P"
##  [73] "W" "W" "E" "R" "Y" "Q" "P" "V" "S" "Y" "K" "L" "C" "T" "R" "S" "G" "N"
##  [91] "E" "D" "E" "F" "R" "N" "M" "V" "T" "R" "C" "N" "N" "V" "G" "V" "R" "I"
## [109] "Y" "V" "D" "A" "V" "I" "N" "H" "M" "C" "G" "N" "A" "V" "S" "A" "G" "T"
## [127] "S" "S" "T" "C" "G" "S" "Y" "F" "N" "P" "G" "S" "R" "D" "F" "P" "A" "V"
## [145] "P" "Y" "S" "G" "W" "D" "F" "N" "D" "G" "K" "C" "K" "T" "G" "S" "G" "D"
## [163] "I" "E" "N" "Y" "N" "D" "A" "T" "Q" "V" "R" "D" "C" "R" "L" "T" "G" "L"
## [181] "L" "D" "L" "A" "L" "E" "K" "D" "Y" "V" "R" "S" "K" "I" "A" "E" "Y" "M"
## [199] "N" "H" "L" "I" "D" "I" "G" "V" "A" "G" "F" "R" "L" "D" "A" "S" "K" "H"
## [217] "M" "W" "P" "G" "D" "I" "K" "A" "I" "L" "D" "K" "L" "H" "N" "L" "N" "S"
## [235] "N" "W" "F" "P" "A" "G" "S" "K" "P" "F" "I" "Y" "Q" "E" "V" "I" "D" "L"
## [253] "G" "G" "E" "P" "I" "K" "S" "S" "D" "Y" "F" "G" "N" "G" "R" "V" "T" "E"
## [271] "F" "K" "Y" "G" "A" "K" "L" "G" "T" "V" "I" "R" "K" "W" "N" "G" "E" "K"
## [289] "M" "S" "Y" "L" "K" "N" "W" "G" "E" "G" "W" "G" "F" "V" "P" "S" "D" "R"
## [307] "A" "L" "V" "F" "V" "D" "N" "H" "D" "N" "Q" "R" "G" "H" "G" "A" "G" "G"
## [325] "A" "S" "I" "L" "T" "F" "W" "D" "A" "R" "L" "Y" "K" "M" "A" "V" "G" "F"
## [343] "M" "L" "A" "H" "P" "Y" "G" "F" "T" "R" "V" "M" "S" "S" "Y" "R" "W" "P"
## [361] "R" "Q" "F" "Q" "N" "G" "N" "D" "V" "N" "D" "W" "V" "G" "P" "P" "N" "N"
## [379] "N" "G" "V" "I" "K" "E" "V" "T" "I" "N" "P" "D" "T" "T" "C" "G" "N" "D"
## [397] "W" "V" "C" "E" "H" "R" "W" "R" "Q" "I" "R" "N" "M" "V" "I" "F" "R" "N"
## [415] "V" "V" "D" "G" "Q" "P" "F" "T" "N" "W" "Y" "D" "N" "G" "S" "N" "Q" "V"
## [433] "A" "F" "G" "R" "G" "N" "R" "G" "F" "I" "V" "F" "N" "N" "D" "D" "W" "S"
## [451] "F" "S" "L" "T" "L" "Q" "T" "G" "L" "P" "A" "G" "T" "Y" "C" "D" "V" "I"
## [469] "S" "G" "D" "K" "I" "N" "G" "N" "C" "T" "G" "I" "K" "I" "Y" "V" "S" "D"
## [487] "D" "G" "K" "A" "H" "F" "S" "I" "S" "N" "S" "A" "E" "D" "P" "F" "I" "A"
## [505] "I" "H" "A" "E" "S" "K" "L"
fruit_fly_vector
##   [1] "M" "F" "L" "A" "K" "S" "I" "V" "C" "L" "A" "L" "L" "A" "V" "A" "N" "A"
##  [19] "Q" "F" "D" "T" "N" "Y" "A" "S" "G" "R" "S" "G" "M" "V" "H" "L" "F" "E"
##  [37] "W" "K" "W" "D" "D" "I" "A" "A" "E" "C" "E" "N" "F" "L" "G" "P" "N" "G"
##  [55] "Y" "A" "G" "V" "Q" "V" "S" "P" "V" "N" "E" "N" "A" "V" "K" "D" "S" "R"
##  [73] "P" "W" "W" "E" "R" "Y" "Q" "P" "I" "S" "Y" "K" "L" "E" "T" "R" "S" "G"
##  [91] "N" "E" "E" "Q" "F" "A" "S" "M" "V" "K" "R" "C" "N" "A" "V" "G" "V" "R"
## [109] "T" "Y" "V" "D" "V" "V" "F" "N" "H" "M" "A" "A" "D" "G" "G" "T" "Y" "G"
## [127] "T" "G" "G" "S" "T" "A" "S" "P" "S" "S" "K" "S" "Y" "P" "G" "V" "P" "Y"
## [145] "S" "S" "L" "D" "F" "N" "P" "T" "C" "A" "I" "S" "N" "Y" "N" "D" "A" "N"
## [163] "E" "V" "R" "N" "C" "E" "L" "V" "G" "L" "R" "D" "L" "N" "Q" "G" "N" "S"
## [181] "Y" "V" "Q" "D" "K" "V" "V" "E" "F" "L" "D" "H" "L" "I" "D" "L" "G" "V"
## [199] "A" "G" "F" "R" "V" "D" "A" "A" "K" "H" "M" "W" "P" "A" "D" "L" "A" "V"
## [217] "I" "Y" "G" "R" "L" "K" "N" "L" "N" "T" "D" "H" "G" "F" "A" "S" "G" "S"
## [235] "K" "A" "Y" "I" "V" "Q" "E" "V" "I" "D" "M" "G" "G" "E" "A" "I" "S" "K"
## [253] "S" "E" "Y" "T" "G" "L" "G" "A" "I" "T" "E" "F" "R" "H" "S" "D" "S" "I"
## [271] "G" "K" "V" "F" "R" "G" "K" "D" "Q" "L" "Q" "Y" "L" "T" "N" "W" "G" "T"
## [289] "A" "W" "G" "F" "A" "A" "S" "D" "R" "S" "L" "V" "F" "V" "D" "N" "H" "D"
## [307] "N" "Q" "R" "G" "H" "G" "A" "G" "G" "A" "D" "V" "L" "T" "Y" "K" "V" "P"
## [325] "K" "Q" "Y" "K" "M" "A" "S" "A" "F" "M" "L" "A" "H" "P" "F" "G" "T" "P"
## [343] "R" "V" "M" "S" "S" "F" "S" "F" "T" "D" "T" "D" "Q" "G" "P" "P" "T" "T"
## [361] "D" "G" "H" "N" "I" "A" "S" "P" "I" "F" "N" "S" "D" "N" "S" "C" "S" "G"
## [379] "G" "W" "V" "C" "E" "H" "R" "W" "R" "Q" "I" "Y" "N" "M" "V" "A" "F" "R"
## [397] "N" "T" "V" "G" "L" "D" "E" "I" "Q" "N" "W" "W" "D" "N" "G" "S" "N" "Q"
## [415] "I" "S" "F" "S" "R" "G" "S" "R" "G" "F" "V" "A" "F" "N" "N" "D" "N" "Y"
## [433] "D" "L" "N" "S" "S" "L" "Q" "T" "G" "L" "P" "A" "G" "T" "Y" "C" "D" "V"
## [451] "I" "S" "G" "S" "K" "S" "G" "S" "S" "C" "T" "G" "K" "T" "V" "T" "V" "G"
## [469] "S" "D" "G" "R" "A" "S" "I" "Y" "I" "G" "S" "S" "E" "D" "D" "G" "V" "L"
## [487] "A" "I" "H" "V" "N" "A" "K" "L"
cattle_vector
##   [1] "M" "K" "F" "F" "L" "L" "L" "S" "V" "I" "V" "F" "C" "W" "A" "Q" "Y" "A"
##  [19] "P" "H" "T" "K" "T" "G" "R" "T" "S" "I" "V" "H" "L" "F" "E" "W" "R" "W"
##  [37] "V" "D" "I" "A" "L" "E" "C" "E" "R" "Y" "L" "A" "P" "K" "G" "F" "G" "G"
##  [55] "V" "Q" "I" "S" "P" "P" "S" "E" "N" "A" "V" "I" "T" "D" "P" "S" "R" "P"
##  [73] "W" "W" "E" "R" "Y" "Q" "P" "V" "S" "Y" "K" "L" "C" "T" "R" "S" "G" "N"
##  [91] "E" "S" "E" "F" "K" "D" "M" "V" "T" "R" "C" "N" "N" "V" "G" "V" "R" "I"
## [109] "Y" "V" "D" "A" "V" "I" "N" "H" "M" "T" "G" "S" "G" "V" "S" "A" "G" "T"
## [127] "S" "S" "T" "C" "G" "S" "Y" "F" "N" "P" "G" "T" "R" "D" "F" "P" "A" "V"
## [145] "P" "Y" "S" "G" "W" "D" "F" "N" "D" "E" "K" "C" "N" "T" "G" "N" "G" "E"
## [163] "I" "K" "S" "Y" "D" "V" "A" "Y" "Q" "V" "R" "D" "C" "R" "L" "V" "G" "L"
## [181] "L" "D" "L" "A" "L" "A" "K" "D" "Y" "V" "R" "S" "T" "V" "A" "E" "Y" "L"
## [199] "N" "R" "L" "I" "D" "I" "G" "V" "A" "G" "F" "R" "I" "D" "A" "S" "K" "H"
## [217] "M" "W" "P" "G" "D" "I" "K" "A" "V" "L" "D" "K" "L" "H" "N" "L" "N" "T"
## [235] "S" "W" "F" "P" "E" "G" "S" "R" "P" "F" "I" "Y" "Q" "E" "V" "I" "D" "L"
## [253] "G" "G" "E" "T" "I" "T" "S" "S" "D" "Y" "F" "G" "N" "G" "R" "V" "T" "E"
## [271] "F" "K" "Y" "G" "V" "K" "L" "G" "T" "V" "L" "R" "K" "W" "S" "G" "E" "K"
## [289] "M" "A" "Y" "L" "K" "N" "W" "G" "E" "G" "W" "G" "F" "M" "P" "S" "D" "R"
## [307] "A" "L" "V" "F" "V" "D" "N" "H" "D" "N" "Q" "R" "G" "H" "G" "A" "G" "G"
## [325] "A" "S" "I" "L" "T" "F" "W" "D" "A" "R" "L" "Y" "K" "M" "G" "V" "G" "F"
## [343] "M" "L" "A" "H" "P" "Y" "G" "F" "T" "R" "V" "M" "S" "S" "Y" "H" "W" "P"
## [361] "R" "H" "F" "E" "D" "G" "K" "D" "V" "N" "D" "W" "V" "G" "P" "P" "N" "N"
## [379] "N" "G" "V" "I" "K" "E" "V" "T" "I" "N" "P" "D" "T" "T" "C" "G" "N" "G"
## [397] "W" "V" "C" "E" "H" "R" "W" "R" "Q" "I" "R" "N" "M" "V" "I" "F" "R" "N"
## [415] "V" "V" "D" "G" "Q" "P" "F" "T" "N" "W" "W" "D" "N" "G" "S" "N" "Q" "V"
## [433] "A" "F" "G" "R" "G" "N" "K" "G" "F" "I" "V" "F" "N" "N" "D" "D" "W" "A"
## [451] "L" "S" "A" "T" "L" "Q" "T" "G" "L" "P" "P" "G" "T" "Y" "C" "D" "V" "I"
## [469] "S" "G" "D" "K" "I" "G" "D" "N" "C" "T" "V" "I" "E" "I" "N" "V" "S" "C"
## [487] "D" "G" "N" "A" "Y" "F" "S" "I" "S" "N" "S" "A" "E" "D" "P" "F" "I" "A"
## [505] "I" "H" "T" "E" "S" "K" "L"
data(package="Biostrings")
chimps_string <- paste(chimps_vector,collapse = "") 
human_string <- paste(human_vector,collapse = "" )
fruit_fly_string <- paste(fruit_fly_vector,collapse = "")
cattle_string <- paste(cattle_vector,collapse = "")
chimps_string   <- toupper(chimps_string)
human_string   <- toupper(human_string)
fruit_fly_string   <- toupper(fruit_fly_string)
cattle_string   <- toupper(cattle_string)



chimps_string
## [1] "MKFFLLLLTIGFCWAQYSPNTQQGRTSIVHLFEWRWVDIALECERYLAPKGFGGVQVSPPNENVAIYNPFRPWWERYQPVSYKLCTRSGNEDEFRNMVTRCNNVGVRIYVDAVINHMCGNAVSAGTSSTCGSYFNPGSRDFPAVPYSGWDFNDGKCKTGSGDIENYNDATQVRDCRLSGLLDLALEKDYVRSKIAEYMNHLIDIGVAGFRLDASKHMWPGDIKAILDKLHNLNSNWFPEGSKPFIYQEVIDLGGEPIKSNDYFGNGRVTEFKYGAKLGTVIRKWNGEKMSYLKNWGEGWGFIPSDRALVFVDNHDNQRGHGAGGASILTFWDARLYKMAVGFMLAHPYGFTRVMSSYRWPRQFQNGNDVNDWVGPPNNNGVIKEVTINPDTTCGNDWVCEHRWRQIRNMVNFRNVVDGQPFXNWYDNGXNQVXFGRGNRGFIVFNNDDWSFSLTLQTGLPAGTYCDVISGDKINGNCTGIKIYVSDDGKAHFSISNSAEDPFIAIHAESKL"
human_string 
## [1] "MKFFLLLFTIGFCWAQYSPNTQQGRTSIVHLFEWRWVDIALECERYLAPKGFGGVQVSPPNENVAIYNPFRPWWERYQPVSYKLCTRSGNEDEFRNMVTRCNNVGVRIYVDAVINHMCGNAVSAGTSSTCGSYFNPGSRDFPAVPYSGWDFNDGKCKTGSGDIENYNDATQVRDCRLTGLLDLALEKDYVRSKIAEYMNHLIDIGVAGFRLDASKHMWPGDIKAILDKLHNLNSNWFPAGSKPFIYQEVIDLGGEPIKSSDYFGNGRVTEFKYGAKLGTVIRKWNGEKMSYLKNWGEGWGFVPSDRALVFVDNHDNQRGHGAGGASILTFWDARLYKMAVGFMLAHPYGFTRVMSSYRWPRQFQNGNDVNDWVGPPNNNGVIKEVTINPDTTCGNDWVCEHRWRQIRNMVIFRNVVDGQPFTNWYDNGSNQVAFGRGNRGFIVFNNDDWSFSLTLQTGLPAGTYCDVISGDKINGNCTGIKIYVSDDGKAHFSISNSAEDPFIAIHAESKL"
fruit_fly_string 
## [1] "MFLAKSIVCLALLAVANAQFDTNYASGRSGMVHLFEWKWDDIAAECENFLGPNGYAGVQVSPVNENAVKDSRPWWERYQPISYKLETRSGNEEQFASMVKRCNAVGVRTYVDVVFNHMAADGGTYGTGGSTASPSSKSYPGVPYSSLDFNPTCAISNYNDANEVRNCELVGLRDLNQGNSYVQDKVVEFLDHLIDLGVAGFRVDAAKHMWPADLAVIYGRLKNLNTDHGFASGSKAYIVQEVIDMGGEAISKSEYTGLGAITEFRHSDSIGKVFRGKDQLQYLTNWGTAWGFAASDRSLVFVDNHDNQRGHGAGGADVLTYKVPKQYKMASAFMLAHPFGTPRVMSSFSFTDTDQGPPTTDGHNIASPIFNSDNSCSGGWVCEHRWRQIYNMVAFRNTVGLDEIQNWWDNGSNQISFSRGSRGFVAFNNDNYDLNSSLQTGLPAGTYCDVISGSKSGSSCTGKTVTVGSDGRASIYIGSSEDDGVLAIHVNAKL"
cattle_string 
## [1] "MKFFLLLSVIVFCWAQYAPHTKTGRTSIVHLFEWRWVDIALECERYLAPKGFGGVQISPPSENAVITDPSRPWWERYQPVSYKLCTRSGNESEFKDMVTRCNNVGVRIYVDAVINHMTGSGVSAGTSSTCGSYFNPGTRDFPAVPYSGWDFNDEKCNTGNGEIKSYDVAYQVRDCRLVGLLDLALAKDYVRSTVAEYLNRLIDIGVAGFRIDASKHMWPGDIKAVLDKLHNLNTSWFPEGSRPFIYQEVIDLGGETITSSDYFGNGRVTEFKYGVKLGTVLRKWSGEKMAYLKNWGEGWGFMPSDRALVFVDNHDNQRGHGAGGASILTFWDARLYKMGVGFMLAHPYGFTRVMSSYHWPRHFEDGKDVNDWVGPPNNNGVIKEVTINPDTTCGNGWVCEHRWRQIRNMVIFRNVVDGQPFTNWWDNGSNQVAFGRGNKGFIVFNNDDWALSATLQTGLPPGTYCDVISGDKIGDNCTVIEINVSCDGNAYFSISNSAEDPFIAIHTESKL"
data(BLOSUM50)
#CHIMPS VS OTHER

chimps_vs_human <- Biostrings::pairwiseAlignment(chimps_string, 
                                               human_string,
                                               substitutionMatrix = BLOSUM50, 
                                               gapOpening = -8, 
                                               gapExtension = -2, 
                                               scoreOnly = FALSE)
chimps_vs_fruit_fly <- Biostrings::pairwiseAlignment(chimps_string, 
                                               fruit_fly_string,
                                               substitutionMatrix = BLOSUM50, 
                                               gapOpening = -8, 
                                               gapExtension = -2, 
                                               scoreOnly = FALSE)
chimps_vs_cattle <- Biostrings::pairwiseAlignment(chimps_string, 
                                               cattle_string ,
                                               substitutionMatrix = BLOSUM50, 
                                               gapOpening = -8, 
                                               gapExtension = -2, 
                                               scoreOnly = FALSE)


chimps_vs_human
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: MKFFLLLLTIGFCWAQYSPNTQQGRTSIVHLFEW...TGIKIYVSDDGKAHFSISNSAEDPFIAIHAESKL
## subject: MKFFLLLFTIGFCWAQYSPNTQQGRTSIVHLFEW...TGIKIYVSDDGKAHFSISNSAEDPFIAIHAESKL
## score: 3593
chimps_vs_fruit_fly
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: M---KFFLLLLTIGFCWAQYSPNTQQGRTSIVHL...TGIKIYVSDDGKAHFSISNSAEDPFIAIHAESKL
## subject: MFLAKSIVCLALLAVANAQFDTNYASGRSGMVHL...TGKTVTVGSDGRASIYIGSSEDDGVLAIHVNAKL
## score: 1859
chimps_vs_cattle
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: MKFFLLLLTIGFCWAQYSPNTQQGRTSIVHLFEW...TGIKIYVSDDGKAHFSISNSAEDPFIAIHAESKL
## subject: MKFFLLLSVIVFCWAQYAPHTKTGRTSIVHLFEW...TVIEINVSCDGNAYFSISNSAEDPFIAIHTESKL
## score: 3167
pid(chimps_vs_human)
## [1] 98.23875
pid(chimps_vs_fruit_fly)
## [1] 52.90698
pid(chimps_vs_cattle)
## [1] 85.3229
#HUMAN VS OTHER


human_vs_fruit_fly <- Biostrings::pairwiseAlignment(human_string, 
                                               fruit_fly_string,
                                               substitutionMatrix = BLOSUM50, 
                                               gapOpening = -8, 
                                               gapExtension = -2, 
                                               scoreOnly = FALSE)
human_vs_cattle <- Biostrings::pairwiseAlignment(human_string, 
                                               cattle_string ,
                                               substitutionMatrix = BLOSUM50, 
                                               gapOpening = -8, 
                                               gapExtension = -2, 
                                               scoreOnly = FALSE)
human_vs_fruit_fly
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: M---KFFLLLFTIGFCWAQYSPNTQQGRTSIVHL...TGIKIYVSDDGKAHFSISNSAEDPFIAIHAESKL
## subject: MFLAKSIVCLALLAVANAQFDTNYASGRSGMVHL...TGKTVTVGSDGRASIYIGSSEDDGVLAIHVNAKL
## score: 1875
human_vs_cattle
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: MKFFLLLFTIGFCWAQYSPNTQQGRTSIVHLFEW...TGIKIYVSDDGKAHFSISNSAEDPFIAIHAESKL
## subject: MKFFLLLSVIVFCWAQYAPHTKTGRTSIVHLFEW...TVIEINVSCDGNAYFSISNSAEDPFIAIHTESKL
## score: 3190
pid(human_vs_fruit_fly)
## [1] 53.29457
pid(human_vs_cattle)
## [1] 86.10568
#FRUIT FLY VS OTHER

fruit_fly_vs_cattle <- Biostrings::pairwiseAlignment(fruit_fly_string, 
                                               cattle_string ,
                                               substitutionMatrix = BLOSUM50, 
                                               gapOpening = -8, 
                                               gapExtension = -2, 
                                               scoreOnly = FALSE)
fruit_fly_vs_cattle
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: MFLAKSIVCLALLAVANAQFDTNYASGRSGMVHL...TGKTVTVGSDGRASIYIGSSEDDGVLAIHVNAKL
## subject: M---KFFLLLSVIVFCWAQYAPHTKTGRTSIVHL...TVIEINVSCDGNAYFSISNSAEDPFIAIHTESKL
## score: 1855
pid(fruit_fly_vs_cattle)
## [1] 52.90698
pids <- c(1,          NA,     NA,     NA,
          98.23875,    1,     NA,     NA,
          52.90698, 53.29457,  1,     NA,
          85.3229,  86.10568, 52.90698, 1)

mat <- matrix(pids, nrow = 4, byrow = T)
row.names(mat) <- c("CHIMPS","HUMANS","FRUIT_FLIES","CATTLES")   
colnames(mat) <- c("CHIMPS","HUMANS","FRUIT_FLIES","CATTLES")   
pander::pander(mat)  
  CHIMPS HUMANS FRUIT_FLIES CATTLES
CHIMPS 1 NA NA NA
HUMANS 98.24 1 NA NA
FRUIT_FLIES 52.91 53.29 1 NA
CATTLES 85.32 86.11 52.91 1
#PID methods comparison
#chimps vs human


pid(chimps_vs_human, type = "PID1")
## [1] 98.23875
pid(chimps_vs_human, type = "PID2")
## [1] 98.23875
pid(chimps_vs_human, type = "PID3")
## [1] 98.23875
pid(chimps_vs_human, type = "PID4")
## [1] 98.23875
pids_comparison <- c("PID1",  98.23875, "(aligned_positions_PLUS_internal_gap_positions)",
                     "PID2",  98.23875, "(aligned_positions)",
                     "PID3",  98.23875, "(length_shorter_sequence)",
                     "PID4",  98.23875, "(average_length_of_the_two_sequences)")

mat <- matrix(pids_comparison, nrow = 4, byrow = T)
row.names(mat) <- c("1","2","3","4")   
colnames(mat) <- c("Method","PID","denominator")   
pander::pander(mat)  
Method PID denominator
PID1 98.23875 (aligned_positions_PLUS_internal_gap_positions)
PID2 98.23875 (aligned_positions)
PID3 98.23875 (length_shorter_sequence)
PID4 98.23875 (average_length_of_the_two_sequences)
#Multiple sequence alignment


a1 <- entrez_fetch(db = "protein", 
                          id = "NP_000690.1", 
                          rettype = "fasta")
a2 <- entrez_fetch(db = "protein", 
                          id = "NP_001103628.1", 
                          rettype = "fasta")
a3 <- entrez_fetch(db = "protein", 
                          id = "NP_001030188.1", 
                          rettype = "fasta")
a4 <- entrez_fetch(db = "protein", 
                          id = "NP_001008222.1", 
                          rettype = "fasta")
a5 <- entrez_fetch(db = "protein", 
                          id = "NP_001286518.1", 
                          rettype = "fasta")

a1 <- fasta_cleaner(a1,  parse = F)
a2 <- fasta_cleaner(a2,  parse = F)
a3 <- fasta_cleaner(a3,  parse = F)
a4 <- fasta_cleaner(a4,  parse = F)
a5 <- fasta_cleaner(a5,  parse = F)
TABLE <- c("NP_000690.1",     "Homo_sapiens",            "a1",
           "NP_001103628.1",  "Pan_troglodytes",         "a2",
           "NP_001030188.1",  "Bos_taurus",              "a3",
           "NP_001008222.1",  "Homo_sapiens",            "a4",
           "NP_001286518.1",  "Drosophila_melanogaster", "a5" )


TABLE_matrix <- matrix(TABLE,
                                  byrow = T,
                                  nrow = 5)

table <- data.frame(TABLE_matrix, 
                     stringsAsFactors = F)


names(table) <- c("accession", "name.orig","name.new")
  
table
##        accession               name.orig name.new
## 1    NP_000690.1            Homo_sapiens       a1
## 2 NP_001103628.1         Pan_troglodytes       a2
## 3 NP_001030188.1              Bos_taurus       a3
## 4 NP_001008222.1            Homo_sapiens       a4
## 5 NP_001286518.1 Drosophila_melanogaster       a5
table$accession
## [1] "NP_000690.1"    "NP_001103628.1" "NP_001030188.1" "NP_001008222.1"
## [5] "NP_001286518.1"
LIST <- entrez_fetch(db = "protein", 
                          id = table$accession, 
                          rettype = "fasta")
cat(LIST)
## >NP_000690.1 pancreatic alpha-amylase precursor [Homo sapiens]
## MKFFLLLFTIGFCWAQYSPNTQQGRTSIVHLFEWRWVDIALECERYLAPKGFGGVQVSPPNENVAIYNPF
## RPWWERYQPVSYKLCTRSGNEDEFRNMVTRCNNVGVRIYVDAVINHMCGNAVSAGTSSTCGSYFNPGSRD
## FPAVPYSGWDFNDGKCKTGSGDIENYNDATQVRDCRLTGLLDLALEKDYVRSKIAEYMNHLIDIGVAGFR
## LDASKHMWPGDIKAILDKLHNLNSNWFPAGSKPFIYQEVIDLGGEPIKSSDYFGNGRVTEFKYGAKLGTV
## IRKWNGEKMSYLKNWGEGWGFVPSDRALVFVDNHDNQRGHGAGGASILTFWDARLYKMAVGFMLAHPYGF
## TRVMSSYRWPRQFQNGNDVNDWVGPPNNNGVIKEVTINPDTTCGNDWVCEHRWRQIRNMVIFRNVVDGQP
## FTNWYDNGSNQVAFGRGNRGFIVFNNDDWSFSLTLQTGLPAGTYCDVISGDKINGNCTGIKIYVSDDGKA
## HFSISNSAEDPFIAIHAESKL
## 
## >NP_001103628.1 pancreatic alpha-amylase precursor [Pan troglodytes]
## MKFFLLLLTIGFCWAQYSPNTQQGRTSIVHLFEWRWVDIALECERYLAPKGFGGVQVSPPNENVAIYNPF
## RPWWERYQPVSYKLCTRSGNEDEFRNMVTRCNNVGVRIYVDAVINHMCGNAVSAGTSSTCGSYFNPGSRD
## FPAVPYSGWDFNDGKCKTGSGDIENYNDATQVRDCRLSGLLDLALEKDYVRSKIAEYMNHLIDIGVAGFR
## LDASKHMWPGDIKAILDKLHNLNSNWFPEGSKPFIYQEVIDLGGEPIKSNDYFGNGRVTEFKYGAKLGTV
## IRKWNGEKMSYLKNWGEGWGFIPSDRALVFVDNHDNQRGHGAGGASILTFWDARLYKMAVGFMLAHPYGF
## TRVMSSYRWPRQFQNGNDVNDWVGPPNNNGVIKEVTINPDTTCGNDWVCEHRWRQIRNMVNFRNVVDGQP
## FXNWYDNGXNQVXFGRGNRGFIVFNNDDWSFSLTLQTGLPAGTYCDVISGDKINGNCTGIKIYVSDDGKA
## HFSISNSAEDPFIAIHAESKL
## 
## >NP_001030188.1 alpha-amylase 2B precursor [Bos taurus]
## MKFFLLLSVIVFCWAQYAPHTKTGRTSIVHLFEWRWVDIALECERYLAPKGFGGVQISPPSENAVITDPS
## RPWWERYQPVSYKLCTRSGNESEFKDMVTRCNNVGVRIYVDAVINHMTGSGVSAGTSSTCGSYFNPGTRD
## FPAVPYSGWDFNDEKCNTGNGEIKSYDVAYQVRDCRLVGLLDLALAKDYVRSTVAEYLNRLIDIGVAGFR
## IDASKHMWPGDIKAVLDKLHNLNTSWFPEGSRPFIYQEVIDLGGETITSSDYFGNGRVTEFKYGVKLGTV
## LRKWSGEKMAYLKNWGEGWGFMPSDRALVFVDNHDNQRGHGAGGASILTFWDARLYKMGVGFMLAHPYGF
## TRVMSSYHWPRHFEDGKDVNDWVGPPNNNGVIKEVTINPDTTCGNGWVCEHRWRQIRNMVIFRNVVDGQP
## FTNWWDNGSNQVAFGRGNKGFIVFNNDDWALSATLQTGLPPGTYCDVISGDKIGDNCTVIEINVSCDGNA
## YFSISNSAEDPFIAIHTESKL
## 
## >NP_001008222.1 alpha-amylase 1A precursor [Homo sapiens]
## MKLFWLLFTIGFCWAQYSSNTQQGRTSIVHLFEWRWVDIALECERYLAPKGFGGVQVSPPNENVAIHNPF
## RPWWERYQPVSYKLCTRSGNEDEFRNMVTRCNNVGVRIYVDAVINHMCGNAVSAGTSSTCGSYFNPGSRD
## FPAVPYSGWDFNDGKCKTGSGDIENYNDATQVRDCRLSGLLDLALGKDYVRSKIAEYMNHLIDIGVAGFR
## IDASKHMWPGDIKAILDKLHNLNSNWFPEGSKPFIYQEVIDLGGEPIKSSDYFGNGRVTEFKYGAKLGTV
## IRKWNGEKMSYLKNWGEGWGFMPSDRALVFVDNHDNQRGHGAGGASILTFWDARLYKMAVGFMLAHPYGF
## TRVMSSYRWPRYFENGKDVNDWVGPPNDNGVTKEVTINPDTTCGNDWVCEHRWRQIRNMVNFRNVVDGQP
## FTNWYDNGSNQVAFGRGNRGFIVFNNDDWTFSLTLQTGLPAGTYCDVISGDKINGNCTGIKIYVSDDGKA
## HFSISNSAEDPFIAIHAESKL
## 
## >NP_001286518.1 amylase proximal, isoform B [Drosophila melanogaster]
## MFLAKSIVCLALLAVANAQFDTNYASGRSGMVHLFEWKWDDIAAECENFLGPNGYAGVQVSPVNENAVKD
## SRPWWERYQPISYKLETRSGNEEQFASMVKRCNAVGVRTYVDVVFNHMAADGGTYGTGGSTASPSSKSYP
## GVPYSSLDFNPTCAISNYNDANEVRNCELVGLRDLNQGNSYVQDKVVEFLDHLIDLGVAGFRVDAAKHMW
## PADLAVIYGRLKNLNTDHGFASGSKAYIVQEVIDMGGEAISKSEYTGLGAITEFRHSDSIGKVFRGKDQL
## QYLTNWGTAWGFAASDRSLVFVDNHDNQRGHGAGGADVLTYKVPKQYKMASAFMLAHPFGTPRVMSSFSF
## TDTDQGPPTTDGHNIASPIFNSDNSCSGGWVCEHRWRQIYNMVAFRNTVGLDEIQNWWDNGSNQISFSRG
## SRGFVAFNNDNYDLNSSLQTGLPAGTYCDVISGSKSGSSCTGKTVTVGSDGRASIYIGSSEDDGVLAIHV
## NAKL
entrez_fetch_list <- function(db, id, rettype, ...){

  #setup list for storing output
  n.seq <- length(id)
  list.output <- as.list(rep(NA, n.seq))
  names(list.output) <- id

  # get output
  for(i in 1:length(id)){
    list.output[[i]] <- rentrez::entrez_fetch(db = db,
                                              id = id[i],
                                              rettype = rettype)
  }


  return(list.output)
}

list <- entrez_fetch_list(db = "protein", 
                          id =table$accession, 
                          rettype = "fasta")

list[[1]]
## [1] ">NP_000690.1 pancreatic alpha-amylase precursor [Homo sapiens]\nMKFFLLLFTIGFCWAQYSPNTQQGRTSIVHLFEWRWVDIALECERYLAPKGFGGVQVSPPNENVAIYNPF\nRPWWERYQPVSYKLCTRSGNEDEFRNMVTRCNNVGVRIYVDAVINHMCGNAVSAGTSSTCGSYFNPGSRD\nFPAVPYSGWDFNDGKCKTGSGDIENYNDATQVRDCRLTGLLDLALEKDYVRSKIAEYMNHLIDIGVAGFR\nLDASKHMWPGDIKAILDKLHNLNSNWFPAGSKPFIYQEVIDLGGEPIKSSDYFGNGRVTEFKYGAKLGTV\nIRKWNGEKMSYLKNWGEGWGFVPSDRALVFVDNHDNQRGHGAGGASILTFWDARLYKMAVGFMLAHPYGF\nTRVMSSYRWPRQFQNGNDVNDWVGPPNNNGVIKEVTINPDTTCGNDWVCEHRWRQIRNMVIFRNVVDGQP\nFTNWYDNGSNQVAFGRGNRGFIVFNNDDWSFSLTLQTGLPAGTYCDVISGDKINGNCTGIKIYVSDDGKA\nHFSISNSAEDPFIAIHAESKL\n\n"
list[[1]] <- fasta_cleaner(list[[1]], parse = F)
list[[2]] <- fasta_cleaner(list[[2]], parse = F)
list[[3]] <- fasta_cleaner(list[[3]], parse = F)
list[[4]] <- fasta_cleaner(list[[4]], parse = F)
list[[5]] <- fasta_cleaner(list[[5]], parse = F)

length(list)
## [1] 5
list_vector <- rep(NA, length(list))
list_vector
## [1] NA NA NA NA NA
for(i in 1:length(list_vector)){
  list_vector[i] <- list[[i]]}

names(list_vector) <- names(list)

list_vector_ss <- Biostrings::AAStringSet(list_vector)

list_align <- msa(list_vector_ss,
                     method = "ClustalW")
## use default substitution matrix
list_align
## CLUSTAL 2.1  
## 
## Call:
##    msa(list_vector_ss, method = "ClustalW")
## 
## MsaAAMultipleAlignment with 5 rows and 516 columns
##     aln                                                    names
## [1] ---MKFFLLLFTIGFCWAQYSPNTQQ...DGKAHFSISNSAEDPFIAIHAESKL NP_000690.1
## [2] ---MKFFLLLLTIGFCWAQYSPNTQQ...DGKAHFSISNSAEDPFIAIHAESKL NP_001103628.1
## [3] ---MKLFWLLFTIGFCWAQYSSNTQQ...DGKAHFSISNSAEDPFIAIHAESKL NP_001008222.1
## [4] ---MKFFLLLSVIVFCWAQYAPHTKT...DGNAYFSISNSAEDPFIAIHTESKL NP_001030188.1
## [5] MFLAKSIVCLALLAVANAQFDTNYAS...DGRASIYIGSSEDDGVLAIHVNAKL NP_001286518.1
## Con ---MKFFLLL?TIGFCWAQYSPNTQQ...DGKAHFSISNSAEDPFIAIHAESKL Consensus
class(list_align)
## [1] "MsaAAMultipleAlignment"
## attr(,"package")
## [1] "msa"
class(list_align) <- "AAMultipleAlignment"
list_align_seqinr <- msaConvert(list_align, 
                                   type = "seqinr::alignment")

compbio4all::print_msa(alignment = list_align_seqinr, 
          chunksize = 60)
## [1] "---MKFFLLLFTIGFCWAQYSPNTQQGRTSIVHLFEWRWVDIALECERYLAPKGFGGVQV 0"
## [1] "---MKFFLLLLTIGFCWAQYSPNTQQGRTSIVHLFEWRWVDIALECERYLAPKGFGGVQV 0"
## [1] "---MKLFWLLFTIGFCWAQYSSNTQQGRTSIVHLFEWRWVDIALECERYLAPKGFGGVQV 0"
## [1] "---MKFFLLLSVIVFCWAQYAPHTKTGRTSIVHLFEWRWVDIALECERYLAPKGFGGVQI 0"
## [1] "MFLAKSIVCLALLAVANAQFDTNYASGRSGMVHLFEWKWDDIAAECENFLGPNGYAGVQV 0"
## [1] " "
## [1] "SPPNENVAIYNPFRPWWERYQPVSYKLCTRSGNEDEFRNMVTRCNNVGVRIYVDAVINHM 0"
## [1] "SPPNENVAIYNPFRPWWERYQPVSYKLCTRSGNEDEFRNMVTRCNNVGVRIYVDAVINHM 0"
## [1] "SPPNENVAIHNPFRPWWERYQPVSYKLCTRSGNEDEFRNMVTRCNNVGVRIYVDAVINHM 0"
## [1] "SPPSENAVITDPSRPWWERYQPVSYKLCTRSGNESEFKDMVTRCNNVGVRIYVDAVINHM 0"
## [1] "SPVNENAVKDS--RPWWERYQPISYKLETRSGNEEQFASMVKRCNAVGVRTYVDVVFNHM 0"
## [1] " "
## [1] "CGNAVSAGTSSTCGSYFNPGSRDFPAVPYSGWDFNDGKCKTGSGDIENYNDATQVRDCRL 0"
## [1] "CGNAVSAGTSSTCGSYFNPGSRDFPAVPYSGWDFNDGKCKTGSGDIENYNDATQVRDCRL 0"
## [1] "CGNAVSAGTSSTCGSYFNPGSRDFPAVPYSGWDFNDGKCKTGSGDIENYNDATQVRDCRL 0"
## [1] "TGSGVSAGTSSTCGSYFNPGTRDFPAVPYSGWDFNDEKCNTGNGEIKSYDVAYQVRDCRL 0"
## [1] "AADG---GTYGTGGSTASPSSKSYPGVPYSSLDFN------PTCAISNYNDANEVRNCEL 0"
## [1] " "
## [1] "TGLLDLALEKDYVRSKIAEYMNHLIDIGVAGFRLDASKHMWPGDIKAILDKLHNLNSNW- 0"
## [1] "SGLLDLALEKDYVRSKIAEYMNHLIDIGVAGFRLDASKHMWPGDIKAILDKLHNLNSNW- 0"
## [1] "SGLLDLALGKDYVRSKIAEYMNHLIDIGVAGFRIDASKHMWPGDIKAILDKLHNLNSNW- 0"
## [1] "VGLLDLALAKDYVRSTVAEYLNRLIDIGVAGFRIDASKHMWPGDIKAVLDKLHNLNTSW- 0"
## [1] "VGLRDLNQGNSYVQDKVVEFLDHLIDLGVAGFRVDAAKHMWPADLAVIYGRLKNLNTDHG 0"
## [1] " "
## [1] "FPAGSKPFIYQEVIDLGGEPIKSSDYFGNGRVTEFKYGAKLGTVIRKWNGEKMSYLKNWG 0"
## [1] "FPEGSKPFIYQEVIDLGGEPIKSNDYFGNGRVTEFKYGAKLGTVIRKWNGEKMSYLKNWG 0"
## [1] "FPEGSKPFIYQEVIDLGGEPIKSSDYFGNGRVTEFKYGAKLGTVIRKWNGEKMSYLKNWG 0"
## [1] "FPEGSRPFIYQEVIDLGGETITSSDYFGNGRVTEFKYGVKLGTVLRKWSGEKMAYLKNWG 0"
## [1] "FASGSKAYIVQEVIDMGGEAISKSEYTGLGAITEFRHSDSIGKVFR--GKDQLQYLTNWG 0"
## [1] " "
## [1] "EGWGFVPSDRALVFVDNHDNQRGHGAGGASILTFWDARLYKMAVGFMLAHPYGFTRVMSS 0"
## [1] "EGWGFIPSDRALVFVDNHDNQRGHGAGGASILTFWDARLYKMAVGFMLAHPYGFTRVMSS 0"
## [1] "EGWGFMPSDRALVFVDNHDNQRGHGAGGASILTFWDARLYKMAVGFMLAHPYGFTRVMSS 0"
## [1] "EGWGFMPSDRALVFVDNHDNQRGHGAGGASILTFWDARLYKMGVGFMLAHPYGFTRVMSS 0"
## [1] "TAWGFAASDRSLVFVDNHDNQRGHGAGGADVLTYKVPKQYKMASAFMLAHPFGTPRVMSS 0"
## [1] " "
## [1] "YRWPRQFQNGNDVNDWVGPPNNNG-VIKEVTINPDTTCGNDWVCEHRWRQIRNMVIFRNV 0"
## [1] "YRWPRQFQNGNDVNDWVGPPNNNG-VIKEVTINPDTTCGNDWVCEHRWRQIRNMVNFRNV 0"
## [1] "YRWPRYFENGKDVNDWVGPPNDNG-VTKEVTINPDTTCGNDWVCEHRWRQIRNMVNFRNV 0"
## [1] "YHWPRHFEDGKDVNDWVGPPNNNG-VIKEVTINPDTTCGNGWVCEHRWRQIRNMVIFRNV 0"
## [1] "FSFTDTDQ---------GPPTTDGHNIASPIFNSDNSCSGGWVCEHRWRQIYNMVAFRNT 0"
## [1] " "
## [1] "VDGQPFTNWYDNGSNQVAFGRGNRGFIVFNNDDWSFSLTLQTGLPAGTYCDVISGDKING 0"
## [1] "VDGQPFXNWYDNGXNQVXFGRGNRGFIVFNNDDWSFSLTLQTGLPAGTYCDVISGDKING 0"
## [1] "VDGQPFTNWYDNGSNQVAFGRGNRGFIVFNNDDWTFSLTLQTGLPAGTYCDVISGDKING 0"
## [1] "VDGQPFTNWWDNGSNQVAFGRGNKGFIVFNNDDWALSATLQTGLPPGTYCDVISGDKIGD 0"
## [1] "VGLDEIQNWWDNGSNQISFSRGSRGFVAFNNDNYDLNSSLQTGLPAGTYCDVISGSKSGS 0"
## [1] " "
## [1] "NCTGIKIYVSDDGKAHFSISNSAEDPFIAIHAESKL 24"
## [1] "NCTGIKIYVSDDGKAHFSISNSAEDPFIAIHAESKL 24"
## [1] "NCTGIKIYVSDDGKAHFSISNSAEDPFIAIHAESKL 24"
## [1] "NCTVIEINVSCDGNAYFSISNSAEDPFIAIHTESKL 24"
## [1] "SCTGKTVTVGSDGRASIYIGSSEDDGVLAIHVNAKL 24"
## [1] " "
class(list_align) <- "AAMultipleAlignment"

ggmsa::ggmsa(list_align,   
      start = 1, 
      end = 50) 

#Distance matrix

list_subset_dist <- seqinr::dist.alignment(list_align_seqinr, 
                                       matrix = "identity")
list_subset_dist
##                NP_000690.1 NP_001103628.1 NP_001008222.1 NP_001030188.1
## NP_001103628.1   0.1086785                                             
## NP_001008222.1   0.1769496      0.1718358                              
## NP_001030188.1   0.3727509      0.3764735      0.3753667               
## NP_001286518.1   0.6676884      0.6682081      0.6692180      0.6753019
#Phylogenetic tree for all sequences


tree_subset <- nj(list_subset_dist)

#rooted
plot.phylo(tree_subset, main="Phylogenetic Tree", 
            use.edge.length = F)
mtext(text = "AMY1 family gene tree - rooted, no branch lenths")

#unrooted
plot.phylo(tree_subset, main="Phylogenetic Tree", 
            type = "unrooted", 
            use.edge.length = F)
mtext(text = "AMY1 family gene tree - UNrooted, no branch lengths")