This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.
#Introduction Amylases are secreted proteins that hydrolyze 1,4-alpha-glucoside bonds in oligosaccharides and polysaccharides, and thus catalyze the first step in digestion of dietary starch and glycogen. The human genome has a cluster of several amylase genes that are expressed at high levels in either salivary gland or pancreas. This gene encodes an amylase isoenzyme produced by the salivary gland. Alternative splicing results in multiple transcript variants encoding the same protein.
#Resources/ References: Refseq gene: https://www.ncbi.nlm.nih.gov/gene/11722 Homologene: https://www.ncbi.nlm.nih.gov/homologene/20179 UniProt: https://www.uniprot.org/uniprot/P00687 PDB: https://www.rcsb.org/structure/1RPK
library(BiocManager)
install("drawProteins")
## Bioconductor version 3.14 (BiocManager 1.30.16), R 4.1.1 (2021-08-10)
## Warning: package(s) not installed when version(s) same as current; use `force = TRUE` to
## re-install: 'drawProteins'
## Old packages: 'glue', 'RcppArmadillo'
library(drawProteins)
# github packages
library(compbio4all)
library(ggmsa)
## Registered S3 methods overwritten by 'ggalt':
## method from
## grid.draw.absoluteGrob ggplot2
## grobHeight.absoluteGrob ggplot2
## grobWidth.absoluteGrob ggplot2
## grobX.absoluteGrob ggplot2
## grobY.absoluteGrob ggplot2
## ggmsa v1.0.0 Document: http://yulab-smu.top/ggmsa/
##
## If you use ggmsa in published research, please cite: DOI: 10.18129/B9.bioc.ggmsa
# CRAN packages
library(rentrez)
library(seqinr)
library(ape)
##
## Attaching package: 'ape'
## The following objects are masked from 'package:seqinr':
##
## as.alignment, consensus
library(pander)
library(ggplot2)
library(msa)
## Loading required package: Biostrings
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Warning: package 'S4Vectors' was built under R version 4.1.2
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: XVector
## Loading required package: GenomeInfoDb
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:ape':
##
## complement
## The following object is masked from 'package:seqinr':
##
## translate
## The following object is masked from 'package:base':
##
## strsplit
##
## Attaching package: 'msa'
## The following object is masked from 'package:BiocManager':
##
## version
library(drawProteins)
## Biostrings
library(Biostrings)
library(HGNChelper)
amy1_table <- c(
"NP_000690.1", "P04746", "1B2Y", "Homo sapiens", "Human", "AMY2A",
"NP_001103628.1", "NA", "NA", "Pan troglodytes", "chimpanzee", "AMY2A",
"XP_001109826.1", "NA", "NA", "Macaca mulatta", "Rhesus monkey", "AMY2A",
"XP_005642408.1", "J9PAL7", "NA", "Canis lupus familiaris", "dog", "AMY2A",
"XP_537942.2", "J9PAL7", "NA", "Canis lupus familiaris", "dog", "AMY2B",
"NP_001030188.1", "Q3MHH8", "NA", "Bos taurus", "cattle", "AMY2B",
"XP_002686210.1", "F1MP21", "NA", "Bos taurus", "cattle", "AMY2A",
"ENSP00000359100","P0DUB6", "NA", "Homo sapiens", "human", "AMY1A",
"NP_001286518.1", "P08144", "NA", "Drosophila melanogaster", "Fruit Fly", "AMY-P",
"XP_021013165.1", "A0A6P5PER8", "NA", "Mus caroli", "Ryukyu mouse", "AMY1")
#make a table
amy1_table <- matrix(data = amy1_table, nrow = 10, ncol = 6, byrow = TRUE, dimnames = NULL)
amy1_table
## [,1] [,2] [,3] [,4]
## [1,] "NP_000690.1" "P04746" "1B2Y" "Homo sapiens"
## [2,] "NP_001103628.1" "NA" "NA" "Pan troglodytes"
## [3,] "XP_001109826.1" "NA" "NA" "Macaca mulatta"
## [4,] "XP_005642408.1" "J9PAL7" "NA" "Canis lupus familiaris"
## [5,] "XP_537942.2" "J9PAL7" "NA" "Canis lupus familiaris"
## [6,] "NP_001030188.1" "Q3MHH8" "NA" "Bos taurus"
## [7,] "XP_002686210.1" "F1MP21" "NA" "Bos taurus"
## [8,] "ENSP00000359100" "P0DUB6" "NA" "Homo sapiens"
## [9,] "NP_001286518.1" "P08144" "NA" "Drosophila melanogaster"
## [10,] "XP_021013165.1" "A0A6P5PER8" "NA" "Mus caroli"
## [,5] [,6]
## [1,] "Human" "AMY2A"
## [2,] "chimpanzee" "AMY2A"
## [3,] "Rhesus monkey" "AMY2A"
## [4,] "dog" "AMY2A"
## [5,] "dog" "AMY2B"
## [6,] "cattle" "AMY2B"
## [7,] "cattle" "AMY2A"
## [8,] "human" "AMY1A"
## [9,] "Fruit Fly" "AMY-P"
## [10,] "Ryukyu mouse" "AMY1"
length(amy1_table)
## [1] 60
amy1_table[[1]]
## [1] "NP_000690.1"
##[1] ">NP_031472.2 alpha-amylase 1 precursor [Mus musculus] MKFFLLLSLIGFCWAQYDPHTQYGRTAIVHLFEWRWVDIAKECERYLAPNGFAGVQVSPPNENIVVHSPS RPWWERYQPISYKICSRSGNEDEFRDMVNRCNNVGVRIYVDAVINHMCGVGAQAGQSSTCGSYFNPNNRD FPGVPYSGFDFNDGKCRTASGGIENYQDAAQVRDCRLSGLLDLALEKDYVRTKVADYMNHLIDIGVAGFR LDASKHMWPGDIKAILDKLHNLNTKWFSQGSRPFIFQEVIDLGGEAVSSNEYFGNGRVTEFKYGAKLGKV MRKWDGEKMSYLKNWGEGWGLMPSDRALVFVDNHDNQRGHGAGGASILTFWDARLYKMAVGFMLAHPYGF TRVMSSYYWPRNFQNGKDVNDWVGPPNNNGKTKEVSINPDSTCGNDWICEHRWRQIRNMVAFRNVVNGQP FANWWDNDSNQVAFGRGNKGFIVFNNDDWALSETLQTGLPAGTYCDVISGDKVDGNCTGIKVYVGNDGKAHFSISNSAEDPFIAIHAESKI"
for(i in 1:length(amy1_table)){
amy1_table[[i]] <- compbio4all::fasta_cleaner(amy1_table[[i]], parse = F)
}
#Protein diagram
P00687_json <- drawProteins::get_features("P00687")
## [1] "Download has worked"
is(P00687_json)
## [1] "list" "vector" "list_OR_List" "vector_OR_Vector"
## [5] "vector_OR_factor"
my_prot_df <- drawProteins::feature_to_dataframe(P00687_json)
is(my_prot_df)
## [1] "data.frame" "list" "oldClass" "vector"
## [5] "list_OR_List" "vector_OR_Vector" "vector_OR_factor"
my_prot_df[,-2]
## type begin end length accession entryName taxid order
## featuresTemp SIGNAL 1 15 14 P00687 AMY1_MOUSE 10090 1
## featuresTemp.1 CHAIN 16 511 495 P00687 AMY1_MOUSE 10090 1
## featuresTemp.2 ACT_SITE 212 212 0 P00687 AMY1_MOUSE 10090 1
## featuresTemp.3 ACT_SITE 248 248 0 P00687 AMY1_MOUSE 10090 1
## featuresTemp.4 METAL 115 115 0 P00687 AMY1_MOUSE 10090 1
## featuresTemp.5 METAL 173 173 0 P00687 AMY1_MOUSE 10090 1
## featuresTemp.6 METAL 182 182 0 P00687 AMY1_MOUSE 10090 1
## featuresTemp.7 METAL 216 216 0 P00687 AMY1_MOUSE 10090 1
## featuresTemp.8 BINDING 210 210 0 P00687 AMY1_MOUSE 10090 1
## featuresTemp.9 BINDING 313 313 0 P00687 AMY1_MOUSE 10090 1
## featuresTemp.10 BINDING 352 352 0 P00687 AMY1_MOUSE 10090 1
## featuresTemp.11 SITE 315 315 0 P00687 AMY1_MOUSE 10090 1
## featuresTemp.12 MOD_RES 16 16 0 P00687 AMY1_MOUSE 10090 1
## featuresTemp.13 DISULFID 43 101 58 P00687 AMY1_MOUSE 10090 1
## featuresTemp.14 DISULFID 85 130 45 P00687 AMY1_MOUSE 10090 1
## featuresTemp.15 DISULFID 156 175 19 P00687 AMY1_MOUSE 10090 1
## featuresTemp.16 DISULFID 393 399 6 P00687 AMY1_MOUSE 10090 1
## featuresTemp.17 DISULFID 465 477 12 P00687 AMY1_MOUSE 10090 1
## featuresTemp.18 CONFLICT 29 29 0 P00687 AMY1_MOUSE 10090 1
## featuresTemp.19 CONFLICT 229 229 0 P00687 AMY1_MOUSE 10090 1
## featuresTemp.20 CONFLICT 441 441 0 P00687 AMY1_MOUSE 10090 1
my_canvas <- draw_canvas(my_prot_df)
my_canvas <- draw_chains(my_canvas, my_prot_df,
label_size = 2.5)
my_canvas <- draw_domains(my_canvas, my_prot_df)
my_canvas
#dot plot
P00687_FASTA <- rentrez::entrez_fetch(id = "P00687",
db = "protein",
rettype="fasta")
P00687_vector <- fasta_cleaner(P00687_FASTA)
P00687_FASTA_str <- fasta_cleaner(P00687_FASTA,
parse = F)
str(P00687_FASTA)
## chr ">sp|P00687.2|AMY1_MOUSE RecName: Full=Alpha-amylase 1; AltName: Full=1,4-alpha-D-glucan glucanohydrolase 1; Alt"| __truncated__
str(P00687_vector)
## chr [1:511] "M" "K" "F" "F" "L" "L" "L" "S" "L" "I" "G" "F" "C" "W" "A" ...
str(P00687_FASTA_str)
## chr "MKFFLLLSLIGFCWAQYDPHTQYGRTAIVHLFEWRWVDIAKECERYLAPNGFAGVQVSPPNENIVVHSPSRPWWERYQPISYKICSRSGNEDEFRDMVNRCNNVGVRIYVD"| __truncated__
align <- pairwiseAlignment(P00687_FASTA_str,
P00687_FASTA_str,
type = "global")
#Create a 2x2 panel to show different values for dotPlots settings
par(mfrow = c(2,2),
mar = c(0,0,2,1))
dotPlot(P00687_vector, P00687_vector,
wsize = 1,
nmatch = 1)
dotPlot(P00687_vector, P00687_vector,
wsize = 10,
nmatch = 1)
dotPlot(P00687_vector, P00687_vector,
wsize = 10,
nmatch = 5)
dotPlot(P00687_vector, P00687_vector,
wsize = 20,
nmatch = 5)
par(mfrow = c(1,1),
mar = c(4,4,4,4))
#Create a single large plot with the best version of the plot
dotPlot(P00687_vector, P00687_vector,
wsize = 1,
nmatch = 1)
#Protein properties compiled from databases
1. Pfham:http://pfam.xfam.org/protein/P00687 the entire gene have 2 domains that are "Alpha-amylase" and "Alpha-amylase_C"
2. DisProt: no information available
3. RepeatDB: no information available
4. UniProt sub-cellular locations: Extracellular region or secreted
5. PDB secondary structural location: no PDB entry available
#Protein feature prediction
aa.1.1 <- c("A","R","N","D","C","Q","E","G","H","I",
"L","K","M","F","P","S","T","W","Y","V")
alpha <- c(285, 53, 97, 163, 22, 67, 134, 197, 111, 91,
221, 249, 48, 123, 82, 122, 119, 33, 63, 167)
beta <- c(203, 67, 139, 121, 75, 122, 86, 297, 49, 120,
177, 115, 16, 85, 127, 341, 253, 44, 110, 229)
a.plus.b <- c(175, 78, 120, 111, 74, 74, 86, 171, 33, 93,
110, 112, 25, 52, 71, 126, 117, 30, 108, 123)
a.div.b <- c(361, 146, 183, 244, 63, 114, 257, 377, 107, 239,
339, 321, 91, 158, 188, 327, 238, 72, 130, 378)
data.frame(aa.1.1, alpha, beta, a.plus.b, a.div.b)
## aa.1.1 alpha beta a.plus.b a.div.b
## 1 A 285 203 175 361
## 2 R 53 67 78 146
## 3 N 97 139 120 183
## 4 D 163 121 111 244
## 5 C 22 75 74 63
## 6 Q 67 122 74 114
## 7 E 134 86 86 257
## 8 G 197 297 171 377
## 9 H 111 49 33 107
## 10 I 91 120 93 239
## 11 L 221 177 110 339
## 12 K 249 115 112 321
## 13 M 48 16 25 91
## 14 F 123 85 52 158
## 15 P 82 127 71 188
## 16 S 122 341 126 327
## 17 T 119 253 117 238
## 18 W 33 44 30 72
## 19 Y 63 110 108 130
## 20 V 167 229 123 378
alpha.prop <- alpha/sum(alpha)
beta.prop <- beta/sum(beta)
a.plus.b.prop <- a.plus.b/sum(a.plus.b)
a.div.b <- a.div.b/sum(a.div.b)
aa.prop <- data.frame(alpha.prop,
beta.prop,
a.plus.b.prop,
a.div.b)
row.names(aa.prop) <- aa.1.1
aa.prop
## alpha.prop beta.prop a.plus.b.prop a.div.b
## A 0.116469146 0.073126801 0.09264161 0.08331410
## R 0.021659174 0.024135447 0.04129169 0.03369490
## N 0.039640376 0.050072046 0.06352567 0.04223402
## D 0.066612178 0.043587896 0.05876125 0.05631202
## C 0.008990601 0.027017291 0.03917417 0.01453958
## Q 0.027380466 0.043948127 0.03917417 0.02630972
## E 0.054760932 0.030979827 0.04552673 0.05931225
## G 0.080506743 0.106988473 0.09052409 0.08700669
## H 0.045361667 0.017651297 0.01746956 0.02469421
## I 0.037188394 0.043227666 0.04923240 0.05515809
## L 0.090314671 0.063760807 0.05823187 0.07823679
## K 0.101757254 0.041426513 0.05929063 0.07408262
## M 0.019615856 0.005763689 0.01323452 0.02100162
## F 0.050265631 0.030619597 0.02752779 0.03646434
## P 0.033510421 0.045749280 0.03758602 0.04338795
## S 0.049856968 0.122838617 0.06670196 0.07546734
## T 0.048630977 0.091138329 0.06193753 0.05492730
## W 0.013485901 0.015850144 0.01588142 0.01661666
## Y 0.025745811 0.039625360 0.05717311 0.03000231
## V 0.068246833 0.082492795 0.06511382 0.08723748
plot(aa.prop,panel = panel.smooth)
#Percent Identity Comparisons (PID)
#Chimps: NP_001103628.1
chimps_fasta <- rentrez::entrez_fetch(db = "protein",
id = "NP_001103628.1",
rettype = "fasta")
#Human: NP_000690.1
human_fasta <- rentrez::entrez_fetch(db = "protein",
id = "NP_000690.1",
rettype = "fasta")
#Fruit_Fly: NP_001286518.1
fruit_fly_fasta <- rentrez::entrez_fetch(db = "protein",
id = "NP_001286518.1",
rettype = "fasta")
#Cattle: NP_001030188.1
cattle_fasta <- rentrez::entrez_fetch(db = "protein",
id = "NP_001030188.1",
rettype = "fasta")
chimps_vector <- fasta_cleaner(chimps_fasta)
human_vector <- fasta_cleaner(human_fasta)
fruit_fly_vector <- fasta_cleaner(fruit_fly_fasta)
cattle_vector <- fasta_cleaner(cattle_fasta)
chimps_vector
## [1] "M" "K" "F" "F" "L" "L" "L" "L" "T" "I" "G" "F" "C" "W" "A" "Q" "Y" "S"
## [19] "P" "N" "T" "Q" "Q" "G" "R" "T" "S" "I" "V" "H" "L" "F" "E" "W" "R" "W"
## [37] "V" "D" "I" "A" "L" "E" "C" "E" "R" "Y" "L" "A" "P" "K" "G" "F" "G" "G"
## [55] "V" "Q" "V" "S" "P" "P" "N" "E" "N" "V" "A" "I" "Y" "N" "P" "F" "R" "P"
## [73] "W" "W" "E" "R" "Y" "Q" "P" "V" "S" "Y" "K" "L" "C" "T" "R" "S" "G" "N"
## [91] "E" "D" "E" "F" "R" "N" "M" "V" "T" "R" "C" "N" "N" "V" "G" "V" "R" "I"
## [109] "Y" "V" "D" "A" "V" "I" "N" "H" "M" "C" "G" "N" "A" "V" "S" "A" "G" "T"
## [127] "S" "S" "T" "C" "G" "S" "Y" "F" "N" "P" "G" "S" "R" "D" "F" "P" "A" "V"
## [145] "P" "Y" "S" "G" "W" "D" "F" "N" "D" "G" "K" "C" "K" "T" "G" "S" "G" "D"
## [163] "I" "E" "N" "Y" "N" "D" "A" "T" "Q" "V" "R" "D" "C" "R" "L" "S" "G" "L"
## [181] "L" "D" "L" "A" "L" "E" "K" "D" "Y" "V" "R" "S" "K" "I" "A" "E" "Y" "M"
## [199] "N" "H" "L" "I" "D" "I" "G" "V" "A" "G" "F" "R" "L" "D" "A" "S" "K" "H"
## [217] "M" "W" "P" "G" "D" "I" "K" "A" "I" "L" "D" "K" "L" "H" "N" "L" "N" "S"
## [235] "N" "W" "F" "P" "E" "G" "S" "K" "P" "F" "I" "Y" "Q" "E" "V" "I" "D" "L"
## [253] "G" "G" "E" "P" "I" "K" "S" "N" "D" "Y" "F" "G" "N" "G" "R" "V" "T" "E"
## [271] "F" "K" "Y" "G" "A" "K" "L" "G" "T" "V" "I" "R" "K" "W" "N" "G" "E" "K"
## [289] "M" "S" "Y" "L" "K" "N" "W" "G" "E" "G" "W" "G" "F" "I" "P" "S" "D" "R"
## [307] "A" "L" "V" "F" "V" "D" "N" "H" "D" "N" "Q" "R" "G" "H" "G" "A" "G" "G"
## [325] "A" "S" "I" "L" "T" "F" "W" "D" "A" "R" "L" "Y" "K" "M" "A" "V" "G" "F"
## [343] "M" "L" "A" "H" "P" "Y" "G" "F" "T" "R" "V" "M" "S" "S" "Y" "R" "W" "P"
## [361] "R" "Q" "F" "Q" "N" "G" "N" "D" "V" "N" "D" "W" "V" "G" "P" "P" "N" "N"
## [379] "N" "G" "V" "I" "K" "E" "V" "T" "I" "N" "P" "D" "T" "T" "C" "G" "N" "D"
## [397] "W" "V" "C" "E" "H" "R" "W" "R" "Q" "I" "R" "N" "M" "V" "N" "F" "R" "N"
## [415] "V" "V" "D" "G" "Q" "P" "F" "X" "N" "W" "Y" "D" "N" "G" "X" "N" "Q" "V"
## [433] "X" "F" "G" "R" "G" "N" "R" "G" "F" "I" "V" "F" "N" "N" "D" "D" "W" "S"
## [451] "F" "S" "L" "T" "L" "Q" "T" "G" "L" "P" "A" "G" "T" "Y" "C" "D" "V" "I"
## [469] "S" "G" "D" "K" "I" "N" "G" "N" "C" "T" "G" "I" "K" "I" "Y" "V" "S" "D"
## [487] "D" "G" "K" "A" "H" "F" "S" "I" "S" "N" "S" "A" "E" "D" "P" "F" "I" "A"
## [505] "I" "H" "A" "E" "S" "K" "L"
human_vector
## [1] "M" "K" "F" "F" "L" "L" "L" "F" "T" "I" "G" "F" "C" "W" "A" "Q" "Y" "S"
## [19] "P" "N" "T" "Q" "Q" "G" "R" "T" "S" "I" "V" "H" "L" "F" "E" "W" "R" "W"
## [37] "V" "D" "I" "A" "L" "E" "C" "E" "R" "Y" "L" "A" "P" "K" "G" "F" "G" "G"
## [55] "V" "Q" "V" "S" "P" "P" "N" "E" "N" "V" "A" "I" "Y" "N" "P" "F" "R" "P"
## [73] "W" "W" "E" "R" "Y" "Q" "P" "V" "S" "Y" "K" "L" "C" "T" "R" "S" "G" "N"
## [91] "E" "D" "E" "F" "R" "N" "M" "V" "T" "R" "C" "N" "N" "V" "G" "V" "R" "I"
## [109] "Y" "V" "D" "A" "V" "I" "N" "H" "M" "C" "G" "N" "A" "V" "S" "A" "G" "T"
## [127] "S" "S" "T" "C" "G" "S" "Y" "F" "N" "P" "G" "S" "R" "D" "F" "P" "A" "V"
## [145] "P" "Y" "S" "G" "W" "D" "F" "N" "D" "G" "K" "C" "K" "T" "G" "S" "G" "D"
## [163] "I" "E" "N" "Y" "N" "D" "A" "T" "Q" "V" "R" "D" "C" "R" "L" "T" "G" "L"
## [181] "L" "D" "L" "A" "L" "E" "K" "D" "Y" "V" "R" "S" "K" "I" "A" "E" "Y" "M"
## [199] "N" "H" "L" "I" "D" "I" "G" "V" "A" "G" "F" "R" "L" "D" "A" "S" "K" "H"
## [217] "M" "W" "P" "G" "D" "I" "K" "A" "I" "L" "D" "K" "L" "H" "N" "L" "N" "S"
## [235] "N" "W" "F" "P" "A" "G" "S" "K" "P" "F" "I" "Y" "Q" "E" "V" "I" "D" "L"
## [253] "G" "G" "E" "P" "I" "K" "S" "S" "D" "Y" "F" "G" "N" "G" "R" "V" "T" "E"
## [271] "F" "K" "Y" "G" "A" "K" "L" "G" "T" "V" "I" "R" "K" "W" "N" "G" "E" "K"
## [289] "M" "S" "Y" "L" "K" "N" "W" "G" "E" "G" "W" "G" "F" "V" "P" "S" "D" "R"
## [307] "A" "L" "V" "F" "V" "D" "N" "H" "D" "N" "Q" "R" "G" "H" "G" "A" "G" "G"
## [325] "A" "S" "I" "L" "T" "F" "W" "D" "A" "R" "L" "Y" "K" "M" "A" "V" "G" "F"
## [343] "M" "L" "A" "H" "P" "Y" "G" "F" "T" "R" "V" "M" "S" "S" "Y" "R" "W" "P"
## [361] "R" "Q" "F" "Q" "N" "G" "N" "D" "V" "N" "D" "W" "V" "G" "P" "P" "N" "N"
## [379] "N" "G" "V" "I" "K" "E" "V" "T" "I" "N" "P" "D" "T" "T" "C" "G" "N" "D"
## [397] "W" "V" "C" "E" "H" "R" "W" "R" "Q" "I" "R" "N" "M" "V" "I" "F" "R" "N"
## [415] "V" "V" "D" "G" "Q" "P" "F" "T" "N" "W" "Y" "D" "N" "G" "S" "N" "Q" "V"
## [433] "A" "F" "G" "R" "G" "N" "R" "G" "F" "I" "V" "F" "N" "N" "D" "D" "W" "S"
## [451] "F" "S" "L" "T" "L" "Q" "T" "G" "L" "P" "A" "G" "T" "Y" "C" "D" "V" "I"
## [469] "S" "G" "D" "K" "I" "N" "G" "N" "C" "T" "G" "I" "K" "I" "Y" "V" "S" "D"
## [487] "D" "G" "K" "A" "H" "F" "S" "I" "S" "N" "S" "A" "E" "D" "P" "F" "I" "A"
## [505] "I" "H" "A" "E" "S" "K" "L"
fruit_fly_vector
## [1] "M" "F" "L" "A" "K" "S" "I" "V" "C" "L" "A" "L" "L" "A" "V" "A" "N" "A"
## [19] "Q" "F" "D" "T" "N" "Y" "A" "S" "G" "R" "S" "G" "M" "V" "H" "L" "F" "E"
## [37] "W" "K" "W" "D" "D" "I" "A" "A" "E" "C" "E" "N" "F" "L" "G" "P" "N" "G"
## [55] "Y" "A" "G" "V" "Q" "V" "S" "P" "V" "N" "E" "N" "A" "V" "K" "D" "S" "R"
## [73] "P" "W" "W" "E" "R" "Y" "Q" "P" "I" "S" "Y" "K" "L" "E" "T" "R" "S" "G"
## [91] "N" "E" "E" "Q" "F" "A" "S" "M" "V" "K" "R" "C" "N" "A" "V" "G" "V" "R"
## [109] "T" "Y" "V" "D" "V" "V" "F" "N" "H" "M" "A" "A" "D" "G" "G" "T" "Y" "G"
## [127] "T" "G" "G" "S" "T" "A" "S" "P" "S" "S" "K" "S" "Y" "P" "G" "V" "P" "Y"
## [145] "S" "S" "L" "D" "F" "N" "P" "T" "C" "A" "I" "S" "N" "Y" "N" "D" "A" "N"
## [163] "E" "V" "R" "N" "C" "E" "L" "V" "G" "L" "R" "D" "L" "N" "Q" "G" "N" "S"
## [181] "Y" "V" "Q" "D" "K" "V" "V" "E" "F" "L" "D" "H" "L" "I" "D" "L" "G" "V"
## [199] "A" "G" "F" "R" "V" "D" "A" "A" "K" "H" "M" "W" "P" "A" "D" "L" "A" "V"
## [217] "I" "Y" "G" "R" "L" "K" "N" "L" "N" "T" "D" "H" "G" "F" "A" "S" "G" "S"
## [235] "K" "A" "Y" "I" "V" "Q" "E" "V" "I" "D" "M" "G" "G" "E" "A" "I" "S" "K"
## [253] "S" "E" "Y" "T" "G" "L" "G" "A" "I" "T" "E" "F" "R" "H" "S" "D" "S" "I"
## [271] "G" "K" "V" "F" "R" "G" "K" "D" "Q" "L" "Q" "Y" "L" "T" "N" "W" "G" "T"
## [289] "A" "W" "G" "F" "A" "A" "S" "D" "R" "S" "L" "V" "F" "V" "D" "N" "H" "D"
## [307] "N" "Q" "R" "G" "H" "G" "A" "G" "G" "A" "D" "V" "L" "T" "Y" "K" "V" "P"
## [325] "K" "Q" "Y" "K" "M" "A" "S" "A" "F" "M" "L" "A" "H" "P" "F" "G" "T" "P"
## [343] "R" "V" "M" "S" "S" "F" "S" "F" "T" "D" "T" "D" "Q" "G" "P" "P" "T" "T"
## [361] "D" "G" "H" "N" "I" "A" "S" "P" "I" "F" "N" "S" "D" "N" "S" "C" "S" "G"
## [379] "G" "W" "V" "C" "E" "H" "R" "W" "R" "Q" "I" "Y" "N" "M" "V" "A" "F" "R"
## [397] "N" "T" "V" "G" "L" "D" "E" "I" "Q" "N" "W" "W" "D" "N" "G" "S" "N" "Q"
## [415] "I" "S" "F" "S" "R" "G" "S" "R" "G" "F" "V" "A" "F" "N" "N" "D" "N" "Y"
## [433] "D" "L" "N" "S" "S" "L" "Q" "T" "G" "L" "P" "A" "G" "T" "Y" "C" "D" "V"
## [451] "I" "S" "G" "S" "K" "S" "G" "S" "S" "C" "T" "G" "K" "T" "V" "T" "V" "G"
## [469] "S" "D" "G" "R" "A" "S" "I" "Y" "I" "G" "S" "S" "E" "D" "D" "G" "V" "L"
## [487] "A" "I" "H" "V" "N" "A" "K" "L"
cattle_vector
## [1] "M" "K" "F" "F" "L" "L" "L" "S" "V" "I" "V" "F" "C" "W" "A" "Q" "Y" "A"
## [19] "P" "H" "T" "K" "T" "G" "R" "T" "S" "I" "V" "H" "L" "F" "E" "W" "R" "W"
## [37] "V" "D" "I" "A" "L" "E" "C" "E" "R" "Y" "L" "A" "P" "K" "G" "F" "G" "G"
## [55] "V" "Q" "I" "S" "P" "P" "S" "E" "N" "A" "V" "I" "T" "D" "P" "S" "R" "P"
## [73] "W" "W" "E" "R" "Y" "Q" "P" "V" "S" "Y" "K" "L" "C" "T" "R" "S" "G" "N"
## [91] "E" "S" "E" "F" "K" "D" "M" "V" "T" "R" "C" "N" "N" "V" "G" "V" "R" "I"
## [109] "Y" "V" "D" "A" "V" "I" "N" "H" "M" "T" "G" "S" "G" "V" "S" "A" "G" "T"
## [127] "S" "S" "T" "C" "G" "S" "Y" "F" "N" "P" "G" "T" "R" "D" "F" "P" "A" "V"
## [145] "P" "Y" "S" "G" "W" "D" "F" "N" "D" "E" "K" "C" "N" "T" "G" "N" "G" "E"
## [163] "I" "K" "S" "Y" "D" "V" "A" "Y" "Q" "V" "R" "D" "C" "R" "L" "V" "G" "L"
## [181] "L" "D" "L" "A" "L" "A" "K" "D" "Y" "V" "R" "S" "T" "V" "A" "E" "Y" "L"
## [199] "N" "R" "L" "I" "D" "I" "G" "V" "A" "G" "F" "R" "I" "D" "A" "S" "K" "H"
## [217] "M" "W" "P" "G" "D" "I" "K" "A" "V" "L" "D" "K" "L" "H" "N" "L" "N" "T"
## [235] "S" "W" "F" "P" "E" "G" "S" "R" "P" "F" "I" "Y" "Q" "E" "V" "I" "D" "L"
## [253] "G" "G" "E" "T" "I" "T" "S" "S" "D" "Y" "F" "G" "N" "G" "R" "V" "T" "E"
## [271] "F" "K" "Y" "G" "V" "K" "L" "G" "T" "V" "L" "R" "K" "W" "S" "G" "E" "K"
## [289] "M" "A" "Y" "L" "K" "N" "W" "G" "E" "G" "W" "G" "F" "M" "P" "S" "D" "R"
## [307] "A" "L" "V" "F" "V" "D" "N" "H" "D" "N" "Q" "R" "G" "H" "G" "A" "G" "G"
## [325] "A" "S" "I" "L" "T" "F" "W" "D" "A" "R" "L" "Y" "K" "M" "G" "V" "G" "F"
## [343] "M" "L" "A" "H" "P" "Y" "G" "F" "T" "R" "V" "M" "S" "S" "Y" "H" "W" "P"
## [361] "R" "H" "F" "E" "D" "G" "K" "D" "V" "N" "D" "W" "V" "G" "P" "P" "N" "N"
## [379] "N" "G" "V" "I" "K" "E" "V" "T" "I" "N" "P" "D" "T" "T" "C" "G" "N" "G"
## [397] "W" "V" "C" "E" "H" "R" "W" "R" "Q" "I" "R" "N" "M" "V" "I" "F" "R" "N"
## [415] "V" "V" "D" "G" "Q" "P" "F" "T" "N" "W" "W" "D" "N" "G" "S" "N" "Q" "V"
## [433] "A" "F" "G" "R" "G" "N" "K" "G" "F" "I" "V" "F" "N" "N" "D" "D" "W" "A"
## [451] "L" "S" "A" "T" "L" "Q" "T" "G" "L" "P" "P" "G" "T" "Y" "C" "D" "V" "I"
## [469] "S" "G" "D" "K" "I" "G" "D" "N" "C" "T" "V" "I" "E" "I" "N" "V" "S" "C"
## [487] "D" "G" "N" "A" "Y" "F" "S" "I" "S" "N" "S" "A" "E" "D" "P" "F" "I" "A"
## [505] "I" "H" "T" "E" "S" "K" "L"
data(package="Biostrings")
chimps_string <- paste(chimps_vector,collapse = "")
human_string <- paste(human_vector,collapse = "" )
fruit_fly_string <- paste(fruit_fly_vector,collapse = "")
cattle_string <- paste(cattle_vector,collapse = "")
chimps_string <- toupper(chimps_string)
human_string <- toupper(human_string)
fruit_fly_string <- toupper(fruit_fly_string)
cattle_string <- toupper(cattle_string)
chimps_string
## [1] "MKFFLLLLTIGFCWAQYSPNTQQGRTSIVHLFEWRWVDIALECERYLAPKGFGGVQVSPPNENVAIYNPFRPWWERYQPVSYKLCTRSGNEDEFRNMVTRCNNVGVRIYVDAVINHMCGNAVSAGTSSTCGSYFNPGSRDFPAVPYSGWDFNDGKCKTGSGDIENYNDATQVRDCRLSGLLDLALEKDYVRSKIAEYMNHLIDIGVAGFRLDASKHMWPGDIKAILDKLHNLNSNWFPEGSKPFIYQEVIDLGGEPIKSNDYFGNGRVTEFKYGAKLGTVIRKWNGEKMSYLKNWGEGWGFIPSDRALVFVDNHDNQRGHGAGGASILTFWDARLYKMAVGFMLAHPYGFTRVMSSYRWPRQFQNGNDVNDWVGPPNNNGVIKEVTINPDTTCGNDWVCEHRWRQIRNMVNFRNVVDGQPFXNWYDNGXNQVXFGRGNRGFIVFNNDDWSFSLTLQTGLPAGTYCDVISGDKINGNCTGIKIYVSDDGKAHFSISNSAEDPFIAIHAESKL"
human_string
## [1] "MKFFLLLFTIGFCWAQYSPNTQQGRTSIVHLFEWRWVDIALECERYLAPKGFGGVQVSPPNENVAIYNPFRPWWERYQPVSYKLCTRSGNEDEFRNMVTRCNNVGVRIYVDAVINHMCGNAVSAGTSSTCGSYFNPGSRDFPAVPYSGWDFNDGKCKTGSGDIENYNDATQVRDCRLTGLLDLALEKDYVRSKIAEYMNHLIDIGVAGFRLDASKHMWPGDIKAILDKLHNLNSNWFPAGSKPFIYQEVIDLGGEPIKSSDYFGNGRVTEFKYGAKLGTVIRKWNGEKMSYLKNWGEGWGFVPSDRALVFVDNHDNQRGHGAGGASILTFWDARLYKMAVGFMLAHPYGFTRVMSSYRWPRQFQNGNDVNDWVGPPNNNGVIKEVTINPDTTCGNDWVCEHRWRQIRNMVIFRNVVDGQPFTNWYDNGSNQVAFGRGNRGFIVFNNDDWSFSLTLQTGLPAGTYCDVISGDKINGNCTGIKIYVSDDGKAHFSISNSAEDPFIAIHAESKL"
fruit_fly_string
## [1] "MFLAKSIVCLALLAVANAQFDTNYASGRSGMVHLFEWKWDDIAAECENFLGPNGYAGVQVSPVNENAVKDSRPWWERYQPISYKLETRSGNEEQFASMVKRCNAVGVRTYVDVVFNHMAADGGTYGTGGSTASPSSKSYPGVPYSSLDFNPTCAISNYNDANEVRNCELVGLRDLNQGNSYVQDKVVEFLDHLIDLGVAGFRVDAAKHMWPADLAVIYGRLKNLNTDHGFASGSKAYIVQEVIDMGGEAISKSEYTGLGAITEFRHSDSIGKVFRGKDQLQYLTNWGTAWGFAASDRSLVFVDNHDNQRGHGAGGADVLTYKVPKQYKMASAFMLAHPFGTPRVMSSFSFTDTDQGPPTTDGHNIASPIFNSDNSCSGGWVCEHRWRQIYNMVAFRNTVGLDEIQNWWDNGSNQISFSRGSRGFVAFNNDNYDLNSSLQTGLPAGTYCDVISGSKSGSSCTGKTVTVGSDGRASIYIGSSEDDGVLAIHVNAKL"
cattle_string
## [1] "MKFFLLLSVIVFCWAQYAPHTKTGRTSIVHLFEWRWVDIALECERYLAPKGFGGVQISPPSENAVITDPSRPWWERYQPVSYKLCTRSGNESEFKDMVTRCNNVGVRIYVDAVINHMTGSGVSAGTSSTCGSYFNPGTRDFPAVPYSGWDFNDEKCNTGNGEIKSYDVAYQVRDCRLVGLLDLALAKDYVRSTVAEYLNRLIDIGVAGFRIDASKHMWPGDIKAVLDKLHNLNTSWFPEGSRPFIYQEVIDLGGETITSSDYFGNGRVTEFKYGVKLGTVLRKWSGEKMAYLKNWGEGWGFMPSDRALVFVDNHDNQRGHGAGGASILTFWDARLYKMGVGFMLAHPYGFTRVMSSYHWPRHFEDGKDVNDWVGPPNNNGVIKEVTINPDTTCGNGWVCEHRWRQIRNMVIFRNVVDGQPFTNWWDNGSNQVAFGRGNKGFIVFNNDDWALSATLQTGLPPGTYCDVISGDKIGDNCTVIEINVSCDGNAYFSISNSAEDPFIAIHTESKL"
data(BLOSUM50)
#CHIMPS VS OTHER
chimps_vs_human <- Biostrings::pairwiseAlignment(chimps_string,
human_string,
substitutionMatrix = BLOSUM50,
gapOpening = -8,
gapExtension = -2,
scoreOnly = FALSE)
chimps_vs_fruit_fly <- Biostrings::pairwiseAlignment(chimps_string,
fruit_fly_string,
substitutionMatrix = BLOSUM50,
gapOpening = -8,
gapExtension = -2,
scoreOnly = FALSE)
chimps_vs_cattle <- Biostrings::pairwiseAlignment(chimps_string,
cattle_string ,
substitutionMatrix = BLOSUM50,
gapOpening = -8,
gapExtension = -2,
scoreOnly = FALSE)
chimps_vs_human
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: MKFFLLLLTIGFCWAQYSPNTQQGRTSIVHLFEW...TGIKIYVSDDGKAHFSISNSAEDPFIAIHAESKL
## subject: MKFFLLLFTIGFCWAQYSPNTQQGRTSIVHLFEW...TGIKIYVSDDGKAHFSISNSAEDPFIAIHAESKL
## score: 3593
chimps_vs_fruit_fly
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: M---KFFLLLLTIGFCWAQYSPNTQQGRTSIVHL...TGIKIYVSDDGKAHFSISNSAEDPFIAIHAESKL
## subject: MFLAKSIVCLALLAVANAQFDTNYASGRSGMVHL...TGKTVTVGSDGRASIYIGSSEDDGVLAIHVNAKL
## score: 1859
chimps_vs_cattle
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: MKFFLLLLTIGFCWAQYSPNTQQGRTSIVHLFEW...TGIKIYVSDDGKAHFSISNSAEDPFIAIHAESKL
## subject: MKFFLLLSVIVFCWAQYAPHTKTGRTSIVHLFEW...TVIEINVSCDGNAYFSISNSAEDPFIAIHTESKL
## score: 3167
pid(chimps_vs_human)
## [1] 98.23875
pid(chimps_vs_fruit_fly)
## [1] 52.90698
pid(chimps_vs_cattle)
## [1] 85.3229
#HUMAN VS OTHER
human_vs_fruit_fly <- Biostrings::pairwiseAlignment(human_string,
fruit_fly_string,
substitutionMatrix = BLOSUM50,
gapOpening = -8,
gapExtension = -2,
scoreOnly = FALSE)
human_vs_cattle <- Biostrings::pairwiseAlignment(human_string,
cattle_string ,
substitutionMatrix = BLOSUM50,
gapOpening = -8,
gapExtension = -2,
scoreOnly = FALSE)
human_vs_fruit_fly
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: M---KFFLLLFTIGFCWAQYSPNTQQGRTSIVHL...TGIKIYVSDDGKAHFSISNSAEDPFIAIHAESKL
## subject: MFLAKSIVCLALLAVANAQFDTNYASGRSGMVHL...TGKTVTVGSDGRASIYIGSSEDDGVLAIHVNAKL
## score: 1875
human_vs_cattle
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: MKFFLLLFTIGFCWAQYSPNTQQGRTSIVHLFEW...TGIKIYVSDDGKAHFSISNSAEDPFIAIHAESKL
## subject: MKFFLLLSVIVFCWAQYAPHTKTGRTSIVHLFEW...TVIEINVSCDGNAYFSISNSAEDPFIAIHTESKL
## score: 3190
pid(human_vs_fruit_fly)
## [1] 53.29457
pid(human_vs_cattle)
## [1] 86.10568
#FRUIT FLY VS OTHER
fruit_fly_vs_cattle <- Biostrings::pairwiseAlignment(fruit_fly_string,
cattle_string ,
substitutionMatrix = BLOSUM50,
gapOpening = -8,
gapExtension = -2,
scoreOnly = FALSE)
fruit_fly_vs_cattle
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: MFLAKSIVCLALLAVANAQFDTNYASGRSGMVHL...TGKTVTVGSDGRASIYIGSSEDDGVLAIHVNAKL
## subject: M---KFFLLLSVIVFCWAQYAPHTKTGRTSIVHL...TVIEINVSCDGNAYFSISNSAEDPFIAIHTESKL
## score: 1855
pid(fruit_fly_vs_cattle)
## [1] 52.90698
pids <- c(1, NA, NA, NA,
98.23875, 1, NA, NA,
52.90698, 53.29457, 1, NA,
85.3229, 86.10568, 52.90698, 1)
mat <- matrix(pids, nrow = 4, byrow = T)
row.names(mat) <- c("CHIMPS","HUMANS","FRUIT_FLIES","CATTLES")
colnames(mat) <- c("CHIMPS","HUMANS","FRUIT_FLIES","CATTLES")
pander::pander(mat)
| Â | CHIMPS | HUMANS | FRUIT_FLIES | CATTLES |
|---|---|---|---|---|
| CHIMPS | 1 | NA | NA | NA |
| HUMANS | 98.24 | 1 | NA | NA |
| FRUIT_FLIES | 52.91 | 53.29 | 1 | NA |
| CATTLES | 85.32 | 86.11 | 52.91 | 1 |
#PID methods comparison
#chimps vs human
pid(chimps_vs_human, type = "PID1")
## [1] 98.23875
pid(chimps_vs_human, type = "PID2")
## [1] 98.23875
pid(chimps_vs_human, type = "PID3")
## [1] 98.23875
pid(chimps_vs_human, type = "PID4")
## [1] 98.23875
pids_comparison <- c("PID1", 98.23875, "(aligned_positions_PLUS_internal_gap_positions)",
"PID2", 98.23875, "(aligned_positions)",
"PID3", 98.23875, "(length_shorter_sequence)",
"PID4", 98.23875, "(average_length_of_the_two_sequences)")
mat <- matrix(pids_comparison, nrow = 4, byrow = T)
row.names(mat) <- c("1","2","3","4")
colnames(mat) <- c("Method","PID","denominator")
pander::pander(mat)
| Method | PID | denominator |
|---|---|---|
| PID1 | 98.23875 | (aligned_positions_PLUS_internal_gap_positions) |
| PID2 | 98.23875 | (aligned_positions) |
| PID3 | 98.23875 | (length_shorter_sequence) |
| PID4 | 98.23875 | (average_length_of_the_two_sequences) |
#Multiple sequence alignment
a1 <- entrez_fetch(db = "protein",
id = "NP_000690.1",
rettype = "fasta")
a2 <- entrez_fetch(db = "protein",
id = "NP_001103628.1",
rettype = "fasta")
a3 <- entrez_fetch(db = "protein",
id = "NP_001030188.1",
rettype = "fasta")
a4 <- entrez_fetch(db = "protein",
id = "NP_001008222.1",
rettype = "fasta")
a5 <- entrez_fetch(db = "protein",
id = "NP_001286518.1",
rettype = "fasta")
a1 <- fasta_cleaner(a1, parse = F)
a2 <- fasta_cleaner(a2, parse = F)
a3 <- fasta_cleaner(a3, parse = F)
a4 <- fasta_cleaner(a4, parse = F)
a5 <- fasta_cleaner(a5, parse = F)
TABLE <- c("NP_000690.1", "Homo_sapiens", "a1",
"NP_001103628.1", "Pan_troglodytes", "a2",
"NP_001030188.1", "Bos_taurus", "a3",
"NP_001008222.1", "Homo_sapiens", "a4",
"NP_001286518.1", "Drosophila_melanogaster", "a5" )
TABLE_matrix <- matrix(TABLE,
byrow = T,
nrow = 5)
table <- data.frame(TABLE_matrix,
stringsAsFactors = F)
names(table) <- c("accession", "name.orig","name.new")
table
## accession name.orig name.new
## 1 NP_000690.1 Homo_sapiens a1
## 2 NP_001103628.1 Pan_troglodytes a2
## 3 NP_001030188.1 Bos_taurus a3
## 4 NP_001008222.1 Homo_sapiens a4
## 5 NP_001286518.1 Drosophila_melanogaster a5
table$accession
## [1] "NP_000690.1" "NP_001103628.1" "NP_001030188.1" "NP_001008222.1"
## [5] "NP_001286518.1"
LIST <- entrez_fetch(db = "protein",
id = table$accession,
rettype = "fasta")
cat(LIST)
## >NP_000690.1 pancreatic alpha-amylase precursor [Homo sapiens]
## MKFFLLLFTIGFCWAQYSPNTQQGRTSIVHLFEWRWVDIALECERYLAPKGFGGVQVSPPNENVAIYNPF
## RPWWERYQPVSYKLCTRSGNEDEFRNMVTRCNNVGVRIYVDAVINHMCGNAVSAGTSSTCGSYFNPGSRD
## FPAVPYSGWDFNDGKCKTGSGDIENYNDATQVRDCRLTGLLDLALEKDYVRSKIAEYMNHLIDIGVAGFR
## LDASKHMWPGDIKAILDKLHNLNSNWFPAGSKPFIYQEVIDLGGEPIKSSDYFGNGRVTEFKYGAKLGTV
## IRKWNGEKMSYLKNWGEGWGFVPSDRALVFVDNHDNQRGHGAGGASILTFWDARLYKMAVGFMLAHPYGF
## TRVMSSYRWPRQFQNGNDVNDWVGPPNNNGVIKEVTINPDTTCGNDWVCEHRWRQIRNMVIFRNVVDGQP
## FTNWYDNGSNQVAFGRGNRGFIVFNNDDWSFSLTLQTGLPAGTYCDVISGDKINGNCTGIKIYVSDDGKA
## HFSISNSAEDPFIAIHAESKL
##
## >NP_001103628.1 pancreatic alpha-amylase precursor [Pan troglodytes]
## MKFFLLLLTIGFCWAQYSPNTQQGRTSIVHLFEWRWVDIALECERYLAPKGFGGVQVSPPNENVAIYNPF
## RPWWERYQPVSYKLCTRSGNEDEFRNMVTRCNNVGVRIYVDAVINHMCGNAVSAGTSSTCGSYFNPGSRD
## FPAVPYSGWDFNDGKCKTGSGDIENYNDATQVRDCRLSGLLDLALEKDYVRSKIAEYMNHLIDIGVAGFR
## LDASKHMWPGDIKAILDKLHNLNSNWFPEGSKPFIYQEVIDLGGEPIKSNDYFGNGRVTEFKYGAKLGTV
## IRKWNGEKMSYLKNWGEGWGFIPSDRALVFVDNHDNQRGHGAGGASILTFWDARLYKMAVGFMLAHPYGF
## TRVMSSYRWPRQFQNGNDVNDWVGPPNNNGVIKEVTINPDTTCGNDWVCEHRWRQIRNMVNFRNVVDGQP
## FXNWYDNGXNQVXFGRGNRGFIVFNNDDWSFSLTLQTGLPAGTYCDVISGDKINGNCTGIKIYVSDDGKA
## HFSISNSAEDPFIAIHAESKL
##
## >NP_001030188.1 alpha-amylase 2B precursor [Bos taurus]
## MKFFLLLSVIVFCWAQYAPHTKTGRTSIVHLFEWRWVDIALECERYLAPKGFGGVQISPPSENAVITDPS
## RPWWERYQPVSYKLCTRSGNESEFKDMVTRCNNVGVRIYVDAVINHMTGSGVSAGTSSTCGSYFNPGTRD
## FPAVPYSGWDFNDEKCNTGNGEIKSYDVAYQVRDCRLVGLLDLALAKDYVRSTVAEYLNRLIDIGVAGFR
## IDASKHMWPGDIKAVLDKLHNLNTSWFPEGSRPFIYQEVIDLGGETITSSDYFGNGRVTEFKYGVKLGTV
## LRKWSGEKMAYLKNWGEGWGFMPSDRALVFVDNHDNQRGHGAGGASILTFWDARLYKMGVGFMLAHPYGF
## TRVMSSYHWPRHFEDGKDVNDWVGPPNNNGVIKEVTINPDTTCGNGWVCEHRWRQIRNMVIFRNVVDGQP
## FTNWWDNGSNQVAFGRGNKGFIVFNNDDWALSATLQTGLPPGTYCDVISGDKIGDNCTVIEINVSCDGNA
## YFSISNSAEDPFIAIHTESKL
##
## >NP_001008222.1 alpha-amylase 1A precursor [Homo sapiens]
## MKLFWLLFTIGFCWAQYSSNTQQGRTSIVHLFEWRWVDIALECERYLAPKGFGGVQVSPPNENVAIHNPF
## RPWWERYQPVSYKLCTRSGNEDEFRNMVTRCNNVGVRIYVDAVINHMCGNAVSAGTSSTCGSYFNPGSRD
## FPAVPYSGWDFNDGKCKTGSGDIENYNDATQVRDCRLSGLLDLALGKDYVRSKIAEYMNHLIDIGVAGFR
## IDASKHMWPGDIKAILDKLHNLNSNWFPEGSKPFIYQEVIDLGGEPIKSSDYFGNGRVTEFKYGAKLGTV
## IRKWNGEKMSYLKNWGEGWGFMPSDRALVFVDNHDNQRGHGAGGASILTFWDARLYKMAVGFMLAHPYGF
## TRVMSSYRWPRYFENGKDVNDWVGPPNDNGVTKEVTINPDTTCGNDWVCEHRWRQIRNMVNFRNVVDGQP
## FTNWYDNGSNQVAFGRGNRGFIVFNNDDWTFSLTLQTGLPAGTYCDVISGDKINGNCTGIKIYVSDDGKA
## HFSISNSAEDPFIAIHAESKL
##
## >NP_001286518.1 amylase proximal, isoform B [Drosophila melanogaster]
## MFLAKSIVCLALLAVANAQFDTNYASGRSGMVHLFEWKWDDIAAECENFLGPNGYAGVQVSPVNENAVKD
## SRPWWERYQPISYKLETRSGNEEQFASMVKRCNAVGVRTYVDVVFNHMAADGGTYGTGGSTASPSSKSYP
## GVPYSSLDFNPTCAISNYNDANEVRNCELVGLRDLNQGNSYVQDKVVEFLDHLIDLGVAGFRVDAAKHMW
## PADLAVIYGRLKNLNTDHGFASGSKAYIVQEVIDMGGEAISKSEYTGLGAITEFRHSDSIGKVFRGKDQL
## QYLTNWGTAWGFAASDRSLVFVDNHDNQRGHGAGGADVLTYKVPKQYKMASAFMLAHPFGTPRVMSSFSF
## TDTDQGPPTTDGHNIASPIFNSDNSCSGGWVCEHRWRQIYNMVAFRNTVGLDEIQNWWDNGSNQISFSRG
## SRGFVAFNNDNYDLNSSLQTGLPAGTYCDVISGSKSGSSCTGKTVTVGSDGRASIYIGSSEDDGVLAIHV
## NAKL
entrez_fetch_list <- function(db, id, rettype, ...){
#setup list for storing output
n.seq <- length(id)
list.output <- as.list(rep(NA, n.seq))
names(list.output) <- id
# get output
for(i in 1:length(id)){
list.output[[i]] <- rentrez::entrez_fetch(db = db,
id = id[i],
rettype = rettype)
}
return(list.output)
}
list <- entrez_fetch_list(db = "protein",
id =table$accession,
rettype = "fasta")
list[[1]]
## [1] ">NP_000690.1 pancreatic alpha-amylase precursor [Homo sapiens]\nMKFFLLLFTIGFCWAQYSPNTQQGRTSIVHLFEWRWVDIALECERYLAPKGFGGVQVSPPNENVAIYNPF\nRPWWERYQPVSYKLCTRSGNEDEFRNMVTRCNNVGVRIYVDAVINHMCGNAVSAGTSSTCGSYFNPGSRD\nFPAVPYSGWDFNDGKCKTGSGDIENYNDATQVRDCRLTGLLDLALEKDYVRSKIAEYMNHLIDIGVAGFR\nLDASKHMWPGDIKAILDKLHNLNSNWFPAGSKPFIYQEVIDLGGEPIKSSDYFGNGRVTEFKYGAKLGTV\nIRKWNGEKMSYLKNWGEGWGFVPSDRALVFVDNHDNQRGHGAGGASILTFWDARLYKMAVGFMLAHPYGF\nTRVMSSYRWPRQFQNGNDVNDWVGPPNNNGVIKEVTINPDTTCGNDWVCEHRWRQIRNMVIFRNVVDGQP\nFTNWYDNGSNQVAFGRGNRGFIVFNNDDWSFSLTLQTGLPAGTYCDVISGDKINGNCTGIKIYVSDDGKA\nHFSISNSAEDPFIAIHAESKL\n\n"
list[[1]] <- fasta_cleaner(list[[1]], parse = F)
list[[2]] <- fasta_cleaner(list[[2]], parse = F)
list[[3]] <- fasta_cleaner(list[[3]], parse = F)
list[[4]] <- fasta_cleaner(list[[4]], parse = F)
list[[5]] <- fasta_cleaner(list[[5]], parse = F)
length(list)
## [1] 5
list_vector <- rep(NA, length(list))
list_vector
## [1] NA NA NA NA NA
for(i in 1:length(list_vector)){
list_vector[i] <- list[[i]]}
names(list_vector) <- names(list)
list_vector_ss <- Biostrings::AAStringSet(list_vector)
list_align <- msa(list_vector_ss,
method = "ClustalW")
## use default substitution matrix
list_align
## CLUSTAL 2.1
##
## Call:
## msa(list_vector_ss, method = "ClustalW")
##
## MsaAAMultipleAlignment with 5 rows and 516 columns
## aln names
## [1] ---MKFFLLLFTIGFCWAQYSPNTQQ...DGKAHFSISNSAEDPFIAIHAESKL NP_000690.1
## [2] ---MKFFLLLLTIGFCWAQYSPNTQQ...DGKAHFSISNSAEDPFIAIHAESKL NP_001103628.1
## [3] ---MKLFWLLFTIGFCWAQYSSNTQQ...DGKAHFSISNSAEDPFIAIHAESKL NP_001008222.1
## [4] ---MKFFLLLSVIVFCWAQYAPHTKT...DGNAYFSISNSAEDPFIAIHTESKL NP_001030188.1
## [5] MFLAKSIVCLALLAVANAQFDTNYAS...DGRASIYIGSSEDDGVLAIHVNAKL NP_001286518.1
## Con ---MKFFLLL?TIGFCWAQYSPNTQQ...DGKAHFSISNSAEDPFIAIHAESKL Consensus
class(list_align)
## [1] "MsaAAMultipleAlignment"
## attr(,"package")
## [1] "msa"
class(list_align) <- "AAMultipleAlignment"
list_align_seqinr <- msaConvert(list_align,
type = "seqinr::alignment")
compbio4all::print_msa(alignment = list_align_seqinr,
chunksize = 60)
## [1] "---MKFFLLLFTIGFCWAQYSPNTQQGRTSIVHLFEWRWVDIALECERYLAPKGFGGVQV 0"
## [1] "---MKFFLLLLTIGFCWAQYSPNTQQGRTSIVHLFEWRWVDIALECERYLAPKGFGGVQV 0"
## [1] "---MKLFWLLFTIGFCWAQYSSNTQQGRTSIVHLFEWRWVDIALECERYLAPKGFGGVQV 0"
## [1] "---MKFFLLLSVIVFCWAQYAPHTKTGRTSIVHLFEWRWVDIALECERYLAPKGFGGVQI 0"
## [1] "MFLAKSIVCLALLAVANAQFDTNYASGRSGMVHLFEWKWDDIAAECENFLGPNGYAGVQV 0"
## [1] " "
## [1] "SPPNENVAIYNPFRPWWERYQPVSYKLCTRSGNEDEFRNMVTRCNNVGVRIYVDAVINHM 0"
## [1] "SPPNENVAIYNPFRPWWERYQPVSYKLCTRSGNEDEFRNMVTRCNNVGVRIYVDAVINHM 0"
## [1] "SPPNENVAIHNPFRPWWERYQPVSYKLCTRSGNEDEFRNMVTRCNNVGVRIYVDAVINHM 0"
## [1] "SPPSENAVITDPSRPWWERYQPVSYKLCTRSGNESEFKDMVTRCNNVGVRIYVDAVINHM 0"
## [1] "SPVNENAVKDS--RPWWERYQPISYKLETRSGNEEQFASMVKRCNAVGVRTYVDVVFNHM 0"
## [1] " "
## [1] "CGNAVSAGTSSTCGSYFNPGSRDFPAVPYSGWDFNDGKCKTGSGDIENYNDATQVRDCRL 0"
## [1] "CGNAVSAGTSSTCGSYFNPGSRDFPAVPYSGWDFNDGKCKTGSGDIENYNDATQVRDCRL 0"
## [1] "CGNAVSAGTSSTCGSYFNPGSRDFPAVPYSGWDFNDGKCKTGSGDIENYNDATQVRDCRL 0"
## [1] "TGSGVSAGTSSTCGSYFNPGTRDFPAVPYSGWDFNDEKCNTGNGEIKSYDVAYQVRDCRL 0"
## [1] "AADG---GTYGTGGSTASPSSKSYPGVPYSSLDFN------PTCAISNYNDANEVRNCEL 0"
## [1] " "
## [1] "TGLLDLALEKDYVRSKIAEYMNHLIDIGVAGFRLDASKHMWPGDIKAILDKLHNLNSNW- 0"
## [1] "SGLLDLALEKDYVRSKIAEYMNHLIDIGVAGFRLDASKHMWPGDIKAILDKLHNLNSNW- 0"
## [1] "SGLLDLALGKDYVRSKIAEYMNHLIDIGVAGFRIDASKHMWPGDIKAILDKLHNLNSNW- 0"
## [1] "VGLLDLALAKDYVRSTVAEYLNRLIDIGVAGFRIDASKHMWPGDIKAVLDKLHNLNTSW- 0"
## [1] "VGLRDLNQGNSYVQDKVVEFLDHLIDLGVAGFRVDAAKHMWPADLAVIYGRLKNLNTDHG 0"
## [1] " "
## [1] "FPAGSKPFIYQEVIDLGGEPIKSSDYFGNGRVTEFKYGAKLGTVIRKWNGEKMSYLKNWG 0"
## [1] "FPEGSKPFIYQEVIDLGGEPIKSNDYFGNGRVTEFKYGAKLGTVIRKWNGEKMSYLKNWG 0"
## [1] "FPEGSKPFIYQEVIDLGGEPIKSSDYFGNGRVTEFKYGAKLGTVIRKWNGEKMSYLKNWG 0"
## [1] "FPEGSRPFIYQEVIDLGGETITSSDYFGNGRVTEFKYGVKLGTVLRKWSGEKMAYLKNWG 0"
## [1] "FASGSKAYIVQEVIDMGGEAISKSEYTGLGAITEFRHSDSIGKVFR--GKDQLQYLTNWG 0"
## [1] " "
## [1] "EGWGFVPSDRALVFVDNHDNQRGHGAGGASILTFWDARLYKMAVGFMLAHPYGFTRVMSS 0"
## [1] "EGWGFIPSDRALVFVDNHDNQRGHGAGGASILTFWDARLYKMAVGFMLAHPYGFTRVMSS 0"
## [1] "EGWGFMPSDRALVFVDNHDNQRGHGAGGASILTFWDARLYKMAVGFMLAHPYGFTRVMSS 0"
## [1] "EGWGFMPSDRALVFVDNHDNQRGHGAGGASILTFWDARLYKMGVGFMLAHPYGFTRVMSS 0"
## [1] "TAWGFAASDRSLVFVDNHDNQRGHGAGGADVLTYKVPKQYKMASAFMLAHPFGTPRVMSS 0"
## [1] " "
## [1] "YRWPRQFQNGNDVNDWVGPPNNNG-VIKEVTINPDTTCGNDWVCEHRWRQIRNMVIFRNV 0"
## [1] "YRWPRQFQNGNDVNDWVGPPNNNG-VIKEVTINPDTTCGNDWVCEHRWRQIRNMVNFRNV 0"
## [1] "YRWPRYFENGKDVNDWVGPPNDNG-VTKEVTINPDTTCGNDWVCEHRWRQIRNMVNFRNV 0"
## [1] "YHWPRHFEDGKDVNDWVGPPNNNG-VIKEVTINPDTTCGNGWVCEHRWRQIRNMVIFRNV 0"
## [1] "FSFTDTDQ---------GPPTTDGHNIASPIFNSDNSCSGGWVCEHRWRQIYNMVAFRNT 0"
## [1] " "
## [1] "VDGQPFTNWYDNGSNQVAFGRGNRGFIVFNNDDWSFSLTLQTGLPAGTYCDVISGDKING 0"
## [1] "VDGQPFXNWYDNGXNQVXFGRGNRGFIVFNNDDWSFSLTLQTGLPAGTYCDVISGDKING 0"
## [1] "VDGQPFTNWYDNGSNQVAFGRGNRGFIVFNNDDWTFSLTLQTGLPAGTYCDVISGDKING 0"
## [1] "VDGQPFTNWWDNGSNQVAFGRGNKGFIVFNNDDWALSATLQTGLPPGTYCDVISGDKIGD 0"
## [1] "VGLDEIQNWWDNGSNQISFSRGSRGFVAFNNDNYDLNSSLQTGLPAGTYCDVISGSKSGS 0"
## [1] " "
## [1] "NCTGIKIYVSDDGKAHFSISNSAEDPFIAIHAESKL 24"
## [1] "NCTGIKIYVSDDGKAHFSISNSAEDPFIAIHAESKL 24"
## [1] "NCTGIKIYVSDDGKAHFSISNSAEDPFIAIHAESKL 24"
## [1] "NCTVIEINVSCDGNAYFSISNSAEDPFIAIHTESKL 24"
## [1] "SCTGKTVTVGSDGRASIYIGSSEDDGVLAIHVNAKL 24"
## [1] " "
class(list_align) <- "AAMultipleAlignment"
ggmsa::ggmsa(list_align,
start = 1,
end = 50)
#Distance matrix
list_subset_dist <- seqinr::dist.alignment(list_align_seqinr,
matrix = "identity")
list_subset_dist
## NP_000690.1 NP_001103628.1 NP_001008222.1 NP_001030188.1
## NP_001103628.1 0.1086785
## NP_001008222.1 0.1769496 0.1718358
## NP_001030188.1 0.3727509 0.3764735 0.3753667
## NP_001286518.1 0.6676884 0.6682081 0.6692180 0.6753019
#Phylogenetic tree for all sequences
tree_subset <- nj(list_subset_dist)
#rooted
plot.phylo(tree_subset, main="Phylogenetic Tree",
use.edge.length = F)
mtext(text = "AMY1 family gene tree - rooted, no branch lenths")
#unrooted
plot.phylo(tree_subset, main="Phylogenetic Tree",
type = "unrooted",
use.edge.length = F)
mtext(text = "AMY1 family gene tree - UNrooted, no branch lengths")