function: This gene encodes a transcription factor involved in the induction of genes regulated by oxygen, which is induced as oxygen levels fall. The encoded protein contains a basic-helix-loop-helix domain protein dimerization domain as well as a domain found in proteins in signal transduction pathways which respond to oxygen levels. Mutations in this gene are associated with erythrocytosis familial type 4.
Key information use to make this script can be found here:
Refseq Gene:https://www.ncbi.nlm.nih.gov/gene/2034/ Refseq Homologene: https://www.ncbi.nlm.nih.gov/gene/2034/ortholog/?scope=89593&term=EPAS1
library(seqinr)
library(rentrez)
library(compbio4all)
library(Biostrings)
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: XVector
## Loading required package: GenomeInfoDb
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:seqinr':
##
## translate
## The following object is masked from 'package:base':
##
## strsplit
library(devtools)
## Loading required package: usethis
library(ggmsa)
## Registered S3 methods overwritten by 'ggalt':
## method from
## grid.draw.absoluteGrob ggplot2
## grobHeight.absoluteGrob ggplot2
## grobWidth.absoluteGrob ggplot2
## grobX.absoluteGrob ggplot2
## grobY.absoluteGrob ggplot2
library(BiocManager)
## Bioconductor version '3.13' is out-of-date; the current release version '3.14'
## is available with R version '4.1'; see https://bioconductor.org/install
##
## Attaching package: 'BiocManager'
## The following object is masked from 'package:devtools':
##
## install
library(ape)
##
## Attaching package: 'ape'
## The following object is masked from 'package:Biostrings':
##
## complement
## The following objects are masked from 'package:seqinr':
##
## as.alignment, consensus
library(drawProteins)
library(msa)
##
## Attaching package: 'msa'
## The following object is masked from 'package:BiocManager':
##
## version
# Human EPAS1 (H. sapiens)
hepas1 <- entrez_fetch(db = "protein",
id = "NP_001421.2",
rettype = "fasta")
# house mouse EPAS1 (M. musculus)
mepas1 <- entrez_fetch(db = "protein",
id = "NP_034267.3",
rettype = "fasta")
# norway rat EPAS1
nepas1 <- entrez_fetch(db = "protein",
id = "XP_038967774.1",
rettype = "fasta")
# ZebraFish EPAS1
zepas1 <- entrez_fetch(db = "protein",
id = "NP_001034895.2",
rettype = "fasta")
#B. taurus
tepas1 <- entrez_fetch(db = "protein",
id = "NP_777150.1",
rettype = "fasta")
#G. gallus
gepas1 <- entrez_fetch(db = "protein",
id = "XP_015139104.2",
rettype = "fasta")
#C. familiaris
cepas1 <- entrez_fetch(db = "protein",
id = "XP_022280389.2",
rettype = "fasta")
#X. tropicalis
pepas1 <- entrez_fetch(db = "protein",
id = "NP_001005647.1",
rettype = "fasta")
#I. punctatus
IPepas1 <- entrez_fetch(db = "protein",
id = "NP_001337036.1",
rettype = "fasta")
#P. troglodytes
MMepas1 <- entrez_fetch(db = "protein",
id = "XP_001147219.1",
rettype = "fasta")
hepas1 <- fasta_cleaner(hepas1, parse = F)
mepas1 <- fasta_cleaner(mepas1, parse = F)
nepas1 <- fasta_cleaner(nepas1, parse = F)
zepas1 <- fasta_cleaner(zepas1, parse = F)
tepas1 <- fasta_cleaner(tepas1, parse = F)
gepas1 <- fasta_cleaner(gepas1, parse = F)
cepas1 <- fasta_cleaner(cepas1, parse = F)
pepas1 <- fasta_cleaner(pepas1, parse = F)
IPepas1 <- fasta_cleaner(IPepas1, parse = F)
MMepas1 <- fasta_cleaner(MMepas1, parse = F)
Accession numbers were obtained from RefSeq, Refseq Homlogene, UniProt and PDB. UniProt accession numbers can be found by searching for the gene name. PDB accessions can be found by searching with a UniProt accession or a gene name, though many proteins are not in PDB. The the Neanderthal genome database was searched but did not yield sequence information on.
epas_table <- c("NP_001421.2", "H. sapiens", "human", "hepas1",
"Q99814",
"NP_034267.3", "M. musculus", "house mouse", "mepas1",
"P97481",
"XP_038967774.1", "R. norvegicus", "norway rat", "nepas1",
"Q9JHS1",
"NP_001034895.2", "D. rerio", "zebra fish", "zepas1",
"B3DJD1",
"NP_777150.1", "B. taurus", "cattle", "cepas1",
"A0A3Q1MIU5",
"XP_015139104.2", "G. gallus", "chicken", "gepas1",
"Q9W7C6",
"XP_022280389.2", "C. familiaris", "dog", "depas1",
"E2QUK0",
"NP_001005647.1", "X. tropicalis", "frog", "pepas1",
"Q6GL61",
"NP_001337036.1", "I. punctatus", "channel catfish",
"iepas1", "K7G1Q3",
"XP_001147219.1", "P. troglodytes", "chimp",
"mmepas1", "A0A2I3TA92")
epas_table
## [1] "NP_001421.2" "H. sapiens" "human" "hepas1"
## [5] "Q99814" "NP_034267.3" "M. musculus" "house mouse"
## [9] "mepas1" "P97481" "XP_038967774.1" "R. norvegicus"
## [13] "norway rat" "nepas1" "Q9JHS1" "NP_001034895.2"
## [17] "D. rerio" "zebra fish" "zepas1" "B3DJD1"
## [21] "NP_777150.1" "B. taurus" "cattle" "cepas1"
## [25] "A0A3Q1MIU5" "XP_015139104.2" "G. gallus" "chicken"
## [29] "gepas1" "Q9W7C6" "XP_022280389.2" "C. familiaris"
## [33] "dog" "depas1" "E2QUK0" "NP_001005647.1"
## [37] "X. tropicalis" "frog" "pepas1" "Q6GL61"
## [41] "NP_001337036.1" "I. punctatus" "channel catfish" "iepas1"
## [45] "K7G1Q3" "XP_001147219.1" "P. troglodytes" "chimp"
## [49] "mmepas1" "A0A2I3TA92"
# convert to matrix
epas_table_matrix <- matrix(epas_table,
byrow = T,
nrow = 10)
# convert to data frame
epas_table <- data.frame(epas_table_matrix,
stringsAsFactors = F)
length(epas_table)
## [1] 5
# renames columns
names(epas_table) <- c("accession", "name.orig", "common.name", "name.new", "UniProt")
# Create simplified species names
epas_table$spp [grep("sapiens",epas_table$name.orig)] <- "Homo"
epas_table$spp[grep("musculus",epas_table$name.orig)] <- "Mus"
epas_table$spp[grep("norvegicus",epas_table$name.orig)] <- "Rat"
epas_table$spp[grep("rerio",epas_table$name.orig)] <- "zebrafish"
epas_table$spp[grep("taurus",epas_table$name.orig)] <- "cattle"
epas_table$spp[grep("gallus",epas_table$name.orig)] <- "chicken"
epas_table$spp[grep("familiaris",epas_table$name.orig)] <- "dog"
epas_table$spp[grep("tropicalis",epas_table$name.orig)] <- "frog"
epas_table$spp[grep("punctatus",epas_table$name.orig)] <- "catfish"
epas_table$spp[grep("troglodytes",epas_table$name.orig)] <- "chimp"
pander::pander(epas_table)
| accession | name.orig | common.name | name.new | UniProt |
|---|---|---|---|---|
| NP_001421.2 | H. sapiens | human | hepas1 | Q99814 |
| NP_034267.3 | M. musculus | house mouse | mepas1 | P97481 |
| XP_038967774.1 | R. norvegicus | norway rat | nepas1 | Q9JHS1 |
| NP_001034895.2 | D. rerio | zebra fish | zepas1 | B3DJD1 |
| NP_777150.1 | B. taurus | cattle | cepas1 | A0A3Q1MIU5 |
| XP_015139104.2 | G. gallus | chicken | gepas1 | Q9W7C6 |
| XP_022280389.2 | C. familiaris | dog | depas1 | E2QUK0 |
| NP_001005647.1 | X. tropicalis | frog | pepas1 | Q6GL61 |
| NP_001337036.1 | I. punctatus | channel catfish | iepas1 | K7G1Q3 |
| XP_001147219.1 | P. troglodytes | chimp | mmepas1 | A0A2I3TA92 |
| spp |
|---|
| Homo |
| Mus |
| Rat |
| zebrafish |
| cattle |
| chicken |
| dog |
| frog |
| catfish |
| chimp |
Multivariate statistcal techniques were used to confirm the information about protein structure and location in the line database.
Q99814_h <- drawProteins::get_features("Q99814")
## [1] "Download has worked"
is(Q99814_h)
## [1] "list" "vector" "list_OR_List" "vector_OR_Vector"
## [5] "vector_OR_factor"
my_prot_df <- drawProteins::feature_to_dataframe(Q99814_h)
is(my_prot_df)
## [1] "data.frame" "list" "oldClass" "vector"
## [5] "list_OR_List" "vector_OR_Vector" "vector_OR_factor"
my_prot_df[,-2]
## type begin end length accession entryName taxid order
## featuresTemp CHAIN 1 870 869 Q99814 EPAS1_HUMAN 9606 1
## featuresTemp.1 DOMAIN 14 67 53 Q99814 EPAS1_HUMAN 9606 1
## featuresTemp.2 DOMAIN 84 154 70 Q99814 EPAS1_HUMAN 9606 1
## featuresTemp.3 DOMAIN 230 300 70 Q99814 EPAS1_HUMAN 9606 1
## featuresTemp.4 DOMAIN 304 347 43 Q99814 EPAS1_HUMAN 9606 1
## featuresTemp.5 REGION 1 24 23 Q99814 EPAS1_HUMAN 9606 1
## featuresTemp.6 REGION 26 53 27 Q99814 EPAS1_HUMAN 9606 1
## featuresTemp.7 REGION 171 192 21 Q99814 EPAS1_HUMAN 9606 1
## featuresTemp.8 REGION 460 486 26 Q99814 EPAS1_HUMAN 9606 1
## featuresTemp.9 REGION 496 542 46 Q99814 EPAS1_HUMAN 9606 1
## featuresTemp.10 REGION 830 870 40 Q99814 EPAS1_HUMAN 9606 1
## featuresTemp.11 COMPBIAS 462 486 24 Q99814 EPAS1_HUMAN 9606 1
## featuresTemp.12 MOD_RES 405 405 0 Q99814 EPAS1_HUMAN 9606 1
## featuresTemp.13 MOD_RES 531 531 0 Q99814 EPAS1_HUMAN 9606 1
## featuresTemp.14 MOD_RES 840 840 0 Q99814 EPAS1_HUMAN 9606 1
## featuresTemp.15 MOD_RES 847 847 0 Q99814 EPAS1_HUMAN 9606 1
## featuresTemp.16 VARIANT 534 534 0 Q99814 EPAS1_HUMAN 9606 1
## featuresTemp.17 VARIANT 535 535 0 Q99814 EPAS1_HUMAN 9606 1
## featuresTemp.18 VARIANT 535 535 0 Q99814 EPAS1_HUMAN 9606 1
## featuresTemp.19 VARIANT 537 537 0 Q99814 EPAS1_HUMAN 9606 1
## featuresTemp.20 VARIANT 537 537 0 Q99814 EPAS1_HUMAN 9606 1
## featuresTemp.21 VARIANT 540 540 0 Q99814 EPAS1_HUMAN 9606 1
## featuresTemp.22 VARIANT 766 766 0 Q99814 EPAS1_HUMAN 9606 1
## featuresTemp.23 VARIANT 785 785 0 Q99814 EPAS1_HUMAN 9606 1
## featuresTemp.24 MUTAGEN 844 844 0 Q99814 EPAS1_HUMAN 9606 1
## featuresTemp.25 CONFLICT 60 60 0 Q99814 EPAS1_HUMAN 9606 1
## featuresTemp.26 CONFLICT 539 539 0 Q99814 EPAS1_HUMAN 9606 1
## featuresTemp.27 CONFLICT 601 601 0 Q99814 EPAS1_HUMAN 9606 1
## featuresTemp.28 CONFLICT 693 693 0 Q99814 EPAS1_HUMAN 9606 1
## featuresTemp.29 CONFLICT 716 716 0 Q99814 EPAS1_HUMAN 9606 1
## featuresTemp.30 CONFLICT 722 722 0 Q99814 EPAS1_HUMAN 9606 1
## featuresTemp.31 CONFLICT 765 765 0 Q99814 EPAS1_HUMAN 9606 1
## featuresTemp.32 CONFLICT 769 769 0 Q99814 EPAS1_HUMAN 9606 1
## featuresTemp.33 CONFLICT 844 844 0 Q99814 EPAS1_HUMAN 9606 1
## featuresTemp.34 CONFLICT 847 847 0 Q99814 EPAS1_HUMAN 9606 1
## featuresTemp.35 HELIX 240 242 2 Q99814 EPAS1_HUMAN 9606 1
## featuresTemp.36 STRAND 243 248 5 Q99814 EPAS1_HUMAN 9606 1
## featuresTemp.37 TURN 250 252 2 Q99814 EPAS1_HUMAN 9606 1
## featuresTemp.38 STRAND 253 257 4 Q99814 EPAS1_HUMAN 9606 1
## featuresTemp.39 HELIX 260 265 5 Q99814 EPAS1_HUMAN 9606 1
## featuresTemp.40 HELIX 269 272 3 Q99814 EPAS1_HUMAN 9606 1
## featuresTemp.41 HELIX 277 280 3 Q99814 EPAS1_HUMAN 9606 1
## featuresTemp.42 HELIX 283 285 2 Q99814 EPAS1_HUMAN 9606 1
## featuresTemp.43 HELIX 286 299 13 Q99814 EPAS1_HUMAN 9606 1
## featuresTemp.44 STRAND 300 303 3 Q99814 EPAS1_HUMAN 9606 1
## featuresTemp.45 STRAND 307 310 3 Q99814 EPAS1_HUMAN 9606 1
## featuresTemp.46 STRAND 314 327 13 Q99814 EPAS1_HUMAN 9606 1
## featuresTemp.47 TURN 329 331 2 Q99814 EPAS1_HUMAN 9606 1
## featuresTemp.48 STRAND 334 343 9 Q99814 EPAS1_HUMAN 9606 1
my_canvas <- draw_canvas(my_prot_df)
my_canvas <- draw_chains(my_canvas, my_prot_df,
label_size = 2.5)
my_canvas <- draw_domains(my_canvas, my_prot_df)
my_canvas
Q9JHS1_n <- drawProteins::get_features("Q9JHS1")
## [1] "Download has worked"
my_prot_df <- drawProteins::feature_to_dataframe(Q9JHS1_n)
my_canvas <- draw_canvas(my_prot_df)
my_canvas <- draw_chains(my_canvas, my_prot_df, label_size = 2.5)
my_canvas <- draw_regions(my_canvas, my_prot_df)
my_canvas
h_fasta <- rentrez::entrez_fetch(db = "protein",# add code
id = "NP_001421.2",
rettype = "fasta")
h_vector <- fasta_cleaner(h_fasta)
# set up 2 x 2 grid, make margins thinner
par(mfrow = c(2,2),
mar = c(0,0,2,1))
# plot 1:EPAS1 - Defaults
seqinr::dotPlot(h_vector[400:600], h_vector[400:600],
wsize = 1,
nmatch = 1,
main = "EPAS1 Defaults")
# plot2: EPAS1 - size = 10, nmatch = 1
dotPlot (h_vector[400:600], h_vector[400:600],
wsize = 10,
nmatch = 1,
main = " EPAS1- size = 10, nmatch = 1")
# plot 3: EPAS1 - size =10 , nmatch = 5
dotPlot(h_vector[400:600], h_vector[400:600],
wsize = 10,
nmatch = 5 ,
main = "EPAS1 - size = 10, nmatch = 5")
# plot 4: EPAS1 - size =20 , nmatch = 5
dotPlot(h_vector[400:600], h_vector[400:600],
wsize = 20,
nmatch = 5,
main = "EPAS1 - size = 20, nmatch = 5")
Below are links to relevant information. The folllowing were not avalible for this gene: 1. DisProt: no information available on Presence of Disorder 2. RepeatDB: no information available
PDB secondary structural location: Crystal structure of PT2399 bound to HIF2a-B:ARNT-B complex
A the EPAS1 homolog is listed in Alphafold (https://alphafold.ebi.ac.uk/entry/Q99814).
source <- c("PFam","DisProt", "RepeatsDB", "UniProt", "PDB")
property <- c("major feature from Pfam - domain", "Presence of disorganized regions", "Presence of tandem repeats", "subcellular location", "Protein secondary structural classifications")
Info <- c("PAS
PAS_3
HIF-1
HIF-1a_CTAD", "NA", "NA", "nucleus", "1P97- NMR structure of the C-terminal PAS domain of HIF2a")
properties_table <- data.frame(source, property, Info)
pander::pander(properties_table)
| source | property | Info |
|---|---|---|
| PFam | major feature from Pfam - domain | PAS PAS_3 HIF-1 HIF-1a_CTAD |
| DisProt | Presence of disorganized regions | NA |
| RepeatsDB | Presence of tandem repeats | NA |
| UniProt | subcellular location | nucleus |
| PDB | Protein secondary structural classifications | 1P97- NMR structure of the C-terminal PAS domain of HIF2a |
# enter once
aa.1.1 <- c("A","R","N","D","C","Q","E","G","H","I",
"L","K","M","F","P","S","T","W","Y","V")
# enter again
aa.1.2 <- c("A","R","N","D","C","Q","E","G","H","I",
"L","K","M","F","P","S","T","W","Y","V")
# are they the same length?
length(aa.1.1) == length(aa.1.2)
## [1] TRUE
# are all the values the same?
aa.1.1 == aa.1.2
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE
#length(unique(aa.1.1)) tells me how many unique values
unique(aa.1.1)
## [1] "A" "R" "N" "D" "C" "Q" "E" "G" "H" "I" "L" "K" "M" "F" "P" "S" "T" "W" "Y"
## [20] "V"
# does each vector have the same number of unique values?
length(unique(aa.1.1)) == length(unique(aa.1.2))
## [1] TRUE
# OPTIONAL: are any of the values returned by the logical check == FALSE?
any(c(aa.1.1 == aa.1.2) == FALSE)
## [1] FALSE
# alpha proteins
alpha <- c(285, 53, 97, 163, 22, 67, 134, 197, 111, 91,
221, 249, 48, 123, 82, 122, 119, 33, 63, 167)
# check against chou's total
sum(alpha) == 2447
## [1] TRUE
# beta proteins
beta <- c(203, 67, 139, 121, 75, 122, 86, 297, 49, 120,
177, 115, 16, 85, 127, 341, 253, 44, 110, 229)
# check against chou's total
sum(beta) == 2776
## [1] TRUE
# alpha + beta
a.plus.b <- c(175, 78, 120, 111, 74, 74, 86, 171, 33, 93,
110, 112, 25, 52, 71, 126, 117, 30, 108, 123)
sum(a.plus.b) == 1889
## [1] TRUE
# alpha/beta
a.div.b <- c(361, 146, 183, 244, 63, 114, 257, 377, 107, 239,
339, 321, 91, 158, 188, 327, 238, 72, 130, 378)
sum(a.div.b) == 4333
## [1] TRUE
data.frame(aa.1.1, alpha, beta, a.plus.b, a.div.b)
## aa.1.1 alpha beta a.plus.b a.div.b
## 1 A 285 203 175 361
## 2 R 53 67 78 146
## 3 N 97 139 120 183
## 4 D 163 121 111 244
## 5 C 22 75 74 63
## 6 Q 67 122 74 114
## 7 E 134 86 86 257
## 8 G 197 297 171 377
## 9 H 111 49 33 107
## 10 I 91 120 93 239
## 11 L 221 177 110 339
## 12 K 249 115 112 321
## 13 M 48 16 25 91
## 14 F 123 85 52 158
## 15 P 82 127 71 188
## 16 S 122 341 126 327
## 17 T 119 253 117 238
## 18 W 33 44 30 72
## 19 Y 63 110 108 130
## 20 V 167 229 123 378
alpha.prop <- alpha/sum(alpha)
beta.prop <- beta/sum(beta)
a.plus.b.prop <- a.plus.b/sum(a.plus.b)
a.div.b <- a.div.b/sum(a.div.b)
#dataframe
aa.prop <- data.frame(alpha.prop,
beta.prop,
a.plus.b.prop,
a.div.b)
#row labels
row.names(aa.prop) <- aa.1.1
aa.prop
## alpha.prop beta.prop a.plus.b.prop a.div.b
## A 0.116469146 0.073126801 0.09264161 0.08331410
## R 0.021659174 0.024135447 0.04129169 0.03369490
## N 0.039640376 0.050072046 0.06352567 0.04223402
## D 0.066612178 0.043587896 0.05876125 0.05631202
## C 0.008990601 0.027017291 0.03917417 0.01453958
## Q 0.027380466 0.043948127 0.03917417 0.02630972
## E 0.054760932 0.030979827 0.04552673 0.05931225
## G 0.080506743 0.106988473 0.09052409 0.08700669
## H 0.045361667 0.017651297 0.01746956 0.02469421
## I 0.037188394 0.043227666 0.04923240 0.05515809
## L 0.090314671 0.063760807 0.05823187 0.07823679
## K 0.101757254 0.041426513 0.05929063 0.07408262
## M 0.019615856 0.005763689 0.01323452 0.02100162
## F 0.050265631 0.030619597 0.02752779 0.03646434
## P 0.033510421 0.045749280 0.03758602 0.04338795
## S 0.049856968 0.122838617 0.06670196 0.07546734
## T 0.048630977 0.091138329 0.06193753 0.05492730
## W 0.013485901 0.015850144 0.01588142 0.01661666
## Y 0.025745811 0.039625360 0.05717311 0.03000231
## V 0.068246833 0.082492795 0.06511382 0.08723748
plot(aa.prop,panel = panel.smooth)
cor(aa.prop)
## alpha.prop beta.prop a.plus.b.prop a.div.b
## alpha.prop 1.0000000 0.4941143 0.6969508 0.8555289
## beta.prop 0.4941143 1.0000000 0.7977771 0.7706654
## a.plus.b.prop 0.6969508 0.7977771 1.0000000 0.8198043
## a.div.b 0.8555289 0.7706654 0.8198043 1.0000000
round(cor(aa.prop), 3)
## alpha.prop beta.prop a.plus.b.prop a.div.b
## alpha.prop 1.000 0.494 0.697 0.856
## beta.prop 0.494 1.000 0.798 0.771
## a.plus.b.prop 0.697 0.798 1.000 0.820
## a.div.b 0.856 0.771 0.820 1.000
par(mfrow = c(1,3), mar = c(4,4,1,0))
plot(alpha.prop ~ beta.prop, data = aa.prop)
plot(alpha.prop ~ a.plus.b.prop, data = aa.prop)
plot(alpha.prop ~ a.div.b, data = aa.prop)
par(mfrow = c(1,1), mar = c(4,4,4,4))
# download
NP_001421 <- rentrez::entrez_fetch(id = "NP_001421.2",
db = "protein",
rettype = "fasta")
# clean and turn into vector
NP_001421 <- compbio4all::fasta_cleaner(NP_001421, parse = TRUE)
NP_001421.freq.table <- table(NP_001421)/length(NP_001421)
NP_001421.freq.table
## NP_001421
## A C D E F G
## 0.060919540 0.025287356 0.050574713 0.067816092 0.043678161 0.057471264
## H I K L M N
## 0.031034483 0.031034483 0.057471264 0.100000000 0.033333333 0.037931034
## P Q R S T V
## 0.080459770 0.045977011 0.040229885 0.109195402 0.057471264 0.037931034
## W Y
## 0.006896552 0.025287356
table_to_vector <- function(table_x){
table_names <- attr(table_x, "dimnames")[[1]]
table_vect <- as.vector(table_x)
names(table_vect) <- table_names
return(table_vect)
}
human.aa.freq <- table_to_vector(NP_001421.freq.table)
aa.names <- names(human.aa.freq)
any(aa.names == "U")
## [1] FALSE
chou_cor <- function(x,y){
numerator <- sum(x*y)
denominator <- sqrt((sum(x^2))*(sum(y^2)))
result <- numerator/denominator
return(result)
}
chou_cosine <- function(z.1, z.2){
z.1.abs <- sqrt(sum(z.1^2))
z.2.abs <- sqrt(sum(z.2^2))
my.cosine <- sum(z.1*z.2)/(z.1.abs*z.2.abs)
return(my.cosine)
}
par(mfrow = c(2,2), mar = c(1,4,1,1))
plot(alpha.prop ~ human.aa.freq, data = aa.prop)
plot(beta.prop ~ human.aa.freq, data = aa.prop)
plot(a.plus.b.prop ~ human.aa.freq, data = aa.prop)
plot(a.div.b ~ human.aa.freq, data = aa.prop)
par(mfrow = c(1,1), mar = c(1,1,1,1))
#Calculate correlation between each column
corr.alpha <- chou_cor(aa.prop[ , ], aa.prop[ , ])
corr.beta <- chou_cor(aa.prop[ , ], aa.prop[ , ])
corr.apb <- chou_cor(aa.prop[ , ], aa.prop[ , ])
corr.adb <- chou_cor(aa.prop[ , ], aa.prop[ , ])
#Calculate cosine similarity
cos.alpha <- chou_cosine(aa.prop[ , ], aa.prop[ , ])
cos.beta <- chou_cosine(aa.prop[ , ], aa.prop[ , ])
cos.apb <- chou_cosine(aa.prop[ , ], aa.prop[ , ])
cos.adb <- chou_cosine(aa.prop[ , ], aa.prop[ , ])
aa.prop.flipped <- t(aa.prop)
round(aa.prop.flipped,2)
## A R N D C Q E G H I L K M
## alpha.prop 0.12 0.02 0.04 0.07 0.01 0.03 0.05 0.08 0.05 0.04 0.09 0.10 0.02
## beta.prop 0.07 0.02 0.05 0.04 0.03 0.04 0.03 0.11 0.02 0.04 0.06 0.04 0.01
## a.plus.b.prop 0.09 0.04 0.06 0.06 0.04 0.04 0.05 0.09 0.02 0.05 0.06 0.06 0.01
## a.div.b 0.08 0.03 0.04 0.06 0.01 0.03 0.06 0.09 0.02 0.06 0.08 0.07 0.02
## F P S T W Y V
## alpha.prop 0.05 0.03 0.05 0.05 0.01 0.03 0.07
## beta.prop 0.03 0.05 0.12 0.09 0.02 0.04 0.08
## a.plus.b.prop 0.03 0.04 0.07 0.06 0.02 0.06 0.07
## a.div.b 0.04 0.04 0.08 0.05 0.02 0.03 0.09
dist(aa.prop.flipped, method = "euclidean")
## alpha.prop beta.prop a.plus.b.prop
## beta.prop 0.13342098
## a.plus.b.prop 0.09281824 0.08289406
## a.div.b 0.06699039 0.08659174 0.06175113
dist.alpha <- dist((aa.prop.flipped[c(1, 1),]), method = "euclidean")
dist.beta <- dist((aa.prop.flipped[c(2, 1),]), method = "euclidean")
dist.apb <- dist((aa.prop.flipped[c(3,1),]), method = "euclidean")
dist.adb <- dist((aa.prop.flipped[c(4,1),]), method = "euclidean")
# fold types
fold.type <- c("alpha","beta","alpha plus beta", "alpha/beta")
# data
corr.sim <- round(c(corr.alpha,corr.beta,corr.apb,corr.adb),5)
cosine.sim <- round(c(cos.alpha,cos.beta,cos.apb,cos.adb),5)
Euclidean.dist <- round(c(dist.alpha,dist.beta,dist.apb,dist.adb),5)
# summary
sim.sum <- c("","","most.sim","")
dist.sum <- c("","","min.dist","")
df <- data.frame(fold.type,
corr.sim ,
cosine.sim ,
Euclidean.dist ,
sim.sum ,
dist.sum )
pander::pander(df)
| fold.type | corr.sim | cosine.sim | Euclidean.dist | sim.sum | dist.sum |
|---|---|---|---|---|---|
| alpha | 1 | 1 | 0 | ||
| beta | 1 | 1 | 0.1334 | ||
| alpha plus beta | 1 | 1 | 0.09282 | most.sim | min.dist |
| alpha/beta | 1 | 1 | 0.06699 |
# convert homo_vector to an object
homoseq_string <-paste(hepas1 ,collapse = "")
musseq_string <-paste(mepas1 ,collapse = "")
monseq_string <-paste(MMepas1 ,collapse = "")
gseq_string <-paste(gepas1 ,collapse = "")
homoseq_string <- toupper(homoseq_string)
musseq_string <- toupper(musseq_string)
monseq_string <- toupper(monseq_string)
gseq_string <- toupper(gseq_string)
align01.02 <- Biostrings::pairwiseAlignment(homoseq_string,
musseq_string,
substitutionMatrix = "BLOSUM62",
gapOpening = -8,
gapExtension = -2,
scoreOnly = FALSE)
align01.05 <- Biostrings::pairwiseAlignment(homoseq_string,
monseq_string,
substitutionMatrix = "BLOSUM62",
gapOpening = -8,
gapExtension = -2,
scoreOnly = FALSE)
align01.06 <- Biostrings::pairwiseAlignment(homoseq_string,
gseq_string,
substitutionMatrix = "BLOSUM62",
gapOpening = -8,
gapExtension = -2,
scoreOnly = FALSE)
align02.05 <- Biostrings::pairwiseAlignment(musseq_string,
monseq_string,
substitutionMatrix = "BLOSUM62",
gapOpening = -8,
gapExtension = -2,
scoreOnly = FALSE)
align02.06 <- Biostrings::pairwiseAlignment(musseq_string,
gseq_string,
substitutionMatrix = "BLOSUM62",
gapOpening = -8,
gapExtension = -2,
scoreOnly = FALSE)
align05.06 <- Biostrings::pairwiseAlignment(monseq_string,
gseq_string,
substitutionMatrix = "BLOSUM62",
gapOpening = -8,
gapExtension = -2,
scoreOnly = FALSE)
pids <- c(1, NA, NA, NA,
pid(align01.02), 1, NA, NA,
pid(align01.05), pid(align02.05), 1, NA,
pid(align01.06), pid(align02.06), pid(align05.06), 1)
mat <- matrix(pids, nrow = 4, byrow = T)
row.names(mat) <- c("Homo","Mus","Pan","Gal")
colnames(mat) <- c("Homo","Mus","Pan","Gal")
pander::pander(mat)
| Homo | Mus | Pan | Gal | |
|---|---|---|---|---|
| Homo | 1 | NA | NA | NA |
| Mus | 87.89 | 1 | NA | NA |
| Pan | 99.89 | 87.77 | 1 | NA |
| Gal | 74.55 | 71.04 | 74.55 | 1 |
homovschimp_table <- c("PID1", pid(align01.05),"(aligned positions + internal gap positions)", "PID2",
pid(align01.05, type = "PID2"), "(aligned positions)", "PID3",
pid(align01.05, type = "PID3"), "(length shorter sequence)", "PID4",
pid(align01.05, type = "PID4"), "(average length of the two sequences)")
homovschimp_matrix <- matrix(homovschimp_table,
nrow = 4, byrow = T)
homovschimp <- data.frame(homovschimp_matrix,
stringsAsFactors = F)
names(homovschimp) <- c("method", "PID", "denomenator")
pander::pander(homovschimp)
| method | PID | denomenator |
|---|---|---|
| PID1 | 99.8850574712644 | (aligned positions + internal gap positions) |
| PID2 | 99.8850574712644 | (aligned positions) |
| PID3 | 99.8850574712644 | (length shorter sequence) |
| PID4 | 99.8850574712644 | (average length of the two sequences) |
nchar(hepas1)
## [1] 870
nchar(mepas1)
## [1] 874
nchar(nepas1)
## [1] 885
nchar(zepas1)
## [1] 834
nchar(tepas1)
## [1] 870
nchar(gepas1)
## [1] 871
nchar(cepas1)
## [1] 869
nchar(pepas1)
## [1] 862
nchar(IPepas1)
## [1] 820
nchar(MMepas1)
## [1] 870
align.epas <- Biostrings::pairwiseAlignment(
hepas1,
mepas1)
align.epas
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: MTADKEKKRSSSERRKEKSRDAARCRRSKETEVF...PELTRYDCEVNVPVLGSSTLLQGGDLLRALDQAT
## subject: MTADKEKKRSSSELRKEKSRDAARCRRSKETEVF...PELTRYDCEVNVPVPGSSTLLQGRDLLRALDQAT
## score: 2641.632
epas_table$accession
## [1] "NP_001421.2" "NP_034267.3" "XP_038967774.1" "NP_001034895.2"
## [5] "NP_777150.1" "XP_015139104.2" "XP_022280389.2" "NP_001005647.1"
## [9] "NP_001337036.1" "XP_001147219.1"
epas1 <- entrez_fetch(db = "protein",
id = epas_table$accession,
rettype = "fasta")
cat(epas1)
## >NP_001421.2 endothelial PAS domain-containing protein 1 [Homo sapiens]
## MTADKEKKRSSSERRKEKSRDAARCRRSKETEVFYELAHELPLPHSVSSHLDKASIMRLAISFLRTHKLL
## SSVCSENESEAEADQQMDNLYLKALEGFIAVVTQDGDMIFLSENISKFMGLTQVELTGHSIFDFTHPCDH
## EEIRENLSLKNGSGFGKKSKDMSTERDFFMRMKCTVTNRGRTVNLKSATWKVLHCTGQVKVYNNCPPHNS
## LCGYKEPLLSCLIIMCEPIQHPSHMDIPLDSKTFLSRHSMDMKFTYCDDRITELIGYHPEELLGRSAYEF
## YHALDSENMTKSHQNLCTKGQVVSGQYRMLAKHGGYVWLETQGTVIYNPRNLQPQCIMCVNYVLSEIEKN
## DVVFSMDQTESLFKPHLMAMNSIFDSSGKGAVSEKSNFLFTKLKEEPEELAQLAPTPGDAIISLDFGNQN
## FEESSAYGKAILPPSQPWATELRSHSTQSEAGSLPAFTVPQAAAPGSTTPSATSSSSSCSTPNSPEDYYT
## SLDNDLKIEVIEKLFAMDTEAKDQCSTQTDFNELDLETLAPYIPMDGEDFQLSPICPEERLLAENPQSTP
## QHCFSAMTNIFQPLAPVAPHSPFLLDKFQQQLESKKTEPEHRPMSSIFFDAGSKASLPPCCGQASTPLSS
## MGGRSNTQWPPDPPLHFGPTKWAVGDQRTEFLGAAPLGPPVSPPHVSTFKTRSAKGFGARGPDVLSPAMV
## ALSNKLKLKRQLEYEEQAFQDLSGGDPPGGSTSHLMWKRMKNLRGGSCPLMPDKPLSANVPNDKFTQNPM
## RGLGHPLRHLPLPQPPSAISPGENSKSRFPPQCYATQYQDYSLSSAHKVSGMASRLLGPSFESYLLPELT
## RYDCEVNVPVLGSSTLLQGGDLLRALDQAT
##
## >NP_034267.3 endothelial PAS domain-containing protein 1 [Mus musculus]
## MTADKEKKRSSSELRKEKSRDAARCRRSKETEVFYELAHELPLPHSVSSHLDKASIMRLAISFLRTHKLL
## SSVCSENESEAEADQQMDNLYLKALEGFIAVVTQDGDMIFLSENISKFMGLTQVELTGHSIFDFTHPCDH
## EEIRENLTLKNGSGFGKKSKDVSTERDFFMRMKCTVTNRGRTVNLKSATWKVLHCTGQVRVYNNCPPHSS
## LCGSKEPLLSCLIIMCEPIQHPSHMDIPLDSKTFLSRHSMDMKFTYCDDRILELIGYHPEELLGRSAYEF
## YHALDSENMTKSHQNLCTKGQVVSGQYRMLAKHGGYVWLETQGTVIYNPRNLQPQCIMCVNYVLSEIEKN
## DVVFSMDQTESLFKPHLMAMNSIFDSSDDVAVTEKSNYLFTKLKEEPEELAQLAPTPGDAIISLDFGSQN
## FDEPSAYGKAILPPGQPWVSGLRSHSAQSESGSLPAFTVPQADTPGNTTPSASSSSSCSTPSSPEDYYSS
## LENPLKIEVIEKLFAMDTEPRDPGSTQTDFSELDLETLAPYIPMDGEDFQLSPICPEEPLMPESPQPTPQ
## HCFSTMTSIFQPLTPGATHGPFFLDKYPQQLESRKTESEHWPMSSIFFDAGSKGSLSPCCGQASTPLSSM
## GGRSNTQWPPDPPLHFGPTKWPVGDQSAESLGALPVGSSQLEPPSAPPHVSMFKMRSAKDFGARGPYMMS
## PAMIALSNKLKLKRQLEYEEQAFQDTSGGDPPGTSSSHLMWKRMKSLMGGTCPLMPDKTISANMAPDEFT
## QKSMRGLGQPLRHLPPPQPPSTRSSGENAKTGFPPQCYASQFQDYGPPGAQKVSGVASRLLGPSFEPYLL
## PELTRYDCEVNVPVPGSSTLLQGRDLLRALDQAT
##
## >XP_038967774.1 endothelial PAS domain-containing protein 1 isoform X1 [Rattus norvegicus]
## MTAEWRELWEIRGQAGKDWRSSSELRKEKSRDAARCRRSKETEVFYELAHELPLPHSVSSHLDKASIMRL
## AISFLRTHKLLSSVCSENESEAEADQQMDNLYLKALEGFIAVVTQDGDMIFLSENISKFMGLTQVELTGH
## SIFDFTHPCDHEEIRENLTLKTGSGFGKKNKDMSTERDFFMRMKCTVTNRGRTVNLKSATWKVLHCTGQV
## RVYNNCPPHSSLCGYKEPLLSCLIIMCEPIQHPSHMDIPLDSKTFLSRHSMDMKFTYCDDRILELVGYHP
## EELLGRSAYEFYHALDSENMTKSHQNLCTKGQVVSGQYRMLAKHGGYVWLETQGTVVYNPRNLQPQCIMC
## VNYVLSEIEKNDVVFSMDQTESLFKPHLMAMNSIFDSSDDVALSEKSNYLFTKLKEEPEDLAQLAPTPGD
## AIISLDFGSQNFDESSTYGKAILPPGQPWATELRSHSAQSESRSLPAFTVPQAGSPGNATPSATSSSSCS
## TPSSPEDYYSSLENHLKIEVIEKLFAMDTEAKDQCSTQTDFNELDLETLAPYIPMDGEDFQLSPICPEEP
## LVPESPQPNPQHCFSTMSSIFQPLTPGASQGTFFLDKYPQQLESRKTESEHWPMSTIFFDAGSKGSLPPC
## CGQASTPLSSMGGRSNTQWPPDPPLHFGPTKWSVGNQSAEPLGPLPLGTSQLEPPSTPPHVSMFKMRSAK
## DFGARGPYMMSPAMIALSNKLKLKRQLDYEEPAFQDTSGGDPPGTSSSHLMWKRMKSLMGGTCPLMPDKT
## VSASMAPDEFTQKSMRGLGQPLRHLPPSQPPSTRSPGENAKSGFPPQCYASPFQDYSPPGAQKGSGVASR
## LLGPSFEPYLLPELTRYDCEVNVPVPGSSTLLQGRDLLRALDQAT
##
## >NP_001034895.2 endothelial PAS domain-containing protein 1b [Danio rerio]
## MTAEKEKKRCSSERRKEKSRDAARCRRSKETEVFYELAHHLPLPHSISSHLDKASIMRLAISFLRTRKLV
## NSGYNTPTEMTDADRLMDSWYLKSLGGFITVVTSDGDMIFLSENINKFMGLTQVELTGHSIFDFTHPCDH
## EEIRENLSLKAGIGKKGKELSTERDFFMRMKCTVTNRGRTVNLKSASWKVLHCTGHLKVCNGCPARVLCG
## FKEPPLTCVVMMCEPIVHPSNIDTPLDSKTFLSRHSMDMKYTYCDERVTELMGYNPEDLLGRSAYEFYHA
## LDAENVTKSHQNLCTKGQAVSGQYRMLAKNGGYVWVETRGTVIYNSRNSQPQCIVCVNYVLSDVEEKSLI
## FSMDQTESLFKPHKLNGFFSPKEALGSDPADLLFTKLKEEPEDLTQLAPTPGDTIISLDFGQSQYEEHTV
## YNKVSSVAQTVSHPVHDGHRTSYSGEMAKMAATFSVPQSAPPSSATPSLSSCSTPSSPDDYYTPVDSDLK
## VELTEKLFSLDTQEAKTSRNQETDLSDLDLETLAPYIPMDGEDFQLNPICPEEPPSEIGTLGTNQQCFSN
## ITSLFQPLSSPSAAHYQPKMSSGGDKQNINGGSVESWPPVPYSRDPMQMPPYHDPASTPLSSMGGRQNLQ
## WPPDPPLPSKAGMMDPLAAKRSCQGMPANRMAPFMQRPMENFVQNYRDTSPARLALANSFKRSFSQMAMA
## ETPPTKSQQTVWKKLRHESCAVMERKSLSSSALSDKSMAHNGGMDHQHRKSQYSGNQNGQPTKHYREQFC
## NYREFNMQPSSKMDGIASRLIGPSFETYSLPELTRYDCEVNVPLQGNLHLLQGSDLLRALDQAT
##
## >NP_777150.1 endothelial PAS domain-containing protein 1 [Bos taurus]
## MTADKEKKRSSSERRKEKSRDAARCRRSKETEVFYELAHELPLPHSVSSHLDKASIMRLAISFLRTHKLL
## SSVCSENESEADADQQMDNLYLKALEGFIAVVTQDGDMIFLSENISKFMGLTQVELTGHSIFDFTHPCDH
## EEIRENLSLKNGSGFGKKSKDMSTERDFFMRMKCTVTNRGRTVNLKSATWKVLHCTGQVKVYNNCPPHSS
## LCGCKEPLLSCLIIMCEPIQHPSHMDIPLDSKTFLSRHSMDMKFTYCDDRITELVGYHPEELLGRSAYEF
## YHALDSENMTKSHQNLCTKGQVVSGQYRMLAKHGGYVWLETQGTVIYNPRNLQPQCIMCVNYVLSEIEKN
## DVVFSMDQTESLFKPHLLTMNSIFDNSGKVAVSEKSNFLFTKLKEEPEELAQLAPTAGDTIISLDFGTPN
## FEESSAYGKGILPPGQQWTGEVKSHGTHSEAGSLPAFTVPQAAALGNSTPSASSSSSCSTPSSPGDYYTS
## LDDNLKIEAIEKLFAMDTEAKDQCGTQTDFNELDLETLAPYIPMDGEDFQLSPICPEESLLPETPQSAPQ
## HCFSTMSNIFQPLAPMASHSTFLLDKYQQQLESKKTEPEQRRVSFAFFDGGSRVSLLQCCGQTYTPLSSM
## GGISNTQWPPDPPLQLGPTKWPGEDRHAEAVGAAPLGLPPATPHLAMLKKRSAKGFGPQGPDVMSPAMIA
## LSNKLKLKRQLEYEEQAFQDMSGGDPPGSGTSHLMWKRMKSLRGGGTCSLMPDKLPNANVPNDEFIQNPV
## RGRSQPLRHLSPPQPPSATSPGEPTKSGFPAQCYAPQYQDYSLPAAHKMSGMASRLLGPSFEPYLLPELT
## RYDCEVNVPVPGTSTLLQGGDLLRALDQAT
##
## >XP_015139104.2 endothelial PAS domain-containing protein 1 isoform X1 [Gallus gallus]
## MAEADHPAPSSAESSSERRKEKSRDAARCRRSKETEVFYELAHELPLPHNVSSHLDKASIMRLAISFLRT
## HKLLSSVCADNENELEADQQMDNLYLKALEGFIAVVTQDGDMIFLSENVNKYMGLTQVELTGHSIFDFTH
## PCDHEEIRENLSLKNGPGFGKKNKEMSTERDFFMRMKCTVTNRGRTVNLKSATWKVLHCTGQVKVYNTCP
## PHTLCGYKEPLLTCLIIMCEPIQHPSNIDIPLDSKTFMSRHSMDMKFTYCDDRITELIGYHPEELLGRSA
## YEFYHALDSENMTKSHQNLCTKGQVVTGQYRMLAKHGGYVWLETQGTVIYNTRNLQPQCIICVNYVLSEI
## EKNDIVFSMDQTESLFKPHLLTMSSAFENGISGRDKSDLLFTKLKEEPEELAQLAPTPGDAIISLDFELH
## PGIQKFEEAPAYTSAVLTPNKPWPVEVKSHAAQGETLTIPSFTMPQIAPGSSTPSASSNSSCSTPNSPED
## YYTSVDDDLKIEVIEKLFAMDTESKSQCNSQTDFNELDLETLAPYIPMDGEDFQLSPICQEERTLSESAQ
## NTQQSLSSMSTIFQPLASASQNQFLPEKYCPQLSNKNINPGHGSLSSVFFNNMSRSSLPPYHNQASTPLS
## SMGGRPNTQWPPDPPLEYVPSKWRLMDKYSGTLSSSPSGPPVRSPNMPIYKKRPLDGLGQRGIDINPARI
## ALSNSLKLKRQLDYEEQALQQLSGGDPSVINPPQLMWKRMKFLKGENCSLLTEKKSLSTSVLTDEFVCNS
## RGLSQPVNQLQQQQQSTCGSPGENLKAGAFSPQFYSPHYQDYTVQSAHKVSGVTSRLLGSSFEPYLLPEL
## TRYDCEVNVPVLGSSTLLQGSELLRALDQAT
##
## >XP_022280389.2 endothelial PAS domain-containing protein 1 [Canis lupus familiaris]
## MTADKEKKRSSSERRKEKSRDAARCRRSKETEVFYELAHELPLPHSVSSHLDKASIMRLAISFLRTHKLL
## SSVCSENESEAEANQQMDNLYLKALEGFIAVVTQDGDMIFLSENISKFMGLTQVELTGHSIFDFTHPCDH
## EEIRENLSLKSGPGFGKKSKDMSTERDFFMRMKCTVTNRGRTVNLKSATWKVLHCTGQVKVYNNCPPHSS
## LCSYKEPLLSCLIIMCEPIQHPSHMDIPLDSKTFLSRHSMDMKFTYCDDRITELVGYHPEELLGRSAYEF
## YHALDSENMTKSHQNLCTKGQVVSGQYRMLAKHGGYVWLETQGTVIYNPRNLQPQCIMCVNYVLSEIEKN
## DVVFSMDQTESLFKPHLMAMTSLFDSSGEVAVSDKSDYLFTKLKEEPEELAQLAPTAGDAIISLDFGSQS
## FEESSSYNSALLPPSQPWPRELRSHSTQSEDGSLPAFTVPQAAASGSATPSATSSSSSCSTPSSPGDYYT
## SLNDDLKIEVIEKLFTMDTEAKDQGSTQTDFSELDLETLAPYIPMDGEDFQLSPICPEERLLPEKPQSTP
## QHCFSTMTNIFQPLAPMASHSPFLLDKYQQQLGSKKIEPEHQPMSSIFFDGGNKVSLPPCCGQASTPLSS
## MGGRSSTQWPPDPPLHFGPTKWPVADQHTESLGPSPLGPPITSPHLSMFKKRSAKAFGPQGPDVMSPAMV
## ALSNKLKLKRQREYEEQAFQDLSGGDPPGSSTSHQVWKRMKSLRGNVNCPLIPDKLLSANIPNDEFTQNP
## MRGLGQPLRHLPPPPSVMSPGENTKSGFPPQCYAPQYQDYSLPSAHKVSGMASRLLGPSFEPYLLPELTR
## YDCEVNVPVPGSSTLLQGGDLLRALDQAT
##
## >NP_001005647.1 endothelial PAS domain-containing protein 1 [Xenopus tropicalis]
## MTAEKEKKRNSSERRKEKSRDAARCRRSKETEVFYELAHQLPLPQSISSHLDKASIMRLTISFLRTHKLL
## SSVCADRNIETASEKQLDNLYLKALEGFVAVVTQDGDMIFLSENVNKYLGLTQVELTGHSIFDFTHPCDH
## DEIKENLSMKTGVGSGKKNKDANTEHDFFMRMKCTVTNRGRTVNLKSATWKVLHCTGHVKAYNSYYPHSL
## CGYKEPVLSCLIMMCQPIQHPSNIDIPLDSKTFLSRHSMDMKFTYCDDRITELIGYHPEELLGRSAYEFY
## HALDSESMTKSHQNLCAKGQVVTGQYRMLAKHGGYLWVETQSTVIYNTRNSQPQCIVCVNYVLSEIEKND
## VVFSMDQTESLFKPHLMTMNSIFSSSVQEKSDFLFTKLKKEPEDLAQLAPTPGDEIVSLDFGSQTFDEQS
## NFNSVQTSPSKQWPLEVKNLNSQVDIINLPSFKAPQVSTGNSTPSASSNSTCSTPSSPEAYYSSLDEDLK
## IELIEKLFAMDTEAKNQCNTTQNDFNDLDLETLAPYIPMDGEDFQLNPICQEESTISDTPQKAQQNLSSI
## SSLFQPSTPSPQNQFLQQNICQTTAAKSNNGSQDPLPSVLFNNEKKALPLTPYHTGASPPLSSMGGRPNV
## QWPPDPPVSYMPNKWRFVEQYNGSLSSLPSGPPVHLHNMPLYKQRSLESFGQRGKDLHPAEIPFCNSMKR
## KRQLDYGEQGFPQLDVGDLQGESNNPLIMWKRMKALKSGSCPLVAERKSLSTSVLNDGYVCTRHRELNQP
## ISQQQQQQCVAPPRDGRKTAYSNTFFSCSYHDYNMQQTEKMKGLTSRLIAPSFEPYLLPELTRYDCEVNV
## PVLGSSTLLQGSDLLRALDQTT
##
## >NP_001337036.1 endothelial PAS domain-containing protein 1 [Ictalurus punctatus]
## MTAEKDKKRSTSERRKEKSRDAARCRRSKETEVFYELAHQLPLPHSVSSHLDKASIMRLAISFLRTCKLF
## TSGCSTSETDIDRQMDSLYLKSLEGFISVVTSDGDIIFLSENINKFMGLTQVELIGHSIFDFTHPCDHEE
## IRENLSMKTGVGKKGKDLSTERDFFMRMKCTVTSRGRTVNLKSASWKVLHCTGHLKVYNGCSTRTPCGYK
## ESPLTCVVMLCEPVPHPSNIDTPFDSKTFLSRHSMDMKFTYCDERVTQLMGYNPEDLLGRSVYEFYHALD
## SESVTRSHQNLCTKGQAVSGHYRMLAKHGGFVWVETQGTVIYSSRNSQPQCIVCVNYVLSDIEEKSTIFS
## KDQTESLLKTNMSSFFSKARSPMASETSSALFTKFKEEPEDLNHLAPTPGDGFIPLNFGHPSFEEYPVCS
## KVSPMHPPATHSVTERHNLPTMGANFSIPQAPPPSSATPSISNCSTPSSPDDYKSPVDDLKMEITEKLFA
## MDTKGKNSYSQETELSDLDLETLAPYIPMDGEDFQLNPIGQEEPLPEAVALGITEYSFSNIANFFQPLTP
## PPGAHFQPNPHSASEKQAPSNATMEPWPPIFYASHMPLAHHTNPESIPLASMGGHQSLQWPPDPPINYSS
## TKGGAIDSLVEKHSCQALQTNRMSLHNQRSMEKYGPCKAYRDVSPVRLTIPNTMKRSFSNMSMGVSSATR
## PAEMWKRMKNESCAILNRMSQSSSALTGEHMGHQHRKTQNQGNQTARGKKDYPEHCCNYTDYNMLPNTKM
## EGVASRLLGPSFEYCLPELTRYDCEVNVPLQGNLHLLQGRDLLCALDQAT
##
## >XP_001147219.1 endothelial PAS domain-containing protein 1 isoform X2 [Pan troglodytes]
## MTADKEKKRSSSERRKEKSRDAARCRRSKETEVFYELAHELPLPHSVSSHLDKASIMRLAISFLRTHKLL
## SSVCSENESEAEADQQMDNLYLKALEGFIAVVTQDGDMIFLSENISKFMGLTQVELTGHSIFDFTHPCDH
## EEIRENLSLKNGSGFGKKSKDMSTERDFFMRMKCTVTNRGRTVNLKSATWKVLHCTGQVKVYNNCPPHNS
## LCGYKEPLLSCLIIMCEPIQHPSHMDIPLDSKTFLSRHSMDMKFTYCDDRITELIGYHPEELLGRSAYEF
## YHALDSENMTKSHQNLCTKGQVVSGQYRMLAKHGGYVWLETQGTVIYNPRNLQPQCIMCVNYVLSEIEKN
## DVVFSMDQTESLFKPHLMAMNSIFDSSGKGAVSEKSNFLFTKLKEEPEELAQLAPTPGDAIISLDFGNQN
## FEESSAYGKAILPPSQPWATELRSHSTQSEAGSLPAFTVPQAAAPGSTTPSATSSSSSCSTPNSPEDYYT
## SLDNDLKIEVIEKLFAMDTEAKDQCSTQTDFNELDLETLAPYIPMDGEDFQLSPICPEERLLAENPQSTP
## QHCFSAMTNIFQPLAPVAPHSPFLLDKFQQQLESKKTEPEHRPMSSIFFDAGSKASLPPCCGQASTPLSS
## MGGRSNTQWPPDPPLHFGPTKWAVGDQRTEFLGAAPLGPPVSPPHVSTFKTRSAKGFGARGPDVLSPAMV
## ALSNKLKLKRQLEYEEQAFQDLSGGDPPGGSTSHLMWKRMKNLRGGSCPLMPDKPLSANVPNDKFTQNPM
## RGLGHPLRHLPLPQPPSAISPGENSKSRFPPQCYTTQYQDYSLSSAHKVSGMASRLLGPSFESYLLPELT
## RYDCEVNVPVLGSSTLLQGGDLLRALDQAT
epas_list <- entrez_fetch_list(db = "protein",
id = epas_table$accession,
rettype = "fasta")
length(epas_list)
## [1] 10
for(i in 1:length(epas_list)){
epas_list[[i]] <- fasta_cleaner(epas_list[[i]], parse = F)
}
epas_vector <- rep(NA, length(epas_list))
for(i in 1:length(epas_vector)){
epas_vector[i] <- epas_list[[i]]
}
names(epas_vector) <- names(epas_list)
epas_vector_ss <- Biostrings::AAStringSet(epas_vector)
epas_align <- msa(epas_vector_ss,
method = "ClustalW")
## use default substitution matrix
epas_align
## CLUSTAL 2.1
##
## Call:
## msa(epas_vector_ss, method = "ClustalW")
##
## MsaAAMultipleAlignment with 10 rows and 900 columns
## aln names
## [1] -MTADK-----------EKKRSSSE...VNVPVPGSSTLLQGRDLLRALDQAT NP_034267.3
## [2] -MTAEWRELWEIRGQAGKDWRSSSE...VNVPVPGSSTLLQGRDLLRALDQAT XP_038967774.1
## [3] -MTADK-----------EKKRSSSE...VNVPVLGSSTLLQGGDLLRALDQAT NP_001421.2
## [4] -MTADK-----------EKKRSSSE...VNVPVLGSSTLLQGGDLLRALDQAT XP_001147219.1
## [5] -MTADK-----------EKKRSSSE...VNVPVPGSSTLLQGGDLLRALDQAT XP_022280389.2
## [6] -MTADK-----------EKKRSSSE...VNVPVPGTSTLLQGGDLLRALDQAT NP_777150.1
## [7] MAEADHP--------APSSAESSSE...VNVPVLGSSTLLQGSELLRALDQAT XP_015139104.2
## [8] -MTAEK-----------EKKRNSSE...VNVPVLGSSTLLQGSDLLRALDQTT NP_001005647.1
## [9] -MTAEK-----------EKKRCSSE...VNVPLQGNLHLLQGSDLLRALDQAT NP_001034895.2
## [10] -MTAEK-----------DKKRSTSE...VNVPLQGNLHLLQGRDLLCALDQAT NP_001337036.1
## Con -MTADK-----------EKKRSSSE...VNVPV?GSSTLLQG?DLLRALDQAT Consensus
class(epas_align) <- "AAMultipleAlignment"
epas_align_seqinr <- msaConvert(epas_align, type = "seqinr::alignment")
print_msa(alignment = epas_align_seqinr,
chunksize = 60)
## [1] "-MTADK-----------EKKRSSSELRKEKSRDAARCRRSKETEVFYELAHELPLPHSVS 0"
## [1] "-MTAEWRELWEIRGQAGKDWRSSSELRKEKSRDAARCRRSKETEVFYELAHELPLPHSVS 0"
## [1] "-MTADK-----------EKKRSSSERRKEKSRDAARCRRSKETEVFYELAHELPLPHSVS 0"
## [1] "-MTADK-----------EKKRSSSERRKEKSRDAARCRRSKETEVFYELAHELPLPHSVS 0"
## [1] "-MTADK-----------EKKRSSSERRKEKSRDAARCRRSKETEVFYELAHELPLPHSVS 0"
## [1] "-MTADK-----------EKKRSSSERRKEKSRDAARCRRSKETEVFYELAHELPLPHSVS 0"
## [1] "MAEADHP--------APSSAESSSERRKEKSRDAARCRRSKETEVFYELAHELPLPHNVS 0"
## [1] "-MTAEK-----------EKKRNSSERRKEKSRDAARCRRSKETEVFYELAHQLPLPQSIS 0"
## [1] "-MTAEK-----------EKKRCSSERRKEKSRDAARCRRSKETEVFYELAHHLPLPHSIS 0"
## [1] "-MTAEK-----------DKKRSTSERRKEKSRDAARCRRSKETEVFYELAHQLPLPHSVS 0"
## [1] " "
## [1] "SHLDKASIMRLAISFLRTHKLLSSVCSENESEAEADQQMDNLYLKALEGFIAVVTQDGDM 0"
## [1] "SHLDKASIMRLAISFLRTHKLLSSVCSENESEAEADQQMDNLYLKALEGFIAVVTQDGDM 0"
## [1] "SHLDKASIMRLAISFLRTHKLLSSVCSENESEAEADQQMDNLYLKALEGFIAVVTQDGDM 0"
## [1] "SHLDKASIMRLAISFLRTHKLLSSVCSENESEAEADQQMDNLYLKALEGFIAVVTQDGDM 0"
## [1] "SHLDKASIMRLAISFLRTHKLLSSVCSENESEAEANQQMDNLYLKALEGFIAVVTQDGDM 0"
## [1] "SHLDKASIMRLAISFLRTHKLLSSVCSENESEADADQQMDNLYLKALEGFIAVVTQDGDM 0"
## [1] "SHLDKASIMRLAISFLRTHKLLSSVCADNENELEADQQMDNLYLKALEGFIAVVTQDGDM 0"
## [1] "SHLDKASIMRLTISFLRTHKLLSSVCADRNIETASEKQLDNLYLKALEGFVAVVTQDGDM 0"
## [1] "SHLDKASIMRLAISFLRTRKLVNSGYNTPTEMTDADRLMDSWYLKSLGGFITVVTSDGDM 0"
## [1] "SHLDKASIMRLAISFLRTCKLFTSGCSTS--ETDIDRQMDSLYLKSLEGFISVVTSDGDI 0"
## [1] " "
## [1] "IFLSENISKFMGLTQVELTGHSIFDFTHPCDHEEIRENLTLKNGSGFGKKSKDVSTERDF 0"
## [1] "IFLSENISKFMGLTQVELTGHSIFDFTHPCDHEEIRENLTLKTGSGFGKKNKDMSTERDF 0"
## [1] "IFLSENISKFMGLTQVELTGHSIFDFTHPCDHEEIRENLSLKNGSGFGKKSKDMSTERDF 0"
## [1] "IFLSENISKFMGLTQVELTGHSIFDFTHPCDHEEIRENLSLKNGSGFGKKSKDMSTERDF 0"
## [1] "IFLSENISKFMGLTQVELTGHSIFDFTHPCDHEEIRENLSLKSGPGFGKKSKDMSTERDF 0"
## [1] "IFLSENISKFMGLTQVELTGHSIFDFTHPCDHEEIRENLSLKNGSGFGKKSKDMSTERDF 0"
## [1] "IFLSENVNKYMGLTQVELTGHSIFDFTHPCDHEEIRENLSLKNGPGFGKKNKEMSTERDF 0"
## [1] "IFLSENVNKYLGLTQVELTGHSIFDFTHPCDHDEIKENLSMKTGVGSGKKNKDANTEHDF 0"
## [1] "IFLSENINKFMGLTQVELTGHSIFDFTHPCDHEEIRENLSLKAG--IGKKGKELSTERDF 0"
## [1] "IFLSENINKFMGLTQVELIGHSIFDFTHPCDHEEIRENLSMKTG--VGKKGKDLSTERDF 0"
## [1] " "
## [1] "FMRMKCTVTNRGRTVNLKSATWKVLHCTGQVRVYNNCPPHSSLCGSKEPLLSCLIIMCEP 0"
## [1] "FMRMKCTVTNRGRTVNLKSATWKVLHCTGQVRVYNNCPPHSSLCGYKEPLLSCLIIMCEP 0"
## [1] "FMRMKCTVTNRGRTVNLKSATWKVLHCTGQVKVYNNCPPHNSLCGYKEPLLSCLIIMCEP 0"
## [1] "FMRMKCTVTNRGRTVNLKSATWKVLHCTGQVKVYNNCPPHNSLCGYKEPLLSCLIIMCEP 0"
## [1] "FMRMKCTVTNRGRTVNLKSATWKVLHCTGQVKVYNNCPPHSSLCSYKEPLLSCLIIMCEP 0"
## [1] "FMRMKCTVTNRGRTVNLKSATWKVLHCTGQVKVYNNCPPHSSLCGCKEPLLSCLIIMCEP 0"
## [1] "FMRMKCTVTNRGRTVNLKSATWKVLHCTGQVKVYNTCPPHT-LCGYKEPLLTCLIIMCEP 0"
## [1] "FMRMKCTVTNRGRTVNLKSATWKVLHCTGHVKAYNSYYPHS-LCGYKEPVLSCLIMMCQP 0"
## [1] "FMRMKCTVTNRGRTVNLKSASWKVLHCTGHLKVCNGCPARV-LCGFKEPPLTCVVMMCEP 0"
## [1] "FMRMKCTVTSRGRTVNLKSASWKVLHCTGHLKVYNGCSTRT-PCGYKESPLTCVVMLCEP 0"
## [1] " "
## [1] "IQHPSHMDIPLDSKTFLSRHSMDMKFTYCDDRILELIGYHPEELLGRSAYEFYHALDSEN 0"
## [1] "IQHPSHMDIPLDSKTFLSRHSMDMKFTYCDDRILELVGYHPEELLGRSAYEFYHALDSEN 0"
## [1] "IQHPSHMDIPLDSKTFLSRHSMDMKFTYCDDRITELIGYHPEELLGRSAYEFYHALDSEN 0"
## [1] "IQHPSHMDIPLDSKTFLSRHSMDMKFTYCDDRITELIGYHPEELLGRSAYEFYHALDSEN 0"
## [1] "IQHPSHMDIPLDSKTFLSRHSMDMKFTYCDDRITELVGYHPEELLGRSAYEFYHALDSEN 0"
## [1] "IQHPSHMDIPLDSKTFLSRHSMDMKFTYCDDRITELVGYHPEELLGRSAYEFYHALDSEN 0"
## [1] "IQHPSNIDIPLDSKTFMSRHSMDMKFTYCDDRITELIGYHPEELLGRSAYEFYHALDSEN 0"
## [1] "IQHPSNIDIPLDSKTFLSRHSMDMKFTYCDDRITELIGYHPEELLGRSAYEFYHALDSES 0"
## [1] "IVHPSNIDTPLDSKTFLSRHSMDMKYTYCDERVTELMGYNPEDLLGRSAYEFYHALDAEN 0"
## [1] "VPHPSNIDTPFDSKTFLSRHSMDMKFTYCDERVTQLMGYNPEDLLGRSVYEFYHALDSES 0"
## [1] " "
## [1] "MTKSHQNLCTKGQVVSGQYRMLAKHGGYVWLETQGTVIYNPRNLQPQCIMCVNYVLSEIE 0"
## [1] "MTKSHQNLCTKGQVVSGQYRMLAKHGGYVWLETQGTVVYNPRNLQPQCIMCVNYVLSEIE 0"
## [1] "MTKSHQNLCTKGQVVSGQYRMLAKHGGYVWLETQGTVIYNPRNLQPQCIMCVNYVLSEIE 0"
## [1] "MTKSHQNLCTKGQVVSGQYRMLAKHGGYVWLETQGTVIYNPRNLQPQCIMCVNYVLSEIE 0"
## [1] "MTKSHQNLCTKGQVVSGQYRMLAKHGGYVWLETQGTVIYNPRNLQPQCIMCVNYVLSEIE 0"
## [1] "MTKSHQNLCTKGQVVSGQYRMLAKHGGYVWLETQGTVIYNPRNLQPQCIMCVNYVLSEIE 0"
## [1] "MTKSHQNLCTKGQVVTGQYRMLAKHGGYVWLETQGTVIYNTRNLQPQCIICVNYVLSEIE 0"
## [1] "MTKSHQNLCAKGQVVTGQYRMLAKHGGYLWVETQSTVIYNTRNSQPQCIVCVNYVLSEIE 0"
## [1] "VTKSHQNLCTKGQAVSGQYRMLAKNGGYVWVETRGTVIYNSRNSQPQCIVCVNYVLSDVE 0"
## [1] "VTRSHQNLCTKGQAVSGHYRMLAKHGGFVWVETQGTVIYSSRNSQPQCIVCVNYVLSDIE 0"
## [1] " "
## [1] "KNDVVFSMDQTESLFKPHLMAMNSIFDSSDDVAVTEKSNYLFTKLKEEPEELAQLAPTPG 0"
## [1] "KNDVVFSMDQTESLFKPHLMAMNSIFDSSDDVALSEKSNYLFTKLKEEPEDLAQLAPTPG 0"
## [1] "KNDVVFSMDQTESLFKPHLMAMNSIFDSSGKGAVSEKSNFLFTKLKEEPEELAQLAPTPG 0"
## [1] "KNDVVFSMDQTESLFKPHLMAMNSIFDSSGKGAVSEKSNFLFTKLKEEPEELAQLAPTPG 0"
## [1] "KNDVVFSMDQTESLFKPHLMAMTSLFDSSGEVAVSDKSDYLFTKLKEEPEELAQLAPTAG 0"
## [1] "KNDVVFSMDQTESLFKPHLLTMNSIFDNSGKVAVSEKSNFLFTKLKEEPEELAQLAPTAG 0"
## [1] "KNDIVFSMDQTESLFKPHLLTMSSAFENG--ISGRDKSDLLFTKLKEEPEELAQLAPTPG 0"
## [1] "KNDVVFSMDQTESLFKPHLMTMNSIFSSS----VQEKSDFLFTKLKKEPEDLAQLAPTPG 0"
## [1] "EKSLIFSMDQTESLFKPHKLNG---FFSPKEALGSDPADLLFTKLKEEPEDLTQLAPTPG 0"
## [1] "EKSTIFSKDQTESLLKTNMSSF---FSKARSPMASETSSALFTKFKEEPEDLNHLAPTPG 0"
## [1] " "
## [1] "DAIISLDFGS----QNFDEPSAYGKAILPPGQPWVSGLRSHSAQ---SESGSLPAFTVPQ 0"
## [1] "DAIISLDFGS----QNFDESSTYGKAILPPGQPWATELRSHSAQ---SESRSLPAFTVPQ 0"
## [1] "DAIISLDFGN----QNFEESSAYGKAILPPSQPWATELRSHSTQ---SEAGSLPAFTVPQ 0"
## [1] "DAIISLDFGN----QNFEESSAYGKAILPPSQPWATELRSHSTQ---SEAGSLPAFTVPQ 0"
## [1] "DAIISLDFGS----QSFEESSSYNSALLPPSQPWPRELRSHSTQ---SEDGSLPAFTVPQ 0"
## [1] "DTIISLDFGT----PNFEESSAYGKGILPPGQQWTGEVKSHGTH---SEAGSLPAFTVPQ 0"
## [1] "DAIISLDFELHPGIQKFEEAPAYTSAVLTPNKPWPVEVKSHAAQ---GETLTIPSFTMPQ 0"
## [1] "DEIVSLDFGS----QTFDEQSNFNSVQTSPSKQWPLEVKNLNSQ---VDIINLPSFKAPQ 0"
## [1] "DTIISLDFGQ----SQYEEHTVYNKVSSVAQTVSHPVHDGHRTSYSGEMAKMAATFSVPQ 0"
## [1] "DGFIPLNFGH----PSFEEYPVCSKVSPMHPPATHSVTERHN------LPTMGANFSIPQ 0"
## [1] " "
## [1] "ADTPGNTTPSASSSSS-CSTPSSPEDYYSSLENPLKIEVIEKLFAMDTEPRDPG-STQTD 0"
## [1] "AGSPGNATPSATSSSS-CSTPSSPEDYYSSLENHLKIEVIEKLFAMDTEAKDQC-STQTD 0"
## [1] "AAAPGSTTPSATSSSSSCSTPNSPEDYYTSLDNDLKIEVIEKLFAMDTEAKDQC-STQTD 0"
## [1] "AAAPGSTTPSATSSSSSCSTPNSPEDYYTSLDNDLKIEVIEKLFAMDTEAKDQC-STQTD 0"
## [1] "AAASGSATPSATSSSSSCSTPSSPGDYYTSLNDDLKIEVIEKLFTMDTEAKDQG-STQTD 0"
## [1] "AAALGNSTPSASSSSS-CSTPSSPGDYYTSLDDNLKIEAIEKLFAMDTEAKDQC-GTQTD 0"
## [1] "IAP-GSSTPSASSNSS-CSTPNSPEDYYTSVDDDLKIEVIEKLFAMDTESKSQC-NSQTD 0"
## [1] "VST-GNSTPSASSNST-CSTPSSPEAYYSSLDEDLKIELIEKLFAMDTEAKNQCNTTQND 0"
## [1] "SAPPSSATPSLSS----CSTPSSPDDYYTPVDSDLKVELTEKLFSLDTQEAKTSRNQETD 0"
## [1] "APPPSSATPSISN----CSTPSSPDDYKSPVD-DLKMEITEKLFAMDTK-GKNSYSQETE 0"
## [1] " "
## [1] "FSELDLETLAPYIPMDGEDFQLSPICPEEPLMPESPQPTPQHCFSTMTSIFQPLTPGATH 0"
## [1] "FNELDLETLAPYIPMDGEDFQLSPICPEEPLVPESPQPNPQHCFSTMSSIFQPLTPGASQ 0"
## [1] "FNELDLETLAPYIPMDGEDFQLSPICPEERLLAENPQSTPQHCFSAMTNIFQPLAPVAPH 0"
## [1] "FNELDLETLAPYIPMDGEDFQLSPICPEERLLAENPQSTPQHCFSAMTNIFQPLAPVAPH 0"
## [1] "FSELDLETLAPYIPMDGEDFQLSPICPEERLLPEKPQSTPQHCFSTMTNIFQPLAPMASH 0"
## [1] "FNELDLETLAPYIPMDGEDFQLSPICPEESLLPETPQSAPQHCFSTMSNIFQPLAPMASH 0"
## [1] "FNELDLETLAPYIPMDGEDFQLSPICQEERTLSESAQNT-QQSLSSMSTIFQPLAS-ASQ 0"
## [1] "FNDLDLETLAPYIPMDGEDFQLNPICQEESTISDTPQKA-QQNLSSISSLFQPSTP-SPQ 0"
## [1] "LSDLDLETLAPYIPMDGEDFQLNPICPEEPPSEIGTLGTNQQCFSNITSLFQPLSS-PSA 0"
## [1] "LSDLDLETLAPYIPMDGEDFQLNPIGQEEPLPEAVALGITEYSFSNIANFFQPLTP-PPG 0"
## [1] " "
## [1] "GPFFLDKYPQQLESRKTESEHWPMSSIFFDAGSKGS-LSPCCGQASTPLSSMGGRSNTQW 0"
## [1] "GTFFLDKYPQQLESRKTESEHWPMSTIFFDAGSKGS-LPPCCGQASTPLSSMGGRSNTQW 0"
## [1] "SPFLLDKFQQQLESKKTEPEHRPMSSIFFDAGSKAS-LPPCCGQASTPLSSMGGRSNTQW 0"
## [1] "SPFLLDKFQQQLESKKTEPEHRPMSSIFFDAGSKAS-LPPCCGQASTPLSSMGGRSNTQW 0"
## [1] "SPFLLDKYQQQLGSKKIEPEHQPMSSIFFDGGNKVS-LPPCCGQASTPLSSMGGRSSTQW 0"
## [1] "STFLLDKYQQQLESKKTEPEQRRVSFAFFDGGSRVS-LLQCCGQTYTPLSSMGGISNTQW 0"
## [1] "NQFLPEKYCPQLSNKNINPGHGSLSSVFFNNMSRSS-LPPYHNQASTPLSSMGGRPNTQW 0"
## [1] "NQFLQQNICQTTAAKSNNGSQDPLPSVLFNNEKKALPLTPYHTGASPPLSSMGGRPNVQW 0"
## [1] "AHYQPKMSSGGDKQNINGGSVESWPPVPYSRD--PMQMPPYHDPASTPLSSMGGRQNLQW 0"
## [1] "AHFQPNPHSASEKQAPSNATMEPWPPIFYAS---HMPLAHHTNPESIPLASMGGHQSLQW 0"
## [1] " "
## [1] "PPDPPLHFGPTKWPVGDQSAESLGALPVGSSQLEPPSAPPHVSMFKMRSAKDFG-ARGPY 0"
## [1] "PPDPPLHFGPTKWSVGNQSAEPLGPLPLGTSQLEPPSTPPHVSMFKMRSAKDFG-ARGPY 0"
## [1] "PPDPPLHFGPTKWAVGDQRTEFLGAAPLG-----PPVSPPHVSTFKTRSAKGFG-ARGPD 0"
## [1] "PPDPPLHFGPTKWAVGDQRTEFLGAAPLG-----PPVSPPHVSTFKTRSAKGFG-ARGPD 0"
## [1] "PPDPPLHFGPTKWPVADQHTESLGPSPLG-----PPITSPHLSMFKKRSAKAFG-PQGPD 0"
## [1] "PPDPPLQLGPTKWPGEDRHAEAVGAAPLG-----LPPATPHLAMLKKRSAKGFG-PQGPD 0"
## [1] "PPDPPLEYVPSKWRLMDKYSGTLSSSPSG-----PPVRSPNMPIYKKRPLDGLG-QRGID 0"
## [1] "PPDPPVSYMPNKWRFVEQYNGSLSSLPSG-----PPVHLHNMPLYKQRSLESFG-QRGKD 0"
## [1] "PPDPPL---PSKAGMMDPLAAKRSCQGMP---------ANRMAPFMQRPMENF--VQNYR 0"
## [1] "PPDPPINYSSTKGGAIDSLVEKHSCQALQ---------TNRMSLHNQRSMEKYGPCKAYR 0"
## [1] " "
## [1] "MMSPAMIALSNKLKLKRQLEYEEQAFQDTSGGDPPG-TSSSHLMWKRMKSLMGGT-CPLM 0"
## [1] "MMSPAMIALSNKLKLKRQLDYEEPAFQDTSGGDPPG-TSSSHLMWKRMKSLMGGT-CPLM 0"
## [1] "VLSPAMVALSNKLKLKRQLEYEEQAFQDLSGGDPPG-GSTSHLMWKRMKNLRGGS-CPLM 0"
## [1] "VLSPAMVALSNKLKLKRQLEYEEQAFQDLSGGDPPG-GSTSHLMWKRMKNLRGGS-CPLM 0"
## [1] "VMSPAMVALSNKLKLKRQREYEEQAFQDLSGGDPPG-SSTSHQVWKRMKSLRGNVNCPLI 0"
## [1] "VMSPAMIALSNKLKLKRQLEYEEQAFQDMSGGDPPG-SGTSHLMWKRMKSLRGGGTCSLM 0"
## [1] "IN-PARIALSNSLKLKRQLDYEEQALQQLSGGDPSV-INPPQLMWKRMKFLKGENCSLLT 0"
## [1] "LH-PAEIPFCNSMKRKRQLDYGEQGFPQLDVGDLQGESNNPLIMWKRMKALKSGSCPLVA 0"
## [1] "DTSPARLALANSFKR---------SFSQMAMAETPP-TKSQQTVWKKLR----HESCAVM 0"
## [1] "DVSPVRLTIPNTMKR---------SFSNMSMGVSSA-TRP-AEMWKRMK----NESCAIL 0"
## [1] " "
## [1] "PDKTISANMAPDEFTQKSMRGLGQPLRHLPPPQPPSTRSSGENAKT-GFPPQCYASQFQD 0"
## [1] "PDKTVSASMAPDEFTQKSMRGLGQPLRHLPPSQPPSTRSPGENAKS-GFPPQCYASPFQD 0"
## [1] "PDKPLSANVPNDKFTQNPMRGLGHPLRHLPLPQPPSAISPGENSKS-RFPPQCYATQYQD 0"
## [1] "PDKPLSANVPNDKFTQNPMRGLGHPLRHLPLPQPPSAISPGENSKS-RFPPQCYTTQYQD 0"
## [1] "PDKLLSANIPNDEFTQNPMRGLGQPLRHLPP--PPSVMSPGENTKS-GFPPQCYAPQYQD 0"
## [1] "PDKLPNANVPNDEFIQNPVRGRSQPLRHLSPPQPPSATSPGEPTKS-GFPAQCYAPQYQD 0"
## [1] "EKKSLSTSVLTDEFVCN-SRGLSQPVNQLQQQQQSTCGSPGENLKAGAFSPQFYSPHYQD 0"
## [1] "ERKSLSTSVLNDGYVCTRHRELNQPIS--QQQQQQCVAPPRDGRKT-AYSNTFFSCSYHD 0"
## [1] "ERKSLSSSALSD-----KSMAHNGGMDHQHRKSQYSGNQNGQPTKH--YREQFCN--YRE 0"
## [1] "NRMSQSSSALTG-----EHMG------HQHRKTQNQGNQTARGKKD--YPEHCCN--YTD 0"
## [1] " "
## [1] "YGPPGAQKVSGVASRLLGPSFEPYLLPELTRYDCEVNVPVPGSSTLLQGRDLLRALDQAT 0"
## [1] "YSPPGAQKGSGVASRLLGPSFEPYLLPELTRYDCEVNVPVPGSSTLLQGRDLLRALDQAT 0"
## [1] "YSLSSAHKVSGMASRLLGPSFESYLLPELTRYDCEVNVPVLGSSTLLQGGDLLRALDQAT 0"
## [1] "YSLSSAHKVSGMASRLLGPSFESYLLPELTRYDCEVNVPVLGSSTLLQGGDLLRALDQAT 0"
## [1] "YSLPSAHKVSGMASRLLGPSFEPYLLPELTRYDCEVNVPVPGSSTLLQGGDLLRALDQAT 0"
## [1] "YSLPAAHKMSGMASRLLGPSFEPYLLPELTRYDCEVNVPVPGTSTLLQGGDLLRALDQAT 0"
## [1] "YTVQSAHKVSGVTSRLLGSSFEPYLLPELTRYDCEVNVPVLGSSTLLQGSELLRALDQAT 0"
## [1] "YNMQQTEKMKGLTSRLIAPSFEPYLLPELTRYDCEVNVPVLGSSTLLQGSDLLRALDQTT 0"
## [1] "FNMQPSSKMDGIASRLIGPSFETYSLPELTRYDCEVNVPLQGNLHLLQGSDLLRALDQAT 0"
## [1] "YNMLPNTKMEGVASRLLGPSFE-YCLPELTRYDCEVNVPLQGNLHLLQGRDLLCALDQAT 0"
## [1] " "
ggmsa::ggmsa(epas_align,
start = 820,
end = 890)
#msaPrettyPrint(epas_align, # alignment
# file = "epas_msa.pdf", # file name
# y=c(820, 890), # range
# askForOverwrite=FALSE)
epas_subset_dist <- seqinr::dist.alignment(epas_align_seqinr,
matrix = "identity")
is(epas_subset_dist)
## [1] "dist" "oldClass"
class(epas_subset_dist)
## [1] "dist"
epas_dist <- seqinr::dist.alignment(epas_align_seqinr,
matrix = "identity")
epas_dist_rounded <- round(epas_dist,
digits = 2)
epas_dist_rounded
## NP_034267.3 XP_038967774.1 NP_001421.2 XP_001147219.1
## XP_038967774.1 0.25
## NP_001421.2 0.34 0.36
## XP_001147219.1 0.34 0.36 0.03
## XP_022280389.2 0.36 0.37 0.31 0.31
## NP_777150.1 0.40 0.41 0.35 0.35
## XP_015139104.2 0.53 0.52 0.50 0.50
## NP_001005647.1 0.59 0.59 0.58 0.58
## NP_001034895.2 0.65 0.65 0.64 0.64
## NP_001337036.1 0.65 0.65 0.65 0.65
## XP_022280389.2 NP_777150.1 XP_015139104.2 NP_001005647.1
## XP_038967774.1
## NP_001421.2
## XP_001147219.1
## XP_022280389.2
## NP_777150.1 0.35
## XP_015139104.2 0.50 0.52
## NP_001005647.1 0.59 0.59 0.54
## NP_001034895.2 0.64 0.64 0.63 0.65
## NP_001337036.1 0.65 0.66 0.66 0.65
## NP_001034895.2
## XP_038967774.1
## NP_001421.2
## XP_001147219.1
## XP_022280389.2
## NP_777150.1
## XP_015139104.2
## NP_001005647.1
## NP_001034895.2
## NP_001337036.1 0.55
# Note - not using rounded values
tree_subset <- nj(epas_dist)
# plot tree
plot.phylo(tree_subset, main="Phylogenetic Tree",
use.edge.length = T)
# add label
mtext(text = "EPAS1 family gene tree - rooted")