EPAS 1- endothelial PAS domain protein 1

function: This gene encodes a transcription factor involved in the induction of genes regulated by oxygen, which is induced as oxygen levels fall. The encoded protein contains a basic-helix-loop-helix domain protein dimerization domain as well as a domain found in proteins in signal transduction pathways which respond to oxygen levels. Mutations in this gene are associated with erythrocytosis familial type 4.

Resources

Key information use to make this script can be found here:

Refseq Gene:https://www.ncbi.nlm.nih.gov/gene/2034/ Refseq Homologene: https://www.ncbi.nlm.nih.gov/gene/2034/ortholog/?scope=89593&term=EPAS1

packages

library(seqinr)
library(rentrez)
library(compbio4all)
library(Biostrings)
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: XVector
## Loading required package: GenomeInfoDb
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:seqinr':
## 
##     translate
## The following object is masked from 'package:base':
## 
##     strsplit
library(devtools)
## Loading required package: usethis
library(ggmsa)
## Registered S3 methods overwritten by 'ggalt':
##   method                  from   
##   grid.draw.absoluteGrob  ggplot2
##   grobHeight.absoluteGrob ggplot2
##   grobWidth.absoluteGrob  ggplot2
##   grobX.absoluteGrob      ggplot2
##   grobY.absoluteGrob      ggplot2
library(BiocManager)
## Bioconductor version '3.13' is out-of-date; the current release version '3.14'
##   is available with R version '4.1'; see https://bioconductor.org/install
## 
## Attaching package: 'BiocManager'
## The following object is masked from 'package:devtools':
## 
##     install
library(ape)
## 
## Attaching package: 'ape'
## The following object is masked from 'package:Biostrings':
## 
##     complement
## The following objects are masked from 'package:seqinr':
## 
##     as.alignment, consensus
library(drawProteins)
library(msa)
## 
## Attaching package: 'msa'
## The following object is masked from 'package:BiocManager':
## 
##     version
clean sequences
# Human EPAS1 (H. sapiens)
hepas1 <- entrez_fetch(db = "protein", 
                          id = "NP_001421.2", 
                          rettype = "fasta")
# house mouse EPAS1 (M. musculus)
mepas1 <- entrez_fetch(db = "protein", 
                          id = "NP_034267.3", 
                          rettype = "fasta")

# norway rat EPAS1
nepas1 <- entrez_fetch(db = "protein", 
                          id = "XP_038967774.1", 
                          rettype = "fasta")


# ZebraFish EPAS1
zepas1 <- entrez_fetch(db = "protein", 
                          id = "NP_001034895.2", 
                          rettype = "fasta")
#B. taurus
tepas1 <- entrez_fetch(db = "protein", 
                          id = "NP_777150.1", 
                          rettype = "fasta")
#G. gallus
gepas1 <- entrez_fetch(db = "protein", 
                          id = "XP_015139104.2", 
                          rettype = "fasta")
#C. familiaris
cepas1 <- entrez_fetch(db = "protein", 
                          id = "XP_022280389.2", 
                         rettype = "fasta")
#X. tropicalis
pepas1 <- entrez_fetch(db = "protein", 
                       id = "NP_001005647.1", 
                          rettype = "fasta")
#I. punctatus
IPepas1 <- entrez_fetch(db = "protein", 
                          id = "NP_001337036.1", 
                          rettype = "fasta")
#P. troglodytes
MMepas1 <- entrez_fetch(db = "protein", 
                          id = "XP_001147219.1", 
                          rettype = "fasta")
hepas1  <- fasta_cleaner(hepas1,  parse = F)
mepas1 <- fasta_cleaner(mepas1, parse = F)
nepas1  <- fasta_cleaner(nepas1,  parse = F)
zepas1   <- fasta_cleaner(zepas1,   parse = F)
tepas1   <- fasta_cleaner(tepas1,   parse = F)
gepas1   <- fasta_cleaner(gepas1,   parse = F)
cepas1   <- fasta_cleaner(cepas1,   parse = F)
pepas1   <- fasta_cleaner(pepas1,   parse = F)
IPepas1   <- fasta_cleaner(IPepas1,   parse = F)
MMepas1   <- fasta_cleaner(MMepas1,   parse = F)

Accession number table

Accession numbers were obtained from RefSeq, Refseq Homlogene, UniProt and PDB. UniProt accession numbers can be found by searching for the gene name. PDB accessions can be found by searching with a UniProt accession or a gene name, though many proteins are not in PDB. The the Neanderthal genome database was searched but did not yield sequence information on.

epas_table <- c("NP_001421.2", "H. sapiens", "human", "hepas1",
                "Q99814", 
            "NP_034267.3", "M. musculus", "house mouse", "mepas1",
            "P97481",
            "XP_038967774.1", "R. norvegicus", "norway rat", "nepas1",
            "Q9JHS1",
            "NP_001034895.2", "D. rerio", "zebra fish", "zepas1",
            "B3DJD1",
            "NP_777150.1", "B. taurus", "cattle", "cepas1",
            "A0A3Q1MIU5",
            "XP_015139104.2", "G. gallus", "chicken", "gepas1",
            "Q9W7C6",
            "XP_022280389.2", "C. familiaris", "dog", "depas1",
            "E2QUK0",
            "NP_001005647.1", "X. tropicalis", "frog", "pepas1",
            "Q6GL61",
            "NP_001337036.1", "I. punctatus", "channel catfish", 
            "iepas1", "K7G1Q3", 
            "XP_001147219.1", "P. troglodytes", "chimp", 
            "mmepas1", "A0A2I3TA92")
epas_table
##  [1] "NP_001421.2"     "H. sapiens"      "human"           "hepas1"         
##  [5] "Q99814"          "NP_034267.3"     "M. musculus"     "house mouse"    
##  [9] "mepas1"          "P97481"          "XP_038967774.1"  "R. norvegicus"  
## [13] "norway rat"      "nepas1"          "Q9JHS1"          "NP_001034895.2" 
## [17] "D. rerio"        "zebra fish"      "zepas1"          "B3DJD1"         
## [21] "NP_777150.1"     "B. taurus"       "cattle"          "cepas1"         
## [25] "A0A3Q1MIU5"      "XP_015139104.2"  "G. gallus"       "chicken"        
## [29] "gepas1"          "Q9W7C6"          "XP_022280389.2"  "C. familiaris"  
## [33] "dog"             "depas1"          "E2QUK0"          "NP_001005647.1" 
## [37] "X. tropicalis"   "frog"            "pepas1"          "Q6GL61"         
## [41] "NP_001337036.1"  "I. punctatus"    "channel catfish" "iepas1"         
## [45] "K7G1Q3"          "XP_001147219.1"  "P. troglodytes"  "chimp"          
## [49] "mmepas1"         "A0A2I3TA92"
# convert to matrix
epas_table_matrix <- matrix(epas_table,
                                  byrow = T,
                                  nrow = 10)
# convert to data frame
epas_table <- data.frame(epas_table_matrix, 
                     stringsAsFactors = F)
length(epas_table)
## [1] 5
# renames columns
names(epas_table) <- c("accession", "name.orig", "common.name", "name.new", "UniProt")

# Create simplified species names
epas_table$spp [grep("sapiens",epas_table$name.orig)] <- "Homo"
epas_table$spp[grep("musculus",epas_table$name.orig)] <- "Mus"
epas_table$spp[grep("norvegicus",epas_table$name.orig)] <- "Rat"
epas_table$spp[grep("rerio",epas_table$name.orig)] <- "zebrafish"
epas_table$spp[grep("taurus",epas_table$name.orig)] <- "cattle"
epas_table$spp[grep("gallus",epas_table$name.orig)] <- "chicken"
epas_table$spp[grep("familiaris",epas_table$name.orig)] <- "dog"
epas_table$spp[grep("tropicalis",epas_table$name.orig)] <- "frog"
epas_table$spp[grep("punctatus",epas_table$name.orig)] <- "catfish"
epas_table$spp[grep("troglodytes",epas_table$name.orig)] <- "chimp"
pander::pander(epas_table)
Table continues below
accession name.orig common.name name.new UniProt
NP_001421.2 H. sapiens human hepas1 Q99814
NP_034267.3 M. musculus house mouse mepas1 P97481
XP_038967774.1 R. norvegicus norway rat nepas1 Q9JHS1
NP_001034895.2 D. rerio zebra fish zepas1 B3DJD1
NP_777150.1 B. taurus cattle cepas1 A0A3Q1MIU5
XP_015139104.2 G. gallus chicken gepas1 Q9W7C6
XP_022280389.2 C. familiaris dog depas1 E2QUK0
NP_001005647.1 X. tropicalis frog pepas1 Q6GL61
NP_001337036.1 I. punctatus channel catfish iepas1 K7G1Q3
XP_001147219.1 P. troglodytes chimp mmepas1 A0A2I3TA92
spp
Homo
Mus
Rat
zebrafish
cattle
chicken
dog
frog
catfish
chimp

protein diagram

Multivariate statistcal techniques were used to confirm the information about protein structure and location in the line database.

Q99814_h  <- drawProteins::get_features("Q99814")
## [1] "Download has worked"
is(Q99814_h)
## [1] "list"             "vector"           "list_OR_List"     "vector_OR_Vector"
## [5] "vector_OR_factor"
my_prot_df <- drawProteins::feature_to_dataframe(Q99814_h)
is(my_prot_df)
## [1] "data.frame"       "list"             "oldClass"         "vector"          
## [5] "list_OR_List"     "vector_OR_Vector" "vector_OR_factor"
my_prot_df[,-2]
##                     type begin end length accession   entryName taxid order
## featuresTemp       CHAIN     1 870    869    Q99814 EPAS1_HUMAN  9606     1
## featuresTemp.1    DOMAIN    14  67     53    Q99814 EPAS1_HUMAN  9606     1
## featuresTemp.2    DOMAIN    84 154     70    Q99814 EPAS1_HUMAN  9606     1
## featuresTemp.3    DOMAIN   230 300     70    Q99814 EPAS1_HUMAN  9606     1
## featuresTemp.4    DOMAIN   304 347     43    Q99814 EPAS1_HUMAN  9606     1
## featuresTemp.5    REGION     1  24     23    Q99814 EPAS1_HUMAN  9606     1
## featuresTemp.6    REGION    26  53     27    Q99814 EPAS1_HUMAN  9606     1
## featuresTemp.7    REGION   171 192     21    Q99814 EPAS1_HUMAN  9606     1
## featuresTemp.8    REGION   460 486     26    Q99814 EPAS1_HUMAN  9606     1
## featuresTemp.9    REGION   496 542     46    Q99814 EPAS1_HUMAN  9606     1
## featuresTemp.10   REGION   830 870     40    Q99814 EPAS1_HUMAN  9606     1
## featuresTemp.11 COMPBIAS   462 486     24    Q99814 EPAS1_HUMAN  9606     1
## featuresTemp.12  MOD_RES   405 405      0    Q99814 EPAS1_HUMAN  9606     1
## featuresTemp.13  MOD_RES   531 531      0    Q99814 EPAS1_HUMAN  9606     1
## featuresTemp.14  MOD_RES   840 840      0    Q99814 EPAS1_HUMAN  9606     1
## featuresTemp.15  MOD_RES   847 847      0    Q99814 EPAS1_HUMAN  9606     1
## featuresTemp.16  VARIANT   534 534      0    Q99814 EPAS1_HUMAN  9606     1
## featuresTemp.17  VARIANT   535 535      0    Q99814 EPAS1_HUMAN  9606     1
## featuresTemp.18  VARIANT   535 535      0    Q99814 EPAS1_HUMAN  9606     1
## featuresTemp.19  VARIANT   537 537      0    Q99814 EPAS1_HUMAN  9606     1
## featuresTemp.20  VARIANT   537 537      0    Q99814 EPAS1_HUMAN  9606     1
## featuresTemp.21  VARIANT   540 540      0    Q99814 EPAS1_HUMAN  9606     1
## featuresTemp.22  VARIANT   766 766      0    Q99814 EPAS1_HUMAN  9606     1
## featuresTemp.23  VARIANT   785 785      0    Q99814 EPAS1_HUMAN  9606     1
## featuresTemp.24  MUTAGEN   844 844      0    Q99814 EPAS1_HUMAN  9606     1
## featuresTemp.25 CONFLICT    60  60      0    Q99814 EPAS1_HUMAN  9606     1
## featuresTemp.26 CONFLICT   539 539      0    Q99814 EPAS1_HUMAN  9606     1
## featuresTemp.27 CONFLICT   601 601      0    Q99814 EPAS1_HUMAN  9606     1
## featuresTemp.28 CONFLICT   693 693      0    Q99814 EPAS1_HUMAN  9606     1
## featuresTemp.29 CONFLICT   716 716      0    Q99814 EPAS1_HUMAN  9606     1
## featuresTemp.30 CONFLICT   722 722      0    Q99814 EPAS1_HUMAN  9606     1
## featuresTemp.31 CONFLICT   765 765      0    Q99814 EPAS1_HUMAN  9606     1
## featuresTemp.32 CONFLICT   769 769      0    Q99814 EPAS1_HUMAN  9606     1
## featuresTemp.33 CONFLICT   844 844      0    Q99814 EPAS1_HUMAN  9606     1
## featuresTemp.34 CONFLICT   847 847      0    Q99814 EPAS1_HUMAN  9606     1
## featuresTemp.35    HELIX   240 242      2    Q99814 EPAS1_HUMAN  9606     1
## featuresTemp.36   STRAND   243 248      5    Q99814 EPAS1_HUMAN  9606     1
## featuresTemp.37     TURN   250 252      2    Q99814 EPAS1_HUMAN  9606     1
## featuresTemp.38   STRAND   253 257      4    Q99814 EPAS1_HUMAN  9606     1
## featuresTemp.39    HELIX   260 265      5    Q99814 EPAS1_HUMAN  9606     1
## featuresTemp.40    HELIX   269 272      3    Q99814 EPAS1_HUMAN  9606     1
## featuresTemp.41    HELIX   277 280      3    Q99814 EPAS1_HUMAN  9606     1
## featuresTemp.42    HELIX   283 285      2    Q99814 EPAS1_HUMAN  9606     1
## featuresTemp.43    HELIX   286 299     13    Q99814 EPAS1_HUMAN  9606     1
## featuresTemp.44   STRAND   300 303      3    Q99814 EPAS1_HUMAN  9606     1
## featuresTemp.45   STRAND   307 310      3    Q99814 EPAS1_HUMAN  9606     1
## featuresTemp.46   STRAND   314 327     13    Q99814 EPAS1_HUMAN  9606     1
## featuresTemp.47     TURN   329 331      2    Q99814 EPAS1_HUMAN  9606     1
## featuresTemp.48   STRAND   334 343      9    Q99814 EPAS1_HUMAN  9606     1

Domains present

my_canvas <- draw_canvas(my_prot_df)  
my_canvas <- draw_chains(my_canvas, my_prot_df, 
                         label_size = 2.5)
my_canvas <- draw_domains(my_canvas, my_prot_df)
my_canvas

Protein with LOTS of “Region” information

Q9JHS1_n <- drawProteins::get_features("Q9JHS1")
## [1] "Download has worked"
my_prot_df <- drawProteins::feature_to_dataframe(Q9JHS1_n)

my_canvas <- draw_canvas(my_prot_df)
my_canvas <- draw_chains(my_canvas, my_prot_df, label_size = 2.5)
my_canvas <- draw_regions(my_canvas, my_prot_df)
my_canvas

DotPlots

h_fasta <- rentrez::entrez_fetch(db = "protein",# add code
                        id = "NP_001421.2",
                         rettype = "fasta")

h_vector <- fasta_cleaner(h_fasta)

# set up 2 x 2 grid, make margins thinner
par(mfrow = c(2,2), 
    mar = c(0,0,2,1))

# plot 1:EPAS1  - Defaults
seqinr::dotPlot(h_vector[400:600], h_vector[400:600], 
        wsize = 1, 
        nmatch = 1, 
        main = "EPAS1 Defaults")

# plot2: EPAS1 - size = 10, nmatch = 1
dotPlot (h_vector[400:600], h_vector[400:600], 
        wsize = 10, 
        nmatch = 1, 
        main = " EPAS1- size = 10, nmatch = 1")

# plot 3: EPAS1 - size =10 , nmatch = 5
dotPlot(h_vector[400:600], h_vector[400:600],
        wsize = 10, 
        nmatch = 5 , 
        main = "EPAS1 - size = 10, nmatch = 5")

# plot 4: EPAS1 - size =20 , nmatch = 5
dotPlot(h_vector[400:600], h_vector[400:600], 
        wsize = 20,
        nmatch = 5,
        main = "EPAS1 - size = 20, nmatch = 5")

Protein properties compiled from databases

Below are links to relevant information. The folllowing were not avalible for this gene: 1. DisProt: no information available on Presence of Disorder 2. RepeatDB: no information available

PDB secondary structural location: Crystal structure of PT2399 bound to HIF2a-B:ARNT-B complex

A the EPAS1 homolog is listed in Alphafold (https://alphafold.ebi.ac.uk/entry/Q99814).

source <- c("PFam","DisProt", "RepeatsDB", "UniProt", "PDB")

property <- c("major feature from Pfam - domain", "Presence of disorganized regions", "Presence of tandem repeats", "subcellular location", "Protein secondary structural classifications")

Info <- c("PAS 
          PAS_3 
          HIF-1 
          HIF-1a_CTAD", "NA", "NA", "nucleus", "1P97- NMR structure of the C-terminal PAS domain of HIF2a")

properties_table <- data.frame(source, property, Info)

pander::pander(properties_table)
source property Info
PFam major feature from Pfam - domain PAS PAS_3 HIF-1 HIF-1a_CTAD
DisProt Presence of disorganized regions NA
RepeatsDB Presence of tandem repeats NA
UniProt subcellular location nucleus
PDB Protein secondary structural classifications 1P97- NMR structure of the C-terminal PAS domain of HIF2a

Predict protein fold: secondary structure prediction

# enter once
aa.1.1 <- c("A","R","N","D","C","Q","E","G","H","I",
            "L","K","M","F","P","S","T","W","Y","V")

# enter again
aa.1.2 <- c("A","R","N","D","C","Q","E","G","H","I",
            "L","K","M","F","P","S","T","W","Y","V")

# are they the same length?
length(aa.1.1) == length(aa.1.2)
## [1] TRUE
# are all the values the same?
aa.1.1 == aa.1.2
##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE
#length(unique(aa.1.1)) tells me how many unique values
unique(aa.1.1)
##  [1] "A" "R" "N" "D" "C" "Q" "E" "G" "H" "I" "L" "K" "M" "F" "P" "S" "T" "W" "Y"
## [20] "V"
# does each vector have the same number of unique values?
length(unique(aa.1.1)) == length(unique(aa.1.2))
## [1] TRUE
# OPTIONAL: are any of the values returned by the logical check == FALSE?
any(c(aa.1.1 == aa.1.2) == FALSE)
## [1] FALSE
# alpha proteins
alpha <- c(285, 53, 97, 163, 22, 67, 134, 197, 111, 91, 
           221, 249, 48, 123, 82, 122, 119, 33, 63, 167)

# check against chou's total
sum(alpha) == 2447
## [1] TRUE
# beta proteins
beta <- c(203, 67, 139, 121, 75, 122, 86, 297, 49, 120, 
          177, 115, 16, 85, 127, 341, 253, 44, 110, 229)

# check against chou's total
sum(beta) == 2776
## [1] TRUE
# alpha + beta
a.plus.b <- c(175, 78, 120, 111, 74, 74, 86, 171, 33, 93,
              110, 112, 25, 52, 71, 126, 117, 30, 108, 123)
sum(a.plus.b) == 1889
## [1] TRUE
# alpha/beta
a.div.b <- c(361, 146, 183, 244, 63, 114, 257, 377, 107, 239, 
             339, 321, 91, 158, 188, 327, 238, 72, 130, 378)
sum(a.div.b) == 4333
## [1] TRUE
data.frame(aa.1.1, alpha, beta, a.plus.b, a.div.b)
##    aa.1.1 alpha beta a.plus.b a.div.b
## 1       A   285  203      175     361
## 2       R    53   67       78     146
## 3       N    97  139      120     183
## 4       D   163  121      111     244
## 5       C    22   75       74      63
## 6       Q    67  122       74     114
## 7       E   134   86       86     257
## 8       G   197  297      171     377
## 9       H   111   49       33     107
## 10      I    91  120       93     239
## 11      L   221  177      110     339
## 12      K   249  115      112     321
## 13      M    48   16       25      91
## 14      F   123   85       52     158
## 15      P    82  127       71     188
## 16      S   122  341      126     327
## 17      T   119  253      117     238
## 18      W    33   44       30      72
## 19      Y    63  110      108     130
## 20      V   167  229      123     378
alpha.prop <- alpha/sum(alpha)
beta.prop <- beta/sum(beta)
a.plus.b.prop <- a.plus.b/sum(a.plus.b)
a.div.b <- a.div.b/sum(a.div.b)

#dataframe
aa.prop <- data.frame(alpha.prop,
                      beta.prop,
                      a.plus.b.prop,
                      a.div.b)
#row labels
row.names(aa.prop) <- aa.1.1
aa.prop
##    alpha.prop   beta.prop a.plus.b.prop    a.div.b
## A 0.116469146 0.073126801    0.09264161 0.08331410
## R 0.021659174 0.024135447    0.04129169 0.03369490
## N 0.039640376 0.050072046    0.06352567 0.04223402
## D 0.066612178 0.043587896    0.05876125 0.05631202
## C 0.008990601 0.027017291    0.03917417 0.01453958
## Q 0.027380466 0.043948127    0.03917417 0.02630972
## E 0.054760932 0.030979827    0.04552673 0.05931225
## G 0.080506743 0.106988473    0.09052409 0.08700669
## H 0.045361667 0.017651297    0.01746956 0.02469421
## I 0.037188394 0.043227666    0.04923240 0.05515809
## L 0.090314671 0.063760807    0.05823187 0.07823679
## K 0.101757254 0.041426513    0.05929063 0.07408262
## M 0.019615856 0.005763689    0.01323452 0.02100162
## F 0.050265631 0.030619597    0.02752779 0.03646434
## P 0.033510421 0.045749280    0.03758602 0.04338795
## S 0.049856968 0.122838617    0.06670196 0.07546734
## T 0.048630977 0.091138329    0.06193753 0.05492730
## W 0.013485901 0.015850144    0.01588142 0.01661666
## Y 0.025745811 0.039625360    0.05717311 0.03000231
## V 0.068246833 0.082492795    0.06511382 0.08723748

Data Exploration

plot(aa.prop,panel = panel.smooth)

Correlation Matrix

cor(aa.prop)
##               alpha.prop beta.prop a.plus.b.prop   a.div.b
## alpha.prop     1.0000000 0.4941143     0.6969508 0.8555289
## beta.prop      0.4941143 1.0000000     0.7977771 0.7706654
## a.plus.b.prop  0.6969508 0.7977771     1.0000000 0.8198043
## a.div.b        0.8555289 0.7706654     0.8198043 1.0000000
round(cor(aa.prop), 3)
##               alpha.prop beta.prop a.plus.b.prop a.div.b
## alpha.prop         1.000     0.494         0.697   0.856
## beta.prop          0.494     1.000         0.798   0.771
## a.plus.b.prop      0.697     0.798         1.000   0.820
## a.div.b            0.856     0.771         0.820   1.000

Plots

par(mfrow = c(1,3), mar = c(4,4,1,0))
plot(alpha.prop ~ beta.prop, data = aa.prop)
plot(alpha.prop ~ a.plus.b.prop, data = aa.prop)
plot(alpha.prop ~ a.div.b, data = aa.prop)

par(mfrow = c(1,1), mar = c(4,4,4,4))
# download
NP_001421 <- rentrez::entrez_fetch(id = "NP_001421.2",
                                     db = "protein",
                                     rettype = "fasta")

# clean and turn into vector
NP_001421 <- compbio4all::fasta_cleaner(NP_001421, parse = TRUE)
NP_001421.freq.table <- table(NP_001421)/length(NP_001421)

NP_001421.freq.table
## NP_001421
##           A           C           D           E           F           G 
## 0.060919540 0.025287356 0.050574713 0.067816092 0.043678161 0.057471264 
##           H           I           K           L           M           N 
## 0.031034483 0.031034483 0.057471264 0.100000000 0.033333333 0.037931034 
##           P           Q           R           S           T           V 
## 0.080459770 0.045977011 0.040229885 0.109195402 0.057471264 0.037931034 
##           W           Y 
## 0.006896552 0.025287356
table_to_vector <- function(table_x){
  table_names <- attr(table_x, "dimnames")[[1]]
  table_vect <- as.vector(table_x)
  names(table_vect) <- table_names
  return(table_vect)
}


human.aa.freq <- table_to_vector(NP_001421.freq.table)

Checking for U

aa.names <- names(human.aa.freq)
any(aa.names == "U")
## [1] FALSE
chou_cor <- function(x,y){
  numerator <- sum(x*y)
denominator <- sqrt((sum(x^2))*(sum(y^2)))
result <- numerator/denominator
return(result)
}


chou_cosine <- function(z.1, z.2){
  z.1.abs <- sqrt(sum(z.1^2))
  z.2.abs <- sqrt(sum(z.2^2))
  my.cosine <- sum(z.1*z.2)/(z.1.abs*z.2.abs)
  return(my.cosine)
}


par(mfrow = c(2,2), mar = c(1,4,1,1))

plot(alpha.prop ~ human.aa.freq, data = aa.prop)
plot(beta.prop ~ human.aa.freq, data = aa.prop)
plot(a.plus.b.prop  ~ human.aa.freq, data = aa.prop)
plot(a.div.b ~ human.aa.freq, data = aa.prop)

par(mfrow = c(1,1), mar = c(1,1,1,1))
#Calculate correlation between each column
corr.alpha <- chou_cor(aa.prop[ , ], aa.prop[ , ])
corr.beta  <- chou_cor(aa.prop[ , ], aa.prop[ , ])
corr.apb   <- chou_cor(aa.prop[ , ], aa.prop[ , ])
corr.adb   <- chou_cor(aa.prop[ , ], aa.prop[ , ])

#Calculate cosine similarity
cos.alpha <- chou_cosine(aa.prop[ , ], aa.prop[ , ])
cos.beta  <- chou_cosine(aa.prop[ , ], aa.prop[ , ])
cos.apb   <- chou_cosine(aa.prop[ , ], aa.prop[ , ])
cos.adb   <- chou_cosine(aa.prop[ , ], aa.prop[ , ])

aa.prop.flipped <- t(aa.prop)
round(aa.prop.flipped,2)
##                  A    R    N    D    C    Q    E    G    H    I    L    K    M
## alpha.prop    0.12 0.02 0.04 0.07 0.01 0.03 0.05 0.08 0.05 0.04 0.09 0.10 0.02
## beta.prop     0.07 0.02 0.05 0.04 0.03 0.04 0.03 0.11 0.02 0.04 0.06 0.04 0.01
## a.plus.b.prop 0.09 0.04 0.06 0.06 0.04 0.04 0.05 0.09 0.02 0.05 0.06 0.06 0.01
## a.div.b       0.08 0.03 0.04 0.06 0.01 0.03 0.06 0.09 0.02 0.06 0.08 0.07 0.02
##                  F    P    S    T    W    Y    V
## alpha.prop    0.05 0.03 0.05 0.05 0.01 0.03 0.07
## beta.prop     0.03 0.05 0.12 0.09 0.02 0.04 0.08
## a.plus.b.prop 0.03 0.04 0.07 0.06 0.02 0.06 0.07
## a.div.b       0.04 0.04 0.08 0.05 0.02 0.03 0.09
dist(aa.prop.flipped, method = "euclidean")
##               alpha.prop  beta.prop a.plus.b.prop
## beta.prop     0.13342098                         
## a.plus.b.prop 0.09281824 0.08289406              
## a.div.b       0.06699039 0.08659174    0.06175113
dist.alpha <- dist((aa.prop.flipped[c(1, 1),]),  method = "euclidean")
dist.beta  <- dist((aa.prop.flipped[c(2, 1),]),  method = "euclidean")
dist.apb   <- dist((aa.prop.flipped[c(3,1),]),  method = "euclidean")
dist.adb  <- dist((aa.prop.flipped[c(4,1),]), method = "euclidean")
# fold types
fold.type <- c("alpha","beta","alpha plus beta", "alpha/beta")

# data
corr.sim <- round(c(corr.alpha,corr.beta,corr.apb,corr.adb),5)
cosine.sim <- round(c(cos.alpha,cos.beta,cos.apb,cos.adb),5)
Euclidean.dist <- round(c(dist.alpha,dist.beta,dist.apb,dist.adb),5)

# summary
sim.sum <- c("","","most.sim","")
dist.sum <- c("","","min.dist","")

df <- data.frame(fold.type,
           corr.sim ,
           cosine.sim ,
           Euclidean.dist ,
           sim.sum ,
           dist.sum )
pander::pander(df)
fold.type corr.sim cosine.sim Euclidean.dist sim.sum dist.sum
alpha 1 1 0
beta 1 1 0.1334
alpha plus beta 1 1 0.09282 most.sim min.dist
alpha/beta 1 1 0.06699

Percent Identity Comparisons (PID)

# convert homo_vector to an object 
homoseq_string <-paste(hepas1 ,collapse = "")    
musseq_string <-paste(mepas1 ,collapse = "") 
monseq_string <-paste(MMepas1 ,collapse = "") 
gseq_string <-paste(gepas1 ,collapse = "") 

homoseq_string   <- toupper(homoseq_string)
musseq_string <- toupper(musseq_string)
monseq_string <- toupper(monseq_string)
gseq_string <- toupper(gseq_string)

PID Table

align01.02 <- Biostrings::pairwiseAlignment(homoseq_string, 
                                              musseq_string,
                                               substitutionMatrix = "BLOSUM62", 
                                               gapOpening = -8, 
                                               gapExtension = -2, 
                                               scoreOnly = FALSE)
align01.05 <- Biostrings::pairwiseAlignment(homoseq_string, 
                                              monseq_string,
                                               substitutionMatrix = "BLOSUM62", 
                                               gapOpening = -8, 
                                               gapExtension = -2, 
                                               scoreOnly = FALSE)
align01.06 <- Biostrings::pairwiseAlignment(homoseq_string,
                                              gseq_string,
                                               substitutionMatrix = "BLOSUM62", 
                                               gapOpening = -8, 
                                               gapExtension = -2, 
                                               scoreOnly = FALSE)
align02.05 <- Biostrings::pairwiseAlignment(musseq_string,
                                              monseq_string,
                                               substitutionMatrix = "BLOSUM62", 
                                               gapOpening = -8, 
                                               gapExtension = -2, 
                                               scoreOnly = FALSE)
align02.06 <- Biostrings::pairwiseAlignment(musseq_string,
                                             gseq_string,
                                               substitutionMatrix = "BLOSUM62", 
                                               gapOpening = -8, 
                                               gapExtension = -2, 
                                               scoreOnly = FALSE)
align05.06 <- Biostrings::pairwiseAlignment(monseq_string,
                                              gseq_string,
                                               substitutionMatrix = "BLOSUM62", 
                                               gapOpening = -8, 
                                               gapExtension = -2, 
                                               scoreOnly = FALSE)
pids <- c(1,                  NA,     NA,     NA,
          pid(align01.02),          1,     NA,     NA,
          pid(align01.05), pid(align02.05),      1,     NA,
          pid(align01.06), pid(align02.06), pid(align05.06), 1)

mat <- matrix(pids, nrow = 4, byrow = T)
row.names(mat) <- c("Homo","Mus","Pan","Gal")   
colnames(mat) <- c("Homo","Mus","Pan","Gal")   
pander::pander(mat) 
  Homo Mus Pan Gal
Homo 1 NA NA NA
Mus 87.89 1 NA NA
Pan 99.89 87.77 1 NA
Gal 74.55 71.04 74.55 1

PID methods comparison, Human Vs Chimp

homovschimp_table <- c("PID1", pid(align01.05),"(aligned positions + internal gap positions)", "PID2",
pid(align01.05, type = "PID2"), "(aligned positions)", "PID3",
pid(align01.05, type = "PID3"), "(length shorter sequence)", "PID4",
pid(align01.05, type = "PID4"), "(average length of the two sequences)")

   
homovschimp_matrix <- matrix(homovschimp_table,
                      nrow = 4, byrow = T)

homovschimp <- data.frame(homovschimp_matrix, 
                     stringsAsFactors = F)
names(homovschimp) <- c("method", "PID", "denomenator")

pander::pander(homovschimp)
method PID denomenator
PID1 99.8850574712644 (aligned positions + internal gap positions)
PID2 99.8850574712644 (aligned positions)
PID3 99.8850574712644 (length shorter sequence)
PID4 99.8850574712644 (average length of the two sequences)

Multiple Sequence Alignment

nchar(hepas1)
## [1] 870
nchar(mepas1)
## [1] 874
nchar(nepas1)
## [1] 885
nchar(zepas1)
## [1] 834
nchar(tepas1)
## [1] 870
nchar(gepas1)
## [1] 871
nchar(cepas1)
## [1] 869
nchar(pepas1)
## [1] 862
nchar(IPepas1)
## [1] 820
nchar(MMepas1)
## [1] 870
align.epas <- Biostrings::pairwiseAlignment(
                  hepas1,
                  mepas1)
align.epas
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: MTADKEKKRSSSERRKEKSRDAARCRRSKETEVF...PELTRYDCEVNVPVLGSSTLLQGGDLLRALDQAT
## subject: MTADKEKKRSSSELRKEKSRDAARCRRSKETEVF...PELTRYDCEVNVPVPGSSTLLQGRDLLRALDQAT
## score: 2641.632
epas_table$accession
##  [1] "NP_001421.2"    "NP_034267.3"    "XP_038967774.1" "NP_001034895.2"
##  [5] "NP_777150.1"    "XP_015139104.2" "XP_022280389.2" "NP_001005647.1"
##  [9] "NP_001337036.1" "XP_001147219.1"
epas1 <- entrez_fetch(db = "protein", 
                          id = epas_table$accession, 
                          rettype = "fasta")
cat(epas1)
## >NP_001421.2 endothelial PAS domain-containing protein 1 [Homo sapiens]
## MTADKEKKRSSSERRKEKSRDAARCRRSKETEVFYELAHELPLPHSVSSHLDKASIMRLAISFLRTHKLL
## SSVCSENESEAEADQQMDNLYLKALEGFIAVVTQDGDMIFLSENISKFMGLTQVELTGHSIFDFTHPCDH
## EEIRENLSLKNGSGFGKKSKDMSTERDFFMRMKCTVTNRGRTVNLKSATWKVLHCTGQVKVYNNCPPHNS
## LCGYKEPLLSCLIIMCEPIQHPSHMDIPLDSKTFLSRHSMDMKFTYCDDRITELIGYHPEELLGRSAYEF
## YHALDSENMTKSHQNLCTKGQVVSGQYRMLAKHGGYVWLETQGTVIYNPRNLQPQCIMCVNYVLSEIEKN
## DVVFSMDQTESLFKPHLMAMNSIFDSSGKGAVSEKSNFLFTKLKEEPEELAQLAPTPGDAIISLDFGNQN
## FEESSAYGKAILPPSQPWATELRSHSTQSEAGSLPAFTVPQAAAPGSTTPSATSSSSSCSTPNSPEDYYT
## SLDNDLKIEVIEKLFAMDTEAKDQCSTQTDFNELDLETLAPYIPMDGEDFQLSPICPEERLLAENPQSTP
## QHCFSAMTNIFQPLAPVAPHSPFLLDKFQQQLESKKTEPEHRPMSSIFFDAGSKASLPPCCGQASTPLSS
## MGGRSNTQWPPDPPLHFGPTKWAVGDQRTEFLGAAPLGPPVSPPHVSTFKTRSAKGFGARGPDVLSPAMV
## ALSNKLKLKRQLEYEEQAFQDLSGGDPPGGSTSHLMWKRMKNLRGGSCPLMPDKPLSANVPNDKFTQNPM
## RGLGHPLRHLPLPQPPSAISPGENSKSRFPPQCYATQYQDYSLSSAHKVSGMASRLLGPSFESYLLPELT
## RYDCEVNVPVLGSSTLLQGGDLLRALDQAT
## 
## >NP_034267.3 endothelial PAS domain-containing protein 1 [Mus musculus]
## MTADKEKKRSSSELRKEKSRDAARCRRSKETEVFYELAHELPLPHSVSSHLDKASIMRLAISFLRTHKLL
## SSVCSENESEAEADQQMDNLYLKALEGFIAVVTQDGDMIFLSENISKFMGLTQVELTGHSIFDFTHPCDH
## EEIRENLTLKNGSGFGKKSKDVSTERDFFMRMKCTVTNRGRTVNLKSATWKVLHCTGQVRVYNNCPPHSS
## LCGSKEPLLSCLIIMCEPIQHPSHMDIPLDSKTFLSRHSMDMKFTYCDDRILELIGYHPEELLGRSAYEF
## YHALDSENMTKSHQNLCTKGQVVSGQYRMLAKHGGYVWLETQGTVIYNPRNLQPQCIMCVNYVLSEIEKN
## DVVFSMDQTESLFKPHLMAMNSIFDSSDDVAVTEKSNYLFTKLKEEPEELAQLAPTPGDAIISLDFGSQN
## FDEPSAYGKAILPPGQPWVSGLRSHSAQSESGSLPAFTVPQADTPGNTTPSASSSSSCSTPSSPEDYYSS
## LENPLKIEVIEKLFAMDTEPRDPGSTQTDFSELDLETLAPYIPMDGEDFQLSPICPEEPLMPESPQPTPQ
## HCFSTMTSIFQPLTPGATHGPFFLDKYPQQLESRKTESEHWPMSSIFFDAGSKGSLSPCCGQASTPLSSM
## GGRSNTQWPPDPPLHFGPTKWPVGDQSAESLGALPVGSSQLEPPSAPPHVSMFKMRSAKDFGARGPYMMS
## PAMIALSNKLKLKRQLEYEEQAFQDTSGGDPPGTSSSHLMWKRMKSLMGGTCPLMPDKTISANMAPDEFT
## QKSMRGLGQPLRHLPPPQPPSTRSSGENAKTGFPPQCYASQFQDYGPPGAQKVSGVASRLLGPSFEPYLL
## PELTRYDCEVNVPVPGSSTLLQGRDLLRALDQAT
## 
## >XP_038967774.1 endothelial PAS domain-containing protein 1 isoform X1 [Rattus norvegicus]
## MTAEWRELWEIRGQAGKDWRSSSELRKEKSRDAARCRRSKETEVFYELAHELPLPHSVSSHLDKASIMRL
## AISFLRTHKLLSSVCSENESEAEADQQMDNLYLKALEGFIAVVTQDGDMIFLSENISKFMGLTQVELTGH
## SIFDFTHPCDHEEIRENLTLKTGSGFGKKNKDMSTERDFFMRMKCTVTNRGRTVNLKSATWKVLHCTGQV
## RVYNNCPPHSSLCGYKEPLLSCLIIMCEPIQHPSHMDIPLDSKTFLSRHSMDMKFTYCDDRILELVGYHP
## EELLGRSAYEFYHALDSENMTKSHQNLCTKGQVVSGQYRMLAKHGGYVWLETQGTVVYNPRNLQPQCIMC
## VNYVLSEIEKNDVVFSMDQTESLFKPHLMAMNSIFDSSDDVALSEKSNYLFTKLKEEPEDLAQLAPTPGD
## AIISLDFGSQNFDESSTYGKAILPPGQPWATELRSHSAQSESRSLPAFTVPQAGSPGNATPSATSSSSCS
## TPSSPEDYYSSLENHLKIEVIEKLFAMDTEAKDQCSTQTDFNELDLETLAPYIPMDGEDFQLSPICPEEP
## LVPESPQPNPQHCFSTMSSIFQPLTPGASQGTFFLDKYPQQLESRKTESEHWPMSTIFFDAGSKGSLPPC
## CGQASTPLSSMGGRSNTQWPPDPPLHFGPTKWSVGNQSAEPLGPLPLGTSQLEPPSTPPHVSMFKMRSAK
## DFGARGPYMMSPAMIALSNKLKLKRQLDYEEPAFQDTSGGDPPGTSSSHLMWKRMKSLMGGTCPLMPDKT
## VSASMAPDEFTQKSMRGLGQPLRHLPPSQPPSTRSPGENAKSGFPPQCYASPFQDYSPPGAQKGSGVASR
## LLGPSFEPYLLPELTRYDCEVNVPVPGSSTLLQGRDLLRALDQAT
## 
## >NP_001034895.2 endothelial PAS domain-containing protein 1b [Danio rerio]
## MTAEKEKKRCSSERRKEKSRDAARCRRSKETEVFYELAHHLPLPHSISSHLDKASIMRLAISFLRTRKLV
## NSGYNTPTEMTDADRLMDSWYLKSLGGFITVVTSDGDMIFLSENINKFMGLTQVELTGHSIFDFTHPCDH
## EEIRENLSLKAGIGKKGKELSTERDFFMRMKCTVTNRGRTVNLKSASWKVLHCTGHLKVCNGCPARVLCG
## FKEPPLTCVVMMCEPIVHPSNIDTPLDSKTFLSRHSMDMKYTYCDERVTELMGYNPEDLLGRSAYEFYHA
## LDAENVTKSHQNLCTKGQAVSGQYRMLAKNGGYVWVETRGTVIYNSRNSQPQCIVCVNYVLSDVEEKSLI
## FSMDQTESLFKPHKLNGFFSPKEALGSDPADLLFTKLKEEPEDLTQLAPTPGDTIISLDFGQSQYEEHTV
## YNKVSSVAQTVSHPVHDGHRTSYSGEMAKMAATFSVPQSAPPSSATPSLSSCSTPSSPDDYYTPVDSDLK
## VELTEKLFSLDTQEAKTSRNQETDLSDLDLETLAPYIPMDGEDFQLNPICPEEPPSEIGTLGTNQQCFSN
## ITSLFQPLSSPSAAHYQPKMSSGGDKQNINGGSVESWPPVPYSRDPMQMPPYHDPASTPLSSMGGRQNLQ
## WPPDPPLPSKAGMMDPLAAKRSCQGMPANRMAPFMQRPMENFVQNYRDTSPARLALANSFKRSFSQMAMA
## ETPPTKSQQTVWKKLRHESCAVMERKSLSSSALSDKSMAHNGGMDHQHRKSQYSGNQNGQPTKHYREQFC
## NYREFNMQPSSKMDGIASRLIGPSFETYSLPELTRYDCEVNVPLQGNLHLLQGSDLLRALDQAT
## 
## >NP_777150.1 endothelial PAS domain-containing protein 1 [Bos taurus]
## MTADKEKKRSSSERRKEKSRDAARCRRSKETEVFYELAHELPLPHSVSSHLDKASIMRLAISFLRTHKLL
## SSVCSENESEADADQQMDNLYLKALEGFIAVVTQDGDMIFLSENISKFMGLTQVELTGHSIFDFTHPCDH
## EEIRENLSLKNGSGFGKKSKDMSTERDFFMRMKCTVTNRGRTVNLKSATWKVLHCTGQVKVYNNCPPHSS
## LCGCKEPLLSCLIIMCEPIQHPSHMDIPLDSKTFLSRHSMDMKFTYCDDRITELVGYHPEELLGRSAYEF
## YHALDSENMTKSHQNLCTKGQVVSGQYRMLAKHGGYVWLETQGTVIYNPRNLQPQCIMCVNYVLSEIEKN
## DVVFSMDQTESLFKPHLLTMNSIFDNSGKVAVSEKSNFLFTKLKEEPEELAQLAPTAGDTIISLDFGTPN
## FEESSAYGKGILPPGQQWTGEVKSHGTHSEAGSLPAFTVPQAAALGNSTPSASSSSSCSTPSSPGDYYTS
## LDDNLKIEAIEKLFAMDTEAKDQCGTQTDFNELDLETLAPYIPMDGEDFQLSPICPEESLLPETPQSAPQ
## HCFSTMSNIFQPLAPMASHSTFLLDKYQQQLESKKTEPEQRRVSFAFFDGGSRVSLLQCCGQTYTPLSSM
## GGISNTQWPPDPPLQLGPTKWPGEDRHAEAVGAAPLGLPPATPHLAMLKKRSAKGFGPQGPDVMSPAMIA
## LSNKLKLKRQLEYEEQAFQDMSGGDPPGSGTSHLMWKRMKSLRGGGTCSLMPDKLPNANVPNDEFIQNPV
## RGRSQPLRHLSPPQPPSATSPGEPTKSGFPAQCYAPQYQDYSLPAAHKMSGMASRLLGPSFEPYLLPELT
## RYDCEVNVPVPGTSTLLQGGDLLRALDQAT
## 
## >XP_015139104.2 endothelial PAS domain-containing protein 1 isoform X1 [Gallus gallus]
## MAEADHPAPSSAESSSERRKEKSRDAARCRRSKETEVFYELAHELPLPHNVSSHLDKASIMRLAISFLRT
## HKLLSSVCADNENELEADQQMDNLYLKALEGFIAVVTQDGDMIFLSENVNKYMGLTQVELTGHSIFDFTH
## PCDHEEIRENLSLKNGPGFGKKNKEMSTERDFFMRMKCTVTNRGRTVNLKSATWKVLHCTGQVKVYNTCP
## PHTLCGYKEPLLTCLIIMCEPIQHPSNIDIPLDSKTFMSRHSMDMKFTYCDDRITELIGYHPEELLGRSA
## YEFYHALDSENMTKSHQNLCTKGQVVTGQYRMLAKHGGYVWLETQGTVIYNTRNLQPQCIICVNYVLSEI
## EKNDIVFSMDQTESLFKPHLLTMSSAFENGISGRDKSDLLFTKLKEEPEELAQLAPTPGDAIISLDFELH
## PGIQKFEEAPAYTSAVLTPNKPWPVEVKSHAAQGETLTIPSFTMPQIAPGSSTPSASSNSSCSTPNSPED
## YYTSVDDDLKIEVIEKLFAMDTESKSQCNSQTDFNELDLETLAPYIPMDGEDFQLSPICQEERTLSESAQ
## NTQQSLSSMSTIFQPLASASQNQFLPEKYCPQLSNKNINPGHGSLSSVFFNNMSRSSLPPYHNQASTPLS
## SMGGRPNTQWPPDPPLEYVPSKWRLMDKYSGTLSSSPSGPPVRSPNMPIYKKRPLDGLGQRGIDINPARI
## ALSNSLKLKRQLDYEEQALQQLSGGDPSVINPPQLMWKRMKFLKGENCSLLTEKKSLSTSVLTDEFVCNS
## RGLSQPVNQLQQQQQSTCGSPGENLKAGAFSPQFYSPHYQDYTVQSAHKVSGVTSRLLGSSFEPYLLPEL
## TRYDCEVNVPVLGSSTLLQGSELLRALDQAT
## 
## >XP_022280389.2 endothelial PAS domain-containing protein 1 [Canis lupus familiaris]
## MTADKEKKRSSSERRKEKSRDAARCRRSKETEVFYELAHELPLPHSVSSHLDKASIMRLAISFLRTHKLL
## SSVCSENESEAEANQQMDNLYLKALEGFIAVVTQDGDMIFLSENISKFMGLTQVELTGHSIFDFTHPCDH
## EEIRENLSLKSGPGFGKKSKDMSTERDFFMRMKCTVTNRGRTVNLKSATWKVLHCTGQVKVYNNCPPHSS
## LCSYKEPLLSCLIIMCEPIQHPSHMDIPLDSKTFLSRHSMDMKFTYCDDRITELVGYHPEELLGRSAYEF
## YHALDSENMTKSHQNLCTKGQVVSGQYRMLAKHGGYVWLETQGTVIYNPRNLQPQCIMCVNYVLSEIEKN
## DVVFSMDQTESLFKPHLMAMTSLFDSSGEVAVSDKSDYLFTKLKEEPEELAQLAPTAGDAIISLDFGSQS
## FEESSSYNSALLPPSQPWPRELRSHSTQSEDGSLPAFTVPQAAASGSATPSATSSSSSCSTPSSPGDYYT
## SLNDDLKIEVIEKLFTMDTEAKDQGSTQTDFSELDLETLAPYIPMDGEDFQLSPICPEERLLPEKPQSTP
## QHCFSTMTNIFQPLAPMASHSPFLLDKYQQQLGSKKIEPEHQPMSSIFFDGGNKVSLPPCCGQASTPLSS
## MGGRSSTQWPPDPPLHFGPTKWPVADQHTESLGPSPLGPPITSPHLSMFKKRSAKAFGPQGPDVMSPAMV
## ALSNKLKLKRQREYEEQAFQDLSGGDPPGSSTSHQVWKRMKSLRGNVNCPLIPDKLLSANIPNDEFTQNP
## MRGLGQPLRHLPPPPSVMSPGENTKSGFPPQCYAPQYQDYSLPSAHKVSGMASRLLGPSFEPYLLPELTR
## YDCEVNVPVPGSSTLLQGGDLLRALDQAT
## 
## >NP_001005647.1 endothelial PAS domain-containing protein 1 [Xenopus tropicalis]
## MTAEKEKKRNSSERRKEKSRDAARCRRSKETEVFYELAHQLPLPQSISSHLDKASIMRLTISFLRTHKLL
## SSVCADRNIETASEKQLDNLYLKALEGFVAVVTQDGDMIFLSENVNKYLGLTQVELTGHSIFDFTHPCDH
## DEIKENLSMKTGVGSGKKNKDANTEHDFFMRMKCTVTNRGRTVNLKSATWKVLHCTGHVKAYNSYYPHSL
## CGYKEPVLSCLIMMCQPIQHPSNIDIPLDSKTFLSRHSMDMKFTYCDDRITELIGYHPEELLGRSAYEFY
## HALDSESMTKSHQNLCAKGQVVTGQYRMLAKHGGYLWVETQSTVIYNTRNSQPQCIVCVNYVLSEIEKND
## VVFSMDQTESLFKPHLMTMNSIFSSSVQEKSDFLFTKLKKEPEDLAQLAPTPGDEIVSLDFGSQTFDEQS
## NFNSVQTSPSKQWPLEVKNLNSQVDIINLPSFKAPQVSTGNSTPSASSNSTCSTPSSPEAYYSSLDEDLK
## IELIEKLFAMDTEAKNQCNTTQNDFNDLDLETLAPYIPMDGEDFQLNPICQEESTISDTPQKAQQNLSSI
## SSLFQPSTPSPQNQFLQQNICQTTAAKSNNGSQDPLPSVLFNNEKKALPLTPYHTGASPPLSSMGGRPNV
## QWPPDPPVSYMPNKWRFVEQYNGSLSSLPSGPPVHLHNMPLYKQRSLESFGQRGKDLHPAEIPFCNSMKR
## KRQLDYGEQGFPQLDVGDLQGESNNPLIMWKRMKALKSGSCPLVAERKSLSTSVLNDGYVCTRHRELNQP
## ISQQQQQQCVAPPRDGRKTAYSNTFFSCSYHDYNMQQTEKMKGLTSRLIAPSFEPYLLPELTRYDCEVNV
## PVLGSSTLLQGSDLLRALDQTT
## 
## >NP_001337036.1 endothelial PAS domain-containing protein 1 [Ictalurus punctatus]
## MTAEKDKKRSTSERRKEKSRDAARCRRSKETEVFYELAHQLPLPHSVSSHLDKASIMRLAISFLRTCKLF
## TSGCSTSETDIDRQMDSLYLKSLEGFISVVTSDGDIIFLSENINKFMGLTQVELIGHSIFDFTHPCDHEE
## IRENLSMKTGVGKKGKDLSTERDFFMRMKCTVTSRGRTVNLKSASWKVLHCTGHLKVYNGCSTRTPCGYK
## ESPLTCVVMLCEPVPHPSNIDTPFDSKTFLSRHSMDMKFTYCDERVTQLMGYNPEDLLGRSVYEFYHALD
## SESVTRSHQNLCTKGQAVSGHYRMLAKHGGFVWVETQGTVIYSSRNSQPQCIVCVNYVLSDIEEKSTIFS
## KDQTESLLKTNMSSFFSKARSPMASETSSALFTKFKEEPEDLNHLAPTPGDGFIPLNFGHPSFEEYPVCS
## KVSPMHPPATHSVTERHNLPTMGANFSIPQAPPPSSATPSISNCSTPSSPDDYKSPVDDLKMEITEKLFA
## MDTKGKNSYSQETELSDLDLETLAPYIPMDGEDFQLNPIGQEEPLPEAVALGITEYSFSNIANFFQPLTP
## PPGAHFQPNPHSASEKQAPSNATMEPWPPIFYASHMPLAHHTNPESIPLASMGGHQSLQWPPDPPINYSS
## TKGGAIDSLVEKHSCQALQTNRMSLHNQRSMEKYGPCKAYRDVSPVRLTIPNTMKRSFSNMSMGVSSATR
## PAEMWKRMKNESCAILNRMSQSSSALTGEHMGHQHRKTQNQGNQTARGKKDYPEHCCNYTDYNMLPNTKM
## EGVASRLLGPSFEYCLPELTRYDCEVNVPLQGNLHLLQGRDLLCALDQAT
## 
## >XP_001147219.1 endothelial PAS domain-containing protein 1 isoform X2 [Pan troglodytes]
## MTADKEKKRSSSERRKEKSRDAARCRRSKETEVFYELAHELPLPHSVSSHLDKASIMRLAISFLRTHKLL
## SSVCSENESEAEADQQMDNLYLKALEGFIAVVTQDGDMIFLSENISKFMGLTQVELTGHSIFDFTHPCDH
## EEIRENLSLKNGSGFGKKSKDMSTERDFFMRMKCTVTNRGRTVNLKSATWKVLHCTGQVKVYNNCPPHNS
## LCGYKEPLLSCLIIMCEPIQHPSHMDIPLDSKTFLSRHSMDMKFTYCDDRITELIGYHPEELLGRSAYEF
## YHALDSENMTKSHQNLCTKGQVVSGQYRMLAKHGGYVWLETQGTVIYNPRNLQPQCIMCVNYVLSEIEKN
## DVVFSMDQTESLFKPHLMAMNSIFDSSGKGAVSEKSNFLFTKLKEEPEELAQLAPTPGDAIISLDFGNQN
## FEESSAYGKAILPPSQPWATELRSHSTQSEAGSLPAFTVPQAAAPGSTTPSATSSSSSCSTPNSPEDYYT
## SLDNDLKIEVIEKLFAMDTEAKDQCSTQTDFNELDLETLAPYIPMDGEDFQLSPICPEERLLAENPQSTP
## QHCFSAMTNIFQPLAPVAPHSPFLLDKFQQQLESKKTEPEHRPMSSIFFDAGSKASLPPCCGQASTPLSS
## MGGRSNTQWPPDPPLHFGPTKWAVGDQRTEFLGAAPLGPPVSPPHVSTFKTRSAKGFGARGPDVLSPAMV
## ALSNKLKLKRQLEYEEQAFQDLSGGDPPGGSTSHLMWKRMKNLRGGSCPLMPDKPLSANVPNDKFTQNPM
## RGLGHPLRHLPLPQPPSAISPGENSKSRFPPQCYTTQYQDYSLSSAHKVSGMASRLLGPSFESYLLPELT
## RYDCEVNVPVLGSSTLLQGGDLLRALDQAT
epas_list <- entrez_fetch_list(db = "protein", 
                          id = epas_table$accession, 
                          rettype = "fasta")
length(epas_list)
## [1] 10
for(i in 1:length(epas_list)){
  epas_list[[i]] <- fasta_cleaner(epas_list[[i]], parse = F)
}
epas_vector <- rep(NA, length(epas_list))

for(i in 1:length(epas_vector)){
  epas_vector[i] <- epas_list[[i]]
}


names(epas_vector) <- names(epas_list)
epas_vector_ss <- Biostrings::AAStringSet(epas_vector)
epas_align <- msa(epas_vector_ss,
                     method = "ClustalW")
## use default substitution matrix
epas_align
## CLUSTAL 2.1  
## 
## Call:
##    msa(epas_vector_ss, method = "ClustalW")
## 
## MsaAAMultipleAlignment with 10 rows and 900 columns
##      aln                                                   names
##  [1] -MTADK-----------EKKRSSSE...VNVPVPGSSTLLQGRDLLRALDQAT NP_034267.3
##  [2] -MTAEWRELWEIRGQAGKDWRSSSE...VNVPVPGSSTLLQGRDLLRALDQAT XP_038967774.1
##  [3] -MTADK-----------EKKRSSSE...VNVPVLGSSTLLQGGDLLRALDQAT NP_001421.2
##  [4] -MTADK-----------EKKRSSSE...VNVPVLGSSTLLQGGDLLRALDQAT XP_001147219.1
##  [5] -MTADK-----------EKKRSSSE...VNVPVPGSSTLLQGGDLLRALDQAT XP_022280389.2
##  [6] -MTADK-----------EKKRSSSE...VNVPVPGTSTLLQGGDLLRALDQAT NP_777150.1
##  [7] MAEADHP--------APSSAESSSE...VNVPVLGSSTLLQGSELLRALDQAT XP_015139104.2
##  [8] -MTAEK-----------EKKRNSSE...VNVPVLGSSTLLQGSDLLRALDQTT NP_001005647.1
##  [9] -MTAEK-----------EKKRCSSE...VNVPLQGNLHLLQGSDLLRALDQAT NP_001034895.2
## [10] -MTAEK-----------DKKRSTSE...VNVPLQGNLHLLQGRDLLCALDQAT NP_001337036.1
##  Con -MTADK-----------EKKRSSSE...VNVPV?GSSTLLQG?DLLRALDQAT Consensus
class(epas_align) <- "AAMultipleAlignment"
epas_align_seqinr <- msaConvert(epas_align, type = "seqinr::alignment")
print_msa(alignment = epas_align_seqinr, 
          chunksize = 60)
## [1] "-MTADK-----------EKKRSSSELRKEKSRDAARCRRSKETEVFYELAHELPLPHSVS 0"
## [1] "-MTAEWRELWEIRGQAGKDWRSSSELRKEKSRDAARCRRSKETEVFYELAHELPLPHSVS 0"
## [1] "-MTADK-----------EKKRSSSERRKEKSRDAARCRRSKETEVFYELAHELPLPHSVS 0"
## [1] "-MTADK-----------EKKRSSSERRKEKSRDAARCRRSKETEVFYELAHELPLPHSVS 0"
## [1] "-MTADK-----------EKKRSSSERRKEKSRDAARCRRSKETEVFYELAHELPLPHSVS 0"
## [1] "-MTADK-----------EKKRSSSERRKEKSRDAARCRRSKETEVFYELAHELPLPHSVS 0"
## [1] "MAEADHP--------APSSAESSSERRKEKSRDAARCRRSKETEVFYELAHELPLPHNVS 0"
## [1] "-MTAEK-----------EKKRNSSERRKEKSRDAARCRRSKETEVFYELAHQLPLPQSIS 0"
## [1] "-MTAEK-----------EKKRCSSERRKEKSRDAARCRRSKETEVFYELAHHLPLPHSIS 0"
## [1] "-MTAEK-----------DKKRSTSERRKEKSRDAARCRRSKETEVFYELAHQLPLPHSVS 0"
## [1] " "
## [1] "SHLDKASIMRLAISFLRTHKLLSSVCSENESEAEADQQMDNLYLKALEGFIAVVTQDGDM 0"
## [1] "SHLDKASIMRLAISFLRTHKLLSSVCSENESEAEADQQMDNLYLKALEGFIAVVTQDGDM 0"
## [1] "SHLDKASIMRLAISFLRTHKLLSSVCSENESEAEADQQMDNLYLKALEGFIAVVTQDGDM 0"
## [1] "SHLDKASIMRLAISFLRTHKLLSSVCSENESEAEADQQMDNLYLKALEGFIAVVTQDGDM 0"
## [1] "SHLDKASIMRLAISFLRTHKLLSSVCSENESEAEANQQMDNLYLKALEGFIAVVTQDGDM 0"
## [1] "SHLDKASIMRLAISFLRTHKLLSSVCSENESEADADQQMDNLYLKALEGFIAVVTQDGDM 0"
## [1] "SHLDKASIMRLAISFLRTHKLLSSVCADNENELEADQQMDNLYLKALEGFIAVVTQDGDM 0"
## [1] "SHLDKASIMRLTISFLRTHKLLSSVCADRNIETASEKQLDNLYLKALEGFVAVVTQDGDM 0"
## [1] "SHLDKASIMRLAISFLRTRKLVNSGYNTPTEMTDADRLMDSWYLKSLGGFITVVTSDGDM 0"
## [1] "SHLDKASIMRLAISFLRTCKLFTSGCSTS--ETDIDRQMDSLYLKSLEGFISVVTSDGDI 0"
## [1] " "
## [1] "IFLSENISKFMGLTQVELTGHSIFDFTHPCDHEEIRENLTLKNGSGFGKKSKDVSTERDF 0"
## [1] "IFLSENISKFMGLTQVELTGHSIFDFTHPCDHEEIRENLTLKTGSGFGKKNKDMSTERDF 0"
## [1] "IFLSENISKFMGLTQVELTGHSIFDFTHPCDHEEIRENLSLKNGSGFGKKSKDMSTERDF 0"
## [1] "IFLSENISKFMGLTQVELTGHSIFDFTHPCDHEEIRENLSLKNGSGFGKKSKDMSTERDF 0"
## [1] "IFLSENISKFMGLTQVELTGHSIFDFTHPCDHEEIRENLSLKSGPGFGKKSKDMSTERDF 0"
## [1] "IFLSENISKFMGLTQVELTGHSIFDFTHPCDHEEIRENLSLKNGSGFGKKSKDMSTERDF 0"
## [1] "IFLSENVNKYMGLTQVELTGHSIFDFTHPCDHEEIRENLSLKNGPGFGKKNKEMSTERDF 0"
## [1] "IFLSENVNKYLGLTQVELTGHSIFDFTHPCDHDEIKENLSMKTGVGSGKKNKDANTEHDF 0"
## [1] "IFLSENINKFMGLTQVELTGHSIFDFTHPCDHEEIRENLSLKAG--IGKKGKELSTERDF 0"
## [1] "IFLSENINKFMGLTQVELIGHSIFDFTHPCDHEEIRENLSMKTG--VGKKGKDLSTERDF 0"
## [1] " "
## [1] "FMRMKCTVTNRGRTVNLKSATWKVLHCTGQVRVYNNCPPHSSLCGSKEPLLSCLIIMCEP 0"
## [1] "FMRMKCTVTNRGRTVNLKSATWKVLHCTGQVRVYNNCPPHSSLCGYKEPLLSCLIIMCEP 0"
## [1] "FMRMKCTVTNRGRTVNLKSATWKVLHCTGQVKVYNNCPPHNSLCGYKEPLLSCLIIMCEP 0"
## [1] "FMRMKCTVTNRGRTVNLKSATWKVLHCTGQVKVYNNCPPHNSLCGYKEPLLSCLIIMCEP 0"
## [1] "FMRMKCTVTNRGRTVNLKSATWKVLHCTGQVKVYNNCPPHSSLCSYKEPLLSCLIIMCEP 0"
## [1] "FMRMKCTVTNRGRTVNLKSATWKVLHCTGQVKVYNNCPPHSSLCGCKEPLLSCLIIMCEP 0"
## [1] "FMRMKCTVTNRGRTVNLKSATWKVLHCTGQVKVYNTCPPHT-LCGYKEPLLTCLIIMCEP 0"
## [1] "FMRMKCTVTNRGRTVNLKSATWKVLHCTGHVKAYNSYYPHS-LCGYKEPVLSCLIMMCQP 0"
## [1] "FMRMKCTVTNRGRTVNLKSASWKVLHCTGHLKVCNGCPARV-LCGFKEPPLTCVVMMCEP 0"
## [1] "FMRMKCTVTSRGRTVNLKSASWKVLHCTGHLKVYNGCSTRT-PCGYKESPLTCVVMLCEP 0"
## [1] " "
## [1] "IQHPSHMDIPLDSKTFLSRHSMDMKFTYCDDRILELIGYHPEELLGRSAYEFYHALDSEN 0"
## [1] "IQHPSHMDIPLDSKTFLSRHSMDMKFTYCDDRILELVGYHPEELLGRSAYEFYHALDSEN 0"
## [1] "IQHPSHMDIPLDSKTFLSRHSMDMKFTYCDDRITELIGYHPEELLGRSAYEFYHALDSEN 0"
## [1] "IQHPSHMDIPLDSKTFLSRHSMDMKFTYCDDRITELIGYHPEELLGRSAYEFYHALDSEN 0"
## [1] "IQHPSHMDIPLDSKTFLSRHSMDMKFTYCDDRITELVGYHPEELLGRSAYEFYHALDSEN 0"
## [1] "IQHPSHMDIPLDSKTFLSRHSMDMKFTYCDDRITELVGYHPEELLGRSAYEFYHALDSEN 0"
## [1] "IQHPSNIDIPLDSKTFMSRHSMDMKFTYCDDRITELIGYHPEELLGRSAYEFYHALDSEN 0"
## [1] "IQHPSNIDIPLDSKTFLSRHSMDMKFTYCDDRITELIGYHPEELLGRSAYEFYHALDSES 0"
## [1] "IVHPSNIDTPLDSKTFLSRHSMDMKYTYCDERVTELMGYNPEDLLGRSAYEFYHALDAEN 0"
## [1] "VPHPSNIDTPFDSKTFLSRHSMDMKFTYCDERVTQLMGYNPEDLLGRSVYEFYHALDSES 0"
## [1] " "
## [1] "MTKSHQNLCTKGQVVSGQYRMLAKHGGYVWLETQGTVIYNPRNLQPQCIMCVNYVLSEIE 0"
## [1] "MTKSHQNLCTKGQVVSGQYRMLAKHGGYVWLETQGTVVYNPRNLQPQCIMCVNYVLSEIE 0"
## [1] "MTKSHQNLCTKGQVVSGQYRMLAKHGGYVWLETQGTVIYNPRNLQPQCIMCVNYVLSEIE 0"
## [1] "MTKSHQNLCTKGQVVSGQYRMLAKHGGYVWLETQGTVIYNPRNLQPQCIMCVNYVLSEIE 0"
## [1] "MTKSHQNLCTKGQVVSGQYRMLAKHGGYVWLETQGTVIYNPRNLQPQCIMCVNYVLSEIE 0"
## [1] "MTKSHQNLCTKGQVVSGQYRMLAKHGGYVWLETQGTVIYNPRNLQPQCIMCVNYVLSEIE 0"
## [1] "MTKSHQNLCTKGQVVTGQYRMLAKHGGYVWLETQGTVIYNTRNLQPQCIICVNYVLSEIE 0"
## [1] "MTKSHQNLCAKGQVVTGQYRMLAKHGGYLWVETQSTVIYNTRNSQPQCIVCVNYVLSEIE 0"
## [1] "VTKSHQNLCTKGQAVSGQYRMLAKNGGYVWVETRGTVIYNSRNSQPQCIVCVNYVLSDVE 0"
## [1] "VTRSHQNLCTKGQAVSGHYRMLAKHGGFVWVETQGTVIYSSRNSQPQCIVCVNYVLSDIE 0"
## [1] " "
## [1] "KNDVVFSMDQTESLFKPHLMAMNSIFDSSDDVAVTEKSNYLFTKLKEEPEELAQLAPTPG 0"
## [1] "KNDVVFSMDQTESLFKPHLMAMNSIFDSSDDVALSEKSNYLFTKLKEEPEDLAQLAPTPG 0"
## [1] "KNDVVFSMDQTESLFKPHLMAMNSIFDSSGKGAVSEKSNFLFTKLKEEPEELAQLAPTPG 0"
## [1] "KNDVVFSMDQTESLFKPHLMAMNSIFDSSGKGAVSEKSNFLFTKLKEEPEELAQLAPTPG 0"
## [1] "KNDVVFSMDQTESLFKPHLMAMTSLFDSSGEVAVSDKSDYLFTKLKEEPEELAQLAPTAG 0"
## [1] "KNDVVFSMDQTESLFKPHLLTMNSIFDNSGKVAVSEKSNFLFTKLKEEPEELAQLAPTAG 0"
## [1] "KNDIVFSMDQTESLFKPHLLTMSSAFENG--ISGRDKSDLLFTKLKEEPEELAQLAPTPG 0"
## [1] "KNDVVFSMDQTESLFKPHLMTMNSIFSSS----VQEKSDFLFTKLKKEPEDLAQLAPTPG 0"
## [1] "EKSLIFSMDQTESLFKPHKLNG---FFSPKEALGSDPADLLFTKLKEEPEDLTQLAPTPG 0"
## [1] "EKSTIFSKDQTESLLKTNMSSF---FSKARSPMASETSSALFTKFKEEPEDLNHLAPTPG 0"
## [1] " "
## [1] "DAIISLDFGS----QNFDEPSAYGKAILPPGQPWVSGLRSHSAQ---SESGSLPAFTVPQ 0"
## [1] "DAIISLDFGS----QNFDESSTYGKAILPPGQPWATELRSHSAQ---SESRSLPAFTVPQ 0"
## [1] "DAIISLDFGN----QNFEESSAYGKAILPPSQPWATELRSHSTQ---SEAGSLPAFTVPQ 0"
## [1] "DAIISLDFGN----QNFEESSAYGKAILPPSQPWATELRSHSTQ---SEAGSLPAFTVPQ 0"
## [1] "DAIISLDFGS----QSFEESSSYNSALLPPSQPWPRELRSHSTQ---SEDGSLPAFTVPQ 0"
## [1] "DTIISLDFGT----PNFEESSAYGKGILPPGQQWTGEVKSHGTH---SEAGSLPAFTVPQ 0"
## [1] "DAIISLDFELHPGIQKFEEAPAYTSAVLTPNKPWPVEVKSHAAQ---GETLTIPSFTMPQ 0"
## [1] "DEIVSLDFGS----QTFDEQSNFNSVQTSPSKQWPLEVKNLNSQ---VDIINLPSFKAPQ 0"
## [1] "DTIISLDFGQ----SQYEEHTVYNKVSSVAQTVSHPVHDGHRTSYSGEMAKMAATFSVPQ 0"
## [1] "DGFIPLNFGH----PSFEEYPVCSKVSPMHPPATHSVTERHN------LPTMGANFSIPQ 0"
## [1] " "
## [1] "ADTPGNTTPSASSSSS-CSTPSSPEDYYSSLENPLKIEVIEKLFAMDTEPRDPG-STQTD 0"
## [1] "AGSPGNATPSATSSSS-CSTPSSPEDYYSSLENHLKIEVIEKLFAMDTEAKDQC-STQTD 0"
## [1] "AAAPGSTTPSATSSSSSCSTPNSPEDYYTSLDNDLKIEVIEKLFAMDTEAKDQC-STQTD 0"
## [1] "AAAPGSTTPSATSSSSSCSTPNSPEDYYTSLDNDLKIEVIEKLFAMDTEAKDQC-STQTD 0"
## [1] "AAASGSATPSATSSSSSCSTPSSPGDYYTSLNDDLKIEVIEKLFTMDTEAKDQG-STQTD 0"
## [1] "AAALGNSTPSASSSSS-CSTPSSPGDYYTSLDDNLKIEAIEKLFAMDTEAKDQC-GTQTD 0"
## [1] "IAP-GSSTPSASSNSS-CSTPNSPEDYYTSVDDDLKIEVIEKLFAMDTESKSQC-NSQTD 0"
## [1] "VST-GNSTPSASSNST-CSTPSSPEAYYSSLDEDLKIELIEKLFAMDTEAKNQCNTTQND 0"
## [1] "SAPPSSATPSLSS----CSTPSSPDDYYTPVDSDLKVELTEKLFSLDTQEAKTSRNQETD 0"
## [1] "APPPSSATPSISN----CSTPSSPDDYKSPVD-DLKMEITEKLFAMDTK-GKNSYSQETE 0"
## [1] " "
## [1] "FSELDLETLAPYIPMDGEDFQLSPICPEEPLMPESPQPTPQHCFSTMTSIFQPLTPGATH 0"
## [1] "FNELDLETLAPYIPMDGEDFQLSPICPEEPLVPESPQPNPQHCFSTMSSIFQPLTPGASQ 0"
## [1] "FNELDLETLAPYIPMDGEDFQLSPICPEERLLAENPQSTPQHCFSAMTNIFQPLAPVAPH 0"
## [1] "FNELDLETLAPYIPMDGEDFQLSPICPEERLLAENPQSTPQHCFSAMTNIFQPLAPVAPH 0"
## [1] "FSELDLETLAPYIPMDGEDFQLSPICPEERLLPEKPQSTPQHCFSTMTNIFQPLAPMASH 0"
## [1] "FNELDLETLAPYIPMDGEDFQLSPICPEESLLPETPQSAPQHCFSTMSNIFQPLAPMASH 0"
## [1] "FNELDLETLAPYIPMDGEDFQLSPICQEERTLSESAQNT-QQSLSSMSTIFQPLAS-ASQ 0"
## [1] "FNDLDLETLAPYIPMDGEDFQLNPICQEESTISDTPQKA-QQNLSSISSLFQPSTP-SPQ 0"
## [1] "LSDLDLETLAPYIPMDGEDFQLNPICPEEPPSEIGTLGTNQQCFSNITSLFQPLSS-PSA 0"
## [1] "LSDLDLETLAPYIPMDGEDFQLNPIGQEEPLPEAVALGITEYSFSNIANFFQPLTP-PPG 0"
## [1] " "
## [1] "GPFFLDKYPQQLESRKTESEHWPMSSIFFDAGSKGS-LSPCCGQASTPLSSMGGRSNTQW 0"
## [1] "GTFFLDKYPQQLESRKTESEHWPMSTIFFDAGSKGS-LPPCCGQASTPLSSMGGRSNTQW 0"
## [1] "SPFLLDKFQQQLESKKTEPEHRPMSSIFFDAGSKAS-LPPCCGQASTPLSSMGGRSNTQW 0"
## [1] "SPFLLDKFQQQLESKKTEPEHRPMSSIFFDAGSKAS-LPPCCGQASTPLSSMGGRSNTQW 0"
## [1] "SPFLLDKYQQQLGSKKIEPEHQPMSSIFFDGGNKVS-LPPCCGQASTPLSSMGGRSSTQW 0"
## [1] "STFLLDKYQQQLESKKTEPEQRRVSFAFFDGGSRVS-LLQCCGQTYTPLSSMGGISNTQW 0"
## [1] "NQFLPEKYCPQLSNKNINPGHGSLSSVFFNNMSRSS-LPPYHNQASTPLSSMGGRPNTQW 0"
## [1] "NQFLQQNICQTTAAKSNNGSQDPLPSVLFNNEKKALPLTPYHTGASPPLSSMGGRPNVQW 0"
## [1] "AHYQPKMSSGGDKQNINGGSVESWPPVPYSRD--PMQMPPYHDPASTPLSSMGGRQNLQW 0"
## [1] "AHFQPNPHSASEKQAPSNATMEPWPPIFYAS---HMPLAHHTNPESIPLASMGGHQSLQW 0"
## [1] " "
## [1] "PPDPPLHFGPTKWPVGDQSAESLGALPVGSSQLEPPSAPPHVSMFKMRSAKDFG-ARGPY 0"
## [1] "PPDPPLHFGPTKWSVGNQSAEPLGPLPLGTSQLEPPSTPPHVSMFKMRSAKDFG-ARGPY 0"
## [1] "PPDPPLHFGPTKWAVGDQRTEFLGAAPLG-----PPVSPPHVSTFKTRSAKGFG-ARGPD 0"
## [1] "PPDPPLHFGPTKWAVGDQRTEFLGAAPLG-----PPVSPPHVSTFKTRSAKGFG-ARGPD 0"
## [1] "PPDPPLHFGPTKWPVADQHTESLGPSPLG-----PPITSPHLSMFKKRSAKAFG-PQGPD 0"
## [1] "PPDPPLQLGPTKWPGEDRHAEAVGAAPLG-----LPPATPHLAMLKKRSAKGFG-PQGPD 0"
## [1] "PPDPPLEYVPSKWRLMDKYSGTLSSSPSG-----PPVRSPNMPIYKKRPLDGLG-QRGID 0"
## [1] "PPDPPVSYMPNKWRFVEQYNGSLSSLPSG-----PPVHLHNMPLYKQRSLESFG-QRGKD 0"
## [1] "PPDPPL---PSKAGMMDPLAAKRSCQGMP---------ANRMAPFMQRPMENF--VQNYR 0"
## [1] "PPDPPINYSSTKGGAIDSLVEKHSCQALQ---------TNRMSLHNQRSMEKYGPCKAYR 0"
## [1] " "
## [1] "MMSPAMIALSNKLKLKRQLEYEEQAFQDTSGGDPPG-TSSSHLMWKRMKSLMGGT-CPLM 0"
## [1] "MMSPAMIALSNKLKLKRQLDYEEPAFQDTSGGDPPG-TSSSHLMWKRMKSLMGGT-CPLM 0"
## [1] "VLSPAMVALSNKLKLKRQLEYEEQAFQDLSGGDPPG-GSTSHLMWKRMKNLRGGS-CPLM 0"
## [1] "VLSPAMVALSNKLKLKRQLEYEEQAFQDLSGGDPPG-GSTSHLMWKRMKNLRGGS-CPLM 0"
## [1] "VMSPAMVALSNKLKLKRQREYEEQAFQDLSGGDPPG-SSTSHQVWKRMKSLRGNVNCPLI 0"
## [1] "VMSPAMIALSNKLKLKRQLEYEEQAFQDMSGGDPPG-SGTSHLMWKRMKSLRGGGTCSLM 0"
## [1] "IN-PARIALSNSLKLKRQLDYEEQALQQLSGGDPSV-INPPQLMWKRMKFLKGENCSLLT 0"
## [1] "LH-PAEIPFCNSMKRKRQLDYGEQGFPQLDVGDLQGESNNPLIMWKRMKALKSGSCPLVA 0"
## [1] "DTSPARLALANSFKR---------SFSQMAMAETPP-TKSQQTVWKKLR----HESCAVM 0"
## [1] "DVSPVRLTIPNTMKR---------SFSNMSMGVSSA-TRP-AEMWKRMK----NESCAIL 0"
## [1] " "
## [1] "PDKTISANMAPDEFTQKSMRGLGQPLRHLPPPQPPSTRSSGENAKT-GFPPQCYASQFQD 0"
## [1] "PDKTVSASMAPDEFTQKSMRGLGQPLRHLPPSQPPSTRSPGENAKS-GFPPQCYASPFQD 0"
## [1] "PDKPLSANVPNDKFTQNPMRGLGHPLRHLPLPQPPSAISPGENSKS-RFPPQCYATQYQD 0"
## [1] "PDKPLSANVPNDKFTQNPMRGLGHPLRHLPLPQPPSAISPGENSKS-RFPPQCYTTQYQD 0"
## [1] "PDKLLSANIPNDEFTQNPMRGLGQPLRHLPP--PPSVMSPGENTKS-GFPPQCYAPQYQD 0"
## [1] "PDKLPNANVPNDEFIQNPVRGRSQPLRHLSPPQPPSATSPGEPTKS-GFPAQCYAPQYQD 0"
## [1] "EKKSLSTSVLTDEFVCN-SRGLSQPVNQLQQQQQSTCGSPGENLKAGAFSPQFYSPHYQD 0"
## [1] "ERKSLSTSVLNDGYVCTRHRELNQPIS--QQQQQQCVAPPRDGRKT-AYSNTFFSCSYHD 0"
## [1] "ERKSLSSSALSD-----KSMAHNGGMDHQHRKSQYSGNQNGQPTKH--YREQFCN--YRE 0"
## [1] "NRMSQSSSALTG-----EHMG------HQHRKTQNQGNQTARGKKD--YPEHCCN--YTD 0"
## [1] " "
## [1] "YGPPGAQKVSGVASRLLGPSFEPYLLPELTRYDCEVNVPVPGSSTLLQGRDLLRALDQAT 0"
## [1] "YSPPGAQKGSGVASRLLGPSFEPYLLPELTRYDCEVNVPVPGSSTLLQGRDLLRALDQAT 0"
## [1] "YSLSSAHKVSGMASRLLGPSFESYLLPELTRYDCEVNVPVLGSSTLLQGGDLLRALDQAT 0"
## [1] "YSLSSAHKVSGMASRLLGPSFESYLLPELTRYDCEVNVPVLGSSTLLQGGDLLRALDQAT 0"
## [1] "YSLPSAHKVSGMASRLLGPSFEPYLLPELTRYDCEVNVPVPGSSTLLQGGDLLRALDQAT 0"
## [1] "YSLPAAHKMSGMASRLLGPSFEPYLLPELTRYDCEVNVPVPGTSTLLQGGDLLRALDQAT 0"
## [1] "YTVQSAHKVSGVTSRLLGSSFEPYLLPELTRYDCEVNVPVLGSSTLLQGSELLRALDQAT 0"
## [1] "YNMQQTEKMKGLTSRLIAPSFEPYLLPELTRYDCEVNVPVLGSSTLLQGSDLLRALDQTT 0"
## [1] "FNMQPSSKMDGIASRLIGPSFETYSLPELTRYDCEVNVPLQGNLHLLQGSDLLRALDQAT 0"
## [1] "YNMLPNTKMEGVASRLLGPSFE-YCLPELTRYDCEVNVPLQGNLHLLQGRDLLCALDQAT 0"
## [1] " "
ggmsa::ggmsa(epas_align,
      start = 820, 
      end = 890) 

#msaPrettyPrint(epas_align,             # alignment
#               file = "epas_msa.pdf",   # file name
#               y=c(820, 890),           # range
#               askForOverwrite=FALSE)

distance matrix

epas_subset_dist <- seqinr::dist.alignment(epas_align_seqinr, 
                                       matrix = "identity")
is(epas_subset_dist)
## [1] "dist"     "oldClass"
class(epas_subset_dist)
## [1] "dist"
epas_dist <- seqinr::dist.alignment(epas_align_seqinr, 
                                       matrix = "identity")
epas_dist_rounded <- round(epas_dist,
                              digits = 2)

distance matrix for all sequences

epas_dist_rounded
##                NP_034267.3 XP_038967774.1 NP_001421.2 XP_001147219.1
## XP_038967774.1        0.25                                          
## NP_001421.2           0.34           0.36                           
## XP_001147219.1        0.34           0.36        0.03               
## XP_022280389.2        0.36           0.37        0.31           0.31
## NP_777150.1           0.40           0.41        0.35           0.35
## XP_015139104.2        0.53           0.52        0.50           0.50
## NP_001005647.1        0.59           0.59        0.58           0.58
## NP_001034895.2        0.65           0.65        0.64           0.64
## NP_001337036.1        0.65           0.65        0.65           0.65
##                XP_022280389.2 NP_777150.1 XP_015139104.2 NP_001005647.1
## XP_038967774.1                                                         
## NP_001421.2                                                            
## XP_001147219.1                                                         
## XP_022280389.2                                                         
## NP_777150.1              0.35                                          
## XP_015139104.2           0.50        0.52                              
## NP_001005647.1           0.59        0.59           0.54               
## NP_001034895.2           0.64        0.64           0.63           0.65
## NP_001337036.1           0.65        0.66           0.66           0.65
##                NP_001034895.2
## XP_038967774.1               
## NP_001421.2                  
## XP_001147219.1               
## XP_022280389.2               
## NP_777150.1                  
## XP_015139104.2               
## NP_001005647.1               
## NP_001034895.2               
## NP_001337036.1           0.55

Phylogenetic Trees

# Note - not using rounded values
tree_subset <- nj(epas_dist)
# plot tree
plot.phylo(tree_subset, main="Phylogenetic Tree", 
            use.edge.length = T)

# add label
mtext(text = "EPAS1 family gene tree - rooted")