Introduction

This code compiles summary information about the gene EGLN1 (Egl-9 Family Hypoxia Inducible Factor 1).

Links: RefSeq page: https://www.ncbi.nlm.nih.gov/gene/54583 Homologene page: https://www.ncbi.nlm.nih.gov/homologene/56936 UniProt page: https://www.uniprot.org/uniprot/F6P6J7 PDB page: NA

To compare the evolutionary relationship between the human version of EGLN1 and its homologs in other species, this code creates pairwise and multiple sequence alignments and a phylogenetic tree.

Preparation

if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

BiocManager::install("drawProteins")
## Bioconductor version 3.13 (BiocManager 1.30.16), R 4.1.2 (2021-11-01)
## Warning: package(s) not installed when version(s) same as current; use `force = TRUE` to
##   re-install: 'drawProteins'
## Old packages: 'ade4', 'aplot', 'backports', 'brio', 'broom', 'car', 'cli',
##   'conquer', 'corrplot', 'cpp11', 'crayon', 'credentials', 'crosstalk',
##   'data.table', 'desc', 'diffobj', 'digest', 'fs', 'generics', 'gert', 'glue',
##   'Hmisc', 'hms', 'htmlTable', 'knitr', 'lifecycle', 'maps', 'Matrix',
##   'matrixStats', 'memoise', 'mime', 'nloptr', 'openxlsx', 'pillar', 'pkgbuild',
##   'pkgload', 'plotly', 'rcmdcheck', 'RcppArmadillo', 'RCurl', 'readr',
##   'remotes', 'rio', 'rlang', 'rsconnect', 'S4Vectors', 'sessioninfo', 'sp',
##   'stringi', 'testthat', 'tibble', 'tidyr', 'tzdb', 'usethis', 'viridis',
##   'vroom', 'withr', 'xfun', 'XML', 'xml2', 'yulab.utils'
# installed.packages("BiocManager")
# install.packages("drawProteins")
# install.packages("HGNChelper")
library(BiocManager)
## Bioconductor version '3.13' is out-of-date; the current release version '3.14'
##   is available with R version '4.1'; see https://bioconductor.org/install
library(drawProteins)

# github packages
library(compbio4all)
library(ggmsa)
## Registered S3 methods overwritten by 'ggalt':
##   method                  from   
##   grid.draw.absoluteGrob  ggplot2
##   grobHeight.absoluteGrob ggplot2
##   grobWidth.absoluteGrob  ggplot2
##   grobX.absoluteGrob      ggplot2
##   grobY.absoluteGrob      ggplot2
# CRAN packages
library(rentrez)
library(seqinr)
library(ape)
## 
## Attaching package: 'ape'
## The following objects are masked from 'package:seqinr':
## 
##     as.alignment, consensus
library(pander)

library(ggplot2)
library(msa)
## Loading required package: Biostrings
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: XVector
## Loading required package: GenomeInfoDb
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:ape':
## 
##     complement
## The following object is masked from 'package:seqinr':
## 
##     translate
## The following object is masked from 'package:base':
## 
##     strsplit
## 
## Attaching package: 'msa'
## The following object is masked from 'package:BiocManager':
## 
##     version
library(drawProteins)

# Biostrings
library(Biostrings)

library(HGNChelper)

Accession Numbers

RefSeq, Refseq Homlogene, UniProt and PDB were used to obtain accession numbers.

A protein BLAST search was carried out excluding vertebrates to determine if it occurred outside of vertebrates. The gene does not appear in non-vertebrates and so a second search was conducted to exclude mammals.

Accession Number Table

Does not occur: Outside of vertebrates Drosophila Melanogaster Neanderthal

#               RefSeq        Uniprot   PDB     sci name      common name gene name
egln1_table <- c("NP_071334", "Q9GZT9", "2Y34", "Homo sapiens", "human", "EGLN1",
                 "NP_444437.2", "Q91YE3", "NA", "Mus musculus", "house mouse", "EGLN1",
                 "NP_001002595.2", "F6P6J7", "NA", "Danio rerio", "zebrafish", "egln1",
                 "XP_525092.2", "NA", "NA", "Pan troglodytes", "chimpanzee", "EGLN1",
                 "XP_001104870.1", "A0A5F8AKN1", "NA", "Macaca mulatta", "rhesus monkey", "EGLN1",
                 "XP_546089.3", "NA", "NA", "Canis lupus familiaris", "dog", "EGLN1",
                 "XP_002728657.3", "NA", "NA", "Rattus norvegicus", "Norway rat", "EGLN1",
                 "XP_020929239.1", "NA", "NA", "Sus scrofa", "Wild boar", "EGLN1",
                 "XP_026268254.1", "NA", "NA", "Urocitellus parryii", "Arctic ground squirrel", "EGLN1",
                 "XP_032955515.1", "NA", "NA", "Rhinolophus ferrumequinum", "greater horseshoe bat", "EGLN1")

Convert vector information into a table

ncbi.protein.accession <- c("NP_071334", "NP_444437.2", "NP_001002595.2", "XP_525092.2", "XP_001104870.1", "XP_546089.3", "XP_002728657.3", "XP_020929239.1", "XP_026268254.1", "XP_032955515.1")
UniProt.id <- c("Q9GZT9", "Q91YE3", "F6P6J7", "NA", "A0A5F8AKN1", "NA", "NA", "NA", "NA", "NA")
PDB <- c("2Y34", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA")
species <- c("Homo sapiens", "Mus musculus", "Danio rerio", "Pan troglodytes", "Macaca mulatta", "Canis lupus familiaris", "Rattus norvegicus", "Sus scrofa", "Urocitellus parryii", "Rhinolophus ferrumequinum")
common.name <- c("human", "house mouse", "zebrafish", "chimpanzee", "rhesus monkey", "dog", "Norway rat", "Wild boar", "Arctic ground squirrel", "greater horseshoe bat")
gene.name <- c("EGLN1", "EGLN1", "egln1", "EGLN1", "EGLN1", "EGLN1", "EGLN1", "EGLN1", "EGLN1", "EGLN1")
egln1_table1 <- data.frame(ncbi.protein.accession = ncbi.protein.accession,
                           UniProt.id = UniProt.id,
                           PDB = PDB,
                           species = species,
                           common.name = common.name,
                           gene.name = gene.name)
egln1_table1 
##    ncbi.protein.accession UniProt.id  PDB                   species
## 1               NP_071334     Q9GZT9 2Y34              Homo sapiens
## 2             NP_444437.2     Q91YE3   NA              Mus musculus
## 3          NP_001002595.2     F6P6J7   NA               Danio rerio
## 4             XP_525092.2         NA   NA           Pan troglodytes
## 5          XP_001104870.1 A0A5F8AKN1   NA            Macaca mulatta
## 6             XP_546089.3         NA   NA    Canis lupus familiaris
## 7          XP_002728657.3         NA   NA         Rattus norvegicus
## 8          XP_020929239.1         NA   NA                Sus scrofa
## 9          XP_026268254.1         NA   NA       Urocitellus parryii
## 10         XP_032955515.1         NA   NA Rhinolophus ferrumequinum
##               common.name gene.name
## 1                   human     EGLN1
## 2             house mouse     EGLN1
## 3               zebrafish     egln1
## 4              chimpanzee     EGLN1
## 5           rhesus monkey     EGLN1
## 6                     dog     EGLN1
## 7              Norway rat     EGLN1
## 8               Wild boar     EGLN1
## 9  Arctic ground squirrel     EGLN1
## 10  greater horseshoe bat     EGLN1

Data Preparation

Download Sequences

All sequences were downloaded using a wrapper compbio4all::entrez_fetch_list() which uses rentrez::entrez_fetch() to access NCBI databases.

entrez_fetch_list <- function(db, id, rettype, ...){

  #setup list for storing output
  n.seq <- length(id)
  list.output <- as.list(rep(NA, n.seq))
  names(list.output) <- id

  # get output
  for(i in 1:length(id)){
    list.output[[i]] <- rentrez::entrez_fetch(db = db,
                                              id = id[i],
                                              rettype = rettype)
  }


  return(list.output)
}
egln1_list <- entrez_fetch_list(db = "protein", 
                          id = egln1_table1$ncbi.protein.accession, 
                          rettype = "fasta")

Initial Data Cleaning

Removing FASTA header

for(i in 1:length(egln1_list)){
  egln1_list[[i]] <- compbio4all::fasta_cleaner(egln1_list[[i]], parse = F)
}

General Protein Information

Protein Diagram

# Q9GZT9
Q9GZT9  <- drawProteins::get_features("Q9GZT9")
## [1] "Download has worked"
is(Q9GZT9)
## [1] "list"             "vector"           "list_OR_List"     "vector_OR_Vector"
## [5] "vector_OR_factor"
my_prot_df1 <- drawProteins::feature_to_dataframe(Q9GZT9)
is(my_prot_df1)
## [1] "data.frame"       "list"             "oldClass"         "vector"          
## [5] "list_OR_List"     "vector_OR_Vector" "vector_OR_factor"
# Q91YE3
Q91YE3  <- drawProteins::get_features("Q91YE3")
## [1] "Download has worked"
is(Q91YE3)
## [1] "list"             "vector"           "list_OR_List"     "vector_OR_Vector"
## [5] "vector_OR_factor"
my_prot_df2 <- drawProteins::feature_to_dataframe(Q91YE3)
is(my_prot_df2)
## [1] "data.frame"       "list"             "oldClass"         "vector"          
## [5] "list_OR_List"     "vector_OR_Vector" "vector_OR_factor"
# F6P6J7
F6P6J7  <- drawProteins::get_features("F6P6J7")
## [1] "Download has worked"
is(F6P6J7)
## [1] "list"             "vector"           "list_OR_List"     "vector_OR_Vector"
## [5] "vector_OR_factor"
my_prot_df3 <- drawProteins::feature_to_dataframe(F6P6J7)
is(my_prot_df3)
## [1] "data.frame"       "list"             "oldClass"         "vector"          
## [5] "list_OR_List"     "vector_OR_Vector" "vector_OR_factor"
# Q9GZT9
Q9GZT9  <- drawProteins::get_features("Q9GZT9")
## [1] "Download has worked"
is(Q9GZT9)
## [1] "list"             "vector"           "list_OR_List"     "vector_OR_Vector"
## [5] "vector_OR_factor"
my_prot_df4 <- drawProteins::feature_to_dataframe(Q9GZT9)
is(my_prot_df4)
## [1] "data.frame"       "list"             "oldClass"         "vector"          
## [5] "list_OR_List"     "vector_OR_Vector" "vector_OR_factor"

The information available on a protein on UniProt varies a lot depending on how much its been studied. drawProteins can extract information about the following things:

domains chains regions motifs phosphorylated sites repeats and others

Looking at a dataframe to see what is available to plot.

my_prot_df3[,-2]
##                    type begin end length accession    entryName taxid order
## featuresTemp     DOMAIN    18  55     37    F6P6J7 F6P6J7_DANRE  7955     1
## featuresTemp.1   DOMAIN   229 327     98    F6P6J7 F6P6J7_DANRE  7955     1
## featuresTemp.2   REGION    40 126     86    F6P6J7 F6P6J7_DANRE  7955     1
## featuresTemp.3   REGION   336 358     22    F6P6J7 F6P6J7_DANRE  7955     1
## featuresTemp.4 COMPBIAS    40  75     35    F6P6J7 F6P6J7_DANRE  7955     1
## featuresTemp.5 COMPBIAS    76  97     21    F6P6J7 F6P6J7_DANRE  7955     1
## featuresTemp.6 COMPBIAS   104 126     22    F6P6J7 F6P6J7_DANRE  7955     1
## 1                 CHAIN     1 358    357    F6P6J7 F6P6J7_DANRE  7955     1

Domains Present

# Q9GZT9
# my_canvas1 <- draw_canvas(my_prot_df1)  
# my_canvas1 <- draw_chains(my_canvas1, my_prot_df1, 
#                          label_size = 2.5)
#m y_canvas1 <- draw_domains(my_canvas1, my_prot_df1)
# my_canvas1

# Q91YE3
# my_canvas2 <- draw_canvas(my_prot_df2)  
# my_canvas2 <- draw_chains(my_canvas2, my_prot_df2, 
#                          label_size = 2.5)
# my_canvas2 <- draw_domains(my_canvas2, my_prot_df2)
# my_canvas2

# F6P6J7
my_canvas3 <- draw_canvas(my_prot_df3)  
my_canvas3 <- draw_chains(my_canvas3, my_prot_df3, 
                         label_size = 2.5)
my_canvas3 <- draw_domains(my_canvas3, my_prot_df3)
my_canvas3

# Q9GZT9
# my_canvas4 <- draw_canvas(my_prot_df4)  
# my_canvas4 <- draw_chains(my_canvas4, my_prot_df4, 
#                          label_size = 2.5)
# my_canvas4 <- draw_domains(my_canvas4, my_prot_df4)
# my_canvas4

Draw Dotplot

# sequence 3: NP_001002595.2
egln1_daniorerio_fasta <- rentrez::entrez_fetch(db ="protein", 
                         id = "NP_001002595.2",
                         rettype = "fasta")

# converting to a vector
egln1_daniorerio_vector   <- compbio4all::fasta_cleaner(egln1_daniorerio_fasta) 
# set up 2 x 2 grid, make margins thinner
par(mfrow = c(2,2), 
    mar = c(0,0,2,1))

# plot 1: EGLN1
dotPlot(egln1_daniorerio_vector, egln1_daniorerio_vector, 
        wsize = 30, 
        nmatch = 5, 
        main = "EGLN1 30 / 5")

# plot 2 EGLN1 - size = 30, nmatch = 10
dotPlot(egln1_daniorerio_vector, egln1_daniorerio_vector, 
        wsize = 30, 
        nmatch = 10, 
        main = "EGLN1 - size = 30, nmatch = 10")

# plot 3: size = 5, nmatch = 2
dotPlot(egln1_daniorerio_vector, egln1_daniorerio_vector, 
        wsize = 5, 
        nmatch = 2, 
        main = "EGLN1 - size = 5, nmatch = 2")

# plot 4: size = 12, nmatch = 4
dotPlot(egln1_daniorerio_vector, egln1_daniorerio_vector, 
        wsize = 12,
        nmatch = 4,
        main = "EGLN1 - size = 12 , nmatch = 4")

# reset par() 
par(mfrow = c(1,1), 
    mar = c(4,4,4,4))
dotPlot(egln1_daniorerio_vector, egln1_daniorerio_vector, 
        wsize = 30, 
        nmatch = 5, 
        main = "Danio rerio: EGLN1 30 / 5")

Protein properties compiled from databases

Below are links to relevant information.

Pfam; http://pfam.xfam.org/protein/F6P6J7; entire protein is indicated to be “F6P6J7_DANRE”. zf-MYND domain from 18 to 55; 2OG-FeII_Oxy_3 domain from 233 to 326. DisProt: no information available RepeatsDB: no information available UniProt sub-cellular locations: nucleus and cytoplasm PDB secondary structural location: no PDB or CATH entry available Alphafold; (https://alphafold.ebi.ac.uk/entry/F6P6J7). The predicted structure contains alpha helices, beta sheets, and disordered regions.

Protein feature prediction

Predict protein fold

from Chou and Zhang data

# enter once
aa.1.1 <- c("A","R","N","D","C","Q","E","G","H","I",
            "L","K","M","F","P","S","T","W","Y","V")

# enter again
aa.1.2 <- c("A","R","N","D","C","Q","E","G","H","I",
            "L","K","M","F","P","S","T","W","Y","V")

# checking accuracy
length(aa.1.1) == length(aa.1.2)
## [1] TRUE
length(unique(aa.1.1)) == length(unique(aa.1.2))
## [1] TRUE
# alpha proteins
alpha <- c(285, 53, 97, 163, 22, 67, 134, 197, 111, 91, 
           221, 249, 48, 123, 82, 122, 119, 33, 63, 167)

# check against chou's total
sum(alpha) == 2447
## [1] TRUE
# beta proteins
beta <- c(203, 67, 139, 121, 75, 122, 86, 297, 49, 120, 
          177, 115, 16, 85, 127, 341, 253, 44, 110, 229)

# check against chou's total
sum(beta) == 2776
## [1] TRUE
# alpha + beta
a.plus.b <- c(175, 78, 120, 111, 74, 74, 86, 171, 33, 93,
              110, 112, 25, 52, 71, 126, 117, 30, 108, 123)
sum(a.plus.b) == 1889
## [1] TRUE
# alpha/beta
a.div.b <- c(361, 146, 183, 244, 63, 114, 257, 377, 107, 239, 
             339, 321, 91, 158, 188, 327, 238, 72, 130, 378)
sum(a.div.b) == 4333
## [1] TRUE
data.frame(aa.1.1, alpha, beta, a.plus.b, a.div.b)
##    aa.1.1 alpha beta a.plus.b a.div.b
## 1       A   285  203      175     361
## 2       R    53   67       78     146
## 3       N    97  139      120     183
## 4       D   163  121      111     244
## 5       C    22   75       74      63
## 6       Q    67  122       74     114
## 7       E   134   86       86     257
## 8       G   197  297      171     377
## 9       H   111   49       33     107
## 10      I    91  120       93     239
## 11      L   221  177      110     339
## 12      K   249  115      112     321
## 13      M    48   16       25      91
## 14      F   123   85       52     158
## 15      P    82  127       71     188
## 16      S   122  341      126     327
## 17      T   119  253      117     238
## 18      W    33   44       30      72
## 19      Y    63  110      108     130
## 20      V   167  229      123     378
alpha.prop <- alpha/sum(alpha)
beta.prop <- beta/sum(beta)
a.plus.b.prop <- a.plus.b/sum(a.plus.b)
a.div.b <- a.div.b/sum(a.div.b)

#dataframe
aa.prop <- data.frame(alpha.prop,
                      beta.prop,
                      a.plus.b.prop,
                      a.div.b)
#row labels
row.names(aa.prop) <- aa.1.1

aa_pander <- pander::pander(aa.prop)

Adding my protein data

# frequency of my protein
egln1.freq.table <- table(egln1_daniorerio_vector)/length(egln1_daniorerio_vector)

# converting table into a vector
table_to_vector <- function(table_x){
  table_names <- attr(table_x, "dimnames")[[1]]
  table_vect <- as.vector(table_x)
  names(table_vect) <- table_names
  return(table_vect)
}

egln1.daniorerio.aa.freq <- table_to_vector(egln1.freq.table)

aa.names <- names(egln1.daniorerio.aa.freq)
any(aa.names == "U")
## [1] FALSE
aa.prop$egln1.daniorerio.aa.freq <- egln1.daniorerio.aa.freq
pander::pander(aa.prop)
Table continues below
  alpha.prop beta.prop a.plus.b.prop a.div.b
A 0.1165 0.07313 0.09264 0.08331
R 0.02166 0.02414 0.04129 0.03369
N 0.03964 0.05007 0.06353 0.04223
D 0.06661 0.04359 0.05876 0.05631
C 0.008991 0.02702 0.03917 0.01454
Q 0.02738 0.04395 0.03917 0.02631
E 0.05476 0.03098 0.04553 0.05931
G 0.08051 0.107 0.09052 0.08701
H 0.04536 0.01765 0.01747 0.02469
I 0.03719 0.04323 0.04923 0.05516
L 0.09031 0.06376 0.05823 0.07824
K 0.1018 0.04143 0.05929 0.07408
M 0.01962 0.005764 0.01323 0.021
F 0.05027 0.03062 0.02753 0.03646
P 0.03351 0.04575 0.03759 0.04339
S 0.04986 0.1228 0.0667 0.07547
T 0.04863 0.09114 0.06194 0.05493
W 0.01349 0.01585 0.01588 0.01662
Y 0.02575 0.03963 0.05717 0.03
V 0.06825 0.08249 0.06511 0.08724
  egln1.daniorerio.aa.freq
A 0.04469
R 0.03631
N 0.06983
D 0.08101
C 0.03073
Q 0.07821
E 0.01955
G 0.03631
H 0.07263
I 0.05587
L 0.02235
K 0.04469
M 0.05028
F 0.04469
P 0.07821
S 0.0838
T 0.05587
W 0.04749
Y 0.01397
V 0.03352

Functions to calculate similarities

# Corrleation used in Chou adn Zhange 1992.
chou_cor <- function(x,y){
  numerator <- sum(x*y)
denominator <- sqrt((sum(x^2))*(sum(y^2)))
result <- numerator/denominator
return(result)
}

# Cosine similarity used in Higgs and Attwood (2005). 
chou_cosine <- function(z.1, z.2){
  z.1.abs <- sqrt(sum(z.1^2))
  z.2.abs <- sqrt(sum(z.2^2))
  my.cosine <- sum(z.1*z.2)/(z.1.abs*z.2.abs)
  return(my.cosine)
}

Data exploration

par(mfrow = c(2,2), mar = c(1,4,1,1))
plot(alpha.prop ~ egln1.daniorerio.aa.freq, data = aa.prop)
plot(beta.prop ~ egln1.daniorerio.aa.freq, data = aa.prop)
plot(a.plus.b.prop  ~ egln1.daniorerio.aa.freq, data = aa.prop)
plot(a.div.b ~ egln1.daniorerio.aa.freq, data = aa.prop)

Calculating correlation between each column

par(mfrow = c(1,1), mar = c(1,1,1,1))
corr.alpha <- chou_cor(aa.prop[,5], aa.prop[,1])
corr.beta  <- chou_cor(aa.prop[,5], aa.prop[,2])
corr.apb   <- chou_cor(aa.prop[,5], aa.prop[,3])
corr.adb   <- chou_cor(aa.prop[,5], aa.prop[,4])

Calculating cosine similarity

cos.alpha <- chou_cosine(aa.prop[,5], aa.prop[,1])
cos.beta  <- chou_cosine(aa.prop[,5], aa.prop[,2])
cos.apb   <- chou_cosine(aa.prop[,5], aa.prop[,3])
cos.adb   <- chou_cosine(aa.prop[,5], aa.prop[,4])

Calculating distance

aa.prop.flipped <- t(aa.prop)
round(aa.prop.flipped,2)
##                             A    R    N    D    C    Q    E    G    H    I    L
## alpha.prop               0.12 0.02 0.04 0.07 0.01 0.03 0.05 0.08 0.05 0.04 0.09
## beta.prop                0.07 0.02 0.05 0.04 0.03 0.04 0.03 0.11 0.02 0.04 0.06
## a.plus.b.prop            0.09 0.04 0.06 0.06 0.04 0.04 0.05 0.09 0.02 0.05 0.06
## a.div.b                  0.08 0.03 0.04 0.06 0.01 0.03 0.06 0.09 0.02 0.06 0.08
## egln1.daniorerio.aa.freq 0.04 0.04 0.07 0.08 0.03 0.08 0.02 0.04 0.07 0.06 0.02
##                             K    M    F    P    S    T    W    Y    V
## alpha.prop               0.10 0.02 0.05 0.03 0.05 0.05 0.01 0.03 0.07
## beta.prop                0.04 0.01 0.03 0.05 0.12 0.09 0.02 0.04 0.08
## a.plus.b.prop            0.06 0.01 0.03 0.04 0.07 0.06 0.02 0.06 0.07
## a.div.b                  0.07 0.02 0.04 0.04 0.08 0.05 0.02 0.03 0.09
## egln1.daniorerio.aa.freq 0.04 0.05 0.04 0.08 0.08 0.06 0.05 0.01 0.03

Generating a distance matrix

dist(aa.prop.flipped, method = "euclidean")
##                          alpha.prop  beta.prop a.plus.b.prop    a.div.b
## beta.prop                0.13342098                                    
## a.plus.b.prop            0.09281824 0.08289406                         
## a.div.b                  0.06699039 0.08659174    0.06175113           
## egln1.daniorerio.aa.freq 0.16845549 0.15500920    0.14200299 0.15021813

Individal distances

dist.alpha <- dist((aa.prop.flipped[c(1,5),]),  method = "euclidean")
dist.beta  <- dist((aa.prop.flipped[c(2,5),]),  method = "euclidean")
dist.apb   <- dist((aa.prop.flipped[c(3,5),]),  method = "euclidean")
dist.adb  <- dist((aa.prop.flipped[c(4,5),]), method = "euclidean")

Compiling the information

# fold types
fold.type <- c("alpha","beta","alpha plus beta", "alpha/beta")

# data
corr.sim <- round(c(corr.alpha,corr.beta,corr.apb,corr.adb),5)
cosine.sim <- round(c(cos.alpha,cos.beta,cos.apb,cos.adb),5)
Euclidean.dist <- round(c(dist.alpha,dist.beta,dist.apb,dist.adb),5)

# summary
sim.sum <- c("","","most.sim","")
dist.sum <- c("","","min.dist","")

df <- data.frame(fold.type,
           corr.sim ,
           cosine.sim ,
           Euclidean.dist ,
           sim.sum ,
           dist.sum )

# display output
pander::pander(df)
fold.type corr.sim cosine.sim Euclidean.dist sim.sum dist.sum
alpha 0.7754 0.7754 0.1685
beta 0.8135 0.8135 0.155
alpha plus beta 0.829 0.829 0.142 most.sim min.dist
alpha/beta 0.8125 0.8125 0.1502

Percent Identity Comparisons (PID)

Data preparation

Converting all FASTA records into entries in a single vector.

names(egln1_list)
##  [1] "NP_071334"      "NP_444437.2"    "NP_001002595.2" "XP_525092.2"   
##  [5] "XP_001104870.1" "XP_546089.3"    "XP_002728657.3" "XP_020929239.1"
##  [9] "XP_026268254.1" "XP_032955515.1"
length(egln1_list)
## [1] 10
egln1_list[1]
## $NP_071334
## [1] "MANDSGGPGGPSPSERDRQYCELCGKMENLLRCSRCRSSFYCCKEHQRQDWKKHKLVCQGSEGALGHGVGPHQHSGPAPPAAVPPPRAGAREPRKAAARRDNASGDAAKGKVKAKPPADPAAAASPCRAAAGGQGSAVAAEAEPGKEEPPARSSLFQEKANLYPPSNTPGDALSPGGGLRPNGQTKPLPALKLALEYIVPCMNKHGICVVDDFLGKETGQQIGDEVRALHDTGKFTDGQLVSQKSDSSKDIRGDKITWIEGKEPGCETIGLLMSSMDDLIRHCNGKLGSYKINGRTKAMVACYPGNGTGYVRHVDNPNGDGRCVTCIYYLNKDWDAKVSGGILRIFPEGKAQFADIEPKFDRLLFFWSDRRNPHEVQPAYATRYAITVWYFDADERARAKVKYLTGEKGVRVELNKPSDSVGKDVF"

Making each entry of the list into a vector.

# naming the vector
egln_vector <- rep(NA, length(egln1_list))

# the loop function
for(i in 1:length(egln_vector)){
  egln_vector[i] <- egln1_list[[i]]
}

names(egln_vector) <- names(egln1_list)

egln1_human_vector <- egln_vector[1]
egln1_mouse_vector <- egln_vector[2]
egln1_zfish_vector <- egln_vector[3]
egln1_chimp_vector <- egln_vector[4]

PID table

Doing pairwise alignments for humans, chimps, mice, and zebrafish

data(BLOSUM50)
globalAlignh.v.c <- pairwiseAlignment(egln1_human_vector, egln1_chimp_vector,
                                     substitutionMatrix = "BLOSUM50", 
                                     gapOpening = -2,
                                     gapExtension = -8, 
                                     scoreOnly = FALSE)
globalAlignh.v.m <- pairwiseAlignment(egln1_human_vector, egln1_mouse_vector,
                                     substitutionMatrix = "BLOSUM50", 
                                     gapOpening = -2,
                                     gapExtension = -8, 
                                     scoreOnly = FALSE)
globalAlignh.v.z <- pairwiseAlignment(egln1_human_vector, egln1_zfish_vector,
                                     substitutionMatrix = "BLOSUM50", 
                                     gapOpening = -2,
                                     gapExtension = -8, 
                                     scoreOnly = FALSE)
globalAlignsc.v.m <- pairwiseAlignment(egln1_chimp_vector, egln1_mouse_vector,
                                     substitutionMatrix = "BLOSUM50", 
                                     gapOpening = -2,
                                     gapExtension = -8, 
                                     scoreOnly = FALSE)
globalAlignc.v.z <- pairwiseAlignment(egln1_chimp_vector, egln1_zfish_vector,
                                     substitutionMatrix = "BLOSUM50", 
                                     gapOpening = -2,
                                     gapExtension = -8, 
                                     scoreOnly = FALSE)
globalAlignm.v.z <- pairwiseAlignment(egln1_mouse_vector, egln1_zfish_vector,
                                     substitutionMatrix = "BLOSUM50", 
                                     gapOpening = -2,
                                     gapExtension = -8, 
                                     scoreOnly = FALSE)
Biostrings::pid(globalAlignh.v.c)
## [1] 77.2093
Biostrings::pid(globalAlignh.v.m)
## [1] 79.76471
Biostrings::pid(globalAlignh.v.z)
## [1] 61.70213
Biostrings::pid(globalAlignsc.v.m)
## [1] 68.72038
Biostrings::pid(globalAlignc.v.z)
## [1] 55.47619
Biostrings::pid(globalAlignm.v.z)
## [1] 64.67662

Building a matrix

pids <- c(1,                      NA,                   NA,                   NA,
          pid(globalAlignh.v.c),   1,                   NA,                   NA,
          pid(globalAlignh.v.m), pid(globalAlignsc.v.m), 1,                   NA,
          pid(globalAlignh.v.z), pid(globalAlignc.v.z), pid(globalAlignm.v.z), 1)

mat <- matrix(pids, nrow = 4, byrow = T)
row.names(mat) <- c("Homo","Chimp","Mouse","Fish")   
colnames(mat) <- c("Homo","Chimp","Mouse","Fish")   
pander::pander(mat)  
  Homo Chimp Mouse Fish
Homo 1 NA NA NA
Chimp 77.21 1 NA NA
Mouse 79.76 68.72 1 NA
Fish 61.7 55.48 64.68 1

PID methods comparison

Comparing different PID methods (humans vs. chimps)

PID <- c(pid(globalAlignh.v.c, type = "PID1"), pid(globalAlignh.v.c, type = "PID2"), pid(globalAlignh.v.c, type = "PID3"), pid(globalAlignh.v.c, type = "PID4"))
method <- c("PID1", "PID2", "PID3", "PID4")
denominator <- c("(aligned positions + internal gap positions)", "(aligned positions)", "(length shorter sequence)", "(average length of the two sequences)")

comparePID_table <- data.frame(method = method,
                               PID = PID,
                               denominator = denominator)
comparePID_table
##   method      PID                                  denominator
## 1   PID1 77.20930 (aligned positions + internal gap positions)
## 2   PID2 79.42584                          (aligned positions)
## 3   PID3 78.67299                    (length shorter sequence)
## 4   PID4 78.30189        (average length of the two sequences)

Multiple Sequence Alignment

MSA data preparation

Converting our named vector to a string set

egln_vector_ss <- Biostrings::AAStringSet(egln_vector)
egln_vector_ss
## AAStringSet object of length 10:
##      width seq                                              names               
##  [1]   426 MANDSGGPGGPSPSERDRQYCEL...TGEKGVRVELNKPSDSVGKDVF NP_071334
##  [2]   400 MASDSGGPGVLSASERDRQYCEL...KYLTGEKGVRVELKPNSVSKDV NP_444437.2
##  [3]   358 MEGNSRESERLERERQFCELCGK...KYSTSSGERGVKVELNKPSEPS NP_001002595.2
##  [4]   422 MYKEKHIAHVCIWQNPQYRDKQD...TGEKGVRVELNKPSDSVSKDVF XP_525092.2
##  [5]   426 MANDSGGPGGPSPSERDRQYCEL...TGEKGVRVELNKPSDSVGKDVF XP_001104870.1
##  [6]   386 MVFQGGEGAPGSGAAQQWHPVPV...TGEKGVRVELNKPSDSISKDVL XP_546089.3
##  [7]   388 MNKHGICVVDDFLGRETGQQIGD...KYLTGEKGVRVELKPNSVSKDV XP_002728657.3
##  [8]   427 MANDSGGPGGPSPSERDRQYCEL...TGEKGVRVELNKPSDSVSKDVL XP_020929239.1
##  [9]   413 MASDSGGPGGPSASERDRQYCEL...LTGEKGVRVELNKPSDPVGKDV XP_026268254.1
## [10]   427 MANDSGGPGGPSPSERDRQYCEL...TGEKGVRVELNKPSDSISKDVL XP_032955515.1

Building Multiple Sequence Alignment (MSA)

This chunk performs the multiple sequence alignment on the dataset and puts it on the object “shrooms_align”

egln1_align <-  msa  (egln_vector_ss,
                       method = "ClustalW")
## use default substitution matrix
egln1_align
## CLUSTAL 2.1  
## 
## Call:
##    msa(egln_vector_ss, method = "ClustalW")
## 
## MsaAAMultipleAlignment with 10 rows and 439 columns
##      aln                                                   names
##  [1] -MASDSGGPGVLSASERDRQYCELC...LT--GEKGVRVELKPNS--VSKDV- NP_444437.2
##  [2] ---MNKHGICVVDDFLGRETGQQIG...LT--GEKGVRVELKPNS--VSKDV- XP_002728657.3
##  [3] -MASDSGGPGGPSASERDRQYCELC...LT--GEKGVRVELNKPSDPVGKDV- XP_026268254.1
##  [4] -MANDSGGPGGPSPSERDRQYCELC...LT--GEKGVRVELNKPSDSVGKDVF NP_071334
##  [5] -MANDSGGPGGPSPSERDRQYCELC...LT--GEKGVRVELNKPSDSVGKDVF XP_001104870.1
##  [6] MYKEKHIAHVCIWQNPQYRDKQDRN...LT--GEKGVRVELNKPSDSVSKDVF XP_525092.2
##  [7] ----------MVFQGG---------...LT--GEKGVRVELNKPSDSISKDVL XP_546089.3
##  [8] -MANDSGGPGGPSPSERDRQYCELC...LT--GEKGVRVELNKPSDSVSKDVL XP_020929239.1
##  [9] -MANDSGGPGGPSPSERDRQYCELC...LT--GEKGVRVELNKPSDSISKDVL XP_032955515.1
## [10] ----MEGNSRESERLERERQFCELC...STSSGERGVKVELNKPSEPS----- NP_001002595.2
##  Con -MA?DSGGPGGPS?SERDRQYCELC...LT--GEKGVRVELNKPSDSVSKDV? Consensus

Cleaning / Setting up an MSA

msa produces a species MSA objects

class(egln1_align)
## [1] "MsaAAMultipleAlignment"
## attr(,"package")
## [1] "msa"
is(egln1_align)
## [1] "MsaAAMultipleAlignment" "AAMultipleAlignment"    "MsaMetaData"           
## [4] "MultipleAlignment"

Changing class of alignment, converting to seqinr format, and showing output with print_msa

class(egln1_align) <- "AAMultipleAlignment"
egln1_align_seqinr <- msaConvert(egln1_align, type = "seqinr::alignment")

print_msa(alignment = egln1_align_seqinr, 
          chunksize = 60)
## [1] "-MASDSGGPGVLSASERDRQYCELCGKMENLLRCGRCRS-SFYCCKEHQRQDWKKHKLVC 0"
## [1] "---MNKHGICVVDDFLGRETGQQIGDEVRALHDTGKFTDGQLVSQKSDSSKDIRGDKITW 0"
## [1] "-MASDSGGPGGPSASERDRQYCELCGKMENLLRCGRCRS-SFYCCKEHQRQDWKKHKLVC 0"
## [1] "-MANDSGGPGGPSPSERDRQYCELCGKMENLLRCSRCRS-SFYCCKEHQRQDWKKHKLVC 0"
## [1] "-MANDSGGPGGPSPSERDRQYCELCGKMENLLRCSRCRS-SFYCCKEHQRQDWKKHKLVC 0"
## [1] "MYKEKHIAHVCIWQNPQYRDKQDRNSHLATLTGASLAHP-APAWRKSSHDYAAFAPLPAC 0"
## [1] "----------MVFQGG---------------EGAPGSGA-AQQWH----------PVPVL 0"
## [1] "-MANDSGGPGGPSPSERDRQYCELCGKMENLLRCGRCRS-SFYCSKEHQRQDWKKHKLVC 0"
## [1] "-MANDSGGPGGPSPSERDRQYCELCGKMENLLRCSRCRS-SFYCSKEHQRQDWKKHKLVC 0"
## [1] "----MEGNSRESERLERERQFCELCGKMENLMKCGRCRS-SFYCSKEHQRQDWKKHKRVC 0"
## [1] " "
## [1] "QGGE---------------APRAQPAPAQPRVAPPPGGAPGAARAGGAARRGDS-----A 0"
## [1] "IEGK---------------EPGWRPS------------APGAARAGGAARRGDSS----T 0"
## [1] "QGGDPAGPAAA-----PRAQPGPAPPAAAPPTRAGAAAGNAGARAGKAAGQRQGS----A 0"
## [1] "QGSEGALGHGVGPHQHSGPAPPAAVPPPRAGAREPRKAAARRDNASGDAAKGKVKAKPPA 0"
## [1] "QGSESALGPGVGPHQHSGPAPPVAAPPPRAGAREPRKAAARRDNASGDAAKAKAKGKPPA 0"
## [1] "GWRPPPAGVGAAALP---PRLFRRFAGPCALGVPRRRPAAGPKAPGKRPGKSTVK--PPA 0"
## [1] "AASAPPPGEAREARE---PREARKAAP----RRDSAAESAGPRAAG-DAAKAKAK--PAA 0"
## [1] "QGGEGAPAPGAGQQPQPGPAPLLRKAVPRRDRVAAVEAAGPRAAGDAAKAKAKAKP--AA 0"
## [1] "QRGDGAPGPGASQHPHPGPAPPAAAPPPRATGAKEARTAAPRRDSAAAGDAADAK----A 0"
## [1] "KEAD-----------------------------------KQQQPVGEEEEEQPSD----T 0"
## [1] " "
## [1] "AASRVPGPEDAAQARSGPGPAEPGS--------EDPPLSRSPGP----ERASLCPAGGGP 0"
## [1] "AASRVPGPEDATQAGSGPGPAEPSS--------EDPPPSRSPGP----ERASLCPAGGGP 0"
## [1] "EAARAPAPGDAAQARGPRGAAECGQ--------EEPPARRSPCP----ERASLCPAGSSP 0"
## [1] "DPAAAASPCRAAAGGQGSAVAAEAE-----PGKEEPPARSSLFQ----EKANLYPPSNTP 0"
## [1] "DPAAAASPSRAAPGGQGSAVAAEAE-----PGKEEPPARSSLFQ----EKANLYPPSNTP 0"
## [1] "DPAAAASPSRAAAGGQGSAVAAEAE-----PGKEEPPARSSLFQ----EKANLYPPSNTP 0"
## [1] "DPAAAASPPRAAAGGQGPAAAAAAA-----AAAACAAAEAEPAKEEDPEKTNLYPPSGTP 0"
## [1] "DPAAAASPPRAAPGGAGPAAAAAAA--EAEPAKEEPPARSSLYQ----EKANLYPPSNTP 0"
## [1] "KPAAAASPPRAAPGGQGPAAAAATATAEAEPAKEEPLGRSSLFQ----DKANLYPPGNTP 0"
## [1] "SKQSSTSQSNSTLTSQD-------------------------------EKMPDFIKSATG 0"
## [1] " "
## [1] "GEALSPGGGLRPNGQTKPLPALKLALEYIVPCMNKHGICVVDDFLGRETGQQIGDEVRAL 0"
## [1] "GEALSPSGGLRPNGQTKPLPALKLALEYIVPCMNKHGICVVDDFLGRETGQQIGDEVRAL 0"
## [1] "GEALSPGGGLRPNGQTKPLPALKLALEYIVPCMNKHGICVVDDFLGRETGQQIGDEVRAL 0"
## [1] "GDALSPGGGLRPNGQTKPLPALKLALEYIVPCMNKHGICVVDDFLGKETGQQIGDEVRAL 0"
## [1] "GDALSPSGGLRPNGQTKPLPALKLALEYIVPCMNKHGICVVDDFLGKETGQQIGDEVRAL 0"
## [1] "GDALSPSGGLRPNGQTKPLPALKLALEYIVPCMNKHGICVVDDFLGKETGQQIGDEVRAL 0"
## [1] "GEGLSPGGGLRPNGQTKPLPALKLALEYIVPCMNKHGICVVDDFLGKETGQQIGDEVRAL 0"
## [1] "GEGLSHGGGLRPNGQTKPLPALKLALEYIVPCMNKHGICVVDDFLGKETGQQIGDEVRAL 0"
## [1] "GEALSHGGGPRPNGQTKPLPALKLALEYIVPCMNKHGICVVDDFLGKETGQQIGEEVRAL 0"
## [1] "SDTKPTPDSSKPNGQTR-SPPQKLATDYIVPCMNKHGICVVDNFLGNEVGRSILEDVRAL 0"
## [1] " "
## [1] "HDTGKFTDGQLVSQKSDSSKDIRGDQITWIEGKEPGCETIGLLMSSMDDLIRHCSGKLGN 0"
## [1] "HDTGKFTDGQLVSQKSDSSKDIRGDKITWIEGKEPGCETIGLLMSSMDDLIRHCSGKLGN 0"
## [1] "HDTGKFTDGQLVSQKSDSSKDIRGDKITWIEGKEPGCETIGLLMSSMDDLIRHCNGKLGN 0"
## [1] "HDTGKFTDGQLVSQKSDSSKDIRGDKITWIEGKEPGCETIGLLMSSMDDLIRHCNGKLGS 0"
## [1] "HDTGKFTDGQLVSQKSDSSKDIRGDKITWIEGKEPGCETIGLLMSSMDDLIRHCNGKLGS 0"
## [1] "HDTGKFTDGQLVSQKSDSSKDIRGDKITWIEGKEPGCETIGLLMSSMDDLIRHCNGKLGS 0"
## [1] "HDTGKFTDGQLVSQKSDSSKDIRGDKITWIEGKEPGCETIGLLMSSMDDLIRHCNGKLGN 0"
## [1] "HDTGKFTDGQLVSQKSDSSKDIRGDKITWIEGKEPGCETIGLLMSSMDDLIRHCNGKLGN 0"
## [1] "HDTGKFTDGQLVSQKSDSSKDIRGDKITWIEGKEPGCETIGLLMSSMDDLIRHCNGKLGN 0"
## [1] "YLTGGFTDGQLVSQRSDSSKDIRGDKITWVEGKEPGCERIAFLMSRMDDLIRHCNGKLGN 0"
## [1] " "
## [1] "YRINGRTKAMVACYPGNGTGYVRHVDNPNGDGRCVTCIYYLNKDWDAKVSGGILRIFPEG 0"
## [1] "YRINGRTKAMVACYPGNGTGYVRHVDNPNGDGRCVTCIYYLNKDWDAKVSGGILRIFPEG 0"
## [1] "YKINGRTKAMVACYPGNGTGYVRHVDNPNGDGRCVTCIYYLNKDWDAKVSGGILRIFPEG 0"
## [1] "YKINGRTKAMVACYPGNGTGYVRHVDNPNGDGRCVTCIYYLNKDWDAKVSGGILRIFPEG 0"
## [1] "YKINGRTKAMVACYPGNGTGYVRHVDNPNGDGRCVTCIYYLNKDWDAKVSGGILRIFPEG 0"
## [1] "YKINGRTKAMVACYPGNGTGYVRHVDNPNGDGRCVTCIYYLNKDWDAKVSGGILRIFPEG 0"
## [1] "YKINGRTKAMVACYPGNGTGYVRHVDNPNGDGRCVTCIYYLNKDWDAKVSGGILRIFPEG 0"
## [1] "YKINGRTKAMVACYPGNGTGYVRHVDNPNGDGRCVTCIYYLNKDWDAKVSGGILRIFPEG 0"
## [1] "YKINGRTKAMVACYPGNGTGYVRHVDNPNGDGRCVTCIYYLNKDWDAKVSGGILRIFPEG 0"
## [1] "YRINGRTKAMVACYPGNGTGYVRHVDNPNGDGRCVTCIYYLNKDWDATEHGGLLRIFPEG 0"
## [1] " "
## [1] "KAQFADIEPKFDRLLFFWSDRRNPHEVQPAYATRYAITVWYFDADERARAKVKYLT--GE 0"
## [1] "KAQFADIEPKFDRLLFFWSDRRNPHEVQPAYATRYAITVWYFDADERARAKVKYLT--GE 0"
## [1] "KAQFADIEPKFDRLLFFWSDRRNPHEVQPAYATRYAITVWYFDADERARAKVKYLT--GE 0"
## [1] "KAQFADIEPKFDRLLFFWSDRRNPHEVQPAYATRYAITVWYFDADERARAKVKYLT--GE 0"
## [1] "KAQFADIEPKFDRLLFFWSDRRNPHEVQPAYATRYAITVWYFDADERARAKVKYLT--GE 0"
## [1] "KAQFADIEPKFDRLLFFWSDRRNPHEVQPAYATRYAITVWYFDADERARAKVKYLT--GE 0"
## [1] "KAQFADIEPKFDRLLFFWSDRRNPHEVQPAYSTRYAITVWYFDADERARAKVKYLT--GE 0"
## [1] "KAQFADIEPKFDRLLFFWSDRRNPHEVQPAYATRYAITVWYFDADERARAKVKYLT--GE 0"
## [1] "KAQFADIEPKFDRLLFFWSDRRNPHEVQPAYATRYAITVWYFDADERARAKVKYLT--GE 0"
## [1] "TAQFADIEPKFDRLLLFWSDRRNPHEVQPAYATRYAITVWYFDADERARAKEKYSTSSGE 0"
## [1] " "
## [1] "KGVRVELKPNS--VSKDV- 41"
## [1] "KGVRVELKPNS--VSKDV- 41"
## [1] "KGVRVELNKPSDPVGKDV- 41"
## [1] "KGVRVELNKPSDSVGKDVF 41"
## [1] "KGVRVELNKPSDSVGKDVF 41"
## [1] "KGVRVELNKPSDSVSKDVF 41"
## [1] "KGVRVELNKPSDSISKDVL 41"
## [1] "KGVRVELNKPSDSVSKDVL 41"
## [1] "KGVRVELNKPSDSISKDVL 41"
## [1] "RGVKVELNKPSEPS----- 41"
## [1] " "

Finished MSA

ggmsa(egln1_align,   start = 15, end = 65)

Distance matrix

Make a distance matrix

egln1_dist <- seqinr::dist.alignment(egln1_align_seqinr, 
                                       matrix = "identity")
is(egln1_dist)
## [1] "dist"     "oldClass"
class(egln1_dist)
## [1] "dist"
egln1_align_seqinr_rnd <- round(egln1_dist, 3)
egln1_align_seqinr_rnd
##                NP_444437.2 XP_002728657.3 XP_026268254.1 NP_071334
## XP_002728657.3       0.407                                        
## XP_026268254.1       0.361          0.495                         
## NP_071334            0.433          0.552          0.432          
## XP_001104870.1       0.442          0.552          0.435     0.153
## XP_525092.2          0.574          0.555          0.569     0.480
## XP_546089.3          0.518          0.513          0.516     0.472
## XP_020929239.1       0.436          0.545          0.429     0.357
## XP_032955515.1       0.447          0.557          0.443     0.358
## NP_001002595.2       0.563          0.636          0.556     0.553
##                XP_001104870.1 XP_525092.2 XP_546089.3 XP_020929239.1
## XP_002728657.3                                                      
## XP_026268254.1                                                      
## NP_071334                                                           
## XP_001104870.1                                                      
## XP_525092.2             0.480                                       
## XP_546089.3             0.469       0.449                           
## XP_020929239.1          0.350       0.512       0.452               
## XP_032955515.1          0.351       0.522       0.470          0.318
## NP_001002595.2          0.551       0.629       0.593          0.551
##                XP_032955515.1
## XP_002728657.3               
## XP_026268254.1               
## NP_071334                    
## XP_001104870.1               
## XP_525092.2                  
## XP_546089.3                  
## XP_020929239.1               
## XP_032955515.1               
## NP_001002595.2          0.551

Phylogenetic trees of sequences

Building a phylogenetic tree from the distance matrix

egln1_tree <- nj(egln1_dist)
# plot tree
plot.phylo(egln1_tree, main="Phylogenetic Tree", 
            use.edge.length = F)
# add label
mtext(text = "EGLN1 family gene tree - rooted, no branch lengths")