##Introduction

Type 3 oculocutaneous albinism (OCA3) is a rare genetic disease linked to the EMC2 Gene. This code compiles summary information about the gene EMC2 (ER membrane protein complex subunit 2).

It also generates alignments and a phylogeneitc tree to indicating the evolutionary relationship betweeen the human version of the gene and its homologs in other species.

##Resources / References Key information use to make this script can be found here:

RefSeq Gene: https://www.ncbi.nlm.nih.gov/gene/9694 RefSeq Homologene: https://www.ncbi.nlm.nih.gov/homologene/8785 BLAST: https://blast.ncbi.nlm.nih.gov/Blast.cgi PBD: https://www.rcsb.org/structure/6WW7

##Preparation Load packages:

# github packages
library(compbio4all)
library(ggmsa)
## Registered S3 methods overwritten by 'ggalt':
##   method                  from   
##   grid.draw.absoluteGrob  ggplot2
##   grobHeight.absoluteGrob ggplot2
##   grobWidth.absoluteGrob  ggplot2
##   grobX.absoluteGrob      ggplot2
##   grobY.absoluteGrob      ggplot2
# CRAN packages
library(rentrez)
library(seqinr)
library(ape)
## 
## Attaching package: 'ape'
## The following objects are masked from 'package:seqinr':
## 
##     as.alignment, consensus
library(pander)
library(ggplot2)

#Bioconductor packages 
library(BiocManager)
## Bioconductor version '3.13' is out-of-date; the current release version '3.14'
##   is available with R version '4.1'; see https://bioconductor.org/install
library(drawProteins)
library(msa)
## Loading required package: Biostrings
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: XVector
## Loading required package: GenomeInfoDb
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:ape':
## 
##     complement
## The following object is masked from 'package:seqinr':
## 
##     translate
## The following object is masked from 'package:base':
## 
##     strsplit
## 
## Attaching package: 'msa'
## The following object is masked from 'package:BiocManager':
## 
##     version
#BioStrings Packages
library(Biostrings)
library(HGNChelper)

##Accession Numbers

Accession numbers were obtained from RefSeq, Refseq Homlogene, UniProt and PDB. UniProt accession numbers can be found by searching for the gene name. PDB accessions can be found by searching with a UniProt accession or a gene name. The Neanderthal database was searched and yielded no results. 7 Organisim Accession information was obtained via Refseq Homolgene, 3 were obtained via a Blast search on EMC2.

##Accession Number Table

#human
#chimpanzee
#Rhesus monkey
#zebrafish
#tropical clawed frog
#chicken
#Norway rat

#BLAST
#meerkat
#Southern white rhinoceros
#Wild boar

RefSeq.id <-c("NP_055488.1","XP_001135909.1","XP_001090826.1","XP_850261.1","NP_001073796.1","NP_080012.1","NP_001107257.1","XP_029779579.1","XP_004431318.2","XP_020944679.1")
UniProt.id <-c("Q15006","H2QWK7","I0FH38","Q6TGY8","Q5M7J9","A0A1D5NVZ7","B0BNG0","NA","NA","NA")
PDB.id <-c("6WW7","NA","NA","NA","NA","NA","NA","NA","NA","NA")
Scientific.Name <-c("Homo sapiens","Pan troglodytes","Macaca mulatta","Danio rerio","Xenopus tropicalis","Gallus gallus","Rattus norvegicus","Suricata suricatta","Ceratotherium simum simum","Sus scrof")
Common.Name<-c("human","chimpanzee","Rhesus monkey","zebrafish","tropical clawed frog","chicken","Norway rat","Meerkat","Southern white rhinoceros","Wild Boar")
Gene.Name <-c("EMC2","EMC2","EMC2","EMC2","EMC2","Emc2","EMC2","EMC2","EMC2","EMC2")

EMC2.Table <- data.frame(RefSeq.id=RefSeq.id, 
                         UniProt.id=UniProt.id,
                         PDB.id=PDB.id,
                         Scientific.Name=Scientific.Name,
                         Common.Name=Common.Name,
                         Gene.Name=Gene.Name)

pander::pander(EMC2.Table)
Table continues below
RefSeq.id UniProt.id PDB.id Scientific.Name
NP_055488.1 Q15006 6WW7 Homo sapiens
XP_001135909.1 H2QWK7 NA Pan troglodytes
XP_001090826.1 I0FH38 NA Macaca mulatta
XP_850261.1 Q6TGY8 NA Danio rerio
NP_001073796.1 Q5M7J9 NA Xenopus tropicalis
NP_080012.1 A0A1D5NVZ7 NA Gallus gallus
NP_001107257.1 B0BNG0 NA Rattus norvegicus
XP_029779579.1 NA NA Suricata suricatta
XP_004431318.2 NA NA Ceratotherium simum simum
XP_020944679.1 NA NA Sus scrof
Common.Name Gene.Name
human EMC2
chimpanzee EMC2
Rhesus monkey EMC2
zebrafish EMC2
tropical clawed frog EMC2
chicken Emc2
Norway rat EMC2
Meerkat EMC2
Southern white rhinoceros EMC2
Wild Boar EMC2

##Data Prepartation

EMC2_list <- compbio4all::entrez_fetch_list(db="protein", id=RefSeq.id, rettype="fasta")
length(EMC2_list)
## [1] 10
for(i in 1:length(EMC2_list)){
  EMC2_list[[i]] <- compbio4all::fasta_cleaner(EMC2_list[[i]], parse = F)
}

##General Protein Information protein diagram via draw proteins

Q15006_json  <- drawProteins::get_features("Q15006")
## [1] "Download has worked"
my_prot_df <- drawProteins::feature_to_dataframe(Q15006_json)


#Protein Regions
my_canvas <- draw_canvas(my_prot_df)
my_canvas <- draw_chains(my_canvas, my_prot_df, label_size = 2.5)
my_canvas <- draw_regions(my_canvas, my_prot_df)
my_canvas <- draw_folding(my_canvas, my_prot_df)
my_canvas

#Repeats
my_canvas <- draw_canvas(my_prot_df)
my_canvas <- draw_chains(my_canvas, my_prot_df, label_size = 2.5)

my_canvas <- draw_repeat(my_canvas, my_prot_df)
my_canvas

Q15006_FASTA <- rentrez::entrez_fetch(id = "Q15006",
                                      db = "protein", 
                                      rettype="fasta")
Q15006_vector <- fasta_cleaner(Q15006_FASTA)
par(mfrow = c(2,2), 
    mar = c(0,0,2,1))

# plot 1: Defaults
dotPlot(Q15006_vector, Q15006_vector, 
        wsize = 1, 
        nmatch = 1, 
        main = "")

# plot 2 size = 10, nmatch = 1
dotPlot(Q15006_vector, Q15006_vector, 
        wsize = 10, 
        nmatch = 1, 
        main = "")

# plot 3: size = 10, nmatch = 5
dotPlot(Q15006_vector, Q15006_vector, 
        wsize = 10, 
        nmatch = 5, 
        main = "")

# plot 4: size = 20, nmatch = 5
dotPlot(Q15006_vector, Q15006_vector, 
        wsize = 20,
        nmatch = 5,
        main = "")

# reset par() - run this or other plots will be small!
par(mfrow = c(1,1), 
    mar = c(4,4,4,4))
dotPlot(Q15006_vector, Q15006_vector, 
        wsize = 20,
        nmatch = 5,
        main = "")

#Protein properties compiled from databases

Pfam: http://pfam.xfam.org/protein/Q15006 Nothing really interesting to plot but the page did exist Disprot: No information found RepeatsDB: No information found

Information for major Protein properties was not adundant enough to compile anything interesting for this section.

##Secondary Protein Properties Corrleation used in Chou adn Zhange 1992. Cosine similarity used in Higgs and Attwood (2005). 4 correlations investigated for protein prediction.

# enter once
aa.1.1 <- c("A","R","N","D","C","Q","E","G","H","I",
            "L","K","M","F","P","S","T","W","Y","V")

# enter again
aa.1.2 <- c("A","R","N","D","C","Q","E","G","H","I",
            "L","K","M","F","P","S","T","W","Y","V")
alpha <- c(285, 53, 97, 163, 22, 67, 134, 197, 111, 91, 
           221, 249, 48, 123, 82, 122, 119, 33, 63, 167)
beta <- c(203, 67, 139, 121, 75, 122, 86, 297, 49, 120, 
          177, 115, 16, 85, 127, 341, 253, 44, 110, 229)
a.plus.b <- c(175, 78, 120, 111, 74, 74, 86, 171, 33, 93,
              110, 112, 25, 52, 71, 126, 117, 30, 108, 123)
a.div.b <- c(361, 146, 183, 244, 63, 114, 257, 377, 107, 239, 
             339, 321, 91, 158, 188, 327, 238, 72, 130, 378)
data.frame(aa.1.1, alpha, beta, a.plus.b, a.div.b)
##    aa.1.1 alpha beta a.plus.b a.div.b
## 1       A   285  203      175     361
## 2       R    53   67       78     146
## 3       N    97  139      120     183
## 4       D   163  121      111     244
## 5       C    22   75       74      63
## 6       Q    67  122       74     114
## 7       E   134   86       86     257
## 8       G   197  297      171     377
## 9       H   111   49       33     107
## 10      I    91  120       93     239
## 11      L   221  177      110     339
## 12      K   249  115      112     321
## 13      M    48   16       25      91
## 14      F   123   85       52     158
## 15      P    82  127       71     188
## 16      S   122  341      126     327
## 17      T   119  253      117     238
## 18      W    33   44       30      72
## 19      Y    63  110      108     130
## 20      V   167  229      123     378
alpha.prop <- alpha/sum(alpha)
beta.prop <- beta/sum(beta)
a.plus.b.prop <- a.plus.b/sum(a.plus.b)
a.div.b <- a.div.b/sum(a.div.b)
aa.prop <- data.frame(alpha.prop,
                      beta.prop,
                      a.plus.b.prop,
                      a.div.b)
#row labels
row.names(aa.prop) <- aa.1.1
# check against chou's total
EMC2_Human <- rentrez::entrez_fetch(id = "NP_055488.1",
                                      db = "protein", 
                                      rettype="fasta")
Human_vector <- compbio4all::fasta_cleaner(EMC2_Human)
EMC2_H.freq.table <- table(Human_vector)/length(Human_vector)
table_to_vector <- function(table_x){
  table_names <- attr(table_x, "dimnames")[[1]]
  table_vect <- as.vector(table_x)
  names(table_vect) <- table_names
  return(table_vect)
}
EMC2.human.aa.freq <- table_to_vector(EMC2_H.freq.table)
aa.prop$EMC2.human.aa.freq <- EMC2.human.aa.freq


chou_cor <- function(x,y){
  numerator <- sum(x*y)
denominator <- sqrt((sum(x^2))*(sum(y^2)))
result <- numerator/denominator
return(result)
}
chou_cosine <- function(z.1, z.2){
  z.1.abs <- sqrt(sum(z.1^2))
  z.2.abs <- sqrt(sum(z.2^2))
  my.cosine <- sum(z.1*z.2)/(z.1.abs*z.2.abs)
  return(my.cosine)
}
par(mfrow = c(2,2), mar = c(1,4,1,1))
plot(alpha.prop ~ EMC2.human.aa.freq, data = aa.prop)
plot(beta.prop ~ EMC2.human.aa.freq, data = aa.prop)
plot(a.plus.b.prop  ~ EMC2.human.aa.freq, data = aa.prop)
plot(a.div.b ~ EMC2.human.aa.freq, data = aa.prop)

Table providing Cosine similarity, Eeclidian distance, and column correlation.

par(mfrow = c(1,1), mar = c(1,1,1,1))

corr.alpha <- chou_cor(aa.prop[,5], aa.prop[,1])
corr.beta  <- chou_cor(aa.prop[,5], aa.prop[,2])
corr.apb   <- chou_cor(aa.prop[,5], aa.prop[,3])
corr.adb   <- chou_cor(aa.prop[,5], aa.prop[,4])

cos.alpha <- chou_cosine(aa.prop[,5], aa.prop[,1])
cos.beta  <- chou_cosine(aa.prop[,5], aa.prop[,2])
cos.apb   <- chou_cosine(aa.prop[,5], aa.prop[,3])
cos.adb   <- chou_cosine(aa.prop[,5], aa.prop[,4])

aa.prop.flipped <- t(aa.prop)
round(aa.prop.flipped,2)
##                       A    R    N    D    C    Q    E    G    H    I    L    K
## alpha.prop         0.12 0.02 0.04 0.07 0.01 0.03 0.05 0.08 0.05 0.04 0.09 0.10
## beta.prop          0.07 0.02 0.05 0.04 0.03 0.04 0.03 0.11 0.02 0.04 0.06 0.04
## a.plus.b.prop      0.09 0.04 0.06 0.06 0.04 0.04 0.05 0.09 0.02 0.05 0.06 0.06
## a.div.b            0.08 0.03 0.04 0.06 0.01 0.03 0.06 0.09 0.02 0.06 0.08 0.07
## EMC2.human.aa.freq 0.10 0.01 0.05 0.10 0.03 0.04 0.02 0.05 0.07 0.09 0.04 0.05
##                       M    F    P    S    T    W    Y    V
## alpha.prop         0.02 0.05 0.03 0.05 0.05 0.01 0.03 0.07
## beta.prop          0.01 0.03 0.05 0.12 0.09 0.02 0.04 0.08
## a.plus.b.prop      0.01 0.03 0.04 0.07 0.06 0.02 0.06 0.07
## a.div.b            0.02 0.04 0.04 0.08 0.05 0.02 0.03 0.09
## EMC2.human.aa.freq 0.01 0.06 0.07 0.05 0.03 0.03 0.02 0.06
dist(aa.prop.flipped, method = "euclidean")
##                    alpha.prop  beta.prop a.plus.b.prop    a.div.b
## beta.prop          0.13342098                                    
## a.plus.b.prop      0.09281824 0.08289406                         
## a.div.b            0.06699039 0.08659174    0.06175113           
## EMC2.human.aa.freq 0.12451288 0.16159692    0.12800446 0.12645889
dist.alpha <- dist((aa.prop.flipped[c(1,5),]),  method = "euclidean")
dist.beta  <- dist((aa.prop.flipped[c(2,5),]),  method = "euclidean")
dist.apb   <- dist((aa.prop.flipped[c(3,5),]),  method = "euclidean")
dist.adb  <- dist((aa.prop.flipped[c(4,5),]), method = "euclidean")

fold.type <- c("alpha","beta","alpha plus beta", "alpha/beta")

# data
corr.sim <- round(c(corr.alpha,corr.beta,corr.apb,corr.adb),5)
cosine.sim <- round(c(cos.alpha,cos.beta,cos.apb,cos.adb),5)
Euclidean.dist <- round(c(dist.alpha,dist.beta,dist.apb,dist.adb),5)

# summary
sim.sum <- c("","","most.sim","")
dist.sum <- c("","","min.dist","")

df <- data.frame(fold.type,
           corr.sim ,
           cosine.sim ,
           Euclidean.dist ,
           sim.sum ,
           dist.sum )

pander::pander(df)
fold.type corr.sim cosine.sim Euclidean.dist sim.sum dist.sum
alpha 0.8825 0.8825 0.1245
beta 0.8049 0.8049 0.1616
alpha plus beta 0.8693 0.8693 0.128 most.sim min.dist
alpha/beta 0.8742 0.8742 0.1265

##Percent Identity Comparisons (PID)

#Data Preperation

EMC2_Human <- rentrez::entrez_fetch(id = "NP_055488.1",
                                      db = "protein", 
                                      rettype="fasta")
Human_vector <- compbio4all::fasta_cleaner(EMC2_Human)
Human_str <- paste(Human_vector, collapse = "")
Human_str <- toupper(Human_str)

EMC2_Chimp <- rentrez::entrez_fetch(id = "XP_001135909.1",
                                      db = "protein", 
                                      rettype="fasta")
Chimp_vector <- compbio4all::fasta_cleaner(EMC2_Chimp)
Chimp_str <- paste(Chimp_vector,collapse = "")
Chimp_str <- toupper(Chimp_str)


EMC2_Zebrafish <- rentrez::entrez_fetch(id = "XP_850261.1",
                                      db = "protein", 
                                      rettype="fasta")
Zebra_vector <- compbio4all::fasta_cleaner(EMC2_Zebrafish)
Zebrafish_str <- paste(Zebra_vector,collapse = "")
Zebrafish_str <- toupper(Zebrafish_str)



EMC2_chx <- rentrez::entrez_fetch(id = "NP_080012.1",
                                      db = "protein", 
                                      rettype="fasta")
chx_vector <- compbio4all::fasta_cleaner(EMC2_chx)
chx_str <- paste(chx_vector,collapse = "")
chx_str <- toupper(chx_str)

data(BLOSUM50)
Align01 <- Biostrings::pairwiseAlignment(Human_str, Chimp_str, substitutionMatrix = BLOSUM50,
                                       gapOpening = -2, 
                                               gapExtension = -8, 
                                               scoreOnly = FALSE)
Align02 <- Biostrings::pairwiseAlignment(Human_str, Zebrafish_str, substitutionMatrix = BLOSUM50,
                                       gapOpening = -2, 
                                               gapExtension = -8, 
                                               scoreOnly = FALSE)
Align03 <- Biostrings::pairwiseAlignment(Human_str, chx_str, substitutionMatrix = BLOSUM50,
                                       gapOpening = -2, 
                                               gapExtension = -8, 
                                               scoreOnly = FALSE)

Align04 <- Biostrings::pairwiseAlignment(Chimp_str, Zebrafish_str, substitutionMatrix = BLOSUM50,
                                       gapOpening = -2, 
                                               gapExtension = -8, 
                                               scoreOnly = FALSE)
Align05 <- Biostrings::pairwiseAlignment(Chimp_str, chx_str, substitutionMatrix = BLOSUM50,
                                       gapOpening = -2, 
                                               gapExtension = -8, 
                                               scoreOnly = FALSE)

Align06 <- Biostrings::pairwiseAlignment(Zebrafish_str,chx_str, substitutionMatrix = BLOSUM50,
                                       gapOpening = -2, 
                                               gapExtension = -8, 
                                               scoreOnly = FALSE)
pids <- c(1,                  NA,     NA,     NA,
          pid(Align01),          1,     NA,     NA,
          pid(Align02), pid(Align04),      1,     NA,
          pid(Align03), pid(Align05), pid(Align06), 1)
mat <- matrix(pids, nrow = 4, byrow = T)
row.names(mat) <- c("Human","Chimpanzee","Zebrafish","Chicken")   
colnames(mat) <- c("Human","Chimpanzee","Zebrafish","Chicken")   
pander::pander(mat)  
  Human Chimpanzee Zebrafish Chicken
Human 1 NA NA NA
Chimpanzee 100 1 NA NA
Zebrafish 100 100 1 NA
Chicken 97.31 97.31 97.31 1

#PID methods comparison

pid1 <- pid(Align01)
pid2 <- pid(Align01, type="PID2")
pid3 <- pid(Align01, type="PID3")
pid4 <- pid(Align01, type="PID4")

method <- c("PID1", "PID2", "PID3", "PID4")
PID <- c(pid1, pid2, pid3, pid4)
denominator <- c("(aligned positions + internal gap positions)", "(aligned positions)", "(length shorter sequence)", "(average length of the two sequences)")

pid.df <- data.frame(method=method, PID=PID, denominator=denominator)
pander(pid.df)
method PID denominator
PID1 100 (aligned positions + internal gap positions)
PID2 100 (aligned positions)
PID3 100 (length shorter sequence)
PID4 100 (average length of the two sequences)

##Multiple sequence alignment #MSA data preparation

vec1 <- c(EMC2_list$NP_055488.1, EMC2_list$XP_001135909.1, EMC2_list$XP_001090826.1,
          EMC2_list$XP_850261.1, EMC2_list$NP_001073796.1, EMC2_list$NP_080012.1,
          EMC2_list$NP_001107257.1, EMC2_list$XP_029779579.1, EMC2_list$XP_004431318.2,
          EMC2_list$XP_020944679.1)
EMC2_ss <- Biostrings::AAStringSet(vec1)
EMC2_msa <- msa(EMC2_ss)
## use default substitution matrix
EMC2_msa_2 <- msaConvert(EMC2_msa, type="seqinr::alignment")

##Full MSA

compbio4all::print_msa(EMC2_msa_2)
## [1] "--------------------------------MAKVSELYDVTWEEMRDKMRKWREENSR 0"
## [1] "--------------------------------MAKVSELYDVTWEEMRDKMRKWREENSR 0"
## [1] "--------------------------------MAKVSELYDVTWEEMRDKMRKWREENSR 0"
## [1] "--------------------------------MAKVSELYDVTWEEMRDKMRKWREENSR 0"
## [1] "--------------------------------MAKVSELYDVTWEEMRDKMRKWREENSR 0"
## [1] "--------------------------------MAKVSELYDVTWEEMRDKMRKWREENSR 0"
## [1] "--------------------------------MAKVSELYDVTWEEMRDKMRKWREENSR 0"
## [1] "MKNRVVILRLLVAGHVTASPRPLTPLPLGFKKMAKVSELYDVTWEEMRDKMRKWREENSR 0"
## [1] "--------------------------------MAKVTERYDVTWEEMRDKMRKWREENSR 0"
## [1] "--------------------------------MAKVTERYDVTWEEMRDKMRKWREENSR 0"
## [1] " "
## [1] "NSEQIVEVGEELINEYASKLGDDIWIIYEQVMIAALDYGRDDLALFCLQELRRQFPGSHR 0"
## [1] "NSEQIVEVGEELINEYASKLGDDIWIIYEQVMIAALDYGRDDLALFCLQELRRQFPGSHR 0"
## [1] "NSEQIVEVGEELINEYASKLGDDIWIIYEQVMIAALDYGRDDLALFCLQELRRQFPGSHR 0"
## [1] "NSEQIVEVGEELISEYASKLGDDIWIIYEQVMIAALDYGRDDLALFCLQELRRQFPGSHR 0"
## [1] "NSEQIVEVGEELINEYASKLGDDIWIIYEQVMIAALDYGRDDLALFCLQELRRQFPGSHR 0"
## [1] "NSEQIVEVGEELINEYASKLGDDIWIIYEQVMIAALDYGRDDLALFCLQELRRQFPGSHR 0"
## [1] "NSEQIVEVGEELINEYASKLGDDIWIIYEQVMVAALDYGRDDLALFCLQELRRQFPGSHR 0"
## [1] "NSEQIVEVGEELINEYASKLGDDIWIIYEQVMIAALDYGRDDLALFCLQELRRQFPGSHR 0"
## [1] "NSEQIMEVGEELINDYASKLGDDIWIIYEQVMIAALDYGRDDLALFCLQELRRQFPGSHR 0"
## [1] "NSEQIMEVGEELINDYASKLGDDIWIIYEQVMIAALDYGRDDLALFCLQELRRQFPGSHR 0"
## [1] " "
## [1] "VKRLTGMRFEAMERYDDAVQLYDRILQEDPTNTAARKRKIAIRKAQGKNVEAIRELNEYL 0"
## [1] "VKRLTGMRFEAMERYDDALQLYDRILQEDPTNTAARKRKIAIRKAQGKNVEAIRELNEYL 0"
## [1] "VKRLTGMRFEAMERYDDAIQLYDRILQEDPTNTAARKRKIAIRKAQGKNVEAIRELNEYL 0"
## [1] "VKRLTGMRFEAMERYDDAIQLYDRILQEDPTNTAARKRKIAIRKAQGKNVEAIRELNEYL 0"
## [1] "VKRLTGMRFEAMERYDDAIQLYDRILQEDPTNTAARKRKIAIRKAQGKNVEAIRELNEYL 0"
## [1] "VKRLTGMRFEAMERYDDAIQLYDRILQEDPTNTAARKRKIAIRKAQGKNVEAIRELNEYL 0"
## [1] "VKRLTGMRFEAMERYDDAIQLYDRILQEDPTNTAARKRKIAIRKAQGKNVEAIRELNEYL 0"
## [1] "VKRLTGMRFEAMERYDDAIQLYDRILQEDPTNTAARKRKIAIRKAQGKNVEAIRELNEYL 0"
## [1] "VKRLTGMRFEAMERYDDAIQLYDRILQEDPTNTAARKRKIAIRKAQGKTVEAIRELNEYL 0"
## [1] "VKRLTGMRFEAMERYDDAIQLYDRILQEDPTNTAARKRKIAIRKAQGKNVEAIRELNEYL 0"
## [1] " "
## [1] "EQFVGDQEAWHELAELYINEHDYAKAAFCLEELMMTNPHNHLYCQQYAEVKYTQGGLENL 0"
## [1] "EQFVGDQEAWHELAELYINEHDYAKAAFCLEELMMTNPHNHLYCQQYAEVKYTQGGLENL 0"
## [1] "EQFVGDQEAWHELAELYINEHDYAKAAFCLEELMMTNPHNHLYCQQYAEVKYTQGGLENL 0"
## [1] "EQFVGDQEAWHELAELYINEHDYAKAAFCLEELMMTNPYNHLYCQQYAEVKYTQGGLENL 0"
## [1] "EQFVGDQEAWHELAELYINEHDYAKAAFCLEELMMTNPHNHLYCQQYAEVKYTQGGLENL 0"
## [1] "EQFVGDQEAWHELAELYINEHDYAKAAFCLEELMMTNPHNHLYCQQYAEVKYTQGGLENL 0"
## [1] "EQFVGDQEAWHELAELYINEHDYAKAAFCLEELMMTNPHNHLYCQQYAEVKYTQGGLENL 0"
## [1] "EQFVGDQEAWHELAELYINEHDYAKAAFCLEELMMTNPHNHLYCQQYAEVKYTQGGLENL 0"
## [1] "EQFVGDQEAWHELAELYINEHDYAKAAFCLEELMMTNPHNHLYCQQYAEVKYTQGGLENL 0"
## [1] "EQFVGDQEAWHELAELYINEHDYAKAAFCLEELMMTNPHNHLYCQQYAEVKYTQGGLENL 0"
## [1] " "
## [1] "ELSRKYFAQALKLNNRNMRALFGLYMSASHIASNPKASAKTKKDNMKYASWAASQINRAY 0"
## [1] "ELSRKYFAQALKLNNRNMRALFGLYMSASHIASNPKASAKTKKDNMKYASWAASQINRAY 0"
## [1] "ELSRKYFAQALKLNNRNMRALFGLYMSASHIASNPKASAKTKKDNMKYASWAASQINRAY 0"
## [1] "ELSRKYFAQALKLNNRNMRALFGLYMSASHIASNPKASAKTKKDNMKYASWAASQINKAY 0"
## [1] "ELSRKYFAQALKLNNRNMRALFGLYMSASHIASNPKASAKTKKDNMKYASWAASQINRAY 0"
## [1] "ELSRKYFAQALKLNNRNMRALFGLYMSASHIASNPKASAKTKKDNMKYASWAASQINRAY 0"
## [1] "ELSRKYFAQALKLNNRNMRALFGLYMSASHIASNPKASAKTKKDNMKYASWAASQINRAY 0"
## [1] "ELSRKYFAQALKLNNRNMRALFGLYMSASHIASNPKASAKTKKDNMKYASWAANQINRAY 0"
## [1] "ELSRKYFAQALKLNNRNMRALFGLYMSASHIASNPKASAKMKKDNIKYASWAANQINRAY 0"
## [1] "ELSRKYFAQALKLNNRNMRALFGLYMSASHIASNPKASAKMKKDNIRYAGWAANQINRAY 0"
## [1] " "
## [1] "QFAGRSKKETKYSLKAVEDMLETLQITQS 31"
## [1] "QFAGRSKKETKYSLKAVEDMLETLQITQS 31"
## [1] "QFAGRSKKETKYSLKAVEDMLETLQITQS 31"
## [1] "QFAGRSKKETKYSLKAVEDMLETLQITQS 31"
## [1] "QFAGRSKKETKYSLKAVEDMLETLQITQS 31"
## [1] "QFAGRSKKETKYSLKAVEDMLETLQITQS 31"
## [1] "QFAGRSKKETKYSLKAVEDMLETLQITQS 31"
## [1] "QFAGRSKKETKYSLKAVEDMLETLQITQS 31"
## [1] "QFAGRSKKETKYSLKAVEDMLETLQITQS 31"
## [1] "QFAGRSKKETKSSLKAVEDMLETLQITQS 31"
## [1] " "

##Distance Matrix

d <- seqinr::dist.alignment(EMC2_msa_2, "identity")
d.m <- as.matrix(d)
colnames(d.m)<-Scientific.Name
rownames(d.m)<- Scientific.Name
disp.dm <- round(d.m, 3)
disp.dm
##                           Homo sapiens Pan troglodytes Macaca mulatta
## Homo sapiens                     0.000           0.058          0.058
## Pan troglodytes                  0.058           0.000          0.058
## Macaca mulatta                   0.058           0.058          0.000
## Danio rerio                      0.116           0.116          0.101
## Xenopus tropicalis               0.058           0.058          0.000
## Gallus gallus                    0.058           0.058          0.000
## Rattus norvegicus                0.082           0.082          0.058
## Suricata suricatta               0.082           0.082          0.058
## Ceratotherium simum simum        0.174           0.174          0.164
## Sus scrof                        0.192           0.192          0.183
##                           Danio rerio Xenopus tropicalis Gallus gallus
## Homo sapiens                    0.116              0.058         0.058
## Pan troglodytes                 0.116              0.058         0.058
## Macaca mulatta                  0.101              0.000         0.000
## Danio rerio                     0.000              0.101         0.101
## Xenopus tropicalis              0.101              0.000         0.000
## Gallus gallus                   0.101              0.000         0.000
## Rattus norvegicus               0.116              0.058         0.058
## Suricata suricatta              0.116              0.058         0.058
## Ceratotherium simum simum       0.192              0.164         0.164
## Sus scrof                       0.209              0.183         0.183
##                           Rattus norvegicus Suricata suricatta
## Homo sapiens                          0.082              0.082
## Pan troglodytes                       0.082              0.082
## Macaca mulatta                        0.058              0.058
## Danio rerio                           0.116              0.116
## Xenopus tropicalis                    0.058              0.058
## Gallus gallus                         0.058              0.058
## Rattus norvegicus                     0.000              0.082
## Suricata suricatta                    0.082              0.000
## Ceratotherium simum simum             0.174              0.154
## Sus scrof                             0.192              0.174
##                           Ceratotherium simum simum Sus scrof
## Homo sapiens                                  0.174     0.192
## Pan troglodytes                               0.174     0.192
## Macaca mulatta                                0.164     0.183
## Danio rerio                                   0.192     0.209
## Xenopus tropicalis                            0.164     0.183
## Gallus gallus                                 0.164     0.183
## Rattus norvegicus                             0.174     0.192
## Suricata suricatta                            0.154     0.174
## Ceratotherium simum simum                     0.000     0.116
## Sus scrof                                     0.116     0.000

##Phylogenetic Tree

tree<- ape::nj(d.m)
plot(tree, main="EMC2 Family Gene Tree")