##Introduction
Type 3 oculocutaneous albinism (OCA3) is a rare genetic disease linked to the EMC2 Gene. This code compiles summary information about the gene EMC2 (ER membrane protein complex subunit 2).
It also generates alignments and a phylogeneitc tree to indicating the evolutionary relationship betweeen the human version of the gene and its homologs in other species.
##Resources / References Key information use to make this script can be found here:
RefSeq Gene: https://www.ncbi.nlm.nih.gov/gene/9694 RefSeq Homologene: https://www.ncbi.nlm.nih.gov/homologene/8785 BLAST: https://blast.ncbi.nlm.nih.gov/Blast.cgi PBD: https://www.rcsb.org/structure/6WW7
##Preparation Load packages:
# github packages
library(compbio4all)
library(ggmsa)
## Registered S3 methods overwritten by 'ggalt':
## method from
## grid.draw.absoluteGrob ggplot2
## grobHeight.absoluteGrob ggplot2
## grobWidth.absoluteGrob ggplot2
## grobX.absoluteGrob ggplot2
## grobY.absoluteGrob ggplot2
# CRAN packages
library(rentrez)
library(seqinr)
library(ape)
##
## Attaching package: 'ape'
## The following objects are masked from 'package:seqinr':
##
## as.alignment, consensus
library(pander)
library(ggplot2)
#Bioconductor packages
library(BiocManager)
## Bioconductor version '3.13' is out-of-date; the current release version '3.14'
## is available with R version '4.1'; see https://bioconductor.org/install
library(drawProteins)
library(msa)
## Loading required package: Biostrings
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: XVector
## Loading required package: GenomeInfoDb
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:ape':
##
## complement
## The following object is masked from 'package:seqinr':
##
## translate
## The following object is masked from 'package:base':
##
## strsplit
##
## Attaching package: 'msa'
## The following object is masked from 'package:BiocManager':
##
## version
#BioStrings Packages
library(Biostrings)
library(HGNChelper)
##Accession Numbers
Accession numbers were obtained from RefSeq, Refseq Homlogene, UniProt and PDB. UniProt accession numbers can be found by searching for the gene name. PDB accessions can be found by searching with a UniProt accession or a gene name. The Neanderthal database was searched and yielded no results. 7 Organisim Accession information was obtained via Refseq Homolgene, 3 were obtained via a Blast search on EMC2.
##Accession Number Table
#human
#chimpanzee
#Rhesus monkey
#zebrafish
#tropical clawed frog
#chicken
#Norway rat
#BLAST
#meerkat
#Southern white rhinoceros
#Wild boar
RefSeq.id <-c("NP_055488.1","XP_001135909.1","XP_001090826.1","XP_850261.1","NP_001073796.1","NP_080012.1","NP_001107257.1","XP_029779579.1","XP_004431318.2","XP_020944679.1")
UniProt.id <-c("Q15006","H2QWK7","I0FH38","Q6TGY8","Q5M7J9","A0A1D5NVZ7","B0BNG0","NA","NA","NA")
PDB.id <-c("6WW7","NA","NA","NA","NA","NA","NA","NA","NA","NA")
Scientific.Name <-c("Homo sapiens","Pan troglodytes","Macaca mulatta","Danio rerio","Xenopus tropicalis","Gallus gallus","Rattus norvegicus","Suricata suricatta","Ceratotherium simum simum","Sus scrof")
Common.Name<-c("human","chimpanzee","Rhesus monkey","zebrafish","tropical clawed frog","chicken","Norway rat","Meerkat","Southern white rhinoceros","Wild Boar")
Gene.Name <-c("EMC2","EMC2","EMC2","EMC2","EMC2","Emc2","EMC2","EMC2","EMC2","EMC2")
EMC2.Table <- data.frame(RefSeq.id=RefSeq.id,
UniProt.id=UniProt.id,
PDB.id=PDB.id,
Scientific.Name=Scientific.Name,
Common.Name=Common.Name,
Gene.Name=Gene.Name)
pander::pander(EMC2.Table)
| RefSeq.id | UniProt.id | PDB.id | Scientific.Name |
|---|---|---|---|
| NP_055488.1 | Q15006 | 6WW7 | Homo sapiens |
| XP_001135909.1 | H2QWK7 | NA | Pan troglodytes |
| XP_001090826.1 | I0FH38 | NA | Macaca mulatta |
| XP_850261.1 | Q6TGY8 | NA | Danio rerio |
| NP_001073796.1 | Q5M7J9 | NA | Xenopus tropicalis |
| NP_080012.1 | A0A1D5NVZ7 | NA | Gallus gallus |
| NP_001107257.1 | B0BNG0 | NA | Rattus norvegicus |
| XP_029779579.1 | NA | NA | Suricata suricatta |
| XP_004431318.2 | NA | NA | Ceratotherium simum simum |
| XP_020944679.1 | NA | NA | Sus scrof |
| Common.Name | Gene.Name |
|---|---|
| human | EMC2 |
| chimpanzee | EMC2 |
| Rhesus monkey | EMC2 |
| zebrafish | EMC2 |
| tropical clawed frog | EMC2 |
| chicken | Emc2 |
| Norway rat | EMC2 |
| Meerkat | EMC2 |
| Southern white rhinoceros | EMC2 |
| Wild Boar | EMC2 |
##Data Prepartation
EMC2_list <- compbio4all::entrez_fetch_list(db="protein", id=RefSeq.id, rettype="fasta")
length(EMC2_list)
## [1] 10
for(i in 1:length(EMC2_list)){
EMC2_list[[i]] <- compbio4all::fasta_cleaner(EMC2_list[[i]], parse = F)
}
##General Protein Information protein diagram via draw proteins
Q15006_json <- drawProteins::get_features("Q15006")
## [1] "Download has worked"
my_prot_df <- drawProteins::feature_to_dataframe(Q15006_json)
#Protein Regions
my_canvas <- draw_canvas(my_prot_df)
my_canvas <- draw_chains(my_canvas, my_prot_df, label_size = 2.5)
my_canvas <- draw_regions(my_canvas, my_prot_df)
my_canvas <- draw_folding(my_canvas, my_prot_df)
my_canvas
#Repeats
my_canvas <- draw_canvas(my_prot_df)
my_canvas <- draw_chains(my_canvas, my_prot_df, label_size = 2.5)
my_canvas <- draw_repeat(my_canvas, my_prot_df)
my_canvas
Q15006_FASTA <- rentrez::entrez_fetch(id = "Q15006",
db = "protein",
rettype="fasta")
Q15006_vector <- fasta_cleaner(Q15006_FASTA)
par(mfrow = c(2,2),
mar = c(0,0,2,1))
# plot 1: Defaults
dotPlot(Q15006_vector, Q15006_vector,
wsize = 1,
nmatch = 1,
main = "")
# plot 2 size = 10, nmatch = 1
dotPlot(Q15006_vector, Q15006_vector,
wsize = 10,
nmatch = 1,
main = "")
# plot 3: size = 10, nmatch = 5
dotPlot(Q15006_vector, Q15006_vector,
wsize = 10,
nmatch = 5,
main = "")
# plot 4: size = 20, nmatch = 5
dotPlot(Q15006_vector, Q15006_vector,
wsize = 20,
nmatch = 5,
main = "")
# reset par() - run this or other plots will be small!
par(mfrow = c(1,1),
mar = c(4,4,4,4))
dotPlot(Q15006_vector, Q15006_vector,
wsize = 20,
nmatch = 5,
main = "")
#Protein properties compiled from databases
Pfam: http://pfam.xfam.org/protein/Q15006 Nothing really interesting to plot but the page did exist Disprot: No information found RepeatsDB: No information found
Information for major Protein properties was not adundant enough to compile anything interesting for this section.
##Secondary Protein Properties Corrleation used in Chou adn Zhange 1992. Cosine similarity used in Higgs and Attwood (2005). 4 correlations investigated for protein prediction.
# enter once
aa.1.1 <- c("A","R","N","D","C","Q","E","G","H","I",
"L","K","M","F","P","S","T","W","Y","V")
# enter again
aa.1.2 <- c("A","R","N","D","C","Q","E","G","H","I",
"L","K","M","F","P","S","T","W","Y","V")
alpha <- c(285, 53, 97, 163, 22, 67, 134, 197, 111, 91,
221, 249, 48, 123, 82, 122, 119, 33, 63, 167)
beta <- c(203, 67, 139, 121, 75, 122, 86, 297, 49, 120,
177, 115, 16, 85, 127, 341, 253, 44, 110, 229)
a.plus.b <- c(175, 78, 120, 111, 74, 74, 86, 171, 33, 93,
110, 112, 25, 52, 71, 126, 117, 30, 108, 123)
a.div.b <- c(361, 146, 183, 244, 63, 114, 257, 377, 107, 239,
339, 321, 91, 158, 188, 327, 238, 72, 130, 378)
data.frame(aa.1.1, alpha, beta, a.plus.b, a.div.b)
## aa.1.1 alpha beta a.plus.b a.div.b
## 1 A 285 203 175 361
## 2 R 53 67 78 146
## 3 N 97 139 120 183
## 4 D 163 121 111 244
## 5 C 22 75 74 63
## 6 Q 67 122 74 114
## 7 E 134 86 86 257
## 8 G 197 297 171 377
## 9 H 111 49 33 107
## 10 I 91 120 93 239
## 11 L 221 177 110 339
## 12 K 249 115 112 321
## 13 M 48 16 25 91
## 14 F 123 85 52 158
## 15 P 82 127 71 188
## 16 S 122 341 126 327
## 17 T 119 253 117 238
## 18 W 33 44 30 72
## 19 Y 63 110 108 130
## 20 V 167 229 123 378
alpha.prop <- alpha/sum(alpha)
beta.prop <- beta/sum(beta)
a.plus.b.prop <- a.plus.b/sum(a.plus.b)
a.div.b <- a.div.b/sum(a.div.b)
aa.prop <- data.frame(alpha.prop,
beta.prop,
a.plus.b.prop,
a.div.b)
#row labels
row.names(aa.prop) <- aa.1.1
# check against chou's total
EMC2_Human <- rentrez::entrez_fetch(id = "NP_055488.1",
db = "protein",
rettype="fasta")
Human_vector <- compbio4all::fasta_cleaner(EMC2_Human)
EMC2_H.freq.table <- table(Human_vector)/length(Human_vector)
table_to_vector <- function(table_x){
table_names <- attr(table_x, "dimnames")[[1]]
table_vect <- as.vector(table_x)
names(table_vect) <- table_names
return(table_vect)
}
EMC2.human.aa.freq <- table_to_vector(EMC2_H.freq.table)
aa.prop$EMC2.human.aa.freq <- EMC2.human.aa.freq
chou_cor <- function(x,y){
numerator <- sum(x*y)
denominator <- sqrt((sum(x^2))*(sum(y^2)))
result <- numerator/denominator
return(result)
}
chou_cosine <- function(z.1, z.2){
z.1.abs <- sqrt(sum(z.1^2))
z.2.abs <- sqrt(sum(z.2^2))
my.cosine <- sum(z.1*z.2)/(z.1.abs*z.2.abs)
return(my.cosine)
}
par(mfrow = c(2,2), mar = c(1,4,1,1))
plot(alpha.prop ~ EMC2.human.aa.freq, data = aa.prop)
plot(beta.prop ~ EMC2.human.aa.freq, data = aa.prop)
plot(a.plus.b.prop ~ EMC2.human.aa.freq, data = aa.prop)
plot(a.div.b ~ EMC2.human.aa.freq, data = aa.prop)
Table providing Cosine similarity, Eeclidian distance, and column correlation.
par(mfrow = c(1,1), mar = c(1,1,1,1))
corr.alpha <- chou_cor(aa.prop[,5], aa.prop[,1])
corr.beta <- chou_cor(aa.prop[,5], aa.prop[,2])
corr.apb <- chou_cor(aa.prop[,5], aa.prop[,3])
corr.adb <- chou_cor(aa.prop[,5], aa.prop[,4])
cos.alpha <- chou_cosine(aa.prop[,5], aa.prop[,1])
cos.beta <- chou_cosine(aa.prop[,5], aa.prop[,2])
cos.apb <- chou_cosine(aa.prop[,5], aa.prop[,3])
cos.adb <- chou_cosine(aa.prop[,5], aa.prop[,4])
aa.prop.flipped <- t(aa.prop)
round(aa.prop.flipped,2)
## A R N D C Q E G H I L K
## alpha.prop 0.12 0.02 0.04 0.07 0.01 0.03 0.05 0.08 0.05 0.04 0.09 0.10
## beta.prop 0.07 0.02 0.05 0.04 0.03 0.04 0.03 0.11 0.02 0.04 0.06 0.04
## a.plus.b.prop 0.09 0.04 0.06 0.06 0.04 0.04 0.05 0.09 0.02 0.05 0.06 0.06
## a.div.b 0.08 0.03 0.04 0.06 0.01 0.03 0.06 0.09 0.02 0.06 0.08 0.07
## EMC2.human.aa.freq 0.10 0.01 0.05 0.10 0.03 0.04 0.02 0.05 0.07 0.09 0.04 0.05
## M F P S T W Y V
## alpha.prop 0.02 0.05 0.03 0.05 0.05 0.01 0.03 0.07
## beta.prop 0.01 0.03 0.05 0.12 0.09 0.02 0.04 0.08
## a.plus.b.prop 0.01 0.03 0.04 0.07 0.06 0.02 0.06 0.07
## a.div.b 0.02 0.04 0.04 0.08 0.05 0.02 0.03 0.09
## EMC2.human.aa.freq 0.01 0.06 0.07 0.05 0.03 0.03 0.02 0.06
dist(aa.prop.flipped, method = "euclidean")
## alpha.prop beta.prop a.plus.b.prop a.div.b
## beta.prop 0.13342098
## a.plus.b.prop 0.09281824 0.08289406
## a.div.b 0.06699039 0.08659174 0.06175113
## EMC2.human.aa.freq 0.12451288 0.16159692 0.12800446 0.12645889
dist.alpha <- dist((aa.prop.flipped[c(1,5),]), method = "euclidean")
dist.beta <- dist((aa.prop.flipped[c(2,5),]), method = "euclidean")
dist.apb <- dist((aa.prop.flipped[c(3,5),]), method = "euclidean")
dist.adb <- dist((aa.prop.flipped[c(4,5),]), method = "euclidean")
fold.type <- c("alpha","beta","alpha plus beta", "alpha/beta")
# data
corr.sim <- round(c(corr.alpha,corr.beta,corr.apb,corr.adb),5)
cosine.sim <- round(c(cos.alpha,cos.beta,cos.apb,cos.adb),5)
Euclidean.dist <- round(c(dist.alpha,dist.beta,dist.apb,dist.adb),5)
# summary
sim.sum <- c("","","most.sim","")
dist.sum <- c("","","min.dist","")
df <- data.frame(fold.type,
corr.sim ,
cosine.sim ,
Euclidean.dist ,
sim.sum ,
dist.sum )
pander::pander(df)
| fold.type | corr.sim | cosine.sim | Euclidean.dist | sim.sum | dist.sum |
|---|---|---|---|---|---|
| alpha | 0.8825 | 0.8825 | 0.1245 | ||
| beta | 0.8049 | 0.8049 | 0.1616 | ||
| alpha plus beta | 0.8693 | 0.8693 | 0.128 | most.sim | min.dist |
| alpha/beta | 0.8742 | 0.8742 | 0.1265 |
##Percent Identity Comparisons (PID)
#Data Preperation
EMC2_Human <- rentrez::entrez_fetch(id = "NP_055488.1",
db = "protein",
rettype="fasta")
Human_vector <- compbio4all::fasta_cleaner(EMC2_Human)
Human_str <- paste(Human_vector, collapse = "")
Human_str <- toupper(Human_str)
EMC2_Chimp <- rentrez::entrez_fetch(id = "XP_001135909.1",
db = "protein",
rettype="fasta")
Chimp_vector <- compbio4all::fasta_cleaner(EMC2_Chimp)
Chimp_str <- paste(Chimp_vector,collapse = "")
Chimp_str <- toupper(Chimp_str)
EMC2_Zebrafish <- rentrez::entrez_fetch(id = "XP_850261.1",
db = "protein",
rettype="fasta")
Zebra_vector <- compbio4all::fasta_cleaner(EMC2_Zebrafish)
Zebrafish_str <- paste(Zebra_vector,collapse = "")
Zebrafish_str <- toupper(Zebrafish_str)
EMC2_chx <- rentrez::entrez_fetch(id = "NP_080012.1",
db = "protein",
rettype="fasta")
chx_vector <- compbio4all::fasta_cleaner(EMC2_chx)
chx_str <- paste(chx_vector,collapse = "")
chx_str <- toupper(chx_str)
data(BLOSUM50)
Align01 <- Biostrings::pairwiseAlignment(Human_str, Chimp_str, substitutionMatrix = BLOSUM50,
gapOpening = -2,
gapExtension = -8,
scoreOnly = FALSE)
Align02 <- Biostrings::pairwiseAlignment(Human_str, Zebrafish_str, substitutionMatrix = BLOSUM50,
gapOpening = -2,
gapExtension = -8,
scoreOnly = FALSE)
Align03 <- Biostrings::pairwiseAlignment(Human_str, chx_str, substitutionMatrix = BLOSUM50,
gapOpening = -2,
gapExtension = -8,
scoreOnly = FALSE)
Align04 <- Biostrings::pairwiseAlignment(Chimp_str, Zebrafish_str, substitutionMatrix = BLOSUM50,
gapOpening = -2,
gapExtension = -8,
scoreOnly = FALSE)
Align05 <- Biostrings::pairwiseAlignment(Chimp_str, chx_str, substitutionMatrix = BLOSUM50,
gapOpening = -2,
gapExtension = -8,
scoreOnly = FALSE)
Align06 <- Biostrings::pairwiseAlignment(Zebrafish_str,chx_str, substitutionMatrix = BLOSUM50,
gapOpening = -2,
gapExtension = -8,
scoreOnly = FALSE)
pids <- c(1, NA, NA, NA,
pid(Align01), 1, NA, NA,
pid(Align02), pid(Align04), 1, NA,
pid(Align03), pid(Align05), pid(Align06), 1)
mat <- matrix(pids, nrow = 4, byrow = T)
row.names(mat) <- c("Human","Chimpanzee","Zebrafish","Chicken")
colnames(mat) <- c("Human","Chimpanzee","Zebrafish","Chicken")
pander::pander(mat)
| Â | Human | Chimpanzee | Zebrafish | Chicken |
|---|---|---|---|---|
| Human | 1 | NA | NA | NA |
| Chimpanzee | 100 | 1 | NA | NA |
| Zebrafish | 100 | 100 | 1 | NA |
| Chicken | 97.31 | 97.31 | 97.31 | 1 |
#PID methods comparison
pid1 <- pid(Align01)
pid2 <- pid(Align01, type="PID2")
pid3 <- pid(Align01, type="PID3")
pid4 <- pid(Align01, type="PID4")
method <- c("PID1", "PID2", "PID3", "PID4")
PID <- c(pid1, pid2, pid3, pid4)
denominator <- c("(aligned positions + internal gap positions)", "(aligned positions)", "(length shorter sequence)", "(average length of the two sequences)")
pid.df <- data.frame(method=method, PID=PID, denominator=denominator)
pander(pid.df)
| method | PID | denominator |
|---|---|---|
| PID1 | 100 | (aligned positions + internal gap positions) |
| PID2 | 100 | (aligned positions) |
| PID3 | 100 | (length shorter sequence) |
| PID4 | 100 | (average length of the two sequences) |
##Multiple sequence alignment #MSA data preparation
vec1 <- c(EMC2_list$NP_055488.1, EMC2_list$XP_001135909.1, EMC2_list$XP_001090826.1,
EMC2_list$XP_850261.1, EMC2_list$NP_001073796.1, EMC2_list$NP_080012.1,
EMC2_list$NP_001107257.1, EMC2_list$XP_029779579.1, EMC2_list$XP_004431318.2,
EMC2_list$XP_020944679.1)
EMC2_ss <- Biostrings::AAStringSet(vec1)
EMC2_msa <- msa(EMC2_ss)
## use default substitution matrix
EMC2_msa_2 <- msaConvert(EMC2_msa, type="seqinr::alignment")
##Full MSA
compbio4all::print_msa(EMC2_msa_2)
## [1] "--------------------------------MAKVSELYDVTWEEMRDKMRKWREENSR 0"
## [1] "--------------------------------MAKVSELYDVTWEEMRDKMRKWREENSR 0"
## [1] "--------------------------------MAKVSELYDVTWEEMRDKMRKWREENSR 0"
## [1] "--------------------------------MAKVSELYDVTWEEMRDKMRKWREENSR 0"
## [1] "--------------------------------MAKVSELYDVTWEEMRDKMRKWREENSR 0"
## [1] "--------------------------------MAKVSELYDVTWEEMRDKMRKWREENSR 0"
## [1] "--------------------------------MAKVSELYDVTWEEMRDKMRKWREENSR 0"
## [1] "MKNRVVILRLLVAGHVTASPRPLTPLPLGFKKMAKVSELYDVTWEEMRDKMRKWREENSR 0"
## [1] "--------------------------------MAKVTERYDVTWEEMRDKMRKWREENSR 0"
## [1] "--------------------------------MAKVTERYDVTWEEMRDKMRKWREENSR 0"
## [1] " "
## [1] "NSEQIVEVGEELINEYASKLGDDIWIIYEQVMIAALDYGRDDLALFCLQELRRQFPGSHR 0"
## [1] "NSEQIVEVGEELINEYASKLGDDIWIIYEQVMIAALDYGRDDLALFCLQELRRQFPGSHR 0"
## [1] "NSEQIVEVGEELINEYASKLGDDIWIIYEQVMIAALDYGRDDLALFCLQELRRQFPGSHR 0"
## [1] "NSEQIVEVGEELISEYASKLGDDIWIIYEQVMIAALDYGRDDLALFCLQELRRQFPGSHR 0"
## [1] "NSEQIVEVGEELINEYASKLGDDIWIIYEQVMIAALDYGRDDLALFCLQELRRQFPGSHR 0"
## [1] "NSEQIVEVGEELINEYASKLGDDIWIIYEQVMIAALDYGRDDLALFCLQELRRQFPGSHR 0"
## [1] "NSEQIVEVGEELINEYASKLGDDIWIIYEQVMVAALDYGRDDLALFCLQELRRQFPGSHR 0"
## [1] "NSEQIVEVGEELINEYASKLGDDIWIIYEQVMIAALDYGRDDLALFCLQELRRQFPGSHR 0"
## [1] "NSEQIMEVGEELINDYASKLGDDIWIIYEQVMIAALDYGRDDLALFCLQELRRQFPGSHR 0"
## [1] "NSEQIMEVGEELINDYASKLGDDIWIIYEQVMIAALDYGRDDLALFCLQELRRQFPGSHR 0"
## [1] " "
## [1] "VKRLTGMRFEAMERYDDAVQLYDRILQEDPTNTAARKRKIAIRKAQGKNVEAIRELNEYL 0"
## [1] "VKRLTGMRFEAMERYDDALQLYDRILQEDPTNTAARKRKIAIRKAQGKNVEAIRELNEYL 0"
## [1] "VKRLTGMRFEAMERYDDAIQLYDRILQEDPTNTAARKRKIAIRKAQGKNVEAIRELNEYL 0"
## [1] "VKRLTGMRFEAMERYDDAIQLYDRILQEDPTNTAARKRKIAIRKAQGKNVEAIRELNEYL 0"
## [1] "VKRLTGMRFEAMERYDDAIQLYDRILQEDPTNTAARKRKIAIRKAQGKNVEAIRELNEYL 0"
## [1] "VKRLTGMRFEAMERYDDAIQLYDRILQEDPTNTAARKRKIAIRKAQGKNVEAIRELNEYL 0"
## [1] "VKRLTGMRFEAMERYDDAIQLYDRILQEDPTNTAARKRKIAIRKAQGKNVEAIRELNEYL 0"
## [1] "VKRLTGMRFEAMERYDDAIQLYDRILQEDPTNTAARKRKIAIRKAQGKNVEAIRELNEYL 0"
## [1] "VKRLTGMRFEAMERYDDAIQLYDRILQEDPTNTAARKRKIAIRKAQGKTVEAIRELNEYL 0"
## [1] "VKRLTGMRFEAMERYDDAIQLYDRILQEDPTNTAARKRKIAIRKAQGKNVEAIRELNEYL 0"
## [1] " "
## [1] "EQFVGDQEAWHELAELYINEHDYAKAAFCLEELMMTNPHNHLYCQQYAEVKYTQGGLENL 0"
## [1] "EQFVGDQEAWHELAELYINEHDYAKAAFCLEELMMTNPHNHLYCQQYAEVKYTQGGLENL 0"
## [1] "EQFVGDQEAWHELAELYINEHDYAKAAFCLEELMMTNPHNHLYCQQYAEVKYTQGGLENL 0"
## [1] "EQFVGDQEAWHELAELYINEHDYAKAAFCLEELMMTNPYNHLYCQQYAEVKYTQGGLENL 0"
## [1] "EQFVGDQEAWHELAELYINEHDYAKAAFCLEELMMTNPHNHLYCQQYAEVKYTQGGLENL 0"
## [1] "EQFVGDQEAWHELAELYINEHDYAKAAFCLEELMMTNPHNHLYCQQYAEVKYTQGGLENL 0"
## [1] "EQFVGDQEAWHELAELYINEHDYAKAAFCLEELMMTNPHNHLYCQQYAEVKYTQGGLENL 0"
## [1] "EQFVGDQEAWHELAELYINEHDYAKAAFCLEELMMTNPHNHLYCQQYAEVKYTQGGLENL 0"
## [1] "EQFVGDQEAWHELAELYINEHDYAKAAFCLEELMMTNPHNHLYCQQYAEVKYTQGGLENL 0"
## [1] "EQFVGDQEAWHELAELYINEHDYAKAAFCLEELMMTNPHNHLYCQQYAEVKYTQGGLENL 0"
## [1] " "
## [1] "ELSRKYFAQALKLNNRNMRALFGLYMSASHIASNPKASAKTKKDNMKYASWAASQINRAY 0"
## [1] "ELSRKYFAQALKLNNRNMRALFGLYMSASHIASNPKASAKTKKDNMKYASWAASQINRAY 0"
## [1] "ELSRKYFAQALKLNNRNMRALFGLYMSASHIASNPKASAKTKKDNMKYASWAASQINRAY 0"
## [1] "ELSRKYFAQALKLNNRNMRALFGLYMSASHIASNPKASAKTKKDNMKYASWAASQINKAY 0"
## [1] "ELSRKYFAQALKLNNRNMRALFGLYMSASHIASNPKASAKTKKDNMKYASWAASQINRAY 0"
## [1] "ELSRKYFAQALKLNNRNMRALFGLYMSASHIASNPKASAKTKKDNMKYASWAASQINRAY 0"
## [1] "ELSRKYFAQALKLNNRNMRALFGLYMSASHIASNPKASAKTKKDNMKYASWAASQINRAY 0"
## [1] "ELSRKYFAQALKLNNRNMRALFGLYMSASHIASNPKASAKTKKDNMKYASWAANQINRAY 0"
## [1] "ELSRKYFAQALKLNNRNMRALFGLYMSASHIASNPKASAKMKKDNIKYASWAANQINRAY 0"
## [1] "ELSRKYFAQALKLNNRNMRALFGLYMSASHIASNPKASAKMKKDNIRYAGWAANQINRAY 0"
## [1] " "
## [1] "QFAGRSKKETKYSLKAVEDMLETLQITQS 31"
## [1] "QFAGRSKKETKYSLKAVEDMLETLQITQS 31"
## [1] "QFAGRSKKETKYSLKAVEDMLETLQITQS 31"
## [1] "QFAGRSKKETKYSLKAVEDMLETLQITQS 31"
## [1] "QFAGRSKKETKYSLKAVEDMLETLQITQS 31"
## [1] "QFAGRSKKETKYSLKAVEDMLETLQITQS 31"
## [1] "QFAGRSKKETKYSLKAVEDMLETLQITQS 31"
## [1] "QFAGRSKKETKYSLKAVEDMLETLQITQS 31"
## [1] "QFAGRSKKETKYSLKAVEDMLETLQITQS 31"
## [1] "QFAGRSKKETKSSLKAVEDMLETLQITQS 31"
## [1] " "
##Distance Matrix
d <- seqinr::dist.alignment(EMC2_msa_2, "identity")
d.m <- as.matrix(d)
colnames(d.m)<-Scientific.Name
rownames(d.m)<- Scientific.Name
disp.dm <- round(d.m, 3)
disp.dm
## Homo sapiens Pan troglodytes Macaca mulatta
## Homo sapiens 0.000 0.058 0.058
## Pan troglodytes 0.058 0.000 0.058
## Macaca mulatta 0.058 0.058 0.000
## Danio rerio 0.116 0.116 0.101
## Xenopus tropicalis 0.058 0.058 0.000
## Gallus gallus 0.058 0.058 0.000
## Rattus norvegicus 0.082 0.082 0.058
## Suricata suricatta 0.082 0.082 0.058
## Ceratotherium simum simum 0.174 0.174 0.164
## Sus scrof 0.192 0.192 0.183
## Danio rerio Xenopus tropicalis Gallus gallus
## Homo sapiens 0.116 0.058 0.058
## Pan troglodytes 0.116 0.058 0.058
## Macaca mulatta 0.101 0.000 0.000
## Danio rerio 0.000 0.101 0.101
## Xenopus tropicalis 0.101 0.000 0.000
## Gallus gallus 0.101 0.000 0.000
## Rattus norvegicus 0.116 0.058 0.058
## Suricata suricatta 0.116 0.058 0.058
## Ceratotherium simum simum 0.192 0.164 0.164
## Sus scrof 0.209 0.183 0.183
## Rattus norvegicus Suricata suricatta
## Homo sapiens 0.082 0.082
## Pan troglodytes 0.082 0.082
## Macaca mulatta 0.058 0.058
## Danio rerio 0.116 0.116
## Xenopus tropicalis 0.058 0.058
## Gallus gallus 0.058 0.058
## Rattus norvegicus 0.000 0.082
## Suricata suricatta 0.082 0.000
## Ceratotherium simum simum 0.174 0.154
## Sus scrof 0.192 0.174
## Ceratotherium simum simum Sus scrof
## Homo sapiens 0.174 0.192
## Pan troglodytes 0.174 0.192
## Macaca mulatta 0.164 0.183
## Danio rerio 0.192 0.209
## Xenopus tropicalis 0.164 0.183
## Gallus gallus 0.164 0.183
## Rattus norvegicus 0.174 0.192
## Suricata suricatta 0.154 0.174
## Ceratotherium simum simum 0.000 0.116
## Sus scrof 0.116 0.000
##Phylogenetic Tree
tree<- ape::nj(d.m)
plot(tree, main="EMC2 Family Gene Tree")