Introduction
BHLHE41(basic helix-loop-helix family member e41) is a gene that may be involved in cell differentiation and control of the circadian rhythm.
#Resources
https://www.ncbi.nlm.nih.gov/gene/79365 https://www.ncbi.nlm.nih.gov/gene/?Term=ortholog_gene_79365[group]
https://www.uniprot.org/uniprot/?query=BHLHE41&sort=score https://alphafold.ebi.ac.uk/entry/Q9C0J9 http://pfam.xfam.org/protein/Q9C0J9
https://blast.ncbi.nlm.nih.gov/Blast.cgi
# packages
library(compbio4all)
library(ggmsa)
## Registered S3 methods overwritten by 'ggalt':
## method from
## grid.draw.absoluteGrob ggplot2
## grobHeight.absoluteGrob ggplot2
## grobWidth.absoluteGrob ggplot2
## grobX.absoluteGrob ggplot2
## grobY.absoluteGrob ggplot2
library(rentrez)
library(seqinr)
library(ape)
##
## Attaching package: 'ape'
## The following objects are masked from 'package:seqinr':
##
## as.alignment, consensus
library(msa)
## Loading required package: Biostrings
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: XVector
## Loading required package: GenomeInfoDb
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:ape':
##
## complement
## The following object is masked from 'package:seqinr':
##
## translate
## The following object is masked from 'package:base':
##
## strsplit
library(Biostrings)
library(BiocManager)
## Bioconductor version '3.13' is out-of-date; the current release version '3.14'
## is available with R version '4.1'; see https://bioconductor.org/install
##
## Attaching package: 'BiocManager'
## The following object is masked from 'package:msa':
##
## version
library(drawProteins)
library(ggplot2)
Accession Numbers
refseq_accession <- c("NP_110389.1","NP_077789.1","NP_001002973.1", "XP_520805.2","NP_001027504.1","XP_024847914.1", "XP_015146971.2","XP_027694693.1", "XP_039096736.1", "ENSP00000242728")
uniprot_accession <- c("Q9C0J9","Q99PV5", "Q7YQC9","H2Q5M4","Q4V788",NA, NA,NA,NA,NA)
pdb_accession <- c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA)
com_name <- c("Human", "House Mouse", "Dog", "Chimpanzee", "Tropical Clawed Frog", "Cattle", "Chicken","Common Wombat", "Hyaena hyaena", "Neanderthal")
sci_name <- c("Homo sapiens", "Mus musculus", "Canis lupus familiaris", "Pan troglodytes", "Xenopus tropicalis", "Bos taurus", "Gallus gallus","Vombatus ursinus", "Striped hyena", "Homo neanderthalensis")
BHLHE41_table <- data.frame(refseq_accession, uniprot_accession,
pdb_accession, com_name,sci_name)
pander::pander(BHLHE41_table)
Table continues below
| NP_110389.1 |
Q9C0J9 |
NA |
Human |
| NP_077789.1 |
Q99PV5 |
NA |
House Mouse |
| NP_001002973.1 |
Q7YQC9 |
NA |
Dog |
| XP_520805.2 |
H2Q5M4 |
NA |
Chimpanzee |
| NP_001027504.1 |
Q4V788 |
NA |
Tropical Clawed Frog |
| XP_024847914.1 |
NA |
NA |
Cattle |
| XP_015146971.2 |
NA |
NA |
Chicken |
| XP_027694693.1 |
NA |
NA |
Common Wombat |
| XP_039096736.1 |
NA |
NA |
Hyaena hyaena |
| ENSP00000242728 |
NA |
NA |
Neanderthal |
| Homo sapiens |
| Mus musculus |
| Canis lupus familiaris |
| Pan troglodytes |
| Xenopus tropicalis |
| Bos taurus |
| Gallus gallus |
| Vombatus ursinus |
| Striped hyena |
| Homo neanderthalensis |
Data Preparation
#
BHLHE41_list <- compbio4all::entrez_fetch_list(db = "protein",
id = BHLHE41_table$refseq_accession[1:9],
rettype = "fasta")
for(i in 1:length(BHLHE41_list)){
BHLHE41_list[[i]] <- compbio4all::fasta_cleaner(BHLHE41_list[[i]], parse = F)
}
General Protein Information
#Protein Diagram
Q9C0J9_json <- drawProteins::get_features("Q9C0J9")
## [1] "Download has worked"
my_prot_df <- drawProteins::feature_to_dataframe(Q9C0J9_json)
my_canvas <- draw_canvas(my_prot_df)
my_canvas <- draw_chains(my_canvas, my_prot_df,
label_size = 2.5)
my_canvas <- draw_domains(my_canvas, my_prot_df)
my_canvas <- draw_recept_dom(my_canvas, my_prot_df)
my_canvas <- draw_regions(my_canvas, my_prot_df)
my_canvas <- draw_folding(my_canvas, my_prot_df)
my_canvas <- draw_repeat(my_canvas, my_prot_df)
my_canvas <- draw_motif(my_canvas, my_prot_df)
my_canvas <- draw_phospho(my_canvas, my_prot_df)
my_canvas

#Dot Plot
# set up 2 x 2 grid, make margins thinner
par(mfrow = c(2,2),
mar = c(0,0,2,2))
NP_110389.1 <- compbio4all::fasta_cleaner(BHLHE41_list[1], parse = TRUE)
# plot 1: BHLHE41 - Defaults
dotPlot(NP_110389.1,
NP_110389.1,
wsize = 1,
nmatch = 1,
main = "BHLHE41 Defaults")
# plot 2 BHLHE41 - size = 5, nmatch = 1
dotPlot(NP_110389.1, NP_110389.1,
wsize = 5,
nmatch = 1,
main = "BHLHE41 - size = 5, nmatch = 1")
# plot 3: BHLHE41 - size = 10, nmatch = 5
dotPlot(NP_110389.1, NP_110389.1,
wsize = 10,
nmatch = 5,
main = "BHLHE41 - size = 10, nmatch = 5")
# plot 4: BHLHE41 - size = 20, nmatch = 5
dotPlot(NP_110389.1, NP_110389.1,
wsize = 20,
nmatch =5,
main = "BHLHE41 - size = 20, nmatch = 5")

par(mfrow = c(1,1),
mar = c(4,4,4,4))
# Best Plot: BHLHE41 - size = 10, nmatch = 5
dotPlot(NP_110389.1[250:450], NP_110389.1[250:450],
wsize = 10,
nmatch = 5,
main = "BHLHE41 - size = 10, nmatch = 5")

#Protein properties compiled from databases
Properties <- c("Major Features","Subcellular Location", "AlphaFold Secondary Structure" )
Information <- c("HLH and Hairy_orange domains along with multiple disorder and low_complexity domains","Nucleus", "Alpha helixes can be seen as well as many disordered sections")
Database <- c("Pfam","Uniprot", "AlphaFold")
prop_df <- data.frame(Properties,Information,Database)
pander::pander(prop_df)
| Major Features |
HLH and Hairy_orange domains along with multiple disorder and low_complexity domains |
Pfam |
| Subcellular Location |
Nucleus |
Uniprot |
| AlphaFold Secondary Structure |
Alpha helixes can be seen as well as many disordered sections |
AlphaFold |
Protein feature prediction
#Secondary Structure prediction
aa.1.1 <- c("A","R","N","D","C","Q","E","G","H","I",
"L","K","M","F","P","S","T","W","Y","V")
alpha <- c(285, 53, 97, 163, 22, 67, 134, 197, 111, 91,
221, 249, 48, 123, 82, 122, 119, 33, 63, 167)
beta <- c(203, 67, 139, 121, 75, 122, 86, 297, 49, 120,
177, 115, 16, 85, 127, 341, 253, 44, 110, 229)
a.plus.b <- c(175, 78, 120, 111, 74, 74, 86, 171, 33, 93,
110, 112, 25, 52, 71, 126, 117, 30, 108, 123)
a.div.b <- c(361, 146, 183, 244, 63, 114, 257, 377, 107, 239,
339, 321, 91, 158, 188, 327, 238, 72, 130, 378)
alpha.prop <- alpha/sum(alpha)
beta.prop <- beta/sum(beta)
a.plus.b.prop <- a.plus.b/sum(a.plus.b)
a.div.b <- a.div.b/sum(a.div.b)
aa.prop <- data.frame(alpha.prop,
beta.prop,
a.plus.b.prop,
a.div.b)
row.names(aa.prop) <- aa.1.1
NP_110389.1 <- compbio4all::fasta_cleaner(BHLHE41_list[1], parse = TRUE)
NP_110389.1.freq.table <- table(NP_110389.1)/length(NP_110389.1)
#convert table to vector
table_to_vector <- function(table_x){
table_names <- attr(table_x, "dimnames")[[1]]
table_vect <- as.vector(table_x)
names(table_vect) <- table_names
return(table_vect)
}
BHLHE41.human.aa.freq <- table_to_vector(NP_110389.1.freq.table)
#removing Us
aa.names <- names(BHLHE41.human.aa.freq)
BHLHE41.human.aa.freq
## A C D E F G
## 0.170124481 0.018672199 0.041493776 0.060165975 0.029045643 0.080912863
## H I K L M N
## 0.031120332 0.024896266 0.053941909 0.116182573 0.008298755 0.010373444
## P Q R S T V
## 0.109958506 0.049792531 0.045643154 0.060165975 0.039419087 0.026970954
## W Y
## 0.002074689 0.020746888
aa.prop$BHLHE41.human.aa.freq <- BHLHE41.human.aa.freq
pander::pander(aa.prop)
| A |
0.1165 |
0.07313 |
0.09264 |
0.08331 |
0.1701 |
| R |
0.02166 |
0.02414 |
0.04129 |
0.03369 |
0.01867 |
| N |
0.03964 |
0.05007 |
0.06353 |
0.04223 |
0.04149 |
| D |
0.06661 |
0.04359 |
0.05876 |
0.05631 |
0.06017 |
| C |
0.008991 |
0.02702 |
0.03917 |
0.01454 |
0.02905 |
| Q |
0.02738 |
0.04395 |
0.03917 |
0.02631 |
0.08091 |
| E |
0.05476 |
0.03098 |
0.04553 |
0.05931 |
0.03112 |
| G |
0.08051 |
0.107 |
0.09052 |
0.08701 |
0.0249 |
| H |
0.04536 |
0.01765 |
0.01747 |
0.02469 |
0.05394 |
| I |
0.03719 |
0.04323 |
0.04923 |
0.05516 |
0.1162 |
| L |
0.09031 |
0.06376 |
0.05823 |
0.07824 |
0.008299 |
| K |
0.1018 |
0.04143 |
0.05929 |
0.07408 |
0.01037 |
| M |
0.01962 |
0.005764 |
0.01323 |
0.021 |
0.11 |
| F |
0.05027 |
0.03062 |
0.02753 |
0.03646 |
0.04979 |
| P |
0.03351 |
0.04575 |
0.03759 |
0.04339 |
0.04564 |
| S |
0.04986 |
0.1228 |
0.0667 |
0.07547 |
0.06017 |
| T |
0.04863 |
0.09114 |
0.06194 |
0.05493 |
0.03942 |
| W |
0.01349 |
0.01585 |
0.01588 |
0.01662 |
0.02697 |
| Y |
0.02575 |
0.03963 |
0.05717 |
0.03 |
0.002075 |
| V |
0.06825 |
0.08249 |
0.06511 |
0.08724 |
0.02075 |
chou_cor <- function(x,y){
numerator <- sum(x*y)
denominator <- sqrt((sum(x^2))*(sum(y^2)))
result <- numerator/denominator
return(result)
}
chou_cosine <- function(z.1, z.2){
z.1.abs <- sqrt(sum(z.1^2))
z.2.abs <- sqrt(sum(z.2^2))
my.cosine <- sum(z.1*z.2)/(z.1.abs*z.2.abs)
return(my.cosine)
}
#calculate correlation
corr.alpha <- chou_cor(aa.prop[,5], aa.prop[,1])
corr.beta <- chou_cor(aa.prop[,5], aa.prop[,2])
corr.apb <- chou_cor(aa.prop[,5], aa.prop[,3])
corr.adb <- chou_cor(aa.prop[,5], aa.prop[,4])
#calculate cosine similarity
cos.alpha <- chou_cosine(aa.prop[,5], aa.prop[,1])
cos.beta <- chou_cosine(aa.prop[,5], aa.prop[,2])
cos.apb <- chou_cosine(aa.prop[,5], aa.prop[,3])
cos.adb <- chou_cosine(aa.prop[,5], aa.prop[,4])
#calculate distance
aa.prop.flipped <- t(aa.prop)
round(aa.prop.flipped,2)
## A R N D C Q E G H I L
## alpha.prop 0.12 0.02 0.04 0.07 0.01 0.03 0.05 0.08 0.05 0.04 0.09
## beta.prop 0.07 0.02 0.05 0.04 0.03 0.04 0.03 0.11 0.02 0.04 0.06
## a.plus.b.prop 0.09 0.04 0.06 0.06 0.04 0.04 0.05 0.09 0.02 0.05 0.06
## a.div.b 0.08 0.03 0.04 0.06 0.01 0.03 0.06 0.09 0.02 0.06 0.08
## BHLHE41.human.aa.freq 0.17 0.02 0.04 0.06 0.03 0.08 0.03 0.02 0.05 0.12 0.01
## K M F P S T W Y V
## alpha.prop 0.10 0.02 0.05 0.03 0.05 0.05 0.01 0.03 0.07
## beta.prop 0.04 0.01 0.03 0.05 0.12 0.09 0.02 0.04 0.08
## a.plus.b.prop 0.06 0.01 0.03 0.04 0.07 0.06 0.02 0.06 0.07
## a.div.b 0.07 0.02 0.04 0.04 0.08 0.05 0.02 0.03 0.09
## BHLHE41.human.aa.freq 0.01 0.11 0.05 0.05 0.06 0.04 0.03 0.00 0.02
dist(aa.prop.flipped, method = "euclidean")
## alpha.prop beta.prop a.plus.b.prop a.div.b
## beta.prop 0.13342098
## a.plus.b.prop 0.09281824 0.08289406
## a.div.b 0.06699039 0.08659174 0.06175113
## BHLHE41.human.aa.freq 0.20674190 0.22748500 0.19913273 0.20743674
dist.alpha <- dist((aa.prop.flipped[c(1,5),]), method = "euclidean")
dist.beta <- dist((aa.prop.flipped[c(2,5),]), method = "euclidean")
dist.apb <- dist((aa.prop.flipped[c(3,5),]), method = "euclidean")
dist.adb <- dist((aa.prop.flipped[c(4,5),]), method = "euclidean")
fold.type <- c("alpha","beta","alpha plus beta", "alpha/beta")
# data
corr.sim <- round(c(corr.alpha,corr.beta,corr.apb,corr.adb),5)
cosine.sim <- round(c(cos.alpha,cos.beta,cos.apb,cos.adb),5)
Euclidean.dist <- round(c(dist.alpha,dist.beta,dist.apb,dist.adb),5)
# summary
sim.sum <- c("","","most.sim","")
dist.sum <- c("","","min.dist","")
df <- data.frame(fold.type,
corr.sim ,
cosine.sim ,
Euclidean.dist ,
sim.sum ,
dist.sum )
pander::pander(df)
| alpha |
0.7192 |
0.7192 |
0.2067 |
|
|
| beta |
0.6619 |
0.6619 |
0.2275 |
|
|
| alpha plus beta |
0.7319 |
0.7319 |
0.1991 |
most.sim |
min.dist |
| alpha/beta |
0.7106 |
0.7106 |
0.2074 |
|
|
Percent Identity Comparisons (PID)
#PID table
human <-compbio4all::entrez_fetch_list(db = "protein",
id = BHLHE41_table$refseq_accession[1],
rettype = "fasta")
human <- fasta_cleaner(human, parse = F)
chimp <-compbio4all::entrez_fetch_list(db = "protein",
id = BHLHE41_table$refseq_accession[4],
rettype = "fasta")
chimp <- fasta_cleaner(chimp, parse = F)
mouse <-compbio4all::entrez_fetch_list(db = "protein",
id = BHLHE41_table$refseq_accession[2],
rettype = "fasta")
mouse <- fasta_cleaner(mouse, parse = F)
dog <-compbio4all::entrez_fetch_list(db = "protein",
id = BHLHE41_table$refseq_accession[3],
rettype = "fasta")
dog <- fasta_cleaner(dog, parse = F)
align.human.vs.chimp <- pairwiseAlignment(
human,
chimp)
align.human.vs.mouse <-Biostrings::pairwiseAlignment(
human,
mouse)
align.human.vs.dog<- Biostrings::pairwiseAlignment(
human,
dog)
align.chimp.vs.mouse <-Biostrings::pairwiseAlignment(
chimp,
mouse)
align.chimp.vs.dog<- Biostrings::pairwiseAlignment(
chimp,
dog)
align.mouse.vs.dog<- Biostrings::pairwiseAlignment(
mouse,
dog)
pids <- c(100, NA, NA, NA,
pid(align.human.vs.chimp),100, NA, NA,
pid(align.human.vs.mouse), pid(align.chimp.vs.mouse), 100, NA,
pid(align.human.vs.dog), pid(align.chimp.vs.dog), pid(align.mouse.vs.dog), 100)
mat <- matrix(pids, nrow = 4, byrow = T)
row.names(mat) <- c("Homo","Pan","Mus","Canis")
colnames(mat) <- c("Homo","Pan","Mus","Canis")
pander::pander(mat)
| Homo |
100 |
NA |
NA |
NA |
| Pan |
100 |
100 |
NA |
NA |
| Mus |
72.73 |
72.73 |
100 |
NA |
| Canis |
90.47 |
90.47 |
68.09 |
100 |
#PID method comparisons (chimps and humans)
Method <- c("PID1","PID2","PID3","PID4")
PID <- c(pid(align.human.vs.chimp, type = "PID1"), pid(align.human.vs.chimp, type = "PID2"),pid(align.human.vs.chimp, type = "PID4"),pid(align.human.vs.chimp, type = "PID4"))
denominator <- c("(aligned positions + internal gap positions)","(aligned positions)", "(length shorter sequence)", "(average length of the two sequences)")
pid.df <- data.frame(Method, PID, denominator)
pander::pander(pid.df)
| PID1 |
100 |
(aligned positions + internal gap positions) |
| PID2 |
100 |
(aligned positions) |
| PID3 |
100 |
(length shorter sequence) |
| PID4 |
100 |
(average length of the two sequences) |
Multiple sequence alignment
#Build MSA
names(BHLHE41_list) <- BHLHE41_table$sci_name[1:9]
BHLHE41_vector <- rep(NA, length(BHLHE41_list))
for(i in 1:length(BHLHE41_vector)){
BHLHE41_vector[i] <- BHLHE41_list[[i]]
}
names(BHLHE41_vector) <- names(BHLHE41_list)
BHLHE41_vector_ss <- Biostrings::AAStringSet(BHLHE41_vector)
BHLHE41_align <- msa(BHLHE41_vector_ss,
method = "ClustalW")
## use default substitution matrix
BHLHE41_align_seqinr <- msaConvert(BHLHE41_align, type = "seqinr::alignment")
print_msa(alignment = BHLHE41_align_seqinr,
chunksize = 60)
## [1] "MDEGIPHLQERQLLEHRDFIGLDYSSLYMCKPKRSMKRDDTKDTYKLPHRLIEKKRRDRI 0"
## [1] "MDEGIPHLQERQLLEHRDFIGLDYSSLYMCKPKRSMKRDDTKDTYKLPHRLIEKKRRDRI 0"
## [1] "MDEGIPHLQERQLLEHRDFIGLDYSSLYMCKPKRSMKRDDSKDTYKLPHRLIEKKRRDRI 0"
## [1] "MDEGIPHLQERQLLEHRDFIGLDYSSLYMCKPKRSMKRDDSKDTYKLPHRLIEKKRRDRI 0"
## [1] "MDEGIPHLQERQLLEHRDFIGLDYPSLYMCKPKRSMKRDDSKDTYKLPHRLIEKKRRDRI 0"
## [1] "MDEGIPHLQERQLLEHRDFIGLDYSSLYMCKPKRSLKRDDTKDTYKLPHRLIEKKRRDRI 0"
## [1] "MDEGISRLQERQLMEHRDFIGLDYSSLYMCKPKRGMKRDESKETYKLPHRLIEKKRRDRI 0"
## [1] "MDEGISRLPERQLLEHRDFLGLEYPALYMCKPKRGVKRDESKETYKLPHRLIEKKRRDRI 0"
## [1] "MEEGIPSLQDRQ-LEPRDFLGMEYPHLFLCKPKRGLKRDESKETYKLPHRLIEKKRRDRI 0"
## [1] " "
## [1] "NECIAQLKDLLPEHLKLTTLGHLEKAVVLELTLKHLKALTALTEQQHQKIIALQNGERSL 0"
## [1] "NECIAQLKDLLPEHLKLTTLGHLEKAVVLELTLKHLKALTALTEQQHQKIIALQNGERSL 0"
## [1] "NECIAQLKDLLPEHLKLTTLGHLEKAVVLELTLKHLKALTALTEQQHQKIIALQNGERSL 0"
## [1] "NECIAQLKDLLPEHLKLTTLGHLEKAVVLELTLKHLKALTALTEQQHQKIIALQNGERSL 0"
## [1] "NECIAQLKDLLPEHLKLTTLGHLEKAVVLELTLKHLKALTALTEQQHQKIIALQNGERSL 0"
## [1] "NECIAQLKDLLPEHLKLTTLGHLEKAVVLELTLKHLKALTALTEQQHQKIIALQNGERSL 0"
## [1] "NECIAQLKDLLPEHLKLTTLGHLEKAVVLELTLKHLKALTALTEQQHQKIIALQNGERSM 0"
## [1] "NECIAQLKDLLPEHLKLTTLGHLEKAVVLELTLKHLKALTALTEQQHQKIVALQSGERSV 0"
## [1] "NECIAQLKDLLPEHLKLTTLGHLEKAVVLELTLKHLKGLTSLTEQQHQKIMALQNGEHAL 0"
## [1] " "
## [1] "KSPIQSDLDAFHSGFQTCAKEVLQYLSRFESWTPREPRCVQLINHLHAVATQFLPTPQLL 0"
## [1] "KSPIQSDLDAFHSGFQTCAKEVLQYLSRFESWTPREPRCVQLINHLHAVATQFLPTPQLL 0"
## [1] "KSPIQSDLDAFHSGFQTCAKEVLQYLSRFESWTPREQRCVQLINHLHAVATQFLPTPQLL 0"
## [1] "KSPIQSDLDAFHSGFQTCAKEVLQYLSRFESWTPREQRCVQLINHLHAVATQFLPTPQLL 0"
## [1] "KSPIQSDLDAFHSGFQTCAKEVLQYLARFESWTPREPRCVQLINHLHAVATQFLPTPQLL 0"
## [1] "KSPVQADLDAFHSGFQTCAKEVLQYLARFESWTPREPRCAQLVSHLHAVAT------QLL 0"
## [1] "KSPIQADLDSFHSGFQTCAKEVMQYLSRFESWTPREQRCAQLINHLHSVSTQFLPSPQLL 0"
## [1] "KSPVQADLDAFHSGFQTCAKEVLQFLSRSESWTPREQRCAQLLGHLHAVSSQFLPGPRLL 0"
## [1] "KSPIQTDLDAFHSGFQTCAKEVLQYLSRFESWTSRDQRCTQLLNHLHTVSSQILSSSQLL 0"
## [1] " "
## [1] "TQ-QVPLSKGTG-APSAAG---SAAAPC---------------------LERAGQKLEPL 0"
## [1] "TQ-QVPLSKGTG-APSAAG---SAAAPC---------------------LERAGQKLEPL 0"
## [1] "TQ-QVPLSKGAG-APSAAAPAGSAAAPC---------------------LERAGQKLEPL 0"
## [1] "TQ-QVPLSKGAG-APPAAAPAGSAAAPC---------------------LERAGQKLEPL 0"
## [1] "TQ-QVPLSKGTG-APTAAAPAGSGAAPC---------------------LERAGQKLEPL 0"
## [1] "TP-QVPSGRGSGRAPCSAG---AAAASG---------------------PER-------V 0"
## [1] "TQ-QVPVSKGNTSSSSSSSTTTTTTTSSSSSSSSSIATPPSSFLSVNASQDRTVQKLEPQ 0"
## [1] "SPPPGPLAKGPSSSSSPPAPLRAPARKP-----------------------------EGQ 0"
## [1] "LQ-SVPGGKGSSASPGRGG----QKAEN-------------------------------Q 0"
## [1] " "
## [1] "AYCVPVIQRTQPSAELAA-ENDTDTDSGYGGEAEARPDREKGKGA------GASRVTIKQ 0"
## [1] "AYCVPVIQRTQPSAELAA-ENDTDTDSGYGGEAEARPDREKGKGA------GASRVTIKQ 0"
## [1] "AHCVPVIQRTQPSAELAA-ENDTDTDSGYGGEAEARPDRDKGKGS------GAGRVTIKQ 0"
## [1] "AHCVPVIQRTQPSAELAA-ENDTDTDSGYGGEAEARPDRERGKGA------AASRVTIKQ 0"
## [1] "AHCVPVIQRTQPSSELAAAENDTDTDSGYGGEAEARPDREKGKGA------GASRVTIKQ 0"
## [1] "ARCVPVIQRTQPGTEPEH---DTDTDSGYGGEAEQ------------------GRAAVKQ 0"
## [1] "TNCVPVIQRTQPSSELAG-ENDTDTDSGYGGEGEARPDREKYHAL------GGSRVTIKQ 0"
## [1] "AHCVPVIQRTH-AAEPGA-ETDTDTDSGYGGEAEARPERGAGPAG------PLPALPVKQ 0"
## [1] "TNCVPVIQRTH-NLELNE--NDTDTDSGYGGEGEKAEGRSDGQSSSGVKSQGGNSLTIKQ 0"
## [1] " "
## [1] "EPPGEDSP-APKRMKLDSRGGGSGGGPG-------------------GGAAAAAAALLGP 0"
## [1] "EPPGEDSP-APKRMKLDSRGGGSGGGPG-------------------GGAAAAAAALLGP 0"
## [1] "EPPGEDSP-APKRMKLDSRGGGGGGGGGGLGGGGGGGLGGGGGGGLGGGAAAAAAALLGP 0"
## [1] "EPPGEDSP-APKRMKLDSRGGGGGGGGG--------------GGGLGGGAAAAAAALLGP 0"
## [1] "EPPGEDSP-APKRMRLDTRGGGGGPG---------------------GGAAAAAAALLGP 0"
## [1] "EPPGDSSP-APKRPKLEARG-----------------------------------ALLGP 0"
## [1] "EPPGEDSP-EPKRMKLDCGESSSPGG-------------------------SAAAALLGP 0"
## [1] "EPAGDEAPPAPKRPRLERGGSPPPG-----------------------------AALPGA 0"
## [1] "EPMEEPSP---KRFKPEFPGRGVTG--------------------------------PNA 0"
## [1] " "
## [1] "DPAAAAALLRPDAALLSSLVAFGGGG-GAPFPQPAA--AAAPFCLPFCFLSPSAAAAYV- 0"
## [1] "DPAAAAALLRPDAALLSSLVAFGGGG-GAPFPQPAA--AAAPFCLPFCFLSPSAAAAYV- 0"
## [1] "DPAAAAALLRPDAALLSSLVAFGGGG-GAPFAQPAA--AAAPFCLPFYFLSPSAAAAYV- 0"
## [1] "DPAAAAALLRPDAALLSSLVAFGGGG-GAPFAQPAA--AAAPFCLPFYFLSPSAAAAYV- 0"
## [1] "DPTAAAALLRPDAALLSSLVAFGGGG-GAPFAQPAA--AAAPFCLPFYFLSPSAAAAYV- 0"
## [1] "EP-----------ALLGSLVALGG---GAPFAQP----AAAPFCLPFYLLSP-SAAAYV- 0"
## [1] "NP--AAALLRPDAALLSSLVAFGGGG-GTPFGQPAAAAAAAPFCLPFYFISPSAAAAYM- 0"
## [1] "ARG-------ADAALLSSLMALGAGGGAAPFGQP-----AAPFCLPFYFISPSAAAAYV- 0"
## [1] "EP-----AVRPDAALLNSLMAFGG----APFGQQ------APFCLPFYFISPSAAAAAAY 0"
## [1] " "
## [1] "QPFLDKSGLEKYLYPA----AAAAPFPLLYPGIPAPAAAAAAAAAAAAAAAAFPCLSSVL 0"
## [1] "QPFLDKSGLEKYLYPA----AAAAPFPLLYPGIPAPAAAAAAAAAAAAAAAAFPCLSSVL 0"
## [1] "QPFLDKSGLEKYLYPA----AAAAPFPLLYPGIPAP-AAAAAAAAAAAAAAAFPCLSSVL 0"
## [1] "QPFLDKSGLEKYLYPA----AAAAPFPLLYPGIPAP-AAAAAAAAAAAAAAAFPCLSSVL 0"
## [1] "QPFLDKSGLEKYLYPA----AAAAPFPLLYPGIPAP-AAAAAAAAAAAAAAAFPCLSSVL 0"
## [1] "QPWLDKSGLDKYLYP-----AAAAPFPLLYPGIPAA--------AAAAAAAAFPCLSSVL 0"
## [1] "QPFLDKSNLEKYLYP-----AAAAPIPLLYPGIPAQ--AAAAAAAAAAAAAAFPCLSSVL 0"
## [1] "QPFLDKGGLEKYLYP-------AAPIPLLYPGIPAQ--AAAAAAAAAAAAASFPCLSSVL 0"
## [1] "MPFLDKGSLEKYLYPAAAAAAAATPIPLLYPGIPGQ------------ASGTFPCLSSVL 0"
## [1] " "
## [1] "SPPPEKAG--AAAATLLPHEVAPLGAPHP----------QHPHGRTHLPFAGPRE----P 0"
## [1] "SPPPEKAG--AAAATLLPHEVAPLGAPHP----------QHPHGRTHLPFAGPRE----P 0"
## [1] "SPPPEKAG--AAAATLLPHEVAPPGALHPPHPHPHPHPHPHPHGRTHLAFAGARE----P 0"
## [1] "SPPPEKAG--AAAATLLPHEVAPPAALHP----------LHAHGRTHLPFAGARE----P 0"
## [1] "SPPPEKAAAAAAAATLLPHEVAPPGALHP----------AHPHGRTHLPFAGARE----P 0"
## [1] "SPPPEKAG-ATAGAPFLAHEVAPPGPLRP----------QHAHSRTHLPRA--------- 0"
## [1] "SPPPEKAS--AAAPTILPPELASPVAPHQHFP-------LPLHPFAHLPFAASSREPGLS 0"
## [1] "GPAEKAAG--LSPAPHLPPPFAAAAAAVP----------------AAEPG---------- 0"
## [1] "SP--EKMA--ACSSALLPELPSPPFQTGP-----------------AMDRG--------L 0"
## [1] " "
## [1] "GNPESSAQEDPSQPGKEAP 41"
## [1] "GNPESSAQEDPSQPGKEAP 41"
## [1] "GNPESSAQEDPSQPGKEAH 41"
## [1] "GNPESSAQEDPSQPGKEAP 41"
## [1] "GNPESSAQEDPSQPAKETL 41"
## [1] "VNPES-SQEDATQPAKDAP 41"
## [1] "SDLEASSQGDSSQPGKENP 41"
## [1] "---EEAEPAAAEEPGAEGP 41"
## [1] "TEELREPLEGLLPPSKEN- 41"
## [1] " "
#Display MSA
#ggmsa::ggmsa(BHLHE41_align,
# start = 50,
# end = 150)
Distance Matrix
#
BHLHE41_dist <- seqinr::dist.alignment(BHLHE41_align_seqinr,
matrix = "identity")
BHLHE41_dist <- round(BHLHE41_dist,3)
BHLHE41_dist
## Homo sapiens Pan troglodytes Canis lupus familiaris
## Pan troglodytes 0.000
## Canis lupus familiaris 0.193 0.193
## Striped hyena 0.193 0.193 0.150
## Bos taurus 0.219 0.219 0.204
## Mus musculus 0.393 0.393 0.396
## Vombatus ursinus 0.437 0.437 0.451
## Gallus gallus 0.544 0.544 0.540
## Xenopus tropicalis 0.581 0.581 0.572
## Striped hyena Bos taurus Mus musculus Vombatus ursinus
## Pan troglodytes
## Canis lupus familiaris
## Striped hyena
## Bos taurus 0.204
## Mus musculus 0.393 0.402
## Vombatus ursinus 0.443 0.455 0.507
## Gallus gallus 0.535 0.548 0.537 0.535
## Xenopus tropicalis 0.577 0.579 0.586 0.575
## Gallus gallus
## Pan troglodytes
## Canis lupus familiaris
## Striped hyena
## Bos taurus
## Mus musculus
## Vombatus ursinus
## Gallus gallus
## Xenopus tropicalis 0.592
Phylogeny
#
tree <- nj(BHLHE41_dist)
plot.phylo(tree, main="Phylogenetic Tree",
use.edge.length = T)
# add label
mtext(text = "BHLHE41 family gene tree - rooted, with branch lenths")
