Introduction

BHLHE41(basic helix-loop-helix family member e41) is a gene that may be involved in cell differentiation and control of the circadian rhythm.

#Resources

https://www.ncbi.nlm.nih.gov/gene/79365 https://www.ncbi.nlm.nih.gov/gene/?Term=ortholog_gene_79365[group]

https://www.uniprot.org/uniprot/?query=BHLHE41&sort=score https://alphafold.ebi.ac.uk/entry/Q9C0J9 http://pfam.xfam.org/protein/Q9C0J9

https://blast.ncbi.nlm.nih.gov/Blast.cgi

# packages 
library(compbio4all)
library(ggmsa)
## Registered S3 methods overwritten by 'ggalt':
##   method                  from   
##   grid.draw.absoluteGrob  ggplot2
##   grobHeight.absoluteGrob ggplot2
##   grobWidth.absoluteGrob  ggplot2
##   grobX.absoluteGrob      ggplot2
##   grobY.absoluteGrob      ggplot2
library(rentrez)
library(seqinr)
library(ape)
## 
## Attaching package: 'ape'
## The following objects are masked from 'package:seqinr':
## 
##     as.alignment, consensus
library(msa)
## Loading required package: Biostrings
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: XVector
## Loading required package: GenomeInfoDb
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:ape':
## 
##     complement
## The following object is masked from 'package:seqinr':
## 
##     translate
## The following object is masked from 'package:base':
## 
##     strsplit
library(Biostrings)
library(BiocManager)
## Bioconductor version '3.13' is out-of-date; the current release version '3.14'
##   is available with R version '4.1'; see https://bioconductor.org/install
## 
## Attaching package: 'BiocManager'
## The following object is masked from 'package:msa':
## 
##     version
library(drawProteins)
library(ggplot2)

Accession Numbers

refseq_accession <- c("NP_110389.1","NP_077789.1","NP_001002973.1", "XP_520805.2","NP_001027504.1","XP_024847914.1", "XP_015146971.2","XP_027694693.1", "XP_039096736.1", "ENSP00000242728")
uniprot_accession <- c("Q9C0J9","Q99PV5", "Q7YQC9","H2Q5M4","Q4V788",NA, NA,NA,NA,NA)
pdb_accession <- c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA)
com_name <- c("Human", "House Mouse", "Dog", "Chimpanzee", "Tropical Clawed Frog", "Cattle", "Chicken","Common Wombat", "Hyaena hyaena", "Neanderthal")
sci_name <- c("Homo sapiens", "Mus musculus", "Canis lupus familiaris", "Pan troglodytes", "Xenopus tropicalis", "Bos taurus", "Gallus gallus","Vombatus ursinus", "Striped hyena", "Homo neanderthalensis")
BHLHE41_table <- data.frame(refseq_accession, uniprot_accession,
                            pdb_accession, com_name,sci_name)
pander::pander(BHLHE41_table)
Table continues below
refseq_accession uniprot_accession pdb_accession com_name
NP_110389.1 Q9C0J9 NA Human
NP_077789.1 Q99PV5 NA House Mouse
NP_001002973.1 Q7YQC9 NA Dog
XP_520805.2 H2Q5M4 NA Chimpanzee
NP_001027504.1 Q4V788 NA Tropical Clawed Frog
XP_024847914.1 NA NA Cattle
XP_015146971.2 NA NA Chicken
XP_027694693.1 NA NA Common Wombat
XP_039096736.1 NA NA Hyaena hyaena
ENSP00000242728 NA NA Neanderthal
sci_name
Homo sapiens
Mus musculus
Canis lupus familiaris
Pan troglodytes
Xenopus tropicalis
Bos taurus
Gallus gallus
Vombatus ursinus
Striped hyena
Homo neanderthalensis

Data Preparation

# 

BHLHE41_list <- compbio4all::entrez_fetch_list(db = "protein", 
                          id = BHLHE41_table$refseq_accession[1:9], 
                          rettype = "fasta")

for(i in 1:length(BHLHE41_list)){
  BHLHE41_list[[i]] <- compbio4all::fasta_cleaner(BHLHE41_list[[i]], parse = F)
}

General Protein Information

#Protein Diagram
Q9C0J9_json  <- drawProteins::get_features("Q9C0J9")
## [1] "Download has worked"
my_prot_df <- drawProteins::feature_to_dataframe(Q9C0J9_json)
my_canvas <- draw_canvas(my_prot_df)  
my_canvas <- draw_chains(my_canvas, my_prot_df, 
                         label_size = 2.5)
my_canvas <- draw_domains(my_canvas, my_prot_df)
my_canvas <- draw_recept_dom(my_canvas, my_prot_df)
my_canvas <- draw_regions(my_canvas, my_prot_df)
my_canvas <- draw_folding(my_canvas, my_prot_df)
my_canvas <- draw_repeat(my_canvas, my_prot_df)
my_canvas <- draw_motif(my_canvas, my_prot_df)
my_canvas <- draw_phospho(my_canvas, my_prot_df)

my_canvas

#Dot Plot

# set up 2 x 2 grid, make margins thinner
par(mfrow = c(2,2), 
    mar = c(0,0,2,2))

NP_110389.1 <- compbio4all::fasta_cleaner(BHLHE41_list[1], parse = TRUE)

# plot 1: BHLHE41 - Defaults
dotPlot(NP_110389.1, 
        NP_110389.1, 
        wsize = 1, 
        nmatch = 1, 
        main = "BHLHE41 Defaults")

# plot 2 BHLHE41 - size = 5, nmatch = 1
dotPlot(NP_110389.1, NP_110389.1, 
        wsize = 5, 
        nmatch = 1, 
        main = "BHLHE41 - size = 5, nmatch = 1")

# plot 3: BHLHE41 - size = 10, nmatch = 5
dotPlot(NP_110389.1, NP_110389.1, 
        wsize = 10, 
        nmatch = 5, 
        main = "BHLHE41 - size = 10, nmatch = 5")

# plot 4: BHLHE41 - size = 20, nmatch = 5
dotPlot(NP_110389.1, NP_110389.1, 
        wsize = 20,
        nmatch =5,
        main = "BHLHE41 - size = 20, nmatch = 5")

par(mfrow = c(1,1), 
    mar = c(4,4,4,4))

# Best Plot: BHLHE41 - size = 10, nmatch = 5
dotPlot(NP_110389.1[250:450], NP_110389.1[250:450], 
        wsize = 10, 
        nmatch = 5, 
        main = "BHLHE41 - size = 10, nmatch = 5")

#Protein properties compiled from databases

Properties <- c("Major Features","Subcellular Location", "AlphaFold Secondary Structure" )
Information <- c("HLH and Hairy_orange domains along with multiple disorder and low_complexity domains","Nucleus", "Alpha helixes can be seen as well as many disordered sections")
Database <- c("Pfam","Uniprot", "AlphaFold")
prop_df <- data.frame(Properties,Information,Database)
pander::pander(prop_df)
Properties Information Database
Major Features HLH and Hairy_orange domains along with multiple disorder and low_complexity domains Pfam
Subcellular Location Nucleus Uniprot
AlphaFold Secondary Structure Alpha helixes can be seen as well as many disordered sections AlphaFold

Protein feature prediction

#Secondary Structure prediction

aa.1.1 <- c("A","R","N","D","C","Q","E","G","H","I",
            "L","K","M","F","P","S","T","W","Y","V")
alpha <- c(285, 53, 97, 163, 22, 67, 134, 197, 111, 91, 
           221, 249, 48, 123, 82, 122, 119, 33, 63, 167)
beta <- c(203, 67, 139, 121, 75, 122, 86, 297, 49, 120, 
          177, 115, 16, 85, 127, 341, 253, 44, 110, 229)
a.plus.b <- c(175, 78, 120, 111, 74, 74, 86, 171, 33, 93,
              110, 112, 25, 52, 71, 126, 117, 30, 108, 123)
a.div.b <- c(361, 146, 183, 244, 63, 114, 257, 377, 107, 239, 
             339, 321, 91, 158, 188, 327, 238, 72, 130, 378)

alpha.prop <- alpha/sum(alpha)
beta.prop <- beta/sum(beta)
a.plus.b.prop <- a.plus.b/sum(a.plus.b)
a.div.b <- a.div.b/sum(a.div.b)

aa.prop <- data.frame(alpha.prop,
                      beta.prop,
                      a.plus.b.prop,
                      a.div.b)

row.names(aa.prop) <- aa.1.1



NP_110389.1 <- compbio4all::fasta_cleaner(BHLHE41_list[1], parse = TRUE)
NP_110389.1.freq.table <- table(NP_110389.1)/length(NP_110389.1)

#convert table to vector
table_to_vector <- function(table_x){
  table_names <- attr(table_x, "dimnames")[[1]]
  table_vect <- as.vector(table_x)
  names(table_vect) <- table_names
  return(table_vect)
}

BHLHE41.human.aa.freq <- table_to_vector(NP_110389.1.freq.table)

#removing Us
aa.names <- names(BHLHE41.human.aa.freq)
BHLHE41.human.aa.freq
##           A           C           D           E           F           G 
## 0.170124481 0.018672199 0.041493776 0.060165975 0.029045643 0.080912863 
##           H           I           K           L           M           N 
## 0.031120332 0.024896266 0.053941909 0.116182573 0.008298755 0.010373444 
##           P           Q           R           S           T           V 
## 0.109958506 0.049792531 0.045643154 0.060165975 0.039419087 0.026970954 
##           W           Y 
## 0.002074689 0.020746888
aa.prop$BHLHE41.human.aa.freq <- BHLHE41.human.aa.freq
pander::pander(aa.prop)
  alpha.prop beta.prop a.plus.b.prop a.div.b BHLHE41.human.aa.freq
A 0.1165 0.07313 0.09264 0.08331 0.1701
R 0.02166 0.02414 0.04129 0.03369 0.01867
N 0.03964 0.05007 0.06353 0.04223 0.04149
D 0.06661 0.04359 0.05876 0.05631 0.06017
C 0.008991 0.02702 0.03917 0.01454 0.02905
Q 0.02738 0.04395 0.03917 0.02631 0.08091
E 0.05476 0.03098 0.04553 0.05931 0.03112
G 0.08051 0.107 0.09052 0.08701 0.0249
H 0.04536 0.01765 0.01747 0.02469 0.05394
I 0.03719 0.04323 0.04923 0.05516 0.1162
L 0.09031 0.06376 0.05823 0.07824 0.008299
K 0.1018 0.04143 0.05929 0.07408 0.01037
M 0.01962 0.005764 0.01323 0.021 0.11
F 0.05027 0.03062 0.02753 0.03646 0.04979
P 0.03351 0.04575 0.03759 0.04339 0.04564
S 0.04986 0.1228 0.0667 0.07547 0.06017
T 0.04863 0.09114 0.06194 0.05493 0.03942
W 0.01349 0.01585 0.01588 0.01662 0.02697
Y 0.02575 0.03963 0.05717 0.03 0.002075
V 0.06825 0.08249 0.06511 0.08724 0.02075
chou_cor <- function(x,y){
  numerator <- sum(x*y)
denominator <- sqrt((sum(x^2))*(sum(y^2)))
result <- numerator/denominator
return(result)
}

chou_cosine <- function(z.1, z.2){
  z.1.abs <- sqrt(sum(z.1^2))
  z.2.abs <- sqrt(sum(z.2^2))
  my.cosine <- sum(z.1*z.2)/(z.1.abs*z.2.abs)
  return(my.cosine)
}


#calculate correlation
corr.alpha <- chou_cor(aa.prop[,5], aa.prop[,1])
corr.beta  <- chou_cor(aa.prop[,5], aa.prop[,2])
corr.apb   <- chou_cor(aa.prop[,5], aa.prop[,3])
corr.adb   <- chou_cor(aa.prop[,5], aa.prop[,4])

#calculate cosine similarity
cos.alpha <- chou_cosine(aa.prop[,5], aa.prop[,1])
cos.beta  <- chou_cosine(aa.prop[,5], aa.prop[,2])
cos.apb   <- chou_cosine(aa.prop[,5], aa.prop[,3])
cos.adb   <- chou_cosine(aa.prop[,5], aa.prop[,4])

#calculate distance
aa.prop.flipped <- t(aa.prop)
round(aa.prop.flipped,2)
##                          A    R    N    D    C    Q    E    G    H    I    L
## alpha.prop            0.12 0.02 0.04 0.07 0.01 0.03 0.05 0.08 0.05 0.04 0.09
## beta.prop             0.07 0.02 0.05 0.04 0.03 0.04 0.03 0.11 0.02 0.04 0.06
## a.plus.b.prop         0.09 0.04 0.06 0.06 0.04 0.04 0.05 0.09 0.02 0.05 0.06
## a.div.b               0.08 0.03 0.04 0.06 0.01 0.03 0.06 0.09 0.02 0.06 0.08
## BHLHE41.human.aa.freq 0.17 0.02 0.04 0.06 0.03 0.08 0.03 0.02 0.05 0.12 0.01
##                          K    M    F    P    S    T    W    Y    V
## alpha.prop            0.10 0.02 0.05 0.03 0.05 0.05 0.01 0.03 0.07
## beta.prop             0.04 0.01 0.03 0.05 0.12 0.09 0.02 0.04 0.08
## a.plus.b.prop         0.06 0.01 0.03 0.04 0.07 0.06 0.02 0.06 0.07
## a.div.b               0.07 0.02 0.04 0.04 0.08 0.05 0.02 0.03 0.09
## BHLHE41.human.aa.freq 0.01 0.11 0.05 0.05 0.06 0.04 0.03 0.00 0.02
dist(aa.prop.flipped, method = "euclidean")
##                       alpha.prop  beta.prop a.plus.b.prop    a.div.b
## beta.prop             0.13342098                                    
## a.plus.b.prop         0.09281824 0.08289406                         
## a.div.b               0.06699039 0.08659174    0.06175113           
## BHLHE41.human.aa.freq 0.20674190 0.22748500    0.19913273 0.20743674
dist.alpha <- dist((aa.prop.flipped[c(1,5),]),  method = "euclidean")
dist.beta  <- dist((aa.prop.flipped[c(2,5),]),  method = "euclidean")
dist.apb   <- dist((aa.prop.flipped[c(3,5),]),  method = "euclidean")
dist.adb  <- dist((aa.prop.flipped[c(4,5),]), method = "euclidean")

fold.type <- c("alpha","beta","alpha plus beta", "alpha/beta")

# data
corr.sim <- round(c(corr.alpha,corr.beta,corr.apb,corr.adb),5)
cosine.sim <- round(c(cos.alpha,cos.beta,cos.apb,cos.adb),5)
Euclidean.dist <- round(c(dist.alpha,dist.beta,dist.apb,dist.adb),5)

# summary
sim.sum <- c("","","most.sim","")
dist.sum <- c("","","min.dist","")

df <- data.frame(fold.type,
           corr.sim ,
           cosine.sim ,
           Euclidean.dist ,
           sim.sum ,
           dist.sum )
pander::pander(df)
fold.type corr.sim cosine.sim Euclidean.dist sim.sum dist.sum
alpha 0.7192 0.7192 0.2067
beta 0.6619 0.6619 0.2275
alpha plus beta 0.7319 0.7319 0.1991 most.sim min.dist
alpha/beta 0.7106 0.7106 0.2074

Percent Identity Comparisons (PID)

#PID table
human <-compbio4all::entrez_fetch_list(db = "protein", 
                          id = BHLHE41_table$refseq_accession[1], 
                          rettype = "fasta")
human <- fasta_cleaner(human,  parse = F)

chimp <-compbio4all::entrez_fetch_list(db = "protein", 
                          id = BHLHE41_table$refseq_accession[4], 
                          rettype = "fasta")
chimp <- fasta_cleaner(chimp,  parse = F)

mouse <-compbio4all::entrez_fetch_list(db = "protein", 
                          id = BHLHE41_table$refseq_accession[2], 
                          rettype = "fasta")
mouse <- fasta_cleaner(mouse,  parse = F)
  
dog <-compbio4all::entrez_fetch_list(db = "protein", 
                          id = BHLHE41_table$refseq_accession[3], 
                          rettype = "fasta")
dog <- fasta_cleaner(dog,  parse = F)


align.human.vs.chimp <- pairwiseAlignment(
                  human,
                  chimp)
align.human.vs.mouse  <-Biostrings::pairwiseAlignment(
                  human,
                  mouse)

align.human.vs.dog<- Biostrings::pairwiseAlignment(
                  human,
                  dog)

align.chimp.vs.mouse  <-Biostrings::pairwiseAlignment(
                  chimp,
                  mouse)

align.chimp.vs.dog<- Biostrings::pairwiseAlignment(
                  chimp,
                  dog)

align.mouse.vs.dog<- Biostrings::pairwiseAlignment(
                  mouse,
                  dog)


pids <- c(100,                  NA,     NA,     NA,
          pid(align.human.vs.chimp),100,     NA,     NA,
          pid(align.human.vs.mouse), pid(align.chimp.vs.mouse),      100,     NA,
          pid(align.human.vs.dog), pid(align.chimp.vs.dog), pid(align.mouse.vs.dog), 100)

mat <- matrix(pids, nrow = 4, byrow = T)
row.names(mat) <- c("Homo","Pan","Mus","Canis")   
colnames(mat) <- c("Homo","Pan","Mus","Canis")   
pander::pander(mat)  
  Homo Pan Mus Canis
Homo 100 NA NA NA
Pan 100 100 NA NA
Mus 72.73 72.73 100 NA
Canis 90.47 90.47 68.09 100
#PID method comparisons (chimps and humans)

Method <- c("PID1","PID2","PID3","PID4")
PID <- c(pid(align.human.vs.chimp, type = "PID1"), pid(align.human.vs.chimp, type = "PID2"),pid(align.human.vs.chimp, type = "PID4"),pid(align.human.vs.chimp, type = "PID4"))
denominator <- c("(aligned positions + internal gap positions)","(aligned positions)", "(length shorter sequence)", "(average length of the two sequences)")

pid.df <- data.frame(Method, PID, denominator)

pander::pander(pid.df)
Method PID denominator
PID1 100 (aligned positions + internal gap positions)
PID2 100 (aligned positions)
PID3 100 (length shorter sequence)
PID4 100 (average length of the two sequences)

Multiple sequence alignment

#Build MSA
names(BHLHE41_list) <- BHLHE41_table$sci_name[1:9]
BHLHE41_vector <- rep(NA, length(BHLHE41_list))

for(i in 1:length(BHLHE41_vector)){
  BHLHE41_vector[i] <- BHLHE41_list[[i]]
}

names(BHLHE41_vector) <- names(BHLHE41_list)


BHLHE41_vector_ss <- Biostrings::AAStringSet(BHLHE41_vector)

BHLHE41_align <- msa(BHLHE41_vector_ss,
                     method = "ClustalW")
## use default substitution matrix
BHLHE41_align_seqinr <- msaConvert(BHLHE41_align, type = "seqinr::alignment")

print_msa(alignment = BHLHE41_align_seqinr, 
          chunksize = 60)
## [1] "MDEGIPHLQERQLLEHRDFIGLDYSSLYMCKPKRSMKRDDTKDTYKLPHRLIEKKRRDRI 0"
## [1] "MDEGIPHLQERQLLEHRDFIGLDYSSLYMCKPKRSMKRDDTKDTYKLPHRLIEKKRRDRI 0"
## [1] "MDEGIPHLQERQLLEHRDFIGLDYSSLYMCKPKRSMKRDDSKDTYKLPHRLIEKKRRDRI 0"
## [1] "MDEGIPHLQERQLLEHRDFIGLDYSSLYMCKPKRSMKRDDSKDTYKLPHRLIEKKRRDRI 0"
## [1] "MDEGIPHLQERQLLEHRDFIGLDYPSLYMCKPKRSMKRDDSKDTYKLPHRLIEKKRRDRI 0"
## [1] "MDEGIPHLQERQLLEHRDFIGLDYSSLYMCKPKRSLKRDDTKDTYKLPHRLIEKKRRDRI 0"
## [1] "MDEGISRLQERQLMEHRDFIGLDYSSLYMCKPKRGMKRDESKETYKLPHRLIEKKRRDRI 0"
## [1] "MDEGISRLPERQLLEHRDFLGLEYPALYMCKPKRGVKRDESKETYKLPHRLIEKKRRDRI 0"
## [1] "MEEGIPSLQDRQ-LEPRDFLGMEYPHLFLCKPKRGLKRDESKETYKLPHRLIEKKRRDRI 0"
## [1] " "
## [1] "NECIAQLKDLLPEHLKLTTLGHLEKAVVLELTLKHLKALTALTEQQHQKIIALQNGERSL 0"
## [1] "NECIAQLKDLLPEHLKLTTLGHLEKAVVLELTLKHLKALTALTEQQHQKIIALQNGERSL 0"
## [1] "NECIAQLKDLLPEHLKLTTLGHLEKAVVLELTLKHLKALTALTEQQHQKIIALQNGERSL 0"
## [1] "NECIAQLKDLLPEHLKLTTLGHLEKAVVLELTLKHLKALTALTEQQHQKIIALQNGERSL 0"
## [1] "NECIAQLKDLLPEHLKLTTLGHLEKAVVLELTLKHLKALTALTEQQHQKIIALQNGERSL 0"
## [1] "NECIAQLKDLLPEHLKLTTLGHLEKAVVLELTLKHLKALTALTEQQHQKIIALQNGERSL 0"
## [1] "NECIAQLKDLLPEHLKLTTLGHLEKAVVLELTLKHLKALTALTEQQHQKIIALQNGERSM 0"
## [1] "NECIAQLKDLLPEHLKLTTLGHLEKAVVLELTLKHLKALTALTEQQHQKIVALQSGERSV 0"
## [1] "NECIAQLKDLLPEHLKLTTLGHLEKAVVLELTLKHLKGLTSLTEQQHQKIMALQNGEHAL 0"
## [1] " "
## [1] "KSPIQSDLDAFHSGFQTCAKEVLQYLSRFESWTPREPRCVQLINHLHAVATQFLPTPQLL 0"
## [1] "KSPIQSDLDAFHSGFQTCAKEVLQYLSRFESWTPREPRCVQLINHLHAVATQFLPTPQLL 0"
## [1] "KSPIQSDLDAFHSGFQTCAKEVLQYLSRFESWTPREQRCVQLINHLHAVATQFLPTPQLL 0"
## [1] "KSPIQSDLDAFHSGFQTCAKEVLQYLSRFESWTPREQRCVQLINHLHAVATQFLPTPQLL 0"
## [1] "KSPIQSDLDAFHSGFQTCAKEVLQYLARFESWTPREPRCVQLINHLHAVATQFLPTPQLL 0"
## [1] "KSPVQADLDAFHSGFQTCAKEVLQYLARFESWTPREPRCAQLVSHLHAVAT------QLL 0"
## [1] "KSPIQADLDSFHSGFQTCAKEVMQYLSRFESWTPREQRCAQLINHLHSVSTQFLPSPQLL 0"
## [1] "KSPVQADLDAFHSGFQTCAKEVLQFLSRSESWTPREQRCAQLLGHLHAVSSQFLPGPRLL 0"
## [1] "KSPIQTDLDAFHSGFQTCAKEVLQYLSRFESWTSRDQRCTQLLNHLHTVSSQILSSSQLL 0"
## [1] " "
## [1] "TQ-QVPLSKGTG-APSAAG---SAAAPC---------------------LERAGQKLEPL 0"
## [1] "TQ-QVPLSKGTG-APSAAG---SAAAPC---------------------LERAGQKLEPL 0"
## [1] "TQ-QVPLSKGAG-APSAAAPAGSAAAPC---------------------LERAGQKLEPL 0"
## [1] "TQ-QVPLSKGAG-APPAAAPAGSAAAPC---------------------LERAGQKLEPL 0"
## [1] "TQ-QVPLSKGTG-APTAAAPAGSGAAPC---------------------LERAGQKLEPL 0"
## [1] "TP-QVPSGRGSGRAPCSAG---AAAASG---------------------PER-------V 0"
## [1] "TQ-QVPVSKGNTSSSSSSSTTTTTTTSSSSSSSSSIATPPSSFLSVNASQDRTVQKLEPQ 0"
## [1] "SPPPGPLAKGPSSSSSPPAPLRAPARKP-----------------------------EGQ 0"
## [1] "LQ-SVPGGKGSSASPGRGG----QKAEN-------------------------------Q 0"
## [1] " "
## [1] "AYCVPVIQRTQPSAELAA-ENDTDTDSGYGGEAEARPDREKGKGA------GASRVTIKQ 0"
## [1] "AYCVPVIQRTQPSAELAA-ENDTDTDSGYGGEAEARPDREKGKGA------GASRVTIKQ 0"
## [1] "AHCVPVIQRTQPSAELAA-ENDTDTDSGYGGEAEARPDRDKGKGS------GAGRVTIKQ 0"
## [1] "AHCVPVIQRTQPSAELAA-ENDTDTDSGYGGEAEARPDRERGKGA------AASRVTIKQ 0"
## [1] "AHCVPVIQRTQPSSELAAAENDTDTDSGYGGEAEARPDREKGKGA------GASRVTIKQ 0"
## [1] "ARCVPVIQRTQPGTEPEH---DTDTDSGYGGEAEQ------------------GRAAVKQ 0"
## [1] "TNCVPVIQRTQPSSELAG-ENDTDTDSGYGGEGEARPDREKYHAL------GGSRVTIKQ 0"
## [1] "AHCVPVIQRTH-AAEPGA-ETDTDTDSGYGGEAEARPERGAGPAG------PLPALPVKQ 0"
## [1] "TNCVPVIQRTH-NLELNE--NDTDTDSGYGGEGEKAEGRSDGQSSSGVKSQGGNSLTIKQ 0"
## [1] " "
## [1] "EPPGEDSP-APKRMKLDSRGGGSGGGPG-------------------GGAAAAAAALLGP 0"
## [1] "EPPGEDSP-APKRMKLDSRGGGSGGGPG-------------------GGAAAAAAALLGP 0"
## [1] "EPPGEDSP-APKRMKLDSRGGGGGGGGGGLGGGGGGGLGGGGGGGLGGGAAAAAAALLGP 0"
## [1] "EPPGEDSP-APKRMKLDSRGGGGGGGGG--------------GGGLGGGAAAAAAALLGP 0"
## [1] "EPPGEDSP-APKRMRLDTRGGGGGPG---------------------GGAAAAAAALLGP 0"
## [1] "EPPGDSSP-APKRPKLEARG-----------------------------------ALLGP 0"
## [1] "EPPGEDSP-EPKRMKLDCGESSSPGG-------------------------SAAAALLGP 0"
## [1] "EPAGDEAPPAPKRPRLERGGSPPPG-----------------------------AALPGA 0"
## [1] "EPMEEPSP---KRFKPEFPGRGVTG--------------------------------PNA 0"
## [1] " "
## [1] "DPAAAAALLRPDAALLSSLVAFGGGG-GAPFPQPAA--AAAPFCLPFCFLSPSAAAAYV- 0"
## [1] "DPAAAAALLRPDAALLSSLVAFGGGG-GAPFPQPAA--AAAPFCLPFCFLSPSAAAAYV- 0"
## [1] "DPAAAAALLRPDAALLSSLVAFGGGG-GAPFAQPAA--AAAPFCLPFYFLSPSAAAAYV- 0"
## [1] "DPAAAAALLRPDAALLSSLVAFGGGG-GAPFAQPAA--AAAPFCLPFYFLSPSAAAAYV- 0"
## [1] "DPTAAAALLRPDAALLSSLVAFGGGG-GAPFAQPAA--AAAPFCLPFYFLSPSAAAAYV- 0"
## [1] "EP-----------ALLGSLVALGG---GAPFAQP----AAAPFCLPFYLLSP-SAAAYV- 0"
## [1] "NP--AAALLRPDAALLSSLVAFGGGG-GTPFGQPAAAAAAAPFCLPFYFISPSAAAAYM- 0"
## [1] "ARG-------ADAALLSSLMALGAGGGAAPFGQP-----AAPFCLPFYFISPSAAAAYV- 0"
## [1] "EP-----AVRPDAALLNSLMAFGG----APFGQQ------APFCLPFYFISPSAAAAAAY 0"
## [1] " "
## [1] "QPFLDKSGLEKYLYPA----AAAAPFPLLYPGIPAPAAAAAAAAAAAAAAAAFPCLSSVL 0"
## [1] "QPFLDKSGLEKYLYPA----AAAAPFPLLYPGIPAPAAAAAAAAAAAAAAAAFPCLSSVL 0"
## [1] "QPFLDKSGLEKYLYPA----AAAAPFPLLYPGIPAP-AAAAAAAAAAAAAAAFPCLSSVL 0"
## [1] "QPFLDKSGLEKYLYPA----AAAAPFPLLYPGIPAP-AAAAAAAAAAAAAAAFPCLSSVL 0"
## [1] "QPFLDKSGLEKYLYPA----AAAAPFPLLYPGIPAP-AAAAAAAAAAAAAAAFPCLSSVL 0"
## [1] "QPWLDKSGLDKYLYP-----AAAAPFPLLYPGIPAA--------AAAAAAAAFPCLSSVL 0"
## [1] "QPFLDKSNLEKYLYP-----AAAAPIPLLYPGIPAQ--AAAAAAAAAAAAAAFPCLSSVL 0"
## [1] "QPFLDKGGLEKYLYP-------AAPIPLLYPGIPAQ--AAAAAAAAAAAAASFPCLSSVL 0"
## [1] "MPFLDKGSLEKYLYPAAAAAAAATPIPLLYPGIPGQ------------ASGTFPCLSSVL 0"
## [1] " "
## [1] "SPPPEKAG--AAAATLLPHEVAPLGAPHP----------QHPHGRTHLPFAGPRE----P 0"
## [1] "SPPPEKAG--AAAATLLPHEVAPLGAPHP----------QHPHGRTHLPFAGPRE----P 0"
## [1] "SPPPEKAG--AAAATLLPHEVAPPGALHPPHPHPHPHPHPHPHGRTHLAFAGARE----P 0"
## [1] "SPPPEKAG--AAAATLLPHEVAPPAALHP----------LHAHGRTHLPFAGARE----P 0"
## [1] "SPPPEKAAAAAAAATLLPHEVAPPGALHP----------AHPHGRTHLPFAGARE----P 0"
## [1] "SPPPEKAG-ATAGAPFLAHEVAPPGPLRP----------QHAHSRTHLPRA--------- 0"
## [1] "SPPPEKAS--AAAPTILPPELASPVAPHQHFP-------LPLHPFAHLPFAASSREPGLS 0"
## [1] "GPAEKAAG--LSPAPHLPPPFAAAAAAVP----------------AAEPG---------- 0"
## [1] "SP--EKMA--ACSSALLPELPSPPFQTGP-----------------AMDRG--------L 0"
## [1] " "
## [1] "GNPESSAQEDPSQPGKEAP 41"
## [1] "GNPESSAQEDPSQPGKEAP 41"
## [1] "GNPESSAQEDPSQPGKEAH 41"
## [1] "GNPESSAQEDPSQPGKEAP 41"
## [1] "GNPESSAQEDPSQPAKETL 41"
## [1] "VNPES-SQEDATQPAKDAP 41"
## [1] "SDLEASSQGDSSQPGKENP 41"
## [1] "---EEAEPAAAEEPGAEGP 41"
## [1] "TEELREPLEGLLPPSKEN- 41"
## [1] " "
#Display MSA

#ggmsa::ggmsa(BHLHE41_align,   
#      start = 50, 
#      end = 150) 

Distance Matrix

# 
BHLHE41_dist <- seqinr::dist.alignment(BHLHE41_align_seqinr, 
                                       matrix = "identity")
BHLHE41_dist <- round(BHLHE41_dist,3)
BHLHE41_dist
##                        Homo sapiens Pan troglodytes Canis lupus familiaris
## Pan troglodytes               0.000                                       
## Canis lupus familiaris        0.193           0.193                       
## Striped hyena                 0.193           0.193                  0.150
## Bos taurus                    0.219           0.219                  0.204
## Mus musculus                  0.393           0.393                  0.396
## Vombatus ursinus              0.437           0.437                  0.451
## Gallus gallus                 0.544           0.544                  0.540
## Xenopus tropicalis            0.581           0.581                  0.572
##                        Striped hyena Bos taurus Mus musculus Vombatus ursinus
## Pan troglodytes                                                              
## Canis lupus familiaris                                                       
## Striped hyena                                                                
## Bos taurus                     0.204                                         
## Mus musculus                   0.393      0.402                              
## Vombatus ursinus               0.443      0.455        0.507                 
## Gallus gallus                  0.535      0.548        0.537            0.535
## Xenopus tropicalis             0.577      0.579        0.586            0.575
##                        Gallus gallus
## Pan troglodytes                     
## Canis lupus familiaris              
## Striped hyena                       
## Bos taurus                          
## Mus musculus                        
## Vombatus ursinus                    
## Gallus gallus                       
## Xenopus tropicalis             0.592

Phylogeny

#
tree <- nj(BHLHE41_dist)

plot.phylo(tree, main="Phylogenetic Tree", 
            use.edge.length = T)

# add label
mtext(text = "BHLHE41 family gene tree - rooted, with branch lenths")