This code is designed to summarize information about the HESX1 gene or also known as HESX homeobox 1. The HESX1 gene encodes a homeobox protein which is a large family of transcription factors known for being a master control gene, a single protein can regulate expression of many target genes. This HESX1 gene is located in the pituitary gland and the forebrain. Mutations in this gene are associated with: septooptic dyslexia, a disorder of early brain development, HESX1-related growth hormone deficiency, and combined pituary hormone deficiency.
Resources / References:
RefSeq Gene: https://www.ncbi.nlm.nih.gov/protein/NP_003856.1
RefSeq Homologene:https://www.ncbi.nlm.nih.gov/homologene/?term=HESX1
UniProt:https://www.uniprot.org/uniprot/Q9UBX0
PFAM:https://pfam.xfam.org/protein/Q9UBX0
AlphaFold: https://alphafold.ebi.ac.uk/entry/A2T777
The Neanderthal Genome was not able to be used because the accessions for the sequences were not available.
There was no information available in the PDB.
## CRAN packages
library(rentrez) ## used to look through the NCBI's database and download genetic data, and is accessed by using entrez.
library(seqinr) ## Used to explore data analysis and data visualization for biological sequences
library(ape) ##This function uses functions to read and write data for manipulating phylogenetic tree
##
## Attaching package: 'ape'
## The following objects are masked from 'package:seqinr':
##
## as.alignment, consensus
library(pander) ## an R package used to Pandoc's markdown.
library(ggplot2) ##used to map variables and is a system for declaratively creating graphics
## GitHub Packages
library(compbio4all) ##package for helpgul datasets and functions to learn comp bio.
library(ggmsa) ## made for visualization and annotation of multiple sequence alignments, uses the ggplot2 package to plot MSA sequences.
## Registered S3 methods overwritten by 'ggalt':
## method from
## grid.draw.absoluteGrob ggplot2
## grobHeight.absoluteGrob ggplot2
## grobWidth.absoluteGrob ggplot2
## grobX.absoluteGrob ggplot2
## grobY.absoluteGrob ggplot2
## ggmsa v1.1.3 Document: http://yulab-smu.top/ggmsa/
##
## If you use ggmsa in published research, please cite: DOI: 10.18129/B9.bioc.ggmsa
## BioConductor Packages
library(msa) ## the multiple sequence alignment package which contains msaClustalW().
## Loading required package: Biostrings
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Warning: package 'S4Vectors' was built under R version 4.1.2
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: XVector
## Loading required package: GenomeInfoDb
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:ape':
##
## complement
## The following object is masked from 'package:seqinr':
##
## translate
## The following object is masked from 'package:base':
##
## strsplit
library(drawProteins) ##Uses UniProt data to generate visualization of proteins.
library(Biostrings) ##Helps manipulate DNA and RNA sequences easlily.
library(HGNChelper) ##Contains functions for identifying and correcting the HGNC human gene symbols.
These Accession numbers were obtained from RefSeq, RefSeq Homogene, and Uniprot. There was no sequence information for the HESX1 gene available for the Neaderthal genome. This gene also does not appear outside of vertebrates, which was found through a BLAST search.
HGNChelper::checkGeneSymbols(x = c("HESX1"))
## Maps last updated on: Thu Oct 24 12:31:05 2019
## x Approved Suggested.Symbol
## 1 HESX1 TRUE HESX1
HESX1_table<-c("NP_001362987.1", "Q9UBX0", "NA", "Homo Sapien", "Human", "HESX1",
"NP_034550.2", "Q61658", "NA", "Mus Musculus", "Mouse", "HESX1",
"NP_001075039.1", "A2T777", "NA", "Pan troglodytes", "Chimpanzee", "HESX1",
"NP_001102576.1", "D4AEG9", "NA", "Rattus norvegicus", "Rat", "HESX1",
"NP_001016712.1", "Q91898", "NA", "Xenopus tropicalis", "Frog","HESX1",
"XP_038282473.1", "E2R1U3", "NA", "Canis lupus familiaris", "Dog", "HESX1",
"XP_020924666.1", "I3LKM5", "NA", "Sus scrofa", "Pig", "HESX1",
"XP_015148451.2", "F1NTN1", "NA", "Gallus Gallus", "Chicken", "HESX1",
"XP_001490373.2", "F7DHL2", "NA", "Equus Caballas", "Horse", "HESX1",
"XP_017922675.1", "A0A452FUH8", "NA", "Capra hircus", "Goat", "HESX1")
Convert vector information into table
NCBI.Protein.Accession<-c("NP_001362987.1","NP_034550.2","NP_001075039.1", "NP_001102576.1", "NP_001016712.1", "XP_038282473.1","XP_020924666.1","XP_015148451.2","XP_001490373.2","XP_017922675.1" )
UniProt.ID<-c("Q9UBX0", "Q61658", "A2T777","D4AEG9", "Q91898", "E2R1U3","I3LKM5","F1NTN1","F7DHL2","A0A452FUH8" )
PDB<-c("NA")
Species.Name<-c("Homo Sapien", "Mus Musculus", "Pan troglodytes","Rattus norvegicus","Xenopus tropicalis", "Canis lupus familiaris","Sus scrofa","Gallus Gallus","Equus Caballas","Capra hircus" )
Common.Name<-c("Human","Mouse","Chimpanzee","Rat","Frog","Dog","Pig","Chicken","Horse","Goat")
Gene.Name<-c("HESX1")
HESX1_table<-data.frame(NCBI.Protein.Accession=NCBI.Protein.Accession, UniProt.ID=UniProt.ID, PDB=PDB, Species.Name=Species.Name, Common.Name=Common.Name, Gene.Name=Gene.Name)
pander::pander(HESX1_table)
NCBI.Protein.Accession | UniProt.ID | PDB | Species.Name |
---|---|---|---|
NP_001362987.1 | Q9UBX0 | NA | Homo Sapien |
NP_034550.2 | Q61658 | NA | Mus Musculus |
NP_001075039.1 | A2T777 | NA | Pan troglodytes |
NP_001102576.1 | D4AEG9 | NA | Rattus norvegicus |
NP_001016712.1 | Q91898 | NA | Xenopus tropicalis |
XP_038282473.1 | E2R1U3 | NA | Canis lupus familiaris |
XP_020924666.1 | I3LKM5 | NA | Sus scrofa |
XP_015148451.2 | F1NTN1 | NA | Gallus Gallus |
XP_001490373.2 | F7DHL2 | NA | Equus Caballas |
XP_017922675.1 | A0A452FUH8 | NA | Capra hircus |
Common.Name | Gene.Name |
---|---|
Human | HESX1 |
Mouse | HESX1 |
Chimpanzee | HESX1 |
Rat | HESX1 |
Frog | HESX1 |
Dog | HESX1 |
Pig | HESX1 |
Chicken | HESX1 |
Horse | HESX1 |
Goat | HESX1 |
# Human HESX1 (H.Sapien)
hHESX1 <- entrez_fetch_list(db = "protein",
id = " NP_001362987.1",
rettype = "fasta")
# Mouse HESX1 (M.Muscula)
mHESX1 <- entrez_fetch_list(db = "protein",
id = " NP_034550.2",
rettype = "fasta")
# Chimp HESX1 (P. Troglodytes)
cHESX1 <- entrez_fetch_list(db = "protein",
id = " NP_001075039.1",
rettype = "fasta")
# Rat HESX1 (R. Norvegicus)
rHESX1 <- entrez_fetch_list(db = "protein",
id = " NP_001102576.1",
rettype = "fasta")
# Frog HESX1 (X.Tropicalis)
fHESX1 <- entrez_fetch_list(db = "protein",
id = " NP_001016712.1",
rettype = "fasta")
# Dog HESX1 (C. Lupis)
dHESX1 <- entrez_fetch_list(db = "protein",
id = " XP_038282473.1",
rettype = "fasta")
# Pig HESX1 (C. Lupis)
pHESX1 <- entrez_fetch_list(db = "protein",
id = " XP_020924666.1",
rettype = "fasta")
# Chicken HESX1 (C. Lupis)
chHESX1 <- entrez_fetch_list(db = "protein",
id = " XP_015148451.2",
rettype = "fasta")
# Horse HESX1 (C. Lupis)
hoHESX1 <- entrez_fetch_list(db = "protein",
id = " XP_001490373.2",
rettype = "fasta")
# Goat HESX1 (C. Lupis)
gHESX1 <- entrez_fetch_list(db = "protein",
id = " XP_017922675.1",
rettype = "fasta")
Number of fasta files obtained
HESX1_List<-c(hHESX1,mHESX1,cHESX1,rHESX1,fHESX1,dHESX1,pHESX1,chHESX1,hoHESX1,gHESX1)
length(HESX1_List)
## [1] 10
The first entry
HESX1_List[1]
## $` NP_001362987.1`
## [1] ">NP_001362987.1 homeobox expressed in ES cells 1 [Homo sapiens]\nMSPSLQEGAQLGENKPSTCSFSIERILGLDQKKDCVPLMKPHRPWADTCSSSGKDGNLCLHVPNPPSGIS\nFPSVVDHPMPEERASKYENYFSASERLSLKRELSWYRGRRPRTAFTQNQIEVLENVFRVNCYPGIDIRED\nLAQKLNLEEDRIQIWFQNRRAKLKRSHRESQFLMAKKNFNTNLLE\n\n"
Initial Data Cleaning Remove the Fasta Header
for(i in 1:length(HESX1_List)){
HESX1_List[[i]] <- compbio4all::fasta_cleaner(HESX1_List[[i]], parse = F)
}
hHESX1<- fasta_cleaner(hHESX1, parse = F)
mHESX1<- fasta_cleaner(mHESX1, parse = F)
cHESX1<- fasta_cleaner(cHESX1, parse = F)
rHESX1<- fasta_cleaner(rHESX1, parse = F)
fHESX1<- fasta_cleaner(fHESX1, parse = F)
dHESX1<- fasta_cleaner(dHESX1, parse = F)
pHESX1<- fasta_cleaner(dHESX1, parse = F)
chHESX1<- fasta_cleaner(dHESX1, parse = F)
hoHESX1<- fasta_cleaner(dHESX1, parse = F)
gHESX1<- fasta_cleaner(dHESX1, parse = F)
General Protein information ## Protein Diagram
Q9UBX0_human<-drawProteins::get_features("Q9UBX0")
## [1] "Download has worked"
is(Q9UBX0_human)
## [1] "list" "vector" "list_OR_List" "vector_OR_Vector"
## [5] "vector_OR_factor"
my_prot_df <- drawProteins::feature_to_dataframe(Q9UBX0_human)
is(my_prot_df)
## [1] "data.frame" "list" "oldClass" "vector"
## [5] "list_OR_List" "vector_OR_Vector" "vector_OR_factor"
my_prot_df[,-2]
## type begin end length accession entryName taxid order
## featuresTemp CHAIN 1 185 184 Q9UBX0 HESX1_HUMAN 9606 1
## featuresTemp.1 DNA_BIND 108 167 59 Q9UBX0 HESX1_HUMAN 9606 1
## featuresTemp.2 VARIANT 6 6 0 Q9UBX0 HESX1_HUMAN 9606 1
## featuresTemp.3 VARIANT 26 26 0 Q9UBX0 HESX1_HUMAN 9606 1
## featuresTemp.4 VARIANT 109 109 0 Q9UBX0 HESX1_HUMAN 9606 1
## featuresTemp.5 VARIANT 125 125 0 Q9UBX0 HESX1_HUMAN 9606 1
## featuresTemp.6 VARIANT 149 149 0 Q9UBX0 HESX1_HUMAN 9606 1
## featuresTemp.7 VARIANT 160 160 0 Q9UBX0 HESX1_HUMAN 9606 1
## featuresTemp.8 VARIANT 170 170 0 Q9UBX0 HESX1_HUMAN 9606 1
## featuresTemp.9 VARIANT 181 181 0 Q9UBX0 HESX1_HUMAN 9606 1
## featuresTemp.10 HELIX 117 127 10 Q9UBX0 HESX1_HUMAN 9606 1
## featuresTemp.11 HELIX 135 145 10 Q9UBX0 HESX1_HUMAN 9606 1
## featuresTemp.12 HELIX 149 164 15 Q9UBX0 HESX1_HUMAN 9606 1
my_prot_df <- drawProteins::feature_to_dataframe(Q9UBX0_human)
my_canvasHuman <- draw_canvas(my_prot_df)
my_canvasHuman<- draw_domains(my_canvasHuman, my_prot_df)
my_canvasHuman <- draw_chains(my_canvasHuman, my_prot_df, label_size = 2.5)
my_canvasHuman <- draw_regions(my_canvasHuman, my_prot_df)
my_canvasHuman <- draw_motif(my_canvasHuman, my_prot_df)
my_canvasHuman<- draw_phospho(my_canvasHuman, my_prot_df)
my_canvasHuman <- draw_repeat(my_canvasHuman, my_prot_df)
my_canvasHuman <- draw_recept_dom(my_canvasHuman, my_prot_df)
my_canvasHuman <- draw_folding(my_canvasHuman,my_prot_df)
my_canvasHuman
The resulting data show that drawProteins() is only able to extract a limited amount of information suggesting that the protein does not have much regional inofrmation and only contains folds.
prepare output
hHESX1 <- entrez_fetch_list(db = "protein",
id = " NP_001362987.1",
rettype = "fasta")
hHESX1_vector <- fasta_cleaner(hHESX1)
hHESX1_string <- fasta_cleaner(hHESX1_vector,
parse = F)
str(hHESX1_string)
## chr "M"
str(hHESX1_vector)
## chr [1:185] "M" "S" "P" "S" "L" "Q" "E" "G" "A" "Q" "L" "G" "E" "N" "K" ...
str(hHESX1)
## List of 1
## $ NP_001362987.1: chr ">NP_001362987.1 homeobox expressed in ES cells 1 [Homo sapiens]\nMSPSLQEGAQLGENKPSTCSFSIERILGLDQKKDCVPLMKPHRPWA"| __truncated__
# set up 2 x 2 grid
par(mfrow = c(2,2),
mar = c(0,0,2,1))
dotPlot(hHESX1_vector, hHESX1_vector,
wsize = 15,
nmatch = 2,
main = "wsize = 15, nmatch = 2")
dotPlot(hHESX1_vector, hHESX1_vector,
wsize = 10,
nmatch = 1,
main = "wsize = 10, nmatch = 1")
dotPlot(hHESX1_vector, hHESX1_vector,
wsize = 10,
nmatch = 5,
main = "wsize = 10, nmatch = 5")
dotPlot(hHESX1_vector, hHESX1_vector,
wsize = 20,
nmatch = 5,
main = "wsize = 20, nmatch = 5")
par(mfrow = c(1,1),
mar = c(4,4,4,4))
From these graphs, the first dotplot seems to have display the most information.
par(mfrow = c(1,1),
mar = c(4,4,4,4))
dotPlot(hHESX1_vector, hHESX1_vector,
wsize = 15,
nmatch = 2,
main = "wsize = 15, nmatch = 2")
##Pfam, Transmembrane region from 109 to 165.
human_HESX1.domain <-c("Homeodomain", "https://pfam.xfam.org/protein/Q9UBX0")
##DisProt: NA, no information found
##RepeatDb: NA, no information found
##UniProt subcellular information not found
##PDB secondary structural location : No PDB entry available
## The HESX1 homolog is listed in Alphafold (https://alphafold.ebi.ac.uk/entry/A2T777). The predicted structure is primarily alpha helices and disordered regions.
Source<-c("PFAM", "UniProt", "AlphaFold")
Protein.Property<-c("There is only one domain present, the Homeobox. This domain starts at 109 and ends at 165","HESX1 gene is required for the normal development of eyes, forebrain, and the pituatary gland. HESX1 is located in the Nucleus.","The protein structure model has a very high confidence, most of the structure is drawn with over 50% confidence with some alpha helices drawn with more than 90% confidence" )
link<-c("https://pfam.xfam.org/protein/Q9UBX0","https://www.uniprot.org/uniprot/Q9UBX0","https://alphafold.ebi.ac.uk/entry/A2T777" )
Source_table<-data.frame(Source=Source, Protein.Property=Protein.Property, link=link)
pander(Source_table)
Source | Protein.Property |
---|---|
PFAM | There is only one domain present, the Homeobox. This domain starts at 109 and ends at 165 |
UniProt | HESX1 gene is required for the normal development of eyes, forebrain, and the pituatary gland. HESX1 is located in the Nucleus. |
AlphaFold | The protein structure model has a very high confidence, most of the structure is drawn with over 50% confidence with some alpha helices drawn with more than 90% confidence |
link |
---|
https://pfam.xfam.org/protein/Q9UBX0 |
https://www.uniprot.org/uniprot/Q9UBX0 |
https://alphafold.ebi.ac.uk/entry/A2T777 |
aa.1.1 <- c("A","R","N","D","C","Q","E","G","H","I",
"L","K","M","F","P","S","T","W","Y","V")
aa.1.2 <- c("A","R","N","D","C","Q","E","G","H","I",
"L","K","M","F","P","S","T","W","Y","V")
length(aa.1.1) == length(aa.1.2)
## [1] TRUE
aa.1.1 == aa.1.2
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE
unique(aa.1.1)
## [1] "A" "R" "N" "D" "C" "Q" "E" "G" "H" "I" "L" "K" "M" "F" "P" "S" "T" "W" "Y"
## [20] "V"
length(unique(aa.1.1)) == length(unique(aa.1.2))
## [1] TRUE
any(c(aa.1.1 == aa.1.2) == FALSE)
## [1] FALSE
# alpha proteins
alpha <- c(285, 53, 97, 163, 22, 67, 134, 197, 111, 91,
221, 249, 48, 123, 82, 122, 119, 33, 63, 167)
# check against chou's total
sum(alpha) == 2447
## [1] TRUE
beta <- c(203, 67, 139, 121, 75, 122, 86, 297, 49, 120,
177, 115, 16, 85, 127, 341, 253, 44, 110, 229)
# check against chou's total
sum(beta) == 2776
## [1] TRUE
a.plus.b <- c(175, 78, 120, 111, 74, 74, 86, 171, 33, 93,
110, 112, 25, 52, 71, 126, 117, 30, 108, 123)
sum(a.plus.b) == 1889
## [1] TRUE
a.div.b <- c(361, 146, 183, 244, 63, 114, 257, 377, 107, 239,
339, 321, 91, 158, 188, 327, 238, 72, 130, 378)
sum(a.div.b) == 4333
## [1] TRUE
pander(data.frame(aa.1.1, alpha, beta, a.plus.b, a.div.b))
aa.1.1 | alpha | beta | a.plus.b | a.div.b |
---|---|---|---|---|
A | 285 | 203 | 175 | 361 |
R | 53 | 67 | 78 | 146 |
N | 97 | 139 | 120 | 183 |
D | 163 | 121 | 111 | 244 |
C | 22 | 75 | 74 | 63 |
Q | 67 | 122 | 74 | 114 |
E | 134 | 86 | 86 | 257 |
G | 197 | 297 | 171 | 377 |
H | 111 | 49 | 33 | 107 |
I | 91 | 120 | 93 | 239 |
L | 221 | 177 | 110 | 339 |
K | 249 | 115 | 112 | 321 |
M | 48 | 16 | 25 | 91 |
F | 123 | 85 | 52 | 158 |
P | 82 | 127 | 71 | 188 |
S | 122 | 341 | 126 | 327 |
T | 119 | 253 | 117 | 238 |
W | 33 | 44 | 30 | 72 |
Y | 63 | 110 | 108 | 130 |
V | 167 | 229 | 123 | 378 |
alpha.prop <- alpha/sum(alpha)
beta.prop <- beta/sum(beta)
a.plus.b.prop <- a.plus.b/sum(a.plus.b)
a.div.b <- a.div.b/sum(a.div.b)
aa.prop <- data.frame(alpha.prop,
beta.prop,
a.plus.b.prop,
a.div.b)
#row labels
row.names(aa.prop) <- aa.1.1
pander::pander(aa.prop)
alpha.prop | beta.prop | a.plus.b.prop | a.div.b | |
---|---|---|---|---|
A | 0.1165 | 0.07313 | 0.09264 | 0.08331 |
R | 0.02166 | 0.02414 | 0.04129 | 0.03369 |
N | 0.03964 | 0.05007 | 0.06353 | 0.04223 |
D | 0.06661 | 0.04359 | 0.05876 | 0.05631 |
C | 0.008991 | 0.02702 | 0.03917 | 0.01454 |
Q | 0.02738 | 0.04395 | 0.03917 | 0.02631 |
E | 0.05476 | 0.03098 | 0.04553 | 0.05931 |
G | 0.08051 | 0.107 | 0.09052 | 0.08701 |
H | 0.04536 | 0.01765 | 0.01747 | 0.02469 |
I | 0.03719 | 0.04323 | 0.04923 | 0.05516 |
L | 0.09031 | 0.06376 | 0.05823 | 0.07824 |
K | 0.1018 | 0.04143 | 0.05929 | 0.07408 |
M | 0.01962 | 0.005764 | 0.01323 | 0.021 |
F | 0.05027 | 0.03062 | 0.02753 | 0.03646 |
P | 0.03351 | 0.04575 | 0.03759 | 0.04339 |
S | 0.04986 | 0.1228 | 0.0667 | 0.07547 |
T | 0.04863 | 0.09114 | 0.06194 | 0.05493 |
W | 0.01349 | 0.01585 | 0.01588 | 0.01662 |
Y | 0.02575 | 0.03963 | 0.05717 | 0.03 |
V | 0.06825 | 0.08249 | 0.06511 | 0.08724 |
cor(aa.prop)
## alpha.prop beta.prop a.plus.b.prop a.div.b
## alpha.prop 1.0000000 0.4941143 0.6969508 0.8555289
## beta.prop 0.4941143 1.0000000 0.7977771 0.7706654
## a.plus.b.prop 0.6969508 0.7977771 1.0000000 0.8198043
## a.div.b 0.8555289 0.7706654 0.8198043 1.0000000
round(cor(aa.prop), 3)
## alpha.prop beta.prop a.plus.b.prop a.div.b
## alpha.prop 1.000 0.494 0.697 0.856
## beta.prop 0.494 1.000 0.798 0.771
## a.plus.b.prop 0.697 0.798 1.000 0.820
## a.div.b 0.856 0.771 0.820 1.000
par(mfrow = c(1,3), mar = c(4,4,1,0))
plot(alpha.prop ~ beta.prop, data = aa.prop)
plot(alpha.prop ~ a.plus.b.prop, data = aa.prop)
plot(alpha.prop ~ a.div.b, data = aa.prop)
par(mfrow = c(1,1), mar = c(4,4,4,4))
hHESX1 <- entrez_fetch_list(db = "protein",
id = " NP_001362987.1",
rettype = "fasta")
# clean and turn into vector
hHESX1 <- compbio4all::fasta_cleaner(hHESX1, parse = TRUE)
hHESX1.freq.table <- table(hHESX1)/length(hHESX1)
table_to_vector <- function(table_x){
table_names <- attr(table_x, "dimnames")[[1]]
table_vect <- as.vector(table_x)
names(table_vect) <- table_names
return(table_vect)
}
HESX1.human.aa.freq <- table_to_vector(hHESX1.freq.table)
aa.names <- names(HESX1.human.aa.freq)
any(aa.names == "U")
## [1] FALSE
Since there was no prescence of “U” found there is no need to remove anything.
chou_cor <- function(x,y){
numerator <- sum(x*y)
denominator <- sqrt((sum(x^2))*(sum(y^2)))
result <- numerator/denominator
return(result)
}
chou_cosine <- function(z.1, z.2){
z.1.abs <- sqrt(sum(z.1^2))
z.2.abs <- sqrt(sum(z.2^2))
my.cosine <- sum(z.1*z.2)/(z.1.abs*z.2.abs)
return(my.cosine)
}
par(mfrow = c(2,2), mar = c(1,4,1,1))
plot(alpha.prop ~ HESX1.human.aa.freq, data = aa.prop)
plot(beta.prop ~ HESX1.human.aa.freq, data = aa.prop)
plot(a.plus.b.prop ~ HESX1.human.aa.freq, data = aa.prop)
plot(a.div.b ~ HESX1.human.aa.freq, data = aa.prop)
par(mfrow = c(1,1), mar = c(1,1,1,1))
corr.alpha <- chou_cor(aa.prop[,4], aa.prop[,1])
corr.beta <- chou_cor(aa.prop[,4], aa.prop[,2])
corr.apb <- chou_cor(aa.prop[,4], aa.prop[,3])
corr.adb <- chou_cor(aa.prop[,4], aa.prop[,4])
cos.alpha <- chou_cosine(aa.prop[,4], aa.prop[,1])
cos.beta <- chou_cosine(aa.prop[,4], aa.prop[,2])
cos.apb <- chou_cosine(aa.prop[,4], aa.prop[,3])
cos.adb <- chou_cosine(aa.prop[,4], aa.prop[,4])
aa.prop.flipped <- t(aa.prop)
round(aa.prop.flipped,2)
## A R N D C Q E G H I L K M
## alpha.prop 0.12 0.02 0.04 0.07 0.01 0.03 0.05 0.08 0.05 0.04 0.09 0.10 0.02
## beta.prop 0.07 0.02 0.05 0.04 0.03 0.04 0.03 0.11 0.02 0.04 0.06 0.04 0.01
## a.plus.b.prop 0.09 0.04 0.06 0.06 0.04 0.04 0.05 0.09 0.02 0.05 0.06 0.06 0.01
## a.div.b 0.08 0.03 0.04 0.06 0.01 0.03 0.06 0.09 0.02 0.06 0.08 0.07 0.02
## F P S T W Y V
## alpha.prop 0.05 0.03 0.05 0.05 0.01 0.03 0.07
## beta.prop 0.03 0.05 0.12 0.09 0.02 0.04 0.08
## a.plus.b.prop 0.03 0.04 0.07 0.06 0.02 0.06 0.07
## a.div.b 0.04 0.04 0.08 0.05 0.02 0.03 0.09
dist(aa.prop.flipped, method = "euclidean")
## alpha.prop beta.prop a.plus.b.prop
## beta.prop 0.13342098
## a.plus.b.prop 0.09281824 0.08289406
## a.div.b 0.06699039 0.08659174 0.06175113
dist.alpha <- dist((aa.prop.flipped[c(1,4),]), method = "euclidean")
dist.beta <- dist((aa.prop.flipped[c(2,4),]), method = "euclidean")
dist.apb <- dist((aa.prop.flipped[c(3,4),]), method = "euclidean")
dist.adb <- dist((aa.prop.flipped[c(4,4),]), method = "euclidean")
fold.type <- c("alpha","beta","alpha plus beta", "alpha/beta")
# data
corr.sim <- round(c(corr.alpha,corr.beta,corr.apb,corr.adb),5)
cosine.sim <- round(c(cos.alpha,cos.beta,cos.apb,cos.adb),5)
Euclidean.dist <- round(c(dist.alpha,dist.beta,dist.apb,dist.adb),5)
# summary
sim.sum <- c("","","most.sim","")
dist.sum <- c("","","min.dist","")
df <- data.frame(fold.type,
corr.sim ,
cosine.sim ,
Euclidean.dist ,
sim.sum ,
dist.sum )
pander::pander(df)
fold.type | corr.sim | cosine.sim | Euclidean.dist | sim.sum | dist.sum |
---|---|---|---|---|---|
alpha | 0.9658 | 0.9658 | 0.06699 | ||
beta | 0.9436 | 0.9436 | 0.08659 | ||
alpha plus beta | 0.9686 | 0.9686 | 0.06175 | most.sim | min.dist |
alpha/beta | 1 | 1 | 0 |
Data Preparation
names(HESX1_List)
## [1] " NP_001362987.1" " NP_034550.2" " NP_001075039.1" " NP_001102576.1"
## [5] " NP_001016712.1" " XP_038282473.1" " XP_020924666.1" " XP_015148451.2"
## [9] " XP_001490373.2" " XP_017922675.1"
length(HESX1_List)
## [1] 10
HESX1_List[1]
## $` NP_001362987.1`
## [1] "MSPSLQEGAQLGENKPSTCSFSIERILGLDQKKDCVPLMKPHRPWADTCSSSGKDGNLCLHVPNPPSGISFPSVVDHPMPEERASKYENYFSASERLSLKRELSWYRGRRPRTAFTQNQIEVLENVFRVNCYPGIDIREDLAQKLNLEEDRIQIWFQNRRAKLKRSHRESQFLMAKKNFNTNLLE"
HESX1_vector <- c(HESX1_List[1],HESX1_List[2],HESX1_List[3],HESX1_List[4],HESX1_List[5],HESX1_List[6],HESX1_List[7],HESX1_List[8],HESX1_List[9],HESX1_List[10])
HESX1_vector
## $` NP_001362987.1`
## [1] "MSPSLQEGAQLGENKPSTCSFSIERILGLDQKKDCVPLMKPHRPWADTCSSSGKDGNLCLHVPNPPSGISFPSVVDHPMPEERASKYENYFSASERLSLKRELSWYRGRRPRTAFTQNQIEVLENVFRVNCYPGIDIREDLAQKLNLEEDRIQIWFQNRRAKLKRSHRESQFLMAKKNFNTNLLE"
##
## $` NP_034550.2`
## [1] "MSPSLREGAQLRESKPAPCSFSIESILGLDQKKDCTTSVRPHRPWTDTCGDSEKGGNPPLHAPDLPSETSFPCPVDHPRPEERAPKYENYFSASETRSLKRELSWYRGRRPRTAFTQNQVEVLENVFRVNCYPGIDIREDLAQKLNLEEDRIQIWFQNRRAKMKRSRRESQFLMAKKPFNPDLLK"
##
## $` NP_001075039.1`
## [1] "MSPSLQEGAQLGESKPSTCSFSIERILGLDQNKDCVPLMKPHRPWADTCSSSGKDGNLCLHVPNPPSGISFPSVVDHPMPEERALKYENYFSASERLSLKRELSWYRGRRPRTAFTQNQIEVLENVFRVNCYPGIDIREDLAQKLNLEEDRIQIWFQNRRAKLKRSHRESQFLMAKKNFNTNLLE"
##
## $` NP_001102576.1`
## [1] "MSPSLREVAQLRESKPSPCSFSIESILGLDQKKDCATSVRPHRPWTDTCGDSEKDGNPRLHAPGLPSEISFPCPVDHPMPEERAPKYENYFSASETHSLKRELSWYRGRRPRTAFTQNQVEVLENVFRMNCYPGIDIREDLAQKLNLEEDRIQIWFQNRRAKLKRSRRESQFLMAKKPFNPDLLK"
##
## $` NP_001016712.1`
## [1] "MSPGLQKGSRLMENRSPPSSFSIEHILGLDKKTEVASSPIIKHHRPWIQCNSEGVDGTFWHIPVISCDLPVQVHALRRSMGEETQVRLEKCCGEEDRLTYKRELSWYRGRRPRTAFTRSQIEILENVFRVNSYPGIDIREELAGKLALDEDRIQIWFQNRRAKLKRSHRESQFLMVKDSLSSKIQE"
##
## $` XP_038282473.1`
## [1] "MAPSLQEGAQLGERKPSSCSFSIESILGLDQKKDCIPSMKPHRPWADTCGSSGKDVNLCVHIPSLPNGISLPCTVDHPMPEEKFLKYENYFSASERLSLKRELSWYRGRRPRTAFTQNQIEVLENVFRVNCYPGIDIREDLARKLNLEEDRIQIWFQNRRAKLKRSHRESQFLMAKKNFNSNLLEEIEN"
##
## $` XP_020924666.1`
## [1] "MSPSLQEGARLAESKPSPCSFSIESILGLDQKKDCVPSTKPHRPWADTCSSSGKDVNFCLHVPSLPDGISLPCNVDHSVPEEGVLKCENYFSASERLSLKRELSWYRGRRPRTAFTQNQIEMLENVFRVNCYPGIDIREDLARKLNLEEDRIQIWFQNRRAKLKRSHRESQFLMAKKSFNTDLLE"
##
## $` XP_015148451.2`
## [1] "MVIGAQFAGGHCVYSAATSKYIDMLIRCLGLEQKKDGIAAVKPHRPWMDVCTNLVLGDDSDPHLQIPVVSYENSLFHANSNLMQEEKVLNCEKYFSVTERLSFKRELSWYRGRRPRTAFTRNQIEVLENVFKMNSYPGIDIREELARKLDLEEDRIQIWFQNRRAKLKRSHRESQFLMVKNNFTSSLLE"
##
## $` XP_001490373.2`
## [1] "MTPSLQEGARLGESKPSPCSFSIESILGLDQKKDCAPSMKPHRPWVDTYGSSGKDVNLCLHVASLPNGISFPCTVDHPAPEKRVLKHENYFSASERQSLKRELSWYRGRRPRTAFTQNQIEVLENVFRVNCYPGIDIREDLAQKLNLEEDRIQIWFQNRRAKLKRSHRESQFLMAKKNFHTNLLE"
##
## $` XP_017922675.1`
## [1] "MQPDSELRRLSHGERGDSQRLKDTELHDITTPQKFSGAIGKSLEKRLLLEKPMSPMTGLDQKKDCVPSTKPHRPWADTCGSSGKEVNLCLHVPTLPSGISLPQTVDHSVPEESVWKYEDYFAPSERLSLKRELSWYRGRRPRTAFTQNQIEVLENVFRVNCYPGIDIREDLARKLNLEEDRIQIWFQNRRAKLKRSHRESQFLMAKKNFNTDLLE"
names(HESX1_vector) <- names(HESX1_List)
PID table
# Human HESX1 (H.Sapien)
hHESX1 <- entrez_fetch_list(db = "protein",
id = " NP_001362987.1",
rettype = "fasta")
# Chimp HESX1 (P. Troglodytes)
cHESX1 <- entrez_fetch_list(db = "protein",
id = " NP_001075039.1",
rettype = "fasta")
# Frog HESX1 (X.Tropicalis)
fHESX1 <- entrez_fetch_list(db = "protein",
id = " NP_001016712.1",
rettype = "fasta")
# Dog HESX1 (C. Lupis)
dHESX1 <- entrez_fetch_list(db = "protein",
id = " XP_038282473.1",
rettype = "fasta")
hHESX1<- fasta_cleaner(hHESX1, parse = F)
cHESX1<- fasta_cleaner(cHESX1, parse = F)
fHESX1<- fasta_cleaner(fHESX1, parse = F)
dHESX1<- fasta_cleaner(dHESX1, parse = F)
align.human.vs.chimp <- Biostrings:: pairwiseAlignment (
hHESX1, cHESX1)
align.human.vs.frog <- Biostrings::pairwiseAlignment(
hHESX1, fHESX1 )
align.human.vs.dog <- Biostrings::pairwiseAlignment(
hHESX1, dHESX1)
align.chimp.vs.frog <- Biostrings::pairwiseAlignment(
cHESX1, fHESX1)
align.chimp.vs.dog <- Biostrings::pairwiseAlignment(
cHESX1, dHESX1)
align.frog.vs.dog <- Biostrings::pairwiseAlignment(
fHESX1, dHESX1 )
Biostrings:: pid(align.human.vs.chimp)
## [1] 98.37838
Biostrings::pid(align.human.vs.frog)
## [1] 59.06736
Biostrings::pid(align.human.vs.dog)
## [1] 88.64865
Biostrings::pid(align.chimp.vs.frog)
## [1] 58.97436
Biostrings::pid(align.chimp.vs.dog)
## [1] 88.64865
Biostrings::pid(align.frog.vs.dog)
## [1] 58.88325
pids <- c(1, NA, NA, NA,
pid(align.human.vs.chimp), 1, NA, NA,
pid(align.human.vs.frog), pid(align.chimp.vs.frog), 1, NA,
pid(align.human.vs.dog), pid(align.chimp.vs.dog), pid(align.frog.vs.dog), 1)
mat <- matrix(pids, nrow = 4, byrow = T)
row.names(mat) <- c("Human","Chimp","Frog","Dog")
colnames(mat) <- c("Human","Chimp","Frog","Dog")
pander::pander(mat)
Human | Chimp | Frog | Dog | |
---|---|---|---|---|
Human | 1 | NA | NA | NA |
Chimp | 98.38 | 1 | NA | NA |
Frog | 59.07 | 58.97 | 1 | NA |
Dog | 88.65 | 88.65 | 58.88 | 1 |
PID comparison
pid(align.human.vs.chimp, type = "PID1")
## [1] 98.37838
pid(align.human.vs.chimp, type = "PID2")
## [1] 98.37838
pid(align.human.vs.chimp, type = "PID3")
## [1] 98.37838
pid(align.human.vs.chimp, type = "PID4")
## [1] 98.37838
Data preparation
HESX1_vector <- rep(NA, length(HESX1_List))
# RUN THE LOOP
for(i in 1:length(HESX1_vector)){
HESX1_vector[i] <- HESX1_List[[i]]
}
# NAME THE VECTOR
names(HESX1_vector) <- names(HESX1_List)
HESX1_Vector_ss <- Biostrings::AAStringSet(HESX1_vector)
# add necessary function
HESX1_align <-msa(HESX1_Vector_ss,
method = "ClustalW")
## use default substitution matrix
HESX1_align
## CLUSTAL 2.1
##
## Call:
## msa(HESX1_Vector_ss, method = "ClustalW")
##
## MsaAAMultipleAlignment with 10 rows and 224 columns
## aln names
## [1] MSPS-----LQEG-----AQLGENK...RSHRESQFLMAKKNFNTNLLE---- NP_001362987.1
## [2] MSPS-----LQEG-----AQLGESK...RSHRESQFLMAKKNFNTNLLE---- NP_001075039.1
## [3] MSPS-----LREG-----AQLRESK...RSRRESQFLMAKKPFNPDLLK---- NP_034550.2
## [4] MSPS-----LREV-----AQLRESK...RSRRESQFLMAKKPFNPDLLK---- NP_001102576.1
## [5] MAPS-----LQEG-----AQLGERK...RSHRESQFLMAKKNFNSNLLEEIEN XP_038282473.1
## [6] MTPS-----LQEG-----ARLGESK...RSHRESQFLMAKKNFHTNLLE---- XP_001490373.2
## [7] MSPS-----LQEG-----ARLAESK...RSHRESQFLMAKKSFNTDLLE---- XP_020924666.1
## [8] MQPDSELRRLSHGERGDSQRLKDTE...RSHRESQFLMAKKNFNTDLLE---- XP_017922675.1
## [9] ---------MVIG-----AQFAGGH...RSHRESQFLMVKNNFTSSLLE---- XP_015148451.2
## [10] MSPG-----LQKG-----SRLMENR...RSHRESQFLMVKDSLSSKIQE---- NP_001016712.1
## Con MSPS-----LQEG-----AQL?ESK...RSHRESQFLMAKKNFNT?LLE---- Consensus
class(HESX1_align) <- "AAMultipleAlignment"
HESX1_align_seqinr <- msaConvert(HESX1_align, type = "seqinr::alignment")
print_msa(alignment = HESX1_align_seqinr,
chunksize = 60)
## [1] "MSPS-----LQEG-----AQLGENK----PSTCSFS----------------IERILGLD 0"
## [1] "MSPS-----LQEG-----AQLGESK----PSTCSFS----------------IERILGLD 0"
## [1] "MSPS-----LREG-----AQLRESK----PAPCSFS----------------IESILGLD 0"
## [1] "MSPS-----LREV-----AQLRESK----PSPCSFS----------------IESILGLD 0"
## [1] "MAPS-----LQEG-----AQLGERK----PSSCSFS----------------IESILGLD 0"
## [1] "MTPS-----LQEG-----ARLGESK----PSPCSFS----------------IESILGLD 0"
## [1] "MSPS-----LQEG-----ARLAESK----PSPCSFS----------------IESILGLD 0"
## [1] "MQPDSELRRLSHGERGDSQRLKDTELHDITTPQKFSGAIGKSLEKRLLLEKPMSPMTGLD 0"
## [1] "---------MVIG-----AQFAGGHCVYSAATSKYID--------------MLIRCLGLE 0"
## [1] "MSPG-----LQKG-----SRLMENR----SPPSSFS----------------IEHILGLD 0"
## [1] " "
## [1] "QKKDCVPL--MKPHRPWADTCS--SSGKDGNLCLHVPNPPSGISFP-SVVDHPMPEERAS 0"
## [1] "QNKDCVPL--MKPHRPWADTCS--SSGKDGNLCLHVPNPPSGISFP-SVVDHPMPEERAL 0"
## [1] "QKKDCTTS--VRPHRPWTDTCG--DSEKGGNPPLHAPDLPSETSFP-CPVDHPRPEERAP 0"
## [1] "QKKDCATS--VRPHRPWTDTCG--DSEKDGNPRLHAPGLPSEISFP-CPVDHPMPEERAP 0"
## [1] "QKKDCIPS--MKPHRPWADTCG--SSGKDVNLCVHIPSLPNGISLP-CTVDHPMPEEKFL 0"
## [1] "QKKDCAPS--MKPHRPWVDTYG--SSGKDVNLCLHVASLPNGISFP-CTVDHPAPEKRVL 0"
## [1] "QKKDCVPS--TKPHRPWADTCS--SSGKDVNFCLHVPSLPDGISLP-CNVDHSVPEEGVL 0"
## [1] "QKKDCVPS--TKPHRPWADTCG--SSGKEVNLCLHVPTLPSGISLP-QTVDHSVPEESVW 0"
## [1] "QKKDGIAA--VKPHRPWMDVCTNLVLGDDSDPHLQIPVVSYENSLF-HANSNLMQEEKVL 0"
## [1] "KKTEVASSPIIKHHRPWIQCNS---EGVDG-TFWHIPVISCDLPVQVHALRRSMGEETQV 0"
## [1] " "
## [1] "KYENYFSASERLSLKRELSWYRGRRPRTAFTQNQIEVLENVFRVNCYPGIDIREDLAQKL 0"
## [1] "KYENYFSASERLSLKRELSWYRGRRPRTAFTQNQIEVLENVFRVNCYPGIDIREDLAQKL 0"
## [1] "KYENYFSASETRSLKRELSWYRGRRPRTAFTQNQVEVLENVFRVNCYPGIDIREDLAQKL 0"
## [1] "KYENYFSASETHSLKRELSWYRGRRPRTAFTQNQVEVLENVFRMNCYPGIDIREDLAQKL 0"
## [1] "KYENYFSASERLSLKRELSWYRGRRPRTAFTQNQIEVLENVFRVNCYPGIDIREDLARKL 0"
## [1] "KHENYFSASERQSLKRELSWYRGRRPRTAFTQNQIEVLENVFRVNCYPGIDIREDLAQKL 0"
## [1] "KCENYFSASERLSLKRELSWYRGRRPRTAFTQNQIEMLENVFRVNCYPGIDIREDLARKL 0"
## [1] "KYEDYFAPSERLSLKRELSWYRGRRPRTAFTQNQIEVLENVFRVNCYPGIDIREDLARKL 0"
## [1] "NCEKYFSVTERLSFKRELSWYRGRRPRTAFTRNQIEVLENVFKMNSYPGIDIREELARKL 0"
## [1] "RLEKCCGEEDRLTYKRELSWYRGRRPRTAFTRSQIEILENVFRVNSYPGIDIREELAGKL 0"
## [1] " "
## [1] "NLEEDRIQIWFQNRRAKLKRSHRESQFLMAKKNFNTNLLE---- 16"
## [1] "NLEEDRIQIWFQNRRAKLKRSHRESQFLMAKKNFNTNLLE---- 16"
## [1] "NLEEDRIQIWFQNRRAKMKRSRRESQFLMAKKPFNPDLLK---- 16"
## [1] "NLEEDRIQIWFQNRRAKLKRSRRESQFLMAKKPFNPDLLK---- 16"
## [1] "NLEEDRIQIWFQNRRAKLKRSHRESQFLMAKKNFNSNLLEEIEN 16"
## [1] "NLEEDRIQIWFQNRRAKLKRSHRESQFLMAKKNFHTNLLE---- 16"
## [1] "NLEEDRIQIWFQNRRAKLKRSHRESQFLMAKKSFNTDLLE---- 16"
## [1] "NLEEDRIQIWFQNRRAKLKRSHRESQFLMAKKNFNTDLLE---- 16"
## [1] "DLEEDRIQIWFQNRRAKLKRSHRESQFLMVKNNFTSSLLE---- 16"
## [1] "ALDEDRIQIWFQNRRAKLKRSHRESQFLMVKDSLSSKIQE---- 16"
## [1] " "
Based off of our output from drawProteins, the amino acids between 109,165 are what we will look at.
ggmsa::ggmsa(HESX1_align,
start = 109,
end = 165)
HESX1_subset_dist <- seqinr::dist.alignment(HESX1_align_seqinr,
matrix = "identity")
seqinr::dist.alignment(HESX1_align_seqinr)
## NP_001362987.1 NP_001075039.1 NP_034550.2 NP_001102576.1
## NP_001075039.1 0.1273429
## NP_034550.2 0.4411288 0.4411288
## NP_001102576.1 0.4223486 0.4223486 0.2438431
## XP_038282473.1 0.3369177 0.3369177 0.4591414 0.4411288
## XP_001490373.2 0.3601801 0.3525965 0.4591414 0.4411288
## XP_020924666.1 0.3676073 0.3601801 0.4591414 0.4472136
## XP_017922675.1 0.4649906 0.4707671 0.5402702 0.5402702
## XP_015148451.2 0.6263108 0.6263108 0.6648225 0.6564596
## NP_001016712.1 0.6486640 0.6570342 0.6934510 0.6855256
## XP_038282473.1 XP_001490373.2 XP_020924666.1 XP_017922675.1
## NP_001075039.1
## NP_034550.2
## NP_001102576.1
## XP_038282473.1
## XP_001490373.2 0.3287980
## XP_020924666.1 0.3448462 0.3525965
## XP_017922675.1 0.4532167 0.4707671 0.4223486
## XP_015148451.2 0.6084125 0.6522380 0.6263108 0.6581452
## NP_001016712.1 0.6528625 0.6652991 0.6401844 0.6855256
## XP_015148451.2
## NP_001075039.1
## NP_034550.2
## NP_001102576.1
## XP_038282473.1
## XP_001490373.2
## XP_020924666.1
## XP_017922675.1
## XP_015148451.2
## NP_001016712.1 0.6726916
is(HESX1_subset_dist)
## [1] "dist" "oldClass"
class(HESX1_subset_dist)
## [1] "dist"
HESX1_subset_dist_alt <- matrix(data = NA,
nrow = 10,
ncol = 10)
distances <- c(0.1273429,
0.4411288, 0.4411288,
0.4223486, 0.4223486, 0.2438431,
0.3369177, 0.3369177, 0.4591414,0.4411288,
0.3601801, 0.3525965, 0.4591414, 0.4411288, 0.3287980,
0.3676073, 0.3601801, 0.4591414, 0.4472136, 0.3448462, 0.3525965,
0.4649906, 0.4707671, 0.5402702, 0.5402702, 0.4532167, 0.4707671, 0.4223486,
0.6263108, 0.6263108, 0.6648225, 0.6564596, 0.6084125, 0.6522380, 0.6263108, 0.6581452,
0.6486640, 0.6570342, 0.6934510, 0.6855256, 0.6528625, 0.6652991, 0.6401844, 0.6855256, 0.6726916)
HESX1_subset_dist_alt[lower.tri(HESX1_subset_dist_alt)] <- distances
seqnames <- c("NP_001362987.1","NP_001075039.1","NP_034550.2", "NP_001102576.1","XP_038282473.1","XP_001490373.2","XP_020924666.1","XP_017922675.1","XP_015148451.2","NP_001016712.1")
colnames(HESX1_subset_dist_alt) <- seqnames
row.names(HESX1_subset_dist_alt) <- seqnames
HESX1_subset_dist_alt <- as.dist(HESX1_subset_dist_alt)
HESX1_subset_dist <- HESX1_subset_dist_alt
HESX1_subset_dist_rounded <- round(HESX1_subset_dist,
digits = 3)
HESX1_subset_dist_rounded
## NP_001362987.1 NP_001075039.1 NP_034550.2 NP_001102576.1
## NP_001075039.1 0.127
## NP_034550.2 0.441 0.441
## NP_001102576.1 0.441 0.360 0.459
## XP_038282473.1 0.422 0.353 0.447 0.540
## XP_001490373.2 0.422 0.459 0.345 0.453
## XP_020924666.1 0.244 0.441 0.353 0.471
## XP_017922675.1 0.337 0.329 0.465 0.422
## XP_015148451.2 0.337 0.368 0.471 0.626
## NP_001016712.1 0.459 0.360 0.540 0.626
## XP_038282473.1 XP_001490373.2 XP_020924666.1 XP_017922675.1
## NP_001075039.1
## NP_034550.2
## NP_001102576.1
## XP_038282473.1
## XP_001490373.2 0.665
## XP_020924666.1 0.656 0.658
## XP_017922675.1 0.608 0.649 0.686
## XP_015148451.2 0.652 0.657 0.653 0.640
## NP_001016712.1 0.626 0.693 0.665 0.686
## XP_015148451.2
## NP_001075039.1
## NP_034550.2
## NP_001102576.1
## XP_038282473.1
## XP_001490373.2
## XP_020924666.1
## XP_017922675.1
## XP_015148451.2
## NP_001016712.1 0.673
tree_subset<- nj(HESX1_subset_dist)
plot.phylo(tree_subset, main="Phylogenetic Tree",
use.edge.length = T)
mtext(text = "HESX1 family gene tree - rooted, no branch lengths")