This gene encodes a lysozomal protein that functions to degrade hyaluronic acid at pH below 4.
This code downloads the sequences of HYAL2 orthologs from various species and performs various analyses on them. It predicts the secondary structure of the human protein and lastly. It also creates a multiple sequence alignment and phylogenetic tree of the pulled orthologs.
Key information used to make this script can be found here:
# install required packages
# install.packages("BiocManager")
# BiocManager::install("drawProteins")
# install.packages("HGNChelper")
# github packages
library(compbio4all)
library(ggmsa)
## Registered S3 methods overwritten by 'ggalt':
## method from
## grid.draw.absoluteGrob ggplot2
## grobHeight.absoluteGrob ggplot2
## grobWidth.absoluteGrob ggplot2
## grobX.absoluteGrob ggplot2
## grobY.absoluteGrob ggplot2
# CRAN packages
library(rentrez)
library(seqinr)
library(ape)
##
## Attaching package: 'ape'
## The following objects are masked from 'package:seqinr':
##
## as.alignment, consensus
library(pander)
## Warning: package 'pander' was built under R version 4.1.2
library(ggplot2)
# Bioconductor packages
library(msa)
## Loading required package: Biostrings
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
##
## windows
## Loading required package: XVector
## Loading required package: GenomeInfoDb
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:ape':
##
## complement
## The following object is masked from 'package:seqinr':
##
## translate
## The following object is masked from 'package:base':
##
## strsplit
library(drawProteins)
## Biostrings
library(Biostrings)
library(HGNChelper)
## Warning: package 'HGNChelper' was built under R version 4.1.2
Accession numbers were obtained from RefSeq, Refseq Homlogene, UniProt and PDB. UniProt accession numbers can be found by searching for the gene name. PDB accessions can be found by searching with a UniProt accession or a gene name, though many proteins are not in PDB. The the Neanderthal genome database was searched but did not yield sequence information on.
A protein BLAST search (https://blast.ncbi.nlm.nih.gov/Blast.cgi?PROGRAM=blastp&PAGE_TYPE=BlastSearch&LINK_LOC=blasthome) was carried out excluding vertebrates to determine if it occurred outside of vertebreates. The gene does not appear in non-vertebrates and so a second search was conducted to exclude mammals.
HGNChelper::checkGeneSymbols(x = c("HYAL2"))
## Maps last updated on: Thu Oct 24 12:31:05 2019
## x Approved Suggested.Symbol
## 1 HYAL2 TRUE HYAL2
Download sequences for the HYAL2 gene in different species using their RefSeq accessions.
Not available: Neanderthal
# RefSeq Uniprot PDB sci name common name gene name
hyal2_table <- c("NP_003764.3", "Q12891", "NA", "Homo sapiens", "Human", "HYAL2",
"XP_001168599.1", "H2R4F6", "NA", "Pan troglodytes", "Chimpanzee", "HYAL2",
"NP_034619.2", "O35632", "NA", "Mus musculus", "House mouse", "Hyal2",
"NP_776772.1", "Q8SQG8", "NA", "Bos taurus", "Cattle", "HYAL2",
"NP_001119986.1", "B0BM11", "NA", "Xenopus tropicalis", "tropical clawed frog", "hyal2",
"XP_001101134.1", "F7H9X1", "NA", "Macaca mulatta", "Rhesus monkey", "HYAL2",
"NP_742037.2", "Q9Z2Q3", "NA", "Rattus norvegicus", "Norway rat", "Hyal2",
"XP_007088631.2", "NA", "NA", "Panthera tigris", "tiger", "HYAL2",
"XP_012493526.1", "A0A2K6GDK7", "NA", "Propithecus coquereli", "Coquerel's sifaka", "HYAL2",
"XP_015342633.1", "NA", "NA", "Marmota marmota marmota", "Alpine marmot", "Hyal2")
# convert to table
hyal2_table <- matrix(hyal2_table, ncol=6, byrow=TRUE)
# display table information and convert to data.frame
dim(hyal2_table)
## [1] 10 6
hyal2_table <- data.frame(hyal2_table)
# dataframe information
is(hyal2_table)
## [1] "data.frame" "list" "oldClass" "vector"
## [5] "list_OR_List" "vector_OR_Vector" "vector_OR_factor"
colnames(hyal2_table)
## [1] "X1" "X2" "X3" "X4" "X5" "X6"
colnames(hyal2_table) <- c("ncbi.protein.accession", "UniProt.id", "PDB", "species", "common.name", "gene.name")
# display using pander
pander::pander(hyal2_table)
| ncbi.protein.accession | UniProt.id | PDB | species |
|---|---|---|---|
| NP_003764.3 | Q12891 | NA | Homo sapiens |
| XP_001168599.1 | H2R4F6 | NA | Pan troglodytes |
| NP_034619.2 | O35632 | NA | Mus musculus |
| NP_776772.1 | Q8SQG8 | NA | Bos taurus |
| NP_001119986.1 | B0BM11 | NA | Xenopus tropicalis |
| XP_001101134.1 | F7H9X1 | NA | Macaca mulatta |
| NP_742037.2 | Q9Z2Q3 | NA | Rattus norvegicus |
| XP_007088631.2 | NA | NA | Panthera tigris |
| XP_012493526.1 | A0A2K6GDK7 | NA | Propithecus coquereli |
| XP_015342633.1 | NA | NA | Marmota marmota marmota |
| common.name | gene.name |
|---|---|
| Human | HYAL2 |
| Chimpanzee | HYAL2 |
| House mouse | Hyal2 |
| Cattle | HYAL2 |
| tropical clawed frog | hyal2 |
| Rhesus monkey | HYAL2 |
| Norway rat | Hyal2 |
| tiger | HYAL2 |
| Coquerel’s sifaka | HYAL2 |
| Alpine marmot | Hyal2 |
All sequences were downloaded using a wrapper compbio4all::entrez_fetch_list() which uses rentrez::entrez_fetch() to access NCBI databases.
# data download and preparation
accessions <- hyal2_table[,1] # first index column
hyal2_list <- entrez_fetch_list(db="protein", id=accessions, rettype="fasta")
length(hyal2_list)
## [1] 10
The first entry
hyal2_list[[1]]
## [1] ">NP_003764.3 hyaluronidase-2 precursor [Homo sapiens]\nMRAGPGPTVTLALVLAVSWAMELKPTAPPIFTGRPFVVAWDVPTQDCGPRLKVPLDLNAFDVQASPNEGF\nVNQNITIFYRDRLGLYPRFDSAGRSVHGGVPQNVSLWAHRKMLQKRVEHYIRTQESAGLAVIDWEDWRPV\nWVRNWQDKDVYRRLSRQLVASRHPDWPPDRIVKQAQYEFEFAAQQFMLETLRYVKAVRPRHLWGFYLFPD\nCYNHDYVQNWESYTGRCPDVEVARNDQLAWLWAESTALFPSVYLDETLASSRHGRNFVSFRVQEALRVAR\nTHHANHALPVYVFTRPTYSRRLTGLSEMDLISTIGESAALGAAGVILWGDAGYTTSTETCQYLKDYLTRL\nLVPYVVNVSWATQYCSRAQCHGHGRCVRRNPSASTFLHLSTNSFRLVPGHAPGEPQLRPVGELSWADIDH\nLQTHFRCQCYLGWSGEQCQWDHRQAAGGASEAWAGSHLTSLLALAALAFTWTL\n\n"
Remove FASTA header
for(i in 1:length(hyal2_list)){
hyal2_list[[i]] <- compbio4all::fasta_cleaner(hyal2_list[[i]], parse = F)
}
Draw protein diagram.
# uniprot ids
uniprot_id <- hyal2_table[1, 2]
json <- drawProteins::get_features(uniprot_id)
## [1] "Download has worked"
# draw protein diagram
prot_df <- drawProteins::feature_to_dataframe(json)
canvas <- draw_canvas(prot_df)
canvas <- draw_chains(canvas, prot_df, label_size = 2.5)
canvas <- draw_regions(canvas, prot_df)
canvas <- draw_recept_dom(canvas, prot_df)
canvas <- draw_folding(canvas, prot_df)
canvas <- draw_repeat(canvas, prot_df)
canvas <- draw_phospho(canvas, prot_df)
show(canvas)
homo_sapiens <- hyal2_table[1, 1]
homo_sapiens <- entrez_fetch(id=homo_sapiens, db="protein", rettype = "fasta")
homo_sapiens <- fasta_cleaner(homo_sapiens)
par(mfrow = c(2,2),
mar = c(0,0,4,2))
# plot 1: Defaults
dotPlot(homo_sapiens, homo_sapiens,
wsize = 1,
nmatch = 1,
xlab="homo_sapiens", ylab="homo_sapiens",
main = "Defaults")
# plot 2 size = 10, nmatch = 1
dotPlot(homo_sapiens, homo_sapiens,
wsize = 10,
nmatch = 1,
xlab="homo_sapiens", ylab="homo_sapiens",
main = "Window Size 10 #1")
# plot 3: size = 10, nmatch = 5
dotPlot(homo_sapiens, homo_sapiens,
wsize = 10,
nmatch = 5,
xlab="homo_sapiens", ylab="homo_sapiens",
main = "Window Size 10 #2")
# plot 4: size = 20, nmatch = 5
dotPlot(homo_sapiens, homo_sapiens,
wsize = 20,
nmatch = 5,
xlab="homo_sapiens", ylab="homo_sapiens",
main = "Window size 20")
# reset par() - run this or other plots will be small!
par(mfrow = c(1,1),
mar = c(4,4,4,4))
# best plot
dotPlot(homo_sapiens, homo_sapiens,
wsize = 1,
nmatch = 1,
xlab="homo_sapiens", ylab="homo_sapiens",
main = "Best plot")
sources <- c("PFAM", "Uniprot", "AlphaFold", "RepeatsDB")
properties <- c("There is a glucoside hydrolase family domain in the protein encoded by the HYAL2 gene.",
"HYAL2 econdes a protein present in the plasma membrane of the cell.",
"There was no speicfic information about the secondary structure on AlphaFold, but from the 3D model, it looks like the protein has a number of alpha helices and a few beta sheets.",
"NA")
links <- c("http://pfam.xfam.org/protein/Q12891",
"https://www.uniprot.org/uniprot/Q12891",
"https://alphafold.ebi.ac.uk/entry/Q12891",
"NA")
# convert to table
protein_table <- data.frame(sources, properties, links)
colnames(protein_table) <- c("Source", "Protein Property", "Link")
pander::pander(protein_table)
| Source | Protein Property |
|---|---|
| PFAM | There is a glucoside hydrolase family domain in the protein encoded by the HYAL2 gene. |
| Uniprot | HYAL2 econdes a protein present in the plasma membrane of the cell. |
| AlphaFold | There was no speicfic information about the secondary structure on AlphaFold, but from the 3D model, it looks like the protein has a number of alpha helices and a few beta sheets. |
| RepeatsDB | NA |
| Link |
|---|
| http://pfam.xfam.org/protein/Q12891 |
| https://www.uniprot.org/uniprot/Q12891 |
| https://alphafold.ebi.ac.uk/entry/Q12891 |
| NA |
Multivariate statistical techniques were used to confirm the information about protein structure and location in the line database.
Alphafold indicates that there are a mix of alpha helices and beta sheets. I therefore predict that machine-learning methods will indicate an a+b and a/b structure.
aa_codes <- c("A", "R", "N", "D", "C", "Q", "E", "G", "H", "I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V")
# alpha proteins
alpha <- c(285, 53, 97, 163, 22, 67, 134, 197, 111, 91,
221, 249, 48, 123, 82, 122, 119, 33, 63, 167)
sum.alpha <- 2447
# check against chou's total
sum(alpha) == sum.alpha
## [1] TRUE
# beta proteins
beta <- c(203, 67, 139, 121, 75, 122, 86, 297, 49, 120,
177, 115, 16, 85, 127, 341, 253, 44, 110, 229)
sum.beta <- 2776
# check against chou's total
sum(beta) == sum.beta
## [1] TRUE
# alpha + beta
a.plus.b <- c(175, 78, 120, 111, 74, 74, 86, 171, 33, 93,
110, 112, 25, 52, 71, 126, 117, 30, 108, 123)
sum.aplusb <- 1889
sum(a.plus.b) == sum.aplusb
## [1] TRUE
# alpha/beta
a.div.b <- c(361, 146, 183, 244, 63, 114, 257, 377, 107, 239,
339, 321, 91, 158, 188, 327, 238, 72, 130, 378)
sum.adivb <- 4333
sum(a.div.b) == sum.adivb
## [1] TRUE
composition_table <- data.frame(amino_acid=aa_codes, alpha=alpha, beta=beta, a_plus_b=a.plus.b, a_div_b=a.div.b)
pander::pander(composition_table)
| amino_acid | alpha | beta | a_plus_b | a_div_b |
|---|---|---|---|---|
| A | 285 | 203 | 175 | 361 |
| R | 53 | 67 | 78 | 146 |
| N | 97 | 139 | 120 | 183 |
| D | 163 | 121 | 111 | 244 |
| C | 22 | 75 | 74 | 63 |
| Q | 67 | 122 | 74 | 114 |
| E | 134 | 86 | 86 | 257 |
| G | 197 | 297 | 171 | 377 |
| H | 111 | 49 | 33 | 107 |
| I | 91 | 120 | 93 | 239 |
| L | 221 | 177 | 110 | 339 |
| K | 249 | 115 | 112 | 321 |
| M | 48 | 16 | 25 | 91 |
| F | 123 | 85 | 52 | 158 |
| P | 82 | 127 | 71 | 188 |
| S | 122 | 341 | 126 | 327 |
| T | 119 | 253 | 117 | 238 |
| W | 33 | 44 | 30 | 72 |
| Y | 63 | 110 | 108 | 130 |
| V | 167 | 229 | 123 | 378 |
aa_prop <- data.frame(alpha.prop=alpha/sum.alpha,
beta.prop=beta/sum.beta,
a_plus_b.prop=a.plus.b/sum.aplusb,
a_div_b.prop=a.div.b/sum.adivb)
round(aa_prop, 4)
## alpha.prop beta.prop a_plus_b.prop a_div_b.prop
## 1 0.1165 0.0731 0.0926 0.0833
## 2 0.0217 0.0241 0.0413 0.0337
## 3 0.0396 0.0501 0.0635 0.0422
## 4 0.0666 0.0436 0.0588 0.0563
## 5 0.0090 0.0270 0.0392 0.0145
## 6 0.0274 0.0439 0.0392 0.0263
## 7 0.0548 0.0310 0.0455 0.0593
## 8 0.0805 0.1070 0.0905 0.0870
## 9 0.0454 0.0177 0.0175 0.0247
## 10 0.0372 0.0432 0.0492 0.0552
## 11 0.0903 0.0638 0.0582 0.0782
## 12 0.1018 0.0414 0.0593 0.0741
## 13 0.0196 0.0058 0.0132 0.0210
## 14 0.0503 0.0306 0.0275 0.0365
## 15 0.0335 0.0457 0.0376 0.0434
## 16 0.0499 0.1228 0.0667 0.0755
## 17 0.0486 0.0911 0.0619 0.0549
## 18 0.0135 0.0159 0.0159 0.0166
## 19 0.0257 0.0396 0.0572 0.0300
## 20 0.0682 0.0825 0.0651 0.0872
row.names(aa_prop) <- aa_codes
pander::pander(aa_prop)
| alpha.prop | beta.prop | a_plus_b.prop | a_div_b.prop | |
|---|---|---|---|---|
| A | 0.1165 | 0.07313 | 0.09264 | 0.08331 |
| R | 0.02166 | 0.02414 | 0.04129 | 0.03369 |
| N | 0.03964 | 0.05007 | 0.06353 | 0.04223 |
| D | 0.06661 | 0.04359 | 0.05876 | 0.05631 |
| C | 0.008991 | 0.02702 | 0.03917 | 0.01454 |
| Q | 0.02738 | 0.04395 | 0.03917 | 0.02631 |
| E | 0.05476 | 0.03098 | 0.04553 | 0.05931 |
| G | 0.08051 | 0.107 | 0.09052 | 0.08701 |
| H | 0.04536 | 0.01765 | 0.01747 | 0.02469 |
| I | 0.03719 | 0.04323 | 0.04923 | 0.05516 |
| L | 0.09031 | 0.06376 | 0.05823 | 0.07824 |
| K | 0.1018 | 0.04143 | 0.05929 | 0.07408 |
| M | 0.01962 | 0.005764 | 0.01323 | 0.021 |
| F | 0.05027 | 0.03062 | 0.02753 | 0.03646 |
| P | 0.03351 | 0.04575 | 0.03759 | 0.04339 |
| S | 0.04986 | 0.1228 | 0.0667 | 0.07547 |
| T | 0.04863 | 0.09114 | 0.06194 | 0.05493 |
| W | 0.01349 | 0.01585 | 0.01588 | 0.01662 |
| Y | 0.02575 | 0.03963 | 0.05717 | 0.03 |
| V | 0.06825 | 0.08249 | 0.06511 | 0.08724 |
Determine the number of each amino acid in my protein (HYAL2).
A function to convert a table into a vector is helpful here.
table_to_vector <- function(table_x){
table_names <- attr(table_x, "dimnames")[[1]]
table_vect <- as.vector(table_x)
names(table_vect) <- table_names
return(table_vect)
}
freq_table <- table(homo_sapiens)/length(homo_sapiens)
freq_vector <- table_to_vector(freq_table)
freq_vector
## A C D E F G H
## 0.09513742 0.02114165 0.04862579 0.04228330 0.04016913 0.06131078 0.03805497
## I K L M N P Q
## 0.02114165 0.01691332 0.09936575 0.01057082 0.02959831 0.05919662 0.05073996
## R S T V W Y
## 0.08245243 0.06342495 0.05919662 0.08033827 0.04016913 0.04016913
Any “U” if present in the sequence, will be removed. U means unknown amino acid.
aa_names <- names(freq_vector)
i.U <- which(aa_names == "U")
aa_names[i.U]
## character(0)
freq_vector[i.U]
## named numeric(0)
if(length(i.U) > 0) {
homo_sapiens <- homo_sapiens[-i.U]
}
Add data on my protein to the proportion table.
aa_prop$HYAL2_human_aa_freq <- freq_vector
pander::pander(aa_prop)
| alpha.prop | beta.prop | a_plus_b.prop | a_div_b.prop | |
|---|---|---|---|---|
| A | 0.1165 | 0.07313 | 0.09264 | 0.08331 |
| R | 0.02166 | 0.02414 | 0.04129 | 0.03369 |
| N | 0.03964 | 0.05007 | 0.06353 | 0.04223 |
| D | 0.06661 | 0.04359 | 0.05876 | 0.05631 |
| C | 0.008991 | 0.02702 | 0.03917 | 0.01454 |
| Q | 0.02738 | 0.04395 | 0.03917 | 0.02631 |
| E | 0.05476 | 0.03098 | 0.04553 | 0.05931 |
| G | 0.08051 | 0.107 | 0.09052 | 0.08701 |
| H | 0.04536 | 0.01765 | 0.01747 | 0.02469 |
| I | 0.03719 | 0.04323 | 0.04923 | 0.05516 |
| L | 0.09031 | 0.06376 | 0.05823 | 0.07824 |
| K | 0.1018 | 0.04143 | 0.05929 | 0.07408 |
| M | 0.01962 | 0.005764 | 0.01323 | 0.021 |
| F | 0.05027 | 0.03062 | 0.02753 | 0.03646 |
| P | 0.03351 | 0.04575 | 0.03759 | 0.04339 |
| S | 0.04986 | 0.1228 | 0.0667 | 0.07547 |
| T | 0.04863 | 0.09114 | 0.06194 | 0.05493 |
| W | 0.01349 | 0.01585 | 0.01588 | 0.01662 |
| Y | 0.02575 | 0.03963 | 0.05717 | 0.03 |
| V | 0.06825 | 0.08249 | 0.06511 | 0.08724 |
| HYAL2_human_aa_freq | |
|---|---|
| A | 0.09514 |
| R | 0.02114 |
| N | 0.04863 |
| D | 0.04228 |
| C | 0.04017 |
| Q | 0.06131 |
| E | 0.03805 |
| G | 0.02114 |
| H | 0.01691 |
| I | 0.09937 |
| L | 0.01057 |
| K | 0.0296 |
| M | 0.0592 |
| F | 0.05074 |
| P | 0.08245 |
| S | 0.06342 |
| T | 0.0592 |
| W | 0.08034 |
| Y | 0.04017 |
| V | 0.04017 |
Two custom functions are needed: one to calculate correlates between two columns of our table, and one to calculate correlation similarities.
# Corrleation used in Chou and Zhange 1992.
chou_cor <- function(x,y){
numerator <- sum(x*y)
denominator <- sqrt((sum(x^2))*(sum(y^2)))
result <- numerator/denominator
return(result)
}
# Cosine similarity used in Higgs and Attwood (2005).
chou_cosine <- function(z.1, z.2){
z.1.abs <- sqrt(sum(z.1^2))
z.2.abs <- sqrt(sum(z.2^2))
my.cosine <- sum(z.1*z.2)/(z.1.abs*z.2.abs)
return(my.cosine)
}
Display the data as scatterplots
par(mfrow = c(2,2), mar = c(1,4,1,1))
plot(alpha.prop ~ HYAL2_human_aa_freq, data = aa_prop)
plot(beta.prop ~ HYAL2_human_aa_freq, data = aa_prop)
plot(a_plus_b.prop ~ HYAL2_human_aa_freq, data = aa_prop)
plot(a_div_b.prop ~ HYAL2_human_aa_freq, data = aa_prop)
None of these correlations are very informative.
Correlation between each column:
cor_matrix <- round(cor(aa_prop), 3)
cor_matrix
## alpha.prop beta.prop a_plus_b.prop a_div_b.prop
## alpha.prop 1.000 0.494 0.697 0.856
## beta.prop 0.494 1.000 0.798 0.771
## a_plus_b.prop 0.697 0.798 1.000 0.820
## a_div_b.prop 0.856 0.771 0.820 1.000
## HYAL2_human_aa_freq -0.147 0.033 -0.019 -0.082
## HYAL2_human_aa_freq
## alpha.prop -0.147
## beta.prop 0.033
## a_plus_b.prop -0.019
## a_div_b.prop -0.082
## HYAL2_human_aa_freq 1.000
Calculate the correlation between each column
corr.alpha <- chou_cor(aa_prop[, 5], aa_prop[, 1])
corr.beta <- chou_cor(aa_prop[, 5], aa_prop[, 2])
corr.apb <- chou_cor(aa_prop[, 5], aa_prop[, 3])
corr.adb <- chou_cor(aa_prop[, 5], aa_prop[, 4])
Calculate the cosine similarity with each column
cos.alpha <- chou_cosine(aa_prop[, 5], aa_prop[, 1])
cos.beta <- chou_cosine(aa_prop[, 5], aa_prop[, 2])
cos.apb <- chou_cosine(aa_prop[, 5], aa_prop[, 3])
cos.adb <- chou_cosine(aa_prop[, 5], aa_prop[, 4])
Flip the distance matrix.
aa_prop_flipped <- t(aa_prop)
round(aa_prop_flipped, 2)
## A R N D C Q E G H I L K
## alpha.prop 0.12 0.02 0.04 0.07 0.01 0.03 0.05 0.08 0.05 0.04 0.09 0.10
## beta.prop 0.07 0.02 0.05 0.04 0.03 0.04 0.03 0.11 0.02 0.04 0.06 0.04
## a_plus_b.prop 0.09 0.04 0.06 0.06 0.04 0.04 0.05 0.09 0.02 0.05 0.06 0.06
## a_div_b.prop 0.08 0.03 0.04 0.06 0.01 0.03 0.06 0.09 0.02 0.06 0.08 0.07
## HYAL2_human_aa_freq 0.10 0.02 0.05 0.04 0.04 0.06 0.04 0.02 0.02 0.10 0.01 0.03
## M F P S T W Y V
## alpha.prop 0.02 0.05 0.03 0.05 0.05 0.01 0.03 0.07
## beta.prop 0.01 0.03 0.05 0.12 0.09 0.02 0.04 0.08
## a_plus_b.prop 0.01 0.03 0.04 0.07 0.06 0.02 0.06 0.07
## a_div_b.prop 0.02 0.04 0.04 0.08 0.05 0.02 0.03 0.09
## HYAL2_human_aa_freq 0.06 0.05 0.08 0.06 0.06 0.08 0.04 0.04
dist_matrix <- dist(aa_prop_flipped, method="euclidean")
dist_matrix
## alpha.prop beta.prop a_plus_b.prop a_div_b.prop
## beta.prop 0.13342098
## a_plus_b.prop 0.09281824 0.08289406
## a_div_b.prop 0.06699039 0.08659174 0.06175113
## HYAL2_human_aa_freq 0.18171261 0.17210489 0.14724825 0.15972045
Individual distances
dist.alpha <- dist(aa_prop_flipped[c(1, 5),], method="euclidean")
dist.beta <- dist(aa_prop_flipped[c(2, 5),], method="euclidean")
dist.apb <- dist(aa_prop_flipped[c(3, 5),], method="euclidean")
dist.adb <- dist(aa_prop_flipped[c(4, 5),], method="euclidean")
# fold types
fold.type <- c("alpha","beta","alpha plus beta", "alpha/beta")
# data
corr.sim <- round(c(corr.alpha,corr.beta,corr.apb,corr.adb),5)
cosine.sim <- round(c(cos.alpha,cos.beta,cos.apb,cos.adb),5)
Euclidean.dist <- round(c(dist.alpha,dist.beta,dist.apb,dist.adb),5)
# summary
sim.sum <- c("","","most.sim","")
dist.sum <- c("","","min.dist","")
df <- data.frame(fold.type,
corr.sim ,
cosine.sim ,
Euclidean.dist ,
sim.sum ,
dist.sum )
pander::pander(df)
| fold.type | corr.sim | cosine.sim | Euclidean.dist | sim.sum | dist.sum |
|---|---|---|---|---|---|
| alpha | 0.7442 | 0.7442 | 0.1817 | ||
| beta | 0.7741 | 0.7741 | 0.1721 | ||
| alpha plus beta | 0.8215 | 0.8215 | 0.1472 | most.sim | min.dist |
| alpha/beta | 0.7936 | 0.7936 | 0.1597 |
Convert all the FASTA records into entires in a single vector.
names(hyal2_list)
## [1] "NP_003764.3" "XP_001168599.1" "NP_034619.2" "NP_776772.1"
## [5] "NP_001119986.1" "XP_001101134.1" "NP_742037.2" "XP_007088631.2"
## [9] "XP_012493526.1" "XP_015342633.1"
length(hyal2_list)
## [1] 10
Each entry is stores as a full entry with no spaces, parsing or header. For example,
hyal2_list[1]
## $NP_003764.3
## [1] "MRAGPGPTVTLALVLAVSWAMELKPTAPPIFTGRPFVVAWDVPTQDCGPRLKVPLDLNAFDVQASPNEGFVNQNITIFYRDRLGLYPRFDSAGRSVHGGVPQNVSLWAHRKMLQKRVEHYIRTQESAGLAVIDWEDWRPVWVRNWQDKDVYRRLSRQLVASRHPDWPPDRIVKQAQYEFEFAAQQFMLETLRYVKAVRPRHLWGFYLFPDCYNHDYVQNWESYTGRCPDVEVARNDQLAWLWAESTALFPSVYLDETLASSRHGRNFVSFRVQEALRVARTHHANHALPVYVFTRPTYSRRLTGLSEMDLISTIGESAALGAAGVILWGDAGYTTSTETCQYLKDYLTRLLVPYVVNVSWATQYCSRAQCHGHGRCVRRNPSASTFLHLSTNSFRLVPGHAPGEPQLRPVGELSWADIDHLQTHFRCQCYLGWSGEQCQWDHRQAAGGASEAWAGSHLTSLLALAALAFTWTL"
This is then turned into a vector
# creating a vector without sequences with NA at every position
hyal2_vector <- rep(NA, length(hyal2_list))
# populating the vector with the fasta sequences from shroom_list
for(i in 1:length(hyal2_vector)){
hyal2_vector[i] <- hyal2_list[[i]]
}
# labeling the sequences with the names from te list.
names(hyal2_vector) <- names(hyal2_list)
Pairwise Alignments
human_chimp.aln <- pairwiseAlignment(hyal2_list[[1]], hyal2_list[[2]])
human_mouse.aln <- pairwiseAlignment(hyal2_list[[1]], hyal2_list[[3]])
human_cattle.aln <- pairwiseAlignment(hyal2_list[[1]], hyal2_list[[4]])
human_chimp.aln
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: MRAGPGPTVTLALVLAVSWAMELKPTAPPIFTGR...WDHRQAAGGASEAWAGSHLTSLLALAALAFTWTL
## subject: MRAGPGPTVTLALVLAVAWAMELKPTAPPIFTGR...WDHRQAAGGASEAWAGSHLTSLLALAALAFTWTL
## score: 2014.558
human_mouse.aln
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: MRAGPGPTVTLALVLAVSWAMELKPTAPPIFTGR...WDHRQAAGGASEAWAGSHLTSLLALAALAFTWTL
## subject: MRAGLGPIITLALVLEVAWAGELKPTAPPIFTGR...RNYKGAAGNASRAWAGSHLTSLLGLVAVALTWTL
## score: 1139.505
human_cattle.aln
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: MRAGPGPTVTLALVLAVSWAMELKPTAPPIFTGR...WDHRQAAGGASEAWAGSHLTSLLALAALAFTWTL
## subject: MWTGLGPAVTLALVLVVAWATELKPTAPPIFTGR...WDRRRAAGGASGAWAGSHLTGLLAVAVLAFT---
## score: 1293.451
# other alignemnts
chimp_mouse.aln <- pairwiseAlignment(hyal2_list[[2]], hyal2_list[[3]])
chimp_cattle.aln <- pairwiseAlignment(hyal2_list[[2]], hyal2_list[[4]])
mouse_cattle.aln <- pairwiseAlignment(hyal2_list[[3]], hyal2_list[[4]])
Biostrings::pid(human_chimp.aln)
## [1] 99.57717
Biostrings::pid(human_mouse.aln)
## [1] 82.0296
Biostrings::pid(human_cattle.aln)
## [1] 85.62368
Build matrix
# human chimp mouse cattle
pids <- c(1, NA, NA, NA,
pid(human_chimp.aln), 1, NA, NA,
pid(human_mouse.aln), pid(chimp_mouse.aln), 1, NA,
pid(human_cattle.aln), pid(chimp_cattle.aln), pid(mouse_cattle.aln), 1)
pid_matrix <- matrix(pids, nrow=4, byrow=T)
row.names(pid_matrix) <- c("Homo", "Pan", "Mus", "Bos")
colnames(pid_matrix) <- c("Homo", "Pan", "Mus", "Bos")
pander::pander(pid_matrix)
| Homo | Pan | Mus | Bos | |
|---|---|---|---|---|
| Homo | 1 | NA | NA | NA |
| Pan | 99.58 | 1 | NA | NA |
| Mus | 82.03 | 82.45 | 1 | NA |
| Bos | 85.62 | 85.84 | 78.74 | 1 |
Compare the humans vs. chimps PID using different methods
pid_1 <- Biostrings::pid(human_chimp.aln, type = "PID1")
pid_2 <- Biostrings::pid(human_chimp.aln, type = "PID2")
pid_3 <- Biostrings::pid(human_chimp.aln, type = "PID3")
pid_4 <- Biostrings::pid(human_chimp.aln, type = "PID4")
pid_vector <- c(pid_1, pid_2, pid_3, pid_4)
method <- c("PID1", "PID2", "PID3", "PID4")
den <- c("(aligned positions + internal gap positions)",
"(aligned positions)",
"(length shorter sequence)",
"(average length of the two sequences)")
pid_df <- data.frame(method, pid_vector, den)
colnames(pid_df) <- c("method", "PID", "denominator")
pander::pander(pid_df)
| method | PID | denominator |
|---|---|---|
| PID1 | 99.58 | (aligned positions + internal gap positions) |
| PID2 | 99.58 | (aligned positions) |
| PID3 | 99.58 | (length shorter sequence) |
| PID4 | 99.58 | (average length of the two sequences) |
# convert the vectors to AAStringSet
hyal2_vector_ss <- Biostrings::AAStringSet(hyal2_vector)
hyal2_msa <- msa(hyal2_vector_ss, method = "ClustalW")
## use default substitution matrix
class(hyal2_msa)
## [1] "MsaAAMultipleAlignment"
## attr(,"package")
## [1] "msa"
is(hyal2_msa)
## [1] "MsaAAMultipleAlignment" "AAMultipleAlignment" "MsaMetaData"
## [4] "MultipleAlignment"
hyal2_msa
## CLUSTAL 2.1
##
## Call:
## msa(hyal2_vector_ss, method = "ClustalW")
##
## MsaAAMultipleAlignment with 10 rows and 484 columns
## aln names
## [1] MRAGLG-PIITLALVLEVAWAGEL-...ASRAWAGSHLTSLLGLVAVALTWTL NP_034619.2
## [2] MRAGLG-PIITLALVLEVAWASEL-...ASRAWAGAHLASLLGLVAMTLTWTL NP_742037.2
## [3] MLAGLG-PTITLALVLAVAWAMEL-...ASGAWTGSHFTSLLALVAVALT--- XP_015342633.1
## [4] MRAGPG-PTVTLALVLAVSWAMEL-...ASEAWAGSHLTSLLALAALAFTWTL NP_003764.3
## [5] MRAGPG-PTVTLALVLAVAWAMEL-...ASEAWAGSHLTSLLALAALAFTWTL XP_001168599.1
## [6] MRAGPG-PAVTLALVLAVAWAMEL-...ASEAWAAFHLTSLLALAALAFTWTL XP_001101134.1
## [7] MWAGPG-PTAVLALVLAVAWAMEL-...ASGAWAGSHLASLLASTALAFICTL XP_012493526.1
## [8] MWTGLG-PAVTLALVLVVAWATEL-...ASGAWAGSHLTGLLAVAVLAFT--- NP_776772.1
## [9] MWAGLG-PAITLAVVWAAAWATQL-...ASGAWAGSHLTGPLAVAALALTWTL XP_007088631.2
## [10] MPCCLSFLPAFLWLFLGAAANGQLS...VATGPCGIVLVVSLVALILALL--- NP_001119986.1
## Con MRAGLG-P??TLALVLAVAWAMEL-...AS?AWAGSHLTSLLALAALA?TWTL Consensus
class(hyal2_msa) <- "AAMultipleAlignment"
class(hyal2_msa)
## [1] "AAMultipleAlignment"
# convert MSA to seqinr alignment format
hyal2_seqinr <- msaConvert(hyal2_msa, type = "seqinr::alignment")
print_msa(alignment = hyal2_seqinr, chunksize = 60)
## [1] "MRAGLG-PIITLALVLEVAWAGEL------KPTAPPIFTGRPFVVAWNVPTQECAPRHKV 0"
## [1] "MRAGLG-PIITLALVLEVAWASEL------KPTAPPIFTGRPFVVAWNVPTQECAPRHKV 0"
## [1] "MLAGLG-PTITLALVLAVAWAMEL------KPTVPPIFTGRPFVVAWDVPTQDCAPRHKV 0"
## [1] "MRAGPG-PTVTLALVLAVSWAMEL------KPTAPPIFTGRPFVVAWDVPTQDCGPRLKV 0"
## [1] "MRAGPG-PTVTLALVLAVAWAMEL------KPTAPPIFTGRPFVVAWDVPTQDCGPRLKV 0"
## [1] "MRAGPG-PAVTLALVLAVAWAMEL------KPTAPPIFTGRPFVVAWDVPTQDCGPRLKV 0"
## [1] "MWAGPG-PTAVLALVLAVAWAMEL------KPTAPPIFTGRPFVVAWDVPTQDCGPRLKV 0"
## [1] "MWTGLG-PAVTLALVLVVAWATEL------KPTAPPIFTGRPFVVAWDVPTQDCGPRHKM 0"
## [1] "MWAGLG-PAITLAVVWAAAWATQL------KPTAPPIFTGRPFVVAWDVPTQDCGPRLKV 0"
## [1] "MPCCLSFLPAFLWLFLGAAANGQLSDSWLNKPTFRPVFTRRPFIIAWNAPTQDCPPRFNV 0"
## [1] " "
## [1] "PLD---LRAFDVKATPNEGFFNQNITTFYYDRLGLYPRFDAAGTSVHGGVPQNGSLCAHL 0"
## [1] "PLD---LRAFDVEATPNEGFFNQNITTFYYDRLGLYPRFDAAGMSVHGGVPQNGSLCAHL 0"
## [1] "PLD---LSAFDVEASPNEGFVNQNITIFYYDRLGLYPRFDSTGTSVHGGVPQNVNLRAHL 0"
## [1] "PLD---LNAFDVQASPNEGFVNQNITIFYRDRLGLYPRFDSAGRSVHGGVPQNVSLWAHR 0"
## [1] "PLD---LNAFDVQASPNEGFVNQNITIFYRDRLGLYPRFDSAGRSVHGGVPQNVSLWAHR 0"
## [1] "PLD---LNAFDVQASPNEGFVNQNITIFYRDRLGLYPRFDSAGRSVHGGVPQNVSLWAHR 0"
## [1] "PLD---LTAFDVQASPNEGFVNQNITIFYRDRLGLYPRFDSAGRSVHGGVPQNGSLWAHL 0"
## [1] "PLDPKDMKAFDVQASPNEGFVNQNITIFYRDRLGMYPHFNSVGRSVHGGVPQNGSLWVHL 0"
## [1] "PLD---LKAFDVQASPNEGFVNQNITIFYHDRLGLYPRFSSVGRSVHGGVPQNGSLWAHL 0"
## [1] "HLD---LKLFDLNASPNEGFVDQNLTIFYKERLGMYPYYDEHLAPVAGGLPQNASLRAHL 0"
## [1] " "
## [1] "PMLKESVERYIQTQEPGGLAVIDWEEWRPVWVRNWQEKDVYRQSSRQLVASRHPDWPSDR 0"
## [1] "PMLKEAVERYIQTQEPAGLAVIDWEEWRPVWVRNWQEKDVYRQSSRQLVASRHPDWPSDR 0"
## [1] "QMLKKPVEHYIRTQEPAGLAVIDWEDWRPVWVRNWQDKDVYRQSSRQLVASRHPDWPSDR 0"
## [1] "KMLQKRVEHYIRTQESAGLAVIDWEDWRPVWVRNWQDKDVYRRLSRQLVASRHPDWPPDR 0"
## [1] "KMLQKRVEHYIRTQESAGLAVIDWEDWRPVWVRNWQDKDVYRRLSRQLVASRHPDWPPDR 0"
## [1] "KMLQKRVEHYIRTQESEGLAVIDWEDWRPVWVRNWQDKDVYRRSSRQLVASRHPDWPPDR 0"
## [1] "KMLKRRVEHYIRTQEPAGLAVIDWEDWRPVWVRNWQDKDVYRRSSRQLVASRHPDWPSDR 0"
## [1] "EMLKGHVEHYIRTQEPAGLAVIDWEDWRPVWVRNWQDKDVYRRLSRHLVAIRHPDWPPER 0"
## [1] "KMLQEHVEHYIRTQEPAGLAVIDWEDWRPVWVRNWQDKDIYRQSSRQLVAVRHPDWPSDR 0"
## [1] "DKLPEGIQKYIRSRDKDGLAVIDWEEWRPIWVRNWQNKDVYRQNSRNLVSSRHPTWPREK 0"
## [1] " "
## [1] "VMKQAQYEFEFAARQFMLNTLRYVKAVRPQHLWGFYLFPDCYNHDYVQNWESYTGRCPDV 0"
## [1] "IVKQAQYEFEFAARQFMLNTLRYVKAVRPQHLWGFYLFPDCYNHDYVQNWDSYTGRCPDV 0"
## [1] "IVKQAQYEFESNARQFMLETLRYVKAVRPRHLWGFYLFPDCYNHDYVQNWESYTGRCPDV 0"
## [1] "IVKQAQYEFEFAAQQFMLETLRYVKAVRPRHLWGFYLFPDCYNHDYVQNWESYTGRCPDV 0"
## [1] "IVKQAQYEFEFAAQQFMLETLRYVKAVRPRHLWGFYLFPDCYNHDYVQNWESYTGRCPDV 0"
## [1] "IVKQAQYEFEFAAQQFMLETLRYVKAVRPRHLWGFYLFPDCYNHDYVQNWESYTGRCPDV 0"
## [1] "VVKQAQYEFEFAARQFMLETLRYVKAVRPQHLWGFYLFPDCYNHDYVQNWESYTGRCPDV 0"
## [1] "VAKEAQYEFEFAARQFMLETLRFVKAFRPRHLWGFYLFPDCYNHDYVQNWETYTGRCPDV 0"
## [1] "VVKQAQYEFEFAARQFMLETLRFVKAVRPRHLWGFYLFPDCYNHDYVQNWETYTGRCPDV 0"
## [1] "VDKEALYEFENAAREFMTETLRHAKNYRPRQLWGFYLFPDCYNHDYVKNRDSYTGQCPDV 0"
## [1] " "
## [1] "EVARNDQLAWLWAESTALFPSVYLDETLASSVHSRNFVSFRVREALRVAHTHHANHALPV 0"
## [1] "EVARNDQLAWLWAESTALFPSVYLDETLASSKHSRNFVSFRVQEALRVAHTHHANHALPV 0"
## [1] "EVARNDQLAWLWAESTALFPSVYLDETLASSAHGRNFVSFRVQEALRVAHTHHANHALPV 0"
## [1] "EVARNDQLAWLWAESTALFPSVYLDETLASSRHGRNFVSFRVQEALRVARTHHANHALPV 0"
## [1] "EVARNDQLAWLWAESTALFPSVYLDETLASSRHGRNFVSFRVQEALRVARTHHANHALPV 0"
## [1] "EVARNDQLAWLWAESTALFPSVYLDETLASSRHGRNFVSFRVQEALRVARTHHANHALPV 0"
## [1] "EVARNDQLAWLWAESTALFPSVYLDETLASSAHGRNFVSFRVQEALRVAHTHHANHALPV 0"
## [1] "EVSRNDQLAWLWAESTALFPSVYLEETLASSTHGRNFVSFRVQEALRVADVHHANHALPV 0"
## [1] "EVSRNDQLAWLWAESTALFPSVYLDETLASSTHGRNFVSFRVQEALRVAHTHHPNHALPV 0"
## [1] "EISRNDQLSWLWEESTALYPSIYLDQILASSENGRKFVRSRVREAMRISYRHHKDYSLPV 0"
## [1] " "
## [1] "YVFTRPTYTRGLTGLSQVDLISTIGESAALGSAGVIFWGDSEDASSMETCQYLKNYLTQL 0"
## [1] "YVFTRPTYTRGLTELSQMDLISTIGESAALGSAGVIFWGDSVYASSMENCQNLKKYLTQT 0"
## [1] "YVFTRPTYTRKLTGLSEMDLISTIGESAALGAAGVILWGDSVYTSSMETCQYLKNYLTQL 0"
## [1] "YVFTRPTYSRRLTGLSEMDLISTIGESAALGAAGVILWGDAGYTTSTETCQYLKDYLTRL 0"
## [1] "YVFTRPTYSRRLTGLSEMDLISTIGESAALGAAGVILWGDAGYTTSTETCQYLKDYLTRL 0"
## [1] "YVFTRPTYSRRLTGLSEMDLISTIGESAALGAAGVILWGDAGYTTSTETCQYLKDYLTRL 0"
## [1] "YVFTRPTYSRRLTGLSEMDLISTIGESAALGAAGVILWGDAGYTTSTETCQYLKDYLTRL 0"
## [1] "YVFTRPTYSRGLTGLSEMDLISTIGESAALGAAGVILWGDAGFTTSNETCRRLKDYLTRS 0"
## [1] "YVFTRPTYSRKLTGLSEMDLISTIGESAALGAAGVILWGDAGYTTSTETCQYLKDYLTRL 0"
## [1] "FVYTRPTYIRKLDFLSQMDLISTIGESAAQGAAGVIFWGDAEYTKSKETCQMIKKYLDED 0"
## [1] " "
## [1] "LVPYIVNVSWATQYCSWTQCHGHGRCVRRNPSANTFLHLNASSFRLVPGHTPSE-PQLRP 0"
## [1] "LVPYIVNVSWATQYCSWTQCHGHGRCVRRNPSASTFLHLSPSSFRLVPGRTPSE-PQLRP 0"
## [1] "LVPYVVNVSWATQYCSRDQCNGHGRCVRRNPSANTFLHLSASSFRLVPGRAPGE-PQLRP 0"
## [1] "LVPYVVNVSWATQYCSRAQCHGHGRCVRRNPSASTFLHLSTNSFRLVPGHAPGE-PQLRP 0"
## [1] "LVPYVVNVSWATQYCSRAQCHGHGRCVRRNPSASTFLHLSTNSFRLVPGHAPGE-PQLRP 0"
## [1] "LVPYVVNVSWATQYCSRAQCHGHGRCVRRNPSASTFLHLSTNSFRLVPSHTPGE-PQLRP 0"
## [1] "LVPYMVNVSWATQYCSWAQCHGHGRCVRRNPNTNTFLHLSANSFRLVPGHAPGE-PQLRP 0"
## [1] "LVPYVVNVSWAAQYCSWAQCHGHGRCVRRDPNAHTFLHLSASSFRLVPSHAPDE-PRLRP 0"
## [1] "LVPYVVNVSWAAQYCSRAQCHGHGRCVRRDPSANTFLHLSASSFRLVPSHVPGE-PQLRP 0"
## [1] "LGHYIVNVTTAAELCSQSLCNGNGRCLRQENNTDAFLHLNPANFQIVSAPKDFQGPSLRA 0"
## [1] " "
## [1] "EGQLSEADLNYLQKHFRCQCYLGWGGEQCQRNYKGAAGNASRAWAGSHLTSLLGLVAVAL 0"
## [1] "EGELSEDDLSYLQMHFRCHCYLGWGGEQCQWNHKRAAGDASRAWAGAHLASLLGLVAMTL 0"
## [1] "EGELSRADLNYLQTHFRCQCYLGWGGEQCQWDHRRTTGGASGAWTGSHFTSLLALVAVAL 0"
## [1] "VGELSWADIDHLQTHFRCQCYLGWSGEQCQWDHRQAAGGASEAWAGSHLTSLLALAALAF 0"
## [1] "VGELSWADLDHLQTHFRCQCYLGWSGEQCQWDHRQAAGGASEAWAGSHLTSLLALAALAF 0"
## [1] "VGELSWADLDHLQTHFRCQCYLGWSGEQCQWDHRQAAGGASEAWAAFHLTSLLALAALAF 0"
## [1] "VGELSWADLDHLQTHFRCQCYLGWGGEQCQWDRRRAAGGASGAWAGSHLASLLASTALAF 0"
## [1] "EGELSWADRNHLQMHFRCQCYLGWGGEQCQWDRRRAAGGASGAWAGSHLTGLLAVAVLAF 0"
## [1] "EGELSWADLNHLQTHFRCQCYLGWGGEQCQWDHTRAVGGASGAWAGSHLTGPLAVAALAL 0"
## [1] "EGKLSAGDIATLRSQFRCQCYVDWYGDSCGIQRSTNGGAVATGPCGIVLVVSLVALILAL 0"
## [1] " "
## [1] "TWTL 56"
## [1] "TWTL 56"
## [1] "T--- 56"
## [1] "TWTL 56"
## [1] "TWTL 56"
## [1] "TWTL 56"
## [1] "ICTL 56"
## [1] "T--- 56"
## [1] "TWTL 56"
## [1] "L--- 56"
## [1] " "
ggmsa(hyal2_msa, start=400, end=450)
## Distance Matrix
hyal2_subset_dist <- seqinr::dist.alignment(hyal2_seqinr, matrix="identity")
is(hyal2_subset_dist)
## [1] "dist" "oldClass"
class(hyal2_subset_dist)
## [1] "dist"
hyal2_align_seqinr_rnd <- round(hyal2_subset_dist, 3)
hyal2_align_seqinr_rnd
## NP_034619.2 NP_742037.2 XP_015342633.1 NP_003764.3
## NP_742037.2 0.272
## XP_015342633.1 0.372 0.378
## NP_003764.3 0.424 0.424 0.345
## XP_001168599.1 0.419 0.419 0.339 0.065
## XP_001101134.1 0.421 0.421 0.354 0.138
## XP_012493526.1 0.403 0.411 0.339 0.264
## NP_776772.1 0.459 0.468 0.418 0.372
## XP_007088631.2 0.421 0.434 0.354 0.332
## NP_001119986.1 0.662 0.667 0.660 0.667
## XP_001168599.1 XP_001101134.1 XP_012493526.1 NP_776772.1
## NP_742037.2
## XP_015342633.1
## NP_003764.3
## XP_001168599.1
## XP_001101134.1 0.122
## XP_012493526.1 0.256 0.276
## NP_776772.1 0.369 0.378 0.357
## XP_007088631.2 0.325 0.325 0.319 0.333
## NP_001119986.1 0.667 0.668 0.657 0.660
## XP_007088631.2
## NP_742037.2
## XP_015342633.1
## NP_003764.3
## XP_001168599.1
## XP_001101134.1
## XP_012493526.1
## NP_776772.1
## XP_007088631.2
## NP_001119986.1 0.654
Creating phylogenetic tree using distance matrix
hyal2_tree <- ape::nj(hyal2_align_seqinr_rnd)
plot.phylo(hyal2_tree, main="Phylogenetic Tree\n", use.edge.length = F)
mtext("\n\nHYAL2 family tree - rooted, no branch lengths")