This code compiles summary information about the gene GPX1 (Glutathione Peroxidase 1). Additionally, it generates alignments and a phylogenetic tree to illustrate the evolutionary relationship between the human version of the gene and its homologs in other species.
Key information used to make this script can be found here:
Other resources consulted:
Load necessary packages: BiocManager and drawProteins from BioConductor
library(BiocManager)
## Bioconductor version '3.13' is out-of-date; the current release version '3.14'
## is available with R version '4.1'; see https://bioconductor.org/install
library(drawProteins)
Load other packages:
# GitHub Packages
library(compbio4all)
library(ggmsa)
## Registered S3 methods overwritten by 'ggalt':
## method from
## grid.draw.absoluteGrob ggplot2
## grobHeight.absoluteGrob ggplot2
## grobWidth.absoluteGrob ggplot2
## grobX.absoluteGrob ggplot2
## grobY.absoluteGrob ggplot2
# CRAN Packages
library(rentrez)
library(seqinr)
library(ape)
##
## Attaching package: 'ape'
## The following objects are masked from 'package:seqinr':
##
## as.alignment, consensus
library(pander)
library(ggplot2)
# Bioconductor Packages
library(msa)
## Loading required package: Biostrings
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: XVector
## Loading required package: GenomeInfoDb
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:ape':
##
## complement
## The following object is masked from 'package:seqinr':
##
## translate
## The following object is masked from 'package:base':
##
## strsplit
##
## Attaching package: 'msa'
## The following object is masked from 'package:BiocManager':
##
## version
# Biostrings Packages
library(Biostrings)
library(HGNChelper)
Accession numbers were obtained from RefSeq, Refseq Homlogene, UniProt and PDB. UniProt accession numbers can be found by searching for the gene name. PDB accessions can be found by searching with a UniProt accession or a gene name, though many proteins are not in PDB. The the Neanderthal genome database was searched but did not yield sequence information on.
A protein BLAST search (https://blast.ncbi.nlm.nih.gov/Blast.cgi?PROGRAM=blastp&PAGE_TYPE=BlastSearch&LINK_LOC=blasthome) was carried out excluding vertebrates to determine if it occurred outside of vertebreates. The gene does not appear in non-vertebrates and so a second search was conducted to exclude mammals.
HGNChelper::checkGeneSymbols(x = c("GPX3"))
## Maps last updated on: Thu Oct 24 12:31:05 2019
## x Approved Suggested.Symbol
## 1 GPX3 TRUE GPX3
Not available: - Neanderthal
Does not occur: - Outside of Vertebrates
refseq.ID <- c("NP_000572", "NP_001080311", "NP_001152770", "NP_001108591", "NP_886521","NP_032186", "NP_110826", "NP_001264291", "NP_001081613", "NP_001028945")
uniprot.ID <- c("P07832", "G0JWA0", "NA", "NA", "Y00198", "P12372", "P04071", "NA", "NA", "NA")
pdb <- c("2K9A", "NA", "NA", "NA", "1GP1", "NA", "NA", "NA", "NA", "NA")
species <- c("Homo sapiens", "Pan troglodytes", "Macaca mulatta", "Canis lupus", "Bos taurus", "Mus musculus", "Rattus norvegicus", "Gallus gallus", "Xenopus tropicalis", "Danio rerio")
common_name <- c("Human", "Chimpanzee", "Rhesus Monkey", "Wolf", "Cattle", "Mouse", "Rat", "Junglefowl", "Frog", "Zebrafish")
gene_name <- c("GPX3", "GPX3", "GPX3", "GPX3", "GPX3", "Gpx3", "Gpx3", "GPX3", "gpx3", "gpx13")
gpx3_df <- data.frame(refseq.ID, uniprot.ID, pdb, species, common_name, gene_name)
pander::pander(gpx3_df)
| refseq.ID | uniprot.ID | pdb | species | common_name | gene_name |
|---|---|---|---|---|---|
| NP_000572 | P07832 | 2K9A | Homo sapiens | Human | GPX3 |
| NP_001080311 | G0JWA0 | NA | Pan troglodytes | Chimpanzee | GPX3 |
| NP_001152770 | NA | NA | Macaca mulatta | Rhesus Monkey | GPX3 |
| NP_001108591 | NA | NA | Canis lupus | Wolf | GPX3 |
| NP_886521 | Y00198 | 1GP1 | Bos taurus | Cattle | GPX3 |
| NP_032186 | P12372 | NA | Mus musculus | Mouse | Gpx3 |
| NP_110826 | P04071 | NA | Rattus norvegicus | Rat | Gpx3 |
| NP_001264291 | NA | NA | Gallus gallus | Junglefowl | GPX3 |
| NP_001081613 | NA | NA | Xenopus tropicalis | Frog | gpx3 |
| NP_001028945 | NA | NA | Danio rerio | Zebrafish | gpx13 |
h_sapiens_fasta <- compbio4all::entrez_fetch_list(db = "protein", id = "NP_000572", rettype = "fasta") #1
p_troglodytes_fasta <- compbio4all::entrez_fetch_list(db = "protein", id = "NP_001080311", rettype = "fasta") #2
m_mulatta_fasta <- compbio4all::entrez_fetch_list(db = "protein", id = "NP_001152770", rettype = "fasta") #3
c_lupus_fasta <- compbio4all::entrez_fetch_list(db = "protein", id = "NP_001108591", rettype = "fasta") #4
b_taurus_fasta <- compbio4all::entrez_fetch_list(db = "protein", id = "NP_886521", rettype = "fasta") #5
m_musculus_fasta <- compbio4all::entrez_fetch_list(db = "protein", id = "NP_032186", rettype = "fasta") #6
r_norvegicus_fasta <- compbio4all::entrez_fetch_list(db = "protein", id = "NP_110826", rettype = "fasta") #7
g_gallus_fasta <- compbio4all::entrez_fetch_list(db = "protein", id = "NP_001264291", rettype = "fasta") #8
x_tropicalis_fasta <- compbio4all::entrez_fetch_list(db = "protein", id = "NP_001081613", rettype = "fasta") #9
d_rerio_fasta <- compbio4all::entrez_fetch_list(db = "protein", id = "NP_001028945", rettype = "fasta") #10
gpx3_list <- list(h_sapiens_fasta, p_troglodytes_fasta, m_mulatta_fasta, c_lupus_fasta, b_taurus_fasta, m_musculus_fasta, r_norvegicus_fasta, g_gallus_fasta, x_tropicalis_fasta, d_rerio_fasta)
# Number of Fasta files obtained
length(gpx3_list)
## [1] 10
# The first entry
gpx3_list[[1]]
## $NP_000572
## [1] ">NP_000572.2 glutathione peroxidase 1 isoform 1 [Homo sapiens]\nMCAARLAAAAAAAQSVYAFSARPLAGGEPVSLGSLRGKVLLIENVASLUGTTVRDYTQMNELQRRLGPRG\nLVVLGFPCNQFGHQENAKNEEILNSLKYVRPGGGFEPNFMLFEKCEVNGAGAHPLFAFLREALPAPSDDA\nTALMTDPKLITWSPVCRNDVAWNFEKFLVGPDGVPLRRYSRRFQTIDIEPDIEALLSQGPSCA\n\n"
# Remove Fasta header
for (i in 1:length(gpx3_list)){
gpx3_list[[i]] <- compbio4all::fasta_cleaner(gpx3_list[[i]], parse = F)
}
P07203_json <- drawProteins::get_features("P07203")
## [1] "Download has worked"
my_prot_df <- drawProteins::feature_to_dataframe(P07203_json)
my_prot_df[,-2]
## type begin end length accession entryName taxid order
## featuresTemp CHAIN 1 203 202 P07203 GPX1_HUMAN 9606 1
## featuresTemp.1 ACT_SITE 49 49 0 P07203 GPX1_HUMAN 9606 1
## featuresTemp.2 SITE 49 49 0 P07203 GPX1_HUMAN 9606 1
## featuresTemp.3 NON_STD 49 49 0 P07203 GPX1_HUMAN 9606 1
## featuresTemp.4 MOD_RES 34 34 0 P07203 GPX1_HUMAN 9606 1
## featuresTemp.5 MOD_RES 88 88 0 P07203 GPX1_HUMAN 9606 1
## featuresTemp.6 MOD_RES 88 88 0 P07203 GPX1_HUMAN 9606 1
## featuresTemp.7 MOD_RES 114 114 0 P07203 GPX1_HUMAN 9606 1
## featuresTemp.8 MOD_RES 114 114 0 P07203 GPX1_HUMAN 9606 1
## featuresTemp.9 MOD_RES 148 148 0 P07203 GPX1_HUMAN 9606 1
## featuresTemp.10 MOD_RES 148 148 0 P07203 GPX1_HUMAN 9606 1
## featuresTemp.11 MOD_RES 197 197 0 P07203 GPX1_HUMAN 9606 1
## featuresTemp.12 MOD_RES 201 201 0 P07203 GPX1_HUMAN 9606 1
## featuresTemp.13 VAR_SEQ 85 98 13 P07203 GPX1_HUMAN 9606 1
## featuresTemp.14 VAR_SEQ 99 203 104 P07203 GPX1_HUMAN 9606 1
## featuresTemp.15 VARIANT 5 5 0 P07203 GPX1_HUMAN 9606 1
## featuresTemp.16 VARIANT 7 8 1 P07203 GPX1_HUMAN 9606 1
## featuresTemp.17 VARIANT 8 8 0 P07203 GPX1_HUMAN 9606 1
## featuresTemp.18 VARIANT 194 194 0 P07203 GPX1_HUMAN 9606 1
## featuresTemp.19 VARIANT 200 200 0 P07203 GPX1_HUMAN 9606 1
## featuresTemp.20 CONFLICT 93 93 0 P07203 GPX1_HUMAN 9606 1
## featuresTemp.21 HELIX 16 18 2 P07203 GPX1_HUMAN 9606 1
## featuresTemp.22 HELIX 32 35 3 P07203 GPX1_HUMAN 9606 1
## featuresTemp.23 STRAND 38 45 7 P07203 GPX1_HUMAN 9606 1
## featuresTemp.24 STRAND 47 49 2 P07203 GPX1_HUMAN 9606 1
## featuresTemp.25 HELIX 52 66 14 P07203 GPX1_HUMAN 9606 1
## featuresTemp.26 HELIX 67 69 2 P07203 GPX1_HUMAN 9606 1
## featuresTemp.27 STRAND 71 77 6 P07203 GPX1_HUMAN 9606 1
## featuresTemp.28 TURN 82 85 3 P07203 GPX1_HUMAN 9606 1
## featuresTemp.29 HELIX 89 91 2 P07203 GPX1_HUMAN 9606 1
## featuresTemp.30 HELIX 92 98 6 P07203 GPX1_HUMAN 9606 1
## featuresTemp.31 STRAND 108 112 4 P07203 GPX1_HUMAN 9606 1
## featuresTemp.32 HELIX 124 132 8 P07203 GPX1_HUMAN 9606 1
## featuresTemp.33 HELIX 147 149 2 P07203 GPX1_HUMAN 9606 1
## featuresTemp.34 STRAND 152 154 2 P07203 GPX1_HUMAN 9606 1
## featuresTemp.35 STRAND 166 169 3 P07203 GPX1_HUMAN 9606 1
## featuresTemp.36 STRAND 175 179 4 P07203 GPX1_HUMAN 9606 1
## featuresTemp.37 HELIX 185 188 3 P07203 GPX1_HUMAN 9606 1
## featuresTemp.38 HELIX 189 196 7 P07203 GPX1_HUMAN 9606 1
my_canvas <- draw_canvas(my_prot_df)
my_canvas <- draw_chains(my_canvas, my_prot_df, label_size = 2.5)
my_canvas <- draw_domains(my_canvas, my_prot_df)
my_canvas <- draw_regions(my_canvas, my_prot_df)
my_canvas <- draw_folding(my_canvas, my_prot_df)
my_canvas
gpx3_human_vector <- compbio4all::fasta_cleaner(h_sapiens_fasta)
str(gpx3_human_vector)
## chr [1:203] "M" "C" "A" "A" "R" "L" "A" "A" "A" "A" "A" "A" "A" "Q" "S" ...
# set up 2 x 2 grid
par(mfrow = c(2,2), mar = c(0, 0, 2, 1))
# plot 1: - Defaults
dotPlot(gpx3_human_vector, gpx3_human_vector ,
wsize = 1,
nmatch = 1,
main = "GPX3 Defaults")
# plot - size = 10, nmatch = 1
dotPlot(gpx3_human_vector, gpx3_human_vector,
wsize = 10,
nmatch = 1,
main = " GPX3 - size = X, nmatch = X")
# plot 3: - size = 10, nmatch = 5
dotPlot(gpx3_human_vector, gpx3_human_vector ,
wsize = 10,
nmatch = 5,
main = "GPX3 - size = 10, nmatch = 5")
# plot 4: size = 20, nmatch = 5
dotPlot(gpx3_human_vector, gpx3_human_vector,
wsize = 20,
nmatch = 5,
main = "GPX3 - size = 20, nmatch = 5")
# reset par()
par(mfrow = c(1,1),
mar = c(4,4,4,4))
# best DotPlot
dotPlot(gpx3_human_vector, gpx3_human_vector ,
wsize = 1,
nmatch = 1,
xlab = "gpx3_human_vector", ylab = "gpx3_human_vector")
This particular protein is not in
source <- c("Pfam", "Uniprot", "PDB")
protein_property <- c("There is an interesting doman 'GSHPx' from amino acid 13 to 150. There presents a low complexity region is indicated form amino acid 13 to 150.", "GPX3 It is located in the cytoplasm, and it is expressed in platelets.", "PDB results show a structure '2G9A', which acts as tje selenocysteine to glycine mutant of GPX3. GPX3 has both A and B chains, and has a sequence length of 213." )
link <- c("http://pfam.xfam.org/protein/P07203", "https://www.uniprot.org/uniprot/P07203", "https://www.rcsb.org/structure/2G9A" )
prop_df <- data.frame(source, protein_property, link)
pander::pander(prop_df)
| source | protein_property |
|---|---|
| Pfam | There is an interesting doman ‘GSHPx’ from amino acid 13 to 150. There presents a low complexity region is indicated form amino acid 13 to 150. |
| Uniprot | GPX3 It is located in the cytoplasm, and it is expressed in platelets. |
| PDB | PDB results show a structure ‘2G9A’, which acts as tje selenocysteine to glycine mutant of GPX3. GPX3 has both A and B chains, and has a sequence length of 213. |
| link |
|---|
| http://pfam.xfam.org/protein/P07203 |
| https://www.uniprot.org/uniprot/P07203 |
| https://www.rcsb.org/structure/2G9A |
Multivariate statistcal techniques were used to confirm the information about protein structure and location in the line database.
Uniprot indicates that the protein is in the cytoplasm.
aa.1.1 <- c("A","R","N","D","C","Q","E","G","H","I",
"L","K","M","F","P","S","T","W","Y","V")
alpha <- c(285, 53, 97, 163, 22, 67, 134, 197, 111, 91,
221, 249, 48, 123, 82, 122, 119, 33, 63, 167)
beta <- c(203, 67, 139, 121, 75, 122, 86, 297, 49, 120,
177, 115, 16, 85, 127, 341, 253, 44, 110, 229)
a.plus.b <- c(175, 78, 120, 111, 74, 74, 86, 171, 33, 93,
110, 112, 25, 52, 71, 126, 117, 30, 108, 123)
a.div.b <- c(361, 146, 183, 244, 63, 114, 257, 377, 107, 239,
339, 321, 91, 158, 188, 327, 238, 72, 130, 378)
pander(data.frame(aa.1.1, alpha, beta, a.plus.b, a.div.b))
| aa.1.1 | alpha | beta | a.plus.b | a.div.b |
|---|---|---|---|---|
| A | 285 | 203 | 175 | 361 |
| R | 53 | 67 | 78 | 146 |
| N | 97 | 139 | 120 | 183 |
| D | 163 | 121 | 111 | 244 |
| C | 22 | 75 | 74 | 63 |
| Q | 67 | 122 | 74 | 114 |
| E | 134 | 86 | 86 | 257 |
| G | 197 | 297 | 171 | 377 |
| H | 111 | 49 | 33 | 107 |
| I | 91 | 120 | 93 | 239 |
| L | 221 | 177 | 110 | 339 |
| K | 249 | 115 | 112 | 321 |
| M | 48 | 16 | 25 | 91 |
| F | 123 | 85 | 52 | 158 |
| P | 82 | 127 | 71 | 188 |
| S | 122 | 341 | 126 | 327 |
| T | 119 | 253 | 117 | 238 |
| W | 33 | 44 | 30 | 72 |
| Y | 63 | 110 | 108 | 130 |
| V | 167 | 229 | 123 | 378 |
# Convert to Frequencies
alpha.prop <- alpha/sum(alpha)
beta.prop <- beta/sum(beta)
a.plus.b.prop <- a.plus.b/sum(a.plus.b)
a.div.b <- a.div.b/sum(a.div.b)
aa.prop <- data.frame(alpha.prop,
beta.prop,
a.plus.b.prop,
a.div.b)
#row labels
row.names(aa.prop) <- aa.1.1
pander::pander(aa.prop)
| Â | alpha.prop | beta.prop | a.plus.b.prop | a.div.b |
|---|---|---|---|---|
| A | 0.1165 | 0.07313 | 0.09264 | 0.08331 |
| R | 0.02166 | 0.02414 | 0.04129 | 0.03369 |
| N | 0.03964 | 0.05007 | 0.06353 | 0.04223 |
| D | 0.06661 | 0.04359 | 0.05876 | 0.05631 |
| C | 0.008991 | 0.02702 | 0.03917 | 0.01454 |
| Q | 0.02738 | 0.04395 | 0.03917 | 0.02631 |
| E | 0.05476 | 0.03098 | 0.04553 | 0.05931 |
| G | 0.08051 | 0.107 | 0.09052 | 0.08701 |
| H | 0.04536 | 0.01765 | 0.01747 | 0.02469 |
| I | 0.03719 | 0.04323 | 0.04923 | 0.05516 |
| L | 0.09031 | 0.06376 | 0.05823 | 0.07824 |
| K | 0.1018 | 0.04143 | 0.05929 | 0.07408 |
| M | 0.01962 | 0.005764 | 0.01323 | 0.021 |
| F | 0.05027 | 0.03062 | 0.02753 | 0.03646 |
| P | 0.03351 | 0.04575 | 0.03759 | 0.04339 |
| S | 0.04986 | 0.1228 | 0.0667 | 0.07547 |
| T | 0.04863 | 0.09114 | 0.06194 | 0.05493 |
| W | 0.01349 | 0.01585 | 0.01588 | 0.01662 |
| Y | 0.02575 | 0.03963 | 0.05717 | 0.03 |
| V | 0.06825 | 0.08249 | 0.06511 | 0.08724 |
# function to convert table into vector
table_to_vector <- function(table_x){
table_names <- attr(table_x, "dimnames")[[1]]
table_vect <- as.vector(table_x)
names(table_vect) <- table_names
return(table_vect)
}
# determine frequency of each amino acid in GPX1 human protein
gpx3_human_table <- table(gpx3_human_vector)/length(gpx3_human_vector)
gpx3.human.aa.freq <- table_to_vector(gpx3_human_table)
gpx3.human.aa.freq
## A C D E F G
## 0.118226601 0.024630542 0.039408867 0.064039409 0.054187192 0.083743842
## H I K L M N
## 0.009852217 0.029556650 0.029556650 0.113300493 0.019704433 0.049261084
## P Q R S T U
## 0.073891626 0.034482759 0.068965517 0.054187192 0.034482759 0.004926108
## V W Y
## 0.064039409 0.009852217 0.019704433
# check for presence of "U"
aa.names <- names(gpx3.human.aa.freq)
i.U <- which(aa.names == "U")
aa.names[i.U]
## [1] "U"
gpx3.human.aa.freq[i.U]
## U
## 0.004926108
# remove "U"
gpx3.human.aa.freq <- gpx3.human.aa.freq[-i.U]
# add data to frequency table
aa.prop$GPX3.human.aa.freq <- gpx3.human.aa.freq
pander::pander(aa.prop)
| Â | alpha.prop | beta.prop | a.plus.b.prop | a.div.b | GPX3.human.aa.freq |
|---|---|---|---|---|---|
| A | 0.1165 | 0.07313 | 0.09264 | 0.08331 | 0.1182 |
| R | 0.02166 | 0.02414 | 0.04129 | 0.03369 | 0.02463 |
| N | 0.03964 | 0.05007 | 0.06353 | 0.04223 | 0.03941 |
| D | 0.06661 | 0.04359 | 0.05876 | 0.05631 | 0.06404 |
| C | 0.008991 | 0.02702 | 0.03917 | 0.01454 | 0.05419 |
| Q | 0.02738 | 0.04395 | 0.03917 | 0.02631 | 0.08374 |
| E | 0.05476 | 0.03098 | 0.04553 | 0.05931 | 0.009852 |
| G | 0.08051 | 0.107 | 0.09052 | 0.08701 | 0.02956 |
| H | 0.04536 | 0.01765 | 0.01747 | 0.02469 | 0.02956 |
| I | 0.03719 | 0.04323 | 0.04923 | 0.05516 | 0.1133 |
| L | 0.09031 | 0.06376 | 0.05823 | 0.07824 | 0.0197 |
| K | 0.1018 | 0.04143 | 0.05929 | 0.07408 | 0.04926 |
| M | 0.01962 | 0.005764 | 0.01323 | 0.021 | 0.07389 |
| F | 0.05027 | 0.03062 | 0.02753 | 0.03646 | 0.03448 |
| P | 0.03351 | 0.04575 | 0.03759 | 0.04339 | 0.06897 |
| S | 0.04986 | 0.1228 | 0.0667 | 0.07547 | 0.05419 |
| T | 0.04863 | 0.09114 | 0.06194 | 0.05493 | 0.03448 |
| W | 0.01349 | 0.01585 | 0.01588 | 0.01662 | 0.06404 |
| Y | 0.02575 | 0.03963 | 0.05717 | 0.03 | 0.009852 |
| V | 0.06825 | 0.08249 | 0.06511 | 0.08724 | 0.0197 |
Functions to calculate similarities
chou_cor <- function(x,y){
numerator <- sum(x*y)
denominator <- sqrt((sum(x^2))*(sum(y^2)))
result <- numerator/denominator
return(result)
}
# Cosine similarity used in Higgs and Attwood (2005).
chou_cosine <- function(z.1, z.2){
z.1.abs <- sqrt(sum(z.1^2))
z.2.abs <- sqrt(sum(z.2^2))
my.cosine <- sum(z.1*z.2)/(z.1.abs*z.2.abs)
return(my.cosine)
}
# column coorelation
corr.alpha <- chou_cor(aa.prop[,5], aa.prop[,1])
corr.beta <- chou_cor(aa.prop[,5], aa.prop[,2])
corr.apb <- chou_cor(aa.prop[,5], aa.prop[,3])
corr.adb <- chou_cor(aa.prop[,5], aa.prop[,4])
# cosine similarity
cos.alpha <- chou_cosine(aa.prop[,5], aa.prop[,1])
cos.beta <- chou_cosine(aa.prop[,5], aa.prop[,2])
cos.apb <- chou_cosine(aa.prop[,5], aa.prop[,3])
cos.adb <- chou_cosine(aa.prop[,5], aa.prop[,4])
# Euclidean distance
# flip dataframe
aa.prop.flipped <- t(aa.prop)
round(aa.prop.flipped,2)
## A R N D C Q E G H I L K
## alpha.prop 0.12 0.02 0.04 0.07 0.01 0.03 0.05 0.08 0.05 0.04 0.09 0.10
## beta.prop 0.07 0.02 0.05 0.04 0.03 0.04 0.03 0.11 0.02 0.04 0.06 0.04
## a.plus.b.prop 0.09 0.04 0.06 0.06 0.04 0.04 0.05 0.09 0.02 0.05 0.06 0.06
## a.div.b 0.08 0.03 0.04 0.06 0.01 0.03 0.06 0.09 0.02 0.06 0.08 0.07
## GPX3.human.aa.freq 0.12 0.02 0.04 0.06 0.05 0.08 0.01 0.03 0.03 0.11 0.02 0.05
## M F P S T W Y V
## alpha.prop 0.02 0.05 0.03 0.05 0.05 0.01 0.03 0.07
## beta.prop 0.01 0.03 0.05 0.12 0.09 0.02 0.04 0.08
## a.plus.b.prop 0.01 0.03 0.04 0.07 0.06 0.02 0.06 0.07
## a.div.b 0.02 0.04 0.04 0.08 0.05 0.02 0.03 0.09
## GPX3.human.aa.freq 0.07 0.03 0.07 0.05 0.03 0.06 0.01 0.02
dist(aa.prop.flipped, method = "euclidean")
## alpha.prop beta.prop a.plus.b.prop a.div.b
## beta.prop 0.13342098
## a.plus.b.prop 0.09281824 0.08289406
## a.div.b 0.06699039 0.08659174 0.06175113
## GPX3.human.aa.freq 0.18295063 0.19660540 0.16377475 0.17599560
dist.alpha <- dist((aa.prop.flipped[c(1,5),]), method = "euclidean")
dist.beta <- dist((aa.prop.flipped[c(2,5),]), method = "euclidean")
dist.apb <- dist((aa.prop.flipped[c(3,5),]), method = "euclidean")
dist.adb <- dist((aa.prop.flipped[c(4,5),]), method = "euclidean")
# Put information together
# fold types
fold.type <- c("alpha","beta","alpha plus beta", "alpha/beta")
# data
corr.sim <- round(c(corr.alpha,corr.beta,corr.apb,corr.adb),5)
cosine.sim <- round(c(cos.alpha,cos.beta,cos.apb,cos.adb),5)
Euclidean.dist <- round(c(dist.alpha,dist.beta,dist.apb,dist.adb),5)
# summary
sim.sum <- c("","","most.sim","")
dist.sum <- c("","","min.dist","")
df <- data.frame(fold.type,
corr.sim ,
cosine.sim ,
Euclidean.dist ,
sim.sum ,
dist.sum )
pander::pander(df)
| fold.type | corr.sim | cosine.sim | Euclidean.dist | sim.sum | dist.sum |
|---|---|---|---|---|---|
| alpha | 0.7512 | 0.7512 | 0.183 | ||
| beta | 0.7163 | 0.7163 | 0.1966 | ||
| alpha plus beta | 0.7905 | 0.7905 | 0.1638 | most.sim | min.dist |
| alpha/beta | 0.7612 | 0.7612 | 0.176 |
names(gpx3_list)[1] <- "NP_000572"
names(gpx3_list)[2] <- "NP_001080311"
names(gpx3_list)[3] <- "NP_001152770"
names(gpx3_list)[4] <- "NP_001108591"
names(gpx3_list)[5] <- "NP_886521"
names(gpx3_list)[6] <- "NP_032186"
names(gpx3_list)[7] <- "NP_110826"
names(gpx3_list)[8] <- "NP_001264291"
names(gpx3_list)[9] <- "NP_001081613"
names(gpx3_list)[10] <- "NP_001028945"
gpx3_list[1]
## $NP_000572
## [1] "MCAARLAAAAAAAQSVYAFSARPLAGGEPVSLGSLRGKVLLIENVASLUGTTVRDYTQMNELQRRLGPRGLVVLGFPCNQFGHQENAKNEEILNSLKYVRPGGGFEPNFMLFEKCEVNGAGAHPLFAFLREALPAPSDDATALMTDPKLITWSPVCRNDVAWNFEKFLVGPDGVPLRRYSRRFQTIDIEPDIEALLSQGPSCA"
gpx3_list
## $NP_000572
## [1] "MCAARLAAAAAAAQSVYAFSARPLAGGEPVSLGSLRGKVLLIENVASLUGTTVRDYTQMNELQRRLGPRGLVVLGFPCNQFGHQENAKNEEILNSLKYVRPGGGFEPNFMLFEKCEVNGAGAHPLFAFLREALPAPSDDATALMTDPKLITWSPVCRNDVAWNFEKFLVGPDGVPLRRYSRRFQTIDIEPDIEALLSQGPSCA"
##
## $NP_001080311
## [1] "MSFNVNRSVSDQFYRYKMPRLIAKVEGKGNGIKTVIVNMVDVAKALNRPPTYPTKYFGCELGAQTQFDIKNDRFIVNGSHEANKLQDMLDGFIKKFVLCPECDNPETDLHVNPKKQTIGNSCKACGYRGMLDTKHKLCTFILKNPTENTDGNAGKKEKDKKNRKGKEKENGSAANESPVLTELNPPHVNAGKEDDDEDWGEDTTAEAQRRRMEEISDHAKNLTLSEDLEKPVEYRVNLLFDFVKKKKEEGVIDFCDKDILAEAERLDVKAMGPLVLSEVLFDDKIRDQIKKYRRHFLRFCHNNKKAQRYLLHGFECVIDMHQSHLLSKIPHILKEMYDADLLEEEVIISWAEKPSKKYVSKELAKDIRAKADPFIKWLKEAEEESSGGEEDEEEDDDDDDETIEVVYSKTATVPKVEAVKSPVDKDDDIDIDAI"
##
## $NP_001152770
## [1] "MCAARLAAAAVYAFSARPLAGGEPVSLGSLRGKVLLIENVASLUGTTVRDYTQMNELQRRLGPRGLVVLGFPCNQFGHQENAKNEEILNSLKYVRPGGGFEPNFMLFEKCEVNGAGAHPLFAFLREALPAPSDDATALMTDPKLITWSPVCRNDVAWNFEKFLVGPDGVPVRRYSRRFQTIDIEPDIEALLSQGPSSA"
##
## $NP_001108591
## [1] "MCAAALAAAAVAAAPRSVYAFSARPLAGGEPMSLGSLRGKVLLIENVASLUGTTVRDYTQMNELQRRLGPRGLVVLGFPCNQFGHQENAKNEEILNSLKYVRPGGGFEPNFTLFEKCEVNGAQAHPLFAFLRESLPAPSDDTTALMTDPKFITWSPVCRNDVAWNFEKFLVGPDGVPVRRYSRRFPTIDIEPDIEALLSQGPSCA"
##
## $NP_886521
## [1] "MSDAHMETVHVIVKGKVQGVGYRHAAVRRAHMLGVTGWVQNLPDGTVEAVVQGTADQVDHMLEWLRRGPPAAQVRELASERSFEEKRYKHFAQL"
##
## $NP_032186
## [1] "MCAARLSAAAQSTVYAFSARPLTGGEPVSLGSLRGKVLLIENVASLUGTTIRDYTEMNDLQKRLGPRGLVVLGFPCNQFGHQENGKNEEILNSLKYVRPGGGFEPNFTLFEKCEVNGEKAHPLFTFLRNALPTPSDDPTALMTDPKYIIWSPVCRNDIAWNFEKFLVGPDGVPVRRYSRRFRTIDIEPDIETLLSQQSGNS"
##
## $NP_110826
## [1] "MIKTETSKIKLINEDNLRLDGRSFNELRPIKIEAGVLNRADGSAYIEWGGNKIIVGVYGPKEAYPKHSQDIDHAVVKARYNMAAFSVDERKRPGPDRRTMEISKVISEALSSSIMIEQFPRAEIDVYIEVLQADAGTRIAGLTAATVALADAGIPMRDMVVGCTAGKVDGHIVLDLSKEEDNFGEADIPMAIMPKTGEIVLLQMDGDVTEDEFYEATSMIIEATKKISQIQRNALLNKYKIEGIEGGE"
##
## $NP_001264291
## [1] "MSLPLNPKPFLNGLTGKPVMVKLKWGMEYKGYLVSVDGYMNMQLANTEEYIDGALSGHLGEVLIRCNNVLYIRGVEEEEEDGEMRE"
##
## $NP_001081613
## [1] "MAVRLCDVASLLRSGSWAAEPWTGQVIAAMETQLSNGPTCNNTANCPNTINCSSPVESNNTEDSKTNLIVNYLPQNMTQEELKSLFGSIGEIESCKLVRDKITEGQSLGYGFVNYIDPKDAEKAINTVNGLRLQTKTIKVSYARPSSASIRDANLYVSGLPKTMTQKELEQLFSQYGRIITSRILVDQVTGVSRGVGFIRFDKRIEAEEAIKGLNGQKPPGATEPITVKFANNPSQKVNHTILSQLYQSPNRRYPGPLAQQAQRFRLDNLLNMAYGGIKSRFSPMAIDGMTSLAGINFPGHAGTGWCIFVYNLAPDADESILWQMFGPFGAVTNVKVIRDFNTNKCKGFGFVTMTNYDEAAMAIASLNGYRLGDRVLQVSFKTSKTHKA"
##
## $NP_001028945
## [1] "MYSKAYILLEREFRELKRDTRKDITAYPVSDDMMNWKAEIEGLRNSVCEGLVFYLTLEFSQEYNSVPPNVKFTTIPFHPNVDPYTGKPSIDFLDKPGKWNTNYTVLSILLDLQMLLSYPVLKNPVNLEAAQLLIRNASTYKMVIQELLPPALPKRESSLVIPEKPQDRVRVIKTISFNDYYKTWSQIATTKVAEHSRNPFVGDPHFMGQYYKWKQQDRQNQVQWESKFALSKWQTARKKIMAQEKSNHYNQVIGIYPSPSELSFEYEIEEEEEEEEEEEEEEEEEEEEEEKEEEEEEPRIYQIPQEWPEEHDSKESWEEEADHLVSWTNGLDEESLNYEYFDNVKNYENLGN"
gpx3_vector <- unlist(gpx3_list)
names(gpx3_vector) <- names(gpx3_list)
gpx3_vector[1]
## NP_000572
## "MCAARLAAAAAAAQSVYAFSARPLAGGEPVSLGSLRGKVLLIENVASLUGTTVRDYTQMNELQRRLGPRGLVVLGFPCNQFGHQENAKNEEILNSLKYVRPGGGFEPNFMLFEKCEVNGAGAHPLFAFLREALPAPSDDATALMTDPKLITWSPVCRNDVAWNFEKFLVGPDGVPLRRYSRRFQTIDIEPDIEALLSQGPSCA"
# PID of Human (1) vs Chimp (2)
align01.02 <- Biostrings::pairwiseAlignment(gpx3_vector[1], gpx3_vector[2])
pid(align01.02)
## [1] 17.27273
# PID of Human (1) vs Mouse (6)
align01.06 <- Biostrings::pairwiseAlignment(gpx3_vector[1], gpx3_vector[6])
pid(align01.06)
## [1] 85.78431
# PID of Human (1) vs Frog (9)
align01.09 <- Biostrings::pairwiseAlignment(gpx3_vector[1], gpx3_vector[9])
pid(align01.09)
## [1] 20.2046
# PID of Chimp (2) vs Mouse (6)
align02.06 <- Biostrings::pairwiseAlignment(gpx3_vector[2], gpx3_vector[6])
pid(align02.06)
## [1] 18.77934
# PID of Chimp (2) vs Frog (9)
align02.09 <- Biostrings::pairwiseAlignment(gpx3_vector[2], gpx3_vector[9])
pid(align02.09)
## [1] 21.25813
# PID of Mouse (6) vs Frog (9)
align06.09 <- Biostrings::pairwiseAlignment(gpx3_vector[6], gpx3_vector[9])
pid(align06.09)
## [1] 21.81818
pid_val <- c(1, NA, NA, NA,
pid(align01.02), 1, NA, NA,
pid(align01.06), pid(align02.06), 1, NA,
pid(align01.09), pid(align02.09), pid(align06.09), 1)
pid_mat <- matrix(pid_val, nrow = 4, byrow = T)
row.names(pid_mat) <- c("Homo","Pan","Mouse","Frog")
colnames(pid_mat) <- c("Homo","Pan","Mouse","Frog")
pander::pander(pid_mat)
| Â | Homo | Pan | Mouse | Frog |
|---|---|---|---|---|
| Homo | 1 | NA | NA | NA |
| Pan | 17.27 | 1 | NA | NA |
| Mouse | 85.78 | 18.78 | 1 | NA |
| Frog | 20.2 | 21.26 | 21.82 | 1 |
# Comparison of Human and Chimps PID with different methods
pid1 <- pid(align01.02, type = "PID1")
pid2 <- pid(align01.02, type = "PID2")
pid3 <- pid(align01.02, type = "PID3")
pid4 <- pid(align01.02, type = "PID4")
pid_method <- c("PID1", "PID2", "PID3", "PID4")
pid_value <- c(pid1, pid2, pid3, pid4)
denominator <- c("(aligned positions + internal gap positions)", "(aligned positions)", "(length shorter sequence)", "(average length of the two sequences)")
pid_df <- data.frame(pid_method, pid_value, denominator)
pander::pander(pid_df)
| pid_method | pid_value | denominator |
|---|---|---|
| PID1 | 17.27 | (aligned positions + internal gap positions) |
| PID2 | 38.78 | (aligned positions) |
| PID3 | 37.44 | (length shorter sequence) |
| PID4 | 23.86 | (average length of the two sequences) |
gpx3_vector_ss <- Biostrings::AAStringSet(gpx3_vector)
gpx3_align <- msa(gpx3_vector_ss, method = "ClustalW")
## use default substitution matrix
class(gpx3_align)
## [1] "MsaAAMultipleAlignment"
## attr(,"package")
## [1] "msa"
is(gpx3_align)
## [1] "MsaAAMultipleAlignment" "AAMultipleAlignment" "MsaMetaData"
## [4] "MultipleAlignment"
# Default output of MSA
gpx3_align
## CLUSTAL 2.1
##
## Call:
## msa(gpx3_vector_ss, method = "ClustalW")
##
## MsaAAMultipleAlignment with 10 rows and 464 columns
## aln names
## [1] -------------------------...------------------------- NP_000572
## [2] -------------------------...------------------------- NP_001152770
## [3] -------------------------...------------------------- NP_001108591
## [4] -------------------------...------------------------- NP_032186
## [5] -------------MSFNVNRSVSDQ...VDKDDDIDIDAI------------- NP_001080311
## [6] -------------------------...------------------------- NP_001264291
## [7] ---------------------MYSK...TNGLDEESLNYEYFDNVKNYENLGN NP_001028945
## [8] -------------------------...------------------------- NP_886521
## [9] MAVRLCDVASLLRSGSWAAEPWTGQ...THKA--------------------- NP_001081613
## [10] -------------------------...------------------------- NP_110826
## Con -------------------------...------------------------- Consensus
# Change class of alignment
class(gpx3_align) <- "AAMultipleAlignment"
# Convert to seqinr format
gpx3_align_seqinr <- msaConvert(gpx3_align, type = "seqinr::alignment")
# Output
compbio4all::print_msa(gpx3_align_seqinr)
## [1] "--------------------------------MCAARLAAAAAAAQ--SVYAFSARPLAG 0"
## [1] "--------------------------------MCAARLAAAA-------VYAFSARPLAG 0"
## [1] "--------------------------------MCAAALAAAAVAAAPRSVYAFSARPLAG 0"
## [1] "--------------------------------MCAARLSAAAQS----TVYAFSARPLTG 0"
## [1] "-------------MSFNVNRSVSDQFYRYKMPRLIAKVEGKGNGIKTVIVNMVDVAKALN 0"
## [1] "------------------------------------------------------------ 0"
## [1] "---------------------MYSKAYILLEREFRELKRDTRKDITAYPVSDDMMNWKAE 0"
## [1] "------------------------------------------------------------ 0"
## [1] "MAVRLCDVASLLRSGSWAAEPWTGQVIAAMETQLSNGPTCNNTANCPNTINCSSPVESNN 0"
## [1] "----------------------------------------------------MIKTETSK 0"
## [1] " "
## [1] "GEPVSLGSLRGKVLLIENVASLU--GTTVRDYTQMNELQRRLG----------------- 0"
## [1] "GEPVSLGSLRGKVLLIENVASLU--GTTVRDYTQMNELQRRLG----------------- 0"
## [1] "GEPMSLGSLRGKVLLIENVASLU--GTTVRDYTQMNELQRRLG----------------- 0"
## [1] "GEPVSLGSLRGKVLLIENVASLU--GTTIRDYTEMNDLQKRLG----------------- 0"
## [1] "RPPTYPTKYFGCELGAQTQFDIKNDRFIVNGSHEANKLQDMLDGFIKKFVLCPECDNPET 0"
## [1] "------------------------------------------------------------ 0"
## [1] "IEGLRNSVCEGLVFYLTLEFSQEYNSVPPNVKFTTIPFHPNVDP---------------- 0"
## [1] "------------------------------------------------------------ 0"
## [1] "TEDSKTNLIVNYLPQNMTQEELKSLFGSIGEIESCKLVRDKITEGQSLGYGFVNYIDPKD 0"
## [1] "IKLINEDNLRLDGRSFNELRPIKIEAGVLNRADGSAYIEWG------------------- 0"
## [1] " "
## [1] "----PRGLVVLGFPCNQFGHQ--------------------------------ENAKNEE 0"
## [1] "----PRGLVVLGFPCNQFGHQ--------------------------------ENAKNEE 0"
## [1] "----PRGLVVLGFPCNQFGHQ--------------------------------ENAKNEE 0"
## [1] "----PRGLVVLGFPCNQFGHQ--------------------------------ENGKNEE 0"
## [1] "DLHVNPKKQTIGNSCKACGYRGMLDTKHKLCTFILKNPTENTDGNAGKKEKDKKNRKGKE 0"
## [1] "----------MSLPLNPKPFL--------------------------------NGLTGKP 0"
## [1] "----YTGKPSIDFLDKPGKWN--------------------------------TNYTVLS 0"
## [1] "----------------------------------------------------MSDAHMET 0"
## [1] "AEKAINTVNGLRLQTKTIKVSYARP---------------------------SSASIRDA 0"
## [1] "-----GNKIIVGVYGPKEAYP--------------------------------KHSQDID 0"
## [1] " "
## [1] "-----------ILNSLKYVRPGGGFEPN--------FMLFEKCEVNGAGAHPLFAFLREA 0"
## [1] "-----------ILNSLKYVRPGGGFEPN--------FMLFEKCEVNGAGAHPLFAFLREA 0"
## [1] "-----------ILNSLKYVRPGGGFEPN--------FTLFEKCEVNGAQAHPLFAFLRES 0"
## [1] "-----------ILNSLKYVRPGGGFEPN--------FTLFEKCEVNGEKAHPLFTFLRNA 0"
## [1] "KENGSAANESPVLTELNPPHVNAGKEDDDEDWGEDTTAEAQRRRMEEISDHAKNLTLSED 0"
## [1] "-----------VMVKLKWGMEYKGYLVS----------------VDG--------YMNMQ 0"
## [1] "-----------ILLDLQMLLSYPVLKNP--------VNLEAAQLLIRNASTYKMVIQELL 0"
## [1] "-----------VHVIVKGKVQGVGYRHA--------------------------AVRRAH 0"
## [1] "NLYVSGLPKTMTQKELEQLFSQYGRIITSR------ILVDQVTGVSRGVGFIRFDKRIEA 0"
## [1] "------------HAVVKARYNMAAFSVD------------ERKRPGPDRRTMEISKVISE 0"
## [1] " "
## [1] "LPAPSDDATALMTDPKLITWSP----VCRNDVAWNFEKFLVGPDG--------------V 0"
## [1] "LPAPSDDATALMTDPKLITWSP----VCRNDVAWNFEKFLVGPDG--------------V 0"
## [1] "LPAPSDDTTALMTDPKFITWSP----VCRNDVAWNFEKFLVGPDG--------------V 0"
## [1] "LPTPSDDPTALMTDPKYIIWSP----VCRNDIAWNFEKFLVGPDG--------------V 0"
## [1] "LEKPVEYRVNLLFDFVKKKKEEGVIDFCDKDILAEAERLDVKAMGPLVLSEVLFDDKIRD 0"
## [1] "LANTEEYIDGALSGHLGEVLIR-----CNN-VLYIRGVEEEEEDG--------------- 0"
## [1] "PPALPKRESSLVIPEKPQDRVRVIKTISFNDYYKTWSQIATTKVAEHSRNPFVGDPHFMG 0"
## [1] "MLGVTGWVQNLPDGTVEAVVQG-----TADQVDHMLEWLRRGPPA--------------A 0"
## [1] "EEAIKGLNGQKPPGATEPITVKFANNPSQKVNHTILSQLYQSPNRRYPG-------PLAQ 0"
## [1] "ALSSSIMIEQFPRAEIDVYIEVLQADAGTRIAGLTAATVALADAG--------------I 0"
## [1] " "
## [1] "PLRRYS---RRFQ-----------------TIDIEPDIEALLSQGPSCA----------- 0"
## [1] "PVRRYS---RRFQ-----------------TIDIEPDIEALLSQGPSSA----------- 0"
## [1] "PVRRYS---RRFP-----------------TIDIEPDIEALLSQGPSCA----------- 0"
## [1] "PVRRYS---RRFR-----------------TIDIEPDIETLLSQQSGNS----------- 0"
## [1] "QIKKYR---RHFLRFCHNNKKAQRYLLHGFECVIDMHQSHLLSKIPHILKEMYDADLLEE 0"
## [1] "EMRE-------------------------------------------------------- 0"
## [1] "QYYKWKQQDR--------------------QNQVQWESKFALSKWQTARKKIMAQEKSNH 0"
## [1] "QVRELAS-----------------------ERSFEEKRYKHFAQL--------------- 0"
## [1] "QAQRFRLDNLLNMAYG------------GIKSRFSPMAIDGMTSLAGIN--FPGHAGTGW 0"
## [1] "PMRDMVVG--------------------CTAGKVDGHIVLDLSK---------EEDNFGE 0"
## [1] " "
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "E-VIISWAEKPSKKYVSKELAKDIRAKADPFIKWLKEAEEESSGGEEDEEEDDDDDDETI 0"
## [1] "------------------------------------------------------------ 0"
## [1] "YNQVIGIYPSPSELSFEYEIEEEEEEEEEEEEEEEEEEEEEEKEEEEEEPRIYQIPQEWP 0"
## [1] "------------------------------------------------------------ 0"
## [1] "CIFVYNLAPDADESILWQMFGPFGAVTNVKVIRDFNTNKCKGFGFVTMTNYDEAAMAIAS 0"
## [1] "ADIPMAIMPKTGEIVLLQMDGDVTEDEFYEATSMIIEATKKISQIQRNALLNKYKIEGIE 0"
## [1] " "
## [1] "-------------------------------------------- 16"
## [1] "-------------------------------------------- 16"
## [1] "-------------------------------------------- 16"
## [1] "-------------------------------------------- 16"
## [1] "EVVYSKTATVPKVEAVKSPVDKDDDIDIDAI------------- 16"
## [1] "-------------------------------------------- 16"
## [1] "EEHDSKESWEEEADHLVSWTNGLDEESLNYEYFDNVKNYENLGN 16"
## [1] "-------------------------------------------- 16"
## [1] "LNGYRLGDRVLQVSFKTSKTHKA--------------------- 16"
## [1] "GGE----------------------------------------- 16"
## [1] " "
ggmsa::ggmsa(gpx3_align, start = 25, end = 100)
gpx3_dist <- seqinr::dist.alignment(gpx3_align_seqinr, matrix = "identity")
is(gpx3_dist)
## [1] "dist" "oldClass"
class(gpx3_dist)
## [1] "dist"
gpx3_dist_round <- round(gpx3_dist, 3)
gpx3_dist_round
## NP_000572 NP_001152770 NP_001108591 NP_032186 NP_001080311
## NP_001152770 0.101
## NP_001108591 0.233 0.214
## NP_032186 0.374 0.349 0.381
## NP_001080311 0.896 0.893 0.897 0.897
## NP_001264291 0.915 0.915 0.915 0.921 0.928
## NP_001028945 0.943 0.942 0.944 0.948 0.937
## NP_886521 0.915 0.909 0.909 0.927 0.950
## NP_001081613 0.962 0.964 0.963 0.959 0.956
## NP_110826 0.960 0.960 0.960 0.960 0.945
## NP_001264291 NP_001028945 NP_886521 NP_001081613
## NP_001152770
## NP_001108591
## NP_032186
## NP_001080311
## NP_001264291
## NP_001028945 0.964
## NP_886521 0.928 0.984
## NP_001081613 0.964 0.970 0.928
## NP_110826 0.964 0.963 0.966 0.967
# Build tree
tree_subset <- nj(gpx3_dist)
# Plot tree
plot.phylo(tree_subset, main = "Phylogenetic Tree", use.edge.length = FALSE)
mtext(text = "GPX1 family gene tree - rooted, no branch lengths")