Code By: Deeksha Sesha
This code compiles summary information about the gene SLC24A5 (Sodium/potassium/calcium exchanger 5). SLC24A5 codes for the Sodium/potassium/calcium exchanger 5 protein, which is very important for skin pigmentation.
In addition to showing key summarizing information about SLC24A5 in humans, this code also conducts phylogenetic analyses to compare the evolutionary relationship between the gene in humans and in other species.
Resources used to obtain key information about the protein: Refseq Gene: https://www.ncbi.nlm.nih.gov/gene/283652 Refseq Homologene: https://www.ncbi.nlm.nih.gov/homologene?LinkName=gene_homologene&from_uid=283652 UniProt: https://www.uniprot.org/uniprot/Q71RS6#family_and_domains AlphaFold: https://alphafold.ebi.ac.uk/entry/Q71RS6
Note: SLC24A5 is not on PDB, included AlphaFold instead
# github packages
library(compbio4all)
library(ggmsa)
# CRAN packages
library(rentrez)
library(seqinr)
library(ape)
library(pander)
library(ggplot2)
# Bioconductor packages
library(msa)
library(drawProteins)
library(Biostrings)
library(HGNChelper)
Accession Numbers were obtained from RefSeq, RefSeq HomoloGene, UniProt, Neanderthal Genome Database, and BLAST. This protein is not in PDB.
This function confirms the validity of my gene name and any aliases.
HGNChelper::checkGeneSymbols(x = c("SLC24A5"))
## Maps last updated on: Thu Oct 24 12:31:05 2019
## x Approved Suggested.Symbol
## 1 SLC24A5 TRUE SLC24A5
# data for making data table
slc24a5_table <- c("NP_995322.1","Q71RS6","NA","Homo sapiens","Human","SLC24A5",
"NP_778199.2","Q8C261","NA","Mus musculus","Mouse","SLC24A5",
"XP_510380.2","A0A2J8NTC3","NA","Pan troglodytes","Chimpanzee","SLC24A5",
"XP_001112601.1","F6ZB51","NA","Macaca mulatta","Rhesus Monkey","SLC24A5",
"NP_001121981.2","F1MIZ1","NA","Bos taurus","Cow","SLC24A5",
"NP_001025451.1","Q49SH1","NA","Danio rerio","Zebrafish","SLC24A5",
"NP_001033586.3","B5G557","NA","Gallus gallus","Chicken","SLC24A5",
"XP_032831753.1","NA","NA","Petromyzon marinus","Sea Lamprey","SLC24A5",
"XP_036592760.1","NA","NA","Trichosurus vulpecula","Possum","SLC24A5",
"XP_040269928.1","NA","NA","Bufo bufo","Toad","SLC24A5")
Convert vector information into a table:
# convert to matrix
slc24a5_table_matrix <- matrix(slc24a5_table,
byrow = T,
nrow = 10)
# convert to data frame
slc24a5.df <- data.frame(slc24a5_table_matrix,
stringsAsFactors = F)
# naming columns
names(slc24a5.df) <- c("ncbi.protein.accession", "UniProt.id", "PDB", "species", "common.name",
"gene.name")
pander(slc24a5.df)
| ncbi.protein.accession | UniProt.id | PDB | species |
|---|---|---|---|
| NP_995322.1 | Q71RS6 | NA | Homo sapiens |
| NP_778199.2 | Q8C261 | NA | Mus musculus |
| XP_510380.2 | A0A2J8NTC3 | NA | Pan troglodytes |
| XP_001112601.1 | F6ZB51 | NA | Macaca mulatta |
| NP_001121981.2 | F1MIZ1 | NA | Bos taurus |
| NP_001025451.1 | Q49SH1 | NA | Danio rerio |
| NP_001033586.3 | B5G557 | NA | Gallus gallus |
| XP_032831753.1 | NA | NA | Petromyzon marinus |
| XP_036592760.1 | NA | NA | Trichosurus vulpecula |
| XP_040269928.1 | NA | NA | Bufo bufo |
| common.name | gene.name |
|---|---|
| Human | SLC24A5 |
| Mouse | SLC24A5 |
| Chimpanzee | SLC24A5 |
| Rhesus Monkey | SLC24A5 |
| Cow | SLC24A5 |
| Zebrafish | SLC24A5 |
| Chicken | SLC24A5 |
| Sea Lamprey | SLC24A5 |
| Possum | SLC24A5 |
| Toad | SLC24A5 |
# downloading fasta sequences
slc24a5_fasta_list <- compbio4all::entrez_fetch_list(db = "protein",
id = slc24a5.df$ncbi.protein.accession,
rettype = "fasta")
length(slc24a5_fasta_list)
## [1] 10
Cleaning Fasta Files
for(i in 1:length(slc24a5_fasta_list)){
slc24a5_fasta_list[[i]] <- compbio4all::fasta_cleaner(slc24a5_fasta_list[[i]], parse = F)
}
Note: SLC24A5 has no entries on PDB, DisProt, or RepeatsDB.
source <- c("PFAM","Uniprot","CATH","Alphafold")
protein.property <- c("There are two Na_Ca_ex, or sodium-calcium exchanger domains from AA 71-216 and AA 333-484.",
"SLC24A5 is a multi-pass membrane protein, found primarily in the Golgi Apparatus.",
"The CATH database contained 5 matching domains/regions to the human SLC24A5 FASTA file. For all 5, CATH reported the secondary structure as mainly alpha helices.",
"From analyzing the AlphaFold structure prediction of SLC24A5, it is overwhelmingly composed of alpha helices. Also, the regions with highest model confidence are all alpha helices.")
link <- c("http://pfam.xfam.org/protein/Q71RS6",
"https://www.uniprot.org/uniprot/Q71RS6",
"http://www.cathdb.info/version/v4_3_0/domain/5jdqA02",
"https://alphafold.ebi.ac.uk/entry/Q71RS6")
protein.info.df <- data.frame(source,
protein.property,
link)
colnames(protein.info.df) <- c("Source","Protein Property","Link")
pander::pander(protein.info.df)
| Source | Protein Property |
|---|---|
| PFAM | There are two Na_Ca_ex, or sodium-calcium exchanger domains from AA 71-216 and AA 333-484. |
| Uniprot | SLC24A5 is a multi-pass membrane protein, found primarily in the Golgi Apparatus. |
| CATH | The CATH database contained 5 matching domains/regions to the human SLC24A5 FASTA file. For all 5, CATH reported the secondary structure as mainly alpha helices. |
| Alphafold | From analyzing the AlphaFold structure prediction of SLC24A5, it is overwhelmingly composed of alpha helices. Also, the regions with highest model confidence are all alpha helices. |
| Link |
|---|
| http://pfam.xfam.org/protein/Q71RS6 |
| https://www.uniprot.org/uniprot/Q71RS6 |
| http://www.cathdb.info/version/v4_3_0/domain/5jdqA02 |
| https://alphafold.ebi.ac.uk/entry/Q71RS6 |
Building the Diagram
Q71RS6_json <- drawProteins::get_features("Q71RS6")
## [1] "Download has worked"
is(Q71RS6_json)
## [1] "list" "vector" "list_OR_List" "vector_OR_Vector"
## [5] "vector_OR_factor"
From the raw data the webpage is converted to a dataframe:
my_prot_df <- drawProteins::feature_to_dataframe(Q71RS6_json)
is(my_prot_df)
## [1] "data.frame" "list" "oldClass" "vector"
## [5] "list_OR_List" "vector_OR_Vector" "vector_OR_factor"
my_prot_df[,-2]
## type begin end length accession entryName taxid order
## featuresTemp SIGNAL 1 29 28 Q71RS6 NCKX5_HUMAN 9606 1
## featuresTemp.1 CHAIN 30 500 470 Q71RS6 NCKX5_HUMAN 9606 1
## featuresTemp.2 TOPO_DOM 30 66 36 Q71RS6 NCKX5_HUMAN 9606 1
## featuresTemp.3 TRANSMEM 67 87 20 Q71RS6 NCKX5_HUMAN 9606 1
## featuresTemp.4 TOPO_DOM 88 111 23 Q71RS6 NCKX5_HUMAN 9606 1
## featuresTemp.5 TRANSMEM 112 132 20 Q71RS6 NCKX5_HUMAN 9606 1
## featuresTemp.6 TOPO_DOM 133 136 3 Q71RS6 NCKX5_HUMAN 9606 1
## featuresTemp.7 TRANSMEM 137 157 20 Q71RS6 NCKX5_HUMAN 9606 1
## featuresTemp.8 TOPO_DOM 158 169 11 Q71RS6 NCKX5_HUMAN 9606 1
## featuresTemp.9 TRANSMEM 170 190 20 Q71RS6 NCKX5_HUMAN 9606 1
## featuresTemp.10 TOPO_DOM 191 195 4 Q71RS6 NCKX5_HUMAN 9606 1
## featuresTemp.11 TRANSMEM 196 216 20 Q71RS6 NCKX5_HUMAN 9606 1
## featuresTemp.12 TOPO_DOM 217 302 85 Q71RS6 NCKX5_HUMAN 9606 1
## featuresTemp.13 TRANSMEM 303 323 20 Q71RS6 NCKX5_HUMAN 9606 1
## featuresTemp.14 TOPO_DOM 324 333 9 Q71RS6 NCKX5_HUMAN 9606 1
## featuresTemp.15 TRANSMEM 334 354 20 Q71RS6 NCKX5_HUMAN 9606 1
## featuresTemp.16 TOPO_DOM 355 368 13 Q71RS6 NCKX5_HUMAN 9606 1
## featuresTemp.17 TRANSMEM 369 389 20 Q71RS6 NCKX5_HUMAN 9606 1
## featuresTemp.18 TOPO_DOM 390 399 9 Q71RS6 NCKX5_HUMAN 9606 1
## featuresTemp.19 TRANSMEM 400 420 20 Q71RS6 NCKX5_HUMAN 9606 1
## featuresTemp.20 TOPO_DOM 421 437 16 Q71RS6 NCKX5_HUMAN 9606 1
## featuresTemp.21 TRANSMEM 438 458 20 Q71RS6 NCKX5_HUMAN 9606 1
## featuresTemp.22 TOPO_DOM 459 468 9 Q71RS6 NCKX5_HUMAN 9606 1
## featuresTemp.23 TRANSMEM 469 489 20 Q71RS6 NCKX5_HUMAN 9606 1
## featuresTemp.24 TOPO_DOM 490 500 10 Q71RS6 NCKX5_HUMAN 9606 1
## featuresTemp.25 VAR_SEQ 41 101 60 Q71RS6 NCKX5_HUMAN 9606 1
## featuresTemp.26 VARIANT 111 111 0 Q71RS6 NCKX5_HUMAN 9606 1
This protein has no domains or motifs or regions information. So I elected to include the receptor domain in this summary since it was the most informative about SLC24A5 in particular.
my_canvas <- draw_canvas(my_prot_df)
my_canvas <- draw_chains(my_canvas, my_prot_df, label_size = 2.5)
my_canvas <- draw_recept_dom(my_canvas, my_prot_df)
my_canvas
Here, I will be creating a self vs. self dotplot of the human SLC24A5 gene sequence.
slc24a5_human_vector <- compbio4all::fasta_cleaner(slc24a5_fasta_list[1], parse = T)
slc24a5_human_vector
## [1] "M" "Q" "T" "K" "G" "G" "Q" "T" "W" "A" "R" "R" "A" "L" "L" "L" "G" "I"
## [19] "L" "W" "A" "T" "A" "H" "L" "P" "L" "S" "G" "T" "S" "L" "P" "Q" "R" "L"
## [37] "P" "R" "A" "T" "G" "N" "S" "T" "Q" "C" "V" "I" "S" "P" "S" "S" "E" "F"
## [55] "P" "E" "G" "F" "F" "T" "R" "Q" "E" "R" "R" "D" "G" "G" "I" "I" "I" "Y"
## [73] "F" "L" "I" "I" "V" "Y" "M" "F" "M" "A" "I" "S" "I" "V" "C" "D" "E" "Y"
## [91] "F" "L" "P" "S" "L" "E" "I" "I" "S" "E" "S" "L" "G" "L" "S" "Q" "D" "V"
## [109] "A" "G" "T" "T" "F" "M" "A" "A" "G" "S" "S" "A" "P" "E" "L" "V" "T" "A"
## [127] "F" "L" "G" "V" "F" "I" "T" "K" "G" "D" "I" "G" "I" "S" "T" "I" "L" "G"
## [145] "S" "A" "I" "Y" "N" "L" "L" "G" "I" "C" "A" "A" "C" "G" "L" "L" "S" "N"
## [163] "T" "V" "S" "T" "L" "S" "C" "W" "P" "L" "F" "R" "D" "C" "A" "A" "Y" "T"
## [181] "I" "S" "A" "A" "A" "V" "L" "G" "I" "I" "Y" "D" "N" "Q" "V" "Y" "W" "Y"
## [199] "E" "G" "A" "L" "L" "L" "L" "I" "Y" "G" "L" "Y" "V" "L" "V" "L" "C" "F"
## [217] "D" "I" "K" "I" "N" "Q" "Y" "I" "I" "K" "K" "C" "S" "P" "C" "C" "A" "C"
## [235] "L" "A" "K" "A" "M" "E" "R" "S" "E" "Q" "Q" "P" "L" "M" "G" "W" "E" "D"
## [253] "E" "G" "Q" "P" "F" "I" "R" "R" "Q" "S" "R" "T" "D" "S" "G" "I" "F" "Y"
## [271] "E" "D" "S" "G" "Y" "S" "Q" "L" "S" "I" "S" "L" "H" "G" "L" "S" "Q" "V"
## [289] "S" "E" "D" "P" "P" "S" "V" "F" "N" "M" "P" "E" "A" "D" "L" "K" "R" "I"
## [307] "F" "W" "V" "L" "S" "L" "P" "I" "I" "T" "L" "L" "F" "L" "T" "T" "P" "D"
## [325] "C" "R" "K" "K" "F" "W" "K" "N" "Y" "F" "V" "I" "T" "F" "F" "M" "S" "A"
## [343] "I" "W" "I" "S" "A" "F" "T" "Y" "I" "L" "V" "W" "M" "V" "T" "I" "T" "G"
## [361] "E" "T" "L" "E" "I" "P" "D" "T" "V" "M" "G" "L" "T" "L" "L" "A" "A" "G"
## [379] "T" "S" "I" "P" "D" "T" "I" "A" "S" "V" "L" "V" "A" "R" "K" "G" "K" "G"
## [397] "D" "M" "A" "M" "S" "N" "I" "V" "G" "S" "N" "V" "F" "D" "M" "L" "C" "L"
## [415] "G" "I" "P" "W" "F" "I" "K" "T" "A" "F" "I" "N" "G" "S" "A" "P" "A" "E"
## [433] "V" "N" "S" "R" "G" "L" "T" "Y" "I" "T" "I" "S" "L" "N" "I" "S" "I" "I"
## [451] "F" "L" "F" "L" "A" "V" "H" "F" "N" "G" "W" "K" "L" "D" "R" "K" "L" "G"
## [469] "I" "V" "C" "L" "L" "S" "Y" "L" "G" "L" "A" "T" "L" "S" "V" "L" "Y" "E"
## [487] "L" "G" "I" "I" "G" "N" "N" "K" "I" "R" "G" "C" "G" "G"
# Change parse to equal true in order to separate each letter in the sequence by a comma; need this in order to use dotPlot() function.
Here, I am plotting a dotPlot with the default arguments in the dotPlot() function where wsize, wstep, and nmatch are all equal to 1. In my modified plot, I increased the values of wsize, wstep, and nmatch. Wsize and wstep involve the spacing of the dotplot, while increasing nmatch increases the number of matches in a region needed to produce a dot.
plot_default <- seqinr::dotPlot(slc24a5_human_vector,slc24a5_human_vector)
plot_modified1 <- seqinr::dotPlot(slc24a5_human_vector,
slc24a5_human_vector,
wsize = 8,
wstep = 8,
nmatch = 3)
plot_modified2 <- seqinr::dotPlot(slc24a5_human_vector,
slc24a5_human_vector,
wsize = 10,
wstep = 10,
nmatch = 5)
plot_modified3 <- seqinr::dotPlot(slc24a5_human_vector,
slc24a5_human_vector,
wsize = 15,
wstep = 15,
nmatch = 2)
par(mfrow = c(2,2))
In my opinion, plot_modified1 was the best since it emphasized the diagonal and decreased noise without completely removing the background data.
plot_modified1 <- seqinr::dotPlot(slc24a5_human_vector,
slc24a5_human_vector,
wsize = 8,
wstep = 8,
nmatch = 3)
Alphafold indicates a primary alpha helix structure, so I anticipate my prediction methods yielding similar results.
I will use data collected by Chou and Zhang (1995) as a basis for predicting the structure of SLC24A5.
# enter once
aa.1.1 <- c("A","R","N","D","C","Q","E","G","H","I",
"L","K","M","F","P","S","T","W","Y","V")
# enter again
aa.1.2 <- c("A","R","N","D","C","Q","E","G","H","I",
"L","K","M","F","P","S","T","W","Y","V")
# data double entry: checking that lengths of both vectors is the same
length(aa.1.1) == length(aa.1.2)
## [1] TRUE
# data double entry: checking that the content of both vectors is the same
aa.1.1 == aa.1.2
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE
# Making sure length/number of unique values is the same
unique(aa.1.1)
## [1] "A" "R" "N" "D" "C" "Q" "E" "G" "H" "I" "L" "K" "M" "F" "P" "S" "T" "W" "Y"
## [20] "V"
length(unique(aa.1.1)) == length(unique(aa.1.2))
## [1] TRUE
Now I am making vectors with the frequency of each amino acid that Chou uses. Here, the logical check is making sure the total of the values in each vector equals the total obtained by Chou.
# alpha proteins
alpha <- c(285, 53, 97, 163, 22, 67, 134, 197, 111, 91,
221, 249, 48, 123, 82, 122, 119, 33, 63, 167)
# beta proteins
beta <- c(203, 67, 139, 121, 75, 122, 86, 297, 49, 120,
177, 115, 16, 85, 127, 341, 253, 44, 110, 229)
# alpha + beta
a.plus.b <- c(175, 78, 120, 111, 74, 74, 86, 171, 33, 93,
110, 112, 25, 52, 71, 126, 117, 30, 108, 123)
# alpha/beta
a.div.b <- c(361, 146, 183, 244, 63, 114, 257, 377, 107, 239,
339, 321, 91, 158, 188, 327, 238, 72, 130, 378)
# check against chou's total
sum(alpha) == 2447
## [1] TRUE
sum(beta) == 2776
## [1] TRUE
sum(a.plus.b) == 1889
## [1] TRUE
sum(a.div.b) == 4333
## [1] TRUE
Creating a table of values:
pander::pander(data.frame(aa.1.1, alpha, beta, a.plus.b, a.div.b))
| aa.1.1 | alpha | beta | a.plus.b | a.div.b |
|---|---|---|---|---|
| A | 285 | 203 | 175 | 361 |
| R | 53 | 67 | 78 | 146 |
| N | 97 | 139 | 120 | 183 |
| D | 163 | 121 | 111 | 244 |
| C | 22 | 75 | 74 | 63 |
| Q | 67 | 122 | 74 | 114 |
| E | 134 | 86 | 86 | 257 |
| G | 197 | 297 | 171 | 377 |
| H | 111 | 49 | 33 | 107 |
| I | 91 | 120 | 93 | 239 |
| L | 221 | 177 | 110 | 339 |
| K | 249 | 115 | 112 | 321 |
| M | 48 | 16 | 25 | 91 |
| F | 123 | 85 | 52 | 158 |
| P | 82 | 127 | 71 | 188 |
| S | 122 | 341 | 126 | 327 |
| T | 119 | 253 | 117 | 238 |
| W | 33 | 44 | 30 | 72 |
| Y | 63 | 110 | 108 | 130 |
| V | 167 | 229 | 123 | 378 |
Calculating the proportions of all 20 amino acids in each fold type:
alpha.prop <- alpha/sum(alpha)
beta.prop <- beta/sum(beta)
a.plus.b.prop <- a.plus.b/sum(a.plus.b)
a.div.b <- a.div.b/sum(a.div.b)
# Building a dataframe from vectors
aa.prop <- data.frame(alpha.prop,
beta.prop,
a.plus.b.prop,
a.div.b,
row.names = aa.1.1)
Displaying the frequency table:
pander::pander(aa.prop)
| alpha.prop | beta.prop | a.plus.b.prop | a.div.b | |
|---|---|---|---|---|
| A | 0.1165 | 0.07313 | 0.09264 | 0.08331 |
| R | 0.02166 | 0.02414 | 0.04129 | 0.03369 |
| N | 0.03964 | 0.05007 | 0.06353 | 0.04223 |
| D | 0.06661 | 0.04359 | 0.05876 | 0.05631 |
| C | 0.008991 | 0.02702 | 0.03917 | 0.01454 |
| Q | 0.02738 | 0.04395 | 0.03917 | 0.02631 |
| E | 0.05476 | 0.03098 | 0.04553 | 0.05931 |
| G | 0.08051 | 0.107 | 0.09052 | 0.08701 |
| H | 0.04536 | 0.01765 | 0.01747 | 0.02469 |
| I | 0.03719 | 0.04323 | 0.04923 | 0.05516 |
| L | 0.09031 | 0.06376 | 0.05823 | 0.07824 |
| K | 0.1018 | 0.04143 | 0.05929 | 0.07408 |
| M | 0.01962 | 0.005764 | 0.01323 | 0.021 |
| F | 0.05027 | 0.03062 | 0.02753 | 0.03646 |
| P | 0.03351 | 0.04575 | 0.03759 | 0.04339 |
| S | 0.04986 | 0.1228 | 0.0667 | 0.07547 |
| T | 0.04863 | 0.09114 | 0.06194 | 0.05493 |
| W | 0.01349 | 0.01585 | 0.01588 | 0.01662 |
| Y | 0.02575 | 0.03963 | 0.05717 | 0.03 |
| V | 0.06825 | 0.08249 | 0.06511 | 0.08724 |
In order to predict secondary structure of my protein, I will use amino acid frequencies.
slc24a5_human_table <- table(slc24a5_human_vector)/length(slc24a5_human_vector)
Function to convert this table into a vector:
table_to_vector <- function(table_x){
table_names <- attr(table_x, "dimnames")[[1]]
table_vect <- as.vector(table_x)
names(table_vect) <- table_names
return(table_vect)
}
slc24a5.human.aa.freq <- table_to_vector(slc24a5_human_table)
slc24a5.human.aa.freq
## A C D E F G H I K L M N P
## 0.072 0.030 0.036 0.038 0.054 0.084 0.006 0.104 0.032 0.124 0.026 0.030 0.040
## Q R S T V W Y
## 0.028 0.036 0.086 0.064 0.052 0.022 0.036
Next, I will check for the presence of U, an unknown amino acid. We have to remove all instances of U in order to ensure that the analysis runs as intended.
aa.names <- names(slc24a5.human.aa.freq)
any(aa.names == "U")
## [1] FALSE
sum(slc24a5.human.aa.freq)
## [1] 1
Since the code chunk outputted false, this shows that the SLC24A5 sequence does not contain any instances of U. If it did, we would have to use which() to identify the index value(s) that U occurs in, and then remove them.
Also, as a way of checking my data, I calculated the sum of the amino acid frequency, ensuring that it was 1. It’s especially important that I obtained exactly 1 since I didn’t remove any elements.
Now that all the checking is done, I’ll add my data on SLC24A5 to the amino acid frequency table I made earlier.
aa.prop$slc24a5.human.aa.freq <- slc24a5.human.aa.freq
pander::pander(aa.prop)
| alpha.prop | beta.prop | a.plus.b.prop | a.div.b | slc24a5.human.aa.freq | |
|---|---|---|---|---|---|
| A | 0.1165 | 0.07313 | 0.09264 | 0.08331 | 0.072 |
| R | 0.02166 | 0.02414 | 0.04129 | 0.03369 | 0.03 |
| N | 0.03964 | 0.05007 | 0.06353 | 0.04223 | 0.036 |
| D | 0.06661 | 0.04359 | 0.05876 | 0.05631 | 0.038 |
| C | 0.008991 | 0.02702 | 0.03917 | 0.01454 | 0.054 |
| Q | 0.02738 | 0.04395 | 0.03917 | 0.02631 | 0.084 |
| E | 0.05476 | 0.03098 | 0.04553 | 0.05931 | 0.006 |
| G | 0.08051 | 0.107 | 0.09052 | 0.08701 | 0.104 |
| H | 0.04536 | 0.01765 | 0.01747 | 0.02469 | 0.032 |
| I | 0.03719 | 0.04323 | 0.04923 | 0.05516 | 0.124 |
| L | 0.09031 | 0.06376 | 0.05823 | 0.07824 | 0.026 |
| K | 0.1018 | 0.04143 | 0.05929 | 0.07408 | 0.03 |
| M | 0.01962 | 0.005764 | 0.01323 | 0.021 | 0.04 |
| F | 0.05027 | 0.03062 | 0.02753 | 0.03646 | 0.028 |
| P | 0.03351 | 0.04575 | 0.03759 | 0.04339 | 0.036 |
| S | 0.04986 | 0.1228 | 0.0667 | 0.07547 | 0.086 |
| T | 0.04863 | 0.09114 | 0.06194 | 0.05493 | 0.064 |
| W | 0.01349 | 0.01585 | 0.01588 | 0.01662 | 0.052 |
| Y | 0.02575 | 0.03963 | 0.05717 | 0.03 | 0.022 |
| V | 0.06825 | 0.08249 | 0.06511 | 0.08724 | 0.036 |
Two custom functions are needed: one to calculate correlations between two columns of our table, and one to calculate cosine similarities. These methods were used in Chou and Zhang (1992) and Higgs and Attwood (2005).
# Correlation used in Chou and Zhang 1992.
chou_cor <- function(x,y){
numerator <- sum(x*y)
denominator <- sqrt((sum(x^2))*(sum(y^2)))
result <- numerator/denominator
return(result)
}
# Cosine similarity used in Higgs and Attwood (2005).
chou_cosine <- function(z.1, z.2){
z.1.abs <- sqrt(sum(z.1^2))
z.2.abs <- sqrt(sum(z.2^2))
my.cosine <- sum(z.1*z.2)/(z.1.abs*z.2.abs)
return(my.cosine)
}
Calculating correlation between SLC24A5 and the 4 groups:
corr.alpha <- chou_cor(aa.prop[,5], aa.prop[,1])
corr.beta <- chou_cor(aa.prop[,5], aa.prop[,2])
corr.apb <- chou_cor(aa.prop[,5], aa.prop[,3])
corr.adb <- chou_cor(aa.prop[,5], aa.prop[,4])
Calculating cosine similarity between SLC24A5 and the 4 groups:
cos.alpha <- chou_cosine(aa.prop[,5], aa.prop[,1])
cos.beta <- chou_cosine(aa.prop[,5], aa.prop[,2])
cos.apb <- chou_cosine(aa.prop[,5], aa.prop[,3])
cos.adb <- chou_cosine(aa.prop[,5], aa.prop[,4])
Calculate distance. Note: we need to flip the dataframe on its side using a command called t().
aa.prop.flipped <- t(aa.prop)
round(aa.prop.flipped,2)
## A R N D C Q E G H I L
## alpha.prop 0.12 0.02 0.04 0.07 0.01 0.03 0.05 0.08 0.05 0.04 0.09
## beta.prop 0.07 0.02 0.05 0.04 0.03 0.04 0.03 0.11 0.02 0.04 0.06
## a.plus.b.prop 0.09 0.04 0.06 0.06 0.04 0.04 0.05 0.09 0.02 0.05 0.06
## a.div.b 0.08 0.03 0.04 0.06 0.01 0.03 0.06 0.09 0.02 0.06 0.08
## slc24a5.human.aa.freq 0.07 0.03 0.04 0.04 0.05 0.08 0.01 0.10 0.03 0.12 0.03
## K M F P S T W Y V
## alpha.prop 0.10 0.02 0.05 0.03 0.05 0.05 0.01 0.03 0.07
## beta.prop 0.04 0.01 0.03 0.05 0.12 0.09 0.02 0.04 0.08
## a.plus.b.prop 0.06 0.01 0.03 0.04 0.07 0.06 0.02 0.06 0.07
## a.div.b 0.07 0.02 0.04 0.04 0.08 0.05 0.02 0.03 0.09
## slc24a5.human.aa.freq 0.03 0.04 0.03 0.04 0.09 0.06 0.05 0.02 0.04
Distance Matrix:
dist(aa.prop.flipped, method = "euclidean")
## alpha.prop beta.prop a.plus.b.prop a.div.b
## beta.prop 0.13342098
## a.plus.b.prop 0.09281824 0.08289406
## a.div.b 0.06699039 0.08659174 0.06175113
## slc24a5.human.aa.freq 0.18176263 0.13661334 0.13378590 0.15039406
Getting individual distances between each column:
dist.alpha <- dist((aa.prop.flipped[c(1,5),]), method = "euclidean")
dist.beta <- dist((aa.prop.flipped[c(2,5),]), method = "euclidean")
dist.apb <- dist((aa.prop.flipped[c(3,5),]), method = "euclidean")
dist.adb <- dist((aa.prop.flipped[c(4,5),]), method = "euclidean")
Compiling distance information & displaying output:
# fold types
fold.type <- c("alpha","beta","alpha plus beta", "alpha/beta")
# data
corr.sim <- round(c(corr.alpha,corr.beta,corr.apb,corr.adb),5)
cosine.sim <- round(c(cos.alpha,cos.beta,cos.apb,cos.adb),5)
Euclidean.dist <- round(c(dist.alpha,dist.beta,dist.apb,dist.adb),5)
# summary
sim.sum <- c("","","most.sim","")
dist.sum <- c("","","min.dist","")
df <- data.frame(fold.type,
corr.sim ,
cosine.sim ,
Euclidean.dist ,
sim.sum ,
dist.sum )
# display output
pander::pander(df)
| fold.type | corr.sim | cosine.sim | Euclidean.dist | sim.sum | dist.sum |
|---|---|---|---|---|---|
| alpha | 0.7534 | 0.7534 | 0.1818 | ||
| beta | 0.8625 | 0.8625 | 0.1366 | ||
| alpha plus beta | 0.8602 | 0.8602 | 0.1338 | most.sim | min.dist |
| alpha/beta | 0.8251 | 0.8251 | 0.1504 |
Convert all FASTA records intro entries in a single vector. FASTA entries are contained in a list produced at the beginning of the script. They were cleaned to remove the header and newline characters.
names(slc24a5_fasta_list)
## [1] "NP_995322.1" "NP_778199.2" "XP_510380.2" "XP_001112601.1"
## [5] "NP_001121981.2" "NP_001025451.1" "NP_001033586.3" "XP_032831753.1"
## [9] "XP_036592760.1" "XP_040269928.1"
length(slc24a5_fasta_list)
## [1] 10
Making each entry of the list into a vector:
slc24a5.vector <- rep(NA, length(slc24a5_fasta_list))
for(i in 1:length(slc24a5_fasta_list)){
slc24a5.vector[i] <- slc24a5_fasta_list[[i]]
}
#Name the vector
names(slc24a5.vector) <- names(slc24a5_fasta_list)
Pairwise alignments for human, chimpanzee, cow, and toad:
# Human vs. chimpanzee
align.01.03 <- Biostrings::pairwiseAlignment(
slc24a5.vector[1],
slc24a5.vector[3])
# Human vs. cow
align.01.05 <- Biostrings::pairwiseAlignment(
slc24a5.vector[1],
slc24a5.vector[5])
# Human vs. toad
align.01.10 <- Biostrings::pairwiseAlignment(
slc24a5.vector[1],
slc24a5.vector[10])
# Chimpanzee vs. cow
align.03.05 <- Biostrings::pairwiseAlignment(
slc24a5.vector[3],
slc24a5.vector[5])
# Chimpanzee vs. toad
align.03.10 <- Biostrings::pairwiseAlignment(
slc24a5.vector[3],
slc24a5.vector[10])
# Cow vs. toad
align.05.10 <- Biostrings::pairwiseAlignment(
slc24a5.vector[3],
slc24a5.vector[5])
Get PID values from each comparison:
Biostrings::pid(align.01.03)
## [1] 99.4
Biostrings::pid(align.01.05)
## [1] 90.41916
Biostrings::pid(align.01.10)
## [1] 67.95367
Biostrings::pid(align.03.05)
## [1] 90.61876
Biostrings::pid(align.03.10)
## [1] 68.33977
Biostrings::pid(align.05.10)
## [1] 90.61876
Building PID Matrix:
pids <- c(1, NA, NA, NA,
pid(align.01.03), 1, NA, NA,
pid(align.01.05), pid(align.03.05), 1, NA,
pid(align.01.10), pid(align.03.10), pid(align.05.10), 1)
mat <- matrix(pids, nrow = 4, byrow = T)
row.names(mat) <- c("Homo","Pan","Cow","Toad")
colnames(mat) <- c("Homo","Pan","Cow","Toad")
pander::pander(mat)
| Homo | Pan | Cow | Toad | |
|---|---|---|---|---|
| Homo | 1 | NA | NA | NA |
| Pan | 99.4 | 1 | NA | NA |
| Cow | 90.42 | 90.62 | 1 | NA |
| Toad | 67.95 | 68.34 | 90.62 | 1 |
#Calculating the PIDs
method <- c("PID1","PID2","PID3","PID4")
pid1 <- pid(align.01.03, type = "PID1")
pid2 <- pid(align.01.03, type = "PID2")
pid3 <- pid(align.01.03, type = "PID3")
pid4 <- pid(align.01.03, type = "PID4")
PID <- c(pid1,pid2,pid3,pid4)
denominator <- c("aligned positions + internal gap positions",
"aligned positions",
"length of shorter seq",
"average length of both seq")
#Displaying the data
pid.df <- data.frame(method,PID,denominator)
pander::pander(pid.df)
| method | PID | denominator |
|---|---|---|
| PID1 | 99.4 | aligned positions + internal gap positions |
| PID2 | 99.4 | aligned positions |
| PID3 | 99.4 | length of shorter seq |
| PID4 | 99.4 | average length of both seq |
slc24a5.vector.ss <- Biostrings::AAStringSet(slc24a5.vector)
slc24a5_align <- msa(slc24a5.vector.ss,
method = "ClustalW",
substitutionMatrix = "default")
## use default substitution matrix
class(slc24a5_align)
## [1] "MsaAAMultipleAlignment"
## attr(,"package")
## [1] "msa"
is(slc24a5_align)
## [1] "MsaAAMultipleAlignment" "AAMultipleAlignment" "MsaMetaData"
## [4] "MultipleAlignment"
# msa() produces objects of type MSA!
Default output of MSA:
slc24a5_align
## CLUSTAL 2.1
##
## Call:
## msa(slc24a5.vector.ss, method = "ClustalW", substitutionMatrix = "default")
##
## MsaAAMultipleAlignment with 10 rows and 549 columns
## aln names
## [1] MQTKG--GQTWARRA--------LL...GLATLSVLYELGIIGNN-KIRGCGG NP_995322.1
## [2] MQTKG--GQTWARRA--------LL...GLATLSVLYELGIIGNN-KIRGCGG XP_510380.2
## [3] MQTKR--GHTWARRA--------LL...GLATLSVLYELGIIGNN-KIRGCGG XP_001112601.1
## [4] MQIKG--GPRWARRA--------LL...GLTTLSVLYELGIIGNN-KIRGCGD NP_001121981.2
## [5] MRTKW--GPTWTRRV--------LL...GLATLSVLYEIGIIGNN-RIRGCGV NP_778199.2
## [6] MEEGA--SRRYCRRL--------ML...VLATLSVLYELGIIGNN-KIRGCGD XP_036592760.1
## [7] MQARA--GGLWRRRS--------AL...VFTVLSILYELGIIGNN-PTRVCGN NP_001033586.3
## [8] MEAAGSKGGRRNRRASRTALATFTF...VFATLSILYELGIIGGM-PVRSCGD XP_040269928.1
## [9] MRTDVFLQRRKRRDVLLS---IIAL...LFATLSILYELGIIGNN-PIRSCRD NP_001025451.1
## [10] MDPSL--GSSAARCILG---ASSTV...VFVAVAVLHEVGVLGGDPAARVCGK XP_032831753.1
## Con MQTK?--G??WARRA--------LL...GLATLSVLYELGIIGNN-KIRGCG? Consensus
Change alignment class:
class(slc24a5_align) <- "AAMultipleAlignment"
class(slc24a5_align)
## [1] "AAMultipleAlignment"
Convert to seqinr format:
slc24a5_align_subset_seqinr <- msaConvert(slc24a5_align, type = "seqinr::alignment")
class(slc24a5_align_subset_seqinr)
## [1] "alignment"
class(slc24a5_align)
## [1] "AAMultipleAlignment"
# Output using print_msa()
compbio4all::print_msa(slc24a5_align_subset_seqinr)
## [1] "MQTKG--GQTWARRA--------LLLGILWATAHLPLSGT-----SLPQRLPR--ATGNS 0"
## [1] "MQTKG--GQTWARRA--------LLLGILWATAHLPLSGT-----SLPQRLPR--ATGNS 0"
## [1] "MQTKR--GHTWARRA--------LLLGILWATARLPVPGA-----SLPQRLPR--ATGNS 0"
## [1] "MQIKG--GPRWARRA--------LLLGLLWATAHLPPPGA-----SLARRLPR--ATGNS 0"
## [1] "MRTKW--GPTWTRRV--------LLLGIFWVSAYLPVRGV-----SLPPRLPR--ATGNS 0"
## [1] "MEEGA--SRRYCRRL--------MLLGILWATAHLPPQGA-----SHPQQLPR--DTENN 0"
## [1] "MQARA--GGLWRRRS--------ALLPLLLLALWGWKPRG-----ARAQRPPT--DTENR 0"
## [1] "MEAAGSKGGRRNRRASRTALATFTFVLLLWAAVHLVYQAASLEAVSAQRRVRR--ATENE 0"
## [1] "MRTDVFLQRRKRRDVLLS---IIALLLLIFAIVHLVFCAGLSFQGSSSARVRR--DLENA 0"
## [1] "MDPSL--GSSAARCILG---ASSTVLLLLRLDYAVGLEAGAGAHHSNHMRVRRDEETGND 0"
## [1] " "
## [1] "TQCVISPSSEFPEGFFTRQERRDGGIIIYFLIIVYMFMAISIVCDEYFLPSLEIISESLG 0"
## [1] "TQCVISPSSEFPEGFFTRQERRDGGIIIYFLIIVYMFMAISIVCDEYFLPSLEIISESLG 0"
## [1] "TQCVISPSSEFPEGFFTRQERRDGGIIIYFLIIVYMFMAISIVCDEYFLPSLEIISESLG 0"
## [1] "TQCVISPSSEFPEGFFTKQERADGGIIIYFLIILYMFMAVSIVCDEYFLPSLEIISESLG 0"
## [1] "TQCAVSPASEFPEGFFTKQESTDGGIVIYFLIILYMCMAISIVCDKYFLPSLEIISDSLG 0"
## [1] "TQCVFAPSSEFPEGFFTKQQQKDGGIIIYFLIILYMFMAISIVCDEYFLPSLEVISDCLG 0"
## [1] "TDCVP-PSSEFPEGFFTPQERKDGGIVIYFLIILYMFLAASIVCDDYFLPSLEIITECLG 0"
## [1] "TLCVFAPSSEFPKGLFTEQERKDGGIIIHFLVIVYMFLAVAIVCDEYFLPSLEVISERLG 0"
## [1] "SECVQPQSSEFPEGFFTVQERKDGGILIYFMIIFYMLLSVSIVCDEYFLPSLEVISERLG 0"
## [1] "TGCVGPASDEFPPGFFTPEEQREGGVLIYILIALYLLLAIAIVCDYYFLPSLELISDRLG 0"
## [1] " "
## [1] "LSQDVAGTTFMAAGSSAPELVTAFLGVFITKGDIGISTILGSAIYNLLGICAACGLLSNT 0"
## [1] "LSQDVAGATFMAAGSSAPELVTAFLGVFITKGDIGISTILGSAIYNLLGICAACGLLSNT 0"
## [1] "LSQDVAGATFMAVGSSAPELVTAFLGVFITKGDIGISTILGSAIYNLLGICAACGLLSNM 0"
## [1] "LSQDVAGATFMAAGSSAPELVTAFLGVFITKGDIGISTILGSAIYNLLGICAACGLLSNV 0"
## [1] "LSQDVAGATFMAAGSSAPELVTAFLGVFITKGDIGISTILGSAIYNLLGICAACGLLSNM 0"
## [1] "MSQDVAGATFMAAGSSAPELVTAFLGVFVTKGDIGISTIIGSAIYNLLGICAACGLLSNV 0"
## [1] "LSQDVAGATFMAAGSSAPELVTAFLGVFVTKGDIGVSTILGSAIYNLLGICAACGLLSSV 0"
## [1] "LSQDVAGATFMAVGSSSPELVTAFLGVFVTKGDIGVSTIIGSAVYNLLGICAACCLLS-E 0"
## [1] "LSQDVAGATFMAAGSSAPELVTAFLGVFVTKGDIGVSTIMGSAVYNLLCICAACGLLSSA 0"
## [1] "LTMDVAGATFMAAGSSAPEMVTAFLGVFVTKGDIGVSTIVGSAVYNLLGICAICCLFSPS 0"
## [1] " "
## [1] "VSTLSCWPLFRDCAAYTISAAAVLGIIYDNQVYWYEGALLLLIYGLYVLVLCFDIKINQY 0"
## [1] "VSTLSCWPLFRDCAAYTISVAAVLGIIYDNQVYWYEGALLLLIYGLYVLVLCFDIKINQY 0"
## [1] "VSTLSCWPLFRDCAAYAISVAAVLGIIYDNQIYWYEGTLLLLIYGLYVLVLCFDIKINQY 0"
## [1] "VSRLSCWPLFRDCAAYAISVAALLAIIFDNQVYWYEGTLLLLIYGLYVVVLCFDIKINQY 0"
## [1] "VSTLSCWPLFRDCAVYAVSVGAVFGIIFDNRIYWYEGAGLLLIYGLYVLLLCFDTTISRH 0"
## [1] "VSKISCWPMFRDCVVYAISIAAVLGIIFDNQVYWYEGTVLLLIYGVYILVLCFDIKINQY 0"
## [1] "VSRLSCWPLFRDCLAYTISAAAVLAMISDSRIYWYESASLLLIYGCYILVLCFDIKINRC 0"
## [1] "ICKLTCWPLFRDCVAYAVSVGAVIAITFDNKIFWYESASLLFIYGLYIVMLCFDIKINQY 0"
## [1] "VGRLSCWPLFRDCVAYAISVAAVIAIISDNRVYWYDGACLLLVYGVYVAVLCFDLRISEY 0"
## [1] "ASPLSHWPLLRDSLAYAVSVAVLIGCIHDGKVHWYEALSLVSVYAAYVVVLRFDARLHR- 0"
## [1] " "
## [1] "IIKKCSPCCACLAKAME-RSEQQP--------LMGWEDEG-------------------- 0"
## [1] "IIKKCSPCCACLAKAME-RSEQQP--------LMGWEDEG-------------------- 0"
## [1] "IIKKCSPCCTCLAKAME-RSEQQL--------LMGWEDEG-------------------- 0"
## [1] "IIKKCSPCCTCLVKAIEERSEQQL--------LMGWEEED-------------------- 0"
## [1] "VMKTCSPCCPCLARAMEERIEQQT--------LLGWEDES-------------------- 0"
## [1] "VMKKCSPCCTCFAKAIEGQAEES---------LMGWED---------------------- 0"
## [1] "LMKKLSPCCSCFTKATEQSGEQQP--------LAGWREER-------------------- 0"
## [1] "MVKKFSPCCTCIAEVLEENHDQQP--------LLGWKEES-------------------- 0"
## [1] "VMQRFSPCCWCLKPRDRDSGEQQP--------LVGWSDDS-------------------- 0"
## [1] "---AFTRGCLCLRPAGQHRRRRQPGNGPEARPLVGGGEERGDAAGEDHGGGTSRRSSTGV 0"
## [1] " "
## [1] "QPFIRRQSRTDSGIFYED-SGYSQLSISLHGLSQVSEDPPS-VFNMPEADLKRIFWVLSL 0"
## [1] "QPFIRRQSRTDSGIFYED-SGYSQLSISLHGLSQVSEDPPS-VFNIPEADLKRIFWVLSL 0"
## [1] "QPFIRRQSRTDSGIFYED-SGYSQLSISLHGLSQVSEDPPS-VFNMPEADLKRIFWVLSL 0"
## [1] "QPFIRRQSRTDSGIFHED-SGYSQLSLSLHGLSQVCEDPPS-VFNMPEADLKRIFWVLSL 0"
## [1] "QLFIRRQSRTDSGIFQED-SGYSQLSLSLHGLSQVSEDPPS-VFSMPEADLRRIFWVLSL 0"
## [1] "RPLIHRQSRTDSGIFHED-SDYSQLSTSLQGFNEVPEDPPS-VFSMPEADLKRIFWVLTL 0"
## [1] "GPLIRQQSRTDSGIFQDE-LDYSQLSTSLHGLDEISEDHPS-VFTMPEEDMKRILWVLSL 0"
## [1] "IRVVRQRSRSDSGIFQED-SDYSQLSISMHGLNEISDDPPS-VFTVPEDDFKRVLWVLSL 0"
## [1] "SLRVQRRSRNDSGIFQDD-SGYSHLSLSLHGLNEISDEHKS-VFSMPDHDLKRILWVLSL 0"
## [1] "NDFLHGRTRSDSGVILRDGSDYSQLTLSLHGLTSLEDEEPMGLLHVPQGDGRRVLWVLSL 0"
## [1] " "
## [1] "PIITLLFLTTPDCRKKFWKNYFVITFFMSAIWISAFTYILVWMVTITGETLEIPDTVMGL 0"
## [1] "PIITLLFLTTPDCRKKFWKNYFVITFFMSAIWISAFTYILVWMVTITGETLEIPDTVMGL 0"
## [1] "PIITLLFLTTPDCRTKFWKNYFVITFSMSAIWISAFTYILVWMVTITGETLEIPDTVMGL 0"
## [1] "PIIILLFLTTPDCRRKFWRKYFVITFFMSALWISAFTYILVWMVTITGETLEIPDTVMGL 0"
## [1] "PIITLLALTTPDCRRKFWKNYFVITFFMSALWISAFTYILVWMVTVTGETLGIPDTVMGL 0"
## [1] "PIVTLLFLTTPDCRRKSWKSCFMITFFMAAIWISAFTYVLVWMVTITGETLEIPDTVMGL 0"
## [1] "PITTLLYLTTPDCRRRFWRNWFMVTFFMSAAWISAITYVLVWMITIAGETLGIPESVMGL 0"
## [1] "PIITLLYLTVPDCRRRWWKKWFILTFVMSAVWISGVTYILVWMVIIVGETLSIPDTVMGL 0"
## [1] "PVSTLLFVSVPDCRRPFWKNFYMLTFLMSAVWISAFTYVLVWMVTIVGETLGIPDTVMGM 0"
## [1] "PVLLLLTLTVPDCRARSMRRFFPVTFFMAAVWISALTYLLVWMVTVTGETLGIPDTVMGL 0"
## [1] " "
## [1] "TLLAAGTSIPDTIASVLVARKGKGDMAMSNIVGSNVFDMLCLGIPWFIKTAFINGSAPAE 0"
## [1] "TLLAAGTSIPDTIASVLVARKGKGDMAMSNIVGSNVFDMLCLGIPWFIKTAFINGSAPAE 0"
## [1] "TLLAAGTSIPDTITSVLVARKGKGDMAMSNIVGSNVFDMLCLGIPWFIKTAFINGSAPIE 0"
## [1] "TLLAAGTSIPDTIASVLVARKGKGDMAMSNIVGSNVFDMLCLGVPWFIKTAFINPSAPVE 0"
## [1] "TLLAAGTSIPDTVTSVLVARKGKGDMAISNIVGSNVFDMLCLGLPWFIKTAFTNASAPIE 0"
## [1] "TVLAAGTSIPDTVASVLVARKGRGDMAMSNIIGSNVFDMLCLGAPWFIQTAFTNTSAPVK 0"
## [1] "TLLAAGTSVPDTVASVLVARKGNGDMAMSNIVGSNVFDMLCLGIPWFIKTAFINTSEPIE 0"
## [1] "TLLAAGTSIPDTVASVLVAREGKGDMAMSNIVGSNVFDMLCLGLPWFIKTVFIDRSSPVE 0"
## [1] "TLLAAGTSIPDTVASVMVAREGKSDMAMSNIVGSNVFDMLCLGLPWFIQTVFVDVGSPVE 0"
## [1] "TLLAIGTSLPDTITSLLVAREGKADMAMSNIVGSNVFDMLCLGLPWLLLTAFVSPSGPAH 0"
## [1] " "
## [1] "VNSRGLTYITISLNISIIFLFLAVHFNGWKLDRKLGIVCLLSYLGLATLSVLYELGIIGN 0"
## [1] "VNSRGLTYITISLNISIIFLFLAVHFNGWKLDRKLGIVCLLSYLGLATLSVLYELGIIGN 0"
## [1] "VNSRGLTYTTISLNVSIIFLFLAVHFNGWKLDRKLGIVCLLSYLGLATLSVLYELGIIGN 0"
## [1] "VNSRGLTYITIFLNISIVFLFLAVHLNGWKLDRKLGVVCLLLYLGLTTLSVLYELGIIGN 0"
## [1] "VNSKGLTYITISLNISILFLFLAVHFNGWKLDRKLGVVCLVLYLGLATLSVLYEIGIIGN 0"
## [1] "VNSSGLTYTVISLIFSIVFLFLAVHLNGWKLDKKLGVVCLVLYLVLATLSVLYELGIIGN 0"
## [1] "VNSSGLTYTATSLICSVVFIFLAIHLNGWKIDKKLGAICLILYLVFTVLSILYELGIIGN 0"
## [1] "VSSTGLTYTTISLLFSIIFIFVAIHVNGWKLDKKLGVICLIMYLVFATLSILYELGIIGG 0"
## [1] "VNSSGLVFMSCTLLLSIIFLFLAVHINGWKLNWKLGLVCLACYILFATLSILYELGIIGN 0"
## [1] "VSSAGLVYSASSLLISLALLLACVHAARWRLSRPLGAACLAAYLVFVAVAVLHEVGVLGG 0"
## [1] " "
## [1] "N-KIRGCGG 51"
## [1] "N-KIRGCGG 51"
## [1] "N-KIRGCGG 51"
## [1] "N-KIRGCGD 51"
## [1] "N-RIRGCGV 51"
## [1] "N-KIRGCGD 51"
## [1] "N-PTRVCGN 51"
## [1] "M-PVRSCGD 51"
## [1] "N-PIRSCRD 51"
## [1] "DPAARVCGK 51"
## [1] " "
# Run ggmsa to display part of the MSA
ggmsa::ggmsa(slc24a5_align,
start = 200,
end = 250)
slc24a5_subset_dist <- seqinr::dist.alignment(slc24a5_align_subset_seqinr,
matrix = "identity")
Check to make sure the code produced an object of class “dist.”
is(slc24a5_subset_dist)
## [1] "dist" "oldClass"
class(slc24a5_subset_dist)
## [1] "dist"
Round for Display
slc24a5_align_seqinr_rnd <- round(slc24a5_subset_dist, 3)
slc24a5_align_seqinr_rnd
## NP_995322.1 XP_510380.2 XP_001112601.1 NP_001121981.2
## XP_510380.2 0.077
## XP_001112601.1 0.205 0.200
## NP_001121981.2 0.307 0.303 0.307
## NP_778199.2 0.390 0.387 0.382 0.409
## XP_036592760.1 0.446 0.446 0.444 0.434
## NP_001033586.3 0.524 0.526 0.528 0.520
## XP_040269928.1 0.554 0.550 0.548 0.542
## NP_001025451.1 0.574 0.573 0.580 0.574
## XP_032831753.1 0.671 0.668 0.675 0.665
## NP_778199.2 XP_036592760.1 NP_001033586.3 XP_040269928.1
## XP_510380.2
## XP_001112601.1
## NP_001121981.2
## NP_778199.2
## XP_036592760.1 0.487
## NP_001033586.3 0.555 0.531
## XP_040269928.1 0.580 0.562 0.556
## NP_001025451.1 0.581 0.579 0.581 0.578
## XP_032831753.1 0.673 0.694 0.677 0.671
## NP_001025451.1
## XP_510380.2
## XP_001112601.1
## NP_001121981.2
## NP_778199.2
## XP_036592760.1
## NP_001033586.3
## XP_040269928.1
## NP_001025451.1
## XP_032831753.1 0.676
Using the distance matrix and neighbor joining algorithm:
# Note - not using rounded values
tree_subset <- nj(slc24a5_subset_dist)
# plot tree
plot.phylo(tree_subset, main="Phylogenetic Tree
",
use.edge.length = F)
# add label
mtext(text = "
SLC24A5 Family Gene tree - Rooted, No Branch Lenths")