The SLC22A5 gene provides instructions for making a protein called OCTN2 that is found in the heart, liver, muscle, kidneys, and other tissues. The gene is also known to code for a protein, solute carrier family 22 member 5. Mutations may cause systemic primary carnitine deficiency (CDSP). Sodium-ion dependent, high affinity carnitine transporter. Involved in the active cellular uptake of carnitine. Transports one sodium ion with one molecule of carnitine. Also transports organic cations such as tetraethylammonium (TEA) without the involvement of sodium. Also relative uptake activity ratio of carnitine to TEA is 11.3.
Reference Sequence Gene:https://www.ncbi.nlm.nih.gov/nuccore/1475393985
Homolog genes:https://www.ncbi.nlm.nih.gov/homologene/68295
Uniprot:https://www.uniprot.org/uniprot/O76082
PDB (Protein databank):No data available
Pfam:http://pfam.xfam.org/protein/O76082
Add the necessary calls to library() to load call packages Indicate which packages came from Bioconductor, CRAN, and GitHub
# github packages
library(compbio4all) # indicates what the package does. Includes datasets and helpful functions for learning bioinformatics and computational biology.
library(ggmsa)# indicates how to plot multiple sequence alignments using the ggplot2 package with the multiple color schemes supported. Includes the functions for plotting Multiple Sequences Alignment using ggplot2 package.
# CRAN packages
library(rentrez)# indicates how to search, discover, and download data from the NCBI's databases using their EUtils function. Using 'Entrez' in R
library(seqinr)# indicates how to use exploratory data analysis and data visualization for biological sequences (DNA and protein) data. Includes functions to retrieve and analyze biological sequences.
library(ape)# indicates how to analyze Phylogeny and Evolution. Provides functions for reading, writing, manipulating, analyzing, and simulating phylogeny trees and DNA sequences, computing DNA distances, translating into AA sequences, estimating trees with distance-based methods, and a range of methods for comparative analyses and analysis of diversification.
library(pander) # prints R object in Pandoc's markdown
library(ggplot2) #indicates how to map variables to aesthetics, what graphical primitives to use, and takes care of details.
# Bioconductor packages
library(msa) #indicates how to unify interface to the three functions msaClustalW(), msaClustalOmega(), and msaMuscle() as a wrapper function. The Multiple Sequence Alignment function
library(drawProteins) # indicates how to create the visualization of protein schematics based on the data obtained from the Uniprot Protein Database.
# BioStrings
library(Biostrings) #indicates how to manipulate big strings (like DNA or RNA sequences) easily and quickly.
#library(HGNChelper)#
These accession numbers were obtained from the NCBI and Uniprot databases. I also got the scientific names and common names from the NCBI database.
Not available: - Neanderthal Does not occur: - Outside of vertebrates
NBCI.Protein.Accession <-c(" NP_001342696.1","NP_001295051.1","XP_038537409.1","XP_004920385.1", "XP_016809250.1", "XP_004042518.1","XP_024103525.1", "NP_001039967.1", " XP_006927754.4","XP_032951789.1 ")
UniProt.id <-c("Q9Z0E8","O76082","F6UW69", "A0A6I8RD12", "A0A2I3SM66", "G3QFF2","NA", "NA","A0A2I2U2F0", "A0A671FHY2" )
PDB <- c("NA")
Scientific.Name <-c(" Mus musculus", "Homo sapiens","Canis familaris","Xenopus tropicalis", "Pan troglodytes","Gorilla gorilla","Pongo abelii", "Bos taurus", "Felis catus","Rhinolophus ferrumequinum")
Common.Name <-c("Mouse", "Human", "Dog","Tropical clawed frog","Chimpanzee","Gorilla", "Sumatran orangutan", "Cattle", "Cat", "Horseshoe Bat ")
Gene.Name <-c("SLC22A5")
SLC22A5_table <- data.frame(NBCI.Protein.Accession = NBCI.Protein.Accession, UniProt.id = UniProt.id, Scientific.Name = Scientific.Name, Common.Name = Common.Name , Gene.Name = Gene.Name )
pander(SLC22A5_table)
| NBCI.Protein.Accession | UniProt.id | Scientific.Name |
|---|---|---|
| NP_001342696.1 | Q9Z0E8 | Mus musculus |
| NP_001295051.1 | O76082 | Homo sapiens |
| XP_038537409.1 | F6UW69 | Canis familaris |
| XP_004920385.1 | A0A6I8RD12 | Xenopus tropicalis |
| XP_016809250.1 | A0A2I3SM66 | Pan troglodytes |
| XP_004042518.1 | G3QFF2 | Gorilla gorilla |
| XP_024103525.1 | NA | Pongo abelii |
| NP_001039967.1 | NA | Bos taurus |
| XP_006927754.4 | A0A2I2U2F0 | Felis catus |
| XP_032951789.1 | A0A671FHY2 | Rhinolophus ferrumequinum |
| Common.Name | Gene.Name |
|---|---|
| Mouse | SLC22A5 |
| Human | SLC22A5 |
| Dog | SLC22A5 |
| Tropical clawed frog | SLC22A5 |
| Chimpanzee | SLC22A5 |
| Gorilla | SLC22A5 |
| Sumatran orangutan | SLC22A5 |
| Cattle | SLC22A5 |
| Cat | SLC22A5 |
| Horseshoe Bat | SLC22A5 |
# Mouse SLC22A5 (M. musculus)
mSLC22A5_FASTA <- entrez_fetch_list(db = "protein",
id = " NP_001342696",
rettype = "fasta")
# Human SLC22A5 (H. sapiens)
hSLC22A5_FASTA <- entrez_fetch_list(db = "protein",
id = " NP_001295051",
rettype = "fasta")
# Dog SLC22A5 (C. lupus)
dSLC22A5_FASTA <- entrez_fetch_list(db = "protein",
id = "XP_038537409",
rettype = "fasta")
# Tropical clawed frog SLC22A5 (X. tropicalis)
fSLC22A5_FASTA <- entrez_fetch_list(db = "protein",
id = " XP_004920385 ",
rettype = "fasta")
# Chimpanzee SLC22A5 (P. troglodytes)
chSLC22A5_FASTA <- entrez_fetch_list(db = "protein",
id = "XP_016809250",
rettype = "fasta")
# Gorilla SLC22A5 (Gorilla gorilla)
gSLC22A5_FASTA <- entrez_fetch_list(db = "protein",
id = "XP_004042518",
rettype = "fasta")
# Organutan SLC22A5
oSLC22A5_FASTA <- entrez_fetch_list(db = "protein",
id = "XP_024103525",
rettype = "fasta")
# Cattle SLC22A5(B. taurus)
cattleSLC22A5_FASTA <- entrez_fetch_list(db = "protein",
id = "NP_001039967",
rettype = "fasta")
# Cat SLC22A5 (Felis catus)
catSLC22A5_FASTA <- entrez_fetch_list(db = "protein",
id = "XP_006927754",
rettype = "fasta")
# Horseshoe bat SLC22A5 (Pan)
batSLC22A5_FASTA <- entrez_fetch_list(db = "protein",
id = "XP_032951789",
rettype = "fasta")
SLC22A5_list <- c(hSLC22A5_FASTA, mSLC22A5_FASTA, gSLC22A5_FASTA, dSLC22A5_FASTA, catSLC22A5_FASTA, batSLC22A5_FASTA, chSLC22A5_FASTA, cattleSLC22A5_FASTA, fSLC22A5_FASTA, oSLC22A5_FASTA)
is(SLC22A5_list)
## [1] "list" "vector" "list_OR_List" "vector_OR_Vector"
## [5] "vector_OR_factor"
Remove FASTA header
length(SLC22A5_list)
## [1] 10
for(i in 1:length(SLC22A5_list)){
SLC22A5_list[[i]] <- fasta_cleaner(SLC22A5_list[[i]], parse = F)
}
Set up as 1 continuous string
hSLC22A5_string <- fasta_cleaner(hSLC22A5_FASTA ,
parse = F)
mSLC22A5_string <- fasta_cleaner(mSLC22A5_FASTA ,
parse = F)
gSLC22A5_string <- fasta_cleaner(gSLC22A5_FASTA ,
parse = F)
batSLC22A5_string <- fasta_cleaner(batSLC22A5_FASTA ,
parse = F)
dSLC22A5_string <- fasta_cleaner(dSLC22A5_FASTA ,
parse = F)
catSLC22A5_string <- fasta_cleaner(catSLC22A5_FASTA ,
parse = F)
chSLC22A5_string <- fasta_cleaner(chSLC22A5_FASTA ,
parse = F)
cattleSLC22A5_string <- fasta_cleaner(cattleSLC22A5_FASTA ,
parse = F)
fSLC22A5_string <- fasta_cleaner(fSLC22A5_FASTA ,
parse = F)
oSLC22A5_string <- fasta_cleaner(oSLC22A5_FASTA ,
parse = F)
SLC22A5_string <-c(hSLC22A5_string, mSLC22A5_string, gSLC22A5_string, batSLC22A5_string, dSLC22A5_string, catSLC22A5_string, chSLC22A5_string, cattleSLC22A5_string, fSLC22A5_string, oSLC22A5_string)
O76082_human <-drawProteins::get_features("O76082")
## [1] "Download has worked"
is(O76082_human)
## [1] "list" "vector" "list_OR_List" "vector_OR_Vector"
## [5] "vector_OR_factor"
#Converting raw data from the webpage to a dataframe
protein_human_df <- drawProteins::feature_to_dataframe(O76082_human)
protein_human_df[, 2]
## [1] "Solute carrier family 22 member 5"
## [2] "Cytoplasmic"
## [3] "Helical; Name=1"
## [4] "Extracellular"
## [5] "Helical; Name=2"
## [6] "Cytoplasmic"
## [7] "Helical; Name=3"
## [8] "Extracellular"
## [9] "Helical; Name=4"
## [10] "Cytoplasmic"
## [11] "Helical; Name=5"
## [12] "Extracellular"
## [13] "Helical; Name=6"
## [14] "Cytoplasmic"
## [15] "Helical; Name=7"
## [16] "Extracellular"
## [17] "Helical; Name=8"
## [18] "Cytoplasmic"
## [19] "Helical; Name=9"
## [20] "Extracellular"
## [21] "Helical; Name=10"
## [22] "Cytoplasmic"
## [23] "Helical; Name=11"
## [24] "Extracellular"
## [25] "Helical; Name=12"
## [26] "ATP"
## [27] "Disordered"
## [28] "Phosphotyrosine"
## [29] "Phosphothreonine"
## [30] "N-linked (GlcNAc...) asparagine"
## [31] "N-linked (GlcNAc...) asparagine"
## [32] "N-linked (GlcNAc...) asparagine"
## [33] "NONE"
## [34] "NONE"
## [35] "NONE"
## [36] "in CDSP"
## [37] "in CDSP; unknown pathological significance; reduces carnitine transport but the mutant retains 50% of wild-type activity; dbSNP:rs139203363"
## [38] "in CDSP; carnitine transport reduced to less than 20% of wild-type; dbSNP:rs267607052"
## [39] "in CDSP; loss of carnitine transport"
## [40] "in CDSP; carnitine transport reduced to less than 20% of wild-type; dbSNP:rs11568520"
## [41] "in CDSP; carnitine transport is reduced to less than 5% of normal; dbSNP:rs72552723"
## [42] "in CDSP; unknown pathological significance; reduces carnitine transport but the mutant retains 50% of wild-type activity; dbSNP:rs144020613"
## [43] "in CDSP; reduces carnitine transport to less than 1% of normal"
## [44] "in CDSP; carnitine transport reduced to less than 6% of wild-type; dbSNP:rs772578415"
## [45] "in CDSP; carnitine transport reduced to 1% of wild-type; dbSNP:rs72552724"
## [46] "in CDSP; carnitine transport reduced to less than 1% of wild-type; dbSNP:rs72552725"
## [47] "in CDSP; carnitine transport reduced to less than 10% of wild-type; dbSNP:rs199689597"
## [48] "in CDSP; carnitine transport reduced to less than 5% of wild-type; dbSNP:rs377767445"
## [49] "in CDSP; carnitine transport is reduced to less than 5% of normal; dbSNP:rs202088921"
## [50] "in CDSP; loss of carnitine transport"
## [51] "in CDSP; carnitine transport reduced to 2% of wild-type"
## [52] "in CDSP; carnitine transport reduced to 2% of wild-type; dbSNP:rs757711838"
## [53] "in CDSP; loss of carnitine transport; dbSNP:rs72552726"
## [54] "in CDSP; loss of carnitine transport; dbSNP:rs386134190"
## [55] "in CDSP; unknown pathological significance; reduces carnitine transport but the mutant retains 30% of wild-type activity; dbSNP:rs386134191"
## [56] "in CDSP; carnitine transport reduced to 20% of wild-type; dbSNP:rs377767450"
## [57] "in CDSP; carnitine transport reduced to less than 5% of wild-type; dbSNP:rs386134192"
## [58] "in CDSP"
## [59] "in CDSP; dbSNP:rs201082652"
## [60] "in CDSP; carnitine transport reduced to less than 20% of wild-type; dbSNP:rs748605096"
## [61] "in CDSP; may affect splicing; unknown pathological significance; reduces carnitine transport but the mutant retains 30% of wild-type activity"
## [62] "in CDSP"
## [63] "in CDSP"
## [64] "in CDSP; unknown pathological significance; reduces carnitine transport but the mutant retains more than 25% of wild-type activity; dbSNP:rs151231558"
## [65] "in CDSP; carnitine transport reduced to less than 2% of wild-type; dbSNP:rs1178584184"
## [66] "in dbSNP:rs10040427"
## [67] "in CDSP; unknown pathological significance; reduces carnitine transport but the mutant retains more than 60% of wild-type activity; dbSNP:rs386134193"
## [68] "in CDSP; loss of carnitine transport"
## [69] "in CDSP; loss of carnitine transport; dbSNP:rs121908889"
## [70] "in CDSP; loss of carnitine transport; dbSNP:rs121908890"
## [71] "in CDSP; carnitine transport reduced to less than 10% of wild-type; dbSNP:rs781721860"
## [72] "in CDSP; carnitine transport reduced to less than 20% of wild-type; dbSNP:rs145068530"
## [73] "in CDSP; unknown pathological significance; reduces carnitine transport but the mutant retains more than 40% of wild-type activity; dbSNP:rs386134196"
## [74] "in CDSP; loss of carnitine transport; dbSNP:rs386134197"
## [75] "in CDSP; loss of carnitine transport; dbSNP:rs796052033"
## [76] "in CDSP; loss of carnitine transport; dbSNP:rs386134198"
## [77] "in CDSP; loss of carnitine transport; dbSNP:rs121908888"
## [78] "in CDSP; unknown pathological significance; reduces carnitine transport but the mutant retains 30% of wild-type activity; dbSNP:rs386134199"
## [79] "in CDSP; unknown pathological significance; reduces carnitine transport but the mutant retains 30% of wild-type activity"
## [80] "in CDSP; reduces carnitine transport to less than 20% of wild-type activity; dbSNP:rs386134205"
## [81] "in CDSP; reduces carnitine transport to less than 10% of wild-type activity; dbSNP:rs185551386"
## [82] "in CDSP; reduces carnitine transport to less than 1% of wild-type activity; dbSNP:rs756650860"
## [83] "in CDSP; loss of carnitine transport; dbSNP:rs386134206"
## [84] "in CDSP; reduces carnitine transport to less than 20% of wild-type activity; dbSNP:rs114269482"
## [85] "in CDSP; dbSNP:rs1457258524"
## [86] "in CDSP; reduces carnitine transport to less than 2% of wild-type activity"
## [87] "in CDSP; loss of carnitine transport; dbSNP:rs72552728"
## [88] "in CDSP; loss of carnitine transport"
## [89] "in CDSP"
## [90] "in CDSP; unknown pathological significance; reduces carnitine transport but the mutant retains more than 30% of wild-type activity; dbSNP:rs200699819"
## [91] "in CDSP"
## [92] "in CDSP; reduces carnitine transport to less than 10% of wild-type activity; dbSNP:rs386134203"
## [93] "in CDSP; unknown pathological significance; reduces carnitine transport but the mutant retains more than 40% of wild-type activity; dbSNP:rs201262157"
## [94] "in CDSP; reduces carnitine transport to less than 5% of wild-type activity; dbSNP:rs201262157"
## [95] "in CDSP; unknown pathological significance; reduces carnitine transport but the mutant retains more than 40% of wild-type activity"
## [96] "in CDSP"
## [97] "in CDSP; reduces carnitine transport to less than 1% of wild-type activity; dbSNP:rs386134208"
## [98] "in CDSP"
## [99] "in CDSP; reduces carnitine transport to 5% of wild-type activity; dbSNP:rs386134210"
## [100] "in CDSP; loss of carnitine transport; dbSNP:rs386134211"
## [101] "in CDSP; reduces carnitine transport to less than 1% of wild-type activity; dbSNP:rs72552729"
## [102] "in CDSP; dbSNP:rs1554087707"
## [103] "in CDSP"
## [104] "in CDSP; reduces carnitine transport to less-than-1% to 3% of wild-type activity; dbSNP:rs72552730"
## [105] "in CDSP; unknown pathological significance; reduces carnitine transport but the mutant retains 70% of wild-type activity; dbSNP:rs77300588"
## [106] "in CDSP; unknown pathological significance; no effect on carnitine transport; dbSNP:rs774792831"
## [107] "in CDSP"
## [108] "in CDSP; unknown pathological significance; reduces carnitine transport but the mutant retains 60% of wild-type activity; dbSNP:rs150544263"
## [109] "in CDSP; loss of carnitine transport; dbSNP:rs68018207"
## [110] "in CDSP; reduces carnitine transport to less than 2% of wild-type activity; dbSNP:rs1385634398"
## [111] "in CDSP; loss of carnitine transport; dbSNP:rs61731073"
## [112] "in CDSP; dbSNP:rs886042092"
## [113] "in CDSP; loss of carnitine transport; dbSNP:rs386134214"
## [114] "in CDSP"
## [115] "in CDSP; reduces carnitine transport to 5% of wild-type activity"
## [116] "in CDSP; reduces carnitine transport to less than 1% of wild-type activity; dbSNP:rs144547521"
## [117] "in CDSP; carnitine transport is reduced to less than 1% of normal; dbSNP:rs121908891"
## [118] "in CDSP; reduces carnitine transport to less than 5% of wild-type activity; dbSNP:rs267607054"
## [119] "in CDSP; unknown pathological significance; no effect on carnitine transport"
## [120] "in CDSP; reduces carnitine transport to less than 1% of wild-type activity"
## [121] "in CDSP; loss of carnitine transport; dbSNP:rs72552732"
## [122] "in CDSP; requires 2 nucleotide substitutions; reduces carnitine transport to less than 20% of wild-type activity; dbSNP:rs267607053"
## [123] "in CDSP; reduces carnitine transport to less than 1% of wild-type"
## [124] "in CDSP; reduces carnitine transport to less than 1% of wild-type; dbSNP:rs72552733"
## [125] "in CDSP; loss of carnitine transport; dbSNP:rs386134218"
## [126] "in CDSP; reduces carnitine transport to less than 20% of wild-type; dbSNP:rs386134219"
## [127] "in CDSP; unknown pathological significance; reduces carnitine transport to less than 20% of wild-type; dbSNP:rs11568514"
## [128] "in CDSP; reduces carnitine transport to less than 5% of wild-type; dbSNP:rs72552734"
## [129] "in CDSP; loss of carnitine transport; dbSNP:rs1408166345"
## [130] "in CDSP; reduces carnitine transport to less than 5% of wild-type"
## [131] "in CDSP; reduces carnitine transport to less than 20% of wild-type activity; dbSNP:rs60376624"
## [132] "in CDSP; markedly reduced carnitine transport compared to the wild-type protein; less than 1% of wild-type activity; dbSNP:rs386134221"
## [133] "in CDSP; loss of carnitine transport; dbSNP:rs386134222"
## [134] "in CDSP; dbSNP:rs749282641"
## [135] "in CDSP; reduces carnitine transport to less than 2% of wild-type; dbSNP:rs386134223"
## [136] "in CDSP; loss of carnitine transport"
## [137] "in CDSP; loss of carnitine transport"
## [138] "in CDSP; loss of carnitine transport but stimulated organic cation transport; dbSNP:rs72552735"
## [139] "reduces carnitine transport but the mutant retains more than 60% of wild-type activity; dbSNP:rs11568513"
## [140] "in dbSNP:rs11568513"
## [141] "in CDSP; reduces carnitine transport to less than 10% of wild-type; dbSNP:rs377216516"
## [142] "in CDSP; unknown pathological significance; reduces carnitine transport to 40% of wild-type; dbSNP:rs28383481"
## [143] "in CDSP; reduces carnitine transport to 5% of wild-type; dbSNP:rs1157198543"
## [144] "in dbSNP:rs11568521"
## [145] "in dbSNP:rs11568524"
## [146] "reduces carnitine transport but the mutant retains more than 20% of wild-type activity; dbSNP:rs11568525"
## [147] "Loss of both carnitine and organic cation transport functionalities."
## [148] "in Ref. 8; AAH12325"
Domains present
Not available: - Neanderthal - Uniprot for Cattle and Organutan Does not occur: - Outside of vertebrates
protein_human_df <- drawProteins::feature_to_dataframe(O76082_human)
my_canvasHuman <- draw_canvas(protein_human_df)
my_canvasHuman <- draw_chains(my_canvasHuman, protein_human_df,
label_size = 2.5)
my_canvasHuman<- draw_domains(my_canvasHuman, protein_human_df)
my_canvasHuman <- draw_recept_dom(my_canvasHuman, protein_human_df)
my_canvasHuman <- draw_regions(my_canvasHuman, protein_human_df)
# my_canvasHuman <- draw_motif(my_canvasHuman, protein_human_df)
#my_canvasHuman<- draw_phospho(my_canvasHuman, protein_human_df)
#my_canvasHuman <- draw_repeat(my_canvasHuman, protein_human_df)
my_canvasHuman <- draw_folding(my_canvasHuman,protein_human_df)
my_canvasHuman
Prepare data
# set up 2 x 2 grid, make margins things
par(mfrow = c(2,2),
mar = c(0,0,2,1))
x <- fasta_cleaner(SLC22A5_list[[1]], parse = TRUE)
dotPlot(x,x,
wsize = 17,
nmatch = 3,
main = "plot 4: size = 17, nmatch = 3")
# reset par() - run this or other plots will be small!
par(mfrow = c(1,1),
mar = c(4,4,4,4))
#Pfam; Transmembrane Region from 20 to 509
human_SLC22A5.domain <-c("Sugar_tr", "http://pfam.xfam.org/protein/S22A5_HUMAN#tabview=tab0")
#DisProt : no information available
#RepeatDB: no information available
#PDB of secondary structural location: no information available
#Alphafold
#The SLC22A5 human gene is listed (http://pfam.xfam.org/protein/S22A5_HUMAN#tabview=tab4)
#The predicted structure that can be seen for the SLC22A5 gene in humans is primarily alpha helices with low complexity disordered regions from 498 to 545.
aa.1.1 <- c("A","R","N","D","C","Q","E","G","H","I",
"L","K","M","F","P","S","T","W","Y","V")
aa.1.2 <- c("A","R","N","D","C","Q","E","G","H","I",
"L","K","M","F","P","S","T","W","Y","V")
Make vectors with the frequency of each amino acid in the datbase of proteins of each type used by Chou. Chou’s table lists totals so I do logical checks again the total he gives and the total in my vector.
# alpha proteins
alpha <- c(285, 53, 97, 163, 22, 67, 134, 197, 111, 91,
221, 249, 48, 123, 82, 122, 119, 33, 63, 167)
# check against chou's total
sum(alpha) == 2447
## [1] TRUE
# beta proteins
beta <- c(203, 67, 139, 121, 75, 122, 86, 297, 49, 120,
177, 115, 16, 85, 127, 341, 253, 44, 110, 229)
# check against chou's total
sum(beta) == 2776
## [1] TRUE
# alpha + beta
a.plus.b <- c(175, 78, 120, 111, 74, 74, 86, 171, 33, 93,
110, 112, 25, 52, 71, 126, 117, 30, 108, 123)
sum(a.plus.b) == 1889
## [1] TRUE
# alpha/beta
a.div.b <- c(361, 146, 183, 244, 63, 114, 257, 377, 107, 239,
339, 321, 91, 158, 188, 327, 238, 72, 130, 378)
sum(a.div.b) == 4333
## [1] TRUE
Chou and Chang’s table 5 is approximately like this
pander(data.frame(aa.1.1, alpha, beta, a.plus.b, a.div.b))
| aa.1.1 | alpha | beta | a.plus.b | a.div.b |
|---|---|---|---|---|
| A | 285 | 203 | 175 | 361 |
| R | 53 | 67 | 78 | 146 |
| N | 97 | 139 | 120 | 183 |
| D | 163 | 121 | 111 | 244 |
| C | 22 | 75 | 74 | 63 |
| Q | 67 | 122 | 74 | 114 |
| E | 134 | 86 | 86 | 257 |
| G | 197 | 297 | 171 | 377 |
| H | 111 | 49 | 33 | 107 |
| I | 91 | 120 | 93 | 239 |
| L | 221 | 177 | 110 | 339 |
| K | 249 | 115 | 112 | 321 |
| M | 48 | 16 | 25 | 91 |
| F | 123 | 85 | 52 | 158 |
| P | 82 | 127 | 71 | 188 |
| S | 122 | 341 | 126 | 327 |
| T | 119 | 253 | 117 | 238 |
| W | 33 | 44 | 30 | 72 |
| Y | 63 | 110 | 108 | 130 |
| V | 167 | 229 | 123 | 378 |
From table 5 we calculate the frequencies of each amino acid in each fold class.
Calculate proportions for each of the four protein fold types
alpha.prop <- alpha/sum(alpha)
beta.prop <- beta/sum(beta)
a.plus.b.prop <- a.plus.b/sum(a.plus.b)
a.div.b <- a.div.b/sum(a.div.b)
#Create a dataframe
#dataframe
aa.prop <- data.frame(alpha.prop,
beta.prop,
a.plus.b.prop,
a.div.b)
#row labels
row.names(aa.prop) <- aa.1.1
#Table 5 therefore becomes this
pander(aa.prop)
| alpha.prop | beta.prop | a.plus.b.prop | a.div.b | |
|---|---|---|---|---|
| A | 0.1165 | 0.07313 | 0.09264 | 0.08331 |
| R | 0.02166 | 0.02414 | 0.04129 | 0.03369 |
| N | 0.03964 | 0.05007 | 0.06353 | 0.04223 |
| D | 0.06661 | 0.04359 | 0.05876 | 0.05631 |
| C | 0.008991 | 0.02702 | 0.03917 | 0.01454 |
| Q | 0.02738 | 0.04395 | 0.03917 | 0.02631 |
| E | 0.05476 | 0.03098 | 0.04553 | 0.05931 |
| G | 0.08051 | 0.107 | 0.09052 | 0.08701 |
| H | 0.04536 | 0.01765 | 0.01747 | 0.02469 |
| I | 0.03719 | 0.04323 | 0.04923 | 0.05516 |
| L | 0.09031 | 0.06376 | 0.05823 | 0.07824 |
| K | 0.1018 | 0.04143 | 0.05929 | 0.07408 |
| M | 0.01962 | 0.005764 | 0.01323 | 0.021 |
| F | 0.05027 | 0.03062 | 0.02753 | 0.03646 |
| P | 0.03351 | 0.04575 | 0.03759 | 0.04339 |
| S | 0.04986 | 0.1228 | 0.0667 | 0.07547 |
| T | 0.04863 | 0.09114 | 0.06194 | 0.05493 |
| W | 0.01349 | 0.01585 | 0.01588 | 0.01662 |
| Y | 0.02575 | 0.03963 | 0.05717 | 0.03 |
| V | 0.06825 | 0.08249 | 0.06511 | 0.08724 |
Determine the number of each amino acid in my protein.
new <- fasta_cleaner(hSLC22A5_string,parse = T)
table(new)
## new
## A C D E F G H I K L M N P Q R S T V W Y
## 37 8 23 21 40 41 7 33 17 77 24 16 29 18 32 44 33 48 12 21
A Function to convert a table into a vector is helpful here because R is goofy about tables not being the same as vectors.
table_to_vector <- function(table_x){
table_names <- attr(table_x, "dimnames")[[1]]
table_vect <- as.vector(table_x)
names(table_vect) <- table_names
return(table_vect)
}
SLC22A5_human_table <- table(new)/length(new)
SLC22A5.human.aa.freq <- table_to_vector(SLC22A5_human_table)
SLC22A5.human.aa.freq
## A C D E F G H
## 0.06368330 0.01376936 0.03958692 0.03614458 0.06884682 0.07056799 0.01204819
## I K L M N P Q
## 0.05679862 0.02925990 0.13253012 0.04130809 0.02753873 0.04991394 0.03098107
## R S T V W Y
## 0.05507745 0.07573150 0.05679862 0.08261618 0.02065404 0.03614458
Checking the presence of “U” (unknown aa.)
aa.names <- names(SLC22A5.human.aa.freq)
i.U <- which(aa.names == "U")
aa.names[i.U]
## character(0)
SLC22A5.human.aa.freq[i.U]
## named numeric(0)
Adding data on my focal protein to the amino acid frequency table.
aa.prop$SLC22A5.human.aa.freq <- SLC22A5.human.aa.freq
pander(aa.prop)
| alpha.prop | beta.prop | a.plus.b.prop | a.div.b | SLC22A5.human.aa.freq | |
|---|---|---|---|---|---|
| A | 0.1165 | 0.07313 | 0.09264 | 0.08331 | 0.06368 |
| R | 0.02166 | 0.02414 | 0.04129 | 0.03369 | 0.01377 |
| N | 0.03964 | 0.05007 | 0.06353 | 0.04223 | 0.03959 |
| D | 0.06661 | 0.04359 | 0.05876 | 0.05631 | 0.03614 |
| C | 0.008991 | 0.02702 | 0.03917 | 0.01454 | 0.06885 |
| Q | 0.02738 | 0.04395 | 0.03917 | 0.02631 | 0.07057 |
| E | 0.05476 | 0.03098 | 0.04553 | 0.05931 | 0.01205 |
| G | 0.08051 | 0.107 | 0.09052 | 0.08701 | 0.0568 |
| H | 0.04536 | 0.01765 | 0.01747 | 0.02469 | 0.02926 |
| I | 0.03719 | 0.04323 | 0.04923 | 0.05516 | 0.1325 |
| L | 0.09031 | 0.06376 | 0.05823 | 0.07824 | 0.04131 |
| K | 0.1018 | 0.04143 | 0.05929 | 0.07408 | 0.02754 |
| M | 0.01962 | 0.005764 | 0.01323 | 0.021 | 0.04991 |
| F | 0.05027 | 0.03062 | 0.02753 | 0.03646 | 0.03098 |
| P | 0.03351 | 0.04575 | 0.03759 | 0.04339 | 0.05508 |
| S | 0.04986 | 0.1228 | 0.0667 | 0.07547 | 0.07573 |
| T | 0.04863 | 0.09114 | 0.06194 | 0.05493 | 0.0568 |
| W | 0.01349 | 0.01585 | 0.01588 | 0.01662 | 0.08262 |
| Y | 0.02575 | 0.03963 | 0.05717 | 0.03 | 0.02065 |
| V | 0.06825 | 0.08249 | 0.06511 | 0.08724 | 0.03614 |
Correlation used in Chou and Zhang 1992.
chou_cor <- function(x,y){
numerator <- sum(x*y)
denominator <- sqrt((sum(x^2))*(sum(y^2)))
result <- numerator/denominator
return(result)
}
Cosine similarity used in Higgs and Attwood (2005). TODO: Checking if this is exactly the same as used in Chou’s 1994 and 1995 papers.
chou_cosine <- function(z.1, z.2){
z.1.abs <- sqrt(sum(z.1^2))
z.2.abs <- sqrt(sum(z.2^2))
my.cosine <- sum(z.1*z.2)/(z.1.abs*z.2.abs)
return(my.cosine)
}
Calculating the correlation between each column
corr.alpha <- chou_cor(aa.prop[,5], aa.prop[,1])
corr.beta <- chou_cor(aa.prop[,5], aa.prop[,2])
corr.apb <- chou_cor(aa.prop[,5], aa.prop[,3])
corr.adb <- chou_cor(aa.prop[,5], aa.prop[,4])
Calculating the cosine similarities
cos.alpha <- chou_cosine(aa.prop[,5], aa.prop[,1])
cos.beta <- chou_cosine(aa.prop[,5], aa.prop[,2])
cos.apb <- chou_cosine(aa.prop[,5], aa.prop[,3])
cos.adb <- chou_cosine(aa.prop[,5], aa.prop[,4])
Calculating distances NOTE: we need to flip he dataframe on its side using a command called t()
aa.prop.flipped <- t(aa.prop)
round(aa.prop.flipped,2)
## A R N D C Q E G H I L
## alpha.prop 0.12 0.02 0.04 0.07 0.01 0.03 0.05 0.08 0.05 0.04 0.09
## beta.prop 0.07 0.02 0.05 0.04 0.03 0.04 0.03 0.11 0.02 0.04 0.06
## a.plus.b.prop 0.09 0.04 0.06 0.06 0.04 0.04 0.05 0.09 0.02 0.05 0.06
## a.div.b 0.08 0.03 0.04 0.06 0.01 0.03 0.06 0.09 0.02 0.06 0.08
## SLC22A5.human.aa.freq 0.06 0.01 0.04 0.04 0.07 0.07 0.01 0.06 0.03 0.13 0.04
## K M F P S T W Y V
## alpha.prop 0.10 0.02 0.05 0.03 0.05 0.05 0.01 0.03 0.07
## beta.prop 0.04 0.01 0.03 0.05 0.12 0.09 0.02 0.04 0.08
## a.plus.b.prop 0.06 0.01 0.03 0.04 0.07 0.06 0.02 0.06 0.07
## a.div.b 0.07 0.02 0.04 0.04 0.08 0.05 0.02 0.03 0.09
## SLC22A5.human.aa.freq 0.03 0.05 0.03 0.06 0.08 0.06 0.08 0.02 0.04
Distance matrix
dist(aa.prop.flipped, method = "euclidean")
## alpha.prop beta.prop a.plus.b.prop a.div.b
## beta.prop 0.13342098
## a.plus.b.prop 0.09281824 0.08289406
## a.div.b 0.06699039 0.08659174 0.06175113
## SLC22A5.human.aa.freq 0.19286597 0.16402704 0.15354023 0.16385351
Individual distances
dist.alpha <- dist((aa.prop.flipped[c(1,5),]), method = "euclidean")
dist.beta <- dist((aa.prop.flipped[c(2,5),]), method = "euclidean")
dist.apb <- dist((aa.prop.flipped[c(3,5),]), method = "euclidean")
dist.adb <- dist((aa.prop.flipped[c(4,5),]), method = "euclidean")
Compiling the information
# fold types
fold.type <- c("alpha","beta","alpha plus beta", "alpha/beta")
# data
corr.sim <- round(c(corr.alpha,corr.beta,corr.apb,corr.adb),5)
cosine.sim <- round(c(cos.alpha,cos.beta,cos.apb,cos.adb),5)
Euclidean.dist <- round(c(dist.alpha,dist.beta,dist.apb,dist.adb),5)
# summary
sim.sum <- c("","","most.sim","")
dist.sum <- c("","","min.dist","")
df <- data.frame(fold.type,
corr.sim ,
cosine.sim ,
Euclidean.dist ,
sim.sum ,
dist.sum )
Displaying output
pander(df)
| fold.type | corr.sim | cosine.sim | Euclidean.dist | sim.sum | dist.sum |
|---|---|---|---|---|---|
| alpha | 0.7178 | 0.7178 | 0.1929 | ||
| beta | 0.7988 | 0.7988 | 0.164 | ||
| alpha plus beta | 0.8112 | 0.8112 | 0.1535 | most.sim | min.dist |
| alpha/beta | 0.7882 | 0.7882 | 0.1638 |
Convert all FASTA records intro entries in a single vector. FASTA entries are contained in a list produced at the beginning of the script. They were cleaned to remove the header and newline characters.
SLC22A5_vector <- rep(NA, length(SLC22A5_list))
for (i in 1:length(SLC22A5_list))
{
SLC22A5_vector[i] <- SLC22A5_list[[i]]
}
names(SLC22A5_list)
## [1] " NP_001295051" " NP_001342696" "XP_004042518" "XP_038537409"
## [5] "XP_006927754" "XP_032951789" "XP_016809250" "NP_001039967"
## [9] " XP_004920385 " "XP_024103525"
length(SLC22A5_list)
## [1] 10
Naming the vector
names(SLC22A5_vector) <- names(SLC22A5_list)
align.human.vs.chimp <- Biostrings:: pairwiseAlignment (
hSLC22A5_string, chSLC22A5_string)
align.human.vs.cat <- Biostrings::pairwiseAlignment(
hSLC22A5_string, catSLC22A5_string )
align.human.vs.cattle <- Biostrings::pairwiseAlignment(
hSLC22A5_string, cattleSLC22A5_string)
align.chimp.vs.cat <- Biostrings::pairwiseAlignment(
chSLC22A5_string, catSLC22A5_string)
align.chimp.vs.cattle <- Biostrings::pairwiseAlignment(
chSLC22A5_string, cattleSLC22A5_string)
align.cat.vs.cattle <- Biostrings::pairwiseAlignment(
catSLC22A5_string, cattleSLC22A5_string )
Biostrings:: pid(align.human.vs.chimp)
## [1] 99.48365
Biostrings::pid(align.human.vs.cat)
## [1] 88.46816
Biostrings::pid(align.human.vs.cattle)
## [1] 86.74699
Biostrings::pid(align.chimp.vs.cat)
## [1] 88.12392
Biostrings::pid(align.chimp.vs.cattle)
## [1] 86.74699
Biostrings::pid(align.cat.vs.cattle)
## [1] 93.35727
Building matrix
pids <- c(1, NA, NA, NA,
pid(align.human.vs.chimp), 1, NA, NA,
pid(align.human.vs.cat), pid(align.chimp.vs.cat), 1, NA,
pid(align.human.vs.cattle), pid(align.chimp.vs.cattle), pid(align.cat.vs.cattle), 1)
mat <- matrix(pids, nrow = 4, byrow = T)
row.names(mat) <- c("Homo","Pan","Cat","Cattle")
colnames(mat) <- c("Homo","Pan","Cat","Cattle")
pander(mat)
| Homo | Pan | Cat | Cattle | |
|---|---|---|---|---|
| Homo | 1 | NA | NA | NA |
| Pan | 99.48 | 1 | NA | NA |
| Cat | 88.47 | 88.12 | 1 | NA |
| Cattle | 86.75 | 86.75 | 93.36 | 1 |
Comparing different PID methods
#A comparison chimps and human PID with different methods
pid(align.human.vs.chimp, type = "PID1")
## [1] 99.48365
pid(align.human.vs.chimp, type = "PID2")
## [1] 99.48365
pid(align.human.vs.chimp, type = "PID3")
## [1] 99.48365
pid(align.human.vs.chimp, type = "PID4")
## [1] 99.48365
#A comparsion of rate and cattle PID with different methods
pid(align.cat.vs.cattle, type = "PID1")
## [1] 93.35727
pid(align.cat.vs.cattle, type = "PID2")
## [1] 93.35727
pid(align.cat.vs.cattle, type = "PID3")
## [1] 93.35727
pid(align.cat.vs.cattle, type = "PID4")
## [1] 93.35727
SLC22A5_vector_ss <- AAStringSet(SLC22A5_vector)
# add necessary function
SLC22A5_align <-msa(SLC22A5_vector_ss,
method = "ClustalW")
## use default substitution matrix
msa produces a species MSA objects
class(SLC22A5_align)
## [1] "MsaAAMultipleAlignment"
## attr(,"package")
## [1] "msa"
is(SLC22A5_align)
## [1] "MsaAAMultipleAlignment" "AAMultipleAlignment" "MsaMetaData"
## [4] "MultipleAlignment"
Default output of MSA
msa(SLC22A5_vector_ss, method = "ClustalW")
Change class of alignment
class(SLC22A5_align) <- "AAMultipleAlignment"
Convert to seqinr format
SLC22A5_align_seqinr <- msaConvert(SLC22A5_align, type = "seqinr::alignment")
OPTIONAL: show output with print_msa
print_msa(alignment = SLC22A5_align_seqinr)
## [1] "MRDYDEVTAFLGEWGPFQRLIFFLLSASIIPNGFTGLSSVFLIATPEHRCRVPDAANLSS 0"
## [1] "MRDYDEVTAFLGEWGPFQRLIFFLLSASIIPNGFTGLSSVFLIATPEHRCRVPDAANLSS 0"
## [1] "MRDYDEVTAFLGEWGPFQRLIFFLLSASIIPNGFTGLSSVFLIATPEHRCRVPDAANLSS 0"
## [1] "MRDYDEVTAFLGEWGPFQRLIFFLLSASIIPNGFTGLSSVFLIATPEHRCRVPDAANLSS 0"
## [1] "MQDYDEVTAFLGEWGPFQRLIFFLLSASIIPNGFNGLSSVFFTATPEHHCRVPDTANLSS 0"
## [1] "MRDYDEVTAFLGEWGPFQRLIFFLLSASIIPNGFNGMSVVFLAATPEHRCRVPDTANLSS 0"
## [1] "MRDYDEVTAFLGEWGPFQRLIFFLLSASIIPNGFNGMSAVFLTGTPEHRCRVPDTANLSS 0"
## [1] "MRDYDEVTTFLGEWGPFQRLIFFLLSASIIPNGFNGMSAVFLAATPEHRCRVPDAANLSR 0"
## [1] "------------------------------------------------------------ 0"
## [1] "MPTVDDILEHIGEFHLFQKQTFFLLALLSGAFTPIYVGIVFLGFTPNHHCRSPGVAELSQ 0"
## [1] " "
## [1] "-------AWRNHTVP-LRLRDGREVPHSCRRYRLAT-IANFSALGLEPGRDVDLGQLEQE 0"
## [1] "-------AWRNHTVP-LRLRDGREVPHSCRRYRLAT-IANFSALGLEPGRDVDLGQLEQE 0"
## [1] "-------AWRNHTVP-LRLRDGREVPHSCRRYRLAT-IANFSALGLEPGRDVDLGQLEQE 0"
## [1] "-------AWLNHSVP-LRLRDGREVPHSCRRYRLAT-IANFSALGLEPGRDVDLGQLEQE 0"
## [1] "-------AWRNHSVP-MRLQDGREVPQSCRRYRLAM-MVNFSELGLEPGRDVDLEQLEQE 0"
## [1] "-------AWRNHSIP-LRLQDGREVPHSCRRYRLAA-IANFSALGLEPGRDLDLEQLEQE 0"
## [1] "-------AWRNHSVP-LRLQNGHEVPHSCRRYRLEA-ISNFSALGLEPGRDVDLEQLEQE 0"
## [1] "-------AWRNHSIP-LRLQDGREVPHSCRRYRLAA-IANFSALGLEPERDVDLEQLEQE 0"
## [1] "------------------------------------------------------------ 0"
## [1] "RCGWSPAEELNYTVPGLGSAGEVSFLSQCMRYEVDWNQSTLDCVDPLSSLAANRSHLPLS 0"
## [1] " "
## [1] "SCLDGWVFSQDVYLSTIVTEQDSGAHNAMKNRMGRKPALCLPAQWNLVCEDDWKAPLTIS 0"
## [1] "SCLDGWEFSQDVYLSTIVTEQDSGAYNAMKNRMGRKPALCLPAQWNLVCEDDWKAPLTIS 0"
## [1] "SCLDGWEFSQDVYLSTIVTEQDSGAYNAMKNRMGKKPALCLPAQWNLVCEDDWKAPLTIS 0"
## [1] "SCLDGWEFSQDIYLSTIVT------------------------EWNLVCEDDWKAPLTIS 0"
## [1] "GCLDGWEFSQDIYLSTIVT------------------------EWNLVCEDDWKAPLTIS 0"
## [1] "SCLDGWEFSQDIYLSTIVT------------------------EWNLVCEDDWKAPLTVS 0"
## [1] "SCLDGWEFSQDVYLSTIVT------------------------EWNLVCEDDWKAPLTTS 0"
## [1] "SCLDGWEFSQDVYQSTIVT------------------------EWNLVCEDDWKAPLTVS 0"
## [1] "--------------------------------------------WNLVCENDWKGPLTTS 0"
## [1] "PCEHGWVY--DTPGSSIVT------------------------EFNLVCAHSWMLDLFQS 0"
## [1] " "
## [1] "LFFVGVLLGSFISGQLSDRFGRKNVLFVTMGMQTGFSFLQIFSKNFEMFVVLFVLVGMGQ 0"
## [1] "LFFVGVLLGSFISGQLSDRFGRKNVLFVTMGMQTGFSFLQIFSKNFEMFVVLFVLVGMGQ 0"
## [1] "LFFVGVLLGSFISGQLSDRFGRKNVLFVTMGMQTGFSFLQIFSKNFEMFVVLFVLVGMGQ 0"
## [1] "LFFVGVLLGSFISGQLSDRFGRKNVLFVTMGMQTGFSFLQIFSKNFEMFVVLFVLVGMGQ 0"
## [1] "LFFVGVLMGSFISGQLSDRFGRKNVLFVTMGMQTGFSFLQIFSKNFEMFTVLFFLVGMGQ 0"
## [1] "LFFVGVLVGSFISGQLSDRFGRKNVLFVTMGMQTGFSFLQVFSKNFEMFAVLFVLVGMGQ 0"
## [1] "LFFGGVLVGSFISGQLSDRFGRKNVLFVTMGMQTGFSFLQVFSKNFEMFTVLFILVGMGQ 0"
## [1] "LFFVGVLVSSFISGQLSDRFGRKNVLFVTMGMQTGFSFLQIFSKNFEMFTVLFVLVGMGQ 0"
## [1] "LFFVGVLIGSFVSGQMSDRFGRKKVLFATMAVQTGFSIIQVFSVNWEMFTALFIIVGMGQ 0"
## [1] "LVNVGFFIGAVGIGYLADRFGRKFCLLVTILINAISGVLMAISPNYAWMLVFRFLQGLVS 0"
## [1] " "
## [1] "ISNYVAAFVLGTEILG----------------KSVRIIFSTLG----VCIFY-------- 0"
## [1] "ISNYVAAFVLGTEILG----------------KSVRIIFSTLG----VCIFY-------- 0"
## [1] "ISNYVAAFVLGTEILG----------------KSVRIIFSTLG----VCIFY-------- 0"
## [1] "ISNYVAAFVLGTEILG----------------KSVRIIFSTLG----VCIFY-------- 0"
## [1] "ISNYVAAFVLGTEILG----------------KSVRIIFSTLG----VCIFY-------- 0"
## [1] "ISNYVAAFVLGTEILG----------------KSVRIIFSTLG----VCIFY-------- 0"
## [1] "ISNYVAAFVLGTEILG----------------KSVRIIFSTLG----VCIFY-------- 0"
## [1] "ISNYVAAFVLGMALQSENRNSWQIGSYYILYVRSVHILCIWLHDAATICLLHQRLADAAA 0"
## [1] "ISNYVAAFILGAEILD----------------KSVRIIFSTLG----VCIFY-------- 0"
## [1] "KAGWLIGYILITEFVG------------LGYRRTVGICYQIAF----------------- 0"
## [1] " "
## [1] "-------AFGYMVLPLFAYF--IRDWRMLLVALTMPGVLCVALWW-----------FIPE 0"
## [1] "-------AFGYMVLPLFAYF--IRDWRMLLVALTMPGVLCVALWW-----------FIPE 0"
## [1] "-------AFGYMVLPLFAYF--IRDWRMLLVALTMPGVLCVALWW-----------FIPE 0"
## [1] "-------AFGYMVLPLFAYF--IRDWRMLLVALTMPGVLCVALWW-----------FIPE 0"
## [1] "-------AFGYMLLPLFAYF--IRDWRMLLLALTVPGVLCAALWW-----------FIPE 0"
## [1] "-------AFGYMLLPLFAYF--IRDWRMLLLALTVPGVLCAALWW-----------FIPE 0"
## [1] "-------AVGYMLLPLFAYF--IRDWRKLLLALTVPGVLCAALWW-----------FIPE 0"
## [1] "GTDATRGAVCSSLVVSFACAPGVPTGRCLLFSLHQGAATSHKSLFGLLETGSINYLFIPE 0"
## [1] "-------AIGYMLLPLFAYF--IRDWRTLLLALTIPGLFCIPLWW-----------IIPE 0"
## [1] "-------TVGLLILAGVAYA--LPNWRWLQFAVTLPNFCFLLYFW-----------CIPE 0"
## [1] " "
## [1] "SPRWLISQGRFEEAEVIIRKAAKANGIVVPSTIFDPSELQDLSSKKQQSHNILDLLRTWN 0"
## [1] "SPRWLISQGRFEEAEVIIRKAAKANGIVVPSTIFDPSELQDLSSKKQQSHNILDLLRTWN 0"
## [1] "SPRWLISQGRFEEAEVIIRKAAKANGIVVPSTIFDPSELQDLSSKKQQSHNILDLLRTWN 0"
## [1] "SPRWLISQGRFEEAEVIIRKAAKANGIVVPSTIFDPSELQDLSSKKQQSHNILDLLRTWN 0"
## [1] "SPRWLISQGRFQEAEVIIRRAAKTNGIIAPSTIFDSSELQDLSSKKQQSHSILDLIRTRN 0"
## [1] "SPRWLISQGRFEEAEVIIRRAAKINGIVAPSTIFDSSELQDLSSKKQQSHSILDLLQTRN 0"
## [1] "SPRWLISQGRFKEAEVIIHRAAKINGIVAPSTLFEPSELQDLSSQKQQSHSILDLLRSRN 0"
## [1] "SPRWLISQGRLKEAEVIIRKAAKMNGIVAPSTIFDSSELEDLSSEKQQSHSILDLLRTRN 0"
## [1] "SPRWLISQGRFQEAEDIIRKAAKKNGITPPDSIFNFTELQEQKELTHKSHTFLDLLKTRN 0"
## [1] "SPRWLISQNKNAKAMKIIKHIAKKNGKSVPVSLQSLTADEDTG--MKLNPSFLDLVRTPQ 0"
## [1] " "
## [1] "IRMVTIMSIMLWMTISVGYFGLSLDTPNLHGDIFVNCFLSAMVEVPAYVLAWLLLQYLPR 0"
## [1] "IRMVTIMSVMLWMTISVGYFGLSLDTPNLHGDIFVNCFLSAMVEVPAYVLAWLLLQYLPR 0"
## [1] "IRMVTIMSIMLWMTISVGYFGLSLDTPNLHGDIFVNCFLSAMVEVPAYVLAWLLLQYLPR 0"
## [1] "IRMVTIMSIMLWMTISVGYFGLSLDTPNLHGDIYVNCFLSAMVEVPAYVLAWLLLQYLPR 0"
## [1] "IRMITVMSIILWLTISVGYFGLSLDTPNLHGDVYLNCFLSAVVEVPAYVLAWLLLRHLPR 0"
## [1] "IRMVTVMSIILWMTISVGYFGLSLDTPNLHGDVYVNCFLSAVVEVPAYVLAWLLLQHLPR 0"
## [1] "IRMVTIMSIILWMTISVGYFGLSLDTPNLHGDVYVNCFLSAVVEVPAYILAWLLLQHLPR 0"
## [1] "IRIVTVMCIILWMTISVGYFGLSLDTPNLHGDVYVNCFLSAVVEVPAYVLAWLLLQHMPR 0"
## [1] "IRIITFLSILLWMIISVGYFGLSLNTPNLHGDPYVNCFLSAIIEVPAYVIAWLLLRSFPR 0"
## [1] "IRKHTLILMYNWFTSSVLYQGLIMHMGLAGDNIYLDFFYSALVEFPAAFIIILTIDRIGR 0"
## [1] " "
## [1] "RYSMATALFLGGSVLLFVQLVPPDLYYLATVLVMVGKFGVTAAFSMVYVYTAELYPTVVR 0"
## [1] "RYSMATALFLGGSVLLFVQLVPPDLYYLATVLVMVGKFGVTAAFSMVYVYTAELYPTVVR 0"
## [1] "RYSMATALFLGGSVLLFMQLVPPDLYYLATVLVMVGKFGVTAAFSMVYVYTAELYPTVVR 0"
## [1] "RYSMATALFLGGSVLLFVQLVPPDLYYLATVLVMVGKFGVTAAFSMVYVYTAELYPTVVR 0"
## [1] "RYSMATALFLGGSVLLFVQLVPPELYYLATVLVMVGKFGVTAAFSMVYVYTAELYPTVVR 0"
## [1] "RYSMATALFLGGSVLLFMQLVPPDLYYLATVLVMVGKFGITAAFSMVYVYTAELYPTVVR 0"
## [1] "RYSMATALFLGGSVLLFVQLVPPDLYYLATVLVMVGKFGVTAAFSMVYVYTAELYPTVVR 0"
## [1] "RYSMATALFLGGSILLFVQLVPPDLYYLATVLVMVGKFGVTAAFSMVYVYTAELYPTVVR 0"
## [1] "RYSTASTLVLGGVVLLFIQLVPQELGILSIVLVMLGKFGITSAFSMVYVYTAELYPTVVR 0"
## [1] "RYPWAVSNMVAGAACLASVFIPDDLQWLKITVACLGRMGITIAYEMVCLVNAELYPTYIR 0"
## [1] " "
## [1] "NMGVGVSSTASRLGSILSPYFVY-LGAYDRFLPYILMGSLTILTAILTLFLPESFGTPLP 0"
## [1] "NMGVGVSSTASRLGSILSPYFVY-LGAYDRFLPYILMGSLTILTAILTLFLPESFGTPLP 0"
## [1] "NMGVGVSSTASRLGSILSPYFVY-LGAYDRFLPYILMGSLTILTAILTLFLPESFGTPLP 0"
## [1] "NMGVGVSSTASRLGSILSPYFVY-LGAYDRFLPYILMGSLTILTAILTLFLPESFGTPLP 0"
## [1] "NMGVGVSSTASRLGSILSPYFVY-LGAYDRFLPYILMGSLTILTAILTLFLPETFGTPLP 0"
## [1] "NMGVGVSSTASRLGSILSPYFVY-LGAYDRFLPYILMGSLTILTAILTLFLPESFGTPLP 0"
## [1] "NMGVGVSSTASRLGSILAPYFIY-LGAYDRFLPYILMGSLTILTAILTLFLPESFGTPLP 0"
## [1] "NMGVGVSSTASRLGSILSPYFVY-LGAYDRFLPYILMGSLTILTAILTLFLPETFGTPLP 0"
## [1] "NMGVGASSMASRMGSILSPYFVY-LGAYDRFLPFILMGSLTVLIGMFTLCLPESHGMPLP 0"
## [1] "NLAVLVCSSMCDIGGIVTPFLVYRLTDIWLEFPLVVFAVVGLVAGGLVLLLPETKGKALP 0"
## [1] " "
## [1] "DTIDQMLRVKGMKHRKTPSHTRMLKDGQERPTILKSTAF 21"
## [1] "DTIDQMLRVKGMKHRKTPSHTRMLKDGQERPTILKSTAF 21"
## [1] "DTIDQMLRVKGMKHRKTPSHTRMLKDGQERPTILKSTAF 21"
## [1] "DTIDQMLRVKGMKHRKTPSRTRMLKDGQERPAILKSTAF 21"
## [1] "DTIDQMLRVKGIKYRQTPSHTRMLKDGEESPTVLKSTSF 21"
## [1] "DTIDQMLRVKGIKYRQTPSHTRMLKDGEESSKVLKSTAL 21"
## [1] "DTIDQMLRVKGIKYRQTPNHTRELKDGEENPTVLKSTAF 21"
## [1] "DTIDQMLRVKGIKYRQTPGHTRMLKDSEDSSIVLKSTAL 21"
## [1] "DTIEEMLRVKGFRYKVG---KRLKKDKDRKASVLSNTAL 21"
## [1] "ETIEDAEKMQSLGRLVQTVCH------------------ 21"
## [1] " "
Based on the output from drawProtiens, the first 50 amino acids appears to contain an interesting helical section.
ggmsa(SLC22A5_align,
start = 0,
end = 50)
Distance matrix for all sequences
SLC22A5_dist <- seqinr::dist.alignment(SLC22A5_align_seqinr,
matrix = "identity")
This produces a “dist” class object.
is(SLC22A5_dist)
## [1] "dist" "oldClass"
class(SLC22A5_dist)
## [1] "dist"
SLC22A5_dist_rounded <- round(SLC22A5_dist, 3)
SLC22A5_dist_rounded
## XP_004042518 XP_016809250 NP_001295051 XP_024103525
## XP_016809250 0.072
## NP_001295051 0.083 0.072
## XP_024103525 0.120 0.120 0.120
## NP_001039967 0.311 0.311 0.311 0.306
## XP_006927754 0.287 0.287 0.281 0.278
## XP_032951789 0.314 0.314 0.314 0.314
## XP_038537409 0.415 0.415 0.415 0.413
## XP_004920385 0.526 0.528 0.526 0.524
## NP_001342696 0.820 0.821 0.821 0.818
## NP_001039967 XP_006927754 XP_032951789 XP_038537409
## XP_016809250
## NP_001295051
## XP_024103525
## NP_001039967
## XP_006927754 0.258
## XP_032951789 0.294 0.254
## XP_038537409 0.406 0.362 0.395
## XP_004920385 0.515 0.508 0.521 0.594
## NP_001342696 0.820 0.822 0.821 0.828
## XP_004920385
## XP_016809250
## NP_001295051
## XP_024103525
## NP_001039967
## XP_006927754
## XP_032951789
## XP_038537409
## XP_004920385
## NP_001342696 0.818
nj() is simple function that takes only a single argument, a distance matrix.
# Note - not using rounded values
tree_subset <- nj(SLC22A5_dist)
# plot tree
plot.phylo(tree_subset, main="Phylogenetic Tree",
use.edge.length = F)
# add label
mtext(text = "SLC22A5 family gene tree - rooted, with no branch lengths")