The encoded protein is an organic cation transporter and plasma integral membrane protein containing eleven putative transmembrane domains as well as a nucleotide-binding site motif. Transport by this protein is partially ATP-dependent.
Resources/References -Refseq Gene:https://www.ncbi.nlm.nih.gov/nuccore/1523761482?report=graph -Refseq Homologene:https://www.ncbi.nlm.nih.gov/homologene?LinkName=gene_homologene&from_uid=6583 -Uniprot:https://www.uniprot.org/uniprot/Q9H015 -Pfam:http://pfam.xfam.org/protein/Q9H015 -Alphafold:https://alphafold.ebi.ac.uk/entry/Q9H015
#Required Packages:
#install.packages("BiocManager")
#BiocManager::install("drawProteins")
#install.packages("HGNChelper")
#Load Necessary Packages
#github
library(compbio4all)
library(ggmsa)
## Registered S3 methods overwritten by 'ggalt':
## method from
## grid.draw.absoluteGrob ggplot2
## grobHeight.absoluteGrob ggplot2
## grobWidth.absoluteGrob ggplot2
## grobX.absoluteGrob ggplot2
## grobY.absoluteGrob ggplot2
#CRAN
library(rentrez)
library(seqinr)
library(ape)
##
## Attaching package: 'ape'
## The following objects are masked from 'package:seqinr':
##
## as.alignment, consensus
library(pander)
#Bioconductor
library(msa)
## Loading required package: Biostrings
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
##
## windows
## Loading required package: XVector
## Loading required package: GenomeInfoDb
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:ape':
##
## complement
## The following object is masked from 'package:seqinr':
##
## translate
## The following object is masked from 'package:base':
##
## strsplit
library(drawProteins)
## Biostrings
library(Biostrings)
library(HGNChelper)
## Warning: package 'HGNChelper' was built under R version 4.1.2
##Accession Numbers
Accession numbers for SLC22A4 were obtained from Refseq, Refseq Homologene, Uniprot and PDB. Uniprot accession numbers found by searching for gene name. PDB accessions found by entering Uniprot accession number or SLC22A4 gene name
A protein BLAST search was conducted to determine if SLC22A4 is expressed outside of vertebrae and it was determined that it is not expressed in non-vertebrae. A second protein BLAST was conducted to determine if SLC22A4 is expressed outside mammals. Link to SLC22A4 Protein BLAST search: https://blast.ncbi.nlm.nih.gov/Blast.cgi
HGNChelper::checkGeneSymbols(x = c("SLC22A4"))
## Maps last updated on: Thu Oct 24 12:31:05 2019
## x Approved Suggested.Symbol
## 1 SLC22A4 TRUE SLC22A4
##Creating Accession Number Table
Creating a table with all the accession numbers from various databases for SLC22A4
No Refseq or Uniprot Accession found for H. Neanderthalus
slc22a4_table <- c("NP_003050.2", "Q9H015", "NA", "Homo sapiens", "Human", "SLC22A4",
"XP_001163062.1", "H2QRG3", "NA", "Pan troglodytes", "Chimpanzee", "SLC22A4",
"NP_062661.1", "Q9Z306", "NA", "Mus musculus",
"House mouse", "Slc22a4",
"NP_001193918.1", "E1BEU0", "NA", "Bos taurus",
"Cattle", "SLC22A4",
"XP_005626572.1", "F6UW69", "NA", "Canis lupus",
"Wolf", "SLC22A4",
"XP_017727518.1", "A0A2K6MNT9", "NA", "Rhinopithecus bieti", "Black-and-white snub-nosed monkey", "SLC22A4",
"NP_071606.1", "Q9R141", "NA", "Rattus norvegicus",
"Brown rat", " Slc22a4",
"NP_001139603.1", "A0A1D5PH68", "NA", "Gallus gallus",
"Chicken", "SLC22A4",
"XP_013820784.2", "A0A452FJ23", "NA", "Capra hircus",
"Goat", " SLC22A4",
"XP_011945580.1", "A0A2K5N3M5", "NA", "Cercocebus atys",
"Sooty mangabey", "SLC22A4")
#Creating table
slc22a4_table <- matrix(slc22a4_table, ncol=6, byrow=TRUE)
dim(slc22a4_table)
## [1] 10 6
#Converting to dataframe
slc22a4_table <- data.frame(slc22a4_table)
#Dataframe info
is(slc22a4_table)
## [1] "data.frame" "list" "oldClass" "vector"
## [5] "list_OR_List" "vector_OR_Vector" "vector_OR_factor"
#Adding column names to table
colnames(slc22a4_table)
## [1] "X1" "X2" "X3" "X4" "X5" "X6"
colnames(slc22a4_table) <- c("ncbi.protein.accession", "UniProt.id", "PDB", "species", "common.name", "gene.name")
#Displaying table using pander
pander::pander(slc22a4_table)
| ncbi.protein.accession | UniProt.id | PDB | species |
|---|---|---|---|
| NP_003050.2 | Q9H015 | NA | Homo sapiens |
| XP_001163062.1 | H2QRG3 | NA | Pan troglodytes |
| NP_062661.1 | Q9Z306 | NA | Mus musculus |
| NP_001193918.1 | E1BEU0 | NA | Bos taurus |
| XP_005626572.1 | F6UW69 | NA | Canis lupus |
| XP_017727518.1 | A0A2K6MNT9 | NA | Rhinopithecus bieti |
| NP_071606.1 | Q9R141 | NA | Rattus norvegicus |
| NP_001139603.1 | A0A1D5PH68 | NA | Gallus gallus |
| XP_013820784.2 | A0A452FJ23 | NA | Capra hircus |
| XP_011945580.1 | A0A2K5N3M5 | NA | Cercocebus atys |
| common.name | gene.name |
|---|---|
| Human | SLC22A4 |
| Chimpanzee | SLC22A4 |
| House mouse | Slc22a4 |
| Cattle | SLC22A4 |
| Wolf | SLC22A4 |
| Black-and-white snub-nosed monkey | SLC22A4 |
| Brown rat | Slc22a4 |
| Chicken | SLC22A4 |
| Goat | SLC22A4 |
| Sooty mangabey | SLC22A4 |
##Data Preparation #Downloading Protein Sequences
We are downloading the sequences using the wrapper compbio4all::entrez_fetch_list()
The wrapper uses the rentrez::entrez_fetch() function to access NCBI databases for sequence information
# Downloading and prepping sequences
slc22a4_accessions <- slc22a4_table[,1] # Protein Accession column
slc22a4_list <- entrez_fetch_list(db="protein", id=slc22a4_accessions, rettype="fasta")
#Number of FASTA files downloaded from NCBI
length(slc22a4_list)
## [1] 10
Human slc22a4 FASTA file output
slc22a4_list[[1]]
## [1] ">NP_003050.2 solute carrier family 22 member 4 [Homo sapiens]\nMRDYDEVIAFLGEWGPFQRLIFFLLSASIIPNGFNGMSVVFLAGTPEHRCRVPDAANLSSAWRNNSVPLR\nLRDGREVPHSCSRYRLATIANFSALGLEPGRDVDLGQLEQESCLDGWEFSQDVYLSTVVTEWNLVCEDNW\nKVPLTTSLFFVGVLLGSFVSGQLSDRFGRKNVLFATMAVQTGFSFLQIFSISWEMFTVLFVIVGMGQISN\nYVVAFILGTEILGKSVRIIFSTLGVCTFFAVGYMLLPLFAYFIRDWRMLLLALTVPGVLCVPLWWFIPES\nPRWLISQRRFREAEDIIQKAAKMNNIAVPAVIFDSVEELNPLKQQKAFILDLFRTRNIAIMTIMSLLLWM\nLTSVGYFALSLDAPNLHGDAYLNCFLSALIEIPAYITAWLLLRTLPRRYIIAAVLFWGGGVLLFIQLVPV\nDYYFLSIGLVMLGKFGITSAFSMLYVFTAELYPTLVRNMAVGVTSTASRVGSIIAPYFVYLGAYNRMLPY\nIVMGSLTVLIGILTLFFPESLGMTLPETLEQMQKVKWFRSGKKTRDSMETEENPKVLITAF\n\n"
##Formatting/Cleaning Fasta Files
for(i in 1:length(slc22a4_list)){
slc22a4_list[[i]] <- compbio4all::fasta_cleaner(slc22a4_list[[i]], parse = F)
}
##General Protein Information #Protein Diagram Drawing a protein diagram
uniprot_id <- slc22a4_table[1, 2]# uniprot ids
json <- drawProteins::get_features(uniprot_id)
## [1] "Download has worked"
#drawing protein diagram
prot_df <- drawProteins::feature_to_dataframe(json)
canvas <- draw_canvas(prot_df)
canvas <- draw_chains(canvas, prot_df, label_size = 2.5)
canvas <- draw_regions(canvas, prot_df)
canvas <- draw_recept_dom(canvas, prot_df)
canvas <- draw_folding(canvas, prot_df)
canvas <- draw_repeat(canvas, prot_df)
canvas <- draw_phospho(canvas, prot_df)
show(canvas)
homo_sapiens <- slc22a4_table[1, 1]
homo_sapiens <- entrez_fetch(id=homo_sapiens, db="protein", rettype = "fasta")
homo_sapiens <- fasta_cleaner(homo_sapiens)
par(mfrow = c(2,2),
mar = c(0,0,4,2))
#dotplot 1: Default
dotPlot(homo_sapiens, homo_sapiens,
wsize = 1,
nmatch = 1,
xlab="homo_sapiens", ylab="homo_sapiens",
main = "Default")
#dotplot 2: size = 10, nmatch = 1
dotPlot(homo_sapiens, homo_sapiens,
wsize = 10,
nmatch = 1,
xlab="homo_sapiens", ylab="homo_sapiens",
main = "Window Size: 10, nmatch:1")
#dotplot 3: size = 10, nmatch = 5
dotPlot(homo_sapiens, homo_sapiens,
wsize = 10,
nmatch = 5,
xlab="homo_sapiens", ylab="homo_sapiens",
main = "Window Size: 10, nmatch: 5")
#dotplot 4: size = 20, nmatch = 5
dotPlot(homo_sapiens, homo_sapiens,
wsize = 20,
nmatch = 5,
xlab="homo_sapiens", ylab="homo_sapiens",
main = "Window Size: 20, nmatch:5")
#reset par() - run this or other plots will be small!
par(mfrow = c(1,1),
mar = c(4,4,4,4))
#best version of plot
dotPlot(homo_sapiens, homo_sapiens,
wsize = 1,
nmatch = 1,
xlab="homo_sapiens", ylab="homo_sapiens",
main = "Best version of plot")
##Protein Properties Compiled from Databases
sources <- c("PFAM", "Uniprot", "AlphaFold", "RepeatsDB")
properties <- c("There is a sugar (and other) transporter family domain in the protein expressed by the slc22a4 gene.",
"SLC22A4 encodes an organic cation transporter and plasma integral protein. As such it is located within the cytoplasm of the cell",
"Alphafold database did not provide specific information on the secondary structure of the protein enconded by SLC22A4, however judging from the 3-D model there are quite a few alpha-helices present",
"NA")
links <- c("http://pfam.xfam.org/protein/Q9H015,
https://www.uniprot.org/uniprot/Q9H015#family_and_domains,
https://alphafold.ebi.ac.uk/entry/Q9H015",
"NA")
#converting protein info to table
protein_table <- data.frame(sources, properties, links)
colnames(protein_table) <- c("Source", "Protein Property", "Link")
pander::pander(protein_table)
| Source | Protein Property |
|---|---|
| PFAM | There is a sugar (and other) transporter family domain in the protein expressed by the slc22a4 gene. |
| Uniprot | SLC22A4 encodes an organic cation transporter and plasma integral protein. As such it is located within the cytoplasm of the cell |
| AlphaFold | Alphafold database did not provide specific information on the secondary structure of the protein enconded by SLC22A4, however judging from the 3-D model there are quite a few alpha-helices present |
| RepeatsDB | NA |
##Secondary Structure Predictions Using multivariate statistical methods, information regarding protein structure/location was confirmed in the line database
#Predicting Protein Folds Judging by Alphafold database prediction of SLC22A4 3-D structure, there is a presence of alpha-helices. I predict machine-learning methods will output an “alpha” protein fold class
aa_codes <- c("A", "R", "N", "D", "C", "Q", "E", "G", "H", "I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V")
# alpha proteins
alpha <- c(285, 53, 97, 163, 22, 67, 134, 197, 111, 91,
221, 249, 48, 123, 82, 122, 119, 33, 63, 167)
# check against chou's total
sum(alpha) == 2447
## [1] TRUE
# beta proteins
beta <- c(203, 67, 139, 121, 75, 122, 86, 297, 49, 120,
177, 115, 16, 85, 127, 341, 253, 44, 110, 229)
# check against chou's total
sum(beta) == 2776
## [1] TRUE
# alpha + beta
a.plus.b <- c(175, 78, 120, 111, 74, 74, 86, 171, 33, 93,
110, 112, 25, 52, 71, 126, 117, 30, 108, 123)
sum(a.plus.b) == 1889
## [1] TRUE
# alpha/beta
a.div.b <- c(361, 146, 183, 244, 63, 114, 257, 377, 107, 239,
339, 321, 91, 158, 188, 327, 238, 72, 130, 378)
sum(a.div.b) == 4333
## [1] TRUE
comp_table <- data.frame(aa_codes, alpha, beta, a.plus.b, a.div.b)
pander::pander(comp_table)
| aa_codes | alpha | beta | a.plus.b | a.div.b |
|---|---|---|---|---|
| A | 285 | 203 | 175 | 361 |
| R | 53 | 67 | 78 | 146 |
| N | 97 | 139 | 120 | 183 |
| D | 163 | 121 | 111 | 244 |
| C | 22 | 75 | 74 | 63 |
| Q | 67 | 122 | 74 | 114 |
| E | 134 | 86 | 86 | 257 |
| G | 197 | 297 | 171 | 377 |
| H | 111 | 49 | 33 | 107 |
| I | 91 | 120 | 93 | 239 |
| L | 221 | 177 | 110 | 339 |
| K | 249 | 115 | 112 | 321 |
| M | 48 | 16 | 25 | 91 |
| F | 123 | 85 | 52 | 158 |
| P | 82 | 127 | 71 | 188 |
| S | 122 | 341 | 126 | 327 |
| T | 119 | 253 | 117 | 238 |
| W | 33 | 44 | 30 | 72 |
| Y | 63 | 110 | 108 | 130 |
| V | 167 | 229 | 123 | 378 |
#calculate proportions for each of the four protein fold types
alpha.prop <- alpha/sum(alpha)
beta.prop <- beta/sum(beta)
a.plus.b.prop <- a.plus.b/sum(a.plus.b)
a.div.b <- a.div.b/sum(a.div.b)
#dataframe
aa_prop <- data.frame(alpha.prop,
beta.prop,
a.plus.b.prop,
a.div.b)
round(aa_prop, 4)
## alpha.prop beta.prop a.plus.b.prop a.div.b
## 1 0.1165 0.0731 0.0926 0.0833
## 2 0.0217 0.0241 0.0413 0.0337
## 3 0.0396 0.0501 0.0635 0.0422
## 4 0.0666 0.0436 0.0588 0.0563
## 5 0.0090 0.0270 0.0392 0.0145
## 6 0.0274 0.0439 0.0392 0.0263
## 7 0.0548 0.0310 0.0455 0.0593
## 8 0.0805 0.1070 0.0905 0.0870
## 9 0.0454 0.0177 0.0175 0.0247
## 10 0.0372 0.0432 0.0492 0.0552
## 11 0.0903 0.0638 0.0582 0.0782
## 12 0.1018 0.0414 0.0593 0.0741
## 13 0.0196 0.0058 0.0132 0.0210
## 14 0.0503 0.0306 0.0275 0.0365
## 15 0.0335 0.0457 0.0376 0.0434
## 16 0.0499 0.1228 0.0667 0.0755
## 17 0.0486 0.0911 0.0619 0.0549
## 18 0.0135 0.0159 0.0159 0.0166
## 19 0.0257 0.0396 0.0572 0.0300
## 20 0.0682 0.0825 0.0651 0.0872
#row labels
row.names(aa_prop) <- aa_codes
#output dataframe
pander::pander(aa_prop)
| alpha.prop | beta.prop | a.plus.b.prop | a.div.b | |
|---|---|---|---|---|
| A | 0.1165 | 0.07313 | 0.09264 | 0.08331 |
| R | 0.02166 | 0.02414 | 0.04129 | 0.03369 |
| N | 0.03964 | 0.05007 | 0.06353 | 0.04223 |
| D | 0.06661 | 0.04359 | 0.05876 | 0.05631 |
| C | 0.008991 | 0.02702 | 0.03917 | 0.01454 |
| Q | 0.02738 | 0.04395 | 0.03917 | 0.02631 |
| E | 0.05476 | 0.03098 | 0.04553 | 0.05931 |
| G | 0.08051 | 0.107 | 0.09052 | 0.08701 |
| H | 0.04536 | 0.01765 | 0.01747 | 0.02469 |
| I | 0.03719 | 0.04323 | 0.04923 | 0.05516 |
| L | 0.09031 | 0.06376 | 0.05823 | 0.07824 |
| K | 0.1018 | 0.04143 | 0.05929 | 0.07408 |
| M | 0.01962 | 0.005764 | 0.01323 | 0.021 |
| F | 0.05027 | 0.03062 | 0.02753 | 0.03646 |
| P | 0.03351 | 0.04575 | 0.03759 | 0.04339 |
| S | 0.04986 | 0.1228 | 0.0667 | 0.07547 |
| T | 0.04863 | 0.09114 | 0.06194 | 0.05493 |
| W | 0.01349 | 0.01585 | 0.01588 | 0.01662 |
| Y | 0.02575 | 0.03963 | 0.05717 | 0.03 |
| V | 0.06825 | 0.08249 | 0.06511 | 0.08724 |
#Determine number of each amino acid in protein encoded by SLC22A4
#Function to convert a table into a vector
table_to_vector <- function(table_x){
table_names <- attr(table_x, "dimnames")[[1]]
table_vect <- as.vector(table_x)
names(table_vect) <- table_names
return(table_vect)
}
homo_sapiens_freq_table <- table(homo_sapiens)/length(homo_sapiens)
homo_sapiens_freq_vector <- table_to_vector(homo_sapiens_freq_table)
homo_sapiens_freq_vector
## A C D E F G
## 0.067150635 0.012704174 0.032667877 0.045372051 0.076225045 0.067150635
## H I K L M N
## 0.005444646 0.068965517 0.023593466 0.136116152 0.034482759 0.034482759
## P Q R S T V
## 0.045372051 0.027223230 0.054446461 0.070780399 0.054446461 0.085299456
## W Y
## 0.025408348 0.032667877
Removing any unknown amino acid codes, U, from vector
aa_names <- names(homo_sapiens_freq_vector)
i.U <- which(aa_names == "U")
aa_names[i.U]
## character(0)
homo_sapiens_freq_vector[i.U]
## named numeric(0)
if(length(i.U) > 0) {
homo_sapiens <- homo_sapiens[-i.U]
}
Adding frequency vector data to proportion table
aa_prop$slc22a4_human_aa_freq <- homo_sapiens_freq_vector
pander::pander(aa_prop)
| alpha.prop | beta.prop | a.plus.b.prop | a.div.b | slc22a4_human_aa_freq | |
|---|---|---|---|---|---|
| A | 0.1165 | 0.07313 | 0.09264 | 0.08331 | 0.06715 |
| R | 0.02166 | 0.02414 | 0.04129 | 0.03369 | 0.0127 |
| N | 0.03964 | 0.05007 | 0.06353 | 0.04223 | 0.03267 |
| D | 0.06661 | 0.04359 | 0.05876 | 0.05631 | 0.04537 |
| C | 0.008991 | 0.02702 | 0.03917 | 0.01454 | 0.07623 |
| Q | 0.02738 | 0.04395 | 0.03917 | 0.02631 | 0.06715 |
| E | 0.05476 | 0.03098 | 0.04553 | 0.05931 | 0.005445 |
| G | 0.08051 | 0.107 | 0.09052 | 0.08701 | 0.06897 |
| H | 0.04536 | 0.01765 | 0.01747 | 0.02469 | 0.02359 |
| I | 0.03719 | 0.04323 | 0.04923 | 0.05516 | 0.1361 |
| L | 0.09031 | 0.06376 | 0.05823 | 0.07824 | 0.03448 |
| K | 0.1018 | 0.04143 | 0.05929 | 0.07408 | 0.03448 |
| M | 0.01962 | 0.005764 | 0.01323 | 0.021 | 0.04537 |
| F | 0.05027 | 0.03062 | 0.02753 | 0.03646 | 0.02722 |
| P | 0.03351 | 0.04575 | 0.03759 | 0.04339 | 0.05445 |
| S | 0.04986 | 0.1228 | 0.0667 | 0.07547 | 0.07078 |
| T | 0.04863 | 0.09114 | 0.06194 | 0.05493 | 0.05445 |
| W | 0.01349 | 0.01585 | 0.01588 | 0.01662 | 0.0853 |
| Y | 0.02575 | 0.03963 | 0.05717 | 0.03 | 0.02541 |
| V | 0.06825 | 0.08249 | 0.06511 | 0.08724 | 0.03267 |
##Functions to calculate similarities
Correlation used in Chou and Zhange 1992
chou_cor <- function(x,y){
numerator <- sum(x*y)
denominator <- sqrt((sum(x^2))*(sum(y^2)))
result <- numerator/denominator
return(result)
}
Cosine similarity used in Higgs and Attwood (2005).
chou_cosine <- function(z.1, z.2){
z.1.abs <- sqrt(sum(z.1^2))
z.2.abs <- sqrt(sum(z.2^2))
my.cosine <- sum(z.1*z.2)/(z.1.abs*z.2.abs)
return(my.cosine)
}
Data exploration. Display outputs as scatterplots. None of these correlations are that great.
par(mfrow = c(2,2), mar = c(1,4,1,1))
plot(alpha.prop ~ slc22a4_human_aa_freq, data = aa_prop)
plot(beta.prop ~ slc22a4_human_aa_freq, data = aa_prop)
plot(a.plus.b.prop ~ slc22a4_human_aa_freq, data = aa_prop)
plot(a.div.b ~ slc22a4_human_aa_freq, data = aa_prop)
par(mfrow = c(1,1), mar = c(1,1,1,1))
Calculate correlation between each column:
corr.alpha <- chou_cor(aa_prop[, 5], aa_prop[, 1])
corr.beta <- chou_cor(aa_prop[, 5], aa_prop[, 2])
corr.apb <- chou_cor(aa_prop[, 5], aa_prop[, 3])
corr.adb <- chou_cor(aa_prop[, 5], aa_prop[, 4])
Calculate cosine simularity between each column:
cos.alpha <- chou_cosine(aa_prop[, 5], aa_prop[, 1])
cos.beta <- chou_cosine(aa_prop[, 5], aa_prop[, 2])
cos.apb <- chou_cosine(aa_prop[, 5], aa_prop[, 3])
cos.adb <- chou_cosine(aa_prop[, 5], aa_prop[, 4])
Calculate distance. Note: we need to flip the dataframe on its side using a command called t()
aa_prop_flipped <- t(aa_prop)
round(aa_prop_flipped,2)
## A R N D C Q E G H I L
## alpha.prop 0.12 0.02 0.04 0.07 0.01 0.03 0.05 0.08 0.05 0.04 0.09
## beta.prop 0.07 0.02 0.05 0.04 0.03 0.04 0.03 0.11 0.02 0.04 0.06
## a.plus.b.prop 0.09 0.04 0.06 0.06 0.04 0.04 0.05 0.09 0.02 0.05 0.06
## a.div.b 0.08 0.03 0.04 0.06 0.01 0.03 0.06 0.09 0.02 0.06 0.08
## slc22a4_human_aa_freq 0.07 0.01 0.03 0.05 0.08 0.07 0.01 0.07 0.02 0.14 0.03
## K M F P S T W Y V
## alpha.prop 0.10 0.02 0.05 0.03 0.05 0.05 0.01 0.03 0.07
## beta.prop 0.04 0.01 0.03 0.05 0.12 0.09 0.02 0.04 0.08
## a.plus.b.prop 0.06 0.01 0.03 0.04 0.07 0.06 0.02 0.06 0.07
## a.div.b 0.07 0.02 0.04 0.04 0.08 0.05 0.02 0.03 0.09
## slc22a4_human_aa_freq 0.03 0.05 0.03 0.05 0.07 0.05 0.09 0.03 0.03
Create distance matrix:
dist_matrix <- dist(aa_prop_flipped, method = "euclidean")
Individual Distances:
dist.alpha <- dist(aa_prop_flipped[c(1, 5),], method="euclidean")
dist.beta <- dist(aa_prop_flipped[c(2, 5),], method="euclidean")
dist.apb <- dist(aa_prop_flipped[c(3, 5),], method="euclidean")
dist.adb <- dist(aa_prop_flipped[c(4, 5),], method="euclidean")
# fold types
fold.type <- c("alpha","beta","alpha plus beta", "alpha/beta")
# data
corr.sim <- round(c(corr.alpha,corr.beta,corr.apb,corr.adb),5)
cosine.sim <- round(c(cos.alpha,cos.beta,cos.apb,cos.adb),5)
Euclidean.dist <- round(c(dist.alpha,dist.beta,dist.apb,dist.adb),5)
# summary
sim.sum <- c("","","most.sim","")
dist.sum <- c("","","min.dist","")
#create dataframe
df <- data.frame(fold.type,
corr.sim ,
cosine.sim ,
Euclidean.dist ,
sim.sum ,
dist.sum )
Display output
pander::pander(df)
| fold.type | corr.sim | cosine.sim | Euclidean.dist | sim.sum | dist.sum |
|---|---|---|---|---|---|
| alpha | 0.7151 | 0.7151 | 0.1952 | ||
| beta | 0.7911 | 0.7911 | 0.1683 | ||
| alpha plus beta | 0.8126 | 0.8126 | 0.1545 | most.sim | min.dist |
| alpha/beta | 0.7814 | 0.7814 | 0.1679 |
##Percent Identity Comparisons (PID)
#Data Prep Convert FASTA records into single vector
names(slc22a4_list)
## [1] "NP_003050.2" "XP_001163062.1" "NP_062661.1" "NP_001193918.1"
## [5] "XP_005626572.1" "XP_017727518.1" "NP_071606.1" "NP_001139603.1"
## [9] "XP_013820784.2" "XP_011945580.1"
length(slc22a4_list)
## [1] 10
Convert list into vector
#create vector w/ NAs at each homolog position
slc22a4_vector <- rep(NA, length(slc22a4_list))
#populate vector with fasta sequences from slc22a4_list
for(i in 1:length(slc22a4_vector)){
slc22a4_vector[i] <- slc22a4_list[[i]]
}
#label sequences with slc22a4 list names
names(slc22a4_vector) <- names(slc22a4_list)
##PID Table
Performing pairwise alignments comparing human sequence to chimp, mouse, and cattle respectively
human_chimp_alignment <- pairwiseAlignment(slc22a4_list[[1]], slc22a4_list[[2]])
human_mouse_alignment <- pairwiseAlignment(slc22a4_list[[1]], slc22a4_list[[3]])
human_cattle_alignment <- pairwiseAlignment(slc22a4_list[[1]], slc22a4_list[[4]])
#output alignments
human_chimp_alignment
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: MRDYDEVIAFLGEWGPFQRLIFFLLSASIIPNGF...TLEQMQKVKWFRSGKKTRDSMETEENPKVLITAF
## subject: MRDYDEVIAFLGEWGPFQRLIFFLLSASIIPNGF...TLEQMQKVKWFRCGKKTRDSMETEENPKVLITAF
## score: 2318.619
human_mouse_alignment
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: MRDYDEVIAFLGEWGPFQRLIFFLLSASIIPNGF...TLEQMQKVKWFRSGKKTRDSMETEENPKVLITAF
## subject: MRDYDEVIAFLGEWGPFQRLIFFLLSASIIPNGF...NLEQMQKVRGFRCGKKSTVSVDREESPKVLITAF
## score: 1488.824
human_cattle_alignment
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: MRDYDEVIAFLGEWGPFQRLIFFLLSASIIPNGF...TLEQMQKVKWFRSGKKTRDSMETEENPKVLITAF
## subject: MRDYDEVTAFLGKWGPFQRLIFFLLSASIIPNGF...TLEQMQKVKGFRYGKKTKGSMEKEENSKVIITSF
## score: 1604.795
Aligning chimp, mouse, and cattle sequences to eachother
chimp_mouse_alignment <- pairwiseAlignment(slc22a4_list[[2]], slc22a4_list[[3]])
chimp_cattle_alignment <- pairwiseAlignment(slc22a4_list[[2]], slc22a4_list[[4]])
mouse_cattle_alignment <- pairwiseAlignment(slc22a4_list[[3]], slc22a4_list[[4]])
#output alignments
chimp_mouse_alignment
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: MRDYDEVIAFLGEWGPFQRLIFFLLSASIIPNGF...TLEQMQKVKWFRCGKKTRDSMETEENPKVLITAF
## subject: MRDYDEVIAFLGEWGPFQRLIFFLLSASIIPNGF...NLEQMQKVRGFRCGKKSTVSVDREESPKVLITAF
## score: 1499.367
chimp_cattle_alignment
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: MRDYDEVIAFLGEWGPFQRLIFFLLSASIIPNGF...TLEQMQKVKWFRCGKKTRDSMETEENPKVLITAF
## subject: MRDYDEVTAFLGKWGPFQRLIFFLLSASIIPNGF...TLEQMQKVKGFRYGKKTKGSMEKEENSKVIITSF
## score: 1594.253
mouse_cattle_alignment
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: MRDYDEVIAFLGEWGPFQRLIFFLLSASIIPNGF...NLEQMQKVRGFRCGKKSTVSVDREESPKVLITAF
## subject: MRDYDEVTAFLGKWGPFQRLIFFLLSASIIPNGF...TLEQMQKVKGFRYGKKTKGSMEKEENSKVIITSF
## score: 1462.718
Calculating pid of alignments
Biostrings::pid(human_chimp_alignment)
## [1] 99.09256
Biostrings::pid(human_mouse_alignment)
## [1] 84.81013
Biostrings::pid(human_cattle_alignment)
## [1] 86.79928
Biostrings::pid(chimp_mouse_alignment)
## [1] 84.99096
Biostrings::pid(chimp_cattle_alignment)
## [1] 86.61844
Biostrings::pid(mouse_cattle_alignment)
## [1] 84.26763
Building matrix
# human chimp mouse cattle
pids <- c(1, NA, NA, NA,
pid(human_chimp_alignment), 1, NA, NA,
pid(human_mouse_alignment), pid(chimp_mouse_alignment), 1, NA,
pid(human_cattle_alignment), pid(chimp_cattle_alignment), pid(mouse_cattle_alignment), 1)
pid_matrix <- matrix(pids, nrow=4, byrow=T)
row.names(pid_matrix) <- c("Homo", "Pan", "Mus", "Bos")
colnames(pid_matrix) <- c("Homo", "Pan", "Mus", "Bos")
pander::pander(pid_matrix)
| Homo | Pan | Mus | Bos | |
|---|---|---|---|---|
| Homo | 1 | NA | NA | NA |
| Pan | 99.09 | 1 | NA | NA |
| Mus | 84.81 | 84.99 | 1 | NA |
| Bos | 86.8 | 86.62 | 84.27 | 1 |
##PID Method Comparison
pid1 <- Biostrings::pid(human_chimp_alignment, type = "PID1")
pid2 <- Biostrings::pid(human_chimp_alignment, type = "PID2")
pid3 <- Biostrings::pid(human_chimp_alignment, type = "PID3")
pid4 <- Biostrings::pid(human_chimp_alignment, type = "PID4")
pid_methods_vector <- c(pid1, pid2, pid3, pid4)
pid_method <- c("PID1", "PID2", "PID3", "PID4")
denominator <- c("(aligned positions & internal gap positions)",
"(aligned positions)",
"(length of shorter sequence)",
"(average length of sequences)")
pid_df <- data.frame(pid_method, pid_methods_vector, denominator)
colnames(pid_df) <- c("method", "PID", "denominator")
pander::pander(pid_df)
| method | PID | denominator |
|---|---|---|
| PID1 | 99.09 | (aligned positions & internal gap positions) |
| PID2 | 99.09 | (aligned positions) |
| PID3 | 99.09 | (length of shorter sequence) |
| PID4 | 99.09 | (average length of sequences) |
##Multiple Sequence Alignment (MSA) #MSA Data Prep
#convert vectors to stringset format
slc22a4_vector_ss <- Biostrings::AAStringSet(slc22a4_vector)
#Building MSA
slc22a4_msa <- msa(slc22a4_vector_ss, method = "ClustalW")
## use default substitution matrix
#Cleaning/Outputting MSA
class(slc22a4_msa)
## [1] "MsaAAMultipleAlignment"
## attr(,"package")
## [1] "msa"
is(slc22a4_msa)
## [1] "MsaAAMultipleAlignment" "AAMultipleAlignment" "MsaMetaData"
## [4] "MultipleAlignment"
slc22a4_msa
## CLUSTAL 2.1
##
## Call:
## msa(slc22a4_vector_ss, method = "ClustalW")
##
## MsaAAMultipleAlignment with 10 rows and 675 columns
## aln names
## [1] -------------------------...GFRCGKKSTVSVDREESPKVLITAF NP_062661.1
## [2] -------------------------...GFRCGKKSTVSMDREENPKVLITAF NP_071606.1
## [3] -------------------------...WFRSGKKTRDSMETEENPKVLITAF NP_003050.2
## [4] -------------------------...WFRCGKKTRDSMETEENPKVLITAF XP_001163062.1
## [5] MAEPSVEDPVRSLRAGGRAKHWAET...WFRSGKKTRDSMETEKNPKVLITAF XP_017727518.1
## [6] -------------------------...WFRSGKKTRDSMETEENPKVLITAF XP_011945580.1
## [7] -------------------------...GFRYGKKTKGSMEKEENSKVIITSF NP_001193918.1
## [8] -------------------------...GFRYGKKTKGSMEKEENPKVIITSF XP_013820784.2
## [9] -------------------------...GFRFRKRTRDSVEKEENPKVLITSF XP_005626572.1
## [10] -------------------------...CFRNGQQSTGIRNSKESPKTLITTL NP_001139603.1
## Con -------------------------...GFR?GKKTRDSME?EENPKVLITAF Consensus
class(slc22a4_msa) <- "AAMultipleAlignment"
class(slc22a4_msa)
## [1] "AAMultipleAlignment"
#converting MSA to seqinr alignment format
slc22a4_seqinr <- msaConvert(slc22a4_msa, type = "seqinr::alignment")
print_msa(alignment = slc22a4_seqinr, chunksize = 60)
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "MAEPSVEDPVRSLRAGGRAKHWAETAGSPAPSQRPPLPSAGPGMGVWSQVYSGIKLGAEL 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] " "
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "PERSSALSLFPRTGPRLRAPISNSLPVLGNVLTSLGSPQLRDTVPRTLSSPVVASFGAAV 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] " "
## [1] "--MRDYDEVIAFLGEWGPFQRLIFFLLSASIIPNGFNGMSVVFLAGTPEHRCLVPDTVNL 0"
## [1] "--MRDYDEVIAFLGDWGPFQRLIFFLLSASIIPNGFNGMSVVFLAGTPEHRCLVPHTVNL 0"
## [1] "--MRDYDEVIAFLGEWGPFQRLIFFLLSASIIPNGFNGMSVVFLAGTPEHRCRVPDAANL 0"
## [1] "--MRDYDEVIAFLGEWGPFQRLIFFLLSASIIPNGFNGMSVVFLAGTPEHRCRVPDAANL 0"
## [1] "GSMRDYDEAIAFLGEWGLFQRLIFFLLSASIIPNGFNGMSVVFLAGTPEHRCRVPDAANL 0"
## [1] "--MRDYDEAIAFLGEWGPFQRLIFFLLSASIIPNGFNGMSVVFLAGTPEHRCRVPDAANL 0"
## [1] "--MRDYDEVTAFLGKWGPFQRLIFFLLSASIIPNGFNGMSVVFLAGTPEHHCRVPDTANL 0"
## [1] "--MRDYDEVTAFLGKWGPFQRLIFFLLSASIIPNGFNGMSVVFLAGTPEHRCRVPDTANL 0"
## [1] "--MRDYDEVTTFLGEWGPFQRLIFFLLSASIIPNGFNGMSVVFLAATPEHRCRVPDAANL 0"
## [1] "--MRDYEEVTAFLGKWGRFQRLVFFLLSASIIPNGFNGMSAVFLAGTPEHRCAVPPGANL 0"
## [1] " "
## [1] "SSSWRNHSIPLETKDGRQVPQSCRRYRLATIANFSAMGLEPGQDVDLEQLEQESCLDGWE 0"
## [1] "SSAWRNHSIPLETKDGRQVPQSCRRYRLATIANFSALGLEPGLDVDLEQLEQESCLDGWE 0"
## [1] "SSAWRNNSVPLRLRDGREVPHSCSRYRLATIANFSALGLEPGRDVDLGQLEQESCLDGWE 0"
## [1] "SSAWRNNSVPLRLRDGREVPHSCSRYRLATIANFSALGLEPGRDVDLGQLEQESCLDGWE 0"
## [1] "SSAWRNNSIPLRLRDGREVLHSCRRYRLATIANFSALGLEPGRDVDLGQLEQESCLDGWE 0"
## [1] "SSAWRNNSVPLRLRDGREVLHSCRRYRLATIANFSALGLEPGRDVDLGQLEQESCLDGWE 0"
## [1] "SSAWRNHSIPVRLQDGREVPQSCRRYRLATIVNFSALGLEPGRDVNLEQLEQEGCLDGWE 0"
## [1] "SSAWRNHSIPMRLQDGREVPQSCRRYRLATIVNFSALGLEPGRDVDLEQLEQEGCLDGWE 0"
## [1] "SRAWRNHSIPLRLQDGREVPHSCRRYRLAAIANFSALGLEPGRDVDLEQLEQESCLDGWE 0"
## [1] "SEEWRNASIPLELRGGVTEPSRCRRYRLAVLANLSALGLRPGSDVQLAELEQEPCLDGWE 0"
## [1] " "
## [1] "YDKDIFLSTIVTEWNLVCEDDWKTPLTTSLFFVGVLCGSFVSGQLSDRFGRKKVLFATMA 0"
## [1] "YSKDVFLSTIVTEWNLVCEDDWKTPLTTSLFFVGVLCGSFVSGQLSDRFGRKKVLFATMA 0"
## [1] "FSQDVYLSTVVTEWNLVCEDNWKVPLTTSLFFVGVLLGSFVSGQLSDRFGRKNVLFATMA 0"
## [1] "FSQDVYLFTVVTEWNLVCEDNWKVPLTTSLFFVGVLLGSFVSGQLSDRFGRKNVLFATMA 0"
## [1] "FSQDVYWSTVVTEWNLVCEDNWKVPLTTSLFFVGVLLGSFISGQLSDRFGRKNVLFATMA 0"
## [1] "FSQDVYWSTVVTEWNLVCEDNWKVPLTTSLFFVGVLLGSFVSGQLSDRFGRKNVLFATMA 0"
## [1] "FSQDIYMSTIVTEWSLVCEEDWKTPLTTSLFFVGVLLGSFVSGQLSDRFGRKTILFATMA 0"
## [1] "FSQDIYLSTIVTEWSLVCEEDWKTPLTTSLFFVGVLLGSFVSGQLSDRFGRKTILFATMA 0"
## [1] "FSQDIYQSTIVTEWSLVCEDDWKTPLTSSLFFVGVLLGSFVSGQLSDRFGRKNILFATMA 0"
## [1] "YSRDVYRSTIVSEWNLVCEDDWKVPLTTSLFFVGVLIGSFISGQLSDRFGRKSILFLTMA 0"
## [1] " "
## [1] "VQTGFSFVQIFSTNWEMFTVLFAIVGMGQISNYVVAFILGTEILSKSVRIIFSTLGVCTF 0"
## [1] "VQTGFSFVQIFSTNWEMFTVLFAIVGMGQISNYVVAFILGTEILSKSVRILFSTLGVCTF 0"
## [1] "VQTGFSFLQIFSISWEMFTVLFVIVGMGQISNYVVAFILGTEILGKSVRIIFSTLGVCTF 0"
## [1] "VQTGFSFLQIFSISWEMFTVLFVIVGMGQISNYVVAFILGTEILGKSVRIIFSTLGVCTF 0"
## [1] "VQTGFSFLQIFSISWEMFTVLFLIVGMGQISNYVVAFILGTEILGKSVRIIFSTLGVCTF 0"
## [1] "VQTGFSFLQIFSNSWEMFTVLFLIVGMGQISNYVVAFILGTEILGKSVRIIFSTLGVCTF 0"
## [1] "VQTGFSFLQVFSINWEMFTVLFVIVGMGQISNYVVAFILGTEILGKSVRIIFSTLGVCTF 0"
## [1] "VQTGFSFLQIFSINWEMFTVLFVIVGMGQISNYVVAFILGAEILGKSVRVIFSTLGVCTF 0"
## [1] "VQTGFSFLQIFSINWEMFTVLFVIVGMGQISNYVVAFILGTEILGKSVRIIFSTLGVCTF 0"
## [1] "VQTGFSFLQIFSTSWEMFTVLFLIVGMGQISNYVVAFILGTEILGKSVRILFSTLGVCVF 0"
## [1] " "
## [1] "FAIGYMVLPLFAYFIRDWRMLLLALTLPGLFCVPLWWFIPESPRWLISQRRFAEAEQIIQ 0"
## [1] "FAIGYMVLPLFAYFIRDWRMLLLALTLPGLFCVPLWWFIPESPRWLISQRRFEEAEQIIQ 0"
## [1] "FAVGYMLLPLFAYFIRDWRMLLLALTVPGVLCVPLWWFIPESPRWLISQRRFREAEDIIQ 0"
## [1] "FAVGYMLLPLFAYFIRDWRMLLLALTVPGVLCVPLWWFIPESPRWLISQRRFREAEDIIQ 0"
## [1] "FAVGYMLLPLFAYFIRDWRMLLLALTVPGVLCVPLWWFIPESPRWLISQRRFREAEDIIQ 0"
## [1] "FAVGYMLLPLFAYFIRDWRMLLLALTVPGVLCVPLWWFIPESPRWLISQRRFREAEDIIQ 0"
## [1] "FAIGYMLLPLFAYFIRDWRMLLLALTVPGVLCVPLWWFIPESPRWLISQRRFKEAEDIIQ 0"
## [1] "FAVGYMLLPLFAYFIRDWRMLLLALTVPGLLCVPLWWFIPESPRWLLSQRRFKEAEDIIQ 0"
## [1] "FAVGYMLLPLFAYFIRDWRMLLLALTLPGVLCIPLWWFIPESPRWLISQRRFREAENIIQ 0"
## [1] "FAIGYMLLPLFAYFIRDWRMLLLALTLPGLLCVPLWWIIPESPRWLISQGRYKEAEVIIR 0"
## [1] " "
## [1] "KAAKMNSIVAPAGIFDPLELQELNSLKQQKVIILDLFRTRNIATITVMAVMLWMLTSVGY 0"
## [1] "KAAKMNGIMAPAVIFDPLELQELNSLKQQKVFILDLFKTRNIATITVMSVMLWMLTSVGY 0"
## [1] "KAAKMNNIAVPAVIFDSVEELN--PLKQQKAFILDLFRTRNIAIMTIMSLLLWMLTSVGY 0"
## [1] "KAAKMNNIAVPAVIFDSVEELN--PLKQQKAFILDLFRTRNIAIMTIMSLLLWMLTSVGY 0"
## [1] "KAAKMNNIAVPAVIFDSVEELN--PLKQQKAFILDLFRTWNIAIMTIMSLLLWMLTSVGY 0"
## [1] "KAAKMNNIAVPAVIFDSVEELN--PLKQQKAFILDLFRTWNIAIMTIMSLLLWMLTSVGY 0"
## [1] "KAAKINNVTAPVVVFDPVEQQEQLPLKQQKVFILDLFRTRNIATITIMSLLLWMLTSVGY 0"
## [1] "KAAKINNVTAPVVVFDPVEQQEQLPLKQQKVFILDLFRTRNIATITIMSLLLWMLTSVGY 0"
## [1] "KAAKMNNIAAPVVIFDPMELQELKLQSQQKVFILDLFRTRNIATITIMSLLLWMLTSVGY 0"
## [1] "KAAKINNTPAPAMLFDAAEMQDSKPQQQQKAILLDLFRSRNIATITIMSLLLWFFTSVGY 0"
## [1] " "
## [1] "FALSLNVPNLHGDVYLNCFLSGLIEVPAYFTAWLLLRTLPRRYIIAGVLFWGGGVLLLIQ 0"
## [1] "FALSLNVPNLHGDVYLNCFLSGLIEVPAYFTAWLLLRTLPRRYIIAGVLFWGGGVLLLVQ 0"
## [1] "FALSLDAPNLHGDAYLNCFLSALIEIPAYITAWLLLRTLPRRYIIAAVLFWGGGVLLFIQ 0"
## [1] "FALSLDAPNLHGDVYLNCFLSALIEIPAYITAWLLLRTLPRRYIIAAVLFWGGGVLLFIQ 0"
## [1] "FALSLDTPNLHGDAYLNCFLSALIEIPAYITAWLLLRTLPRRYIIAAVLFWGGGVLLFIQ 0"
## [1] "FALSLDTPNLHGDAYLNCFLSALIEIPAYITAWLLLRTLPRRYIIAAVLFWGG------- 0"
## [1] "FALSLNTPNLHGDPYLNCFFSALIEVPAYIAAWLLLRTLPRRYVIAGVLFFGGSVLLLIQ 0"
## [1] "FALSLNTPNLHGDPYLNCFFSALIEVPAYIAAWLLLRTLPRRYIIAGVLFFGGSVLLLIQ 0"
## [1] "FALSLNAPNLHGDAYLNCFLSALIEVPAYITAWLLLRTLPRRYIIAGVLFLGG------- 0"
## [1] "FGLSLNTPNLHGDAYINCFLSAVIEVPAYIIAWLLLRSLPRRYSISGTLVLGGGVVLFIN 0"
## [1] " "
## [1] "VVPEDYNFVSIGLVMLGKFGITSAFSMLYVFTAELYPTLVRNMAVGITSMASRVGSIIAP 0"
## [1] "VVPEDYNFVSIGLVMLGKFGVTSAFSMLYVFTAELYPTLVRNMAVGITSMASRVGSIIAP 0"
## [1] "LVPVDYYFLSIGLVMLGKFGITSAFSMLYVFTAELYPTLVRNMAVGVTSTASRVGSIIAP 0"
## [1] "LVPVDYYFLSIGLVMLGKFGITSAFSMLYVFTAELYPTLVRNMAVGVTSTASRVGSIIAP 0"
## [1] "LVPVDYYFLSIGLVMLGKFGITSAFSMLYVFTAELYPTLVRNMAVGVTSMASRVGSIIAP 0"
## [1] "----DYYFLSIGLVMLGKFGITSAFSMLYVFTAELYPTLVRNMAVGVTSMASRVGSIIAP 0"
## [1] "LVPAGYNFLSIGLAMLGKFGVTSAFAMLYVFTAELYPTLVRNMAVGVASMASRVGSIIAP 0"
## [1] "LVPAGYNFLSIGLAMLGKFGITSAFAMLYVFTAELYPTLVRNMAVGVGSMASRVGSIIAP 0"
## [1] "----DYNFLSIGLVMLGKFGITSAFSMLYVFTAELYPTLVRNMAVGVTSMASRVGSIIAP 0"
## [1] "LVPTDLNVLSVCLVMLGKFGITAAFSMLYVYNVELYPTLVRNMAVGATSTASRLGSIIAP 0"
## [1] " "
## [1] "YFVYLGAYNRLLPYILMGSLTVLIGIITLFFPESFGVTLPENLEQMQKVRGFRCGKKSTV 0"
## [1] "YFVYLGAYNRLLPYILMGSLTVLIGIITLFFPESFGVTLPENLEQMQKVRGFRCGKKSTV 0"
## [1] "YFVYLGAYNRMLPYIVMGSLTVLIGILTLFFPESLGMTLPETLEQMQKVKWFRSGKKTRD 0"
## [1] "YFVYLGAYNRMLPYIIMGSLTVLIGILTLFFPESLGITLPETLEQMQKVKWFRCGKKTRD 0"
## [1] "YFVYLGAYNRMLPYIVMGSLTVLIGILTLFFPESLGMTLPETLEQMQKVKWFRSGKKTRD 0"
## [1] "YFVYLGAYNRMLPYIVMGSLTVLIGILTLFFPESLGMTLPETLEQMQKVKWFRSGKKTRD 0"
## [1] "YFVYLGAYNRVLPYILMGSLTVLIGIITLFFPESFGRTLPETLEQMQKVKGFRYGKKTKG 0"
## [1] "YFVYLGAYNRVLPYILMGSLTVLIGIITLFFPESFGRTLPETLEQMQKVKGFRYGKKTKG 0"
## [1] "YFVYLGAYNRVLPYIVMGSLTVFIGIITLFFPESFGMTLPETLEQMQKVKGFRFRKRTRD 0"
## [1] "YFVYLGAYDRYLPYIVMGSLTVLIGILTLFLPESYGSALPESFEQMMKVKCFRNGQQSTG 0"
## [1] " "
## [1] "SVDREESPKVLITAF 45"
## [1] "SMDREENPKVLITAF 45"
## [1] "SMETEENPKVLITAF 45"
## [1] "SMETEENPKVLITAF 45"
## [1] "SMETEKNPKVLITAF 45"
## [1] "SMETEENPKVLITAF 45"
## [1] "SMEKEENSKVIITSF 45"
## [1] "SMEKEENPKVIITSF 45"
## [1] "SVEKEENPKVLITSF 45"
## [1] "IRNSKESPKTLITTL 45"
## [1] " "
##Finished MSA output msa using ggmsa function
ggmsa(slc22a4_msa, start=350, end=450)
##Distance Matrix
slc22a4_subset_dist <- seqinr::dist.alignment(slc22a4_seqinr, matrix="identity")
is(slc22a4_subset_dist)
## [1] "dist" "oldClass"
class(slc22a4_subset_dist)
## [1] "dist"
slc22a4_align_seqinr_rnd <- round(slc22a4_subset_dist, 3)
slc22a4_align_seqinr_rnd
## NP_062661.1 NP_071606.1 NP_003050.2 XP_001163062.1
## NP_071606.1 0.185
## NP_003050.2 0.393 0.386
## XP_001163062.1 0.390 0.383 0.095
## XP_017727518.1 0.402 0.395 0.148 0.176
## XP_011945580.1 0.394 0.385 0.129 0.161
## NP_001193918.1 0.397 0.387 0.361 0.364
## XP_013820784.2 0.387 0.383 0.351 0.354
## XP_005626572.1 0.370 0.370 0.310 0.322
## NP_001139603.1 0.498 0.498 0.480 0.486
## XP_017727518.1 XP_011945580.1 NP_001193918.1 XP_013820784.2
## NP_071606.1
## NP_003050.2
## XP_001163062.1
## XP_017727518.1
## XP_011945580.1 0.096
## NP_001193918.1 0.369 0.363
## XP_013820784.2 0.361 0.355 0.159
## XP_005626572.1 0.325 0.322 0.319 0.310
## NP_001139603.1 0.480 0.481 0.494 0.492
## XP_005626572.1
## NP_071606.1
## NP_003050.2
## XP_001163062.1
## XP_017727518.1
## XP_011945580.1
## NP_001193918.1
## XP_013820784.2
## XP_005626572.1
## NP_001139603.1 0.478
##Phylogenetic Tree
Use created distance matrix to create a phylogenetic tree using neighbor joining algorithm in ape package
slc22a4_tree <- ape::nj(slc22a4_align_seqinr_rnd)
#Plot Created Tree
plot.phylo(slc22a4_tree, main="SLC22A4 Phylogenetic Tree\n", use.edge.length = F)
mtext("\n\nSLC22A4 Tree - Rooted, No Branch Lengths")