SLC22A4-Solute Carrier Family 22 Member 4

The encoded protein is an organic cation transporter and plasma integral membrane protein containing eleven putative transmembrane domains as well as a nucleotide-binding site motif. Transport by this protein is partially ATP-dependent.

Resources/References -Refseq Gene:https://www.ncbi.nlm.nih.gov/nuccore/1523761482?report=graph -Refseq Homologene:https://www.ncbi.nlm.nih.gov/homologene?LinkName=gene_homologene&from_uid=6583 -Uniprot:https://www.uniprot.org/uniprot/Q9H015 -Pfam:http://pfam.xfam.org/protein/Q9H015 -Alphafold:https://alphafold.ebi.ac.uk/entry/Q9H015

#Required Packages:

#install.packages("BiocManager")
#BiocManager::install("drawProteins")
#install.packages("HGNChelper")

#Load Necessary Packages

#github
library(compbio4all)
library(ggmsa)
## Registered S3 methods overwritten by 'ggalt':
##   method                  from   
##   grid.draw.absoluteGrob  ggplot2
##   grobHeight.absoluteGrob ggplot2
##   grobWidth.absoluteGrob  ggplot2
##   grobX.absoluteGrob      ggplot2
##   grobY.absoluteGrob      ggplot2
#CRAN
library(rentrez)

library(seqinr)

library(ape)
## 
## Attaching package: 'ape'
## The following objects are masked from 'package:seqinr':
## 
##     as.alignment, consensus
library(pander)

#Bioconductor
library(msa)
## Loading required package: Biostrings
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
## 
##     windows
## Loading required package: XVector
## Loading required package: GenomeInfoDb
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:ape':
## 
##     complement
## The following object is masked from 'package:seqinr':
## 
##     translate
## The following object is masked from 'package:base':
## 
##     strsplit
library(drawProteins)

## Biostrings
library(Biostrings)

library(HGNChelper)
## Warning: package 'HGNChelper' was built under R version 4.1.2

##Accession Numbers

Accession numbers for SLC22A4 were obtained from Refseq, Refseq Homologene, Uniprot and PDB. Uniprot accession numbers found by searching for gene name. PDB accessions found by entering Uniprot accession number or SLC22A4 gene name

A protein BLAST search was conducted to determine if SLC22A4 is expressed outside of vertebrae and it was determined that it is not expressed in non-vertebrae. A second protein BLAST was conducted to determine if SLC22A4 is expressed outside mammals. Link to SLC22A4 Protein BLAST search: https://blast.ncbi.nlm.nih.gov/Blast.cgi

HGNChelper::checkGeneSymbols(x = c("SLC22A4"))
## Maps last updated on: Thu Oct 24 12:31:05 2019
##         x Approved Suggested.Symbol
## 1 SLC22A4     TRUE          SLC22A4

##Creating Accession Number Table

Creating a table with all the accession numbers from various databases for SLC22A4

No Refseq or Uniprot Accession found for H. Neanderthalus

slc22a4_table <- c("NP_003050.2",    "Q9H015",     "NA", "Homo sapiens",            "Human",                "SLC22A4",
                 "XP_001163062.1", "H2QRG3",     "NA", "Pan troglodytes",         "Chimpanzee",           "SLC22A4",
                 "NP_062661.1",    "Q9Z306",     "NA", "Mus musculus",            
"House mouse",          "Slc22a4",
                 "NP_001193918.1",  "E1BEU0",     "NA", "Bos taurus",          
"Cattle",               "SLC22A4",
                 "XP_005626572.1", "F6UW69",     "NA", "Canis lupus",      
"Wolf", "SLC22A4",
                 "XP_017727518.1", "A0A2K6MNT9",     "NA", "Rhinopithecus bieti",       "Black-and-white snub-nosed monkey",        "SLC22A4",
                 "NP_071606.1",    "Q9R141",     "NA", "Rattus norvegicus",       
"Brown rat",           "    Slc22a4",
                 "NP_001139603.1", "A0A1D5PH68",         "NA", "Gallus gallus",         
"Chicken",                "SLC22A4", 
                 "XP_013820784.2", "A0A452FJ23", "NA", "Capra hircus",   
"Goat",    "    SLC22A4",
                 "XP_011945580.1", "A0A2K5N3M5",         "NA", "Cercocebus atys", 
"Sooty mangabey",        "SLC22A4")

#Creating table
slc22a4_table <- matrix(slc22a4_table, ncol=6, byrow=TRUE)
dim(slc22a4_table)
## [1] 10  6
#Converting to dataframe
slc22a4_table <- data.frame(slc22a4_table)
#Dataframe info
is(slc22a4_table)
## [1] "data.frame"       "list"             "oldClass"         "vector"          
## [5] "list_OR_List"     "vector_OR_Vector" "vector_OR_factor"
#Adding column names to table
colnames(slc22a4_table)
## [1] "X1" "X2" "X3" "X4" "X5" "X6"
colnames(slc22a4_table) <- c("ncbi.protein.accession", "UniProt.id", "PDB", "species", "common.name", "gene.name")
#Displaying table using pander
pander::pander(slc22a4_table)
Table continues below
ncbi.protein.accession UniProt.id PDB species
NP_003050.2 Q9H015 NA Homo sapiens
XP_001163062.1 H2QRG3 NA Pan troglodytes
NP_062661.1 Q9Z306 NA Mus musculus
NP_001193918.1 E1BEU0 NA Bos taurus
XP_005626572.1 F6UW69 NA Canis lupus
XP_017727518.1 A0A2K6MNT9 NA Rhinopithecus bieti
NP_071606.1 Q9R141 NA Rattus norvegicus
NP_001139603.1 A0A1D5PH68 NA Gallus gallus
XP_013820784.2 A0A452FJ23 NA Capra hircus
XP_011945580.1 A0A2K5N3M5 NA Cercocebus atys
common.name gene.name
Human SLC22A4
Chimpanzee SLC22A4
House mouse Slc22a4
Cattle SLC22A4
Wolf SLC22A4
Black-and-white snub-nosed monkey SLC22A4
Brown rat Slc22a4
Chicken SLC22A4
Goat SLC22A4
Sooty mangabey SLC22A4

##Data Preparation #Downloading Protein Sequences

We are downloading the sequences using the wrapper compbio4all::entrez_fetch_list()

The wrapper uses the rentrez::entrez_fetch() function to access NCBI databases for sequence information

# Downloading and prepping sequences
slc22a4_accessions <- slc22a4_table[,1] # Protein Accession column
slc22a4_list <- entrez_fetch_list(db="protein", id=slc22a4_accessions, rettype="fasta")

#Number of FASTA files downloaded from NCBI

length(slc22a4_list)
## [1] 10

Human slc22a4 FASTA file output

slc22a4_list[[1]]
## [1] ">NP_003050.2 solute carrier family 22 member 4 [Homo sapiens]\nMRDYDEVIAFLGEWGPFQRLIFFLLSASIIPNGFNGMSVVFLAGTPEHRCRVPDAANLSSAWRNNSVPLR\nLRDGREVPHSCSRYRLATIANFSALGLEPGRDVDLGQLEQESCLDGWEFSQDVYLSTVVTEWNLVCEDNW\nKVPLTTSLFFVGVLLGSFVSGQLSDRFGRKNVLFATMAVQTGFSFLQIFSISWEMFTVLFVIVGMGQISN\nYVVAFILGTEILGKSVRIIFSTLGVCTFFAVGYMLLPLFAYFIRDWRMLLLALTVPGVLCVPLWWFIPES\nPRWLISQRRFREAEDIIQKAAKMNNIAVPAVIFDSVEELNPLKQQKAFILDLFRTRNIAIMTIMSLLLWM\nLTSVGYFALSLDAPNLHGDAYLNCFLSALIEIPAYITAWLLLRTLPRRYIIAAVLFWGGGVLLFIQLVPV\nDYYFLSIGLVMLGKFGITSAFSMLYVFTAELYPTLVRNMAVGVTSTASRVGSIIAPYFVYLGAYNRMLPY\nIVMGSLTVLIGILTLFFPESLGMTLPETLEQMQKVKWFRSGKKTRDSMETEENPKVLITAF\n\n"

##Formatting/Cleaning Fasta Files

for(i in 1:length(slc22a4_list)){
  slc22a4_list[[i]] <- compbio4all::fasta_cleaner(slc22a4_list[[i]], parse = F)
}

##General Protein Information #Protein Diagram Drawing a protein diagram

uniprot_id <- slc22a4_table[1, 2]# uniprot ids

json <- drawProteins::get_features(uniprot_id)
## [1] "Download has worked"
#drawing protein diagram
prot_df <- drawProteins::feature_to_dataframe(json)
canvas <- draw_canvas(prot_df)
canvas <- draw_chains(canvas, prot_df, label_size = 2.5)
canvas <- draw_regions(canvas, prot_df)
canvas <- draw_recept_dom(canvas, prot_df)
canvas <- draw_folding(canvas, prot_df)
canvas <- draw_repeat(canvas, prot_df)
canvas <- draw_phospho(canvas, prot_df)
show(canvas)

homo_sapiens <- slc22a4_table[1, 1]
homo_sapiens <- entrez_fetch(id=homo_sapiens, db="protein", rettype = "fasta")
homo_sapiens <- fasta_cleaner(homo_sapiens)

par(mfrow = c(2,2), 
    mar = c(0,0,4,2))

#dotplot 1: Default
dotPlot(homo_sapiens, homo_sapiens, 
        wsize = 1, 
        nmatch = 1,
        xlab="homo_sapiens", ylab="homo_sapiens",
        main = "Default")

#dotplot 2: size = 10, nmatch = 1
dotPlot(homo_sapiens, homo_sapiens, 
        wsize = 10, 
        nmatch = 1,
        xlab="homo_sapiens", ylab="homo_sapiens",
        main = "Window Size: 10, nmatch:1")

#dotplot 3: size = 10, nmatch = 5
dotPlot(homo_sapiens, homo_sapiens, 
        wsize = 10, 
        nmatch = 5,
        xlab="homo_sapiens", ylab="homo_sapiens",
        main = "Window Size: 10, nmatch: 5")

#dotplot 4: size = 20, nmatch = 5
dotPlot(homo_sapiens, homo_sapiens, 
        wsize = 20,
        nmatch = 5,
        xlab="homo_sapiens", ylab="homo_sapiens",
        main = "Window Size: 20, nmatch:5")

#reset par() - run this or other plots will be small!
par(mfrow = c(1,1), 
    mar = c(4,4,4,4))

#best version of plot
dotPlot(homo_sapiens, homo_sapiens, 
        wsize = 1, 
        nmatch = 1, 
        xlab="homo_sapiens", ylab="homo_sapiens",
        main = "Best version of plot")

##Protein Properties Compiled from Databases

sources <- c("PFAM", "Uniprot", "AlphaFold", "RepeatsDB")

properties <- c("There is a sugar (and other) transporter family domain in the protein expressed by the slc22a4 gene.",
                "SLC22A4 encodes an organic cation transporter and plasma integral protein. As such it is located within the cytoplasm of the cell",
                "Alphafold database did not provide specific information on the secondary structure of the protein enconded by SLC22A4, however judging from the 3-D model there are quite a few alpha-helices present",
                "NA")

links <- c("http://pfam.xfam.org/protein/Q9H015,
           https://www.uniprot.org/uniprot/Q9H015#family_and_domains,
           https://alphafold.ebi.ac.uk/entry/Q9H015",
           "NA")


#converting protein info to table
protein_table <- data.frame(sources, properties, links)
colnames(protein_table) <- c("Source", "Protein Property", "Link")

pander::pander(protein_table)
Table continues below
Source Protein Property
PFAM There is a sugar (and other) transporter family domain in the protein expressed by the slc22a4 gene.
Uniprot SLC22A4 encodes an organic cation transporter and plasma integral protein. As such it is located within the cytoplasm of the cell
AlphaFold Alphafold database did not provide specific information on the secondary structure of the protein enconded by SLC22A4, however judging from the 3-D model there are quite a few alpha-helices present
RepeatsDB NA
Link
http://pfam.xfam.org/protein/Q9H015, https://www.uniprot.org/uniprot/Q9H015#family_and_domains, https://alphafold.ebi.ac.uk/entry/Q9H015
NA
http://pfam.xfam.org/protein/Q9H015, https://www.uniprot.org/uniprot/Q9H015#family_and_domains, https://alphafold.ebi.ac.uk/entry/Q9H015
NA

##Secondary Structure Predictions Using multivariate statistical methods, information regarding protein structure/location was confirmed in the line database

#Predicting Protein Folds Judging by Alphafold database prediction of SLC22A4 3-D structure, there is a presence of alpha-helices. I predict machine-learning methods will output an “alpha” protein fold class

aa_codes <- c("A", "R", "N", "D", "C", "Q", "E", "G", "H", "I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V")

# alpha proteins
alpha <- c(285, 53, 97, 163, 22, 67, 134, 197, 111, 91, 
           221, 249, 48, 123, 82, 122, 119, 33, 63, 167)

# check against chou's total
sum(alpha) == 2447
## [1] TRUE
# beta proteins
beta <- c(203, 67, 139, 121, 75, 122, 86, 297, 49, 120, 
          177, 115, 16, 85, 127, 341, 253, 44, 110, 229)

# check against chou's total
sum(beta) == 2776
## [1] TRUE
# alpha + beta
a.plus.b <- c(175, 78, 120, 111, 74, 74, 86, 171, 33, 93,
              110, 112, 25, 52, 71, 126, 117, 30, 108, 123)
sum(a.plus.b) == 1889
## [1] TRUE
# alpha/beta
a.div.b <- c(361, 146, 183, 244, 63, 114, 257, 377, 107, 239, 
             339, 321, 91, 158, 188, 327, 238, 72, 130, 378)
sum(a.div.b) == 4333
## [1] TRUE
comp_table <- data.frame(aa_codes, alpha, beta, a.plus.b, a.div.b)

pander::pander(comp_table)
aa_codes alpha beta a.plus.b a.div.b
A 285 203 175 361
R 53 67 78 146
N 97 139 120 183
D 163 121 111 244
C 22 75 74 63
Q 67 122 74 114
E 134 86 86 257
G 197 297 171 377
H 111 49 33 107
I 91 120 93 239
L 221 177 110 339
K 249 115 112 321
M 48 16 25 91
F 123 85 52 158
P 82 127 71 188
S 122 341 126 327
T 119 253 117 238
W 33 44 30 72
Y 63 110 108 130
V 167 229 123 378
#calculate proportions for each of the four protein fold types

alpha.prop <- alpha/sum(alpha)
beta.prop <- beta/sum(beta)
a.plus.b.prop <- a.plus.b/sum(a.plus.b)
a.div.b <- a.div.b/sum(a.div.b)

#dataframe
aa_prop <- data.frame(alpha.prop,
                      beta.prop,
                      a.plus.b.prop,
                      a.div.b)
round(aa_prop, 4)
##    alpha.prop beta.prop a.plus.b.prop a.div.b
## 1      0.1165    0.0731        0.0926  0.0833
## 2      0.0217    0.0241        0.0413  0.0337
## 3      0.0396    0.0501        0.0635  0.0422
## 4      0.0666    0.0436        0.0588  0.0563
## 5      0.0090    0.0270        0.0392  0.0145
## 6      0.0274    0.0439        0.0392  0.0263
## 7      0.0548    0.0310        0.0455  0.0593
## 8      0.0805    0.1070        0.0905  0.0870
## 9      0.0454    0.0177        0.0175  0.0247
## 10     0.0372    0.0432        0.0492  0.0552
## 11     0.0903    0.0638        0.0582  0.0782
## 12     0.1018    0.0414        0.0593  0.0741
## 13     0.0196    0.0058        0.0132  0.0210
## 14     0.0503    0.0306        0.0275  0.0365
## 15     0.0335    0.0457        0.0376  0.0434
## 16     0.0499    0.1228        0.0667  0.0755
## 17     0.0486    0.0911        0.0619  0.0549
## 18     0.0135    0.0159        0.0159  0.0166
## 19     0.0257    0.0396        0.0572  0.0300
## 20     0.0682    0.0825        0.0651  0.0872
#row labels
row.names(aa_prop) <- aa_codes

#output dataframe
pander::pander(aa_prop)
  alpha.prop beta.prop a.plus.b.prop a.div.b
A 0.1165 0.07313 0.09264 0.08331
R 0.02166 0.02414 0.04129 0.03369
N 0.03964 0.05007 0.06353 0.04223
D 0.06661 0.04359 0.05876 0.05631
C 0.008991 0.02702 0.03917 0.01454
Q 0.02738 0.04395 0.03917 0.02631
E 0.05476 0.03098 0.04553 0.05931
G 0.08051 0.107 0.09052 0.08701
H 0.04536 0.01765 0.01747 0.02469
I 0.03719 0.04323 0.04923 0.05516
L 0.09031 0.06376 0.05823 0.07824
K 0.1018 0.04143 0.05929 0.07408
M 0.01962 0.005764 0.01323 0.021
F 0.05027 0.03062 0.02753 0.03646
P 0.03351 0.04575 0.03759 0.04339
S 0.04986 0.1228 0.0667 0.07547
T 0.04863 0.09114 0.06194 0.05493
W 0.01349 0.01585 0.01588 0.01662
Y 0.02575 0.03963 0.05717 0.03
V 0.06825 0.08249 0.06511 0.08724

#Determine number of each amino acid in protein encoded by SLC22A4

#Function to convert a table into a vector
table_to_vector <- function(table_x){
  table_names <- attr(table_x, "dimnames")[[1]]
  table_vect <- as.vector(table_x)
  names(table_vect) <- table_names
  return(table_vect)
}

homo_sapiens_freq_table <- table(homo_sapiens)/length(homo_sapiens)
homo_sapiens_freq_vector <- table_to_vector(homo_sapiens_freq_table)

homo_sapiens_freq_vector
##           A           C           D           E           F           G 
## 0.067150635 0.012704174 0.032667877 0.045372051 0.076225045 0.067150635 
##           H           I           K           L           M           N 
## 0.005444646 0.068965517 0.023593466 0.136116152 0.034482759 0.034482759 
##           P           Q           R           S           T           V 
## 0.045372051 0.027223230 0.054446461 0.070780399 0.054446461 0.085299456 
##           W           Y 
## 0.025408348 0.032667877

Removing any unknown amino acid codes, U, from vector

aa_names <- names(homo_sapiens_freq_vector)

i.U <- which(aa_names == "U")
aa_names[i.U]
## character(0)
homo_sapiens_freq_vector[i.U]
## named numeric(0)
if(length(i.U) > 0) {
  homo_sapiens <- homo_sapiens[-i.U]
}

Adding frequency vector data to proportion table

aa_prop$slc22a4_human_aa_freq <- homo_sapiens_freq_vector
pander::pander(aa_prop)
  alpha.prop beta.prop a.plus.b.prop a.div.b slc22a4_human_aa_freq
A 0.1165 0.07313 0.09264 0.08331 0.06715
R 0.02166 0.02414 0.04129 0.03369 0.0127
N 0.03964 0.05007 0.06353 0.04223 0.03267
D 0.06661 0.04359 0.05876 0.05631 0.04537
C 0.008991 0.02702 0.03917 0.01454 0.07623
Q 0.02738 0.04395 0.03917 0.02631 0.06715
E 0.05476 0.03098 0.04553 0.05931 0.005445
G 0.08051 0.107 0.09052 0.08701 0.06897
H 0.04536 0.01765 0.01747 0.02469 0.02359
I 0.03719 0.04323 0.04923 0.05516 0.1361
L 0.09031 0.06376 0.05823 0.07824 0.03448
K 0.1018 0.04143 0.05929 0.07408 0.03448
M 0.01962 0.005764 0.01323 0.021 0.04537
F 0.05027 0.03062 0.02753 0.03646 0.02722
P 0.03351 0.04575 0.03759 0.04339 0.05445
S 0.04986 0.1228 0.0667 0.07547 0.07078
T 0.04863 0.09114 0.06194 0.05493 0.05445
W 0.01349 0.01585 0.01588 0.01662 0.0853
Y 0.02575 0.03963 0.05717 0.03 0.02541
V 0.06825 0.08249 0.06511 0.08724 0.03267

##Functions to calculate similarities

Correlation used in Chou and Zhange 1992

chou_cor <- function(x,y){
  numerator <- sum(x*y)
denominator <- sqrt((sum(x^2))*(sum(y^2)))
result <- numerator/denominator
return(result)
}

Cosine similarity used in Higgs and Attwood (2005).

chou_cosine <- function(z.1, z.2){
  z.1.abs <- sqrt(sum(z.1^2))
  z.2.abs <- sqrt(sum(z.2^2))
  my.cosine <- sum(z.1*z.2)/(z.1.abs*z.2.abs)
  return(my.cosine)
}

Data exploration. Display outputs as scatterplots. None of these correlations are that great.

par(mfrow = c(2,2), mar = c(1,4,1,1))
plot(alpha.prop ~ slc22a4_human_aa_freq, data = aa_prop)
plot(beta.prop ~ slc22a4_human_aa_freq, data = aa_prop)
plot(a.plus.b.prop  ~ slc22a4_human_aa_freq, data = aa_prop)
plot(a.div.b ~ slc22a4_human_aa_freq, data = aa_prop)

par(mfrow = c(1,1), mar = c(1,1,1,1))

Calculate correlation between each column:

corr.alpha <- chou_cor(aa_prop[, 5], aa_prop[, 1])
corr.beta <- chou_cor(aa_prop[, 5], aa_prop[, 2])
corr.apb <- chou_cor(aa_prop[, 5], aa_prop[, 3])
corr.adb <- chou_cor(aa_prop[, 5], aa_prop[, 4])

Calculate cosine simularity between each column:

cos.alpha <- chou_cosine(aa_prop[, 5], aa_prop[, 1])
cos.beta <- chou_cosine(aa_prop[, 5], aa_prop[, 2])
cos.apb <- chou_cosine(aa_prop[, 5], aa_prop[, 3])
cos.adb <- chou_cosine(aa_prop[, 5], aa_prop[, 4])

Calculate distance. Note: we need to flip the dataframe on its side using a command called t()

aa_prop_flipped <- t(aa_prop)
round(aa_prop_flipped,2)
##                          A    R    N    D    C    Q    E    G    H    I    L
## alpha.prop            0.12 0.02 0.04 0.07 0.01 0.03 0.05 0.08 0.05 0.04 0.09
## beta.prop             0.07 0.02 0.05 0.04 0.03 0.04 0.03 0.11 0.02 0.04 0.06
## a.plus.b.prop         0.09 0.04 0.06 0.06 0.04 0.04 0.05 0.09 0.02 0.05 0.06
## a.div.b               0.08 0.03 0.04 0.06 0.01 0.03 0.06 0.09 0.02 0.06 0.08
## slc22a4_human_aa_freq 0.07 0.01 0.03 0.05 0.08 0.07 0.01 0.07 0.02 0.14 0.03
##                          K    M    F    P    S    T    W    Y    V
## alpha.prop            0.10 0.02 0.05 0.03 0.05 0.05 0.01 0.03 0.07
## beta.prop             0.04 0.01 0.03 0.05 0.12 0.09 0.02 0.04 0.08
## a.plus.b.prop         0.06 0.01 0.03 0.04 0.07 0.06 0.02 0.06 0.07
## a.div.b               0.07 0.02 0.04 0.04 0.08 0.05 0.02 0.03 0.09
## slc22a4_human_aa_freq 0.03 0.05 0.03 0.05 0.07 0.05 0.09 0.03 0.03

Create distance matrix:

dist_matrix <- dist(aa_prop_flipped, method = "euclidean")

Individual Distances:

dist.alpha <- dist(aa_prop_flipped[c(1, 5),], method="euclidean")
dist.beta <- dist(aa_prop_flipped[c(2, 5),], method="euclidean")
dist.apb <- dist(aa_prop_flipped[c(3, 5),], method="euclidean")
dist.adb <- dist(aa_prop_flipped[c(4, 5),], method="euclidean")

# fold types
fold.type <- c("alpha","beta","alpha plus beta", "alpha/beta")

# data
corr.sim <- round(c(corr.alpha,corr.beta,corr.apb,corr.adb),5)
cosine.sim <- round(c(cos.alpha,cos.beta,cos.apb,cos.adb),5)
Euclidean.dist <- round(c(dist.alpha,dist.beta,dist.apb,dist.adb),5)

# summary
sim.sum <- c("","","most.sim","")
dist.sum <- c("","","min.dist","")

#create dataframe
df <- data.frame(fold.type,
           corr.sim ,
           cosine.sim ,
           Euclidean.dist ,
           sim.sum ,
           dist.sum )

Display output

pander::pander(df)
fold.type corr.sim cosine.sim Euclidean.dist sim.sum dist.sum
alpha 0.7151 0.7151 0.1952
beta 0.7911 0.7911 0.1683
alpha plus beta 0.8126 0.8126 0.1545 most.sim min.dist
alpha/beta 0.7814 0.7814 0.1679

##Percent Identity Comparisons (PID)

#Data Prep Convert FASTA records into single vector

names(slc22a4_list)
##  [1] "NP_003050.2"    "XP_001163062.1" "NP_062661.1"    "NP_001193918.1"
##  [5] "XP_005626572.1" "XP_017727518.1" "NP_071606.1"    "NP_001139603.1"
##  [9] "XP_013820784.2" "XP_011945580.1"
length(slc22a4_list)
## [1] 10

Convert list into vector

#create vector w/ NAs at each homolog position
slc22a4_vector <- rep(NA, length(slc22a4_list))

#populate vector with fasta sequences from slc22a4_list
for(i in 1:length(slc22a4_vector)){
  slc22a4_vector[i] <- slc22a4_list[[i]]
}

#label sequences with slc22a4 list names
names(slc22a4_vector) <- names(slc22a4_list)

##PID Table

Performing pairwise alignments comparing human sequence to chimp, mouse, and cattle respectively

human_chimp_alignment <- pairwiseAlignment(slc22a4_list[[1]], slc22a4_list[[2]])
human_mouse_alignment <- pairwiseAlignment(slc22a4_list[[1]], slc22a4_list[[3]])
human_cattle_alignment <- pairwiseAlignment(slc22a4_list[[1]], slc22a4_list[[4]])

#output alignments
human_chimp_alignment
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: MRDYDEVIAFLGEWGPFQRLIFFLLSASIIPNGF...TLEQMQKVKWFRSGKKTRDSMETEENPKVLITAF
## subject: MRDYDEVIAFLGEWGPFQRLIFFLLSASIIPNGF...TLEQMQKVKWFRCGKKTRDSMETEENPKVLITAF
## score: 2318.619
human_mouse_alignment
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: MRDYDEVIAFLGEWGPFQRLIFFLLSASIIPNGF...TLEQMQKVKWFRSGKKTRDSMETEENPKVLITAF
## subject: MRDYDEVIAFLGEWGPFQRLIFFLLSASIIPNGF...NLEQMQKVRGFRCGKKSTVSVDREESPKVLITAF
## score: 1488.824
human_cattle_alignment
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: MRDYDEVIAFLGEWGPFQRLIFFLLSASIIPNGF...TLEQMQKVKWFRSGKKTRDSMETEENPKVLITAF
## subject: MRDYDEVTAFLGKWGPFQRLIFFLLSASIIPNGF...TLEQMQKVKGFRYGKKTKGSMEKEENSKVIITSF
## score: 1604.795

Aligning chimp, mouse, and cattle sequences to eachother

chimp_mouse_alignment <- pairwiseAlignment(slc22a4_list[[2]], slc22a4_list[[3]])
chimp_cattle_alignment <- pairwiseAlignment(slc22a4_list[[2]], slc22a4_list[[4]])
mouse_cattle_alignment <- pairwiseAlignment(slc22a4_list[[3]], slc22a4_list[[4]])

#output alignments
chimp_mouse_alignment
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: MRDYDEVIAFLGEWGPFQRLIFFLLSASIIPNGF...TLEQMQKVKWFRCGKKTRDSMETEENPKVLITAF
## subject: MRDYDEVIAFLGEWGPFQRLIFFLLSASIIPNGF...NLEQMQKVRGFRCGKKSTVSVDREESPKVLITAF
## score: 1499.367
chimp_cattle_alignment
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: MRDYDEVIAFLGEWGPFQRLIFFLLSASIIPNGF...TLEQMQKVKWFRCGKKTRDSMETEENPKVLITAF
## subject: MRDYDEVTAFLGKWGPFQRLIFFLLSASIIPNGF...TLEQMQKVKGFRYGKKTKGSMEKEENSKVIITSF
## score: 1594.253
mouse_cattle_alignment
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: MRDYDEVIAFLGEWGPFQRLIFFLLSASIIPNGF...NLEQMQKVRGFRCGKKSTVSVDREESPKVLITAF
## subject: MRDYDEVTAFLGKWGPFQRLIFFLLSASIIPNGF...TLEQMQKVKGFRYGKKTKGSMEKEENSKVIITSF
## score: 1462.718

Calculating pid of alignments

Biostrings::pid(human_chimp_alignment)
## [1] 99.09256
Biostrings::pid(human_mouse_alignment)
## [1] 84.81013
Biostrings::pid(human_cattle_alignment)
## [1] 86.79928
Biostrings::pid(chimp_mouse_alignment)
## [1] 84.99096
Biostrings::pid(chimp_cattle_alignment)
## [1] 86.61844
Biostrings::pid(mouse_cattle_alignment)
## [1] 84.26763

Building matrix

#         human                 chimp   mouse   cattle

pids <- c(1,      NA,     NA,     NA,
          pid(human_chimp_alignment), 1, NA, NA,
          pid(human_mouse_alignment), pid(chimp_mouse_alignment), 1, NA,
        pid(human_cattle_alignment), pid(chimp_cattle_alignment), pid(mouse_cattle_alignment), 1)

pid_matrix <- matrix(pids, nrow=4, byrow=T)
row.names(pid_matrix) <- c("Homo", "Pan", "Mus", "Bos")
colnames(pid_matrix) <- c("Homo", "Pan", "Mus", "Bos")
pander::pander(pid_matrix)
  Homo Pan Mus Bos
Homo 1 NA NA NA
Pan 99.09 1 NA NA
Mus 84.81 84.99 1 NA
Bos 86.8 86.62 84.27 1

##PID Method Comparison

pid1 <- Biostrings::pid(human_chimp_alignment, type = "PID1")
pid2 <- Biostrings::pid(human_chimp_alignment, type = "PID2")
pid3 <- Biostrings::pid(human_chimp_alignment, type = "PID3")
pid4 <- Biostrings::pid(human_chimp_alignment, type = "PID4")

pid_methods_vector <- c(pid1, pid2, pid3, pid4)

pid_method <- c("PID1", "PID2", "PID3", "PID4")

denominator <- c("(aligned positions & internal gap positions)",
         "(aligned positions)",
         "(length of shorter sequence)",
         "(average length of sequences)")

pid_df <- data.frame(pid_method, pid_methods_vector, denominator)
colnames(pid_df) <- c("method", "PID", "denominator")

pander::pander(pid_df)
method PID denominator
PID1 99.09 (aligned positions & internal gap positions)
PID2 99.09 (aligned positions)
PID3 99.09 (length of shorter sequence)
PID4 99.09 (average length of sequences)

##Multiple Sequence Alignment (MSA) #MSA Data Prep

#convert vectors to stringset format
slc22a4_vector_ss <- Biostrings::AAStringSet(slc22a4_vector)

#Building MSA

slc22a4_msa <- msa(slc22a4_vector_ss, method = "ClustalW")
## use default substitution matrix

#Cleaning/Outputting MSA

class(slc22a4_msa)
## [1] "MsaAAMultipleAlignment"
## attr(,"package")
## [1] "msa"
is(slc22a4_msa)
## [1] "MsaAAMultipleAlignment" "AAMultipleAlignment"    "MsaMetaData"           
## [4] "MultipleAlignment"
slc22a4_msa
## CLUSTAL 2.1  
## 
## Call:
##    msa(slc22a4_vector_ss, method = "ClustalW")
## 
## MsaAAMultipleAlignment with 10 rows and 675 columns
##      aln                                                   names
##  [1] -------------------------...GFRCGKKSTVSVDREESPKVLITAF NP_062661.1
##  [2] -------------------------...GFRCGKKSTVSMDREENPKVLITAF NP_071606.1
##  [3] -------------------------...WFRSGKKTRDSMETEENPKVLITAF NP_003050.2
##  [4] -------------------------...WFRCGKKTRDSMETEENPKVLITAF XP_001163062.1
##  [5] MAEPSVEDPVRSLRAGGRAKHWAET...WFRSGKKTRDSMETEKNPKVLITAF XP_017727518.1
##  [6] -------------------------...WFRSGKKTRDSMETEENPKVLITAF XP_011945580.1
##  [7] -------------------------...GFRYGKKTKGSMEKEENSKVIITSF NP_001193918.1
##  [8] -------------------------...GFRYGKKTKGSMEKEENPKVIITSF XP_013820784.2
##  [9] -------------------------...GFRFRKRTRDSVEKEENPKVLITSF XP_005626572.1
## [10] -------------------------...CFRNGQQSTGIRNSKESPKTLITTL NP_001139603.1
##  Con -------------------------...GFR?GKKTRDSME?EENPKVLITAF Consensus
class(slc22a4_msa) <- "AAMultipleAlignment"
class(slc22a4_msa)
## [1] "AAMultipleAlignment"
#converting MSA to seqinr alignment format
slc22a4_seqinr <- msaConvert(slc22a4_msa, type = "seqinr::alignment")

print_msa(alignment = slc22a4_seqinr, chunksize = 60)
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "MAEPSVEDPVRSLRAGGRAKHWAETAGSPAPSQRPPLPSAGPGMGVWSQVYSGIKLGAEL 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] " "
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "PERSSALSLFPRTGPRLRAPISNSLPVLGNVLTSLGSPQLRDTVPRTLSSPVVASFGAAV 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] " "
## [1] "--MRDYDEVIAFLGEWGPFQRLIFFLLSASIIPNGFNGMSVVFLAGTPEHRCLVPDTVNL 0"
## [1] "--MRDYDEVIAFLGDWGPFQRLIFFLLSASIIPNGFNGMSVVFLAGTPEHRCLVPHTVNL 0"
## [1] "--MRDYDEVIAFLGEWGPFQRLIFFLLSASIIPNGFNGMSVVFLAGTPEHRCRVPDAANL 0"
## [1] "--MRDYDEVIAFLGEWGPFQRLIFFLLSASIIPNGFNGMSVVFLAGTPEHRCRVPDAANL 0"
## [1] "GSMRDYDEAIAFLGEWGLFQRLIFFLLSASIIPNGFNGMSVVFLAGTPEHRCRVPDAANL 0"
## [1] "--MRDYDEAIAFLGEWGPFQRLIFFLLSASIIPNGFNGMSVVFLAGTPEHRCRVPDAANL 0"
## [1] "--MRDYDEVTAFLGKWGPFQRLIFFLLSASIIPNGFNGMSVVFLAGTPEHHCRVPDTANL 0"
## [1] "--MRDYDEVTAFLGKWGPFQRLIFFLLSASIIPNGFNGMSVVFLAGTPEHRCRVPDTANL 0"
## [1] "--MRDYDEVTTFLGEWGPFQRLIFFLLSASIIPNGFNGMSVVFLAATPEHRCRVPDAANL 0"
## [1] "--MRDYEEVTAFLGKWGRFQRLVFFLLSASIIPNGFNGMSAVFLAGTPEHRCAVPPGANL 0"
## [1] " "
## [1] "SSSWRNHSIPLETKDGRQVPQSCRRYRLATIANFSAMGLEPGQDVDLEQLEQESCLDGWE 0"
## [1] "SSAWRNHSIPLETKDGRQVPQSCRRYRLATIANFSALGLEPGLDVDLEQLEQESCLDGWE 0"
## [1] "SSAWRNNSVPLRLRDGREVPHSCSRYRLATIANFSALGLEPGRDVDLGQLEQESCLDGWE 0"
## [1] "SSAWRNNSVPLRLRDGREVPHSCSRYRLATIANFSALGLEPGRDVDLGQLEQESCLDGWE 0"
## [1] "SSAWRNNSIPLRLRDGREVLHSCRRYRLATIANFSALGLEPGRDVDLGQLEQESCLDGWE 0"
## [1] "SSAWRNNSVPLRLRDGREVLHSCRRYRLATIANFSALGLEPGRDVDLGQLEQESCLDGWE 0"
## [1] "SSAWRNHSIPVRLQDGREVPQSCRRYRLATIVNFSALGLEPGRDVNLEQLEQEGCLDGWE 0"
## [1] "SSAWRNHSIPMRLQDGREVPQSCRRYRLATIVNFSALGLEPGRDVDLEQLEQEGCLDGWE 0"
## [1] "SRAWRNHSIPLRLQDGREVPHSCRRYRLAAIANFSALGLEPGRDVDLEQLEQESCLDGWE 0"
## [1] "SEEWRNASIPLELRGGVTEPSRCRRYRLAVLANLSALGLRPGSDVQLAELEQEPCLDGWE 0"
## [1] " "
## [1] "YDKDIFLSTIVTEWNLVCEDDWKTPLTTSLFFVGVLCGSFVSGQLSDRFGRKKVLFATMA 0"
## [1] "YSKDVFLSTIVTEWNLVCEDDWKTPLTTSLFFVGVLCGSFVSGQLSDRFGRKKVLFATMA 0"
## [1] "FSQDVYLSTVVTEWNLVCEDNWKVPLTTSLFFVGVLLGSFVSGQLSDRFGRKNVLFATMA 0"
## [1] "FSQDVYLFTVVTEWNLVCEDNWKVPLTTSLFFVGVLLGSFVSGQLSDRFGRKNVLFATMA 0"
## [1] "FSQDVYWSTVVTEWNLVCEDNWKVPLTTSLFFVGVLLGSFISGQLSDRFGRKNVLFATMA 0"
## [1] "FSQDVYWSTVVTEWNLVCEDNWKVPLTTSLFFVGVLLGSFVSGQLSDRFGRKNVLFATMA 0"
## [1] "FSQDIYMSTIVTEWSLVCEEDWKTPLTTSLFFVGVLLGSFVSGQLSDRFGRKTILFATMA 0"
## [1] "FSQDIYLSTIVTEWSLVCEEDWKTPLTTSLFFVGVLLGSFVSGQLSDRFGRKTILFATMA 0"
## [1] "FSQDIYQSTIVTEWSLVCEDDWKTPLTSSLFFVGVLLGSFVSGQLSDRFGRKNILFATMA 0"
## [1] "YSRDVYRSTIVSEWNLVCEDDWKVPLTTSLFFVGVLIGSFISGQLSDRFGRKSILFLTMA 0"
## [1] " "
## [1] "VQTGFSFVQIFSTNWEMFTVLFAIVGMGQISNYVVAFILGTEILSKSVRIIFSTLGVCTF 0"
## [1] "VQTGFSFVQIFSTNWEMFTVLFAIVGMGQISNYVVAFILGTEILSKSVRILFSTLGVCTF 0"
## [1] "VQTGFSFLQIFSISWEMFTVLFVIVGMGQISNYVVAFILGTEILGKSVRIIFSTLGVCTF 0"
## [1] "VQTGFSFLQIFSISWEMFTVLFVIVGMGQISNYVVAFILGTEILGKSVRIIFSTLGVCTF 0"
## [1] "VQTGFSFLQIFSISWEMFTVLFLIVGMGQISNYVVAFILGTEILGKSVRIIFSTLGVCTF 0"
## [1] "VQTGFSFLQIFSNSWEMFTVLFLIVGMGQISNYVVAFILGTEILGKSVRIIFSTLGVCTF 0"
## [1] "VQTGFSFLQVFSINWEMFTVLFVIVGMGQISNYVVAFILGTEILGKSVRIIFSTLGVCTF 0"
## [1] "VQTGFSFLQIFSINWEMFTVLFVIVGMGQISNYVVAFILGAEILGKSVRVIFSTLGVCTF 0"
## [1] "VQTGFSFLQIFSINWEMFTVLFVIVGMGQISNYVVAFILGTEILGKSVRIIFSTLGVCTF 0"
## [1] "VQTGFSFLQIFSTSWEMFTVLFLIVGMGQISNYVVAFILGTEILGKSVRILFSTLGVCVF 0"
## [1] " "
## [1] "FAIGYMVLPLFAYFIRDWRMLLLALTLPGLFCVPLWWFIPESPRWLISQRRFAEAEQIIQ 0"
## [1] "FAIGYMVLPLFAYFIRDWRMLLLALTLPGLFCVPLWWFIPESPRWLISQRRFEEAEQIIQ 0"
## [1] "FAVGYMLLPLFAYFIRDWRMLLLALTVPGVLCVPLWWFIPESPRWLISQRRFREAEDIIQ 0"
## [1] "FAVGYMLLPLFAYFIRDWRMLLLALTVPGVLCVPLWWFIPESPRWLISQRRFREAEDIIQ 0"
## [1] "FAVGYMLLPLFAYFIRDWRMLLLALTVPGVLCVPLWWFIPESPRWLISQRRFREAEDIIQ 0"
## [1] "FAVGYMLLPLFAYFIRDWRMLLLALTVPGVLCVPLWWFIPESPRWLISQRRFREAEDIIQ 0"
## [1] "FAIGYMLLPLFAYFIRDWRMLLLALTVPGVLCVPLWWFIPESPRWLISQRRFKEAEDIIQ 0"
## [1] "FAVGYMLLPLFAYFIRDWRMLLLALTVPGLLCVPLWWFIPESPRWLLSQRRFKEAEDIIQ 0"
## [1] "FAVGYMLLPLFAYFIRDWRMLLLALTLPGVLCIPLWWFIPESPRWLISQRRFREAENIIQ 0"
## [1] "FAIGYMLLPLFAYFIRDWRMLLLALTLPGLLCVPLWWIIPESPRWLISQGRYKEAEVIIR 0"
## [1] " "
## [1] "KAAKMNSIVAPAGIFDPLELQELNSLKQQKVIILDLFRTRNIATITVMAVMLWMLTSVGY 0"
## [1] "KAAKMNGIMAPAVIFDPLELQELNSLKQQKVFILDLFKTRNIATITVMSVMLWMLTSVGY 0"
## [1] "KAAKMNNIAVPAVIFDSVEELN--PLKQQKAFILDLFRTRNIAIMTIMSLLLWMLTSVGY 0"
## [1] "KAAKMNNIAVPAVIFDSVEELN--PLKQQKAFILDLFRTRNIAIMTIMSLLLWMLTSVGY 0"
## [1] "KAAKMNNIAVPAVIFDSVEELN--PLKQQKAFILDLFRTWNIAIMTIMSLLLWMLTSVGY 0"
## [1] "KAAKMNNIAVPAVIFDSVEELN--PLKQQKAFILDLFRTWNIAIMTIMSLLLWMLTSVGY 0"
## [1] "KAAKINNVTAPVVVFDPVEQQEQLPLKQQKVFILDLFRTRNIATITIMSLLLWMLTSVGY 0"
## [1] "KAAKINNVTAPVVVFDPVEQQEQLPLKQQKVFILDLFRTRNIATITIMSLLLWMLTSVGY 0"
## [1] "KAAKMNNIAAPVVIFDPMELQELKLQSQQKVFILDLFRTRNIATITIMSLLLWMLTSVGY 0"
## [1] "KAAKINNTPAPAMLFDAAEMQDSKPQQQQKAILLDLFRSRNIATITIMSLLLWFFTSVGY 0"
## [1] " "
## [1] "FALSLNVPNLHGDVYLNCFLSGLIEVPAYFTAWLLLRTLPRRYIIAGVLFWGGGVLLLIQ 0"
## [1] "FALSLNVPNLHGDVYLNCFLSGLIEVPAYFTAWLLLRTLPRRYIIAGVLFWGGGVLLLVQ 0"
## [1] "FALSLDAPNLHGDAYLNCFLSALIEIPAYITAWLLLRTLPRRYIIAAVLFWGGGVLLFIQ 0"
## [1] "FALSLDAPNLHGDVYLNCFLSALIEIPAYITAWLLLRTLPRRYIIAAVLFWGGGVLLFIQ 0"
## [1] "FALSLDTPNLHGDAYLNCFLSALIEIPAYITAWLLLRTLPRRYIIAAVLFWGGGVLLFIQ 0"
## [1] "FALSLDTPNLHGDAYLNCFLSALIEIPAYITAWLLLRTLPRRYIIAAVLFWGG------- 0"
## [1] "FALSLNTPNLHGDPYLNCFFSALIEVPAYIAAWLLLRTLPRRYVIAGVLFFGGSVLLLIQ 0"
## [1] "FALSLNTPNLHGDPYLNCFFSALIEVPAYIAAWLLLRTLPRRYIIAGVLFFGGSVLLLIQ 0"
## [1] "FALSLNAPNLHGDAYLNCFLSALIEVPAYITAWLLLRTLPRRYIIAGVLFLGG------- 0"
## [1] "FGLSLNTPNLHGDAYINCFLSAVIEVPAYIIAWLLLRSLPRRYSISGTLVLGGGVVLFIN 0"
## [1] " "
## [1] "VVPEDYNFVSIGLVMLGKFGITSAFSMLYVFTAELYPTLVRNMAVGITSMASRVGSIIAP 0"
## [1] "VVPEDYNFVSIGLVMLGKFGVTSAFSMLYVFTAELYPTLVRNMAVGITSMASRVGSIIAP 0"
## [1] "LVPVDYYFLSIGLVMLGKFGITSAFSMLYVFTAELYPTLVRNMAVGVTSTASRVGSIIAP 0"
## [1] "LVPVDYYFLSIGLVMLGKFGITSAFSMLYVFTAELYPTLVRNMAVGVTSTASRVGSIIAP 0"
## [1] "LVPVDYYFLSIGLVMLGKFGITSAFSMLYVFTAELYPTLVRNMAVGVTSMASRVGSIIAP 0"
## [1] "----DYYFLSIGLVMLGKFGITSAFSMLYVFTAELYPTLVRNMAVGVTSMASRVGSIIAP 0"
## [1] "LVPAGYNFLSIGLAMLGKFGVTSAFAMLYVFTAELYPTLVRNMAVGVASMASRVGSIIAP 0"
## [1] "LVPAGYNFLSIGLAMLGKFGITSAFAMLYVFTAELYPTLVRNMAVGVGSMASRVGSIIAP 0"
## [1] "----DYNFLSIGLVMLGKFGITSAFSMLYVFTAELYPTLVRNMAVGVTSMASRVGSIIAP 0"
## [1] "LVPTDLNVLSVCLVMLGKFGITAAFSMLYVYNVELYPTLVRNMAVGATSTASRLGSIIAP 0"
## [1] " "
## [1] "YFVYLGAYNRLLPYILMGSLTVLIGIITLFFPESFGVTLPENLEQMQKVRGFRCGKKSTV 0"
## [1] "YFVYLGAYNRLLPYILMGSLTVLIGIITLFFPESFGVTLPENLEQMQKVRGFRCGKKSTV 0"
## [1] "YFVYLGAYNRMLPYIVMGSLTVLIGILTLFFPESLGMTLPETLEQMQKVKWFRSGKKTRD 0"
## [1] "YFVYLGAYNRMLPYIIMGSLTVLIGILTLFFPESLGITLPETLEQMQKVKWFRCGKKTRD 0"
## [1] "YFVYLGAYNRMLPYIVMGSLTVLIGILTLFFPESLGMTLPETLEQMQKVKWFRSGKKTRD 0"
## [1] "YFVYLGAYNRMLPYIVMGSLTVLIGILTLFFPESLGMTLPETLEQMQKVKWFRSGKKTRD 0"
## [1] "YFVYLGAYNRVLPYILMGSLTVLIGIITLFFPESFGRTLPETLEQMQKVKGFRYGKKTKG 0"
## [1] "YFVYLGAYNRVLPYILMGSLTVLIGIITLFFPESFGRTLPETLEQMQKVKGFRYGKKTKG 0"
## [1] "YFVYLGAYNRVLPYIVMGSLTVFIGIITLFFPESFGMTLPETLEQMQKVKGFRFRKRTRD 0"
## [1] "YFVYLGAYDRYLPYIVMGSLTVLIGILTLFLPESYGSALPESFEQMMKVKCFRNGQQSTG 0"
## [1] " "
## [1] "SVDREESPKVLITAF 45"
## [1] "SMDREENPKVLITAF 45"
## [1] "SMETEENPKVLITAF 45"
## [1] "SMETEENPKVLITAF 45"
## [1] "SMETEKNPKVLITAF 45"
## [1] "SMETEENPKVLITAF 45"
## [1] "SMEKEENSKVIITSF 45"
## [1] "SMEKEENPKVIITSF 45"
## [1] "SVEKEENPKVLITSF 45"
## [1] "IRNSKESPKTLITTL 45"
## [1] " "

##Finished MSA output msa using ggmsa function

ggmsa(slc22a4_msa, start=350, end=450)

##Distance Matrix

slc22a4_subset_dist <- seqinr::dist.alignment(slc22a4_seqinr, matrix="identity")
is(slc22a4_subset_dist)
## [1] "dist"     "oldClass"
class(slc22a4_subset_dist)
## [1] "dist"
slc22a4_align_seqinr_rnd <- round(slc22a4_subset_dist, 3)
slc22a4_align_seqinr_rnd
##                NP_062661.1 NP_071606.1 NP_003050.2 XP_001163062.1
## NP_071606.1          0.185                                       
## NP_003050.2          0.393       0.386                           
## XP_001163062.1       0.390       0.383       0.095               
## XP_017727518.1       0.402       0.395       0.148          0.176
## XP_011945580.1       0.394       0.385       0.129          0.161
## NP_001193918.1       0.397       0.387       0.361          0.364
## XP_013820784.2       0.387       0.383       0.351          0.354
## XP_005626572.1       0.370       0.370       0.310          0.322
## NP_001139603.1       0.498       0.498       0.480          0.486
##                XP_017727518.1 XP_011945580.1 NP_001193918.1 XP_013820784.2
## NP_071606.1                                                               
## NP_003050.2                                                               
## XP_001163062.1                                                            
## XP_017727518.1                                                            
## XP_011945580.1          0.096                                             
## NP_001193918.1          0.369          0.363                              
## XP_013820784.2          0.361          0.355          0.159               
## XP_005626572.1          0.325          0.322          0.319          0.310
## NP_001139603.1          0.480          0.481          0.494          0.492
##                XP_005626572.1
## NP_071606.1                  
## NP_003050.2                  
## XP_001163062.1               
## XP_017727518.1               
## XP_011945580.1               
## NP_001193918.1               
## XP_013820784.2               
## XP_005626572.1               
## NP_001139603.1          0.478

##Phylogenetic Tree

Use created distance matrix to create a phylogenetic tree using neighbor joining algorithm in ape package

slc22a4_tree <- ape::nj(slc22a4_align_seqinr_rnd)

#Plot Created Tree

plot.phylo(slc22a4_tree, main="SLC22A4 Phylogenetic Tree\n", use.edge.length = F)

mtext("\n\nSLC22A4 Tree - Rooted, No Branch Lengths")