Final Portfolio

SLC22A4-Solute Carrier Family 22 Member 4

The encoded protein is an organic cation transporter and plasma integral membrane protein containing eleven putative transmembrane domains as well as a nucleotide-binding site motif. Transport by this protein is partially ATP-dependent.

Resources/References -Refseq Gene:https://www.ncbi.nlm.nih.gov/nuccore/1523761482?report=graph -Refseq Homologene:https://www.ncbi.nlm.nih.gov/homologene?LinkName=gene_homologene&from_uid=6583 -Uniprot:https://www.uniprot.org/uniprot/Q9H015 -Pfam:http://pfam.xfam.org/protein/Q9H015 -Alphafold:https://alphafold.ebi.ac.uk/entry/Q9H015

#Required Packages:

#install.packages("BiocManager")
#BiocManager::install("drawProteins")
#install.packages("HGNChelper")

#Load Necessary Packages

#github
library(compbio4all)
library(ggmsa)

## Registered S3 methods overwritten by 'ggalt':
##   method                  from   
##   grid.draw.absoluteGrob  ggplot2
##   grobHeight.absoluteGrob ggplot2
##   grobWidth.absoluteGrob  ggplot2
##   grobX.absoluteGrob      ggplot2
##   grobY.absoluteGrob      ggplot2

#CRAN
library(rentrez)

library(seqinr)

library(ape)

## 
## Attaching package: 'ape'

## The following objects are masked from 'package:seqinr':
## 
##     as.alignment, consensus

library(pander)

#Bioconductor
library(msa)

## Loading required package: Biostrings

## Loading required package: BiocGenerics

## Loading required package: parallel

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min

## Loading required package: S4Vectors

## Loading required package: stats4

## 
## Attaching package: 'S4Vectors'

## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname

## Loading required package: IRanges

## 
## Attaching package: 'IRanges'

## The following object is masked from 'package:grDevices':
## 
##     windows

## Loading required package: XVector

## Loading required package: GenomeInfoDb

## 
## Attaching package: 'Biostrings'

## The following object is masked from 'package:ape':
## 
##     complement

## The following object is masked from 'package:seqinr':
## 
##     translate

## The following object is masked from 'package:base':
## 
##     strsplit

library(drawProteins)

## Biostrings
library(Biostrings)

library(HGNChelper)

## Warning: package 'HGNChelper' was built under R version 4.1.2

##Accession Numbers

Accession numbers for SLC22A4 were obtained from Refseq, Refseq Homologene, Uniprot and PDB. Uniprot accession numbers found by searching for gene name. PDB accessions found by entering Uniprot accession number or SLC22A4 gene name

A protein BLAST search was conducted to determine if SLC22A4 is expressed outside of vertebrae and it was determined that it is not expressed in non-vertebrae. A second protein BLAST was conducted to determine if SLC22A4 is expressed outside mammals. Link to SLC22A4 Protein BLAST search: https://blast.ncbi.nlm.nih.gov/Blast.cgi

HGNChelper::checkGeneSymbols(x = c("SLC22A4"))

## Maps last updated on: Thu Oct 24 12:31:05 2019

##         x Approved Suggested.Symbol
## 1 SLC22A4     TRUE          SLC22A4

##Creating Accession Number Table

Creating a table with all the accession numbers from various databases for SLC22A4

No Refseq or Uniprot Accession found for H. Neanderthalus

slc22a4_table <- c("NP_003050.2",    "Q9H015",     "NA", "Homo sapiens",            "Human",                "SLC22A4",
                 "XP_001163062.1", "H2QRG3",     "NA", "Pan troglodytes",         "Chimpanzee",           "SLC22A4",
                 "NP_062661.1",    "Q9Z306",     "NA", "Mus musculus",            
"House mouse",          "Slc22a4",
                 "NP_001193918.1",  "E1BEU0",     "NA", "Bos taurus",          
"Cattle",               "SLC22A4",
                 "XP_005626572.1", "F6UW69",     "NA", "Canis lupus",      
"Wolf", "SLC22A4",
                 "XP_017727518.1", "A0A2K6MNT9",     "NA", "Rhinopithecus bieti",       "Black-and-white snub-nosed monkey",        "SLC22A4",
                 "NP_071606.1",    "Q9R141",     "NA", "Rattus norvegicus",       
"Brown rat",           "    Slc22a4",
                 "NP_001139603.1", "A0A1D5PH68",         "NA", "Gallus gallus",         
"Chicken",                "SLC22A4", 
                 "XP_013820784.2", "A0A452FJ23", "NA", "Capra hircus",   
"Goat",    "    SLC22A4",
                 "XP_011945580.1", "A0A2K5N3M5",         "NA", "Cercocebus atys", 
"Sooty mangabey",        "SLC22A4")

#Creating table
slc22a4_table <- matrix(slc22a4_table, ncol=6, byrow=TRUE)
dim(slc22a4_table)

## [1] 10  6

#Converting to dataframe
slc22a4_table <- data.frame(slc22a4_table)
#Dataframe info
is(slc22a4_table)

## [1] "data.frame"       "list"             "oldClass"         "vector"          
## [5] "list_OR_List"     "vector_OR_Vector" "vector_OR_factor"

#Adding column names to table
colnames(slc22a4_table)

## [1] "X1" "X2" "X3" "X4" "X5" "X6"

colnames(slc22a4_table) <- c("ncbi.protein.accession", "UniProt.id", "PDB", "species", "common.name", "gene.name")
#Displaying table using pander
pander::pander(slc22a4_table)

Table continues below
ncbi.protein.accession	UniProt.id	PDB	species
NP_003050.2	Q9H015	NA	Homo sapiens
XP_001163062.1	H2QRG3	NA	Pan troglodytes
NP_062661.1	Q9Z306	NA	Mus musculus
NP_001193918.1	E1BEU0	NA	Bos taurus
XP_005626572.1	F6UW69	NA	Canis lupus
XP_017727518.1	A0A2K6MNT9	NA	Rhinopithecus bieti
NP_071606.1	Q9R141	NA	Rattus norvegicus
NP_001139603.1	A0A1D5PH68	NA	Gallus gallus
XP_013820784.2	A0A452FJ23	NA	Capra hircus
XP_011945580.1	A0A2K5N3M5	NA	Cercocebus atys

common.name	gene.name
Human	SLC22A4
Chimpanzee	SLC22A4
House mouse	Slc22a4
Cattle	SLC22A4
Wolf	SLC22A4
Black-and-white snub-nosed monkey	SLC22A4
Brown rat	Slc22a4
Chicken	SLC22A4
Goat	SLC22A4
Sooty mangabey	SLC22A4

##Data Preparation #Downloading Protein Sequences

We are downloading the sequences using the wrapper compbio4all::entrez_fetch_list()

The wrapper uses the rentrez::entrez_fetch() function to access NCBI databases for sequence information

# Downloading and prepping sequences
slc22a4_accessions <- slc22a4_table[,1] # Protein Accession column
slc22a4_list <- entrez_fetch_list(db="protein", id=slc22a4_accessions, rettype="fasta")

#Number of FASTA files downloaded from NCBI

length(slc22a4_list)

## [1] 10

Human slc22a4 FASTA file output

slc22a4_list[[1]]

## [1] ">NP_003050.2 solute carrier family 22 member 4 [Homo sapiens]\nMRDYDEVIAFLGEWGPFQRLIFFLLSASIIPNGFNGMSVVFLAGTPEHRCRVPDAANLSSAWRNNSVPLR\nLRDGREVPHSCSRYRLATIANFSALGLEPGRDVDLGQLEQESCLDGWEFSQDVYLSTVVTEWNLVCEDNW\nKVPLTTSLFFVGVLLGSFVSGQLSDRFGRKNVLFATMAVQTGFSFLQIFSISWEMFTVLFVIVGMGQISN\nYVVAFILGTEILGKSVRIIFSTLGVCTFFAVGYMLLPLFAYFIRDWRMLLLALTVPGVLCVPLWWFIPES\nPRWLISQRRFREAEDIIQKAAKMNNIAVPAVIFDSVEELNPLKQQKAFILDLFRTRNIAIMTIMSLLLWM\nLTSVGYFALSLDAPNLHGDAYLNCFLSALIEIPAYITAWLLLRTLPRRYIIAAVLFWGGGVLLFIQLVPV\nDYYFLSIGLVMLGKFGITSAFSMLYVFTAELYPTLVRNMAVGVTSTASRVGSIIAPYFVYLGAYNRMLPY\nIVMGSLTVLIGILTLFFPESLGMTLPETLEQMQKVKWFRSGKKTRDSMETEENPKVLITAF\n\n"

##Formatting/Cleaning Fasta Files

for(i in 1:length(slc22a4_list)){
  slc22a4_list[[i]] <- compbio4all::fasta_cleaner(slc22a4_list[[i]], parse = F)
}

##General Protein Information #Protein Diagram Drawing a protein diagram

uniprot_id <- slc22a4_table[1, 2]# uniprot ids

json <- drawProteins::get_features(uniprot_id)

## [1] "Download has worked"

#drawing protein diagram
prot_df <- drawProteins::feature_to_dataframe(json)
canvas <- draw_canvas(prot_df)
canvas <- draw_chains(canvas, prot_df, label_size = 2.5)
canvas <- draw_regions(canvas, prot_df)
canvas <- draw_recept_dom(canvas, prot_df)
canvas <- draw_folding(canvas, prot_df)
canvas <- draw_repeat(canvas, prot_df)
canvas <- draw_phospho(canvas, prot_df)
show(canvas)

homo_sapiens <- slc22a4_table[1, 1]
homo_sapiens <- entrez_fetch(id=homo_sapiens, db="protein", rettype = "fasta")
homo_sapiens <- fasta_cleaner(homo_sapiens)

par(mfrow = c(2,2), 
    mar = c(0,0,4,2))

#dotplot 1: Default
dotPlot(homo_sapiens, homo_sapiens, 
        wsize = 1, 
        nmatch = 1,
        xlab="homo_sapiens", ylab="homo_sapiens",
        main = "Default")

#dotplot 2: size = 10, nmatch = 1
dotPlot(homo_sapiens, homo_sapiens, 
        wsize = 10, 
        nmatch = 1,
        xlab="homo_sapiens", ylab="homo_sapiens",
        main = "Window Size: 10, nmatch:1")

#dotplot 3: size = 10, nmatch = 5
dotPlot(homo_sapiens, homo_sapiens, 
        wsize = 10, 
        nmatch = 5,
        xlab="homo_sapiens", ylab="homo_sapiens",
        main = "Window Size: 10, nmatch: 5")

#dotplot 4: size = 20, nmatch = 5
dotPlot(homo_sapiens, homo_sapiens, 
        wsize = 20,
        nmatch = 5,
        xlab="homo_sapiens", ylab="homo_sapiens",
        main = "Window Size: 20, nmatch:5")

#reset par() - run this or other plots will be small!
par(mfrow = c(1,1), 
    mar = c(4,4,4,4))

#best version of plot
dotPlot(homo_sapiens, homo_sapiens, 
        wsize = 1, 
        nmatch = 1, 
        xlab="homo_sapiens", ylab="homo_sapiens",
        main = "Best version of plot")

##Protein Properties Compiled from Databases

sources <- c("PFAM", "Uniprot", "AlphaFold", "RepeatsDB")

properties <- c("There is a sugar (and other) transporter family domain in the protein expressed by the slc22a4 gene.",
                "SLC22A4 encodes an organic cation transporter and plasma integral protein. As such it is located within the cytoplasm of the cell",
                "Alphafold database did not provide specific information on the secondary structure of the protein enconded by SLC22A4, however judging from the 3-D model there are quite a few alpha-helices present",
                "NA")

links <- c("http://pfam.xfam.org/protein/Q9H015,
           https://www.uniprot.org/uniprot/Q9H015#family_and_domains,
           https://alphafold.ebi.ac.uk/entry/Q9H015",
           "NA")


#converting protein info to table
protein_table <- data.frame(sources, properties, links)
colnames(protein_table) <- c("Source", "Protein Property", "Link")

pander::pander(protein_table)

Table continues below
Source	Protein Property
PFAM	There is a sugar (and other) transporter family domain in the protein expressed by the slc22a4 gene.
Uniprot	SLC22A4 encodes an organic cation transporter and plasma integral protein. As such it is located within the cytoplasm of the cell
AlphaFold	Alphafold database did not provide specific information on the secondary structure of the protein enconded by SLC22A4, however judging from the 3-D model there are quite a few alpha-helices present
RepeatsDB	NA

Link
http://pfam.xfam.org/protein/Q9H015, https://www.uniprot.org/uniprot/Q9H015#family_and_domains, https://alphafold.ebi.ac.uk/entry/Q9H015
NA
http://pfam.xfam.org/protein/Q9H015, https://www.uniprot.org/uniprot/Q9H015#family_and_domains, https://alphafold.ebi.ac.uk/entry/Q9H015
NA

##Secondary Structure Predictions Using multivariate statistical methods, information regarding protein structure/location was confirmed in the line database

#Predicting Protein Folds Judging by Alphafold database prediction of SLC22A4 3-D structure, there is a presence of alpha-helices. I predict machine-learning methods will output an “alpha” protein fold class

aa_codes <- c("A", "R", "N", "D", "C", "Q", "E", "G", "H", "I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V")

# alpha proteins
alpha <- c(285, 53, 97, 163, 22, 67, 134, 197, 111, 91, 
           221, 249, 48, 123, 82, 122, 119, 33, 63, 167)

# check against chou's total
sum(alpha) == 2447

## [1] TRUE

# beta proteins
beta <- c(203, 67, 139, 121, 75, 122, 86, 297, 49, 120, 
          177, 115, 16, 85, 127, 341, 253, 44, 110, 229)

# check against chou's total
sum(beta) == 2776

## [1] TRUE

# alpha + beta
a.plus.b <- c(175, 78, 120, 111, 74, 74, 86, 171, 33, 93,
              110, 112, 25, 52, 71, 126, 117, 30, 108, 123)
sum(a.plus.b) == 1889

## [1] TRUE

# alpha/beta
a.div.b <- c(361, 146, 183, 244, 63, 114, 257, 377, 107, 239, 
             339, 321, 91, 158, 188, 327, 238, 72, 130, 378)
sum(a.div.b) == 4333

## [1] TRUE

comp_table <- data.frame(aa_codes, alpha, beta, a.plus.b, a.div.b)

pander::pander(comp_table)

aa_codes	alpha	beta	a.plus.b	a.div.b
A	285	203	175	361
R	53	67	78	146
N	97	139	120	183
D	163	121	111	244
C	22	75	74	63
Q	67	122	74	114
E	134	86	86	257
G	197	297	171	377
H	111	49	33	107
I	91	120	93	239
L	221	177	110	339
K	249	115	112	321
M	48	16	25	91
F	123	85	52	158
P	82	127	71	188
S	122	341	126	327
T	119	253	117	238
W	33	44	30	72
Y	63	110	108	130
V	167	229	123	378

#calculate proportions for each of the four protein fold types

alpha.prop <- alpha/sum(alpha)
beta.prop <- beta/sum(beta)
a.plus.b.prop <- a.plus.b/sum(a.plus.b)
a.div.b <- a.div.b/sum(a.div.b)

#dataframe
aa_prop <- data.frame(alpha.prop,
                      beta.prop,
                      a.plus.b.prop,
                      a.div.b)
round(aa_prop, 4)

##    alpha.prop beta.prop a.plus.b.prop a.div.b
## 1      0.1165    0.0731        0.0926  0.0833
## 2      0.0217    0.0241        0.0413  0.0337
## 3      0.0396    0.0501        0.0635  0.0422
## 4      0.0666    0.0436        0.0588  0.0563
## 5      0.0090    0.0270        0.0392  0.0145
## 6      0.0274    0.0439        0.0392  0.0263
## 7      0.0548    0.0310        0.0455  0.0593
## 8      0.0805    0.1070        0.0905  0.0870
## 9      0.0454    0.0177        0.0175  0.0247
## 10     0.0372    0.0432        0.0492  0.0552
## 11     0.0903    0.0638        0.0582  0.0782
## 12     0.1018    0.0414        0.0593  0.0741
## 13     0.0196    0.0058        0.0132  0.0210
## 14     0.0503    0.0306        0.0275  0.0365
## 15     0.0335    0.0457        0.0376  0.0434
## 16     0.0499    0.1228        0.0667  0.0755
## 17     0.0486    0.0911        0.0619  0.0549
## 18     0.0135    0.0159        0.0159  0.0166
## 19     0.0257    0.0396        0.0572  0.0300
## 20     0.0682    0.0825        0.0651  0.0872

#row labels
row.names(aa_prop) <- aa_codes

#output dataframe
pander::pander(aa_prop)

	alpha.prop	beta.prop	a.plus.b.prop	a.div.b
A	0.1165	0.07313	0.09264	0.08331
R	0.02166	0.02414	0.04129	0.03369
N	0.03964	0.05007	0.06353	0.04223
D	0.06661	0.04359	0.05876	0.05631
C	0.008991	0.02702	0.03917	0.01454
Q	0.02738	0.04395	0.03917	0.02631
E	0.05476	0.03098	0.04553	0.05931
G	0.08051	0.107	0.09052	0.08701
H	0.04536	0.01765	0.01747	0.02469
I	0.03719	0.04323	0.04923	0.05516
L	0.09031	0.06376	0.05823	0.07824
K	0.1018	0.04143	0.05929	0.07408
M	0.01962	0.005764	0.01323	0.021
F	0.05027	0.03062	0.02753	0.03646
P	0.03351	0.04575	0.03759	0.04339
S	0.04986	0.1228	0.0667	0.07547
T	0.04863	0.09114	0.06194	0.05493
W	0.01349	0.01585	0.01588	0.01662
Y	0.02575	0.03963	0.05717	0.03
V	0.06825	0.08249	0.06511	0.08724

#Determine number of each amino acid in protein encoded by SLC22A4

#Function to convert a table into a vector
table_to_vector <- function(table_x){
  table_names <- attr(table_x, "dimnames")[[1]]
  table_vect <- as.vector(table_x)
  names(table_vect) <- table_names
  return(table_vect)
}

homo_sapiens_freq_table <- table(homo_sapiens)/length(homo_sapiens)
homo_sapiens_freq_vector <- table_to_vector(homo_sapiens_freq_table)

homo_sapiens_freq_vector

##           A           C           D           E           F           G 
## 0.067150635 0.012704174 0.032667877 0.045372051 0.076225045 0.067150635 
##           H           I           K           L           M           N 
## 0.005444646 0.068965517 0.023593466 0.136116152 0.034482759 0.034482759 
##           P           Q           R           S           T           V 
## 0.045372051 0.027223230 0.054446461 0.070780399 0.054446461 0.085299456 
##           W           Y 
## 0.025408348 0.032667877

Removing any unknown amino acid codes, U, from vector

aa_names <- names(homo_sapiens_freq_vector)

i.U <- which(aa_names == "U")
aa_names[i.U]

## character(0)

homo_sapiens_freq_vector[i.U]

## named numeric(0)

if(length(i.U) > 0) {
  homo_sapiens <- homo_sapiens[-i.U]
}

Adding frequency vector data to proportion table

aa_prop$slc22a4_human_aa_freq <- homo_sapiens_freq_vector
pander::pander(aa_prop)

	alpha.prop	beta.prop	a.plus.b.prop	a.div.b	slc22a4_human_aa_freq
A	0.1165	0.07313	0.09264	0.08331	0.06715
R	0.02166	0.02414	0.04129	0.03369	0.0127
N	0.03964	0.05007	0.06353	0.04223	0.03267
D	0.06661	0.04359	0.05876	0.05631	0.04537
C	0.008991	0.02702	0.03917	0.01454	0.07623
Q	0.02738	0.04395	0.03917	0.02631	0.06715
E	0.05476	0.03098	0.04553	0.05931	0.005445
G	0.08051	0.107	0.09052	0.08701	0.06897
H	0.04536	0.01765	0.01747	0.02469	0.02359
I	0.03719	0.04323	0.04923	0.05516	0.1361
L	0.09031	0.06376	0.05823	0.07824	0.03448
K	0.1018	0.04143	0.05929	0.07408	0.03448
M	0.01962	0.005764	0.01323	0.021	0.04537
F	0.05027	0.03062	0.02753	0.03646	0.02722
P	0.03351	0.04575	0.03759	0.04339	0.05445
S	0.04986	0.1228	0.0667	0.07547	0.07078
T	0.04863	0.09114	0.06194	0.05493	0.05445
W	0.01349	0.01585	0.01588	0.01662	0.0853
Y	0.02575	0.03963	0.05717	0.03	0.02541
V	0.06825	0.08249	0.06511	0.08724	0.03267

##Functions to calculate similarities

Correlation used in Chou and Zhange 1992

chou_cor <- function(x,y){
  numerator <- sum(x*y)
denominator <- sqrt((sum(x^2))*(sum(y^2)))
result <- numerator/denominator
return(result)
}

Cosine similarity used in Higgs and Attwood (2005).

chou_cosine <- function(z.1, z.2){
  z.1.abs <- sqrt(sum(z.1^2))
  z.2.abs <- sqrt(sum(z.2^2))
  my.cosine <- sum(z.1*z.2)/(z.1.abs*z.2.abs)
  return(my.cosine)
}

Data exploration. Display outputs as scatterplots. None of these correlations are that great.

par(mfrow = c(2,2), mar = c(1,4,1,1))
plot(alpha.prop ~ slc22a4_human_aa_freq, data = aa_prop)
plot(beta.prop ~ slc22a4_human_aa_freq, data = aa_prop)
plot(a.plus.b.prop  ~ slc22a4_human_aa_freq, data = aa_prop)
plot(a.div.b ~ slc22a4_human_aa_freq, data = aa_prop)

par(mfrow = c(1,1), mar = c(1,1,1,1))

Calculate correlation between each column:

corr.alpha <- chou_cor(aa_prop[, 5], aa_prop[, 1])
corr.beta <- chou_cor(aa_prop[, 5], aa_prop[, 2])
corr.apb <- chou_cor(aa_prop[, 5], aa_prop[, 3])
corr.adb <- chou_cor(aa_prop[, 5], aa_prop[, 4])

Calculate cosine simularity between each column:

cos.alpha <- chou_cosine(aa_prop[, 5], aa_prop[, 1])
cos.beta <- chou_cosine(aa_prop[, 5], aa_prop[, 2])
cos.apb <- chou_cosine(aa_prop[, 5], aa_prop[, 3])
cos.adb <- chou_cosine(aa_prop[, 5], aa_prop[, 4])

Calculate distance. Note: we need to flip the dataframe on its side using a command called t()

aa_prop_flipped <- t(aa_prop)
round(aa_prop_flipped,2)

##                          A    R    N    D    C    Q    E    G    H    I    L
## alpha.prop            0.12 0.02 0.04 0.07 0.01 0.03 0.05 0.08 0.05 0.04 0.09
## beta.prop             0.07 0.02 0.05 0.04 0.03 0.04 0.03 0.11 0.02 0.04 0.06
## a.plus.b.prop         0.09 0.04 0.06 0.06 0.04 0.04 0.05 0.09 0.02 0.05 0.06
## a.div.b               0.08 0.03 0.04 0.06 0.01 0.03 0.06 0.09 0.02 0.06 0.08
## slc22a4_human_aa_freq 0.07 0.01 0.03 0.05 0.08 0.07 0.01 0.07 0.02 0.14 0.03
##                          K    M    F    P    S    T    W    Y    V
## alpha.prop            0.10 0.02 0.05 0.03 0.05 0.05 0.01 0.03 0.07
## beta.prop             0.04 0.01 0.03 0.05 0.12 0.09 0.02 0.04 0.08
## a.plus.b.prop         0.06 0.01 0.03 0.04 0.07 0.06 0.02 0.06 0.07
## a.div.b               0.07 0.02 0.04 0.04 0.08 0.05 0.02 0.03 0.09
## slc22a4_human_aa_freq 0.03 0.05 0.03 0.05 0.07 0.05 0.09 0.03 0.03

Create distance matrix:

dist_matrix <- dist(aa_prop_flipped, method = "euclidean")

Individual Distances:

dist.alpha <- dist(aa_prop_flipped[c(1, 5),], method="euclidean")
dist.beta <- dist(aa_prop_flipped[c(2, 5),], method="euclidean")
dist.apb <- dist(aa_prop_flipped[c(3, 5),], method="euclidean")
dist.adb <- dist(aa_prop_flipped[c(4, 5),], method="euclidean")

# fold types
fold.type <- c("alpha","beta","alpha plus beta", "alpha/beta")

# data
corr.sim <- round(c(corr.alpha,corr.beta,corr.apb,corr.adb),5)
cosine.sim <- round(c(cos.alpha,cos.beta,cos.apb,cos.adb),5)
Euclidean.dist <- round(c(dist.alpha,dist.beta,dist.apb,dist.adb),5)

# summary
sim.sum <- c("","","most.sim","")
dist.sum <- c("","","min.dist","")

#create dataframe
df <- data.frame(fold.type,
           corr.sim ,
           cosine.sim ,
           Euclidean.dist ,
           sim.sum ,
           dist.sum )

Display output

pander::pander(df)

fold.type	corr.sim	cosine.sim	Euclidean.dist	sim.sum	dist.sum
alpha	0.7151	0.7151	0.1952
beta	0.7911	0.7911	0.1683
alpha plus beta	0.8126	0.8126	0.1545	most.sim	min.dist
alpha/beta	0.7814	0.7814	0.1679

##Percent Identity Comparisons (PID)

#Data Prep Convert FASTA records into single vector

names(slc22a4_list)

##  [1] "NP_003050.2"    "XP_001163062.1" "NP_062661.1"    "NP_001193918.1"
##  [5] "XP_005626572.1" "XP_017727518.1" "NP_071606.1"    "NP_001139603.1"
##  [9] "XP_013820784.2" "XP_011945580.1"

length(slc22a4_list)

## [1] 10

Convert list into vector

#create vector w/ NAs at each homolog position
slc22a4_vector <- rep(NA, length(slc22a4_list))

#populate vector with fasta sequences from slc22a4_list
for(i in 1:length(slc22a4_vector)){
  slc22a4_vector[i] <- slc22a4_list[[i]]
}

#label sequences with slc22a4 list names
names(slc22a4_vector) <- names(slc22a4_list)

##PID Table

Performing pairwise alignments comparing human sequence to chimp, mouse, and cattle respectively

human_chimp_alignment <- pairwiseAlignment(slc22a4_list[[1]], slc22a4_list[[2]])
human_mouse_alignment <- pairwiseAlignment(slc22a4_list[[1]], slc22a4_list[[3]])
human_cattle_alignment <- pairwiseAlignment(slc22a4_list[[1]], slc22a4_list[[4]])

#output alignments
human_chimp_alignment

## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: MRDYDEVIAFLGEWGPFQRLIFFLLSASIIPNGF...TLEQMQKVKWFRSGKKTRDSMETEENPKVLITAF
## subject: MRDYDEVIAFLGEWGPFQRLIFFLLSASIIPNGF...TLEQMQKVKWFRCGKKTRDSMETEENPKVLITAF
## score: 2318.619

human_mouse_alignment

## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: MRDYDEVIAFLGEWGPFQRLIFFLLSASIIPNGF...TLEQMQKVKWFRSGKKTRDSMETEENPKVLITAF
## subject: MRDYDEVIAFLGEWGPFQRLIFFLLSASIIPNGF...NLEQMQKVRGFRCGKKSTVSVDREESPKVLITAF
## score: 1488.824

human_cattle_alignment

## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: MRDYDEVIAFLGEWGPFQRLIFFLLSASIIPNGF...TLEQMQKVKWFRSGKKTRDSMETEENPKVLITAF
## subject: MRDYDEVTAFLGKWGPFQRLIFFLLSASIIPNGF...TLEQMQKVKGFRYGKKTKGSMEKEENSKVIITSF
## score: 1604.795

Aligning chimp, mouse, and cattle sequences to eachother

chimp_mouse_alignment <- pairwiseAlignment(slc22a4_list[[2]], slc22a4_list[[3]])
chimp_cattle_alignment <- pairwiseAlignment(slc22a4_list[[2]], slc22a4_list[[4]])
mouse_cattle_alignment <- pairwiseAlignment(slc22a4_list[[3]], slc22a4_list[[4]])

#output alignments
chimp_mouse_alignment

## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: MRDYDEVIAFLGEWGPFQRLIFFLLSASIIPNGF...TLEQMQKVKWFRCGKKTRDSMETEENPKVLITAF
## subject: MRDYDEVIAFLGEWGPFQRLIFFLLSASIIPNGF...NLEQMQKVRGFRCGKKSTVSVDREESPKVLITAF
## score: 1499.367

chimp_cattle_alignment

## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: MRDYDEVIAFLGEWGPFQRLIFFLLSASIIPNGF...TLEQMQKVKWFRCGKKTRDSMETEENPKVLITAF
## subject: MRDYDEVTAFLGKWGPFQRLIFFLLSASIIPNGF...TLEQMQKVKGFRYGKKTKGSMEKEENSKVIITSF
## score: 1594.253

mouse_cattle_alignment

## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: MRDYDEVIAFLGEWGPFQRLIFFLLSASIIPNGF...NLEQMQKVRGFRCGKKSTVSVDREESPKVLITAF
## subject: MRDYDEVTAFLGKWGPFQRLIFFLLSASIIPNGF...TLEQMQKVKGFRYGKKTKGSMEKEENSKVIITSF
## score: 1462.718

Calculating pid of alignments

Biostrings::pid(human_chimp_alignment)

## [1] 99.09256

Biostrings::pid(human_mouse_alignment)

## [1] 84.81013

Biostrings::pid(human_cattle_alignment)

## [1] 86.79928

Biostrings::pid(chimp_mouse_alignment)

## [1] 84.99096

Biostrings::pid(chimp_cattle_alignment)

## [1] 86.61844

Biostrings::pid(mouse_cattle_alignment)

## [1] 84.26763

Building matrix

#         human                 chimp   mouse   cattle

pids <- c(1,      NA,     NA,     NA,
          pid(human_chimp_alignment), 1, NA, NA,
          pid(human_mouse_alignment), pid(chimp_mouse_alignment), 1, NA,
        pid(human_cattle_alignment), pid(chimp_cattle_alignment), pid(mouse_cattle_alignment), 1)

pid_matrix <- matrix(pids, nrow=4, byrow=T)
row.names(pid_matrix) <- c("Homo", "Pan", "Mus", "Bos")
colnames(pid_matrix) <- c("Homo", "Pan", "Mus", "Bos")
pander::pander(pid_matrix)

	Homo	Pan	Mus	Bos
Homo	1	NA	NA	NA
Pan	99.09	1	NA	NA
Mus	84.81	84.99	1	NA
Bos	86.8	86.62	84.27	1

##PID Method Comparison

pid1 <- Biostrings::pid(human_chimp_alignment, type = "PID1")
pid2 <- Biostrings::pid(human_chimp_alignment, type = "PID2")
pid3 <- Biostrings::pid(human_chimp_alignment, type = "PID3")
pid4 <- Biostrings::pid(human_chimp_alignment, type = "PID4")

pid_methods_vector <- c(pid1, pid2, pid3, pid4)

pid_method <- c("PID1", "PID2", "PID3", "PID4")

denominator <- c("(aligned positions & internal gap positions)",
         "(aligned positions)",
         "(length of shorter sequence)",
         "(average length of sequences)")

pid_df <- data.frame(pid_method, pid_methods_vector, denominator)
colnames(pid_df) <- c("method", "PID", "denominator")

pander::pander(pid_df)

method	PID	denominator
PID1	99.09	(aligned positions & internal gap positions)
PID2	99.09	(aligned positions)
PID3	99.09	(length of shorter sequence)
PID4	99.09	(average length of sequences)

##Multiple Sequence Alignment (MSA) #MSA Data Prep

#convert vectors to stringset format
slc22a4_vector_ss <- Biostrings::AAStringSet(slc22a4_vector)

#Building MSA

slc22a4_msa <- msa(slc22a4_vector_ss, method = "ClustalW")

## use default substitution matrix

#Cleaning/Outputting MSA

class(slc22a4_msa)

## [1] "MsaAAMultipleAlignment"
## attr(,"package")
## [1] "msa"

is(slc22a4_msa)

## [1] "MsaAAMultipleAlignment" "AAMultipleAlignment"    "MsaMetaData"           
## [4] "MultipleAlignment"

slc22a4_msa

## CLUSTAL 2.1  
## 
## Call:
##    msa(slc22a4_vector_ss, method = "ClustalW")
## 
## MsaAAMultipleAlignment with 10 rows and 675 columns
##      aln                                                   names
##  [1] -------------------------...GFRCGKKSTVSVDREESPKVLITAF NP_062661.1
##  [2] -------------------------...GFRCGKKSTVSMDREENPKVLITAF NP_071606.1
##  [3] -------------------------...WFRSGKKTRDSMETEENPKVLITAF NP_003050.2
##  [4] -------------------------...WFRCGKKTRDSMETEENPKVLITAF XP_001163062.1
##  [5] MAEPSVEDPVRSLRAGGRAKHWAET...WFRSGKKTRDSMETEKNPKVLITAF XP_017727518.1
##  [6] -------------------------...WFRSGKKTRDSMETEENPKVLITAF XP_011945580.1
##  [7] -------------------------...GFRYGKKTKGSMEKEENSKVIITSF NP_001193918.1
##  [8] -------------------------...GFRYGKKTKGSMEKEENPKVIITSF XP_013820784.2
##  [9] -------------------------...GFRFRKRTRDSVEKEENPKVLITSF XP_005626572.1
## [10] -------------------------...CFRNGQQSTGIRNSKESPKTLITTL NP_001139603.1
##  Con -------------------------...GFR?GKKTRDSME?EENPKVLITAF Consensus

class(slc22a4_msa) <- "AAMultipleAlignment"
class(slc22a4_msa)

## [1] "AAMultipleAlignment"

#converting MSA to seqinr alignment format
slc22a4_seqinr <- msaConvert(slc22a4_msa, type = "seqinr::alignment")

print_msa(alignment = slc22a4_seqinr, chunksize = 60)

## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "MAEPSVEDPVRSLRAGGRAKHWAETAGSPAPSQRPPLPSAGPGMGVWSQVYSGIKLGAEL 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] " "
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "PERSSALSLFPRTGPRLRAPISNSLPVLGNVLTSLGSPQLRDTVPRTLSSPVVASFGAAV 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] " "
## [1] "--MRDYDEVIAFLGEWGPFQRLIFFLLSASIIPNGFNGMSVVFLAGTPEHRCLVPDTVNL 0"
## [1] "--MRDYDEVIAFLGDWGPFQRLIFFLLSASIIPNGFNGMSVVFLAGTPEHRCLVPHTVNL 0"
## [1] "--MRDYDEVIAFLGEWGPFQRLIFFLLSASIIPNGFNGMSVVFLAGTPEHRCRVPDAANL 0"
## [1] "--MRDYDEVIAFLGEWGPFQRLIFFLLSASIIPNGFNGMSVVFLAGTPEHRCRVPDAANL 0"
## [1] "GSMRDYDEAIAFLGEWGLFQRLIFFLLSASIIPNGFNGMSVVFLAGTPEHRCRVPDAANL 0"
## [1] "--MRDYDEAIAFLGEWGPFQRLIFFLLSASIIPNGFNGMSVVFLAGTPEHRCRVPDAANL 0"
## [1] "--MRDYDEVTAFLGKWGPFQRLIFFLLSASIIPNGFNGMSVVFLAGTPEHHCRVPDTANL 0"
## [1] "--MRDYDEVTAFLGKWGPFQRLIFFLLSASIIPNGFNGMSVVFLAGTPEHRCRVPDTANL 0"
## [1] "--MRDYDEVTTFLGEWGPFQRLIFFLLSASIIPNGFNGMSVVFLAATPEHRCRVPDAANL 0"
## [1] "--MRDYEEVTAFLGKWGRFQRLVFFLLSASIIPNGFNGMSAVFLAGTPEHRCAVPPGANL 0"
## [1] " "
## [1] "SSSWRNHSIPLETKDGRQVPQSCRRYRLATIANFSAMGLEPGQDVDLEQLEQESCLDGWE 0"
## [1] "SSAWRNHSIPLETKDGRQVPQSCRRYRLATIANFSALGLEPGLDVDLEQLEQESCLDGWE 0"
## [1] "SSAWRNNSVPLRLRDGREVPHSCSRYRLATIANFSALGLEPGRDVDLGQLEQESCLDGWE 0"
## [1] "SSAWRNNSVPLRLRDGREVPHSCSRYRLATIANFSALGLEPGRDVDLGQLEQESCLDGWE 0"
## [1] "SSAWRNNSIPLRLRDGREVLHSCRRYRLATIANFSALGLEPGRDVDLGQLEQESCLDGWE 0"
## [1] "SSAWRNNSVPLRLRDGREVLHSCRRYRLATIANFSALGLEPGRDVDLGQLEQESCLDGWE 0"
## [1] "SSAWRNHSIPVRLQDGREVPQSCRRYRLATIVNFSALGLEPGRDVNLEQLEQEGCLDGWE 0"
## [1] "SSAWRNHSIPMRLQDGREVPQSCRRYRLATIVNFSALGLEPGRDVDLEQLEQEGCLDGWE 0"
## [1] "SRAWRNHSIPLRLQDGREVPHSCRRYRLAAIANFSALGLEPGRDVDLEQLEQESCLDGWE 0"
## [1] "SEEWRNASIPLELRGGVTEPSRCRRYRLAVLANLSALGLRPGSDVQLAELEQEPCLDGWE 0"
## [1] " "
## [1] "YDKDIFLSTIVTEWNLVCEDDWKTPLTTSLFFVGVLCGSFVSGQLSDRFGRKKVLFATMA 0"
## [1] "YSKDVFLSTIVTEWNLVCEDDWKTPLTTSLFFVGVLCGSFVSGQLSDRFGRKKVLFATMA 0"
## [1] "FSQDVYLSTVVTEWNLVCEDNWKVPLTTSLFFVGVLLGSFVSGQLSDRFGRKNVLFATMA 0"
## [1] "FSQDVYLFTVVTEWNLVCEDNWKVPLTTSLFFVGVLLGSFVSGQLSDRFGRKNVLFATMA 0"
## [1] "FSQDVYWSTVVTEWNLVCEDNWKVPLTTSLFFVGVLLGSFISGQLSDRFGRKNVLFATMA 0"
## [1] "FSQDVYWSTVVTEWNLVCEDNWKVPLTTSLFFVGVLLGSFVSGQLSDRFGRKNVLFATMA 0"
## [1] "FSQDIYMSTIVTEWSLVCEEDWKTPLTTSLFFVGVLLGSFVSGQLSDRFGRKTILFATMA 0"
## [1] "FSQDIYLSTIVTEWSLVCEEDWKTPLTTSLFFVGVLLGSFVSGQLSDRFGRKTILFATMA 0"
## [1] "FSQDIYQSTIVTEWSLVCEDDWKTPLTSSLFFVGVLLGSFVSGQLSDRFGRKNILFATMA 0"
## [1] "YSRDVYRSTIVSEWNLVCEDDWKVPLTTSLFFVGVLIGSFISGQLSDRFGRKSILFLTMA 0"
## [1] " "
## [1] "VQTGFSFVQIFSTNWEMFTVLFAIVGMGQISNYVVAFILGTEILSKSVRIIFSTLGVCTF 0"
## [1] "VQTGFSFVQIFSTNWEMFTVLFAIVGMGQISNYVVAFILGTEILSKSVRILFSTLGVCTF 0"
## [1] "VQTGFSFLQIFSISWEMFTVLFVIVGMGQISNYVVAFILGTEILGKSVRIIFSTLGVCTF 0"
## [1] "VQTGFSFLQIFSISWEMFTVLFVIVGMGQISNYVVAFILGTEILGKSVRIIFSTLGVCTF 0"
## [1] "VQTGFSFLQIFSISWEMFTVLFLIVGMGQISNYVVAFILGTEILGKSVRIIFSTLGVCTF 0"
## [1] "VQTGFSFLQIFSNSWEMFTVLFLIVGMGQISNYVVAFILGTEILGKSVRIIFSTLGVCTF 0"
## [1] "VQTGFSFLQVFSINWEMFTVLFVIVGMGQISNYVVAFILGTEILGKSVRIIFSTLGVCTF 0"
## [1] "VQTGFSFLQIFSINWEMFTVLFVIVGMGQISNYVVAFILGAEILGKSVRVIFSTLGVCTF 0"
## [1] "VQTGFSFLQIFSINWEMFTVLFVIVGMGQISNYVVAFILGTEILGKSVRIIFSTLGVCTF 0"
## [1] "VQTGFSFLQIFSTSWEMFTVLFLIVGMGQISNYVVAFILGTEILGKSVRILFSTLGVCVF 0"
## [1] " "
## [1] "FAIGYMVLPLFAYFIRDWRMLLLALTLPGLFCVPLWWFIPESPRWLISQRRFAEAEQIIQ 0"
## [1] "FAIGYMVLPLFAYFIRDWRMLLLALTLPGLFCVPLWWFIPESPRWLISQRRFEEAEQIIQ 0"
## [1] "FAVGYMLLPLFAYFIRDWRMLLLALTVPGVLCVPLWWFIPESPRWLISQRRFREAEDIIQ 0"
## [1] "FAVGYMLLPLFAYFIRDWRMLLLALTVPGVLCVPLWWFIPESPRWLISQRRFREAEDIIQ 0"
## [1] "FAVGYMLLPLFAYFIRDWRMLLLALTVPGVLCVPLWWFIPESPRWLISQRRFREAEDIIQ 0"
## [1] "FAVGYMLLPLFAYFIRDWRMLLLALTVPGVLCVPLWWFIPESPRWLISQRRFREAEDIIQ 0"
## [1] "FAIGYMLLPLFAYFIRDWRMLLLALTVPGVLCVPLWWFIPESPRWLISQRRFKEAEDIIQ 0"
## [1] "FAVGYMLLPLFAYFIRDWRMLLLALTVPGLLCVPLWWFIPESPRWLLSQRRFKEAEDIIQ 0"
## [1] "FAVGYMLLPLFAYFIRDWRMLLLALTLPGVLCIPLWWFIPESPRWLISQRRFREAENIIQ 0"
## [1] "FAIGYMLLPLFAYFIRDWRMLLLALTLPGLLCVPLWWIIPESPRWLISQGRYKEAEVIIR 0"
## [1] " "
## [1] "KAAKMNSIVAPAGIFDPLELQELNSLKQQKVIILDLFRTRNIATITVMAVMLWMLTSVGY 0"
## [1] "KAAKMNGIMAPAVIFDPLELQELNSLKQQKVFILDLFKTRNIATITVMSVMLWMLTSVGY 0"
## [1] "KAAKMNNIAVPAVIFDSVEELN--PLKQQKAFILDLFRTRNIAIMTIMSLLLWMLTSVGY 0"
## [1] "KAAKMNNIAVPAVIFDSVEELN--PLKQQKAFILDLFRTRNIAIMTIMSLLLWMLTSVGY 0"
## [1] "KAAKMNNIAVPAVIFDSVEELN--PLKQQKAFILDLFRTWNIAIMTIMSLLLWMLTSVGY 0"
## [1] "KAAKMNNIAVPAVIFDSVEELN--PLKQQKAFILDLFRTWNIAIMTIMSLLLWMLTSVGY 0"
## [1] "KAAKINNVTAPVVVFDPVEQQEQLPLKQQKVFILDLFRTRNIATITIMSLLLWMLTSVGY 0"
## [1] "KAAKINNVTAPVVVFDPVEQQEQLPLKQQKVFILDLFRTRNIATITIMSLLLWMLTSVGY 0"
## [1] "KAAKMNNIAAPVVIFDPMELQELKLQSQQKVFILDLFRTRNIATITIMSLLLWMLTSVGY 0"
## [1] "KAAKINNTPAPAMLFDAAEMQDSKPQQQQKAILLDLFRSRNIATITIMSLLLWFFTSVGY 0"
## [1] " "
## [1] "FALSLNVPNLHGDVYLNCFLSGLIEVPAYFTAWLLLRTLPRRYIIAGVLFWGGGVLLLIQ 0"
## [1] "FALSLNVPNLHGDVYLNCFLSGLIEVPAYFTAWLLLRTLPRRYIIAGVLFWGGGVLLLVQ 0"
## [1] "FALSLDAPNLHGDAYLNCFLSALIEIPAYITAWLLLRTLPRRYIIAAVLFWGGGVLLFIQ 0"
## [1] "FALSLDAPNLHGDVYLNCFLSALIEIPAYITAWLLLRTLPRRYIIAAVLFWGGGVLLFIQ 0"
## [1] "FALSLDTPNLHGDAYLNCFLSALIEIPAYITAWLLLRTLPRRYIIAAVLFWGGGVLLFIQ 0"
## [1] "FALSLDTPNLHGDAYLNCFLSALIEIPAYITAWLLLRTLPRRYIIAAVLFWGG------- 0"
## [1] "FALSLNTPNLHGDPYLNCFFSALIEVPAYIAAWLLLRTLPRRYVIAGVLFFGGSVLLLIQ 0"
## [1] "FALSLNTPNLHGDPYLNCFFSALIEVPAYIAAWLLLRTLPRRYIIAGVLFFGGSVLLLIQ 0"
## [1] "FALSLNAPNLHGDAYLNCFLSALIEVPAYITAWLLLRTLPRRYIIAGVLFLGG------- 0"
## [1] "FGLSLNTPNLHGDAYINCFLSAVIEVPAYIIAWLLLRSLPRRYSISGTLVLGGGVVLFIN 0"
## [1] " "
## [1] "VVPEDYNFVSIGLVMLGKFGITSAFSMLYVFTAELYPTLVRNMAVGITSMASRVGSIIAP 0"
## [1] "VVPEDYNFVSIGLVMLGKFGVTSAFSMLYVFTAELYPTLVRNMAVGITSMASRVGSIIAP 0"
## [1] "LVPVDYYFLSIGLVMLGKFGITSAFSMLYVFTAELYPTLVRNMAVGVTSTASRVGSIIAP 0"
## [1] "LVPVDYYFLSIGLVMLGKFGITSAFSMLYVFTAELYPTLVRNMAVGVTSTASRVGSIIAP 0"
## [1] "LVPVDYYFLSIGLVMLGKFGITSAFSMLYVFTAELYPTLVRNMAVGVTSMASRVGSIIAP 0"
## [1] "----DYYFLSIGLVMLGKFGITSAFSMLYVFTAELYPTLVRNMAVGVTSMASRVGSIIAP 0"
## [1] "LVPAGYNFLSIGLAMLGKFGVTSAFAMLYVFTAELYPTLVRNMAVGVASMASRVGSIIAP 0"
## [1] "LVPAGYNFLSIGLAMLGKFGITSAFAMLYVFTAELYPTLVRNMAVGVGSMASRVGSIIAP 0"
## [1] "----DYNFLSIGLVMLGKFGITSAFSMLYVFTAELYPTLVRNMAVGVTSMASRVGSIIAP 0"
## [1] "LVPTDLNVLSVCLVMLGKFGITAAFSMLYVYNVELYPTLVRNMAVGATSTASRLGSIIAP 0"
## [1] " "
## [1] "YFVYLGAYNRLLPYILMGSLTVLIGIITLFFPESFGVTLPENLEQMQKVRGFRCGKKSTV 0"
## [1] "YFVYLGAYNRLLPYILMGSLTVLIGIITLFFPESFGVTLPENLEQMQKVRGFRCGKKSTV 0"
## [1] "YFVYLGAYNRMLPYIVMGSLTVLIGILTLFFPESLGMTLPETLEQMQKVKWFRSGKKTRD 0"
## [1] "YFVYLGAYNRMLPYIIMGSLTVLIGILTLFFPESLGITLPETLEQMQKVKWFRCGKKTRD 0"
## [1] "YFVYLGAYNRMLPYIVMGSLTVLIGILTLFFPESLGMTLPETLEQMQKVKWFRSGKKTRD 0"
## [1] "YFVYLGAYNRMLPYIVMGSLTVLIGILTLFFPESLGMTLPETLEQMQKVKWFRSGKKTRD 0"
## [1] "YFVYLGAYNRVLPYILMGSLTVLIGIITLFFPESFGRTLPETLEQMQKVKGFRYGKKTKG 0"
## [1] "YFVYLGAYNRVLPYILMGSLTVLIGIITLFFPESFGRTLPETLEQMQKVKGFRYGKKTKG 0"
## [1] "YFVYLGAYNRVLPYIVMGSLTVFIGIITLFFPESFGMTLPETLEQMQKVKGFRFRKRTRD 0"
## [1] "YFVYLGAYDRYLPYIVMGSLTVLIGILTLFLPESYGSALPESFEQMMKVKCFRNGQQSTG 0"
## [1] " "
## [1] "SVDREESPKVLITAF 45"
## [1] "SMDREENPKVLITAF 45"
## [1] "SMETEENPKVLITAF 45"
## [1] "SMETEENPKVLITAF 45"
## [1] "SMETEKNPKVLITAF 45"
## [1] "SMETEENPKVLITAF 45"
## [1] "SMEKEENSKVIITSF 45"
## [1] "SMEKEENPKVIITSF 45"
## [1] "SVEKEENPKVLITSF 45"
## [1] "IRNSKESPKTLITTL 45"
## [1] " "

##Finished MSA output msa using ggmsa function

ggmsa(slc22a4_msa, start=350, end=450)

##Distance Matrix

slc22a4_subset_dist <- seqinr::dist.alignment(slc22a4_seqinr, matrix="identity")
is(slc22a4_subset_dist)

## [1] "dist"     "oldClass"

class(slc22a4_subset_dist)

## [1] "dist"

slc22a4_align_seqinr_rnd <- round(slc22a4_subset_dist, 3)
slc22a4_align_seqinr_rnd

##                NP_062661.1 NP_071606.1 NP_003050.2 XP_001163062.1
## NP_071606.1          0.185                                       
## NP_003050.2          0.393       0.386                           
## XP_001163062.1       0.390       0.383       0.095               
## XP_017727518.1       0.402       0.395       0.148          0.176
## XP_011945580.1       0.394       0.385       0.129          0.161
## NP_001193918.1       0.397       0.387       0.361          0.364
## XP_013820784.2       0.387       0.383       0.351          0.354
## XP_005626572.1       0.370       0.370       0.310          0.322
## NP_001139603.1       0.498       0.498       0.480          0.486
##                XP_017727518.1 XP_011945580.1 NP_001193918.1 XP_013820784.2
## NP_071606.1                                                               
## NP_003050.2                                                               
## XP_001163062.1                                                            
## XP_017727518.1                                                            
## XP_011945580.1          0.096                                             
## NP_001193918.1          0.369          0.363                              
## XP_013820784.2          0.361          0.355          0.159               
## XP_005626572.1          0.325          0.322          0.319          0.310
## NP_001139603.1          0.480          0.481          0.494          0.492
##                XP_005626572.1
## NP_071606.1                  
## NP_003050.2                  
## XP_001163062.1               
## XP_017727518.1               
## XP_011945580.1               
## NP_001193918.1               
## XP_013820784.2               
## XP_005626572.1               
## NP_001139603.1          0.478

##Phylogenetic Tree

Use created distance matrix to create a phylogenetic tree using neighbor joining algorithm in ape package

slc22a4_tree <- ape::nj(slc22a4_align_seqinr_rnd)

#Plot Created Tree

plot.phylo(slc22a4_tree, main="SLC22A4 Phylogenetic Tree\n", use.edge.length = F)

mtext("\n\nSLC22A4 Tree - Rooted, No Branch Lengths")

Final Portfolio

Vignesh Senguttuvan

12/18/2021

SLC22A4-Solute Carrier Family 22 Member 4

aa_codes	alpha	beta	a.plus.b	a.div.b
A	285	203	175	361
R	53	67	78	146
N	97	139	120	183
D	163	121	111	244
C	22	75	74	63
Q	67	122	74	114
E	134	86	86	257
G	197	297	171	377
H	111	49	33	107
I	91	120	93	239
L	221	177	110	339
K	249	115	112	321
M	48	16	25	91
F	123	85	52	158
P	82	127	71	188
S	122	341	126	327
T	119	253	117	238
W	33	44	30	72
Y	63	110	108	130
V	167	229	123	378

aa_codes	alpha	beta	a.plus.b	a.div.b
A	285	203	175	361
R	53	67	78	146
N	97	139	120	183
D	163	121	111	244
C	22	75	74	63
Q	67	122	74	114
E	134	86	86	257
G	197	297	171	377
H	111	49	33	107
I	91	120	93	239
L	221	177	110	339
K	249	115	112	321
M	48	16	25	91
F	123	85	52	158
P	82	127	71	188
S	122	341	126	327
T	119	253	117	238
W	33	44	30	72
Y	63	110	108	130
V	167	229	123	378

aa_codes	alpha	beta	a.plus.b	a.div.b
A	285	203	175	361
R	53	67	78	146
N	97	139	120	183
D	163	121	111	244
C	22	75	74	63
Q	67	122	74	114
E	134	86	86	257
G	197	297	171	377
H	111	49	33	107
I	91	120	93	239
L	221	177	110	339
K	249	115	112	321
M	48	16	25	91
F	123	85	52	158
P	82	127	71	188
S	122	341	126	327
T	119	253	117	238
W	33	44	30	72
Y	63	110	108	130
V	167	229	123	378