##Introduction

This portfolio demonstrates a summary of the information related to the gene Arsenite Methyltransferase (AS3MT)

This includes alignments and phylogeny relations between the human version and other homologs to this gene.

AS3MT catalyzes the transfer of a methyl group from S-adenosyl-L-methionine (AdoMet) to trivalent arsenical and may play a role in arsenic metabolism

##Resources / References

Key information use to make this script can be found here:

Refseq Gene: https://www.ncbi.nlm.nih.gov/gene/1733

Refseq Homologene: https://www.ncbi.nlm.nih.gov/homologene/620

UniProt: https://www.uniprot.org/uniprot/Q9HC16

PDB: https://www.rcsb.org/structure/2KBO

#Preperation

Load packages

library(BiocManager)
## Bioconductor version '3.13' is out-of-date; the current release version '3.14'
##   is available with R version '4.1'; see https://bioconductor.org/install
# github packages
library(compbio4all)
library(ggmsa)
## Registered S3 methods overwritten by 'ggalt':
##   method                  from   
##   grid.draw.absoluteGrob  ggplot2
##   grobHeight.absoluteGrob ggplot2
##   grobWidth.absoluteGrob  ggplot2
##   grobX.absoluteGrob      ggplot2
##   grobY.absoluteGrob      ggplot2
# CRAN packages
library(rentrez)
library(seqinr)
library(ape)
## 
## Attaching package: 'ape'
## The following objects are masked from 'package:seqinr':
## 
##     as.alignment, consensus
library(pander)


library(ggplot2)

# Bioconductor packages
library(msa)
## Loading required package: Biostrings
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
## 
##     windows
## Loading required package: XVector
## Loading required package: GenomeInfoDb
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:ape':
## 
##     complement
## The following object is masked from 'package:seqinr':
## 
##     translate
## The following object is masked from 'package:base':
## 
##     strsplit
## 
## Attaching package: 'msa'
## The following object is masked from 'package:BiocManager':
## 
##     version
library(drawProteins)

## Biostrings
library(Biostrings)


library(HGNChelper)
## Warning: package 'HGNChelper' was built under R version 4.1.2

Accession Numbers

Accession numbers were obtained from RefSeq, Refseq Homlogene, UniProt and PDB. NCBI was the source of the RefSeq accession numbers which were then used further in Uniprot and PDB. There are no known protein sequences other than that in Homosapiens, the rest are predicted sequences.

Accession Number Table

RefSeq.Accession <- c("NP_065733.2", "XP_009457416.2", "XP_006527298.1", "XP_038938624.1", "XP_042192919.1", "XP_040848125.1", "XP_013810406.1")

Uniprot.Accession <- c("Q9HBK9", "NA", "NA", "NA", "NA", "NA", "NA")

PDB <- c("NA", "NA", "NA", "NA", "NA", "NA", "NA")

Scientific.Name <- c("Homo Sapien", "Pan troglodytes", "Mus musculus", "Rattus norvegicus", "Callorhinchus milii", "Ochotona curzoniae", "Apteryx mantelli")

Common.Name <- c("Human", "CHimpanzee", "Mouse", "Rat", "Shark", "Pika", "Kiwi")

Gene.Name <- c("AS3MT", "AS3MT", "As3mt", "As3mt", "as3mt", "AS3MT", "AS3MT")

accession.table <- data.frame(RefSeq.Accession, Uniprot.Accession, PDB, Scientific.Name, Common.Name, Gene.Name)

pander::pander(accession.table)
Table continues below
RefSeq.Accession Uniprot.Accession PDB Scientific.Name Common.Name
NP_065733.2 Q9HBK9 NA Homo Sapien Human
XP_009457416.2 NA NA Pan troglodytes CHimpanzee
XP_006527298.1 NA NA Mus musculus Mouse
XP_038938624.1 NA NA Rattus norvegicus Rat
XP_042192919.1 NA NA Callorhinchus milii Shark
XP_040848125.1 NA NA Ochotona curzoniae Pika
XP_013810406.1 NA NA Apteryx mantelli Kiwi
Gene.Name
AS3MT
AS3MT
As3mt
As3mt
as3mt
AS3MT
AS3MT

##Data Preparation

#Downloading Sequences

AS3MT_list <- entrez_fetch_list(db = "protein", 
                          id = RefSeq.Accession,
                          rettype = "fasta") 

Number of Files Obtained

length(AS3MT_list)
## [1] 7

First Entry

AS3MT_list[[1]]
## [1] ">NP_065733.2 arsenite methyltransferase [Homo sapiens]\nMAALRDAEIQKDVQTYYGQVLKRSADLQTNGCVTTARPVPKHIREALQNVHEEVALRYYGCGLVIPEHLE\nNCWILDLGSGSGRDCYVLSQLVGEKGHVTGIDMTKGQVEVAEKYLDYHMEKYGFQASNVTFIHGYIEKLG\nEAGIKNESHDIVVSNCVINLVPDKQQVLQEAYRVLKHGGELYFSDVYTSLELPEEIRTHKVLWGECLGGA\nLYWKELAVLAQKIGFCPPRLVTANLITIQNKELERVIGDCRFVSATFRLFKHSKTGPTKRCQVIYNGGIT\nGHEKELMFDANFTFKEGEIVEVDEETAAILKNSRFAQDFLIRPIGEKLPTSGGCSALELKDIITDPFKLA\nEESDSMKSRCVPDAAGGCCGTKKSC\n\n"

##Initializing Data

#Initial Data Cleaning

Remove FASTA Header

for(i in 1:length(AS3MT_list)){
  AS3MT_list[[i]]<-compbio4all::fasta_cleaner(AS3MT_list[[i]], parse = F)
}

##General Protein Diagram

Protein diagram

AS3MT_json <- drawProteins::get_features("Q9HBK9")
## [1] "Download has worked"
is(AS3MT_json)
## [1] "list"             "vector"           "list_OR_List"     "vector_OR_Vector"
## [5] "vector_OR_factor"
my_prot_df <- drawProteins::feature_to_dataframe(AS3MT_json)

is(my_prot_df)
## [1] "data.frame"       "list"             "oldClass"         "vector"          
## [5] "list_OR_List"     "vector_OR_Vector" "vector_OR_factor"
my_prot_df[,-2]
##                    type begin end length accession   entryName taxid order
## featuresTemp      CHAIN     1 375    374    Q9HBK9 AS3MT_HUMAN  9606     1
## featuresTemp.1  MOD_RES   335 335      0    Q9HBK9 AS3MT_HUMAN  9606     1
## featuresTemp.2  VAR_SEQ    58 153     95    Q9HBK9 AS3MT_HUMAN  9606     1
## featuresTemp.3  VARIANT   173 173      0    Q9HBK9 AS3MT_HUMAN  9606     1
## featuresTemp.4  VARIANT   287 287      0    Q9HBK9 AS3MT_HUMAN  9606     1
## featuresTemp.5  VARIANT   306 306      0    Q9HBK9 AS3MT_HUMAN  9606     1
## featuresTemp.6 CONFLICT   125 125      0    Q9HBK9 AS3MT_HUMAN  9606     1
## featuresTemp.7 CONFLICT   132 132      0    Q9HBK9 AS3MT_HUMAN  9606     1
## featuresTemp.8 CONFLICT   135 135      0    Q9HBK9 AS3MT_HUMAN  9606     1
## featuresTemp.9 CONFLICT   140 140      0    Q9HBK9 AS3MT_HUMAN  9606     1
my_prot_df <- drawProteins::feature_to_dataframe(AS3MT_json)

my_canvas <- draw_canvas(my_prot_df)
my_canvas <- draw_chains(my_canvas, my_prot_df, label_size = 2.5)

my_canvas <- draw_regions(my_canvas, my_prot_df)
my_canvas <- draw_motif(my_canvas, my_prot_df)
my_canvas <- draw_phospho(my_canvas, my_prot_df)
my_canvas <- draw_repeat(my_canvas, my_prot_df)
my_canvas <- draw_recept_dom(my_canvas, my_prot_df)
my_canvas <- draw_folding(my_canvas, my_prot_df)
my_canvas

##Dotplot

Prepping Data

AS3MT.human.vector <- fasta_cleaner(AS3MT_list[[1]])

2x2 plots

par(mfrow = c(2,2), 
    mar = c(0,0,2,1))

#Defaults
dotPlot(AS3MT.human.vector, AS3MT.human.vector, wsize = 1, nmatch = 1, main = "Defaults")

#V2
dotPlot(AS3MT.human.vector, AS3MT.human.vector, wsize = 10, nmatch = 1, main = "size = 10, nmatch = 1")

#V3
dotPlot(AS3MT.human.vector, AS3MT.human.vector, wsize = 10, nmatch = 5, main = "size = 10, nmatch = 5")

#V4
dotPlot(AS3MT.human.vector, AS3MT.human.vector, wsize = 20, nmatch = 5,main = "size = 20, nmatch = 5")

Best Plot

dotPlot(AS3MT.human.vector, AS3MT.human.vector, 
        wsize = 1,
        nmatch = 1,
        main = "size = 1, nmatch = 1")

##Protein Properteis Compiled From Databases

PDB: https://www.rcsb.org/structure/6CX6 Uniprot: https://www.UniProt.org/uniprot/Q9HBK9 Pfam: http://pfam.xfam.org/family/PF13847 Alphafold: https://alphafold.ebi.ac.uk/entry/Q9HBK9

URL <- c("PDB", "Uniprot", "Pfam", "Alphafold")

Protein.Property <- c("Classification: Transferase",
                "Location: Cytosol",
                "Domain: Methyltransferase", 
                "Structure: Alpha helicies, and beta pleated sheets")

Protein.Properties.dataframe <- data.frame(URL, Protein.Property)

pander::pander(Protein.Properties.dataframe)
URL Protein.Property
PDB Classification: Transferase
Uniprot Location: Cytosol
Pfam Domain: Methyltransferase
Alphafold Structure: Alpha helicies, and beta pleated sheets

##Protein Feature Prediction

Multivariate statistcal techniques were used to confirm the information about protein structure and location in the line database.

##Predict Protein Fold

Alphafold indicates that there are a mix of alpha helices and beta sheets. I predict that a mixture of a+b and a/b structure will be formed through the machine learning process.

Chou and Zhang (1994) Table

aa.1.1 <- c("A","R","N","D","C","Q","E","G","H","I",
            "L","K","M","F","P","S","T","W","Y","V")

Alpha <- c(285, 53, 97, 163, 22, 67, 134, 197, 111, 91, 
           221, 249, 48, 123, 82, 122, 119, 33, 63, 167)

Beta <- c(203, 67, 139, 121, 75, 122, 86, 297, 49, 120, 
          177, 115, 16, 85, 127, 341, 253, 44, 110, 229)

A.Plus.B <- c(175, 78, 120, 111, 74, 74, 86, 171, 33, 93,
              110, 112, 25, 52, 71, 126, 117, 30, 108, 123)

A.Div.B <- c(361, 146, 183, 244, 63, 114, 257, 377, 107, 239, 
             339, 321, 91, 158, 188, 327, 238, 72, 130, 378)

pander(data.frame(aa.1.1, Alpha, Beta, A.Plus.B, A.Div.B))
aa.1.1 Alpha Beta A.Plus.B A.Div.B
A 285 203 175 361
R 53 67 78 146
N 97 139 120 183
D 163 121 111 244
C 22 75 74 63
Q 67 122 74 114
E 134 86 86 257
G 197 297 171 377
H 111 49 33 107
I 91 120 93 239
L 221 177 110 339
K 249 115 112 321
M 48 16 25 91
F 123 85 52 158
P 82 127 71 188
S 122 341 126 327
T 119 253 117 238
W 33 44 30 72
Y 63 110 108 130
V 167 229 123 378

Convert to Frequencies

Alpha.properties <- Alpha/sum(Alpha)

Beta.properties <- Beta/sum(Beta)

A.Plus.B.properties <- A.Plus.B/sum(A.Plus.B)

A.Div.B.properties <- A.Div.B/sum(A.Div.B)

aa.prop <- data.frame(Alpha.properties,
                      Beta.properties,
                      A.Plus.B.properties,
                      A.Div.B.properties)

row.names(aa.prop) <- aa.1.1

pander::pander(aa.prop)
Table continues below
  Alpha.properties Beta.properties A.Plus.B.properties
A 0.1165 0.07313 0.09264
R 0.02166 0.02414 0.04129
N 0.03964 0.05007 0.06353
D 0.06661 0.04359 0.05876
C 0.008991 0.02702 0.03917
Q 0.02738 0.04395 0.03917
E 0.05476 0.03098 0.04553
G 0.08051 0.107 0.09052
H 0.04536 0.01765 0.01747
I 0.03719 0.04323 0.04923
L 0.09031 0.06376 0.05823
K 0.1018 0.04143 0.05929
M 0.01962 0.005764 0.01323
F 0.05027 0.03062 0.02753
P 0.03351 0.04575 0.03759
S 0.04986 0.1228 0.0667
T 0.04863 0.09114 0.06194
W 0.01349 0.01585 0.01588
Y 0.02575 0.03963 0.05717
V 0.06825 0.08249 0.06511
  A.Div.B.properties
A 0.08331
R 0.03369
N 0.04223
D 0.05631
C 0.01454
Q 0.02631
E 0.05931
G 0.08701
H 0.02469
I 0.05516
L 0.07824
K 0.07408
M 0.021
F 0.03646
P 0.04339
S 0.07547
T 0.05493
W 0.01662
Y 0.03
V 0.08724

Determining number of each amino acid in AS3MT

AS3MT.table <- table(AS3MT.human.vector)/length(AS3MT.human.vector)

Table to vector function

table_to_vector <- function(table_x){
  table_names <- attr(table_x, "dimnames")[[1]]
  table_vect <- as.vector(table_x)
  names(table_vect) <- table_names
  return(table_vect)
}
AS3MT.human.table <- table(AS3MT.human.vector)/length(AS3MT.human.vector)
AS3MT.human.aa.freq <- table_to_vector(AS3MT.human.table)
pander::pander(AS3MT.human.aa.freq)
Table continues below
A C D E F G H I K
0.064 0.03733 0.048 0.088 0.03467 0.088 0.02933 0.06133 0.072
Table continues below
L M N P Q R S T V
0.09333 0.01333 0.032 0.032 0.04 0.04267 0.048 0.05333 0.07733
W Y
0.008 0.03733

Checking for “U” (UNknown) presence

aa.names <- names(AS3MT.human.aa.freq)
i.U <- which(aa.names == "U")
aa.names[i.U]
## character(0)
AS3MT.human.aa.freq[i.U]
## named numeric(0)

Add AS3MT data

aa.prop$AS3MT.human.aa.freq <- AS3MT.human.aa.freq
pander::pander(aa.prop)
Table continues below
  Alpha.properties Beta.properties A.Plus.B.properties
A 0.1165 0.07313 0.09264
R 0.02166 0.02414 0.04129
N 0.03964 0.05007 0.06353
D 0.06661 0.04359 0.05876
C 0.008991 0.02702 0.03917
Q 0.02738 0.04395 0.03917
E 0.05476 0.03098 0.04553
G 0.08051 0.107 0.09052
H 0.04536 0.01765 0.01747
I 0.03719 0.04323 0.04923
L 0.09031 0.06376 0.05823
K 0.1018 0.04143 0.05929
M 0.01962 0.005764 0.01323
F 0.05027 0.03062 0.02753
P 0.03351 0.04575 0.03759
S 0.04986 0.1228 0.0667
T 0.04863 0.09114 0.06194
W 0.01349 0.01585 0.01588
Y 0.02575 0.03963 0.05717
V 0.06825 0.08249 0.06511
  A.Div.B.properties AS3MT.human.aa.freq
A 0.08331 0.064
R 0.03369 0.03733
N 0.04223 0.048
D 0.05631 0.088
C 0.01454 0.03467
Q 0.02631 0.088
E 0.05931 0.02933
G 0.08701 0.06133
H 0.02469 0.072
I 0.05516 0.09333
L 0.07824 0.01333
K 0.07408 0.032
M 0.021 0.032
F 0.03646 0.04
P 0.04339 0.04267
S 0.07547 0.048
T 0.05493 0.05333
W 0.01662 0.07733
Y 0.03 0.008
V 0.08724 0.03733

##Functions to calculate similarities

chou_cor <- function(x,y){
  numerator <- sum(x*y)
denominator <- sqrt((sum(x^2))*(sum(y^2)))
result <- numerator/denominator
return(result)
}

chou_cosine <- function(z.1, z.2){
  z.1.abs <- sqrt(sum(z.1^2))
  z.2.abs <- sqrt(sum(z.2^2))
  my.cosine <- sum(z.1*z.2)/(z.1.abs*z.2.abs)
  return(my.cosine)
}

Calculate correlation between each column

corr.alpha <- chou_cor(aa.prop[,5], aa.prop[,1])
corr.beta  <- chou_cor(aa.prop[,5], aa.prop[,2])
corr.apb   <- chou_cor(aa.prop[,5], aa.prop[,3])
corr.adb   <- chou_cor(aa.prop[,5], aa.prop[,4])

Calculate cosine similarity

cos.alpha <- chou_cosine(aa.prop[,5], aa.prop[,1])
cos.beta  <- chou_cosine(aa.prop[,5], aa.prop[,2])
cos.apb   <- chou_cosine(aa.prop[,5], aa.prop[,3])
cos.adb   <- chou_cosine(aa.prop[,5], aa.prop[,4])

Calculate distance

aa.prop.flipped <- t(aa.prop)
round(aa.prop.flipped,2) 
##                        A    R    N    D    C    Q    E    G    H    I    L    K
## Alpha.properties    0.12 0.02 0.04 0.07 0.01 0.03 0.05 0.08 0.05 0.04 0.09 0.10
## Beta.properties     0.07 0.02 0.05 0.04 0.03 0.04 0.03 0.11 0.02 0.04 0.06 0.04
## A.Plus.B.properties 0.09 0.04 0.06 0.06 0.04 0.04 0.05 0.09 0.02 0.05 0.06 0.06
## A.Div.B.properties  0.08 0.03 0.04 0.06 0.01 0.03 0.06 0.09 0.02 0.06 0.08 0.07
## AS3MT.human.aa.freq 0.06 0.04 0.05 0.09 0.03 0.09 0.03 0.06 0.07 0.09 0.01 0.03
##                        M    F    P    S    T    W    Y    V
## Alpha.properties    0.02 0.05 0.03 0.05 0.05 0.01 0.03 0.07
## Beta.properties     0.01 0.03 0.05 0.12 0.09 0.02 0.04 0.08
## A.Plus.B.properties 0.01 0.03 0.04 0.07 0.06 0.02 0.06 0.07
## A.Div.B.properties  0.02 0.04 0.04 0.08 0.05 0.02 0.03 0.09
## AS3MT.human.aa.freq 0.03 0.04 0.04 0.05 0.05 0.08 0.01 0.04

We can get distance matrix like this

dist(aa.prop.flipped, method = "euclidean")
##                     Alpha.properties Beta.properties A.Plus.B.properties
## Beta.properties           0.13342098                                    
## A.Plus.B.properties       0.09281824      0.08289406                    
## A.Div.B.properties        0.06699039      0.08659174          0.06175113
## AS3MT.human.aa.freq       0.17100713      0.17044689          0.14509680
##                     A.Div.B.properties
## Beta.properties                       
## A.Plus.B.properties                   
## A.Div.B.properties                    
## AS3MT.human.aa.freq         0.15627830

Individual distances

dist.Alpha <- dist((aa.prop.flipped[c(1,5),]),  method = "euclidean")
dist.Beta  <- dist((aa.prop.flipped[c(2,5),]),  method = "euclidean")
dist.APB   <- dist((aa.prop.flipped[c(3,5),]),  method = "euclidean")
dist.ADB  <- dist((aa.prop.flipped[c(4,5),]), method = "euclidean")

Compile the information. Rounding makes it easier to read

# fold types
fold.type <- c("alpha","beta","alpha plus beta", "alpha/beta")

# data
corr.sim <- round(c(corr.alpha, corr.beta, corr.apb, corr.adb),5)
cosine.sim <- round(c(cos.alpha, cos.beta, cos.apb, cos.adb),5)
Euclidean.dist <- round(c(dist.Alpha, dist.Beta, dist.APB, dist.ADB),5)

# summary
sim.sum <- c("","","most.sim","")
dist.sum <- c("","","min.dist","")

df <- data.frame(fold.type,
           corr.sim ,
           cosine.sim ,
           Euclidean.dist ,
           sim.sum ,
           dist.sum )

Display Output

pander::pander(df)
fold.type corr.sim cosine.sim Euclidean.dist sim.sum dist.sum
alpha 0.7721 0.7721 0.171
beta 0.7772 0.7772 0.1704
alpha plus beta 0.8252 0.8252 0.1451 most.sim min.dist
alpha/beta 0.8009 0.8009 0.1563

##Percent Identity Coparisons (PID)

names(AS3MT_list)
## [1] "NP_065733.2"    "XP_009457416.2" "XP_006527298.1" "XP_038938624.1"
## [5] "XP_042192919.1" "XP_040848125.1" "XP_013810406.1"
length(AS3MT_list)
## [1] 7
AS3MT_list[1]
## $NP_065733.2
## [1] "MAALRDAEIQKDVQTYYGQVLKRSADLQTNGCVTTARPVPKHIREALQNVHEEVALRYYGCGLVIPEHLENCWILDLGSGSGRDCYVLSQLVGEKGHVTGIDMTKGQVEVAEKYLDYHMEKYGFQASNVTFIHGYIEKLGEAGIKNESHDIVVSNCVINLVPDKQQVLQEAYRVLKHGGELYFSDVYTSLELPEEIRTHKVLWGECLGGALYWKELAVLAQKIGFCPPRLVTANLITIQNKELERVIGDCRFVSATFRLFKHSKTGPTKRCQVIYNGGITGHEKELMFDANFTFKEGEIVEVDEETAAILKNSRFAQDFLIRPIGEKLPTSGGCSALELKDIITDPFKLAEESDSMKSRCVPDAAGGCCGTKKSC"
AS3MT_vector <- unlist(AS3MT_list)

names(AS3MT_vector) <- names(AS3MT_list)

AS3MT_vector[1]
##                                                                                                                                                                                                                                                                                                                                                                               NP_065733.2 
## "MAALRDAEIQKDVQTYYGQVLKRSADLQTNGCVTTARPVPKHIREALQNVHEEVALRYYGCGLVIPEHLENCWILDLGSGSGRDCYVLSQLVGEKGHVTGIDMTKGQVEVAEKYLDYHMEKYGFQASNVTFIHGYIEKLGEAGIKNESHDIVVSNCVINLVPDKQQVLQEAYRVLKHGGELYFSDVYTSLELPEEIRTHKVLWGECLGGALYWKELAVLAQKIGFCPPRLVTANLITIQNKELERVIGDCRFVSATFRLFKHSKTGPTKRCQVIYNGGITGHEKELMFDANFTFKEGEIVEVDEETAAILKNSRFAQDFLIRPIGEKLPTSGGCSALELKDIITDPFKLAEESDSMKSRCVPDAAGGCCGTKKSC"

##PID Table

align.HumVChimp <- Biostrings::pairwiseAlignment(AS3MT_vector[1], AS3MT_vector[2])
Biostrings::pid(align.HumVChimp)
## [1] 98.40849
align.HumVMouse <- Biostrings::pairwiseAlignment(AS3MT_vector[1], AS3MT_vector[3])
Biostrings::pid(align.HumVMouse)
## [1] 72.82609
align.HumVRat <- Biostrings::pairwiseAlignment(AS3MT_vector[1], AS3MT_vector[4])
Biostrings::pid(align.HumVRat)
## [1] 66.13333
align.ChimpVMouse <- Biostrings::pairwiseAlignment(AS3MT_vector[2], AS3MT_vector[3])
Biostrings::pid(align.ChimpVMouse)
## [1] 71.62162
align.ChimpVRat <- Biostrings::pairwiseAlignment(AS3MT_vector[2], AS3MT_vector[4])
Biostrings::pid(align.ChimpVRat)
## [1] 65.25199
align.MouseVRat <- Biostrings::pairwiseAlignment(AS3MT_vector[3], AS3MT_vector[4])
Biostrings::pid(align.MouseVRat)
## [1] 71.34986

Build Matrix

pids <- c(1,                  NA,     NA,     NA,
          pid(align.HumVChimp),          1,     NA,     NA,
          pid(align.HumVMouse), pid(align.ChimpVMouse),      1,     NA,
          pid(align.HumVRat), pid(align.ChimpVRat), pid(align.MouseVRat), 1)

mat <- matrix(pids, nrow = 4, byrow = T)
row.names(mat) <- c("Homo","Pan","Rat","Fish")   
colnames(mat) <- c("Homo","Pan","Rat","Fish")   
pander::pander(mat)  
  Homo Pan Rat Fish
Homo 1 NA NA NA
Pan 98.41 1 NA NA
Rat 72.83 71.62 1 NA
Fish 66.13 65.25 71.35 1

##PID Methods Comparison

pid1 <- pid(align.HumVChimp, type = "PID1")
pid2 <- pid(align.HumVChimp, type = "PID2")
pid3 <- pid(align.HumVChimp, type = "PID3")
pid4 <- pid(align.HumVChimp, type = "PID4")

PID.Method <- c("PID1", "PID2", "PID3", "PID4")
PID.Value <- c(pid1, pid2, pid3, pid4)
Denominator <- c("(aligned positions + internal gap positions)", "(aligned positions)", "(length shorter sequence)", "(average length of the two sequences)")

pid_df <- data.frame(PID.Method, PID.Value, Denominator)
pander::pander(pid_df)
PID.Method PID.Value Denominator
PID1 98.41 (aligned positions + internal gap positions)
PID2 98.93 (aligned positions)
PID3 98.93 (length shorter sequence)
PID4 98.67 (average length of the two sequences)

##Multiple Sequence Alignment

#MSA Data Preparation

AS3MT_vector_ss <- Biostrings::AAStringSet(AS3MT_vector)

#Build MSA

AS3MT_alignment <- msa(AS3MT_vector_ss, method = "ClustalW")
## use default substitution matrix

#Cleaning / Setting up MSA

MSA produces a species MSA objects

class(AS3MT_alignment)
## [1] "MsaAAMultipleAlignment"
## attr(,"package")
## [1] "msa"
is(AS3MT_alignment)
## [1] "MsaAAMultipleAlignment" "AAMultipleAlignment"    "MsaMetaData"           
## [4] "MultipleAlignment"

Default Output of MSA

AS3MT_alignment
## CLUSTAL 2.1  
## 
## Call:
##    msa(AS3MT_vector_ss, method = "ClustalW")
## 
## MsaAAMultipleAlignment with 7 rows and 422 columns
##     aln                                                    names
## [1] --MAALRDA-EIQKDVQTYYGQVLKR...EESDSMKSRCVPDAAGGCCGTKKSC NP_065733.2
## [2] MTLAAPRDA-EIQKDVQTYYGQVLKR...EESDSMKSRCVPDAAGGCCGTKKSC XP_009457416.2
## [3] --MAASRDADEIHKDVQNYYGNVLKT...-------GRSELETKGH-------- XP_006527298.1
## [4] --MAAPRDA-EIHKDVQNYYGNVLKT...EESDKMKPRCAPEGTGGCCGKRKSC XP_038938624.1
## [5] --MATSSDT-EILRDVQEYYGKVLKT...EQMAGVKSGCA--SAGGCCGKPKKC XP_040848125.1
## [6] -------------------------M...EIKP---SGGAPPVAGSCCVTKGCC XP_042192919.1
## [7] --MAAPAGE-QIHRAVQDYYGQELQR...EQPG---APGPACCPASACGPRGCC XP_013810406.1
## Con --MAA?RDA-EI?KDVQ?YYG?VLK?...E?????KSRC?P??AGGCCG??K?C Consensus

Change class of alignment

class(AS3MT_alignment) <- "AAMultipleAlignment"

Convert to Seqinr Format

AS3MT_seqnir <- msaConvert(AS3MT_alignment, type = "seqinr::alignment")

Print new MSA

compbio4all::print_msa(AS3MT_seqnir)
## [1] "--MAALRDA-EIQKDVQTYYGQVLKRSADLQTNGCVTTARPVPKHIREALQNVHEEVALR 0"
## [1] "MTLAAPRDA-EIQKDVQTYYGQVLKRSADLQTNGCVTTARPVPKHIREALQNVHEEVALR 0"
## [1] "--MAASRDADEIHKDVQNYYGNVLKTSADLQTNACVTRAKPVPSYIRESLQNVHEDVSSR 0"
## [1] "--MAAPRDA-EIHKDVQNYYGNVLKTSADLQTNACVTPAKGVPEYIRKSLQNVHEEVTSR 0"
## [1] "--MATSSDT-EILRDVQEYYGKVLKTSADLQANACVTPAGPMPAHVREALRNVHEEVALR 0"
## [1] "-------------------------MSADLKTNVCVTPARALPKHVRAALQDVHGEVAMR 0"
## [1] "--MAAPAGE-QIHRAVQDYYGQELQRSEDLKTNACVTLAGPLPRAAREALERVHGEVVAR 0"
## [1] " "
## [1] "YYGCGLVIPEHLENCWILDLGSGSGRDCYVLSQLVGEKGHVTGIDMTKGQVEVAEKYLDY 0"
## [1] "YYGCGLVIPEHLENCWILDLGSGSGRDCYVLSQLVGEKGHVTGIDMTKGQVEVAEKYLDY 0"
## [1] "YYGCGLTVPERLENCRILDLGSGSGRDCYVLSQLVGEKGHVTGIDMTEVQVEVAKTYLEH 0"
## [1] "YYGCGLVVPEHLENCRILDLGSGSGRDCYVLSQLVGQKGHITGIDMTKVQVEVAKAYLEY 0"
## [1] "YYGCGLVIPERLQSCWVLDLGSGSGRDCYALSQLVGDKGHVTGIDMTGEQVEMAKKYLDY 0"
## [1] "YYGCGLVVPECLENCWILDLGSGSGRDCYMLSKLVGEKGHVTGVDMTDEQIAVAQKFVQY 0"
## [1] "YYGCGLVLPECLASCRVLDLGSGSGRDCYLLSQLVGEEGHVTGIDMTQGQVEVARKHIAY 0"
## [1] " "
## [1] "HMEKYGFQASNVTFIHGYIEKLGEAGIKNESHDIVVSNCVINLVPDKQQVLQEAYRVLKH 0"
## [1] "HMEKYGFQASNVTFIHGYIEKLGGAGIKNESHDIVVSNCVINLVPDKQQVLQEAYRVLKH 0"
## [1] "HMEKFGFQAPNVTFLHGRIEKLAEAGIQSESYDIVISNCVINLVPDKQQVLQEVYRVLKH 0"
## [1] "HTEKFGFQTPNVTFLHGQIEMLAEAGIQKESYDIVISNCVINLVPDKQKVLREVYQVLKY 0"
## [1] "HMEKFGFQTPNVTFIHGYIEKLAEAGIKDESYDIVISNCVINLVPDKQQVLQEAYRVLKH 0"
## [1] "HTQKFGFSKPNVEFIQGYIEKLDDVGLKSNSYDIIISNCVVNLSPDKLSVLREAYRVLKN 0"
## [1] "HMDKFGFRAPNVDFIQGYMEKLGDAGLADESYDIVISNCVINLSPDKGAVLREAYRVLKP 0"
## [1] " "
## [1] "GGELYFSDVYTSLELPEEIRTHKVLWGECLGGALYWKELAVLAQKIGFCPPRLVTANLIT 0"
## [1] "GGELYFSDVYTSLELPEEIRTHKVLWGECLGGALYWKELAVLAQKIGFCPPRLVTANLIT 0"
## [1] "GGELYFSDVYASLEVPEDIKSHKVLWGECLGGALYWKDLAIIAQKIGFCPPRLVTADIIT 0"
## [1] "GGELYFSDVYASLEVSEDIKSHKVLW---------------------------------- 0"
## [1] "GGELYFSDVYASLELSEGVRSHRVLWGECLGGALYWKDLANFAQKIGFCPPRLVAANLVT 0"
## [1] "GGEMYFSDIYSDRELPEKMRKHKVLWGECLGGALSWEELVRIAKEVGFSTPRLTTAKLFD 0"
## [1] "GGEMYFSDIYASRRLSEAARAHRVLWGECLAGALCWRELYSIAQDVGFSPPRLVAASPIT 0"
## [1] " "
## [1] "IQNKELERVIGDCRFVS-----------------------------ATFRLFKHSKTGPT 0"
## [1] "IQNKELERVIGDCRFVS-----------------------------ATFRLFKHSKTGPT 0"
## [1] "VENKELEGVLGDCRFVS-----------------------------ATFRLFKLPKTEPA 0"
## [1] "----------GDCRFVS-----------------------------ATFRLFKLPKTEPA 0"
## [1] "VQNKDLEGVIGDCRFVS-----------------------------ATFRLFKLPKTGPT 0"
## [1] "VNNKELEEVIGDYKFVS-----------------------------ATYRLFKIPAHDSK 0"
## [1] "VGHEELESIVGDCRFVSQAKGAEMGGGKGSPTCASCWGEDGRRTSMGKCRVVGATCSATA 0"
## [1] " "
## [1] "KRCQVIYNGGITGHEKELMFDAN------------FTFKEGEIVEVDEETAAILKNSRFA 0"
## [1] "KRCQVIYNGGITGHEKELMFDAN------------FKFKEGEIVEVDEETAAILKNSRFA 0"
## [1] "ERCRVVYNGGIKGHEKELIFDAN------------FTFKEGEAVAVDEETAAVLKNSRFA 0"
## [1] "GRCQVVYNGGIMGHEKELIFDAN------------FTFKEGEAVEVDEETAAILRNSRFA 0"
## [1] "ERCQVTYRGGIPGHEKELVFDAN------------FTFKKGEALEVDEETAIILKNSRFA 0"
## [1] "EELQVIYNGSIAGCEGKLEFDAN------------YTFEEGEVMDVDEEMAMILKSSRFA 0"
## [1] "GVSEGFGPGCIPAFAPRVVTAASGGSPGCLVPSLSLCPQAGEAVDVDADTAAILRSSRFA 0"
## [1] " "
## [1] "QDFLIRPIGEKLPTSGG---CSALELKDIITDPFKLAEESDSMKSRCVPDAAGGCCGTKK 0"
## [1] "QAFLIRPIGEKLPTSGG---CSALELKDIITDPFKLAEESDSMKSRCVPDAAGGCCGTKK 0"
## [1] "PDFLFTPVDASLP---------APQ-------------------GRSELETKGH------ 0"
## [1] "HDFLFTPVEASLL---------APQTKVIIRDPFKLAEESDKMKPRCAPEGTGGCCGKRK 0"
## [1] "QDFLFEPLTEKLLTSGA---CPVLEPKAVITDPFKLAEQMAGVKSGCA--SAGGCCGKPK 0"
## [1] "DKFTMHPANKVASGEAG---GRARQPKDMIIDPFKFTEIKP---SGGAPPVAGSCCVTKG 0"
## [1] "EDFLIQASGADANADTAPPGCCRGRVKERICDPFQLLEQPG---APGPACCPASACGPRG 0"
## [1] " "
## [1] "SC 58"
## [1] "SC 58"
## [1] "-- 58"
## [1] "SC 58"
## [1] "KC 58"
## [1] "CC 58"
## [1] "CC 58"
## [1] " "

#Finished MSA

ggmsa::ggmsa(AS3MT_alignment, start = 0, end = 50) 

##Distance Matrix

AS3MT_distance <- seqinr::dist.alignment(AS3MT_seqnir, matrix = "identity")
is(AS3MT_distance)
## [1] "dist"     "oldClass"
class(AS3MT_distance)
## [1] "dist"

Round for Display

AS3MT_align_seqinr_rnd <- round(AS3MT_distance, 3)
AS3MT_align_seqinr_rnd
##                NP_065733.2 XP_009457416.2 XP_006527298.1 XP_038938624.1
## XP_009457416.2       0.115                                             
## XP_006527298.1       0.474          0.487                              
## XP_038938624.1       0.487          0.496          0.366               
## XP_040848125.1       0.466          0.477          0.494          0.513
## XP_042192919.1       0.619          0.621          0.642          0.648
## XP_013810406.1       0.674          0.676          0.673          0.680
##                XP_040848125.1 XP_042192919.1
## XP_009457416.2                              
## XP_006527298.1                              
## XP_038938624.1                              
## XP_040848125.1                              
## XP_042192919.1          0.639               
## XP_013810406.1          0.664          0.685

##Phylogenetic Trees of Sequences

Build as phylogenetic tree from distance matrix

tree <- nj(AS3MT_distance)

PLot Phylogenetic Tree

plot.phylo(tree, main = "Phylogenetic Tree\n", use.edge.length = FALSE)
mtext(text = "\nAS3MT family gene tree - rooted, no branch lengths")