##Introduction
This portfolio demonstrates a summary of the information related to the gene Arsenite Methyltransferase (AS3MT)
This includes alignments and phylogeny relations between the human version and other homologs to this gene.
AS3MT catalyzes the transfer of a methyl group from S-adenosyl-L-methionine (AdoMet) to trivalent arsenical and may play a role in arsenic metabolism
##Resources / References
Key information use to make this script can be found here:
Refseq Gene: https://www.ncbi.nlm.nih.gov/gene/1733
Refseq Homologene: https://www.ncbi.nlm.nih.gov/homologene/620
UniProt: https://www.uniprot.org/uniprot/Q9HC16
PDB: https://www.rcsb.org/structure/2KBO
#Preperation
Load packages
library(BiocManager)
## Bioconductor version '3.13' is out-of-date; the current release version '3.14'
## is available with R version '4.1'; see https://bioconductor.org/install
# github packages
library(compbio4all)
library(ggmsa)
## Registered S3 methods overwritten by 'ggalt':
## method from
## grid.draw.absoluteGrob ggplot2
## grobHeight.absoluteGrob ggplot2
## grobWidth.absoluteGrob ggplot2
## grobX.absoluteGrob ggplot2
## grobY.absoluteGrob ggplot2
# CRAN packages
library(rentrez)
library(seqinr)
library(ape)
##
## Attaching package: 'ape'
## The following objects are masked from 'package:seqinr':
##
## as.alignment, consensus
library(pander)
library(ggplot2)
# Bioconductor packages
library(msa)
## Loading required package: Biostrings
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
##
## windows
## Loading required package: XVector
## Loading required package: GenomeInfoDb
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:ape':
##
## complement
## The following object is masked from 'package:seqinr':
##
## translate
## The following object is masked from 'package:base':
##
## strsplit
##
## Attaching package: 'msa'
## The following object is masked from 'package:BiocManager':
##
## version
library(drawProteins)
## Biostrings
library(Biostrings)
library(HGNChelper)
## Warning: package 'HGNChelper' was built under R version 4.1.2
Accession numbers were obtained from RefSeq, Refseq Homlogene, UniProt and PDB. NCBI was the source of the RefSeq accession numbers which were then used further in Uniprot and PDB. There are no known protein sequences other than that in Homosapiens, the rest are predicted sequences.
RefSeq.Accession <- c("NP_065733.2", "XP_009457416.2", "XP_006527298.1", "XP_038938624.1", "XP_042192919.1", "XP_040848125.1", "XP_013810406.1")
Uniprot.Accession <- c("Q9HBK9", "NA", "NA", "NA", "NA", "NA", "NA")
PDB <- c("NA", "NA", "NA", "NA", "NA", "NA", "NA")
Scientific.Name <- c("Homo Sapien", "Pan troglodytes", "Mus musculus", "Rattus norvegicus", "Callorhinchus milii", "Ochotona curzoniae", "Apteryx mantelli")
Common.Name <- c("Human", "CHimpanzee", "Mouse", "Rat", "Shark", "Pika", "Kiwi")
Gene.Name <- c("AS3MT", "AS3MT", "As3mt", "As3mt", "as3mt", "AS3MT", "AS3MT")
accession.table <- data.frame(RefSeq.Accession, Uniprot.Accession, PDB, Scientific.Name, Common.Name, Gene.Name)
pander::pander(accession.table)
| RefSeq.Accession | Uniprot.Accession | PDB | Scientific.Name | Common.Name |
|---|---|---|---|---|
| NP_065733.2 | Q9HBK9 | NA | Homo Sapien | Human |
| XP_009457416.2 | NA | NA | Pan troglodytes | CHimpanzee |
| XP_006527298.1 | NA | NA | Mus musculus | Mouse |
| XP_038938624.1 | NA | NA | Rattus norvegicus | Rat |
| XP_042192919.1 | NA | NA | Callorhinchus milii | Shark |
| XP_040848125.1 | NA | NA | Ochotona curzoniae | Pika |
| XP_013810406.1 | NA | NA | Apteryx mantelli | Kiwi |
| Gene.Name |
|---|
| AS3MT |
| AS3MT |
| As3mt |
| As3mt |
| as3mt |
| AS3MT |
| AS3MT |
##Data Preparation
#Downloading Sequences
AS3MT_list <- entrez_fetch_list(db = "protein",
id = RefSeq.Accession,
rettype = "fasta")
Number of Files Obtained
length(AS3MT_list)
## [1] 7
First Entry
AS3MT_list[[1]]
## [1] ">NP_065733.2 arsenite methyltransferase [Homo sapiens]\nMAALRDAEIQKDVQTYYGQVLKRSADLQTNGCVTTARPVPKHIREALQNVHEEVALRYYGCGLVIPEHLE\nNCWILDLGSGSGRDCYVLSQLVGEKGHVTGIDMTKGQVEVAEKYLDYHMEKYGFQASNVTFIHGYIEKLG\nEAGIKNESHDIVVSNCVINLVPDKQQVLQEAYRVLKHGGELYFSDVYTSLELPEEIRTHKVLWGECLGGA\nLYWKELAVLAQKIGFCPPRLVTANLITIQNKELERVIGDCRFVSATFRLFKHSKTGPTKRCQVIYNGGIT\nGHEKELMFDANFTFKEGEIVEVDEETAAILKNSRFAQDFLIRPIGEKLPTSGGCSALELKDIITDPFKLA\nEESDSMKSRCVPDAAGGCCGTKKSC\n\n"
##Initializing Data
#Initial Data Cleaning
Remove FASTA Header
for(i in 1:length(AS3MT_list)){
AS3MT_list[[i]]<-compbio4all::fasta_cleaner(AS3MT_list[[i]], parse = F)
}
##General Protein Diagram
Protein diagram
AS3MT_json <- drawProteins::get_features("Q9HBK9")
## [1] "Download has worked"
is(AS3MT_json)
## [1] "list" "vector" "list_OR_List" "vector_OR_Vector"
## [5] "vector_OR_factor"
my_prot_df <- drawProteins::feature_to_dataframe(AS3MT_json)
is(my_prot_df)
## [1] "data.frame" "list" "oldClass" "vector"
## [5] "list_OR_List" "vector_OR_Vector" "vector_OR_factor"
my_prot_df[,-2]
## type begin end length accession entryName taxid order
## featuresTemp CHAIN 1 375 374 Q9HBK9 AS3MT_HUMAN 9606 1
## featuresTemp.1 MOD_RES 335 335 0 Q9HBK9 AS3MT_HUMAN 9606 1
## featuresTemp.2 VAR_SEQ 58 153 95 Q9HBK9 AS3MT_HUMAN 9606 1
## featuresTemp.3 VARIANT 173 173 0 Q9HBK9 AS3MT_HUMAN 9606 1
## featuresTemp.4 VARIANT 287 287 0 Q9HBK9 AS3MT_HUMAN 9606 1
## featuresTemp.5 VARIANT 306 306 0 Q9HBK9 AS3MT_HUMAN 9606 1
## featuresTemp.6 CONFLICT 125 125 0 Q9HBK9 AS3MT_HUMAN 9606 1
## featuresTemp.7 CONFLICT 132 132 0 Q9HBK9 AS3MT_HUMAN 9606 1
## featuresTemp.8 CONFLICT 135 135 0 Q9HBK9 AS3MT_HUMAN 9606 1
## featuresTemp.9 CONFLICT 140 140 0 Q9HBK9 AS3MT_HUMAN 9606 1
my_prot_df <- drawProteins::feature_to_dataframe(AS3MT_json)
my_canvas <- draw_canvas(my_prot_df)
my_canvas <- draw_chains(my_canvas, my_prot_df, label_size = 2.5)
my_canvas <- draw_regions(my_canvas, my_prot_df)
my_canvas <- draw_motif(my_canvas, my_prot_df)
my_canvas <- draw_phospho(my_canvas, my_prot_df)
my_canvas <- draw_repeat(my_canvas, my_prot_df)
my_canvas <- draw_recept_dom(my_canvas, my_prot_df)
my_canvas <- draw_folding(my_canvas, my_prot_df)
my_canvas
##Dotplot
Prepping Data
AS3MT.human.vector <- fasta_cleaner(AS3MT_list[[1]])
2x2 plots
par(mfrow = c(2,2),
mar = c(0,0,2,1))
#Defaults
dotPlot(AS3MT.human.vector, AS3MT.human.vector, wsize = 1, nmatch = 1, main = "Defaults")
#V2
dotPlot(AS3MT.human.vector, AS3MT.human.vector, wsize = 10, nmatch = 1, main = "size = 10, nmatch = 1")
#V3
dotPlot(AS3MT.human.vector, AS3MT.human.vector, wsize = 10, nmatch = 5, main = "size = 10, nmatch = 5")
#V4
dotPlot(AS3MT.human.vector, AS3MT.human.vector, wsize = 20, nmatch = 5,main = "size = 20, nmatch = 5")
Best Plot
dotPlot(AS3MT.human.vector, AS3MT.human.vector,
wsize = 1,
nmatch = 1,
main = "size = 1, nmatch = 1")
##Protein Properteis Compiled From Databases
PDB: https://www.rcsb.org/structure/6CX6 Uniprot: https://www.UniProt.org/uniprot/Q9HBK9 Pfam: http://pfam.xfam.org/family/PF13847 Alphafold: https://alphafold.ebi.ac.uk/entry/Q9HBK9
URL <- c("PDB", "Uniprot", "Pfam", "Alphafold")
Protein.Property <- c("Classification: Transferase",
"Location: Cytosol",
"Domain: Methyltransferase",
"Structure: Alpha helicies, and beta pleated sheets")
Protein.Properties.dataframe <- data.frame(URL, Protein.Property)
pander::pander(Protein.Properties.dataframe)
| URL | Protein.Property |
|---|---|
| PDB | Classification: Transferase |
| Uniprot | Location: Cytosol |
| Pfam | Domain: Methyltransferase |
| Alphafold | Structure: Alpha helicies, and beta pleated sheets |
##Protein Feature Prediction
Multivariate statistcal techniques were used to confirm the information about protein structure and location in the line database.
##Predict Protein Fold
Alphafold indicates that there are a mix of alpha helices and beta sheets. I predict that a mixture of a+b and a/b structure will be formed through the machine learning process.
Chou and Zhang (1994) Table
aa.1.1 <- c("A","R","N","D","C","Q","E","G","H","I",
"L","K","M","F","P","S","T","W","Y","V")
Alpha <- c(285, 53, 97, 163, 22, 67, 134, 197, 111, 91,
221, 249, 48, 123, 82, 122, 119, 33, 63, 167)
Beta <- c(203, 67, 139, 121, 75, 122, 86, 297, 49, 120,
177, 115, 16, 85, 127, 341, 253, 44, 110, 229)
A.Plus.B <- c(175, 78, 120, 111, 74, 74, 86, 171, 33, 93,
110, 112, 25, 52, 71, 126, 117, 30, 108, 123)
A.Div.B <- c(361, 146, 183, 244, 63, 114, 257, 377, 107, 239,
339, 321, 91, 158, 188, 327, 238, 72, 130, 378)
pander(data.frame(aa.1.1, Alpha, Beta, A.Plus.B, A.Div.B))
| aa.1.1 | Alpha | Beta | A.Plus.B | A.Div.B |
|---|---|---|---|---|
| A | 285 | 203 | 175 | 361 |
| R | 53 | 67 | 78 | 146 |
| N | 97 | 139 | 120 | 183 |
| D | 163 | 121 | 111 | 244 |
| C | 22 | 75 | 74 | 63 |
| Q | 67 | 122 | 74 | 114 |
| E | 134 | 86 | 86 | 257 |
| G | 197 | 297 | 171 | 377 |
| H | 111 | 49 | 33 | 107 |
| I | 91 | 120 | 93 | 239 |
| L | 221 | 177 | 110 | 339 |
| K | 249 | 115 | 112 | 321 |
| M | 48 | 16 | 25 | 91 |
| F | 123 | 85 | 52 | 158 |
| P | 82 | 127 | 71 | 188 |
| S | 122 | 341 | 126 | 327 |
| T | 119 | 253 | 117 | 238 |
| W | 33 | 44 | 30 | 72 |
| Y | 63 | 110 | 108 | 130 |
| V | 167 | 229 | 123 | 378 |
Convert to Frequencies
Alpha.properties <- Alpha/sum(Alpha)
Beta.properties <- Beta/sum(Beta)
A.Plus.B.properties <- A.Plus.B/sum(A.Plus.B)
A.Div.B.properties <- A.Div.B/sum(A.Div.B)
aa.prop <- data.frame(Alpha.properties,
Beta.properties,
A.Plus.B.properties,
A.Div.B.properties)
row.names(aa.prop) <- aa.1.1
pander::pander(aa.prop)
| Alpha.properties | Beta.properties | A.Plus.B.properties | |
|---|---|---|---|
| A | 0.1165 | 0.07313 | 0.09264 |
| R | 0.02166 | 0.02414 | 0.04129 |
| N | 0.03964 | 0.05007 | 0.06353 |
| D | 0.06661 | 0.04359 | 0.05876 |
| C | 0.008991 | 0.02702 | 0.03917 |
| Q | 0.02738 | 0.04395 | 0.03917 |
| E | 0.05476 | 0.03098 | 0.04553 |
| G | 0.08051 | 0.107 | 0.09052 |
| H | 0.04536 | 0.01765 | 0.01747 |
| I | 0.03719 | 0.04323 | 0.04923 |
| L | 0.09031 | 0.06376 | 0.05823 |
| K | 0.1018 | 0.04143 | 0.05929 |
| M | 0.01962 | 0.005764 | 0.01323 |
| F | 0.05027 | 0.03062 | 0.02753 |
| P | 0.03351 | 0.04575 | 0.03759 |
| S | 0.04986 | 0.1228 | 0.0667 |
| T | 0.04863 | 0.09114 | 0.06194 |
| W | 0.01349 | 0.01585 | 0.01588 |
| Y | 0.02575 | 0.03963 | 0.05717 |
| V | 0.06825 | 0.08249 | 0.06511 |
| A.Div.B.properties | |
|---|---|
| A | 0.08331 |
| R | 0.03369 |
| N | 0.04223 |
| D | 0.05631 |
| C | 0.01454 |
| Q | 0.02631 |
| E | 0.05931 |
| G | 0.08701 |
| H | 0.02469 |
| I | 0.05516 |
| L | 0.07824 |
| K | 0.07408 |
| M | 0.021 |
| F | 0.03646 |
| P | 0.04339 |
| S | 0.07547 |
| T | 0.05493 |
| W | 0.01662 |
| Y | 0.03 |
| V | 0.08724 |
Determining number of each amino acid in AS3MT
AS3MT.table <- table(AS3MT.human.vector)/length(AS3MT.human.vector)
Table to vector function
table_to_vector <- function(table_x){
table_names <- attr(table_x, "dimnames")[[1]]
table_vect <- as.vector(table_x)
names(table_vect) <- table_names
return(table_vect)
}
AS3MT.human.table <- table(AS3MT.human.vector)/length(AS3MT.human.vector)
AS3MT.human.aa.freq <- table_to_vector(AS3MT.human.table)
pander::pander(AS3MT.human.aa.freq)
| A | C | D | E | F | G | H | I | K |
|---|---|---|---|---|---|---|---|---|
| 0.064 | 0.03733 | 0.048 | 0.088 | 0.03467 | 0.088 | 0.02933 | 0.06133 | 0.072 |
| L | M | N | P | Q | R | S | T | V |
|---|---|---|---|---|---|---|---|---|
| 0.09333 | 0.01333 | 0.032 | 0.032 | 0.04 | 0.04267 | 0.048 | 0.05333 | 0.07733 |
| W | Y |
|---|---|
| 0.008 | 0.03733 |
Checking for “U” (UNknown) presence
aa.names <- names(AS3MT.human.aa.freq)
i.U <- which(aa.names == "U")
aa.names[i.U]
## character(0)
AS3MT.human.aa.freq[i.U]
## named numeric(0)
Add AS3MT data
aa.prop$AS3MT.human.aa.freq <- AS3MT.human.aa.freq
pander::pander(aa.prop)
| Alpha.properties | Beta.properties | A.Plus.B.properties | |
|---|---|---|---|
| A | 0.1165 | 0.07313 | 0.09264 |
| R | 0.02166 | 0.02414 | 0.04129 |
| N | 0.03964 | 0.05007 | 0.06353 |
| D | 0.06661 | 0.04359 | 0.05876 |
| C | 0.008991 | 0.02702 | 0.03917 |
| Q | 0.02738 | 0.04395 | 0.03917 |
| E | 0.05476 | 0.03098 | 0.04553 |
| G | 0.08051 | 0.107 | 0.09052 |
| H | 0.04536 | 0.01765 | 0.01747 |
| I | 0.03719 | 0.04323 | 0.04923 |
| L | 0.09031 | 0.06376 | 0.05823 |
| K | 0.1018 | 0.04143 | 0.05929 |
| M | 0.01962 | 0.005764 | 0.01323 |
| F | 0.05027 | 0.03062 | 0.02753 |
| P | 0.03351 | 0.04575 | 0.03759 |
| S | 0.04986 | 0.1228 | 0.0667 |
| T | 0.04863 | 0.09114 | 0.06194 |
| W | 0.01349 | 0.01585 | 0.01588 |
| Y | 0.02575 | 0.03963 | 0.05717 |
| V | 0.06825 | 0.08249 | 0.06511 |
| A.Div.B.properties | AS3MT.human.aa.freq | |
|---|---|---|
| A | 0.08331 | 0.064 |
| R | 0.03369 | 0.03733 |
| N | 0.04223 | 0.048 |
| D | 0.05631 | 0.088 |
| C | 0.01454 | 0.03467 |
| Q | 0.02631 | 0.088 |
| E | 0.05931 | 0.02933 |
| G | 0.08701 | 0.06133 |
| H | 0.02469 | 0.072 |
| I | 0.05516 | 0.09333 |
| L | 0.07824 | 0.01333 |
| K | 0.07408 | 0.032 |
| M | 0.021 | 0.032 |
| F | 0.03646 | 0.04 |
| P | 0.04339 | 0.04267 |
| S | 0.07547 | 0.048 |
| T | 0.05493 | 0.05333 |
| W | 0.01662 | 0.07733 |
| Y | 0.03 | 0.008 |
| V | 0.08724 | 0.03733 |
##Functions to calculate similarities
chou_cor <- function(x,y){
numerator <- sum(x*y)
denominator <- sqrt((sum(x^2))*(sum(y^2)))
result <- numerator/denominator
return(result)
}
chou_cosine <- function(z.1, z.2){
z.1.abs <- sqrt(sum(z.1^2))
z.2.abs <- sqrt(sum(z.2^2))
my.cosine <- sum(z.1*z.2)/(z.1.abs*z.2.abs)
return(my.cosine)
}
Calculate correlation between each column
corr.alpha <- chou_cor(aa.prop[,5], aa.prop[,1])
corr.beta <- chou_cor(aa.prop[,5], aa.prop[,2])
corr.apb <- chou_cor(aa.prop[,5], aa.prop[,3])
corr.adb <- chou_cor(aa.prop[,5], aa.prop[,4])
Calculate cosine similarity
cos.alpha <- chou_cosine(aa.prop[,5], aa.prop[,1])
cos.beta <- chou_cosine(aa.prop[,5], aa.prop[,2])
cos.apb <- chou_cosine(aa.prop[,5], aa.prop[,3])
cos.adb <- chou_cosine(aa.prop[,5], aa.prop[,4])
Calculate distance
aa.prop.flipped <- t(aa.prop)
round(aa.prop.flipped,2)
## A R N D C Q E G H I L K
## Alpha.properties 0.12 0.02 0.04 0.07 0.01 0.03 0.05 0.08 0.05 0.04 0.09 0.10
## Beta.properties 0.07 0.02 0.05 0.04 0.03 0.04 0.03 0.11 0.02 0.04 0.06 0.04
## A.Plus.B.properties 0.09 0.04 0.06 0.06 0.04 0.04 0.05 0.09 0.02 0.05 0.06 0.06
## A.Div.B.properties 0.08 0.03 0.04 0.06 0.01 0.03 0.06 0.09 0.02 0.06 0.08 0.07
## AS3MT.human.aa.freq 0.06 0.04 0.05 0.09 0.03 0.09 0.03 0.06 0.07 0.09 0.01 0.03
## M F P S T W Y V
## Alpha.properties 0.02 0.05 0.03 0.05 0.05 0.01 0.03 0.07
## Beta.properties 0.01 0.03 0.05 0.12 0.09 0.02 0.04 0.08
## A.Plus.B.properties 0.01 0.03 0.04 0.07 0.06 0.02 0.06 0.07
## A.Div.B.properties 0.02 0.04 0.04 0.08 0.05 0.02 0.03 0.09
## AS3MT.human.aa.freq 0.03 0.04 0.04 0.05 0.05 0.08 0.01 0.04
We can get distance matrix like this
dist(aa.prop.flipped, method = "euclidean")
## Alpha.properties Beta.properties A.Plus.B.properties
## Beta.properties 0.13342098
## A.Plus.B.properties 0.09281824 0.08289406
## A.Div.B.properties 0.06699039 0.08659174 0.06175113
## AS3MT.human.aa.freq 0.17100713 0.17044689 0.14509680
## A.Div.B.properties
## Beta.properties
## A.Plus.B.properties
## A.Div.B.properties
## AS3MT.human.aa.freq 0.15627830
Individual distances
dist.Alpha <- dist((aa.prop.flipped[c(1,5),]), method = "euclidean")
dist.Beta <- dist((aa.prop.flipped[c(2,5),]), method = "euclidean")
dist.APB <- dist((aa.prop.flipped[c(3,5),]), method = "euclidean")
dist.ADB <- dist((aa.prop.flipped[c(4,5),]), method = "euclidean")
Compile the information. Rounding makes it easier to read
# fold types
fold.type <- c("alpha","beta","alpha plus beta", "alpha/beta")
# data
corr.sim <- round(c(corr.alpha, corr.beta, corr.apb, corr.adb),5)
cosine.sim <- round(c(cos.alpha, cos.beta, cos.apb, cos.adb),5)
Euclidean.dist <- round(c(dist.Alpha, dist.Beta, dist.APB, dist.ADB),5)
# summary
sim.sum <- c("","","most.sim","")
dist.sum <- c("","","min.dist","")
df <- data.frame(fold.type,
corr.sim ,
cosine.sim ,
Euclidean.dist ,
sim.sum ,
dist.sum )
Display Output
pander::pander(df)
| fold.type | corr.sim | cosine.sim | Euclidean.dist | sim.sum | dist.sum |
|---|---|---|---|---|---|
| alpha | 0.7721 | 0.7721 | 0.171 | ||
| beta | 0.7772 | 0.7772 | 0.1704 | ||
| alpha plus beta | 0.8252 | 0.8252 | 0.1451 | most.sim | min.dist |
| alpha/beta | 0.8009 | 0.8009 | 0.1563 |
##Percent Identity Coparisons (PID)
names(AS3MT_list)
## [1] "NP_065733.2" "XP_009457416.2" "XP_006527298.1" "XP_038938624.1"
## [5] "XP_042192919.1" "XP_040848125.1" "XP_013810406.1"
length(AS3MT_list)
## [1] 7
AS3MT_list[1]
## $NP_065733.2
## [1] "MAALRDAEIQKDVQTYYGQVLKRSADLQTNGCVTTARPVPKHIREALQNVHEEVALRYYGCGLVIPEHLENCWILDLGSGSGRDCYVLSQLVGEKGHVTGIDMTKGQVEVAEKYLDYHMEKYGFQASNVTFIHGYIEKLGEAGIKNESHDIVVSNCVINLVPDKQQVLQEAYRVLKHGGELYFSDVYTSLELPEEIRTHKVLWGECLGGALYWKELAVLAQKIGFCPPRLVTANLITIQNKELERVIGDCRFVSATFRLFKHSKTGPTKRCQVIYNGGITGHEKELMFDANFTFKEGEIVEVDEETAAILKNSRFAQDFLIRPIGEKLPTSGGCSALELKDIITDPFKLAEESDSMKSRCVPDAAGGCCGTKKSC"
AS3MT_vector <- unlist(AS3MT_list)
names(AS3MT_vector) <- names(AS3MT_list)
AS3MT_vector[1]
## NP_065733.2
## "MAALRDAEIQKDVQTYYGQVLKRSADLQTNGCVTTARPVPKHIREALQNVHEEVALRYYGCGLVIPEHLENCWILDLGSGSGRDCYVLSQLVGEKGHVTGIDMTKGQVEVAEKYLDYHMEKYGFQASNVTFIHGYIEKLGEAGIKNESHDIVVSNCVINLVPDKQQVLQEAYRVLKHGGELYFSDVYTSLELPEEIRTHKVLWGECLGGALYWKELAVLAQKIGFCPPRLVTANLITIQNKELERVIGDCRFVSATFRLFKHSKTGPTKRCQVIYNGGITGHEKELMFDANFTFKEGEIVEVDEETAAILKNSRFAQDFLIRPIGEKLPTSGGCSALELKDIITDPFKLAEESDSMKSRCVPDAAGGCCGTKKSC"
##PID Table
align.HumVChimp <- Biostrings::pairwiseAlignment(AS3MT_vector[1], AS3MT_vector[2])
Biostrings::pid(align.HumVChimp)
## [1] 98.40849
align.HumVMouse <- Biostrings::pairwiseAlignment(AS3MT_vector[1], AS3MT_vector[3])
Biostrings::pid(align.HumVMouse)
## [1] 72.82609
align.HumVRat <- Biostrings::pairwiseAlignment(AS3MT_vector[1], AS3MT_vector[4])
Biostrings::pid(align.HumVRat)
## [1] 66.13333
align.ChimpVMouse <- Biostrings::pairwiseAlignment(AS3MT_vector[2], AS3MT_vector[3])
Biostrings::pid(align.ChimpVMouse)
## [1] 71.62162
align.ChimpVRat <- Biostrings::pairwiseAlignment(AS3MT_vector[2], AS3MT_vector[4])
Biostrings::pid(align.ChimpVRat)
## [1] 65.25199
align.MouseVRat <- Biostrings::pairwiseAlignment(AS3MT_vector[3], AS3MT_vector[4])
Biostrings::pid(align.MouseVRat)
## [1] 71.34986
Build Matrix
pids <- c(1, NA, NA, NA,
pid(align.HumVChimp), 1, NA, NA,
pid(align.HumVMouse), pid(align.ChimpVMouse), 1, NA,
pid(align.HumVRat), pid(align.ChimpVRat), pid(align.MouseVRat), 1)
mat <- matrix(pids, nrow = 4, byrow = T)
row.names(mat) <- c("Homo","Pan","Rat","Fish")
colnames(mat) <- c("Homo","Pan","Rat","Fish")
pander::pander(mat)
| Homo | Pan | Rat | Fish | |
|---|---|---|---|---|
| Homo | 1 | NA | NA | NA |
| Pan | 98.41 | 1 | NA | NA |
| Rat | 72.83 | 71.62 | 1 | NA |
| Fish | 66.13 | 65.25 | 71.35 | 1 |
##PID Methods Comparison
pid1 <- pid(align.HumVChimp, type = "PID1")
pid2 <- pid(align.HumVChimp, type = "PID2")
pid3 <- pid(align.HumVChimp, type = "PID3")
pid4 <- pid(align.HumVChimp, type = "PID4")
PID.Method <- c("PID1", "PID2", "PID3", "PID4")
PID.Value <- c(pid1, pid2, pid3, pid4)
Denominator <- c("(aligned positions + internal gap positions)", "(aligned positions)", "(length shorter sequence)", "(average length of the two sequences)")
pid_df <- data.frame(PID.Method, PID.Value, Denominator)
pander::pander(pid_df)
| PID.Method | PID.Value | Denominator |
|---|---|---|
| PID1 | 98.41 | (aligned positions + internal gap positions) |
| PID2 | 98.93 | (aligned positions) |
| PID3 | 98.93 | (length shorter sequence) |
| PID4 | 98.67 | (average length of the two sequences) |
##Multiple Sequence Alignment
#MSA Data Preparation
AS3MT_vector_ss <- Biostrings::AAStringSet(AS3MT_vector)
#Build MSA
AS3MT_alignment <- msa(AS3MT_vector_ss, method = "ClustalW")
## use default substitution matrix
#Cleaning / Setting up MSA
MSA produces a species MSA objects
class(AS3MT_alignment)
## [1] "MsaAAMultipleAlignment"
## attr(,"package")
## [1] "msa"
is(AS3MT_alignment)
## [1] "MsaAAMultipleAlignment" "AAMultipleAlignment" "MsaMetaData"
## [4] "MultipleAlignment"
Default Output of MSA
AS3MT_alignment
## CLUSTAL 2.1
##
## Call:
## msa(AS3MT_vector_ss, method = "ClustalW")
##
## MsaAAMultipleAlignment with 7 rows and 422 columns
## aln names
## [1] --MAALRDA-EIQKDVQTYYGQVLKR...EESDSMKSRCVPDAAGGCCGTKKSC NP_065733.2
## [2] MTLAAPRDA-EIQKDVQTYYGQVLKR...EESDSMKSRCVPDAAGGCCGTKKSC XP_009457416.2
## [3] --MAASRDADEIHKDVQNYYGNVLKT...-------GRSELETKGH-------- XP_006527298.1
## [4] --MAAPRDA-EIHKDVQNYYGNVLKT...EESDKMKPRCAPEGTGGCCGKRKSC XP_038938624.1
## [5] --MATSSDT-EILRDVQEYYGKVLKT...EQMAGVKSGCA--SAGGCCGKPKKC XP_040848125.1
## [6] -------------------------M...EIKP---SGGAPPVAGSCCVTKGCC XP_042192919.1
## [7] --MAAPAGE-QIHRAVQDYYGQELQR...EQPG---APGPACCPASACGPRGCC XP_013810406.1
## Con --MAA?RDA-EI?KDVQ?YYG?VLK?...E?????KSRC?P??AGGCCG??K?C Consensus
Change class of alignment
class(AS3MT_alignment) <- "AAMultipleAlignment"
Convert to Seqinr Format
AS3MT_seqnir <- msaConvert(AS3MT_alignment, type = "seqinr::alignment")
Print new MSA
compbio4all::print_msa(AS3MT_seqnir)
## [1] "--MAALRDA-EIQKDVQTYYGQVLKRSADLQTNGCVTTARPVPKHIREALQNVHEEVALR 0"
## [1] "MTLAAPRDA-EIQKDVQTYYGQVLKRSADLQTNGCVTTARPVPKHIREALQNVHEEVALR 0"
## [1] "--MAASRDADEIHKDVQNYYGNVLKTSADLQTNACVTRAKPVPSYIRESLQNVHEDVSSR 0"
## [1] "--MAAPRDA-EIHKDVQNYYGNVLKTSADLQTNACVTPAKGVPEYIRKSLQNVHEEVTSR 0"
## [1] "--MATSSDT-EILRDVQEYYGKVLKTSADLQANACVTPAGPMPAHVREALRNVHEEVALR 0"
## [1] "-------------------------MSADLKTNVCVTPARALPKHVRAALQDVHGEVAMR 0"
## [1] "--MAAPAGE-QIHRAVQDYYGQELQRSEDLKTNACVTLAGPLPRAAREALERVHGEVVAR 0"
## [1] " "
## [1] "YYGCGLVIPEHLENCWILDLGSGSGRDCYVLSQLVGEKGHVTGIDMTKGQVEVAEKYLDY 0"
## [1] "YYGCGLVIPEHLENCWILDLGSGSGRDCYVLSQLVGEKGHVTGIDMTKGQVEVAEKYLDY 0"
## [1] "YYGCGLTVPERLENCRILDLGSGSGRDCYVLSQLVGEKGHVTGIDMTEVQVEVAKTYLEH 0"
## [1] "YYGCGLVVPEHLENCRILDLGSGSGRDCYVLSQLVGQKGHITGIDMTKVQVEVAKAYLEY 0"
## [1] "YYGCGLVIPERLQSCWVLDLGSGSGRDCYALSQLVGDKGHVTGIDMTGEQVEMAKKYLDY 0"
## [1] "YYGCGLVVPECLENCWILDLGSGSGRDCYMLSKLVGEKGHVTGVDMTDEQIAVAQKFVQY 0"
## [1] "YYGCGLVLPECLASCRVLDLGSGSGRDCYLLSQLVGEEGHVTGIDMTQGQVEVARKHIAY 0"
## [1] " "
## [1] "HMEKYGFQASNVTFIHGYIEKLGEAGIKNESHDIVVSNCVINLVPDKQQVLQEAYRVLKH 0"
## [1] "HMEKYGFQASNVTFIHGYIEKLGGAGIKNESHDIVVSNCVINLVPDKQQVLQEAYRVLKH 0"
## [1] "HMEKFGFQAPNVTFLHGRIEKLAEAGIQSESYDIVISNCVINLVPDKQQVLQEVYRVLKH 0"
## [1] "HTEKFGFQTPNVTFLHGQIEMLAEAGIQKESYDIVISNCVINLVPDKQKVLREVYQVLKY 0"
## [1] "HMEKFGFQTPNVTFIHGYIEKLAEAGIKDESYDIVISNCVINLVPDKQQVLQEAYRVLKH 0"
## [1] "HTQKFGFSKPNVEFIQGYIEKLDDVGLKSNSYDIIISNCVVNLSPDKLSVLREAYRVLKN 0"
## [1] "HMDKFGFRAPNVDFIQGYMEKLGDAGLADESYDIVISNCVINLSPDKGAVLREAYRVLKP 0"
## [1] " "
## [1] "GGELYFSDVYTSLELPEEIRTHKVLWGECLGGALYWKELAVLAQKIGFCPPRLVTANLIT 0"
## [1] "GGELYFSDVYTSLELPEEIRTHKVLWGECLGGALYWKELAVLAQKIGFCPPRLVTANLIT 0"
## [1] "GGELYFSDVYASLEVPEDIKSHKVLWGECLGGALYWKDLAIIAQKIGFCPPRLVTADIIT 0"
## [1] "GGELYFSDVYASLEVSEDIKSHKVLW---------------------------------- 0"
## [1] "GGELYFSDVYASLELSEGVRSHRVLWGECLGGALYWKDLANFAQKIGFCPPRLVAANLVT 0"
## [1] "GGEMYFSDIYSDRELPEKMRKHKVLWGECLGGALSWEELVRIAKEVGFSTPRLTTAKLFD 0"
## [1] "GGEMYFSDIYASRRLSEAARAHRVLWGECLAGALCWRELYSIAQDVGFSPPRLVAASPIT 0"
## [1] " "
## [1] "IQNKELERVIGDCRFVS-----------------------------ATFRLFKHSKTGPT 0"
## [1] "IQNKELERVIGDCRFVS-----------------------------ATFRLFKHSKTGPT 0"
## [1] "VENKELEGVLGDCRFVS-----------------------------ATFRLFKLPKTEPA 0"
## [1] "----------GDCRFVS-----------------------------ATFRLFKLPKTEPA 0"
## [1] "VQNKDLEGVIGDCRFVS-----------------------------ATFRLFKLPKTGPT 0"
## [1] "VNNKELEEVIGDYKFVS-----------------------------ATYRLFKIPAHDSK 0"
## [1] "VGHEELESIVGDCRFVSQAKGAEMGGGKGSPTCASCWGEDGRRTSMGKCRVVGATCSATA 0"
## [1] " "
## [1] "KRCQVIYNGGITGHEKELMFDAN------------FTFKEGEIVEVDEETAAILKNSRFA 0"
## [1] "KRCQVIYNGGITGHEKELMFDAN------------FKFKEGEIVEVDEETAAILKNSRFA 0"
## [1] "ERCRVVYNGGIKGHEKELIFDAN------------FTFKEGEAVAVDEETAAVLKNSRFA 0"
## [1] "GRCQVVYNGGIMGHEKELIFDAN------------FTFKEGEAVEVDEETAAILRNSRFA 0"
## [1] "ERCQVTYRGGIPGHEKELVFDAN------------FTFKKGEALEVDEETAIILKNSRFA 0"
## [1] "EELQVIYNGSIAGCEGKLEFDAN------------YTFEEGEVMDVDEEMAMILKSSRFA 0"
## [1] "GVSEGFGPGCIPAFAPRVVTAASGGSPGCLVPSLSLCPQAGEAVDVDADTAAILRSSRFA 0"
## [1] " "
## [1] "QDFLIRPIGEKLPTSGG---CSALELKDIITDPFKLAEESDSMKSRCVPDAAGGCCGTKK 0"
## [1] "QAFLIRPIGEKLPTSGG---CSALELKDIITDPFKLAEESDSMKSRCVPDAAGGCCGTKK 0"
## [1] "PDFLFTPVDASLP---------APQ-------------------GRSELETKGH------ 0"
## [1] "HDFLFTPVEASLL---------APQTKVIIRDPFKLAEESDKMKPRCAPEGTGGCCGKRK 0"
## [1] "QDFLFEPLTEKLLTSGA---CPVLEPKAVITDPFKLAEQMAGVKSGCA--SAGGCCGKPK 0"
## [1] "DKFTMHPANKVASGEAG---GRARQPKDMIIDPFKFTEIKP---SGGAPPVAGSCCVTKG 0"
## [1] "EDFLIQASGADANADTAPPGCCRGRVKERICDPFQLLEQPG---APGPACCPASACGPRG 0"
## [1] " "
## [1] "SC 58"
## [1] "SC 58"
## [1] "-- 58"
## [1] "SC 58"
## [1] "KC 58"
## [1] "CC 58"
## [1] "CC 58"
## [1] " "
#Finished MSA
ggmsa::ggmsa(AS3MT_alignment, start = 0, end = 50)
##Distance Matrix
AS3MT_distance <- seqinr::dist.alignment(AS3MT_seqnir, matrix = "identity")
is(AS3MT_distance)
## [1] "dist" "oldClass"
class(AS3MT_distance)
## [1] "dist"
Round for Display
AS3MT_align_seqinr_rnd <- round(AS3MT_distance, 3)
AS3MT_align_seqinr_rnd
## NP_065733.2 XP_009457416.2 XP_006527298.1 XP_038938624.1
## XP_009457416.2 0.115
## XP_006527298.1 0.474 0.487
## XP_038938624.1 0.487 0.496 0.366
## XP_040848125.1 0.466 0.477 0.494 0.513
## XP_042192919.1 0.619 0.621 0.642 0.648
## XP_013810406.1 0.674 0.676 0.673 0.680
## XP_040848125.1 XP_042192919.1
## XP_009457416.2
## XP_006527298.1
## XP_038938624.1
## XP_040848125.1
## XP_042192919.1 0.639
## XP_013810406.1 0.664 0.685
##Phylogenetic Trees of Sequences
Build as phylogenetic tree from distance matrix
tree <- nj(AS3MT_distance)
PLot Phylogenetic Tree
plot.phylo(tree, main = "Phylogenetic Tree\n", use.edge.length = FALSE)
mtext(text = "\nAS3MT family gene tree - rooted, no branch lengths")