This code compiles summary information about the gene PRKG1 (Protein Kinase CGMP-Dependent).
It also generates alignments and a phylogeneitc tree to indicate the evolutionary relationship betweeen the human version of the gene and its homologs in other species.
Key information use to make this script can be found here:
Refseq Gene: https://www.ncbi.nlm.nih.gov/gene/1733 Refseq Homologene: https://www.ncbi.nlm.nih.gov/homologene/620 Other resources consulted includes
Neanderthal genome: http://neandertal.ensemblgenomes.org/index.html Other interesting resources and online tools include:
REPPER: https://toolkit.tuebingen.mpg.de/jobs/7803820 Sub-cellular locations prediction: https://wolfpsort.hgc.jp/
Load necessary packages:
Download and load drawProteins from Bioconductor
library(BiocManager)
## Bioconductor version '3.13' is out-of-date; the current release version '3.14'
## is available with R version '4.1'; see https://bioconductor.org/install
#install("drawProteins")
library(drawProteins)
Load other packages
# github packages
library(compbio4all)
library(ggmsa)
## Registered S3 methods overwritten by 'ggalt':
## method from
## grid.draw.absoluteGrob ggplot2
## grobHeight.absoluteGrob ggplot2
## grobWidth.absoluteGrob ggplot2
## grobX.absoluteGrob ggplot2
## grobY.absoluteGrob ggplot2
# CRAN packages
library(rentrez)
library(seqinr)
library(ape)
##
## Attaching package: 'ape'
## The following objects are masked from 'package:seqinr':
##
## as.alignment, consensus
library(pander)
library(ggplot2)
# Bioconductor packages
## msa
### The msa package is having problems on some platforms
### You can skip the msa steps if necessary. The msa output
### is used to make a distance matrix and then phylogenetics trees,
### but I provide code to build the matrix by hand so
### you can proceed even if msa doesn't work for you.
## Biostrings
library(Biostrings)
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: XVector
## Loading required package: GenomeInfoDb
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:ape':
##
## complement
## The following object is masked from 'package:seqinr':
##
## translate
## The following object is masked from 'package:base':
##
## strsplit
library(msa)
##
## Attaching package: 'msa'
## The following object is masked from 'package:BiocManager':
##
## version
library(drawProteins)
#library(HGNChelper)
Accession numbers were obtained from RefSeq, Refseq Homlogene, UniProt and PDB. UniProt accession numbers can be found by searching for the gene name. PDB accessions can be found by searching with a UniProt accession or a gene name, though many proteins are not in PDB. The the Neanderthal genome database was searched but did not yield sequence information on.
A protein BLAST search (https://blast.ncbi.nlm.nih.gov/Blast.cgi?PROGRAM=blastp&PAGE_TYPE=BlastSearch&LINK_LOC=blasthome) was carried out excluding vertebrates to determine if it occurred outside of vertebreates. The gene does not appear in non-vertebrates and so a second search was conducted to exclude mammals.
OPTIONAL: Use the function to confirm the validity of your gene name and any aliases
# 10 RefSeq or Genbank PROTEIN accession numbers for sequences for adopted gene from 10 species
# Max of 7 obtained via Homologene
# Other 3:
# Neanderthal
# Drosophila
# obtain from BLAST
# UniProt Accession Numbers
# PDB Accession numbers
# Common name for each species
# Scientific name for each species
# this is optional
#HGNChelper::checkGeneSymbols(x = c("PRKG1"))
# can make table using matrix()
# TODO: Add additional sequences to get a total of 10
prkg1_table_vector <- c("NP_006249", "Q13976", "1ZXA","Homo sapiens", "Human", "PRKG1",
"NP_776861","P00516", "3SHR", "Bos taurus" , "Cattle","PRKG1",
"NP_001013855", "P0C605","NA","Mus musculus", "Mouse" ,"PRKG1",
"NP_001099201","NA","NA","Rattus norvegicus", "Rat", "PRKG1",
"NP_957324", "Q7T2E5","NA","Danio Rerio", "Zebrafish", "PRKG1",
"NP_477488","NA","NA", "Drosophila melanogaster", "Fruit Fly", "PRKG1",
"XP_001162858", "H2Q1X0", "NA", "Pan troglodytes", "Chimpanzee", "PRKG1",
"XP_851997", "J9JHE0", "NA", "Canis Lupus Familiaris", "Dog", "PRKG1",
"XP_003641507", "NA", "NA", "Gallus gallus", "Chicken", "PRKG1",
"XP_002935474", "A0A6I8R7F3", "NA", "Xenos Tropicalis", "Frog", "PRKG1")
prkg1_matrix <- matrix( prkg1_table_vector, ncol = 6, byrow = TRUE)
?matrix
prkg1_df <- data.frame( prkg1_matrix )
colnames( prkg1_df ) <- c("ncbi.protein.accession", "UniProt.id", "PDB", "species", "common.name",
"gene.name")
# names of columns
# ncbi.protein.accession
# UniProt.id
# PDB
# species
# common.name
# gene.name
The finished table
pander::pander( prkg1_df )
| ncbi.protein.accession | UniProt.id | PDB | species |
|---|---|---|---|
| NP_006249 | Q13976 | 1ZXA | Homo sapiens |
| NP_776861 | P00516 | 3SHR | Bos taurus |
| NP_001013855 | P0C605 | NA | Mus musculus |
| NP_001099201 | NA | NA | Rattus norvegicus |
| NP_957324 | Q7T2E5 | NA | Danio Rerio |
| NP_477488 | NA | NA | Drosophila melanogaster |
| XP_001162858 | H2Q1X0 | NA | Pan troglodytes |
| XP_851997 | J9JHE0 | NA | Canis Lupus Familiaris |
| XP_003641507 | NA | NA | Gallus gallus |
| XP_002935474 | A0A6I8R7F3 | NA | Xenos Tropicalis |
| common.name | gene.name |
|---|---|
| Human | PRKG1 |
| Cattle | PRKG1 |
| Mouse | PRKG1 |
| Rat | PRKG1 |
| Zebrafish | PRKG1 |
| Fruit Fly | PRKG1 |
| Chimpanzee | PRKG1 |
| Dog | PRKG1 |
| Chicken | PRKG1 |
| Frog | PRKG1 |
All sequences were downloaded using a wrapper compbio4all::entrez_fetch_list() which uses rentrez::entrez_fetch() to access NCBI databases.
# download FASTA sequences
# entrez_fetch_list is a wrapper
# uses rentrez::entrez_fetch() to access NCBI databases
prkg1_list <- compbio4all::entrez_fetch_list( db = "protein",
id = prkg1_df$ncbi.protein.accession,
rettype = "fasta"
)
Number of FASTA files obtained
length( prkg1_list )
## [1] 10
# 10
The first entry
prkg1_list[[1]]
## [1] ">NP_006249.1 cGMP-dependent protein kinase 1 isoform 2 [Homo sapiens]\nMGTLRDLQYALQEKIEELRQRDALIDELELELDQKDELIQKLQNELDKYRSVIRPATQQAQKQSASTLQG\nEPRTKRQAISAEPTAFDIQDLSHVTLPFYPKSPQSKDLIKEAILDNDFMKNLELSQIQEIVDCMYPVEYG\nKDSCIIKEGDVGSLVYVMEDGKVEVTKEGVKLCTMGPGKVFGELAILYNCTRTATVKTLVNVKLWAIDRQ\nCFQTIMMRTGLIKHTEYMEFLKSVPTFQSLPEEILSKLADVLEETHYENGEYIIRQGARGDTFFIISKGT\nVNVTREDSPSEDPVFLRTLGKGDWFGEKALQGEDVRTANVIAAEAVTCLVIDRDSFKHLIGGLDDVSNKA\nYEDAEAKAKYEAEAAFFANLKLSDFNIIDTLGVGGFGRVELVQLKSEESKTFAMKILKKRHIVDTRQQEH\nIRSEKQIMQGAHSDFIVRLYRTFKDSKYLYMLMEACLGGELWTILRDRGSFEDSTTRFYTACVVEAFAYL\nHSKGIIYRDLKPENLILDHRGYAKLVDFGFAKKIGFGKKTWTFCGTPEYVAPEIILNKGHDISADYWSLG\nILMYELLTGSPPFSGPDPMKTYNIILRGIDMIEFPKKIAKNAANLIKKLCRDNPSERLGNLKNGVKDIQK\nHKWFEGFNWEGLRKGTLTPPIIPSVASPTDTSNFDSFPEDNDEPPPDDNSGWDIDF\n\n"
# output should be the FASTA sequence with header information and newlines still included
Remove FASTA header
# use fasta_cleaner( FASTA_seq, parse=F )
# we don't want to parse every char in the fasta file
for(i in 1:length(prkg1_list)){
prkg1_list[[i]] <- compbio4all::fasta_cleaner(prkg1_list[[i]], parse = F)
}
Specific additional cleaning steps will be as needed for particular analyses
For code see https://rpubs.com/lowbrowR/drawProtein
# First use UniProt accession to download data from UniProt
Q13976_dp <- drawProteins::get_features("Q13976")
## [1] "Download has worked"
is( Q13976_dp )
## [1] "list" "vector" "list_OR_List" "vector_OR_Vector"
## [5] "vector_OR_factor"
# convert raw data into dataframe
my_prot_df <- drawProteins::feature_to_dataframe(Q13976_dp)
is(my_prot_df)
## [1] "data.frame" "list" "oldClass" "vector"
## [5] "list_OR_List" "vector_OR_Vector" "vector_OR_factor"
my_canvas <- draw_canvas(my_prot_df)
my_canvas <- draw_chains(my_canvas, my_prot_df,
label_size = 2.5)
my_canvas <- draw_domains(my_canvas, my_prot_df)
my_canvas
Prepare Data
prkg1_list[[1]]
## [1] "MGTLRDLQYALQEKIEELRQRDALIDELELELDQKDELIQKLQNELDKYRSVIRPATQQAQKQSASTLQGEPRTKRQAISAEPTAFDIQDLSHVTLPFYPKSPQSKDLIKEAILDNDFMKNLELSQIQEIVDCMYPVEYGKDSCIIKEGDVGSLVYVMEDGKVEVTKEGVKLCTMGPGKVFGELAILYNCTRTATVKTLVNVKLWAIDRQCFQTIMMRTGLIKHTEYMEFLKSVPTFQSLPEEILSKLADVLEETHYENGEYIIRQGARGDTFFIISKGTVNVTREDSPSEDPVFLRTLGKGDWFGEKALQGEDVRTANVIAAEAVTCLVIDRDSFKHLIGGLDDVSNKAYEDAEAKAKYEAEAAFFANLKLSDFNIIDTLGVGGFGRVELVQLKSEESKTFAMKILKKRHIVDTRQQEHIRSEKQIMQGAHSDFIVRLYRTFKDSKYLYMLMEACLGGELWTILRDRGSFEDSTTRFYTACVVEAFAYLHSKGIIYRDLKPENLILDHRGYAKLVDFGFAKKIGFGKKTWTFCGTPEYVAPEIILNKGHDISADYWSLGILMYELLTGSPPFSGPDPMKTYNIILRGIDMIEFPKKIAKNAANLIKKLCRDNPSERLGNLKNGVKDIQKHKWFEGFNWEGLRKGTLTPPIIPSVASPTDTSNFDSFPEDNDEPPPDDNSGWDIDF"
prkg1_human_vector <- unlist(strsplit( prkg1_list[[1]], "" ))
seqinr::dotPlot( prkg1_human_vector, prkg1_human_vector )
# TODO:
# create 2x2 panel to show different values for dotPlots settings
# create a single large plot with best version of the plot
par(mfrow = c(2,2),
mar = c(0,0,2,1))
# plot 1: Defaults
seqinr::dotPlot(prkg1_human_vector, prkg1_human_vector,
wsize = 1,
nmatch = 1,
main = "size=1, num match=1")
# plot 2 size = 10, nmatch = 10
seqinr::dotPlot(prkg1_human_vector, prkg1_human_vector,
wsize = 10,
nmatch = 1,
main = "size = 10, nmatch = 10")
# plot 3: size = 10, nmatch = 5
seqinr::dotPlot(prkg1_human_vector, prkg1_human_vector,
wsize = 10,
nmatch = 5,
main = "size = 10, nmatch = 5")
# plot 4: size = 20, nmatch = 5
seqinr::dotPlot(prkg1_human_vector, prkg1_human_vector,
wsize = 20,
nmatch = 5,
main = "size = 20, nmatch = 5")
# reset par() - run this or other plots will be small!
par(mfrow = c(1,1),
mar = c(4,4,4,4))
seqinr::dotPlot(prkg1_human_vector, prkg1_human_vector,
wsize = 20,
nmatch = 5,
main = "PRKG1 Human dot plot")
TODO: Create table
Below are links to relevant information. 1. Pfam: Region present: cNMP binding domain, Pkinase domain 2. DisProt: NA 3. RepeatDB: NA 4. PDB secondary structural classification: partial apo (92-227) of PRKG1 contains alpha helices and beta sheets
A Oryctolagus cuniculus (Rabbit) homolog is listed in Alphafold (https://alphafold.ebi.ac.uk/entry/O77676). The predicted structure contains alpha helices and beta sheets.
Uniprot (which uses http://www.csbio.sjtu.edu) indicates that this protein is a protein kinase that acts as key mediator of the nitric oxide (NO)/cGMP signaling pathway.
NOTE: My protein does NOT contain “U”.
First, I need the data from Chou and Zhang (1994) Table 5. Code to build this table is available at https://rpubs.com/lowbrowR/843543
The table looks like this:
# enter once
aa.1.1 <- c("A","R","N","D","C","Q","E","G","H","I",
"L","K","M","F","P","S","T","W","Y","V")
# alpha proteins
alpha <- c(285, 53, 97, 163, 22, 67, 134, 197, 111, 91,
221, 249, 48, 123, 82, 122, 119, 33, 63, 167)
# check against chou's total
sum(alpha) == 2447
## [1] TRUE
# beta proteins
beta <- c(203, 67, 139, 121, 75, 122, 86, 297, 49, 120,
177, 115, 16, 85, 127, 341, 253, 44, 110, 229)
# check against chou's total
sum(beta) == 2776
## [1] TRUE
# alpha + beta
a.plus.b <- c(175, 78, 120, 111, 74, 74, 86, 171, 33, 93,
110, 112, 25, 52, 71, 126, 117, 30, 108, 123)
sum(a.plus.b) == 1889
## [1] TRUE
# alpha/beta
a.div.b <- c(361, 146, 183, 244, 63, 114, 257, 377, 107, 239,
339, 321, 91, 158, 188, 327, 238, 72, 130, 378)
sum(a.div.b) == 4333
## [1] TRUE
data.frame(aa.1.1, alpha, beta, a.plus.b, a.div.b)
## aa.1.1 alpha beta a.plus.b a.div.b
## 1 A 285 203 175 361
## 2 R 53 67 78 146
## 3 N 97 139 120 183
## 4 D 163 121 111 244
## 5 C 22 75 74 63
## 6 Q 67 122 74 114
## 7 E 134 86 86 257
## 8 G 197 297 171 377
## 9 H 111 49 33 107
## 10 I 91 120 93 239
## 11 L 221 177 110 339
## 12 K 249 115 112 321
## 13 M 48 16 25 91
## 14 F 123 85 52 158
## 15 P 82 127 71 188
## 16 S 122 341 126 327
## 17 T 119 253 117 238
## 18 W 33 44 30 72
## 19 Y 63 110 108 130
## 20 V 167 229 123 378
alpha.prop <- alpha/sum(alpha)
beta.prop <- beta/sum(beta)
a.plus.b.prop <- a.plus.b/sum(a.plus.b)
a.div.b <- a.div.b/sum(a.div.b)
#dataframe
aa.prop <- data.frame(alpha.prop,
beta.prop,
a.plus.b.prop,
a.div.b)
#row labels
row.names(aa.prop) <- aa.1.1
Table 5 therefore becomes this
pander::pander(aa.prop)
| alpha.prop | beta.prop | a.plus.b.prop | a.div.b | |
|---|---|---|---|---|
| A | 0.1165 | 0.07313 | 0.09264 | 0.08331 |
| R | 0.02166 | 0.02414 | 0.04129 | 0.03369 |
| N | 0.03964 | 0.05007 | 0.06353 | 0.04223 |
| D | 0.06661 | 0.04359 | 0.05876 | 0.05631 |
| C | 0.008991 | 0.02702 | 0.03917 | 0.01454 |
| Q | 0.02738 | 0.04395 | 0.03917 | 0.02631 |
| E | 0.05476 | 0.03098 | 0.04553 | 0.05931 |
| G | 0.08051 | 0.107 | 0.09052 | 0.08701 |
| H | 0.04536 | 0.01765 | 0.01747 | 0.02469 |
| I | 0.03719 | 0.04323 | 0.04923 | 0.05516 |
| L | 0.09031 | 0.06376 | 0.05823 | 0.07824 |
| K | 0.1018 | 0.04143 | 0.05929 | 0.07408 |
| M | 0.01962 | 0.005764 | 0.01323 | 0.021 |
| F | 0.05027 | 0.03062 | 0.02753 | 0.03646 |
| P | 0.03351 | 0.04575 | 0.03759 | 0.04339 |
| S | 0.04986 | 0.1228 | 0.0667 | 0.07547 |
| T | 0.04863 | 0.09114 | 0.06194 | 0.05493 |
| W | 0.01349 | 0.01585 | 0.01588 | 0.01662 |
| Y | 0.02575 | 0.03963 | 0.05717 | 0.03 |
| V | 0.06825 | 0.08249 | 0.06511 | 0.08724 |
Determine the number of each amino acid in my protein.
A Function to convert a table into a vector is helpful here because R is goofy about tables not being the same as vectors.
table_to_vector <- function(table_x){
table_names <- attr(table_x, "dimnames")[[1]]
table_vect <- as.vector(table_x)
names(table_vect) <- table_names
return(table_vect)
}
NP_006249 <- prkg1_human_vector
PRKG1.human.aa.freq <- table(NP_006249)/length(NP_006249)
pander::pander(PRKG1.human.aa.freq)
| A | C | D | E | F | G | H | I |
|---|---|---|---|---|---|---|---|
| 0.05977 | 0.01458 | 0.07434 | 0.08017 | 0.0481 | 0.06706 | 0.01603 | 0.07289 |
| K | L | M | N | P | Q | R | S |
|---|---|---|---|---|---|---|---|
| 0.08163 | 0.09475 | 0.02187 | 0.03353 | 0.04227 | 0.03936 | 0.04519 | 0.05394 |
| T | V | W | Y |
|---|---|---|---|
| 0.05831 | 0.05102 | 0.01166 | 0.03353 |
Check for the presence of “U” (unknown aa.)
aa.names <- names(PRKG1.human.aa.freq)
aa.names
## [1] "A" "C" "D" "E" "F" "G" "H" "I" "K" "L" "M" "N" "P" "Q" "R" "S" "T" "V" "W"
## [20] "Y"
any(aa.names == "U")
## [1] FALSE
# i.U <- which(aa.names == "U")
# i.U
# aa.names[i.U]
# pander::pander(PRKG1.human.aa.freq)
# PRKG1.human.aa.freq[i.U]
# pander::pander(PRKG1.human.aa.freq)
# PRKG1.human.aa.freq <- PRKG1.human.aa.freq[-i.U]
# pander::pander(PRKG1.human.aa.freq)
Add data on my focal protein to the amino acid frequency table.
aa.prop$PRKG1.human.aa.freq <- PRKG1.human.aa.freq
pander::pander(aa.prop)
| alpha.prop | beta.prop | a.plus.b.prop | a.div.b | PRKG1.human.aa.freq | |
|---|---|---|---|---|---|
| A | 0.1165 | 0.07313 | 0.09264 | 0.08331 | 0.05977 |
| R | 0.02166 | 0.02414 | 0.04129 | 0.03369 | 0.01458 |
| N | 0.03964 | 0.05007 | 0.06353 | 0.04223 | 0.07434 |
| D | 0.06661 | 0.04359 | 0.05876 | 0.05631 | 0.08017 |
| C | 0.008991 | 0.02702 | 0.03917 | 0.01454 | 0.0481 |
| Q | 0.02738 | 0.04395 | 0.03917 | 0.02631 | 0.06706 |
| E | 0.05476 | 0.03098 | 0.04553 | 0.05931 | 0.01603 |
| G | 0.08051 | 0.107 | 0.09052 | 0.08701 | 0.07289 |
| H | 0.04536 | 0.01765 | 0.01747 | 0.02469 | 0.08163 |
| I | 0.03719 | 0.04323 | 0.04923 | 0.05516 | 0.09475 |
| L | 0.09031 | 0.06376 | 0.05823 | 0.07824 | 0.02187 |
| K | 0.1018 | 0.04143 | 0.05929 | 0.07408 | 0.03353 |
| M | 0.01962 | 0.005764 | 0.01323 | 0.021 | 0.04227 |
| F | 0.05027 | 0.03062 | 0.02753 | 0.03646 | 0.03936 |
| P | 0.03351 | 0.04575 | 0.03759 | 0.04339 | 0.04519 |
| S | 0.04986 | 0.1228 | 0.0667 | 0.07547 | 0.05394 |
| T | 0.04863 | 0.09114 | 0.06194 | 0.05493 | 0.05831 |
| W | 0.01349 | 0.01585 | 0.01588 | 0.01662 | 0.05102 |
| Y | 0.02575 | 0.03963 | 0.05717 | 0.03 | 0.01166 |
| V | 0.06825 | 0.08249 | 0.06511 | 0.08724 | 0.03353 |
Two custom functions are needed: one to calculate correlates between two columns of our table, and one to calculate correlation similarities.
# Correlation used in Chou and Zhange 1992.
chou_cor <- function(x,y){
numerator <- sum(x*y)
denominator <- sqrt((sum(x^2))*(sum(y^2)))
result <- numerator/denominator
return(result)
}
# Cosine similarity used in Higgs and Attwood (2005).
chou_cosine <- function(z.1, z.2){
z.1.abs <- sqrt(sum(z.1^2))
z.2.abs <- sqrt(sum(z.2^2))
my.cosine <- sum(z.1*z.2)/(z.1.abs*z.2.abs)
return(my.cosine)
}
Calculate correlation between each column
corr.alpha <- chou_cor(aa.prop[,5], aa.prop[,1])
corr.beta <- chou_cor(aa.prop[,5], aa.prop[,2])
corr.apb <- chou_cor(aa.prop[,5], aa.prop[,3])
corr.adb <- chou_cor(aa.prop[,5], aa.prop[,4])
Calculate cosine similarity
cos.alpha <- chou_cosine(aa.prop[,5], aa.prop[,1])
cos.beta <- chou_cosine(aa.prop[,5], aa.prop[,2])
cos.apb <- chou_cosine(aa.prop[,5], aa.prop[,3])
cos.adb <- chou_cosine(aa.prop[,5], aa.prop[,4])
Calculate distance. Note: we need to flip the dataframe on its side using a command called t()
aa.prop.flipped <- t(aa.prop)
round(aa.prop.flipped,2)
## A R N D C Q E G H I L K
## alpha.prop 0.12 0.02 0.04 0.07 0.01 0.03 0.05 0.08 0.05 0.04 0.09 0.10
## beta.prop 0.07 0.02 0.05 0.04 0.03 0.04 0.03 0.11 0.02 0.04 0.06 0.04
## a.plus.b.prop 0.09 0.04 0.06 0.06 0.04 0.04 0.05 0.09 0.02 0.05 0.06 0.06
## a.div.b 0.08 0.03 0.04 0.06 0.01 0.03 0.06 0.09 0.02 0.06 0.08 0.07
## PRKG1.human.aa.freq 0.06 0.01 0.07 0.08 0.05 0.07 0.02 0.07 0.08 0.09 0.02 0.03
## M F P S T W Y V
## alpha.prop 0.02 0.05 0.03 0.05 0.05 0.01 0.03 0.07
## beta.prop 0.01 0.03 0.05 0.12 0.09 0.02 0.04 0.08
## a.plus.b.prop 0.01 0.03 0.04 0.07 0.06 0.02 0.06 0.07
## a.div.b 0.02 0.04 0.04 0.08 0.05 0.02 0.03 0.09
## PRKG1.human.aa.freq 0.04 0.04 0.05 0.05 0.06 0.05 0.01 0.03
We can get distance matrix like this
dist(aa.prop.flipped, method = "euclidean")
## alpha.prop beta.prop a.plus.b.prop a.div.b
## beta.prop 0.13342098
## a.plus.b.prop 0.09281824 0.08289406
## a.div.b 0.06699039 0.08659174 0.06175113
## PRKG1.human.aa.freq 0.16424606 0.15741673 0.13455135 0.14951781
Individual distances using dist()
dist.alpha <- dist((aa.prop.flipped[c(1,5),]), method = "euclidean")
dist.beta <- dist((aa.prop.flipped[c(2,5),]), method = "euclidean")
dist.apb <- dist((aa.prop.flipped[c(3,5),]), method = "euclidean")
dist.adb <- dist((aa.prop.flipped[c(4,5),]), method = "euclidean")
Compile the information. Rounding makes it easier to read
# fold types
fold.type <- c("alpha","beta","alpha plus beta", "alpha/beta")
# data
corr.sim <- round(c(corr.alpha,corr.beta,corr.apb,corr.adb),5)
cosine.sim <- round(c(cos.alpha,cos.beta,cos.apb,cos.adb),5)
Euclidean.dist <- round(c(dist.alpha,dist.beta,dist.apb,dist.adb),5)
# summary
sim.sum <- c("","","most.sim","")
dist.sum <- c("","","min.dist","")
df <- data.frame(fold.type,
corr.sim ,
cosine.sim ,
Euclidean.dist ,
sim.sum ,
dist.sum )
Display output
pander::pander(df)
| fold.type | corr.sim | cosine.sim | Euclidean.dist | sim.sum | dist.sum |
|---|---|---|---|---|---|
| alpha | 0.7894 | 0.7894 | 0.1643 | ||
| beta | 0.8098 | 0.8098 | 0.1574 | ||
| alpha plus beta | 0.8493 | 0.8493 | 0.1346 | most.sim | min.dist |
| alpha/beta | 0.8173 | 0.8173 | 0.1495 |
# fold.type
# corr.sim
# cosine.sim
# Euclidean.dist
# sim.sum
# dist.sum
Convert all FASTA records intro entries in a single vector. FASTA entries are contained in a list produced at the beginning of the script. They were cleaned to remove the header and newline characters.
prkg1_list
## $NP_006249
## [1] "MGTLRDLQYALQEKIEELRQRDALIDELELELDQKDELIQKLQNELDKYRSVIRPATQQAQKQSASTLQGEPRTKRQAISAEPTAFDIQDLSHVTLPFYPKSPQSKDLIKEAILDNDFMKNLELSQIQEIVDCMYPVEYGKDSCIIKEGDVGSLVYVMEDGKVEVTKEGVKLCTMGPGKVFGELAILYNCTRTATVKTLVNVKLWAIDRQCFQTIMMRTGLIKHTEYMEFLKSVPTFQSLPEEILSKLADVLEETHYENGEYIIRQGARGDTFFIISKGTVNVTREDSPSEDPVFLRTLGKGDWFGEKALQGEDVRTANVIAAEAVTCLVIDRDSFKHLIGGLDDVSNKAYEDAEAKAKYEAEAAFFANLKLSDFNIIDTLGVGGFGRVELVQLKSEESKTFAMKILKKRHIVDTRQQEHIRSEKQIMQGAHSDFIVRLYRTFKDSKYLYMLMEACLGGELWTILRDRGSFEDSTTRFYTACVVEAFAYLHSKGIIYRDLKPENLILDHRGYAKLVDFGFAKKIGFGKKTWTFCGTPEYVAPEIILNKGHDISADYWSLGILMYELLTGSPPFSGPDPMKTYNIILRGIDMIEFPKKIAKNAANLIKKLCRDNPSERLGNLKNGVKDIQKHKWFEGFNWEGLRKGTLTPPIIPSVASPTDTSNFDSFPEDNDEPPPDDNSGWDIDF"
##
## $NP_776861
## [1] "MSELEEDFAKILMLKEERIKELEKRLSEKEEEIQELKRKLHKCQSVLPVPSTHIGPRTTRAQGISAEPQTYRSFHDLRQAFRKFTKSERSKDLIKEAILDNDFMKNLELSQIQEIVDCMYPVEYGKDSCIIKEGDVGSLVYVMEDGKVEVTKEGVKLCTMGPGKVFGELAILYNCTRTATVKTLVNVKLWAIDRQCFQTIMMRTGLIKHTEYMEFLKSVPTFQSLPEEILSKLADVLEETHYENGEYIIRQGARGDTFFIISKGKVNVTREDSPNEDPVFLRTLGKGDWFGEKALQGEDVRTANVIAAEAVTCLVIDRDSFKHLIGGLDDVSNKAYEDAEAKAKYEAEAAFFANLKLSDFNIIDTLGVGGFGRVELVQLKSEESKTFAMKILKKRHIVDTRQQEHIRSEKQIMQGAHSDFIVRLYRTFKDSKYLYMLMEACLGGELWTILRDRGSFEDSTTRFYTACVVEAFAYLHSKGIIYRDLKPENLILDHRGYAKLVDFGFAKKIGFGKKTWTFCGTPEYVAPEIILNKGHDISADYWSLGILMYELLTGSPPFSGPDPMKTYNIILRGIDMIEFPKKIAKNAANLIKKLCRDNPSERLGNLKNGVKDIQKHKWFEGFNWEGLRKGTLTPPIIPSVASPTDTSNFDSFPEDNDEPPPDDNSGWDIDF"
##
## $NP_001013855
## [1] "MSELEEDFAKILMLKEERIKELEKRLSEKEEEIQELKRKLHKCQSVLPVPSTHIGPRTTRAQGISAEPQTYRSFHDLRQAFRKFTKSERSKDLIKEAILDNDFMKNLELSQIQEIVDCMYPVEYGKDSCIIKEGDVGSLVYVMEDGKVEVTKEGVKLCTMGPGKVFGELAILYNCTRTATVKTLVNVKLWAIDRQCFQTIMMRTGLIKHTEYMEFLKSVPTFQSLPDEILSKLADVLEETHYENGEYIIRQGARGDTFFIISKGQVNVTREDSPSEDPVFLRTLGKGDWFGEKALQGEDVRTANVIAAEAVTCLVIDRDSFKHLIGGLDDVSNKAYEDAEAKAKYEAEAAFFANLKLSDFNIIDTLGVGGFGRVELVQLKSEESKTFAMKILKKRHIVDTRQQEHIRSEKQIMQGAHSDFIVRLYRTFKDSKYLYMLMEACLGGELWTILRDRGSFEDSTTRFYTACVVEAFAYLHSKGIIYRDLKPENLILDHRGYAKLVDFGFAKKIGFGKKTWTFCGTPEYVAPEIILNKGHDISADYWSLGILMYELLTGSPPFSGPDPMKTYNIILRGIDMIEFPKKIAKNAANLIKKLCRDNPSERLGNLKNGVKDIQKHKWFEGFNWEGLRKGTLTPPIIPSVASPTDTSNFDSFPEDSDEPPPDDNSGWDIDF"
##
## $NP_001099201
## [1] "MSELEEDFAKILMLKEERIKELEKRLSEKEEEIQELKRKLHKCQSVLPVPSTHIGPRTTRAQGISAEPQTYRSFHDLRQAFRKFTKSERSKDLIKEAILDNDFMKNLELSQIQEIVDCMYPVEYGKDSCIIKEGDVGSLVYVMEDGKVEVTKEGVKLCTMGPGKVFGELAILYNCTRTATVKTLVNVKLWAIDRQCFQTIMMRTGLIKHTEYMEFLKSVPTFQSLPDEILSKLADVLEETHYENGEYIIRQGARGDTFFIISKGKVNVTREDSPSEDPVFLRTLGKGDWFGEKALQGEDVRTANVIAAEAVTCLVIDRDSFKHLIGGLDDVSNKAYEDAEAKAKYEAEAAFFANLKLSDFNIIDTLGVGGFGRVELVQLKSEESKTFAMKILKKRHIVDTRQQEHIRSEKQIMQGAHSDFIVRLYRTFKDSKYLYMLMEACLGGELWTILRDRGSFEDSTTRFYTACVVEAFAYLHSKGIIYRDLKPENLILDHRGYAKLVDFGFAKKIGFGKKTWTFCGTPEYVAPEIILNKGHDISADYWSLGILMYELLTGSPPFSGPDPMKTYNIILRGIDMIEFPKKIAKNAANLIKKLCRDNPSERLGNLKNGVKDIQKHKWFEGFNWEGLRKGTLTPPIIPSVASPTDTSNFDSFPEDSDEPPPDDNSGWDIDF"
##
## $NP_957324
## [1] "MSDLDEDFAKILMLKEERIRDLERRLLEREDEISELKRKLHKCQSVLPSAQIGPRTHRAQGISAEPQTHQDLSNQSFRRVAKSDRSKDLIKSAILDNDFMKNLEMSQIQEIVDCMYPVDYDKNSCIIKEGDVGSLVYVMEDGKVEVTKEGLKLCTMGPGKVFGELAILYNCTRTATVRTVSSVKLWAIDRQCFQTIMMRTGLIKHAEYMELLKSVLTFRGLPEEILSKLADVLEETHYEDGNYIIRQGARGDTFFIISKGKVTMTREDCPGQEPVYLRSMGRGDSFGEKALQGEDIRTANVIAAETVTCLVIDRDSYKHLIGGLEDVSNKGCEDAEAKAKYEAENAFFSNLNLSDFNIIDTLGVGGFGRVELVQLKSDEMKTFAMKILKKRHIVDTRQQEHIRSEKLIMQEAHSDFIVRLYRTFKDSKYLYMLMEACLGGELWTILRDRGNFDDSTTRFYTACVVEAFAYLHSKGIIYRDLKPENLILDHRGYAKLVDFGFAKKIGFGKKTWTFCGTPEYVAPEIILNKGHDISADYWSLGILMYELLTGSPPFSGPDPMKTYNIILRGIDMIEFPKKITKNAANLIKKLCRDTPSERLGNLKNGVKDIQKHKWFEGFNWDGLRKGTLMPPIIPNVTSSTDTSNFDSFPEDNEDPPPDDNSGWDIDF"
##
## $NP_477488
## [1] "MQSLRISGCTPSGTGGSATPSPVGLVDPNFIVSNYVAASPQEERFIQIIQAKELKIQEMQRALQFKDNEIAELKSHLDKFQSVFPFSRGSAAGCAGTGGASGSGAGGSGGSGPGTATGATRKSGQNFQRQRALGISAEPQSESSLLLEHVSFPKYDKDERSRELIKAAILDNDFMKNLDLTQIREIVDCMYPVKYPAKNLIIKEGDVGSIVYVMEDGRVEVSREGKYLSTLSGAKVLGELAILYNCQRTATITAITECNLWAIERQCFQTIMMRTGLIRQAEYSDFLKSVPIFKDLAEDTLIKISDVLEETHYQRGDYIVRQGARGDTFFIISKGKVRVTIKQQDTQEEKFIRMLGKGDFFGEKALQGDDLRTANIICESADGVSCLVIDRETFNQLISNLDEIKHRYDDEGAMERRKINEEFRDINLTDLRVIATLGVGGFGRVELVQTNGDSSRSFALKQMKKSQIVETRQQQHIMSEKEIMGEANCQFIVKLFKTFKDKKYLYMLMESCLGGELWTILRDKGNFDDSTTRFYTACVVEAFDYLHSRNIIYRDLKPENLLLNERGYVKLVDFGFAKKLQTGRKTWTFCGTPEYVAPEVILNRGHDISADYWSLGVLMFELLTGTPPFTGSDPMRTYNIILKGIDAIEFPRNITRNASNLIKKLCRDNPAERLGYQRGGISEIQKHKWFDGFYWWGLQNCTLEPPIKPAVKSVVDTTNFDDYPPDPEGPPPDDVTGWDKDF"
##
## $XP_001162858
## [1] "MGTLRDLQYALQEKIEELRQRDALIDELELELDQKDELIQKLQNELDKYRSVIRPATQQAQKQSASTLQGEPRTKRQAISAEPTAFDIQDLSHVTLPFYPKSPQSKDLIKEAILDNDFMKNLELSQIQEIVDCMYPVEYGKDSCIIKEGDVGSLVYVMEDGKVEVTKEGVKLCTMGPGKVFGELAILYNCTRTATVKTLVNVKLWAIDRQCFQTIMMRTGLIKHTEYMEFLKSVPTFQSLPEEILSKLADVLEETHYENGEYIIRQGARGDTFFIISKGTVNVTREDSPSEDPVFLRTLGKGDWFGEKALQGEDVRTANVIAAEAVTCLVIDRDSFKHLIGGLDDVSNKAYEDAEAKAKYEAEAAFFANLKLSDFNIIDTLGVGGFGRVELVQLKSEESKTFAMKILKKRHIVDTRQQEHIRSEKQIMQGAHSDFIVRLYRTFKDSKYLYMLMEACLGGELWTILRDRGSFEDSTTRFYTACVVEAFAYLHSKGIIYRDLKPENLILDHRGYAKLVDFGFAKKIGFGKKTWTFCGTPEYVAPEIILNKGHDISADYWSLGILMYELLTGSPPFSGPDPMKTYNIILRGIDMIEFPKKIAKNAANLIKKLCRDNPSERLGNLKNGVKDIQKHKWFEGFNWEGLRKGTLTPPIIPSVASPTDTSNFDSFPEDNDEPPPDDNSGWDIDF"
##
## $XP_851997
## [1] "MSELEEDFAQVLMLKEERIKELERRLSEKEEEIQELKRKLHKCQSVLPAPSPHIGPRTTRAQGISAEPQTYRSFHDLRQAFRKFAKSERSKDLIKEAILDNDFMKNLELSQIQEIVDCMYPVEYGKDSCIIKEGDVGSLVYVMEDGKVEVTKEGVKLCTMGPGKVFGELAILYNCTRTATVKTLVNVKLWAIDRQCFQTIMMRTGLIKHTEYMEFLKSVPTFQSLPEEILSKLADVLEETHYENGEYIIRQGARGDTFFIISKGTVNVTREDSPSEDPVFLRTLGKGDWFGEKALQGEDVRTANVIAAEAVTCLVIDRDSFKHLIGGLDDVSNKAYEDAEAKAKYEAEAAFFANLKLSDFNIIDTLGVGGFGRVELVQLKSEESKTFAMKILKKRHIVDTRQQEHIRSEKQIMQGAHSDFIVRLYRTFKDSKYLYMLMEVCLGGELWTILRDRGSFEDSTTRFYTACVVEAFAYLHSKGIIYRDLKPENLILDHRGYTKLVDFGFAKKIGFGKKTWTFCGTPEYVAPEIILNKGHDISADYWSLGILMYELLTGSPPFSGPDPMKTYNIILRGIDMIEFPKKIAKNAANLIKKLCRDNPSERLGNLKNGVKDIQKHKWFEGFNWEGLRKGTLTPPIIPSVASPTDTSNFDSFPEDNDEPPPDDNSGWDIDF"
##
## $XP_003641507
## [1] "MVIVAISRIPASPSGTKAFPLFLRFFRAPLLLPPPLPAAPRPVPGDVGPAAVSGAGRAAPVPPPCERRGPTMSELEGDFTKLLLLKEERIRELERRLGEKDEEIQELRRRLHKCHSVLPAPSPHIGPRTTRAQGISAEPQTYRSFHDLRQAFHKFTKAERSKELIKEAILDNDFMKNLELSQIQEIVDCMYPVEYGKDSCIIKEGDVGSLVYVMEDGKVEVTKEGVKLCTMGPGKVFGELAILYNCTRTATVKTLVNVKLWAIDRQCFQTIMMRTGLIKHTEYMEFLKSVPTFQSLPEEILSKLADVLEETHYESGEYIIRQGARGDTFFIISKGKVNVTREDSPSEDPVFLRTLGKGDWFGEKALQWEDVRTANVIAAEAVTCLVIDRDSFKHLIGGLDDVSNKAYEDAEAKAKYEAEAAFFANLKLSDFNIIDTLGVGGFGRVELVQLKSEETKTFAMKILKKRHIVDTRQQEHIRSEKQIMQSAHSDFIVRLYRTFKDSKYLYMLMEACLGGELWTILRDRGSFEDSTTRFYTACVVEAFAYLHSKGIIYRDLKPENLILDHRGYAKLVDFGFAKKIGFGKKTWTFCGTPEYVAPEIILNKGHDISADYWSLGILMYELLTGSPPFSGPDPMKTYNIILRGIDMIEFPKKIAKNAANLIKKLCRDNPSERLGNLKNGVKDIQKHKWFEGFNWEGLRKGTLTPPIIPSVASPTDTSNFDSFPEDNDEPPPDDNSGWDIDF"
##
## $XP_002935474
## [1] "MGTLRDLQYALQEKIEELRQRDALIDELELELDQKDELIQRLQNELDKYRSVIKPATQQVHKQNPTTLGEQRTKRQAISAEPTAIDIQELSHVTLPFYPKSPQSKELIKEAILDNDFMKNLEISQIQEIVDCMYPVEYGKDSCIIKEGDVGSLVYVMEDGKVEVTKESVKLCTMGPGKVFGELAILYNCTRTATVKTLTNVKLWAIDRQCFQTIMMRTGLIKHTEYMEFLKSVPTFQSLPEEIVSKLADVLEETHYESGDYIIRQGARGDTFFIISKGKVNVTREDSPGEDPIFLRTLGKGDWFGEKALQGEDVRTANVIAAEAVTCLVIDRDSFKHLIGGLDDVSNKAYEDAEAKAKYEAEAAFFGNLKLADFNIIDTLGVGGFGRVELVQLKSDECKTFAMKILKKRHIVDTRQQEHIRSEKQIMQSAHSDFIVRLYRTFKDSKYLYMLMEACLGGELWTILRDRGSFDDSTTRFYTACVVEAFAYLHSKGIIYRDLKPENLILDHRGYAKLVDFGFAKKIGFGKKTWTFCGTPEYVAPEIILNKGHDISADYWSLGILMYELLTGSPPFSGPDPMKTYNIILRGIDMIEFPKKITKNAANLIKKLCRDNPSERLGNLKNGVKDIQKHKWFEGFNWEGLRKGTLTPPIIPSVASPTDTSNFDSFPEDNEDPPPDDNSGWDIDF"
names( prkg1_list )
## [1] "NP_006249" "NP_776861" "NP_001013855" "NP_001099201" "NP_957324"
## [6] "NP_477488" "XP_001162858" "XP_851997" "XP_003641507" "XP_002935474"
# 10 accession numbers
length( prkg1_list )
## [1] 10
# 10
Each entry is a full entry with no spaces or parsing, and no header
prkg1_list[1]
## $NP_006249
## [1] "MGTLRDLQYALQEKIEELRQRDALIDELELELDQKDELIQKLQNELDKYRSVIRPATQQAQKQSASTLQGEPRTKRQAISAEPTAFDIQDLSHVTLPFYPKSPQSKDLIKEAILDNDFMKNLELSQIQEIVDCMYPVEYGKDSCIIKEGDVGSLVYVMEDGKVEVTKEGVKLCTMGPGKVFGELAILYNCTRTATVKTLVNVKLWAIDRQCFQTIMMRTGLIKHTEYMEFLKSVPTFQSLPEEILSKLADVLEETHYENGEYIIRQGARGDTFFIISKGTVNVTREDSPSEDPVFLRTLGKGDWFGEKALQGEDVRTANVIAAEAVTCLVIDRDSFKHLIGGLDDVSNKAYEDAEAKAKYEAEAAFFANLKLSDFNIIDTLGVGGFGRVELVQLKSEESKTFAMKILKKRHIVDTRQQEHIRSEKQIMQGAHSDFIVRLYRTFKDSKYLYMLMEACLGGELWTILRDRGSFEDSTTRFYTACVVEAFAYLHSKGIIYRDLKPENLILDHRGYAKLVDFGFAKKIGFGKKTWTFCGTPEYVAPEIILNKGHDISADYWSLGILMYELLTGSPPFSGPDPMKTYNIILRGIDMIEFPKKIAKNAANLIKKLCRDNPSERLGNLKNGVKDIQKHKWFEGFNWEGLRKGTLTPPIIPSVASPTDTSNFDSFPEDNDEPPPDDNSGWDIDF"
Make each entry of the list into a vector. There are several ways to do this.
prkg1_vector <- unlist( prkg1_list )
Name the vector
names( prkg1_list )
## [1] "NP_006249" "NP_776861" "NP_001013855" "NP_001099201" "NP_957324"
## [6] "NP_477488" "XP_001162858" "XP_851997" "XP_003641507" "XP_002935474"
names( prkg1_vector ) <- names( prkg1_list )
Do pairwise alignments for humans, chimps and 2-other species.
prkg1_human <- prkg1_vector["NP_006249"]
prkg1_chimp <- prkg1_vector["XP_001162858"]
prkg1_mouse <- prkg1_vector["NP_001013855"]
prkg1_fish <- prkg1_vector["NP_957324"]
# human and chimp
alignHumanChimp <- Biostrings::pairwiseAlignment(prkg1_human, prkg1_chimp)
Biostrings::pid( alignHumanChimp )
## [1] 100
alignHumanMouse <- Biostrings::pairwiseAlignment(prkg1_human, prkg1_mouse)
Biostrings::pid( alignHumanMouse )
## [1] 90.13062
alignHumanFish <- Biostrings::pairwiseAlignment(prkg1_human, prkg1_fish)
Biostrings::pid( alignHumanFish )
## [1] 82.67831
Build matrix
alignHumanChimp <- Biostrings::pairwiseAlignment(prkg1_human, prkg1_mouse)
alignHumanMouse <- Biostrings::pairwiseAlignment(prkg1_human, prkg1_mouse)
alignHumanFish <- Biostrings::pairwiseAlignment(prkg1_human, prkg1_mouse)
alignChimpMouse <- Biostrings::pairwiseAlignment(prkg1_human, prkg1_mouse)
alignChimpFish <- Biostrings::pairwiseAlignment(prkg1_human, prkg1_mouse)
alignMouseFish <- Biostrings::pairwiseAlignment(prkg1_human, prkg1_mouse)
pids <- c(1, NA, NA, NA,
Biostrings::pid(alignHumanChimp), 1, NA, NA,
Biostrings::pid(alignHumanMouse), Biostrings::pid(alignChimpMouse), 1, NA,
Biostrings::pid(alignHumanFish), Biostrings::pid(alignChimpFish), Biostrings::pid(alignMouseFish), 1)
mat <- matrix(pids, nrow = 4, byrow = T)
row.names(mat) <- c("Homo","Pan","Rat","Fish")
colnames(mat) <- c("Homo","Pan","Rat","Fish")
pander::pander(mat)
| Homo | Pan | Rat | Fish | |
|---|---|---|---|---|
| Homo | 1 | NA | NA | NA |
| Pan | 90.13 | 1 | NA | NA |
| Rat | 90.13 | 90.13 | 1 | NA |
| Fish | 90.13 | 90.13 | 90.13 | 1 |
Compare different PID methods. I did this for Humans vs. chimps
# humans vs chimps
# pid1, pid2, pid3, pid4
PID1 <- Biostrings::pid(alignHumanChimp, type="PID1")
PID2 <- Biostrings::pid(alignHumanChimp, type="PID2")
PID3 <- Biostrings::pid(alignHumanChimp, type="PID3")
PID4 <- Biostrings::pid(alignHumanChimp, type="PID4")
method <- c("PID1", "PID2", "PID3", "PID4")
PID <- c( PID1, PID2, PID3, PID4 )
pid.comparison <- data.frame( method, PID )
pander::pander(pid.comparison)
| method | PID |
|---|---|
| PID1 | 90.13 |
| PID2 | 92.96 |
| PID3 | 92.55 |
| PID4 | 91.53 |
For use with R bioinformatics tools we need to convert our named vector to a string set using Biostrings::AAStringSet(). Note the _ss tag at the end of the object we’re assigning the output to, which designates this as a string set.
prkg1_vector_ss <- Biostrings::AAStringSet( prkg1_vector )
prkg1_align <- msa(prkg1_vector_ss,
method = "ClustalW")
## use default substitution matrix
msa produces a species MSA object
class( prkg1_align )
## [1] "MsaAAMultipleAlignment"
## attr(,"package")
## [1] "msa"
# should be msa
is( prkg1_align )
## [1] "MsaAAMultipleAlignment" "AAMultipleAlignment" "MsaMetaData"
## [4] "MultipleAlignment"
# should be MsaAAMultipleAlignment
Default output of MSA
prkg1_align
## CLUSTAL 2.1
##
## Call:
## msa(prkg1_vector_ss, method = "ClustalW")
##
## MsaAAMultipleAlignment with 10 rows and 781 columns
## aln names
## [1] -------------------------...SNFDSFPEDNDEPPPDDNSGWDIDF NP_006249
## [2] -------------------------...SNFDSFPEDNDEPPPDDNSGWDIDF XP_001162858
## [3] -------------------------...SNFDSFPEDNEDPPPDDNSGWDIDF XP_002935474
## [4] -------------------------...SNFDSFPEDNDEPPPDDNSGWDIDF XP_851997
## [5] MVIVAISRIPASPSGTKAFPLFLRF...SNFDSFPEDNDEPPPDDNSGWDIDF XP_003641507
## [6] -------------------------...SNFDSFPEDSDEPPPDDNSGWDIDF NP_001013855
## [7] -------------------------...SNFDSFPEDSDEPPPDDNSGWDIDF NP_001099201
## [8] -------------------------...SNFDSFPEDNDEPPPDDNSGWDIDF NP_776861
## [9] -------------------------...SNFDSFPEDNEDPPPDDNSGWDIDF NP_957324
## [10] -------------------------...TNFDDYPPDPEGPPPDDVTGWDKDF NP_477488
## Con -------------------------...SNFDSFPEDNDEPPPDDNSGWDIDF Consensus
Change class of alignment
# should be msa
class(prkg1_align) <- "AAMultipleAlignment"
Convert to seqinr format
# alignment
prkg1_align_seqinr <- msaConvert(prkg1_align,
type = "seqinr::alignment")
OPTIONAL: show output with print_msa
compbio4all::print_msa(prkg1_align_seqinr)
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "MVIVAISRIPASPSGTKAFPLFLRFFRAPLLLPPPLPAAPRPVPGDVGPAAVSGAGRAAP 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "--------------------------------MQSLRISGCTPSGTGGSATPSPVGLVDP 0"
## [1] " "
## [1] "--------MGTLRDLQYALQEKIEELRQRDALIDELELELDQKDELIQKLQNELDKYRSV 0"
## [1] "--------MGTLRDLQYALQEKIEELRQRDALIDELELELDQKDELIQKLQNELDKYRSV 0"
## [1] "--------MGTLRDLQYALQEKIEELRQRDALIDELELELDQKDELIQRLQNELDKYRSV 0"
## [1] "-----------MSELEEDFAQVLMLKEER---IKELERRLSEKEEEIQELKRKLHKCQSV 0"
## [1] "VPPPCERRGPTMSELEGDFTKLLLLKEER---IRELERRLGEKDEEIQELRRRLHKCHSV 0"
## [1] "-----------MSELEEDFAKILMLKEER---IKELEKRLSEKEEEIQELKRKLHKCQSV 0"
## [1] "-----------MSELEEDFAKILMLKEER---IKELEKRLSEKEEEIQELKRKLHKCQSV 0"
## [1] "-----------MSELEEDFAKILMLKEER---IKELEKRLSEKEEEIQELKRKLHKCQSV 0"
## [1] "-----------MSDLDEDFAKILMLKEER---IRDLERRLLEREDEISELKRKLHKCQSV 0"
## [1] "N--FIVSNYVAASPQEERFIQIIQAKELK---IQEMQRALQFKDNEIAELKSHLDKFQSV 0"
## [1] " "
## [1] "IRPATQQAQ-----------------KQSASTLQGEPRT--------KRQAISAEPTAFD 0"
## [1] "IRPATQQAQ-----------------KQSASTLQGEPRT--------KRQAISAEPTAFD 0"
## [1] "IKPATQQVH-----------------KQNPTTL-GEQRT--------KRQAISAEPTAID 0"
## [1] "LP--------------------------APSPHIGPRTT--------RAQGISAEPQTYR 0"
## [1] "LP--------------------------APSPHIGPRTT--------RAQGISAEPQTYR 0"
## [1] "LP--------------------------VPSTHIGPRTT--------RAQGISAEPQTYR 0"
## [1] "LP--------------------------VPSTHIGPRTT--------RAQGISAEPQTYR 0"
## [1] "LP--------------------------VPSTHIGPRTT--------RAQGISAEPQTYR 0"
## [1] "LP----------------------------SAQIGPRTH--------RAQGISAEPQTH- 0"
## [1] "FPFSRGSAAGCAGTGGASGSGAGGSGGSGPGTATGATRKSGQNFQRQRALGISAEPQSES 0"
## [1] " "
## [1] "IQDLSHVTLPFYPKSPQSKDLIKEAILDNDFMKNLELSQIQEIVDCMYPVEYGKDSCIIK 0"
## [1] "IQDLSHVTLPFYPKSPQSKDLIKEAILDNDFMKNLELSQIQEIVDCMYPVEYGKDSCIIK 0"
## [1] "IQELSHVTLPFYPKSPQSKELIKEAILDNDFMKNLEISQIQEIVDCMYPVEYGKDSCIIK 0"
## [1] "SFHDLRQAFRKFAKSERSKDLIKEAILDNDFMKNLELSQIQEIVDCMYPVEYGKDSCIIK 0"
## [1] "SFHDLRQAFHKFTKAERSKELIKEAILDNDFMKNLELSQIQEIVDCMYPVEYGKDSCIIK 0"
## [1] "SFHDLRQAFRKFTKSERSKDLIKEAILDNDFMKNLELSQIQEIVDCMYPVEYGKDSCIIK 0"
## [1] "SFHDLRQAFRKFTKSERSKDLIKEAILDNDFMKNLELSQIQEIVDCMYPVEYGKDSCIIK 0"
## [1] "SFHDLRQAFRKFTKSERSKDLIKEAILDNDFMKNLELSQIQEIVDCMYPVEYGKDSCIIK 0"
## [1] "-QDLSNQSFRRVAKSDRSKDLIKSAILDNDFMKNLEMSQIQEIVDCMYPVDYDKNSCIIK 0"
## [1] "SLLLEHVSFPKYDKDERSRELIKAAILDNDFMKNLDLTQIREIVDCMYPVKYPAKNLIIK 0"
## [1] " "
## [1] "EGDVGSLVYVMEDGKVEVTKEGVKLCTMGPGKVFGELAILYNCTRTATVKTLVNVKLWAI 0"
## [1] "EGDVGSLVYVMEDGKVEVTKEGVKLCTMGPGKVFGELAILYNCTRTATVKTLVNVKLWAI 0"
## [1] "EGDVGSLVYVMEDGKVEVTKESVKLCTMGPGKVFGELAILYNCTRTATVKTLTNVKLWAI 0"
## [1] "EGDVGSLVYVMEDGKVEVTKEGVKLCTMGPGKVFGELAILYNCTRTATVKTLVNVKLWAI 0"
## [1] "EGDVGSLVYVMEDGKVEVTKEGVKLCTMGPGKVFGELAILYNCTRTATVKTLVNVKLWAI 0"
## [1] "EGDVGSLVYVMEDGKVEVTKEGVKLCTMGPGKVFGELAILYNCTRTATVKTLVNVKLWAI 0"
## [1] "EGDVGSLVYVMEDGKVEVTKEGVKLCTMGPGKVFGELAILYNCTRTATVKTLVNVKLWAI 0"
## [1] "EGDVGSLVYVMEDGKVEVTKEGVKLCTMGPGKVFGELAILYNCTRTATVKTLVNVKLWAI 0"
## [1] "EGDVGSLVYVMEDGKVEVTKEGLKLCTMGPGKVFGELAILYNCTRTATVRTVSSVKLWAI 0"
## [1] "EGDVGSIVYVMEDGRVEVSREGKYLSTLSGAKVLGELAILYNCQRTATITAITECNLWAI 0"
## [1] " "
## [1] "DRQCFQTIMMRTGLIKHTEYMEFLKSVPTFQSLPEEILSKLADVLEETHYENGEYIIRQG 0"
## [1] "DRQCFQTIMMRTGLIKHTEYMEFLKSVPTFQSLPEEILSKLADVLEETHYENGEYIIRQG 0"
## [1] "DRQCFQTIMMRTGLIKHTEYMEFLKSVPTFQSLPEEIVSKLADVLEETHYESGDYIIRQG 0"
## [1] "DRQCFQTIMMRTGLIKHTEYMEFLKSVPTFQSLPEEILSKLADVLEETHYENGEYIIRQG 0"
## [1] "DRQCFQTIMMRTGLIKHTEYMEFLKSVPTFQSLPEEILSKLADVLEETHYESGEYIIRQG 0"
## [1] "DRQCFQTIMMRTGLIKHTEYMEFLKSVPTFQSLPDEILSKLADVLEETHYENGEYIIRQG 0"
## [1] "DRQCFQTIMMRTGLIKHTEYMEFLKSVPTFQSLPDEILSKLADVLEETHYENGEYIIRQG 0"
## [1] "DRQCFQTIMMRTGLIKHTEYMEFLKSVPTFQSLPEEILSKLADVLEETHYENGEYIIRQG 0"
## [1] "DRQCFQTIMMRTGLIKHAEYMELLKSVLTFRGLPEEILSKLADVLEETHYEDGNYIIRQG 0"
## [1] "ERQCFQTIMMRTGLIRQAEYSDFLKSVPIFKDLAEDTLIKISDVLEETHYQRGDYIVRQG 0"
## [1] " "
## [1] "ARGDTFFIISKGTVNVTREDSPSEDPVFLRTLGKGDWFGEKALQGEDVRTANVI--AAEA 0"
## [1] "ARGDTFFIISKGTVNVTREDSPSEDPVFLRTLGKGDWFGEKALQGEDVRTANVI--AAEA 0"
## [1] "ARGDTFFIISKGKVNVTREDSPGEDPIFLRTLGKGDWFGEKALQGEDVRTANVI--AAEA 0"
## [1] "ARGDTFFIISKGTVNVTREDSPSEDPVFLRTLGKGDWFGEKALQGEDVRTANVI--AAEA 0"
## [1] "ARGDTFFIISKGKVNVTREDSPSEDPVFLRTLGKGDWFGEKALQWEDVRTANVI--AAEA 0"
## [1] "ARGDTFFIISKGQVNVTREDSPSEDPVFLRTLGKGDWFGEKALQGEDVRTANVI--AAEA 0"
## [1] "ARGDTFFIISKGKVNVTREDSPSEDPVFLRTLGKGDWFGEKALQGEDVRTANVI--AAEA 0"
## [1] "ARGDTFFIISKGKVNVTREDSPNEDPVFLRTLGKGDWFGEKALQGEDVRTANVI--AAEA 0"
## [1] "ARGDTFFIISKGKVTMTREDCPGQEPVYLRSMGRGDSFGEKALQGEDIRTANVI--AAET 0"
## [1] "ARGDTFFIISKGKVRVTIKQQDTQEEKFIRMLGKGDFFGEKALQGDDLRTANIICESADG 0"
## [1] " "
## [1] "VTCLVIDRDSFKHLIGGLDDVSNKAYEDAEAKAKYEAEAAFFANLKLSDFNIIDTLGVGG 0"
## [1] "VTCLVIDRDSFKHLIGGLDDVSNKAYEDAEAKAKYEAEAAFFANLKLSDFNIIDTLGVGG 0"
## [1] "VTCLVIDRDSFKHLIGGLDDVSNKAYEDAEAKAKYEAEAAFFGNLKLADFNIIDTLGVGG 0"
## [1] "VTCLVIDRDSFKHLIGGLDDVSNKAYEDAEAKAKYEAEAAFFANLKLSDFNIIDTLGVGG 0"
## [1] "VTCLVIDRDSFKHLIGGLDDVSNKAYEDAEAKAKYEAEAAFFANLKLSDFNIIDTLGVGG 0"
## [1] "VTCLVIDRDSFKHLIGGLDDVSNKAYEDAEAKAKYEAEAAFFANLKLSDFNIIDTLGVGG 0"
## [1] "VTCLVIDRDSFKHLIGGLDDVSNKAYEDAEAKAKYEAEAAFFANLKLSDFNIIDTLGVGG 0"
## [1] "VTCLVIDRDSFKHLIGGLDDVSNKAYEDAEAKAKYEAEAAFFANLKLSDFNIIDTLGVGG 0"
## [1] "VTCLVIDRDSYKHLIGGLEDVSNKGCEDAEAKAKYEAENAFFSNLNLSDFNIIDTLGVGG 0"
## [1] "VSCLVIDRETFNQLISNLDEIKHR-YDDEGAMERRKINEEFR-DINLTDLRVIATLGVGG 0"
## [1] " "
## [1] "FGRVELVQLKSEESKTFAMKILKKRHIVDTRQQEHIRSEKQIMQGAHSDFIVRLYRTFKD 0"
## [1] "FGRVELVQLKSEESKTFAMKILKKRHIVDTRQQEHIRSEKQIMQGAHSDFIVRLYRTFKD 0"
## [1] "FGRVELVQLKSDECKTFAMKILKKRHIVDTRQQEHIRSEKQIMQSAHSDFIVRLYRTFKD 0"
## [1] "FGRVELVQLKSEESKTFAMKILKKRHIVDTRQQEHIRSEKQIMQGAHSDFIVRLYRTFKD 0"
## [1] "FGRVELVQLKSEETKTFAMKILKKRHIVDTRQQEHIRSEKQIMQSAHSDFIVRLYRTFKD 0"
## [1] "FGRVELVQLKSEESKTFAMKILKKRHIVDTRQQEHIRSEKQIMQGAHSDFIVRLYRTFKD 0"
## [1] "FGRVELVQLKSEESKTFAMKILKKRHIVDTRQQEHIRSEKQIMQGAHSDFIVRLYRTFKD 0"
## [1] "FGRVELVQLKSEESKTFAMKILKKRHIVDTRQQEHIRSEKQIMQGAHSDFIVRLYRTFKD 0"
## [1] "FGRVELVQLKSDEMKTFAMKILKKRHIVDTRQQEHIRSEKLIMQEAHSDFIVRLYRTFKD 0"
## [1] "FGRVELVQTNGDSSRSFALKQMKKSQIVETRQQQHIMSEKEIMGEANCQFIVKLFKTFKD 0"
## [1] " "
## [1] "SKYLYMLMEACLGGELWTILRDRGSFEDSTTRFYTACVVEAFAYLHSKGIIYRDLKPENL 0"
## [1] "SKYLYMLMEACLGGELWTILRDRGSFEDSTTRFYTACVVEAFAYLHSKGIIYRDLKPENL 0"
## [1] "SKYLYMLMEACLGGELWTILRDRGSFDDSTTRFYTACVVEAFAYLHSKGIIYRDLKPENL 0"
## [1] "SKYLYMLMEVCLGGELWTILRDRGSFEDSTTRFYTACVVEAFAYLHSKGIIYRDLKPENL 0"
## [1] "SKYLYMLMEACLGGELWTILRDRGSFEDSTTRFYTACVVEAFAYLHSKGIIYRDLKPENL 0"
## [1] "SKYLYMLMEACLGGELWTILRDRGSFEDSTTRFYTACVVEAFAYLHSKGIIYRDLKPENL 0"
## [1] "SKYLYMLMEACLGGELWTILRDRGSFEDSTTRFYTACVVEAFAYLHSKGIIYRDLKPENL 0"
## [1] "SKYLYMLMEACLGGELWTILRDRGSFEDSTTRFYTACVVEAFAYLHSKGIIYRDLKPENL 0"
## [1] "SKYLYMLMEACLGGELWTILRDRGNFDDSTTRFYTACVVEAFAYLHSKGIIYRDLKPENL 0"
## [1] "KKYLYMLMESCLGGELWTILRDKGNFDDSTTRFYTACVVEAFDYLHSRNIIYRDLKPENL 0"
## [1] " "
## [1] "ILDHRGYAKLVDFGFAKKIGFGKKTWTFCGTPEYVAPEIILNKGHDISADYWSLGILMYE 0"
## [1] "ILDHRGYAKLVDFGFAKKIGFGKKTWTFCGTPEYVAPEIILNKGHDISADYWSLGILMYE 0"
## [1] "ILDHRGYAKLVDFGFAKKIGFGKKTWTFCGTPEYVAPEIILNKGHDISADYWSLGILMYE 0"
## [1] "ILDHRGYTKLVDFGFAKKIGFGKKTWTFCGTPEYVAPEIILNKGHDISADYWSLGILMYE 0"
## [1] "ILDHRGYAKLVDFGFAKKIGFGKKTWTFCGTPEYVAPEIILNKGHDISADYWSLGILMYE 0"
## [1] "ILDHRGYAKLVDFGFAKKIGFGKKTWTFCGTPEYVAPEIILNKGHDISADYWSLGILMYE 0"
## [1] "ILDHRGYAKLVDFGFAKKIGFGKKTWTFCGTPEYVAPEIILNKGHDISADYWSLGILMYE 0"
## [1] "ILDHRGYAKLVDFGFAKKIGFGKKTWTFCGTPEYVAPEIILNKGHDISADYWSLGILMYE 0"
## [1] "ILDHRGYAKLVDFGFAKKIGFGKKTWTFCGTPEYVAPEIILNKGHDISADYWSLGILMYE 0"
## [1] "LLNERGYVKLVDFGFAKKLQTGRKTWTFCGTPEYVAPEVILNRGHDISADYWSLGVLMFE 0"
## [1] " "
## [1] "LLTGSPPFSGPDPMKTYNIILRGIDMIEFPKKIAKNAANLIKKLCRDNPSERLGNLKNGV 0"
## [1] "LLTGSPPFSGPDPMKTYNIILRGIDMIEFPKKIAKNAANLIKKLCRDNPSERLGNLKNGV 0"
## [1] "LLTGSPPFSGPDPMKTYNIILRGIDMIEFPKKITKNAANLIKKLCRDNPSERLGNLKNGV 0"
## [1] "LLTGSPPFSGPDPMKTYNIILRGIDMIEFPKKIAKNAANLIKKLCRDNPSERLGNLKNGV 0"
## [1] "LLTGSPPFSGPDPMKTYNIILRGIDMIEFPKKIAKNAANLIKKLCRDNPSERLGNLKNGV 0"
## [1] "LLTGSPPFSGPDPMKTYNIILRGIDMIEFPKKIAKNAANLIKKLCRDNPSERLGNLKNGV 0"
## [1] "LLTGSPPFSGPDPMKTYNIILRGIDMIEFPKKIAKNAANLIKKLCRDNPSERLGNLKNGV 0"
## [1] "LLTGSPPFSGPDPMKTYNIILRGIDMIEFPKKIAKNAANLIKKLCRDNPSERLGNLKNGV 0"
## [1] "LLTGSPPFSGPDPMKTYNIILRGIDMIEFPKKITKNAANLIKKLCRDTPSERLGNLKNGV 0"
## [1] "LLTGTPPFTGSDPMRTYNIILKGIDAIEFPRNITRNASNLIKKLCRDNPAERLGYQRGGI 0"
## [1] " "
## [1] "KDIQKHKWFEGFNWEGLRKGTLTPPIIPSVASPTDTSNFDSFPEDNDEPPPDDNSGWDID 0"
## [1] "KDIQKHKWFEGFNWEGLRKGTLTPPIIPSVASPTDTSNFDSFPEDNDEPPPDDNSGWDID 0"
## [1] "KDIQKHKWFEGFNWEGLRKGTLTPPIIPSVASPTDTSNFDSFPEDNEDPPPDDNSGWDID 0"
## [1] "KDIQKHKWFEGFNWEGLRKGTLTPPIIPSVASPTDTSNFDSFPEDNDEPPPDDNSGWDID 0"
## [1] "KDIQKHKWFEGFNWEGLRKGTLTPPIIPSVASPTDTSNFDSFPEDNDEPPPDDNSGWDID 0"
## [1] "KDIQKHKWFEGFNWEGLRKGTLTPPIIPSVASPTDTSNFDSFPEDSDEPPPDDNSGWDID 0"
## [1] "KDIQKHKWFEGFNWEGLRKGTLTPPIIPSVASPTDTSNFDSFPEDSDEPPPDDNSGWDID 0"
## [1] "KDIQKHKWFEGFNWEGLRKGTLTPPIIPSVASPTDTSNFDSFPEDNDEPPPDDNSGWDID 0"
## [1] "KDIQKHKWFEGFNWDGLRKGTLMPPIIPNVTSSTDTSNFDSFPEDNEDPPPDDNSGWDID 0"
## [1] "SEIQKHKWFDGFYWWGLQNCTLEPPIKPAVKSVVDTTNFDDYPPDPEGPPPDDVTGWDKD 0"
## [1] " "
## [1] "F 59"
## [1] "F 59"
## [1] "F 59"
## [1] "F 59"
## [1] "F 59"
## [1] "F 59"
## [1] "F 59"
## [1] "F 59"
## [1] "F 59"
## [1] "F 59"
## [1] " "
Based on the output from drawProtiens, amino acids 625-700 appear to contain an interesting AGC-Kinase C Terminal.
# key step - must have class set properly
class(prkg1_align) <- "AAMultipleAlignment"
# run ggmsa
ggmsa::ggmsa(prkg1_align,
start = 625, end = 700)
Make a distance matrix
prkg1_dist <- seqinr::dist.alignment(prkg1_align_seqinr,
matrix = "identity")
This produces a “dist” class object
is( prkg1_dist )
## [1] "dist" "oldClass"
class( prkg1_dist )
## [1] "dist"
Round for display
prkg1_align_seqinr_rnd <- round(prkg1_dist, 3)
prkg1_align_seqinr_rnd
## NP_006249 XP_001162858 XP_002935474 XP_851997 XP_003641507
## XP_001162858 0.000
## XP_002935474 0.206 0.206
## XP_851997 0.309 0.309 0.350
## XP_003641507 0.322 0.322 0.343 0.181
## NP_001013855 0.309 0.309 0.348 0.128 0.185
## NP_001099201 0.309 0.309 0.346 0.128 0.181
## NP_776861 0.306 0.306 0.341 0.122 0.177
## NP_957324 0.408 0.408 0.406 0.342 0.357
## NP_477488 0.623 0.623 0.617 0.599 0.622
## NP_001013855 NP_001099201 NP_776861 NP_957324
## XP_001162858
## XP_002935474
## XP_851997
## XP_003641507
## NP_001013855
## NP_001099201 0.039
## NP_776861 0.077 0.067
## NP_957324 0.342 0.340 0.335
## NP_477488 0.600 0.599 0.598 0.603
Build a phylogenetic tree from distance matrix
tree <- nj(prkg1_align_seqinr_rnd)
Plot the tree
# plot tree
plot.phylo(tree, main="PRKG1 Phylogenetic Tree",
type = "unrooted",
use.edge.length = F)
# add label
# mtext(text = "PRKG1 Phylogenetic Tree - UNrooted, no branch lengths")