Introduction

This code gives summary information about the gene CPT1A (carnitine palmitoyltransferase 1A) in both humans and its homologs in other species through alignments and phylogenetic trees.

References

Key information used in this code was found at these websites: Refseq Gene: https://www.ncbi.nlm.nih.gov/gene/1374 Refseq Homologene: https://www.ncbi.nlm.nih.gov/homologene/1413

Other Resources include: Neanderthal Genome: http://neandertal.ensemblgenomes.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000110090 REPPER: https://toolkit.tuebingen.mpg.de/jobs/7803820 Sub-cellular locations prediction: https://wolfpsort.hgc.jp/

Preparation

Load necessary packages

library(BiocManager) 

#only needs to be installed once 
#install("drawProteins")
library(drawProteins)

# github packages
library(compbio4all)
#library(ggmsa)

# CRAN packages
library(rentrez)
library(seqinr)
library(ape)
## 
## Attaching package: 'ape'
## The following objects are masked from 'package:seqinr':
## 
##     as.alignment, consensus
library(pander)

library(ggplot2)

# Bioconductor packages
#library(msa)
library(drawProteins)

## Biostrings
library(Biostrings)
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Warning: package 'S4Vectors' was built under R version 4.1.2
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
## 
##     windows
## Loading required package: XVector
## Loading required package: GenomeInfoDb
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:ape':
## 
##     complement
## The following object is masked from 'package:seqinr':
## 
##     translate
## The following object is masked from 'package:base':
## 
##     strsplit
library(msa)
## 
## Attaching package: 'msa'
## The following object is masked from 'package:BiocManager':
## 
##     version
library(ggmsa)
## Registered S3 methods overwritten by 'ggalt':
##   method                  from   
##   grid.draw.absoluteGrob  ggplot2
##   grobHeight.absoluteGrob ggplot2
##   grobWidth.absoluteGrob  ggplot2
##   grobX.absoluteGrob      ggplot2
##   grobY.absoluteGrob      ggplot2
## ggmsa v1.0.0  Document: http://yulab-smu.top/ggmsa/
## 
## If you use ggmsa in published research, please cite: DOI: 10.18129/B9.bioc.ggmsa
library(HGNChelper)
## Warning: package 'HGNChelper' was built under R version 4.1.2

Accession Numbers

The accession numbers were obtained from RefSeq, RefSeq Homologene, UniProt, and PDB. Only the human protein was found in PDB.

HGNChelper::checkGeneSymbols(x = c("CPT1A")) 
## Maps last updated on: Thu Oct 24 12:31:05 2019
##       x Approved Suggested.Symbol
## 1 CPT1A     TRUE            CPT1A

Accession Number Table

Not available in: Neanderthal Drosophila

Create vectors

ncbi.protein.accession <- c("NP_001867.2", "XP_001173853.1", "NP_038523.2", "NP_113747.2", "NP_001012916.1", "NP_001038319.1", "XP_004913491.1", "XP_006019242.1", "XP_009472179.1", "XP_021152303.1") 

Uniprot.id <- c("P50416", "K7D636", "P97742", "P32198", "Q6B842", "E7EYD4", "A9JRK5", "A0A1U7RUG7", "NA", "A0A2I0MI80")

PDB <- c("2LE3", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA")

Species <- c("Homo sapiens", "Pan troglodytes", "Mus musculus", "Rattus norvegicus", "Gallus gallus", "Danio rerio", "Xenopus tropicalis", "Alligator sinensis", "Nipponia nippon", "Columba livia")

Common.name <- c("Human", "Chimpanzee", "Mouse", "Rat", "Red junglefowl", "Zebrafish", "Western clawed frog", "Chinese alligator", "Crested ibis", "Rock Dove")

Gene.name <- c("CPT1A", "CPT1A", "CPT1A", "CPT1A", "CPT1A", "CPT1A", "LOC100490391", "CPT1A", "CPT1A", "CPT1A") 

Convert the vectors into a table

cpt1a_table <- data.frame(ncbi.protein.accession, Uniprot.id, PDB, Species, Common.name, Gene.name)

Display the finished table

pander(cpt1a_table)
Table continues below
ncbi.protein.accession Uniprot.id PDB Species
NP_001867.2 P50416 2LE3 Homo sapiens
XP_001173853.1 K7D636 NA Pan troglodytes
NP_038523.2 P97742 NA Mus musculus
NP_113747.2 P32198 NA Rattus norvegicus
NP_001012916.1 Q6B842 NA Gallus gallus
NP_001038319.1 E7EYD4 NA Danio rerio
XP_004913491.1 A9JRK5 NA Xenopus tropicalis
XP_006019242.1 A0A1U7RUG7 NA Alligator sinensis
XP_009472179.1 NA NA Nipponia nippon
XP_021152303.1 A0A2I0MI80 NA Columba livia
Common.name Gene.name
Human CPT1A
Chimpanzee CPT1A
Mouse CPT1A
Rat CPT1A
Red junglefowl CPT1A
Zebrafish CPT1A
Western clawed frog LOC100490391
Chinese alligator CPT1A
Crested ibis CPT1A
Rock Dove CPT1A

Data preparation

#Download sequences

The sequences were downloaded using a wrapper compbio4all::entrez_fetch_list() which uses accesses NCBI databases to download the information and then store it in a list.

cpt1a_list <- entrez_fetch_list(db = "protein", 
                          id = ncbi.protein.accession,
                          rettype = "fasta") 

Number of FASTA files downloaded:

length(cpt1a_list)
## [1] 10

The first entry:

cpt1a_list[[1]]
## [1] ">NP_001867.2 carnitine O-palmitoyltransferase 1, liver isoform isoform 1 [Homo sapiens]\nMAEAHQAVAFQFTVTPDGIDLRLSHEALRQIYLSGLHSWKKKFIRFKNGIITGVYPASPSSWLIVVVGVM\nTTMYAKIDPSLGIIAKINRTLETANCMSSQTKNVVSGVLFGTGLWVALIVTMRYSLKVLLSYHGWMFTEH\nGKMSRATKIWMGMVKIFSGRKPMLYSFQTSLPRLPVPAVKDTVNRYLQSVRPLMKEEDFKRMTALAQDFA\nVGLGPRLQWYLKLKSWWATNYVSDWWEEYIYLRGRGPLMVNSNYYAMDLLYILPTHIQAARAGNAIHAIL\nLYRRKLDREEIKPIRLLGSTIPLCSAQWERMFNTSRIPGEETDTIQHMRDSKHIVVYHRGRYFKVWLYHD\nGRLLKPREMEQQMQRILDNTSEPQPGEARLAALTAGDRVPWARCRQAYFGRGKNKQSLDAVEKAAFFVTL\nDETEEGYRSEDPDTSMDSYAKSLLHGRCYDRWFDKSFTFVVFKNGKMGLNAEHSWADAPIVAHLWEYVMS\nIDSLQLGYAEDGHCKGDINPNIPYPTRLQWDIPGECQEVIETSLNTANLLANDVDFHSFPFVAFGKGIIK\nKCRTSPDAFVQLALQLAHYKDMGKFCLTYEASMTRLFREGRTETVRSCTTESCDFVRAMVDPAQTVEQRL\nKLFKLASEKHQHMYRLAMTGSGIDRHLFCLYVVSKYLAVESPFLKEVLSEPWRLSTSQTPQQQVELFDLE\nNNPEYVSSGGGFGPVADDGYGVSYILVGENLINFHISSKFSCPETDSHRFGRHLKEAMTDIITLFGLSSN\nSKK\n\n"

Initial Data Cleaning

Remove FASTA header

for(i in 1:length(cpt1a_list)){
  cpt1a_list[[i]] <- compbio4all::fasta_cleaner(cpt1a_list[[i]], parse = F)
}

General Protein Information

#Protein Diagram

Use a UniProt accession number to download the data from UniProt in the form of a list

CPT1A_json  <- drawProteins::get_features("P50416")
## [1] "Download has worked"
is(CPT1A_json)
## [1] "list"             "vector"           "list_OR_List"     "vector_OR_Vector"
## [5] "vector_OR_factor"

Domains present

my_prot_df <- drawProteins::feature_to_dataframe(CPT1A_json)

my_canvas <- draw_canvas(my_prot_df)
my_canvas <- draw_chains(my_canvas, my_prot_df, label_size = 2.5)
my_canvas <- draw_recept_dom(my_canvas, my_prot_df)
my_canvas

Draw Dotplot

Prepare the data

cpt1a_human_vector <- fasta_cleaner(cpt1a_list[[1]])

Make a 2x2 panel to show different values for dotplot settings

par(mfrow = c(2,2), 
    mar = c(0,0,2,1))

# plot 1: Defaults
dotPlot(cpt1a_human_vector, cpt1a_human_vector, 
        wsize = 1, 
        nmatch = 1, 
        main = "Defaults")

# plot 2 size = 10, nmatch = 1
dotPlot(cpt1a_human_vector, cpt1a_human_vector, 
        wsize = 10, 
        nmatch = 1, 
        main = "size = 10, nmatch = 1")

# plot 3: size = 10, nmatch = 5
dotPlot(cpt1a_human_vector, cpt1a_human_vector, 
        wsize = 10, 
        nmatch = 5, 
        main = "size = 10, nmatch = 5")

# plot 4: size = 25, nmatch = 5
dotPlot(cpt1a_human_vector, cpt1a_human_vector, 
        wsize = 25,
        nmatch = 5,
        main = "size = 25, nmatch = 5")

# reset par() - run this or other plots will be small!
par(mfrow = c(1,1), 
    mar = c(4,4,4,4))

Display best version of the plot

dotPlot(cpt1a_human_vector, cpt1a_human_vector, 
        wsize = 25,
        nmatch = 5,
        main = "size = 25, nmatch = 5")

#Protein properties compiled from databases

Links to relevant information 1. Pfam: http://pfam.xfam.org/protein/P50416 2. Uniprot: https://www.uniprot.org/uniprot/P50416 3. PDB: https://www.rcsb.org/structure/2LE3 4. Alphafold: https://alphafold.ebi.ac.uk/entry/P50416

CPT1A was not found in the following databases 1. Disprot: No information available 2. RepeatsDB: No information available

Create table with properties information

Databases <- c("Pfam", "Uniprot", "PDB", "Alphafold")

Properties <- c("Domain: CPT_N, Transmembrane region 1-47", 
                "Subcellular location: Mitochondrion outer membrane", 
                "Classification: Transferase",
                "Predicted structure: mainly alpha helicies with some beta sheets")

protein.properties.df <- data.frame(Databases, Properties)

pander::pander(protein.properties.df)
Databases Properties
Pfam Domain: CPT_N, Transmembrane region 1-47
Uniprot Subcellular location: Mitochondrion outer membrane
PDB Classification: Transferase
Alphafold Predicted structure: mainly alpha helicies with some beta sheets

Protein Feature prediction

Multivariate statistcal techniques were used to confirm the information about protein structure and location in the line database.

Uniprot indicates that the protein is located in the Mitochondrion outer membrane.

Predict Protein Fold

Alphafold shows mainly alpha helicies, with some small portions of beta sheets. Therefore, I predict that the machine-learning methods will indicate alpha and alpha+beta structure.

aa.1.1 <- c("A","R","N","D","C","Q","E","G","H","I",
            "L","K","M","F","P","S","T","W","Y","V")

alpha <- c(285, 53, 97, 163, 22, 67, 134, 197, 111, 91, 
           221, 249, 48, 123, 82, 122, 119, 33, 63, 167)

beta <- c(203, 67, 139, 121, 75, 122, 86, 297, 49, 120, 
          177, 115, 16, 85, 127, 341, 253, 44, 110, 229)

a.plus.b <- c(175, 78, 120, 111, 74, 74, 86, 171, 33, 93,
              110, 112, 25, 52, 71, 126, 117, 30, 108,                 123)

a.div.b <- c(361, 146, 183, 244, 63, 114, 257, 377, 107,              239, 339, 321, 91, 158, 188, 327, 238, 72,               130, 378)

pander(data.frame(aa.1.1, alpha, beta, a.plus.b,  a.div.b)) 
aa.1.1 alpha beta a.plus.b a.div.b
A 285 203 175 361
R 53 67 78 146
N 97 139 120 183
D 163 121 111 244
C 22 75 74 63
Q 67 122 74 114
E 134 86 86 257
G 197 297 171 377
H 111 49 33 107
I 91 120 93 239
L 221 177 110 339
K 249 115 112 321
M 48 16 25 91
F 123 85 52 158
P 82 127 71 188
S 122 341 126 327
T 119 253 117 238
W 33 44 30 72
Y 63 110 108 130
V 167 229 123 378

Convert to frequencies

alpha.prop <- alpha/sum(alpha)
beta.prop <- beta/sum(beta)
a.plus.b.prop <- a.plus.b/sum(a.plus.b)
a.div.b <- a.div.b/sum(a.div.b)

Create dataframe

aa.prop <- data.frame(alpha.prop,
                      beta.prop,
                      a.plus.b.prop,
                      a.div.b)
row.names(aa.prop) <- aa.1.1

Display dataframe

pander(aa.prop)
  alpha.prop beta.prop a.plus.b.prop a.div.b
A 0.1165 0.07313 0.09264 0.08331
R 0.02166 0.02414 0.04129 0.03369
N 0.03964 0.05007 0.06353 0.04223
D 0.06661 0.04359 0.05876 0.05631
C 0.008991 0.02702 0.03917 0.01454
Q 0.02738 0.04395 0.03917 0.02631
E 0.05476 0.03098 0.04553 0.05931
G 0.08051 0.107 0.09052 0.08701
H 0.04536 0.01765 0.01747 0.02469
I 0.03719 0.04323 0.04923 0.05516
L 0.09031 0.06376 0.05823 0.07824
K 0.1018 0.04143 0.05929 0.07408
M 0.01962 0.005764 0.01323 0.021
F 0.05027 0.03062 0.02753 0.03646
P 0.03351 0.04575 0.03759 0.04339
S 0.04986 0.1228 0.0667 0.07547
T 0.04863 0.09114 0.06194 0.05493
W 0.01349 0.01585 0.01588 0.01662
Y 0.02575 0.03963 0.05717 0.03
V 0.06825 0.08249 0.06511 0.08724

Determine the number of each amino acid in CPT1A with a frequency table.

cpt1a.freq.table <- table(cpt1a_human_vector)/length(cpt1a_human_vector)

Convert the table into a vector and display

table_to_vector <- function(table_x){
  table_names <- attr(table_x, "dimnames")[[1]]
  table_vect <- as.vector(table_x)
  names(table_vect) <- table_names
  return(table_vect)
}

cpt1a_human_table <- table(cpt1a_human_vector)/length(cpt1a_human_vector)
cpt1a.human.aa.freq <- table_to_vector(cpt1a_human_table) 
pander(cpt1a.human.aa.freq) 
Table continues below
A C D E F G H I
0.06727 0.01552 0.04916 0.05563 0.04657 0.06339 0.02975 0.05045
Table continues below
K L M N P Q R S
0.05692 0.09702 0.03493 0.03105 0.04398 0.03752 0.0621 0.07374
T V W Y
0.05692 0.0621 0.02329 0.04269

Check for the presence of “U” (unknown amino acid)

aa.names <- names(cpt1a.human.aa.freq)
any(aa.names == "U")
## [1] FALSE
i.U <- which(aa.names == "U")
aa.names[i.U]
## character(0)
cpt1a.human.aa.freq[i.U]
## named numeric(0)

Add data on CPT1A to the amino acid frequency table

aa.prop$cpt1a.human.aa.freq <- cpt1a.human.aa.freq
pander::pander(aa.prop)
  alpha.prop beta.prop a.plus.b.prop a.div.b cpt1a.human.aa.freq
A 0.1165 0.07313 0.09264 0.08331 0.06727
R 0.02166 0.02414 0.04129 0.03369 0.01552
N 0.03964 0.05007 0.06353 0.04223 0.04916
D 0.06661 0.04359 0.05876 0.05631 0.05563
C 0.008991 0.02702 0.03917 0.01454 0.04657
Q 0.02738 0.04395 0.03917 0.02631 0.06339
E 0.05476 0.03098 0.04553 0.05931 0.02975
G 0.08051 0.107 0.09052 0.08701 0.05045
H 0.04536 0.01765 0.01747 0.02469 0.05692
I 0.03719 0.04323 0.04923 0.05516 0.09702
L 0.09031 0.06376 0.05823 0.07824 0.03493
K 0.1018 0.04143 0.05929 0.07408 0.03105
M 0.01962 0.005764 0.01323 0.021 0.04398
F 0.05027 0.03062 0.02753 0.03646 0.03752
P 0.03351 0.04575 0.03759 0.04339 0.0621
S 0.04986 0.1228 0.0667 0.07547 0.07374
T 0.04863 0.09114 0.06194 0.05493 0.05692
W 0.01349 0.01585 0.01588 0.01662 0.0621
Y 0.02575 0.03963 0.05717 0.03 0.02329
V 0.06825 0.08249 0.06511 0.08724 0.04269

Functions to calculate similarities

# Correlation used in Chou and Zhange 1992.
chou_cor <- function(x,y){
  numerator <- sum(x*y)
denominator <- sqrt((sum(x^2))*(sum(y^2)))
result <- numerator/denominator
return(result)
}

# Cosine similarity used in Higgs and Attwood (2005). 
chou_cosine <- function(z.1, z.2){
  z.1.abs <- sqrt(sum(z.1^2))
  z.2.abs <- sqrt(sum(z.2^2))
  my.cosine <- sum(z.1*z.2)/(z.1.abs*z.2.abs)
  return(my.cosine)
}

Calculate correlation between each column

corr.alpha <- chou_cor(aa.prop[,5], aa.prop[,1])
corr.beta  <- chou_cor(aa.prop[,5], aa.prop[,2])
corr.apb   <- chou_cor(aa.prop[,5], aa.prop[,3])
corr.adb   <- chou_cor(aa.prop[,5], aa.prop[,4])

Calculate cosine similarity

cos.alpha <- chou_cosine(aa.prop[,5], aa.prop[,1])
cos.beta  <- chou_cosine(aa.prop[,5], aa.prop[,2])
cos.apb   <- chou_cosine(aa.prop[,5], aa.prop[,3])
cos.adb   <- chou_cosine(aa.prop[,5], aa.prop[,4])

Calculate distance

aa.prop.flipped <- t(aa.prop)
round(aa.prop.flipped,2) 
##                        A    R    N    D    C    Q    E    G    H    I    L    K
## alpha.prop          0.12 0.02 0.04 0.07 0.01 0.03 0.05 0.08 0.05 0.04 0.09 0.10
## beta.prop           0.07 0.02 0.05 0.04 0.03 0.04 0.03 0.11 0.02 0.04 0.06 0.04
## a.plus.b.prop       0.09 0.04 0.06 0.06 0.04 0.04 0.05 0.09 0.02 0.05 0.06 0.06
## a.div.b             0.08 0.03 0.04 0.06 0.01 0.03 0.06 0.09 0.02 0.06 0.08 0.07
## cpt1a.human.aa.freq 0.07 0.02 0.05 0.06 0.05 0.06 0.03 0.05 0.06 0.10 0.03 0.03
##                        M    F    P    S    T    W    Y    V
## alpha.prop          0.02 0.05 0.03 0.05 0.05 0.01 0.03 0.07
## beta.prop           0.01 0.03 0.05 0.12 0.09 0.02 0.04 0.08
## a.plus.b.prop       0.01 0.03 0.04 0.07 0.06 0.02 0.06 0.07
## a.div.b             0.02 0.04 0.04 0.08 0.05 0.02 0.03 0.09
## cpt1a.human.aa.freq 0.04 0.04 0.06 0.07 0.06 0.06 0.02 0.04

Create distance matrix

dist(aa.prop.flipped, method = "euclidean")
##                     alpha.prop  beta.prop a.plus.b.prop    a.div.b
## beta.prop           0.13342098                                    
## a.plus.b.prop       0.09281824 0.08289406                         
## a.div.b             0.06699039 0.08659174    0.06175113           
## cpt1a.human.aa.freq 0.15467072 0.13760320    0.12127782 0.12942144

Individual distances using dist()

dist.alpha <- dist((aa.prop.flipped[c(1,5),]),  method = "euclidean")
dist.beta  <- dist((aa.prop.flipped[c(2,5),]),  method = "euclidean")
dist.apb   <- dist((aa.prop.flipped[c(3,5),]),  method = "euclidean")
dist.adb  <- dist((aa.prop.flipped[c(4,5),]), method = "euclidean")

Compile the information and round to make it easier to read.

# fold types
fold.type <- c("alpha","beta","alpha plus beta", "alpha/beta")

# data
corr.sim <- round(c(corr.alpha,corr.beta,corr.apb,corr.adb),5)
cosine.sim <- round(c(cos.alpha,cos.beta,cos.apb,cos.adb),5)
Euclidean.dist <- round(c(dist.alpha,dist.beta,dist.apb,dist.adb),5)

# summary
sim.sum <- c("","","most.sim","")
dist.sum <- c("","","min.dist","")

df <- data.frame(fold.type,
           corr.sim ,
           cosine.sim ,
           Euclidean.dist ,
           sim.sum ,
           dist.sum )

Display output

pander::pander(df)
fold.type corr.sim cosine.sim Euclidean.dist sim.sum dist.sum
alpha 0.8089 0.8089 0.1547
beta 0.8525 0.8525 0.1376
alpha plus beta 0.8733 0.8733 0.1213 most.sim min.dist
alpha/beta 0.859 0.859 0.1294

Percent Identity Comparisons (PID)

Data Preparation

Convert the FASTA entries into a single vector

names(cpt1a_list)
##  [1] "NP_001867.2"    "XP_001173853.1" "NP_038523.2"    "NP_113747.2"   
##  [5] "NP_001012916.1" "NP_001038319.1" "XP_004913491.1" "XP_006019242.1"
##  [9] "XP_009472179.1" "XP_021152303.1"
length(cpt1a_list)
## [1] 10

Display the first entry

cpt1a_list[1]
## $NP_001867.2
## [1] "MAEAHQAVAFQFTVTPDGIDLRLSHEALRQIYLSGLHSWKKKFIRFKNGIITGVYPASPSSWLIVVVGVMTTMYAKIDPSLGIIAKINRTLETANCMSSQTKNVVSGVLFGTGLWVALIVTMRYSLKVLLSYHGWMFTEHGKMSRATKIWMGMVKIFSGRKPMLYSFQTSLPRLPVPAVKDTVNRYLQSVRPLMKEEDFKRMTALAQDFAVGLGPRLQWYLKLKSWWATNYVSDWWEEYIYLRGRGPLMVNSNYYAMDLLYILPTHIQAARAGNAIHAILLYRRKLDREEIKPIRLLGSTIPLCSAQWERMFNTSRIPGEETDTIQHMRDSKHIVVYHRGRYFKVWLYHDGRLLKPREMEQQMQRILDNTSEPQPGEARLAALTAGDRVPWARCRQAYFGRGKNKQSLDAVEKAAFFVTLDETEEGYRSEDPDTSMDSYAKSLLHGRCYDRWFDKSFTFVVFKNGKMGLNAEHSWADAPIVAHLWEYVMSIDSLQLGYAEDGHCKGDINPNIPYPTRLQWDIPGECQEVIETSLNTANLLANDVDFHSFPFVAFGKGIIKKCRTSPDAFVQLALQLAHYKDMGKFCLTYEASMTRLFREGRTETVRSCTTESCDFVRAMVDPAQTVEQRLKLFKLASEKHQHMYRLAMTGSGIDRHLFCLYVVSKYLAVESPFLKEVLSEPWRLSTSQTPQQQVELFDLENNPEYVSSGGGFGPVADDGYGVSYILVGENLINFHISSKFSCPETDSHRFGRHLKEAMTDIITLFGLSSNSKK"

Make each list entry into a vector

cpt1a_vector <- rep(NA, length(cpt1a_list))

# creating vector
for(i in 1:length(cpt1a_vector)){
  cpt1a_vector[i] <- cpt1a_list[[i]]
}

Name the vector

names(cpt1a_vector) <- names(cpt1a_list)

PID Table

Pairwise alignments for humans, chimpanzees, mice, and horses

align01.02 <- Biostrings::pairwiseAlignment(
  cpt1a_vector[[1]], cpt1a_vector[[2]])

Biostrings::pid(align01.02)
## [1] 99.48254
align01.03 <- Biostrings::pairwiseAlignment(
  cpt1a_vector[[1]], cpt1a_vector[[3]])

Biostrings::pid(align01.03)
## [1] 86.54592
align01.08 <- Biostrings::pairwiseAlignment(
  cpt1a_vector[[1]], cpt1a_vector[[8]])

Biostrings::pid(align01.08)
## [1] 80.72445
align02.03 <- Biostrings::pairwiseAlignment(
  cpt1a_vector[[2]], cpt1a_vector[[3]])

Biostrings::pid(align01.08)
## [1] 80.72445
align02.08 <- Biostrings::pairwiseAlignment(
  cpt1a_vector[[2]], cpt1a_vector[[8]])

Biostrings::pid(align01.08)
## [1] 80.72445
align03.08 <- Biostrings::pairwiseAlignment(
  cpt1a_vector[[3]], cpt1a_vector[[8]])

Biostrings::pid(align01.08)
## [1] 80.72445

Build matrix

pids <- c(1,                  NA,     NA,     NA,
          pid(align01.02),          1,     NA,     NA,
          pid(align01.03), pid(align02.03),      1,     NA,
          pid(align01.08), pid(align02.08), pid(align03.08), 1)

mat <- matrix(pids, nrow = 4, byrow = T)
row.names(mat) <- c("Human","Chimpanzee","Mouse","Alligator")   
colnames(mat) <- c("Human","Chimpanzee","Mouse","Alligator")   
pander::pander(mat)  
  Human Chimpanzee Mouse Alligator
Human 1 NA NA NA
Chimpanzee 99.48 1 NA NA
Mouse 86.55 86.42 1 NA
Alligator 80.72 80.72 79.56 1

PID Methods Comparison

Compare different PID methods for humans and chimpanzees

Prepare the data

humancpt1a_string <-paste(cpt1a_vector[[1]],
                          collapse = "")    

# convert ulcerans_vector to an object ulceransseq_string
chimpcpt1a_string <-paste(cpt1a_vector[[2]],
                          collapse = "")
humancpt1a_string   <- toupper(humancpt1a_string)
chimpcpt1a_string <- toupper(chimpcpt1a_string)

Create pairwise alignment between human and chimpanzee

data(BLOSUM50)
globalAlignHumanChimp <- Biostrings::pairwiseAlignment(humancpt1a_string, 
                              chimpcpt1a_string,
                  substitutionMatrix = BLOSUM50, 
                                gapOpening = -8, 
                              gapExtension = -2, 
                              scoreOnly = FALSE)

Calculate PIDs with different methods and create a table of the results

PID1.results <- pid(globalAlignHumanChimp, type = "PID1")
PID2.results <- pid(globalAlignHumanChimp, type = "PID2")
PID3.results <- pid(globalAlignHumanChimp, type = "PID3")
PID4.results <- pid(globalAlignHumanChimp, type = "PID4")

methods <- c("PID1", "PID2", "PID3", "PID4")
PID <- c(PID1.results, PID2.results,
                 PID3.results, PID4.results)
denominator <- c("(aligned positions + internal gap positions)", "(aligned positions)", "(length shorter sequence)", "(average length of the two sequences)")

PID.df <- data.frame(methods, PID, denominator)
pander(PID.df)
methods PID denominator
PID1 99.48 (aligned positions + internal gap positions)
PID2 99.48 (aligned positions)
PID3 99.48 (length shorter sequence)
PID4 99.48 (average length of the two sequences)

Multiple Sequence Alignment

MSA data preparation

cpt1a_vector_ss <- Biostrings::AAStringSet(cpt1a_vector)

Building Multiple Sequence Alignment (MSA)

cpt1a_align <-msa(cpt1a_vector_ss,
                  method = "ClustalW")
## use default substitution matrix

Cleaning/Setting up an MSA

msa produces a species MSA objects

class(cpt1a_align)
## [1] "MsaAAMultipleAlignment"
## attr(,"package")
## [1] "msa"
is(cpt1a_align)
## [1] "MsaAAMultipleAlignment" "AAMultipleAlignment"    "MsaMetaData"           
## [4] "MultipleAlignment"

Default output of MSA

cpt1a_align
## CLUSTAL 2.1  
## 
## Call:
##    msa(cpt1a_vector_ss, method = "ClustalW")
## 
## MsaAAMultipleAlignment with 10 rows and 917 columns
##      aln                                                   names
##  [1] MAEAHQAVAFQFTVTPDGIDLRMSH...------------------------- XP_009472179.1
##  [2] MAEAHQAVAFQFTVTPDGIDLRMSH...IGNKEGNSPVLTSAFNMLPKSRKDR XP_021152303.1
##  [3] MAEAHQAVAFQFTVTPDGIDLRMSH...------------------------- NP_001012916.1
##  [4] MAEAHQAVAFQFTVTPDGIDLRMSH...------------------------- XP_006019242.1
##  [5] MAEAHQAVAFQFTVTPDGIDLRLSH...------------------------- NP_001867.2
##  [6] MAEAHQAVAFQFTVTPDGIDLRLSH...------------------------- XP_001173853.1
##  [7] MAEAHQAVAFQFTVTPDGIDLRLSH...------------------------- NP_038523.2
##  [8] MAEAHQAVAFQFTVTPDGIDLRLSH...------------------------- NP_113747.2
##  [9] MAEAHQAVAFQFTVTPDGIDLRLSH...------------------------- XP_004913491.1
## [10] MAEAHQAVAFQFTVGPDGIDLQLSH...------------------------- NP_001038319.1
##  Con MAEAHQAVAFQFTVTPDGIDLRLSH...------------------------- Consensus

Change class of alignment

class(cpt1a_align) <- "AAMultipleAlignment"
is(cpt1a_align)
## [1] "AAMultipleAlignment" "MultipleAlignment"
cpt1a_align
## AAMultipleAlignment with 10 rows and 917 columns
##       aln                                                   names               
##  [1] MAEAHQAVAFQFTVTPDGIDLRMSH...------------------------- XP_009472179.1
##  [2] MAEAHQAVAFQFTVTPDGIDLRMSH...IGNKEGNSPVLTSAFNMLPKSRKDR XP_021152303.1
##  [3] MAEAHQAVAFQFTVTPDGIDLRMSH...------------------------- NP_001012916.1
##  [4] MAEAHQAVAFQFTVTPDGIDLRMSH...------------------------- XP_006019242.1
##  [5] MAEAHQAVAFQFTVTPDGIDLRLSH...------------------------- NP_001867.2
##  [6] MAEAHQAVAFQFTVTPDGIDLRLSH...------------------------- XP_001173853.1
##  [7] MAEAHQAVAFQFTVTPDGIDLRLSH...------------------------- NP_038523.2
##  [8] MAEAHQAVAFQFTVTPDGIDLRLSH...------------------------- NP_113747.2
##  [9] MAEAHQAVAFQFTVTPDGIDLRLSH...------------------------- XP_004913491.1
## [10] MAEAHQAVAFQFTVGPDGIDLQLSH...------------------------- NP_001038319.1

Convert to seqinr format

cpt1a_align_seqinr <- msaConvert(cpt1a_align, type = "seqinr::alignment") 
is(cpt1a_align_seqinr)
## [1] "alignment"

Show output with print_msa

compbio4all::print_msa(cpt1a_align_seqinr)
## [1] "MAEAHQAVAFQFTVTPDGIDLRMSHEALKQIYLSGVHSWKKKFIRFKNGIITGVYPASPS 0"
## [1] "MAEAHQAVAFQFTVTPDGIDLRMSHEALKQIYLSGVHSWKKKFIRFKNGIITGVYPASPS 0"
## [1] "MAEAHQAVAFQFTVTPDGIDLRMSHEALKQIYLSGVHSWKKKFIRFKNGIITGVYPASPS 0"
## [1] "MAEAHQAVAFQFTVTPDGIDLRMSHEALKQVYLSGLHSWKKKFIRFKNGIITGVYPASPS 0"
## [1] "MAEAHQAVAFQFTVTPDGIDLRLSHEALRQIYLSGLHSWKKKFIRFKNGIITGVYPASPS 0"
## [1] "MAEAHQAVAFQFTVTPDGIDLRLSHEALRQIYLSGLHSWKKKFIRFKNGIITGVYPASPS 0"
## [1] "MAEAHQAVAFQFTVTPDGIDLRLSHEALKQICLSGLHSWKKKFIRFKNGIITGVFPASPS 0"
## [1] "MAEAHQAVAFQFTVTPDGIDLRLSHEALKQICLSGLHSWKKKFIRFKNGIITGVFPANPS 0"
## [1] "MAEAHQAVAFQFTVTPDGIDLRLSHEALRQIYLSGLHSWKKKFIRFKNGILTGVFPSSPS 0"
## [1] "MAEAHQAVAFQFTVGPDGIDLQLSHEALRQVYLSGIHSWKKKFIRFKNGILNGVYPGSAP 0"
## [1] " "
## [1] "SWLIVVVGVMST-MYAKIDPSLGIIAKINRTLDTTGYMSNQTQNIVSGILFGTGLWVALI 0"
## [1] "SWLIVVVGVMST-MYAKIDPSLGIIAKINRTLDTTGCMSNQTQNIVSGILFGTGLWVALI 0"
## [1] "SWLIVVVGVMST-MYAKIDPSLGIIAKINRTLDTTGYMSNQTQNIVSGILFGTGLWVALI 0"
## [1] "SWLIVVVGVMST-MYAKIDPSLGIISKINQTLDTTGYMSNQTQNIVSGILFGTGLWVALI 0"
## [1] "SWLIVVVGVMTT-MYAKIDPSLGIIAKINRTLETANCMSSQTKNVVSGVLFGTGLWVALI 0"
## [1] "SWLIVVVGVMTT-MYAKIDPSLGIIAKINRTLETANCMSSQTKNVVSGVLFGTGLWVALI 0"
## [1] "SWLIVVVGVISS-MHTKVDPSLGMIAKINRTLDTTGRMSSQTKNIVSGVLFGTGLWVAII 0"
## [1] "SWLIVVVGVISS-MHAKVDPSLGMIAKISRTLDTTGRMSSQTKNIVSGVLFGTGLWVAVI 0"
## [1] "SWLIVVVGVTSASMYTKVDPSFGIIEKINNALSATGYMTPQTQNIVSGVLFGTGLWVSLI 0"
## [1] "GFVLVLAGYLGRAQYLKVDPSLGLLFKLGNYVPISRYMSEQKQMLVGGVMVGTGLWIAII 0"
## [1] " "
## [1] "VTMRYSLKMLLSYHGWMFAEHGKLSAGTKFWMALVKLFSGRK-PMLYSFQTSLPRLPVPA 0"
## [1] "VTMRYSLKMLLSYHGWMFAEHGKLSAGTRVWMALVKLFSGRK-PMLYSFQTSLPRLPVPA 0"
## [1] "VTMRYSLKMLLSYHGWMFAEHGKLSAGTKLWMTLVKLFSGRK-PMLYSFQTSLPRLPVPA 0"
## [1] "ITMRYSLKMLLSYHGWMFAEHGRLSTGTKIWMSLVKLFSGCK-PMLYSFQTSLPRLPVPS 0"
## [1] "VTMRYSLKVLLSYHGWMFTEHGKMSRATKIWMGMVKIFSGRK-PMLYSFQTSLPRLPVPA 0"
## [1] "VTMRYSLKVLLSYHGWMFTEHGKMSRATKIWMGMVKIFSGRK-PMLYSFQTSLPRLPVPA 0"
## [1] "MTMRYSLKVLLSYHGWMFAEHGKMSRSTRIWMAMVKVFSGRK-PMLYSFQTSLPRLPVPA 0"
## [1] "MTMRYSLKVLLSYHGWMFAEHGKMSRSTKIWMAMVKVLSGRK-PMLYSFQTSLPRLPVPA 0"
## [1] "ATMRYSLKKLLSYHGWMFEEHGKLSASTRIWMGMVKLLSGRK-PMLYSFQTSLPRLPVPP 0"
## [1] "FGMRTVLKGLLSWHGWMFASHGRMTWKIRLWLVFVKVFSGMQTPMLYSFQNSLPHLFVPS 0"
## [1] " "
## [1] "VKDTVNRYLESVRPLMDDEEFRRMEGLAKDFAFNLGPRLQWYLKLKSWWATNYVSDWWEE 0"
## [1] "VKDTVNRYLESVRPLMDDGEFRRMEGLAKDFAFNLGPRLQWYLKLKSWWATNYVSDWWEE 0"
## [1] "VKDTVNRYLESVRPLMNDEEFKRMEGLAKDFAFNLGPRLQWYLKLKSWWATNYVSDWWEE 0"
## [1] "IKDTVTRYLESVRPLMNDVEFKRMDGLAKDFAVNLGPRLQWYLKLKSWWATNYVSDWWEE 0"
## [1] "VKDTVNRYLQSVRPLMKEEDFKRMTALAQDFAVGLGPRLQWYLKLKSWWATNYVSDWWEE 0"
## [1] "VKDTVNRYLQSVRPLMKEEDFKRMTALAQDFAVGLGPRLQWYLKLKSWWATNYVSDWWEE 0"
## [1] "VKDTVSRYLESVRPLMKEGDFQRMTALAQDFAVNLGPKLQWYLKLKSWWATNYVSDWWEE 0"
## [1] "VKDTVSRYLESVRPLMKEEDFQRMTALAQDFAVNLGPKLQWYLKLKSWWATNYVSDWWEE 0"
## [1] "VKDTVKRYLDSVKPLMDKEKFERMEGLAKDFANNLGPRLQWYLKLKSWWATNYVSDWWEE 0"
## [1] "VKETTRRYLESVQPLLSDEEHQRMQRLALDFERNLGPKLQWYLKLKSWWATNYVSDWWEE 0"
## [1] " "
## [1] "YIYLRGRGPIMVNSNYFAMDFLYLSPTTVQAARAGNVIHAILLYRKKLDRQEIKPILLMG 0"
## [1] "YVYLRGRGPIMVNSNYFAMDFLYLSPTTLQAARAGNVIHAILLYRKKLDRQEIKPILLMG 0"
## [1] "YIYLRGRGPIMVNSNYFAMDFLHLSPTTIQAARAGNIIHAILLYRKKLDRQEIKPILLMG 0"
## [1] "YIYLRGRGPIMVNSNYYAMDFLCVAPTSIQAARAGNVIHALLLYRKQLDRQEIKPILLMG 0"
## [1] "YIYLRGRGPLMVNSNYYAMDLLYILPTHIQAARAGNAIHAILLYRRKLDREEIKPIRLLG 0"
## [1] "YIYLRGRGPLMVNSNYYAMDLLYILPTHIQAARAGNAIHAILLYRRKLDREEIKPIRLLG 0"
## [1] "YIYLRGRGPIMVNSNYYAMEMLYITPTHIQAARAGNTIHAILLYRRTVDREELKPIRLLG 0"
## [1] "YIYLRGRGPLMVNSNYYAMEMLYITPTHIQAARAGNTIHAILLYRRTLDREELKPIRLLG 0"
## [1] "YIYLRGRGPIMVNSNYYAMDFLYVLPTHIQAARAGNAIHAILLYRRKLDREEIKPIQLMG 0"
## [1] "YIYLRGRGPIMVNSNYYVMDFLYAFPTNIQAARAGNVIHAIMLYRRKLDRAQIKPVFALS 0"
## [1] " "
## [1] "STVPLCSAQWERMFNTSRIPGEESDTLQHVKDSKHIVVYHKGRYFKVWLYHDGRLLKPRE 0"
## [1] "STVPLCSAQWERMFNTSRIPGEESDTLQHVKDSKHIVVYHKGRYFKVWLYHDGRLLKPRE 0"
## [1] "STVPLCSAQWERMFNTSRIPGEESDTLQHVKDSKHIVVYHKGRYFKVWLYHDGRLLKPRE 0"
## [1] "STVPLCSAQWERMFNTSRIPGEETDTLQHVKDSKHIVVYHKGRYFKVWVYHDGRLLKPRE 0"
## [1] "STIPLCSAQWERMFNTSRIPGEETDTIQHMRDSKHIVVYHRGRYFKVWLYHDGRLLKPRE 0"
## [1] "STIPLCSAQWERMFNTSRIPGEETDTSQHMRDSKHIVVYHRGRYFKVWLYHDGRLLKPRE 0"
## [1] "STIPLCSAQWERLFNTSRIPGEETDTIQHVKDSRHIVVYHRGRYFKVWLYHDGRLLRPRE 0"
## [1] "STIPLCSAQWERLFNTSRIPGEETDTIQHIKDSRHIVVYHRGRYFKVWLYHDGRLLRPRE 0"
## [1] "STVPLCSAQWERMYNTSRIPGEETDTIQHVKDSKHIVVYHKGRYFKVWLYHDGRLLKPRE 0"
## [1] "NSVPLCSAQWERLFNTSRIPGKERDSVQHVSDSRHIVVYHRGRYFKVWMFYDGRLLLPRE 0"
## [1] " "
## [1] "IEQQMQRILDDDSEPQAGEEKLAALTAGDRVPWAKARQTYFSCGKNKQSLDAVEKAAFFV 0"
## [1] "IEQQMQRILDDDSEPQTGEKKLAALTAGDRVPWARARQAYFSHGKNKQSLDAVEKAAFFV 0"
## [1] "IEQQIQRILDDDSEPQAGEEKLAALTAGDRVPWAKARQAYFSRGKNKQSLDAVEKAAFFV 0"
## [1] "IEQQMQRILDDSSKSLPGEEKLAALTAGDRIPWAKARQAYFGRGKNKHSLDAVEKAAFFV 0"
## [1] "MEQQMQRILDNTSEPQPGEARLAALTAGDRVPWARCRQAYFGRGKNKQSLDAVEKAAFFV 0"
## [1] "MEQQMQRILDNTSEPQPGEARLAALTAGDRVPWARCRQAYFGRGKNKQSLDAVEKAAFFV 0"
## [1] "LEQQMQQILDDTSEPQPGEAKLAALTAADRVPWAKCRQTYFARGKNKQSLDAVEKAAFFV 0"
## [1] "LEQQMQQILDDPSEPQPGEAKLAALTAADRVPWAKCRQTYFARGKNKQSLDAVEKAAFFV 0"
## [1] "IEHQMQKIIDDTSSPQPGEEKLAALTAGDRVPWAKARKAYFANGKNKQSMDAVEKAAFFV 0"
## [1] "IEQQMERILADTSEPQPGEETLAALTAGDRVPWACARNAYLRHGTNKKSLDSVEKAAFFV 0"
## [1] " "
## [1] "TLDDIEQGYRKEDPVMSLDAYAKSLMHGRCYDRWFDKAFTLIVFKNGRMGLNAEHSWADA 0"
## [1] "TLDDIEQGYRKEDPVRSLDAYAKSLIHGRCYDRWFDKTFTLIVFKNGRMGLNAEHSWADA 0"
## [1] "TLDDDEQGYSKEDPVSSLDAYAKSLIHGRCYDRWFDKTFTLVVFKNGKMGLNAEHSWADA 0"
## [1] "TLDESEQGYREEDPVTSLDTYAKSLLHGKCYDRWFDKTFTLVVFKNGKMGLNAEHSWADA 0"
## [1] "TLDETEEGYRSEDPDTSMDSYAKSLLHGRCYDRWFDKSFTFVVFKNGKMGLNAEHSWADA 0"
## [1] "TLDETEEGYRSEDPDTSMDSYAKSLLHGRCYDRWFDKSFTFVVFKNGKMGLNAEHSWADA 0"
## [1] "TLDESEQGYREEDPEASIDSYAKSLLHGRCFDRWFDKSITFVVFKNSKIGINAEHSWADA 0"
## [1] "TLDESEQGYREEDPEASIDSYAKSLLHGRCFDRWFDKSITFVVFKNSKIGINAEHSWADA 0"
## [1] "TLDETEQGYNKEDPVNSLDSYAKSLLHGKCYDRWFDKTMSFVVFKNGKMGMNVEHSWADA 0"
## [1] "TLDDTEQRFDQKNPVESLDRYAKSLLHGKCYDRWFDKSINLIIFKNGTMGLNAEHSWADA 0"
## [1] " "
## [1] "PIVGHLWENVMTTEYLELGYSEDGHCKGDINQNIPIPTKLQWEIPEECQEVIERSLSTAI 0"
## [1] "PIVGHLWENVMATEYLELGYSEDGHCKGDTNQNIPIPTKLQWEIPEECQEVIERSLSTAI 0"
## [1] "PIVGHLWENVMATEYLELGYLEDGHCKGDTNQNIPIPTKLQWEIPEECQDVIERSLSTAR 0"
## [1] "PIIGHLWETTMANDYLLLGYSEDGHCRGDTHQNIPFPTRLQWEIPEECQEVIEKSLSTAS 0"
## [1] "PIVAHLWEYVMSIDSLQLGYAEDGHCKGDINPNIPYPTRLQWDIPGECQEVIETSLNTAN 0"
## [1] "PIVAHLWEYVMSTDSLQLGYAEDGHCKGDTNPNIPYPTRLQWDIPGECQEVIETSLNTAN 0"
## [1] "PIVGHLWEYVMATDVFQLGYSEDGHCKGDKNPNIPKPTRLQWDIPGECQEVIETSLSSAS 0"
## [1] "PVVGHLWEYVMATDVFQLGYSEDGHCKGDTNPNIPKPTRLQWDIPGECQEVIDASLSSAS 0"
## [1] "PIVGHLWEYVMATDKMELGYNEDGHCKGDVNGNIPPPSRLQWDIPEECQNVVEESLTVAK 0"
## [1] "PIVGHLWEQVLSMDPVKLGYTEDGHCKGEPHANLPGPQRLQWNIPTECQTMITNSLSVAE 0"
## [1] " "
## [1] "ALADDVDFHSFFFDAFGKGLIKKAKTSPDAFVQLALQLAHYRDMGKFSLTYEASMTRLFR 0"
## [1] "ALADDVDFHSFFFDAFGKGLIKKAKTSPDGFVQLALQLAHYRDMGKFSLTYEASMTRLFR 0"
## [1] "ALADDVDFYSFYFDVFGKGLIKKAKTSPDAFIQLALQLAHYRDMGKFSLTYEASMTRLFR 0"
## [1] "ALAEDVDFHSFPFDTFGKGLIKKAKISPDAFVQLSLQLAHYRDMGKFCLTYEASMTRLFR 0"
## [1] "LLANDVDFHSFPFVAFGKGIIKKCRTSPDAFVQLALQLAHYKDMGKFCLTYEASMTRLFR 0"
## [1] "LLANDVDFHSFPFVAFGKGIIKKCRTSPDAFVQLALQLAHYKDMGKFCLTYEASMTRLFR 0"
## [1] "FLANDVDLHSFPFDTFGKGLIKKCRTSPDAFIQLALQLAHYKDMGKFCLTYEASMTRLFR 0"
## [1] "LLANDVDLHSFPFDSFGKGLIKKCRTSPDAFIQLALQLAHYKDMGKFCLTYEASMTRLFR 0"
## [1] "ALADDVDFHSFPFNSFGKGLIKKSRTSPDAFVQLSLQLAHYRDKEKFCLTYEASMTRLFR 0"
## [1] "ALADDVDSHIIPFSDFGKGLIKKCRTSPDAFIQLALQLAHYRDKGKFCLTYEASMTRLFR 0"
## [1] " "
## [1] "EGRTETVRSCTVESCNFVQTMEDPTESNENKLKSFRLAAAKHQHLYRLAMTGAGVDRHLF 0"
## [1] "EGRTETVRSCTVESCNFVRTMEDPTESNENKLKLFQLAAAKHQHLYRLAMTGAGIDRHLF 0"
## [1] "EGRTETVRSCTIESCNFVQTMENPSESNENKMKSFRLAATKHQHLYRLAMTGAGIDRHLF 0"
## [1] "EGRTETVRSCTTESCSFVRAMEDPAESTEKKLKLLRIAAANHQHLYRFAMTGAGVDRHLF 0"
## [1] "EGRTETVRSCTTESCDFVRAMVDPAQTVEQRLKLFKLASEKHQHMYRLAMTGSGIDRHLF 0"
## [1] "EGRTETVRSCTTESCDFVRAMVDPAQTVEQRLKLFKLASEKHQHMYRLAMTGSGIDRHLF 0"
## [1] "EGRTETVRSCTTESCNFVLAMMDPTTTAEQRFKLFKIACEKHQHLYRLAMTGAGIDRHLF 0"
## [1] "EGRTETVRSCTMESCNFVQAMMDPKSTAEQRLKLFKIACEKHQHLYRLAMTGAGIDRHLF 0"
## [1] "EGRTETVRSCTIESCDFVLAMSDPSQTNEKRLQLFKEAAEKHQQMYRLAMTGSGIDRHLF 0"
## [1] "EGRTETVRSCTTESCAFVRAMN-SNHTREQKLQLLKNAAEKHQQMYRLAMTGHGIDRHLF 0"
## [1] " "
## [1] "CLYVVSKYLAVDSPFLKEVLSEPWRLSTSQTPQQHIDLK---KNPEMLSCGGGFGPVADD 0"
## [1] "CLYVVSKYLAVDSPFLKEVLSEPWRLSTSQTPQQHIDLK---KNPEMLSCGGGFGPVADD 0"
## [1] "CLYVVSKYLSVDSPFLKEVLSEPWRLSTSQTPQQHIDLK---KNPEMLSSGGGFGPVADD 0"
## [1] "CLYVVSKYLGVDSPFLKEVLSEPWRLSTSQTPHQHIDLE---KNPECVCSGGGFGPVADD 0"
## [1] "CLYVVSKYLAVESPFLKEVLSEPWRLSTSQTPQQQVELFDLENNPEYVSSGGGFGPVADD 0"
## [1] "CLYVVSKYLAMESPFLKEVLSEPWRLSTSQTPQQQVELFDLENNPEYVSSGGGFGPVADD 0"
## [1] "CLYVVSKYLAVDSPFLKEVLSEPWRLSTSQTPQQQVELFDFEKYPDYVSCGGGFGPVADD 0"
## [1] "CLYVVSKYLAVDSPFLKEVLSEPWRLSTSQTPQQQVELFDFEKNPDYVSCGGGFGPVADD 0"
## [1] "CLYVVSKYLGVDSPFLKEVLSEPWRLSTSQTPQQQVHLFQLEKFPENVSSGGGFGPVADD 0"
## [1] "CLYVVLKYLGQDSPFLKEVLSEPWRLSTSQTPLQQGELFDLVKNPEYVTSGGGFGPVADD 0"
## [1] " "
## [1] "GYGVSYIILGENSIHFHVSSKISCSET-----------------DSHRFG--KNIQKAMV 0"
## [1] "GYGVSYIILDEDSIHFHVSSKISCSDTFENGPLQPVDSTATFHLESHMHGQVKMTSEGYC 0"
## [1] "GYGVSYIILDENSIHFHVSSKFSCSET-----------------DSHRFG--KNIQKALV 0"
## [1] "GYGVSYIIVGEKLINFHVSSKFSCPET-----------------DSHRFG--ENIRRAMI 0"
## [1] "GYGVSYILVGENLINFHISSKFSCPET-----------------DSHRFG--RHLKEAMT 0"
## [1] "GYGVSYILVGENLINFHISSKFSCPET-----------------DSHRFG--RHLKEAMT 0"
## [1] "GYGVSYIIVGENFIHFHISSKFSSPET-----------------DSHRFG--KHLRQAMM 0"
## [1] "GYGVSYIIVGENFIHFHISSKFSSPET-----------------DSHRFG--KHLRQAMM 0"
## [1] "GYGVSYIIVGENLINFHISSKFSSPET-----------------DSHRFG--KHIKQAMI 0"
## [1] "GYGVAYVIVGEKLINFHISSKRSSPET-----------------DSHRFG--NNIRRAMI 0"
## [1] " "
## [1] "DIMGLFTHSKNCTK---------------------------------------------- 0"
## [1] "DCFANGDFCKNCNCDNCCNNQLHETERLTAIKACLERNPEAFLPKIGKRKLGEIKHHHNK 0"
## [1] "DIMGLFQPTKNCTK---------------------------------------------- 0"
## [1] "DIVSLFGFGKNSTK---------------------------------------------- 0"
## [1] "DIITLFGLSSNSKK---------------------------------------------- 0"
## [1] "DIITLFGLSSNSKK---------------------------------------------- 0"
## [1] "DIITLFGLTANSKK---------------------------------------------- 0"
## [1] "DIITLFGLTINSKK---------------------------------------------- 0"
## [1] "DILALFNISTNNSNKGKK------------------------------------------ 0"
## [1] "DMLDLFQLDKKDPK---------------------------------------------- 0"
## [1] " "
## [1] "------------------------------------------------------------ 0"
## [1] "GCNCKRSGCLENYCECFEAKIVCSSICKCTGCKNYEESPDENKQLNVINYMDIGNKEGNS 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] " "
## [1] "----------------- 43"
## [1] "PVLTSAFNMLPKSRKDR 43"
## [1] "----------------- 43"
## [1] "----------------- 43"
## [1] "----------------- 43"
## [1] "----------------- 43"
## [1] "----------------- 43"
## [1] "----------------- 43"
## [1] "----------------- 43"
## [1] "----------------- 43"
## [1] " "

Finished MSA

# key step - must have class set properly
class(cpt1a_align) <- "AAMultipleAlignment"

## run ggmsa
ggmsa::ggmsa(cpt1a_align,
             start = 0, 
             end = 50) 

Distance Matrix

This produces a “dist” object class

names(cpt1a_vector_ss)
##  [1] "NP_001867.2"    "XP_001173853.1" "NP_038523.2"    "NP_113747.2"   
##  [5] "NP_001012916.1" "NP_001038319.1" "XP_004913491.1" "XP_006019242.1"
##  [9] "XP_009472179.1" "XP_021152303.1"
names.focal <- c("NP_001867.2", "XP_001173853.1", "NP_038523.2", "NP_113747.2", "NP_001012916.1", "NP_001038319.1", "XP_004913491.1", "XP_006019242.1", "XP_009472179.1", "XP_021152303.1")
cpt1a_vector_ss[names.focal]
## AAStringSet object of length 10:
##      width seq                                              names               
##  [1]   773 MAEAHQAVAFQFTVTPDGIDLRL...RHLKEAMTDIITLFGLSSNSKK NP_001867.2
##  [2]   773 MAEAHQAVAFQFTVTPDGIDLRL...RHLKEAMTDIITLFGLSSNSKK XP_001173853.1
##  [3]   773 MAEAHQAVAFQFTVTPDGIDLRL...KHLRQAMMDIITLFGLTANSKK NP_038523.2
##  [4]   773 MAEAHQAVAFQFTVTPDGIDLRL...KHLRQAMMDIITLFGLTINSKK NP_113747.2
##  [5]   770 MAEAHQAVAFQFTVTPDGIDLRM...KNIQKALVDIMGLFQPTKNCTK NP_001012916.1
##  [6]   774 MAEAHQAVAFQFTVGPDGIDLQL...NNIRRAMIDMLDLFQLDKKDPK NP_001038319.1
##  [7]   778 MAEAHQAVAFQFTVTPDGIDLRL...QAMIDILALFNISTNNSNKGKK XP_004913491.1
##  [8]   770 MAEAHQAVAFQFTVTPDGIDLRM...ENIRRAMIDIVSLFGFGKNSTK XP_006019242.1
##  [9]   770 MAEAHQAVAFQFTVTPDGIDLRM...KNIQKAMVDIMGLFTHSKNCTK XP_009472179.1
## [10]   912 MAEAHQAVAFQFTVTPDGIDLRM...KEGNSPVLTSAFNMLPKSRKDR XP_021152303.1
cpt1a_vector_ss_subset <- cpt1a_vector_ss[names.focal]
cpt1a_align_subset <- msa(cpt1a_vector_ss_subset,
                     method = "ClustalW")
## use default substitution matrix
class(cpt1a_align_subset) <- "AAMultipleAlignment"
cpt1a_align_subset_seqinr <- msaConvert(cpt1a_align_subset, type = "seqinr::alignment")
cpt1a_subset_dist <- seqinr::dist.alignment(cpt1a_align_subset_seqinr, 
                      matrix = "identity")
is(cpt1a_subset_dist)
## [1] "dist"     "oldClass"
class(cpt1a_subset_dist)
## [1] "dist"

Round for display

cpt1a_align_seqinr_rnd <- round(cpt1a_subset_dist, 3)
cpt1a_align_seqinr_rnd
##                XP_009472179.1 XP_021152303.1 NP_001012916.1 XP_006019242.1
## XP_021152303.1          0.239                                             
## NP_001012916.1          0.228          0.284                              
## XP_006019242.1          0.366          0.391          0.368               
## NP_001867.2             0.434          0.450          0.441          0.438
## XP_001173853.1          0.435          0.449          0.440          0.438
## NP_038523.2             0.443          0.459          0.450          0.450
## NP_113747.2             0.444          0.466          0.450          0.454
## XP_004913491.1          0.440          0.462          0.438          0.443
## NP_001038319.1          0.554          0.567          0.555          0.541
##                NP_001867.2 XP_001173853.1 NP_038523.2 NP_113747.2
## XP_021152303.1                                                   
## NP_001012916.1                                                   
## XP_006019242.1                                                   
## NP_001867.2                                                      
## XP_001173853.1       0.072                                       
## NP_038523.2          0.367          0.369                        
## NP_113747.2          0.367          0.367       0.176            
## XP_004913491.1       0.424          0.426       0.436       0.443
## NP_001038319.1       0.545          0.545       0.547       0.553
##                XP_004913491.1
## XP_021152303.1               
## NP_001012916.1               
## XP_006019242.1               
## NP_001867.2                  
## XP_001173853.1               
## NP_038523.2                  
## NP_113747.2                  
## XP_004913491.1               
## NP_001038319.1          0.538

Phylogenetic trees of sequences

Use the distance matrix to build a phylogenetic tree

tree_subset <- nj(cpt1a_subset_dist)

Plotting phylogenetic trees

Plot the tree

# plot tree
plot.phylo(tree_subset, main="Phylogenetic Tree", 
            use.edge.length = F)

# add label
mtext(text = "CPT1A family gene tree - rooted, no branch lenths")