This code gives summary information about the gene CPT1A (carnitine palmitoyltransferase 1A) in both humans and its homologs in other species through alignments and phylogenetic trees.
Key information used in this code was found at these websites: Refseq Gene: https://www.ncbi.nlm.nih.gov/gene/1374 Refseq Homologene: https://www.ncbi.nlm.nih.gov/homologene/1413
Other Resources include: Neanderthal Genome: http://neandertal.ensemblgenomes.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000110090 REPPER: https://toolkit.tuebingen.mpg.de/jobs/7803820 Sub-cellular locations prediction: https://wolfpsort.hgc.jp/
Load necessary packages
library(BiocManager)
#only needs to be installed once
#install("drawProteins")
library(drawProteins)
# github packages
library(compbio4all)
#library(ggmsa)
# CRAN packages
library(rentrez)
library(seqinr)
library(ape)
##
## Attaching package: 'ape'
## The following objects are masked from 'package:seqinr':
##
## as.alignment, consensus
library(pander)
library(ggplot2)
# Bioconductor packages
#library(msa)
library(drawProteins)
## Biostrings
library(Biostrings)
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Warning: package 'S4Vectors' was built under R version 4.1.2
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
##
## windows
## Loading required package: XVector
## Loading required package: GenomeInfoDb
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:ape':
##
## complement
## The following object is masked from 'package:seqinr':
##
## translate
## The following object is masked from 'package:base':
##
## strsplit
library(msa)
##
## Attaching package: 'msa'
## The following object is masked from 'package:BiocManager':
##
## version
library(ggmsa)
## Registered S3 methods overwritten by 'ggalt':
## method from
## grid.draw.absoluteGrob ggplot2
## grobHeight.absoluteGrob ggplot2
## grobWidth.absoluteGrob ggplot2
## grobX.absoluteGrob ggplot2
## grobY.absoluteGrob ggplot2
## ggmsa v1.0.0 Document: http://yulab-smu.top/ggmsa/
##
## If you use ggmsa in published research, please cite: DOI: 10.18129/B9.bioc.ggmsa
library(HGNChelper)
## Warning: package 'HGNChelper' was built under R version 4.1.2
The accession numbers were obtained from RefSeq, RefSeq Homologene, UniProt, and PDB. Only the human protein was found in PDB.
HGNChelper::checkGeneSymbols(x = c("CPT1A"))
## Maps last updated on: Thu Oct 24 12:31:05 2019
## x Approved Suggested.Symbol
## 1 CPT1A TRUE CPT1A
Not available in: Neanderthal Drosophila
Create vectors
ncbi.protein.accession <- c("NP_001867.2", "XP_001173853.1", "NP_038523.2", "NP_113747.2", "NP_001012916.1", "NP_001038319.1", "XP_004913491.1", "XP_006019242.1", "XP_009472179.1", "XP_021152303.1")
Uniprot.id <- c("P50416", "K7D636", "P97742", "P32198", "Q6B842", "E7EYD4", "A9JRK5", "A0A1U7RUG7", "NA", "A0A2I0MI80")
PDB <- c("2LE3", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA")
Species <- c("Homo sapiens", "Pan troglodytes", "Mus musculus", "Rattus norvegicus", "Gallus gallus", "Danio rerio", "Xenopus tropicalis", "Alligator sinensis", "Nipponia nippon", "Columba livia")
Common.name <- c("Human", "Chimpanzee", "Mouse", "Rat", "Red junglefowl", "Zebrafish", "Western clawed frog", "Chinese alligator", "Crested ibis", "Rock Dove")
Gene.name <- c("CPT1A", "CPT1A", "CPT1A", "CPT1A", "CPT1A", "CPT1A", "LOC100490391", "CPT1A", "CPT1A", "CPT1A")
Convert the vectors into a table
cpt1a_table <- data.frame(ncbi.protein.accession, Uniprot.id, PDB, Species, Common.name, Gene.name)
Display the finished table
pander(cpt1a_table)
| ncbi.protein.accession | Uniprot.id | PDB | Species |
|---|---|---|---|
| NP_001867.2 | P50416 | 2LE3 | Homo sapiens |
| XP_001173853.1 | K7D636 | NA | Pan troglodytes |
| NP_038523.2 | P97742 | NA | Mus musculus |
| NP_113747.2 | P32198 | NA | Rattus norvegicus |
| NP_001012916.1 | Q6B842 | NA | Gallus gallus |
| NP_001038319.1 | E7EYD4 | NA | Danio rerio |
| XP_004913491.1 | A9JRK5 | NA | Xenopus tropicalis |
| XP_006019242.1 | A0A1U7RUG7 | NA | Alligator sinensis |
| XP_009472179.1 | NA | NA | Nipponia nippon |
| XP_021152303.1 | A0A2I0MI80 | NA | Columba livia |
| Common.name | Gene.name |
|---|---|
| Human | CPT1A |
| Chimpanzee | CPT1A |
| Mouse | CPT1A |
| Rat | CPT1A |
| Red junglefowl | CPT1A |
| Zebrafish | CPT1A |
| Western clawed frog | LOC100490391 |
| Chinese alligator | CPT1A |
| Crested ibis | CPT1A |
| Rock Dove | CPT1A |
#Download sequences
The sequences were downloaded using a wrapper compbio4all::entrez_fetch_list() which uses accesses NCBI databases to download the information and then store it in a list.
cpt1a_list <- entrez_fetch_list(db = "protein",
id = ncbi.protein.accession,
rettype = "fasta")
Number of FASTA files downloaded:
length(cpt1a_list)
## [1] 10
The first entry:
cpt1a_list[[1]]
## [1] ">NP_001867.2 carnitine O-palmitoyltransferase 1, liver isoform isoform 1 [Homo sapiens]\nMAEAHQAVAFQFTVTPDGIDLRLSHEALRQIYLSGLHSWKKKFIRFKNGIITGVYPASPSSWLIVVVGVM\nTTMYAKIDPSLGIIAKINRTLETANCMSSQTKNVVSGVLFGTGLWVALIVTMRYSLKVLLSYHGWMFTEH\nGKMSRATKIWMGMVKIFSGRKPMLYSFQTSLPRLPVPAVKDTVNRYLQSVRPLMKEEDFKRMTALAQDFA\nVGLGPRLQWYLKLKSWWATNYVSDWWEEYIYLRGRGPLMVNSNYYAMDLLYILPTHIQAARAGNAIHAIL\nLYRRKLDREEIKPIRLLGSTIPLCSAQWERMFNTSRIPGEETDTIQHMRDSKHIVVYHRGRYFKVWLYHD\nGRLLKPREMEQQMQRILDNTSEPQPGEARLAALTAGDRVPWARCRQAYFGRGKNKQSLDAVEKAAFFVTL\nDETEEGYRSEDPDTSMDSYAKSLLHGRCYDRWFDKSFTFVVFKNGKMGLNAEHSWADAPIVAHLWEYVMS\nIDSLQLGYAEDGHCKGDINPNIPYPTRLQWDIPGECQEVIETSLNTANLLANDVDFHSFPFVAFGKGIIK\nKCRTSPDAFVQLALQLAHYKDMGKFCLTYEASMTRLFREGRTETVRSCTTESCDFVRAMVDPAQTVEQRL\nKLFKLASEKHQHMYRLAMTGSGIDRHLFCLYVVSKYLAVESPFLKEVLSEPWRLSTSQTPQQQVELFDLE\nNNPEYVSSGGGFGPVADDGYGVSYILVGENLINFHISSKFSCPETDSHRFGRHLKEAMTDIITLFGLSSN\nSKK\n\n"
Remove FASTA header
for(i in 1:length(cpt1a_list)){
cpt1a_list[[i]] <- compbio4all::fasta_cleaner(cpt1a_list[[i]], parse = F)
}
#Protein Diagram
Use a UniProt accession number to download the data from UniProt in the form of a list
CPT1A_json <- drawProteins::get_features("P50416")
## [1] "Download has worked"
is(CPT1A_json)
## [1] "list" "vector" "list_OR_List" "vector_OR_Vector"
## [5] "vector_OR_factor"
Domains present
my_prot_df <- drawProteins::feature_to_dataframe(CPT1A_json)
my_canvas <- draw_canvas(my_prot_df)
my_canvas <- draw_chains(my_canvas, my_prot_df, label_size = 2.5)
my_canvas <- draw_recept_dom(my_canvas, my_prot_df)
my_canvas
Prepare the data
cpt1a_human_vector <- fasta_cleaner(cpt1a_list[[1]])
Make a 2x2 panel to show different values for dotplot settings
par(mfrow = c(2,2),
mar = c(0,0,2,1))
# plot 1: Defaults
dotPlot(cpt1a_human_vector, cpt1a_human_vector,
wsize = 1,
nmatch = 1,
main = "Defaults")
# plot 2 size = 10, nmatch = 1
dotPlot(cpt1a_human_vector, cpt1a_human_vector,
wsize = 10,
nmatch = 1,
main = "size = 10, nmatch = 1")
# plot 3: size = 10, nmatch = 5
dotPlot(cpt1a_human_vector, cpt1a_human_vector,
wsize = 10,
nmatch = 5,
main = "size = 10, nmatch = 5")
# plot 4: size = 25, nmatch = 5
dotPlot(cpt1a_human_vector, cpt1a_human_vector,
wsize = 25,
nmatch = 5,
main = "size = 25, nmatch = 5")
# reset par() - run this or other plots will be small!
par(mfrow = c(1,1),
mar = c(4,4,4,4))
Display best version of the plot
dotPlot(cpt1a_human_vector, cpt1a_human_vector,
wsize = 25,
nmatch = 5,
main = "size = 25, nmatch = 5")
#Protein properties compiled from databases
Links to relevant information 1. Pfam: http://pfam.xfam.org/protein/P50416 2. Uniprot: https://www.uniprot.org/uniprot/P50416 3. PDB: https://www.rcsb.org/structure/2LE3 4. Alphafold: https://alphafold.ebi.ac.uk/entry/P50416
CPT1A was not found in the following databases 1. Disprot: No information available 2. RepeatsDB: No information available
Create table with properties information
Databases <- c("Pfam", "Uniprot", "PDB", "Alphafold")
Properties <- c("Domain: CPT_N, Transmembrane region 1-47",
"Subcellular location: Mitochondrion outer membrane",
"Classification: Transferase",
"Predicted structure: mainly alpha helicies with some beta sheets")
protein.properties.df <- data.frame(Databases, Properties)
pander::pander(protein.properties.df)
| Databases | Properties |
|---|---|
| Pfam | Domain: CPT_N, Transmembrane region 1-47 |
| Uniprot | Subcellular location: Mitochondrion outer membrane |
| PDB | Classification: Transferase |
| Alphafold | Predicted structure: mainly alpha helicies with some beta sheets |
Multivariate statistcal techniques were used to confirm the information about protein structure and location in the line database.
Uniprot indicates that the protein is located in the Mitochondrion outer membrane.
Alphafold shows mainly alpha helicies, with some small portions of beta sheets. Therefore, I predict that the machine-learning methods will indicate alpha and alpha+beta structure.
aa.1.1 <- c("A","R","N","D","C","Q","E","G","H","I",
"L","K","M","F","P","S","T","W","Y","V")
alpha <- c(285, 53, 97, 163, 22, 67, 134, 197, 111, 91,
221, 249, 48, 123, 82, 122, 119, 33, 63, 167)
beta <- c(203, 67, 139, 121, 75, 122, 86, 297, 49, 120,
177, 115, 16, 85, 127, 341, 253, 44, 110, 229)
a.plus.b <- c(175, 78, 120, 111, 74, 74, 86, 171, 33, 93,
110, 112, 25, 52, 71, 126, 117, 30, 108, 123)
a.div.b <- c(361, 146, 183, 244, 63, 114, 257, 377, 107, 239, 339, 321, 91, 158, 188, 327, 238, 72, 130, 378)
pander(data.frame(aa.1.1, alpha, beta, a.plus.b, a.div.b))
| aa.1.1 | alpha | beta | a.plus.b | a.div.b |
|---|---|---|---|---|
| A | 285 | 203 | 175 | 361 |
| R | 53 | 67 | 78 | 146 |
| N | 97 | 139 | 120 | 183 |
| D | 163 | 121 | 111 | 244 |
| C | 22 | 75 | 74 | 63 |
| Q | 67 | 122 | 74 | 114 |
| E | 134 | 86 | 86 | 257 |
| G | 197 | 297 | 171 | 377 |
| H | 111 | 49 | 33 | 107 |
| I | 91 | 120 | 93 | 239 |
| L | 221 | 177 | 110 | 339 |
| K | 249 | 115 | 112 | 321 |
| M | 48 | 16 | 25 | 91 |
| F | 123 | 85 | 52 | 158 |
| P | 82 | 127 | 71 | 188 |
| S | 122 | 341 | 126 | 327 |
| T | 119 | 253 | 117 | 238 |
| W | 33 | 44 | 30 | 72 |
| Y | 63 | 110 | 108 | 130 |
| V | 167 | 229 | 123 | 378 |
Convert to frequencies
alpha.prop <- alpha/sum(alpha)
beta.prop <- beta/sum(beta)
a.plus.b.prop <- a.plus.b/sum(a.plus.b)
a.div.b <- a.div.b/sum(a.div.b)
Create dataframe
aa.prop <- data.frame(alpha.prop,
beta.prop,
a.plus.b.prop,
a.div.b)
row.names(aa.prop) <- aa.1.1
Display dataframe
pander(aa.prop)
| alpha.prop | beta.prop | a.plus.b.prop | a.div.b | |
|---|---|---|---|---|
| A | 0.1165 | 0.07313 | 0.09264 | 0.08331 |
| R | 0.02166 | 0.02414 | 0.04129 | 0.03369 |
| N | 0.03964 | 0.05007 | 0.06353 | 0.04223 |
| D | 0.06661 | 0.04359 | 0.05876 | 0.05631 |
| C | 0.008991 | 0.02702 | 0.03917 | 0.01454 |
| Q | 0.02738 | 0.04395 | 0.03917 | 0.02631 |
| E | 0.05476 | 0.03098 | 0.04553 | 0.05931 |
| G | 0.08051 | 0.107 | 0.09052 | 0.08701 |
| H | 0.04536 | 0.01765 | 0.01747 | 0.02469 |
| I | 0.03719 | 0.04323 | 0.04923 | 0.05516 |
| L | 0.09031 | 0.06376 | 0.05823 | 0.07824 |
| K | 0.1018 | 0.04143 | 0.05929 | 0.07408 |
| M | 0.01962 | 0.005764 | 0.01323 | 0.021 |
| F | 0.05027 | 0.03062 | 0.02753 | 0.03646 |
| P | 0.03351 | 0.04575 | 0.03759 | 0.04339 |
| S | 0.04986 | 0.1228 | 0.0667 | 0.07547 |
| T | 0.04863 | 0.09114 | 0.06194 | 0.05493 |
| W | 0.01349 | 0.01585 | 0.01588 | 0.01662 |
| Y | 0.02575 | 0.03963 | 0.05717 | 0.03 |
| V | 0.06825 | 0.08249 | 0.06511 | 0.08724 |
Determine the number of each amino acid in CPT1A with a frequency table.
cpt1a.freq.table <- table(cpt1a_human_vector)/length(cpt1a_human_vector)
Convert the table into a vector and display
table_to_vector <- function(table_x){
table_names <- attr(table_x, "dimnames")[[1]]
table_vect <- as.vector(table_x)
names(table_vect) <- table_names
return(table_vect)
}
cpt1a_human_table <- table(cpt1a_human_vector)/length(cpt1a_human_vector)
cpt1a.human.aa.freq <- table_to_vector(cpt1a_human_table)
pander(cpt1a.human.aa.freq)
| A | C | D | E | F | G | H | I |
|---|---|---|---|---|---|---|---|
| 0.06727 | 0.01552 | 0.04916 | 0.05563 | 0.04657 | 0.06339 | 0.02975 | 0.05045 |
| K | L | M | N | P | Q | R | S |
|---|---|---|---|---|---|---|---|
| 0.05692 | 0.09702 | 0.03493 | 0.03105 | 0.04398 | 0.03752 | 0.0621 | 0.07374 |
| T | V | W | Y |
|---|---|---|---|
| 0.05692 | 0.0621 | 0.02329 | 0.04269 |
Check for the presence of “U” (unknown amino acid)
aa.names <- names(cpt1a.human.aa.freq)
any(aa.names == "U")
## [1] FALSE
i.U <- which(aa.names == "U")
aa.names[i.U]
## character(0)
cpt1a.human.aa.freq[i.U]
## named numeric(0)
Add data on CPT1A to the amino acid frequency table
aa.prop$cpt1a.human.aa.freq <- cpt1a.human.aa.freq
pander::pander(aa.prop)
| alpha.prop | beta.prop | a.plus.b.prop | a.div.b | cpt1a.human.aa.freq | |
|---|---|---|---|---|---|
| A | 0.1165 | 0.07313 | 0.09264 | 0.08331 | 0.06727 |
| R | 0.02166 | 0.02414 | 0.04129 | 0.03369 | 0.01552 |
| N | 0.03964 | 0.05007 | 0.06353 | 0.04223 | 0.04916 |
| D | 0.06661 | 0.04359 | 0.05876 | 0.05631 | 0.05563 |
| C | 0.008991 | 0.02702 | 0.03917 | 0.01454 | 0.04657 |
| Q | 0.02738 | 0.04395 | 0.03917 | 0.02631 | 0.06339 |
| E | 0.05476 | 0.03098 | 0.04553 | 0.05931 | 0.02975 |
| G | 0.08051 | 0.107 | 0.09052 | 0.08701 | 0.05045 |
| H | 0.04536 | 0.01765 | 0.01747 | 0.02469 | 0.05692 |
| I | 0.03719 | 0.04323 | 0.04923 | 0.05516 | 0.09702 |
| L | 0.09031 | 0.06376 | 0.05823 | 0.07824 | 0.03493 |
| K | 0.1018 | 0.04143 | 0.05929 | 0.07408 | 0.03105 |
| M | 0.01962 | 0.005764 | 0.01323 | 0.021 | 0.04398 |
| F | 0.05027 | 0.03062 | 0.02753 | 0.03646 | 0.03752 |
| P | 0.03351 | 0.04575 | 0.03759 | 0.04339 | 0.0621 |
| S | 0.04986 | 0.1228 | 0.0667 | 0.07547 | 0.07374 |
| T | 0.04863 | 0.09114 | 0.06194 | 0.05493 | 0.05692 |
| W | 0.01349 | 0.01585 | 0.01588 | 0.01662 | 0.0621 |
| Y | 0.02575 | 0.03963 | 0.05717 | 0.03 | 0.02329 |
| V | 0.06825 | 0.08249 | 0.06511 | 0.08724 | 0.04269 |
# Correlation used in Chou and Zhange 1992.
chou_cor <- function(x,y){
numerator <- sum(x*y)
denominator <- sqrt((sum(x^2))*(sum(y^2)))
result <- numerator/denominator
return(result)
}
# Cosine similarity used in Higgs and Attwood (2005).
chou_cosine <- function(z.1, z.2){
z.1.abs <- sqrt(sum(z.1^2))
z.2.abs <- sqrt(sum(z.2^2))
my.cosine <- sum(z.1*z.2)/(z.1.abs*z.2.abs)
return(my.cosine)
}
Calculate correlation between each column
corr.alpha <- chou_cor(aa.prop[,5], aa.prop[,1])
corr.beta <- chou_cor(aa.prop[,5], aa.prop[,2])
corr.apb <- chou_cor(aa.prop[,5], aa.prop[,3])
corr.adb <- chou_cor(aa.prop[,5], aa.prop[,4])
Calculate cosine similarity
cos.alpha <- chou_cosine(aa.prop[,5], aa.prop[,1])
cos.beta <- chou_cosine(aa.prop[,5], aa.prop[,2])
cos.apb <- chou_cosine(aa.prop[,5], aa.prop[,3])
cos.adb <- chou_cosine(aa.prop[,5], aa.prop[,4])
Calculate distance
aa.prop.flipped <- t(aa.prop)
round(aa.prop.flipped,2)
## A R N D C Q E G H I L K
## alpha.prop 0.12 0.02 0.04 0.07 0.01 0.03 0.05 0.08 0.05 0.04 0.09 0.10
## beta.prop 0.07 0.02 0.05 0.04 0.03 0.04 0.03 0.11 0.02 0.04 0.06 0.04
## a.plus.b.prop 0.09 0.04 0.06 0.06 0.04 0.04 0.05 0.09 0.02 0.05 0.06 0.06
## a.div.b 0.08 0.03 0.04 0.06 0.01 0.03 0.06 0.09 0.02 0.06 0.08 0.07
## cpt1a.human.aa.freq 0.07 0.02 0.05 0.06 0.05 0.06 0.03 0.05 0.06 0.10 0.03 0.03
## M F P S T W Y V
## alpha.prop 0.02 0.05 0.03 0.05 0.05 0.01 0.03 0.07
## beta.prop 0.01 0.03 0.05 0.12 0.09 0.02 0.04 0.08
## a.plus.b.prop 0.01 0.03 0.04 0.07 0.06 0.02 0.06 0.07
## a.div.b 0.02 0.04 0.04 0.08 0.05 0.02 0.03 0.09
## cpt1a.human.aa.freq 0.04 0.04 0.06 0.07 0.06 0.06 0.02 0.04
Create distance matrix
dist(aa.prop.flipped, method = "euclidean")
## alpha.prop beta.prop a.plus.b.prop a.div.b
## beta.prop 0.13342098
## a.plus.b.prop 0.09281824 0.08289406
## a.div.b 0.06699039 0.08659174 0.06175113
## cpt1a.human.aa.freq 0.15467072 0.13760320 0.12127782 0.12942144
Individual distances using dist()
dist.alpha <- dist((aa.prop.flipped[c(1,5),]), method = "euclidean")
dist.beta <- dist((aa.prop.flipped[c(2,5),]), method = "euclidean")
dist.apb <- dist((aa.prop.flipped[c(3,5),]), method = "euclidean")
dist.adb <- dist((aa.prop.flipped[c(4,5),]), method = "euclidean")
Compile the information and round to make it easier to read.
# fold types
fold.type <- c("alpha","beta","alpha plus beta", "alpha/beta")
# data
corr.sim <- round(c(corr.alpha,corr.beta,corr.apb,corr.adb),5)
cosine.sim <- round(c(cos.alpha,cos.beta,cos.apb,cos.adb),5)
Euclidean.dist <- round(c(dist.alpha,dist.beta,dist.apb,dist.adb),5)
# summary
sim.sum <- c("","","most.sim","")
dist.sum <- c("","","min.dist","")
df <- data.frame(fold.type,
corr.sim ,
cosine.sim ,
Euclidean.dist ,
sim.sum ,
dist.sum )
Display output
pander::pander(df)
| fold.type | corr.sim | cosine.sim | Euclidean.dist | sim.sum | dist.sum |
|---|---|---|---|---|---|
| alpha | 0.8089 | 0.8089 | 0.1547 | ||
| beta | 0.8525 | 0.8525 | 0.1376 | ||
| alpha plus beta | 0.8733 | 0.8733 | 0.1213 | most.sim | min.dist |
| alpha/beta | 0.859 | 0.859 | 0.1294 |
Convert the FASTA entries into a single vector
names(cpt1a_list)
## [1] "NP_001867.2" "XP_001173853.1" "NP_038523.2" "NP_113747.2"
## [5] "NP_001012916.1" "NP_001038319.1" "XP_004913491.1" "XP_006019242.1"
## [9] "XP_009472179.1" "XP_021152303.1"
length(cpt1a_list)
## [1] 10
Display the first entry
cpt1a_list[1]
## $NP_001867.2
## [1] "MAEAHQAVAFQFTVTPDGIDLRLSHEALRQIYLSGLHSWKKKFIRFKNGIITGVYPASPSSWLIVVVGVMTTMYAKIDPSLGIIAKINRTLETANCMSSQTKNVVSGVLFGTGLWVALIVTMRYSLKVLLSYHGWMFTEHGKMSRATKIWMGMVKIFSGRKPMLYSFQTSLPRLPVPAVKDTVNRYLQSVRPLMKEEDFKRMTALAQDFAVGLGPRLQWYLKLKSWWATNYVSDWWEEYIYLRGRGPLMVNSNYYAMDLLYILPTHIQAARAGNAIHAILLYRRKLDREEIKPIRLLGSTIPLCSAQWERMFNTSRIPGEETDTIQHMRDSKHIVVYHRGRYFKVWLYHDGRLLKPREMEQQMQRILDNTSEPQPGEARLAALTAGDRVPWARCRQAYFGRGKNKQSLDAVEKAAFFVTLDETEEGYRSEDPDTSMDSYAKSLLHGRCYDRWFDKSFTFVVFKNGKMGLNAEHSWADAPIVAHLWEYVMSIDSLQLGYAEDGHCKGDINPNIPYPTRLQWDIPGECQEVIETSLNTANLLANDVDFHSFPFVAFGKGIIKKCRTSPDAFVQLALQLAHYKDMGKFCLTYEASMTRLFREGRTETVRSCTTESCDFVRAMVDPAQTVEQRLKLFKLASEKHQHMYRLAMTGSGIDRHLFCLYVVSKYLAVESPFLKEVLSEPWRLSTSQTPQQQVELFDLENNPEYVSSGGGFGPVADDGYGVSYILVGENLINFHISSKFSCPETDSHRFGRHLKEAMTDIITLFGLSSNSKK"
Make each list entry into a vector
cpt1a_vector <- rep(NA, length(cpt1a_list))
# creating vector
for(i in 1:length(cpt1a_vector)){
cpt1a_vector[i] <- cpt1a_list[[i]]
}
Name the vector
names(cpt1a_vector) <- names(cpt1a_list)
Pairwise alignments for humans, chimpanzees, mice, and horses
align01.02 <- Biostrings::pairwiseAlignment(
cpt1a_vector[[1]], cpt1a_vector[[2]])
Biostrings::pid(align01.02)
## [1] 99.48254
align01.03 <- Biostrings::pairwiseAlignment(
cpt1a_vector[[1]], cpt1a_vector[[3]])
Biostrings::pid(align01.03)
## [1] 86.54592
align01.08 <- Biostrings::pairwiseAlignment(
cpt1a_vector[[1]], cpt1a_vector[[8]])
Biostrings::pid(align01.08)
## [1] 80.72445
align02.03 <- Biostrings::pairwiseAlignment(
cpt1a_vector[[2]], cpt1a_vector[[3]])
Biostrings::pid(align01.08)
## [1] 80.72445
align02.08 <- Biostrings::pairwiseAlignment(
cpt1a_vector[[2]], cpt1a_vector[[8]])
Biostrings::pid(align01.08)
## [1] 80.72445
align03.08 <- Biostrings::pairwiseAlignment(
cpt1a_vector[[3]], cpt1a_vector[[8]])
Biostrings::pid(align01.08)
## [1] 80.72445
Build matrix
pids <- c(1, NA, NA, NA,
pid(align01.02), 1, NA, NA,
pid(align01.03), pid(align02.03), 1, NA,
pid(align01.08), pid(align02.08), pid(align03.08), 1)
mat <- matrix(pids, nrow = 4, byrow = T)
row.names(mat) <- c("Human","Chimpanzee","Mouse","Alligator")
colnames(mat) <- c("Human","Chimpanzee","Mouse","Alligator")
pander::pander(mat)
| Human | Chimpanzee | Mouse | Alligator | |
|---|---|---|---|---|
| Human | 1 | NA | NA | NA |
| Chimpanzee | 99.48 | 1 | NA | NA |
| Mouse | 86.55 | 86.42 | 1 | NA |
| Alligator | 80.72 | 80.72 | 79.56 | 1 |
Compare different PID methods for humans and chimpanzees
Prepare the data
humancpt1a_string <-paste(cpt1a_vector[[1]],
collapse = "")
# convert ulcerans_vector to an object ulceransseq_string
chimpcpt1a_string <-paste(cpt1a_vector[[2]],
collapse = "")
humancpt1a_string <- toupper(humancpt1a_string)
chimpcpt1a_string <- toupper(chimpcpt1a_string)
Create pairwise alignment between human and chimpanzee
data(BLOSUM50)
globalAlignHumanChimp <- Biostrings::pairwiseAlignment(humancpt1a_string,
chimpcpt1a_string,
substitutionMatrix = BLOSUM50,
gapOpening = -8,
gapExtension = -2,
scoreOnly = FALSE)
Calculate PIDs with different methods and create a table of the results
PID1.results <- pid(globalAlignHumanChimp, type = "PID1")
PID2.results <- pid(globalAlignHumanChimp, type = "PID2")
PID3.results <- pid(globalAlignHumanChimp, type = "PID3")
PID4.results <- pid(globalAlignHumanChimp, type = "PID4")
methods <- c("PID1", "PID2", "PID3", "PID4")
PID <- c(PID1.results, PID2.results,
PID3.results, PID4.results)
denominator <- c("(aligned positions + internal gap positions)", "(aligned positions)", "(length shorter sequence)", "(average length of the two sequences)")
PID.df <- data.frame(methods, PID, denominator)
pander(PID.df)
| methods | PID | denominator |
|---|---|---|
| PID1 | 99.48 | (aligned positions + internal gap positions) |
| PID2 | 99.48 | (aligned positions) |
| PID3 | 99.48 | (length shorter sequence) |
| PID4 | 99.48 | (average length of the two sequences) |
cpt1a_vector_ss <- Biostrings::AAStringSet(cpt1a_vector)
cpt1a_align <-msa(cpt1a_vector_ss,
method = "ClustalW")
## use default substitution matrix
msa produces a species MSA objects
class(cpt1a_align)
## [1] "MsaAAMultipleAlignment"
## attr(,"package")
## [1] "msa"
is(cpt1a_align)
## [1] "MsaAAMultipleAlignment" "AAMultipleAlignment" "MsaMetaData"
## [4] "MultipleAlignment"
Default output of MSA
cpt1a_align
## CLUSTAL 2.1
##
## Call:
## msa(cpt1a_vector_ss, method = "ClustalW")
##
## MsaAAMultipleAlignment with 10 rows and 917 columns
## aln names
## [1] MAEAHQAVAFQFTVTPDGIDLRMSH...------------------------- XP_009472179.1
## [2] MAEAHQAVAFQFTVTPDGIDLRMSH...IGNKEGNSPVLTSAFNMLPKSRKDR XP_021152303.1
## [3] MAEAHQAVAFQFTVTPDGIDLRMSH...------------------------- NP_001012916.1
## [4] MAEAHQAVAFQFTVTPDGIDLRMSH...------------------------- XP_006019242.1
## [5] MAEAHQAVAFQFTVTPDGIDLRLSH...------------------------- NP_001867.2
## [6] MAEAHQAVAFQFTVTPDGIDLRLSH...------------------------- XP_001173853.1
## [7] MAEAHQAVAFQFTVTPDGIDLRLSH...------------------------- NP_038523.2
## [8] MAEAHQAVAFQFTVTPDGIDLRLSH...------------------------- NP_113747.2
## [9] MAEAHQAVAFQFTVTPDGIDLRLSH...------------------------- XP_004913491.1
## [10] MAEAHQAVAFQFTVGPDGIDLQLSH...------------------------- NP_001038319.1
## Con MAEAHQAVAFQFTVTPDGIDLRLSH...------------------------- Consensus
Change class of alignment
class(cpt1a_align) <- "AAMultipleAlignment"
is(cpt1a_align)
## [1] "AAMultipleAlignment" "MultipleAlignment"
cpt1a_align
## AAMultipleAlignment with 10 rows and 917 columns
## aln names
## [1] MAEAHQAVAFQFTVTPDGIDLRMSH...------------------------- XP_009472179.1
## [2] MAEAHQAVAFQFTVTPDGIDLRMSH...IGNKEGNSPVLTSAFNMLPKSRKDR XP_021152303.1
## [3] MAEAHQAVAFQFTVTPDGIDLRMSH...------------------------- NP_001012916.1
## [4] MAEAHQAVAFQFTVTPDGIDLRMSH...------------------------- XP_006019242.1
## [5] MAEAHQAVAFQFTVTPDGIDLRLSH...------------------------- NP_001867.2
## [6] MAEAHQAVAFQFTVTPDGIDLRLSH...------------------------- XP_001173853.1
## [7] MAEAHQAVAFQFTVTPDGIDLRLSH...------------------------- NP_038523.2
## [8] MAEAHQAVAFQFTVTPDGIDLRLSH...------------------------- NP_113747.2
## [9] MAEAHQAVAFQFTVTPDGIDLRLSH...------------------------- XP_004913491.1
## [10] MAEAHQAVAFQFTVGPDGIDLQLSH...------------------------- NP_001038319.1
Convert to seqinr format
cpt1a_align_seqinr <- msaConvert(cpt1a_align, type = "seqinr::alignment")
is(cpt1a_align_seqinr)
## [1] "alignment"
Show output with print_msa
compbio4all::print_msa(cpt1a_align_seqinr)
## [1] "MAEAHQAVAFQFTVTPDGIDLRMSHEALKQIYLSGVHSWKKKFIRFKNGIITGVYPASPS 0"
## [1] "MAEAHQAVAFQFTVTPDGIDLRMSHEALKQIYLSGVHSWKKKFIRFKNGIITGVYPASPS 0"
## [1] "MAEAHQAVAFQFTVTPDGIDLRMSHEALKQIYLSGVHSWKKKFIRFKNGIITGVYPASPS 0"
## [1] "MAEAHQAVAFQFTVTPDGIDLRMSHEALKQVYLSGLHSWKKKFIRFKNGIITGVYPASPS 0"
## [1] "MAEAHQAVAFQFTVTPDGIDLRLSHEALRQIYLSGLHSWKKKFIRFKNGIITGVYPASPS 0"
## [1] "MAEAHQAVAFQFTVTPDGIDLRLSHEALRQIYLSGLHSWKKKFIRFKNGIITGVYPASPS 0"
## [1] "MAEAHQAVAFQFTVTPDGIDLRLSHEALKQICLSGLHSWKKKFIRFKNGIITGVFPASPS 0"
## [1] "MAEAHQAVAFQFTVTPDGIDLRLSHEALKQICLSGLHSWKKKFIRFKNGIITGVFPANPS 0"
## [1] "MAEAHQAVAFQFTVTPDGIDLRLSHEALRQIYLSGLHSWKKKFIRFKNGILTGVFPSSPS 0"
## [1] "MAEAHQAVAFQFTVGPDGIDLQLSHEALRQVYLSGIHSWKKKFIRFKNGILNGVYPGSAP 0"
## [1] " "
## [1] "SWLIVVVGVMST-MYAKIDPSLGIIAKINRTLDTTGYMSNQTQNIVSGILFGTGLWVALI 0"
## [1] "SWLIVVVGVMST-MYAKIDPSLGIIAKINRTLDTTGCMSNQTQNIVSGILFGTGLWVALI 0"
## [1] "SWLIVVVGVMST-MYAKIDPSLGIIAKINRTLDTTGYMSNQTQNIVSGILFGTGLWVALI 0"
## [1] "SWLIVVVGVMST-MYAKIDPSLGIISKINQTLDTTGYMSNQTQNIVSGILFGTGLWVALI 0"
## [1] "SWLIVVVGVMTT-MYAKIDPSLGIIAKINRTLETANCMSSQTKNVVSGVLFGTGLWVALI 0"
## [1] "SWLIVVVGVMTT-MYAKIDPSLGIIAKINRTLETANCMSSQTKNVVSGVLFGTGLWVALI 0"
## [1] "SWLIVVVGVISS-MHTKVDPSLGMIAKINRTLDTTGRMSSQTKNIVSGVLFGTGLWVAII 0"
## [1] "SWLIVVVGVISS-MHAKVDPSLGMIAKISRTLDTTGRMSSQTKNIVSGVLFGTGLWVAVI 0"
## [1] "SWLIVVVGVTSASMYTKVDPSFGIIEKINNALSATGYMTPQTQNIVSGVLFGTGLWVSLI 0"
## [1] "GFVLVLAGYLGRAQYLKVDPSLGLLFKLGNYVPISRYMSEQKQMLVGGVMVGTGLWIAII 0"
## [1] " "
## [1] "VTMRYSLKMLLSYHGWMFAEHGKLSAGTKFWMALVKLFSGRK-PMLYSFQTSLPRLPVPA 0"
## [1] "VTMRYSLKMLLSYHGWMFAEHGKLSAGTRVWMALVKLFSGRK-PMLYSFQTSLPRLPVPA 0"
## [1] "VTMRYSLKMLLSYHGWMFAEHGKLSAGTKLWMTLVKLFSGRK-PMLYSFQTSLPRLPVPA 0"
## [1] "ITMRYSLKMLLSYHGWMFAEHGRLSTGTKIWMSLVKLFSGCK-PMLYSFQTSLPRLPVPS 0"
## [1] "VTMRYSLKVLLSYHGWMFTEHGKMSRATKIWMGMVKIFSGRK-PMLYSFQTSLPRLPVPA 0"
## [1] "VTMRYSLKVLLSYHGWMFTEHGKMSRATKIWMGMVKIFSGRK-PMLYSFQTSLPRLPVPA 0"
## [1] "MTMRYSLKVLLSYHGWMFAEHGKMSRSTRIWMAMVKVFSGRK-PMLYSFQTSLPRLPVPA 0"
## [1] "MTMRYSLKVLLSYHGWMFAEHGKMSRSTKIWMAMVKVLSGRK-PMLYSFQTSLPRLPVPA 0"
## [1] "ATMRYSLKKLLSYHGWMFEEHGKLSASTRIWMGMVKLLSGRK-PMLYSFQTSLPRLPVPP 0"
## [1] "FGMRTVLKGLLSWHGWMFASHGRMTWKIRLWLVFVKVFSGMQTPMLYSFQNSLPHLFVPS 0"
## [1] " "
## [1] "VKDTVNRYLESVRPLMDDEEFRRMEGLAKDFAFNLGPRLQWYLKLKSWWATNYVSDWWEE 0"
## [1] "VKDTVNRYLESVRPLMDDGEFRRMEGLAKDFAFNLGPRLQWYLKLKSWWATNYVSDWWEE 0"
## [1] "VKDTVNRYLESVRPLMNDEEFKRMEGLAKDFAFNLGPRLQWYLKLKSWWATNYVSDWWEE 0"
## [1] "IKDTVTRYLESVRPLMNDVEFKRMDGLAKDFAVNLGPRLQWYLKLKSWWATNYVSDWWEE 0"
## [1] "VKDTVNRYLQSVRPLMKEEDFKRMTALAQDFAVGLGPRLQWYLKLKSWWATNYVSDWWEE 0"
## [1] "VKDTVNRYLQSVRPLMKEEDFKRMTALAQDFAVGLGPRLQWYLKLKSWWATNYVSDWWEE 0"
## [1] "VKDTVSRYLESVRPLMKEGDFQRMTALAQDFAVNLGPKLQWYLKLKSWWATNYVSDWWEE 0"
## [1] "VKDTVSRYLESVRPLMKEEDFQRMTALAQDFAVNLGPKLQWYLKLKSWWATNYVSDWWEE 0"
## [1] "VKDTVKRYLDSVKPLMDKEKFERMEGLAKDFANNLGPRLQWYLKLKSWWATNYVSDWWEE 0"
## [1] "VKETTRRYLESVQPLLSDEEHQRMQRLALDFERNLGPKLQWYLKLKSWWATNYVSDWWEE 0"
## [1] " "
## [1] "YIYLRGRGPIMVNSNYFAMDFLYLSPTTVQAARAGNVIHAILLYRKKLDRQEIKPILLMG 0"
## [1] "YVYLRGRGPIMVNSNYFAMDFLYLSPTTLQAARAGNVIHAILLYRKKLDRQEIKPILLMG 0"
## [1] "YIYLRGRGPIMVNSNYFAMDFLHLSPTTIQAARAGNIIHAILLYRKKLDRQEIKPILLMG 0"
## [1] "YIYLRGRGPIMVNSNYYAMDFLCVAPTSIQAARAGNVIHALLLYRKQLDRQEIKPILLMG 0"
## [1] "YIYLRGRGPLMVNSNYYAMDLLYILPTHIQAARAGNAIHAILLYRRKLDREEIKPIRLLG 0"
## [1] "YIYLRGRGPLMVNSNYYAMDLLYILPTHIQAARAGNAIHAILLYRRKLDREEIKPIRLLG 0"
## [1] "YIYLRGRGPIMVNSNYYAMEMLYITPTHIQAARAGNTIHAILLYRRTVDREELKPIRLLG 0"
## [1] "YIYLRGRGPLMVNSNYYAMEMLYITPTHIQAARAGNTIHAILLYRRTLDREELKPIRLLG 0"
## [1] "YIYLRGRGPIMVNSNYYAMDFLYVLPTHIQAARAGNAIHAILLYRRKLDREEIKPIQLMG 0"
## [1] "YIYLRGRGPIMVNSNYYVMDFLYAFPTNIQAARAGNVIHAIMLYRRKLDRAQIKPVFALS 0"
## [1] " "
## [1] "STVPLCSAQWERMFNTSRIPGEESDTLQHVKDSKHIVVYHKGRYFKVWLYHDGRLLKPRE 0"
## [1] "STVPLCSAQWERMFNTSRIPGEESDTLQHVKDSKHIVVYHKGRYFKVWLYHDGRLLKPRE 0"
## [1] "STVPLCSAQWERMFNTSRIPGEESDTLQHVKDSKHIVVYHKGRYFKVWLYHDGRLLKPRE 0"
## [1] "STVPLCSAQWERMFNTSRIPGEETDTLQHVKDSKHIVVYHKGRYFKVWVYHDGRLLKPRE 0"
## [1] "STIPLCSAQWERMFNTSRIPGEETDTIQHMRDSKHIVVYHRGRYFKVWLYHDGRLLKPRE 0"
## [1] "STIPLCSAQWERMFNTSRIPGEETDTSQHMRDSKHIVVYHRGRYFKVWLYHDGRLLKPRE 0"
## [1] "STIPLCSAQWERLFNTSRIPGEETDTIQHVKDSRHIVVYHRGRYFKVWLYHDGRLLRPRE 0"
## [1] "STIPLCSAQWERLFNTSRIPGEETDTIQHIKDSRHIVVYHRGRYFKVWLYHDGRLLRPRE 0"
## [1] "STVPLCSAQWERMYNTSRIPGEETDTIQHVKDSKHIVVYHKGRYFKVWLYHDGRLLKPRE 0"
## [1] "NSVPLCSAQWERLFNTSRIPGKERDSVQHVSDSRHIVVYHRGRYFKVWMFYDGRLLLPRE 0"
## [1] " "
## [1] "IEQQMQRILDDDSEPQAGEEKLAALTAGDRVPWAKARQTYFSCGKNKQSLDAVEKAAFFV 0"
## [1] "IEQQMQRILDDDSEPQTGEKKLAALTAGDRVPWARARQAYFSHGKNKQSLDAVEKAAFFV 0"
## [1] "IEQQIQRILDDDSEPQAGEEKLAALTAGDRVPWAKARQAYFSRGKNKQSLDAVEKAAFFV 0"
## [1] "IEQQMQRILDDSSKSLPGEEKLAALTAGDRIPWAKARQAYFGRGKNKHSLDAVEKAAFFV 0"
## [1] "MEQQMQRILDNTSEPQPGEARLAALTAGDRVPWARCRQAYFGRGKNKQSLDAVEKAAFFV 0"
## [1] "MEQQMQRILDNTSEPQPGEARLAALTAGDRVPWARCRQAYFGRGKNKQSLDAVEKAAFFV 0"
## [1] "LEQQMQQILDDTSEPQPGEAKLAALTAADRVPWAKCRQTYFARGKNKQSLDAVEKAAFFV 0"
## [1] "LEQQMQQILDDPSEPQPGEAKLAALTAADRVPWAKCRQTYFARGKNKQSLDAVEKAAFFV 0"
## [1] "IEHQMQKIIDDTSSPQPGEEKLAALTAGDRVPWAKARKAYFANGKNKQSMDAVEKAAFFV 0"
## [1] "IEQQMERILADTSEPQPGEETLAALTAGDRVPWACARNAYLRHGTNKKSLDSVEKAAFFV 0"
## [1] " "
## [1] "TLDDIEQGYRKEDPVMSLDAYAKSLMHGRCYDRWFDKAFTLIVFKNGRMGLNAEHSWADA 0"
## [1] "TLDDIEQGYRKEDPVRSLDAYAKSLIHGRCYDRWFDKTFTLIVFKNGRMGLNAEHSWADA 0"
## [1] "TLDDDEQGYSKEDPVSSLDAYAKSLIHGRCYDRWFDKTFTLVVFKNGKMGLNAEHSWADA 0"
## [1] "TLDESEQGYREEDPVTSLDTYAKSLLHGKCYDRWFDKTFTLVVFKNGKMGLNAEHSWADA 0"
## [1] "TLDETEEGYRSEDPDTSMDSYAKSLLHGRCYDRWFDKSFTFVVFKNGKMGLNAEHSWADA 0"
## [1] "TLDETEEGYRSEDPDTSMDSYAKSLLHGRCYDRWFDKSFTFVVFKNGKMGLNAEHSWADA 0"
## [1] "TLDESEQGYREEDPEASIDSYAKSLLHGRCFDRWFDKSITFVVFKNSKIGINAEHSWADA 0"
## [1] "TLDESEQGYREEDPEASIDSYAKSLLHGRCFDRWFDKSITFVVFKNSKIGINAEHSWADA 0"
## [1] "TLDETEQGYNKEDPVNSLDSYAKSLLHGKCYDRWFDKTMSFVVFKNGKMGMNVEHSWADA 0"
## [1] "TLDDTEQRFDQKNPVESLDRYAKSLLHGKCYDRWFDKSINLIIFKNGTMGLNAEHSWADA 0"
## [1] " "
## [1] "PIVGHLWENVMTTEYLELGYSEDGHCKGDINQNIPIPTKLQWEIPEECQEVIERSLSTAI 0"
## [1] "PIVGHLWENVMATEYLELGYSEDGHCKGDTNQNIPIPTKLQWEIPEECQEVIERSLSTAI 0"
## [1] "PIVGHLWENVMATEYLELGYLEDGHCKGDTNQNIPIPTKLQWEIPEECQDVIERSLSTAR 0"
## [1] "PIIGHLWETTMANDYLLLGYSEDGHCRGDTHQNIPFPTRLQWEIPEECQEVIEKSLSTAS 0"
## [1] "PIVAHLWEYVMSIDSLQLGYAEDGHCKGDINPNIPYPTRLQWDIPGECQEVIETSLNTAN 0"
## [1] "PIVAHLWEYVMSTDSLQLGYAEDGHCKGDTNPNIPYPTRLQWDIPGECQEVIETSLNTAN 0"
## [1] "PIVGHLWEYVMATDVFQLGYSEDGHCKGDKNPNIPKPTRLQWDIPGECQEVIETSLSSAS 0"
## [1] "PVVGHLWEYVMATDVFQLGYSEDGHCKGDTNPNIPKPTRLQWDIPGECQEVIDASLSSAS 0"
## [1] "PIVGHLWEYVMATDKMELGYNEDGHCKGDVNGNIPPPSRLQWDIPEECQNVVEESLTVAK 0"
## [1] "PIVGHLWEQVLSMDPVKLGYTEDGHCKGEPHANLPGPQRLQWNIPTECQTMITNSLSVAE 0"
## [1] " "
## [1] "ALADDVDFHSFFFDAFGKGLIKKAKTSPDAFVQLALQLAHYRDMGKFSLTYEASMTRLFR 0"
## [1] "ALADDVDFHSFFFDAFGKGLIKKAKTSPDGFVQLALQLAHYRDMGKFSLTYEASMTRLFR 0"
## [1] "ALADDVDFYSFYFDVFGKGLIKKAKTSPDAFIQLALQLAHYRDMGKFSLTYEASMTRLFR 0"
## [1] "ALAEDVDFHSFPFDTFGKGLIKKAKISPDAFVQLSLQLAHYRDMGKFCLTYEASMTRLFR 0"
## [1] "LLANDVDFHSFPFVAFGKGIIKKCRTSPDAFVQLALQLAHYKDMGKFCLTYEASMTRLFR 0"
## [1] "LLANDVDFHSFPFVAFGKGIIKKCRTSPDAFVQLALQLAHYKDMGKFCLTYEASMTRLFR 0"
## [1] "FLANDVDLHSFPFDTFGKGLIKKCRTSPDAFIQLALQLAHYKDMGKFCLTYEASMTRLFR 0"
## [1] "LLANDVDLHSFPFDSFGKGLIKKCRTSPDAFIQLALQLAHYKDMGKFCLTYEASMTRLFR 0"
## [1] "ALADDVDFHSFPFNSFGKGLIKKSRTSPDAFVQLSLQLAHYRDKEKFCLTYEASMTRLFR 0"
## [1] "ALADDVDSHIIPFSDFGKGLIKKCRTSPDAFIQLALQLAHYRDKGKFCLTYEASMTRLFR 0"
## [1] " "
## [1] "EGRTETVRSCTVESCNFVQTMEDPTESNENKLKSFRLAAAKHQHLYRLAMTGAGVDRHLF 0"
## [1] "EGRTETVRSCTVESCNFVRTMEDPTESNENKLKLFQLAAAKHQHLYRLAMTGAGIDRHLF 0"
## [1] "EGRTETVRSCTIESCNFVQTMENPSESNENKMKSFRLAATKHQHLYRLAMTGAGIDRHLF 0"
## [1] "EGRTETVRSCTTESCSFVRAMEDPAESTEKKLKLLRIAAANHQHLYRFAMTGAGVDRHLF 0"
## [1] "EGRTETVRSCTTESCDFVRAMVDPAQTVEQRLKLFKLASEKHQHMYRLAMTGSGIDRHLF 0"
## [1] "EGRTETVRSCTTESCDFVRAMVDPAQTVEQRLKLFKLASEKHQHMYRLAMTGSGIDRHLF 0"
## [1] "EGRTETVRSCTTESCNFVLAMMDPTTTAEQRFKLFKIACEKHQHLYRLAMTGAGIDRHLF 0"
## [1] "EGRTETVRSCTMESCNFVQAMMDPKSTAEQRLKLFKIACEKHQHLYRLAMTGAGIDRHLF 0"
## [1] "EGRTETVRSCTIESCDFVLAMSDPSQTNEKRLQLFKEAAEKHQQMYRLAMTGSGIDRHLF 0"
## [1] "EGRTETVRSCTTESCAFVRAMN-SNHTREQKLQLLKNAAEKHQQMYRLAMTGHGIDRHLF 0"
## [1] " "
## [1] "CLYVVSKYLAVDSPFLKEVLSEPWRLSTSQTPQQHIDLK---KNPEMLSCGGGFGPVADD 0"
## [1] "CLYVVSKYLAVDSPFLKEVLSEPWRLSTSQTPQQHIDLK---KNPEMLSCGGGFGPVADD 0"
## [1] "CLYVVSKYLSVDSPFLKEVLSEPWRLSTSQTPQQHIDLK---KNPEMLSSGGGFGPVADD 0"
## [1] "CLYVVSKYLGVDSPFLKEVLSEPWRLSTSQTPHQHIDLE---KNPECVCSGGGFGPVADD 0"
## [1] "CLYVVSKYLAVESPFLKEVLSEPWRLSTSQTPQQQVELFDLENNPEYVSSGGGFGPVADD 0"
## [1] "CLYVVSKYLAMESPFLKEVLSEPWRLSTSQTPQQQVELFDLENNPEYVSSGGGFGPVADD 0"
## [1] "CLYVVSKYLAVDSPFLKEVLSEPWRLSTSQTPQQQVELFDFEKYPDYVSCGGGFGPVADD 0"
## [1] "CLYVVSKYLAVDSPFLKEVLSEPWRLSTSQTPQQQVELFDFEKNPDYVSCGGGFGPVADD 0"
## [1] "CLYVVSKYLGVDSPFLKEVLSEPWRLSTSQTPQQQVHLFQLEKFPENVSSGGGFGPVADD 0"
## [1] "CLYVVLKYLGQDSPFLKEVLSEPWRLSTSQTPLQQGELFDLVKNPEYVTSGGGFGPVADD 0"
## [1] " "
## [1] "GYGVSYIILGENSIHFHVSSKISCSET-----------------DSHRFG--KNIQKAMV 0"
## [1] "GYGVSYIILDEDSIHFHVSSKISCSDTFENGPLQPVDSTATFHLESHMHGQVKMTSEGYC 0"
## [1] "GYGVSYIILDENSIHFHVSSKFSCSET-----------------DSHRFG--KNIQKALV 0"
## [1] "GYGVSYIIVGEKLINFHVSSKFSCPET-----------------DSHRFG--ENIRRAMI 0"
## [1] "GYGVSYILVGENLINFHISSKFSCPET-----------------DSHRFG--RHLKEAMT 0"
## [1] "GYGVSYILVGENLINFHISSKFSCPET-----------------DSHRFG--RHLKEAMT 0"
## [1] "GYGVSYIIVGENFIHFHISSKFSSPET-----------------DSHRFG--KHLRQAMM 0"
## [1] "GYGVSYIIVGENFIHFHISSKFSSPET-----------------DSHRFG--KHLRQAMM 0"
## [1] "GYGVSYIIVGENLINFHISSKFSSPET-----------------DSHRFG--KHIKQAMI 0"
## [1] "GYGVAYVIVGEKLINFHISSKRSSPET-----------------DSHRFG--NNIRRAMI 0"
## [1] " "
## [1] "DIMGLFTHSKNCTK---------------------------------------------- 0"
## [1] "DCFANGDFCKNCNCDNCCNNQLHETERLTAIKACLERNPEAFLPKIGKRKLGEIKHHHNK 0"
## [1] "DIMGLFQPTKNCTK---------------------------------------------- 0"
## [1] "DIVSLFGFGKNSTK---------------------------------------------- 0"
## [1] "DIITLFGLSSNSKK---------------------------------------------- 0"
## [1] "DIITLFGLSSNSKK---------------------------------------------- 0"
## [1] "DIITLFGLTANSKK---------------------------------------------- 0"
## [1] "DIITLFGLTINSKK---------------------------------------------- 0"
## [1] "DILALFNISTNNSNKGKK------------------------------------------ 0"
## [1] "DMLDLFQLDKKDPK---------------------------------------------- 0"
## [1] " "
## [1] "------------------------------------------------------------ 0"
## [1] "GCNCKRSGCLENYCECFEAKIVCSSICKCTGCKNYEESPDENKQLNVINYMDIGNKEGNS 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] " "
## [1] "----------------- 43"
## [1] "PVLTSAFNMLPKSRKDR 43"
## [1] "----------------- 43"
## [1] "----------------- 43"
## [1] "----------------- 43"
## [1] "----------------- 43"
## [1] "----------------- 43"
## [1] "----------------- 43"
## [1] "----------------- 43"
## [1] "----------------- 43"
## [1] " "
# key step - must have class set properly
class(cpt1a_align) <- "AAMultipleAlignment"
## run ggmsa
ggmsa::ggmsa(cpt1a_align,
start = 0,
end = 50)
This produces a “dist” object class
names(cpt1a_vector_ss)
## [1] "NP_001867.2" "XP_001173853.1" "NP_038523.2" "NP_113747.2"
## [5] "NP_001012916.1" "NP_001038319.1" "XP_004913491.1" "XP_006019242.1"
## [9] "XP_009472179.1" "XP_021152303.1"
names.focal <- c("NP_001867.2", "XP_001173853.1", "NP_038523.2", "NP_113747.2", "NP_001012916.1", "NP_001038319.1", "XP_004913491.1", "XP_006019242.1", "XP_009472179.1", "XP_021152303.1")
cpt1a_vector_ss[names.focal]
## AAStringSet object of length 10:
## width seq names
## [1] 773 MAEAHQAVAFQFTVTPDGIDLRL...RHLKEAMTDIITLFGLSSNSKK NP_001867.2
## [2] 773 MAEAHQAVAFQFTVTPDGIDLRL...RHLKEAMTDIITLFGLSSNSKK XP_001173853.1
## [3] 773 MAEAHQAVAFQFTVTPDGIDLRL...KHLRQAMMDIITLFGLTANSKK NP_038523.2
## [4] 773 MAEAHQAVAFQFTVTPDGIDLRL...KHLRQAMMDIITLFGLTINSKK NP_113747.2
## [5] 770 MAEAHQAVAFQFTVTPDGIDLRM...KNIQKALVDIMGLFQPTKNCTK NP_001012916.1
## [6] 774 MAEAHQAVAFQFTVGPDGIDLQL...NNIRRAMIDMLDLFQLDKKDPK NP_001038319.1
## [7] 778 MAEAHQAVAFQFTVTPDGIDLRL...QAMIDILALFNISTNNSNKGKK XP_004913491.1
## [8] 770 MAEAHQAVAFQFTVTPDGIDLRM...ENIRRAMIDIVSLFGFGKNSTK XP_006019242.1
## [9] 770 MAEAHQAVAFQFTVTPDGIDLRM...KNIQKAMVDIMGLFTHSKNCTK XP_009472179.1
## [10] 912 MAEAHQAVAFQFTVTPDGIDLRM...KEGNSPVLTSAFNMLPKSRKDR XP_021152303.1
cpt1a_vector_ss_subset <- cpt1a_vector_ss[names.focal]
cpt1a_align_subset <- msa(cpt1a_vector_ss_subset,
method = "ClustalW")
## use default substitution matrix
class(cpt1a_align_subset) <- "AAMultipleAlignment"
cpt1a_align_subset_seqinr <- msaConvert(cpt1a_align_subset, type = "seqinr::alignment")
cpt1a_subset_dist <- seqinr::dist.alignment(cpt1a_align_subset_seqinr,
matrix = "identity")
is(cpt1a_subset_dist)
## [1] "dist" "oldClass"
class(cpt1a_subset_dist)
## [1] "dist"
Round for display
cpt1a_align_seqinr_rnd <- round(cpt1a_subset_dist, 3)
cpt1a_align_seqinr_rnd
## XP_009472179.1 XP_021152303.1 NP_001012916.1 XP_006019242.1
## XP_021152303.1 0.239
## NP_001012916.1 0.228 0.284
## XP_006019242.1 0.366 0.391 0.368
## NP_001867.2 0.434 0.450 0.441 0.438
## XP_001173853.1 0.435 0.449 0.440 0.438
## NP_038523.2 0.443 0.459 0.450 0.450
## NP_113747.2 0.444 0.466 0.450 0.454
## XP_004913491.1 0.440 0.462 0.438 0.443
## NP_001038319.1 0.554 0.567 0.555 0.541
## NP_001867.2 XP_001173853.1 NP_038523.2 NP_113747.2
## XP_021152303.1
## NP_001012916.1
## XP_006019242.1
## NP_001867.2
## XP_001173853.1 0.072
## NP_038523.2 0.367 0.369
## NP_113747.2 0.367 0.367 0.176
## XP_004913491.1 0.424 0.426 0.436 0.443
## NP_001038319.1 0.545 0.545 0.547 0.553
## XP_004913491.1
## XP_021152303.1
## NP_001012916.1
## XP_006019242.1
## NP_001867.2
## XP_001173853.1
## NP_038523.2
## NP_113747.2
## XP_004913491.1
## NP_001038319.1 0.538
Use the distance matrix to build a phylogenetic tree
tree_subset <- nj(cpt1a_subset_dist)
Plot the tree
# plot tree
plot.phylo(tree_subset, main="Phylogenetic Tree",
use.edge.length = F)
# add label
mtext(text = "CPT1A family gene tree - rooted, no branch lenths")