This code compiles summary information about the gene TRIP4 (thyroid hormone receptor interactor 4), which encodes the activating signal cointegrator 1 protein. The protein is involved mediating interactions with transcriptional coactivators and ligand-bound nuclear receptors. An MSA and a phylogenetic tree is also generated to show the evolutionary relationship betweeen the human version of the gene and its homologs in other species.
Key information use to make this script can be found here: * Refseq Gene: https://www.ncbi.nlm.nih.gov/gene/9325 * Refseq Homologene: https://www.ncbi.nlm.nih.gov/homologene?LinkName=nuccore_homologene&from_uid=1015232364 * UniProt: https://www.uniprot.org/uniprot/Q15650 * PDB: https://www.rcsb.org/structure/2E5O
Other resources include: * Neanderthal genome: http://neandertal.ensemblgenomes.org/index.html
Load necessary packages:
Download and load drawProteins from Bioconductor
library(BiocManager) #Allow access to Bioconductor packages
## Bioconductor version '3.13' is out-of-date; the current release version '3.14'
## is available with R version '4.1'; see https://bioconductor.org/install
#install("drawProteins")
library(drawProteins)
Load other packages
# github packages
library(compbio4all)
library(ggmsa)
## Registered S3 methods overwritten by 'ggalt':
## method from
## grid.draw.absoluteGrob ggplot2
## grobHeight.absoluteGrob ggplot2
## grobWidth.absoluteGrob ggplot2
## grobX.absoluteGrob ggplot2
## grobY.absoluteGrob ggplot2
# CRAN packages
library(rentrez)
library(seqinr)
library(ape)
##
## Attaching package: 'ape'
## The following objects are masked from 'package:seqinr':
##
## as.alignment, consensus
library(pander)
library(ggplot2)
# Bioconductor packages
## msa
###The msa output is used to make a distance matrix and then phylogenetics trees
library(msa)
## Loading required package: Biostrings
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: XVector
## Loading required package: GenomeInfoDb
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:ape':
##
## complement
## The following object is masked from 'package:seqinr':
##
## translate
## The following object is masked from 'package:base':
##
## strsplit
##
## Attaching package: 'msa'
## The following object is masked from 'package:BiocManager':
##
## version
library(drawProteins)
## Biostrings
library(Biostrings)
library(HGNChelper)
#Accession Number Table Accession numbers were obtained from RefSeq, Refseq Homlogene, UniProt and PDB. UniProt accession numbers can be found by searching for the gene name. PDB accessions can be found by searching with a UniProt accession or a gene name, though many proteins are not in PDB. The the Neanderthal genome database was searched but did not yield sequence information on. TRIP4.
Not available: * Neanderthal
# Refseq Uniprot PBD sci name common name gene name
trip4_table <- c("NP_057297.2", "Q15650", "2E5O", "Homo sapiens", "Human", "TRIP4",
"XP_510472.2", "K7BG15", "NA", "Pan troglodytes", "Chimpanzee", "TRIP4",
"NP_001039816.1", "Q2KIC4", "NA", "Bos taurus", "Cattle", "TRIP4",
"NP_062771.2","Q9QXN3","NA","Mus musculus","Mouse","TRIP4",
"NP_001128453.1","B5DEP5","NA","Rattus norvegicus","Norway rat","TRIP4",
"NP_001071043.2","Q08CA0","NA","Danio rerio","Zebrafish","TRIP4",
"XP_535513.2","F6UYL9","NA","Canis lupus familiaris","Dog","TRIP4",
"XP_014941048.1","A0A6J0A8Z9","NA","Acinonyx jubatus","Cheetah","TRIP4",
"XP_005685718.1","NA","NA","Capra hircus","Goat","TRIP4",
"XP_015134569.2","NA","NA","Gallus gallus","Chicken","TRIP4"
)
# convert the vector to matrix using matrix()
trip4_table_matrix <- matrix(trip4_table,
byrow = T,
nrow = 10)
# convert the matrix to a dataframe using data.frame()
trip4_table <- data.frame(trip4_table_matrix,
stringsAsFactors = F)
# name columns of dataframe using names() function
names(trip4_table) <- c("ncbi.protein.accession","UniProt.id", "PBD", "Species", "common.name","gene.name")
pander::pander(trip4_table)
| ncbi.protein.accession | UniProt.id | PBD | Species |
|---|---|---|---|
| NP_057297.2 | Q15650 | 2E5O | Homo sapiens |
| XP_510472.2 | K7BG15 | NA | Pan troglodytes |
| NP_001039816.1 | Q2KIC4 | NA | Bos taurus |
| NP_062771.2 | Q9QXN3 | NA | Mus musculus |
| NP_001128453.1 | B5DEP5 | NA | Rattus norvegicus |
| NP_001071043.2 | Q08CA0 | NA | Danio rerio |
| XP_535513.2 | F6UYL9 | NA | Canis lupus familiaris |
| XP_014941048.1 | A0A6J0A8Z9 | NA | Acinonyx jubatus |
| XP_005685718.1 | NA | NA | Capra hircus |
| XP_015134569.2 | NA | NA | Gallus gallus |
| common.name | gene.name |
|---|---|
| Human | TRIP4 |
| Chimpanzee | TRIP4 |
| Cattle | TRIP4 |
| Mouse | TRIP4 |
| Norway rat | TRIP4 |
| Zebrafish | TRIP4 |
| Dog | TRIP4 |
| Cheetah | TRIP4 |
| Goat | TRIP4 |
| Chicken | TRIP4 |
All sequences were downloaded using a wrapper compbio4all::entrez_fetch_list() which uses rentrez::entrez_fetch() to access NCBI databases.
#Download FASTA sequences
trip4_list <- compbio4all::entrez_fetch_list(db = "protein", id = trip4_table$ncbi.protein.accession, rettype = "fasta")
#Number of FASTA files obtained
length(trip4_list)
## [1] 10
# First entry (human)
trip4_human <- trip4_list[[1]]
Remove FASTA header
# This step removes the FASTA header and performs cleaning steps required for particular analyses
for(i in 1:length(trip4_list)){
trip4_list[[i]] <- compbio4all::fasta_cleaner(trip4_list[[i]], parse = F)
}
# Protein for human
# Use a UniProt accession to download data from UniProt, which produces a list
Q15650_list <- drawProteins::get_features("Q15650")
## [1] "Download has worked"
is(Q15650_list)
## [1] "list" "vector" "list_OR_List" "vector_OR_Vector"
## [5] "vector_OR_factor"
# Convert raw data from the webpage to dataframe
my_prot_df <- drawProteins::feature_to_dataframe(Q15650_list)
# View dataframe
my_prot_df[,-2]
## type begin end length accession entryName taxid order
## featuresTemp INIT_MET 1 1 0 Q15650 TRIP4_HUMAN 9606 1
## featuresTemp.1 CHAIN 2 581 579 Q15650 TRIP4_HUMAN 9606 1
## featuresTemp.2 DOMAIN 437 531 94 Q15650 TRIP4_HUMAN 9606 1
## featuresTemp.3 ZN_FING 171 187 16 Q15650 TRIP4_HUMAN 9606 1
## featuresTemp.4 REGION 97 118 21 Q15650 TRIP4_HUMAN 9606 1
## featuresTemp.5 REGION 200 300 100 Q15650 TRIP4_HUMAN 9606 1
## featuresTemp.6 REGION 300 400 100 Q15650 TRIP4_HUMAN 9606 1
## featuresTemp.7 MOD_RES 2 2 0 Q15650 TRIP4_HUMAN 9606 1
## featuresTemp.8 MOD_RES 276 276 0 Q15650 TRIP4_HUMAN 9606 1
## featuresTemp.9 MOD_RES 289 289 0 Q15650 TRIP4_HUMAN 9606 1
## featuresTemp.10 MOD_RES 341 341 0 Q15650 TRIP4_HUMAN 9606 1
## featuresTemp.11 CROSSLNK 324 324 0 Q15650 TRIP4_HUMAN 9606 1
## featuresTemp.12 CROSSLNK 325 325 0 Q15650 TRIP4_HUMAN 9606 1
## featuresTemp.13 CROSSLNK 334 334 0 Q15650 TRIP4_HUMAN 9606 1
## featuresTemp.14 CROSSLNK 367 367 0 Q15650 TRIP4_HUMAN 9606 1
## featuresTemp.15 CONFLICT 157 157 0 Q15650 TRIP4_HUMAN 9606 1
## featuresTemp.16 CONFLICT 164 164 0 Q15650 TRIP4_HUMAN 9606 1
## featuresTemp.17 CONFLICT 475 475 0 Q15650 TRIP4_HUMAN 9606 1
## featuresTemp.18 STRAND 435 439 4 Q15650 TRIP4_HUMAN 9606 1
## featuresTemp.19 HELIX 443 448 5 Q15650 TRIP4_HUMAN 9606 1
## featuresTemp.20 STRAND 454 459 5 Q15650 TRIP4_HUMAN 9606 1
## featuresTemp.21 STRAND 465 471 6 Q15650 TRIP4_HUMAN 9606 1
## featuresTemp.22 HELIX 478 492 14 Q15650 TRIP4_HUMAN 9606 1
## featuresTemp.23 STRAND 504 518 14 Q15650 TRIP4_HUMAN 9606 1
## featuresTemp.24 HELIX 521 525 4 Q15650 TRIP4_HUMAN 9606 1
## featuresTemp.25 TURN 530 532 2 Q15650 TRIP4_HUMAN 9606 1
## featuresTemp.26 STRAND 535 546 11 Q15650 TRIP4_HUMAN 9606 1
## featuresTemp.27 STRAND 557 561 4 Q15650 TRIP4_HUMAN 9606 1
## featuresTemp.28 HELIX 564 572 8 Q15650 TRIP4_HUMAN 9606 1
## featuresTemp.29 HELIX 573 575 2 Q15650 TRIP4_HUMAN 9606 1
# Key domain
my_canvas <- draw_canvas(my_prot_df)
my_canvas <- draw_chains(my_canvas, my_prot_df, label_size = 2.5)
my_canvas <- draw_domains(my_canvas, my_prot_df)
my_canvas
# Create vector and FASTA Clean the TRIP4 human protein
TRIP4_human_vector <- compbio4all::fasta_cleaner(trip4_human)
# Default dotplot
seqinr::dotPlot(TRIP4_human_vector,
TRIP4_human_vector)
# set up 2 x 2 grid and show different values for dotplot settings
par(mfrow = c(2,2),
mar = c(0,0,2,1))
# plot 1: TRIP4 - Defaults
dotPlot(TRIP4_human_vector,
TRIP4_human_vector,
wsize = 1,
wstep = 1,
nmatch = 1,
xlab = "trip4_human_vector",
ylab = "trip4_human_vector",
main = "TRIP4 Defaults")
# plot 2: TRIP4 - size = 10, nmatch = 1
dotPlot(TRIP4_human_vector,
TRIP4_human_vector,
wsize = 10 ,
wstep = 1,
nmatch = 1,
xlab = "trip4_human_vector",
ylab = "trip4_human_vector",
main = " TRIP4- size = 10, nmatch = 1")
# plot 3: TRIP4 - size = 10 , nmatch = 5
dotPlot(TRIP4_human_vector,
TRIP4_human_vector,
wsize = 10,
wstep = 1,
nmatch = 5,
xlab = "trip4_human_vector",
ylab = "trip4_human_vector",
main = "TRIP4 - size = 10 , nmatch = 5")
# plot 4: TRIP4 - size = 20, nmatch = 5
dotPlot(TRIP4_human_vector,
TRIP4_human_vector,
wsize = 20,
wstep = 1,
nmatch = 5,
xlab = "trip4_human_vector",
ylab = "trip4_human_vector",
main = "TRIP4 - size = 20, nmatch = 5")
# Best version of the dot plot
par(mfrow = c(1,1),
mar = c(4,4,4,4))
dotPlot(TRIP4_human_vector,
TRIP4_human_vector,
wsize = 20,
wstep = 1,
nmatch = 5,
xlab = "trip4_human_vector",
ylab = "trip4_human_vector",
# main = "TRIP4 Dotplot"
main = "TRIP4 - window = 20, nmatch = 5")
Note: There was no information available on the presence of disorganized regions or tandem repeats.
## Make table consisitng of protein properties from different databases
trip4_protProp <- c("Pfam","There are two interesting domains presented in this database. The first is a zf-C2HC5 domain from AA 168-216. The second is an ASCH domain from AA 437-551.","http://pfam.xfam.org/protein/TRIP4_HUMAN",
"Uniprot","TRIP4 is a protein coding gene for Activating signal cointegrator 1, found in the subcellular locations - nucleus and cytoplasm/cytosol.","https://www.uniprot.org/uniprot/Q15650",
"Alphafold"," Alphafold predicts with moderate confidence that the Activating signal cointegrator 1 protein, which is encoded by the TRIP4 gene consists mainly of alpha helices and some beta strands.","https://alphafold.ebi.ac.uk/entry/Q15650",
"Disprot", "NA", "NA",
"RepeatsDB", "NA", "NA")
# convert the vector to matrix using matrix()
trip4_protProp_matrix <- matrix(trip4_protProp,
byrow = T,
nrow = 5)
# convert the matrix to a dataframe using data.frame()
trip4_protProp <- data.frame(trip4_protProp_matrix,
stringsAsFactors = F)
# name columns of dataframe using names() function
names(trip4_protProp) <- c("Source","Protein Property", "Link")
pander::pander(trip4_protProp)
| Source | Protein Property |
|---|---|
| Pfam | There are two interesting domains presented in this database. The first is a zf-C2HC5 domain from AA 168-216. The second is an ASCH domain from AA 437-551. |
| Uniprot | TRIP4 is a protein coding gene for Activating signal cointegrator 1, found in the subcellular locations - nucleus and cytoplasm/cytosol. |
| Alphafold | Alphafold predicts with moderate confidence that the Activating signal cointegrator 1 protein, which is encoded by the TRIP4 gene consists mainly of alpha helices and some beta strands. |
| Disprot | NA |
| RepeatsDB | NA |
| Link |
|---|
| http://pfam.xfam.org/protein/TRIP4_HUMAN |
| https://www.uniprot.org/uniprot/Q15650 |
| https://alphafold.ebi.ac.uk/entry/Q15650 |
| NA |
| NA |
Multivariate statistical techniques were used to confirm the information about protein structure and location in the online database.
Uniprot (which uses http://www.csbio.sjtu.edu.cn/bioinf/Cell-PLoc-2/) indicates that the protein is found in the nucleus and cytoplasm.
#Make 2 vectors of amino acids.
aa.1.1 <- c("A","R","N","D","C","Q","E","G","H","I",
"L","K","M","F","P","S","T","W","Y","V")
aa.1.2 <- c("A","R","N","D","C","Q","E","G","H","I",
"L","K","M","F","P","S","T","W","Y","V")
# Use logical check to make sure vectors are same length and contain same values
length(aa.1.1) == length(aa.1.2)
## [1] TRUE
aa.1.1 == aa.1.2
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE
# Shows number of unique values present
unique(aa.1.1)
## [1] "A" "R" "N" "D" "C" "Q" "E" "G" "H" "I" "L" "K" "M" "F" "P" "S" "T" "W" "Y"
## [20] "V"
# Show both vectors have same number of unique values
length(unique(aa.1.1)) == length(unique(aa.1.2))
## [1] TRUE
# Creates a vector of each amino acid frequency in the database of proteins of each type that Chou used
# Alpha proteins
alpha <- c(285, 53, 97, 163, 22, 67, 134, 197, 111, 91,
221, 249, 48, 123, 82, 122, 119, 33, 63, 167)
# Check against Chou's total
sum(alpha) == 2447
## [1] TRUE
# Beta proteins
beta <- c(203, 67, 139, 121, 75, 122, 86, 297, 49, 120,
177, 115, 16, 85, 127, 341, 253, 44, 110, 229)
# check against Chou's total
sum(beta) == 2776
## [1] TRUE
# Alpha and Beta
a.plus.b <- c(175, 78, 120, 111, 74, 74, 86, 171, 33, 93,
110, 112, 25, 52, 71, 126, 117, 30, 108, 123)
# Check against Chou's total
sum(a.plus.b) == 1889
## [1] TRUE
# alpha/beta
a.div.b <- c(361, 146, 183, 244, 63, 114, 257, 377, 107, 239,
339, 321, 91, 158, 188, 327, 238, 72, 130, 378)
# Check against Chou's total
sum(a.div.b) == 4333
## [1] TRUE
# Create a dataframe of the data
chouDF <- data.frame(aa.1.1, alpha, beta, a.plus.b, a.div.b)
# Print result (dataframe)
pander::pander(chouDF)
| aa.1.1 | alpha | beta | a.plus.b | a.div.b |
|---|---|---|---|---|
| A | 285 | 203 | 175 | 361 |
| R | 53 | 67 | 78 | 146 |
| N | 97 | 139 | 120 | 183 |
| D | 163 | 121 | 111 | 244 |
| C | 22 | 75 | 74 | 63 |
| Q | 67 | 122 | 74 | 114 |
| E | 134 | 86 | 86 | 257 |
| G | 197 | 297 | 171 | 377 |
| H | 111 | 49 | 33 | 107 |
| I | 91 | 120 | 93 | 239 |
| L | 221 | 177 | 110 | 339 |
| K | 249 | 115 | 112 | 321 |
| M | 48 | 16 | 25 | 91 |
| F | 123 | 85 | 52 | 158 |
| P | 82 | 127 | 71 | 188 |
| S | 122 | 341 | 126 | 327 |
| T | 119 | 253 | 117 | 238 |
| W | 33 | 44 | 30 | 72 |
| Y | 63 | 110 | 108 | 130 |
| V | 167 | 229 | 123 | 378 |
Calculate proportions for each of the four protein fold types
alpha.prop <- alpha/sum(alpha)
beta.prop <- beta/sum(beta)
a.plus.b.prop <- a.plus.b/sum(a.plus.b)
a.div.b <- a.div.b/sum(a.div.b)
Create dataframe
#dataframe
aa.prop <- data.frame(alpha.prop,
beta.prop,
a.plus.b.prop,
a.div.b)
#row labels
row.names(aa.prop) <- aa.1.1
# Print dataframe
pander(aa.prop)
| alpha.prop | beta.prop | a.plus.b.prop | a.div.b | |
|---|---|---|---|---|
| A | 0.1165 | 0.07313 | 0.09264 | 0.08331 |
| R | 0.02166 | 0.02414 | 0.04129 | 0.03369 |
| N | 0.03964 | 0.05007 | 0.06353 | 0.04223 |
| D | 0.06661 | 0.04359 | 0.05876 | 0.05631 |
| C | 0.008991 | 0.02702 | 0.03917 | 0.01454 |
| Q | 0.02738 | 0.04395 | 0.03917 | 0.02631 |
| E | 0.05476 | 0.03098 | 0.04553 | 0.05931 |
| G | 0.08051 | 0.107 | 0.09052 | 0.08701 |
| H | 0.04536 | 0.01765 | 0.01747 | 0.02469 |
| I | 0.03719 | 0.04323 | 0.04923 | 0.05516 |
| L | 0.09031 | 0.06376 | 0.05823 | 0.07824 |
| K | 0.1018 | 0.04143 | 0.05929 | 0.07408 |
| M | 0.01962 | 0.005764 | 0.01323 | 0.021 |
| F | 0.05027 | 0.03062 | 0.02753 | 0.03646 |
| P | 0.03351 | 0.04575 | 0.03759 | 0.04339 |
| S | 0.04986 | 0.1228 | 0.0667 | 0.07547 |
| T | 0.04863 | 0.09114 | 0.06194 | 0.05493 |
| W | 0.01349 | 0.01585 | 0.01588 | 0.01662 |
| Y | 0.02575 | 0.03963 | 0.05717 | 0.03 |
| V | 0.06825 | 0.08249 | 0.06511 | 0.08724 |
Determine the number of each amino acid in the protein
table(TRIP4_human_vector)
## TRIP4_human_vector
## A C D E F G H I K L M N P Q R S T V W Y
## 26 15 34 49 22 38 14 30 54 60 5 21 25 42 32 42 24 30 9 9
# Convert table to vector
table.to.vec <- function(table_x){
table_names <- attr(table_x, "dimnames")[[1]]
table_vect <- as.vector(table_x)
names(table_vect) <- table_names
return(table_vect)
}
Create a table of amino acid frequencies and convert it to a vector.
TRIP4_human_table <- table(TRIP4_human_vector)/length(TRIP4_human_vector)
TRIP4.human.aa.freq <- table.to.vec(TRIP4_human_table)
# Print result
TRIP4.human.aa.freq
## A C D E F G
## 0.044750430 0.025817556 0.058519793 0.084337349 0.037865749 0.065404475
## H I K L M N
## 0.024096386 0.051635112 0.092943201 0.103270224 0.008605852 0.036144578
## P Q R S T V
## 0.043029260 0.072289157 0.055077453 0.072289157 0.041308090 0.051635112
## W Y
## 0.015490534 0.015490534
# Check for presence of U, which is an unknown amino acid
aa.names <- names(TRIP4.human.aa.freq)
any(aa.names == "U") #Check for any amino acids that has U
## [1] FALSE
# Result returns FALSE, so no "U" is present
# Add protein to frequency table
aa.prop$TRIP4.human.aa.freq <- TRIP4.human.aa.freq
pander::pander(aa.prop)
| alpha.prop | beta.prop | a.plus.b.prop | a.div.b | TRIP4.human.aa.freq | |
|---|---|---|---|---|---|
| A | 0.1165 | 0.07313 | 0.09264 | 0.08331 | 0.04475 |
| R | 0.02166 | 0.02414 | 0.04129 | 0.03369 | 0.02582 |
| N | 0.03964 | 0.05007 | 0.06353 | 0.04223 | 0.05852 |
| D | 0.06661 | 0.04359 | 0.05876 | 0.05631 | 0.08434 |
| C | 0.008991 | 0.02702 | 0.03917 | 0.01454 | 0.03787 |
| Q | 0.02738 | 0.04395 | 0.03917 | 0.02631 | 0.0654 |
| E | 0.05476 | 0.03098 | 0.04553 | 0.05931 | 0.0241 |
| G | 0.08051 | 0.107 | 0.09052 | 0.08701 | 0.05164 |
| H | 0.04536 | 0.01765 | 0.01747 | 0.02469 | 0.09294 |
| I | 0.03719 | 0.04323 | 0.04923 | 0.05516 | 0.1033 |
| L | 0.09031 | 0.06376 | 0.05823 | 0.07824 | 0.008606 |
| K | 0.1018 | 0.04143 | 0.05929 | 0.07408 | 0.03614 |
| M | 0.01962 | 0.005764 | 0.01323 | 0.021 | 0.04303 |
| F | 0.05027 | 0.03062 | 0.02753 | 0.03646 | 0.07229 |
| P | 0.03351 | 0.04575 | 0.03759 | 0.04339 | 0.05508 |
| S | 0.04986 | 0.1228 | 0.0667 | 0.07547 | 0.07229 |
| T | 0.04863 | 0.09114 | 0.06194 | 0.05493 | 0.04131 |
| W | 0.01349 | 0.01585 | 0.01588 | 0.01662 | 0.05164 |
| Y | 0.02575 | 0.03963 | 0.05717 | 0.03 | 0.01549 |
| V | 0.06825 | 0.08249 | 0.06511 | 0.08724 | 0.01549 |
#Two custom functions: the first one calculates correlates between two columns of our table, and the second one to calculates correlation similarities
chou_cor <- function(x,y){
numerator <- sum(x*y)
denominator <- sqrt((sum(x^2))*(sum(y^2)))
result <- numerator/denominator
return(result)
}
chou_cosine <- function(z.1, z.2){
z.1.abs <- sqrt(sum(z.1^2))
z.2.abs <- sqrt(sum(z.2^2))
my.cosine <- sum(z.1*z.2)/(z.1.abs*z.2.abs)
return(my.cosine)
}
#Calculates and displays correlation between each column
corr.alpha <- chou_cor(aa.prop[,5], aa.prop[,1])
corr.beta <- chou_cor(aa.prop[,5], aa.prop[,2])
corr.apb <- chou_cor(aa.prop[,5], aa.prop[,3])
corr.adb <- chou_cor(aa.prop[,5], aa.prop[,4])
# Display result for alpha
corr.alpha
## [1] 0.739012
# Display result for beta
corr.beta
## [1] 0.7483259
# Display result for apb
corr.apb
## [1] 0.7825123
# Display result for adb
corr.adb
## [1] 0.7662036
Calculates and displays the cosine similarity
cos.alpha <- chou_cosine(aa.prop[,5], aa.prop[,1])
cos.beta <- chou_cosine(aa.prop[,5], aa.prop[,2])
cos.apb <- chou_cosine(aa.prop[,5], aa.prop[,3])
cos.adb <- chou_cosine(aa.prop[,5], aa.prop[,4])
# Display cosine similarities results
cos.alpha
## [1] 0.739012
cos.beta
## [1] 0.7483259
cos.apb
## [1] 0.7825123
cos.adb
## [1] 0.7662036
Calculate Euclidean distance
# Euclidean distance
aa.prop.flipped <- t(aa.prop)
round(aa.prop.flipped,2)
## A R N D C Q E G H I L K
## alpha.prop 0.12 0.02 0.04 0.07 0.01 0.03 0.05 0.08 0.05 0.04 0.09 0.10
## beta.prop 0.07 0.02 0.05 0.04 0.03 0.04 0.03 0.11 0.02 0.04 0.06 0.04
## a.plus.b.prop 0.09 0.04 0.06 0.06 0.04 0.04 0.05 0.09 0.02 0.05 0.06 0.06
## a.div.b 0.08 0.03 0.04 0.06 0.01 0.03 0.06 0.09 0.02 0.06 0.08 0.07
## TRIP4.human.aa.freq 0.04 0.03 0.06 0.08 0.04 0.07 0.02 0.05 0.09 0.10 0.01 0.04
## M F P S T W Y V
## alpha.prop 0.02 0.05 0.03 0.05 0.05 0.01 0.03 0.07
## beta.prop 0.01 0.03 0.05 0.12 0.09 0.02 0.04 0.08
## a.plus.b.prop 0.01 0.03 0.04 0.07 0.06 0.02 0.06 0.07
## a.div.b 0.02 0.04 0.04 0.08 0.05 0.02 0.03 0.09
## TRIP4.human.aa.freq 0.04 0.07 0.06 0.07 0.04 0.05 0.02 0.02
Calculate Distance Matrix
dist(aa.prop.flipped, method = "euclidean")
## alpha.prop beta.prop a.plus.b.prop a.div.b
## beta.prop 0.13342098
## a.plus.b.prop 0.09281824 0.08289406
## a.div.b 0.06699039 0.08659174 0.06175113
## TRIP4.human.aa.freq 0.18410888 0.18214298 0.16315846 0.17061508
Calculates and displays individual distances
dist.alpha <- dist((aa.prop.flipped[c(1,5),]), method = "euclidean")
dist.beta <- dist((aa.prop.flipped[c(2,5),]), method = "euclidean")
dist.apb <- dist((aa.prop.flipped[c(3,5),]), method = "euclidean")
dist.adb <- dist((aa.prop.flipped[c(4,5),]), method = "euclidean")
# Display results
dist.alpha
## alpha.prop
## TRIP4.human.aa.freq 0.1841089
dist.beta
## beta.prop
## TRIP4.human.aa.freq 0.182143
dist.apb
## a.plus.b.prop
## TRIP4.human.aa.freq 0.1631585
dist.adb
## a.div.b
## TRIP4.human.aa.freq 0.1706151
# Compile info and rounds it. More readble
fold.type <- c("alpha","beta","alpha plus beta", "alpha/beta")
corr.sim <- round(c(corr.alpha,corr.beta,corr.apb,corr.adb),5)
cosine.sim <- round(c(cos.alpha,cos.beta,cos.apb,cos.adb),5)
Euclidean.dist <- round(c(dist.alpha,dist.beta,dist.apb,dist.adb),5)
sim.sum <- c("","","most.sim","")
dist.sum <- c("","","min.dist","")
df2 <- data.frame(fold.type,
corr.sim ,
cosine.sim ,
Euclidean.dist ,
sim.sum ,
dist.sum )
# Display result
pander(df2)
| fold.type | corr.sim | cosine.sim | Euclidean.dist | sim.sum | dist.sum |
|---|---|---|---|---|---|
| alpha | 0.739 | 0.739 | 0.1841 | ||
| beta | 0.7483 | 0.7483 | 0.1821 | ||
| alpha plus beta | 0.7825 | 0.7825 | 0.1632 | most.sim | min.dist |
| alpha/beta | 0.7662 | 0.7662 | 0.1706 |
# Info about list
names(trip4_list)
## [1] "NP_057297.2" "XP_510472.2" "NP_001039816.1" "NP_062771.2"
## [5] "NP_001128453.1" "NP_001071043.2" "XP_535513.2" "XP_014941048.1"
## [9] "XP_005685718.1" "XP_015134569.2"
length(trip4_list)
## [1] 10
# First entry data
trip4_list[[1]]
## [1] "MAVAGAVSGEPLVHWCTQQLRKTFGLDVSEEIIQYVLSIESAEEIREYVTDLLQGNEGKKGQFIEELITKWQKNDQELISDPLQQCFKKDEILDGQKSGDHLKRGRKKGRNRQEVPAFTEPDTTAEVKTPFDLAKAQENSNSVKKKTKFVNLYTREGQDRLAVLLPGRHPCDCLGQKHKLINNCLICGRIVCEQEGSGPCLFCGTLVCTHEEQDILQRDSNKSQKLLKKLMSGVENSGKVDISTKDLLPHQELRIKSGLEKAIKHKDKLLEFDRTSIRRTQVIDDESDYFASDSNQWLSKLERETLQKREEELRELRHASRLSKKVTIDFAGRKILEEENSLAEYHSRLDETIQAIANGTLNQPLTKLDRSSEEPLGVLVNPNMYQSPPQWVDHTGAASQKKAFRSSGFGLEFNSFQHQLRIQDQEFQEGFDGGWCLSVHQPWASLLVRGIKRVEGRSWYTPHRGRLWIAATAKKPSPQEVSELQATYRLLRGKDVEFPNDYPSGCLLGCVDLIDCLSQKQFKEQFPDISQESDSPFVFICKNPQEMVVKFPIKGNPKIWKLDSKIHQGAKKGLMKQNKAV"
# Creates and prints empty vector
TRIP4_vector <- rep(NA, length(trip4_list))
TRIP4_vector
## [1] NA NA NA NA NA NA NA NA NA NA
# Add info to the vector using for loop
for(i in 1:length(TRIP4_vector)){
TRIP4_vector[i] <- trip4_list[[i]]
}
# Name the vector
names(TRIP4_vector) <- names(trip4_list)
# Turn sequences into vectors
humanTRIP4 <- fasta_cleaner(trip4_list[[1]], parse = F)
chimpanzeeTRIP4 <- fasta_cleaner(trip4_list[[2]], parse = F)
cattleTRIP4 <- fasta_cleaner(trip4_list[[3]], parse = F)
norwayratTRIP4 <- fasta_cleaner(trip4_list[[5]], parse = F)
Performing Global Pairwise Alignment on Sequences
# Aligning human seq vs chimp seq
align.human.vs.chimp <- Biostrings::pairwiseAlignment(
humanTRIP4,
chimpanzeeTRIP4)
# Aligning human seq vs cattle seq
align.human.vs.cattle <- Biostrings::pairwiseAlignment(
humanTRIP4,
cattleTRIP4)
# Aligning human seq vs Norway rat seq
align.human.vs.norwayrat <- Biostrings::pairwiseAlignment(
humanTRIP4,
norwayratTRIP4)
#Percent sequence alignments between species
# PID of human vs chimp
Biostrings::pid(align.human.vs.chimp)
## [1] 99.82788
# PID of human vs cattle
Biostrings::pid(align.human.vs.cattle)
## [1] 87.62887
# PID of human vs norway rat
Biostrings::pid(align.human.vs.norwayrat)
## [1] 88.64028
Creating Global pairwise alignments for a matrix
# Aligning chimp seq vs Norway rat seq
align.chimp.vs.norwayrat <- Biostrings::pairwiseAlignment(
chimpanzeeTRIP4,
norwayratTRIP4)
# Aligning chimp seq vs cattle seq
align.chimp.vs.cattle <- Biostrings::pairwiseAlignment(
chimpanzeeTRIP4,
cattleTRIP4)
# Aligning Norway rat seq vs cattle seq
align.norwayrat.vs.cattle <- Biostrings::pairwiseAlignment(
norwayratTRIP4,
cattleTRIP4)
Generating a matrix of PID
pids <- c(1, NA, NA, NA,
pid(align.human.vs.chimp), 1, NA, NA,
pid(align.human.vs.cattle), pid(align.chimp.vs.norwayrat), 1, NA,
pid(align.human.vs.norwayrat), pid(align.chimp.vs.cattle), pid(align.norwayrat.vs.cattle), 1)
mat <- matrix(pids, nrow = 4, byrow = T)
row.names(mat) <- c("Homo","Pan","Rat","Bos")
colnames(mat) <- c("Homo","Pan","Rat","Bos")
pander::pander(mat)
| Homo | Pan | Rat | Bos | |
|---|---|---|---|---|
| Homo | 1 | NA | NA | NA |
| Pan | 99.83 | 1 | NA | NA |
| Rat | 87.63 | 88.47 | 1 | NA |
| Bos | 88.64 | 87.46 | 83.68 | 1 |
Comparing different PID methods.
A comparison of chimps and human PID with different methods.
PID1 <- pid(align.human.vs.chimp, type = "PID1")
PID2 <- pid(align.human.vs.chimp, type = "PID2")
PID3 <- pid(align.human.vs.chimp, type = "PID3")
PID4 <- pid(align.human.vs.chimp, type = "PID4")
# Generates the dataframe
Method <- c("PID1", "PID2", "PID3", "PID4")
PID <- c(PID1, PID2, PID3, PID4)
Denominator <- c("(aligned positions + internal gap positions)", "(aligned positions)", "(length shorter sequence)", "(average length of the two sequences)")
# Convert vectors to dataframe
pidDF <- data.frame(Method,
PID,
Denominator)
# Print result (table)
pander::pander(pidDF)
| Method | PID | Denominator |
|---|---|---|
| PID1 | 99.83 | (aligned positions + internal gap positions) |
| PID2 | 99.83 | (aligned positions) |
| PID3 | 99.83 | (length shorter sequence) |
| PID4 | 99.83 | (average length of the two sequences) |
For use with R bioinformatics tools we need to convert our named vector to a string set using Biostrings::AAStringSet(). Note the _ss tag at the end of the object we’re assigning the output to, which designates this as a string set.
#Convert vector into stringset
TRIP4_vector_ss <- Biostrings::AAStringSet(TRIP4_vector)
# Uses default substitution matrix
TRIP4_align <- msa(TRIP4_vector_ss,
method = "ClustalW")
## use default substitution matrix
MSA produces a species MSA objects
# Info about TRIP4_align
class(TRIP4_align)
## [1] "MsaAAMultipleAlignment"
## attr(,"package")
## [1] "msa"
is(TRIP4_align)
## [1] "MsaAAMultipleAlignment" "AAMultipleAlignment" "MsaMetaData"
## [4] "MultipleAlignment"
Default output of MSA
TRIP4_align
## CLUSTAL 2.1
##
## Call:
## msa(TRIP4_vector_ss, method = "ClustalW")
##
## MsaAAMultipleAlignment with 10 rows and 590 columns
## aln names
## [1] MAVAGAVSGEPLVHWCTQQLRKTFG...PKIWKLDSKIHQGAKKGLMKQNKAV NP_057297.2
## [2] MAVAGAVSGEPLVHWCTQQLRKTFG...PKIWKLDSKIHQGAKKGLMKQNKAV XP_510472.2
## [3] MAVAGAASREQLVGWCTEQLRKSFG...PKIWKLDSKIHQGAKKGLMKQNKAG NP_001039816.1
## [4] MAVAGAASRQQLVGWCTEQLRKSFG...PKIWKLDSKIHQGAKKGLMKQNKAG XP_005685718.1
## [5] MAVAGAASREQLVGWCTQELRKTFG...PKIWKLDSKIHQGAKKGLMKQNKAV XP_535513.2
## [6] MAVAGAASREQLVGWCTQQLRKTFG...PKIWKLDSKIHQGAKKGLMKQNKAV XP_014941048.1
## [7] MAVAGAAYREPLVHWCTQQLQKTFA...PKIWKLDSKIHQGAKKGLMKQNKAV NP_062771.2
## [8] MAVAGAASGEPLVHWCTQQLRKTFA...PKIWKLDSKIHQGAKKGLMKQNKAV NP_001128453.1
## [9] MAAPG-----ALLDWCVRRLRGDFG...PKIWKLDSKIHQGAKKGLMKQKAVA XP_015134569.2
## [10] -------MSDSLLRWTVDKLKHGFG...HKIWKLESQSHQSAKKCLMQPA--- NP_001071043.2
## Con MAVAGAASRE?LV?WCTQQLRKTFG...PKIWKLDSKIHQGAKKGLMKQNKAV Consensus
Change class of alignment
class(TRIP4_align) <- "MsaAAMultipleAlignment"
#class(TRIP4_align)
Convert to seqinr format
TRIP4_align_seqinr <- msaConvert(TRIP4_align,
type = "seqinr::alignment")
Display Information
class(TRIP4_align_seqinr)
## [1] "alignment"
is(TRIP4_align_seqinr)
## [1] "alignment"
# Display output with print_msa
compbio4all::print_msa(TRIP4_align_seqinr)
## [1] "MAVAGAVSGEPLVHWCTQQLRKTFGLDVSEEIIQYVLSIESAEEIREYVTDLLQGNEGKK 0"
## [1] "MAVAGAVSGEPLVHWCTQQLRKTFGLDVSEEIIQYVLSIESAEEIREYVTDLLQGNEGKK 0"
## [1] "MAVAGAASREQLVGWCTEQLRKSFGLDVSEEIIQYVLSIESAEEIREYVTDLLQGNEGKK 0"
## [1] "MAVAGAASRQQLVGWCTEQLRKSFGLDVSEEIIQYVLSIESAEEIREYVTDLLQGNEGKK 0"
## [1] "MAVAGAASREQLVGWCTQELRKTFGLDVSEEIMQYVLSIESAEEIREYVTDLLQGNEDKK 0"
## [1] "MAVAGAASREQLVGWCTQQLRKTFGLDVSEEIMQYVLSIESAEEIREYVTDLLQGNEDKK 0"
## [1] "MAVAGAAYREPLVHWCTQQLQKTFALDVSEEIIQYVLSIENAEEIREYVTDLLQGNEGKK 0"
## [1] "MAVAGAASGEPLVHWCTQQLRKTFALDVSEEIIQYVLSIESAEEIRDYVTDLLQGNEGKK 0"
## [1] "MAAPG-----ALLDWCVRRLRGDFGLDVGEEVVRYILSITSEDEIREYVVDLLQGTEGRK 0"
## [1] "-------MSDSLLRWTVDKLKHGFGLDASDDIVQYILSIDNADEIVEYVGDLLQGTEGNK 0"
## [1] " "
## [1] "GQFIEELITKWQKNDQELISDPLQQCFKKDEILDGQKSG--DHLKRGRKKGRNRQEVPAF 0"
## [1] "GQFIEELITKWQKNDQELISDPLQQCFKKDEILDGQKSG--DHLKRGRKKGRNRQEVPAF 0"
## [1] "GQFIEELVTKWQKNDQELTSDALQQSFKKD---DGQRSG--DQLKRSRRKGRNKQEAPAF 0"
## [1] "GQFIEELVTKWQKNDQELTSDALQQSFKKD---DGQRSG--DQLKRSRRKGRNKQEAPAF 0"
## [1] "GQFIEELITKWQKNDQELSSDALPQSFKKDEMLDGQRSG--DQLKRSRRKGRNRQEVPAF 0"
## [1] "GQFIEELIAKWQKNDQELSSDALQKSFKKDEMLDGQRSG--DQLKRSRRKGRNRQEVPAF 0"
## [1] "GQFIEDLITKWQKNDQEFISDSFQQCLRKDEILDGQRSV--DQLKRSRRKGRNKQEVPAF 0"
## [1] "GQFIEDLITKWQKKDQECISDSFQQCWKKDESLDGQRSV--DQLKRSRRKGRNKQEVPAF 0"
## [1] "GRFVEELLSRWQQSSQ-SPAEPLPAYRKKDETSESPRAG--DQAKKGKRKGRNKQETPAY 0"
## [1] "QEFVDELVQRWQ-RCQTQTSDGLGGVLRKEVVMEELDTAPKDTQKKSKRKGRNKQEIVTV 0"
## [1] " "
## [1] "TEPDT-TAEVKTPFDLAKAQEN-----SNSVKKKTKFVNLYTREGQDRLAVLLPGRHPCD 0"
## [1] "TEPDT-TAEVKTPFDLAKAQEN-----SNSVKKKTKFVNLYTREGQDRLAVLLPGRHPCD 0"
## [1] "TEPDT-TTEVKTPFDLAKAQDN-----SSSLKKKTKFVSLYTKEGQDKLAVLIPGRHPCD 0"
## [1] "TEPDT-TTEVKTPFDLAKAQDN-----SSSLKKKTKFVSLYTKEGQDKLAVLIPGRHSCD 0"
## [1] "AEPDT-TKEVKTPFDLAKAQEN-----SNSLKKKTKFVSLYTKEGQDRLAVLIPGRHPCD 0"
## [1] "AEPDT-TTEVKTPFDLAKAQEN-----SSSLKKKTKFVSLYTKEGQDRLAVLIPGRHPCD 0"
## [1] "PEPDV-AVEVKTPLDLAKAQES-----NNSVKKKTRFVNLYTREGQDKLAVLLPGRHPCD 0"
## [1] "PEPDV-TVEVKTPLDLAKAQEG-----NSSVKKKTRFVSLYTREGQDKLAVLLPGRHPCD 0"
## [1] "VEPSTHVEEVKTPLDLAKAQESSTASSSNAYKKKTKYVSLYTKEGQDRLAVLIPGRHACE 0"
## [1] "SQAEP--EPVRTPIDLMKAQESS---NSSSVKKKSKFVNLYAKEGKDKLAVLLPGRQACE 0"
## [1] " "
## [1] "CLGQKHKLINNCLICGRIVCEQEGSGPCLFCGTLVCTHEEQDILQRDSNKSQKLLKKLMS 0"
## [1] "CLGQKHKLINNCLICGRIVCEQEGSGPCLFCGTLVCTHEEQDILQRDSNKSQKLLKKLMS 0"
## [1] "CLGQKHKLINNCLVCGRIVCEQEGSGPCLFCGSLVCTHEERDILQRDSNKSQKLLKKLMS 0"
## [1] "CLGQKHKLINNCLICGRIVCEQEGSGPCLFCGSLVCTHEERDILQRDSNKSQKLLKKLMS 0"
## [1] "CLGQKHKLINNCLICGRIVCEQEGSGPCFFCGTLVCTREEQDILQRDSNKSQKLLKKLMS 0"
## [1] "CLGQKHKLINNCLICGRIVCEQEGSGPCFFCGTLVCTREEQDILQRDSNKSQKLLKKLMS 0"
## [1] "CLGQKHKLINNCLVCGRIVCEQEGSGPCLFCGSLVCTNEEQDILQRDSNKSQKLLKKLMS 0"
## [1] "CLGQKHKLINNCLVCGRIVCEQEGSGPCLFCGSLVCTNEEQDILQRDSNKSQKLLKKLMS 0"
## [1] "CLGQKHKLINNCLVCGRIVCEQEGSGPCLFCGSLVCTKEEQDILQRDSNKSQKLLKKLMA 0"
## [1] "CLAQKHRLINNCLSCGRIVCEQEGSGPCLFCGSLVCTNEEQEILQRDSNKSQKLRKKLMG 0"
## [1] " "
## [1] "GVENSGKVDISTKDLLPHQELRIKSGLEKAIKHKDKLLEFDRTSIRRTQVIDDESDYFAS 0"
## [1] "GVENSGKVDISTKDLLPHQELRIKSGLEKAIKHKDKLLEFDRTSIRRTQVIDDESDYFAS 0"
## [1] "GTENSGKVDISTKDLLPHQELRFKSGLEKAIKHKDKLLEFDRTSIRRTQVIDDESDYFAS 0"
## [1] "GTENAGKVDISTKDLLPHQELRFKSGLEKAIKHKDKLLEFDRTSIRRTQVIDDESDYFAS 0"
## [1] "GTENSGKVDISTKDLLPHQELRIKSGLEKAIKHKDKLLEFDRTSIRRTQVIDDEADYFAS 0"
## [1] "GTESSGKVDISTKDLLPNQELRIKSGLEKAIKHKDKLLEYDRTSIRRTQVIDDESDYFAS 0"
## [1] "GAETSGKVDVSTKDLLPHQESRMKSGLEKAIKHKEKLLEFDRTSIRRTQVIDDESDYFAS 0"
## [1] "GAENSGKVDISTKDHPPHQESRMKSGLEKAVKHKEKLLEFDRTSIRRTQVIDDESDYFAS 0"
## [1] "GAESSGNLDAISKDLLPRQEARLKAGLEMAVKHKDKLLEFDRTSVRRTQVIDDESDYFAT 0"
## [1] "--------EGTEREYLPHQESKMKAGLEKAVKHKDKLLEFDKNSVKRTQVLDDESDYFAT 0"
## [1] " "
## [1] "DSNQWLSKLERETLQKREEELRELRHASRLSKKVTIDFAGRKILEEENSLAEYHSRLDET 0"
## [1] "DSNQWLSKLERETLQKREEELRELRHASRLSKKVTIDFAGRKILEEENSLAEYHSRLDET 0"
## [1] "DSNQWLSKIEREAIQKREEELREFRHASRLSKKITIDFAGRKIVEDEDPLAEYHSKLDEA 0"
## [1] "DSNQWLSKIEREAIQKREEELREFRHASRLSKKITIDFAGRKIVEDEDPLAEYHSKLDEA 0"
## [1] "DSNQWLSKIERETLQKREEELREFRHMSRLSKKITIDFAGRKIVEDENPLAEYHSKLDET 0"
## [1] "DSNQWLSKIERETLQKREEELREFRHMSRLSKKITIDFAGRKIMEDENPLAEYHSKLDET 0"
## [1] "DSNQWLSKVEREMLQKREEELRELRHASRLSKKVTIDFAGRKILEDENPLAEYHSRLDET 0"
## [1] "DSNQWLSKVERDMLQKREEELRELRHASRLSKKVTIDFAGRKILEDENPLAEYHSRLDET 0"
## [1] "DSNQWLSKQEREALQKREQELQELRHASRLAKKITIDFAGRQILEEDSGMAEYHSKLDET 0"
## [1] "DSNQWLSPGEREALRKREEELRELRHASRKDRKITLDFAGRRVLDEGENLSPYYQKFDET 0"
## [1] " "
## [1] "IQAIANGTLNQPLTKLDRSSEEPLGVLVNPNMYQSPPQWVDHTGAASQKKAFRSSGFGLE 0"
## [1] "IQAIANGTLNQPLTKLDRSSEEPLGVLVNPNMYQSPPQWVDHTGAASQKKAFRSSGFGLE 0"
## [1] "IHAIANGTLNQPVTKLDRSSEEPLGLLVNPNLYQSPPQWIDQMGAASQKKAFHPAGSGLE 0"
## [1] "IHAIANGTLNQPLTKLDRSSEEPLGLLVNPNLYQPPPQWIDQTGAASQKKAFHPAGSGLE 0"
## [1] "IQAIANGTLNQPLTKLDRSSDEPLGVLVNPNLYQSPPQWIDQTGAASQKKTFHSAGSELE 0"
## [1] "IQAIANGTLNQPLTKLDRSSDEPLGVLVNPNLYQSPPQWIDQTGAASQKKAFHSGGSELE 0"
## [1] "IQAIASGTLNQSLVTLDRSCEEPLGVLVNPNMYQASPQWVDNTGSTPQKKTSLSAGPRLE 0"
## [1] "IQAIANGTLNQSLVTSDRSSEEPLGVLVNPNMYQAPPQWVDNTGSAPPKKTSLSVGPRLE 0"
## [1] "VEAINCGTLSQPAASPEAKMTLSSGVLVNPHLLQPAPLWVDQTGSLPLRRATHSMGAGEE 0"
## [1] "VKAINTGSFGQKAKHSAPADRQHFRELVNPNILQSAPEWVDVASSNPSRKSSSKAASGKE 0"
## [1] " "
## [1] "FNSFQHQLRIQDQEFQEGFDGGWCLSVHQPWASLLVRGIKRVEGRSWYTPHRGRLWIAAT 0"
## [1] "FNSFQHQLRIQDQEFQEGFDGGWCLSVHQPWASLLVRGIKRVEGRSWYTPHRGRLWIAAT 0"
## [1] "FSSSRHQFRIQDQKFQEGFDGGWCLSMHQPWASLLVRGIKRVEGRNWYTPHRGRLWIAAT 0"
## [1] "FSSCRHQFRIQDQEFQEGFDGGWCLSMHQPWASLLVRGIKRVEGRSWYTPHRGRLWIAAT 0"
## [1] "FNSYRHQLRIQDQEFQEGFDGGWCLSMHQPWASLLVRGIKRVEGRNWYTPHRGRLWIAAT 0"
## [1] "LNSYRHQLRIQDQEFQEGFDGGWCLSMHQPWASLLVRGIKRVEGRSWYTPHRGRLWIAAT 0"
## [1] "PSLHQHQLRIQDQEFQEGFDGGWCLSMHQPWASLLVRGIKRVEGRSWYTPHRGRLWIAAT 0"
## [1] "LSLHPHQLRIQDQEFQEGFDGGWCLSVHQPWASLLVKGIKRVEGRSWYTPHRGRLWIAAT 0"
## [1] "FGSERNRLRIQDRELQEISDDGWCLSMHQPWASLLIRGIKRVEGRTWYTSHRGRLWIAAT 0"
## [1] "TS-ERSRLRLQDKELQEIADGGWCLSMHQPWASLLVKGIKRVEGRTWYTSHRGRLWIAAA 0"
## [1] " "
## [1] "AKKPSPQEVSELQATYRLLRGKDVEFPNDYPSGCLLGCVDLIDCLSQKQFKEQFPDISQE 0"
## [1] "AKKPSPQEVSELQATYRLLRGKDVEFPNDYPSGCLLGCVDLIDCLSQKQFKEQFPDISQE 0"
## [1] "AKRPSPQEVSELQTTYLLLRGKGVEFPSDYPSGCLLGCVDLIDCLSQKQFKDQYPDISQE 0"
## [1] "AKRPSPQEVSELQTTYLLLRGKDVEFPNDYPSGCLLGCVDLIDCLSQKQFKDQYPDISQE 0"
## [1] "AKRPSPQEVSELQTTYRVLRGKDVEFPNDYPSGCLLGCVDLIDCLSQKQFKDQYPDMSQE 0"
## [1] "AKRPSPQEVSELQTTYRLLRGKDVEFPNDYPSGCLLGCVDLIDCLSQKQFKEQYPDMSQE 0"
## [1] "GKRPSPQEVSELQATYRLLRGKDVEFPNDYPSGCLLGCVDLIDCLSQKQFQEQFPDISQE 0"
## [1] "GKRPSTQEVSELQATYRLLRGKDVEFPNDYPSGCLLGCVDLIDCLSQKQFQEQFPDISQE 0"
## [1] "AKRPSPQEISELETTYRMLLRKDVEFPNEYPSGCLLGCVDVTDCLSQEQFNEQYPDLSQE 0"
## [1] "ANRPTPQEIAEVEAMYRHLYKHEPQFPAEYPTGCLLGCVNVSDCLSQEQFREQYPQISEE 0"
## [1] " "
## [1] "-SDSPFVFICKNPQEMVVKFPIKGNPKIWKLDSKIHQGAKKGLMKQNKAV 10"
## [1] "-SDSPYVFICKNPQEMVVKFPIKGNPKIWKLDSKIHQGAKKGLMKQNKAV 10"
## [1] "ESDSPFVFICTNPQEMIVKFPIKGNPKIWKLDSKIHQGAKKGLMKQNKAG 10"
## [1] "ESDSPFVFICTNPQEMIVKFPIKGNPKIWKLDSKIHQGAKKGLMKQNKAG 10"
## [1] "-SDSPFVFICTNPQEMIVKFPIKGNPKIWKLDSKIHQGAKKGLMKQNKAV 10"
## [1] "-SDSPFVFICTNPQEMIVKFPIKGNPKIWKLDSKIHQGAKKGLMKQNKAV 10"
## [1] "-SDSSFVFICKNPQEMVVKFPIKGNPKIWKLDSKIHQGAKKGLMKQNKAV 10"
## [1] "-SDSPFVFICKNPQEMVVKFPIKGNPKIWKLDSKIHQGAKKGLMKQNKAV 10"
## [1] "-SGSPFVFICTNPQEMVVKFPIKGKPKIWKLDSKIHQGAKKGLMKQKAVA 10"
## [1] "-STSPFVFICSNPQELVVKFPMKGKHKIWKLESQSHQSAKKCLMQPA--- 10"
## [1] " "
# Display finished MSA resutlt
# Change class of the alignment
class(TRIP4_align) <- "AAMultipleAlignment"
ggmsa::ggmsa(TRIP4_align,
start = 0,
end = 50)
Make a distance matrix
This produces a “dist” class object.
# Use MSA to calculate the genetic distance
TRIP4_dist <- seqinr::dist.alignment(TRIP4_align_seqinr,
matrix = "identity")
# Information about the object
is(TRIP4_dist)
## [1] "dist" "oldClass"
class(TRIP4_dist)
## [1] "dist"
Round for display
# Round to 3 decimals
TRIP4_align_seqinr_rnd <- round(TRIP4_dist, 3)
# Display result
TRIP4_align_seqinr_rnd
## NP_057297.2 XP_510472.2 NP_001039816.1 XP_005685718.1
## XP_510472.2 0.041
## NP_001039816.1 0.343 0.346
## XP_005685718.1 0.335 0.338 0.144
## XP_535513.2 0.308 0.310 0.279 0.276
## XP_014941048.1 0.308 0.310 0.288 0.279
## NP_062771.2 0.337 0.340 0.399 0.395
## NP_001128453.1 0.337 0.340 0.397 0.392
## XP_015134569.2 0.539 0.541 0.539 0.534
## NP_001071043.2 0.639 0.641 0.634 0.634
## XP_535513.2 XP_014941048.1 NP_062771.2 NP_001128453.1
## XP_510472.2
## NP_001039816.1
## XP_005685718.1
## XP_535513.2
## XP_014941048.1 0.171
## NP_062771.2 0.385 0.387
## NP_001128453.1 0.387 0.382 0.238
## XP_015134569.2 0.532 0.532 0.553 0.558
## NP_001071043.2 0.646 0.646 0.628 0.637
## XP_015134569.2
## XP_510472.2
## NP_001039816.1
## XP_005685718.1
## XP_535513.2
## XP_014941048.1
## NP_062771.2
## NP_001128453.1
## XP_015134569.2
## NP_001071043.2 0.597
Build a phylogenetic tree from distance matrix
phyTree <- nj(TRIP4_dist)
Plotting the tree.
# plot tree
# Include title
plot.phylo(phyTree, main="Phylogenetic Tree",
use.edge.length = F)
# Adding label
mtext(text = "TRIP4 family gene tree - rooted, no branch lengths")