Introduction

This code compiles summary information about the gene TRIP4 (thyroid hormone receptor interactor 4), which encodes the activating signal cointegrator 1 protein. The protein is involved mediating interactions with transcriptional coactivators and ligand-bound nuclear receptors. An MSA and a phylogenetic tree is also generated to show the evolutionary relationship betweeen the human version of the gene and its homologs in other species.

Resources/References

Key information use to make this script can be found here: * Refseq Gene: https://www.ncbi.nlm.nih.gov/gene/9325 * Refseq Homologene: https://www.ncbi.nlm.nih.gov/homologene?LinkName=nuccore_homologene&from_uid=1015232364 * UniProt: https://www.uniprot.org/uniprot/Q15650 * PDB: https://www.rcsb.org/structure/2E5O

Other resources include: * Neanderthal genome: http://neandertal.ensemblgenomes.org/index.html

Preparation

Load necessary packages:

Download and load drawProteins from Bioconductor

library(BiocManager)  #Allow access to Bioconductor packages
## Bioconductor version '3.13' is out-of-date; the current release version '3.14'
##   is available with R version '4.1'; see https://bioconductor.org/install
#install("drawProteins")
library(drawProteins)

Load other packages

# github packages
library(compbio4all)
library(ggmsa)
## Registered S3 methods overwritten by 'ggalt':
##   method                  from   
##   grid.draw.absoluteGrob  ggplot2
##   grobHeight.absoluteGrob ggplot2
##   grobWidth.absoluteGrob  ggplot2
##   grobX.absoluteGrob      ggplot2
##   grobY.absoluteGrob      ggplot2
# CRAN packages
library(rentrez)
library(seqinr)
library(ape)
## 
## Attaching package: 'ape'
## The following objects are masked from 'package:seqinr':
## 
##     as.alignment, consensus
library(pander)


library(ggplot2)

# Bioconductor packages
## msa
###The msa output is used to make a distance matrix and then phylogenetics trees
library(msa)
## Loading required package: Biostrings
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: XVector
## Loading required package: GenomeInfoDb
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:ape':
## 
##     complement
## The following object is masked from 'package:seqinr':
## 
##     translate
## The following object is masked from 'package:base':
## 
##     strsplit
## 
## Attaching package: 'msa'
## The following object is masked from 'package:BiocManager':
## 
##     version
library(drawProteins)

## Biostrings
library(Biostrings)


library(HGNChelper)

#Accession Number Table Accession numbers were obtained from RefSeq, Refseq Homlogene, UniProt and PDB. UniProt accession numbers can be found by searching for the gene name. PDB accessions can be found by searching with a UniProt accession or a gene name, though many proteins are not in PDB. The the Neanderthal genome database was searched but did not yield sequence information on. TRIP4.

Not available: * Neanderthal

#                  Refseq         Uniprot  PBD      sci name   common name  gene name   
trip4_table <- c("NP_057297.2", "Q15650", "2E5O", "Homo sapiens", "Human", "TRIP4", 
                "XP_510472.2", "K7BG15", "NA", "Pan troglodytes", "Chimpanzee", "TRIP4",
                "NP_001039816.1", "Q2KIC4", "NA", "Bos taurus", "Cattle", "TRIP4",
                "NP_062771.2","Q9QXN3","NA","Mus musculus","Mouse","TRIP4",
                "NP_001128453.1","B5DEP5","NA","Rattus norvegicus","Norway rat","TRIP4",
                "NP_001071043.2","Q08CA0","NA","Danio rerio","Zebrafish","TRIP4",
                "XP_535513.2","F6UYL9","NA","Canis lupus familiaris","Dog","TRIP4",
                "XP_014941048.1","A0A6J0A8Z9","NA","Acinonyx jubatus","Cheetah","TRIP4",   
                "XP_005685718.1","NA","NA","Capra hircus","Goat","TRIP4",
                "XP_015134569.2","NA","NA","Gallus gallus","Chicken","TRIP4"
)

# convert the vector to matrix using matrix()
trip4_table_matrix <- matrix(trip4_table,
                                  byrow = T,
                                  nrow = 10)

# convert the matrix to a dataframe using data.frame()
trip4_table <- data.frame(trip4_table_matrix, 
                     stringsAsFactors = F)

# name columns of dataframe using names() function
names(trip4_table) <- c("ncbi.protein.accession","UniProt.id", "PBD", "Species", "common.name","gene.name")

pander::pander(trip4_table)
Table continues below
ncbi.protein.accession UniProt.id PBD Species
NP_057297.2 Q15650 2E5O Homo sapiens
XP_510472.2 K7BG15 NA Pan troglodytes
NP_001039816.1 Q2KIC4 NA Bos taurus
NP_062771.2 Q9QXN3 NA Mus musculus
NP_001128453.1 B5DEP5 NA Rattus norvegicus
NP_001071043.2 Q08CA0 NA Danio rerio
XP_535513.2 F6UYL9 NA Canis lupus familiaris
XP_014941048.1 A0A6J0A8Z9 NA Acinonyx jubatus
XP_005685718.1 NA NA Capra hircus
XP_015134569.2 NA NA Gallus gallus
common.name gene.name
Human TRIP4
Chimpanzee TRIP4
Cattle TRIP4
Mouse TRIP4
Norway rat TRIP4
Zebrafish TRIP4
Dog TRIP4
Cheetah TRIP4
Goat TRIP4
Chicken TRIP4

Data Preparation

Download sequences

All sequences were downloaded using a wrapper compbio4all::entrez_fetch_list() which uses rentrez::entrez_fetch() to access NCBI databases.

#Download FASTA sequences
trip4_list <- compbio4all::entrez_fetch_list(db = "protein", id = trip4_table$ncbi.protein.accession, rettype = "fasta")

#Number of FASTA files obtained
length(trip4_list)
## [1] 10
# First entry (human)
trip4_human <- trip4_list[[1]]

Initial Data Cleaning

Remove FASTA header

# This step removes the FASTA header and performs cleaning steps required for particular analyses
for(i in 1:length(trip4_list)){
  trip4_list[[i]] <- compbio4all::fasta_cleaner(trip4_list[[i]], parse = F)
}

General Protein Information

# Protein for human

# Use a UniProt accession to download data from UniProt, which produces a list
Q15650_list <- drawProteins::get_features("Q15650")
## [1] "Download has worked"
is(Q15650_list)
## [1] "list"             "vector"           "list_OR_List"     "vector_OR_Vector"
## [5] "vector_OR_factor"
# Convert raw data from the webpage to dataframe
my_prot_df <- drawProteins::feature_to_dataframe(Q15650_list)

# View dataframe
my_prot_df[,-2]
##                     type begin end length accession   entryName taxid order
## featuresTemp    INIT_MET     1   1      0    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.1     CHAIN     2 581    579    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.2    DOMAIN   437 531     94    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.3   ZN_FING   171 187     16    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.4    REGION    97 118     21    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.5    REGION   200 300    100    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.6    REGION   300 400    100    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.7   MOD_RES     2   2      0    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.8   MOD_RES   276 276      0    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.9   MOD_RES   289 289      0    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.10  MOD_RES   341 341      0    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.11 CROSSLNK   324 324      0    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.12 CROSSLNK   325 325      0    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.13 CROSSLNK   334 334      0    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.14 CROSSLNK   367 367      0    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.15 CONFLICT   157 157      0    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.16 CONFLICT   164 164      0    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.17 CONFLICT   475 475      0    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.18   STRAND   435 439      4    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.19    HELIX   443 448      5    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.20   STRAND   454 459      5    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.21   STRAND   465 471      6    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.22    HELIX   478 492     14    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.23   STRAND   504 518     14    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.24    HELIX   521 525      4    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.25     TURN   530 532      2    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.26   STRAND   535 546     11    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.27   STRAND   557 561      4    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.28    HELIX   564 572      8    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.29    HELIX   573 575      2    Q15650 TRIP4_HUMAN  9606     1

Protein diagram

# Key domain 
my_canvas <- draw_canvas(my_prot_df)  
my_canvas <- draw_chains(my_canvas, my_prot_df, label_size = 2.5)
my_canvas <- draw_domains(my_canvas, my_prot_df)
my_canvas

Dotplot

# Create vector and FASTA Clean the TRIP4 human protein
TRIP4_human_vector <- compbio4all::fasta_cleaner(trip4_human)

# Default dotplot
seqinr::dotPlot(TRIP4_human_vector,
                TRIP4_human_vector)

# set up 2 x 2 grid and show different values for dotplot settings
par(mfrow = c(2,2), 
    mar = c(0,0,2,1))

# plot 1: TRIP4 - Defaults
dotPlot(TRIP4_human_vector,
        TRIP4_human_vector,
        wsize = 1, 
        wstep = 1,
        nmatch = 1, 
        xlab = "trip4_human_vector",
        ylab = "trip4_human_vector",
        main = "TRIP4 Defaults")

# plot 2: TRIP4 - size = 10, nmatch = 1
dotPlot(TRIP4_human_vector,
        TRIP4_human_vector, 
        wsize = 10 , 
        wstep = 1,
        nmatch = 1, 
        xlab = "trip4_human_vector",
        ylab = "trip4_human_vector",
        main = " TRIP4- size = 10, nmatch = 1")

# plot 3: TRIP4 - size = 10 , nmatch = 5
dotPlot(TRIP4_human_vector,
        TRIP4_human_vector, 
        wsize = 10, 
        wstep = 1,
        nmatch = 5, 
        xlab = "trip4_human_vector",
        ylab = "trip4_human_vector",
        main = "TRIP4 - size = 10 , nmatch = 5")

# plot 4: TRIP4 - size = 20, nmatch = 5
dotPlot(TRIP4_human_vector,
        TRIP4_human_vector, 
        wsize = 20,
        wstep = 1,
        nmatch = 5,
        xlab = "trip4_human_vector",
        ylab = "trip4_human_vector",
        main = "TRIP4 - size = 20, nmatch = 5")

# Best version of the dot plot
par(mfrow = c(1,1),
    mar = c(4,4,4,4))

dotPlot(TRIP4_human_vector,
        TRIP4_human_vector,
        wsize = 20, 
        wstep = 1,
        nmatch = 5, 
        xlab = "trip4_human_vector",
        ylab = "trip4_human_vector",
       # main = "TRIP4 Dotplot"
        main = "TRIP4 - window = 20, nmatch = 5")

Protein Properties Compiled from Database

Note: There was no information available on the presence of disorganized regions or tandem repeats.

## Make table consisitng of protein properties from different databases

trip4_protProp <- c("Pfam","There are two interesting domains presented in this database. The first is a zf-C2HC5 domain from AA 168-216. The second is an ASCH domain from AA 437-551.","http://pfam.xfam.org/protein/TRIP4_HUMAN",
                          "Uniprot","TRIP4 is a protein coding gene for Activating signal cointegrator 1, found in the subcellular locations - nucleus and cytoplasm/cytosol.","https://www.uniprot.org/uniprot/Q15650",
                          "Alphafold"," Alphafold predicts with moderate confidence that the Activating signal cointegrator 1 protein, which is encoded by the TRIP4 gene consists mainly of alpha helices and some beta strands.","https://alphafold.ebi.ac.uk/entry/Q15650",
                    "Disprot", "NA", "NA",
                    "RepeatsDB", "NA", "NA")

# convert the vector to matrix using matrix()
trip4_protProp_matrix <- matrix(trip4_protProp,
                                  byrow = T,
                                  nrow = 5)

# convert the matrix to a dataframe using data.frame()
trip4_protProp <- data.frame(trip4_protProp_matrix, 
                     stringsAsFactors = F)

# name columns of dataframe using names() function
names(trip4_protProp) <- c("Source","Protein Property", "Link")

pander::pander(trip4_protProp)
Table continues below
Source Protein Property
Pfam There are two interesting domains presented in this database. The first is a zf-C2HC5 domain from AA 168-216. The second is an ASCH domain from AA 437-551.
Uniprot TRIP4 is a protein coding gene for Activating signal cointegrator 1, found in the subcellular locations - nucleus and cytoplasm/cytosol.
Alphafold Alphafold predicts with moderate confidence that the Activating signal cointegrator 1 protein, which is encoded by the TRIP4 gene consists mainly of alpha helices and some beta strands.
Disprot NA
RepeatsDB NA
Link
http://pfam.xfam.org/protein/TRIP4_HUMAN
https://www.uniprot.org/uniprot/Q15650
https://alphafold.ebi.ac.uk/entry/Q15650
NA
NA

Protein Feature Prediction

Multivariate statistical techniques were used to confirm the information about protein structure and location in the online database.

Uniprot (which uses http://www.csbio.sjtu.edu.cn/bioinf/Cell-PLoc-2/) indicates that the protein is found in the nucleus and cytoplasm.

Secondary Structure Prediction

#Make 2 vectors of amino acids. 
aa.1.1 <- c("A","R","N","D","C","Q","E","G","H","I",
            "L","K","M","F","P","S","T","W","Y","V")

aa.1.2 <- c("A","R","N","D","C","Q","E","G","H","I",
            "L","K","M","F","P","S","T","W","Y","V")

# Use logical check to make sure vectors are same length and contain same values
length(aa.1.1) == length(aa.1.2)
## [1] TRUE
aa.1.1 == aa.1.2
##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE
# Shows number of unique values present 
unique(aa.1.1)
##  [1] "A" "R" "N" "D" "C" "Q" "E" "G" "H" "I" "L" "K" "M" "F" "P" "S" "T" "W" "Y"
## [20] "V"
# Show both vectors have same number of unique values
length(unique(aa.1.1)) == length(unique(aa.1.2))
## [1] TRUE
# Creates a vector of each amino acid frequency in the database of proteins of each type that Chou used
# Alpha proteins
alpha <- c(285, 53, 97, 163, 22, 67, 134, 197, 111, 91, 
           221, 249, 48, 123, 82, 122, 119, 33, 63, 167)

# Check against Chou's total
sum(alpha) == 2447
## [1] TRUE
# Beta proteins
beta <- c(203, 67, 139, 121, 75, 122, 86, 297, 49, 120, 
          177, 115, 16, 85, 127, 341, 253, 44, 110, 229)

# check against Chou's total
sum(beta) == 2776
## [1] TRUE
# Alpha and Beta
a.plus.b <- c(175, 78, 120, 111, 74, 74, 86, 171, 33, 93,
              110, 112, 25, 52, 71, 126, 117, 30, 108, 123)

# Check against Chou's total
sum(a.plus.b) == 1889
## [1] TRUE
# alpha/beta
a.div.b <- c(361, 146, 183, 244, 63, 114, 257, 377, 107, 239, 
             339, 321, 91, 158, 188, 327, 238, 72, 130, 378)

# Check against Chou's total
sum(a.div.b) == 4333
## [1] TRUE
# Create a dataframe of the data
chouDF <- data.frame(aa.1.1, alpha, beta, a.plus.b, a.div.b)

# Print result (dataframe)
pander::pander(chouDF)
aa.1.1 alpha beta a.plus.b a.div.b
A 285 203 175 361
R 53 67 78 146
N 97 139 120 183
D 163 121 111 244
C 22 75 74 63
Q 67 122 74 114
E 134 86 86 257
G 197 297 171 377
H 111 49 33 107
I 91 120 93 239
L 221 177 110 339
K 249 115 112 321
M 48 16 25 91
F 123 85 52 158
P 82 127 71 188
S 122 341 126 327
T 119 253 117 238
W 33 44 30 72
Y 63 110 108 130
V 167 229 123 378

Calculate proportions for each of the four protein fold types

alpha.prop <- alpha/sum(alpha)
beta.prop <- beta/sum(beta)
a.plus.b.prop <- a.plus.b/sum(a.plus.b)
a.div.b <- a.div.b/sum(a.div.b)

Create dataframe

#dataframe
aa.prop <- data.frame(alpha.prop,
                      beta.prop,
                      a.plus.b.prop,
                      a.div.b)
#row labels
row.names(aa.prop) <- aa.1.1
# Print dataframe
pander(aa.prop)
  alpha.prop beta.prop a.plus.b.prop a.div.b
A 0.1165 0.07313 0.09264 0.08331
R 0.02166 0.02414 0.04129 0.03369
N 0.03964 0.05007 0.06353 0.04223
D 0.06661 0.04359 0.05876 0.05631
C 0.008991 0.02702 0.03917 0.01454
Q 0.02738 0.04395 0.03917 0.02631
E 0.05476 0.03098 0.04553 0.05931
G 0.08051 0.107 0.09052 0.08701
H 0.04536 0.01765 0.01747 0.02469
I 0.03719 0.04323 0.04923 0.05516
L 0.09031 0.06376 0.05823 0.07824
K 0.1018 0.04143 0.05929 0.07408
M 0.01962 0.005764 0.01323 0.021
F 0.05027 0.03062 0.02753 0.03646
P 0.03351 0.04575 0.03759 0.04339
S 0.04986 0.1228 0.0667 0.07547
T 0.04863 0.09114 0.06194 0.05493
W 0.01349 0.01585 0.01588 0.01662
Y 0.02575 0.03963 0.05717 0.03
V 0.06825 0.08249 0.06511 0.08724

Determine the number of each amino acid in the protein

table(TRIP4_human_vector)
## TRIP4_human_vector
##  A  C  D  E  F  G  H  I  K  L  M  N  P  Q  R  S  T  V  W  Y 
## 26 15 34 49 22 38 14 30 54 60  5 21 25 42 32 42 24 30  9  9
# Convert table to vector
table.to.vec <- function(table_x){
  table_names <- attr(table_x, "dimnames")[[1]]
  table_vect <- as.vector(table_x)
  names(table_vect) <- table_names
  return(table_vect)
}

Create a table of amino acid frequencies and convert it to a vector.

TRIP4_human_table <- table(TRIP4_human_vector)/length(TRIP4_human_vector)
TRIP4.human.aa.freq <- table.to.vec(TRIP4_human_table)

# Print result
TRIP4.human.aa.freq
##           A           C           D           E           F           G 
## 0.044750430 0.025817556 0.058519793 0.084337349 0.037865749 0.065404475 
##           H           I           K           L           M           N 
## 0.024096386 0.051635112 0.092943201 0.103270224 0.008605852 0.036144578 
##           P           Q           R           S           T           V 
## 0.043029260 0.072289157 0.055077453 0.072289157 0.041308090 0.051635112 
##           W           Y 
## 0.015490534 0.015490534
# Check for presence of U, which is an unknown amino acid 
aa.names <- names(TRIP4.human.aa.freq)
any(aa.names == "U") #Check for any amino acids that has U
## [1] FALSE
# Result returns FALSE, so no "U" is present
# Add protein to frequency table
aa.prop$TRIP4.human.aa.freq <- TRIP4.human.aa.freq
pander::pander(aa.prop)
  alpha.prop beta.prop a.plus.b.prop a.div.b TRIP4.human.aa.freq
A 0.1165 0.07313 0.09264 0.08331 0.04475
R 0.02166 0.02414 0.04129 0.03369 0.02582
N 0.03964 0.05007 0.06353 0.04223 0.05852
D 0.06661 0.04359 0.05876 0.05631 0.08434
C 0.008991 0.02702 0.03917 0.01454 0.03787
Q 0.02738 0.04395 0.03917 0.02631 0.0654
E 0.05476 0.03098 0.04553 0.05931 0.0241
G 0.08051 0.107 0.09052 0.08701 0.05164
H 0.04536 0.01765 0.01747 0.02469 0.09294
I 0.03719 0.04323 0.04923 0.05516 0.1033
L 0.09031 0.06376 0.05823 0.07824 0.008606
K 0.1018 0.04143 0.05929 0.07408 0.03614
M 0.01962 0.005764 0.01323 0.021 0.04303
F 0.05027 0.03062 0.02753 0.03646 0.07229
P 0.03351 0.04575 0.03759 0.04339 0.05508
S 0.04986 0.1228 0.0667 0.07547 0.07229
T 0.04863 0.09114 0.06194 0.05493 0.04131
W 0.01349 0.01585 0.01588 0.01662 0.05164
Y 0.02575 0.03963 0.05717 0.03 0.01549
V 0.06825 0.08249 0.06511 0.08724 0.01549

Calculating Similarities with Different Functions

#Two custom functions: the first one calculates correlates between two columns of our table, and the second one to calculates correlation similarities

chou_cor <- function(x,y){
  numerator <- sum(x*y)
denominator <- sqrt((sum(x^2))*(sum(y^2)))
result <- numerator/denominator
return(result)
}

chou_cosine <- function(z.1, z.2){
  z.1.abs <- sqrt(sum(z.1^2))
  z.2.abs <- sqrt(sum(z.2^2))
  my.cosine <- sum(z.1*z.2)/(z.1.abs*z.2.abs)
  return(my.cosine)
}
#Calculates and displays correlation between each column

corr.alpha <- chou_cor(aa.prop[,5], aa.prop[,1])
corr.beta  <- chou_cor(aa.prop[,5], aa.prop[,2])
corr.apb   <- chou_cor(aa.prop[,5], aa.prop[,3])
corr.adb   <- chou_cor(aa.prop[,5], aa.prop[,4])

# Display result for alpha
corr.alpha
## [1] 0.739012
# Display result for beta
corr.beta
## [1] 0.7483259
# Display result for apb
corr.apb
## [1] 0.7825123
# Display result for adb
corr.adb
## [1] 0.7662036

Calculates and displays the cosine similarity

cos.alpha <- chou_cosine(aa.prop[,5], aa.prop[,1])
cos.beta  <- chou_cosine(aa.prop[,5], aa.prop[,2])
cos.apb   <- chou_cosine(aa.prop[,5], aa.prop[,3])
cos.adb   <- chou_cosine(aa.prop[,5], aa.prop[,4])

# Display cosine similarities results
cos.alpha
## [1] 0.739012
cos.beta
## [1] 0.7483259
cos.apb
## [1] 0.7825123
cos.adb
## [1] 0.7662036

Calculate Euclidean distance

# Euclidean distance
aa.prop.flipped <- t(aa.prop)
round(aa.prop.flipped,2)
##                        A    R    N    D    C    Q    E    G    H    I    L    K
## alpha.prop          0.12 0.02 0.04 0.07 0.01 0.03 0.05 0.08 0.05 0.04 0.09 0.10
## beta.prop           0.07 0.02 0.05 0.04 0.03 0.04 0.03 0.11 0.02 0.04 0.06 0.04
## a.plus.b.prop       0.09 0.04 0.06 0.06 0.04 0.04 0.05 0.09 0.02 0.05 0.06 0.06
## a.div.b             0.08 0.03 0.04 0.06 0.01 0.03 0.06 0.09 0.02 0.06 0.08 0.07
## TRIP4.human.aa.freq 0.04 0.03 0.06 0.08 0.04 0.07 0.02 0.05 0.09 0.10 0.01 0.04
##                        M    F    P    S    T    W    Y    V
## alpha.prop          0.02 0.05 0.03 0.05 0.05 0.01 0.03 0.07
## beta.prop           0.01 0.03 0.05 0.12 0.09 0.02 0.04 0.08
## a.plus.b.prop       0.01 0.03 0.04 0.07 0.06 0.02 0.06 0.07
## a.div.b             0.02 0.04 0.04 0.08 0.05 0.02 0.03 0.09
## TRIP4.human.aa.freq 0.04 0.07 0.06 0.07 0.04 0.05 0.02 0.02

Calculate Distance Matrix

dist(aa.prop.flipped, method = "euclidean")
##                     alpha.prop  beta.prop a.plus.b.prop    a.div.b
## beta.prop           0.13342098                                    
## a.plus.b.prop       0.09281824 0.08289406                         
## a.div.b             0.06699039 0.08659174    0.06175113           
## TRIP4.human.aa.freq 0.18410888 0.18214298    0.16315846 0.17061508

Calculates and displays individual distances

dist.alpha <- dist((aa.prop.flipped[c(1,5),]),  method = "euclidean")
dist.beta  <- dist((aa.prop.flipped[c(2,5),]),  method = "euclidean")
dist.apb   <- dist((aa.prop.flipped[c(3,5),]),  method = "euclidean")
dist.adb  <- dist((aa.prop.flipped[c(4,5),]), method = "euclidean")

# Display results
dist.alpha 
##                     alpha.prop
## TRIP4.human.aa.freq  0.1841089
dist.beta
##                     beta.prop
## TRIP4.human.aa.freq  0.182143
dist.apb
##                     a.plus.b.prop
## TRIP4.human.aa.freq     0.1631585
dist.adb
##                       a.div.b
## TRIP4.human.aa.freq 0.1706151
# Compile info and rounds it. More readble
fold.type <- c("alpha","beta","alpha plus beta", "alpha/beta")

corr.sim <- round(c(corr.alpha,corr.beta,corr.apb,corr.adb),5)
cosine.sim <- round(c(cos.alpha,cos.beta,cos.apb,cos.adb),5)
Euclidean.dist <- round(c(dist.alpha,dist.beta,dist.apb,dist.adb),5)

sim.sum <- c("","","most.sim","")
dist.sum <- c("","","min.dist","")

df2 <- data.frame(fold.type,
           corr.sim ,
           cosine.sim ,
           Euclidean.dist ,
           sim.sum ,
           dist.sum )

# Display result
pander(df2)
fold.type corr.sim cosine.sim Euclidean.dist sim.sum dist.sum
alpha 0.739 0.739 0.1841
beta 0.7483 0.7483 0.1821
alpha plus beta 0.7825 0.7825 0.1632 most.sim min.dist
alpha/beta 0.7662 0.7662 0.1706

Percent Identity Comparisons (PID)

Data Preparation

# Info about list
names(trip4_list)
##  [1] "NP_057297.2"    "XP_510472.2"    "NP_001039816.1" "NP_062771.2"   
##  [5] "NP_001128453.1" "NP_001071043.2" "XP_535513.2"    "XP_014941048.1"
##  [9] "XP_005685718.1" "XP_015134569.2"
length(trip4_list)
## [1] 10
# First entry data
trip4_list[[1]]
## [1] "MAVAGAVSGEPLVHWCTQQLRKTFGLDVSEEIIQYVLSIESAEEIREYVTDLLQGNEGKKGQFIEELITKWQKNDQELISDPLQQCFKKDEILDGQKSGDHLKRGRKKGRNRQEVPAFTEPDTTAEVKTPFDLAKAQENSNSVKKKTKFVNLYTREGQDRLAVLLPGRHPCDCLGQKHKLINNCLICGRIVCEQEGSGPCLFCGTLVCTHEEQDILQRDSNKSQKLLKKLMSGVENSGKVDISTKDLLPHQELRIKSGLEKAIKHKDKLLEFDRTSIRRTQVIDDESDYFASDSNQWLSKLERETLQKREEELRELRHASRLSKKVTIDFAGRKILEEENSLAEYHSRLDETIQAIANGTLNQPLTKLDRSSEEPLGVLVNPNMYQSPPQWVDHTGAASQKKAFRSSGFGLEFNSFQHQLRIQDQEFQEGFDGGWCLSVHQPWASLLVRGIKRVEGRSWYTPHRGRLWIAATAKKPSPQEVSELQATYRLLRGKDVEFPNDYPSGCLLGCVDLIDCLSQKQFKEQFPDISQESDSPFVFICKNPQEMVVKFPIKGNPKIWKLDSKIHQGAKKGLMKQNKAV"
# Creates and prints empty vector
TRIP4_vector <- rep(NA, length(trip4_list))
TRIP4_vector
##  [1] NA NA NA NA NA NA NA NA NA NA
# Add info to the vector using for loop
for(i in 1:length(TRIP4_vector)){
  TRIP4_vector[i] <- trip4_list[[i]]
}
# Name the vector
names(TRIP4_vector) <- names(trip4_list)
# Turn sequences into vectors
humanTRIP4  <- fasta_cleaner(trip4_list[[1]], parse = F)
chimpanzeeTRIP4  <- fasta_cleaner(trip4_list[[2]], parse = F)
cattleTRIP4   <- fasta_cleaner(trip4_list[[3]], parse = F)
norwayratTRIP4   <- fasta_cleaner(trip4_list[[5]], parse = F)

Performing Global Pairwise Alignment on Sequences

# Aligning human seq vs chimp seq
align.human.vs.chimp <- Biostrings::pairwiseAlignment(
                  humanTRIP4,
                  chimpanzeeTRIP4)

# Aligning human seq vs cattle seq
align.human.vs.cattle <- Biostrings::pairwiseAlignment(
                  humanTRIP4,
                  cattleTRIP4)

# Aligning human seq vs Norway rat seq
align.human.vs.norwayrat <- Biostrings::pairwiseAlignment(
                  humanTRIP4,
                  norwayratTRIP4)

PID Table

#Percent sequence alignments between species

# PID of human vs chimp
Biostrings::pid(align.human.vs.chimp)
## [1] 99.82788
# PID of human vs cattle
Biostrings::pid(align.human.vs.cattle)
## [1] 87.62887
# PID of human vs norway rat
Biostrings::pid(align.human.vs.norwayrat)
## [1] 88.64028

Creating Global pairwise alignments for a matrix

# Aligning chimp seq vs Norway rat seq
align.chimp.vs.norwayrat <- Biostrings::pairwiseAlignment(
                  chimpanzeeTRIP4,
                  norwayratTRIP4)

# Aligning chimp seq vs cattle seq
align.chimp.vs.cattle <- Biostrings::pairwiseAlignment(
                  chimpanzeeTRIP4,
                  cattleTRIP4)

# Aligning Norway rat seq vs cattle seq
align.norwayrat.vs.cattle <- Biostrings::pairwiseAlignment(
                  norwayratTRIP4,
                  cattleTRIP4)

Generating a matrix of PID

pids <- c(1,                  NA,     NA,     NA,
          pid(align.human.vs.chimp),          1,     NA,     NA,
          pid(align.human.vs.cattle), pid(align.chimp.vs.norwayrat),      1,     NA,
          pid(align.human.vs.norwayrat), pid(align.chimp.vs.cattle), pid(align.norwayrat.vs.cattle), 1)

mat <- matrix(pids, nrow = 4, byrow = T)
row.names(mat) <- c("Homo","Pan","Rat","Bos")   
colnames(mat) <- c("Homo","Pan","Rat","Bos")   
pander::pander(mat)  
  Homo Pan Rat Bos
Homo 1 NA NA NA
Pan 99.83 1 NA NA
Rat 87.63 88.47 1 NA
Bos 88.64 87.46 83.68 1

PID Methods Comparison

Comparing different PID methods.

A comparison of chimps and human PID with different methods.

PID1 <- pid(align.human.vs.chimp, type = "PID1")
PID2 <- pid(align.human.vs.chimp, type = "PID2")
PID3 <- pid(align.human.vs.chimp, type = "PID3")
PID4 <- pid(align.human.vs.chimp, type = "PID4")
# Generates the dataframe

Method <- c("PID1", "PID2", "PID3", "PID4")

PID <- c(PID1, PID2, PID3, PID4)

Denominator <- c("(aligned positions + internal gap positions)", "(aligned positions)", "(length shorter sequence)", "(average length of the two sequences)")

# Convert vectors to dataframe
pidDF <- data.frame(Method,
                    PID,
                    Denominator)

# Print result (table)
pander::pander(pidDF)
Method PID Denominator
PID1 99.83 (aligned positions + internal gap positions)
PID2 99.83 (aligned positions)
PID3 99.83 (length shorter sequence)
PID4 99.83 (average length of the two sequences)

Multiple Sequence Alignment

Data Preparation

For use with R bioinformatics tools we need to convert our named vector to a string set using Biostrings::AAStringSet(). Note the _ss tag at the end of the object we’re assigning the output to, which designates this as a string set.

#Convert vector into stringset
TRIP4_vector_ss <- Biostrings::AAStringSet(TRIP4_vector)

Building Sequence Alignement (MSA)

# Uses default substitution matrix
TRIP4_align <- msa(TRIP4_vector_ss,
                     method = "ClustalW")
## use default substitution matrix

Cleaning / setting up an MSA

MSA produces a species MSA objects

# Info about TRIP4_align
class(TRIP4_align)
## [1] "MsaAAMultipleAlignment"
## attr(,"package")
## [1] "msa"
is(TRIP4_align)
## [1] "MsaAAMultipleAlignment" "AAMultipleAlignment"    "MsaMetaData"           
## [4] "MultipleAlignment"

Default output of MSA

TRIP4_align
## CLUSTAL 2.1  
## 
## Call:
##    msa(TRIP4_vector_ss, method = "ClustalW")
## 
## MsaAAMultipleAlignment with 10 rows and 590 columns
##      aln                                                   names
##  [1] MAVAGAVSGEPLVHWCTQQLRKTFG...PKIWKLDSKIHQGAKKGLMKQNKAV NP_057297.2
##  [2] MAVAGAVSGEPLVHWCTQQLRKTFG...PKIWKLDSKIHQGAKKGLMKQNKAV XP_510472.2
##  [3] MAVAGAASREQLVGWCTEQLRKSFG...PKIWKLDSKIHQGAKKGLMKQNKAG NP_001039816.1
##  [4] MAVAGAASRQQLVGWCTEQLRKSFG...PKIWKLDSKIHQGAKKGLMKQNKAG XP_005685718.1
##  [5] MAVAGAASREQLVGWCTQELRKTFG...PKIWKLDSKIHQGAKKGLMKQNKAV XP_535513.2
##  [6] MAVAGAASREQLVGWCTQQLRKTFG...PKIWKLDSKIHQGAKKGLMKQNKAV XP_014941048.1
##  [7] MAVAGAAYREPLVHWCTQQLQKTFA...PKIWKLDSKIHQGAKKGLMKQNKAV NP_062771.2
##  [8] MAVAGAASGEPLVHWCTQQLRKTFA...PKIWKLDSKIHQGAKKGLMKQNKAV NP_001128453.1
##  [9] MAAPG-----ALLDWCVRRLRGDFG...PKIWKLDSKIHQGAKKGLMKQKAVA XP_015134569.2
## [10] -------MSDSLLRWTVDKLKHGFG...HKIWKLESQSHQSAKKCLMQPA--- NP_001071043.2
##  Con MAVAGAASRE?LV?WCTQQLRKTFG...PKIWKLDSKIHQGAKKGLMKQNKAV Consensus

Change class of alignment

class(TRIP4_align) <- "MsaAAMultipleAlignment"

#class(TRIP4_align)

Convert to seqinr format

TRIP4_align_seqinr <- msaConvert(TRIP4_align, 
                                   type = "seqinr::alignment")

Display Information

class(TRIP4_align_seqinr)
## [1] "alignment"
is(TRIP4_align_seqinr)
## [1] "alignment"
# Display output with print_msa
compbio4all::print_msa(TRIP4_align_seqinr)
## [1] "MAVAGAVSGEPLVHWCTQQLRKTFGLDVSEEIIQYVLSIESAEEIREYVTDLLQGNEGKK 0"
## [1] "MAVAGAVSGEPLVHWCTQQLRKTFGLDVSEEIIQYVLSIESAEEIREYVTDLLQGNEGKK 0"
## [1] "MAVAGAASREQLVGWCTEQLRKSFGLDVSEEIIQYVLSIESAEEIREYVTDLLQGNEGKK 0"
## [1] "MAVAGAASRQQLVGWCTEQLRKSFGLDVSEEIIQYVLSIESAEEIREYVTDLLQGNEGKK 0"
## [1] "MAVAGAASREQLVGWCTQELRKTFGLDVSEEIMQYVLSIESAEEIREYVTDLLQGNEDKK 0"
## [1] "MAVAGAASREQLVGWCTQQLRKTFGLDVSEEIMQYVLSIESAEEIREYVTDLLQGNEDKK 0"
## [1] "MAVAGAAYREPLVHWCTQQLQKTFALDVSEEIIQYVLSIENAEEIREYVTDLLQGNEGKK 0"
## [1] "MAVAGAASGEPLVHWCTQQLRKTFALDVSEEIIQYVLSIESAEEIRDYVTDLLQGNEGKK 0"
## [1] "MAAPG-----ALLDWCVRRLRGDFGLDVGEEVVRYILSITSEDEIREYVVDLLQGTEGRK 0"
## [1] "-------MSDSLLRWTVDKLKHGFGLDASDDIVQYILSIDNADEIVEYVGDLLQGTEGNK 0"
## [1] " "
## [1] "GQFIEELITKWQKNDQELISDPLQQCFKKDEILDGQKSG--DHLKRGRKKGRNRQEVPAF 0"
## [1] "GQFIEELITKWQKNDQELISDPLQQCFKKDEILDGQKSG--DHLKRGRKKGRNRQEVPAF 0"
## [1] "GQFIEELVTKWQKNDQELTSDALQQSFKKD---DGQRSG--DQLKRSRRKGRNKQEAPAF 0"
## [1] "GQFIEELVTKWQKNDQELTSDALQQSFKKD---DGQRSG--DQLKRSRRKGRNKQEAPAF 0"
## [1] "GQFIEELITKWQKNDQELSSDALPQSFKKDEMLDGQRSG--DQLKRSRRKGRNRQEVPAF 0"
## [1] "GQFIEELIAKWQKNDQELSSDALQKSFKKDEMLDGQRSG--DQLKRSRRKGRNRQEVPAF 0"
## [1] "GQFIEDLITKWQKNDQEFISDSFQQCLRKDEILDGQRSV--DQLKRSRRKGRNKQEVPAF 0"
## [1] "GQFIEDLITKWQKKDQECISDSFQQCWKKDESLDGQRSV--DQLKRSRRKGRNKQEVPAF 0"
## [1] "GRFVEELLSRWQQSSQ-SPAEPLPAYRKKDETSESPRAG--DQAKKGKRKGRNKQETPAY 0"
## [1] "QEFVDELVQRWQ-RCQTQTSDGLGGVLRKEVVMEELDTAPKDTQKKSKRKGRNKQEIVTV 0"
## [1] " "
## [1] "TEPDT-TAEVKTPFDLAKAQEN-----SNSVKKKTKFVNLYTREGQDRLAVLLPGRHPCD 0"
## [1] "TEPDT-TAEVKTPFDLAKAQEN-----SNSVKKKTKFVNLYTREGQDRLAVLLPGRHPCD 0"
## [1] "TEPDT-TTEVKTPFDLAKAQDN-----SSSLKKKTKFVSLYTKEGQDKLAVLIPGRHPCD 0"
## [1] "TEPDT-TTEVKTPFDLAKAQDN-----SSSLKKKTKFVSLYTKEGQDKLAVLIPGRHSCD 0"
## [1] "AEPDT-TKEVKTPFDLAKAQEN-----SNSLKKKTKFVSLYTKEGQDRLAVLIPGRHPCD 0"
## [1] "AEPDT-TTEVKTPFDLAKAQEN-----SSSLKKKTKFVSLYTKEGQDRLAVLIPGRHPCD 0"
## [1] "PEPDV-AVEVKTPLDLAKAQES-----NNSVKKKTRFVNLYTREGQDKLAVLLPGRHPCD 0"
## [1] "PEPDV-TVEVKTPLDLAKAQEG-----NSSVKKKTRFVSLYTREGQDKLAVLLPGRHPCD 0"
## [1] "VEPSTHVEEVKTPLDLAKAQESSTASSSNAYKKKTKYVSLYTKEGQDRLAVLIPGRHACE 0"
## [1] "SQAEP--EPVRTPIDLMKAQESS---NSSSVKKKSKFVNLYAKEGKDKLAVLLPGRQACE 0"
## [1] " "
## [1] "CLGQKHKLINNCLICGRIVCEQEGSGPCLFCGTLVCTHEEQDILQRDSNKSQKLLKKLMS 0"
## [1] "CLGQKHKLINNCLICGRIVCEQEGSGPCLFCGTLVCTHEEQDILQRDSNKSQKLLKKLMS 0"
## [1] "CLGQKHKLINNCLVCGRIVCEQEGSGPCLFCGSLVCTHEERDILQRDSNKSQKLLKKLMS 0"
## [1] "CLGQKHKLINNCLICGRIVCEQEGSGPCLFCGSLVCTHEERDILQRDSNKSQKLLKKLMS 0"
## [1] "CLGQKHKLINNCLICGRIVCEQEGSGPCFFCGTLVCTREEQDILQRDSNKSQKLLKKLMS 0"
## [1] "CLGQKHKLINNCLICGRIVCEQEGSGPCFFCGTLVCTREEQDILQRDSNKSQKLLKKLMS 0"
## [1] "CLGQKHKLINNCLVCGRIVCEQEGSGPCLFCGSLVCTNEEQDILQRDSNKSQKLLKKLMS 0"
## [1] "CLGQKHKLINNCLVCGRIVCEQEGSGPCLFCGSLVCTNEEQDILQRDSNKSQKLLKKLMS 0"
## [1] "CLGQKHKLINNCLVCGRIVCEQEGSGPCLFCGSLVCTKEEQDILQRDSNKSQKLLKKLMA 0"
## [1] "CLAQKHRLINNCLSCGRIVCEQEGSGPCLFCGSLVCTNEEQEILQRDSNKSQKLRKKLMG 0"
## [1] " "
## [1] "GVENSGKVDISTKDLLPHQELRIKSGLEKAIKHKDKLLEFDRTSIRRTQVIDDESDYFAS 0"
## [1] "GVENSGKVDISTKDLLPHQELRIKSGLEKAIKHKDKLLEFDRTSIRRTQVIDDESDYFAS 0"
## [1] "GTENSGKVDISTKDLLPHQELRFKSGLEKAIKHKDKLLEFDRTSIRRTQVIDDESDYFAS 0"
## [1] "GTENAGKVDISTKDLLPHQELRFKSGLEKAIKHKDKLLEFDRTSIRRTQVIDDESDYFAS 0"
## [1] "GTENSGKVDISTKDLLPHQELRIKSGLEKAIKHKDKLLEFDRTSIRRTQVIDDEADYFAS 0"
## [1] "GTESSGKVDISTKDLLPNQELRIKSGLEKAIKHKDKLLEYDRTSIRRTQVIDDESDYFAS 0"
## [1] "GAETSGKVDVSTKDLLPHQESRMKSGLEKAIKHKEKLLEFDRTSIRRTQVIDDESDYFAS 0"
## [1] "GAENSGKVDISTKDHPPHQESRMKSGLEKAVKHKEKLLEFDRTSIRRTQVIDDESDYFAS 0"
## [1] "GAESSGNLDAISKDLLPRQEARLKAGLEMAVKHKDKLLEFDRTSVRRTQVIDDESDYFAT 0"
## [1] "--------EGTEREYLPHQESKMKAGLEKAVKHKDKLLEFDKNSVKRTQVLDDESDYFAT 0"
## [1] " "
## [1] "DSNQWLSKLERETLQKREEELRELRHASRLSKKVTIDFAGRKILEEENSLAEYHSRLDET 0"
## [1] "DSNQWLSKLERETLQKREEELRELRHASRLSKKVTIDFAGRKILEEENSLAEYHSRLDET 0"
## [1] "DSNQWLSKIEREAIQKREEELREFRHASRLSKKITIDFAGRKIVEDEDPLAEYHSKLDEA 0"
## [1] "DSNQWLSKIEREAIQKREEELREFRHASRLSKKITIDFAGRKIVEDEDPLAEYHSKLDEA 0"
## [1] "DSNQWLSKIERETLQKREEELREFRHMSRLSKKITIDFAGRKIVEDENPLAEYHSKLDET 0"
## [1] "DSNQWLSKIERETLQKREEELREFRHMSRLSKKITIDFAGRKIMEDENPLAEYHSKLDET 0"
## [1] "DSNQWLSKVEREMLQKREEELRELRHASRLSKKVTIDFAGRKILEDENPLAEYHSRLDET 0"
## [1] "DSNQWLSKVERDMLQKREEELRELRHASRLSKKVTIDFAGRKILEDENPLAEYHSRLDET 0"
## [1] "DSNQWLSKQEREALQKREQELQELRHASRLAKKITIDFAGRQILEEDSGMAEYHSKLDET 0"
## [1] "DSNQWLSPGEREALRKREEELRELRHASRKDRKITLDFAGRRVLDEGENLSPYYQKFDET 0"
## [1] " "
## [1] "IQAIANGTLNQPLTKLDRSSEEPLGVLVNPNMYQSPPQWVDHTGAASQKKAFRSSGFGLE 0"
## [1] "IQAIANGTLNQPLTKLDRSSEEPLGVLVNPNMYQSPPQWVDHTGAASQKKAFRSSGFGLE 0"
## [1] "IHAIANGTLNQPVTKLDRSSEEPLGLLVNPNLYQSPPQWIDQMGAASQKKAFHPAGSGLE 0"
## [1] "IHAIANGTLNQPLTKLDRSSEEPLGLLVNPNLYQPPPQWIDQTGAASQKKAFHPAGSGLE 0"
## [1] "IQAIANGTLNQPLTKLDRSSDEPLGVLVNPNLYQSPPQWIDQTGAASQKKTFHSAGSELE 0"
## [1] "IQAIANGTLNQPLTKLDRSSDEPLGVLVNPNLYQSPPQWIDQTGAASQKKAFHSGGSELE 0"
## [1] "IQAIASGTLNQSLVTLDRSCEEPLGVLVNPNMYQASPQWVDNTGSTPQKKTSLSAGPRLE 0"
## [1] "IQAIANGTLNQSLVTSDRSSEEPLGVLVNPNMYQAPPQWVDNTGSAPPKKTSLSVGPRLE 0"
## [1] "VEAINCGTLSQPAASPEAKMTLSSGVLVNPHLLQPAPLWVDQTGSLPLRRATHSMGAGEE 0"
## [1] "VKAINTGSFGQKAKHSAPADRQHFRELVNPNILQSAPEWVDVASSNPSRKSSSKAASGKE 0"
## [1] " "
## [1] "FNSFQHQLRIQDQEFQEGFDGGWCLSVHQPWASLLVRGIKRVEGRSWYTPHRGRLWIAAT 0"
## [1] "FNSFQHQLRIQDQEFQEGFDGGWCLSVHQPWASLLVRGIKRVEGRSWYTPHRGRLWIAAT 0"
## [1] "FSSSRHQFRIQDQKFQEGFDGGWCLSMHQPWASLLVRGIKRVEGRNWYTPHRGRLWIAAT 0"
## [1] "FSSCRHQFRIQDQEFQEGFDGGWCLSMHQPWASLLVRGIKRVEGRSWYTPHRGRLWIAAT 0"
## [1] "FNSYRHQLRIQDQEFQEGFDGGWCLSMHQPWASLLVRGIKRVEGRNWYTPHRGRLWIAAT 0"
## [1] "LNSYRHQLRIQDQEFQEGFDGGWCLSMHQPWASLLVRGIKRVEGRSWYTPHRGRLWIAAT 0"
## [1] "PSLHQHQLRIQDQEFQEGFDGGWCLSMHQPWASLLVRGIKRVEGRSWYTPHRGRLWIAAT 0"
## [1] "LSLHPHQLRIQDQEFQEGFDGGWCLSVHQPWASLLVKGIKRVEGRSWYTPHRGRLWIAAT 0"
## [1] "FGSERNRLRIQDRELQEISDDGWCLSMHQPWASLLIRGIKRVEGRTWYTSHRGRLWIAAT 0"
## [1] "TS-ERSRLRLQDKELQEIADGGWCLSMHQPWASLLVKGIKRVEGRTWYTSHRGRLWIAAA 0"
## [1] " "
## [1] "AKKPSPQEVSELQATYRLLRGKDVEFPNDYPSGCLLGCVDLIDCLSQKQFKEQFPDISQE 0"
## [1] "AKKPSPQEVSELQATYRLLRGKDVEFPNDYPSGCLLGCVDLIDCLSQKQFKEQFPDISQE 0"
## [1] "AKRPSPQEVSELQTTYLLLRGKGVEFPSDYPSGCLLGCVDLIDCLSQKQFKDQYPDISQE 0"
## [1] "AKRPSPQEVSELQTTYLLLRGKDVEFPNDYPSGCLLGCVDLIDCLSQKQFKDQYPDISQE 0"
## [1] "AKRPSPQEVSELQTTYRVLRGKDVEFPNDYPSGCLLGCVDLIDCLSQKQFKDQYPDMSQE 0"
## [1] "AKRPSPQEVSELQTTYRLLRGKDVEFPNDYPSGCLLGCVDLIDCLSQKQFKEQYPDMSQE 0"
## [1] "GKRPSPQEVSELQATYRLLRGKDVEFPNDYPSGCLLGCVDLIDCLSQKQFQEQFPDISQE 0"
## [1] "GKRPSTQEVSELQATYRLLRGKDVEFPNDYPSGCLLGCVDLIDCLSQKQFQEQFPDISQE 0"
## [1] "AKRPSPQEISELETTYRMLLRKDVEFPNEYPSGCLLGCVDVTDCLSQEQFNEQYPDLSQE 0"
## [1] "ANRPTPQEIAEVEAMYRHLYKHEPQFPAEYPTGCLLGCVNVSDCLSQEQFREQYPQISEE 0"
## [1] " "
## [1] "-SDSPFVFICKNPQEMVVKFPIKGNPKIWKLDSKIHQGAKKGLMKQNKAV 10"
## [1] "-SDSPYVFICKNPQEMVVKFPIKGNPKIWKLDSKIHQGAKKGLMKQNKAV 10"
## [1] "ESDSPFVFICTNPQEMIVKFPIKGNPKIWKLDSKIHQGAKKGLMKQNKAG 10"
## [1] "ESDSPFVFICTNPQEMIVKFPIKGNPKIWKLDSKIHQGAKKGLMKQNKAG 10"
## [1] "-SDSPFVFICTNPQEMIVKFPIKGNPKIWKLDSKIHQGAKKGLMKQNKAV 10"
## [1] "-SDSPFVFICTNPQEMIVKFPIKGNPKIWKLDSKIHQGAKKGLMKQNKAV 10"
## [1] "-SDSSFVFICKNPQEMVVKFPIKGNPKIWKLDSKIHQGAKKGLMKQNKAV 10"
## [1] "-SDSPFVFICKNPQEMVVKFPIKGNPKIWKLDSKIHQGAKKGLMKQNKAV 10"
## [1] "-SGSPFVFICTNPQEMVVKFPIKGKPKIWKLDSKIHQGAKKGLMKQKAVA 10"
## [1] "-STSPFVFICSNPQELVVKFPMKGKHKIWKLESQSHQSAKKCLMQPA--- 10"
## [1] " "

Finished MSA

# Display finished MSA resutlt

# Change class of the alignment
class(TRIP4_align) <- "AAMultipleAlignment"

ggmsa::ggmsa(TRIP4_align,
      start = 0, 
      end = 50)

Distance Matrix

Make a distance matrix

This produces a “dist” class object.

# Use MSA to calculate the genetic distance
TRIP4_dist <- seqinr::dist.alignment(TRIP4_align_seqinr, 
                                             matrix = "identity")

# Information about the object
is(TRIP4_dist)
## [1] "dist"     "oldClass"
class(TRIP4_dist)
## [1] "dist"

Round for display

# Round to 3 decimals
TRIP4_align_seqinr_rnd <- round(TRIP4_dist, 3)

# Display result
TRIP4_align_seqinr_rnd
##                NP_057297.2 XP_510472.2 NP_001039816.1 XP_005685718.1
## XP_510472.2          0.041                                          
## NP_001039816.1       0.343       0.346                              
## XP_005685718.1       0.335       0.338          0.144               
## XP_535513.2          0.308       0.310          0.279          0.276
## XP_014941048.1       0.308       0.310          0.288          0.279
## NP_062771.2          0.337       0.340          0.399          0.395
## NP_001128453.1       0.337       0.340          0.397          0.392
## XP_015134569.2       0.539       0.541          0.539          0.534
## NP_001071043.2       0.639       0.641          0.634          0.634
##                XP_535513.2 XP_014941048.1 NP_062771.2 NP_001128453.1
## XP_510472.2                                                         
## NP_001039816.1                                                      
## XP_005685718.1                                                      
## XP_535513.2                                                         
## XP_014941048.1       0.171                                          
## NP_062771.2          0.385          0.387                           
## NP_001128453.1       0.387          0.382       0.238               
## XP_015134569.2       0.532          0.532       0.553          0.558
## NP_001071043.2       0.646          0.646       0.628          0.637
##                XP_015134569.2
## XP_510472.2                  
## NP_001039816.1               
## XP_005685718.1               
## XP_535513.2                  
## XP_014941048.1               
## NP_062771.2                  
## NP_001128453.1               
## XP_015134569.2               
## NP_001071043.2          0.597

Phylogenetic Trees of Sequences

Build a phylogenetic tree from distance matrix

phyTree <- nj(TRIP4_dist)

Plotting phylogenetic trees

Plotting the tree.

# plot tree
# Include title
plot.phylo(phyTree, main="Phylogenetic Tree", 
            use.edge.length = F)

# Adding label
mtext(text = "TRIP4 family gene tree - rooted, no branch lengths")