Introduction

This code compiles summary information about the gene TRIP4 (thyroid hormone receptor interactor 4), which encodes the activating signal cointegrator 1 protein. The protein is involved mediating interactions with transcriptional coactivators and ligand-bound nuclear receptors. An MSA and a phylogenetic tree is also generated to show the evolutionary relationship betweeen the human version of the gene and its homologs in other species.

Resources/References

Key information use to make this script can be found here: * Refseq Gene: https://www.ncbi.nlm.nih.gov/gene/9325 * Refseq Homologene: https://www.ncbi.nlm.nih.gov/homologene?LinkName=nuccore_homologene&from_uid=1015232364 * UniProt: https://www.uniprot.org/uniprot/Q15650 * PDB: https://www.rcsb.org/structure/2E5O

Other resources include: * Neanderthal genome: http://neandertal.ensemblgenomes.org/index.html

Preparation

Load necessary packages:

Download and load drawProteins from Bioconductor

library(BiocManager)  #Allow access to Bioconductor packages

## Bioconductor version '3.13' is out-of-date; the current release version '3.14'
##   is available with R version '4.1'; see https://bioconductor.org/install

#install("drawProteins")
library(drawProteins)

Load other packages

# github packages
library(compbio4all)
library(ggmsa)

## Registered S3 methods overwritten by 'ggalt':
##   method                  from   
##   grid.draw.absoluteGrob  ggplot2
##   grobHeight.absoluteGrob ggplot2
##   grobWidth.absoluteGrob  ggplot2
##   grobX.absoluteGrob      ggplot2
##   grobY.absoluteGrob      ggplot2

# CRAN packages
library(rentrez)
library(seqinr)
library(ape)

## 
## Attaching package: 'ape'

## The following objects are masked from 'package:seqinr':
## 
##     as.alignment, consensus

library(pander)


library(ggplot2)

# Bioconductor packages
## msa
###The msa output is used to make a distance matrix and then phylogenetics trees
library(msa)

## Loading required package: Biostrings

## Loading required package: BiocGenerics

## Loading required package: parallel

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min

## Loading required package: S4Vectors

## Loading required package: stats4

## 
## Attaching package: 'S4Vectors'

## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname

## Loading required package: IRanges

## Loading required package: XVector

## Loading required package: GenomeInfoDb

## 
## Attaching package: 'Biostrings'

## The following object is masked from 'package:ape':
## 
##     complement

## The following object is masked from 'package:seqinr':
## 
##     translate

## The following object is masked from 'package:base':
## 
##     strsplit

## 
## Attaching package: 'msa'

## The following object is masked from 'package:BiocManager':
## 
##     version

library(drawProteins)

## Biostrings
library(Biostrings)


library(HGNChelper)

#Accession Number Table Accession numbers were obtained from RefSeq, Refseq Homlogene, UniProt and PDB. UniProt accession numbers can be found by searching for the gene name. PDB accessions can be found by searching with a UniProt accession or a gene name, though many proteins are not in PDB. The the Neanderthal genome database was searched but did not yield sequence information on. TRIP4.

Not available: * Neanderthal

#                  Refseq         Uniprot  PBD      sci name   common name  gene name   
trip4_table <- c("NP_057297.2", "Q15650", "2E5O", "Homo sapiens", "Human", "TRIP4", 
                "XP_510472.2", "K7BG15", "NA", "Pan troglodytes", "Chimpanzee", "TRIP4",
                "NP_001039816.1", "Q2KIC4", "NA", "Bos taurus", "Cattle", "TRIP4",
                "NP_062771.2","Q9QXN3","NA","Mus musculus","Mouse","TRIP4",
                "NP_001128453.1","B5DEP5","NA","Rattus norvegicus","Norway rat","TRIP4",
                "NP_001071043.2","Q08CA0","NA","Danio rerio","Zebrafish","TRIP4",
                "XP_535513.2","F6UYL9","NA","Canis lupus familiaris","Dog","TRIP4",
                "XP_014941048.1","A0A6J0A8Z9","NA","Acinonyx jubatus","Cheetah","TRIP4",   
                "XP_005685718.1","NA","NA","Capra hircus","Goat","TRIP4",
                "XP_015134569.2","NA","NA","Gallus gallus","Chicken","TRIP4"
)

# convert the vector to matrix using matrix()
trip4_table_matrix <- matrix(trip4_table,
                                  byrow = T,
                                  nrow = 10)

# convert the matrix to a dataframe using data.frame()
trip4_table <- data.frame(trip4_table_matrix, 
                     stringsAsFactors = F)

# name columns of dataframe using names() function
names(trip4_table) <- c("ncbi.protein.accession","UniProt.id", "PBD", "Species", "common.name","gene.name")

pander::pander(trip4_table)

Table continues below
ncbi.protein.accession	UniProt.id	PBD	Species
NP_057297.2	Q15650	2E5O	Homo sapiens
XP_510472.2	K7BG15	NA	Pan troglodytes
NP_001039816.1	Q2KIC4	NA	Bos taurus
NP_062771.2	Q9QXN3	NA	Mus musculus
NP_001128453.1	B5DEP5	NA	Rattus norvegicus
NP_001071043.2	Q08CA0	NA	Danio rerio
XP_535513.2	F6UYL9	NA	Canis lupus familiaris
XP_014941048.1	A0A6J0A8Z9	NA	Acinonyx jubatus
XP_005685718.1	NA	NA	Capra hircus
XP_015134569.2	NA	NA	Gallus gallus

common.name	gene.name
Human	TRIP4
Chimpanzee	TRIP4
Cattle	TRIP4
Mouse	TRIP4
Norway rat	TRIP4
Zebrafish	TRIP4
Dog	TRIP4
Cheetah	TRIP4
Goat	TRIP4
Chicken	TRIP4

Data Preparation

Download sequences

All sequences were downloaded using a wrapper compbio4all::entrez_fetch_list() which uses rentrez::entrez_fetch() to access NCBI databases.

#Download FASTA sequences
trip4_list <- compbio4all::entrez_fetch_list(db = "protein", id = trip4_table$ncbi.protein.accession, rettype = "fasta")

#Number of FASTA files obtained
length(trip4_list)

## [1] 10

# First entry (human)
trip4_human <- trip4_list[[1]]

Initial Data Cleaning

Remove FASTA header

# This step removes the FASTA header and performs cleaning steps required for particular analyses
for(i in 1:length(trip4_list)){
  trip4_list[[i]] <- compbio4all::fasta_cleaner(trip4_list[[i]], parse = F)
}

General Protein Information

# Protein for human

# Use a UniProt accession to download data from UniProt, which produces a list
Q15650_list <- drawProteins::get_features("Q15650")

## [1] "Download has worked"

is(Q15650_list)

## [1] "list"             "vector"           "list_OR_List"     "vector_OR_Vector"
## [5] "vector_OR_factor"

# Convert raw data from the webpage to dataframe
my_prot_df <- drawProteins::feature_to_dataframe(Q15650_list)

# View dataframe
my_prot_df[,-2]

##                     type begin end length accession   entryName taxid order
## featuresTemp    INIT_MET     1   1      0    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.1     CHAIN     2 581    579    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.2    DOMAIN   437 531     94    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.3   ZN_FING   171 187     16    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.4    REGION    97 118     21    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.5    REGION   200 300    100    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.6    REGION   300 400    100    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.7   MOD_RES     2   2      0    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.8   MOD_RES   276 276      0    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.9   MOD_RES   289 289      0    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.10  MOD_RES   341 341      0    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.11 CROSSLNK   324 324      0    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.12 CROSSLNK   325 325      0    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.13 CROSSLNK   334 334      0    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.14 CROSSLNK   367 367      0    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.15 CONFLICT   157 157      0    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.16 CONFLICT   164 164      0    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.17 CONFLICT   475 475      0    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.18   STRAND   435 439      4    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.19    HELIX   443 448      5    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.20   STRAND   454 459      5    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.21   STRAND   465 471      6    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.22    HELIX   478 492     14    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.23   STRAND   504 518     14    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.24    HELIX   521 525      4    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.25     TURN   530 532      2    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.26   STRAND   535 546     11    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.27   STRAND   557 561      4    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.28    HELIX   564 572      8    Q15650 TRIP4_HUMAN  9606     1
## featuresTemp.29    HELIX   573 575      2    Q15650 TRIP4_HUMAN  9606     1

Protein diagram

# Key domain 
my_canvas <- draw_canvas(my_prot_df)  
my_canvas <- draw_chains(my_canvas, my_prot_df, label_size = 2.5)
my_canvas <- draw_domains(my_canvas, my_prot_df)
my_canvas

Dotplot

# Create vector and FASTA Clean the TRIP4 human protein
TRIP4_human_vector <- compbio4all::fasta_cleaner(trip4_human)

# Default dotplot
seqinr::dotPlot(TRIP4_human_vector,
                TRIP4_human_vector)

# set up 2 x 2 grid and show different values for dotplot settings
par(mfrow = c(2,2), 
    mar = c(0,0,2,1))

# plot 1: TRIP4 - Defaults
dotPlot(TRIP4_human_vector,
        TRIP4_human_vector,
        wsize = 1, 
        wstep = 1,
        nmatch = 1, 
        xlab = "trip4_human_vector",
        ylab = "trip4_human_vector",
        main = "TRIP4 Defaults")

# plot 2: TRIP4 - size = 10, nmatch = 1
dotPlot(TRIP4_human_vector,
        TRIP4_human_vector, 
        wsize = 10 , 
        wstep = 1,
        nmatch = 1, 
        xlab = "trip4_human_vector",
        ylab = "trip4_human_vector",
        main = " TRIP4- size = 10, nmatch = 1")

# plot 3: TRIP4 - size = 10 , nmatch = 5
dotPlot(TRIP4_human_vector,
        TRIP4_human_vector, 
        wsize = 10, 
        wstep = 1,
        nmatch = 5, 
        xlab = "trip4_human_vector",
        ylab = "trip4_human_vector",
        main = "TRIP4 - size = 10 , nmatch = 5")

# plot 4: TRIP4 - size = 20, nmatch = 5
dotPlot(TRIP4_human_vector,
        TRIP4_human_vector, 
        wsize = 20,
        wstep = 1,
        nmatch = 5,
        xlab = "trip4_human_vector",
        ylab = "trip4_human_vector",
        main = "TRIP4 - size = 20, nmatch = 5")

# Best version of the dot plot
par(mfrow = c(1,1),
    mar = c(4,4,4,4))

dotPlot(TRIP4_human_vector,
        TRIP4_human_vector,
        wsize = 20, 
        wstep = 1,
        nmatch = 5, 
        xlab = "trip4_human_vector",
        ylab = "trip4_human_vector",
       # main = "TRIP4 Dotplot"
        main = "TRIP4 - window = 20, nmatch = 5")

Protein Properties Compiled from Database

Note: There was no information available on the presence of disorganized regions or tandem repeats.

## Make table consisitng of protein properties from different databases

trip4_protProp <- c("Pfam","There are two interesting domains presented in this database. The first is a zf-C2HC5 domain from AA 168-216. The second is an ASCH domain from AA 437-551.","http://pfam.xfam.org/protein/TRIP4_HUMAN",
                          "Uniprot","TRIP4 is a protein coding gene for Activating signal cointegrator 1, found in the subcellular locations - nucleus and cytoplasm/cytosol.","https://www.uniprot.org/uniprot/Q15650",
                          "Alphafold"," Alphafold predicts with moderate confidence that the Activating signal cointegrator 1 protein, which is encoded by the TRIP4 gene consists mainly of alpha helices and some beta strands.","https://alphafold.ebi.ac.uk/entry/Q15650",
                    "Disprot", "NA", "NA",
                    "RepeatsDB", "NA", "NA")

# convert the vector to matrix using matrix()
trip4_protProp_matrix <- matrix(trip4_protProp,
                                  byrow = T,
                                  nrow = 5)

# convert the matrix to a dataframe using data.frame()
trip4_protProp <- data.frame(trip4_protProp_matrix, 
                     stringsAsFactors = F)

# name columns of dataframe using names() function
names(trip4_protProp) <- c("Source","Protein Property", "Link")

pander::pander(trip4_protProp)

Table continues below
Source	Protein Property
Pfam	There are two interesting domains presented in this database. The first is a zf-C2HC5 domain from AA 168-216. The second is an ASCH domain from AA 437-551.
Uniprot	TRIP4 is a protein coding gene for Activating signal cointegrator 1, found in the subcellular locations - nucleus and cytoplasm/cytosol.
Alphafold	Alphafold predicts with moderate confidence that the Activating signal cointegrator 1 protein, which is encoded by the TRIP4 gene consists mainly of alpha helices and some beta strands.
Disprot	NA
RepeatsDB	NA

Link
http://pfam.xfam.org/protein/TRIP4_HUMAN
https://www.uniprot.org/uniprot/Q15650
https://alphafold.ebi.ac.uk/entry/Q15650
NA
NA

Protein Feature Prediction

Multivariate statistical techniques were used to confirm the information about protein structure and location in the online database.

Uniprot (which uses http://www.csbio.sjtu.edu.cn/bioinf/Cell-PLoc-2/) indicates that the protein is found in the nucleus and cytoplasm.

Secondary Structure Prediction

#Make 2 vectors of amino acids. 
aa.1.1 <- c("A","R","N","D","C","Q","E","G","H","I",
            "L","K","M","F","P","S","T","W","Y","V")

aa.1.2 <- c("A","R","N","D","C","Q","E","G","H","I",
            "L","K","M","F","P","S","T","W","Y","V")

# Use logical check to make sure vectors are same length and contain same values
length(aa.1.1) == length(aa.1.2)

## [1] TRUE

aa.1.1 == aa.1.2

##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE

# Shows number of unique values present 
unique(aa.1.1)

##  [1] "A" "R" "N" "D" "C" "Q" "E" "G" "H" "I" "L" "K" "M" "F" "P" "S" "T" "W" "Y"
## [20] "V"

# Show both vectors have same number of unique values
length(unique(aa.1.1)) == length(unique(aa.1.2))

## [1] TRUE

# Creates a vector of each amino acid frequency in the database of proteins of each type that Chou used
# Alpha proteins
alpha <- c(285, 53, 97, 163, 22, 67, 134, 197, 111, 91, 
           221, 249, 48, 123, 82, 122, 119, 33, 63, 167)

# Check against Chou's total
sum(alpha) == 2447

## [1] TRUE

# Beta proteins
beta <- c(203, 67, 139, 121, 75, 122, 86, 297, 49, 120, 
          177, 115, 16, 85, 127, 341, 253, 44, 110, 229)

# check against Chou's total
sum(beta) == 2776

## [1] TRUE

# Alpha and Beta
a.plus.b <- c(175, 78, 120, 111, 74, 74, 86, 171, 33, 93,
              110, 112, 25, 52, 71, 126, 117, 30, 108, 123)

# Check against Chou's total
sum(a.plus.b) == 1889

## [1] TRUE

# alpha/beta
a.div.b <- c(361, 146, 183, 244, 63, 114, 257, 377, 107, 239, 
             339, 321, 91, 158, 188, 327, 238, 72, 130, 378)

# Check against Chou's total
sum(a.div.b) == 4333

## [1] TRUE

# Create a dataframe of the data
chouDF <- data.frame(aa.1.1, alpha, beta, a.plus.b, a.div.b)

# Print result (dataframe)
pander::pander(chouDF)

aa.1.1	alpha	beta	a.plus.b	a.div.b
A	285	203	175	361
R	53	67	78	146
N	97	139	120	183
D	163	121	111	244
C	22	75	74	63
Q	67	122	74	114
E	134	86	86	257
G	197	297	171	377
H	111	49	33	107
I	91	120	93	239
L	221	177	110	339
K	249	115	112	321
M	48	16	25	91
F	123	85	52	158
P	82	127	71	188
S	122	341	126	327
T	119	253	117	238
W	33	44	30	72
Y	63	110	108	130
V	167	229	123	378

Calculate proportions for each of the four protein fold types

alpha.prop <- alpha/sum(alpha)
beta.prop <- beta/sum(beta)
a.plus.b.prop <- a.plus.b/sum(a.plus.b)
a.div.b <- a.div.b/sum(a.div.b)

Create dataframe

#dataframe
aa.prop <- data.frame(alpha.prop,
                      beta.prop,
                      a.plus.b.prop,
                      a.div.b)
#row labels
row.names(aa.prop) <- aa.1.1

# Print dataframe
pander(aa.prop)

	alpha.prop	beta.prop	a.plus.b.prop	a.div.b
A	0.1165	0.07313	0.09264	0.08331
R	0.02166	0.02414	0.04129	0.03369
N	0.03964	0.05007	0.06353	0.04223
D	0.06661	0.04359	0.05876	0.05631
C	0.008991	0.02702	0.03917	0.01454
Q	0.02738	0.04395	0.03917	0.02631
E	0.05476	0.03098	0.04553	0.05931
G	0.08051	0.107	0.09052	0.08701
H	0.04536	0.01765	0.01747	0.02469
I	0.03719	0.04323	0.04923	0.05516
L	0.09031	0.06376	0.05823	0.07824
K	0.1018	0.04143	0.05929	0.07408
M	0.01962	0.005764	0.01323	0.021
F	0.05027	0.03062	0.02753	0.03646
P	0.03351	0.04575	0.03759	0.04339
S	0.04986	0.1228	0.0667	0.07547
T	0.04863	0.09114	0.06194	0.05493
W	0.01349	0.01585	0.01588	0.01662
Y	0.02575	0.03963	0.05717	0.03
V	0.06825	0.08249	0.06511	0.08724

Determine the number of each amino acid in the protein

table(TRIP4_human_vector)

## TRIP4_human_vector
##  A  C  D  E  F  G  H  I  K  L  M  N  P  Q  R  S  T  V  W  Y 
## 26 15 34 49 22 38 14 30 54 60  5 21 25 42 32 42 24 30  9  9

# Convert table to vector
table.to.vec <- function(table_x){
  table_names <- attr(table_x, "dimnames")[[1]]
  table_vect <- as.vector(table_x)
  names(table_vect) <- table_names
  return(table_vect)
}

Create a table of amino acid frequencies and convert it to a vector.

TRIP4_human_table <- table(TRIP4_human_vector)/length(TRIP4_human_vector)
TRIP4.human.aa.freq <- table.to.vec(TRIP4_human_table)

# Print result
TRIP4.human.aa.freq

##           A           C           D           E           F           G 
## 0.044750430 0.025817556 0.058519793 0.084337349 0.037865749 0.065404475 
##           H           I           K           L           M           N 
## 0.024096386 0.051635112 0.092943201 0.103270224 0.008605852 0.036144578 
##           P           Q           R           S           T           V 
## 0.043029260 0.072289157 0.055077453 0.072289157 0.041308090 0.051635112 
##           W           Y 
## 0.015490534 0.015490534

# Check for presence of U, which is an unknown amino acid 
aa.names <- names(TRIP4.human.aa.freq)
any(aa.names == "U") #Check for any amino acids that has U

## [1] FALSE

# Result returns FALSE, so no "U" is present

# Add protein to frequency table
aa.prop$TRIP4.human.aa.freq <- TRIP4.human.aa.freq
pander::pander(aa.prop)

	alpha.prop	beta.prop	a.plus.b.prop	a.div.b	TRIP4.human.aa.freq
A	0.1165	0.07313	0.09264	0.08331	0.04475
R	0.02166	0.02414	0.04129	0.03369	0.02582
N	0.03964	0.05007	0.06353	0.04223	0.05852
D	0.06661	0.04359	0.05876	0.05631	0.08434
C	0.008991	0.02702	0.03917	0.01454	0.03787
Q	0.02738	0.04395	0.03917	0.02631	0.0654
E	0.05476	0.03098	0.04553	0.05931	0.0241
G	0.08051	0.107	0.09052	0.08701	0.05164
H	0.04536	0.01765	0.01747	0.02469	0.09294
I	0.03719	0.04323	0.04923	0.05516	0.1033
L	0.09031	0.06376	0.05823	0.07824	0.008606
K	0.1018	0.04143	0.05929	0.07408	0.03614
M	0.01962	0.005764	0.01323	0.021	0.04303
F	0.05027	0.03062	0.02753	0.03646	0.07229
P	0.03351	0.04575	0.03759	0.04339	0.05508
S	0.04986	0.1228	0.0667	0.07547	0.07229
T	0.04863	0.09114	0.06194	0.05493	0.04131
W	0.01349	0.01585	0.01588	0.01662	0.05164
Y	0.02575	0.03963	0.05717	0.03	0.01549
V	0.06825	0.08249	0.06511	0.08724	0.01549

Calculating Similarities with Different Functions

#Two custom functions: the first one calculates correlates between two columns of our table, and the second one to calculates correlation similarities

chou_cor <- function(x,y){
  numerator <- sum(x*y)
denominator <- sqrt((sum(x^2))*(sum(y^2)))
result <- numerator/denominator
return(result)
}

chou_cosine <- function(z.1, z.2){
  z.1.abs <- sqrt(sum(z.1^2))
  z.2.abs <- sqrt(sum(z.2^2))
  my.cosine <- sum(z.1*z.2)/(z.1.abs*z.2.abs)
  return(my.cosine)
}

#Calculates and displays correlation between each column

corr.alpha <- chou_cor(aa.prop[,5], aa.prop[,1])
corr.beta  <- chou_cor(aa.prop[,5], aa.prop[,2])
corr.apb   <- chou_cor(aa.prop[,5], aa.prop[,3])
corr.adb   <- chou_cor(aa.prop[,5], aa.prop[,4])

# Display result for alpha
corr.alpha

## [1] 0.739012

# Display result for beta
corr.beta

## [1] 0.7483259

# Display result for apb
corr.apb

## [1] 0.7825123

# Display result for adb
corr.adb

## [1] 0.7662036

Calculates and displays the cosine similarity

cos.alpha <- chou_cosine(aa.prop[,5], aa.prop[,1])
cos.beta  <- chou_cosine(aa.prop[,5], aa.prop[,2])
cos.apb   <- chou_cosine(aa.prop[,5], aa.prop[,3])
cos.adb   <- chou_cosine(aa.prop[,5], aa.prop[,4])

# Display cosine similarities results
cos.alpha

## [1] 0.739012

cos.beta

## [1] 0.7483259

cos.apb

## [1] 0.7825123

cos.adb

## [1] 0.7662036

Calculate Euclidean distance

# Euclidean distance
aa.prop.flipped <- t(aa.prop)
round(aa.prop.flipped,2)

##                        A    R    N    D    C    Q    E    G    H    I    L    K
## alpha.prop          0.12 0.02 0.04 0.07 0.01 0.03 0.05 0.08 0.05 0.04 0.09 0.10
## beta.prop           0.07 0.02 0.05 0.04 0.03 0.04 0.03 0.11 0.02 0.04 0.06 0.04
## a.plus.b.prop       0.09 0.04 0.06 0.06 0.04 0.04 0.05 0.09 0.02 0.05 0.06 0.06
## a.div.b             0.08 0.03 0.04 0.06 0.01 0.03 0.06 0.09 0.02 0.06 0.08 0.07
## TRIP4.human.aa.freq 0.04 0.03 0.06 0.08 0.04 0.07 0.02 0.05 0.09 0.10 0.01 0.04
##                        M    F    P    S    T    W    Y    V
## alpha.prop          0.02 0.05 0.03 0.05 0.05 0.01 0.03 0.07
## beta.prop           0.01 0.03 0.05 0.12 0.09 0.02 0.04 0.08
## a.plus.b.prop       0.01 0.03 0.04 0.07 0.06 0.02 0.06 0.07
## a.div.b             0.02 0.04 0.04 0.08 0.05 0.02 0.03 0.09
## TRIP4.human.aa.freq 0.04 0.07 0.06 0.07 0.04 0.05 0.02 0.02

Calculate Distance Matrix

dist(aa.prop.flipped, method = "euclidean")

##                     alpha.prop  beta.prop a.plus.b.prop    a.div.b
## beta.prop           0.13342098                                    
## a.plus.b.prop       0.09281824 0.08289406                         
## a.div.b             0.06699039 0.08659174    0.06175113           
## TRIP4.human.aa.freq 0.18410888 0.18214298    0.16315846 0.17061508

Calculates and displays individual distances

dist.alpha <- dist((aa.prop.flipped[c(1,5),]),  method = "euclidean")
dist.beta  <- dist((aa.prop.flipped[c(2,5),]),  method = "euclidean")
dist.apb   <- dist((aa.prop.flipped[c(3,5),]),  method = "euclidean")
dist.adb  <- dist((aa.prop.flipped[c(4,5),]), method = "euclidean")

# Display results
dist.alpha

##                     alpha.prop
## TRIP4.human.aa.freq  0.1841089

dist.beta

##                     beta.prop
## TRIP4.human.aa.freq  0.182143

dist.apb

##                     a.plus.b.prop
## TRIP4.human.aa.freq     0.1631585

dist.adb

##                       a.div.b
## TRIP4.human.aa.freq 0.1706151

# Compile info and rounds it. More readble
fold.type <- c("alpha","beta","alpha plus beta", "alpha/beta")

corr.sim <- round(c(corr.alpha,corr.beta,corr.apb,corr.adb),5)
cosine.sim <- round(c(cos.alpha,cos.beta,cos.apb,cos.adb),5)
Euclidean.dist <- round(c(dist.alpha,dist.beta,dist.apb,dist.adb),5)

sim.sum <- c("","","most.sim","")
dist.sum <- c("","","min.dist","")

df2 <- data.frame(fold.type,
           corr.sim ,
           cosine.sim ,
           Euclidean.dist ,
           sim.sum ,
           dist.sum )

# Display result
pander(df2)

fold.type	corr.sim	cosine.sim	Euclidean.dist	sim.sum	dist.sum
alpha	0.739	0.739	0.1841
beta	0.7483	0.7483	0.1821
alpha plus beta	0.7825	0.7825	0.1632	most.sim	min.dist
alpha/beta	0.7662	0.7662	0.1706

Percent Identity Comparisons (PID)

Data Preparation

# Info about list
names(trip4_list)

##  [1] "NP_057297.2"    "XP_510472.2"    "NP_001039816.1" "NP_062771.2"   
##  [5] "NP_001128453.1" "NP_001071043.2" "XP_535513.2"    "XP_014941048.1"
##  [9] "XP_005685718.1" "XP_015134569.2"

length(trip4_list)

## [1] 10

# First entry data
trip4_list[[1]]

## [1] "MAVAGAVSGEPLVHWCTQQLRKTFGLDVSEEIIQYVLSIESAEEIREYVTDLLQGNEGKKGQFIEELITKWQKNDQELISDPLQQCFKKDEILDGQKSGDHLKRGRKKGRNRQEVPAFTEPDTTAEVKTPFDLAKAQENSNSVKKKTKFVNLYTREGQDRLAVLLPGRHPCDCLGQKHKLINNCLICGRIVCEQEGSGPCLFCGTLVCTHEEQDILQRDSNKSQKLLKKLMSGVENSGKVDISTKDLLPHQELRIKSGLEKAIKHKDKLLEFDRTSIRRTQVIDDESDYFASDSNQWLSKLERETLQKREEELRELRHASRLSKKVTIDFAGRKILEEENSLAEYHSRLDETIQAIANGTLNQPLTKLDRSSEEPLGVLVNPNMYQSPPQWVDHTGAASQKKAFRSSGFGLEFNSFQHQLRIQDQEFQEGFDGGWCLSVHQPWASLLVRGIKRVEGRSWYTPHRGRLWIAATAKKPSPQEVSELQATYRLLRGKDVEFPNDYPSGCLLGCVDLIDCLSQKQFKEQFPDISQESDSPFVFICKNPQEMVVKFPIKGNPKIWKLDSKIHQGAKKGLMKQNKAV"

# Creates and prints empty vector
TRIP4_vector <- rep(NA, length(trip4_list))
TRIP4_vector

##  [1] NA NA NA NA NA NA NA NA NA NA

# Add info to the vector using for loop
for(i in 1:length(TRIP4_vector)){
  TRIP4_vector[i] <- trip4_list[[i]]
}

# Name the vector
names(TRIP4_vector) <- names(trip4_list)

# Turn sequences into vectors
humanTRIP4  <- fasta_cleaner(trip4_list[[1]], parse = F)
chimpanzeeTRIP4  <- fasta_cleaner(trip4_list[[2]], parse = F)
cattleTRIP4   <- fasta_cleaner(trip4_list[[3]], parse = F)
norwayratTRIP4   <- fasta_cleaner(trip4_list[[5]], parse = F)

Performing Global Pairwise Alignment on Sequences

# Aligning human seq vs chimp seq
align.human.vs.chimp <- Biostrings::pairwiseAlignment(
                  humanTRIP4,
                  chimpanzeeTRIP4)

# Aligning human seq vs cattle seq
align.human.vs.cattle <- Biostrings::pairwiseAlignment(
                  humanTRIP4,
                  cattleTRIP4)

# Aligning human seq vs Norway rat seq
align.human.vs.norwayrat <- Biostrings::pairwiseAlignment(
                  humanTRIP4,
                  norwayratTRIP4)

PID Table

#Percent sequence alignments between species

# PID of human vs chimp
Biostrings::pid(align.human.vs.chimp)

## [1] 99.82788

# PID of human vs cattle
Biostrings::pid(align.human.vs.cattle)

## [1] 87.62887

# PID of human vs norway rat
Biostrings::pid(align.human.vs.norwayrat)

## [1] 88.64028

Creating Global pairwise alignments for a matrix

# Aligning chimp seq vs Norway rat seq
align.chimp.vs.norwayrat <- Biostrings::pairwiseAlignment(
                  chimpanzeeTRIP4,
                  norwayratTRIP4)

# Aligning chimp seq vs cattle seq
align.chimp.vs.cattle <- Biostrings::pairwiseAlignment(
                  chimpanzeeTRIP4,
                  cattleTRIP4)

# Aligning Norway rat seq vs cattle seq
align.norwayrat.vs.cattle <- Biostrings::pairwiseAlignment(
                  norwayratTRIP4,
                  cattleTRIP4)

Generating a matrix of PID

pids <- c(1,                  NA,     NA,     NA,
          pid(align.human.vs.chimp),          1,     NA,     NA,
          pid(align.human.vs.cattle), pid(align.chimp.vs.norwayrat),      1,     NA,
          pid(align.human.vs.norwayrat), pid(align.chimp.vs.cattle), pid(align.norwayrat.vs.cattle), 1)

mat <- matrix(pids, nrow = 4, byrow = T)
row.names(mat) <- c("Homo","Pan","Rat","Bos")   
colnames(mat) <- c("Homo","Pan","Rat","Bos")   
pander::pander(mat)

	Homo	Pan	Rat	Bos
Homo	1	NA	NA	NA
Pan	99.83	1	NA	NA
Rat	87.63	88.47	1	NA
Bos	88.64	87.46	83.68	1

PID Methods Comparison

Comparing different PID methods.

A comparison of chimps and human PID with different methods.

PID1 <- pid(align.human.vs.chimp, type = "PID1")
PID2 <- pid(align.human.vs.chimp, type = "PID2")
PID3 <- pid(align.human.vs.chimp, type = "PID3")
PID4 <- pid(align.human.vs.chimp, type = "PID4")

# Generates the dataframe

Method <- c("PID1", "PID2", "PID3", "PID4")

PID <- c(PID1, PID2, PID3, PID4)

Denominator <- c("(aligned positions + internal gap positions)", "(aligned positions)", "(length shorter sequence)", "(average length of the two sequences)")

# Convert vectors to dataframe
pidDF <- data.frame(Method,
                    PID,
                    Denominator)

# Print result (table)
pander::pander(pidDF)

Method	PID	Denominator
PID1	99.83	(aligned positions + internal gap positions)
PID2	99.83	(aligned positions)
PID3	99.83	(length shorter sequence)
PID4	99.83	(average length of the two sequences)

Multiple Sequence Alignment

Data Preparation

For use with R bioinformatics tools we need to convert our named vector to a string set using Biostrings::AAStringSet(). Note the _ss tag at the end of the object we’re assigning the output to, which designates this as a string set.

#Convert vector into stringset
TRIP4_vector_ss <- Biostrings::AAStringSet(TRIP4_vector)

Building Sequence Alignement (MSA)

# Uses default substitution matrix
TRIP4_align <- msa(TRIP4_vector_ss,
                     method = "ClustalW")

## use default substitution matrix

Cleaning / setting up an MSA

MSA produces a species MSA objects

# Info about TRIP4_align
class(TRIP4_align)

## [1] "MsaAAMultipleAlignment"
## attr(,"package")
## [1] "msa"

is(TRIP4_align)

## [1] "MsaAAMultipleAlignment" "AAMultipleAlignment"    "MsaMetaData"           
## [4] "MultipleAlignment"

Default output of MSA

TRIP4_align

## CLUSTAL 2.1  
## 
## Call:
##    msa(TRIP4_vector_ss, method = "ClustalW")
## 
## MsaAAMultipleAlignment with 10 rows and 590 columns
##      aln                                                   names
##  [1] MAVAGAVSGEPLVHWCTQQLRKTFG...PKIWKLDSKIHQGAKKGLMKQNKAV NP_057297.2
##  [2] MAVAGAVSGEPLVHWCTQQLRKTFG...PKIWKLDSKIHQGAKKGLMKQNKAV XP_510472.2
##  [3] MAVAGAASREQLVGWCTEQLRKSFG...PKIWKLDSKIHQGAKKGLMKQNKAG NP_001039816.1
##  [4] MAVAGAASRQQLVGWCTEQLRKSFG...PKIWKLDSKIHQGAKKGLMKQNKAG XP_005685718.1
##  [5] MAVAGAASREQLVGWCTQELRKTFG...PKIWKLDSKIHQGAKKGLMKQNKAV XP_535513.2
##  [6] MAVAGAASREQLVGWCTQQLRKTFG...PKIWKLDSKIHQGAKKGLMKQNKAV XP_014941048.1
##  [7] MAVAGAAYREPLVHWCTQQLQKTFA...PKIWKLDSKIHQGAKKGLMKQNKAV NP_062771.2
##  [8] MAVAGAASGEPLVHWCTQQLRKTFA...PKIWKLDSKIHQGAKKGLMKQNKAV NP_001128453.1
##  [9] MAAPG-----ALLDWCVRRLRGDFG...PKIWKLDSKIHQGAKKGLMKQKAVA XP_015134569.2
## [10] -------MSDSLLRWTVDKLKHGFG...HKIWKLESQSHQSAKKCLMQPA--- NP_001071043.2
##  Con MAVAGAASRE?LV?WCTQQLRKTFG...PKIWKLDSKIHQGAKKGLMKQNKAV Consensus

Change class of alignment

class(TRIP4_align) <- "MsaAAMultipleAlignment"

#class(TRIP4_align)

Convert to seqinr format

TRIP4_align_seqinr <- msaConvert(TRIP4_align, 
                                   type = "seqinr::alignment")

Display Information

class(TRIP4_align_seqinr)

## [1] "alignment"

is(TRIP4_align_seqinr)

## [1] "alignment"

# Display output with print_msa
compbio4all::print_msa(TRIP4_align_seqinr)

## [1] "MAVAGAVSGEPLVHWCTQQLRKTFGLDVSEEIIQYVLSIESAEEIREYVTDLLQGNEGKK 0"
## [1] "MAVAGAVSGEPLVHWCTQQLRKTFGLDVSEEIIQYVLSIESAEEIREYVTDLLQGNEGKK 0"
## [1] "MAVAGAASREQLVGWCTEQLRKSFGLDVSEEIIQYVLSIESAEEIREYVTDLLQGNEGKK 0"
## [1] "MAVAGAASRQQLVGWCTEQLRKSFGLDVSEEIIQYVLSIESAEEIREYVTDLLQGNEGKK 0"
## [1] "MAVAGAASREQLVGWCTQELRKTFGLDVSEEIMQYVLSIESAEEIREYVTDLLQGNEDKK 0"
## [1] "MAVAGAASREQLVGWCTQQLRKTFGLDVSEEIMQYVLSIESAEEIREYVTDLLQGNEDKK 0"
## [1] "MAVAGAAYREPLVHWCTQQLQKTFALDVSEEIIQYVLSIENAEEIREYVTDLLQGNEGKK 0"
## [1] "MAVAGAASGEPLVHWCTQQLRKTFALDVSEEIIQYVLSIESAEEIRDYVTDLLQGNEGKK 0"
## [1] "MAAPG-----ALLDWCVRRLRGDFGLDVGEEVVRYILSITSEDEIREYVVDLLQGTEGRK 0"
## [1] "-------MSDSLLRWTVDKLKHGFGLDASDDIVQYILSIDNADEIVEYVGDLLQGTEGNK 0"
## [1] " "
## [1] "GQFIEELITKWQKNDQELISDPLQQCFKKDEILDGQKSG--DHLKRGRKKGRNRQEVPAF 0"
## [1] "GQFIEELITKWQKNDQELISDPLQQCFKKDEILDGQKSG--DHLKRGRKKGRNRQEVPAF 0"
## [1] "GQFIEELVTKWQKNDQELTSDALQQSFKKD---DGQRSG--DQLKRSRRKGRNKQEAPAF 0"
## [1] "GQFIEELVTKWQKNDQELTSDALQQSFKKD---DGQRSG--DQLKRSRRKGRNKQEAPAF 0"
## [1] "GQFIEELITKWQKNDQELSSDALPQSFKKDEMLDGQRSG--DQLKRSRRKGRNRQEVPAF 0"
## [1] "GQFIEELIAKWQKNDQELSSDALQKSFKKDEMLDGQRSG--DQLKRSRRKGRNRQEVPAF 0"
## [1] "GQFIEDLITKWQKNDQEFISDSFQQCLRKDEILDGQRSV--DQLKRSRRKGRNKQEVPAF 0"
## [1] "GQFIEDLITKWQKKDQECISDSFQQCWKKDESLDGQRSV--DQLKRSRRKGRNKQEVPAF 0"
## [1] "GRFVEELLSRWQQSSQ-SPAEPLPAYRKKDETSESPRAG--DQAKKGKRKGRNKQETPAY 0"
## [1] "QEFVDELVQRWQ-RCQTQTSDGLGGVLRKEVVMEELDTAPKDTQKKSKRKGRNKQEIVTV 0"
## [1] " "
## [1] "TEPDT-TAEVKTPFDLAKAQEN-----SNSVKKKTKFVNLYTREGQDRLAVLLPGRHPCD 0"
## [1] "TEPDT-TAEVKTPFDLAKAQEN-----SNSVKKKTKFVNLYTREGQDRLAVLLPGRHPCD 0"
## [1] "TEPDT-TTEVKTPFDLAKAQDN-----SSSLKKKTKFVSLYTKEGQDKLAVLIPGRHPCD 0"
## [1] "TEPDT-TTEVKTPFDLAKAQDN-----SSSLKKKTKFVSLYTKEGQDKLAVLIPGRHSCD 0"
## [1] "AEPDT-TKEVKTPFDLAKAQEN-----SNSLKKKTKFVSLYTKEGQDRLAVLIPGRHPCD 0"
## [1] "AEPDT-TTEVKTPFDLAKAQEN-----SSSLKKKTKFVSLYTKEGQDRLAVLIPGRHPCD 0"
## [1] "PEPDV-AVEVKTPLDLAKAQES-----NNSVKKKTRFVNLYTREGQDKLAVLLPGRHPCD 0"
## [1] "PEPDV-TVEVKTPLDLAKAQEG-----NSSVKKKTRFVSLYTREGQDKLAVLLPGRHPCD 0"
## [1] "VEPSTHVEEVKTPLDLAKAQESSTASSSNAYKKKTKYVSLYTKEGQDRLAVLIPGRHACE 0"
## [1] "SQAEP--EPVRTPIDLMKAQESS---NSSSVKKKSKFVNLYAKEGKDKLAVLLPGRQACE 0"
## [1] " "
## [1] "CLGQKHKLINNCLICGRIVCEQEGSGPCLFCGTLVCTHEEQDILQRDSNKSQKLLKKLMS 0"
## [1] "CLGQKHKLINNCLICGRIVCEQEGSGPCLFCGTLVCTHEEQDILQRDSNKSQKLLKKLMS 0"
## [1] "CLGQKHKLINNCLVCGRIVCEQEGSGPCLFCGSLVCTHEERDILQRDSNKSQKLLKKLMS 0"
## [1] "CLGQKHKLINNCLICGRIVCEQEGSGPCLFCGSLVCTHEERDILQRDSNKSQKLLKKLMS 0"
## [1] "CLGQKHKLINNCLICGRIVCEQEGSGPCFFCGTLVCTREEQDILQRDSNKSQKLLKKLMS 0"
## [1] "CLGQKHKLINNCLICGRIVCEQEGSGPCFFCGTLVCTREEQDILQRDSNKSQKLLKKLMS 0"
## [1] "CLGQKHKLINNCLVCGRIVCEQEGSGPCLFCGSLVCTNEEQDILQRDSNKSQKLLKKLMS 0"
## [1] "CLGQKHKLINNCLVCGRIVCEQEGSGPCLFCGSLVCTNEEQDILQRDSNKSQKLLKKLMS 0"
## [1] "CLGQKHKLINNCLVCGRIVCEQEGSGPCLFCGSLVCTKEEQDILQRDSNKSQKLLKKLMA 0"
## [1] "CLAQKHRLINNCLSCGRIVCEQEGSGPCLFCGSLVCTNEEQEILQRDSNKSQKLRKKLMG 0"
## [1] " "
## [1] "GVENSGKVDISTKDLLPHQELRIKSGLEKAIKHKDKLLEFDRTSIRRTQVIDDESDYFAS 0"
## [1] "GVENSGKVDISTKDLLPHQELRIKSGLEKAIKHKDKLLEFDRTSIRRTQVIDDESDYFAS 0"
## [1] "GTENSGKVDISTKDLLPHQELRFKSGLEKAIKHKDKLLEFDRTSIRRTQVIDDESDYFAS 0"
## [1] "GTENAGKVDISTKDLLPHQELRFKSGLEKAIKHKDKLLEFDRTSIRRTQVIDDESDYFAS 0"
## [1] "GTENSGKVDISTKDLLPHQELRIKSGLEKAIKHKDKLLEFDRTSIRRTQVIDDEADYFAS 0"
## [1] "GTESSGKVDISTKDLLPNQELRIKSGLEKAIKHKDKLLEYDRTSIRRTQVIDDESDYFAS 0"
## [1] "GAETSGKVDVSTKDLLPHQESRMKSGLEKAIKHKEKLLEFDRTSIRRTQVIDDESDYFAS 0"
## [1] "GAENSGKVDISTKDHPPHQESRMKSGLEKAVKHKEKLLEFDRTSIRRTQVIDDESDYFAS 0"
## [1] "GAESSGNLDAISKDLLPRQEARLKAGLEMAVKHKDKLLEFDRTSVRRTQVIDDESDYFAT 0"
## [1] "--------EGTEREYLPHQESKMKAGLEKAVKHKDKLLEFDKNSVKRTQVLDDESDYFAT 0"
## [1] " "
## [1] "DSNQWLSKLERETLQKREEELRELRHASRLSKKVTIDFAGRKILEEENSLAEYHSRLDET 0"
## [1] "DSNQWLSKLERETLQKREEELRELRHASRLSKKVTIDFAGRKILEEENSLAEYHSRLDET 0"
## [1] "DSNQWLSKIEREAIQKREEELREFRHASRLSKKITIDFAGRKIVEDEDPLAEYHSKLDEA 0"
## [1] "DSNQWLSKIEREAIQKREEELREFRHASRLSKKITIDFAGRKIVEDEDPLAEYHSKLDEA 0"
## [1] "DSNQWLSKIERETLQKREEELREFRHMSRLSKKITIDFAGRKIVEDENPLAEYHSKLDET 0"
## [1] "DSNQWLSKIERETLQKREEELREFRHMSRLSKKITIDFAGRKIMEDENPLAEYHSKLDET 0"
## [1] "DSNQWLSKVEREMLQKREEELRELRHASRLSKKVTIDFAGRKILEDENPLAEYHSRLDET 0"
## [1] "DSNQWLSKVERDMLQKREEELRELRHASRLSKKVTIDFAGRKILEDENPLAEYHSRLDET 0"
## [1] "DSNQWLSKQEREALQKREQELQELRHASRLAKKITIDFAGRQILEEDSGMAEYHSKLDET 0"
## [1] "DSNQWLSPGEREALRKREEELRELRHASRKDRKITLDFAGRRVLDEGENLSPYYQKFDET 0"
## [1] " "
## [1] "IQAIANGTLNQPLTKLDRSSEEPLGVLVNPNMYQSPPQWVDHTGAASQKKAFRSSGFGLE 0"
## [1] "IQAIANGTLNQPLTKLDRSSEEPLGVLVNPNMYQSPPQWVDHTGAASQKKAFRSSGFGLE 0"
## [1] "IHAIANGTLNQPVTKLDRSSEEPLGLLVNPNLYQSPPQWIDQMGAASQKKAFHPAGSGLE 0"
## [1] "IHAIANGTLNQPLTKLDRSSEEPLGLLVNPNLYQPPPQWIDQTGAASQKKAFHPAGSGLE 0"
## [1] "IQAIANGTLNQPLTKLDRSSDEPLGVLVNPNLYQSPPQWIDQTGAASQKKTFHSAGSELE 0"
## [1] "IQAIANGTLNQPLTKLDRSSDEPLGVLVNPNLYQSPPQWIDQTGAASQKKAFHSGGSELE 0"
## [1] "IQAIASGTLNQSLVTLDRSCEEPLGVLVNPNMYQASPQWVDNTGSTPQKKTSLSAGPRLE 0"
## [1] "IQAIANGTLNQSLVTSDRSSEEPLGVLVNPNMYQAPPQWVDNTGSAPPKKTSLSVGPRLE 0"
## [1] "VEAINCGTLSQPAASPEAKMTLSSGVLVNPHLLQPAPLWVDQTGSLPLRRATHSMGAGEE 0"
## [1] "VKAINTGSFGQKAKHSAPADRQHFRELVNPNILQSAPEWVDVASSNPSRKSSSKAASGKE 0"
## [1] " "
## [1] "FNSFQHQLRIQDQEFQEGFDGGWCLSVHQPWASLLVRGIKRVEGRSWYTPHRGRLWIAAT 0"
## [1] "FNSFQHQLRIQDQEFQEGFDGGWCLSVHQPWASLLVRGIKRVEGRSWYTPHRGRLWIAAT 0"
## [1] "FSSSRHQFRIQDQKFQEGFDGGWCLSMHQPWASLLVRGIKRVEGRNWYTPHRGRLWIAAT 0"
## [1] "FSSCRHQFRIQDQEFQEGFDGGWCLSMHQPWASLLVRGIKRVEGRSWYTPHRGRLWIAAT 0"
## [1] "FNSYRHQLRIQDQEFQEGFDGGWCLSMHQPWASLLVRGIKRVEGRNWYTPHRGRLWIAAT 0"
## [1] "LNSYRHQLRIQDQEFQEGFDGGWCLSMHQPWASLLVRGIKRVEGRSWYTPHRGRLWIAAT 0"
## [1] "PSLHQHQLRIQDQEFQEGFDGGWCLSMHQPWASLLVRGIKRVEGRSWYTPHRGRLWIAAT 0"
## [1] "LSLHPHQLRIQDQEFQEGFDGGWCLSVHQPWASLLVKGIKRVEGRSWYTPHRGRLWIAAT 0"
## [1] "FGSERNRLRIQDRELQEISDDGWCLSMHQPWASLLIRGIKRVEGRTWYTSHRGRLWIAAT 0"
## [1] "TS-ERSRLRLQDKELQEIADGGWCLSMHQPWASLLVKGIKRVEGRTWYTSHRGRLWIAAA 0"
## [1] " "
## [1] "AKKPSPQEVSELQATYRLLRGKDVEFPNDYPSGCLLGCVDLIDCLSQKQFKEQFPDISQE 0"
## [1] "AKKPSPQEVSELQATYRLLRGKDVEFPNDYPSGCLLGCVDLIDCLSQKQFKEQFPDISQE 0"
## [1] "AKRPSPQEVSELQTTYLLLRGKGVEFPSDYPSGCLLGCVDLIDCLSQKQFKDQYPDISQE 0"
## [1] "AKRPSPQEVSELQTTYLLLRGKDVEFPNDYPSGCLLGCVDLIDCLSQKQFKDQYPDISQE 0"
## [1] "AKRPSPQEVSELQTTYRVLRGKDVEFPNDYPSGCLLGCVDLIDCLSQKQFKDQYPDMSQE 0"
## [1] "AKRPSPQEVSELQTTYRLLRGKDVEFPNDYPSGCLLGCVDLIDCLSQKQFKEQYPDMSQE 0"
## [1] "GKRPSPQEVSELQATYRLLRGKDVEFPNDYPSGCLLGCVDLIDCLSQKQFQEQFPDISQE 0"
## [1] "GKRPSTQEVSELQATYRLLRGKDVEFPNDYPSGCLLGCVDLIDCLSQKQFQEQFPDISQE 0"
## [1] "AKRPSPQEISELETTYRMLLRKDVEFPNEYPSGCLLGCVDVTDCLSQEQFNEQYPDLSQE 0"
## [1] "ANRPTPQEIAEVEAMYRHLYKHEPQFPAEYPTGCLLGCVNVSDCLSQEQFREQYPQISEE 0"
## [1] " "
## [1] "-SDSPFVFICKNPQEMVVKFPIKGNPKIWKLDSKIHQGAKKGLMKQNKAV 10"
## [1] "-SDSPYVFICKNPQEMVVKFPIKGNPKIWKLDSKIHQGAKKGLMKQNKAV 10"
## [1] "ESDSPFVFICTNPQEMIVKFPIKGNPKIWKLDSKIHQGAKKGLMKQNKAG 10"
## [1] "ESDSPFVFICTNPQEMIVKFPIKGNPKIWKLDSKIHQGAKKGLMKQNKAG 10"
## [1] "-SDSPFVFICTNPQEMIVKFPIKGNPKIWKLDSKIHQGAKKGLMKQNKAV 10"
## [1] "-SDSPFVFICTNPQEMIVKFPIKGNPKIWKLDSKIHQGAKKGLMKQNKAV 10"
## [1] "-SDSSFVFICKNPQEMVVKFPIKGNPKIWKLDSKIHQGAKKGLMKQNKAV 10"
## [1] "-SDSPFVFICKNPQEMVVKFPIKGNPKIWKLDSKIHQGAKKGLMKQNKAV 10"
## [1] "-SGSPFVFICTNPQEMVVKFPIKGKPKIWKLDSKIHQGAKKGLMKQKAVA 10"
## [1] "-STSPFVFICSNPQELVVKFPMKGKHKIWKLESQSHQSAKKCLMQPA--- 10"
## [1] " "

Finished MSA

# Display finished MSA resutlt

# Change class of the alignment
class(TRIP4_align) <- "AAMultipleAlignment"

ggmsa::ggmsa(TRIP4_align,
      start = 0, 
      end = 50)

Distance Matrix

Make a distance matrix

This produces a “dist” class object.

# Use MSA to calculate the genetic distance
TRIP4_dist <- seqinr::dist.alignment(TRIP4_align_seqinr, 
                                             matrix = "identity")

# Information about the object
is(TRIP4_dist)

## [1] "dist"     "oldClass"

class(TRIP4_dist)

## [1] "dist"

Round for display

# Round to 3 decimals
TRIP4_align_seqinr_rnd <- round(TRIP4_dist, 3)

# Display result
TRIP4_align_seqinr_rnd

##                NP_057297.2 XP_510472.2 NP_001039816.1 XP_005685718.1
## XP_510472.2          0.041                                          
## NP_001039816.1       0.343       0.346                              
## XP_005685718.1       0.335       0.338          0.144               
## XP_535513.2          0.308       0.310          0.279          0.276
## XP_014941048.1       0.308       0.310          0.288          0.279
## NP_062771.2          0.337       0.340          0.399          0.395
## NP_001128453.1       0.337       0.340          0.397          0.392
## XP_015134569.2       0.539       0.541          0.539          0.534
## NP_001071043.2       0.639       0.641          0.634          0.634
##                XP_535513.2 XP_014941048.1 NP_062771.2 NP_001128453.1
## XP_510472.2                                                         
## NP_001039816.1                                                      
## XP_005685718.1                                                      
## XP_535513.2                                                         
## XP_014941048.1       0.171                                          
## NP_062771.2          0.385          0.387                           
## NP_001128453.1       0.387          0.382       0.238               
## XP_015134569.2       0.532          0.532       0.553          0.558
## NP_001071043.2       0.646          0.646       0.628          0.637
##                XP_015134569.2
## XP_510472.2                  
## NP_001039816.1               
## XP_005685718.1               
## XP_535513.2                  
## XP_014941048.1               
## NP_062771.2                  
## NP_001128453.1               
## XP_015134569.2               
## NP_001071043.2          0.597

Phylogenetic Trees of Sequences

Build a phylogenetic tree from distance matrix

phyTree <- nj(TRIP4_dist)

Plotting phylogenetic trees

Plotting the tree.

# plot tree
# Include title
plot.phylo(phyTree, main="Phylogenetic Tree", 
            use.edge.length = F)

# Adding label
mtext(text = "TRIP4 family gene tree - rooted, no branch lengths")

Final Portfolio Assignment

Edith Chan

12/17/2021

Introduction

Resources/References

Preparation

Data Preparation

Download sequences

Initial Data Cleaning

General Protein Information

Protein diagram

Dotplot

Protein Properties Compiled from Database

Protein Feature Prediction

Secondary Structure Prediction

Calculating Similarities with Different Functions

Percent Identity Comparisons (PID)

Data Preparation

PID Table

PID Methods Comparison

Multiple Sequence Alignment

Data Preparation

Building Sequence Alignement (MSA)

Cleaning / setting up an MSA

Finished MSA

Distance Matrix

Phylogenetic Trees of Sequences

Plotting phylogenetic trees

aa.1.1	alpha	beta	a.plus.b	a.div.b
A	285	203	175	361
R	53	67	78	146
N	97	139	120	183
D	163	121	111	244
C	22	75	74	63
Q	67	122	74	114
E	134	86	86	257
G	197	297	171	377
H	111	49	33	107
I	91	120	93	239
L	221	177	110	339
K	249	115	112	321
M	48	16	25	91
F	123	85	52	158
P	82	127	71	188
S	122	341	126	327
T	119	253	117	238
W	33	44	30	72
Y	63	110	108	130
V	167	229	123	378

aa.1.1	alpha	beta	a.plus.b	a.div.b
A	285	203	175	361
R	53	67	78	146
N	97	139	120	183
D	163	121	111	244
C	22	75	74	63
Q	67	122	74	114
E	134	86	86	257
G	197	297	171	377
H	111	49	33	107
I	91	120	93	239
L	221	177	110	339
K	249	115	112	321
M	48	16	25	91
F	123	85	52	158
P	82	127	71	188
S	122	341	126	327
T	119	253	117	238
W	33	44	30	72
Y	63	110	108	130
V	167	229	123	378

aa.1.1	alpha	beta	a.plus.b	a.div.b
A	285	203	175	361
R	53	67	78	146
N	97	139	120	183
D	163	121	111	244
C	22	75	74	63
Q	67	122	74	114
E	134	86	86	257
G	197	297	171	377
H	111	49	33	107
I	91	120	93	239
L	221	177	110	339
K	249	115	112	321
M	48	16	25	91
F	123	85	52	158
P	82	127	71	188
S	122	341	126	327
T	119	253	117	238
W	33	44	30	72
Y	63	110	108	130
V	167	229	123	378