Introduction

This code contains summary information about the BRINP3 gene, which encodes the BMP/retinoic acid inducible neural specific 3 protein.

It also builds an MSA and phylogenetic tree to show the evolutionary relationship between the human version of this gene and its homologs in other species.

Resources / References

Key information use to make this script can be found here:

Refseq Gene: https://www.ncbi.nlm.nih.gov/gene/339479 Refseq Homologene: https://www.ncbi.nlm.nih.gov/homologene/17786

Other resources consulted includes:

Neanderthal genome: http://neandertal.ensemblgenomes.org/index.html

Other interesting resources and online tools include:

REPPER: https://toolkit.tuebingen.mpg.de/jobs/7803820 Sub-cellular locations prediction: https://wolfpsort.hgc.jp/

Preparation

Load necessary packages:

Download and load drawProteins from Bioconductor

library(BiocManager)
library(drawProteins)

Load other packages:

# github packages
library(compbio4all)
library(ggmsa)

# CRAN packages
library(rentrez)
library(seqinr)
library(ape)
library(pander)


library(ggplot2)


library(msa)
library(drawProteins)

# Biostrings
library(Biostrings)


library(HGNChelper)

Accession numbers

Accession numbers were obtained from RefSeq, Refseq Homlogene, UniProt and PDB. UniProt accession numbers can be found by searching for the gene name. PDB accessions can be found by searching with a UniProt accession or a gene name, though many proteins are not in PDB. The Neanderthal genome database was searched but did not yield sequence information on BRINP3.

A protein BLAST search (https://blast.ncbi.nlm.nih.gov/Blast.cgi?PROGRAM=blastp&PAGE_TYPE=BlastSearch&LINK_LOC=blasthome) was carried out excluding vertebrates to determine if it occurred outside of vertebreates. The gene does not appear in non-vertebrates and so a second search was conducted to exclude mammals.

Accession number table

Not available:

Neanderthal

Does not occur:

Outside of vertebrates

#Creating table

BRINP3_table <- c("NP_001304117.1","Q76B58","NA","Homo sapiens","human","BRINP3",
"NP_775144.1","Q8K1M7","NA","Rattus norvegicus","Norway rat", "Brinp3",         "NP_001139279.1", "Q499E0","NA","Mus musculus","house mouse", "Brinp3",    
"NP_001248407.1 ","H9EQG6","NA","Macaca mulatta","Rhesus monkey","BRINP3",
"NP_001095610.1", "A7MB29", "NA", "Bos taurus","cattle","BRINP3",   
" XP_017949134.1","A0A7D9NLR3","NA","Xenopus tropicalis","tropical clawed frog", "brinp3",
"XP_016789866.1","NA","NA","Pan troglodytes","chimpanzee","BRINP3",
"XP_041276047.1","NA","NA","Onychostruthus taczanowskii","white-rumped snowfinch","LOC121344292", 
"XP_026524943.1", "A0A6J1U8X2", "NA", "Notechis scutatus",  "mainland tiger snake", "BRINP3",
"XP_015675031.1","NA","NA","Protobothrops mucrosquamatus","protobothrops mucrosquamatus","LOC107290644")
# Converts all BRINP3 table information into matrix format

BRINP3_table_matrix <- matrix(BRINP3_table,
                                  byrow = T,
                                  nrow = 10)

BRINP3_table <- data.frame(BRINP3_table_matrix, 
                     stringsAsFactors = F)

names(BRINP3_table) <- c("ncbi.protein.accession", "UniProt.id","PDB", "species", "common.name", "gene.name")
#Printing table

pander(BRINP3_table)
Table continues below
ncbi.protein.accession UniProt.id PDB species
NP_001304117.1 Q76B58 NA Homo sapiens
NP_775144.1 Q8K1M7 NA Rattus norvegicus
NP_001139279.1 Q499E0 NA Mus musculus
NP_001248407.1 H9EQG6 NA Macaca mulatta
NP_001095610.1 A7MB29 NA Bos taurus
XP_017949134.1 A0A7D9NLR3 NA Xenopus tropicalis
XP_016789866.1 NA NA Pan troglodytes
XP_041276047.1 NA NA Onychostruthus taczanowskii
XP_026524943.1 A0A6J1U8X2 NA Notechis scutatus
XP_015675031.1 NA NA Protobothrops mucrosquamatus
common.name gene.name
human BRINP3
Norway rat Brinp3
house mouse Brinp3
Rhesus monkey BRINP3
cattle BRINP3
tropical clawed frog brinp3
chimpanzee BRINP3
white-rumped snowfinch LOC121344292
mainland tiger snake BRINP3
protobothrops mucrosquamatus LOC107290644

Data Preparation

#Downloading all FASTA sequences using compbio4all::entrez_fetch_list() which uses rentrez::entrez_fetch() to access NCBI databases.

BRINP3 <- rentrez::entrez_fetch(db = "protein", 
                          id = BRINP3_table$ncbi.protein.accession, 
                          rettype = "fasta")
#This assigns data to the variable BRINP3_list

BRINP3_list <- entrez_fetch_list(db = "protein", 
                          id = BRINP3_table$ncbi.protein.accession, 
                          rettype = "fasta")
#Number of FASTA files

length(BRINP3_list)
## [1] 10
#First entry

BRINP3_list[[1]]
## [1] ">NP_001304117.1 BMP/retinoic acid-inducible neural-specific protein 3 isoform 2 [Homo sapiens]\nMPQAPSTGSSLIRDPSIAHRNTQILWTEAGRDLAQDTRYTGEESLTIFVDKRKLSKRAEGSDSTTNSSSV\nTLETLHQLAASYFIDRDSTLRRLHHIQIASTAIKVTETRTGPLGCSNYDNLDSVSSVLVQSPENKIQLQG\nLQVLLPDYLQERFVQAALSYIACNSEGEFICKENDCWCHCGPKFPECNCPSMDIQAMEENLLRITETWKA\nYNSDFEESDEFKLFMKRLPMNYFLNTSTIMHLWTMDSNFQRRYEQLENSMKQLFLKAQKIVHKLFSLSKR\nCHKQPLISLPRQRTSTYWLTRIQSFLYCNENGLLGSFSEETHSCTCPNDQVVCTAFLPCTVGDASACLTC\nAPDNRTRCGTCNTGYMLSQGLCKPEVAESTDHYIGFETDLQDLEMKYLLQKTDRRIEVHAIFISNDMRLN\nSWFDPSWRKRMLLTLKSNKYKSSLVHMILGLSLQICLTKNSTLEPVLAVYVNPFGGSHSESWFMPVNENS\nFPDWERTKLDLPLQCYNWTLTLGNKWKTFFETVHIYLRSRIKSNGPNGNESIYYEPLEFIDPSRNLGYMK\nINNIQVFGYSMHFDPEAIRDLILQLDYPYTQGSQDSALLQLLEIRDRVNKLSPPGQRRLDLFSCLLRHRL\nKLSTSEVVRIQSALQAFNAKLPNTMDYDTTKLCS\n\n"

Initial data cleaning

#This removes the FASTA header, but additional cleaning steps will be required for particular analyses

for(i in 1:length(BRINP3_list)){
  BRINP3_list[[i]] <- compbio4all::fasta_cleaner(BRINP3_list[[i]], parse = F)
}

General protein information

#Downloads from uniprot

Q76B58_BRINP3  <- drawProteins::get_features("Q76B58")
## [1] "Download has worked"
is(Q76B58_BRINP3)
## [1] "list"             "vector"           "list_OR_List"     "vector_OR_Vector"
## [5] "vector_OR_factor"
#Create dataframe

huUni_df <- drawProteins::feature_to_dataframe(Q76B58_BRINP3)
is(huUni_df)
## [1] "data.frame"       "list"             "oldClass"         "vector"          
## [5] "list_OR_List"     "vector_OR_Vector" "vector_OR_factor"
#View dataframe

huUni_df[,-2]
##                     type begin end length accession   entryName taxid order
## featuresTemp      SIGNAL     1  33     32    Q76B58 BRNP3_HUMAN  9606     1
## featuresTemp.1     CHAIN    34 766    732    Q76B58 BRNP3_HUMAN  9606     1
## featuresTemp.2    DOMAIN    74 264    190    Q76B58 BRNP3_HUMAN  9606     1
## featuresTemp.3  CARBOHYD   168 168      0    Q76B58 BRNP3_HUMAN  9606     1
## featuresTemp.4  CARBOHYD   337 337      0    Q76B58 BRNP3_HUMAN  9606     1
## featuresTemp.5  CARBOHYD   456 456      0    Q76B58 BRNP3_HUMAN  9606     1
## featuresTemp.6  CARBOHYD   562 562      0    Q76B58 BRNP3_HUMAN  9606     1
## featuresTemp.7  CARBOHYD   609 609      0    Q76B58 BRNP3_HUMAN  9606     1
## featuresTemp.8  CARBOHYD   641 641      0    Q76B58 BRNP3_HUMAN  9606     1
## featuresTemp.9   VAR_SEQ     1  40     39    Q76B58 BRNP3_HUMAN  9606     1
## featuresTemp.10  VAR_SEQ    41 142    101    Q76B58 BRNP3_HUMAN  9606     1
## featuresTemp.11 CONFLICT   422 422      0    Q76B58 BRNP3_HUMAN  9606     1
## featuresTemp.12 CONFLICT   534 534      0    Q76B58 BRNP3_HUMAN  9606     1

Protein diagram

#This protein has one domain

my_canvas <- draw_canvas(huUni_df)  
my_canvas <- draw_chains(my_canvas, huUni_df, 
                         label_size = 2.5)
my_canvas <- draw_domains(my_canvas, huUni_df)
my_canvas

Dotplot

#Creates vector

huBRINP3_vector <- compbio4all:: fasta_cleaner(BRINP3_list[[1]])
#Defult dotplot

seqinr::dotPlot(huBRINP3_vector,
                huBRINP3_vector)

#2x2 panel showing different values for dotplot settings

par(mfrow = c(2,2), 
    mar = c(0,0,2,1))

#Plot 1: BRINP3 - Defaults
dotPlot(huBRINP3_vector, 
        huBRINP3_vector, 
        wsize = 1, 
        nmatch = 1, 
        main = "BRINP3 Defaults")

#Plot 2: BRINP3 - size = 10, nmatch = 1
dotPlot(huBRINP3_vector, huBRINP3_vector, 
        wsize = 10, 
        nmatch = 1, 
        main = "BRINP3 - size = 10, nmatch = 1")

#Plot 3: BRINP3 - size = 10, nmatch = 5
dotPlot(huBRINP3_vector, huBRINP3_vector, 
        wsize = 10, 
        nmatch = 5, 
        main = "BRINP3 - size = 10, nmatch = 5")

#Plot 4: BRINP3 size = 20, nmatch = 5
dotPlot(huBRINP3_vector, huBRINP3_vector, 
        wsize = 20,
        nmatch =5,
        main = "BRINP3 - size = 20, nmatch = 5")

#Best plot using normal dotplot

par(mfrow = c(1,1), 
    mar = c(4,4,4,4))

dotPlot(huBRINP3_vector, 
        huBRINP3_vector,
        wsize = 10, 
        nmatch = 5, 
        main = "BRINP3 window = 10, nmatch = 5")

Protein properties compiled from databases

databases <- c("Pfam", "Disprot", "RepeatsDB", "Uniprot", "PDB")

info <- c("The BRINP3 gene has two domains: MACPF and BRINP", "NA", "NA", "The subcellular location is the mitochondrion", "NA")

databasedf <- data.frame(databases,
                         info)
pander(databasedf)
databases info
Pfam The BRINP3 gene has two domains: MACPF and BRINP
Disprot NA
RepeatsDB NA
Uniprot The subcellular location is the mitochondrion
PDB NA

Protein feature prediction

Multivariate statistical techniques were used to confirm the information about protein structure and location in the online database.

Uniprot (which uses http://www.csbio.sjtu.edu.cn/bioinf/Cell-PLoc-2/) indicates that the protein is a secreted: found outside of the cell membrane.

Predict protein fold

#Make 2 vectors of amino acids and check to make sure they are the same length and contain the same valiues

aa.1.1 <- c("A","R","N","D","C","Q","E","G","H","I",
            "L","K","M","F","P","S","T","W","Y","V")

aa.1.2 <- c("A","R","N","D","C","Q","E","G","H","I",
            "L","K","M","F","P","S","T","W","Y","V")

length(aa.1.1) == length(aa.1.2)
## [1] TRUE
aa.1.1 == aa.1.2
##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE
# Shows how many unique values are present and that they both have the same number of unique values

unique(aa.1.1)
##  [1] "A" "R" "N" "D" "C" "Q" "E" "G" "H" "I" "L" "K" "M" "F" "P" "S" "T" "W" "Y"
## [20] "V"
length(unique(aa.1.1)) == length(unique(aa.1.2))
## [1] TRUE
#Creates vector of the frequency of each amino acid in the database of proteins of each type used by Chou, and checks against Chou's total

alpha <- c(285, 53, 97, 163, 22, 67, 134, 197, 111, 91, 
           221, 249, 48, 123, 82, 122, 119, 33, 63, 167)

sum(alpha) == 2447
## [1] TRUE
#Does same thing with beta proteins

beta <- c(203, 67, 139, 121, 75, 122, 86, 297, 49, 120, 
          177, 115, 16, 85, 127, 341, 253, 44, 110, 229)

sum(beta) == 2776
## [1] TRUE
#Does same thing with alpha plus beta proteins

a.plus.b <- c(175, 78, 120, 111, 74, 74, 86, 171, 33, 93,
              110, 112, 25, 52, 71, 126, 117, 30, 108, 123)

sum(a.plus.b) == 1889
## [1] TRUE
#Does same thing with alpha divided by beta proteins

a.div.b <- c(361, 146, 183, 244, 63, 114, 257, 377, 107, 239, 
             339, 321, 91, 158, 188, 327, 238, 72, 130, 378)

sum(a.div.b) == 4333
## [1] TRUE
#Creates dataframe of data

czdf <- data.frame(aa.1.1, alpha, beta, a.plus.b, a.div.b)
#Prints dataframe

pander(czdf)
aa.1.1 alpha beta a.plus.b a.div.b
A 285 203 175 361
R 53 67 78 146
N 97 139 120 183
D 163 121 111 244
C 22 75 74 63
Q 67 122 74 114
E 134 86 86 257
G 197 297 171 377
H 111 49 33 107
I 91 120 93 239
L 221 177 110 339
K 249 115 112 321
M 48 16 25 91
F 123 85 52 158
P 82 127 71 188
S 122 341 126 327
T 119 253 117 238
W 33 44 30 72
Y 63 110 108 130
V 167 229 123 378
#Calculates proportions for each of the four protein fold types

alpha.prop <- alpha/sum(alpha)
beta.prop <- beta/sum(beta)
a.plus.b.prop <- a.plus.b/sum(a.plus.b)
a.div.b <- a.div.b/sum(a.div.b)
#Create dataframe and row labels

aa.prop <- data.frame(alpha.prop,
                      beta.prop,
                      a.plus.b.prop,
                      a.div.b)

row.names(aa.prop) <- aa.1.1
#Prints dataframe

pander(aa.prop)
  alpha.prop beta.prop a.plus.b.prop a.div.b
A 0.1165 0.07313 0.09264 0.08331
R 0.02166 0.02414 0.04129 0.03369
N 0.03964 0.05007 0.06353 0.04223
D 0.06661 0.04359 0.05876 0.05631
C 0.008991 0.02702 0.03917 0.01454
Q 0.02738 0.04395 0.03917 0.02631
E 0.05476 0.03098 0.04553 0.05931
G 0.08051 0.107 0.09052 0.08701
H 0.04536 0.01765 0.01747 0.02469
I 0.03719 0.04323 0.04923 0.05516
L 0.09031 0.06376 0.05823 0.07824
K 0.1018 0.04143 0.05929 0.07408
M 0.01962 0.005764 0.01323 0.021
F 0.05027 0.03062 0.02753 0.03646
P 0.03351 0.04575 0.03759 0.04339
S 0.04986 0.1228 0.0667 0.07547
T 0.04863 0.09114 0.06194 0.05493
W 0.01349 0.01585 0.01588 0.01662
Y 0.02575 0.03963 0.05717 0.03
V 0.06825 0.08249 0.06511 0.08724
#Detemines number of each amino acid in protein 

table(huBRINP3_vector)
## huBRINP3_vector
##  A  C  D  E  F  G  H  I  K  L  M  N  P  Q  R  S  T  V  W  Y 
## 27 23 35 39 28 26 16 33 33 80 17 37 30 33 38 63 47 24 11 24
#A function to convert a table into a vector

table_to_vector <- function(table_x){
  table_names <- attr(table_x, "dimnames")[[1]]
  table_vect <- as.vector(table_x)
  names(table_vect) <- table_names
  return(table_vect)
}
#Creates a table of amino acid frequences, and converts this to a vector

BRINP3_human_table <- table(huBRINP3_vector)/length(huBRINP3_vector)
BRINP3.human.aa.freq <- table_to_vector(BRINP3_human_table)
BRINP3.human.aa.freq
##          A          C          D          E          F          G          H 
## 0.04066265 0.03463855 0.05271084 0.05873494 0.04216867 0.03915663 0.02409639 
##          I          K          L          M          N          P          Q 
## 0.04969880 0.04969880 0.12048193 0.02560241 0.05572289 0.04518072 0.04969880 
##          R          S          T          V          W          Y 
## 0.05722892 0.09487952 0.07078313 0.03614458 0.01656627 0.03614458
#Checks for the presence of U, an unknown amino acid

aa.names <- names(BRINP3.human.aa.freq)
any(aa.names == "U")
## [1] FALSE
#Previous test returns "False", indicating no "U" present; can add protein to frequency table

aa.prop$BRINP3.human.aa.freq <- BRINP3.human.aa.freq
pander::pander(aa.prop)
  alpha.prop beta.prop a.plus.b.prop a.div.b BRINP3.human.aa.freq
A 0.1165 0.07313 0.09264 0.08331 0.04066
R 0.02166 0.02414 0.04129 0.03369 0.03464
N 0.03964 0.05007 0.06353 0.04223 0.05271
D 0.06661 0.04359 0.05876 0.05631 0.05873
C 0.008991 0.02702 0.03917 0.01454 0.04217
Q 0.02738 0.04395 0.03917 0.02631 0.03916
E 0.05476 0.03098 0.04553 0.05931 0.0241
G 0.08051 0.107 0.09052 0.08701 0.0497
H 0.04536 0.01765 0.01747 0.02469 0.0497
I 0.03719 0.04323 0.04923 0.05516 0.1205
L 0.09031 0.06376 0.05823 0.07824 0.0256
K 0.1018 0.04143 0.05929 0.07408 0.05572
M 0.01962 0.005764 0.01323 0.021 0.04518
F 0.05027 0.03062 0.02753 0.03646 0.0497
P 0.03351 0.04575 0.03759 0.04339 0.05723
S 0.04986 0.1228 0.0667 0.07547 0.09488
T 0.04863 0.09114 0.06194 0.05493 0.07078
W 0.01349 0.01585 0.01588 0.01662 0.03614
Y 0.02575 0.03963 0.05717 0.03 0.01657
V 0.06825 0.08249 0.06511 0.08724 0.03614

Functions to calculate similarities

#Creates two custom functions: the first one calculates correlates between two columns of our table, and the second one to calculates correlation similarities

chou_cor <- function(x,y){
  numerator <- sum(x*y)
denominator <- sqrt((sum(x^2))*(sum(y^2)))
result <- numerator/denominator
return(result)
}

chou_cosine <- function(z.1, z.2){
  z.1.abs <- sqrt(sum(z.1^2))
  z.2.abs <- sqrt(sum(z.2^2))
  my.cosine <- sum(z.1*z.2)/(z.1.abs*z.2.abs)
  return(my.cosine)
}
#Calculates and displays correlation between each column

corr.alpha <- chou_cor(aa.prop[,5], aa.prop[,1])
corr.beta  <- chou_cor(aa.prop[,5], aa.prop[,2])
corr.apb   <- chou_cor(aa.prop[,5], aa.prop[,3])
corr.adb   <- chou_cor(aa.prop[,5], aa.prop[,4])

corr.alpha
## [1] 0.7820859
corr.beta
## [1] 0.8469979
corr.apb
## [1] 0.8512793
corr.adb
## [1] 0.850818
#Calculates and displays cosine similarity

cos.alpha <- chou_cosine(aa.prop[,5], aa.prop[,1])
cos.beta  <- chou_cosine(aa.prop[,5], aa.prop[,2])
cos.apb   <- chou_cosine(aa.prop[,5], aa.prop[,3])
cos.adb   <- chou_cosine(aa.prop[,5], aa.prop[,4])

cos.alpha
## [1] 0.7820859
cos.beta
## [1] 0.8469979
cos.apb
## [1] 0.8512793
cos.adb
## [1] 0.850818
#Calculates Euclidean distance

aa.prop.flipped <- t(aa.prop)
round(aa.prop.flipped,2)
##                         A    R    N    D    C    Q    E    G    H    I    L
## alpha.prop           0.12 0.02 0.04 0.07 0.01 0.03 0.05 0.08 0.05 0.04 0.09
## beta.prop            0.07 0.02 0.05 0.04 0.03 0.04 0.03 0.11 0.02 0.04 0.06
## a.plus.b.prop        0.09 0.04 0.06 0.06 0.04 0.04 0.05 0.09 0.02 0.05 0.06
## a.div.b              0.08 0.03 0.04 0.06 0.01 0.03 0.06 0.09 0.02 0.06 0.08
## BRINP3.human.aa.freq 0.04 0.03 0.05 0.06 0.04 0.04 0.02 0.05 0.05 0.12 0.03
##                         K    M    F    P    S    T    W    Y    V
## alpha.prop           0.10 0.02 0.05 0.03 0.05 0.05 0.01 0.03 0.07
## beta.prop            0.04 0.01 0.03 0.05 0.12 0.09 0.02 0.04 0.08
## a.plus.b.prop        0.06 0.01 0.03 0.04 0.07 0.06 0.02 0.06 0.07
## a.div.b              0.07 0.02 0.04 0.04 0.08 0.05 0.02 0.03 0.09
## BRINP3.human.aa.freq 0.06 0.05 0.05 0.06 0.09 0.07 0.04 0.02 0.04
#Calculates distance matrix

dist(aa.prop.flipped, method = "euclidean")
##                      alpha.prop  beta.prop a.plus.b.prop    a.div.b
## beta.prop            0.13342098                                    
## a.plus.b.prop        0.09281824 0.08289406                         
## a.div.b              0.06699039 0.08659174    0.06175113           
## BRINP3.human.aa.freq 0.16704125 0.14133523    0.13363945 0.13508498
#Calculates and displays individual distances

dist.alpha <- dist((aa.prop.flipped[c(1,5),]),  method = "euclidean")
dist.beta  <- dist((aa.prop.flipped[c(2,5),]),  method = "euclidean")
dist.apb   <- dist((aa.prop.flipped[c(3,5),]),  method = "euclidean")
dist.adb  <- dist((aa.prop.flipped[c(4,5),]), method = "euclidean")

dist.alpha
##                      alpha.prop
## BRINP3.human.aa.freq  0.1670413
dist.beta
##                      beta.prop
## BRINP3.human.aa.freq 0.1413352
dist.apb
##                      a.plus.b.prop
## BRINP3.human.aa.freq     0.1336395
dist.adb
##                       a.div.b
## BRINP3.human.aa.freq 0.135085
#Compiles information and rounds it, making it easier to read

fold.type <- c("alpha","beta","alpha plus beta", "alpha/beta")

corr.sim <- round(c(corr.alpha,corr.beta,corr.apb,corr.adb),5)
cosine.sim <- round(c(cos.alpha,cos.beta,cos.apb,cos.adb),5)
Euclidean.dist <- round(c(dist.alpha,dist.beta,dist.apb,dist.adb),5)

sim.sum <- c("","","most.sim","")
dist.sum <- c("","","min.dist","")

df2 <- data.frame(fold.type,
           corr.sim ,
           cosine.sim ,
           Euclidean.dist ,
           sim.sum ,
           dist.sum )
pander(df2)
fold.type corr.sim cosine.sim Euclidean.dist sim.sum dist.sum
alpha 0.7821 0.7821 0.167
beta 0.847 0.847 0.1413
alpha plus beta 0.8513 0.8513 0.1336 most.sim min.dist
alpha/beta 0.8508 0.8508 0.1351

Percent Identity Comparisons (PID)

Data Preparation

#Information about list formed at beginning of workflow

names(BRINP3_list)
##  [1] "NP_001304117.1"  "NP_775144.1"     "NP_001139279.1"  "NP_001248407.1 "
##  [5] "NP_001095610.1"  " XP_017949134.1" "XP_016789866.1"  "XP_041276047.1" 
##  [9] "XP_026524943.1"  "XP_015675031.1"
length (BRINP3_list)
## [1] 10
BRINP3_list[[1]]
## [1] "MPQAPSTGSSLIRDPSIAHRNTQILWTEAGRDLAQDTRYTGEESLTIFVDKRKLSKRAEGSDSTTNSSSVTLETLHQLAASYFIDRDSTLRRLHHIQIASTAIKVTETRTGPLGCSNYDNLDSVSSVLVQSPENKIQLQGLQVLLPDYLQERFVQAALSYIACNSEGEFICKENDCWCHCGPKFPECNCPSMDIQAMEENLLRITETWKAYNSDFEESDEFKLFMKRLPMNYFLNTSTIMHLWTMDSNFQRRYEQLENSMKQLFLKAQKIVHKLFSLSKRCHKQPLISLPRQRTSTYWLTRIQSFLYCNENGLLGSFSEETHSCTCPNDQVVCTAFLPCTVGDASACLTCAPDNRTRCGTCNTGYMLSQGLCKPEVAESTDHYIGFETDLQDLEMKYLLQKTDRRIEVHAIFISNDMRLNSWFDPSWRKRMLLTLKSNKYKSSLVHMILGLSLQICLTKNSTLEPVLAVYVNPFGGSHSESWFMPVNENSFPDWERTKLDLPLQCYNWTLTLGNKWKTFFETVHIYLRSRIKSNGPNGNESIYYEPLEFIDPSRNLGYMKINNIQVFGYSMHFDPEAIRDLILQLDYPYTQGSQDSALLQLLEIRDRVNKLSPPGQRRLDLFSCLLRHRLKLSTSEVVRIQSALQAFNAKLPNTMDYDTTKLCS"
#Creates and prints empty vector

BRINP3_vector <- rep(NA, length(BRINP3_list))
BRINP3_vector
##  [1] NA NA NA NA NA NA NA NA NA NA
#Populates the vector

for(i in 1:length(BRINP3_vector)){
  BRINP3_vector[i] <- BRINP3_list[[i]]
}
#Names the vector

names(BRINP3_vector) <- names(BRINP3_list)
#Turns sequences into vectors

humanBRINP3  <- fasta_cleaner(BRINP3_list[[1]], parse = F)
chimpanzeeBRINP3 <- fasta_cleaner(BRINP3_list[[7]], parse = F)
housemouseBRINP3  <- fasta_cleaner(BRINP3_list[[3]], parse = F)
cattleBRINP3   <- fasta_cleaner(BRINP3_list[[5]], parse = F)
#Performs global pairwise alignments on sequences

align.human.vs.chimp <- Biostrings::pairwiseAlignment(
                  humanBRINP3,
                  chimpanzeeBRINP3)
align.human.vs.housemouse <- Biostrings::pairwiseAlignment(
                  humanBRINP3,
                  housemouseBRINP3)
align.human.vs.cattle <- Biostrings::pairwiseAlignment(
                  humanBRINP3,
                  cattleBRINP3)

PID Table

#Prints percent sequence alignment   

Biostrings::pid(align.human.vs.chimp)
## [1] 99.8494
Biostrings::pid(align.human.vs.housemouse)
## [1] 82.37598
Biostrings::pid(align.human.vs.cattle)
## [1] 82.24543
#Global pairwise alignments for matrix

align.chimp.vs.housemouse <- Biostrings::pairwiseAlignment(
                  chimpanzeeBRINP3,
                  housemouseBRINP3)

align.chimp.vs.cattle <- Biostrings::pairwiseAlignment(
                  chimpanzeeBRINP3,
                  cattleBRINP3)

align.housemouse.vs.cattle <- Biostrings::pairwiseAlignment(
                  housemouseBRINP3,
                  cattleBRINP3)
#Builds a matrix of PID

pids <- c(1,                  NA,     NA,     NA,
          pid(align.human.vs.chimp),          1,     NA,     NA,
          pid(align.human.vs.housemouse), pid(align.chimp.vs.housemouse),      1,     NA,
          pid(align.human.vs.cattle), pid(align.chimp.vs.cattle), pid(align.housemouse.vs.cattle), 1)

mat <- matrix(pids, nrow = 4, byrow = T)
row.names(mat) <- c("Homo","Pan","Mus","Bos")   
colnames(mat) <- c("Homo","Pan","Mus","Bos")   
pander::pander(mat)  
  Homo Pan Mus Bos
Homo 1 NA NA NA
Pan 99.85 1 NA NA
Mus 82.38 82.51 1 NA
Bos 82.25 82.38 97.13 1

PID Methods Comparison

#Generates comparisons between humans and chimps using different methods

PID1 <- pid(align.human.vs.chimp, type = "PID1")
PID2 <- pid(align.human.vs.chimp, type = "PID2")
PID3 <- pid(align.human.vs.chimp, type = "PID3")
PID4 <- pid(align.human.vs.chimp, type = "PID4")
#Generates dataframe 

method <- c("PID1", "PID2", "PID3", "PID4")

PID <- c(PID1, PID2, PID3, PID4)

denominator <- c("(aligned positions + internal gap positions)", "(aligned positions)", "(length shorter sequence)", "(average length of the two sequences)")

PIDcompdf <- data.frame(method,
                        PID,
                        denominator)
pander(PIDcompdf)
method PID denominator
PID1 99.85 (aligned positions + internal gap positions)
PID2 99.85 (aligned positions)
PID3 99.85 (length shorter sequence)
PID4 99.85 (average length of the two sequences)

Multiple Sequence Alignment

Data preparation

#Converts vector into stringset

BRINP3_vector_ss <- Biostrings::AAStringSet(BRINP3_vector)

Building Multiple Sequence Alignment (MSA)

#Uses defult substitution matrix

BRINP3_align <- msa(BRINP3_vector_ss,
                     method = "ClustalW")
## use default substitution matrix

Cleaning / setting up an MSA

#Shows information about stringset

class(BRINP3_align)
## [1] "MsaAAMultipleAlignment"
## attr(,"package")
## [1] "msa"
is(BRINP3_align)
## [1] "MsaAAMultipleAlignment" "AAMultipleAlignment"    "MsaMetaData"           
## [4] "MultipleAlignment"
#Shows defult output of MSA

BRINP3_align
## CLUSTAL 2.1  
## 
## Call:
##    msa(BRINP3_vector_ss, method = "ClustalW")
## 
## MsaAAMultipleAlignment with 10 rows and 766 columns
##      aln                                                   names
##  [1] MIWRRRAGAELSSLMALWEWIVLSL...IQSALQAFNAKLPNTVDYDTTKLCS NP_775144.1
##  [2] MIWRRRAGAELSSLMALWEWIVLSL...IQSALQAFNAKLPNTVDYDTTKLCS NP_001139279.1
##  [3] MIWRSRAGAELFSLMALWEWIALSL...IQSALQAFNAKLPNKMDYDTTKLCS NP_001248407.1 
##  [4] MIWRSRAGAELFSLMAPWEWIALSL...IQSALQAFNAKLPNTVDYDTTKLCS NP_001095610.1
##  [5] MIWRCSAVAELLSLMALWVWIALSL...ILSALQAFNEKLPNTIEYETTKLCS  XP_017949134.1
##  [6] -------------------------...IQSALQAFNAKLPNTMDYDTTKLCS NP_001304117.1
##  [7] -------------------------...IQSALQAFNAKLPNTMDYDTTKLCS XP_016789866.1
##  [8] -------------------------...IQSALQTFNAKLPNTMDYDTTKLCS XP_026524943.1
##  [9] -------------------------...IQSALQTFNAKLPNTVDYDTTKLCS XP_015675031.1
## [10] -------------------------...IQSALQAFNAKLPNTVDYDTTKLCS XP_041276047.1
##  Con ????--?-???-????-?-??-???...IQSALQAFNAKLPNTVDYDTTKLCS Consensus
#Changes class of alignment

class(BRINP3_align) <- "AAMultipleAlignment"
#Converts to seqinr format

BRINP3_align_seqinr <- msaConvert(BRINP3_align, 
                                   type = "seqinr::alignment")
#Shows information 

class(BRINP3_align_seqinr)
## [1] "alignment"
is(BRINP3_align_seqinr)
## [1] "alignment"
#Displays output

compbio4all::print_msa(BRINP3_align_seqinr)
## [1] "MIWRRRAGAELSSLMALWEWIVLSLHCWVLAVAAVSDQHATSPFDWLLSDKGPFHRSQEY 0"
## [1] "MIWRRRAGAELSSLMALWEWIVLSLHCWVLAVAAVSDQHATSPFDWLLSDKGPFHRSQEY 0"
## [1] "MIWRSRAGAELFSLMALWEWIALSLHCWVLAVAAVSDQHATSPFDWLLSDKGPFHRSQEY 0"
## [1] "MIWRSRAGAELFSLMAPWEWIALSLHCWVLAVAAVSDQHATSPFDWLLSDKGPFHRSQEY 0"
## [1] "MIWRCSAVAELLSLMALWVWIALSLHCWIIVIDAVSDQHATSPFDWLLSDKGPFHHSQEY 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] " "
## [1] "TDFVDRSRQGFSTRYKIYREFGRWKVNNLAVERRNFLGSPLPLAPEFFRNIRLLGRRPTL 0"
## [1] "TDFVDRSRQGFSTRYKIYREFGRWKVNNLAVERRNFLGSPLPLAPEFFRNIRLLGRRPTL 0"
## [1] "TDFVDRSRQGFSTRYKIYREFGRWKVNNLAVERRNFLGSPLPLAPEFFRNIRLLGRRPTL 0"
## [1] "TDFVDRSRQGFSTRYKIYREFGRWKVNNLAVERRNFLGSPLPLAPEFFRNIRLLGRRPTL 0"
## [1] "TDFVDRSRQGFTTRYKIYREFGRWKVNNLAVERRNFIGSPLPLAPEFFRNIRHLGRRPSL 0"
## [1] "------MPQAPSTGSSLIRDP--------SIAHRN------------------------- 0"
## [1] "------MPQAPSTGSSLIRDP--------SIAHRN------------------------- 0"
## [1] "------MPQAPSTGFSLIRDP--------SIAPQN------------------------- 0"
## [1] "------MPQAPSTGFSLIRDL--------SIAPRN------------------------- 0"
## [1] "------MPQAPSTGSSLIRDP--------SIAHRN------------------------- 0"
## [1] " "
## [1] "QQITENLIKKYGTHFLLSATLGGEESLTIFVDKRKLSKRPEGSETSTNSSSVTLETLHQL 0"
## [1] "QQITENLIKKYGTHFLLSATLGGEESLTIFVDKRKLSKRSEGSETSTNSSSVTLETLHQL 0"
## [1] "QQITENLIKKYGTHFLLSATLGGEESLTIFVDKRKLSKRAEGSDSSTNSSSVTLETLHQL 0"
## [1] "QQITENLIKKYGTHFLLSATLGGEESLTIFVDKRKLSKRSEGSDPSANSSLVTLETLHQL 0"
## [1] "QQITDNIIKKYGTHFLLSATLGGEESLTIFVDKRKLSKRSEATDSSGNSSSITLETLHQL 0"
## [1] "---TQILWTEAGRDLAQDTRYTGEESLTIFVDKRKLSKRAEGSDSTTNSSSVTLETLHQL 0"
## [1] "---TQILWTEAGRDLAQDTRYTGEESLTIFVDKRKLSKRAEGSDSSTNSSSVTLETLHQL 0"
## [1] "---IQILWTEADRDLAQDTRFTGEESLTIFVDKRKLSKRTETSDPSINSTTVTLETLHQL 0"
## [1] "---TQILWTEAGRDLAQDTRFTGEESLTIFVDKRKLSKRSETSDPSINSTTVNLETLHQL 0"
## [1] "---TLILWTEVDRDLAQDTRFTGEESLTIFVDKRKLSKRSEGNDPSANSSVVTLETLHQL 0"
## [1] " "
## [1] "AASYFIDRDSTLRRLHHIQIASTAIKVTETRTGPLGCSNYDNLDSVSSVLVQSPENKIQL 0"
## [1] "AASYFIDRDSTLRRLHHIQIASTAIKVTETRTGPLGCSNYDNLDSVSSVLVQSPENKIQL 0"
## [1] "AASYFIDRDSTLRRLHHIQIASTAIKVTETRTGPLGCSNYDNLDSVSSVLVQSPENKIQL 0"
## [1] "AASYFIDRDSTLRRLHHIQIASTAIKVTETRTGPLGCSNYDNLDSVSSVLVQSPENKIQL 0"
## [1] "AASYFIDRDSTLRRLHHIQIASTAIKVTETRTGPLGCSNYDNLDSVSSVLVQSPENKIHL 0"
## [1] "AASYFIDRDSTLRRLHHIQIASTAIKVTETRTGPLGCSNYDNLDSVSSVLVQSPENKIQL 0"
## [1] "AASYFIDRDSTLRRLHHIQIASTAIKVTETRTGPLGCSNYDNLDSVSSVLVQSPENKIQL 0"
## [1] "AASYFIDRDSTLRRLHHIQIASTAIKVTETRTGPLGCSNYDNLDSVSSVLVQSPENKIQL 0"
## [1] "AASYFIDRDSTLRRLHHIQIASTAIKVTETRTGPLGCSNYDNLDSVSSVLVQSPENKIQL 0"
## [1] "AASYFIDRESTLRRLHHIQIASTAIKVTETRTGPLGCSNYDNLDSVSSVLVQSPENKIQL 0"
## [1] " "
## [1] "QGLQVLLPDYLQERFVQAALSYIACNSEGEFICKENDCWCLCGPKFPECNCPSMDIQAME 0"
## [1] "QGLQVLLPDYLQERFVQAALSYIACNSEGEFICKDNDCWCHCGPKFPECNCPSMDIQAME 0"
## [1] "QGLQVLLPDYLQERFVQAALSYIACNSEGEFICKENDCWCHCGPKFPECNCPSMDIQAME 0"
## [1] "QGLQVLLPDYLQERFVQAALSYIACNSEGEFICKDNDCWCHCGPKFPECNCPSMDIQAME 0"
## [1] "QGLQVILPEYLQENFVQAALSYIACNSEGEFICKDNDCWCRCGPKFPDCNCPYIDIQAME 0"
## [1] "QGLQVLLPDYLQERFVQAALSYIACNSEGEFICKENDCWCHCGPKFPECNCPSMDIQAME 0"
## [1] "QGLQVLLPDYLQERFVQAALSYIACNSEGEFICKENDCWCHCGPKFPECNCPSMDIQAME 0"
## [1] "QGLQFILPEYLQERFVQAALSYIACNSEGEFICKDNDCWCQCGSKFPECNCPYMDIQAME 0"
## [1] "QGLQLILPEYLQERFVQAALSYIACNSEGEFICKDNDCWCQCGSKFPECNCPYMDIQAME 0"
## [1] "QGLQVILPEYLQEHFVQAALSYIACNSEGEFICKDNDCWCQCGPRFPECNCPYMDIQAME 0"
## [1] " "
## [1] "ENLLRITETWKAYNSDFEDSDEFKFFMKRLPMNYFLNTSTIMHLWTMDSNFQRRYEQLEN 0"
## [1] "ENLLRITETWKAYNSDFEDSDEFKFFMKRLPMNYFLNTSTIMHLWTMDSNFQRRYEQLEN 0"
## [1] "ENLLRITETWKAYNSDFEESDEFKLFMKRLPMNYFLNTSTIMHLWTMDSNFQRRYEQLEI 0"
## [1] "ENLLRITETWKAYNIDFEESDEFKFFMKRLPMNYFLNTSTIMHLWTMDSNFQRRYEQLEN 0"
## [1] "ENLVRISDTWNIYHKEFEESDEFKLFMKRLPVNYFLNTSTILHLWTMDSNFQRRYEQLEN 0"
## [1] "ENLLRITETWKAYNSDFEESDEFKLFMKRLPMNYFLNTSTIMHLWTMDSNFQRRYEQLEN 0"
## [1] "ENLLRITETWKAYNSDFEESDEFKLFMKRLPMNYFLNTSTIMHLWTMDSNFQRRYEQLEN 0"
## [1] "ESLLRISETWNTYNSDFEDSDEFKFFMKRLPMNYFLNTSAIMHLWTMDSNFQHRYVQLES 0"
## [1] "ESLLRISETWNTYNSDFEDSDEFKFFMKRLPMNYFLNTSAIMHLWTMDSNFQHRYEQLES 0"
## [1] "ENLLRISETWNAYNSEFEESDEFKLFMKRLPMNYFLNTSAVMHLWTMDSNFQRRYEQLES 0"
## [1] " "
## [1] "SMKQLFLKAHRIVHKLFSLSKRCHKQPLISLPRQRTSTYWLTRIQSFLYCNENGLLGSFS 0"
## [1] "SMKQLFLKAHRIVHKLFSLSKRCHKQPLISLPRQRTSTYWLTRIQSFLYCNENGLLGSFS 0"
## [1] "SMKQLFLKAQKIVHKLFSLSKRCHKQPLISLPRQRTSTYWLTRIQSFLYCNENGLLGSFS 0"
## [1] "SMKQLFLKAQKIVHKLFSLSKRCHKQPLISLPRQRTSTYWLTRIQSFLYCNENGLLGSFS 0"
## [1] "SMKQLYPKAQKIVHKLFSLSKRCHKQPLISLPRQRSSSYWLNRIQSFLYCNENGLLGSFS 0"
## [1] "SMKQLFLKAQKIVHKLFSLSKRCHKQPLISLPRQRTSTYWLTRIQSFLYCNENGLLGSFS 0"
## [1] "SMKQLFLKAQKIVHKLFSLSKRCHKQPLISLPRQRTSTYWLTRIQSFLYCNENGLLGSFS 0"
## [1] "SMKQLFLKAQKIVHKLFSLSKRCHKQPLISLPRQRSSLYWLNRIQSFLYCNENGLLGSFS 0"
## [1] "SMKQLFLKAQKIVHKLFSLSKRCHKQPFISLPRQRSSLYWLNRIQSFLYCNENGLLGSFS 0"
## [1] "SMKQLFLKAQRTVHKLFSLSKRCHKQPLLSLPRKKSSNYWLNRIQSFLYCNENGLLGSFS 0"
## [1] " "
## [1] "EETHSCTCPNDQVVCTAFLPCTVGDASACLTCAPDNRTRCGTCNTGYMLSQGLCKPEVAE 0"
## [1] "EETHSCTCPNDQVVCTAFLPCTVGDASACLTCAPDNRTRCGTCNTGYMLSQGLCKPEVAE 0"
## [1] "EETHSCTCPNDQVVCTAFLPCTVGDASACLTCAPDNRTRCGTCNTGYMLSQGLCKPEVAE 0"
## [1] "EETHSCTCPNDQVVCTAFLPCTVGDAAACLSCAPDNRTRCGTCNTGYMLSQGFCKPEVAE 0"
## [1] "EETHTCTCPNDQVSCQTNLPCVVGDGAACQTCAQDNRTRCGNCNTGYMLSQGFCKPEVAD 0"
## [1] "EETHSCTCPNDQVVCTAFLPCTVGDASACLTCAPDNRTRCGTCNTGYMLSQGLCKPEVAE 0"
## [1] "EETHSCTCPNDQVVCTAFLPCTVGDASACLTCAPDNRTRCGTCNTGYMLSQGLCKPEVAE 0"
## [1] "EETHSCTCPNDQLACLGSLPCSVGDGSACLTCAPDNRTRCGTCNTGYMLSQGLCKPEVAD 0"
## [1] "EETHSCTCPNDQLACLGSLPCSVGEGSACLTCAPDNRTRCGTCNTGYMLSQGICKPEVAD 0"
## [1] "EETHTCTCPNDQVACHGSLPCTLGDGPACLSCAPDNRTRCGSCNAGYMLSQGLCKPEVAD 0"
## [1] " "
## [1] "STDHYIGFETDLQDLEMKYLLQKTDRRIEVHAIFISNDMRLNSWFDPSWRKRMLLTLKSN 0"
## [1] "STDHYIGFETDLQDLEMKYLLQKTDRRIEVHAIFISNDMRLNSWFDPSWRKRMLLTLKSN 0"
## [1] "STDHYIGFETDLQDLEMKYLLQKTDRRIEVHAIFISNDMRLNSWFDPSWRKRMLLTLKSN 0"
## [1] "STDHYIGFETDLQDLEMKYLLQKTDRRLEVHAIFISNDMRLNSWFDPSWRKRMLLTLKSN 0"
## [1] "STEHYIGFETDMQDTEMKYLLQKNDRRIEVHAIFISNDMRLNSWFDPSWRKRMLLTLKSN 0"
## [1] "STDHYIGFETDLQDLEMKYLLQKTDRRIEVHAIFISNDMRLNSWFDPSWRKRMLLTLKSN 0"
## [1] "STDHYIGFETDLQDLEMKYLLQKTDRRIEVHAIFISNDMRLNSWFDPSWRKRMLLTLKSN 0"
## [1] "STEHYIGFETDLQDLEMKYLLQKTDRRIEVHAIFISNDMRLNSWFDPSWRKRMLLTLKSN 0"
## [1] "STEHYIGFETDLQDLEMKYLLQKADRRIEVHAIFISNDMRLNSWFDPSWRKRMLLTLKSN 0"
## [1] "STEHYIGFETDLQDLEMKYLLQKTDRRIEVHAIFISNDMRLNSWFDPSWRKRMLLTLKSN 0"
## [1] " "
## [1] "KYKSSLVHMILGLSLQICLTKNSTLEPVLAVYINPFGGSHSESWFMPVSESSFPDWERTK 0"
## [1] "KYKSSLVHMILGLSLQICLTKNSTLEPVLAVYVNPFGGSHSESWFMPVSESSFPDWERTK 0"
## [1] "KYKSSLVHMILGLSLQICLTKNSTLEPVLAVYVNPFGGSHSESWFMPVNENSFPDWERTK 0"
## [1] "KYKSSLVHMILGLSLQICLTKNSTLEPVLAVYINPFGGSHSESWFMPVNENSFPDWERTK 0"
## [1] "KYKANMVHMILGLSLQICLTKNSTLEPVLAVYVNPFGGSHSESWFMPVNENNFPDWERTK 0"
## [1] "KYKSSLVHMILGLSLQICLTKNSTLEPVLAVYVNPFGGSHSESWFMPVNENSFPDWERTK 0"
## [1] "KYKSSLVHMILGLSLQICLTKNSTLEPVLAVYVNPFGGSHSESWFMPVNENSFPDWERTK 0"
## [1] "KYKSSLVHMILGMSLQICLTKNSTLEPVLAVYINPFGGSHSESWFMPVNENNFPDWERTK 0"
## [1] "KYKSSLVHMILGISLQICLTKNSTLEPVLAVYINPFGGSHSESWFMPVNENNFPDWERTK 0"
## [1] "KYKSSLVHMILGLSLQICLTKNSTLEPVLAVYINPFGGSHSESWFMPVNENNFPDWERTK 0"
## [1] " "
## [1] "LDLPLQCYNWTLTLGNKWKTFFETVHIYLRSRIKANGPNSNESIYYEPLEFIDPSRNLGY 0"
## [1] "LDLPLQCYNWTLTLGNKWKTFFETVHIYLRSRIKANGPNSNESIYYEPLEFIDPSRNLGY 0"
## [1] "LDLPLQCYNWTLTLGNKWKTFFETVHIYLRSRIKSNGPNGNESIYYEPLEFIDPSRNLGY 0"
## [1] "LDLPLQCYNWSLTLGNKWKTFFETVHIYLRSRIKSNGPNGNESIYYEPLEFIDPSRNLGY 0"
## [1] "LDLPLQCYNWSLTLGNKWKTFFETVHIYLRSRIKSNGPNGNESIYYEPLEFLDPSRNLGY 0"
## [1] "LDLPLQCYNWTLTLGNKWKTFFETVHIYLRSRIKSNGPNGNESIYYEPLEFIDPSRNLGY 0"
## [1] "LDLPLQCYNWTLTLGNKWKTFFETVHIYLRSRIKSNGPNGNESIYYEPLEFIDPSRNLGY 0"
## [1] "LDLPLQCYNWTLTLGNKWKTFFETVHIYLRSRIKSNGPNGNDSIYYEPLEFIDPSRNLGY 0"
## [1] "LDLPLQCYNWTLTLGNKWKTFFETVHIYLRSRIKSNGPNGNDSIYYEPLEFIDPSRNLGY 0"
## [1] "LDLPLQCYNWTLTLGNKWKTFFETVHIYLRSRIKSNGPNGNESIYYEPLEFIDPSRNLGY 0"
## [1] " "
## [1] "MKINNIQVFGYSMHFDPEAIRDLILQLDYPYTQGSQDSALLQLLEIRDRVNKLSPPGQRR 0"
## [1] "MKINNIQVFGYSMHFDPEAIRDLILQLDYPYTQGSQDSALLQLLEIRDRVNKLSPPGQRR 0"
## [1] "MKINNIQVFGYSMHFDPEAIRDLILQLDYPYTQGSQDSALLQLLEIRDRVNKLSPPGQRR 0"
## [1] "MKINNIQVFGYSMHFDPEAIRDLILQLDYPYTQGSQDSALLQLLEIRDRVNKLSPPGQRR 0"
## [1] "MKINSIQVFGYSMHFDPEAIRDLILQLDYPYTQGSQDSALLQLLEIRDRVNKLSPPGQRR 0"
## [1] "MKINNIQVFGYSMHFDPEAIRDLILQLDYPYTQGSQDSALLQLLEIRDRVNKLSPPGQRR 0"
## [1] "MKINNIQVFGYSMHFDPEAIRDLILQLDYPYTQGSQDSALLQLLEIRDRVNKLSPPGQRR 0"
## [1] "MKINSIQVFGYSMHFDPEAIRDLILQLDYPYTQGSQDSALLQLLEIRDRVNNLSPPGQRR 0"
## [1] "MKINSIQVFGYSMHFDPEAIRDLILQLDYPYTQGSQDSALLQLLEIRDRVNKLSPPGQRR 0"
## [1] "MKINNIQVFGYSMHFDPEAIRDLILQLDYPYTQGSQDSALLQLLEIRDRVNKLSPPGQRR 0"
## [1] " "
## [1] "LDLFSCLLRHRLKLSTSEVVRIQSALQAFNAKLPNTVDYDTTKLCS 14"
## [1] "LDLFSCLLRHRLKLSTSEVVRIQSALQAFNAKLPNTVDYDTTKLCS 14"
## [1] "LDLFSCLLRHRLKLSTSEVVRIQSALQAFNAKLPNKMDYDTTKLCS 14"
## [1] "LDLFSCLLRHRLKLSTSEVVRIQSALQAFNAKLPNTVDYDTTKLCS 14"
## [1] "LDLFSCLLRHRLKLSTSEVVRILSALQAFNEKLPNTIEYETTKLCS 14"
## [1] "LDLFSCLLRHRLKLSTSEVVRIQSALQAFNAKLPNTMDYDTTKLCS 14"
## [1] "LDLFSCLLRHRLKLSTSEVVRIQSALQAFNAKLPNTMDYDTTKLCS 14"
## [1] "LDLFSCLLRHRLKLSTSEVMRIQSALQTFNAKLPNTMDYDTTKLCS 14"
## [1] "LDLFSCLLRHRLKLSTSEVMRIQSALQTFNAKLPNTVDYDTTKLCS 14"
## [1] "LDLFSCLLRHRLKLSTSEVVRIQSALQAFNAKLPNTVDYDTTKLCS 14"
## [1] " "

Finished MSA

#First step very important: changing class of alignment; second step displays MSA

class(BRINP3_align) <- "AAMultipleAlignment"

ggmsa::ggmsa(BRINP3_align,
      start = 125, 
      end = 175)

Distance Matrix

#Calculates genetic distance using MSA

BRINP3_dist <- seqinr::dist.alignment(BRINP3_align_seqinr, 
                                             matrix = "identity")
#Shows information about distances

is(BRINP3_dist)
## [1] "dist"     "oldClass"
class(BRINP3_dist)
## [1] "dist"
BRINP3_align_seqinr_rnd <- round(BRINP3_dist, 3)
BRINP3_align_seqinr_rnd
##                 NP_775144.1 NP_001139279.1 NP_001248407.1  NP_001095610.1
## NP_001139279.1        0.072                                              
## NP_001248407.1        0.157          0.153                               
## NP_001095610.1        0.177          0.169           0.149               
##  XP_017949134.1       0.337          0.331           0.317          0.317
## NP_001304117.1        0.263          0.260           0.226          0.263
## XP_016789866.1        0.260          0.257           0.223          0.260
## XP_026524943.1        0.351          0.351           0.343          0.347
## XP_015675031.1        0.349          0.347           0.345          0.341
## XP_041276047.1        0.347          0.345           0.338          0.332
##                  XP_017949134.1 NP_001304117.1 XP_016789866.1 XP_026524943.1
## NP_001139279.1                                                              
## NP_001248407.1                                                              
## NP_001095610.1                                                              
##  XP_017949134.1                                                             
## NP_001304117.1            0.372                                             
## XP_016789866.1            0.370          0.039                              
## XP_026524943.1            0.388          0.274          0.272               
## XP_015675031.1            0.382          0.280          0.277          0.150
## XP_041276047.1            0.374          0.269          0.266          0.266
##                 XP_015675031.1
## NP_001139279.1                
## NP_001248407.1                
## NP_001095610.1                
##  XP_017949134.1               
## NP_001304117.1                
## XP_016789866.1                
## XP_026524943.1                
## XP_015675031.1                
## XP_041276047.1           0.269

Phylognetic tree of sequences

# Function that uses genetic distances to cluster sequences into clades

tree <- nj(BRINP3_dist)
# Plots phylogenetic tree and adds a label

plot.phylo(tree, main="Phylogenetic Tree", 
            use.edge.length = F)

mtext(text = "BRINP3 family gene tree - rooted, no branch lengths")