This code contains summary information about the BRINP3 gene, which encodes the BMP/retinoic acid inducible neural specific 3 protein.
It also builds an MSA and phylogenetic tree to show the evolutionary relationship between the human version of this gene and its homologs in other species.
Key information use to make this script can be found here:
Refseq Gene: https://www.ncbi.nlm.nih.gov/gene/339479 Refseq Homologene: https://www.ncbi.nlm.nih.gov/homologene/17786
Other resources consulted includes:
Neanderthal genome: http://neandertal.ensemblgenomes.org/index.html
Other interesting resources and online tools include:
REPPER: https://toolkit.tuebingen.mpg.de/jobs/7803820 Sub-cellular locations prediction: https://wolfpsort.hgc.jp/
Load necessary packages:
Download and load drawProteins from Bioconductor
library(BiocManager)
library(drawProteins)
Load other packages:
# github packages
library(compbio4all)
library(ggmsa)
# CRAN packages
library(rentrez)
library(seqinr)
library(ape)
library(pander)
library(ggplot2)
library(msa)
library(drawProteins)
# Biostrings
library(Biostrings)
library(HGNChelper)
Accession numbers were obtained from RefSeq, Refseq Homlogene, UniProt and PDB. UniProt accession numbers can be found by searching for the gene name. PDB accessions can be found by searching with a UniProt accession or a gene name, though many proteins are not in PDB. The Neanderthal genome database was searched but did not yield sequence information on BRINP3.
A protein BLAST search (https://blast.ncbi.nlm.nih.gov/Blast.cgi?PROGRAM=blastp&PAGE_TYPE=BlastSearch&LINK_LOC=blasthome) was carried out excluding vertebrates to determine if it occurred outside of vertebreates. The gene does not appear in non-vertebrates and so a second search was conducted to exclude mammals.
Not available:
Neanderthal
Does not occur:
Outside of vertebrates
#Creating table
BRINP3_table <- c("NP_001304117.1","Q76B58","NA","Homo sapiens","human","BRINP3",
"NP_775144.1","Q8K1M7","NA","Rattus norvegicus","Norway rat", "Brinp3", "NP_001139279.1", "Q499E0","NA","Mus musculus","house mouse", "Brinp3",
"NP_001248407.1 ","H9EQG6","NA","Macaca mulatta","Rhesus monkey","BRINP3",
"NP_001095610.1", "A7MB29", "NA", "Bos taurus","cattle","BRINP3",
" XP_017949134.1","A0A7D9NLR3","NA","Xenopus tropicalis","tropical clawed frog", "brinp3",
"XP_016789866.1","NA","NA","Pan troglodytes","chimpanzee","BRINP3",
"XP_041276047.1","NA","NA","Onychostruthus taczanowskii","white-rumped snowfinch","LOC121344292",
"XP_026524943.1", "A0A6J1U8X2", "NA", "Notechis scutatus", "mainland tiger snake", "BRINP3",
"XP_015675031.1","NA","NA","Protobothrops mucrosquamatus","protobothrops mucrosquamatus","LOC107290644")
# Converts all BRINP3 table information into matrix format
BRINP3_table_matrix <- matrix(BRINP3_table,
byrow = T,
nrow = 10)
BRINP3_table <- data.frame(BRINP3_table_matrix,
stringsAsFactors = F)
names(BRINP3_table) <- c("ncbi.protein.accession", "UniProt.id","PDB", "species", "common.name", "gene.name")
#Printing table
pander(BRINP3_table)
| ncbi.protein.accession | UniProt.id | PDB | species |
|---|---|---|---|
| NP_001304117.1 | Q76B58 | NA | Homo sapiens |
| NP_775144.1 | Q8K1M7 | NA | Rattus norvegicus |
| NP_001139279.1 | Q499E0 | NA | Mus musculus |
| NP_001248407.1 | H9EQG6 | NA | Macaca mulatta |
| NP_001095610.1 | A7MB29 | NA | Bos taurus |
| XP_017949134.1 | A0A7D9NLR3 | NA | Xenopus tropicalis |
| XP_016789866.1 | NA | NA | Pan troglodytes |
| XP_041276047.1 | NA | NA | Onychostruthus taczanowskii |
| XP_026524943.1 | A0A6J1U8X2 | NA | Notechis scutatus |
| XP_015675031.1 | NA | NA | Protobothrops mucrosquamatus |
| common.name | gene.name |
|---|---|
| human | BRINP3 |
| Norway rat | Brinp3 |
| house mouse | Brinp3 |
| Rhesus monkey | BRINP3 |
| cattle | BRINP3 |
| tropical clawed frog | brinp3 |
| chimpanzee | BRINP3 |
| white-rumped snowfinch | LOC121344292 |
| mainland tiger snake | BRINP3 |
| protobothrops mucrosquamatus | LOC107290644 |
#Downloading all FASTA sequences using compbio4all::entrez_fetch_list() which uses rentrez::entrez_fetch() to access NCBI databases.
BRINP3 <- rentrez::entrez_fetch(db = "protein",
id = BRINP3_table$ncbi.protein.accession,
rettype = "fasta")
#This assigns data to the variable BRINP3_list
BRINP3_list <- entrez_fetch_list(db = "protein",
id = BRINP3_table$ncbi.protein.accession,
rettype = "fasta")
#Number of FASTA files
length(BRINP3_list)
## [1] 10
#First entry
BRINP3_list[[1]]
## [1] ">NP_001304117.1 BMP/retinoic acid-inducible neural-specific protein 3 isoform 2 [Homo sapiens]\nMPQAPSTGSSLIRDPSIAHRNTQILWTEAGRDLAQDTRYTGEESLTIFVDKRKLSKRAEGSDSTTNSSSV\nTLETLHQLAASYFIDRDSTLRRLHHIQIASTAIKVTETRTGPLGCSNYDNLDSVSSVLVQSPENKIQLQG\nLQVLLPDYLQERFVQAALSYIACNSEGEFICKENDCWCHCGPKFPECNCPSMDIQAMEENLLRITETWKA\nYNSDFEESDEFKLFMKRLPMNYFLNTSTIMHLWTMDSNFQRRYEQLENSMKQLFLKAQKIVHKLFSLSKR\nCHKQPLISLPRQRTSTYWLTRIQSFLYCNENGLLGSFSEETHSCTCPNDQVVCTAFLPCTVGDASACLTC\nAPDNRTRCGTCNTGYMLSQGLCKPEVAESTDHYIGFETDLQDLEMKYLLQKTDRRIEVHAIFISNDMRLN\nSWFDPSWRKRMLLTLKSNKYKSSLVHMILGLSLQICLTKNSTLEPVLAVYVNPFGGSHSESWFMPVNENS\nFPDWERTKLDLPLQCYNWTLTLGNKWKTFFETVHIYLRSRIKSNGPNGNESIYYEPLEFIDPSRNLGYMK\nINNIQVFGYSMHFDPEAIRDLILQLDYPYTQGSQDSALLQLLEIRDRVNKLSPPGQRRLDLFSCLLRHRL\nKLSTSEVVRIQSALQAFNAKLPNTMDYDTTKLCS\n\n"
#This removes the FASTA header, but additional cleaning steps will be required for particular analyses
for(i in 1:length(BRINP3_list)){
BRINP3_list[[i]] <- compbio4all::fasta_cleaner(BRINP3_list[[i]], parse = F)
}
#Downloads from uniprot
Q76B58_BRINP3 <- drawProteins::get_features("Q76B58")
## [1] "Download has worked"
is(Q76B58_BRINP3)
## [1] "list" "vector" "list_OR_List" "vector_OR_Vector"
## [5] "vector_OR_factor"
#Create dataframe
huUni_df <- drawProteins::feature_to_dataframe(Q76B58_BRINP3)
is(huUni_df)
## [1] "data.frame" "list" "oldClass" "vector"
## [5] "list_OR_List" "vector_OR_Vector" "vector_OR_factor"
#View dataframe
huUni_df[,-2]
## type begin end length accession entryName taxid order
## featuresTemp SIGNAL 1 33 32 Q76B58 BRNP3_HUMAN 9606 1
## featuresTemp.1 CHAIN 34 766 732 Q76B58 BRNP3_HUMAN 9606 1
## featuresTemp.2 DOMAIN 74 264 190 Q76B58 BRNP3_HUMAN 9606 1
## featuresTemp.3 CARBOHYD 168 168 0 Q76B58 BRNP3_HUMAN 9606 1
## featuresTemp.4 CARBOHYD 337 337 0 Q76B58 BRNP3_HUMAN 9606 1
## featuresTemp.5 CARBOHYD 456 456 0 Q76B58 BRNP3_HUMAN 9606 1
## featuresTemp.6 CARBOHYD 562 562 0 Q76B58 BRNP3_HUMAN 9606 1
## featuresTemp.7 CARBOHYD 609 609 0 Q76B58 BRNP3_HUMAN 9606 1
## featuresTemp.8 CARBOHYD 641 641 0 Q76B58 BRNP3_HUMAN 9606 1
## featuresTemp.9 VAR_SEQ 1 40 39 Q76B58 BRNP3_HUMAN 9606 1
## featuresTemp.10 VAR_SEQ 41 142 101 Q76B58 BRNP3_HUMAN 9606 1
## featuresTemp.11 CONFLICT 422 422 0 Q76B58 BRNP3_HUMAN 9606 1
## featuresTemp.12 CONFLICT 534 534 0 Q76B58 BRNP3_HUMAN 9606 1
#This protein has one domain
my_canvas <- draw_canvas(huUni_df)
my_canvas <- draw_chains(my_canvas, huUni_df,
label_size = 2.5)
my_canvas <- draw_domains(my_canvas, huUni_df)
my_canvas
#Creates vector
huBRINP3_vector <- compbio4all:: fasta_cleaner(BRINP3_list[[1]])
#Defult dotplot
seqinr::dotPlot(huBRINP3_vector,
huBRINP3_vector)
#2x2 panel showing different values for dotplot settings
par(mfrow = c(2,2),
mar = c(0,0,2,1))
#Plot 1: BRINP3 - Defaults
dotPlot(huBRINP3_vector,
huBRINP3_vector,
wsize = 1,
nmatch = 1,
main = "BRINP3 Defaults")
#Plot 2: BRINP3 - size = 10, nmatch = 1
dotPlot(huBRINP3_vector, huBRINP3_vector,
wsize = 10,
nmatch = 1,
main = "BRINP3 - size = 10, nmatch = 1")
#Plot 3: BRINP3 - size = 10, nmatch = 5
dotPlot(huBRINP3_vector, huBRINP3_vector,
wsize = 10,
nmatch = 5,
main = "BRINP3 - size = 10, nmatch = 5")
#Plot 4: BRINP3 size = 20, nmatch = 5
dotPlot(huBRINP3_vector, huBRINP3_vector,
wsize = 20,
nmatch =5,
main = "BRINP3 - size = 20, nmatch = 5")
#Best plot using normal dotplot
par(mfrow = c(1,1),
mar = c(4,4,4,4))
dotPlot(huBRINP3_vector,
huBRINP3_vector,
wsize = 10,
nmatch = 5,
main = "BRINP3 window = 10, nmatch = 5")
databases <- c("Pfam", "Disprot", "RepeatsDB", "Uniprot", "PDB")
info <- c("The BRINP3 gene has two domains: MACPF and BRINP", "NA", "NA", "The subcellular location is the mitochondrion", "NA")
databasedf <- data.frame(databases,
info)
pander(databasedf)
| databases | info |
|---|---|
| Pfam | The BRINP3 gene has two domains: MACPF and BRINP |
| Disprot | NA |
| RepeatsDB | NA |
| Uniprot | The subcellular location is the mitochondrion |
| PDB | NA |
Multivariate statistical techniques were used to confirm the information about protein structure and location in the online database.
Uniprot (which uses http://www.csbio.sjtu.edu.cn/bioinf/Cell-PLoc-2/) indicates that the protein is a secreted: found outside of the cell membrane.
#Make 2 vectors of amino acids and check to make sure they are the same length and contain the same valiues
aa.1.1 <- c("A","R","N","D","C","Q","E","G","H","I",
"L","K","M","F","P","S","T","W","Y","V")
aa.1.2 <- c("A","R","N","D","C","Q","E","G","H","I",
"L","K","M","F","P","S","T","W","Y","V")
length(aa.1.1) == length(aa.1.2)
## [1] TRUE
aa.1.1 == aa.1.2
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE
# Shows how many unique values are present and that they both have the same number of unique values
unique(aa.1.1)
## [1] "A" "R" "N" "D" "C" "Q" "E" "G" "H" "I" "L" "K" "M" "F" "P" "S" "T" "W" "Y"
## [20] "V"
length(unique(aa.1.1)) == length(unique(aa.1.2))
## [1] TRUE
#Creates vector of the frequency of each amino acid in the database of proteins of each type used by Chou, and checks against Chou's total
alpha <- c(285, 53, 97, 163, 22, 67, 134, 197, 111, 91,
221, 249, 48, 123, 82, 122, 119, 33, 63, 167)
sum(alpha) == 2447
## [1] TRUE
#Does same thing with beta proteins
beta <- c(203, 67, 139, 121, 75, 122, 86, 297, 49, 120,
177, 115, 16, 85, 127, 341, 253, 44, 110, 229)
sum(beta) == 2776
## [1] TRUE
#Does same thing with alpha plus beta proteins
a.plus.b <- c(175, 78, 120, 111, 74, 74, 86, 171, 33, 93,
110, 112, 25, 52, 71, 126, 117, 30, 108, 123)
sum(a.plus.b) == 1889
## [1] TRUE
#Does same thing with alpha divided by beta proteins
a.div.b <- c(361, 146, 183, 244, 63, 114, 257, 377, 107, 239,
339, 321, 91, 158, 188, 327, 238, 72, 130, 378)
sum(a.div.b) == 4333
## [1] TRUE
#Creates dataframe of data
czdf <- data.frame(aa.1.1, alpha, beta, a.plus.b, a.div.b)
#Prints dataframe
pander(czdf)
| aa.1.1 | alpha | beta | a.plus.b | a.div.b |
|---|---|---|---|---|
| A | 285 | 203 | 175 | 361 |
| R | 53 | 67 | 78 | 146 |
| N | 97 | 139 | 120 | 183 |
| D | 163 | 121 | 111 | 244 |
| C | 22 | 75 | 74 | 63 |
| Q | 67 | 122 | 74 | 114 |
| E | 134 | 86 | 86 | 257 |
| G | 197 | 297 | 171 | 377 |
| H | 111 | 49 | 33 | 107 |
| I | 91 | 120 | 93 | 239 |
| L | 221 | 177 | 110 | 339 |
| K | 249 | 115 | 112 | 321 |
| M | 48 | 16 | 25 | 91 |
| F | 123 | 85 | 52 | 158 |
| P | 82 | 127 | 71 | 188 |
| S | 122 | 341 | 126 | 327 |
| T | 119 | 253 | 117 | 238 |
| W | 33 | 44 | 30 | 72 |
| Y | 63 | 110 | 108 | 130 |
| V | 167 | 229 | 123 | 378 |
#Calculates proportions for each of the four protein fold types
alpha.prop <- alpha/sum(alpha)
beta.prop <- beta/sum(beta)
a.plus.b.prop <- a.plus.b/sum(a.plus.b)
a.div.b <- a.div.b/sum(a.div.b)
#Create dataframe and row labels
aa.prop <- data.frame(alpha.prop,
beta.prop,
a.plus.b.prop,
a.div.b)
row.names(aa.prop) <- aa.1.1
#Prints dataframe
pander(aa.prop)
| alpha.prop | beta.prop | a.plus.b.prop | a.div.b | |
|---|---|---|---|---|
| A | 0.1165 | 0.07313 | 0.09264 | 0.08331 |
| R | 0.02166 | 0.02414 | 0.04129 | 0.03369 |
| N | 0.03964 | 0.05007 | 0.06353 | 0.04223 |
| D | 0.06661 | 0.04359 | 0.05876 | 0.05631 |
| C | 0.008991 | 0.02702 | 0.03917 | 0.01454 |
| Q | 0.02738 | 0.04395 | 0.03917 | 0.02631 |
| E | 0.05476 | 0.03098 | 0.04553 | 0.05931 |
| G | 0.08051 | 0.107 | 0.09052 | 0.08701 |
| H | 0.04536 | 0.01765 | 0.01747 | 0.02469 |
| I | 0.03719 | 0.04323 | 0.04923 | 0.05516 |
| L | 0.09031 | 0.06376 | 0.05823 | 0.07824 |
| K | 0.1018 | 0.04143 | 0.05929 | 0.07408 |
| M | 0.01962 | 0.005764 | 0.01323 | 0.021 |
| F | 0.05027 | 0.03062 | 0.02753 | 0.03646 |
| P | 0.03351 | 0.04575 | 0.03759 | 0.04339 |
| S | 0.04986 | 0.1228 | 0.0667 | 0.07547 |
| T | 0.04863 | 0.09114 | 0.06194 | 0.05493 |
| W | 0.01349 | 0.01585 | 0.01588 | 0.01662 |
| Y | 0.02575 | 0.03963 | 0.05717 | 0.03 |
| V | 0.06825 | 0.08249 | 0.06511 | 0.08724 |
#Detemines number of each amino acid in protein
table(huBRINP3_vector)
## huBRINP3_vector
## A C D E F G H I K L M N P Q R S T V W Y
## 27 23 35 39 28 26 16 33 33 80 17 37 30 33 38 63 47 24 11 24
#A function to convert a table into a vector
table_to_vector <- function(table_x){
table_names <- attr(table_x, "dimnames")[[1]]
table_vect <- as.vector(table_x)
names(table_vect) <- table_names
return(table_vect)
}
#Creates a table of amino acid frequences, and converts this to a vector
BRINP3_human_table <- table(huBRINP3_vector)/length(huBRINP3_vector)
BRINP3.human.aa.freq <- table_to_vector(BRINP3_human_table)
BRINP3.human.aa.freq
## A C D E F G H
## 0.04066265 0.03463855 0.05271084 0.05873494 0.04216867 0.03915663 0.02409639
## I K L M N P Q
## 0.04969880 0.04969880 0.12048193 0.02560241 0.05572289 0.04518072 0.04969880
## R S T V W Y
## 0.05722892 0.09487952 0.07078313 0.03614458 0.01656627 0.03614458
#Checks for the presence of U, an unknown amino acid
aa.names <- names(BRINP3.human.aa.freq)
any(aa.names == "U")
## [1] FALSE
#Previous test returns "False", indicating no "U" present; can add protein to frequency table
aa.prop$BRINP3.human.aa.freq <- BRINP3.human.aa.freq
pander::pander(aa.prop)
| alpha.prop | beta.prop | a.plus.b.prop | a.div.b | BRINP3.human.aa.freq | |
|---|---|---|---|---|---|
| A | 0.1165 | 0.07313 | 0.09264 | 0.08331 | 0.04066 |
| R | 0.02166 | 0.02414 | 0.04129 | 0.03369 | 0.03464 |
| N | 0.03964 | 0.05007 | 0.06353 | 0.04223 | 0.05271 |
| D | 0.06661 | 0.04359 | 0.05876 | 0.05631 | 0.05873 |
| C | 0.008991 | 0.02702 | 0.03917 | 0.01454 | 0.04217 |
| Q | 0.02738 | 0.04395 | 0.03917 | 0.02631 | 0.03916 |
| E | 0.05476 | 0.03098 | 0.04553 | 0.05931 | 0.0241 |
| G | 0.08051 | 0.107 | 0.09052 | 0.08701 | 0.0497 |
| H | 0.04536 | 0.01765 | 0.01747 | 0.02469 | 0.0497 |
| I | 0.03719 | 0.04323 | 0.04923 | 0.05516 | 0.1205 |
| L | 0.09031 | 0.06376 | 0.05823 | 0.07824 | 0.0256 |
| K | 0.1018 | 0.04143 | 0.05929 | 0.07408 | 0.05572 |
| M | 0.01962 | 0.005764 | 0.01323 | 0.021 | 0.04518 |
| F | 0.05027 | 0.03062 | 0.02753 | 0.03646 | 0.0497 |
| P | 0.03351 | 0.04575 | 0.03759 | 0.04339 | 0.05723 |
| S | 0.04986 | 0.1228 | 0.0667 | 0.07547 | 0.09488 |
| T | 0.04863 | 0.09114 | 0.06194 | 0.05493 | 0.07078 |
| W | 0.01349 | 0.01585 | 0.01588 | 0.01662 | 0.03614 |
| Y | 0.02575 | 0.03963 | 0.05717 | 0.03 | 0.01657 |
| V | 0.06825 | 0.08249 | 0.06511 | 0.08724 | 0.03614 |
#Creates two custom functions: the first one calculates correlates between two columns of our table, and the second one to calculates correlation similarities
chou_cor <- function(x,y){
numerator <- sum(x*y)
denominator <- sqrt((sum(x^2))*(sum(y^2)))
result <- numerator/denominator
return(result)
}
chou_cosine <- function(z.1, z.2){
z.1.abs <- sqrt(sum(z.1^2))
z.2.abs <- sqrt(sum(z.2^2))
my.cosine <- sum(z.1*z.2)/(z.1.abs*z.2.abs)
return(my.cosine)
}
#Calculates and displays correlation between each column
corr.alpha <- chou_cor(aa.prop[,5], aa.prop[,1])
corr.beta <- chou_cor(aa.prop[,5], aa.prop[,2])
corr.apb <- chou_cor(aa.prop[,5], aa.prop[,3])
corr.adb <- chou_cor(aa.prop[,5], aa.prop[,4])
corr.alpha
## [1] 0.7820859
corr.beta
## [1] 0.8469979
corr.apb
## [1] 0.8512793
corr.adb
## [1] 0.850818
#Calculates and displays cosine similarity
cos.alpha <- chou_cosine(aa.prop[,5], aa.prop[,1])
cos.beta <- chou_cosine(aa.prop[,5], aa.prop[,2])
cos.apb <- chou_cosine(aa.prop[,5], aa.prop[,3])
cos.adb <- chou_cosine(aa.prop[,5], aa.prop[,4])
cos.alpha
## [1] 0.7820859
cos.beta
## [1] 0.8469979
cos.apb
## [1] 0.8512793
cos.adb
## [1] 0.850818
#Calculates Euclidean distance
aa.prop.flipped <- t(aa.prop)
round(aa.prop.flipped,2)
## A R N D C Q E G H I L
## alpha.prop 0.12 0.02 0.04 0.07 0.01 0.03 0.05 0.08 0.05 0.04 0.09
## beta.prop 0.07 0.02 0.05 0.04 0.03 0.04 0.03 0.11 0.02 0.04 0.06
## a.plus.b.prop 0.09 0.04 0.06 0.06 0.04 0.04 0.05 0.09 0.02 0.05 0.06
## a.div.b 0.08 0.03 0.04 0.06 0.01 0.03 0.06 0.09 0.02 0.06 0.08
## BRINP3.human.aa.freq 0.04 0.03 0.05 0.06 0.04 0.04 0.02 0.05 0.05 0.12 0.03
## K M F P S T W Y V
## alpha.prop 0.10 0.02 0.05 0.03 0.05 0.05 0.01 0.03 0.07
## beta.prop 0.04 0.01 0.03 0.05 0.12 0.09 0.02 0.04 0.08
## a.plus.b.prop 0.06 0.01 0.03 0.04 0.07 0.06 0.02 0.06 0.07
## a.div.b 0.07 0.02 0.04 0.04 0.08 0.05 0.02 0.03 0.09
## BRINP3.human.aa.freq 0.06 0.05 0.05 0.06 0.09 0.07 0.04 0.02 0.04
#Calculates distance matrix
dist(aa.prop.flipped, method = "euclidean")
## alpha.prop beta.prop a.plus.b.prop a.div.b
## beta.prop 0.13342098
## a.plus.b.prop 0.09281824 0.08289406
## a.div.b 0.06699039 0.08659174 0.06175113
## BRINP3.human.aa.freq 0.16704125 0.14133523 0.13363945 0.13508498
#Calculates and displays individual distances
dist.alpha <- dist((aa.prop.flipped[c(1,5),]), method = "euclidean")
dist.beta <- dist((aa.prop.flipped[c(2,5),]), method = "euclidean")
dist.apb <- dist((aa.prop.flipped[c(3,5),]), method = "euclidean")
dist.adb <- dist((aa.prop.flipped[c(4,5),]), method = "euclidean")
dist.alpha
## alpha.prop
## BRINP3.human.aa.freq 0.1670413
dist.beta
## beta.prop
## BRINP3.human.aa.freq 0.1413352
dist.apb
## a.plus.b.prop
## BRINP3.human.aa.freq 0.1336395
dist.adb
## a.div.b
## BRINP3.human.aa.freq 0.135085
#Compiles information and rounds it, making it easier to read
fold.type <- c("alpha","beta","alpha plus beta", "alpha/beta")
corr.sim <- round(c(corr.alpha,corr.beta,corr.apb,corr.adb),5)
cosine.sim <- round(c(cos.alpha,cos.beta,cos.apb,cos.adb),5)
Euclidean.dist <- round(c(dist.alpha,dist.beta,dist.apb,dist.adb),5)
sim.sum <- c("","","most.sim","")
dist.sum <- c("","","min.dist","")
df2 <- data.frame(fold.type,
corr.sim ,
cosine.sim ,
Euclidean.dist ,
sim.sum ,
dist.sum )
pander(df2)
| fold.type | corr.sim | cosine.sim | Euclidean.dist | sim.sum | dist.sum |
|---|---|---|---|---|---|
| alpha | 0.7821 | 0.7821 | 0.167 | ||
| beta | 0.847 | 0.847 | 0.1413 | ||
| alpha plus beta | 0.8513 | 0.8513 | 0.1336 | most.sim | min.dist |
| alpha/beta | 0.8508 | 0.8508 | 0.1351 |
#Information about list formed at beginning of workflow
names(BRINP3_list)
## [1] "NP_001304117.1" "NP_775144.1" "NP_001139279.1" "NP_001248407.1 "
## [5] "NP_001095610.1" " XP_017949134.1" "XP_016789866.1" "XP_041276047.1"
## [9] "XP_026524943.1" "XP_015675031.1"
length (BRINP3_list)
## [1] 10
BRINP3_list[[1]]
## [1] "MPQAPSTGSSLIRDPSIAHRNTQILWTEAGRDLAQDTRYTGEESLTIFVDKRKLSKRAEGSDSTTNSSSVTLETLHQLAASYFIDRDSTLRRLHHIQIASTAIKVTETRTGPLGCSNYDNLDSVSSVLVQSPENKIQLQGLQVLLPDYLQERFVQAALSYIACNSEGEFICKENDCWCHCGPKFPECNCPSMDIQAMEENLLRITETWKAYNSDFEESDEFKLFMKRLPMNYFLNTSTIMHLWTMDSNFQRRYEQLENSMKQLFLKAQKIVHKLFSLSKRCHKQPLISLPRQRTSTYWLTRIQSFLYCNENGLLGSFSEETHSCTCPNDQVVCTAFLPCTVGDASACLTCAPDNRTRCGTCNTGYMLSQGLCKPEVAESTDHYIGFETDLQDLEMKYLLQKTDRRIEVHAIFISNDMRLNSWFDPSWRKRMLLTLKSNKYKSSLVHMILGLSLQICLTKNSTLEPVLAVYVNPFGGSHSESWFMPVNENSFPDWERTKLDLPLQCYNWTLTLGNKWKTFFETVHIYLRSRIKSNGPNGNESIYYEPLEFIDPSRNLGYMKINNIQVFGYSMHFDPEAIRDLILQLDYPYTQGSQDSALLQLLEIRDRVNKLSPPGQRRLDLFSCLLRHRLKLSTSEVVRIQSALQAFNAKLPNTMDYDTTKLCS"
#Creates and prints empty vector
BRINP3_vector <- rep(NA, length(BRINP3_list))
BRINP3_vector
## [1] NA NA NA NA NA NA NA NA NA NA
#Populates the vector
for(i in 1:length(BRINP3_vector)){
BRINP3_vector[i] <- BRINP3_list[[i]]
}
#Names the vector
names(BRINP3_vector) <- names(BRINP3_list)
#Turns sequences into vectors
humanBRINP3 <- fasta_cleaner(BRINP3_list[[1]], parse = F)
chimpanzeeBRINP3 <- fasta_cleaner(BRINP3_list[[7]], parse = F)
housemouseBRINP3 <- fasta_cleaner(BRINP3_list[[3]], parse = F)
cattleBRINP3 <- fasta_cleaner(BRINP3_list[[5]], parse = F)
#Performs global pairwise alignments on sequences
align.human.vs.chimp <- Biostrings::pairwiseAlignment(
humanBRINP3,
chimpanzeeBRINP3)
align.human.vs.housemouse <- Biostrings::pairwiseAlignment(
humanBRINP3,
housemouseBRINP3)
align.human.vs.cattle <- Biostrings::pairwiseAlignment(
humanBRINP3,
cattleBRINP3)
#Prints percent sequence alignment
Biostrings::pid(align.human.vs.chimp)
## [1] 99.8494
Biostrings::pid(align.human.vs.housemouse)
## [1] 82.37598
Biostrings::pid(align.human.vs.cattle)
## [1] 82.24543
#Global pairwise alignments for matrix
align.chimp.vs.housemouse <- Biostrings::pairwiseAlignment(
chimpanzeeBRINP3,
housemouseBRINP3)
align.chimp.vs.cattle <- Biostrings::pairwiseAlignment(
chimpanzeeBRINP3,
cattleBRINP3)
align.housemouse.vs.cattle <- Biostrings::pairwiseAlignment(
housemouseBRINP3,
cattleBRINP3)
#Builds a matrix of PID
pids <- c(1, NA, NA, NA,
pid(align.human.vs.chimp), 1, NA, NA,
pid(align.human.vs.housemouse), pid(align.chimp.vs.housemouse), 1, NA,
pid(align.human.vs.cattle), pid(align.chimp.vs.cattle), pid(align.housemouse.vs.cattle), 1)
mat <- matrix(pids, nrow = 4, byrow = T)
row.names(mat) <- c("Homo","Pan","Mus","Bos")
colnames(mat) <- c("Homo","Pan","Mus","Bos")
pander::pander(mat)
| Homo | Pan | Mus | Bos | |
|---|---|---|---|---|
| Homo | 1 | NA | NA | NA |
| Pan | 99.85 | 1 | NA | NA |
| Mus | 82.38 | 82.51 | 1 | NA |
| Bos | 82.25 | 82.38 | 97.13 | 1 |
#Generates comparisons between humans and chimps using different methods
PID1 <- pid(align.human.vs.chimp, type = "PID1")
PID2 <- pid(align.human.vs.chimp, type = "PID2")
PID3 <- pid(align.human.vs.chimp, type = "PID3")
PID4 <- pid(align.human.vs.chimp, type = "PID4")
#Generates dataframe
method <- c("PID1", "PID2", "PID3", "PID4")
PID <- c(PID1, PID2, PID3, PID4)
denominator <- c("(aligned positions + internal gap positions)", "(aligned positions)", "(length shorter sequence)", "(average length of the two sequences)")
PIDcompdf <- data.frame(method,
PID,
denominator)
pander(PIDcompdf)
| method | PID | denominator |
|---|---|---|
| PID1 | 99.85 | (aligned positions + internal gap positions) |
| PID2 | 99.85 | (aligned positions) |
| PID3 | 99.85 | (length shorter sequence) |
| PID4 | 99.85 | (average length of the two sequences) |
#Converts vector into stringset
BRINP3_vector_ss <- Biostrings::AAStringSet(BRINP3_vector)
#Uses defult substitution matrix
BRINP3_align <- msa(BRINP3_vector_ss,
method = "ClustalW")
## use default substitution matrix
#Shows information about stringset
class(BRINP3_align)
## [1] "MsaAAMultipleAlignment"
## attr(,"package")
## [1] "msa"
is(BRINP3_align)
## [1] "MsaAAMultipleAlignment" "AAMultipleAlignment" "MsaMetaData"
## [4] "MultipleAlignment"
#Shows defult output of MSA
BRINP3_align
## CLUSTAL 2.1
##
## Call:
## msa(BRINP3_vector_ss, method = "ClustalW")
##
## MsaAAMultipleAlignment with 10 rows and 766 columns
## aln names
## [1] MIWRRRAGAELSSLMALWEWIVLSL...IQSALQAFNAKLPNTVDYDTTKLCS NP_775144.1
## [2] MIWRRRAGAELSSLMALWEWIVLSL...IQSALQAFNAKLPNTVDYDTTKLCS NP_001139279.1
## [3] MIWRSRAGAELFSLMALWEWIALSL...IQSALQAFNAKLPNKMDYDTTKLCS NP_001248407.1
## [4] MIWRSRAGAELFSLMAPWEWIALSL...IQSALQAFNAKLPNTVDYDTTKLCS NP_001095610.1
## [5] MIWRCSAVAELLSLMALWVWIALSL...ILSALQAFNEKLPNTIEYETTKLCS XP_017949134.1
## [6] -------------------------...IQSALQAFNAKLPNTMDYDTTKLCS NP_001304117.1
## [7] -------------------------...IQSALQAFNAKLPNTMDYDTTKLCS XP_016789866.1
## [8] -------------------------...IQSALQTFNAKLPNTMDYDTTKLCS XP_026524943.1
## [9] -------------------------...IQSALQTFNAKLPNTVDYDTTKLCS XP_015675031.1
## [10] -------------------------...IQSALQAFNAKLPNTVDYDTTKLCS XP_041276047.1
## Con ????--?-???-????-?-??-???...IQSALQAFNAKLPNTVDYDTTKLCS Consensus
#Changes class of alignment
class(BRINP3_align) <- "AAMultipleAlignment"
#Converts to seqinr format
BRINP3_align_seqinr <- msaConvert(BRINP3_align,
type = "seqinr::alignment")
#Shows information
class(BRINP3_align_seqinr)
## [1] "alignment"
is(BRINP3_align_seqinr)
## [1] "alignment"
#Displays output
compbio4all::print_msa(BRINP3_align_seqinr)
## [1] "MIWRRRAGAELSSLMALWEWIVLSLHCWVLAVAAVSDQHATSPFDWLLSDKGPFHRSQEY 0"
## [1] "MIWRRRAGAELSSLMALWEWIVLSLHCWVLAVAAVSDQHATSPFDWLLSDKGPFHRSQEY 0"
## [1] "MIWRSRAGAELFSLMALWEWIALSLHCWVLAVAAVSDQHATSPFDWLLSDKGPFHRSQEY 0"
## [1] "MIWRSRAGAELFSLMAPWEWIALSLHCWVLAVAAVSDQHATSPFDWLLSDKGPFHRSQEY 0"
## [1] "MIWRCSAVAELLSLMALWVWIALSLHCWIIVIDAVSDQHATSPFDWLLSDKGPFHHSQEY 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] "------------------------------------------------------------ 0"
## [1] " "
## [1] "TDFVDRSRQGFSTRYKIYREFGRWKVNNLAVERRNFLGSPLPLAPEFFRNIRLLGRRPTL 0"
## [1] "TDFVDRSRQGFSTRYKIYREFGRWKVNNLAVERRNFLGSPLPLAPEFFRNIRLLGRRPTL 0"
## [1] "TDFVDRSRQGFSTRYKIYREFGRWKVNNLAVERRNFLGSPLPLAPEFFRNIRLLGRRPTL 0"
## [1] "TDFVDRSRQGFSTRYKIYREFGRWKVNNLAVERRNFLGSPLPLAPEFFRNIRLLGRRPTL 0"
## [1] "TDFVDRSRQGFTTRYKIYREFGRWKVNNLAVERRNFIGSPLPLAPEFFRNIRHLGRRPSL 0"
## [1] "------MPQAPSTGSSLIRDP--------SIAHRN------------------------- 0"
## [1] "------MPQAPSTGSSLIRDP--------SIAHRN------------------------- 0"
## [1] "------MPQAPSTGFSLIRDP--------SIAPQN------------------------- 0"
## [1] "------MPQAPSTGFSLIRDL--------SIAPRN------------------------- 0"
## [1] "------MPQAPSTGSSLIRDP--------SIAHRN------------------------- 0"
## [1] " "
## [1] "QQITENLIKKYGTHFLLSATLGGEESLTIFVDKRKLSKRPEGSETSTNSSSVTLETLHQL 0"
## [1] "QQITENLIKKYGTHFLLSATLGGEESLTIFVDKRKLSKRSEGSETSTNSSSVTLETLHQL 0"
## [1] "QQITENLIKKYGTHFLLSATLGGEESLTIFVDKRKLSKRAEGSDSSTNSSSVTLETLHQL 0"
## [1] "QQITENLIKKYGTHFLLSATLGGEESLTIFVDKRKLSKRSEGSDPSANSSLVTLETLHQL 0"
## [1] "QQITDNIIKKYGTHFLLSATLGGEESLTIFVDKRKLSKRSEATDSSGNSSSITLETLHQL 0"
## [1] "---TQILWTEAGRDLAQDTRYTGEESLTIFVDKRKLSKRAEGSDSTTNSSSVTLETLHQL 0"
## [1] "---TQILWTEAGRDLAQDTRYTGEESLTIFVDKRKLSKRAEGSDSSTNSSSVTLETLHQL 0"
## [1] "---IQILWTEADRDLAQDTRFTGEESLTIFVDKRKLSKRTETSDPSINSTTVTLETLHQL 0"
## [1] "---TQILWTEAGRDLAQDTRFTGEESLTIFVDKRKLSKRSETSDPSINSTTVNLETLHQL 0"
## [1] "---TLILWTEVDRDLAQDTRFTGEESLTIFVDKRKLSKRSEGNDPSANSSVVTLETLHQL 0"
## [1] " "
## [1] "AASYFIDRDSTLRRLHHIQIASTAIKVTETRTGPLGCSNYDNLDSVSSVLVQSPENKIQL 0"
## [1] "AASYFIDRDSTLRRLHHIQIASTAIKVTETRTGPLGCSNYDNLDSVSSVLVQSPENKIQL 0"
## [1] "AASYFIDRDSTLRRLHHIQIASTAIKVTETRTGPLGCSNYDNLDSVSSVLVQSPENKIQL 0"
## [1] "AASYFIDRDSTLRRLHHIQIASTAIKVTETRTGPLGCSNYDNLDSVSSVLVQSPENKIQL 0"
## [1] "AASYFIDRDSTLRRLHHIQIASTAIKVTETRTGPLGCSNYDNLDSVSSVLVQSPENKIHL 0"
## [1] "AASYFIDRDSTLRRLHHIQIASTAIKVTETRTGPLGCSNYDNLDSVSSVLVQSPENKIQL 0"
## [1] "AASYFIDRDSTLRRLHHIQIASTAIKVTETRTGPLGCSNYDNLDSVSSVLVQSPENKIQL 0"
## [1] "AASYFIDRDSTLRRLHHIQIASTAIKVTETRTGPLGCSNYDNLDSVSSVLVQSPENKIQL 0"
## [1] "AASYFIDRDSTLRRLHHIQIASTAIKVTETRTGPLGCSNYDNLDSVSSVLVQSPENKIQL 0"
## [1] "AASYFIDRESTLRRLHHIQIASTAIKVTETRTGPLGCSNYDNLDSVSSVLVQSPENKIQL 0"
## [1] " "
## [1] "QGLQVLLPDYLQERFVQAALSYIACNSEGEFICKENDCWCLCGPKFPECNCPSMDIQAME 0"
## [1] "QGLQVLLPDYLQERFVQAALSYIACNSEGEFICKDNDCWCHCGPKFPECNCPSMDIQAME 0"
## [1] "QGLQVLLPDYLQERFVQAALSYIACNSEGEFICKENDCWCHCGPKFPECNCPSMDIQAME 0"
## [1] "QGLQVLLPDYLQERFVQAALSYIACNSEGEFICKDNDCWCHCGPKFPECNCPSMDIQAME 0"
## [1] "QGLQVILPEYLQENFVQAALSYIACNSEGEFICKDNDCWCRCGPKFPDCNCPYIDIQAME 0"
## [1] "QGLQVLLPDYLQERFVQAALSYIACNSEGEFICKENDCWCHCGPKFPECNCPSMDIQAME 0"
## [1] "QGLQVLLPDYLQERFVQAALSYIACNSEGEFICKENDCWCHCGPKFPECNCPSMDIQAME 0"
## [1] "QGLQFILPEYLQERFVQAALSYIACNSEGEFICKDNDCWCQCGSKFPECNCPYMDIQAME 0"
## [1] "QGLQLILPEYLQERFVQAALSYIACNSEGEFICKDNDCWCQCGSKFPECNCPYMDIQAME 0"
## [1] "QGLQVILPEYLQEHFVQAALSYIACNSEGEFICKDNDCWCQCGPRFPECNCPYMDIQAME 0"
## [1] " "
## [1] "ENLLRITETWKAYNSDFEDSDEFKFFMKRLPMNYFLNTSTIMHLWTMDSNFQRRYEQLEN 0"
## [1] "ENLLRITETWKAYNSDFEDSDEFKFFMKRLPMNYFLNTSTIMHLWTMDSNFQRRYEQLEN 0"
## [1] "ENLLRITETWKAYNSDFEESDEFKLFMKRLPMNYFLNTSTIMHLWTMDSNFQRRYEQLEI 0"
## [1] "ENLLRITETWKAYNIDFEESDEFKFFMKRLPMNYFLNTSTIMHLWTMDSNFQRRYEQLEN 0"
## [1] "ENLVRISDTWNIYHKEFEESDEFKLFMKRLPVNYFLNTSTILHLWTMDSNFQRRYEQLEN 0"
## [1] "ENLLRITETWKAYNSDFEESDEFKLFMKRLPMNYFLNTSTIMHLWTMDSNFQRRYEQLEN 0"
## [1] "ENLLRITETWKAYNSDFEESDEFKLFMKRLPMNYFLNTSTIMHLWTMDSNFQRRYEQLEN 0"
## [1] "ESLLRISETWNTYNSDFEDSDEFKFFMKRLPMNYFLNTSAIMHLWTMDSNFQHRYVQLES 0"
## [1] "ESLLRISETWNTYNSDFEDSDEFKFFMKRLPMNYFLNTSAIMHLWTMDSNFQHRYEQLES 0"
## [1] "ENLLRISETWNAYNSEFEESDEFKLFMKRLPMNYFLNTSAVMHLWTMDSNFQRRYEQLES 0"
## [1] " "
## [1] "SMKQLFLKAHRIVHKLFSLSKRCHKQPLISLPRQRTSTYWLTRIQSFLYCNENGLLGSFS 0"
## [1] "SMKQLFLKAHRIVHKLFSLSKRCHKQPLISLPRQRTSTYWLTRIQSFLYCNENGLLGSFS 0"
## [1] "SMKQLFLKAQKIVHKLFSLSKRCHKQPLISLPRQRTSTYWLTRIQSFLYCNENGLLGSFS 0"
## [1] "SMKQLFLKAQKIVHKLFSLSKRCHKQPLISLPRQRTSTYWLTRIQSFLYCNENGLLGSFS 0"
## [1] "SMKQLYPKAQKIVHKLFSLSKRCHKQPLISLPRQRSSSYWLNRIQSFLYCNENGLLGSFS 0"
## [1] "SMKQLFLKAQKIVHKLFSLSKRCHKQPLISLPRQRTSTYWLTRIQSFLYCNENGLLGSFS 0"
## [1] "SMKQLFLKAQKIVHKLFSLSKRCHKQPLISLPRQRTSTYWLTRIQSFLYCNENGLLGSFS 0"
## [1] "SMKQLFLKAQKIVHKLFSLSKRCHKQPLISLPRQRSSLYWLNRIQSFLYCNENGLLGSFS 0"
## [1] "SMKQLFLKAQKIVHKLFSLSKRCHKQPFISLPRQRSSLYWLNRIQSFLYCNENGLLGSFS 0"
## [1] "SMKQLFLKAQRTVHKLFSLSKRCHKQPLLSLPRKKSSNYWLNRIQSFLYCNENGLLGSFS 0"
## [1] " "
## [1] "EETHSCTCPNDQVVCTAFLPCTVGDASACLTCAPDNRTRCGTCNTGYMLSQGLCKPEVAE 0"
## [1] "EETHSCTCPNDQVVCTAFLPCTVGDASACLTCAPDNRTRCGTCNTGYMLSQGLCKPEVAE 0"
## [1] "EETHSCTCPNDQVVCTAFLPCTVGDASACLTCAPDNRTRCGTCNTGYMLSQGLCKPEVAE 0"
## [1] "EETHSCTCPNDQVVCTAFLPCTVGDAAACLSCAPDNRTRCGTCNTGYMLSQGFCKPEVAE 0"
## [1] "EETHTCTCPNDQVSCQTNLPCVVGDGAACQTCAQDNRTRCGNCNTGYMLSQGFCKPEVAD 0"
## [1] "EETHSCTCPNDQVVCTAFLPCTVGDASACLTCAPDNRTRCGTCNTGYMLSQGLCKPEVAE 0"
## [1] "EETHSCTCPNDQVVCTAFLPCTVGDASACLTCAPDNRTRCGTCNTGYMLSQGLCKPEVAE 0"
## [1] "EETHSCTCPNDQLACLGSLPCSVGDGSACLTCAPDNRTRCGTCNTGYMLSQGLCKPEVAD 0"
## [1] "EETHSCTCPNDQLACLGSLPCSVGEGSACLTCAPDNRTRCGTCNTGYMLSQGICKPEVAD 0"
## [1] "EETHTCTCPNDQVACHGSLPCTLGDGPACLSCAPDNRTRCGSCNAGYMLSQGLCKPEVAD 0"
## [1] " "
## [1] "STDHYIGFETDLQDLEMKYLLQKTDRRIEVHAIFISNDMRLNSWFDPSWRKRMLLTLKSN 0"
## [1] "STDHYIGFETDLQDLEMKYLLQKTDRRIEVHAIFISNDMRLNSWFDPSWRKRMLLTLKSN 0"
## [1] "STDHYIGFETDLQDLEMKYLLQKTDRRIEVHAIFISNDMRLNSWFDPSWRKRMLLTLKSN 0"
## [1] "STDHYIGFETDLQDLEMKYLLQKTDRRLEVHAIFISNDMRLNSWFDPSWRKRMLLTLKSN 0"
## [1] "STEHYIGFETDMQDTEMKYLLQKNDRRIEVHAIFISNDMRLNSWFDPSWRKRMLLTLKSN 0"
## [1] "STDHYIGFETDLQDLEMKYLLQKTDRRIEVHAIFISNDMRLNSWFDPSWRKRMLLTLKSN 0"
## [1] "STDHYIGFETDLQDLEMKYLLQKTDRRIEVHAIFISNDMRLNSWFDPSWRKRMLLTLKSN 0"
## [1] "STEHYIGFETDLQDLEMKYLLQKTDRRIEVHAIFISNDMRLNSWFDPSWRKRMLLTLKSN 0"
## [1] "STEHYIGFETDLQDLEMKYLLQKADRRIEVHAIFISNDMRLNSWFDPSWRKRMLLTLKSN 0"
## [1] "STEHYIGFETDLQDLEMKYLLQKTDRRIEVHAIFISNDMRLNSWFDPSWRKRMLLTLKSN 0"
## [1] " "
## [1] "KYKSSLVHMILGLSLQICLTKNSTLEPVLAVYINPFGGSHSESWFMPVSESSFPDWERTK 0"
## [1] "KYKSSLVHMILGLSLQICLTKNSTLEPVLAVYVNPFGGSHSESWFMPVSESSFPDWERTK 0"
## [1] "KYKSSLVHMILGLSLQICLTKNSTLEPVLAVYVNPFGGSHSESWFMPVNENSFPDWERTK 0"
## [1] "KYKSSLVHMILGLSLQICLTKNSTLEPVLAVYINPFGGSHSESWFMPVNENSFPDWERTK 0"
## [1] "KYKANMVHMILGLSLQICLTKNSTLEPVLAVYVNPFGGSHSESWFMPVNENNFPDWERTK 0"
## [1] "KYKSSLVHMILGLSLQICLTKNSTLEPVLAVYVNPFGGSHSESWFMPVNENSFPDWERTK 0"
## [1] "KYKSSLVHMILGLSLQICLTKNSTLEPVLAVYVNPFGGSHSESWFMPVNENSFPDWERTK 0"
## [1] "KYKSSLVHMILGMSLQICLTKNSTLEPVLAVYINPFGGSHSESWFMPVNENNFPDWERTK 0"
## [1] "KYKSSLVHMILGISLQICLTKNSTLEPVLAVYINPFGGSHSESWFMPVNENNFPDWERTK 0"
## [1] "KYKSSLVHMILGLSLQICLTKNSTLEPVLAVYINPFGGSHSESWFMPVNENNFPDWERTK 0"
## [1] " "
## [1] "LDLPLQCYNWTLTLGNKWKTFFETVHIYLRSRIKANGPNSNESIYYEPLEFIDPSRNLGY 0"
## [1] "LDLPLQCYNWTLTLGNKWKTFFETVHIYLRSRIKANGPNSNESIYYEPLEFIDPSRNLGY 0"
## [1] "LDLPLQCYNWTLTLGNKWKTFFETVHIYLRSRIKSNGPNGNESIYYEPLEFIDPSRNLGY 0"
## [1] "LDLPLQCYNWSLTLGNKWKTFFETVHIYLRSRIKSNGPNGNESIYYEPLEFIDPSRNLGY 0"
## [1] "LDLPLQCYNWSLTLGNKWKTFFETVHIYLRSRIKSNGPNGNESIYYEPLEFLDPSRNLGY 0"
## [1] "LDLPLQCYNWTLTLGNKWKTFFETVHIYLRSRIKSNGPNGNESIYYEPLEFIDPSRNLGY 0"
## [1] "LDLPLQCYNWTLTLGNKWKTFFETVHIYLRSRIKSNGPNGNESIYYEPLEFIDPSRNLGY 0"
## [1] "LDLPLQCYNWTLTLGNKWKTFFETVHIYLRSRIKSNGPNGNDSIYYEPLEFIDPSRNLGY 0"
## [1] "LDLPLQCYNWTLTLGNKWKTFFETVHIYLRSRIKSNGPNGNDSIYYEPLEFIDPSRNLGY 0"
## [1] "LDLPLQCYNWTLTLGNKWKTFFETVHIYLRSRIKSNGPNGNESIYYEPLEFIDPSRNLGY 0"
## [1] " "
## [1] "MKINNIQVFGYSMHFDPEAIRDLILQLDYPYTQGSQDSALLQLLEIRDRVNKLSPPGQRR 0"
## [1] "MKINNIQVFGYSMHFDPEAIRDLILQLDYPYTQGSQDSALLQLLEIRDRVNKLSPPGQRR 0"
## [1] "MKINNIQVFGYSMHFDPEAIRDLILQLDYPYTQGSQDSALLQLLEIRDRVNKLSPPGQRR 0"
## [1] "MKINNIQVFGYSMHFDPEAIRDLILQLDYPYTQGSQDSALLQLLEIRDRVNKLSPPGQRR 0"
## [1] "MKINSIQVFGYSMHFDPEAIRDLILQLDYPYTQGSQDSALLQLLEIRDRVNKLSPPGQRR 0"
## [1] "MKINNIQVFGYSMHFDPEAIRDLILQLDYPYTQGSQDSALLQLLEIRDRVNKLSPPGQRR 0"
## [1] "MKINNIQVFGYSMHFDPEAIRDLILQLDYPYTQGSQDSALLQLLEIRDRVNKLSPPGQRR 0"
## [1] "MKINSIQVFGYSMHFDPEAIRDLILQLDYPYTQGSQDSALLQLLEIRDRVNNLSPPGQRR 0"
## [1] "MKINSIQVFGYSMHFDPEAIRDLILQLDYPYTQGSQDSALLQLLEIRDRVNKLSPPGQRR 0"
## [1] "MKINNIQVFGYSMHFDPEAIRDLILQLDYPYTQGSQDSALLQLLEIRDRVNKLSPPGQRR 0"
## [1] " "
## [1] "LDLFSCLLRHRLKLSTSEVVRIQSALQAFNAKLPNTVDYDTTKLCS 14"
## [1] "LDLFSCLLRHRLKLSTSEVVRIQSALQAFNAKLPNTVDYDTTKLCS 14"
## [1] "LDLFSCLLRHRLKLSTSEVVRIQSALQAFNAKLPNKMDYDTTKLCS 14"
## [1] "LDLFSCLLRHRLKLSTSEVVRIQSALQAFNAKLPNTVDYDTTKLCS 14"
## [1] "LDLFSCLLRHRLKLSTSEVVRILSALQAFNEKLPNTIEYETTKLCS 14"
## [1] "LDLFSCLLRHRLKLSTSEVVRIQSALQAFNAKLPNTMDYDTTKLCS 14"
## [1] "LDLFSCLLRHRLKLSTSEVVRIQSALQAFNAKLPNTMDYDTTKLCS 14"
## [1] "LDLFSCLLRHRLKLSTSEVMRIQSALQTFNAKLPNTMDYDTTKLCS 14"
## [1] "LDLFSCLLRHRLKLSTSEVMRIQSALQTFNAKLPNTVDYDTTKLCS 14"
## [1] "LDLFSCLLRHRLKLSTSEVVRIQSALQAFNAKLPNTVDYDTTKLCS 14"
## [1] " "
#First step very important: changing class of alignment; second step displays MSA
class(BRINP3_align) <- "AAMultipleAlignment"
ggmsa::ggmsa(BRINP3_align,
start = 125,
end = 175)
#Calculates genetic distance using MSA
BRINP3_dist <- seqinr::dist.alignment(BRINP3_align_seqinr,
matrix = "identity")
#Shows information about distances
is(BRINP3_dist)
## [1] "dist" "oldClass"
class(BRINP3_dist)
## [1] "dist"
BRINP3_align_seqinr_rnd <- round(BRINP3_dist, 3)
BRINP3_align_seqinr_rnd
## NP_775144.1 NP_001139279.1 NP_001248407.1 NP_001095610.1
## NP_001139279.1 0.072
## NP_001248407.1 0.157 0.153
## NP_001095610.1 0.177 0.169 0.149
## XP_017949134.1 0.337 0.331 0.317 0.317
## NP_001304117.1 0.263 0.260 0.226 0.263
## XP_016789866.1 0.260 0.257 0.223 0.260
## XP_026524943.1 0.351 0.351 0.343 0.347
## XP_015675031.1 0.349 0.347 0.345 0.341
## XP_041276047.1 0.347 0.345 0.338 0.332
## XP_017949134.1 NP_001304117.1 XP_016789866.1 XP_026524943.1
## NP_001139279.1
## NP_001248407.1
## NP_001095610.1
## XP_017949134.1
## NP_001304117.1 0.372
## XP_016789866.1 0.370 0.039
## XP_026524943.1 0.388 0.274 0.272
## XP_015675031.1 0.382 0.280 0.277 0.150
## XP_041276047.1 0.374 0.269 0.266 0.266
## XP_015675031.1
## NP_001139279.1
## NP_001248407.1
## NP_001095610.1
## XP_017949134.1
## NP_001304117.1
## XP_016789866.1
## XP_026524943.1
## XP_015675031.1
## XP_041276047.1 0.269
# Function that uses genetic distances to cluster sequences into clades
tree <- nj(BRINP3_dist)
# Plots phylogenetic tree and adds a label
plot.phylo(tree, main="Phylogenetic Tree",
use.edge.length = F)
mtext(text = "BRINP3 family gene tree - rooted, no branch lengths")