Introduction

This code contains summary information about the tyr gene.

It also builds an MSA and phylogenetic tree to show the evolutionary relationship between the human version of this gene and its homologs in other species.

Resources / References

Key information use to make this script can be found here:

Refseq Gene: https://www.ncbi.nlm.nih.gov/gene/339479 Refseq Homologene: https://www.ncbi.nlm.nih.gov/homologene/17786

Other resources consulted includes:

Neanderthal genome: http://neandertal.ensemblgenomes.org/index.html

Other interesting resources and online tools include:

REPPER: https://toolkit.tuebingen.mpg.de/jobs/7803820 Sub-cellular locations prediction: https://wolfpsort.hgc.jp/

Preparation

Load necessary packages:

Download and load drawProteins from Bioconductor

library(BiocManager)
#install("drawProteins")
library(drawProteins)

Load other packages:

# github packages
library(compbio4all)
library(ggmsa)

# CRAN packages
library(rentrez)
library(seqinr)
library(ape)
library(pander)


library(ggplot2)


library(msa)
library(drawProteins)

# Biostrings
library(Biostrings)


library(HGNChelper)

Accession numbers

Accession numbers were obtained from RefSeq, Refseq Homlogene, UniProt and PDB. UniProt accession numbers can be found by searching for the gene name. PDB accessions can be found by searching with a UniProt accession or a gene name, though many proteins are not in PDB. The Neanderthal genome database was searched but did not yield sequence information on tyr.

A protein BLAST search (https://blast.ncbi.nlm.nih.gov/Blast.cgi?PROGRAM=blastp&PAGE_TYPE=BlastSearch&LINK_LOC=blasthome) was carried out excluding vertebrates to determine if it occurred outside of vertebreates. The gene does not appear in non-vertebrates and so a second search was conducted to exclude mammals.

Accession number table

Not available:

Neanderthal

Does not occur:

Outside of vertebrates

#Creating table

tyr_table <- c(
"NP_000363.1","P14679","NA","Homo sapiens","human","tyr",
"NP_035791.1", "P11344","NA","Mus musculus","house mouse", "tyr",
"NP_571088.3","NA","NA","Danio rerio","zebrafish", "tyr",   
"NP_001101005.1","D4A9G4","NA","Rattus norvegicus","Norway rat","tyr",
"NP_989491.1", "P55024", "NA", "Gallus gallus","chicken","tyr",   
"XP_003992691.2","NA","NA","Felis catus","domestic cat", "tyr",
"XP_001136041.2","NA","NA","Pan troglodytes","chimpanzee","tyr",
"XP_006075241.1","NA","NA","Bubalus bubalis","water buffalo","tyr", 
"XP_003468651.1", "A0A6J1U8X2", "NA", "Cavia porcellus",  "domestic guinea pig", "tyr",
"XP_001492610.4","NA","NA","Equus caballus","horse","tyr")
# Converts all tyr table information into matrix format

tyr_table_matrix <- matrix(tyr_table,
                                  byrow = T,
                                  nrow = 10)

tyr_table <- data.frame(tyr_table_matrix, 
                     stringsAsFactors = F)

names(tyr_table) <- c("ncbi.protein.accession", "UniProt.id","PDB", "species", "common.name", "gene.name")
#Printing table

pander(tyr_table)
Table continues below
ncbi.protein.accession UniProt.id PDB species
NP_000363.1 P14679 NA Homo sapiens
NP_035791.1 P11344 NA Mus musculus
NP_571088.3 NA NA Danio rerio
NP_001101005.1 D4A9G4 NA Rattus norvegicus
NP_989491.1 P55024 NA Gallus gallus
XP_003992691.2 NA NA Felis catus
XP_001136041.2 NA NA Pan troglodytes
XP_006075241.1 NA NA Bubalus bubalis
XP_003468651.1 A0A6J1U8X2 NA Cavia porcellus
XP_001492610.4 NA NA Equus caballus
common.name gene.name
human tyr
house mouse tyr
zebrafish tyr
Norway rat tyr
chicken tyr
domestic cat tyr
chimpanzee tyr
water buffalo tyr
domestic guinea pig tyr
horse tyr

Data Preparation

#Downloading all FASTA sequences using compbio4all::entrez_fetch_list() which uses rentrez::entrez_fetch() to access NCBI databases.

tyr <- rentrez::entrez_fetch(db = "protein", 
                          id = tyr_table$ncbi.protein.accession, 
                          rettype = "fasta")
#This assigns data to the variable tyr_list

tyr_list <- entrez_fetch_list(db = "protein", 
                          id = tyr_table$ncbi.protein.accession, 
                          rettype = "fasta")
#Number of FASTA files

length(tyr_list)
## [1] 10
#First entry

tyr_list[[1]]
## [1] ">NP_000363.1 tyrosinase precursor [Homo sapiens]\nMLLAVLYCLLWSFQTSAGHFPRACVSSKNLMEKECCPPWSGDRSPCGQLSGRGSCQNILLSNAPLGPQFP\nFTGVDDRESWPSVFYNRTCQCSGNFMGFNCGNCKFGFWGPNCTERRLLVRRNIFDLSAPEKDKFFAYLTL\nAKHTISSDYVIPIGTYGQMKNGSTPMFNDINIYDLFVWMHYYVSMDALLGGSEIWRDIDFAHEAPAFLPW\nHRLFLLRWEQEIQKLTGDENFTIPYWDWRDAEKCDICTDEYMGGQHPTNPNLLSPASFFSSWQIVCSRLE\nEYNSHQSLCNGTPEGPLRRNPGNHDKSRTPRLPSSADVEFCLSLTQYESGSMDKAANFSFRNTLEGFASP\nLTGIADASQSSMHNALHIYMNGTMSQVQGSANDPIFLLHHAFVDSIFEQWLRRHRPLQEVYPEANAPIGH\nNRESYMVPFIPLYRNGDFFISSKDLGYDYSYLQDSDPDSFQDYIKSYLEQASRIWSWLLGAAMVGAVLTA\nLLAGLVSLLCRHKRKQLPEEKQPLLMEKEDYHSLYQSHL\n\n"

Initial data cleaning

#This removes the FASTA header, but additional cleaning steps will be required for particular analyses

for(i in 1:length(tyr_list)){
  tyr_list[[i]] <- compbio4all::fasta_cleaner(tyr_list[[i]], parse = F)
}

General protein information

#Downloads from uniprot

P14679_tyr  <- drawProteins::get_features("P14679")
## [1] "Download has worked"
is(P14679_tyr)
## [1] "list"             "vector"           "list_OR_List"     "vector_OR_Vector"
## [5] "vector_OR_factor"
#Create dataframe

huUni_df <- drawProteins::feature_to_dataframe(P14679_tyr)
is(huUni_df)
## [1] "data.frame"       "list"             "oldClass"         "vector"          
## [5] "list_OR_List"     "vector_OR_Vector" "vector_OR_factor"
#View dataframe

huUni_df[,-2]
##                      type begin end length accession  entryName taxid order
## featuresTemp       SIGNAL     1  18     17    P14679 TYRO_HUMAN  9606     1
## featuresTemp.1      CHAIN    19 529    510    P14679 TYRO_HUMAN  9606     1
## featuresTemp.2   TOPO_DOM    19 476    457    P14679 TYRO_HUMAN  9606     1
## featuresTemp.3   TRANSMEM   477 497     20    P14679 TYRO_HUMAN  9606     1
## featuresTemp.4   TOPO_DOM   498 529     31    P14679 TYRO_HUMAN  9606     1
## featuresTemp.5     REGION   287 313     26    P14679 TYRO_HUMAN  9606     1
## featuresTemp.6      METAL   180 180      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.7      METAL   202 202      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.8      METAL   211 211      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.9      METAL   363 363      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.10     METAL   367 367      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.11     METAL   390 390      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.12  CARBOHYD    86  86      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.13  CARBOHYD   111 111      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.14  CARBOHYD   161 161      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.15  CARBOHYD   230 230      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.16  CARBOHYD   337 337      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.17  CARBOHYD   371 371      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.18   VAR_SEQ   346 377     31    P14679 TYRO_HUMAN  9606     1
## featuresTemp.19   VAR_SEQ   378 529    151    P14679 TYRO_HUMAN  9606     1
## featuresTemp.20   VARIANT    19  19      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.21   VARIANT    21  21      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.22   VARIANT    36  36      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.23   VARIANT    42  42      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.24   VARIANT    44  44      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.25   VARIANT    44  44      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.26   VARIANT    47  47      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.27   VARIANT    47  47      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.28   VARIANT    50  50      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.29   VARIANT    52  52      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.30   VARIANT    55  55      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.31   VARIANT    68  68      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.32   VARIANT    77  77      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.33   VARIANT    77  77      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.34   VARIANT    77  77      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.35   VARIANT    79  79      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.36   VARIANT    80  80      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.37   VARIANT    81  81      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.38   VARIANT    89  89      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.39   VARIANT    91  91      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.40   VARIANT    97  97      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.41   VARIANT   109 109      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.42   VARIANT   134 134      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.43   VARIANT   142 142      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.44   VARIANT   152 152      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.45   VARIANT   155 155      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.46   VARIANT   176 176      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.47   VARIANT   177 177      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.48   VARIANT   179 179      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.49   VARIANT   180 180      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.50   VARIANT   192 192      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.51   VARIANT   198 198      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.52   VARIANT   199 199      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.53   VARIANT   201 201      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.54   VARIANT   205 205      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.55   VARIANT   206 206      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.56   VARIANT   216 216      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.57   VARIANT   217 217      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.58   VARIANT   217 217      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.59   VARIANT   217 217      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.60   VARIANT   217 217      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.61   VARIANT   217 217      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.62   VARIANT   227 227      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.63   VARIANT   236 236      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.64   VARIANT   236 236      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.65   VARIANT   239 239      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.66   VARIANT   240 240      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.67   VARIANT   243 243      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.68   VARIANT   253 253      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.69   VARIANT   256 256      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.70   VARIANT   272 272      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.71   VARIANT   275 275      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.72   VARIANT   288 288      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.73   VARIANT   289 289      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.74   VARIANT   289 289      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.75   VARIANT   294 294      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.76   VARIANT   294 294      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.77   VARIANT   298 298      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.78   VARIANT   299 299      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.79   VARIANT   299 299      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.80   VARIANT   308 308      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.81   VARIANT   312 312      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.82   VARIANT   313 313      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.83   VARIANT   318 318      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.84   VARIANT   325 325      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.85   VARIANT   328 328      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.86   VARIANT   329 329      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.87   VARIANT   332 332      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.88   VARIANT   339 339      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.89   VARIANT   340 340      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.90   VARIANT   345 345      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.91   VARIANT   346 346      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.92   VARIANT   355 355      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.93   VARIANT   355 355      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.94   VARIANT   355 355      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.95   VARIANT   361 361      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.96   VARIANT   364 364      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.97   VARIANT   367 367      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.98   VARIANT   370 370      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.99   VARIANT   371 371      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.100  VARIANT   371 371      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.101  VARIANT   373 373      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.102  VARIANT   378 378      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.103  VARIANT   380 380      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.104  VARIANT   382 382      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.105  VARIANT   383 383      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.106  VARIANT   384 384      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.107  VARIANT   390 390      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.108  VARIANT   393 393      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.109  VARIANT   395 395      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.110  VARIANT   395 395      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.111  VARIANT   398 398      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.112  VARIANT   398 398      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.113  VARIANT   400 400      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.114  VARIANT   402 402      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.115  VARIANT   402 402      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.116  VARIANT   402 402      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.117  VARIANT   403 403      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.118  VARIANT   404 404      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.119  VARIANT   404 404      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.120  VARIANT   405 405      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.121  VARIANT   406 406      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.122  VARIANT   408 408      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.123  VARIANT   409 409      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.124  VARIANT   416 416      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.125  VARIANT   417 417      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.126  VARIANT   419 419      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.127  VARIANT   422 422      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.128  VARIANT   424 424      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.129  VARIANT   426 426      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.130  VARIANT   427 427      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.131  VARIANT   431 431      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.132  VARIANT   434 434      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.133  VARIANT   435 435      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.134  VARIANT   439 439      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.135  VARIANT   444 444      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.136  VARIANT   446 446      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.137  VARIANT   448 448      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.138  VARIANT   490 490      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.139 CONFLICT    42  45      3    P14679 TYRO_HUMAN  9606     1
## featuresTemp.140 CONFLICT   179 179      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.141 CONFLICT   373 378      5    P14679 TYRO_HUMAN  9606     1
## featuresTemp.142 CONFLICT   495 495      0    P14679 TYRO_HUMAN  9606     1
## featuresTemp.143 CONFLICT   520 523      3    P14679 TYRO_HUMAN  9606     1
## featuresTemp.144 CONFLICT   525 528      3    P14679 TYRO_HUMAN  9606     1

Protein diagram

#This protein has one domain

my_canvas <- draw_canvas(huUni_df)  
my_canvas <- draw_chains(my_canvas, huUni_df, 
                         label_size = 2.5)
my_canvas <- draw_domains(my_canvas, huUni_df)
my_canvas

Dotplot

#Creates vector

hutyr_vector <- compbio4all:: fasta_cleaner(tyr_list[[1]])
#Defult dotplot

seqinr::dotPlot(hutyr_vector,
                hutyr_vector)

#2x2 panel showing different values for dotplot settings

par(mfrow = c(2,2), 
    mar = c(0,0,2,1))

#Plot 1: tyr - Defaults
dotPlot(hutyr_vector, 
        hutyr_vector, 
        wsize = 1, 
        nmatch = 1, 
        main = "tyr Defaults")

#Plot 2: tyr - size = 10, nmatch = 1
dotPlot(hutyr_vector, hutyr_vector, 
        wsize = 10, 
        nmatch = 1, 
        main = "tyr - size = 10, nmatch = 1")

#Plot 3: tyr - size = 10, nmatch = 5
dotPlot(hutyr_vector, hutyr_vector, 
        wsize = 10, 
        nmatch = 5, 
        main = "tyr - size = 10, nmatch = 5")

#Plot 4: tyr size = 20, nmatch = 5
dotPlot(hutyr_vector, hutyr_vector, 
        wsize = 20,
        nmatch =5,
        main = "tyr - size = 20, nmatch = 5")

#Best plot using normal dotplot

par(mfrow = c(1,1), 
    mar = c(4,4,4,4))

dotPlot(hutyr_vector, 
        hutyr_vector,
        wsize = 10, 
        nmatch = 5, 
        main = "tyr window = 10, nmatch = 5")

Protein properties compiled from databases

databases <- c("Pfam", "Disprot", "RepeatsDB", "Uniprot", "PDB")

info <- c("The tyr gene has two domains: MACPF and BRINP", "NA", "NA", "The subcellular location is the mitochondrion", "NA")

databasedf <- data.frame(databases,
                         info)
pander(databasedf)
databases info
Pfam The tyr gene has two domains: MACPF and BRINP
Disprot NA
RepeatsDB NA
Uniprot The subcellular location is the mitochondrion
PDB NA

Protein feature prediction

Multivariate statistical techniques were used to confirm the information about protein structure and location in the online database.

Uniprot (which uses http://www.csbio.sjtu.edu.cn/bioinf/Cell-PLoc-2/) indicates that the protein is a secreted: found outside of the cell membrane.

Predict protein fold

#Make 2 vectors of amino acids and check to make sure they are the same length and contain the same valiues

aa.1.1 <- c("A","R","N","D","C","Q","E","G","H","I",
            "L","K","M","F","P","S","T","W","Y","V")

aa.1.2 <- c("A","R","N","D","C","Q","E","G","H","I",
            "L","K","M","F","P","S","T","W","Y","V")

length(aa.1.1) == length(aa.1.2)
## [1] TRUE
aa.1.1 == aa.1.2
##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE
# Shows how many unique values are present and that they both have the same number of unique values

unique(aa.1.1)
##  [1] "A" "R" "N" "D" "C" "Q" "E" "G" "H" "I" "L" "K" "M" "F" "P" "S" "T" "W" "Y"
## [20] "V"
length(unique(aa.1.1)) == length(unique(aa.1.2))
## [1] TRUE
#Creates vector of the frequency of each amino acid in the database of proteins of each type used by Chou, and checks against Chou's total

alpha <- c(285, 53, 97, 163, 22, 67, 134, 197, 111, 91, 
           221, 249, 48, 123, 82, 122, 119, 33, 63, 167)

sum(alpha) == 2447
## [1] TRUE
#Does same thing with beta proteins

beta <- c(203, 67, 139, 121, 75, 122, 86, 297, 49, 120, 
          177, 115, 16, 85, 127, 341, 253, 44, 110, 229)

sum(beta) == 2776
## [1] TRUE
#Does same thing with alpha plus beta proteins

a.plus.b <- c(175, 78, 120, 111, 74, 74, 86, 171, 33, 93,
              110, 112, 25, 52, 71, 126, 117, 30, 108, 123)

sum(a.plus.b) == 1889
## [1] TRUE
#Does same thing with alpha divided by beta proteins

a.div.b <- c(361, 146, 183, 244, 63, 114, 257, 377, 107, 239, 
             339, 321, 91, 158, 188, 327, 238, 72, 130, 378)

sum(a.div.b) == 4333
## [1] TRUE
#Creates dataframe of data

czdf <- data.frame(aa.1.1, alpha, beta, a.plus.b, a.div.b)
#Prints dataframe

pander(czdf)
aa.1.1 alpha beta a.plus.b a.div.b
A 285 203 175 361
R 53 67 78 146
N 97 139 120 183
D 163 121 111 244
C 22 75 74 63
Q 67 122 74 114
E 134 86 86 257
G 197 297 171 377
H 111 49 33 107
I 91 120 93 239
L 221 177 110 339
K 249 115 112 321
M 48 16 25 91
F 123 85 52 158
P 82 127 71 188
S 122 341 126 327
T 119 253 117 238
W 33 44 30 72
Y 63 110 108 130
V 167 229 123 378
#Calculates proportions for each of the four protein fold types

alpha.prop <- alpha/sum(alpha)
beta.prop <- beta/sum(beta)
a.plus.b.prop <- a.plus.b/sum(a.plus.b)
a.div.b <- a.div.b/sum(a.div.b)
#Create dataframe and row labels

aa.prop <- data.frame(alpha.prop,
                      beta.prop,
                      a.plus.b.prop,
                      a.div.b)

row.names(aa.prop) <- aa.1.1
#Prints dataframe

pander(aa.prop)
  alpha.prop beta.prop a.plus.b.prop a.div.b
A 0.1165 0.07313 0.09264 0.08331
R 0.02166 0.02414 0.04129 0.03369
N 0.03964 0.05007 0.06353 0.04223
D 0.06661 0.04359 0.05876 0.05631
C 0.008991 0.02702 0.03917 0.01454
Q 0.02738 0.04395 0.03917 0.02631
E 0.05476 0.03098 0.04553 0.05931
G 0.08051 0.107 0.09052 0.08701
H 0.04536 0.01765 0.01747 0.02469
I 0.03719 0.04323 0.04923 0.05516
L 0.09031 0.06376 0.05823 0.07824
K 0.1018 0.04143 0.05929 0.07408
M 0.01962 0.005764 0.01323 0.021
F 0.05027 0.03062 0.02753 0.03646
P 0.03351 0.04575 0.03759 0.04339
S 0.04986 0.1228 0.0667 0.07547
T 0.04863 0.09114 0.06194 0.05493
W 0.01349 0.01585 0.01588 0.01662
Y 0.02575 0.03963 0.05717 0.03
V 0.06825 0.08249 0.06511 0.08724
#Detemines number of each amino acid in protein 

table(hutyr_vector)
## hutyr_vector
##  A  C  D  E  F  G  H  I  K  L  M  N  P  Q  R  S  T  V  W  Y 
## 30 17 30 27 31 34 17 22 17 55 15 27 33 23 27 51 19 17 14 23
#A function to convert a table into a vector

table_to_vector <- function(table_x){
  table_names <- attr(table_x, "dimnames")[[1]]
  table_vect <- as.vector(table_x)
  names(table_vect) <- table_names
  return(table_vect)
}
#Creates a table of amino acid frequences, and converts this to a vector

tyr_human_table <- table(hutyr_vector)/length(hutyr_vector)
tyr.human.aa.freq <- table_to_vector(tyr_human_table)
tyr.human.aa.freq
##          A          C          D          E          F          G          H 
## 0.05671078 0.03213611 0.05671078 0.05103970 0.05860113 0.06427221 0.03213611 
##          I          K          L          M          N          P          Q 
## 0.04158790 0.03213611 0.10396975 0.02835539 0.05103970 0.06238185 0.04347826 
##          R          S          T          V          W          Y 
## 0.05103970 0.09640832 0.03591682 0.03213611 0.02646503 0.04347826
#Checks for the presence of U, an unknown amino acid

aa.names <- names(tyr.human.aa.freq)
any(aa.names == "U")
## [1] FALSE
#Previous test returns "False", indicating no "U" present; can add protein to frequency table

aa.prop$tyr.human.aa.freq <- tyr.human.aa.freq
pander::pander(aa.prop)
  alpha.prop beta.prop a.plus.b.prop a.div.b tyr.human.aa.freq
A 0.1165 0.07313 0.09264 0.08331 0.05671
R 0.02166 0.02414 0.04129 0.03369 0.03214
N 0.03964 0.05007 0.06353 0.04223 0.05671
D 0.06661 0.04359 0.05876 0.05631 0.05104
C 0.008991 0.02702 0.03917 0.01454 0.0586
Q 0.02738 0.04395 0.03917 0.02631 0.06427
E 0.05476 0.03098 0.04553 0.05931 0.03214
G 0.08051 0.107 0.09052 0.08701 0.04159
H 0.04536 0.01765 0.01747 0.02469 0.03214
I 0.03719 0.04323 0.04923 0.05516 0.104
L 0.09031 0.06376 0.05823 0.07824 0.02836
K 0.1018 0.04143 0.05929 0.07408 0.05104
M 0.01962 0.005764 0.01323 0.021 0.06238
F 0.05027 0.03062 0.02753 0.03646 0.04348
P 0.03351 0.04575 0.03759 0.04339 0.05104
S 0.04986 0.1228 0.0667 0.07547 0.09641
T 0.04863 0.09114 0.06194 0.05493 0.03592
W 0.01349 0.01585 0.01588 0.01662 0.03214
Y 0.02575 0.03963 0.05717 0.03 0.02647
V 0.06825 0.08249 0.06511 0.08724 0.04348

Functions to calculate similarities

#Creates two custom functions: the first one calculates correlates between two columns of our table, and the second one to calculates correlation similarities

chou_cor <- function(x,y){
  numerator <- sum(x*y)
denominator <- sqrt((sum(x^2))*(sum(y^2)))
result <- numerator/denominator
return(result)
}

chou_cosine <- function(z.1, z.2){
  z.1.abs <- sqrt(sum(z.1^2))
  z.2.abs <- sqrt(sum(z.2^2))
  my.cosine <- sum(z.1*z.2)/(z.1.abs*z.2.abs)
  return(my.cosine)
}
#Calculates and displays correlation between each column

corr.alpha <- chou_cor(aa.prop[,5], aa.prop[,1])
corr.beta  <- chou_cor(aa.prop[,5], aa.prop[,2])
corr.apb   <- chou_cor(aa.prop[,5], aa.prop[,3])
corr.adb   <- chou_cor(aa.prop[,5], aa.prop[,4])

corr.alpha
## [1] 0.7890408
corr.beta
## [1] 0.8411267
corr.apb
## [1] 0.8695129
corr.adb
## [1] 0.8551108
#Calculates and displays cosine similarity

cos.alpha <- chou_cosine(aa.prop[,5], aa.prop[,1])
cos.beta  <- chou_cosine(aa.prop[,5], aa.prop[,2])
cos.apb   <- chou_cosine(aa.prop[,5], aa.prop[,3])
cos.adb   <- chou_cosine(aa.prop[,5], aa.prop[,4])

cos.alpha
## [1] 0.7890408
cos.beta
## [1] 0.8411267
cos.apb
## [1] 0.8695129
cos.adb
## [1] 0.8551108
#Calculates Euclidean distance

aa.prop.flipped <- t(aa.prop)
round(aa.prop.flipped,2)
##                      A    R    N    D    C    Q    E    G    H    I    L    K
## alpha.prop        0.12 0.02 0.04 0.07 0.01 0.03 0.05 0.08 0.05 0.04 0.09 0.10
## beta.prop         0.07 0.02 0.05 0.04 0.03 0.04 0.03 0.11 0.02 0.04 0.06 0.04
## a.plus.b.prop     0.09 0.04 0.06 0.06 0.04 0.04 0.05 0.09 0.02 0.05 0.06 0.06
## a.div.b           0.08 0.03 0.04 0.06 0.01 0.03 0.06 0.09 0.02 0.06 0.08 0.07
## tyr.human.aa.freq 0.06 0.03 0.06 0.05 0.06 0.06 0.03 0.04 0.03 0.10 0.03 0.05
##                      M    F    P    S    T    W    Y    V
## alpha.prop        0.02 0.05 0.03 0.05 0.05 0.01 0.03 0.07
## beta.prop         0.01 0.03 0.05 0.12 0.09 0.02 0.04 0.08
## a.plus.b.prop     0.01 0.03 0.04 0.07 0.06 0.02 0.06 0.07
## a.div.b           0.02 0.04 0.04 0.08 0.05 0.02 0.03 0.09
## tyr.human.aa.freq 0.06 0.04 0.05 0.10 0.04 0.03 0.03 0.04
#Calculates distance matrix

dist(aa.prop.flipped, method = "euclidean")
##                   alpha.prop  beta.prop a.plus.b.prop    a.div.b
## beta.prop         0.13342098                                    
## a.plus.b.prop     0.09281824 0.08289406                         
## a.div.b           0.06699039 0.08659174    0.06175113           
## tyr.human.aa.freq 0.16304038 0.14309134    0.12377392 0.13182124
#Calculates and displays individual distances

dist.alpha <- dist((aa.prop.flipped[c(1,5),]),  method = "euclidean")
dist.beta  <- dist((aa.prop.flipped[c(2,5),]),  method = "euclidean")
dist.apb   <- dist((aa.prop.flipped[c(3,5),]),  method = "euclidean")
dist.adb  <- dist((aa.prop.flipped[c(4,5),]), method = "euclidean")

dist.alpha
##                   alpha.prop
## tyr.human.aa.freq  0.1630404
dist.beta
##                   beta.prop
## tyr.human.aa.freq 0.1430913
dist.apb
##                   a.plus.b.prop
## tyr.human.aa.freq     0.1237739
dist.adb
##                     a.div.b
## tyr.human.aa.freq 0.1318212
#Compiles information and rounds it, making it easier to read

fold.type <- c("alpha","beta","alpha plus beta", "alpha/beta")

corr.sim <- round(c(corr.alpha,corr.beta,corr.apb,corr.adb),5)
cosine.sim <- round(c(cos.alpha,cos.beta,cos.apb,cos.adb),5)
Euclidean.dist <- round(c(dist.alpha,dist.beta,dist.apb,dist.adb),5)

sim.sum <- c("","","most.sim","")
dist.sum <- c("","","min.dist","")

df2 <- data.frame(fold.type,
           corr.sim ,
           cosine.sim ,
           Euclidean.dist ,
           sim.sum ,
           dist.sum )
pander(df2)
fold.type corr.sim cosine.sim Euclidean.dist sim.sum dist.sum
alpha 0.789 0.789 0.163
beta 0.8411 0.8411 0.1431
alpha plus beta 0.8695 0.8695 0.1238 most.sim min.dist
alpha/beta 0.8551 0.8551 0.1318

Percent Identity Comparisons (PID)

Data Preparation

#Information about list formed at beginning of workflow

names(tyr_list)
##  [1] "NP_000363.1"    "NP_035791.1"    "NP_571088.3"    "NP_001101005.1"
##  [5] "NP_989491.1"    "XP_003992691.2" "XP_001136041.2" "XP_006075241.1"
##  [9] "XP_003468651.1" "XP_001492610.4"
length (tyr_list)
## [1] 10
tyr_list[[1]]
## [1] "MLLAVLYCLLWSFQTSAGHFPRACVSSKNLMEKECCPPWSGDRSPCGQLSGRGSCQNILLSNAPLGPQFPFTGVDDRESWPSVFYNRTCQCSGNFMGFNCGNCKFGFWGPNCTERRLLVRRNIFDLSAPEKDKFFAYLTLAKHTISSDYVIPIGTYGQMKNGSTPMFNDINIYDLFVWMHYYVSMDALLGGSEIWRDIDFAHEAPAFLPWHRLFLLRWEQEIQKLTGDENFTIPYWDWRDAEKCDICTDEYMGGQHPTNPNLLSPASFFSSWQIVCSRLEEYNSHQSLCNGTPEGPLRRNPGNHDKSRTPRLPSSADVEFCLSLTQYESGSMDKAANFSFRNTLEGFASPLTGIADASQSSMHNALHIYMNGTMSQVQGSANDPIFLLHHAFVDSIFEQWLRRHRPLQEVYPEANAPIGHNRESYMVPFIPLYRNGDFFISSKDLGYDYSYLQDSDPDSFQDYIKSYLEQASRIWSWLLGAAMVGAVLTALLAGLVSLLCRHKRKQLPEEKQPLLMEKEDYHSLYQSHL"
#Creates and prints empty vector

tyr_vector <- rep(NA, length(tyr_list))
tyr_vector
##  [1] NA NA NA NA NA NA NA NA NA NA
#Populates the vector

for(i in 1:length(tyr_vector)){
  tyr_vector[i] <- tyr_list[[i]]
}
#Names the vector

names(tyr_vector) <- names(tyr_list)
#Turns sequences into vectors

humantyr  <- fasta_cleaner(tyr_list[[1]], parse = F)
chimpanzeetyr <- fasta_cleaner(tyr_list[[7]], parse = F)
housemousetyr  <- fasta_cleaner(tyr_list[[2]], parse = F)
buffalotyr   <- fasta_cleaner(tyr_list[[8]], parse = F)
#Performs global pairwise alignments on sequences

align.human.vs.chimp <- Biostrings::pairwiseAlignment(
                  humantyr,
                  chimpanzeetyr)
align.human.vs.housemouse <- Biostrings::pairwiseAlignment(
                  humantyr,
                  housemousetyr)
align.human.vs.buffalo <- Biostrings::pairwiseAlignment(
                  humantyr,
                  buffalotyr)

PID Table

#Prints percent sequence alignment   

Biostrings::pid(align.human.vs.chimp)
## [1] 94.37387
Biostrings::pid(align.human.vs.housemouse)
## [1] 85.74109
Biostrings::pid(align.human.vs.buffalo)
## [1] 87.16981
#Global pairwise alignments for matrix

align.chimp.vs.housemouse <- Biostrings::pairwiseAlignment(
                  chimpanzeetyr,
                  housemousetyr)

align.chimp.vs.buffalo <- Biostrings::pairwiseAlignment(
                  chimpanzeetyr,
                  buffalotyr)

align.housemouse.vs.buffalo <- Biostrings::pairwiseAlignment(
                  housemousetyr,
                  buffalotyr)
#Builds a matrix of PID

pids <- c(1,                  NA,     NA,     NA,
          pid(align.human.vs.chimp),          1,     NA,     NA,
          pid(align.human.vs.housemouse), pid(align.chimp.vs.housemouse),      1,     NA,
          pid(align.human.vs.buffalo), pid(align.chimp.vs.buffalo), pid(align.housemouse.vs.buffalo), 1)

mat <- matrix(pids, nrow = 4, byrow = T)
row.names(mat) <- c("Homo","Pan","Mus","Bos")   
colnames(mat) <- c("Homo","Pan","Mus","Bos")   
pander::pander(mat)  
  Homo Pan Mus Bos
Homo 1 NA NA NA
Pan 94.37 1 NA NA
Mus 85.74 81.23 1 NA
Bos 87.17 80.85 81.8 1

PID Methods Comparison

#Generates comparisons between humans and chimps using different methods

PID1 <- pid(align.human.vs.chimp, type = "PID1")
PID2 <- pid(align.human.vs.chimp, type = "PID2")
PID3 <- pid(align.human.vs.chimp, type = "PID3")
PID4 <- pid(align.human.vs.chimp, type = "PID4")
#Generates dataframe 

method <- c("PID1", "PID2", "PID3", "PID4")

PID <- c(PID1, PID2, PID3, PID4)

denominator <- c("(aligned positions + internal gap positions)", "(aligned positions)", "(length shorter sequence)", "(average length of the two sequences)")

PIDcompdf <- data.frame(method,
                        PID,
                        denominator)
pander(PIDcompdf)
method PID denominator
PID1 94.37 (aligned positions + internal gap positions)
PID2 98.3 (aligned positions)
PID3 98.3 (length shorter sequence)
PID4 94.55 (average length of the two sequences)

Multiple Sequence Alignment

Data preparation

#Converts vector into stringset

tyr_vector_ss <- Biostrings::AAStringSet(tyr_vector)

Building Multiple Sequence Alignment (MSA)

#Uses defult substitution matrix

tyr_align <- msa(tyr_vector_ss,
                     method = "ClustalW")
## use default substitution matrix

Cleaning / setting up an MSA

#Shows information about stringset

class(tyr_align)
## [1] "MsaAAMultipleAlignment"
## attr(,"package")
## [1] "msa"
is(tyr_align)
## [1] "MsaAAMultipleAlignment" "AAMultipleAlignment"    "MsaMetaData"           
## [4] "MultipleAlignment"
#Shows defult output of MSA

tyr_align
## CLUSTAL 2.1  
## 
## Call:
##    msa(tyr_vector_ss, method = "ClustalW")
## 
## MsaAAMultipleAlignment with 10 rows and 579 columns
##      aln                                                   names
##  [1] ----MLLAVLYCLLWSFQTSAGHFP...YQSHL-------------------- NP_000363.1
##  [2] ----MLLAALYCLLWSFQTSTGHFP...TQSNVQVPENICWYFSVKTICKIVT XP_001136041.2
##  [3] ----MLLAALYCLLWSFQTSSGHFP...YQSHL-------------------- XP_006075241.1
##  [4] MRGRMLLAALYCLLWSFQASVGQFP...YQSHL-------------------- XP_001492610.4
##  [5] ----MLLAALCCLLWSFRTSAGHFP...YQTHV-------------------- XP_003992691.2
##  [6] ----MFLAVLYCLLWSFQISDGHFP...YQSHL-------------------- NP_035791.1
##  [7] ----MFLAVLYYLLWSFQTSAGHFP...YQSHL-------------------- NP_001101005.1
##  [8] ----MLLAVLLCVLWHFQISAGHFP...HQSHL-------------------- XP_003468651.1
##  [9] ----MFLFAMGLLLVILQPSTGQFP...YQSHF-------------------- NP_989491.1
## [10] ---MSLHLLLFFFLQLFSSSLQQFP...YQTTL-------------------- NP_571088.3
##  Con ----MLLAALYCLLWSFQTS?GHFP...YQSHL-------------------- Consensus
#Changes class of alignment

class(tyr_align) <- "AAMultipleAlignment"
#Converts to seqinr format

tyr_align_seqinr <- msaConvert(tyr_align, 
                                   type = "seqinr::alignment")
#Shows information 

class(tyr_align_seqinr)
## [1] "alignment"
is(tyr_align_seqinr)
## [1] "alignment"
#Displays output

compbio4all::print_msa(tyr_align_seqinr)
## [1] "----MLLAVLYCLLWSFQTSAGHFPRACVSSKNLMEKECCPPWSGDRSPCGQLSGRGSCQ 0"
## [1] "----MLLAALYCLLWSFQTSTGHFPRACVSSKNLMEKECCPPWSGDRSPCGQLSGRGSCQ 0"
## [1] "----MLLAALYCLLWSFQTSSGHFPRACASSKSLMEKECCPPWAGDGSPCGRLSGRGSCQ 0"
## [1] "MRGRMLLAALYCLLWSFQASVGQFPRACASSKNLMEKECCPKWRGDRSPCGQLSGRGSCQ 0"
## [1] "----MLLAALCCLLWSFRTSAGHFPRACASSKSLMEKECCPAWTGDSSPCGQLSGRGACQ 0"
## [1] "----MFLAVLYCLLWSFQISDGHFPRACASSKNLLAKECCPPWMGDGSPCGQLSGRGSCQ 0"
## [1] "----MFLAVLYYLLWSFQTSAGHFPRDCASSKNLMAKECCPPWMGDGSPCGHLSGRGSCQ 0"
## [1] "----MLLAVLLCVLWHFQISAGHFPRACASSQNLIEKECCPPWIGDGSPCGQLSGRGSCQ 0"
## [1] "----MFLFAMGLLLVILQPSTGQFPRVCANTQSLLRKECCPPWDGDGTPCGERSNRGTCQ 0"
## [1] "---MSLHLLLFFFLQLFSSSLQQFPRVCTSPEVLQSKRCCPVWPGDGSVCGVQSGRGFCQ 0"
## [1] " "
## [1] "NILLSNAPLGPQFPFTGVDDRESWPSVFYNRTCQCSGNFMGFNCGNCKFGFWGPNCTERR 0"
## [1] "NILLSNAPLGPQFPFTGVDDRESWPSVFYNRTCQCSGNFMGFNCGNCKFGFWGPNCTERR 0"
## [1] "DVILSTAPLGPQFPFTGVDDRESWPSIFYNRTCQCFGNFMGFNCGSCKFGFRGPRCTERR 0"
## [1] "DVILSNAPSGPQFPFAGVDDRESWPSVFYNRTCQCFGNFMGFNCGDCKFGFGGRNCTERR 0"
## [1] "DITLSKAPLGPQYPFTGMDDREAWPSVFYNRTCQCFGNFMGFNCGNCKFGFWGPNCTEKR 0"
## [1] "DILLSSAPSGPQFPFKGVDDRESWPSVFYNRTCQCSGNFMGFNCGNCKFGFGGPNCTEKR 0"
## [1] "DILLSNAPSGPQFPFKGVDDRESWPSVFYNRTCQCSGNFMGFNCGNCKFGFGGPNCTEKR 0"
## [1] "DIFLSSAPFGPQFPFTGVDDRESWPSVFYNRTCQCSGNFMGFSCGNCKFGYWGPNCTQKR 0"
## [1] "RILLSQAPLGPQFPFSGVDDREDWPSVFYNRTCRCRGNFMGFNCGECKFGFSGQNCTERR 0"
## [1] "DVLVSDLPNGPQYPHSGVDDRERWPLVFYNQTCQCAGNYMGFDCGECKFGFFGANCAERR 0"
## [1] " "
## [1] "LLVRRNIFDLSAPEKDKFFAYLTLAKHTISSDYVIPIGTYGQMKNGSTPMFNDINIYDLF 0"
## [1] "LLVRRNIFDLSAPEKDKFFAYLTLAKHTISSDYVIPIGTYGQMKNGSTPMFNDINIYDLF 0"
## [1] "LLVRRNIFDLSVPEKNKFLAYLTLAKHTTSPDYVIPTGTYGQMNHGTTPLFNDVSVYDLF 0"
## [1] "LLVRRSIFDLSVPEKNKFLAYLTLAKHTTSPDYVIPIGTYGQMNNGSKPMFKDITIYDLF 0"
## [1] "LLVRRNIFDLSVPEKNKFLAYLTLAKHTISPDYVIPIGTYGQMNNGSTPMFNDINVYDLF 0"
## [1] "VLIRRNIFDLSVSEKNKFFSYLTLAKHTISSVYVIPTGTYGQMNNGSTPMFNDINIYDLF 0"
## [1] "LLIRRNIFDLSASEKNKFFSYLTLAKHTISSVYVIPTGTYGQMNNGSTPMFKDINIYDLF 0"
## [1] "ILVRRNIFDLSVPEKDKFLAYLTLAKHTISANYVIPTGTYGQMKNGSTPMFNDVSIYNLF 0"
## [1] "LRTRRNIFQLTISEKDKFLAYLNLAKNIPSKDYVIATGTYAQMNNGSNPMFRNINVYDLF 0"
## [1] "ESVRRNIFQLSTTERQRFISYLNLAKTTISPDYMIVTGTYAQMNNGSTPMFANISVYDLF 0"
## [1] " "
## [1] "VWMHYYVSMDALLGGS-EIWRDIDFAHEAPAFLPWHRLFLLRWEQEIQKLTGDENFTIPY 0"
## [1] "VWMHYYVSMDALLGGS-EIWRDIDFAHEAPAFLPWHRLFLLRWEQEIQKLTGDENFTIPY 0"
## [1] "VWMHYYVSRDTLLGDS-EVWRDIDFAHEAPGFLPWHRLFLLLWEQEIQKLTGDENFTIPY 0"
## [1] "VWMHYYASRDTLLGGS-EVWKNIDFAHEAPGFLPWHRVFLLLWEQEIQKLTGDENFTIPY 0"
## [1] "VWMHYYVSRDTLLGGS-EIWKDIDFAHEAPGFLPWHRLFLLLWEQEIQKLTGDENFTIPY 0"
## [1] "VWMHYYVSRDTLLGGS-EIWRDIDFAHEAPGFLPWHRLFLLLWEQEIRELTGDENFTVPY 0"
## [1] "VWMHYYVSRDTLLGGS-EIWRDIDFAHEAPGFLPWHRLFLLLWEQEMQELTGDENFTIPY 0"
## [1] "VWMHYYVSRDTLLGGS-EIWKDIDFAHEAPGFLPWHRLFLLLWEEEIRQLTGDENFTIPY 0"
## [1] "VWMHYYASRDTLLGGS-NVWRDIDFAHEAPGFLPWHRAFLLLWEREIQKITGDENFTIPY 0"
## [1] "VWMHYYVSRDALLGGPGNVWADIDFAHESAAFLPWHRVYLLFWEHEIRKLTGDFNFTIPY 0"
## [1] " "
## [1] "WDWRDAEKCDICTDEYMGGQHPTNPNLLSPASFFSSWQIVCSRLEEYNSHQSLCNGTPEG 0"
## [1] "WDWRDAEKCDICTDEYMGGQHPTNPNLLSPASFFSSWQIVCSRLEEYNSHQSLCNGTPEG 0"
## [1] "WDWRDAENCDVCTDEYMGGRNPANPNLLSPASFFSSWQIVCSRLEEYNSRQALCNGTSEG 0"
## [1] "WDWRDAENCDVCTDEYIGGRNPANPNLISPASVFSSWQIVCSRLDDYNSRQALCNGTPEG 0"
## [1] "WDWRDAKSCDICTDEYMGGHNPANPNLLSPASFFSSWQIICTRLEEYNSRQALCDGTPEG 0"
## [1] "WDWRDAENCDICTDEYLGGRHPENPNLLSPASFFSSWQIICSRSEEYNSHQVLCDGTPEG 0"
## [1] "WDWRDAENCDICTDEYLGGRHPENPNLLSPASFFSSWEIICSRSEEYNSHQVLCDGTPEG 0"
## [1] "WDWRDAENCEICTDEYMGGRHPTNPNLLSPASFFSSWQIVCSRIEEYNSRQVLCDGNPEG 0"
## [1] "WDWRDAEDCVICTDEYMGGQHPTNPNLLSPASFFSSWQVICTQSEEYNSQQALCNATSEG 0"
## [1] "WDWRDAQDCQVCTDELMGARSSLNRSLISPSSVFSSWKVICSQPESYNLREALCDGSPEG 0"
## [1] " "
## [1] "PLRRNPGNHDKSRTPRLPSSADVEFCLSLTQYESGSMDKAANFSFRNTLEGFASPLTGIA 0"
## [1] "PLRRNPGNHDKSRTPRLPSSADVEFCLSLTQYESGSMDKAANFSFRNTLEGFASPLTGIA 0"
## [1] "PLLRNPGNHDKARTPRLPSSADVQFCLSLTQYESGPMDKAANFSFRNTLEGFADPLTGIA 0"
## [1] "PLLRNPGNHDKARTPRLPSSEDVEFCLNLIHYDSGPMDKTANFSFRNTLEGFASPLTGIA 0"
## [1] "TITAQSRNHDKARTPRLPSSADVEFCLSLTQYESDSMDKAANFSFRNTLEGFASPLTGIA 0"
## [1] "PLLRNPGNHDKAKTPRLPSSADVEFCLSLTQYESGSMDRTANFSFRNTLEGFASPLTGIA 0"
## [1] "PLLRNPGNHDKAKTPRLPSSADVEFCLSLTQYESGSMDRTANFSFRNTLEGFASPLTGIA 0"
## [1] "PLLRNPGNHDKARTSRIPSSSDVELCLSLTQYDSGSMDKSANFSFRNTLEGFANPVSGIA 0"
## [1] "PILRNPGNNDKSRTPRLPSSSEVEFCLTLTQYESGSMDKMANYSFRNTLEGFADPHTAIS 0"
## [1] "PLLRNPGDHDRIRVPRLPTSADVESVLRLTDYETGQMDRRANLSFRNALEGFANPETGLA 0"
## [1] " "
## [1] "DASQSSMHNALHIYMNGTMSQVQGSANDPIFLLHHAFVDSIFEQWLRRHRPLQEVYPEAN 0"
## [1] "DASQSSMHNALHIYMNGTMSQVQGSANDPIFLLHHAFVDSIFEQWLRRHRPLQEVYPEAN 0"
## [1] "DASQSSMHNALHIYMNGTMSQVPGSANDPIFLLHHAFVDSIFEQWLRKYHPLQDVYPEAN 0"
## [1] "DAFRSSMHNGLHIYMNGTISQVQGSANDPLFLLHHAFVDSIFERWFRRYHPPQEVYPEAN 0"
## [1] "DASQSSMHNALHIYMNGTMSQVQGSANDPIFLLHHAFVDSIFEQWLRRHHPLQEVYPEAN 0"
## [1] "DPSQSSMHNALHIFMNGTMSQVQGSANDPIFLLHHAFVDSIFEQWLRRHRPLLEVYPEAN 0"
## [1] "DSSQSSMHNALHIYMNGTMSQVQGSANDPIFLLHHAFVDSIFEQWLRRHRPLLEVYPEAN 0"
## [1] "NDSQSSMHNALHIYMNGTMSQVQGSANDPIFLLHHAFVDSIFEQWLRRHRPLQEVYPEAN 0"
## [1] "NISQSGLHNALHIYMNGSMSQVQGSANDPIFILHHAFVDSIFERWLRRHRPMLEVYPAAN 0"
## [1] "VTGRSLMHNSLHVFMNGSMSSVQGSANDPIFIIHHAFIDSIFEQWLRRHQPPRTHYPTAN 0"
## [1] " "
## [1] "APIGHNRESYMVPFIPLYRNGDFFISSKDLGYDYSYLQDSDPDSFQDYIKSYLEQASRIW 0"
## [1] "APIGHNRESYMVPFIPLYRNGDFFISSKDLGYDYSYLQDSDPDSFQDYIKSYLEQASRIW 0"
## [1] "APIGHNRESYMVPFIPLYRNGDFFISSKDLGYDYSYLQDSEPDIFQDYIKPYLEQAQRIW 0"
## [1] "APIGHNRDFYMVPFIPLYRNGDFFISSRDLGYDYSYLQEPDPVFFQDYIKSYLEQASRIW 0"
## [1] "APIGHNRESYMVPFIPLYRNGDFFISSRDLGYDYSNLQDSERDIFQDYIKPFLEQASRIW 0"
## [1] "APIGHNRDSYMVPFIPLYRNGDFFITSKDLGYDYSYLQESDPGFYRNYIEPYLEQASRIW 0"
## [1] "APIGHNRESYMVPFIPLYRNGDFFISSKDLGYDYSYLQESDPGFYRNYIEPYLEQASRIW 0"
## [1] "APIGHNRDSYMVPFIPLYKNGDFFISSRDLGYDYSYLQDSDPDFFQTYFEPYLEQARQIW 0"
## [1] "APIGHNRENYMVPFIPLYRNGEFFISSRELGYDYEYLQEPALGSFQDFLIPYLKQAHQIW 0"
## [1] "APIGHNDGYFMVPFIPLYRNGDYFLSTKALGYEYAYLQDPGQRFVQEFLTPYLEQAQQIW 0"
## [1] " "
## [1] "SWLLGAAMVGAVLTALLAGLVSLLCRHKR---KQLPEEKQPLLMEK-------------- 0"
## [1] "SWLLGAAMVGAVLTALLAGLVSLLCRHKR---KQLPEEKQPLLMEKXGLPQLVSEPFIKG 0"
## [1] "PWLIGAAVVGSVLTAVLGGLTSLLCRRKR---NQLPEEKQPLLMEK-------------- 0"
## [1] "PWLLGAALLGSVLTAVLGGLTTLLCRRKR---GQLPEEKQPLLMEN-------------- 0"
## [1] "PWLIGAAVVGSVLTAVLGRLTSLLCRRKR---KQLREERQPLLMEK-------------- 0"
## [1] "PWLLGAALVGAVIAAALSGLSSRLCLQKKKKKKQPQEERQPLLMDK-------------- 0"
## [1] "PWLLGAALVGAVVATALAGLSSRLCHQKK---KQPQEERQPLLMDK-------------- 0"
## [1] "PWLLGAALVGAALAAALVGLTSLLCHHKR---KQLPEEKQPLLTEK-------------- 0"
## [1] "PWLVGAAVIGGIITAVLSGLI-LACRKKR---KGTSPEIQPLLTES-------------- 0"
## [1] "QWLVAAGILGAAVAGIIASVFAVACRRQTRRRKLSYGERQPLLNSS-------------- 0"
## [1] " "
## [1] "--------EDYHSLYQSHL-------------------- 21"
## [1] "LGNRVGPKXPDLTLTQSNVQVPENICWYFSVKTICKIVT 21"
## [1] "-------EDYHNLMYQSHL-------------------- 21"
## [1] "-------EDYHNLLYQSHL-------------------- 21"
## [1] "-------EDYHSLLYQTHV-------------------- 21"
## [1] "-------DDYHSLLYQSHL-------------------- 21"
## [1] "-------DDYHSLLYQSHL-------------------- 21"
## [1] "-------EDYSSLLHQSHL-------------------- 21"
## [1] "-------EDYNNVSYQSHF-------------------- 21"
## [1] "-------EEEGSASYQTTL-------------------- 21"
## [1] " "

Finished MSA

#First step very important: changing class of alignment; second step displays MSA

class(tyr_align) <- "AAMultipleAlignment"

ggmsa::ggmsa(tyr_align,
      start = 125, 
      end = 175)

Distance Matrix

#Calculates genetic distance using MSA

tyr_dist <- seqinr::dist.alignment(tyr_align_seqinr, 
                                             matrix = "identity")
#Shows information about distances

is(tyr_dist)
## [1] "dist"     "oldClass"
class(tyr_dist)
## [1] "dist"
tyr_align_seqinr_rnd <- round(tyr_dist, 3)
tyr_align_seqinr_rnd
##                NP_000363.1 XP_001136041.2 XP_006075241.1 XP_001492610.4
## XP_001136041.2       0.131                                             
## XP_006075241.1       0.369          0.374                              
## XP_001492610.4       0.410          0.415          0.371               
## XP_003992691.2       0.371          0.374          0.350          0.414
## NP_035791.1          0.379          0.389          0.421          0.441
## NP_001101005.1       0.366          0.379          0.419          0.441
## XP_003468651.1       0.389          0.398          0.419          0.451
## NP_989491.1          0.519          0.519          0.520          0.540
## NP_571088.3          0.626          0.630          0.628          0.630
##                XP_003992691.2 NP_035791.1 NP_001101005.1 XP_003468651.1
## XP_001136041.2                                                         
## XP_006075241.1                                                         
## XP_001492610.4                                                         
## XP_003992691.2                                                         
## NP_035791.1             0.419                                          
## NP_001101005.1          0.417       0.204                              
## XP_003468651.1          0.426       0.391          0.407               
## NP_989491.1             0.527       0.529          0.529          0.540
## NP_571088.3             0.632       0.626          0.622          0.631
##                NP_989491.1
## XP_001136041.2            
## XP_006075241.1            
## XP_001492610.4            
## XP_003992691.2            
## NP_035791.1               
## NP_001101005.1            
## XP_003468651.1            
## NP_989491.1               
## NP_571088.3          0.629

Phylognetic tree of sequences

# Function that uses genetic distances to cluster sequences into clades

tree <- nj(tyr_dist)
# Plots phylogenetic tree and adds a label

plot.phylo(tree, main="Phylogenetic Tree", 
            use.edge.length = F)

mtext(text = "tyr family gene tree - rooted, no branch lengths")