This code contains summary information about the tyr gene.
It also builds an MSA and phylogenetic tree to show the evolutionary relationship between the human version of this gene and its homologs in other species.
Key information use to make this script can be found here:
Refseq Gene: https://www.ncbi.nlm.nih.gov/gene/339479 Refseq Homologene: https://www.ncbi.nlm.nih.gov/homologene/17786
Other resources consulted includes:
Neanderthal genome: http://neandertal.ensemblgenomes.org/index.html
Other interesting resources and online tools include:
REPPER: https://toolkit.tuebingen.mpg.de/jobs/7803820 Sub-cellular locations prediction: https://wolfpsort.hgc.jp/
Load necessary packages:
Download and load drawProteins from Bioconductor
library(BiocManager)
#install("drawProteins")
library(drawProteins)
Load other packages:
# github packages
library(compbio4all)
library(ggmsa)
# CRAN packages
library(rentrez)
library(seqinr)
library(ape)
library(pander)
library(ggplot2)
library(msa)
library(drawProteins)
# Biostrings
library(Biostrings)
library(HGNChelper)
Accession numbers were obtained from RefSeq, Refseq Homlogene, UniProt and PDB. UniProt accession numbers can be found by searching for the gene name. PDB accessions can be found by searching with a UniProt accession or a gene name, though many proteins are not in PDB. The Neanderthal genome database was searched but did not yield sequence information on tyr.
A protein BLAST search (https://blast.ncbi.nlm.nih.gov/Blast.cgi?PROGRAM=blastp&PAGE_TYPE=BlastSearch&LINK_LOC=blasthome) was carried out excluding vertebrates to determine if it occurred outside of vertebreates. The gene does not appear in non-vertebrates and so a second search was conducted to exclude mammals.
Not available:
Neanderthal
Does not occur:
Outside of vertebrates
#Creating table
tyr_table <- c(
"NP_000363.1","P14679","NA","Homo sapiens","human","tyr",
"NP_035791.1", "P11344","NA","Mus musculus","house mouse", "tyr",
"NP_571088.3","NA","NA","Danio rerio","zebrafish", "tyr",
"NP_001101005.1","D4A9G4","NA","Rattus norvegicus","Norway rat","tyr",
"NP_989491.1", "P55024", "NA", "Gallus gallus","chicken","tyr",
"XP_003992691.2","NA","NA","Felis catus","domestic cat", "tyr",
"XP_001136041.2","NA","NA","Pan troglodytes","chimpanzee","tyr",
"XP_006075241.1","NA","NA","Bubalus bubalis","water buffalo","tyr",
"XP_003468651.1", "A0A6J1U8X2", "NA", "Cavia porcellus", "domestic guinea pig", "tyr",
"XP_001492610.4","NA","NA","Equus caballus","horse","tyr")
# Converts all tyr table information into matrix format
tyr_table_matrix <- matrix(tyr_table,
byrow = T,
nrow = 10)
tyr_table <- data.frame(tyr_table_matrix,
stringsAsFactors = F)
names(tyr_table) <- c("ncbi.protein.accession", "UniProt.id","PDB", "species", "common.name", "gene.name")
#Printing table
pander(tyr_table)
| ncbi.protein.accession | UniProt.id | PDB | species |
|---|---|---|---|
| NP_000363.1 | P14679 | NA | Homo sapiens |
| NP_035791.1 | P11344 | NA | Mus musculus |
| NP_571088.3 | NA | NA | Danio rerio |
| NP_001101005.1 | D4A9G4 | NA | Rattus norvegicus |
| NP_989491.1 | P55024 | NA | Gallus gallus |
| XP_003992691.2 | NA | NA | Felis catus |
| XP_001136041.2 | NA | NA | Pan troglodytes |
| XP_006075241.1 | NA | NA | Bubalus bubalis |
| XP_003468651.1 | A0A6J1U8X2 | NA | Cavia porcellus |
| XP_001492610.4 | NA | NA | Equus caballus |
| common.name | gene.name |
|---|---|
| human | tyr |
| house mouse | tyr |
| zebrafish | tyr |
| Norway rat | tyr |
| chicken | tyr |
| domestic cat | tyr |
| chimpanzee | tyr |
| water buffalo | tyr |
| domestic guinea pig | tyr |
| horse | tyr |
#Downloading all FASTA sequences using compbio4all::entrez_fetch_list() which uses rentrez::entrez_fetch() to access NCBI databases.
tyr <- rentrez::entrez_fetch(db = "protein",
id = tyr_table$ncbi.protein.accession,
rettype = "fasta")
#This assigns data to the variable tyr_list
tyr_list <- entrez_fetch_list(db = "protein",
id = tyr_table$ncbi.protein.accession,
rettype = "fasta")
#Number of FASTA files
length(tyr_list)
## [1] 10
#First entry
tyr_list[[1]]
## [1] ">NP_000363.1 tyrosinase precursor [Homo sapiens]\nMLLAVLYCLLWSFQTSAGHFPRACVSSKNLMEKECCPPWSGDRSPCGQLSGRGSCQNILLSNAPLGPQFP\nFTGVDDRESWPSVFYNRTCQCSGNFMGFNCGNCKFGFWGPNCTERRLLVRRNIFDLSAPEKDKFFAYLTL\nAKHTISSDYVIPIGTYGQMKNGSTPMFNDINIYDLFVWMHYYVSMDALLGGSEIWRDIDFAHEAPAFLPW\nHRLFLLRWEQEIQKLTGDENFTIPYWDWRDAEKCDICTDEYMGGQHPTNPNLLSPASFFSSWQIVCSRLE\nEYNSHQSLCNGTPEGPLRRNPGNHDKSRTPRLPSSADVEFCLSLTQYESGSMDKAANFSFRNTLEGFASP\nLTGIADASQSSMHNALHIYMNGTMSQVQGSANDPIFLLHHAFVDSIFEQWLRRHRPLQEVYPEANAPIGH\nNRESYMVPFIPLYRNGDFFISSKDLGYDYSYLQDSDPDSFQDYIKSYLEQASRIWSWLLGAAMVGAVLTA\nLLAGLVSLLCRHKRKQLPEEKQPLLMEKEDYHSLYQSHL\n\n"
#This removes the FASTA header, but additional cleaning steps will be required for particular analyses
for(i in 1:length(tyr_list)){
tyr_list[[i]] <- compbio4all::fasta_cleaner(tyr_list[[i]], parse = F)
}
#Downloads from uniprot
P14679_tyr <- drawProteins::get_features("P14679")
## [1] "Download has worked"
is(P14679_tyr)
## [1] "list" "vector" "list_OR_List" "vector_OR_Vector"
## [5] "vector_OR_factor"
#Create dataframe
huUni_df <- drawProteins::feature_to_dataframe(P14679_tyr)
is(huUni_df)
## [1] "data.frame" "list" "oldClass" "vector"
## [5] "list_OR_List" "vector_OR_Vector" "vector_OR_factor"
#View dataframe
huUni_df[,-2]
## type begin end length accession entryName taxid order
## featuresTemp SIGNAL 1 18 17 P14679 TYRO_HUMAN 9606 1
## featuresTemp.1 CHAIN 19 529 510 P14679 TYRO_HUMAN 9606 1
## featuresTemp.2 TOPO_DOM 19 476 457 P14679 TYRO_HUMAN 9606 1
## featuresTemp.3 TRANSMEM 477 497 20 P14679 TYRO_HUMAN 9606 1
## featuresTemp.4 TOPO_DOM 498 529 31 P14679 TYRO_HUMAN 9606 1
## featuresTemp.5 REGION 287 313 26 P14679 TYRO_HUMAN 9606 1
## featuresTemp.6 METAL 180 180 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.7 METAL 202 202 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.8 METAL 211 211 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.9 METAL 363 363 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.10 METAL 367 367 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.11 METAL 390 390 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.12 CARBOHYD 86 86 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.13 CARBOHYD 111 111 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.14 CARBOHYD 161 161 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.15 CARBOHYD 230 230 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.16 CARBOHYD 337 337 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.17 CARBOHYD 371 371 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.18 VAR_SEQ 346 377 31 P14679 TYRO_HUMAN 9606 1
## featuresTemp.19 VAR_SEQ 378 529 151 P14679 TYRO_HUMAN 9606 1
## featuresTemp.20 VARIANT 19 19 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.21 VARIANT 21 21 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.22 VARIANT 36 36 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.23 VARIANT 42 42 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.24 VARIANT 44 44 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.25 VARIANT 44 44 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.26 VARIANT 47 47 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.27 VARIANT 47 47 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.28 VARIANT 50 50 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.29 VARIANT 52 52 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.30 VARIANT 55 55 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.31 VARIANT 68 68 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.32 VARIANT 77 77 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.33 VARIANT 77 77 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.34 VARIANT 77 77 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.35 VARIANT 79 79 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.36 VARIANT 80 80 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.37 VARIANT 81 81 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.38 VARIANT 89 89 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.39 VARIANT 91 91 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.40 VARIANT 97 97 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.41 VARIANT 109 109 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.42 VARIANT 134 134 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.43 VARIANT 142 142 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.44 VARIANT 152 152 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.45 VARIANT 155 155 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.46 VARIANT 176 176 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.47 VARIANT 177 177 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.48 VARIANT 179 179 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.49 VARIANT 180 180 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.50 VARIANT 192 192 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.51 VARIANT 198 198 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.52 VARIANT 199 199 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.53 VARIANT 201 201 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.54 VARIANT 205 205 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.55 VARIANT 206 206 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.56 VARIANT 216 216 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.57 VARIANT 217 217 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.58 VARIANT 217 217 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.59 VARIANT 217 217 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.60 VARIANT 217 217 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.61 VARIANT 217 217 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.62 VARIANT 227 227 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.63 VARIANT 236 236 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.64 VARIANT 236 236 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.65 VARIANT 239 239 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.66 VARIANT 240 240 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.67 VARIANT 243 243 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.68 VARIANT 253 253 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.69 VARIANT 256 256 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.70 VARIANT 272 272 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.71 VARIANT 275 275 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.72 VARIANT 288 288 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.73 VARIANT 289 289 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.74 VARIANT 289 289 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.75 VARIANT 294 294 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.76 VARIANT 294 294 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.77 VARIANT 298 298 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.78 VARIANT 299 299 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.79 VARIANT 299 299 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.80 VARIANT 308 308 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.81 VARIANT 312 312 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.82 VARIANT 313 313 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.83 VARIANT 318 318 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.84 VARIANT 325 325 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.85 VARIANT 328 328 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.86 VARIANT 329 329 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.87 VARIANT 332 332 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.88 VARIANT 339 339 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.89 VARIANT 340 340 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.90 VARIANT 345 345 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.91 VARIANT 346 346 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.92 VARIANT 355 355 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.93 VARIANT 355 355 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.94 VARIANT 355 355 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.95 VARIANT 361 361 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.96 VARIANT 364 364 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.97 VARIANT 367 367 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.98 VARIANT 370 370 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.99 VARIANT 371 371 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.100 VARIANT 371 371 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.101 VARIANT 373 373 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.102 VARIANT 378 378 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.103 VARIANT 380 380 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.104 VARIANT 382 382 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.105 VARIANT 383 383 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.106 VARIANT 384 384 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.107 VARIANT 390 390 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.108 VARIANT 393 393 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.109 VARIANT 395 395 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.110 VARIANT 395 395 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.111 VARIANT 398 398 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.112 VARIANT 398 398 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.113 VARIANT 400 400 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.114 VARIANT 402 402 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.115 VARIANT 402 402 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.116 VARIANT 402 402 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.117 VARIANT 403 403 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.118 VARIANT 404 404 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.119 VARIANT 404 404 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.120 VARIANT 405 405 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.121 VARIANT 406 406 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.122 VARIANT 408 408 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.123 VARIANT 409 409 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.124 VARIANT 416 416 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.125 VARIANT 417 417 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.126 VARIANT 419 419 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.127 VARIANT 422 422 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.128 VARIANT 424 424 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.129 VARIANT 426 426 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.130 VARIANT 427 427 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.131 VARIANT 431 431 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.132 VARIANT 434 434 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.133 VARIANT 435 435 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.134 VARIANT 439 439 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.135 VARIANT 444 444 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.136 VARIANT 446 446 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.137 VARIANT 448 448 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.138 VARIANT 490 490 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.139 CONFLICT 42 45 3 P14679 TYRO_HUMAN 9606 1
## featuresTemp.140 CONFLICT 179 179 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.141 CONFLICT 373 378 5 P14679 TYRO_HUMAN 9606 1
## featuresTemp.142 CONFLICT 495 495 0 P14679 TYRO_HUMAN 9606 1
## featuresTemp.143 CONFLICT 520 523 3 P14679 TYRO_HUMAN 9606 1
## featuresTemp.144 CONFLICT 525 528 3 P14679 TYRO_HUMAN 9606 1
#This protein has one domain
my_canvas <- draw_canvas(huUni_df)
my_canvas <- draw_chains(my_canvas, huUni_df,
label_size = 2.5)
my_canvas <- draw_domains(my_canvas, huUni_df)
my_canvas
#Creates vector
hutyr_vector <- compbio4all:: fasta_cleaner(tyr_list[[1]])
#Defult dotplot
seqinr::dotPlot(hutyr_vector,
hutyr_vector)
#2x2 panel showing different values for dotplot settings
par(mfrow = c(2,2),
mar = c(0,0,2,1))
#Plot 1: tyr - Defaults
dotPlot(hutyr_vector,
hutyr_vector,
wsize = 1,
nmatch = 1,
main = "tyr Defaults")
#Plot 2: tyr - size = 10, nmatch = 1
dotPlot(hutyr_vector, hutyr_vector,
wsize = 10,
nmatch = 1,
main = "tyr - size = 10, nmatch = 1")
#Plot 3: tyr - size = 10, nmatch = 5
dotPlot(hutyr_vector, hutyr_vector,
wsize = 10,
nmatch = 5,
main = "tyr - size = 10, nmatch = 5")
#Plot 4: tyr size = 20, nmatch = 5
dotPlot(hutyr_vector, hutyr_vector,
wsize = 20,
nmatch =5,
main = "tyr - size = 20, nmatch = 5")
#Best plot using normal dotplot
par(mfrow = c(1,1),
mar = c(4,4,4,4))
dotPlot(hutyr_vector,
hutyr_vector,
wsize = 10,
nmatch = 5,
main = "tyr window = 10, nmatch = 5")
databases <- c("Pfam", "Disprot", "RepeatsDB", "Uniprot", "PDB")
info <- c("The tyr gene has two domains: MACPF and BRINP", "NA", "NA", "The subcellular location is the mitochondrion", "NA")
databasedf <- data.frame(databases,
info)
pander(databasedf)
| databases | info |
|---|---|
| Pfam | The tyr gene has two domains: MACPF and BRINP |
| Disprot | NA |
| RepeatsDB | NA |
| Uniprot | The subcellular location is the mitochondrion |
| PDB | NA |
Multivariate statistical techniques were used to confirm the information about protein structure and location in the online database.
Uniprot (which uses http://www.csbio.sjtu.edu.cn/bioinf/Cell-PLoc-2/) indicates that the protein is a secreted: found outside of the cell membrane.
#Make 2 vectors of amino acids and check to make sure they are the same length and contain the same valiues
aa.1.1 <- c("A","R","N","D","C","Q","E","G","H","I",
"L","K","M","F","P","S","T","W","Y","V")
aa.1.2 <- c("A","R","N","D","C","Q","E","G","H","I",
"L","K","M","F","P","S","T","W","Y","V")
length(aa.1.1) == length(aa.1.2)
## [1] TRUE
aa.1.1 == aa.1.2
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE
# Shows how many unique values are present and that they both have the same number of unique values
unique(aa.1.1)
## [1] "A" "R" "N" "D" "C" "Q" "E" "G" "H" "I" "L" "K" "M" "F" "P" "S" "T" "W" "Y"
## [20] "V"
length(unique(aa.1.1)) == length(unique(aa.1.2))
## [1] TRUE
#Creates vector of the frequency of each amino acid in the database of proteins of each type used by Chou, and checks against Chou's total
alpha <- c(285, 53, 97, 163, 22, 67, 134, 197, 111, 91,
221, 249, 48, 123, 82, 122, 119, 33, 63, 167)
sum(alpha) == 2447
## [1] TRUE
#Does same thing with beta proteins
beta <- c(203, 67, 139, 121, 75, 122, 86, 297, 49, 120,
177, 115, 16, 85, 127, 341, 253, 44, 110, 229)
sum(beta) == 2776
## [1] TRUE
#Does same thing with alpha plus beta proteins
a.plus.b <- c(175, 78, 120, 111, 74, 74, 86, 171, 33, 93,
110, 112, 25, 52, 71, 126, 117, 30, 108, 123)
sum(a.plus.b) == 1889
## [1] TRUE
#Does same thing with alpha divided by beta proteins
a.div.b <- c(361, 146, 183, 244, 63, 114, 257, 377, 107, 239,
339, 321, 91, 158, 188, 327, 238, 72, 130, 378)
sum(a.div.b) == 4333
## [1] TRUE
#Creates dataframe of data
czdf <- data.frame(aa.1.1, alpha, beta, a.plus.b, a.div.b)
#Prints dataframe
pander(czdf)
| aa.1.1 | alpha | beta | a.plus.b | a.div.b |
|---|---|---|---|---|
| A | 285 | 203 | 175 | 361 |
| R | 53 | 67 | 78 | 146 |
| N | 97 | 139 | 120 | 183 |
| D | 163 | 121 | 111 | 244 |
| C | 22 | 75 | 74 | 63 |
| Q | 67 | 122 | 74 | 114 |
| E | 134 | 86 | 86 | 257 |
| G | 197 | 297 | 171 | 377 |
| H | 111 | 49 | 33 | 107 |
| I | 91 | 120 | 93 | 239 |
| L | 221 | 177 | 110 | 339 |
| K | 249 | 115 | 112 | 321 |
| M | 48 | 16 | 25 | 91 |
| F | 123 | 85 | 52 | 158 |
| P | 82 | 127 | 71 | 188 |
| S | 122 | 341 | 126 | 327 |
| T | 119 | 253 | 117 | 238 |
| W | 33 | 44 | 30 | 72 |
| Y | 63 | 110 | 108 | 130 |
| V | 167 | 229 | 123 | 378 |
#Calculates proportions for each of the four protein fold types
alpha.prop <- alpha/sum(alpha)
beta.prop <- beta/sum(beta)
a.plus.b.prop <- a.plus.b/sum(a.plus.b)
a.div.b <- a.div.b/sum(a.div.b)
#Create dataframe and row labels
aa.prop <- data.frame(alpha.prop,
beta.prop,
a.plus.b.prop,
a.div.b)
row.names(aa.prop) <- aa.1.1
#Prints dataframe
pander(aa.prop)
| Â | alpha.prop | beta.prop | a.plus.b.prop | a.div.b |
|---|---|---|---|---|
| A | 0.1165 | 0.07313 | 0.09264 | 0.08331 |
| R | 0.02166 | 0.02414 | 0.04129 | 0.03369 |
| N | 0.03964 | 0.05007 | 0.06353 | 0.04223 |
| D | 0.06661 | 0.04359 | 0.05876 | 0.05631 |
| C | 0.008991 | 0.02702 | 0.03917 | 0.01454 |
| Q | 0.02738 | 0.04395 | 0.03917 | 0.02631 |
| E | 0.05476 | 0.03098 | 0.04553 | 0.05931 |
| G | 0.08051 | 0.107 | 0.09052 | 0.08701 |
| H | 0.04536 | 0.01765 | 0.01747 | 0.02469 |
| I | 0.03719 | 0.04323 | 0.04923 | 0.05516 |
| L | 0.09031 | 0.06376 | 0.05823 | 0.07824 |
| K | 0.1018 | 0.04143 | 0.05929 | 0.07408 |
| M | 0.01962 | 0.005764 | 0.01323 | 0.021 |
| F | 0.05027 | 0.03062 | 0.02753 | 0.03646 |
| P | 0.03351 | 0.04575 | 0.03759 | 0.04339 |
| S | 0.04986 | 0.1228 | 0.0667 | 0.07547 |
| T | 0.04863 | 0.09114 | 0.06194 | 0.05493 |
| W | 0.01349 | 0.01585 | 0.01588 | 0.01662 |
| Y | 0.02575 | 0.03963 | 0.05717 | 0.03 |
| V | 0.06825 | 0.08249 | 0.06511 | 0.08724 |
#Detemines number of each amino acid in protein
table(hutyr_vector)
## hutyr_vector
## A C D E F G H I K L M N P Q R S T V W Y
## 30 17 30 27 31 34 17 22 17 55 15 27 33 23 27 51 19 17 14 23
#A function to convert a table into a vector
table_to_vector <- function(table_x){
table_names <- attr(table_x, "dimnames")[[1]]
table_vect <- as.vector(table_x)
names(table_vect) <- table_names
return(table_vect)
}
#Creates a table of amino acid frequences, and converts this to a vector
tyr_human_table <- table(hutyr_vector)/length(hutyr_vector)
tyr.human.aa.freq <- table_to_vector(tyr_human_table)
tyr.human.aa.freq
## A C D E F G H
## 0.05671078 0.03213611 0.05671078 0.05103970 0.05860113 0.06427221 0.03213611
## I K L M N P Q
## 0.04158790 0.03213611 0.10396975 0.02835539 0.05103970 0.06238185 0.04347826
## R S T V W Y
## 0.05103970 0.09640832 0.03591682 0.03213611 0.02646503 0.04347826
#Checks for the presence of U, an unknown amino acid
aa.names <- names(tyr.human.aa.freq)
any(aa.names == "U")
## [1] FALSE
#Previous test returns "False", indicating no "U" present; can add protein to frequency table
aa.prop$tyr.human.aa.freq <- tyr.human.aa.freq
pander::pander(aa.prop)
| Â | alpha.prop | beta.prop | a.plus.b.prop | a.div.b | tyr.human.aa.freq |
|---|---|---|---|---|---|
| A | 0.1165 | 0.07313 | 0.09264 | 0.08331 | 0.05671 |
| R | 0.02166 | 0.02414 | 0.04129 | 0.03369 | 0.03214 |
| N | 0.03964 | 0.05007 | 0.06353 | 0.04223 | 0.05671 |
| D | 0.06661 | 0.04359 | 0.05876 | 0.05631 | 0.05104 |
| C | 0.008991 | 0.02702 | 0.03917 | 0.01454 | 0.0586 |
| Q | 0.02738 | 0.04395 | 0.03917 | 0.02631 | 0.06427 |
| E | 0.05476 | 0.03098 | 0.04553 | 0.05931 | 0.03214 |
| G | 0.08051 | 0.107 | 0.09052 | 0.08701 | 0.04159 |
| H | 0.04536 | 0.01765 | 0.01747 | 0.02469 | 0.03214 |
| I | 0.03719 | 0.04323 | 0.04923 | 0.05516 | 0.104 |
| L | 0.09031 | 0.06376 | 0.05823 | 0.07824 | 0.02836 |
| K | 0.1018 | 0.04143 | 0.05929 | 0.07408 | 0.05104 |
| M | 0.01962 | 0.005764 | 0.01323 | 0.021 | 0.06238 |
| F | 0.05027 | 0.03062 | 0.02753 | 0.03646 | 0.04348 |
| P | 0.03351 | 0.04575 | 0.03759 | 0.04339 | 0.05104 |
| S | 0.04986 | 0.1228 | 0.0667 | 0.07547 | 0.09641 |
| T | 0.04863 | 0.09114 | 0.06194 | 0.05493 | 0.03592 |
| W | 0.01349 | 0.01585 | 0.01588 | 0.01662 | 0.03214 |
| Y | 0.02575 | 0.03963 | 0.05717 | 0.03 | 0.02647 |
| V | 0.06825 | 0.08249 | 0.06511 | 0.08724 | 0.04348 |
#Creates two custom functions: the first one calculates correlates between two columns of our table, and the second one to calculates correlation similarities
chou_cor <- function(x,y){
numerator <- sum(x*y)
denominator <- sqrt((sum(x^2))*(sum(y^2)))
result <- numerator/denominator
return(result)
}
chou_cosine <- function(z.1, z.2){
z.1.abs <- sqrt(sum(z.1^2))
z.2.abs <- sqrt(sum(z.2^2))
my.cosine <- sum(z.1*z.2)/(z.1.abs*z.2.abs)
return(my.cosine)
}
#Calculates and displays correlation between each column
corr.alpha <- chou_cor(aa.prop[,5], aa.prop[,1])
corr.beta <- chou_cor(aa.prop[,5], aa.prop[,2])
corr.apb <- chou_cor(aa.prop[,5], aa.prop[,3])
corr.adb <- chou_cor(aa.prop[,5], aa.prop[,4])
corr.alpha
## [1] 0.7890408
corr.beta
## [1] 0.8411267
corr.apb
## [1] 0.8695129
corr.adb
## [1] 0.8551108
#Calculates and displays cosine similarity
cos.alpha <- chou_cosine(aa.prop[,5], aa.prop[,1])
cos.beta <- chou_cosine(aa.prop[,5], aa.prop[,2])
cos.apb <- chou_cosine(aa.prop[,5], aa.prop[,3])
cos.adb <- chou_cosine(aa.prop[,5], aa.prop[,4])
cos.alpha
## [1] 0.7890408
cos.beta
## [1] 0.8411267
cos.apb
## [1] 0.8695129
cos.adb
## [1] 0.8551108
#Calculates Euclidean distance
aa.prop.flipped <- t(aa.prop)
round(aa.prop.flipped,2)
## A R N D C Q E G H I L K
## alpha.prop 0.12 0.02 0.04 0.07 0.01 0.03 0.05 0.08 0.05 0.04 0.09 0.10
## beta.prop 0.07 0.02 0.05 0.04 0.03 0.04 0.03 0.11 0.02 0.04 0.06 0.04
## a.plus.b.prop 0.09 0.04 0.06 0.06 0.04 0.04 0.05 0.09 0.02 0.05 0.06 0.06
## a.div.b 0.08 0.03 0.04 0.06 0.01 0.03 0.06 0.09 0.02 0.06 0.08 0.07
## tyr.human.aa.freq 0.06 0.03 0.06 0.05 0.06 0.06 0.03 0.04 0.03 0.10 0.03 0.05
## M F P S T W Y V
## alpha.prop 0.02 0.05 0.03 0.05 0.05 0.01 0.03 0.07
## beta.prop 0.01 0.03 0.05 0.12 0.09 0.02 0.04 0.08
## a.plus.b.prop 0.01 0.03 0.04 0.07 0.06 0.02 0.06 0.07
## a.div.b 0.02 0.04 0.04 0.08 0.05 0.02 0.03 0.09
## tyr.human.aa.freq 0.06 0.04 0.05 0.10 0.04 0.03 0.03 0.04
#Calculates distance matrix
dist(aa.prop.flipped, method = "euclidean")
## alpha.prop beta.prop a.plus.b.prop a.div.b
## beta.prop 0.13342098
## a.plus.b.prop 0.09281824 0.08289406
## a.div.b 0.06699039 0.08659174 0.06175113
## tyr.human.aa.freq 0.16304038 0.14309134 0.12377392 0.13182124
#Calculates and displays individual distances
dist.alpha <- dist((aa.prop.flipped[c(1,5),]), method = "euclidean")
dist.beta <- dist((aa.prop.flipped[c(2,5),]), method = "euclidean")
dist.apb <- dist((aa.prop.flipped[c(3,5),]), method = "euclidean")
dist.adb <- dist((aa.prop.flipped[c(4,5),]), method = "euclidean")
dist.alpha
## alpha.prop
## tyr.human.aa.freq 0.1630404
dist.beta
## beta.prop
## tyr.human.aa.freq 0.1430913
dist.apb
## a.plus.b.prop
## tyr.human.aa.freq 0.1237739
dist.adb
## a.div.b
## tyr.human.aa.freq 0.1318212
#Compiles information and rounds it, making it easier to read
fold.type <- c("alpha","beta","alpha plus beta", "alpha/beta")
corr.sim <- round(c(corr.alpha,corr.beta,corr.apb,corr.adb),5)
cosine.sim <- round(c(cos.alpha,cos.beta,cos.apb,cos.adb),5)
Euclidean.dist <- round(c(dist.alpha,dist.beta,dist.apb,dist.adb),5)
sim.sum <- c("","","most.sim","")
dist.sum <- c("","","min.dist","")
df2 <- data.frame(fold.type,
corr.sim ,
cosine.sim ,
Euclidean.dist ,
sim.sum ,
dist.sum )
pander(df2)
| fold.type | corr.sim | cosine.sim | Euclidean.dist | sim.sum | dist.sum |
|---|---|---|---|---|---|
| alpha | 0.789 | 0.789 | 0.163 | ||
| beta | 0.8411 | 0.8411 | 0.1431 | ||
| alpha plus beta | 0.8695 | 0.8695 | 0.1238 | most.sim | min.dist |
| alpha/beta | 0.8551 | 0.8551 | 0.1318 |
#Information about list formed at beginning of workflow
names(tyr_list)
## [1] "NP_000363.1" "NP_035791.1" "NP_571088.3" "NP_001101005.1"
## [5] "NP_989491.1" "XP_003992691.2" "XP_001136041.2" "XP_006075241.1"
## [9] "XP_003468651.1" "XP_001492610.4"
length (tyr_list)
## [1] 10
tyr_list[[1]]
## [1] "MLLAVLYCLLWSFQTSAGHFPRACVSSKNLMEKECCPPWSGDRSPCGQLSGRGSCQNILLSNAPLGPQFPFTGVDDRESWPSVFYNRTCQCSGNFMGFNCGNCKFGFWGPNCTERRLLVRRNIFDLSAPEKDKFFAYLTLAKHTISSDYVIPIGTYGQMKNGSTPMFNDINIYDLFVWMHYYVSMDALLGGSEIWRDIDFAHEAPAFLPWHRLFLLRWEQEIQKLTGDENFTIPYWDWRDAEKCDICTDEYMGGQHPTNPNLLSPASFFSSWQIVCSRLEEYNSHQSLCNGTPEGPLRRNPGNHDKSRTPRLPSSADVEFCLSLTQYESGSMDKAANFSFRNTLEGFASPLTGIADASQSSMHNALHIYMNGTMSQVQGSANDPIFLLHHAFVDSIFEQWLRRHRPLQEVYPEANAPIGHNRESYMVPFIPLYRNGDFFISSKDLGYDYSYLQDSDPDSFQDYIKSYLEQASRIWSWLLGAAMVGAVLTALLAGLVSLLCRHKRKQLPEEKQPLLMEKEDYHSLYQSHL"
#Creates and prints empty vector
tyr_vector <- rep(NA, length(tyr_list))
tyr_vector
## [1] NA NA NA NA NA NA NA NA NA NA
#Populates the vector
for(i in 1:length(tyr_vector)){
tyr_vector[i] <- tyr_list[[i]]
}
#Names the vector
names(tyr_vector) <- names(tyr_list)
#Turns sequences into vectors
humantyr <- fasta_cleaner(tyr_list[[1]], parse = F)
chimpanzeetyr <- fasta_cleaner(tyr_list[[7]], parse = F)
housemousetyr <- fasta_cleaner(tyr_list[[2]], parse = F)
buffalotyr <- fasta_cleaner(tyr_list[[8]], parse = F)
#Performs global pairwise alignments on sequences
align.human.vs.chimp <- Biostrings::pairwiseAlignment(
humantyr,
chimpanzeetyr)
align.human.vs.housemouse <- Biostrings::pairwiseAlignment(
humantyr,
housemousetyr)
align.human.vs.buffalo <- Biostrings::pairwiseAlignment(
humantyr,
buffalotyr)
#Prints percent sequence alignment
Biostrings::pid(align.human.vs.chimp)
## [1] 94.37387
Biostrings::pid(align.human.vs.housemouse)
## [1] 85.74109
Biostrings::pid(align.human.vs.buffalo)
## [1] 87.16981
#Global pairwise alignments for matrix
align.chimp.vs.housemouse <- Biostrings::pairwiseAlignment(
chimpanzeetyr,
housemousetyr)
align.chimp.vs.buffalo <- Biostrings::pairwiseAlignment(
chimpanzeetyr,
buffalotyr)
align.housemouse.vs.buffalo <- Biostrings::pairwiseAlignment(
housemousetyr,
buffalotyr)
#Builds a matrix of PID
pids <- c(1, NA, NA, NA,
pid(align.human.vs.chimp), 1, NA, NA,
pid(align.human.vs.housemouse), pid(align.chimp.vs.housemouse), 1, NA,
pid(align.human.vs.buffalo), pid(align.chimp.vs.buffalo), pid(align.housemouse.vs.buffalo), 1)
mat <- matrix(pids, nrow = 4, byrow = T)
row.names(mat) <- c("Homo","Pan","Mus","Bos")
colnames(mat) <- c("Homo","Pan","Mus","Bos")
pander::pander(mat)
| Â | Homo | Pan | Mus | Bos |
|---|---|---|---|---|
| Homo | 1 | NA | NA | NA |
| Pan | 94.37 | 1 | NA | NA |
| Mus | 85.74 | 81.23 | 1 | NA |
| Bos | 87.17 | 80.85 | 81.8 | 1 |
#Generates comparisons between humans and chimps using different methods
PID1 <- pid(align.human.vs.chimp, type = "PID1")
PID2 <- pid(align.human.vs.chimp, type = "PID2")
PID3 <- pid(align.human.vs.chimp, type = "PID3")
PID4 <- pid(align.human.vs.chimp, type = "PID4")
#Generates dataframe
method <- c("PID1", "PID2", "PID3", "PID4")
PID <- c(PID1, PID2, PID3, PID4)
denominator <- c("(aligned positions + internal gap positions)", "(aligned positions)", "(length shorter sequence)", "(average length of the two sequences)")
PIDcompdf <- data.frame(method,
PID,
denominator)
pander(PIDcompdf)
| method | PID | denominator |
|---|---|---|
| PID1 | 94.37 | (aligned positions + internal gap positions) |
| PID2 | 98.3 | (aligned positions) |
| PID3 | 98.3 | (length shorter sequence) |
| PID4 | 94.55 | (average length of the two sequences) |
#Converts vector into stringset
tyr_vector_ss <- Biostrings::AAStringSet(tyr_vector)
#Uses defult substitution matrix
tyr_align <- msa(tyr_vector_ss,
method = "ClustalW")
## use default substitution matrix
#Shows information about stringset
class(tyr_align)
## [1] "MsaAAMultipleAlignment"
## attr(,"package")
## [1] "msa"
is(tyr_align)
## [1] "MsaAAMultipleAlignment" "AAMultipleAlignment" "MsaMetaData"
## [4] "MultipleAlignment"
#Shows defult output of MSA
tyr_align
## CLUSTAL 2.1
##
## Call:
## msa(tyr_vector_ss, method = "ClustalW")
##
## MsaAAMultipleAlignment with 10 rows and 579 columns
## aln names
## [1] ----MLLAVLYCLLWSFQTSAGHFP...YQSHL-------------------- NP_000363.1
## [2] ----MLLAALYCLLWSFQTSTGHFP...TQSNVQVPENICWYFSVKTICKIVT XP_001136041.2
## [3] ----MLLAALYCLLWSFQTSSGHFP...YQSHL-------------------- XP_006075241.1
## [4] MRGRMLLAALYCLLWSFQASVGQFP...YQSHL-------------------- XP_001492610.4
## [5] ----MLLAALCCLLWSFRTSAGHFP...YQTHV-------------------- XP_003992691.2
## [6] ----MFLAVLYCLLWSFQISDGHFP...YQSHL-------------------- NP_035791.1
## [7] ----MFLAVLYYLLWSFQTSAGHFP...YQSHL-------------------- NP_001101005.1
## [8] ----MLLAVLLCVLWHFQISAGHFP...HQSHL-------------------- XP_003468651.1
## [9] ----MFLFAMGLLLVILQPSTGQFP...YQSHF-------------------- NP_989491.1
## [10] ---MSLHLLLFFFLQLFSSSLQQFP...YQTTL-------------------- NP_571088.3
## Con ----MLLAALYCLLWSFQTS?GHFP...YQSHL-------------------- Consensus
#Changes class of alignment
class(tyr_align) <- "AAMultipleAlignment"
#Converts to seqinr format
tyr_align_seqinr <- msaConvert(tyr_align,
type = "seqinr::alignment")
#Shows information
class(tyr_align_seqinr)
## [1] "alignment"
is(tyr_align_seqinr)
## [1] "alignment"
#Displays output
compbio4all::print_msa(tyr_align_seqinr)
## [1] "----MLLAVLYCLLWSFQTSAGHFPRACVSSKNLMEKECCPPWSGDRSPCGQLSGRGSCQ 0"
## [1] "----MLLAALYCLLWSFQTSTGHFPRACVSSKNLMEKECCPPWSGDRSPCGQLSGRGSCQ 0"
## [1] "----MLLAALYCLLWSFQTSSGHFPRACASSKSLMEKECCPPWAGDGSPCGRLSGRGSCQ 0"
## [1] "MRGRMLLAALYCLLWSFQASVGQFPRACASSKNLMEKECCPKWRGDRSPCGQLSGRGSCQ 0"
## [1] "----MLLAALCCLLWSFRTSAGHFPRACASSKSLMEKECCPAWTGDSSPCGQLSGRGACQ 0"
## [1] "----MFLAVLYCLLWSFQISDGHFPRACASSKNLLAKECCPPWMGDGSPCGQLSGRGSCQ 0"
## [1] "----MFLAVLYYLLWSFQTSAGHFPRDCASSKNLMAKECCPPWMGDGSPCGHLSGRGSCQ 0"
## [1] "----MLLAVLLCVLWHFQISAGHFPRACASSQNLIEKECCPPWIGDGSPCGQLSGRGSCQ 0"
## [1] "----MFLFAMGLLLVILQPSTGQFPRVCANTQSLLRKECCPPWDGDGTPCGERSNRGTCQ 0"
## [1] "---MSLHLLLFFFLQLFSSSLQQFPRVCTSPEVLQSKRCCPVWPGDGSVCGVQSGRGFCQ 0"
## [1] " "
## [1] "NILLSNAPLGPQFPFTGVDDRESWPSVFYNRTCQCSGNFMGFNCGNCKFGFWGPNCTERR 0"
## [1] "NILLSNAPLGPQFPFTGVDDRESWPSVFYNRTCQCSGNFMGFNCGNCKFGFWGPNCTERR 0"
## [1] "DVILSTAPLGPQFPFTGVDDRESWPSIFYNRTCQCFGNFMGFNCGSCKFGFRGPRCTERR 0"
## [1] "DVILSNAPSGPQFPFAGVDDRESWPSVFYNRTCQCFGNFMGFNCGDCKFGFGGRNCTERR 0"
## [1] "DITLSKAPLGPQYPFTGMDDREAWPSVFYNRTCQCFGNFMGFNCGNCKFGFWGPNCTEKR 0"
## [1] "DILLSSAPSGPQFPFKGVDDRESWPSVFYNRTCQCSGNFMGFNCGNCKFGFGGPNCTEKR 0"
## [1] "DILLSNAPSGPQFPFKGVDDRESWPSVFYNRTCQCSGNFMGFNCGNCKFGFGGPNCTEKR 0"
## [1] "DIFLSSAPFGPQFPFTGVDDRESWPSVFYNRTCQCSGNFMGFSCGNCKFGYWGPNCTQKR 0"
## [1] "RILLSQAPLGPQFPFSGVDDREDWPSVFYNRTCRCRGNFMGFNCGECKFGFSGQNCTERR 0"
## [1] "DVLVSDLPNGPQYPHSGVDDRERWPLVFYNQTCQCAGNYMGFDCGECKFGFFGANCAERR 0"
## [1] " "
## [1] "LLVRRNIFDLSAPEKDKFFAYLTLAKHTISSDYVIPIGTYGQMKNGSTPMFNDINIYDLF 0"
## [1] "LLVRRNIFDLSAPEKDKFFAYLTLAKHTISSDYVIPIGTYGQMKNGSTPMFNDINIYDLF 0"
## [1] "LLVRRNIFDLSVPEKNKFLAYLTLAKHTTSPDYVIPTGTYGQMNHGTTPLFNDVSVYDLF 0"
## [1] "LLVRRSIFDLSVPEKNKFLAYLTLAKHTTSPDYVIPIGTYGQMNNGSKPMFKDITIYDLF 0"
## [1] "LLVRRNIFDLSVPEKNKFLAYLTLAKHTISPDYVIPIGTYGQMNNGSTPMFNDINVYDLF 0"
## [1] "VLIRRNIFDLSVSEKNKFFSYLTLAKHTISSVYVIPTGTYGQMNNGSTPMFNDINIYDLF 0"
## [1] "LLIRRNIFDLSASEKNKFFSYLTLAKHTISSVYVIPTGTYGQMNNGSTPMFKDINIYDLF 0"
## [1] "ILVRRNIFDLSVPEKDKFLAYLTLAKHTISANYVIPTGTYGQMKNGSTPMFNDVSIYNLF 0"
## [1] "LRTRRNIFQLTISEKDKFLAYLNLAKNIPSKDYVIATGTYAQMNNGSNPMFRNINVYDLF 0"
## [1] "ESVRRNIFQLSTTERQRFISYLNLAKTTISPDYMIVTGTYAQMNNGSTPMFANISVYDLF 0"
## [1] " "
## [1] "VWMHYYVSMDALLGGS-EIWRDIDFAHEAPAFLPWHRLFLLRWEQEIQKLTGDENFTIPY 0"
## [1] "VWMHYYVSMDALLGGS-EIWRDIDFAHEAPAFLPWHRLFLLRWEQEIQKLTGDENFTIPY 0"
## [1] "VWMHYYVSRDTLLGDS-EVWRDIDFAHEAPGFLPWHRLFLLLWEQEIQKLTGDENFTIPY 0"
## [1] "VWMHYYASRDTLLGGS-EVWKNIDFAHEAPGFLPWHRVFLLLWEQEIQKLTGDENFTIPY 0"
## [1] "VWMHYYVSRDTLLGGS-EIWKDIDFAHEAPGFLPWHRLFLLLWEQEIQKLTGDENFTIPY 0"
## [1] "VWMHYYVSRDTLLGGS-EIWRDIDFAHEAPGFLPWHRLFLLLWEQEIRELTGDENFTVPY 0"
## [1] "VWMHYYVSRDTLLGGS-EIWRDIDFAHEAPGFLPWHRLFLLLWEQEMQELTGDENFTIPY 0"
## [1] "VWMHYYVSRDTLLGGS-EIWKDIDFAHEAPGFLPWHRLFLLLWEEEIRQLTGDENFTIPY 0"
## [1] "VWMHYYASRDTLLGGS-NVWRDIDFAHEAPGFLPWHRAFLLLWEREIQKITGDENFTIPY 0"
## [1] "VWMHYYVSRDALLGGPGNVWADIDFAHESAAFLPWHRVYLLFWEHEIRKLTGDFNFTIPY 0"
## [1] " "
## [1] "WDWRDAEKCDICTDEYMGGQHPTNPNLLSPASFFSSWQIVCSRLEEYNSHQSLCNGTPEG 0"
## [1] "WDWRDAEKCDICTDEYMGGQHPTNPNLLSPASFFSSWQIVCSRLEEYNSHQSLCNGTPEG 0"
## [1] "WDWRDAENCDVCTDEYMGGRNPANPNLLSPASFFSSWQIVCSRLEEYNSRQALCNGTSEG 0"
## [1] "WDWRDAENCDVCTDEYIGGRNPANPNLISPASVFSSWQIVCSRLDDYNSRQALCNGTPEG 0"
## [1] "WDWRDAKSCDICTDEYMGGHNPANPNLLSPASFFSSWQIICTRLEEYNSRQALCDGTPEG 0"
## [1] "WDWRDAENCDICTDEYLGGRHPENPNLLSPASFFSSWQIICSRSEEYNSHQVLCDGTPEG 0"
## [1] "WDWRDAENCDICTDEYLGGRHPENPNLLSPASFFSSWEIICSRSEEYNSHQVLCDGTPEG 0"
## [1] "WDWRDAENCEICTDEYMGGRHPTNPNLLSPASFFSSWQIVCSRIEEYNSRQVLCDGNPEG 0"
## [1] "WDWRDAEDCVICTDEYMGGQHPTNPNLLSPASFFSSWQVICTQSEEYNSQQALCNATSEG 0"
## [1] "WDWRDAQDCQVCTDELMGARSSLNRSLISPSSVFSSWKVICSQPESYNLREALCDGSPEG 0"
## [1] " "
## [1] "PLRRNPGNHDKSRTPRLPSSADVEFCLSLTQYESGSMDKAANFSFRNTLEGFASPLTGIA 0"
## [1] "PLRRNPGNHDKSRTPRLPSSADVEFCLSLTQYESGSMDKAANFSFRNTLEGFASPLTGIA 0"
## [1] "PLLRNPGNHDKARTPRLPSSADVQFCLSLTQYESGPMDKAANFSFRNTLEGFADPLTGIA 0"
## [1] "PLLRNPGNHDKARTPRLPSSEDVEFCLNLIHYDSGPMDKTANFSFRNTLEGFASPLTGIA 0"
## [1] "TITAQSRNHDKARTPRLPSSADVEFCLSLTQYESDSMDKAANFSFRNTLEGFASPLTGIA 0"
## [1] "PLLRNPGNHDKAKTPRLPSSADVEFCLSLTQYESGSMDRTANFSFRNTLEGFASPLTGIA 0"
## [1] "PLLRNPGNHDKAKTPRLPSSADVEFCLSLTQYESGSMDRTANFSFRNTLEGFASPLTGIA 0"
## [1] "PLLRNPGNHDKARTSRIPSSSDVELCLSLTQYDSGSMDKSANFSFRNTLEGFANPVSGIA 0"
## [1] "PILRNPGNNDKSRTPRLPSSSEVEFCLTLTQYESGSMDKMANYSFRNTLEGFADPHTAIS 0"
## [1] "PLLRNPGDHDRIRVPRLPTSADVESVLRLTDYETGQMDRRANLSFRNALEGFANPETGLA 0"
## [1] " "
## [1] "DASQSSMHNALHIYMNGTMSQVQGSANDPIFLLHHAFVDSIFEQWLRRHRPLQEVYPEAN 0"
## [1] "DASQSSMHNALHIYMNGTMSQVQGSANDPIFLLHHAFVDSIFEQWLRRHRPLQEVYPEAN 0"
## [1] "DASQSSMHNALHIYMNGTMSQVPGSANDPIFLLHHAFVDSIFEQWLRKYHPLQDVYPEAN 0"
## [1] "DAFRSSMHNGLHIYMNGTISQVQGSANDPLFLLHHAFVDSIFERWFRRYHPPQEVYPEAN 0"
## [1] "DASQSSMHNALHIYMNGTMSQVQGSANDPIFLLHHAFVDSIFEQWLRRHHPLQEVYPEAN 0"
## [1] "DPSQSSMHNALHIFMNGTMSQVQGSANDPIFLLHHAFVDSIFEQWLRRHRPLLEVYPEAN 0"
## [1] "DSSQSSMHNALHIYMNGTMSQVQGSANDPIFLLHHAFVDSIFEQWLRRHRPLLEVYPEAN 0"
## [1] "NDSQSSMHNALHIYMNGTMSQVQGSANDPIFLLHHAFVDSIFEQWLRRHRPLQEVYPEAN 0"
## [1] "NISQSGLHNALHIYMNGSMSQVQGSANDPIFILHHAFVDSIFERWLRRHRPMLEVYPAAN 0"
## [1] "VTGRSLMHNSLHVFMNGSMSSVQGSANDPIFIIHHAFIDSIFEQWLRRHQPPRTHYPTAN 0"
## [1] " "
## [1] "APIGHNRESYMVPFIPLYRNGDFFISSKDLGYDYSYLQDSDPDSFQDYIKSYLEQASRIW 0"
## [1] "APIGHNRESYMVPFIPLYRNGDFFISSKDLGYDYSYLQDSDPDSFQDYIKSYLEQASRIW 0"
## [1] "APIGHNRESYMVPFIPLYRNGDFFISSKDLGYDYSYLQDSEPDIFQDYIKPYLEQAQRIW 0"
## [1] "APIGHNRDFYMVPFIPLYRNGDFFISSRDLGYDYSYLQEPDPVFFQDYIKSYLEQASRIW 0"
## [1] "APIGHNRESYMVPFIPLYRNGDFFISSRDLGYDYSNLQDSERDIFQDYIKPFLEQASRIW 0"
## [1] "APIGHNRDSYMVPFIPLYRNGDFFITSKDLGYDYSYLQESDPGFYRNYIEPYLEQASRIW 0"
## [1] "APIGHNRESYMVPFIPLYRNGDFFISSKDLGYDYSYLQESDPGFYRNYIEPYLEQASRIW 0"
## [1] "APIGHNRDSYMVPFIPLYKNGDFFISSRDLGYDYSYLQDSDPDFFQTYFEPYLEQARQIW 0"
## [1] "APIGHNRENYMVPFIPLYRNGEFFISSRELGYDYEYLQEPALGSFQDFLIPYLKQAHQIW 0"
## [1] "APIGHNDGYFMVPFIPLYRNGDYFLSTKALGYEYAYLQDPGQRFVQEFLTPYLEQAQQIW 0"
## [1] " "
## [1] "SWLLGAAMVGAVLTALLAGLVSLLCRHKR---KQLPEEKQPLLMEK-------------- 0"
## [1] "SWLLGAAMVGAVLTALLAGLVSLLCRHKR---KQLPEEKQPLLMEKXGLPQLVSEPFIKG 0"
## [1] "PWLIGAAVVGSVLTAVLGGLTSLLCRRKR---NQLPEEKQPLLMEK-------------- 0"
## [1] "PWLLGAALLGSVLTAVLGGLTTLLCRRKR---GQLPEEKQPLLMEN-------------- 0"
## [1] "PWLIGAAVVGSVLTAVLGRLTSLLCRRKR---KQLREERQPLLMEK-------------- 0"
## [1] "PWLLGAALVGAVIAAALSGLSSRLCLQKKKKKKQPQEERQPLLMDK-------------- 0"
## [1] "PWLLGAALVGAVVATALAGLSSRLCHQKK---KQPQEERQPLLMDK-------------- 0"
## [1] "PWLLGAALVGAALAAALVGLTSLLCHHKR---KQLPEEKQPLLTEK-------------- 0"
## [1] "PWLVGAAVIGGIITAVLSGLI-LACRKKR---KGTSPEIQPLLTES-------------- 0"
## [1] "QWLVAAGILGAAVAGIIASVFAVACRRQTRRRKLSYGERQPLLNSS-------------- 0"
## [1] " "
## [1] "--------EDYHSLYQSHL-------------------- 21"
## [1] "LGNRVGPKXPDLTLTQSNVQVPENICWYFSVKTICKIVT 21"
## [1] "-------EDYHNLMYQSHL-------------------- 21"
## [1] "-------EDYHNLLYQSHL-------------------- 21"
## [1] "-------EDYHSLLYQTHV-------------------- 21"
## [1] "-------DDYHSLLYQSHL-------------------- 21"
## [1] "-------DDYHSLLYQSHL-------------------- 21"
## [1] "-------EDYSSLLHQSHL-------------------- 21"
## [1] "-------EDYNNVSYQSHF-------------------- 21"
## [1] "-------EEEGSASYQTTL-------------------- 21"
## [1] " "
#First step very important: changing class of alignment; second step displays MSA
class(tyr_align) <- "AAMultipleAlignment"
ggmsa::ggmsa(tyr_align,
start = 125,
end = 175)
#Calculates genetic distance using MSA
tyr_dist <- seqinr::dist.alignment(tyr_align_seqinr,
matrix = "identity")
#Shows information about distances
is(tyr_dist)
## [1] "dist" "oldClass"
class(tyr_dist)
## [1] "dist"
tyr_align_seqinr_rnd <- round(tyr_dist, 3)
tyr_align_seqinr_rnd
## NP_000363.1 XP_001136041.2 XP_006075241.1 XP_001492610.4
## XP_001136041.2 0.131
## XP_006075241.1 0.369 0.374
## XP_001492610.4 0.410 0.415 0.371
## XP_003992691.2 0.371 0.374 0.350 0.414
## NP_035791.1 0.379 0.389 0.421 0.441
## NP_001101005.1 0.366 0.379 0.419 0.441
## XP_003468651.1 0.389 0.398 0.419 0.451
## NP_989491.1 0.519 0.519 0.520 0.540
## NP_571088.3 0.626 0.630 0.628 0.630
## XP_003992691.2 NP_035791.1 NP_001101005.1 XP_003468651.1
## XP_001136041.2
## XP_006075241.1
## XP_001492610.4
## XP_003992691.2
## NP_035791.1 0.419
## NP_001101005.1 0.417 0.204
## XP_003468651.1 0.426 0.391 0.407
## NP_989491.1 0.527 0.529 0.529 0.540
## NP_571088.3 0.632 0.626 0.622 0.631
## NP_989491.1
## XP_001136041.2
## XP_006075241.1
## XP_001492610.4
## XP_003992691.2
## NP_035791.1
## NP_001101005.1
## XP_003468651.1
## NP_989491.1
## NP_571088.3 0.629
# Function that uses genetic distances to cluster sequences into clades
tree <- nj(tyr_dist)
# Plots phylogenetic tree and adds a label
plot.phylo(tree, main="Phylogenetic Tree",
use.edge.length = F)
mtext(text = "tyr family gene tree - rooted, no branch lengths")