Introduction: The TLR1 gene is apart of the toll-like receptor family of genes. These genes encode proteins that are involved in the immune response. These proteins recognize molecular patterns of pathogens, so they have a pretty significant imapct on immunology.
Here is the link to the NCBI information on TLR1: https://www.ncbi.nlm.nih.gov/gene/7096
Here is the link to the homolog information for TLR1: https://www.ncbi.nlm.nih.gov/homologene/20694
Here is a link to the Uniprot page for TLR1: https://www.uniprot.org/uniprot/?query=TLR1&sort=score
#Install new packages
#install("drawProteins")
#Load into library
library(BiocManager)
## Bioconductor version '3.13' is out-of-date; the current release version '3.14'
## is available with R version '4.1'; see https://bioconductor.org/install
library(drawProteins)
library(ggpubr)
## Loading required package: ggplot2
library(ggplot2)
#github packages
library(compbio4all)
library(ggmsa)
## Registered S3 methods overwritten by 'ggalt':
## method from
## grid.draw.absoluteGrob ggplot2
## grobHeight.absoluteGrob ggplot2
## grobWidth.absoluteGrob ggplot2
## grobX.absoluteGrob ggplot2
## grobY.absoluteGrob ggplot2
library(msa)
## Loading required package: Biostrings
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
##
## windows
## Loading required package: XVector
## Loading required package: GenomeInfoDb
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
##
## strsplit
##
## Attaching package: 'msa'
## The following object is masked from 'package:BiocManager':
##
## version
#CRAN packages
library(rentrez)
library(seqinr)
##
## Attaching package: 'seqinr'
## The following object is masked from 'package:Biostrings':
##
## translate
library(ape)
##
## Attaching package: 'ape'
## The following objects are masked from 'package:seqinr':
##
## as.alignment, consensus
## The following object is masked from 'package:Biostrings':
##
## complement
## The following object is masked from 'package:ggpubr':
##
## rotate
library(pander)
#make vectors for data frame
common_name<- c("Human", "Chimpanzee", "Rhesus Macaque", "Wolf", "House Mouse", "Brown Rat", "Frog", "Zebrafish", "Rainbow Trout", "Atlantic Salmon")
scientific_name<- c("H.sapiens", "P.troglodytes", "M.mulatta", "C.lupus", "M.musculus", "R.norvegicus", "X.tropicalis", "Danio Rerio", "Oncorhynchus mykiss", "Salmo Salar")
refseq<- c("NP_003254.2", "NP_001123937.1", " NP_001123937.1", "NP_001139615.1", "NP_001263374.1", "NP_001165591.1", "XP_002938702.2", "NP_001124065.1 ", "NP_001159573.1", "XP_014055275.1")
uniprot<- c("Q15399", "NA", "F7AIC3", "F1PIK0", "F1PIK0", "E9PTL5", "NA", "NA", "NA", "NA")
pdb<- c("6NIH", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA")
#make dataframe
df1<- data.frame (refseq, uniprot, pdb, scientific_name, common_name)
pander::pander(df1)
| refseq | uniprot | pdb | scientific_name | common_name |
|---|---|---|---|---|
| NP_003254.2 | Q15399 | 6NIH | H.sapiens | Human |
| NP_001123937.1 | NA | NA | P.troglodytes | Chimpanzee |
| NP_001123937.1 | F7AIC3 | NA | M.mulatta | Rhesus Macaque |
| NP_001139615.1 | F1PIK0 | NA | C.lupus | Wolf |
| NP_001263374.1 | F1PIK0 | NA | M.musculus | House Mouse |
| NP_001165591.1 | E9PTL5 | NA | R.norvegicus | Brown Rat |
| XP_002938702.2 | NA | NA | X.tropicalis | Frog |
| NP_001124065.1 | NA | NA | Danio Rerio | Zebrafish |
| NP_001159573.1 | NA | NA | Oncorhynchus mykiss | Rainbow Trout |
| XP_014055275.1 | NA | NA | Salmo Salar | Atlantic Salmon |
HumanTLR1fasta <- entrez_fetch(db = "protein",
id = "NP_003254.2",
rettype = "fasta")
ChimpTLR1fasta <- entrez_fetch(db= "protein",
id= "NP_001123937.1",
rettype = "fasta")
MacaqueTLR1fasta <- entrez_fetch(db= "protein",
id= "NP_001123937.1",
rettype = "fasta")
WolfTLR1fasta <- entrez_fetch(db= "protein",
id= "NP_001139615.1",
rettype = "fasta")
MouseTLR1fasta <- entrez_fetch(db= "protein",
id= "NP_001263374.1",
rettype = "fasta")
RatTLR1fasta <- entrez_fetch(db= "protein",
id= "NP_001165591.1",
rettype = "fasta")
FrogTLR1fasta <- entrez_fetch(db= "protein",
id= "XP_002938702.2",
rettype = "fasta")
ZebrafishTLR1fasta <- entrez_fetch(db= "protein",
id= "NP_001124065.1",
rettype = "fasta")
TroutTLR1fasta <- entrez_fetch(db= "protein",
id= "NP_001159573.1",
rettype = "fasta")
SalmonTLR1fasta <- entrez_fetch(db= "protein",
id= "XP_014055275.1",
rettype = "fasta")
fasta_list<- list(HumanTLR1fasta, ChimpTLR1fasta, MacaqueTLR1fasta, WolfTLR1fasta, MouseTLR1fasta, RatTLR1fasta, FrogTLR1fasta, ZebrafishTLR1fasta, TroutTLR1fasta, SalmonTLR1fasta)
length(fasta_list)
## [1] 10
fasta_list[[1]]
## [1] ">NP_003254.2 toll-like receptor 1 precursor [Homo sapiens]\nMTSIFHFAIIFMLILQIRIQLSEESEFLVDRSKNGLIHVPKDLSQKTTILNISQNYISELWTSDILSLSK\nLRILIISHNRIQYLDISVFKFNQELEYLDLSHNKLVKISCHPTVNLKHLDLSFNAFDALPICKEFGNMSQ\nLKFLGLSTTHLEKSSVLPIAHLNISKVLLVLGETYGEKEDPEGLQDFNTESLHIVFPTNKEFHFILDVSV\nKTVANLELSNIKCVLEDNKCSYFLSILAKLQTNPKLSNLTLNNIETTWNSFIRILQLVWHTTVWYFSISN\nVKLQGQLDFRDFDYSGTSLKALSIHQVVSDVFGFPQSYIYEIFSNMNIKNFTVSGTRMVHMLCPSKISPF\nLHLDFSNNLLTDTVFENCGHLTELETLILQMNQLKELSKIAEMTTQMKSLQQLDISQNSVSYDEKKGDCS\nWTKSLLSLNMSSNILTDTIFRCLPPRIKVLDLHSNKIKSIPKQVVKLEALQELNVAFNSLTDLPGCGSFS\nSLSVLIIDHNSVSHPSADFFQSCQKMRSIKAGDNPFQCTCELGEFVKNIDQVSSEVLEGWPDSYKCDYPE\nSYRGTLLKDFHMSELSCNITLLIVTIVATMLVLAVTVTSLCSYLDLPWYLRMVCQWTQTRRRARNIPLEE\nLQRNLQFHAFISYSGHDSFWVKNELLPNLEKEGMQICLHERNFVPGKSIVENIITCIEKSYKSIFVLSPN\nFVQSEWCHYELYFAHHNLFHEGSNSLILILLEPIPQYSIPSSYHKLKSLMARRTYLEWPKEKSKRGLFWA\nNLRAAINIKLTEQAKK\n\n"
for(i in 1:length(fasta_list)){
fasta_list[[i]] <- compbio4all::fasta_cleaner(fasta_list[[i]], parse = F)
}
#Building a protein diagram
Q15399_protdiag <- drawProteins::get_features("Q15399")
## [1] "Download has worked"
is(Q15399_protdiag)
## [1] "list" "vector" "list_OR_List" "vector_OR_Vector"
## [5] "vector_OR_factor"
#Raw data converted into dataframe
my_prot_df <- drawProteins::feature_to_dataframe(Q15399_protdiag)
is(my_prot_df)
## [1] "data.frame" "list" "oldClass" "vector"
## [5] "list_OR_List" "vector_OR_Vector" "vector_OR_factor"
my_prot_df[,-2]
## type begin end length accession entryName taxid order
## featuresTemp SIGNAL 1 24 23 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.1 CHAIN 25 786 761 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.2 TOPO_DOM 25 580 555 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.3 TRANSMEM 581 601 20 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.4 TOPO_DOM 602 786 184 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.5 REPEAT 54 77 23 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.6 REPEAT 78 101 23 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.7 REPEAT 102 125 23 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.8 REPEAT 126 150 24 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.9 REPEAT 151 175 24 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.10 REPEAT 176 199 23 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.11 REPEAT 200 223 23 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.12 REPEAT 224 250 26 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.13 REPEAT 251 278 27 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.14 REPEAT 279 308 29 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.15 REPEAT 309 337 28 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.16 REPEAT 338 361 23 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.17 REPEAT 362 388 26 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.18 REPEAT 389 414 25 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.19 REPEAT 415 437 22 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.20 REPEAT 438 457 19 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.21 REPEAT 458 478 20 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.22 REPEAT 479 500 21 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.23 REPEAT 501 524 23 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.24 DOMAIN 525 579 54 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.25 DOMAIN 635 776 141 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.26 REGION 313 316 3 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.27 ACT_SITE 710 710 0 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.28 CARBOHYD 51 51 0 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.29 CARBOHYD 137 137 0 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.30 CARBOHYD 163 163 0 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.31 CARBOHYD 330 330 0 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.32 CARBOHYD 429 429 0 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.33 CARBOHYD 578 578 0 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.34 DISULFID 110 132 22 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.35 DISULFID 223 230 7 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.36 DISULFID 343 368 25 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.37 DISULFID 419 442 23 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.38 VARIANT 44 44 0 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.39 VARIANT 75 75 0 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.40 VARIANT 80 80 0 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.41 VARIANT 118 118 0 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.42 VARIANT 248 248 0 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.43 VARIANT 305 305 0 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.44 VARIANT 315 315 0 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.45 VARIANT 352 352 0 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.46 VARIANT 460 460 0 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.47 VARIANT 542 542 0 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.48 VARIANT 554 554 0 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.49 VARIANT 587 587 0 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.50 VARIANT 602 602 0 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.51 VARIANT 631 631 0 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.52 VARIANT 651 651 0 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.53 VARIANT 674 674 0 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.54 VARIANT 720 720 0 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.55 VARIANT 733 733 0 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.56 CONFLICT 182 182 0 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.57 CONFLICT 228 228 0 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.58 CONFLICT 276 276 0 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.59 STRAND 28 30 2 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.60 STRAND 48 51 3 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.61 HELIX 62 65 3 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.62 STRAND 73 75 2 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.63 STRAND 83 85 2 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.64 HELIX 86 89 3 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.65 STRAND 97 99 2 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.66 STRAND 107 109 2 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.67 STRAND 117 120 3 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.68 HELIX 133 137 4 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.69 STRAND 143 150 7 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.70 HELIX 153 159 6 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.71 STRAND 164 171 7 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.72 TURN 173 178 5 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.73 HELIX 181 184 3 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.74 STRAND 189 195 6 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.75 STRAND 198 200 2 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.76 STRAND 214 218 4 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.77 STRAND 221 223 2 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.78 TURN 227 230 3 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.79 HELIX 231 238 7 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.80 HELIX 239 242 3 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.81 STRAND 248 257 9 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.82 HELIX 258 269 11 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.83 STRAND 274 285 11 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.84 STRAND 301 309 8 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.85 HELIX 317 324 7 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.86 STRAND 329 336 7 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.87 STRAND 352 354 2 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.88 TURN 362 367 5 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.89 STRAND 376 378 2 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.90 HELIX 387 394 7 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.91 STRAND 402 404 2 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.92 HELIX 414 416 2 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.93 STRAND 427 429 2 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.94 HELIX 437 441 4 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.95 STRAND 449 451 2 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.96 HELIX 462 466 4 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.97 STRAND 472 474 2 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.98 TURN 487 491 4 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.99 HELIX 504 516 12 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.100 TURN 517 520 3 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.101 STRAND 521 523 2 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.102 TURN 529 531 2 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.103 HELIX 536 538 2 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.104 STRAND 631 633 2 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.105 STRAND 637 642 5 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.106 HELIX 645 647 2 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.107 HELIX 648 653 5 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.108 HELIX 655 660 5 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.109 TURN 661 663 2 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.110 TURN 669 672 3 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.111 HELIX 679 689 10 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.112 STRAND 690 698 8 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.113 HELIX 699 704 5 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.114 HELIX 707 712 5 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.115 STRAND 724 732 8 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.116 HELIX 736 738 2 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.117 HELIX 744 751 7 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.118 HELIX 762 764 2 Q15399 TLR1_HUMAN 9606 1
## featuresTemp.119 HELIX 767 777 10 Q15399 TLR1_HUMAN 9606 1
#Plot the info
my_canvas <- draw_canvas(my_prot_df)
my_canvas <- draw_chains(my_canvas, my_prot_df,
label_size = 2.5)
my_canvas <- draw_domains(my_canvas, my_prot_df)
my_canvas
#Draw Dotplot
HumanTLR1vector <- compbio4all::fasta_cleaner(HumanTLR1fasta)
seqinr::dotPlot(HumanTLR1vector, HumanTLR1vector)
#grid with different dotplot settings
par(mfrow = c(2,2),
mar = c(0,0,2,1))
seqinr::dotPlot(HumanTLR1vector,HumanTLR1vector ,
wsize = 12,
nmatch = 4)
seqinr::dotPlot(HumanTLR1vector, HumanTLR1vector ,
wsize = 3,
nmatch = 2)
#Best dotplot
seqinr::dotPlot(HumanTLR1vector, HumanTLR1vector ,
wsize = 3,
nmatch = 2)
pfam<- c("LRR_8", "LRRCT", "TIR", "https://nam12.safelinks.protection.outlook.com/?url=http%3A%2F%2Fpfam.xfam.org%2Fprotein%2FQ15399&data=04%7C01%7CMRK96%40pitt.edu%7C894c824c8a3f4f86564f08d9c015918d%7C9ef9f489e0a04eeb87cc3a526112fd0d%7C1%7C0%7C637752021124217536%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000&sdata=0u59JB4gZdlGur0lyU2GbWkgFYCVgQI5%2F0TL2nXf2%2BE%3D&reserved=0")
disprot<- c("NA", "NA", "NA", "NA")
repeats<- c("NA", "NA", "NA", "NA")
subcellularloc<- c("golgi apparatus", "plasma membrane", "NA", "NA")
secondarypdb<- c("leucine rich repeat", "NA", "NA", "NA")
protein.prop.df<- data.frame(pfam, disprot, repeats, subcellularloc, secondarypdb)
pander::pander (protein.prop.df)
| disprot | repeats | subcellularloc | secondarypdb |
|---|---|---|---|
| NA | NA | golgi apparatus | leucine rich repeat |
| NA | NA | plasma membrane | NA |
| NA | NA | NA | NA |
| NA | NA | NA | NA |
Here is the link to the alphafold for TLR1: https://alphafold.ebi.ac.uk/entry/Q15399 From the diagram, you can see many alpha helices as well as beta sheets. They also have a consered cytoplasmic domain.
#Amino acid frequency
# enter once
aa.1.1 <- c("A","R","N","D","C","Q","E","G","H","I",
"L","K","M","F","P","S","T","W","Y","V")
# enter again
aa.1.2 <- c("A","R","N","D","C","Q","E","G","H","I",
"L","K","M","F","P","S","T","W","Y","V")
# are they the same length?
length(aa.1.1) == length(aa.1.2)
## [1] TRUE
# are all the values the same?
aa.1.1 == aa.1.2
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE
#length(unique(aa.1.1)) tells me how many unique values
unique(aa.1.1)
## [1] "A" "R" "N" "D" "C" "Q" "E" "G" "H" "I" "L" "K" "M" "F" "P" "S" "T" "W" "Y"
## [20] "V"
# does each vector have the same number of unique values?
length(unique(aa.1.1)) == length(unique(aa.1.2))
## [1] TRUE
# alpha proteins
alpha <- c(285, 53, 97, 163, 22, 67, 134, 197, 111, 91,
221, 249, 48, 123, 82, 122, 119, 33, 63, 167)
# check against chou's total
sum(alpha) == 2447
## [1] TRUE
# beta proteins
beta <- c(203, 67, 139, 121, 75, 122, 86, 297, 49, 120,
177, 115, 16, 85, 127, 341, 253, 44, 110, 229)
# check against chou's total
sum(beta) == 2776
## [1] TRUE
# alpha + beta
a.plus.b <- c(175, 78, 120, 111, 74, 74, 86, 171, 33, 93,
110, 112, 25, 52, 71, 126, 117, 30, 108, 123)
sum(a.plus.b) == 1889
## [1] TRUE
# alpha/beta
a.div.b <- c(361, 146, 183, 244, 63, 114, 257, 377, 107, 239,
339, 321, 91, 158, 188, 327, 238, 72, 130, 378)
sum(a.div.b) == 4333
## [1] TRUE
#Making data table
data.frame(aa.1.1, alpha, beta, a.plus.b, a.div.b)
## aa.1.1 alpha beta a.plus.b a.div.b
## 1 A 285 203 175 361
## 2 R 53 67 78 146
## 3 N 97 139 120 183
## 4 D 163 121 111 244
## 5 C 22 75 74 63
## 6 Q 67 122 74 114
## 7 E 134 86 86 257
## 8 G 197 297 171 377
## 9 H 111 49 33 107
## 10 I 91 120 93 239
## 11 L 221 177 110 339
## 12 K 249 115 112 321
## 13 M 48 16 25 91
## 14 F 123 85 52 158
## 15 P 82 127 71 188
## 16 S 122 341 126 327
## 17 T 119 253 117 238
## 18 W 33 44 30 72
## 19 Y 63 110 108 130
## 20 V 167 229 123 378
#Calculate proportions
alpha.prop <- alpha/sum(alpha)
beta.prop <- beta/sum(beta)
a.plus.b.prop <- a.plus.b/sum(a.plus.b)
a.div.b <- a.div.b/sum(a.div.b)
#Make data frame of proportions
aa.prop <- data.frame(alpha.prop,
beta.prop,
a.plus.b.prop,
a.div.b)
#row labels
row.names(aa.prop) <- aa.1.1
aa.prop
## alpha.prop beta.prop a.plus.b.prop a.div.b
## A 0.116469146 0.073126801 0.09264161 0.08331410
## R 0.021659174 0.024135447 0.04129169 0.03369490
## N 0.039640376 0.050072046 0.06352567 0.04223402
## D 0.066612178 0.043587896 0.05876125 0.05631202
## C 0.008990601 0.027017291 0.03917417 0.01453958
## Q 0.027380466 0.043948127 0.03917417 0.02630972
## E 0.054760932 0.030979827 0.04552673 0.05931225
## G 0.080506743 0.106988473 0.09052409 0.08700669
## H 0.045361667 0.017651297 0.01746956 0.02469421
## I 0.037188394 0.043227666 0.04923240 0.05515809
## L 0.090314671 0.063760807 0.05823187 0.07823679
## K 0.101757254 0.041426513 0.05929063 0.07408262
## M 0.019615856 0.005763689 0.01323452 0.02100162
## F 0.050265631 0.030619597 0.02752779 0.03646434
## P 0.033510421 0.045749280 0.03758602 0.04338795
## S 0.049856968 0.122838617 0.06670196 0.07546734
## T 0.048630977 0.091138329 0.06193753 0.05492730
## W 0.013485901 0.015850144 0.01588142 0.01661666
## Y 0.025745811 0.039625360 0.05717311 0.03000231
## V 0.068246833 0.082492795 0.06511382 0.08723748
#Make scatterplot matrix
plot(aa.prop,panel = panel.smooth)
#Make correlation matrix
cor(aa.prop)
## alpha.prop beta.prop a.plus.b.prop a.div.b
## alpha.prop 1.0000000 0.4941143 0.6969508 0.8555289
## beta.prop 0.4941143 1.0000000 0.7977771 0.7706654
## a.plus.b.prop 0.6969508 0.7977771 1.0000000 0.8198043
## a.div.b 0.8555289 0.7706654 0.8198043 1.0000000
#Round to 3 decimals
round(cor(aa.prop), 3)
## alpha.prop beta.prop a.plus.b.prop a.div.b
## alpha.prop 1.000 0.494 0.697 0.856
## beta.prop 0.494 1.000 0.798 0.771
## a.plus.b.prop 0.697 0.798 1.000 0.820
## a.div.b 0.856 0.771 0.820 1.000
#Do individual plots
par(mfrow = c(1,3), mar = c(4,4,1,0))
plot(alpha.prop ~ beta.prop, data = aa.prop)
plot(alpha.prop ~ a.plus.b.prop, data = aa.prop)
plot(alpha.prop ~ a.div.b, data = aa.prop)
par(mfrow = c(1,1), mar = c(4,4,4,4))
#Download TLR1 Gene
NP_003254.2 <- rentrez::entrez_fetch(id = "NP_003254.2",
db = "protein",
rettype = "fasta")
#Clean and turn into vector
NP_003254.2 <- compbio4all::fasta_cleaner(NP_003254.2, parse = TRUE)
#amino acid frequency for TLR1 gene
NP_003254.2.freq.table <- table(NP_003254.2)/length(NP_003254.2)
#convert table to vector
table_to_vector <- function(table_x){
table_names <- attr(table_x, "dimnames")[[1]]
table_vect <- as.vector(table_x)
names(table_vect) <- table_names
return(table_vect)}
TLR1.human.aa.freq <- table_to_vector(NP_003254.2.freq.table)
#Check for U
aa.names <- names(TLR1.human.aa.freq)
any(aa.names == "U")
## [1] FALSE
#Functions used in Chou's Paper to calculate similarities
#Correlation
chou_cor <- function(x,y){
numerator <- sum(x*y)
denominator <- sqrt((sum(x^2))*(sum(y^2)))
result <- numerator/denominator
return(result)
}
#Cosine Similarity
chou_cosine <- function(z.1, z.2){
z.1.abs <- sqrt(sum(z.1^2))
z.2.abs <- sqrt(sum(z.2^2))
my.cosine <- sum(z.1*z.2)/(z.1.abs*z.2.abs)
return(my.cosine)
}
#Data Exploration
par(mfrow = c(2,2), mar = c(1,4,1,1))
plot(alpha.prop ~ TLR1.human.aa.freq, data = aa.prop)
plot(beta.prop ~ TLR1.human.aa.freq, data = aa.prop)
plot(a.plus.b.prop ~ TLR1.human.aa.freq, data = aa.prop)
plot(a.div.b ~ TLR1.human.aa.freq, data = aa.prop)
par(mfrow = c(1,1), mar = c(1,1,1,1))
#Calculate Correlations between each columns
corr.alpha <- chou_cor(aa.prop[,4], aa.prop[,1])
corr.beta <- chou_cor(aa.prop[,4], aa.prop[,2])
corr.apb <- chou_cor(aa.prop[,4], aa.prop[,3])
corr.adb <- chou_cor(aa.prop[,4], aa.prop[,4])
#Calculate cosine similarity
cos.alpha <- chou_cosine(aa.prop[,4], aa.prop[,1])
cos.beta <- chou_cosine(aa.prop[,4], aa.prop[,2])
cos.apb <- chou_cosine(aa.prop[,4], aa.prop[,3])
cos.adb <- chou_cosine(aa.prop[,4], aa.prop[,4])
#Calculate distance
aa.prop.flipped <- t(aa.prop)
round(aa.prop.flipped,2)
## A R N D C Q E G H I L K M
## alpha.prop 0.12 0.02 0.04 0.07 0.01 0.03 0.05 0.08 0.05 0.04 0.09 0.10 0.02
## beta.prop 0.07 0.02 0.05 0.04 0.03 0.04 0.03 0.11 0.02 0.04 0.06 0.04 0.01
## a.plus.b.prop 0.09 0.04 0.06 0.06 0.04 0.04 0.05 0.09 0.02 0.05 0.06 0.06 0.01
## a.div.b 0.08 0.03 0.04 0.06 0.01 0.03 0.06 0.09 0.02 0.06 0.08 0.07 0.02
## F P S T W Y V
## alpha.prop 0.05 0.03 0.05 0.05 0.01 0.03 0.07
## beta.prop 0.03 0.05 0.12 0.09 0.02 0.04 0.08
## a.plus.b.prop 0.03 0.04 0.07 0.06 0.02 0.06 0.07
## a.div.b 0.04 0.04 0.08 0.05 0.02 0.03 0.09
#get distance matrix
dist(aa.prop.flipped, method = "euclidean")
## alpha.prop beta.prop a.plus.b.prop
## beta.prop 0.13342098
## a.plus.b.prop 0.09281824 0.08289406
## a.div.b 0.06699039 0.08659174 0.06175113
#Individual distances
dist.alpha <- dist((aa.prop.flipped[c(1,4),]), method = "euclidean")
dist.beta <- dist((aa.prop.flipped[c(2,4),]), method = "euclidean")
dist.apb <- dist((aa.prop.flipped[c(3,4),]), method = "euclidean")
dist.adb <- dist((aa.prop.flipped[c(4,4),]), method = "euclidean")
#Compile the info
# fold types
fold.type <- c("alpha","beta","alpha plus beta", "alpha/beta")
# data
corr.sim <- round(c(corr.alpha,corr.beta,corr.apb,corr.adb),5)
cosine.sim <- round(c(cos.alpha,cos.beta,cos.apb,cos.adb),5)
Euclidean.dist <- round(c(dist.alpha,dist.beta,dist.apb,dist.adb),5)
# summary
sim.sum <- c("","","most.sim","")
dist.sum <- c("","","min.dist","")
df <- data.frame(fold.type,
corr.sim ,
cosine.sim ,
Euclidean.dist ,
sim.sum ,
dist.sum )
#Display Output
pander::pander(df)
| fold.type | corr.sim | cosine.sim | Euclidean.dist | sim.sum | dist.sum |
|---|---|---|---|---|---|
| alpha | 0.9658 | 0.9658 | 0.06699 | ||
| beta | 0.9436 | 0.9436 | 0.08659 | ||
| alpha plus beta | 0.9686 | 0.9686 | 0.06175 | most.sim | min.dist |
| alpha/beta | 1 | 1 | 0 |
#Data Prep
names(fasta_list)
## NULL
length(fasta_list)
## [1] 10
fasta_list[1]
## [[1]]
## [1] "MTSIFHFAIIFMLILQIRIQLSEESEFLVDRSKNGLIHVPKDLSQKTTILNISQNYISELWTSDILSLSKLRILIISHNRIQYLDISVFKFNQELEYLDLSHNKLVKISCHPTVNLKHLDLSFNAFDALPICKEFGNMSQLKFLGLSTTHLEKSSVLPIAHLNISKVLLVLGETYGEKEDPEGLQDFNTESLHIVFPTNKEFHFILDVSVKTVANLELSNIKCVLEDNKCSYFLSILAKLQTNPKLSNLTLNNIETTWNSFIRILQLVWHTTVWYFSISNVKLQGQLDFRDFDYSGTSLKALSIHQVVSDVFGFPQSYIYEIFSNMNIKNFTVSGTRMVHMLCPSKISPFLHLDFSNNLLTDTVFENCGHLTELETLILQMNQLKELSKIAEMTTQMKSLQQLDISQNSVSYDEKKGDCSWTKSLLSLNMSSNILTDTIFRCLPPRIKVLDLHSNKIKSIPKQVVKLEALQELNVAFNSLTDLPGCGSFSSLSVLIIDHNSVSHPSADFFQSCQKMRSIKAGDNPFQCTCELGEFVKNIDQVSSEVLEGWPDSYKCDYPESYRGTLLKDFHMSELSCNITLLIVTIVATMLVLAVTVTSLCSYLDLPWYLRMVCQWTQTRRRARNIPLEELQRNLQFHAFISYSGHDSFWVKNELLPNLEKEGMQICLHERNFVPGKSIVENIITCIEKSYKSIFVLSPNFVQSEWCHYELYFAHHNLFHEGSNSLILILLEPIPQYSIPSSYHKLKSLMARRTYLEWPKEKSKRGLFWANLRAAINIKLTEQAKK"
fasta_vector<- unlist(fasta_list)
names(fasta_vector) <- names(fasta_list)
#Pairwise Alignments
align.human.chimp <- Biostrings::pairwiseAlignment(
HumanTLR1fasta,
ChimpTLR1fasta)
align.human.wolf<-
Biostrings::pairwiseAlignment(
HumanTLR1fasta,
WolfTLR1fasta)
align.human.frog<- Biostrings::pairwiseAlignment(
HumanTLR1fasta,
FrogTLR1fasta)
align.chimp.wolf<- Biostrings::pairwiseAlignment(
ChimpTLR1fasta,
WolfTLR1fasta)
align.chimp.frog<- Biostrings::pairwiseAlignment(
ChimpTLR1fasta,
FrogTLR1fasta)
align.wolf.frog<- Biostrings::pairwiseAlignment(
WolfTLR1fasta,
FrogTLR1fasta)
#PID
Biostrings::pid(align.human.chimp)
## [1] 96.18056
Biostrings::pid(align.human.wolf)
## [1] 76.61017
Biostrings::pid(align.human.frog)
## [1] 43.987
Biostrings::pid(align.chimp.wolf)
## [1] 76.63657
Biostrings::pid(align.chimp.frog)
## [1] 44.08602
Biostrings::pid(align.wolf.frog)
## [1] 45.69328
#Build Matrix
pids <- c(1, NA, NA, NA,
pid(align.human.chimp), 1, NA, NA,
pid(align.human.wolf), pid(align.chimp.wolf), 1, NA,
pid(align.human.frog), pid(align.chimp.frog), pid(align.wolf.frog), 1)
mat <- matrix(pids, nrow = 4, byrow = T)
row.names(mat) <- c("Homo","Pan","Wolf","Frog")
colnames(mat) <- c("Homo","Pan","Wolf","Frog")
pander::pander(mat)
| Â | Homo | Pan | Wolf | Frog |
|---|---|---|---|---|
| Homo | 1 | NA | NA | NA |
| Pan | 96.18 | 1 | NA | NA |
| Wolf | 76.61 | 76.64 | 1 | NA |
| Frog | 43.99 | 44.09 | 45.69 | 1 |
#PID methods comparison
PID1<- Biostrings::pid(align.human.chimp, type = "PID1")
PID2<- Biostrings::pid(align.human.chimp, type = "PID2")
PID3<- Biostrings::pid(align.human.chimp, type = "PID3")
PID4<- Biostrings::pid(align.human.chimp, type = "PID4")
PID1
## [1] 96.18056
PID2
## [1] 96.85315
PID3
## [1] 96.85315
PID4
## [1] 96.51568
#Make vectors for data table
method<- c("PID1", "PID2", "PID3", "PID4")
PID<- c(96.18, 96.85, 96.85, 96.52)
denominator<- c("(aligned positions + internal gap positions)", "(aligned positions)", "(length shorter sequence)", "(average length of the two sequences)")
#make table
PIDtable<- data.frame(method, PID, denominator)
pander::pander(PIDtable)
| method | PID | denominator |
|---|---|---|
| PID1 | 96.18 | (aligned positions + internal gap positions) |
| PID2 | 96.85 | (aligned positions) |
| PID3 | 96.85 | (length shorter sequence) |
| PID4 | 96.52 | (average length of the two sequences) |
#MSA
names(fasta_vector) <- c("Human", "Chimp", "Rhesus Macaque", "Wolf", "House Mouse", "Brown Rat", "Frog", "Zebrafish", "Rainbow Trout", "Atlantic Salmon")
TLR1_vector_ss <- Biostrings::AAStringSet(fasta_vector)
#Build MSA
TLR1_align <- msa(TLR1_vector_ss, method = "ClustalW")
## use default substitution matrix
TLR1_align
## CLUSTAL 2.1
##
## Call:
## msa(TLR1_vector_ss, method = "ClustalW")
##
## MsaAAMultipleAlignment with 10 rows and 821 columns
## aln names
## [1] MTKPNSLIFYCIIVLGLTLMKIQLS...ANLRASINVKLVNQAEGTCYTQQ-- House Mouse
## [2] MTKTQSTIFYCIVVLGLILIKIQLS...ANLRASINVKLVNQAEATCYTQQ-- Brown Rat
## [3] ---MPSIFHFAIIFMLILQIRIQLS...ANLKAAINIKLTEQAKK-------- Chimp
## [4] ---MPSIFHFAIIFMLILQIRIQLS...ANLKAAINIKLTEQAKK-------- Rhesus Macaque
## [5] ---MTSIFHFAIIFMLILQIRIQLS...ANLRAAINIKLTEQAKK-------- Human
## [6] MKTNPSIFQFAIIFILILEIRIQLS...ANLRASINIKLREQAKK-------- Wolf
## [7] -------------MLSIYILVQSES...ANLREAIHVNLSIKEEDMADPEVRT Frog
## [8] MRAVAVTLWAVAALMGVHQSTPSSS...ANLRAALQADLPTAPVRDDVDE--- Rainbow Trout
## [9] MRAVAVTLWAVAALMGVHQCTPSSS...ANLRAALQADLPTAPVRDEMDE--- Atlantic Salmon
## [10] MKPSSG-WWLVSVYLTCFHPSLIP-...ANLRAALQAELPNTPDREEE----- Zebrafish
## Con M????S?????II?M?I?QI?IQLS...ANLRAAIN?KL??QA?????----- Consensus
#Cleaning up
class(TLR1_align)
## [1] "MsaAAMultipleAlignment"
## attr(,"package")
## [1] "msa"
is(TLR1_align)
## [1] "MsaAAMultipleAlignment" "AAMultipleAlignment" "MsaMetaData"
## [4] "MultipleAlignment"
#Change class of assingment and change in seqinr format
class(TLR1_align) <- "AAMultipleAlignment"
TLR1_align_seqinr <- msaConvert(TLR1_align, type = "seqinr::alignment")
is(TLR1_align)
## [1] "AAMultipleAlignment" "MultipleAlignment"
#Show output
# compbio4all::print_msa(alignment = TLR1_align_seqinr)
#display msa as R plot
ggmsa::ggmsa(TLR1_align,
start = 400,
end = 500)
#Distance Matrix
TLR1_dist <- seqinr::dist.alignment(TLR1_align_seqinr,
matrix = "identity")
is(TLR1_dist)
## [1] "dist" "oldClass"
class(TLR1_dist)
## [1] "dist"
#round
TLR1_align_seqinr_rnd <- round(TLR1_dist, 3)
TLR1_align_seqinr_rnd
## House Mouse Brown Rat Chimp Rhesus Macaque Human Wolf Frog
## Brown Rat 0.355
## Chimp 0.516 0.523
## Rhesus Macaque 0.516 0.523 0.000
## Human 0.512 0.522 0.129 0.129
## Wolf 0.530 0.535 0.454 0.454 0.451
## Frog 0.755 0.760 0.750 0.750 0.749 0.744
## Rainbow Trout 0.800 0.804 0.805 0.805 0.806 0.808 0.807
## Atlantic Salmon 0.794 0.797 0.799 0.799 0.800 0.801 0.802
## Zebrafish 0.801 0.801 0.804 0.804 0.804 0.805 0.816
## Rainbow Trout Atlantic Salmon
## Brown Rat
## Chimp
## Rhesus Macaque
## Human
## Wolf
## Frog
## Rainbow Trout
## Atlantic Salmon 0.236
## Zebrafish 0.707 0.701
#Phylogenetic Tree
tree <- nj(TLR1_dist)
#Plot tree
plot.phylo (tree, main="Phylogenetic Tree",
use.edge.length = F)
mtext(text = "TLR1 family gene tree - unrooted, no branch lengths")