###Introduction This code complies summary information about the gene LRRC19 (Leucine-Rich Repeat Containing Protein 19) It also generates alignments and a phylogeneitc tree to indicating the evolutionary relationship betweeen the human version of the gene and its homologs in other species.
###Resources / References Key information use to make this script can be found here:
http://pfam.xfam.org/family/PF15176#tabview=tab0 https://www.uniprot.org/uniprot/Q9H756 https://alphafold.ebi.ac.uk/entry/Q9H756
###Preparation Load necessary packages:
Download and load drawProteins from Bioconductor
library(BiocManager)
## Bioconductor version '3.13' is out-of-date; the current release version '3.14'
## is available with R version '4.1'; see https://bioconductor.org/install
library(drawProteins)
Load other packages
#github packages
library(compbio4all)
library(ggmsa)
## Registered S3 methods overwritten by 'ggalt':
## method from
## grid.draw.absoluteGrob ggplot2
## grobHeight.absoluteGrob ggplot2
## grobWidth.absoluteGrob ggplot2
## grobX.absoluteGrob ggplot2
## grobY.absoluteGrob ggplot2
#CRAN packages
library(rentrez)
library(seqinr)
library(ape)
##
## Attaching package: 'ape'
## The following objects are masked from 'package:seqinr':
##
## as.alignment, consensus
library(pander)
library(ggplot2)
#Bioconductor packages
library(msa)
## Loading required package: Biostrings
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
##
## windows
## Loading required package: XVector
## Loading required package: GenomeInfoDb
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:ape':
##
## complement
## The following object is masked from 'package:seqinr':
##
## translate
## The following object is masked from 'package:base':
##
## strsplit
##
## Attaching package: 'msa'
## The following object is masked from 'package:BiocManager':
##
## version
library(drawProteins)
#Biostrings
library(Biostrings)
###Accession numbers #Not available: Neanderthal, Drosophila
lrrc19_table <- c("NP_075052.1", "Q9H756", "NA","Homo sapiens","Human", "LRCC19",
"XP_001154402.1", "NA", "NA","Pan troglodytes", "Chimpanzee", "LRCC19",
"XP_001105486.1", "NA", "NA","Macaca mulatta", "Rhesus macaque", "LRCC19",
"XP_003431607.1", "NA","NA","Canis lupus familiaris","Dog", "LRCC19",
"NP_001178196.1", "NA","NA", "Bos taurus", "Cattle", "LRCC19",
"NP_780514.1", "Q8BZT5","NA","Mus musculus", "House Mouse", "LRCC19",
"NP_001102883.1","NA", "NA","Rattus norvegicus", "Brown Rat", "LRCC19",
"XP_027640383.1", "NA","NA","Falco peregrinus anatum", "Peregrine Falcon", "LRCC19",
"XP_003911679.1", "NA","NA","Papio anubis","Olive baboon", "LRCC19",
"XP_023059658.1","NA", "NA","Piliocolobus tephrosceles", "Uganda red colobus", "LRCC19")
lrrc19_matrix <- matrix(lrrc19_table, byrow = T, nrow = 10)
lrrc19_table <- data.frame(lrrc19_matrix, stringsAsFactors = F)
names(lrrc19_table) <- c("RefSeq", "Uniprot", "PDB", "Sci Name", "Common Name", "Gene Name")
lrrc19_table
## RefSeq Uniprot PDB Sci Name Common Name
## 1 NP_075052.1 Q9H756 NA Homo sapiens Human
## 2 XP_001154402.1 NA NA Pan troglodytes Chimpanzee
## 3 XP_001105486.1 NA NA Macaca mulatta Rhesus macaque
## 4 XP_003431607.1 NA NA Canis lupus familiaris Dog
## 5 NP_001178196.1 NA NA Bos taurus Cattle
## 6 NP_780514.1 Q8BZT5 NA Mus musculus House Mouse
## 7 NP_001102883.1 NA NA Rattus norvegicus Brown Rat
## 8 XP_027640383.1 NA NA Falco peregrinus anatum Peregrine Falcon
## 9 XP_003911679.1 NA NA Papio anubis Olive baboon
## 10 XP_023059658.1 NA NA Piliocolobus tephrosceles Uganda red colobus
## Gene Name
## 1 LRCC19
## 2 LRCC19
## 3 LRCC19
## 4 LRCC19
## 5 LRCC19
## 6 LRCC19
## 7 LRCC19
## 8 LRCC19
## 9 LRCC19
## 10 LRCC19
###Data Preparation ##Download Sequences #All sequences were downloaded using a wrapper compbio4all::entrez_fetch_list() which uses rentrez::entrez_fetch() to access NCBI databases.
#Obtaining all of the sequences
lrrc19_list <- entrez_fetch_list(db = "protein",
id = lrrc19_table$RefSeq,
rettype = "fasta")
length(lrrc19_list)
## [1] 10
lrrc19_list[1]
## $NP_075052.1
## [1] ">NP_075052.1 leucine-rich repeat-containing protein 19 precursor [Homo sapiens]\nMKVTGITILFWPLSMILLSDKIQSSKREVQCNFTEKNYTLIPADIKKDVTILDLSYNQITLNGTDTRVLQ\nTYFLLTELYLIENKVTILHNNGFGNLSSLEILNICRNSIYVIQQGAFLGLNKLKQLYLCQNKIEQLNADV\nFVPLRSLKLLNLQGNLISYLDVPPLFHLELITLYGNLWNCSCSLFNLQNWLNTSNVTLENENITMCSYPN\nSLQSYNIKTVPHKAECHSKFPSSVTEDLYIHFQPISNSIFNSSSNNLTRNSEHEPLGKSWAFLVGVVVTV\nLTTSLLIFIAIKCPIWYNILLSYNHHRLEEHEAETYEDGFTGNPSSLSQIPETNSEETTVIFEQLHSFVV\nDDDGFIEDKYIDIHELCEEN\n\n"
for(i in 1:length(lrrc19_list)){
lrrc19_list[[i]] <- compbio4all::fasta_cleaner(lrrc19_list[[i]], parse = F)
}
lrrc19_list[1]
## $NP_075052.1
## [1] "MKVTGITILFWPLSMILLSDKIQSSKREVQCNFTEKNYTLIPADIKKDVTILDLSYNQITLNGTDTRVLQTYFLLTELYLIENKVTILHNNGFGNLSSLEILNICRNSIYVIQQGAFLGLNKLKQLYLCQNKIEQLNADVFVPLRSLKLLNLQGNLISYLDVPPLFHLELITLYGNLWNCSCSLFNLQNWLNTSNVTLENENITMCSYPNSLQSYNIKTVPHKAECHSKFPSSVTEDLYIHFQPISNSIFNSSSNNLTRNSEHEPLGKSWAFLVGVVVTVLTTSLLIFIAIKCPIWYNILLSYNHHRLEEHEAETYEDGFTGNPSSLSQIPETNSEETTVIFEQLHSFVVDDDGFIEDKYIDIHELCEEN"
###General Protein Information
##Protein Diagram
Q9H756 <- drawProteins::get_features("Q9H756")
## [1] "Download has worked"
## [1] "Download has worked"
my_prot_df <- drawProteins::feature_to_dataframe(Q9H756)
my_canvas <- draw_canvas(my_prot_df)
my_canvas <- draw_chains(my_canvas, my_prot_df, label_size = 2.5)
my_canvas <- draw_repeat(my_canvas, my_prot_df)
my_canvas
###Draw Dotplot ##Prepare Data
lrrc19_human_FASTA <- rentrez::entrez_fetch(id ="NP_075052.1" ,
db = "protein",
rettype="fasta")
lrrc19_human_vector <- fasta_cleaner(lrrc19_human_FASTA)
lrrc19_human_vector
## [1] "M" "K" "V" "T" "G" "I" "T" "I" "L" "F" "W" "P" "L" "S" "M" "I" "L" "L"
## [19] "S" "D" "K" "I" "Q" "S" "S" "K" "R" "E" "V" "Q" "C" "N" "F" "T" "E" "K"
## [37] "N" "Y" "T" "L" "I" "P" "A" "D" "I" "K" "K" "D" "V" "T" "I" "L" "D" "L"
## [55] "S" "Y" "N" "Q" "I" "T" "L" "N" "G" "T" "D" "T" "R" "V" "L" "Q" "T" "Y"
## [73] "F" "L" "L" "T" "E" "L" "Y" "L" "I" "E" "N" "K" "V" "T" "I" "L" "H" "N"
## [91] "N" "G" "F" "G" "N" "L" "S" "S" "L" "E" "I" "L" "N" "I" "C" "R" "N" "S"
## [109] "I" "Y" "V" "I" "Q" "Q" "G" "A" "F" "L" "G" "L" "N" "K" "L" "K" "Q" "L"
## [127] "Y" "L" "C" "Q" "N" "K" "I" "E" "Q" "L" "N" "A" "D" "V" "F" "V" "P" "L"
## [145] "R" "S" "L" "K" "L" "L" "N" "L" "Q" "G" "N" "L" "I" "S" "Y" "L" "D" "V"
## [163] "P" "P" "L" "F" "H" "L" "E" "L" "I" "T" "L" "Y" "G" "N" "L" "W" "N" "C"
## [181] "S" "C" "S" "L" "F" "N" "L" "Q" "N" "W" "L" "N" "T" "S" "N" "V" "T" "L"
## [199] "E" "N" "E" "N" "I" "T" "M" "C" "S" "Y" "P" "N" "S" "L" "Q" "S" "Y" "N"
## [217] "I" "K" "T" "V" "P" "H" "K" "A" "E" "C" "H" "S" "K" "F" "P" "S" "S" "V"
## [235] "T" "E" "D" "L" "Y" "I" "H" "F" "Q" "P" "I" "S" "N" "S" "I" "F" "N" "S"
## [253] "S" "S" "N" "N" "L" "T" "R" "N" "S" "E" "H" "E" "P" "L" "G" "K" "S" "W"
## [271] "A" "F" "L" "V" "G" "V" "V" "V" "T" "V" "L" "T" "T" "S" "L" "L" "I" "F"
## [289] "I" "A" "I" "K" "C" "P" "I" "W" "Y" "N" "I" "L" "L" "S" "Y" "N" "H" "H"
## [307] "R" "L" "E" "E" "H" "E" "A" "E" "T" "Y" "E" "D" "G" "F" "T" "G" "N" "P"
## [325] "S" "S" "L" "S" "Q" "I" "P" "E" "T" "N" "S" "E" "E" "T" "T" "V" "I" "F"
## [343] "E" "Q" "L" "H" "S" "F" "V" "V" "D" "D" "D" "G" "F" "I" "E" "D" "K" "Y"
## [361] "I" "D" "I" "H" "E" "L" "C" "E" "E" "N"
par(mfrow = c(2,2),
mar = c(0,0,2,1))
# plot 1: Defaults
dotPlot(lrrc19_human_vector, lrrc19_human_vector,
wsize = ,
nmatch = ,
main = "Defaults")
# plot 2 size = 10, nmatch = 1
dotPlot(lrrc19_human_vector, lrrc19_human_vector,
wsize =10 ,
nmatch =1 ,
main = "size=10 nmatch=1")
# plot 3: size = 10, nmatch = 5
dotPlot(lrrc19_human_vector, lrrc19_human_vector,
wsize = 10,
nmatch =5 ,
main = "size=10 nmatch=5")
# plot 4: size = 20, nmatch = 5
dotPlot(lrrc19_human_vector, lrrc19_human_vector,
wsize = 20,
nmatch =5 ,
main = "size=20 nmatch=5")
#Best Plot
par(mfrow = c(1,1),
mar = c(4,4,4,4))
dotPlot(lrrc19_human_vector, lrrc19_human_vector,
wsize = 20,
nmatch =5 ,
main = "size=20 nmatch=5")
###Protein properties compiled from databases
proteinprop_table <- c("PFAM", "LRR19-TM is the single-span transmembrane region of LRRC19, a leucine-rich repeat protein family. LRRC19 functions as a transmembrane receptor inducing pro-inflammatory cytokines. This suggests its role in innate immunity [1]. This family of proteins is found in eukaryotes.", "http://pfam.xfam.org/family/PF15176#tabview=tab0",
"Uniprot", "LRRC19 is a pathogen-recognition receptor that is found in the gut and and kidney.", "https://www.uniprot.org/uniprot/Q9H756",
"Alphafold", "Alphafold predicts with high confidence that the protein has a mixture of beta plated sheets and alphahelices", "https://alphafold.ebi.ac.uk/entry/Q9H756")
prop_matrix <- matrix(proteinprop_table, byrow = T, nrow = 3)
proteinprop_table <- data.frame(prop_matrix, stringsAsFactors = F)
names(proteinprop_table) <- c("Source", "Protein Property", "Link")
proteinprop_table
## Source
## 1 PFAM
## 2 Uniprot
## 3 Alphafold
## Protein Property
## 1 LRR19-TM is the single-span transmembrane region of LRRC19, a leucine-rich repeat protein family. LRRC19 functions as a transmembrane receptor inducing pro-inflammatory cytokines. This suggests its role in innate immunity [1]. This family of proteins is found in eukaryotes.
## 2 LRRC19 is a pathogen-recognition receptor that is found in the gut and and kidney.
## 3 Alphafold predicts with high confidence that the protein has a mixture of beta plated sheets and alphahelices
## Link
## 1 http://pfam.xfam.org/family/PF15176#tabview=tab0
## 2 https://www.uniprot.org/uniprot/Q9H756
## 3 https://alphafold.ebi.ac.uk/entry/Q9H756
###Protein feature prediction
aa.1.1 <- c("A","R","N","D","C","Q","E","G","H","I",
"L","K","M","F","P","S","T","W","Y","V")
alpha <- c(285, 53, 97, 163, 22, 67, 134, 197, 111, 91,
221, 249, 48, 123, 82, 122, 119, 33, 63, 167)
beta <- c(203, 67, 139, 121, 75, 122, 86, 297, 49, 120,
177, 115, 16, 85, 127, 341, 253, 44, 110, 229)
a.plus.b <- c(175, 78, 120, 111, 74, 74, 86, 171, 33, 93,
110, 112, 25, 52, 71, 126, 117, 30, 108, 123)
a.div.b <- c(361, 146, 183, 244, 63, 114, 257, 377, 107, 239,
339, 321, 91, 158, 188, 327, 238, 72, 130, 378)
data.frame(aa.1.1, alpha, beta, a.plus.b, a.div.b)
## aa.1.1 alpha beta a.plus.b a.div.b
## 1 A 285 203 175 361
## 2 R 53 67 78 146
## 3 N 97 139 120 183
## 4 D 163 121 111 244
## 5 C 22 75 74 63
## 6 Q 67 122 74 114
## 7 E 134 86 86 257
## 8 G 197 297 171 377
## 9 H 111 49 33 107
## 10 I 91 120 93 239
## 11 L 221 177 110 339
## 12 K 249 115 112 321
## 13 M 48 16 25 91
## 14 F 123 85 52 158
## 15 P 82 127 71 188
## 16 S 122 341 126 327
## 17 T 119 253 117 238
## 18 W 33 44 30 72
## 19 Y 63 110 108 130
## 20 V 167 229 123 378
#Convert to frequencies
alpha.prop <- alpha/sum(alpha)
beta.prop <- beta/sum(beta)
a.plus.b.prop <- a.plus.b/sum(a.plus.b)
a.div.b <- a.div.b/sum(a.div.b)
#dataframe
aa.prop <- data.frame(alpha.prop,
beta.prop,
a.plus.b.prop,
a.div.b)
#row labels
row.names(aa.prop) <- aa.1.1
#Table 5 is now this
aa.prop
## alpha.prop beta.prop a.plus.b.prop a.div.b
## A 0.116469146 0.073126801 0.09264161 0.08331410
## R 0.021659174 0.024135447 0.04129169 0.03369490
## N 0.039640376 0.050072046 0.06352567 0.04223402
## D 0.066612178 0.043587896 0.05876125 0.05631202
## C 0.008990601 0.027017291 0.03917417 0.01453958
## Q 0.027380466 0.043948127 0.03917417 0.02630972
## E 0.054760932 0.030979827 0.04552673 0.05931225
## G 0.080506743 0.106988473 0.09052409 0.08700669
## H 0.045361667 0.017651297 0.01746956 0.02469421
## I 0.037188394 0.043227666 0.04923240 0.05515809
## L 0.090314671 0.063760807 0.05823187 0.07823679
## K 0.101757254 0.041426513 0.05929063 0.07408262
## M 0.019615856 0.005763689 0.01323452 0.02100162
## F 0.050265631 0.030619597 0.02752779 0.03646434
## P 0.033510421 0.045749280 0.03758602 0.04338795
## S 0.049856968 0.122838617 0.06670196 0.07546734
## T 0.048630977 0.091138329 0.06193753 0.05492730
## W 0.013485901 0.015850144 0.01588142 0.01661666
## Y 0.025745811 0.039625360 0.05717311 0.03000231
## V 0.068246833 0.082492795 0.06511382 0.08723748
##Download Focal Gene
#obtained from before
lrrc19_human_vector
## [1] "M" "K" "V" "T" "G" "I" "T" "I" "L" "F" "W" "P" "L" "S" "M" "I" "L" "L"
## [19] "S" "D" "K" "I" "Q" "S" "S" "K" "R" "E" "V" "Q" "C" "N" "F" "T" "E" "K"
## [37] "N" "Y" "T" "L" "I" "P" "A" "D" "I" "K" "K" "D" "V" "T" "I" "L" "D" "L"
## [55] "S" "Y" "N" "Q" "I" "T" "L" "N" "G" "T" "D" "T" "R" "V" "L" "Q" "T" "Y"
## [73] "F" "L" "L" "T" "E" "L" "Y" "L" "I" "E" "N" "K" "V" "T" "I" "L" "H" "N"
## [91] "N" "G" "F" "G" "N" "L" "S" "S" "L" "E" "I" "L" "N" "I" "C" "R" "N" "S"
## [109] "I" "Y" "V" "I" "Q" "Q" "G" "A" "F" "L" "G" "L" "N" "K" "L" "K" "Q" "L"
## [127] "Y" "L" "C" "Q" "N" "K" "I" "E" "Q" "L" "N" "A" "D" "V" "F" "V" "P" "L"
## [145] "R" "S" "L" "K" "L" "L" "N" "L" "Q" "G" "N" "L" "I" "S" "Y" "L" "D" "V"
## [163] "P" "P" "L" "F" "H" "L" "E" "L" "I" "T" "L" "Y" "G" "N" "L" "W" "N" "C"
## [181] "S" "C" "S" "L" "F" "N" "L" "Q" "N" "W" "L" "N" "T" "S" "N" "V" "T" "L"
## [199] "E" "N" "E" "N" "I" "T" "M" "C" "S" "Y" "P" "N" "S" "L" "Q" "S" "Y" "N"
## [217] "I" "K" "T" "V" "P" "H" "K" "A" "E" "C" "H" "S" "K" "F" "P" "S" "S" "V"
## [235] "T" "E" "D" "L" "Y" "I" "H" "F" "Q" "P" "I" "S" "N" "S" "I" "F" "N" "S"
## [253] "S" "S" "N" "N" "L" "T" "R" "N" "S" "E" "H" "E" "P" "L" "G" "K" "S" "W"
## [271] "A" "F" "L" "V" "G" "V" "V" "V" "T" "V" "L" "T" "T" "S" "L" "L" "I" "F"
## [289] "I" "A" "I" "K" "C" "P" "I" "W" "Y" "N" "I" "L" "L" "S" "Y" "N" "H" "H"
## [307] "R" "L" "E" "E" "H" "E" "A" "E" "T" "Y" "E" "D" "G" "F" "T" "G" "N" "P"
## [325] "S" "S" "L" "S" "Q" "I" "P" "E" "T" "N" "S" "E" "E" "T" "T" "V" "I" "F"
## [343] "E" "Q" "L" "H" "S" "F" "V" "V" "D" "D" "D" "G" "F" "I" "E" "D" "K" "Y"
## [361] "I" "D" "I" "H" "E" "L" "C" "E" "E" "N"
#Determine aa freq for lrrc19
lrrc19.freq.table <-table(lrrc19_human_vector)/length(lrrc19_human_vector)
#Convert table into vector function
table_to_vector <- function(table_x){
table_names <- attr(table_x, "dimnames")[[1]]
table_vect <- as.vector(table_x)
names(table_vect) <- table_names
return(table_vect)
}
lrrc19.human.aa.freq <- table_to_vector(lrrc19.freq.table)
lrrc19.human.aa.freq
## A C D E F G
## 0.018918919 0.024324324 0.037837838 0.070270270 0.045945946 0.035135135
## H I K L M N
## 0.029729730 0.086486486 0.045945946 0.143243243 0.008108108 0.094594595
## P Q R S T V
## 0.035135135 0.040540541 0.016216216 0.089189189 0.070270270 0.054054054
## W Y
## 0.013513514 0.040540541
#Check for presence of U, an unknown amino acid
aa.names <- names(lrrc19.human.aa.freq)
any(aa.names == "U")
## [1] FALSE
#Add data on lrrc19 to amino acid frequency table
aa.prop$lrrc19.human.aa.freq <- lrrc19.human.aa.freq
pander::pander(aa.prop)
| alpha.prop | beta.prop | a.plus.b.prop | a.div.b | lrrc19.human.aa.freq | |
|---|---|---|---|---|---|
| A | 0.1165 | 0.07313 | 0.09264 | 0.08331 | 0.01892 |
| R | 0.02166 | 0.02414 | 0.04129 | 0.03369 | 0.02432 |
| N | 0.03964 | 0.05007 | 0.06353 | 0.04223 | 0.03784 |
| D | 0.06661 | 0.04359 | 0.05876 | 0.05631 | 0.07027 |
| C | 0.008991 | 0.02702 | 0.03917 | 0.01454 | 0.04595 |
| Q | 0.02738 | 0.04395 | 0.03917 | 0.02631 | 0.03514 |
| E | 0.05476 | 0.03098 | 0.04553 | 0.05931 | 0.02973 |
| G | 0.08051 | 0.107 | 0.09052 | 0.08701 | 0.08649 |
| H | 0.04536 | 0.01765 | 0.01747 | 0.02469 | 0.04595 |
| I | 0.03719 | 0.04323 | 0.04923 | 0.05516 | 0.1432 |
| L | 0.09031 | 0.06376 | 0.05823 | 0.07824 | 0.008108 |
| K | 0.1018 | 0.04143 | 0.05929 | 0.07408 | 0.09459 |
| M | 0.01962 | 0.005764 | 0.01323 | 0.021 | 0.03514 |
| F | 0.05027 | 0.03062 | 0.02753 | 0.03646 | 0.04054 |
| P | 0.03351 | 0.04575 | 0.03759 | 0.04339 | 0.01622 |
| S | 0.04986 | 0.1228 | 0.0667 | 0.07547 | 0.08919 |
| T | 0.04863 | 0.09114 | 0.06194 | 0.05493 | 0.07027 |
| W | 0.01349 | 0.01585 | 0.01588 | 0.01662 | 0.05405 |
| Y | 0.02575 | 0.03963 | 0.05717 | 0.03 | 0.01351 |
| V | 0.06825 | 0.08249 | 0.06511 | 0.08724 | 0.04054 |
##Functions to calculate similarities Two custom functions needed: one to calculate correlations and one to calculate cosine similarity
# Corrleation used in Chou adn Zhange 1992.
chou_cor <- function(x,y){
numerator <- sum(x*y)
denominator <- sqrt((sum(x^2))*(sum(y^2)))
result <- numerator/denominator
return(result)
}
# Cosine similarity used in Higgs and Attwood (2005).
chou_cosine <- function(z.1, z.2){
z.1.abs <- sqrt(sum(z.1^2))
z.2.abs <- sqrt(sum(z.2^2))
my.cosine <- sum(z.1*z.2)/(z.1.abs*z.2.abs)
return(my.cosine)
}
#Calculate correlation between each column
corr.alpha <- chou_cor(aa.prop[,5], aa.prop[,1])
corr.beta <- chou_cor(aa.prop[,5], aa.prop[,2])
corr.apb <- chou_cor(aa.prop[,5], aa.prop[,3])
corr.adb <- chou_cor(aa.prop[,5], aa.prop[,4])
#Calculate cosin similarity
cos.alpha <- chou_cosine(aa.prop[,5], aa.prop[,1])
cos.beta <- chou_cosine(aa.prop[,5], aa.prop[,2])
cos.apb <- chou_cosine(aa.prop[,5], aa.prop[,3])
cos.adb <- chou_cosine(aa.prop[,5], aa.prop[,4])
#Calculate distance. (need to flip dataframe on its side with t())
aa.prop.flipped <- t(aa.prop)
round(aa.prop.flipped,2)
## A R N D C Q E G H I L
## alpha.prop 0.12 0.02 0.04 0.07 0.01 0.03 0.05 0.08 0.05 0.04 0.09
## beta.prop 0.07 0.02 0.05 0.04 0.03 0.04 0.03 0.11 0.02 0.04 0.06
## a.plus.b.prop 0.09 0.04 0.06 0.06 0.04 0.04 0.05 0.09 0.02 0.05 0.06
## a.div.b 0.08 0.03 0.04 0.06 0.01 0.03 0.06 0.09 0.02 0.06 0.08
## lrrc19.human.aa.freq 0.02 0.02 0.04 0.07 0.05 0.04 0.03 0.09 0.05 0.14 0.01
## K M F P S T W Y V
## alpha.prop 0.10 0.02 0.05 0.03 0.05 0.05 0.01 0.03 0.07
## beta.prop 0.04 0.01 0.03 0.05 0.12 0.09 0.02 0.04 0.08
## a.plus.b.prop 0.06 0.01 0.03 0.04 0.07 0.06 0.02 0.06 0.07
## a.div.b 0.07 0.02 0.04 0.04 0.08 0.05 0.02 0.03 0.09
## lrrc19.human.aa.freq 0.09 0.04 0.04 0.02 0.09 0.07 0.05 0.01 0.04
#Distance Matrix
dist(aa.prop.flipped, method = "euclidean")
## alpha.prop beta.prop a.plus.b.prop a.div.b
## beta.prop 0.13342098
## a.plus.b.prop 0.09281824 0.08289406
## a.div.b 0.06699039 0.08659174 0.06175113
## lrrc19.human.aa.freq 0.18680998 0.16937594 0.16093128 0.15865109
#Individual Distances
dist.alpha <- dist((aa.prop.flipped[c(1,5),]), method = "euclidean")
dist.beta <- dist((aa.prop.flipped[c(2,5),]), method = "euclidean")
dist.apb <- dist((aa.prop.flipped[c(3,5),]), method = "euclidean")
dist.adb <- dist((aa.prop.flipped[c(4,5),]), method = "euclidean")
#Compiling the information
# fold types
fold.type <- c("alpha","beta","alpha plus beta", "alpha/beta")
# data
corr.sim <- round(c(corr.alpha,corr.beta,corr.apb,corr.adb),5)
cosine.sim <- round(c(cos.alpha,cos.beta,cos.apb,cos.adb),5)
Euclidean.dist <- round(c(dist.alpha,dist.beta,dist.apb,dist.adb),5)
# summary
sim.sum <- c("","","most.sim","")
dist.sum <- c("","","min.dist","")
df <- data.frame(fold.type,
corr.sim ,
cosine.sim ,
Euclidean.dist ,
sim.sum ,
dist.sum )
#Display Output
pander::pander(df)
| fold.type | corr.sim | cosine.sim | Euclidean.dist | sim.sum | dist.sum |
|---|---|---|---|---|---|
| alpha | 0.7478 | 0.7478 | 0.1868 | ||
| beta | 0.7951 | 0.7951 | 0.1694 | ||
| alpha plus beta | 0.8053 | 0.8053 | 0.1609 | most.sim | min.dist |
| alpha/beta | 0.8129 | 0.8129 | 0.1587 |
###Percent Identity Comparison (PID) ##Data preparation
#Humans, Chimps, Rhesus Macaque, Dog
## sequence 1: Humans
human_fasta <- rentrez::entrez_fetch(db = "protein",
id = "NP_075052.1",
rettype = "fasta")
## sequence 2: Chimps
chimp_fasta <- rentrez::entrez_fetch(db = "protein",
id = "XP_001154402.1",
rettype = "fasta")
## sequence 3: Rhesus Macaque
rhesus_fasta <- rentrez::entrez_fetch(db = "protein",
id = "XP_001105486.1 ",
rettype = "fasta")
## sequence 4: Dog
dog_fasta <- rentrez::entrez_fetch(db = "protein",
id = "XP_003431607.1 ",
rettype = "fasta")
# clean
human_vector <- fasta_cleaner(human_fasta)
chimp_vector <- fasta_cleaner(chimp_fasta)
rhesus_vector <- fasta_cleaner(rhesus_fasta)
dog_vector <- fasta_cleaner(dog_fasta)
human_string <-paste(human_vector,collapse = "")
chimp_string <-paste(chimp_vector,collapse = "")
rhesus_string <-paste(rhesus_vector,collapse = "")
dog_string <-paste(dog_vector,collapse = "")
human_string <- toupper(human_string)
chimp_string <- toupper(chimp_string)
rhesus_string <- toupper(rhesus_string)
dog_string <- toupper(dog_string)
##Alignments
data("BLOSUM50")
globalAlignHumanChimp <- Biostrings::pairwiseAlignment(human_string,
chimp_string,
substitutionMatrix = BLOSUM50,
gapOpening = -8,
gapExtension = -2,
scoreOnly = FALSE)
globalAlignHumanRhesus <- Biostrings::pairwiseAlignment(human_string,
rhesus_string,
substitutionMatrix = BLOSUM50,
gapOpening = -8,
gapExtension = -2,
scoreOnly = FALSE)
globalAlignHumanDog <- Biostrings::pairwiseAlignment(human_string,
dog_string,
substitutionMatrix = BLOSUM50,
gapOpening = -8,
gapExtension = -2,
scoreOnly = FALSE)
globalAlignChimpRhesus <- Biostrings::pairwiseAlignment(chimp_string,
rhesus_string,
substitutionMatrix = BLOSUM50,
gapOpening = -8,
gapExtension = -2,
scoreOnly = FALSE)
globalAlignChimpDog <- Biostrings::pairwiseAlignment(chimp_string,
dog_string,
substitutionMatrix = BLOSUM50,
gapOpening = -8,
gapExtension = -2,
scoreOnly = FALSE)
globalAlignRhesusDog <- Biostrings::pairwiseAlignment(rhesus_string,
dog_string,
substitutionMatrix = BLOSUM50,
gapOpening = -8,
gapExtension = -2,
scoreOnly = FALSE)
##Making matrix
pids <- c(1, NA, NA, NA,
pid(globalAlignHumanChimp), 1, NA, NA,
pid(globalAlignHumanRhesus), pid(globalAlignChimpRhesus), 1, NA,
pid(globalAlignHumanDog), pid(globalAlignChimpDog), pid(globalAlignRhesusDog), 1)
mat <- matrix(pids, nrow = 4, byrow = T)
row.names(mat) <- c("Homo","Chimp","Rhesus","Dog")
colnames(mat) <- c("Homo","Chimp","Rhesus","Dog")
pander::pander(mat)
| Homo | Chimp | Rhesus | Dog | |
|---|---|---|---|---|
| Homo | 1 | NA | NA | NA |
| Chimp | 99.19 | 1 | NA | NA |
| Rhesus | 95.14 | 95.41 | 1 | NA |
| Dog | 79.19 | 79.19 | 80.54 | 1 |
###PID Methods Comparison #Obtaining PIDS (Human Vs Chimp)
pid(globalAlignHumanChimp, type = "PID1")
## [1] 99.18919
pid(globalAlignHumanChimp, type = "PID2")
## [1] 99.18919
pid(globalAlignHumanChimp, type = "PID3")
## [1] 99.18919
pid(globalAlignHumanChimp, type = "PID4")
## [1] 99.18919
PIDcomparison_table <- c("PID1", "99.18919", "aligned positions + internal gap positions",
"PID2", "99.18919", "aligned positions",
"PID3", "99.18919", "lenght shorter sequence",
"PID4", "99.18919", "average length of the two sequences")
PID_matrix <- matrix(PIDcomparison_table, byrow = T, nrow = 4)
PIDcomparison_table <- data.frame(PID_matrix, stringsAsFactors = F)
names(PIDcomparison_table) <- c("Method", "PID", "Denominator")
PIDcomparison_table
## Method PID Denominator
## 1 PID1 99.18919 aligned positions + internal gap positions
## 2 PID2 99.18919 aligned positions
## 3 PID3 99.18919 lenght shorter sequence
## 4 PID4 99.18919 average length of the two sequences
###Multiple Sequence Alignment ##MSA data preparation
lrrc19_vector <- rep(NA, length(lrrc19_list))
lrrc19_vector
## [1] NA NA NA NA NA NA NA NA NA NA
# looping
for(i in 1:length(lrrc19_vector)){
lrrc19_vector[i] <- lrrc19_list[[i]]
}
# naming
names(lrrc19_vector) <- names(lrrc19_list)
lrrc19_vector_ss <- Biostrings::AAStringSet(lrrc19_vector)
lrrc19_vector_ss
## AAStringSet object of length 10:
## width seq names
## [1] 370 MKVTGITILFWPLSMILLSDKIQ...VVDDDGFIEDKYIDIHELCEEN NP_075052.1
## [2] 370 MKVTGITILFWPLSMILLSDKIQ...VVDDDGFIEDKYIDIHELREEN XP_001154402.1
## [3] 370 MKVTGITILFWPLSMVLLSDKIQ...VVDDDGFIEDKYIDIHELREEN XP_001105486.1
## [4] 369 MKATCITILFWPLFMLLLSEERQ...VVDDDGFIEDKYIDTHELHEEN XP_003431607.1
## [5] 370 MKTMSITILFWLLSMLLLSDKSQ...VVDDDGFIEDKYIDTHELREEN NP_001178196.1
## [6] 364 MKVTRFMFWLFSMLLPSVKSQAS...VVDDDGFIEDRYIDINEVHEEK NP_780514.1
## [7] 359 MKVTRFMFWLFSMLLSDKSQASE...VVDDDGFIEDRYIDINEVHEEK NP_001102883.1
## [8] 380 MKLSWFAIWAGILFLNPVTADCN...EDGCTEDKDIDSYSTEKSSSLK XP_027640383.1
## [9] 370 MKVTGITILFWPLSMVLLSDKIQ...VVDDDGFIEDKYIDIHELREEN XP_003911679.1
## [10] 370 MKVTGITILFWPLSMVLSSDKIQ...MVDDDGFIEDKYIDIHELREEN XP_023059658.1
##Building Multiple Sequence Alignment (MSA)
lrrc19_align <- msa(lrrc19_vector_ss,
method = "ClustalW")
## use default substitution matrix
lrrc19_align
## CLUSTAL 2.1
##
## Call:
## msa(lrrc19_vector_ss, method = "ClustalW")
##
## MsaAAMultipleAlignment with 10 rows and 380 columns
## aln names
## [1] MKVT--RFMFWLFSMLLPSVKSQAS...VVDDDGFIEDRYIDINEVHEEK--- NP_780514.1
## [2] MKVT--RFMFWLFSMLL-SDKSQAS...VVDDDGFIEDRYIDINEVHEEK--- NP_001102883.1
## [3] MKATCITILFWPLFMLLLSEERQSS...VVDDDGFIEDKYIDTHELHEEN--- XP_003431607.1
## [4] MKTMSITILFWLLSMLLLSDKSQTS...VVDDDGFIEDKYIDTHELREEN--- NP_001178196.1
## [5] MKVTGITILFWPLSMILLSDKIQSS...VVDDDGFIEDKYIDIHELCEEN--- NP_075052.1
## [6] MKVTGITILFWPLSMILLSDKIQSS...VVDDDGFIEDKYIDIHELREEN--- XP_001154402.1
## [7] MKVTGITILFWPLSMVLLSDKIQSS...VVDDDGFIEDKYIDIHELREEN--- XP_001105486.1
## [8] MKVTGITILFWPLSMVLLSDKIQSS...VVDDDGFIEDKYIDIHELREEN--- XP_003911679.1
## [9] MKVTGITILFWPLSMVLSSDKIQSS...MVDDDGFIEDKYIDIHELREEN--- XP_023059658.1
## [10] MKLSWFAIWAGILFLNPVTADCNIT...VSGEDGCTEDKDIDSYSTEKSSSLK XP_027640383.1
## Con MKVTGITILFWPLSM?LLSDKIQSS...VVDDDGFIEDKYIDIHELREEN--- Consensus
#Get MSA ready for viewing
class(lrrc19_align) <- "AAMultipleAlignment"
# This is converting the format of the msa to another
lrrc19_align_seqinr <- msaConvert(lrrc19_align, type = "seqinr::alignment")
##MSA
#ggmsa::ggmsa(lrrc19_align,
# start = 2030,
# end = 2100,)
###Distance Matrix
lrrc19_subset_dist <- seqinr::dist.alignment(lrrc19_align_seqinr,
matrix = "identity")
is(lrrc19_subset_dist)
## [1] "dist" "oldClass"
class(lrrc19_subset_dist)
## [1] "dist"
lrrc19_subset_dist_rounded <- round(lrrc19_subset_dist,
digits = 3)
lrrc19_subset_dist_rounded
## NP_780514.1 NP_001102883.1 XP_003431607.1 NP_001178196.1
## NP_001102883.1 0.358
## XP_003431607.1 0.560 0.536
## NP_001178196.1 0.560 0.538 0.454
## NP_075052.1 0.537 0.522 0.454 0.444
## XP_001154402.1 0.540 0.525 0.454 0.441
## XP_001105486.1 0.542 0.522 0.439 0.444
## XP_003911679.1 0.542 0.522 0.439 0.444
## XP_023059658.1 0.557 0.536 0.451 0.468
## XP_027640383.1 0.770 0.772 0.763 0.775
## NP_075052.1 XP_001154402.1 XP_001105486.1 XP_003911679.1
## NP_001102883.1
## XP_003431607.1
## NP_001178196.1
## NP_075052.1
## XP_001154402.1 0.090
## XP_001105486.1 0.221 0.214
## XP_003911679.1 0.221 0.214 0.000
## XP_023059658.1 0.285 0.280 0.187 0.187
## XP_027640383.1 0.766 0.764 0.764 0.764
## XP_023059658.1
## NP_001102883.1
## XP_003431607.1
## NP_001178196.1
## NP_075052.1
## XP_001154402.1
## XP_001105486.1
## XP_003911679.1
## XP_023059658.1
## XP_027640383.1 0.771
###Phylogenetic Tree
tree_subset <- nj(lrrc19_subset_dist_rounded)
# plot tree
plot.phylo(tree_subset, main="Phylogenetic Tree",
use.edge.length = F)
# add label
mtext(text = "LRRC19 family gene tree - rooted, no branch lengths")