Final portfolio assignment: LRRC19- Leucine-Rich Repeat-Containing Protein 19

###Introduction This code complies summary information about the gene LRRC19 (Leucine-Rich Repeat Containing Protein 19) It also generates alignments and a phylogeneitc tree to indicating the evolutionary relationship betweeen the human version of the gene and its homologs in other species.

###Resources / References Key information use to make this script can be found here:

http://pfam.xfam.org/family/PF15176#tabview=tab0 https://www.uniprot.org/uniprot/Q9H756 https://alphafold.ebi.ac.uk/entry/Q9H756

###Preparation Load necessary packages:

Download and load drawProteins from Bioconductor

library(BiocManager)

## Bioconductor version '3.13' is out-of-date; the current release version '3.14'
##   is available with R version '4.1'; see https://bioconductor.org/install

library(drawProteins)

Load other packages

#github packages
library(compbio4all)
library(ggmsa)

## Registered S3 methods overwritten by 'ggalt':
##   method                  from   
##   grid.draw.absoluteGrob  ggplot2
##   grobHeight.absoluteGrob ggplot2
##   grobWidth.absoluteGrob  ggplot2
##   grobX.absoluteGrob      ggplot2
##   grobY.absoluteGrob      ggplot2

#CRAN packages
library(rentrez)
library(seqinr)
library(ape)

## 
## Attaching package: 'ape'

## The following objects are masked from 'package:seqinr':
## 
##     as.alignment, consensus

library(pander)
library(ggplot2)

#Bioconductor packages
library(msa)

## Loading required package: Biostrings

## Loading required package: BiocGenerics

## Loading required package: parallel

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min

## Loading required package: S4Vectors

## Loading required package: stats4

## 
## Attaching package: 'S4Vectors'

## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname

## Loading required package: IRanges

## 
## Attaching package: 'IRanges'

## The following object is masked from 'package:grDevices':
## 
##     windows

## Loading required package: XVector

## Loading required package: GenomeInfoDb

## 
## Attaching package: 'Biostrings'

## The following object is masked from 'package:ape':
## 
##     complement

## The following object is masked from 'package:seqinr':
## 
##     translate

## The following object is masked from 'package:base':
## 
##     strsplit

## 
## Attaching package: 'msa'

## The following object is masked from 'package:BiocManager':
## 
##     version

library(drawProteins)

#Biostrings
library(Biostrings)

###Accession numbers #Not available: Neanderthal, Drosophila

lrrc19_table <- c("NP_075052.1", "Q9H756", "NA","Homo sapiens","Human", "LRCC19", 
                           "XP_001154402.1", "NA", "NA","Pan troglodytes", "Chimpanzee", "LRCC19",  
                           "XP_001105486.1", "NA", "NA","Macaca mulatta", "Rhesus macaque", "LRCC19", 
                           "XP_003431607.1", "NA","NA","Canis lupus familiaris","Dog", "LRCC19",    
                           "NP_001178196.1", "NA","NA", "Bos taurus", "Cattle", "LRCC19",   
                           "NP_780514.1", "Q8BZT5","NA","Mus musculus", "House Mouse", "LRCC19",    
                           "NP_001102883.1","NA", "NA","Rattus norvegicus", "Brown Rat", "LRCC19", 
                           "XP_027640383.1", "NA","NA","Falco peregrinus anatum", "Peregrine Falcon", "LRCC19", 
                           "XP_003911679.1", "NA","NA","Papio anubis","Olive baboon", "LRCC19", 
                           "XP_023059658.1","NA", "NA","Piliocolobus tephrosceles", "Uganda red colobus", "LRCC19")

lrrc19_matrix <- matrix(lrrc19_table, byrow = T, nrow = 10)

lrrc19_table <- data.frame(lrrc19_matrix, stringsAsFactors = F)

names(lrrc19_table) <- c("RefSeq", "Uniprot", "PDB", "Sci Name", "Common Name", "Gene Name")

lrrc19_table

##            RefSeq Uniprot PDB                  Sci Name        Common Name
## 1     NP_075052.1  Q9H756  NA              Homo sapiens              Human
## 2  XP_001154402.1      NA  NA           Pan troglodytes         Chimpanzee
## 3  XP_001105486.1      NA  NA            Macaca mulatta     Rhesus macaque
## 4  XP_003431607.1      NA  NA    Canis lupus familiaris                Dog
## 5  NP_001178196.1      NA  NA                Bos taurus             Cattle
## 6     NP_780514.1  Q8BZT5  NA              Mus musculus        House Mouse
## 7  NP_001102883.1      NA  NA         Rattus norvegicus          Brown Rat
## 8  XP_027640383.1      NA  NA   Falco peregrinus anatum   Peregrine Falcon
## 9  XP_003911679.1      NA  NA              Papio anubis       Olive baboon
## 10 XP_023059658.1      NA  NA Piliocolobus tephrosceles Uganda red colobus
##    Gene Name
## 1     LRCC19
## 2     LRCC19
## 3     LRCC19
## 4     LRCC19
## 5     LRCC19
## 6     LRCC19
## 7     LRCC19
## 8     LRCC19
## 9     LRCC19
## 10    LRCC19

###Data Preparation ##Download Sequences #All sequences were downloaded using a wrapper compbio4all::entrez_fetch_list() which uses rentrez::entrez_fetch() to access NCBI databases.

#Obtaining all of the sequences
lrrc19_list <- entrez_fetch_list(db = "protein", 
                          id = lrrc19_table$RefSeq, 
                          rettype = "fasta")

length(lrrc19_list)

## [1] 10

lrrc19_list[1]

## $NP_075052.1
## [1] ">NP_075052.1 leucine-rich repeat-containing protein 19 precursor [Homo sapiens]\nMKVTGITILFWPLSMILLSDKIQSSKREVQCNFTEKNYTLIPADIKKDVTILDLSYNQITLNGTDTRVLQ\nTYFLLTELYLIENKVTILHNNGFGNLSSLEILNICRNSIYVIQQGAFLGLNKLKQLYLCQNKIEQLNADV\nFVPLRSLKLLNLQGNLISYLDVPPLFHLELITLYGNLWNCSCSLFNLQNWLNTSNVTLENENITMCSYPN\nSLQSYNIKTVPHKAECHSKFPSSVTEDLYIHFQPISNSIFNSSSNNLTRNSEHEPLGKSWAFLVGVVVTV\nLTTSLLIFIAIKCPIWYNILLSYNHHRLEEHEAETYEDGFTGNPSSLSQIPETNSEETTVIFEQLHSFVV\nDDDGFIEDKYIDIHELCEEN\n\n"

for(i in 1:length(lrrc19_list)){
  lrrc19_list[[i]] <- compbio4all::fasta_cleaner(lrrc19_list[[i]], parse = F)
}

lrrc19_list[1]

## $NP_075052.1
## [1] "MKVTGITILFWPLSMILLSDKIQSSKREVQCNFTEKNYTLIPADIKKDVTILDLSYNQITLNGTDTRVLQTYFLLTELYLIENKVTILHNNGFGNLSSLEILNICRNSIYVIQQGAFLGLNKLKQLYLCQNKIEQLNADVFVPLRSLKLLNLQGNLISYLDVPPLFHLELITLYGNLWNCSCSLFNLQNWLNTSNVTLENENITMCSYPNSLQSYNIKTVPHKAECHSKFPSSVTEDLYIHFQPISNSIFNSSSNNLTRNSEHEPLGKSWAFLVGVVVTVLTTSLLIFIAIKCPIWYNILLSYNHHRLEEHEAETYEDGFTGNPSSLSQIPETNSEETTVIFEQLHSFVVDDDGFIEDKYIDIHELCEEN"

###General Protein Information

##Protein Diagram

Q9H756 <- drawProteins::get_features("Q9H756")

## [1] "Download has worked"

## [1] "Download has worked"
my_prot_df <- drawProteins::feature_to_dataframe(Q9H756)

my_canvas <- draw_canvas(my_prot_df)
my_canvas <- draw_chains(my_canvas, my_prot_df, label_size = 2.5)

my_canvas <- draw_repeat(my_canvas, my_prot_df)
my_canvas

###Draw Dotplot ##Prepare Data

lrrc19_human_FASTA <- rentrez::entrez_fetch(id ="NP_075052.1"  ,
                                      db = "protein", 
                                      rettype="fasta")
lrrc19_human_vector <- fasta_cleaner(lrrc19_human_FASTA)

lrrc19_human_vector

##   [1] "M" "K" "V" "T" "G" "I" "T" "I" "L" "F" "W" "P" "L" "S" "M" "I" "L" "L"
##  [19] "S" "D" "K" "I" "Q" "S" "S" "K" "R" "E" "V" "Q" "C" "N" "F" "T" "E" "K"
##  [37] "N" "Y" "T" "L" "I" "P" "A" "D" "I" "K" "K" "D" "V" "T" "I" "L" "D" "L"
##  [55] "S" "Y" "N" "Q" "I" "T" "L" "N" "G" "T" "D" "T" "R" "V" "L" "Q" "T" "Y"
##  [73] "F" "L" "L" "T" "E" "L" "Y" "L" "I" "E" "N" "K" "V" "T" "I" "L" "H" "N"
##  [91] "N" "G" "F" "G" "N" "L" "S" "S" "L" "E" "I" "L" "N" "I" "C" "R" "N" "S"
## [109] "I" "Y" "V" "I" "Q" "Q" "G" "A" "F" "L" "G" "L" "N" "K" "L" "K" "Q" "L"
## [127] "Y" "L" "C" "Q" "N" "K" "I" "E" "Q" "L" "N" "A" "D" "V" "F" "V" "P" "L"
## [145] "R" "S" "L" "K" "L" "L" "N" "L" "Q" "G" "N" "L" "I" "S" "Y" "L" "D" "V"
## [163] "P" "P" "L" "F" "H" "L" "E" "L" "I" "T" "L" "Y" "G" "N" "L" "W" "N" "C"
## [181] "S" "C" "S" "L" "F" "N" "L" "Q" "N" "W" "L" "N" "T" "S" "N" "V" "T" "L"
## [199] "E" "N" "E" "N" "I" "T" "M" "C" "S" "Y" "P" "N" "S" "L" "Q" "S" "Y" "N"
## [217] "I" "K" "T" "V" "P" "H" "K" "A" "E" "C" "H" "S" "K" "F" "P" "S" "S" "V"
## [235] "T" "E" "D" "L" "Y" "I" "H" "F" "Q" "P" "I" "S" "N" "S" "I" "F" "N" "S"
## [253] "S" "S" "N" "N" "L" "T" "R" "N" "S" "E" "H" "E" "P" "L" "G" "K" "S" "W"
## [271] "A" "F" "L" "V" "G" "V" "V" "V" "T" "V" "L" "T" "T" "S" "L" "L" "I" "F"
## [289] "I" "A" "I" "K" "C" "P" "I" "W" "Y" "N" "I" "L" "L" "S" "Y" "N" "H" "H"
## [307] "R" "L" "E" "E" "H" "E" "A" "E" "T" "Y" "E" "D" "G" "F" "T" "G" "N" "P"
## [325] "S" "S" "L" "S" "Q" "I" "P" "E" "T" "N" "S" "E" "E" "T" "T" "V" "I" "F"
## [343] "E" "Q" "L" "H" "S" "F" "V" "V" "D" "D" "D" "G" "F" "I" "E" "D" "K" "Y"
## [361] "I" "D" "I" "H" "E" "L" "C" "E" "E" "N"

par(mfrow = c(2,2), 
    mar = c(0,0,2,1))

# plot 1: Defaults
dotPlot(lrrc19_human_vector, lrrc19_human_vector, 
        wsize = , 
        nmatch = , 
        main = "Defaults")

# plot 2 size = 10, nmatch = 1
dotPlot(lrrc19_human_vector, lrrc19_human_vector, 
        wsize =10 , 
        nmatch =1 , 
        main = "size=10 nmatch=1")

# plot 3: size = 10, nmatch = 5
dotPlot(lrrc19_human_vector, lrrc19_human_vector, 
        wsize = 10, 
        nmatch =5 , 
        main = "size=10 nmatch=5")

# plot 4: size = 20, nmatch = 5
dotPlot(lrrc19_human_vector, lrrc19_human_vector, 
        wsize = 20,
        nmatch =5 ,
        main = "size=20 nmatch=5")

#Best Plot

par(mfrow = c(1,1), 
    mar = c(4,4,4,4))


dotPlot(lrrc19_human_vector, lrrc19_human_vector, 
        wsize = 20, 
        nmatch =5 , 
        main = "size=20 nmatch=5")

###Protein properties compiled from databases

proteinprop_table <- c("PFAM", "LRR19-TM is the single-span transmembrane region of LRRC19, a leucine-rich repeat protein family. LRRC19 functions as a transmembrane receptor inducing pro-inflammatory cytokines. This suggests its role in innate immunity [1]. This family of proteins is found in eukaryotes.", "http://pfam.xfam.org/family/PF15176#tabview=tab0", 
                       "Uniprot", "LRRC19 is a pathogen-recognition receptor that is found in the gut and and kidney.", "https://www.uniprot.org/uniprot/Q9H756", 
                       "Alphafold", "Alphafold predicts with high confidence that the protein has a mixture of beta plated sheets and alphahelices", "https://alphafold.ebi.ac.uk/entry/Q9H756")

prop_matrix <- matrix(proteinprop_table, byrow = T, nrow = 3)

proteinprop_table <- data.frame(prop_matrix, stringsAsFactors = F)

names(proteinprop_table) <- c("Source", "Protein Property", "Link")

proteinprop_table

##      Source
## 1      PFAM
## 2   Uniprot
## 3 Alphafold
##                                                                                                                                                                                                                                                                     Protein Property
## 1 LRR19-TM is the single-span transmembrane region of LRRC19, a leucine-rich repeat protein family. LRRC19 functions as a transmembrane receptor inducing pro-inflammatory cytokines. This suggests its role in innate immunity [1]. This family of proteins is found in eukaryotes.
## 2                                                                                                                                                                                                 LRRC19 is a pathogen-recognition receptor that is found in the gut and and kidney.
## 3                                                                                                                                                                      Alphafold predicts with high confidence that the protein has a mixture of beta plated sheets and alphahelices
##                                               Link
## 1 http://pfam.xfam.org/family/PF15176#tabview=tab0
## 2           https://www.uniprot.org/uniprot/Q9H756
## 3         https://alphafold.ebi.ac.uk/entry/Q9H756

###Protein feature prediction

aa.1.1 <- c("A","R","N","D","C","Q","E","G","H","I",
            "L","K","M","F","P","S","T","W","Y","V")

alpha <- c(285, 53, 97, 163, 22, 67, 134, 197, 111, 91, 
           221, 249, 48, 123, 82, 122, 119, 33, 63, 167)
beta <- c(203, 67, 139, 121, 75, 122, 86, 297, 49, 120, 
          177, 115, 16, 85, 127, 341, 253, 44, 110, 229)
a.plus.b <- c(175, 78, 120, 111, 74, 74, 86, 171, 33, 93,
              110, 112, 25, 52, 71, 126, 117, 30, 108, 123)
a.div.b <- c(361, 146, 183, 244, 63, 114, 257, 377, 107, 239, 
             339, 321, 91, 158, 188, 327, 238, 72, 130, 378)
data.frame(aa.1.1, alpha, beta, a.plus.b, a.div.b)

##    aa.1.1 alpha beta a.plus.b a.div.b
## 1       A   285  203      175     361
## 2       R    53   67       78     146
## 3       N    97  139      120     183
## 4       D   163  121      111     244
## 5       C    22   75       74      63
## 6       Q    67  122       74     114
## 7       E   134   86       86     257
## 8       G   197  297      171     377
## 9       H   111   49       33     107
## 10      I    91  120       93     239
## 11      L   221  177      110     339
## 12      K   249  115      112     321
## 13      M    48   16       25      91
## 14      F   123   85       52     158
## 15      P    82  127       71     188
## 16      S   122  341      126     327
## 17      T   119  253      117     238
## 18      W    33   44       30      72
## 19      Y    63  110      108     130
## 20      V   167  229      123     378

#Convert to frequencies

alpha.prop <- alpha/sum(alpha)
beta.prop <- beta/sum(beta)
a.plus.b.prop <- a.plus.b/sum(a.plus.b)
a.div.b <- a.div.b/sum(a.div.b)

#dataframe
aa.prop <- data.frame(alpha.prop,
                      beta.prop,
                      a.plus.b.prop,
                      a.div.b)
#row labels
row.names(aa.prop) <- aa.1.1

#Table 5 is now this

aa.prop

##    alpha.prop   beta.prop a.plus.b.prop    a.div.b
## A 0.116469146 0.073126801    0.09264161 0.08331410
## R 0.021659174 0.024135447    0.04129169 0.03369490
## N 0.039640376 0.050072046    0.06352567 0.04223402
## D 0.066612178 0.043587896    0.05876125 0.05631202
## C 0.008990601 0.027017291    0.03917417 0.01453958
## Q 0.027380466 0.043948127    0.03917417 0.02630972
## E 0.054760932 0.030979827    0.04552673 0.05931225
## G 0.080506743 0.106988473    0.09052409 0.08700669
## H 0.045361667 0.017651297    0.01746956 0.02469421
## I 0.037188394 0.043227666    0.04923240 0.05515809
## L 0.090314671 0.063760807    0.05823187 0.07823679
## K 0.101757254 0.041426513    0.05929063 0.07408262
## M 0.019615856 0.005763689    0.01323452 0.02100162
## F 0.050265631 0.030619597    0.02752779 0.03646434
## P 0.033510421 0.045749280    0.03758602 0.04338795
## S 0.049856968 0.122838617    0.06670196 0.07546734
## T 0.048630977 0.091138329    0.06193753 0.05492730
## W 0.013485901 0.015850144    0.01588142 0.01661666
## Y 0.025745811 0.039625360    0.05717311 0.03000231
## V 0.068246833 0.082492795    0.06511382 0.08723748

##Download Focal Gene

#obtained from before
lrrc19_human_vector

##   [1] "M" "K" "V" "T" "G" "I" "T" "I" "L" "F" "W" "P" "L" "S" "M" "I" "L" "L"
##  [19] "S" "D" "K" "I" "Q" "S" "S" "K" "R" "E" "V" "Q" "C" "N" "F" "T" "E" "K"
##  [37] "N" "Y" "T" "L" "I" "P" "A" "D" "I" "K" "K" "D" "V" "T" "I" "L" "D" "L"
##  [55] "S" "Y" "N" "Q" "I" "T" "L" "N" "G" "T" "D" "T" "R" "V" "L" "Q" "T" "Y"
##  [73] "F" "L" "L" "T" "E" "L" "Y" "L" "I" "E" "N" "K" "V" "T" "I" "L" "H" "N"
##  [91] "N" "G" "F" "G" "N" "L" "S" "S" "L" "E" "I" "L" "N" "I" "C" "R" "N" "S"
## [109] "I" "Y" "V" "I" "Q" "Q" "G" "A" "F" "L" "G" "L" "N" "K" "L" "K" "Q" "L"
## [127] "Y" "L" "C" "Q" "N" "K" "I" "E" "Q" "L" "N" "A" "D" "V" "F" "V" "P" "L"
## [145] "R" "S" "L" "K" "L" "L" "N" "L" "Q" "G" "N" "L" "I" "S" "Y" "L" "D" "V"
## [163] "P" "P" "L" "F" "H" "L" "E" "L" "I" "T" "L" "Y" "G" "N" "L" "W" "N" "C"
## [181] "S" "C" "S" "L" "F" "N" "L" "Q" "N" "W" "L" "N" "T" "S" "N" "V" "T" "L"
## [199] "E" "N" "E" "N" "I" "T" "M" "C" "S" "Y" "P" "N" "S" "L" "Q" "S" "Y" "N"
## [217] "I" "K" "T" "V" "P" "H" "K" "A" "E" "C" "H" "S" "K" "F" "P" "S" "S" "V"
## [235] "T" "E" "D" "L" "Y" "I" "H" "F" "Q" "P" "I" "S" "N" "S" "I" "F" "N" "S"
## [253] "S" "S" "N" "N" "L" "T" "R" "N" "S" "E" "H" "E" "P" "L" "G" "K" "S" "W"
## [271] "A" "F" "L" "V" "G" "V" "V" "V" "T" "V" "L" "T" "T" "S" "L" "L" "I" "F"
## [289] "I" "A" "I" "K" "C" "P" "I" "W" "Y" "N" "I" "L" "L" "S" "Y" "N" "H" "H"
## [307] "R" "L" "E" "E" "H" "E" "A" "E" "T" "Y" "E" "D" "G" "F" "T" "G" "N" "P"
## [325] "S" "S" "L" "S" "Q" "I" "P" "E" "T" "N" "S" "E" "E" "T" "T" "V" "I" "F"
## [343] "E" "Q" "L" "H" "S" "F" "V" "V" "D" "D" "D" "G" "F" "I" "E" "D" "K" "Y"
## [361] "I" "D" "I" "H" "E" "L" "C" "E" "E" "N"

#Determine aa freq for lrrc19
lrrc19.freq.table <-table(lrrc19_human_vector)/length(lrrc19_human_vector)

#Convert table into vector function

table_to_vector <- function(table_x){
  table_names <- attr(table_x, "dimnames")[[1]]
  table_vect <- as.vector(table_x)
  names(table_vect) <- table_names
  return(table_vect)
}

lrrc19.human.aa.freq <- table_to_vector(lrrc19.freq.table)
lrrc19.human.aa.freq

##           A           C           D           E           F           G 
## 0.018918919 0.024324324 0.037837838 0.070270270 0.045945946 0.035135135 
##           H           I           K           L           M           N 
## 0.029729730 0.086486486 0.045945946 0.143243243 0.008108108 0.094594595 
##           P           Q           R           S           T           V 
## 0.035135135 0.040540541 0.016216216 0.089189189 0.070270270 0.054054054 
##           W           Y 
## 0.013513514 0.040540541

#Check for presence of U, an unknown amino acid

aa.names <- names(lrrc19.human.aa.freq)
any(aa.names == "U")

## [1] FALSE

#Add data on lrrc19 to amino acid frequency table

aa.prop$lrrc19.human.aa.freq <- lrrc19.human.aa.freq
pander::pander(aa.prop)

	alpha.prop	beta.prop	a.plus.b.prop	a.div.b	lrrc19.human.aa.freq
A	0.1165	0.07313	0.09264	0.08331	0.01892
R	0.02166	0.02414	0.04129	0.03369	0.02432
N	0.03964	0.05007	0.06353	0.04223	0.03784
D	0.06661	0.04359	0.05876	0.05631	0.07027
C	0.008991	0.02702	0.03917	0.01454	0.04595
Q	0.02738	0.04395	0.03917	0.02631	0.03514
E	0.05476	0.03098	0.04553	0.05931	0.02973
G	0.08051	0.107	0.09052	0.08701	0.08649
H	0.04536	0.01765	0.01747	0.02469	0.04595
I	0.03719	0.04323	0.04923	0.05516	0.1432
L	0.09031	0.06376	0.05823	0.07824	0.008108
K	0.1018	0.04143	0.05929	0.07408	0.09459
M	0.01962	0.005764	0.01323	0.021	0.03514
F	0.05027	0.03062	0.02753	0.03646	0.04054
P	0.03351	0.04575	0.03759	0.04339	0.01622
S	0.04986	0.1228	0.0667	0.07547	0.08919
T	0.04863	0.09114	0.06194	0.05493	0.07027
W	0.01349	0.01585	0.01588	0.01662	0.05405
Y	0.02575	0.03963	0.05717	0.03	0.01351
V	0.06825	0.08249	0.06511	0.08724	0.04054

##Functions to calculate similarities Two custom functions needed: one to calculate correlations and one to calculate cosine similarity

# Corrleation used in Chou adn Zhange 1992.
chou_cor <- function(x,y){
  numerator <- sum(x*y)
denominator <- sqrt((sum(x^2))*(sum(y^2)))
result <- numerator/denominator
return(result)
}

# Cosine similarity used in Higgs and Attwood (2005). 
chou_cosine <- function(z.1, z.2){
  z.1.abs <- sqrt(sum(z.1^2))
  z.2.abs <- sqrt(sum(z.2^2))
  my.cosine <- sum(z.1*z.2)/(z.1.abs*z.2.abs)
  return(my.cosine)
}

#Calculate correlation between each column

corr.alpha <- chou_cor(aa.prop[,5], aa.prop[,1])
corr.beta  <- chou_cor(aa.prop[,5], aa.prop[,2])
corr.apb   <- chou_cor(aa.prop[,5], aa.prop[,3])
corr.adb   <- chou_cor(aa.prop[,5], aa.prop[,4])

#Calculate cosin similarity

cos.alpha <- chou_cosine(aa.prop[,5], aa.prop[,1])
cos.beta  <- chou_cosine(aa.prop[,5], aa.prop[,2])
cos.apb   <- chou_cosine(aa.prop[,5], aa.prop[,3])
cos.adb   <- chou_cosine(aa.prop[,5], aa.prop[,4])

#Calculate distance. (need to flip dataframe on its side with t())

aa.prop.flipped <- t(aa.prop)
round(aa.prop.flipped,2)

##                         A    R    N    D    C    Q    E    G    H    I    L
## alpha.prop           0.12 0.02 0.04 0.07 0.01 0.03 0.05 0.08 0.05 0.04 0.09
## beta.prop            0.07 0.02 0.05 0.04 0.03 0.04 0.03 0.11 0.02 0.04 0.06
## a.plus.b.prop        0.09 0.04 0.06 0.06 0.04 0.04 0.05 0.09 0.02 0.05 0.06
## a.div.b              0.08 0.03 0.04 0.06 0.01 0.03 0.06 0.09 0.02 0.06 0.08
## lrrc19.human.aa.freq 0.02 0.02 0.04 0.07 0.05 0.04 0.03 0.09 0.05 0.14 0.01
##                         K    M    F    P    S    T    W    Y    V
## alpha.prop           0.10 0.02 0.05 0.03 0.05 0.05 0.01 0.03 0.07
## beta.prop            0.04 0.01 0.03 0.05 0.12 0.09 0.02 0.04 0.08
## a.plus.b.prop        0.06 0.01 0.03 0.04 0.07 0.06 0.02 0.06 0.07
## a.div.b              0.07 0.02 0.04 0.04 0.08 0.05 0.02 0.03 0.09
## lrrc19.human.aa.freq 0.09 0.04 0.04 0.02 0.09 0.07 0.05 0.01 0.04

#Distance Matrix

dist(aa.prop.flipped, method = "euclidean")

##                      alpha.prop  beta.prop a.plus.b.prop    a.div.b
## beta.prop            0.13342098                                    
## a.plus.b.prop        0.09281824 0.08289406                         
## a.div.b              0.06699039 0.08659174    0.06175113           
## lrrc19.human.aa.freq 0.18680998 0.16937594    0.16093128 0.15865109

#Individual Distances

dist.alpha <- dist((aa.prop.flipped[c(1,5),]),  method = "euclidean")
dist.beta  <- dist((aa.prop.flipped[c(2,5),]),  method = "euclidean")
dist.apb   <- dist((aa.prop.flipped[c(3,5),]),  method = "euclidean")
dist.adb  <- dist((aa.prop.flipped[c(4,5),]), method = "euclidean")

#Compiling the information

# fold types
fold.type <- c("alpha","beta","alpha plus beta", "alpha/beta")

# data
corr.sim <- round(c(corr.alpha,corr.beta,corr.apb,corr.adb),5)
cosine.sim <- round(c(cos.alpha,cos.beta,cos.apb,cos.adb),5)
Euclidean.dist <- round(c(dist.alpha,dist.beta,dist.apb,dist.adb),5)

# summary
sim.sum <- c("","","most.sim","")
dist.sum <- c("","","min.dist","")

df <- data.frame(fold.type,
           corr.sim ,
           cosine.sim ,
           Euclidean.dist ,
           sim.sum ,
           dist.sum )

#Display Output

pander::pander(df)

fold.type	corr.sim	cosine.sim	Euclidean.dist	sim.sum	dist.sum
alpha	0.7478	0.7478	0.1868
beta	0.7951	0.7951	0.1694
alpha plus beta	0.8053	0.8053	0.1609	most.sim	min.dist
alpha/beta	0.8129	0.8129	0.1587

###Percent Identity Comparison (PID) ##Data preparation

#Humans, Chimps, Rhesus Macaque, Dog

## sequence 1: Humans
human_fasta <- rentrez::entrez_fetch(db = "protein",
                        id = "NP_075052.1",
                         rettype = "fasta")
## sequence 2: Chimps
chimp_fasta <- rentrez::entrez_fetch(db = "protein",
                         id = "XP_001154402.1",
                         rettype = "fasta")
## sequence 3: Rhesus Macaque
rhesus_fasta <- rentrez::entrez_fetch(db = "protein",
                         id = "XP_001105486.1   ",
                         rettype = "fasta")
## sequence 4: Dog
dog_fasta <- rentrez::entrez_fetch(db = "protein",
                         id = "XP_003431607.1   ",
                         rettype = "fasta")


# clean
human_vector   <- fasta_cleaner(human_fasta)
chimp_vector <- fasta_cleaner(chimp_fasta)
rhesus_vector <- fasta_cleaner(rhesus_fasta)
dog_vector <- fasta_cleaner(dog_fasta)

human_string <-paste(human_vector,collapse = "")    

chimp_string <-paste(chimp_vector,collapse = "") 

rhesus_string <-paste(rhesus_vector,collapse = "")    

dog_string <-paste(dog_vector,collapse = "") 

human_string   <- toupper(human_string)
chimp_string <- toupper(chimp_string)
rhesus_string   <- toupper(rhesus_string)
dog_string <- toupper(dog_string)

##Alignments

data("BLOSUM50")
globalAlignHumanChimp <- Biostrings::pairwiseAlignment(human_string, 
                                               chimp_string,
                                               substitutionMatrix = BLOSUM50, 
                                               gapOpening = -8, 
                                               gapExtension = -2, 
                                               scoreOnly = FALSE)

globalAlignHumanRhesus <- Biostrings::pairwiseAlignment(human_string, 
                                               rhesus_string,
                                               substitutionMatrix = BLOSUM50, 
                                               gapOpening = -8, 
                                               gapExtension = -2, 
                                               scoreOnly = FALSE)

globalAlignHumanDog <- Biostrings::pairwiseAlignment(human_string, 
                                               dog_string,
                                               substitutionMatrix = BLOSUM50, 
                                               gapOpening = -8, 
                                               gapExtension = -2, 
                                               scoreOnly = FALSE)

globalAlignChimpRhesus <- Biostrings::pairwiseAlignment(chimp_string, 
                                               rhesus_string,
                                               substitutionMatrix = BLOSUM50, 
                                               gapOpening = -8, 
                                               gapExtension = -2, 
                                               scoreOnly = FALSE)

globalAlignChimpDog <- Biostrings::pairwiseAlignment(chimp_string, 
                                               dog_string,
                                               substitutionMatrix = BLOSUM50, 
                                               gapOpening = -8, 
                                               gapExtension = -2, 
                                               scoreOnly = FALSE)
globalAlignRhesusDog <- Biostrings::pairwiseAlignment(rhesus_string, 
                                               dog_string,
                                               substitutionMatrix = BLOSUM50, 
                                               gapOpening = -8, 
                                               gapExtension = -2, 
                                               scoreOnly = FALSE)

##Making matrix

pids <- c(1,                  NA,     NA,     NA,
          pid(globalAlignHumanChimp),          1,     NA,     NA,
          pid(globalAlignHumanRhesus), pid(globalAlignChimpRhesus),      1,     NA,
          pid(globalAlignHumanDog), pid(globalAlignChimpDog), pid(globalAlignRhesusDog), 1)

mat <- matrix(pids, nrow = 4, byrow = T)
row.names(mat) <- c("Homo","Chimp","Rhesus","Dog")   
colnames(mat) <- c("Homo","Chimp","Rhesus","Dog")   
pander::pander(mat)

	Homo	Chimp	Rhesus	Dog
Homo	1	NA	NA	NA
Chimp	99.19	1	NA	NA
Rhesus	95.14	95.41	1	NA
Dog	79.19	79.19	80.54	1

###PID Methods Comparison #Obtaining PIDS (Human Vs Chimp)

pid(globalAlignHumanChimp, type = "PID1")

## [1] 99.18919

pid(globalAlignHumanChimp, type = "PID2")

## [1] 99.18919

pid(globalAlignHumanChimp, type = "PID3")

## [1] 99.18919

pid(globalAlignHumanChimp, type = "PID4")

## [1] 99.18919

PIDcomparison_table <- c("PID1", "99.18919", "aligned positions + internal gap positions", 
                        "PID2", "99.18919", "aligned positions", 
                        "PID3", "99.18919", "lenght shorter sequence",
                        "PID4", "99.18919", "average length of the two sequences")

PID_matrix <- matrix(PIDcomparison_table, byrow = T, nrow = 4)

PIDcomparison_table <- data.frame(PID_matrix, stringsAsFactors = F)

names(PIDcomparison_table) <- c("Method", "PID", "Denominator")

PIDcomparison_table

##   Method      PID                                Denominator
## 1   PID1 99.18919 aligned positions + internal gap positions
## 2   PID2 99.18919                          aligned positions
## 3   PID3 99.18919                    lenght shorter sequence
## 4   PID4 99.18919        average length of the two sequences

###Multiple Sequence Alignment ##MSA data preparation

lrrc19_vector <- rep(NA, length(lrrc19_list))

lrrc19_vector

##  [1] NA NA NA NA NA NA NA NA NA NA

# looping
for(i in 1:length(lrrc19_vector)){
  lrrc19_vector[i] <- lrrc19_list[[i]]
}

#  naming
names(lrrc19_vector) <- names(lrrc19_list)


lrrc19_vector_ss <- Biostrings::AAStringSet(lrrc19_vector)
lrrc19_vector_ss

## AAStringSet object of length 10:
##      width seq                                              names               
##  [1]   370 MKVTGITILFWPLSMILLSDKIQ...VVDDDGFIEDKYIDIHELCEEN NP_075052.1
##  [2]   370 MKVTGITILFWPLSMILLSDKIQ...VVDDDGFIEDKYIDIHELREEN XP_001154402.1
##  [3]   370 MKVTGITILFWPLSMVLLSDKIQ...VVDDDGFIEDKYIDIHELREEN XP_001105486.1
##  [4]   369 MKATCITILFWPLFMLLLSEERQ...VVDDDGFIEDKYIDTHELHEEN XP_003431607.1
##  [5]   370 MKTMSITILFWLLSMLLLSDKSQ...VVDDDGFIEDKYIDTHELREEN NP_001178196.1
##  [6]   364 MKVTRFMFWLFSMLLPSVKSQAS...VVDDDGFIEDRYIDINEVHEEK NP_780514.1
##  [7]   359 MKVTRFMFWLFSMLLSDKSQASE...VVDDDGFIEDRYIDINEVHEEK NP_001102883.1
##  [8]   380 MKLSWFAIWAGILFLNPVTADCN...EDGCTEDKDIDSYSTEKSSSLK XP_027640383.1
##  [9]   370 MKVTGITILFWPLSMVLLSDKIQ...VVDDDGFIEDKYIDIHELREEN XP_003911679.1
## [10]   370 MKVTGITILFWPLSMVLSSDKIQ...MVDDDGFIEDKYIDIHELREEN XP_023059658.1

##Building Multiple Sequence Alignment (MSA)

lrrc19_align <- msa(lrrc19_vector_ss,
                     method = "ClustalW")

## use default substitution matrix

lrrc19_align

## CLUSTAL 2.1  
## 
## Call:
##    msa(lrrc19_vector_ss, method = "ClustalW")
## 
## MsaAAMultipleAlignment with 10 rows and 380 columns
##      aln                                                   names
##  [1] MKVT--RFMFWLFSMLLPSVKSQAS...VVDDDGFIEDRYIDINEVHEEK--- NP_780514.1
##  [2] MKVT--RFMFWLFSMLL-SDKSQAS...VVDDDGFIEDRYIDINEVHEEK--- NP_001102883.1
##  [3] MKATCITILFWPLFMLLLSEERQSS...VVDDDGFIEDKYIDTHELHEEN--- XP_003431607.1
##  [4] MKTMSITILFWLLSMLLLSDKSQTS...VVDDDGFIEDKYIDTHELREEN--- NP_001178196.1
##  [5] MKVTGITILFWPLSMILLSDKIQSS...VVDDDGFIEDKYIDIHELCEEN--- NP_075052.1
##  [6] MKVTGITILFWPLSMILLSDKIQSS...VVDDDGFIEDKYIDIHELREEN--- XP_001154402.1
##  [7] MKVTGITILFWPLSMVLLSDKIQSS...VVDDDGFIEDKYIDIHELREEN--- XP_001105486.1
##  [8] MKVTGITILFWPLSMVLLSDKIQSS...VVDDDGFIEDKYIDIHELREEN--- XP_003911679.1
##  [9] MKVTGITILFWPLSMVLSSDKIQSS...MVDDDGFIEDKYIDIHELREEN--- XP_023059658.1
## [10] MKLSWFAIWAGILFLNPVTADCNIT...VSGEDGCTEDKDIDSYSTEKSSSLK XP_027640383.1
##  Con MKVTGITILFWPLSM?LLSDKIQSS...VVDDDGFIEDKYIDIHELREEN--- Consensus

#Get MSA ready for viewing

class(lrrc19_align) <- "AAMultipleAlignment"

# This is converting the format of the msa to another
lrrc19_align_seqinr <- msaConvert(lrrc19_align, type = "seqinr::alignment")

##MSA

#ggmsa::ggmsa(lrrc19_align, 
     # start = 2030, 
     # end = 2100,)

###Distance Matrix

lrrc19_subset_dist <- seqinr::dist.alignment(lrrc19_align_seqinr, 
                                       matrix = "identity")

is(lrrc19_subset_dist)

## [1] "dist"     "oldClass"

class(lrrc19_subset_dist)

## [1] "dist"

lrrc19_subset_dist_rounded <- round(lrrc19_subset_dist,
                              digits = 3)

lrrc19_subset_dist_rounded

##                NP_780514.1 NP_001102883.1 XP_003431607.1 NP_001178196.1
## NP_001102883.1       0.358                                             
## XP_003431607.1       0.560          0.536                              
## NP_001178196.1       0.560          0.538          0.454               
## NP_075052.1          0.537          0.522          0.454          0.444
## XP_001154402.1       0.540          0.525          0.454          0.441
## XP_001105486.1       0.542          0.522          0.439          0.444
## XP_003911679.1       0.542          0.522          0.439          0.444
## XP_023059658.1       0.557          0.536          0.451          0.468
## XP_027640383.1       0.770          0.772          0.763          0.775
##                NP_075052.1 XP_001154402.1 XP_001105486.1 XP_003911679.1
## NP_001102883.1                                                         
## XP_003431607.1                                                         
## NP_001178196.1                                                         
## NP_075052.1                                                            
## XP_001154402.1       0.090                                             
## XP_001105486.1       0.221          0.214                              
## XP_003911679.1       0.221          0.214          0.000               
## XP_023059658.1       0.285          0.280          0.187          0.187
## XP_027640383.1       0.766          0.764          0.764          0.764
##                XP_023059658.1
## NP_001102883.1               
## XP_003431607.1               
## NP_001178196.1               
## NP_075052.1                  
## XP_001154402.1               
## XP_001105486.1               
## XP_003911679.1               
## XP_023059658.1               
## XP_027640383.1          0.771

###Phylogenetic Tree

tree_subset <- nj(lrrc19_subset_dist_rounded)

# plot tree
plot.phylo(tree_subset, main="Phylogenetic Tree", 
           use.edge.length = F)

# add label
mtext(text = "LRRC19 family gene tree - rooted, no branch lengths")

Final portfolio assignment: LRRC19- Leucine-Rich Repeat-Containing Protein 19

Evan Gu

12/18/2021