Introduction

LAG3 (Lymphocyte activation gene 3 protein) functions as an inhibitory receptor on antigen activated T-cells. It delivers inhibitory signals when bound to ligands.

Packages

#install.packages("rentrez")
#install.packages("ape")
#install.packages("seqinr")

#install.packages("BiocManager")
##BiocManager::install("msa")
#BiocManager::install("Biostrings")
#BiocManager::install("drawProteins")
#BiocManager::install("ggplot2")

#library(devtools)
#devtools::install_github("brouwern/combio4all")
#devtools::install_github("YuLab-SMU/ggmsa")
library(compbio4all)
library(ggmsa)
## Registered S3 methods overwritten by 'ggalt':
##   method                  from   
##   grid.draw.absoluteGrob  ggplot2
##   grobHeight.absoluteGrob ggplot2
##   grobWidth.absoluteGrob  ggplot2
##   grobX.absoluteGrob      ggplot2
##   grobY.absoluteGrob      ggplot2
library(rentrez)
library(seqinr)
library(ape)
## 
## Attaching package: 'ape'
## The following objects are masked from 'package:seqinr':
## 
##     as.alignment, consensus
library(pander)
library(ggplot2)
library(msa)
## Loading required package: Biostrings
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: XVector
## Loading required package: GenomeInfoDb
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:ape':
## 
##     complement
## The following object is masked from 'package:seqinr':
## 
##     translate
## The following object is masked from 'package:base':
## 
##     strsplit
library(drawProteins)
library(Biostrings)

Acession numbers

Accession Table

common_name <- c('human', 'chimp', 'bonobo', 'gorilla', 'oranutan', 'mouse', 'tree frog', 'chicken', 'mink', 'pigeon')
scientific_name <- c('Homo sapien', 'Pan troglodytes', 'Pan paniscus', 'Gorilla gorilla', 'Pongo abelii', 'Mus musculus', 'Xenopus tropicalis', 'Gallus gallus', 'Neogale vison', 'Columba livia')
refseq <- c('NP_002277.4', 'XP_508966.3', 'XP_003820331.2', 'XP_004052634.1', 'XP_002822880.2', 'NP_032505.1', 'XP_004915830.2', 'XP_024998797', 'XP_044082922.1', 'XP_005506340.2')
uniprot <- c('P18627', 'A0A2J8JEL7', 'A0A2R8ZS97', 'G3SDW3.1', 'A0A2J8TBE9', 'Q61790', 'UPI00064CE453', 'UPI001AE9E5F3', NA, 'A0A2I0ML95' )
PDB <- c(NA, NA, NA, NA, NA, '6WKM', NA, NA, NA, NA)
gene <- c("LAG3", "LAG3", "LAG3", "LAG3", "LAG3", "LAG3", "LAG3", "LAG3", "LAG3", "LAG3")
LAG3_table <- data.frame(common_name, scientific_name, refseq, uniprot, PDB, gene)
is(LAG3_table)
## [1] "data.frame"       "list"             "oldClass"         "vector"          
## [5] "list_OR_List"     "vector_OR_Vector" "vector_OR_factor"
pander::pander(LAG3_table)
common_name scientific_name refseq uniprot PDB gene
human Homo sapien NP_002277.4 P18627 NA LAG3
chimp Pan troglodytes XP_508966.3 A0A2J8JEL7 NA LAG3
bonobo Pan paniscus XP_003820331.2 A0A2R8ZS97 NA LAG3
gorilla Gorilla gorilla XP_004052634.1 G3SDW3.1 NA LAG3
oranutan Pongo abelii XP_002822880.2 A0A2J8TBE9 NA LAG3
mouse Mus musculus NP_032505.1 Q61790 6WKM LAG3
tree frog Xenopus tropicalis XP_004915830.2 UPI00064CE453 NA LAG3
chicken Gallus gallus XP_024998797 UPI001AE9E5F3 NA LAG3
mink Neogale vison XP_044082922.1 NA NA LAG3
pigeon Columba livia XP_005506340.2 A0A2I0ML95 NA LAG3

Data preperation

LAG3_list <- entrez_fetch_list(db = "protein", 
                          id = LAG3_table$refseq, 
                          rettype = "fasta")
length(LAG3_list)
## [1] 10
for(i in 1:length(LAG3_list)){
  LAG3_list[[i]] <- fasta_cleaner(LAG3_list[[i]], parse = FALSE)
}
LAG3_vector <- rep(NA, length(LAG3_list))

for(i in 1:length(LAG3_vector)){
  LAG3_vector[i] <- LAG3_list[[i]]
}

names(LAG3_vector) <- names(LAG3_list)
LAG3_vector_ss <- Biostrings::AAStringSet(LAG3_vector)

Protein Information

human_protein <- drawProteins::get_features(LAG3_table$uniprot[1])
## [1] "Download has worked"
hum_protein_df <- drawProteins::feature_to_dataframe(human_protein)
## Warning in drawProteins::extract_feat_acc(features_in_lists_of_six[[i]]): NAs
## introduced by coercion

Protein diagram

is(human_protein)
## [1] "list"             "vector"           "list_OR_List"     "vector_OR_Vector"
## [5] "vector_OR_factor"
is(hum_protein_df)
## [1] "data.frame"       "list"             "oldClass"         "vector"          
## [5] "list_OR_List"     "vector_OR_Vector" "vector_OR_factor"
hum_protein_df[,-2]
##                     type begin end length accession  entryName taxid order
## featuresTemp      SIGNAL     1  22     21    P18627 LAG3_HUMAN  9606     1
## featuresTemp.1     CHAIN    23 525    502    P18627 LAG3_HUMAN  9606     1
## featuresTemp.2     CHAIN    23  NA     NA    P18627 LAG3_HUMAN  9606     1
## featuresTemp.3  TOPO_DOM    23 450    427    P18627 LAG3_HUMAN  9606     1
## featuresTemp.4  TRANSMEM   451 471     20    P18627 LAG3_HUMAN  9606     1
## featuresTemp.5  TOPO_DOM   472 525     53    P18627 LAG3_HUMAN  9606     1
## featuresTemp.6    DOMAIN    37 167    130    P18627 LAG3_HUMAN  9606     1
## featuresTemp.7    DOMAIN   168 252     84    P18627 LAG3_HUMAN  9606     1
## featuresTemp.8    DOMAIN   265 343     78    P18627 LAG3_HUMAN  9606     1
## featuresTemp.9    DOMAIN   348 419     71    P18627 LAG3_HUMAN  9606     1
## featuresTemp.10   REGION    37 252    215    P18627 LAG3_HUMAN  9606     1
## featuresTemp.11   REGION    62  97     35    P18627 LAG3_HUMAN  9606     1
## featuresTemp.12   REGION   429 450     21    P18627 LAG3_HUMAN  9606     1
## featuresTemp.13   REGION   487 525     38    P18627 LAG3_HUMAN  9606     1
## featuresTemp.14   REGION   501 524     23    P18627 LAG3_HUMAN  9606     1
## featuresTemp.15    MOTIF   498 503      5    P18627 LAG3_HUMAN  9606     1
## featuresTemp.16 COMPBIAS    72  91     19    P18627 LAG3_HUMAN  9606     1
## featuresTemp.17 COMPBIAS   504 525     21    P18627 LAG3_HUMAN  9606     1
## featuresTemp.18 CARBOHYD   188 188      0    P18627 LAG3_HUMAN  9606     1
## featuresTemp.19 CARBOHYD   250 250      0    P18627 LAG3_HUMAN  9606     1
## featuresTemp.20 CARBOHYD   256 256      0    P18627 LAG3_HUMAN  9606     1
## featuresTemp.21 CARBOHYD   343 343      0    P18627 LAG3_HUMAN  9606     1
## featuresTemp.22 DISULFID    44 160    116    P18627 LAG3_HUMAN  9606     1
## featuresTemp.23 DISULFID   189 241     52    P18627 LAG3_HUMAN  9606     1
## featuresTemp.24 DISULFID   282 333     51    P18627 LAG3_HUMAN  9606     1
## featuresTemp.25 DISULFID   369 412     43    P18627 LAG3_HUMAN  9606     1
## featuresTemp.26  VAR_SEQ   353 360      7    P18627 LAG3_HUMAN  9606     1
## featuresTemp.27  VAR_SEQ   361 525    164    P18627 LAG3_HUMAN  9606     1
## featuresTemp.28  VARIANT   455 455      0    P18627 LAG3_HUMAN  9606     1
## featuresTemp.29  MUTAGEN    35  35      0    P18627 LAG3_HUMAN  9606     1
## featuresTemp.30  MUTAGEN    52  52      0    P18627 LAG3_HUMAN  9606     1
## featuresTemp.31  MUTAGEN    78  78      0    P18627 LAG3_HUMAN  9606     1
## featuresTemp.32  MUTAGEN    78  78      0    P18627 LAG3_HUMAN  9606     1
## featuresTemp.33  MUTAGEN    85  85      0    P18627 LAG3_HUMAN  9606     1
## featuresTemp.34  MUTAGEN    95  95      0    P18627 LAG3_HUMAN  9606     1
## featuresTemp.35  MUTAGEN    97  97      0    P18627 LAG3_HUMAN  9606     1
## featuresTemp.36  MUTAGEN    98  98      0    P18627 LAG3_HUMAN  9606     1
## featuresTemp.37  MUTAGEN    99  99      0    P18627 LAG3_HUMAN  9606     1
## featuresTemp.38  MUTAGEN   110 110      0    P18627 LAG3_HUMAN  9606     1
## featuresTemp.39  MUTAGEN   125 125      0    P18627 LAG3_HUMAN  9606     1
## featuresTemp.40  MUTAGEN   129 129      0    P18627 LAG3_HUMAN  9606     1
## featuresTemp.41  MUTAGEN   131 131      0    P18627 LAG3_HUMAN  9606     1
## featuresTemp.42  MUTAGEN   137 137      0    P18627 LAG3_HUMAN  9606     1
## featuresTemp.43  MUTAGEN   155 156      1    P18627 LAG3_HUMAN  9606     1
## featuresTemp.44  MUTAGEN   247 247      0    P18627 LAG3_HUMAN  9606     1
canvas <- draw_canvas(hum_protein_df)
canvas <- draw_chains(canvas, hum_protein_df, 
                         label_size = 2.5)
canvas <- draw_domains(canvas, hum_protein_df, label_size = 1.5)
canvas
## Warning: Removed 1 rows containing missing values (geom_rect).

Dot Plot

LAG3_chr <- entrez_fetch_list(db = "protein", 
                          id = LAG3_table$refseq, 
                          rettype = "fasta")
length(LAG3_chr)
## [1] 10
LAG3_chr[1]
## $NP_002277.4
## [1] ">NP_002277.4 lymphocyte activation gene 3 protein precursor [Homo sapiens]\nMWEAQFLGLLFLQPLWVAPVKPLQPGAEVPVVWAQEGAPAQLPCSPTIPLQDLSLLRRAGVTWQHQPDSG\nPPAAAPGHPLAPGPHPAAPSSWGPRPRRYTVLSVGPGGLRSGRLPLQPRVQLDERGRQRGDFSLWLRPAR\nRADAGEYRAAVHLRDRALSCRLRLRLGQASMTASPPGSLRASDWVILNCSFSRPDRPASVHWFRNRGQGR\nVPVRESPHHHLAESFLFLPQVSPMDSGPWGCILTYRDGFNVSIMYNLTVLGLEPPTPLTVYAGAGSRVGL\nPCRLPAGVGTRSFLTAKWTPPGGGPDLLVTGDNGDFTLRLEDVSQAQAGTYTCHIHLQEQQLNATVTLAI\nITVTPKSFGSPGSLGKLLCEVTPVSGQERFVWSSLDTPSQRSFSGPWLEAQEAQLLSQPWQCQLYQGERL\nLGAAVYFTELSSPGAQRSGRAPGALPAGHLLLFLILGVLSLLLLVTGAFGFHLWRRQWRPRRFSALEQGI\nHPPQAQSKIEELEQEPEPEPEPEPEPEPEPEPEQL\n\n"
for(i in 1:length(LAG3_chr)){
  LAG3_chr[[i]] <- fasta_cleaner(LAG3_chr[[i]], parse = TRUE)
}

LAG3_chr[1]
## $NP_002277.4
##   [1] "M" "W" "E" "A" "Q" "F" "L" "G" "L" "L" "F" "L" "Q" "P" "L" "W" "V" "A"
##  [19] "P" "V" "K" "P" "L" "Q" "P" "G" "A" "E" "V" "P" "V" "V" "W" "A" "Q" "E"
##  [37] "G" "A" "P" "A" "Q" "L" "P" "C" "S" "P" "T" "I" "P" "L" "Q" "D" "L" "S"
##  [55] "L" "L" "R" "R" "A" "G" "V" "T" "W" "Q" "H" "Q" "P" "D" "S" "G" "P" "P"
##  [73] "A" "A" "A" "P" "G" "H" "P" "L" "A" "P" "G" "P" "H" "P" "A" "A" "P" "S"
##  [91] "S" "W" "G" "P" "R" "P" "R" "R" "Y" "T" "V" "L" "S" "V" "G" "P" "G" "G"
## [109] "L" "R" "S" "G" "R" "L" "P" "L" "Q" "P" "R" "V" "Q" "L" "D" "E" "R" "G"
## [127] "R" "Q" "R" "G" "D" "F" "S" "L" "W" "L" "R" "P" "A" "R" "R" "A" "D" "A"
## [145] "G" "E" "Y" "R" "A" "A" "V" "H" "L" "R" "D" "R" "A" "L" "S" "C" "R" "L"
## [163] "R" "L" "R" "L" "G" "Q" "A" "S" "M" "T" "A" "S" "P" "P" "G" "S" "L" "R"
## [181] "A" "S" "D" "W" "V" "I" "L" "N" "C" "S" "F" "S" "R" "P" "D" "R" "P" "A"
## [199] "S" "V" "H" "W" "F" "R" "N" "R" "G" "Q" "G" "R" "V" "P" "V" "R" "E" "S"
## [217] "P" "H" "H" "H" "L" "A" "E" "S" "F" "L" "F" "L" "P" "Q" "V" "S" "P" "M"
## [235] "D" "S" "G" "P" "W" "G" "C" "I" "L" "T" "Y" "R" "D" "G" "F" "N" "V" "S"
## [253] "I" "M" "Y" "N" "L" "T" "V" "L" "G" "L" "E" "P" "P" "T" "P" "L" "T" "V"
## [271] "Y" "A" "G" "A" "G" "S" "R" "V" "G" "L" "P" "C" "R" "L" "P" "A" "G" "V"
## [289] "G" "T" "R" "S" "F" "L" "T" "A" "K" "W" "T" "P" "P" "G" "G" "G" "P" "D"
## [307] "L" "L" "V" "T" "G" "D" "N" "G" "D" "F" "T" "L" "R" "L" "E" "D" "V" "S"
## [325] "Q" "A" "Q" "A" "G" "T" "Y" "T" "C" "H" "I" "H" "L" "Q" "E" "Q" "Q" "L"
## [343] "N" "A" "T" "V" "T" "L" "A" "I" "I" "T" "V" "T" "P" "K" "S" "F" "G" "S"
## [361] "P" "G" "S" "L" "G" "K" "L" "L" "C" "E" "V" "T" "P" "V" "S" "G" "Q" "E"
## [379] "R" "F" "V" "W" "S" "S" "L" "D" "T" "P" "S" "Q" "R" "S" "F" "S" "G" "P"
## [397] "W" "L" "E" "A" "Q" "E" "A" "Q" "L" "L" "S" "Q" "P" "W" "Q" "C" "Q" "L"
## [415] "Y" "Q" "G" "E" "R" "L" "L" "G" "A" "A" "V" "Y" "F" "T" "E" "L" "S" "S"
## [433] "P" "G" "A" "Q" "R" "S" "G" "R" "A" "P" "G" "A" "L" "P" "A" "G" "H" "L"
## [451] "L" "L" "F" "L" "I" "L" "G" "V" "L" "S" "L" "L" "L" "L" "V" "T" "G" "A"
## [469] "F" "G" "F" "H" "L" "W" "R" "R" "Q" "W" "R" "P" "R" "R" "F" "S" "A" "L"
## [487] "E" "Q" "G" "I" "H" "P" "P" "Q" "A" "Q" "S" "K" "I" "E" "E" "L" "E" "Q"
## [505] "E" "P" "E" "P" "E" "P" "E" "P" "E" "P" "E" "P" "E" "P" "E" "P" "E" "P"
## [523] "E" "Q" "L"
LAG3_human_chr <- unlist(LAG3_chr[1])
# set up 2 x 2 grid, make margins thinner
par(mfrow = c(2,2), 
    mar = c(0,0,2,1))

# plot 1: Shroom - Defaults
dotPlot(LAG3_human_chr, 
        LAG3_human_chr, 
        wsize = 1, 
        nmatch = 1, 
        main = "Shroom Defaults")

# plot 2 Shroom - size = 10, nmatch = 1
dotPlot(LAG3_human_chr, 
        LAG3_human_chr, 
        wsize = 10, 
        nmatch = 1, 
        main = "Shroom - size = 10, nmatch = 1")

# plot 3: Shroom - size = 10, nmatch = 5
dotPlot(LAG3_human_chr, 
        LAG3_human_chr, 
        wsize = 10, 
        nmatch = 5, 
        main = "Shroom - size = 10, nmatch = 5")

# plot 4: size = 20, nmatch = 5
dotPlot(LAG3_human_chr, 
        LAG3_human_chr, 
        wsize = 20,
        nmatch =7,
        main = "Shroom - size = 20, nmatch = 7")

# reset par() - run this or other plots will be small!
par(mfrow = c(1,1), 
    mar = c(4,4,4,4))
dotPlot(LAG3_human_chr, 
        LAG3_human_chr, 
        wsize = 20,
        nmatch =7,
        main = "Shroom - size = 20, nmatch = 7")

Protein properties compiled from databases

Human LAG3 protein information

Pfam: sig_p domain from 1-17, and transmembrane domain from 450-474. There are other regions of low complexity throughout protein. http://pfam.xfam.org/protein/P18627 DisProt: No Information available RepeatDB: No Information available UniProt sub-cellular locations: Plasma Membrane PDB secondary structural location: No Information available Alphafold: https://alphafold.ebi.ac.uk/entry/P18627

ALphafold’s predicted structure for human LAG3 has a mixture of beta-pleated sheets and alpha helices, but PDB’s structure for mouse LAG3 has only beta-pleated sheets.

Protein feature Prediction

Secondary structure prediction

aa.1 <- c("A","R","N","D","C","Q","E","G","H","I",
            "L","K","M","F","P","S","T","W","Y","V")
alpha <- c(285, 53, 97, 163, 22, 67, 134, 197, 111, 91, 
           221, 249, 48, 123, 82, 122, 119, 33, 63, 167)
beta <- c(203, 67, 139, 121, 75, 122, 86, 297, 49, 120, 
          177, 115, 16, 85, 127, 341, 253, 44, 110, 229)
a.plus.b <- c(175, 78, 120, 111, 74, 74, 86, 171, 33, 93,
              110, 112, 25, 52, 71, 126, 117, 30, 108, 123)
a.div.b <- c(361, 146, 183, 244, 63, 114, 257, 377, 107, 239, 
             339, 321, 91, 158, 188, 327, 238, 72, 130, 378)
alpha.prop <- alpha/sum(alpha)
beta.prop <- beta/sum(beta)
a.plus.b.prop <- a.plus.b/sum(a.plus.b)
a.div.b <- a.div.b/sum(a.div.b)

aa.prop <- data.frame(alpha.prop,
                      beta.prop,
                      a.plus.b.prop,
                      a.div.b)
#row labels
row.names(aa.prop) <- aa.1
pander(aa.prop)
  alpha.prop beta.prop a.plus.b.prop a.div.b
A 0.1165 0.07313 0.09264 0.08331
R 0.02166 0.02414 0.04129 0.03369
N 0.03964 0.05007 0.06353 0.04223
D 0.06661 0.04359 0.05876 0.05631
C 0.008991 0.02702 0.03917 0.01454
Q 0.02738 0.04395 0.03917 0.02631
E 0.05476 0.03098 0.04553 0.05931
G 0.08051 0.107 0.09052 0.08701
H 0.04536 0.01765 0.01747 0.02469
I 0.03719 0.04323 0.04923 0.05516
L 0.09031 0.06376 0.05823 0.07824
K 0.1018 0.04143 0.05929 0.07408
M 0.01962 0.005764 0.01323 0.021
F 0.05027 0.03062 0.02753 0.03646
P 0.03351 0.04575 0.03759 0.04339
S 0.04986 0.1228 0.0667 0.07547
T 0.04863 0.09114 0.06194 0.05493
W 0.01349 0.01585 0.01588 0.01662
Y 0.02575 0.03963 0.05717 0.03
V 0.06825 0.08249 0.06511 0.08724
human.freq.table <- table(LAG3_human_chr)/length(LAG3_human_chr)
table_to_vector <- function(table_x){
  table_names <- attr(table_x, "dimnames")[[1]]
  table_vect <- as.vector(table_x)
  names(table_vect) <- table_names
  return(table_vect)
}
human.aa.freq <- table_to_vector(human.freq.table)
aa.names <- names(human.aa.freq)
any(aa.names == "U")
## [1] FALSE
aa.prop$human.aa.freq <- human.aa.freq
chou_cor <- function(x,y){
  numerator <- sum(x*y)
denominator <- sqrt((sum(x^2))*(sum(y^2)))
result <- numerator/denominator
return(result)
}
chou_cosine <- function(z.1, z.2){
  z.1.abs <- sqrt(sum(z.1^2))
  z.2.abs <- sqrt(sum(z.2^2))
  my.cosine <- sum(z.1*z.2)/(z.1.abs*z.2.abs)
  return(my.cosine)
}
par(mfrow = c(2,2), mar = c(1,4,1,1))
plot(alpha.prop ~ human.aa.freq, data = aa.prop)
plot(beta.prop ~ human.aa.freq, data = aa.prop)
plot(a.plus.b.prop  ~ human.aa.freq, data = aa.prop)
plot(a.div.b ~ human.aa.freq, data = aa.prop)

par(mfrow = c(1,1), mar = c(1,1,1,1))
corr.alpha <- chou_cor(aa.prop[,5], aa.prop[,1])
corr.beta  <- chou_cor(aa.prop[,5], aa.prop[,2])
corr.apb   <- chou_cor(aa.prop[,5], aa.prop[,3])
corr.adb   <- chou_cor(aa.prop[,5], aa.prop[,4])
cos.alpha <- chou_cosine(aa.prop[,5], aa.prop[,1])
cos.beta  <- chou_cosine(aa.prop[,5], aa.prop[,2])
cos.apb   <- chou_cosine(aa.prop[,5], aa.prop[,3])
cos.adb   <- chou_cosine(aa.prop[,5], aa.prop[,4])
aa.prop.flipped <- t(aa.prop)
round(aa.prop.flipped,2)
##                  A    R    N    D    C    Q    E    G    H    I    L    K    M
## alpha.prop    0.12 0.02 0.04 0.07 0.01 0.03 0.05 0.08 0.05 0.04 0.09 0.10 0.02
## beta.prop     0.07 0.02 0.05 0.04 0.03 0.04 0.03 0.11 0.02 0.04 0.06 0.04 0.01
## a.plus.b.prop 0.09 0.04 0.06 0.06 0.04 0.04 0.05 0.09 0.02 0.05 0.06 0.06 0.01
## a.div.b       0.08 0.03 0.04 0.06 0.01 0.03 0.06 0.09 0.02 0.06 0.08 0.07 0.02
## human.aa.freq 0.08 0.02 0.03 0.06 0.03 0.09 0.02 0.02 0.01 0.13 0.01 0.01 0.12
##                  F    P    S    T    W    Y    V
## alpha.prop    0.05 0.03 0.05 0.05 0.01 0.03 0.07
## beta.prop     0.03 0.05 0.12 0.09 0.02 0.04 0.08
## a.plus.b.prop 0.03 0.04 0.07 0.06 0.02 0.06 0.07
## a.div.b       0.04 0.04 0.08 0.05 0.02 0.03 0.09
## human.aa.freq 0.06 0.08 0.08 0.04 0.06 0.03 0.02
dist.alpha <- dist((aa.prop.flipped[c(1,5),]),  method = "euclidean")
dist.beta  <- dist((aa.prop.flipped[c(2,5),]),  method = "euclidean")
dist.apb   <- dist((aa.prop.flipped[c(3,5),]),  method = "euclidean")
dist.adb  <- dist((aa.prop.flipped[c(4,5),]), method = "euclidean")
fold.type <- c("alpha","beta","alpha plus beta", "alpha/beta")

corr.sim <- round(c(corr.alpha,corr.beta,corr.apb,corr.adb),5)
cosine.sim <- round(c(cos.alpha,cos.beta,cos.apb,cos.adb),5)
Euclidean.dist <- round(c(dist.alpha,dist.beta,dist.apb,dist.adb),5)

# summary
sim.sum <- c("","","most.sim","")
dist.sum <- c("","","min.dist","")

df <- data.frame(fold.type,
           corr.sim ,
           cosine.sim ,
           Euclidean.dist ,
           sim.sum ,
           dist.sum )
pander(df)
fold.type corr.sim cosine.sim Euclidean.dist sim.sum dist.sum
alpha 0.6233 0.6233 0.2327
beta 0.6629 0.6629 0.2214
alpha plus beta 0.6953 0.6953 0.2052 most.sim min.dist
alpha/beta 0.6779 0.6779 0.2121

This prediction agrees with Alphafold in predicting a structure with both alpha helices and beta-pleated sheets.

Percent Identity

align.1.2 <- Biostrings::pairwiseAlignment(LAG3_vector[1], LAG3_vector[2])
align.1.3 <- Biostrings::pairwiseAlignment(LAG3_vector[1], LAG3_vector[3])
align.1.4 <- Biostrings::pairwiseAlignment(LAG3_vector[1], LAG3_vector[4])
align.2.3 <- Biostrings::pairwiseAlignment(LAG3_vector[2], LAG3_vector[3])
align.2.4 <- Biostrings::pairwiseAlignment(LAG3_vector[2], LAG3_vector[4])
align.3.4 <- Biostrings::pairwiseAlignment(LAG3_vector[3], LAG3_vector[4])
pids <- c(1,                  NA,     NA,     NA,
          pid(align.1.2),          1,     NA,     NA,
          pid(align.1.3), pid(align.2.3),      1,     NA,
          pid(align.1.4), pid(align.2.4), pid(align.3.4), 1)

mat <- matrix(pids, nrow = 4, byrow = T)
row.names(mat) <- c("Human","Chimp","Bonobo","Gorilla")   
colnames(mat) <- c("Human","Chimp","Bonobo","Gorilla")  
pander::pander(mat)  
  Human Chimp Bonobo Gorilla
Human 1 NA NA NA
Chimp 98.1 1 NA NA
Bonobo 98.11 99.43 1 NA
Gorilla 97.36 98.12 98.68 1
PID_types <- c( "PID1", pid(align.1.2, type = "PID1"), "(aligned positions + internal gap positions)", 
                "PID2", pid(align.1.2, type = "PID2"), "(aligned positions)", 
                "PID3", pid(align.1.2, type = "PID3"), "(length shorter sequence)", 
                "PID4", pid(align.1.2, type = "PID4"), "(average length of the two sequences)")
PID_types_mat <- matrix(PID_types, nrow = 4, byrow = T)
colnames(PID_types_mat) <- c("Method","PID","denominator")  
pander::pander(PID_types_mat)  
Method PID denominator
PID1 98.1024667931689 (aligned positions + internal gap positions)
PID2 98.4761904761905 (aligned positions)
PID3 98.4761904761905 (length shorter sequence)
PID4 98.2889733840304 (average length of the two sequences)

Multiple Sequence Alignment

names.focal <- c('NP_002277.4', 'XP_508966.3', 'XP_003820331.2', 'XP_004052634.1')
LAG3_vector_ss_subset <- LAG3_vector_ss[names.focal]
LAG3_align_subset <-msa(LAG3_vector_ss_subset,
                     method = "ClustalW")
## use default substitution matrix
LAG3_align_subset
## CLUSTAL 2.1  
## 
## Call:
##    msa(LAG3_vector_ss_subset, method = "ClustalW")
## 
## MsaAAMultipleAlignment with 4 rows and 531 columns
##     aln                                                    names
## [1] MWEAQFLGLLFLQPLWVAPVKPLQPG...ELEPEPEPE--LGPEPEPEPE--QL XP_508966.3
## [2] MWEAQFLGLLFLQPLWVAPVKPLQPG...ELEPEPEPE--LGPEPEPEPEPEQL XP_003820331.2
## [3] MWEAQFLGLLFLQPLWVAPVKPLQPG...EPEPEPEPEPELGPEPEPEPEPEQL XP_004052634.1
## [4] MWEAQFLGLLFLQPLWVAPVKPLQPG...EPEPEPEPE----PEPEPEPE--QL NP_002277.4
## Con MWEAQFLGLLFLQPLWVAPVKPLQPG...E?EPEPEPE--LGPEPEPEPE??QL Consensus
class(LAG3_align_subset) <- "AAMultipleAlignment"

LAG3_align_seqinr_subset <- msaConvert(LAG3_align_subset, type = "seqinr::alignment")
ggmsa::ggmsa(LAG3_align_subset,
      start = 250, 
      end = 350) 

Distance Matrix

LAG3_align <-msa(LAG3_vector_ss,
                     method = "ClustalW")
## use default substitution matrix
class(LAG3_align) <- "AAMultipleAlignment"
LAG3_align_seqinr <- msaConvert(LAG3_align, type = "seqinr::alignment")

LAG3_dist <- seqinr::dist.alignment(LAG3_align_seqinr, 
                                       matrix = "identity")
LAG3_dist_rounded <- round(LAG3_dist,
                              digits = 3)
LAG3_dist_rounded
##                XP_508966.3 XP_003820331.2 XP_004052634.1 NP_002277.4
## XP_003820331.2       0.044                                          
## XP_004052634.1       0.107          0.097                           
## NP_002277.4          0.123          0.115          0.123            
## XP_002822880.2       0.180          0.174          0.189       0.180
## XP_044082922.1       0.451          0.452          0.456       0.450
## NP_032505.1          0.548          0.546          0.546       0.544
## XP_024998797         0.834          0.832          0.834       0.834
## XP_005506340.2       0.836          0.835          0.836       0.836
## XP_004915830.2       0.868          0.870          0.868       0.870
##                XP_002822880.2 XP_044082922.1 NP_032505.1 XP_024998797
## XP_003820331.2                                                       
## XP_004052634.1                                                       
## NP_002277.4                                                          
## XP_002822880.2                                                       
## XP_044082922.1          0.445                                        
## NP_032505.1             0.546          0.572                         
## XP_024998797            0.831          0.832       0.836             
## XP_005506340.2          0.835          0.843       0.840        0.493
## XP_004915830.2          0.868          0.871       0.871        0.845
##                XP_005506340.2
## XP_003820331.2               
## XP_004052634.1               
## NP_002277.4                  
## XP_002822880.2               
## XP_044082922.1               
## NP_032505.1                  
## XP_024998797                 
## XP_005506340.2               
## XP_004915830.2          0.840

Phylogeny

tree <- nj(LAG3_dist)

plot.phylo(tree, main="Phylogenetic Tree", 
            use.edge.length = T)

# add label
mtext(text = "LAG3 gene homolog tree - rooted, with branch lenths")

```