INTRODUCTION

TODO: Brief statement in of the gene and its full name and in your own words what is done in your script

EXAMPLE: This code compiles summary information about the gene DIO1 (iodothyronine deiodinase 1). It also generates alignments and a phylogeneitc tree to indicating the evolutionary relationship betweeen the human version of the gene and its homologs in other species.

resources / references

Key information use to make this script can be found here: Refseq Gene: https://www.ncbi.nlm.nih.gov/gene/1733 Refseq Homologene: https://www.ncbi.nlm.nih.gov/homologene/620

Other resources consulted includes

setup

Install any needed packages.

library(BiocManager)
## Bioconductor version '3.13' is out-of-date; the current release version '3.14'
##   is available with R version '4.1'; see https://bioconductor.org/install
BiocManager::install("drawProteins")
## Bioconductor version 3.13 (BiocManager 1.30.16), R 4.1.2 (2021-11-01)
## Warning: package(s) not installed when version(s) same as current; use `force = TRUE` to
##   re-install: 'drawProteins'
## Installation paths not writeable, unable to update packages
##   path: C:/Program Files/R/R-4.1.2/library
##   packages:
##     Matrix
## Old packages: 'digest', 'glue', 'rlang', 'tibble', 'XML'
library(drawProteins)
#install.packages('HGNChelper')
#install.packages("cli")

load the necessary packages

#library(devtools)

# github packages
library(compbio4all)
library(ggmsa)
## Registered S3 methods overwritten by 'ggalt':
##   method                  from   
##   grid.draw.absoluteGrob  ggplot2
##   grobHeight.absoluteGrob ggplot2
##   grobWidth.absoluteGrob  ggplot2
##   grobX.absoluteGrob      ggplot2
##   grobY.absoluteGrob      ggplot2
# CRAN packages
library(rentrez)
library(seqinr)
library(ape)
## 
## Attaching package: 'ape'
## The following objects are masked from 'package:seqinr':
## 
##     as.alignment, consensus
library(pander)

library(ggplot2)

# Bioconductor packages
## msa
library(msa)
## Loading required package: Biostrings
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
## 
##     windows
## Loading required package: XVector
## Loading required package: GenomeInfoDb
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:ape':
## 
##     complement
## The following object is masked from 'package:seqinr':
## 
##     translate
## The following object is masked from 'package:base':
## 
##     strsplit
## 
## Attaching package: 'msa'
## The following object is masked from 'package:BiocManager':
## 
##     version
library(drawProteins)

## Biostrings
library(Biostrings)



#library(HGNChelper)

ACCESSION NUMBERS

acession number table

#Getting all the info
acc_num <- c("NP_001123361","XP_016780936.1","NP_034154.2","NP_001271395.1","NP_001012684.1","NP_001017161.1","NP_571630.2","XP_004626354.1","XP_025731052.1","XP_004746206.1")

unip_acc <- c("P40126","A0A2I3SUR0","P29812","F1PYI1","F1MC37","F7EEJ9","F1QRR2","A0A6P3EM25","A0A3Q7PLZ6","M3YKA6")

pdb_acc <- c("NA","NA","NA","NA","NA","NA","NA","NA","NA","NA")

sci_name <- c("Homo sapiens","Pan troglodytes","Mus musculus","Canis lupus","Bos taurus","Xenopus tropicalis","Danio rerio","Octodon               degus","Callorhinus ursinus","Mustela putorius furo")

com_name <- c("Human","Chimpanzee","Mouse","Dog","Cattle","Frog","Fish","Common degu","Northern fur seal","Ferret")

gene_name <- c("DCT","DCT","DCT","DCT","DCT","DCT","DCT","DCT","DCT","DCT")

convert vector information into a table

dct_df <- data.frame(acc_num, unip_acc, pdb_acc, sci_name, com_name, gene_name)

the finished table

pander::pander(dct_df)
Table continues below
acc_num unip_acc pdb_acc sci_name
NP_001123361 P40126 NA Homo sapiens
XP_016780936.1 A0A2I3SUR0 NA Pan troglodytes
NP_034154.2 P29812 NA Mus musculus
NP_001271395.1 F1PYI1 NA Canis lupus
NP_001012684.1 F1MC37 NA Bos taurus
NP_001017161.1 F7EEJ9 NA Xenopus tropicalis
NP_571630.2 F1QRR2 NA Danio rerio
XP_004626354.1 A0A6P3EM25 NA Octodon degus
XP_025731052.1 A0A3Q7PLZ6 NA Callorhinus ursinus
XP_004746206.1 M3YKA6 NA Mustela putorius furo
com_name gene_name
Human DCT
Chimpanzee DCT
Mouse DCT
Dog DCT
Cattle DCT
Frog DCT
Fish DCT
Common degu DCT
Northern fur seal DCT
Ferret DCT

DATA PREPARATION

download sequences

#dowload each FASTA sequence individually
human_dct <- rentrez::entrez_fetch(db = "protein", 
                          id = "NP_001123361", 
                          rettype = "fasta")
chimp_dct <- rentrez::entrez_fetch(db = "protein", 
                          id = "XP_016780936.1", 
                          rettype = "fasta")
mouse_dct <- rentrez::entrez_fetch(db = "protein", 
                          id = "NP_034154.2", 
                          rettype = "fasta")
dog_dct <- rentrez::entrez_fetch(db = "protein", 
                          id = "NP_001271395.1", 
                          rettype = "fasta")
cattle_dct <- rentrez::entrez_fetch(db = "protein", 
                          id = "NP_001012684.1", 
                          rettype = "fasta")
frog_dct <- rentrez::entrez_fetch(db = "protein", 
                          id = "NP_001017161.1", 
                          rettype = "fasta")
fish_dct <- rentrez::entrez_fetch(db = "protein", 
                          id = "NP_571630.2", 
                          rettype = "fasta")
degu_dct <- rentrez::entrez_fetch(db = "protein", 
                          id = "XP_004626354.1", 
                          rettype = "fasta")
seal_dct <- rentrez::entrez_fetch(db = "protein", 
                          id = "XP_025731052.1", 
                          rettype = "fasta")
ferret_dct <- rentrez::entrez_fetch(db = "protein", 
                          id = "XP_004746206.1", 
                          rettype = "fasta")

#put it all into a list
dct_list <- list(human_dct, chimp_dct, mouse_dct, dog_dct, cattle_dct, frog_dct, fish_dct, degu_dct, seal_dct, ferret_dct)

#check length of list to make sure there are 10
length(dct_list)
## [1] 10
#quick check to see the first entry
dct_list[[1]]
## [1] ">NP_001123361.1 L-dopachrome tautomerase isoform 2 precursor [Homo sapiens]\nMSPLWWGFLLSCLGCKILPGAQGQFPRVCMTVDSLVNKECCPRLGAESANVCGSQQGRGQCTEVRADTRP\nWSGPYILRNQDDRELWPRKFFHRTCKCTGNFAGYNCGDCKFGWTGPNCERKKPPVIRQNIHSLSPQEREQ\nFLGALDLAKKRVHPDYVITTQHWLGLLGPNGTQPQFANCSVYDFFVWLHYYSVRDTLLGPGRPYRAIDFS\nHQGPAFVTWHRYHLLCLERDLQRLIGNESFALPYWNFATGRNECDVCTDQLFGAARPDDPTLISRNSRFS\nSWETVCDSLDDYNHLVTLCNGTYEGLLRRNQMGRNSMKLPTLKDIRDCLSLQKFDNPPFFQNSTFSFRNA\nLEGFDKADGTLDSQVMSLHNLVHSFLNGTNALPHSAANDPIFVVISNRLLYNATTNILEHVRKEKATKEL\nPSLHVLVLHSFTDAIFDEWMKRFNPPADAWPQELAPIGHNRMYNMVPFFPPVTNEELFLTSDQLGYSYAI\nDLPVSVEETPGWPTTLLVVMGTLVALVGLFVLLAFLQYRRLRKGYTPLMETHLSSKRYTEEA\n\n"
names(dct_list)
## NULL

initial data cleaning

#remove FASTA header
for(i in 1:length(dct_list)){
  dct_list[[i]] <- compbio4all::fasta_cleaner(dct_list[[i]], parse = F)
}
#test
dct_list[[1]]
## [1] "MSPLWWGFLLSCLGCKILPGAQGQFPRVCMTVDSLVNKECCPRLGAESANVCGSQQGRGQCTEVRADTRPWSGPYILRNQDDRELWPRKFFHRTCKCTGNFAGYNCGDCKFGWTGPNCERKKPPVIRQNIHSLSPQEREQFLGALDLAKKRVHPDYVITTQHWLGLLGPNGTQPQFANCSVYDFFVWLHYYSVRDTLLGPGRPYRAIDFSHQGPAFVTWHRYHLLCLERDLQRLIGNESFALPYWNFATGRNECDVCTDQLFGAARPDDPTLISRNSRFSSWETVCDSLDDYNHLVTLCNGTYEGLLRRNQMGRNSMKLPTLKDIRDCLSLQKFDNPPFFQNSTFSFRNALEGFDKADGTLDSQVMSLHNLVHSFLNGTNALPHSAANDPIFVVISNRLLYNATTNILEHVRKEKATKELPSLHVLVLHSFTDAIFDEWMKRFNPPADAWPQELAPIGHNRMYNMVPFFPPVTNEELFLTSDQLGYSYAIDLPVSVEETPGWPTTLLVVMGTLVALVGLFVLLAFLQYRRLRKGYTPLMETHLSSKRYTEEA"

GENERAL PROTEIN INFORMATION

setup of protein diagram

P40126_json <- drawProteins::get_features("P40126")
## [1] "Download has worked"
my_prot_df <- drawProteins::feature_to_dataframe(P40126_json)
is(my_prot_df)
## [1] "data.frame"       "list"             "oldClass"         "vector"          
## [5] "list_OR_List"     "vector_OR_Vector" "vector_OR_factor"

present diagram

my_canvas <- draw_canvas(my_prot_df)
my_canvas <- draw_chains(my_canvas, my_prot_df, label_size = 2.5)
my_canvas <- draw_recept_dom(my_canvas, my_prot_df)
my_canvas

fasta_cleaner <- function(fasta_object, parse = TRUE){

         #header <- gsub("\\|","",header)
         #header <- gsub("\\+","",header)
         #header <- gsub("\\=","",header)
         #header <- gsub("\\:","",header)
         #header <- gsub("\\;","",header)

         #fasta_object <- gsub("\\|","",fasta_object)
         #fasta_object <- gsub("\\+","",fasta_object)
         #fasta_object <- gsub("\\=","",fasta_object)
         #fasta_object <- gsub("\\:","",fasta_object)
         #fasta_object <- gsub("\\;","",fasta_object)

         #fasta_object <- gsub(header, "", fasta_object)

        fasta_object <- sub("^(>)(.*?)(\\n)(.*)(\\n\\n)","\\4",fasta_object)
        fasta_object <- gsub("\n", "", fasta_object)

         if(parse == TRUE){
         fasta_object <- stringr::str_split(fasta_object,
                                   pattern = "",
                                   simplify = FALSE)
         }

         return(fasta_object[[1]])
 }

dotplot

dct_human_vector <- fasta_cleaner(human_dct)


# set up 2 x 2 grid
par(mfrow = c(2,2), 
    mar = c(0,0,2,1))

# plot
dotPlot(dct_human_vector, dct_human_vector, 
        wsize = 1, 
        nmatch = 1, 
        main = "Human DCT")

protein properties compiled from databases

program <- c("Pfam", "DisProt", "RepeatsDB", "Subcellular Location", "Protein Secondary Structural Classifications")
key_protein_properties <- c("Tyrosinase is a named domain that's present", "NA", "NA", "The subcellular location is the melanosome", "NA")
entry_url <- c("http://pfam.xfam.org/protein/P40126", "https://www.disprot.org/browse?free_text=P40126&from_homepage=true", "https://repeatsdb.bio.unipd.it/protein/P40126", "https://www.uniprot.org/uniprot/P40126#subcellular_location", "No PDB accession")

properties.df <- data.frame(program, key_protein_properties, entry_url)

pander::pander(properties.df)
Table continues below
program key_protein_properties
Pfam Tyrosinase is a named domain that’s present
DisProt NA
RepeatsDB NA
Subcellular Location The subcellular location is the melanosome
Protein Secondary Structural Classifications NA
entry_url
http://pfam.xfam.org/protein/P40126
https://www.disprot.org/browse?free_text=P40126&from_homepage=true
https://repeatsdb.bio.unipd.it/protein/P40126
https://www.uniprot.org/uniprot/P40126#subcellular_location
No PDB accession

PROTEIN FEATURE PREDICTION

secondary structure prediction

# enter once
aa.1.1 <- c("A","R","N","D","C","Q","E","G","H","I",
            "L","K","M","F","P","S","T","W","Y","V")

# enter again
aa.1.2 <- c("A","R","N","D","C","Q","E","G","H","I",
            "L","K","M","F","P","S","T","W","Y","V")

# are they the same length?
length(aa.1.1) == length(aa.1.2)
## [1] TRUE

make vectors with the frequency of each amino acid in the databse of proteins of each type used by Chou

# alpha proteins
alpha <- c(285, 53, 97, 163, 22, 67, 134, 197, 111, 91, 
           221, 249, 48, 123, 82, 122, 119, 33, 63, 167)

# check against chou's total
sum(alpha) == 2447
## [1] TRUE
# beta proteins
beta <- c(203, 67, 139, 121, 75, 122, 86, 297, 49, 120, 
          177, 115, 16, 85, 127, 341, 253, 44, 110, 229)

# check against chou's total
sum(beta) == 2776
## [1] TRUE
# alpha + beta
a.plus.b <- c(175, 78, 120, 111, 74, 74, 86, 171, 33, 93,
              110, 112, 25, 52, 71, 126, 117, 30, 108, 123)
sum(a.plus.b) == 1889
## [1] TRUE
# alpha/beta
a.div.b <- c(361, 146, 183, 244, 63, 114, 257, 377, 107, 239, 
             339, 321, 91, 158, 188, 327, 238, 72, 130, 378)
sum(a.div.b) == 4333
## [1] TRUE

build table

data.frame(aa.1.1, alpha, beta, a.plus.b, a.div.b)
##    aa.1.1 alpha beta a.plus.b a.div.b
## 1       A   285  203      175     361
## 2       R    53   67       78     146
## 3       N    97  139      120     183
## 4       D   163  121      111     244
## 5       C    22   75       74      63
## 6       Q    67  122       74     114
## 7       E   134   86       86     257
## 8       G   197  297      171     377
## 9       H   111   49       33     107
## 10      I    91  120       93     239
## 11      L   221  177      110     339
## 12      K   249  115      112     321
## 13      M    48   16       25      91
## 14      F   123   85       52     158
## 15      P    82  127       71     188
## 16      S   122  341      126     327
## 17      T   119  253      117     238
## 18      W    33   44       30      72
## 19      Y    63  110      108     130
## 20      V   167  229      123     378
pander(data.frame(aa.1.1, alpha, beta, a.plus.b, a.div.b))
aa.1.1 alpha beta a.plus.b a.div.b
A 285 203 175 361
R 53 67 78 146
N 97 139 120 183
D 163 121 111 244
C 22 75 74 63
Q 67 122 74 114
E 134 86 86 257
G 197 297 171 377
H 111 49 33 107
I 91 120 93 239
L 221 177 110 339
K 249 115 112 321
M 48 16 25 91
F 123 85 52 158
P 82 127 71 188
S 122 341 126 327
T 119 253 117 238
W 33 44 30 72
Y 63 110 108 130
V 167 229 123 378

calculate proportions for each of the four protein fold types

alpha.prop <- alpha/sum(alpha)
beta.prop <- beta/sum(beta)
a.plus.b.prop <- a.plus.b/sum(a.plus.b)
a.div.b <- a.div.b/sum(a.div.b)

create another data frame

#dataframe
aa.prop <- data.frame(alpha.prop,
                      beta.prop,
                      a.plus.b.prop,
                      a.div.b)
#row labels
row.names(aa.prop) <- aa.1.1

aa.prop
##    alpha.prop   beta.prop a.plus.b.prop    a.div.b
## A 0.116469146 0.073126801    0.09264161 0.08331410
## R 0.021659174 0.024135447    0.04129169 0.03369490
## N 0.039640376 0.050072046    0.06352567 0.04223402
## D 0.066612178 0.043587896    0.05876125 0.05631202
## C 0.008990601 0.027017291    0.03917417 0.01453958
## Q 0.027380466 0.043948127    0.03917417 0.02630972
## E 0.054760932 0.030979827    0.04552673 0.05931225
## G 0.080506743 0.106988473    0.09052409 0.08700669
## H 0.045361667 0.017651297    0.01746956 0.02469421
## I 0.037188394 0.043227666    0.04923240 0.05515809
## L 0.090314671 0.063760807    0.05823187 0.07823679
## K 0.101757254 0.041426513    0.05929063 0.07408262
## M 0.019615856 0.005763689    0.01323452 0.02100162
## F 0.050265631 0.030619597    0.02752779 0.03646434
## P 0.033510421 0.045749280    0.03758602 0.04338795
## S 0.049856968 0.122838617    0.06670196 0.07546734
## T 0.048630977 0.091138329    0.06193753 0.05492730
## W 0.013485901 0.015850144    0.01588142 0.01661666
## Y 0.025745811 0.039625360    0.05717311 0.03000231
## V 0.068246833 0.082492795    0.06511382 0.08723748

determine the number of each amino acid in my protein

table_to_vector <- function(table_x){
  table_names <- attr(table_x, "dimnames")[[1]]
  table_vect <- as.vector(table_x)
  names(table_vect) <- table_names
  return(table_vect)
}

#build new table with specific frequencies of DCT
dct_human_table <- table(dct_human_vector)/length(dct_human_vector)
DCT.human.aa.freq <- table_to_vector(dct_human_table)
DCT.human.aa.freq
##          A          C          D          E          F          G          H 
## 0.05253623 0.03442029 0.05434783 0.04528986 0.05797101 0.06521739 0.03079710 
##          I          K          L          M          N          P          Q 
## 0.02717391 0.03442029 0.11775362 0.01811594 0.05615942 0.06702899 0.03985507 
##          R          S          T          V          W          Y 
## 0.06521739 0.05978261 0.06159420 0.05615942 0.02355072 0.03260870

check for presence of “U”

aa.names <- names(DCT.human.aa.freq)
any(aa.names == "U")
## [1] FALSE
#i.U <- which(aa.names == "U")
#aa.names[i.U]

#DCT.human.aa.freq[i.U]

add data on my focal protein to the amino acid frequency table.

aa.prop$DCT.human.aa.freq <- DCT.human.aa.freq
pander::pander(aa.prop)
  alpha.prop beta.prop a.plus.b.prop a.div.b DCT.human.aa.freq
A 0.1165 0.07313 0.09264 0.08331 0.05254
R 0.02166 0.02414 0.04129 0.03369 0.03442
N 0.03964 0.05007 0.06353 0.04223 0.05435
D 0.06661 0.04359 0.05876 0.05631 0.04529
C 0.008991 0.02702 0.03917 0.01454 0.05797
Q 0.02738 0.04395 0.03917 0.02631 0.06522
E 0.05476 0.03098 0.04553 0.05931 0.0308
G 0.08051 0.107 0.09052 0.08701 0.02717
H 0.04536 0.01765 0.01747 0.02469 0.03442
I 0.03719 0.04323 0.04923 0.05516 0.1178
L 0.09031 0.06376 0.05823 0.07824 0.01812
K 0.1018 0.04143 0.05929 0.07408 0.05616
M 0.01962 0.005764 0.01323 0.021 0.06703
F 0.05027 0.03062 0.02753 0.03646 0.03986
P 0.03351 0.04575 0.03759 0.04339 0.06522
S 0.04986 0.1228 0.0667 0.07547 0.05978
T 0.04863 0.09114 0.06194 0.05493 0.06159
W 0.01349 0.01585 0.01588 0.01662 0.05616
Y 0.02575 0.03963 0.05717 0.03 0.02355
V 0.06825 0.08249 0.06511 0.08724 0.03261

functions to calculate similarity

# correlation
chou_cor <- function(x,y){
  numerator <- sum(x*y)
denominator <- sqrt((sum(x^2))*(sum(y^2)))
result <- numerator/denominator
return(result)
}

# cosine similarity
chou_cosine <- function(z.1, z.2){
  z.1.abs <- sqrt(sum(z.1^2))
  z.2.abs <- sqrt(sum(z.2^2))
  my.cosine <- sum(z.1*z.2)/(z.1.abs*z.2.abs)
  return(my.cosine)
}

data exploration

par(mfrow = c(2,2), mar = c(1,4,1,1))
plot(alpha.prop ~ DCT.human.aa.freq, data = aa.prop)
plot(beta.prop ~ DCT.human.aa.freq, data = aa.prop)
plot(a.plus.b.prop  ~ DCT.human.aa.freq, data = aa.prop)
plot(a.div.b ~ DCT.human.aa.freq, data = aa.prop)

calculate correlation between each column and cosine similarity and distance

#correlation
corr.alpha <- chou_cor(aa.prop[,5], aa.prop[,1])
corr.beta  <- chou_cor(aa.prop[,5], aa.prop[,2])
corr.apb   <- chou_cor(aa.prop[,5], aa.prop[,3])
corr.adb   <- chou_cor(aa.prop[,5], aa.prop[,4])

#cosine sim
cos.alpha <- chou_cosine(aa.prop[,5], aa.prop[,1])
cos.beta  <- chou_cosine(aa.prop[,5], aa.prop[,2])
cos.apb   <- chou_cosine(aa.prop[,5], aa.prop[,3])
cos.adb   <- chou_cosine(aa.prop[,5], aa.prop[,4])

#distance
aa.prop.flipped <- t(aa.prop)
round(aa.prop.flipped,2)
##                      A    R    N    D    C    Q    E    G    H    I    L    K
## alpha.prop        0.12 0.02 0.04 0.07 0.01 0.03 0.05 0.08 0.05 0.04 0.09 0.10
## beta.prop         0.07 0.02 0.05 0.04 0.03 0.04 0.03 0.11 0.02 0.04 0.06 0.04
## a.plus.b.prop     0.09 0.04 0.06 0.06 0.04 0.04 0.05 0.09 0.02 0.05 0.06 0.06
## a.div.b           0.08 0.03 0.04 0.06 0.01 0.03 0.06 0.09 0.02 0.06 0.08 0.07
## DCT.human.aa.freq 0.05 0.03 0.05 0.05 0.06 0.07 0.03 0.03 0.03 0.12 0.02 0.06
##                      M    F    P    S    T    W    Y    V
## alpha.prop        0.02 0.05 0.03 0.05 0.05 0.01 0.03 0.07
## beta.prop         0.01 0.03 0.05 0.12 0.09 0.02 0.04 0.08
## a.plus.b.prop     0.01 0.03 0.04 0.07 0.06 0.02 0.06 0.07
## a.div.b           0.02 0.04 0.04 0.08 0.05 0.02 0.03 0.09
## DCT.human.aa.freq 0.07 0.04 0.07 0.06 0.06 0.06 0.02 0.03

distance matrix

dist(aa.prop.flipped, method = "euclidean")
##                   alpha.prop  beta.prop a.plus.b.prop    a.div.b
## beta.prop         0.13342098                                    
## a.plus.b.prop     0.09281824 0.08289406                         
## a.div.b           0.06699039 0.08659174    0.06175113           
## DCT.human.aa.freq 0.18109963 0.17293345    0.14646291 0.15634679

individual distances

dist.alpha <- dist((aa.prop.flipped[c(1,5),]),  method = "euclidean")
dist.beta  <- dist((aa.prop.flipped[c(2,5),]),  method = "euclidean")
dist.apb   <- dist((aa.prop.flipped[c(3,5),]),  method = "euclidean")
dist.adb  <- dist((aa.prop.flipped[c(4,5),]), method = "euclidean")

compile the info

# fold types
fold.type <- c("alpha","beta","alpha plus beta", "alpha/beta")

# data
corr.sim <- round(c(corr.alpha,corr.beta,corr.apb,corr.adb),5)
cosine.sim <- round(c(cos.alpha,cos.beta,cos.apb,cos.adb),5)
Euclidean.dist <- round(c(dist.alpha,dist.beta,dist.apb,dist.adb),5)

# summary
sim.sum <- c("","","most.sim","")
dist.sum <- c("","","min.dist","")

df <- data.frame(fold.type,
           corr.sim ,
           cosine.sim ,
           Euclidean.dist ,
           sim.sum ,
           dist.sum )
# display
pander::pander(df)
fold.type corr.sim cosine.sim Euclidean.dist sim.sum dist.sum
alpha 0.7409 0.7409 0.1811
beta 0.7678 0.7678 0.1729
alpha plus beta 0.8188 0.8188 0.1465 most.sim min.dist
alpha/beta 0.7976 0.7976 0.1563

PERCENT IDENTITY COMPARISONS (PID)

PID table

#converting the FASTA files in the original list into a single vector
dct_vector <- unlist(dct_list)
cat(dct_vector)
## MSPLWWGFLLSCLGCKILPGAQGQFPRVCMTVDSLVNKECCPRLGAESANVCGSQQGRGQCTEVRADTRPWSGPYILRNQDDRELWPRKFFHRTCKCTGNFAGYNCGDCKFGWTGPNCERKKPPVIRQNIHSLSPQEREQFLGALDLAKKRVHPDYVITTQHWLGLLGPNGTQPQFANCSVYDFFVWLHYYSVRDTLLGPGRPYRAIDFSHQGPAFVTWHRYHLLCLERDLQRLIGNESFALPYWNFATGRNECDVCTDQLFGAARPDDPTLISRNSRFSSWETVCDSLDDYNHLVTLCNGTYEGLLRRNQMGRNSMKLPTLKDIRDCLSLQKFDNPPFFQNSTFSFRNALEGFDKADGTLDSQVMSLHNLVHSFLNGTNALPHSAANDPIFVVISNRLLYNATTNILEHVRKEKATKELPSLHVLVLHSFTDAIFDEWMKRFNPPADAWPQELAPIGHNRMYNMVPFFPPVTNEELFLTSDQLGYSYAIDLPVSVEETPGWPTTLLVVMGTLVALVGLFVLLAFLQYRRLRKGYTPLMETHLSSKRYTEEA MSPLWWGFLLSCLGCKILPGAQAQFPRVCMTVDSLVNKECCPRLGAESANVCGSQQGRGQCTEVRADTRPWSGPYILRNQDDRELWPRKFFHRTCKCTGNFAGYNCGDCKFGWTGPNCERKKPPVIRQNIHSLSPQEREQFLGALDLAKKSVHPDYVITTQHWLGLLGPNGTQPQFANCSVYDFFVWLHYYSVRDTLLGPGRPYRAIDFSHQGPAFVTWHRYHLLCLERDLQRLIGNESFALPYWNFATGRNECDVCTDQLFGEARPDDPTLISRNSRFSSWETVCDSLDDYNHRVTLCNGTYEGLLRRNQMGRNSMKLPTLKDIRDCLSLQKFDNPPFFQNSTFSFRNALEGFDKADGTLDSQVMSLHNLVHSFLNGTNALPHSAANDPIYVVISNRLLYNATTNILEHVRKEKATKELPSLHVLVLHSFTDAIFDEWMKRFNPPADAWPQELAPIGHNRMYNMVPFFPPVTNEELFLTSDQLGYSYAIDLPVSVEETPGWPTTLLVVMGMLVALVGLFVLLAFLQYRRLRKGYTPLMETHLSSKRYTEEA MGLVGWGLLLGCLGCGILLRARAQFPRVCMTLDGVLNKECCPPLGPEATNICGFLEGRGQCAEVQTDTRPWSGPYILRNQDDREQWPRKFFNRTCKCTGNFAGYNCGGCKFGWTGPDCNRKKPAILRRNIHSLTAQEREQFLGALDLAKKSIHPDYVITTQHWLGLLGPNGTQPQIANCSVYDFFVWLHYYSVRDTLLGPGRPYKAIDFSHQGPAFVTWHRYHLLWLERELQRLTGNESFALPYWNFATGKNECDVCTDELLGAARQDDPTLISRNSRFSTWEIVCDSLDDYNRRVTLCNGTYEGLLRRNKVGRNNEKLPTLKNVQDCLSLQKFDSPPFFQNSTFSFRNALEGFDKADGTLDSQVMNLHNLAHSFLNGTNALPHSAANDPVFVVLHSFTDAIFDEWLKRNNPSTDAWPQELAPIGHNRMYNMVPFFPPVTNEELFLTAEQLGYNYAVDLSEEEAPVWSTTLSVVIGILGAFVLLLGLLAFLQYRRLRKGYAPLMETGLSSKRYTEEA MQAQELKSGRGEGQDRIREWKARGSSPADKAMSPLGWGLVLCCLGGGLLPGAWAQFPRVCMTVDNLMAKECCPPLGLEPANICGSQEGRGQCMEVQTDARPWSGPYVLRNQDDREGWPRKFFHRTCQCTGHYGGYNCGVCKFGWTGHDCNQRKARVIRKNIHSLTPEEREQFLEALDLAKNTTHPDYVITTQHWLGLLGPNGTQPQIANCSIYDFFVWLHYYSVRDTLLGPGRPYKAIDFSHQGPAFVTWHRYHLLWLERDLQQLIGNESFALPYWNFATGRNDCDVCTDQLLGAARQDDPTLISENSRFSNWEIVCDSLDDYNRRVTLCNGTYEGLLRRNQVGLNSEKLPTLKDIQDCLSLKKFDSPPFFQNSTFSFRNALEGFDKANGILDSQVMSLHNLVHSFLNGTSALPHSAANDPVFVVLHSFTDAIFDEWMKRSNPSVDAWPQELAPIGHNRMYNMVPFFPPVTNEELFLTADQLGYSYAIDLPVEETPAWTTTLLAVMGMLVALVGVFVVLFFLQYRRLRKGYTPLMETHLSDRKYTEEA MSPLGWGLLLGCLGCALPSGARAQFPRVCMTVGSLQAKECCPPLGADPANVCGSREGRGQCAEVQTDTRPWSGPYVLRNQDDRERWPRKFFDRTCRCTGNFAGYNCGNCRFGWTGPKCDQKKPLVVRRDVHSLTPQEREQFLDALDLAKYTPHPDYVITTQHWLGLLGPNGTQPQIANCSIYDFFVWLHYYSVRDTLLGPGRPYKAIDFSHQGPAFVTWHRYHLLWLERDLQRLTGNESFALPYWNFATGRNECDVCTDQLLGAARQDDPTLISQNSRFSSWEIVCDSLNDYNRRVTLCNGTYEGLLKRNQMGRNSEKLPTLKDIQNCLSLKKFDSPPFFQNSTLSFRNALEGFGKADGTLDSQVMNFHNLVHSFLNGTSALPHSAANDPVFVVLHSFTDAIFDEWMKRFNPPVDAWPRELAPIGHNRMYNMVPFFPPVTNEELFLTADQLGYSYAIDLPVEETPDWTTVLSVVTGMLVVLVVLFALLLFLQYRRLRKGYTPLVETQLSNKRYTEEA MECKGVRAVLLLACCLQAVRAQFPRPCVTVEALTSKRCCPALGSDLNNVCGSLEGRGQCTPVVVDERPWSGPYTLRNVDDRERWPLKFFNSICRCTGNFAGFNCGECKFGWTGPTCEQRKPPVVRKNIHSLTAQERTQFLDALDQAKNTIHPDYVIATQHWLSILGPNGTEPQVANTSIYNYFVWLHYYSVRDTLLGPGRPFTAIDFSHQGPAFVTWHRYHLLLLERDLQRMTGNESFALPYWNFATGRNECDVCTDELFGAPRLDDPNLISAGSRFSRWGIVCNSLNDYNRLVTLCNGTNEGFLQRNPLGGAEGRLPSMEDVQKCLSLNEFDNPPFFRNSTFSFRNALEGFDEPDGTLNSTAMSLHNLVHSFLNGTSAQSHSSANDPIFVVLHSFTDAIFDEWMKRFQPSDVAWPQELAPIGHNRMYNMVPFFPPVTNEELFLPAEQLGYVYSIDLPASIEDTRAVAVLVGSTIGGILLAVLLLLLLLVLFMQRKRQQGFEPLMNATFTNKRYTEDA MKRFVLMLCCFGCMYLSEAQFPRVCCTVEGIVSKECCPALGSDPANVCGLLAGRGNCSDVHVDSKPWGGPYSLRNVDDRERWPTKFFNRTCRCFGNFAGYNCGECKFGWTGLNCDQRKAPVLRKDIHSLSPEELQEFLNALDLSKTTVHPDYMIASQHWLGLLGPNGTEPQFSNISIYDFFVWQHYYSVRDTLLGPGRPFKAIDFSHKGPAFITWHRYHLLSLERELQKLTGNENFAVPYWNFATGQSECDVCTDSLLGGRDPSDPSRISPRSRFARWGVVCDSLDEYNRLVTLCNGTNEGSLRRGAAMSTNVTLPTMNDVRSCLNLRDFDSPPYFTNSTFSFRNALEGYDKPDGELDTSVSNLHNLVHQFLNGTSALSHSAANDPIFLVLHAFTDAIFDEWMRRFRENETFPDEMAPIGHNRHYNMVPFFPPVTNEEIYIASEQLGYSYSIDLEELDNRPAMFLLGSTLGGIFLGLLLLLIVLVLYIQQRRKREFEPLLNAEFTNTKYSGEA MRLLPWGLLFCCLGCRIQPGVWAQFPRICMTLDSLRSKECCPALGAEPGNVCGSRDGRGQCSEVQADTRPWSGPYVLRNQDDREQWPRKFFNRTCRCTGNFAGYNCGDCWFGWTGPNCDQRKPLVVRQNIHSLTPQEREQFLGALDLAKKTTHPDYVITTQHWLGLLGPNGTQPQITNCSIYDFFVWLHYYSVRDTLLGPGRPFKAIDFSHQGPAFVTWHRYHLLWLERDLQRLTGNESFALPYWNFATGRNECDVCTDQLLGAARPDDPTLISQNSRFSSWEIVCDSLDDYNRRVTLCNGTYEGLLRRHQVGRNSETLPTLKNIQDCLSLQKFDNAPFFQNSTFSFRNALEGFDKADGTLDSQVVSLHNLVHSFLNGTSAFPHSAANDPVFVVLHSFTDGIFDEWMKRHKPAADAWPEELAPIGHNRMYNMVPFFPPVTNEELFLTTDHLGYSYAIDWPVTVEETPTWSAVLSVVVSTLAAVVGLFVLLIFLQYRRLRRGYAPLIETHLIRKKYTEEAEGAQTVP MSPLGWGLVLSCLGCGILPGAHAQFPRVCMTVDSLMTKECCPSLGPEPDNICGSQEGRGQCMEVQADTRPWSGPYVLRNQDDREGWPRKFFHRTCRCTGHFAGYNCGVCKFGWTGHDCNQRKAPVIRKNIHSLTAQEREQFLDALDLAKNTTHPNYVITTQHWLGLLGPNGTQPQIANCSIYDFFVWLHYYSVRDTLLGPGRPYKAIDFSHQGPAFVTWHRYHLLWLERDLQQLIDNESFALPYWNFATGRNDCDVCTDQLLGAARQDDPTLISQNSRFSSWEIICDSLDDYNRRVTLCNGTYEGLLRRNQVGLNSEKLPTLKDIQDCLSLKKFDSPPFFQNSTFSFRNALEGFDKANGILDSQVMSLHNLVHSFLNGTSALPHSAANDPVFVVLHSFTDAIFDEWMKRSNPSVDAWPQELAPIGHNRMYNMVPFFPPVTNEEFFLTADQLGYSYAIDLPAEETPGWTTTFLVVAGMLGALAGVFVVLFFLQYRRLRKGYTPLVETHLSNKKYTEEA MCPLGWGLLLSCLGCALLPGAHAQFPRVCMTVDSLMSKECCPSLGPEPDNICGSQEGRGQCTEVQADTRPWSGPYVLRNQDDREGWPRKFFHRTCRCTGHFAGYNCGGCKFGWTGHDCSQRKAPVTRKNLHSLTAQEREQFLDALDLAKNTTHPDYVITTQHWLGLLGPNGTQPQIANCSIYDFFVWLHYYSVRDTLLGPGRPYKAIDFSHQGPAFVTWHRYHLLWLERDLQQLIGNESFALPYWNFATGRNDCDVCTDQLLGAARQDDPTLISQNSRFSNWEIVCDSLDDYNRRVTLCNGTYEGLLRRNQVGLNSAKLPTLKDIQDCLSLKKFDSPPFFQNSTFSFRNALEGFDKANGILDSQVMSLHNLVHSFLNGTSALPHSAANDPIFVVLHSFTDAIFDEWMKRSNPSLDAWPQELAPIGHNRMYNMVPFFPPVTNEEFFLTADQLGYSYAINLPAEETPGWTPTLLVVMGMLVALAGVFVVLFFLQYRRLRKGYTPLAETHLNNKKYTEEA
dct_vector
##  [1] "MSPLWWGFLLSCLGCKILPGAQGQFPRVCMTVDSLVNKECCPRLGAESANVCGSQQGRGQCTEVRADTRPWSGPYILRNQDDRELWPRKFFHRTCKCTGNFAGYNCGDCKFGWTGPNCERKKPPVIRQNIHSLSPQEREQFLGALDLAKKRVHPDYVITTQHWLGLLGPNGTQPQFANCSVYDFFVWLHYYSVRDTLLGPGRPYRAIDFSHQGPAFVTWHRYHLLCLERDLQRLIGNESFALPYWNFATGRNECDVCTDQLFGAARPDDPTLISRNSRFSSWETVCDSLDDYNHLVTLCNGTYEGLLRRNQMGRNSMKLPTLKDIRDCLSLQKFDNPPFFQNSTFSFRNALEGFDKADGTLDSQVMSLHNLVHSFLNGTNALPHSAANDPIFVVISNRLLYNATTNILEHVRKEKATKELPSLHVLVLHSFTDAIFDEWMKRFNPPADAWPQELAPIGHNRMYNMVPFFPPVTNEELFLTSDQLGYSYAIDLPVSVEETPGWPTTLLVVMGTLVALVGLFVLLAFLQYRRLRKGYTPLMETHLSSKRYTEEA"
##  [2] "MSPLWWGFLLSCLGCKILPGAQAQFPRVCMTVDSLVNKECCPRLGAESANVCGSQQGRGQCTEVRADTRPWSGPYILRNQDDRELWPRKFFHRTCKCTGNFAGYNCGDCKFGWTGPNCERKKPPVIRQNIHSLSPQEREQFLGALDLAKKSVHPDYVITTQHWLGLLGPNGTQPQFANCSVYDFFVWLHYYSVRDTLLGPGRPYRAIDFSHQGPAFVTWHRYHLLCLERDLQRLIGNESFALPYWNFATGRNECDVCTDQLFGEARPDDPTLISRNSRFSSWETVCDSLDDYNHRVTLCNGTYEGLLRRNQMGRNSMKLPTLKDIRDCLSLQKFDNPPFFQNSTFSFRNALEGFDKADGTLDSQVMSLHNLVHSFLNGTNALPHSAANDPIYVVISNRLLYNATTNILEHVRKEKATKELPSLHVLVLHSFTDAIFDEWMKRFNPPADAWPQELAPIGHNRMYNMVPFFPPVTNEELFLTSDQLGYSYAIDLPVSVEETPGWPTTLLVVMGMLVALVGLFVLLAFLQYRRLRKGYTPLMETHLSSKRYTEEA"
##  [3] "MGLVGWGLLLGCLGCGILLRARAQFPRVCMTLDGVLNKECCPPLGPEATNICGFLEGRGQCAEVQTDTRPWSGPYILRNQDDREQWPRKFFNRTCKCTGNFAGYNCGGCKFGWTGPDCNRKKPAILRRNIHSLTAQEREQFLGALDLAKKSIHPDYVITTQHWLGLLGPNGTQPQIANCSVYDFFVWLHYYSVRDTLLGPGRPYKAIDFSHQGPAFVTWHRYHLLWLERELQRLTGNESFALPYWNFATGKNECDVCTDELLGAARQDDPTLISRNSRFSTWEIVCDSLDDYNRRVTLCNGTYEGLLRRNKVGRNNEKLPTLKNVQDCLSLQKFDSPPFFQNSTFSFRNALEGFDKADGTLDSQVMNLHNLAHSFLNGTNALPHSAANDPVFVVLHSFTDAIFDEWLKRNNPSTDAWPQELAPIGHNRMYNMVPFFPPVTNEELFLTAEQLGYNYAVDLSEEEAPVWSTTLSVVIGILGAFVLLLGLLAFLQYRRLRKGYAPLMETGLSSKRYTEEA"                                   
##  [4] "MQAQELKSGRGEGQDRIREWKARGSSPADKAMSPLGWGLVLCCLGGGLLPGAWAQFPRVCMTVDNLMAKECCPPLGLEPANICGSQEGRGQCMEVQTDARPWSGPYVLRNQDDREGWPRKFFHRTCQCTGHYGGYNCGVCKFGWTGHDCNQRKARVIRKNIHSLTPEEREQFLEALDLAKNTTHPDYVITTQHWLGLLGPNGTQPQIANCSIYDFFVWLHYYSVRDTLLGPGRPYKAIDFSHQGPAFVTWHRYHLLWLERDLQQLIGNESFALPYWNFATGRNDCDVCTDQLLGAARQDDPTLISENSRFSNWEIVCDSLDDYNRRVTLCNGTYEGLLRRNQVGLNSEKLPTLKDIQDCLSLKKFDSPPFFQNSTFSFRNALEGFDKANGILDSQVMSLHNLVHSFLNGTSALPHSAANDPVFVVLHSFTDAIFDEWMKRSNPSVDAWPQELAPIGHNRMYNMVPFFPPVTNEELFLTADQLGYSYAIDLPVEETPAWTTTLLAVMGMLVALVGVFVVLFFLQYRRLRKGYTPLMETHLSDRKYTEEA"    
##  [5] "MSPLGWGLLLGCLGCALPSGARAQFPRVCMTVGSLQAKECCPPLGADPANVCGSREGRGQCAEVQTDTRPWSGPYVLRNQDDRERWPRKFFDRTCRCTGNFAGYNCGNCRFGWTGPKCDQKKPLVVRRDVHSLTPQEREQFLDALDLAKYTPHPDYVITTQHWLGLLGPNGTQPQIANCSIYDFFVWLHYYSVRDTLLGPGRPYKAIDFSHQGPAFVTWHRYHLLWLERDLQRLTGNESFALPYWNFATGRNECDVCTDQLLGAARQDDPTLISQNSRFSSWEIVCDSLNDYNRRVTLCNGTYEGLLKRNQMGRNSEKLPTLKDIQNCLSLKKFDSPPFFQNSTLSFRNALEGFGKADGTLDSQVMNFHNLVHSFLNGTSALPHSAANDPVFVVLHSFTDAIFDEWMKRFNPPVDAWPRELAPIGHNRMYNMVPFFPPVTNEELFLTADQLGYSYAIDLPVEETPDWTTVLSVVTGMLVVLVVLFALLLFLQYRRLRKGYTPLVETQLSNKRYTEEA"                                   
##  [6] "MECKGVRAVLLLACCLQAVRAQFPRPCVTVEALTSKRCCPALGSDLNNVCGSLEGRGQCTPVVVDERPWSGPYTLRNVDDRERWPLKFFNSICRCTGNFAGFNCGECKFGWTGPTCEQRKPPVVRKNIHSLTAQERTQFLDALDQAKNTIHPDYVIATQHWLSILGPNGTEPQVANTSIYNYFVWLHYYSVRDTLLGPGRPFTAIDFSHQGPAFVTWHRYHLLLLERDLQRMTGNESFALPYWNFATGRNECDVCTDELFGAPRLDDPNLISAGSRFSRWGIVCNSLNDYNRLVTLCNGTNEGFLQRNPLGGAEGRLPSMEDVQKCLSLNEFDNPPFFRNSTFSFRNALEGFDEPDGTLNSTAMSLHNLVHSFLNGTSAQSHSSANDPIFVVLHSFTDAIFDEWMKRFQPSDVAWPQELAPIGHNRMYNMVPFFPPVTNEELFLPAEQLGYVYSIDLPASIEDTRAVAVLVGSTIGGILLAVLLLLLLLVLFMQRKRQQGFEPLMNATFTNKRYTEDA"                                  
##  [7] "MKRFVLMLCCFGCMYLSEAQFPRVCCTVEGIVSKECCPALGSDPANVCGLLAGRGNCSDVHVDSKPWGGPYSLRNVDDRERWPTKFFNRTCRCFGNFAGYNCGECKFGWTGLNCDQRKAPVLRKDIHSLSPEELQEFLNALDLSKTTVHPDYMIASQHWLGLLGPNGTEPQFSNISIYDFFVWQHYYSVRDTLLGPGRPFKAIDFSHKGPAFITWHRYHLLSLERELQKLTGNENFAVPYWNFATGQSECDVCTDSLLGGRDPSDPSRISPRSRFARWGVVCDSLDEYNRLVTLCNGTNEGSLRRGAAMSTNVTLPTMNDVRSCLNLRDFDSPPYFTNSTFSFRNALEGYDKPDGELDTSVSNLHNLVHQFLNGTSALSHSAANDPIFLVLHAFTDAIFDEWMRRFRENETFPDEMAPIGHNRHYNMVPFFPPVTNEEIYIASEQLGYSYSIDLEELDNRPAMFLLGSTLGGIFLGLLLLLIVLVLYIQQRRKREFEPLLNAEFTNTKYSGEA"                                       
##  [8] "MRLLPWGLLFCCLGCRIQPGVWAQFPRICMTLDSLRSKECCPALGAEPGNVCGSRDGRGQCSEVQADTRPWSGPYVLRNQDDREQWPRKFFNRTCRCTGNFAGYNCGDCWFGWTGPNCDQRKPLVVRQNIHSLTPQEREQFLGALDLAKKTTHPDYVITTQHWLGLLGPNGTQPQITNCSIYDFFVWLHYYSVRDTLLGPGRPFKAIDFSHQGPAFVTWHRYHLLWLERDLQRLTGNESFALPYWNFATGRNECDVCTDQLLGAARPDDPTLISQNSRFSSWEIVCDSLDDYNRRVTLCNGTYEGLLRRHQVGRNSETLPTLKNIQDCLSLQKFDNAPFFQNSTFSFRNALEGFDKADGTLDSQVVSLHNLVHSFLNGTSAFPHSAANDPVFVVLHSFTDGIFDEWMKRHKPAADAWPEELAPIGHNRMYNMVPFFPPVTNEELFLTTDHLGYSYAIDWPVTVEETPTWSAVLSVVVSTLAAVVGLFVLLIFLQYRRLRRGYAPLIETHLIRKKYTEEAEGAQTVP"                          
##  [9] "MSPLGWGLVLSCLGCGILPGAHAQFPRVCMTVDSLMTKECCPSLGPEPDNICGSQEGRGQCMEVQADTRPWSGPYVLRNQDDREGWPRKFFHRTCRCTGHFAGYNCGVCKFGWTGHDCNQRKAPVIRKNIHSLTAQEREQFLDALDLAKNTTHPNYVITTQHWLGLLGPNGTQPQIANCSIYDFFVWLHYYSVRDTLLGPGRPYKAIDFSHQGPAFVTWHRYHLLWLERDLQQLIDNESFALPYWNFATGRNDCDVCTDQLLGAARQDDPTLISQNSRFSSWEIICDSLDDYNRRVTLCNGTYEGLLRRNQVGLNSEKLPTLKDIQDCLSLKKFDSPPFFQNSTFSFRNALEGFDKANGILDSQVMSLHNLVHSFLNGTSALPHSAANDPVFVVLHSFTDAIFDEWMKRSNPSVDAWPQELAPIGHNRMYNMVPFFPPVTNEEFFLTADQLGYSYAIDLPAEETPGWTTTFLVVAGMLGALAGVFVVLFFLQYRRLRKGYTPLVETHLSNKKYTEEA"                                   
## [10] "MCPLGWGLLLSCLGCALLPGAHAQFPRVCMTVDSLMSKECCPSLGPEPDNICGSQEGRGQCTEVQADTRPWSGPYVLRNQDDREGWPRKFFHRTCRCTGHFAGYNCGGCKFGWTGHDCSQRKAPVTRKNLHSLTAQEREQFLDALDLAKNTTHPDYVITTQHWLGLLGPNGTQPQIANCSIYDFFVWLHYYSVRDTLLGPGRPYKAIDFSHQGPAFVTWHRYHLLWLERDLQQLIGNESFALPYWNFATGRNDCDVCTDQLLGAARQDDPTLISQNSRFSNWEIVCDSLDDYNRRVTLCNGTYEGLLRRNQVGLNSAKLPTLKDIQDCLSLKKFDSPPFFQNSTFSFRNALEGFDKANGILDSQVMSLHNLVHSFLNGTSALPHSAANDPIFVVLHSFTDAIFDEWMKRSNPSLDAWPQELAPIGHNRMYNMVPFFPPVTNEEFFLTADQLGYSYAINLPAEETPGWTPTLLVVMGMLVALAGVFVVLFFLQYRRLRKGYTPLAETHLNNKKYTEEA"
names(dct_vector) <- names(dct_list)
length(dct_vector)
## [1] 10

pairwise alignments

align01.02 <- pairwiseAlignment(dct_vector[1], dct_vector[2])
align01.03 <- pairwiseAlignment(dct_vector[1], dct_vector[3])
align01.04 <- pairwiseAlignment(dct_vector[1], dct_vector[4])

Biostrings::pid(align01.02)
## [1] 98.91304
Biostrings::pid(align01.03)
## [1] 78.26087
Biostrings::pid(align01.04)
## [1] 75.12864
align02.03 <- pairwiseAlignment(dct_vector[2], dct_vector[3])
align02.04 <- pairwiseAlignment(dct_vector[2], dct_vector[4])
align03.04 <- pairwiseAlignment(dct_vector[3], dct_vector[4])

pids <- c(1,                  NA,     NA,     NA,
          pid(align01.02),          1,     NA,     NA,
          pid(align01.03), pid(align02.03),      1,     NA,
          pid(align01.04), pid(align02.04), pid(align03.04), 1)

mat <- matrix(pids, nrow = 4, byrow = T)
row.names(mat) <- c("Homo","Pan","Mouse","Dog")   
colnames(mat) <- c("Homo","Pan","Mouse","Dog")   
pander::pander(mat)  
  Homo Pan Mouse Dog
Homo 1 NA NA NA
Pan 98.91 1 NA NA
Mouse 78.26 78.44 1 NA
Dog 75.13 75.3 77.74 1

PID methods comparison

type1 <- Biostrings::pid(align01.02, type="PID1")
type2 <- Biostrings::pid(align01.02, type="PID2")
type3 <- Biostrings::pid(align01.02, type="PID3")
type4 <- Biostrings::pid(align01.02, type="PID4")

method <- c("PID1","PID2","PID3","PID4")
PID_ <- c(type1, type2, type3, type4)
denominator <- c("(aligned positions + internal gap positions)", "(aligned positions)", "(length shorter sequence)", "(average length of the two sequences)")

methods.df <- data.frame(method, PID_, denominator)
pander::pander(methods.df)
method PID_ denominator
PID1 98.91 (aligned positions + internal gap positions)
PID2 98.91 (aligned positions)
PID3 98.91 (length shorter sequence)
PID4 98.91 (average length of the two sequences)

MULTIPLE SEQUENCE ALIGNMENT

build MSA

dct_vector_ss <- Biostrings::AAStringSet(dct_vector)
dct_align <- msa(dct_vector_ss,
                     method = "ClustalW")
## use default substitution matrix
dct_align
## CLUSTAL 2.1  
## 
## Call:
##    msa(dct_vector_ss, method = "ClustalW")
## 
## MsaAAMultipleAlignment with 10 rows and 591 columns
##      aln 
##  [1] -------------------------------MSPLW...FLQYRRLRKGYTPLMETHLSSKRYTEEA-------
##  [2] -------------------------------MSPLW...FLQYRRLRKGYTPLMETHLSSKRYTEEA-------
##  [3] -------------------------------MGLVG...FLQYRRLRKGYAPLMETGLSSKRYTEEA-------
##  [4] -------------------------------MSPLG...FLQYRRLRKGYTPLVETHLSNKKYTEEA-------
##  [5] -------------------------------MCPLG...FLQYRRLRKGYTPLAETHLNNKKYTEEA-------
##  [6] MQAQELKSGRGEGQDRIREWKARGSSPADKAMSPLG...FLQYRRLRKGYTPLMETHLSDRKYTEEA-------
##  [7] -------------------------------MSPLG...FLQYRRLRKGYTPLVETQLSNKRYTEEA-------
##  [8] -------------------------------MRLLP...FLQYRRLRRGYAPLIETHLIRKKYTEEAEGAQTVP
##  [9] -------------------------------MECKG...LFMQRKRQQGFEPLMNATFTNKRYTEDA-------
## [10] ---------------------------------MKR...LYIQQRRKREFEPLLNAEFTNTKYSGEA-------
##  Con -------------------------------MSPLG...FLQYRRLRKGYTPLMETHLSNK?YTEEA-------

Change class of alignment

class(dct_align) <- "AAMultipleAlignment"
dct_align_seqinr <- msaConvert(dct_align, type = "seqinr::alignment")
#print_msa(alignment = shrooms_align_seqinr, chunksize = 60)

finished MSA based on the output from drawProtiens

#ggmsa::ggmsa(dct_align, start = NULL, end = 510) 

DISTANCE MATRIX

distance matrix for all sequences

dct_dist <- seqinr::dist.alignment(dct_align_seqinr, 
                                       matrix = "identity")

dct_dist_rounded <- round(dct_dist,
                              digits = 3)

dct_dist_rounded
##        1     2     3     4     5     6     7     8     9
## 2  0.104                                                
## 3  0.408 0.405                                          
## 4  0.388 0.386 0.415                                    
## 5  0.386 0.383 0.425 0.215                              
## 6  0.393 0.391 0.420 0.256 0.278                        
## 7  0.393 0.391 0.408 0.376 0.378 0.376                  
## 8  0.402 0.405 0.431 0.410 0.410 0.417 0.396            
## 9  0.558 0.561 0.554 0.559 0.554 0.566 0.550 0.565      
## 10 0.611 0.612 0.613 0.616 0.609 0.612 0.608 0.611 0.575

PHYLOGENY

phylogenetic tree for all sequences

tree <- nj(dct_dist)

# plot tree
plot.phylo (tree, main="Phylogenetic Tree", 
            use.edge.length = F)

# add label
mtext(text = "DCT family gene tree - rooted, no branch lenths")