A complete bioinformatics workflow in R

By: Nathan L. Brouwer

“Worked example: Building a phylogeny in R”

Introduction

Phylogenies are used in biology to view how proteins have diverged along evolutionary lines. THis allows us to find relationships from a common ancestoral sequence.

Vocab

Clade Multiple Sequence Alignment Pairwise Alignment Shroom Raw Fasta File Accession Number Distance Matrix Model Based Phylogenies Codechunk Reproducible Workflow NCBI

Key functions

rentrez::entrez_fetch compbio4all::entrez_fetch_list compbio4all::fasta_cleaner

Software Preliminaires

Load packages into memory

# github packages

library(compbio4all)

# CRAN packages

library(devtools)
library(rentrez)
library(seqinr)
library(ape)

# Bioconductor packages

library(msa)
library(Biostrings)

Comparing macromolecular sequences

A multiple sequence alignment compares multiple sequences of DNA, RNA, or proteins usually from comparable sequences or proteins.

rentrez::entrez_fetch

# Human shroom 3 (H. sapiens)
hShroom3 <- entrez_fetch(db = "protein", 
                          id = "NP_065910", 
                          rettype = "fasta")

cat() is used here to respect the command and will givce the sequence in a more reader friendly format.

cat(hShroom3)
## >NP_065910.3 protein Shroom3 [Homo sapiens]
## MMRTTEDFHKPSATLNSNTATKGRYIYLEAFLEGGAPWGFTLKGGLEHGEPLIISKVEEGGKADTLSSKL
## QAGDEVVHINEVTLSSSRKEAVSLVKGSYKTLRLVVRRDVCTDPGHADTGASNFVSPEHLTSGPQHRKAA
## WSGGVKLRLKHRRSEPAGRPHSWHTTKSGEKQPDASMMQISQGMIGPPWHQSYHSSSSTSDLSNYDHAYL
## RRSPDQCSSQGSMESLEPSGAYPPCHLSPAKSTGSIDQLSHFHNKRDSAYSSFSTSSSILEYPHPGISGR
## ERSGSMDNTSARGGLLEGMRQADIRYVKTVYDTRRGVSAEYEVNSSALLLQGREARASANGQGYDKWSNI
## PRGKGVPPPSWSQQCPSSLETATDNLPPKVGAPLPPARSDSYAAFRHRERPSSWSSLDQKRLCRPQANSL
## GSLKSPFIEEQLHTVLEKSPENSPPVKPKHNYTQKAQPGQPLLPTSIYPVPSLEPHFAQVPQPSVSSNGM
## LYPALAKESGYIAPQGACNKMATIDENGNQNGSGRPGFAFCQPLEHDLLSPVEKKPEATAKYVPSKVHFC
## SVPENEEDASLKRHLTPPQGNSPHSNERKSTHSNKPSSHPHSLKCPQAQAWQAGEDKRSSRLSEPWEGDF
## QEDHNANLWRRLEREGLGQSLSGNFGKTKSAFSSLQNIPESLRRHSSLELGRGTQEGYPGGRPTCAVNTK
## AEDPGRKAAPDLGSHLDRQVSYPRPEGRTGASASFNSTDPSPEEPPAPSHPHTSSLGRRGPGPGSASALQ
## GFQYGKPHCSVLEKVSKFEQREQGSQRPSVGGSGFGHNYRPHRTVSTSSTSGNDFEETKAHIRFSESAEP
## LGNGEQHFKNGELKLEEASRQPCGQQLSGGASDSGRGPQRPDARLLRSQSTFQLSSEPEREPEWRDRPGS
## PESPLLDAPFSRAYRNSIKDAQSRVLGATSFRRRDLELGAPVASRSWRPRPSSAHVGLRSPEASASASPH
## TPRERHSVTPAEGDLARPVPPAARRGARRRLTPEQKKRSYSEPEKMNEVGIVEEAEPAPLGPQRNGMRFP
## ESSVADRRRLFERDGKACSTLSLSGPELKQFQQSALADYIQRKTGKRPTSAAGCSLQEPGPLRERAQSAY
## LQPGPAALEGSGLASASSLSSLREPSLQPRREATLLPATVAETQQAPRDRSSSFAGGRRLGERRRGDLLS
## GANGGTRGTQRGDETPREPSSWGARAGKSMSAEDLLERSDVLAGPVHVRSRSSPATADKRQDVLLGQDSG
## FGLVKDPCYLAGPGSRSLSCSERGQEEMLPLFHHLTPRWGGSGCKAIGDSSVPSECPGTLDHQRQASRTP
## CPRPPLAGTQGLVTDTRAAPLTPIGTPLPSAIPSGYCSQDGQTGRQPLPPYTPAMMHRSNGHTLTQPPGP
## RGCEGDGPEHGVEEGTRKRVSLPQWPPPSRAKWAHAAREDSLPEESSAPDFANLKHYQKQQSLPSLCSTS
## DPDTPLGAPSTPGRISLRISESVLRDSPPPHEDYEDEVFVRDPHPKATSSPTFEPLPPPPPPPPSQETPV
## YSMDDFPPPPPHTVCEAQLDSEDPEGPRPSFNKLSKVTIARERHMPGAAHVVGSQTLASRLQTSIKGSEA
## ESTPPSFMSVHAQLAGSLGGQPAPIQTQSLSHDPVSGTQGLEKKVSPDPQKSSEDIRTEALAKEIVHQDK
## SLADILDPDSRLKTTMDLMEGLFPRDVNLLKENSVKRKAIQRTVSSSGCEGKRNEDKEAVSMLVNCPAYY
## SVSAPKAELLNKIKEMPAEVNEEEEQADVNEKKAELIGSLTHKLETLQEAKGSLLTDIKLNNALGEEVEA
## LISELCKPNEFDKYRMFIGDLDKVVNLLLSLSGRLARVENVLSGLGEDASNEERSSLYEKRKILAGQHED
## ARELKENLDRRERVVLGILANYLSEEQLQDYQHFVKMKSTLLIEQRKLDDKIKLGQEQVKCLLESLPSDF
## IPKAGALALPPNLTSEPIPAGGCTFSGIFPTLTSPL

This chunk is assigning the objects with the relevant data via the accession ID

# Mouse shroom 3a (M. musculus)
mShroom3a <- entrez_fetch(db = "protein", 
                          id = "AAF13269", 
                          rettype = "fasta")

# Human shroom 2 (H. sapiens)
hShroom2 <- entrez_fetch(db = "protein", 
                          id = "CAA58534", 
                          rettype = "fasta")


# Sea-urchin shroom
sShroom <- entrez_fetch(db = "protein", 
                          id = "XP_783573", 
                          rettype = "fasta")

nchar() gives the number of characters in each of the sequences which will help in keeping track of how the fasta file is cleaned up and show how the sequences compare in raw number of characters thus far.

nchar(hShroom3)
## [1] 2070
nchar(mShroom3a)
## [1] 2083
nchar(sShroom)
## [1] 1758
nchar(hShroom2)
## [1] 1673

Prepping macromolecular sequences

Fasta cleaner is summoned here which ultimately will clean the data of the unnessary pieces such as the new line commands and headers

fasta_cleaner
## function (fasta_object, parse = TRUE) 
## {
##     fasta_object <- sub("^(>)(.*?)(\\n)(.*)(\\n\\n)", "\\4", 
##         fasta_object)
##     fasta_object <- gsub("\n", "", fasta_object)
##     if (parse == TRUE) {
##         fasta_object <- stringr::str_split(fasta_object, pattern = "", 
##             simplify = FALSE)
##     }
##     return(fasta_object[[1]])
## }
## <bytecode: 0x000000002bcef670>
## <environment: namespace:compbio4all>

By manually copying and pasting the function code into R, you can use the function without downloading the whole package.

fasta_cleaner <- function(fasta_object, parse = TRUE){

  fasta_object <- sub("^(>)(.*?)(\\n)(.*)(\\n\\n)","\\4",fasta_object)
  fasta_object <- gsub("\n", "", fasta_object)

  if(parse == TRUE){
    fasta_object <- stringr::str_split(fasta_object,
                                       pattern = "",
                                       simplify = FALSE)
  }

  return(fasta_object[[1]])
}

This codechunk assigns the cleaned fasta files to the character vectors objects.

hShroom3  <- fasta_cleaner(hShroom3,  parse = F)
mShroom3a <- fasta_cleaner(mShroom3a, parse = F)
hShroom2  <- fasta_cleaner(hShroom2,  parse = F)
sShroom   <- fasta_cleaner(sShroom,   parse = F)
hShroom3
## [1] "MMRTTEDFHKPSATLNSNTATKGRYIYLEAFLEGGAPWGFTLKGGLEHGEPLIISKVEEGGKADTLSSKLQAGDEVVHINEVTLSSSRKEAVSLVKGSYKTLRLVVRRDVCTDPGHADTGASNFVSPEHLTSGPQHRKAAWSGGVKLRLKHRRSEPAGRPHSWHTTKSGEKQPDASMMQISQGMIGPPWHQSYHSSSSTSDLSNYDHAYLRRSPDQCSSQGSMESLEPSGAYPPCHLSPAKSTGSIDQLSHFHNKRDSAYSSFSTSSSILEYPHPGISGRERSGSMDNTSARGGLLEGMRQADIRYVKTVYDTRRGVSAEYEVNSSALLLQGREARASANGQGYDKWSNIPRGKGVPPPSWSQQCPSSLETATDNLPPKVGAPLPPARSDSYAAFRHRERPSSWSSLDQKRLCRPQANSLGSLKSPFIEEQLHTVLEKSPENSPPVKPKHNYTQKAQPGQPLLPTSIYPVPSLEPHFAQVPQPSVSSNGMLYPALAKESGYIAPQGACNKMATIDENGNQNGSGRPGFAFCQPLEHDLLSPVEKKPEATAKYVPSKVHFCSVPENEEDASLKRHLTPPQGNSPHSNERKSTHSNKPSSHPHSLKCPQAQAWQAGEDKRSSRLSEPWEGDFQEDHNANLWRRLEREGLGQSLSGNFGKTKSAFSSLQNIPESLRRHSSLELGRGTQEGYPGGRPTCAVNTKAEDPGRKAAPDLGSHLDRQVSYPRPEGRTGASASFNSTDPSPEEPPAPSHPHTSSLGRRGPGPGSASALQGFQYGKPHCSVLEKVSKFEQREQGSQRPSVGGSGFGHNYRPHRTVSTSSTSGNDFEETKAHIRFSESAEPLGNGEQHFKNGELKLEEASRQPCGQQLSGGASDSGRGPQRPDARLLRSQSTFQLSSEPEREPEWRDRPGSPESPLLDAPFSRAYRNSIKDAQSRVLGATSFRRRDLELGAPVASRSWRPRPSSAHVGLRSPEASASASPHTPRERHSVTPAEGDLARPVPPAARRGARRRLTPEQKKRSYSEPEKMNEVGIVEEAEPAPLGPQRNGMRFPESSVADRRRLFERDGKACSTLSLSGPELKQFQQSALADYIQRKTGKRPTSAAGCSLQEPGPLRERAQSAYLQPGPAALEGSGLASASSLSSLREPSLQPRREATLLPATVAETQQAPRDRSSSFAGGRRLGERRRGDLLSGANGGTRGTQRGDETPREPSSWGARAGKSMSAEDLLERSDVLAGPVHVRSRSSPATADKRQDVLLGQDSGFGLVKDPCYLAGPGSRSLSCSERGQEEMLPLFHHLTPRWGGSGCKAIGDSSVPSECPGTLDHQRQASRTPCPRPPLAGTQGLVTDTRAAPLTPIGTPLPSAIPSGYCSQDGQTGRQPLPPYTPAMMHRSNGHTLTQPPGPRGCEGDGPEHGVEEGTRKRVSLPQWPPPSRAKWAHAAREDSLPEESSAPDFANLKHYQKQQSLPSLCSTSDPDTPLGAPSTPGRISLRISESVLRDSPPPHEDYEDEVFVRDPHPKATSSPTFEPLPPPPPPPPSQETPVYSMDDFPPPPPHTVCEAQLDSEDPEGPRPSFNKLSKVTIARERHMPGAAHVVGSQTLASRLQTSIKGSEAESTPPSFMSVHAQLAGSLGGQPAPIQTQSLSHDPVSGTQGLEKKVSPDPQKSSEDIRTEALAKEIVHQDKSLADILDPDSRLKTTMDLMEGLFPRDVNLLKENSVKRKAIQRTVSSSGCEGKRNEDKEAVSMLVNCPAYYSVSAPKAELLNKIKEMPAEVNEEEEQADVNEKKAELIGSLTHKLETLQEAKGSLLTDIKLNNALGEEVEALISELCKPNEFDKYRMFIGDLDKVVNLLLSLSGRLARVENVLSGLGEDASNEERSSLYEKRKILAGQHEDARELKENLDRRERVVLGILANYLSEEQLQDYQHFVKMKSTLLIEQRKLDDKIKLGQEQVKCLLESLPSDFIPKAGALALPPNLTSEPIPAGGCTFSGIFPTLTSPL"

aligning sequences

This begins the alignment process to comparatively stack the sequences against each other and lines them up for the beginning of a consensus sequence.

align.h3.vs.m3a <- Biostrings:: pairwiseAlignment          (
                  hShroom3,
                  mShroom3a)

This object shows a snip of the pairwise alignment, as well as a score which can be used to compare the similarities of different pairs of sequences.

align.h3.vs.m3a
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: MMRTTEDFHKPSATLN-SNTATKGRYIYLEAFLE...KAGALALPPNLTSEPIPAGGCTFSGIFPTLTSPL
## subject: MK-TPENLEEPSATPNPSRTPTE-RFVYLEALLE...KAGAISLPPALTGHATPGGTSVFGGVFPTLTSPL
## score: 2189.934

The pid command in this code chunk gives a percent identity score to the pairwise alignment to tell how similar the sequences are in terms of percentage similar.

Biostrings::  pid  (align.h3.vs.m3a)
## [1] 70.56511

Conducting a pairwise alignment between human shroom 3 and human shroom 2

align.h3.vs.h2 <- Biostrings::pairwiseAlignment(
                  hShroom3,
                  hShroom2)

The score function will give a score to how similar the sequences aa they are aligned in the previous codechunk which allows for comparisons between different alignments.

score(align.h3.vs.h2)
## [1] -5673.853

The pid is the percent identity score which gives the percent similarity of the aligned sequences. This score allows for a better comparison between alignments as the raw score is more technial and more difficult to compare.

Biostrings::pid(align.h3.vs.h2)
## [1] 33.83277

The shroom family of genes

the table provides the accession numbers, species names, and the shortened name for each object so that an multiple sequence alignment can be conducted

shroom_table <- c("CAA78718" , "X. laevis Apx" ,         "xShroom1",
            "NP_597713" , "H. sapiens APXL2" ,     "hShroom1",
            "CAA58534" , "H. sapiens APXL",        "hShroom2",
            "ABD19518" , "M. musculus Apxl" ,      "mShroom2",
            "AAF13269" , "M. musculus ShroomL" ,   "mShroom3a",
            "AAF13270" , "M. musculus ShroomS" ,   "mShroom3b",
            "NP_065910", "H. sapiens Shroom" ,     "hShroom3",
            "ABD59319" , "X. laevis Shroom-like",  "xShroom3",
            "NP_065768", "H. sapiens KIAA1202" ,   "hShroom4a",
            "AAK95579" , "H. sapiens SHAP-A" ,     "hShroom4b",
            #"DQ435686" , "M. musculus KIAA1202" ,  "mShroom4",
            "ABA81834" , "D. melanogaster Shroom", "dmShroom",
            "EAA12598" , "A. gambiae Shroom",      "agShroom",
            "XP_392427" , "A. mellifera Shroom" ,  "amShroom",
            "XP_783573" , "S. purpuratus Shroom" , "spShroom") #sea urchin

This next chunk will make the data frame and table to ultimately allow for the pairwise alignment and further the msa later in the project

# convert to table
shroom_table_matrix <- matrix(shroom_table,
                                  byrow = T,
                                  nrow = 14)
# convert to data fram
shroom_table <- data.frame(shroom_table_matrix, 
                     stringsAsFactors = F)

# titles of columns
names(shroom_table) <- c("accession", "name.orig","name.new")

# Create simplified species names
shroom_table$spp <- "Homo"
shroom_table$spp[grep("laevis",shroom_table$name.orig)] <- "Xenopus"
shroom_table$spp[grep("musculus",shroom_table$name.orig)] <- "Mus"
shroom_table$spp[grep("melanogaster",shroom_table$name.orig)] <- "Drosophila"
shroom_table$spp[grep("gambiae",shroom_table$name.orig)] <- "mosquito"
shroom_table$spp[grep("mellifera",shroom_table$name.orig)] <- "bee"
shroom_table$spp[grep("purpuratus",shroom_table$name.orig)] <- "sea urchin"

This next chunk shows the table produced in the previous chunk

shroom_table
##    accession              name.orig  name.new        spp
## 1   CAA78718          X. laevis Apx  xShroom1    Xenopus
## 2  NP_597713       H. sapiens APXL2  hShroom1       Homo
## 3   CAA58534        H. sapiens APXL  hShroom2       Homo
## 4   ABD19518       M. musculus Apxl  mShroom2        Mus
## 5   AAF13269    M. musculus ShroomL mShroom3a        Mus
## 6   AAF13270    M. musculus ShroomS mShroom3b        Mus
## 7  NP_065910      H. sapiens Shroom  hShroom3       Homo
## 8   ABD59319  X. laevis Shroom-like  xShroom3    Xenopus
## 9  NP_065768    H. sapiens KIAA1202 hShroom4a       Homo
## 10  AAK95579      H. sapiens SHAP-A hShroom4b       Homo
## 11  ABA81834 D. melanogaster Shroom  dmShroom Drosophila
## 12  EAA12598      A. gambiae Shroom  agShroom   mosquito
## 13 XP_392427    A. mellifera Shroom  amShroom        bee
## 14 XP_783573   S. purpuratus Shroom  spShroom sea urchin

aligning multiple sequences

This next chunk gives the accession number column of the table

shroom_table$accession
##  [1] "CAA78718"  "NP_597713" "CAA58534"  "ABD19518"  "AAF13269"  "AAF13270" 
##  [7] "NP_065910" "ABD59319"  "NP_065768" "AAK95579"  "ABA81834"  "EAA12598" 
## [13] "XP_392427" "XP_783573"

This assigns the accession numbers and data to the shrooms object

# add necessary function
shrooms <- rentrez::entrez_fetch     (db = "protein", 
                          id = shroom_table$accession, 
                          rettype = "fasta")

This categorizes the previous chunk’s object

cat(shrooms)

This will put the cleaned data from the list and table into a list to allow for later msa.

shrooms_list <- entrez_fetch_list(db = "protein", 
                          id = shroom_table$accession, 
                          rettype = "fasta")

The next chunk is a “for” loop which wil run the same code as many times as needed on each dataset

for(i in 1:length(shrooms_list)){
  shrooms_list[[i]] <- fasta_cleaner(shrooms_list[[i]], parse = F)
}

this chunk will add names to the vector

# naming the vector
shrooms_vector <- rep(NA, length(shrooms_list))

# the loop function
for(i in 1:length(shrooms_vector)){
  shrooms_vector[i] <- shrooms_list[[i]]
}

#  changes the name of the list to the vector
names(shrooms_vector) <- names(shrooms_list)

This turns the vector into a string set which is another way to format the data for annotation and further analysis.

shrooms_vector_ss <- Biostrings:: AAStringSet (shrooms_vector)

MSA

This section will complete the alignment of the shroom family of genes. An MSA will compare multiple sequences against each other to eventually create a consensus sequence and further more a phylogeny tree

Building an XXXXXXXX (MSA)

This chunk performs the multiple sequence alignment on the dataset and puts it on the object “shrooms_align”

# add necessary function
shrooms_align <-  msa  (shrooms_vector_ss,
                       method = "ClustalW")
## use default substitution matrix

Viewing an MSA

This will allow us to visualize the msa that we created in the preceding program

Viewing an MSA in R

The next chunk will show us the msa in a direct output format in R

shrooms_align
## CLUSTAL 2.1  
## 
## Call:
##    msa(shrooms_vector_ss, method = "ClustalW")
## 
## MsaAAMultipleAlignment with 14 rows and 2252 columns
##      aln                                                   names
##  [1] -------------------------...------------------------- NP_065768
##  [2] -------------------------...------------------------- AAK95579
##  [3] -------------------------...SVFGGVFPTLTSPL----------- AAF13269
##  [4] -------------------------...SVFGGVFPTLTSPL----------- AAF13270
##  [5] -------------------------...CTFSGIFPTLTSPL----------- NP_065910
##  [6] -------------------------...NKS--LPPPLTSSL----------- ABD59319
##  [7] -------------------------...------------------------- CAA58534
##  [8] -------------------------...------------------------- ABD19518
##  [9] -------------------------...LT----------------------- NP_597713
## [10] -------------------------...------------------------- CAA78718
## [11] -------------------------...------------------------- EAA12598
## [12] -------------------------...------------------------- ABA81834
## [13] MTELQPSPPGYRVQDEAPGPPSCPP...------------------------- XP_392427
## [14] -------------------------...AATSSSSNGIGGPEQLNSNATSSYC XP_783573
##  Con -------------------------...------------------------- Consensus

This will tell us the class of the shrooms alignment and then force the AAMultipleAlignment class onto the object.

# WHAT IS THE LINE BELOW DOING? (its tricky - do your best)
class(shrooms_align) <- "AAMultipleAlignment"

# WHAT IS THE LINE BELOW DOING? This is simpler
shrooms_align_seqinr <- msaConvert(shrooms_align, type = "seqinr::alignment")

This will print the alignment from the msa but limits the size so that it can be more easily viewed

print_msa(alignment = shrooms_align_seqinr, 
          chunksize = 60)

Displaying an MSA as an R plot

This displays the results as a plot in the R IDE which makes it more reader friendly

## add necessary function
#ggmsa:: ggmsa  plot (shrooms_align,   start = 2000, end = 2100) 

Saving an MSA as PDF

This saves the msa as a PDF from 2000 to 2100 which gives a summary view of the msa comparison. Add the package the function is coming from using :: notation This may not work for everyone. If its not working you can comment it out.

install.packages("tinytex")
install_tinytex()
tinytex::msaPrettyPrint(shrooms_align,             # alignment
               file = "shroom_msa.pdf",   # file name
               y=c(2000, 2100),           # range
               askForOverwrite=FALSE)

This will show us where R studio is writing the save files to in the directory.

getwd()
## [1] "C:/Users/natas/Desktop/coding"

A subset of sequences

To make things easier we’ll move forward with just a subset of sequences:

  • XP_392427: amShroom (bee shroom)
  • EAA12598: agShroom (mosquito shroom)
  • ABA81834: dmShroom (Drosophila shroom)
  • XP_783573: spShroom (sea urchin shroom)
  • CAA78718: xShroom1 (frog shroom)

Our main working object shrooms_vector_ss has the names of our genes listed

names(shrooms_vector_ss)
##  [1] "CAA78718"  "NP_597713" "CAA58534"  "ABD19518"  "AAF13269"  "AAF13270" 
##  [7] "NP_065910" "ABD59319"  "NP_065768" "AAK95579"  "ABA81834"  "EAA12598" 
## [13] "XP_392427" "XP_783573"

We can select the ones we want to focus on be first making a vector of the names

names.focal <- c("XP_392427","EAA12598","ABA81834","XP_783573","CAA78718")

We can use this vector and bracket notation to select the what we want from shrooms_vector_ss:

shrooms_vector_ss[names.focal]
## AAStringSet object of length 5:
##     width seq                                               names               
## [1]  2126 MTELQPSPPGYRVQDEAPGPPSC...GREIQDKVKLGEEQLAALREAID XP_392427
## [2]   674 IPFSSSPKNRSNSKASYLPRQPR...ADKIKLGEEQLAALKDTLVQSEC EAA12598
## [3]  1576 MKMRNHKENGNGSEMGESTKSLA...AVRIKGSEEQLSSLSDALVQSDC ABA81834
## [4]  1661 MMKDAMYPTTTSTTSSSVNPLPK...TSSSSNGIGGPEQLNSNATSSYC XP_783573
## [5]  1420 MSAFGNTIERWNIKSTGVIAGLG...KNLEEKIKVYEEQFESIHNSLPP CAA78718

Let’s assign the subset of sequences to a new object called shrooms_vector_ss_subset.

shrooms_vector_ss_subset <- shrooms_vector_ss[names.focal]

Let’s make another MSA with just this subset. If msa isn’t working for you you can comment this out.

shrooms_align_subset <- msa(shrooms_vector_ss_subset,
                     method = "ClustalW")
## use default substitution matrix

To view it using ggmsa we need to do those annoying conversions again.

class(shrooms_align_subset) <- "AAMultipleAlignment"
shrooms_align_subset_seqinr <- msaConvert(shrooms_align_subset, type = "seqinr::alignment")

THen we can plot it

#ggmsa::ggmsa(shrooms_align_subset,   # shrooms_align, NOT shrooms_align_seqinr
      #start = 2030, 
      #end = 2100) 

We can save our new smaller MSA like this.

msaPrettyPrint(shrooms_align_subset,             # alignment
               file = "shroom_msa_subset.pdf",   # file name
               y=c(2030, 2100),           # range
               askForOverwrite=FALSE)

Genetic distances of sequence in subset

While an MSA is a good way to examine a sequence its hard to assess all of the information visually. A phylogenetic tree allows you to summarize patterns in an MSA. The fastest way to make phylogenetic trees to is first summarize an MSA using a genetic distance matrix. The more amino acids that are identical to each other, the smaller the genetic distance is between them and the less evolution has occurred.

We usually work in terms of difference or genetic distance (a.k.a. evolutionary distance), though often we also talk in terms of similarity or identity.

Calculating genetic distance from an MSA is done using the seqinr::dist.alignment() function.

shrooms_subset_dist <- seqinr::dist.alignment(shrooms_align_subset_seqinr, 
                                       matrix = "identity")

This produces a “dist” class object.

is(shrooms_subset_dist)
## [1] "dist"     "oldClass"
class(shrooms_subset_dist)
## [1] "dist"

If you’ve been having trouble with the MSA software, the data necessary to build the distance matrix directly in R is in this code chunk (you can ignore the details).

shrooms_subset_dist_alt <- matrix(data = NA,
                              nrow = 5, 
                              ncol = 5)
distances <- c(0.8260049, 
               0.8478722, 0.9000568, 
               0.9244596, 0.9435187, 0.9372139, 
               0.9238779, 0.9370038, 0.9323225,0.9413209)
shrooms_subset_dist_alt[lower.tri(shrooms_subset_dist_alt)] <- distances
seqnames <- c("EAA12598","ABA81834","XP_392427", "XP_783573","CAA78718")
colnames(shrooms_subset_dist_alt) <- seqnames
row.names(shrooms_subset_dist_alt)  <- seqnames
shrooms_subset_dist_alt <- as.dist(shrooms_subset_dist_alt)
shrooms_subset_dist <- shrooms_subset_dist_alt

We’ve made a matrix using dist.alignment(); let’s round it off so its easier to look at using the round() function.

shrooms_subset_dist_rounded <- round(shrooms_subset_dist,
                              digits = 3)

If we want to look at it we can type

shrooms_subset_dist_rounded
##           EAA12598 ABA81834 XP_392427 XP_783573
## ABA81834     0.826                             
## XP_392427    0.848    0.944                    
## XP_783573    0.900    0.937     0.937          
## CAA78718     0.924    0.924     0.932     0.941

Not that we have 5 sequence, but the matrix is 4 x 4. This is because redundant information is dropped, including distances from one sequence to itself. This makes it so that the first column is EAA12598, but the first row is ABA81834.

Phylognetic trees of subset sequences (finally!)

We got our sequences, built a multiple sequence alignment, and calculated the genetic distance between sequences. Now we are - finally - ready to build a phylogenetic tree.

First, we let R figure out the structure of the tree. There are MANY ways to build phylogenetic trees. We’ll use a common one used for exploring sequences called neighbor joining algorithm via the function nj(). Neighbor joining uses genetic distances to cluster sequences into clades.

nj() is simple function that takes only a single argument, a distance matrix.

# Note - not using rounded values
tree_subset <- nj(shrooms_subset_dist)

Plotting phylogenetic trees

Now we’ll make a quick plot of our tree using plot() (and add a little label using an important function called mtext()).

# plot tree
plot.phylo(tree_subset, main="Phylogenetic Tree", 
            type = "unrooted", 
            use.edge.length = F)
# add label
mtext(text = "Shroom family gene tree - UNrooted, no branch lengths")

This is an unrooted tree with no outgroup defined. For the sake of plotting we’ve also ignored the evolutionary distance between the sequences, so the branch lengths don’t have meaning.

To make a rooted tree we remove type = "unrooted. In the case of neighbor joining, the algorithm tries to figure out the outgroup on its own.

# plot tree
plot.phylo(tree_subset, main="Phylogenetic Tree", 
            use.edge.length = F)
# add label
mtext(text = "Shroom family gene tree - rooted, no branch lenths")

We can include information about branch length by setting use.edge.length = ... to T.

# plot tree
plot.phylo(tree_subset, main="Phylogenetic Tree", 
            use.edge.length = T)
# add label
mtext(text = "Shroom family gene tree - rooted, with branch lenths")

Now the length of the branches indicates the evolutionary distance between sequences and correlate to the distances reported in our distance matrix. The branches are all very long, indicating that these genes have been evolving independently for many millions of years.

An important note: the vertical lines on the tree have no meaning, only the horizontal ones.

Because the branch lengths are all so long I find this tree a bit hard to view when its rooted. Let’s make it unrooted again.

# plot tree
plot.phylo(tree_subset, main="Phylogenetic Tree", 
           type = "unrooted",
            use.edge.length = T)
# add label
mtext(text = "Shroom family gene tree - rooted, with branch lenths")

Now you can see that the ABA and EAA sequences form a clade, and that the distance between them is somewhat smaller than the distance between other sequences. If we go back to our original distance matrix, we can see that the smallest genetic distance is between ABA and EAA at 0.826.

shrooms_subset_dist_rounded
##           EAA12598 ABA81834 XP_392427 XP_783573
## ABA81834     0.826                             
## XP_392427    0.848    0.944                    
## XP_783573    0.900    0.937     0.937          
## CAA78718     0.924    0.924     0.932     0.941

We can confirm that this is the minimum using the min() function.

min(shrooms_subset_dist_rounded)
## [1] 0.826