A complete bioinformatics workflow in R

By: Nathan L. Brouwer

“Worked example: Building a phylogeny in R”

Introduction

#Phylogenies can be used to show when genes or species diverged due to evolution,which genes or species are more similar or more different to each other, what a common ancestor is for 2 genes or species, and which genes or species are in a clave. 

Vocab

#MSA
#Pairwise Alignement
#Distance Matrix
#PID
#Accession Number
#Fasta File
#Reproducable workflow
#Phylogenetic tree
#Clave
#Consensus Sequence
#Sequence Logo

Key functions

#rentrez::entrez_fetch
#Biostrings::pid
#Biostrings::pairwiseAlignment
#compbio4all::entrez_fetch_list
#ggmsa:ggmsa

Software Preliminaires

Load packages into memory

# github packages
library(devtools)
#devtools::install_github("brouwern/compbio4all")
library(compbio4all)
#devtools::install_github("YuLab-SMU/ggmsa")
library(ggmsa)


# CRAN packages
#install.packages("CRAN")
#install.packages("rentrez")
library(rentrez)
#install.packages("seqinr")
library(seqinr)
#install.packages("ape")
library(ape)


# Bioconductor packages
#install.packages("BiocManager")
library(BiocManager)
#BiocManager::install("Biostrings")
library(Biostrings)
#BiocManager::install("msa")
library(msa)

Downloading macromolecular sequences

#The human shroom 3 sequence is being downloaded from NCBI and put into the object hShroom3

# Human shroom 3 (H. sapiens)
hShroom3 <- rentrez::entrez_fetch(db = "protein", 
                          id = "NP_065910", 
                          rettype = "fasta")
cat(hShroom3) # Respecting new line character (\n)
## >NP_065910.3 protein Shroom3 [Homo sapiens]
## MMRTTEDFHKPSATLNSNTATKGRYIYLEAFLEGGAPWGFTLKGGLEHGEPLIISKVEEGGKADTLSSKL
## QAGDEVVHINEVTLSSSRKEAVSLVKGSYKTLRLVVRRDVCTDPGHADTGASNFVSPEHLTSGPQHRKAA
## WSGGVKLRLKHRRSEPAGRPHSWHTTKSGEKQPDASMMQISQGMIGPPWHQSYHSSSSTSDLSNYDHAYL
## RRSPDQCSSQGSMESLEPSGAYPPCHLSPAKSTGSIDQLSHFHNKRDSAYSSFSTSSSILEYPHPGISGR
## ERSGSMDNTSARGGLLEGMRQADIRYVKTVYDTRRGVSAEYEVNSSALLLQGREARASANGQGYDKWSNI
## PRGKGVPPPSWSQQCPSSLETATDNLPPKVGAPLPPARSDSYAAFRHRERPSSWSSLDQKRLCRPQANSL
## GSLKSPFIEEQLHTVLEKSPENSPPVKPKHNYTQKAQPGQPLLPTSIYPVPSLEPHFAQVPQPSVSSNGM
## LYPALAKESGYIAPQGACNKMATIDENGNQNGSGRPGFAFCQPLEHDLLSPVEKKPEATAKYVPSKVHFC
## SVPENEEDASLKRHLTPPQGNSPHSNERKSTHSNKPSSHPHSLKCPQAQAWQAGEDKRSSRLSEPWEGDF
## QEDHNANLWRRLEREGLGQSLSGNFGKTKSAFSSLQNIPESLRRHSSLELGRGTQEGYPGGRPTCAVNTK
## AEDPGRKAAPDLGSHLDRQVSYPRPEGRTGASASFNSTDPSPEEPPAPSHPHTSSLGRRGPGPGSASALQ
## GFQYGKPHCSVLEKVSKFEQREQGSQRPSVGGSGFGHNYRPHRTVSTSSTSGNDFEETKAHIRFSESAEP
## LGNGEQHFKNGELKLEEASRQPCGQQLSGGASDSGRGPQRPDARLLRSQSTFQLSSEPEREPEWRDRPGS
## PESPLLDAPFSRAYRNSIKDAQSRVLGATSFRRRDLELGAPVASRSWRPRPSSAHVGLRSPEASASASPH
## TPRERHSVTPAEGDLARPVPPAARRGARRRLTPEQKKRSYSEPEKMNEVGIVEEAEPAPLGPQRNGMRFP
## ESSVADRRRLFERDGKACSTLSLSGPELKQFQQSALADYIQRKTGKRPTSAAGCSLQEPGPLRERAQSAY
## LQPGPAALEGSGLASASSLSSLREPSLQPRREATLLPATVAETQQAPRDRSSSFAGGRRLGERRRGDLLS
## GANGGTRGTQRGDETPREPSSWGARAGKSMSAEDLLERSDVLAGPVHVRSRSSPATADKRQDVLLGQDSG
## FGLVKDPCYLAGPGSRSLSCSERGQEEMLPLFHHLTPRWGGSGCKAIGDSSVPSECPGTLDHQRQASRTP
## CPRPPLAGTQGLVTDTRAAPLTPIGTPLPSAIPSGYCSQDGQTGRQPLPPYTPAMMHRSNGHTLTQPPGP
## RGCEGDGPEHGVEEGTRKRVSLPQWPPPSRAKWAHAAREDSLPEESSAPDFANLKHYQKQQSLPSLCSTS
## DPDTPLGAPSTPGRISLRISESVLRDSPPPHEDYEDEVFVRDPHPKATSSPTFEPLPPPPPPPPSQETPV
## YSMDDFPPPPPHTVCEAQLDSEDPEGPRPSFNKLSKVTIARERHMPGAAHVVGSQTLASRLQTSIKGSEA
## ESTPPSFMSVHAQLAGSLGGQPAPIQTQSLSHDPVSGTQGLEKKVSPDPQKSSEDIRTEALAKEIVHQDK
## SLADILDPDSRLKTTMDLMEGLFPRDVNLLKENSVKRKAIQRTVSSSGCEGKRNEDKEAVSMLVNCPAYY
## SVSAPKAELLNKIKEMPAEVNEEEEQADVNEKKAELIGSLTHKLETLQEAKGSLLTDIKLNNALGEEVEA
## LISELCKPNEFDKYRMFIGDLDKVVNLLLSLSGRLARVENVLSGLGEDASNEERSSLYEKRKILAGQHED
## ARELKENLDRRERVVLGILANYLSEEQLQDYQHFVKMKSTLLIEQRKLDDKIKLGQEQVKCLLESLPSDF
## IPKAGALALPPNLTSEPIPAGGCTFSGIFPTLTSPL
# This code chunk is downloading fasta data into its appropriate variable

# Mouse shroom 3a (M. musculus)
mShroom3a <- entrez_fetch(db = "protein", 
                          id = "AAF13269", 
                          rettype = "fasta")

# Human shroom 2 (H. sapiens)
hShroom2 <- entrez_fetch(db = "protein", 
                          id = "CAA58534", 
                          rettype = "fasta")


# Sea-urchin shroom
sShroom <- entrez_fetch(db = "protein", 
                          id = "XP_783573", 
                          rettype = "fasta")
#this code chunk is finding number of characters in each FASTA file
nchar(hShroom3)
## [1] 2070
nchar(mShroom3a)
## [1] 2083
nchar(sShroom)
## [1] 1758
nchar(hShroom2)
## [1] 1673

Prepping macromolecular sequences

fasta_cleaner #cleaning FASTA file of unneeded information
## function (fasta_object, parse = TRUE) 
## {
##     fasta_object <- sub("^(>)(.*?)(\\n)(.*)(\\n\\n)", "\\4", 
##         fasta_object)
##     fasta_object <- gsub("\n", "", fasta_object)
##     if (parse == TRUE) {
##         fasta_object <- stringr::str_split(fasta_object, pattern = "", 
##             simplify = FALSE)
##     }
##     return(fasta_object[[1]])
## }
## <bytecode: 0x7fb4cb5248a8>
## <environment: namespace:compbio4all>
#if you can't download compbio4all, you can just write the function in your code and all of it's features. 
fasta_cleaner <- function(fasta_object, parse = TRUE){

  fasta_object <- sub("^(>)(.*?)(\\n)(.*)(\\n\\n)","\\4",fasta_object)
  fasta_object <- gsub("\n", "", fasta_object)

  if(parse == TRUE){
    fasta_object <- stringr::str_split(fasta_object,
                                       pattern = "",
                                       simplify = FALSE)
  }

  return(fasta_object[[1]])
}
#this code chunk is cleaning the fasta files
hShroom3  <- fasta_cleaner(hShroom3,  parse = F)
mShroom3a <- fasta_cleaner(mShroom3a, parse = F)
hShroom2  <- fasta_cleaner(hShroom2,  parse = F)
sShroom   <- fasta_cleaner(sShroom,   parse = F)
hShroom3
## [1] "MMRTTEDFHKPSATLNSNTATKGRYIYLEAFLEGGAPWGFTLKGGLEHGEPLIISKVEEGGKADTLSSKLQAGDEVVHINEVTLSSSRKEAVSLVKGSYKTLRLVVRRDVCTDPGHADTGASNFVSPEHLTSGPQHRKAAWSGGVKLRLKHRRSEPAGRPHSWHTTKSGEKQPDASMMQISQGMIGPPWHQSYHSSSSTSDLSNYDHAYLRRSPDQCSSQGSMESLEPSGAYPPCHLSPAKSTGSIDQLSHFHNKRDSAYSSFSTSSSILEYPHPGISGRERSGSMDNTSARGGLLEGMRQADIRYVKTVYDTRRGVSAEYEVNSSALLLQGREARASANGQGYDKWSNIPRGKGVPPPSWSQQCPSSLETATDNLPPKVGAPLPPARSDSYAAFRHRERPSSWSSLDQKRLCRPQANSLGSLKSPFIEEQLHTVLEKSPENSPPVKPKHNYTQKAQPGQPLLPTSIYPVPSLEPHFAQVPQPSVSSNGMLYPALAKESGYIAPQGACNKMATIDENGNQNGSGRPGFAFCQPLEHDLLSPVEKKPEATAKYVPSKVHFCSVPENEEDASLKRHLTPPQGNSPHSNERKSTHSNKPSSHPHSLKCPQAQAWQAGEDKRSSRLSEPWEGDFQEDHNANLWRRLEREGLGQSLSGNFGKTKSAFSSLQNIPESLRRHSSLELGRGTQEGYPGGRPTCAVNTKAEDPGRKAAPDLGSHLDRQVSYPRPEGRTGASASFNSTDPSPEEPPAPSHPHTSSLGRRGPGPGSASALQGFQYGKPHCSVLEKVSKFEQREQGSQRPSVGGSGFGHNYRPHRTVSTSSTSGNDFEETKAHIRFSESAEPLGNGEQHFKNGELKLEEASRQPCGQQLSGGASDSGRGPQRPDARLLRSQSTFQLSSEPEREPEWRDRPGSPESPLLDAPFSRAYRNSIKDAQSRVLGATSFRRRDLELGAPVASRSWRPRPSSAHVGLRSPEASASASPHTPRERHSVTPAEGDLARPVPPAARRGARRRLTPEQKKRSYSEPEKMNEVGIVEEAEPAPLGPQRNGMRFPESSVADRRRLFERDGKACSTLSLSGPELKQFQQSALADYIQRKTGKRPTSAAGCSLQEPGPLRERAQSAYLQPGPAALEGSGLASASSLSSLREPSLQPRREATLLPATVAETQQAPRDRSSSFAGGRRLGERRRGDLLSGANGGTRGTQRGDETPREPSSWGARAGKSMSAEDLLERSDVLAGPVHVRSRSSPATADKRQDVLLGQDSGFGLVKDPCYLAGPGSRSLSCSERGQEEMLPLFHHLTPRWGGSGCKAIGDSSVPSECPGTLDHQRQASRTPCPRPPLAGTQGLVTDTRAAPLTPIGTPLPSAIPSGYCSQDGQTGRQPLPPYTPAMMHRSNGHTLTQPPGPRGCEGDGPEHGVEEGTRKRVSLPQWPPPSRAKWAHAAREDSLPEESSAPDFANLKHYQKQQSLPSLCSTSDPDTPLGAPSTPGRISLRISESVLRDSPPPHEDYEDEVFVRDPHPKATSSPTFEPLPPPPPPPPSQETPVYSMDDFPPPPPHTVCEAQLDSEDPEGPRPSFNKLSKVTIARERHMPGAAHVVGSQTLASRLQTSIKGSEAESTPPSFMSVHAQLAGSLGGQPAPIQTQSLSHDPVSGTQGLEKKVSPDPQKSSEDIRTEALAKEIVHQDKSLADILDPDSRLKTTMDLMEGLFPRDVNLLKENSVKRKAIQRTVSSSGCEGKRNEDKEAVSMLVNCPAYYSVSAPKAELLNKIKEMPAEVNEEEEQADVNEKKAELIGSLTHKLETLQEAKGSLLTDIKLNNALGEEVEALISELCKPNEFDKYRMFIGDLDKVVNLLLSLSGRLARVENVLSGLGEDASNEERSSLYEKRKILAGQHEDARELKENLDRRERVVLGILANYLSEEQLQDYQHFVKMKSTLLIEQRKLDDKIKLGQEQVKCLLESLPSDFIPKAGALALPPNLTSEPIPAGGCTFSGIFPTLTSPL"

##Doing pairwise alignment of sequences

#global pairwise alignment, lining up human shroom 3 and mouse shroom 3 genes
align.h3.vs.m3a <- Biostrings::pairwiseAlignment(
                  hShroom3,
                  mShroom3a)
align.h3.vs.m3a #This object  shows the pairwise alignment of human shroom 3 and mouse shroom 3. The dashes shown represent indels.
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: MMRTTEDFHKPSATLN-SNTATKGRYIYLEAFLE...KAGALALPPNLTSEPIPAGGCTFSGIFPTLTSPL
## subject: MK-TPENLEEPSATPNPSRTPTE-RFVYLEALLE...KAGAISLPPALTGHATPGGTSVFGGVFPTLTSPL
## score: 2189.934
Biostrings::pid(align.h3.vs.m3a) #this shows percent identity by counting the amount of identical amino acids between 2 comparison and skipping indels. 
## [1] 70.56511
align.h3.vs.h2 <- Biostrings::pairwiseAlignment(
                  hShroom3,
                  hShroom2)
#this is aligning the human shroom 3 gene to the human shroom 2 gene
score(align.h3.vs.h2) #This is showing the score of how well the genes line up and includes indels. It is lower than the comparison of human shroom 3 and mouse shroom 3 since there are different functions between shroom 2 and shroom 3. 
## [1] -5673.853
#score accomodates for indels whereas pid does not
Biostrings::pid(align.h3.vs.h2)
## [1] 33.83277

The shroom family of genes

#this table shows the accession number and which species and gene it is associated with allowing the data to be organized
shroom_table <- c("CAA78718" , "X. laevis Apx" ,         "xShroom1",
            "NP_597713" , "H. sapiens APXL2" ,     "hShroom1",
            "CAA58534" , "H. sapiens APXL",        "hShroom2",
            "ABD19518" , "M. musculus Apxl" ,      "mShroom2",
            "AAF13269" , "M. musculus ShroomL" ,   "mShroom3a",
            "AAF13270" , "M. musculus ShroomS" ,   "mShroom3b",
            "NP_065910", "H. sapiens Shroom" ,     "hShroom3",
            "ABD59319" , "X. laevis Shroom-like",  "xShroom3",
            "NP_065768", "H. sapiens KIAA1202" ,   "hShroom4a",
            "AAK95579" , "H. sapiens SHAP-A" ,     "hShroom4b",
            #"DQ435686" , "M. musculus KIAA1202" ,  "mShroom4",
            "ABA81834" , "D. melanogaster Shroom", "dmShroom",
            "EAA12598" , "A. gambiae Shroom",      "agShroom",
            "XP_392427" , "A. mellifera Shroom" ,  "amShroom",
            "XP_783573" , "S. purpuratus Shroom" , "spShroom") #sea urchin
#This code chunk will create a data frame with names for each column and simplified species names.
# convert to matrix
shroom_table_matrix <- matrix(shroom_table,
                                  byrow = T,
                                  nrow = 14)
# convert to data fram
shroom_table <- data.frame(shroom_table_matrix, 
                     stringsAsFactors = F)

# naming columns
names(shroom_table) <- c("accession", "name.orig","name.new")

# Create simplified species names
shroom_table$spp <- "Homo"
shroom_table$spp[grep("laevis",shroom_table$name.orig)] <- "Xenopus"
shroom_table$spp[grep("musculus",shroom_table$name.orig)] <- "Mus"
shroom_table$spp[grep("melanogaster",shroom_table$name.orig)] <- "Drosophila"
shroom_table$spp[grep("gambiae",shroom_table$name.orig)] <- "mosquito"
shroom_table$spp[grep("mellifera",shroom_table$name.orig)] <- "bee"
shroom_table$spp[grep("purpuratus",shroom_table$name.orig)] <- "sea urchin"
shroom_table #This is showing the data frame we created. 
##    accession              name.orig  name.new        spp
## 1   CAA78718          X. laevis Apx  xShroom1    Xenopus
## 2  NP_597713       H. sapiens APXL2  hShroom1       Homo
## 3   CAA58534        H. sapiens APXL  hShroom2       Homo
## 4   ABD19518       M. musculus Apxl  mShroom2        Mus
## 5   AAF13269    M. musculus ShroomL mShroom3a        Mus
## 6   AAF13270    M. musculus ShroomS mShroom3b        Mus
## 7  NP_065910      H. sapiens Shroom  hShroom3       Homo
## 8   ABD59319  X. laevis Shroom-like  xShroom3    Xenopus
## 9  NP_065768    H. sapiens KIAA1202 hShroom4a       Homo
## 10  AAK95579      H. sapiens SHAP-A hShroom4b       Homo
## 11  ABA81834 D. melanogaster Shroom  dmShroom Drosophila
## 12  EAA12598      A. gambiae Shroom  agShroom   mosquito
## 13 XP_392427    A. mellifera Shroom  amShroom        bee
## 14 XP_783573   S. purpuratus Shroom  spShroom sea urchin

Downloading multiple sequences

shroom_table$accession #the $ allows us to access all the values from the column accession from the data frame shroom_table
##  [1] "CAA78718"  "NP_597713" "CAA58534"  "ABD19518"  "AAF13269"  "AAF13270" 
##  [7] "NP_065910" "ABD59319"  "NP_065768" "AAK95579"  "ABA81834"  "EAA12598" 
## [13] "XP_392427" "XP_783573"
#this chunk is downloading  the fasta files for each gene in the table
shrooms <-rentrez::entrez_fetch(db = "protein", 
                    id = shroom_table$accession, 
                    rettype = "fasta")
cat(shrooms) #enforces new line character
#This is creating a list of all the sequences . It is different from previous code chunks since it is creating a list object instead of a character object.
shrooms_list <- compbio4all::entrez_fetch_list(db = "protein", 
                          id = shroom_table$accession, 
                          rettype = "fasta")

is(shrooms_list)
## [1] "list"             "vector"           "list_OR_List"     "vector_OR_Vector"
## [5] "vector_OR_factor"
length(shrooms_list)
## [1] 14
nchar(shrooms_list)
##  CAA78718 NP_597713  CAA58534  ABD19518  AAF13269  AAF13270 NP_065910  ABD59319 
##      1486       915      1673      1543      2083      1895      2070      1864 
## NP_065768  AAK95579  ABA81834  EAA12598 XP_392427 XP_783573 
##      1560       778      1647       750      2230      1758
length(shrooms_list) #this shows how many elements are in the list to make sure every gene is accounted for.
## [1] 14
#this is cleaning up each sequence in the list
for(i in 1:length(shrooms_list)){
  shrooms_list[[i]] <- fasta_cleaner(shrooms_list[[i]], parse = F)
}
#this code chunk is taking each sequence from the list and putting it into a vector
# make a vector to store output
shrooms_vector <- rep(NA, length(shrooms_list))

# put each sequence in the vector
for(i in 1:length(shrooms_vector)){
  shrooms_vector[i] <- shrooms_list[[i]]
}

#  name the vector
names(shrooms_vector) <- names(shrooms_list)
#this is converting the vector to a string set
# add necessary function
shrooms_vector_ss <- Biostrings::AAStringSet(shrooms_vector)

MSA

#This section will build a multiple sequence alignment of the different shroom genes. It will allow us to compare the genes and see how they are related, allowing us to later create a phylogenetic tree.

Building a Multiple Sequence Alignment (MSA)

#this is creating an object to store the msa 
shrooms_align <- msa(shrooms_vector_ss,
                     method = "ClustalW")
## use default substitution matrix

Viewing an MSA

#this will allow us to visualize the msa

Viewing an MSA in R

shrooms_align #this output shows the msa with many dashes in an unusable manner
## CLUSTAL 2.1  
## 
## Call:
##    msa(shrooms_vector_ss, method = "ClustalW")
## 
## MsaAAMultipleAlignment with 14 rows and 2252 columns
##      aln                                                   names
##  [1] -------------------------...------------------------- NP_065768
##  [2] -------------------------...------------------------- AAK95579
##  [3] -------------------------...SVFGGVFPTLTSPL----------- AAF13269
##  [4] -------------------------...SVFGGVFPTLTSPL----------- AAF13270
##  [5] -------------------------...CTFSGIFPTLTSPL----------- NP_065910
##  [6] -------------------------...NKS--LPPPLTSSL----------- ABD59319
##  [7] -------------------------...------------------------- CAA58534
##  [8] -------------------------...------------------------- ABD19518
##  [9] -------------------------...LT----------------------- NP_597713
## [10] -------------------------...------------------------- CAA78718
## [11] -------------------------...------------------------- EAA12598
## [12] -------------------------...------------------------- ABA81834
## [13] MTELQPSPPGYRVQDEAPGPPSCPP...------------------------- XP_392427
## [14] -------------------------...AATSSSSNGIGGPEQLNSNATSSYC XP_783573
##  Con -------------------------...------------------------- Consensus
#this chunk is doing behind-the-scenes changes to allow print_msa() to be used

#this is assigning schrooms_align to the class AAMultipleAlignment
class(shrooms_align) <- "AAMultipleAlignment"

# putting shrooms_align into a format defined by the bioinformatics package seqinr
shrooms_align_seqinr <- msaConvert(shrooms_align, type = "seqinr::alignment")
#this shows the amino acids of each gene with many gaps
print_msa(alignment = shrooms_align_seqinr, 
          chunksize = 60)

Displaying an MSA as an R plot

#this output just shows the amino acids where there is the most overlap unlike the others which show the amino acids of the entire gene
ggmsa:: ggmsa(shrooms_align,   # shrooms_align, NOT shrooms_align_seqinr
      start = 2000, 
      end = 2100) 

Saving an MSA as PDF

#this chunk is saving the msa to a pdf
#msa::msaPrettyPrint(shrooms_align,             # alignment
#              file = "shroom_msa.pdf",   # file name
#               y=c(2000, 2100),           # range
#               askForOverwrite=FALSE)
getwd() #shows path to get to working directory
## [1] "/Users/vennilaram/Downloads"