Assignment: Your assignment is to use your notes from class - along with help from classmates, UTAs, and me - to turn this script into a fleshed-out description of what is going on.

This is a substantial project - we’ll work on it in steps over the rest of the unit.

We are currently focused on the overall process and will cover the details over the rest of this unit.

Your first assignment is to get this script to run from top to bottom by adding all of the missing R commands. Once you have done that, you can knit it into an HTML file and upload it to RPubs. (Note - you’ll need to add the YAML header!)

Your second assignment, which will be posted later, is to answer all the TODO and other prompts to add information. You can start on this, but you don’t have to do this on your first time through the code.

Delete all the prompts like TODO() as you compete them. Use RStudio’s search function to see if you’ve missed any - there are a LOT!

A complete bioinformatics workflow in R

By: Nathan L. Brouwer

“Worked example: Building a phylogeny in R”

Introduction

Describe how phylogeneies can be used in biology (readings will be assigned)

Vocab

MSA node clade Pairwise Alignment Percent Identity Indel Sequence Entrez NCBI Conserved Sequence

Key functions

rentrez::entrez_fetch cat compbio4all::fasta_cleaner Biostrings::pid Biostrings::pairwiseAlignment

Software Preliminaires

Load packages into memory

# github packages
library(compbio4all)


# CRAN packages
library(rentrez)
library(seqinr)
library(ape)
library(ggplot2)

# Bioconductor packages
library(msa)
library(Biostrings)

Downloading and saving macromolecular sequences for further analysis

# Human shroom 3 (H. sapiens)
hShroom3 <- rentrez::entrez_fetch(db = "protein", 
                          id = "NP_065910", 
                          rettype = "fasta")

cat prints in a way that respects the new line symbols

cat(hShroom3)

## >NP_065910.3 protein Shroom3 [Homo sapiens]
## MMRTTEDFHKPSATLNSNTATKGRYIYLEAFLEGGAPWGFTLKGGLEHGEPLIISKVEEGGKADTLSSKL
## QAGDEVVHINEVTLSSSRKEAVSLVKGSYKTLRLVVRRDVCTDPGHADTGASNFVSPEHLTSGPQHRKAA
## WSGGVKLRLKHRRSEPAGRPHSWHTTKSGEKQPDASMMQISQGMIGPPWHQSYHSSSSTSDLSNYDHAYL
## RRSPDQCSSQGSMESLEPSGAYPPCHLSPAKSTGSIDQLSHFHNKRDSAYSSFSTSSSILEYPHPGISGR
## ERSGSMDNTSARGGLLEGMRQADIRYVKTVYDTRRGVSAEYEVNSSALLLQGREARASANGQGYDKWSNI
## PRGKGVPPPSWSQQCPSSLETATDNLPPKVGAPLPPARSDSYAAFRHRERPSSWSSLDQKRLCRPQANSL
## GSLKSPFIEEQLHTVLEKSPENSPPVKPKHNYTQKAQPGQPLLPTSIYPVPSLEPHFAQVPQPSVSSNGM
## LYPALAKESGYIAPQGACNKMATIDENGNQNGSGRPGFAFCQPLEHDLLSPVEKKPEATAKYVPSKVHFC
## SVPENEEDASLKRHLTPPQGNSPHSNERKSTHSNKPSSHPHSLKCPQAQAWQAGEDKRSSRLSEPWEGDF
## QEDHNANLWRRLEREGLGQSLSGNFGKTKSAFSSLQNIPESLRRHSSLELGRGTQEGYPGGRPTCAVNTK
## AEDPGRKAAPDLGSHLDRQVSYPRPEGRTGASASFNSTDPSPEEPPAPSHPHTSSLGRRGPGPGSASALQ
## GFQYGKPHCSVLEKVSKFEQREQGSQRPSVGGSGFGHNYRPHRTVSTSSTSGNDFEETKAHIRFSESAEP
## LGNGEQHFKNGELKLEEASRQPCGQQLSGGASDSGRGPQRPDARLLRSQSTFQLSSEPEREPEWRDRPGS
## PESPLLDAPFSRAYRNSIKDAQSRVLGATSFRRRDLELGAPVASRSWRPRPSSAHVGLRSPEASASASPH
## TPRERHSVTPAEGDLARPVPPAARRGARRRLTPEQKKRSYSEPEKMNEVGIVEEAEPAPLGPQRNGMRFP
## ESSVADRRRLFERDGKACSTLSLSGPELKQFQQSALADYIQRKTGKRPTSAAGCSLQEPGPLRERAQSAY
## LQPGPAALEGSGLASASSLSSLREPSLQPRREATLLPATVAETQQAPRDRSSSFAGGRRLGERRRGDLLS
## GANGGTRGTQRGDETPREPSSWGARAGKSMSAEDLLERSDVLAGPVHVRSRSSPATADKRQDVLLGQDSG
## FGLVKDPCYLAGPGSRSLSCSERGQEEMLPLFHHLTPRWGGSGCKAIGDSSVPSECPGTLDHQRQASRTP
## CPRPPLAGTQGLVTDTRAAPLTPIGTPLPSAIPSGYCSQDGQTGRQPLPPYTPAMMHRSNGHTLTQPPGP
## RGCEGDGPEHGVEEGTRKRVSLPQWPPPSRAKWAHAAREDSLPEESSAPDFANLKHYQKQQSLPSLCSTS
## DPDTPLGAPSTPGRISLRISESVLRDSPPPHEDYEDEVFVRDPHPKATSSPTFEPLPPPPPPPPSQETPV
## YSMDDFPPPPPHTVCEAQLDSEDPEGPRPSFNKLSKVTIARERHMPGAAHVVGSQTLASRLQTSIKGSEA
## ESTPPSFMSVHAQLAGSLGGQPAPIQTQSLSHDPVSGTQGLEKKVSPDPQKSSEDIRTEALAKEIVHQDK
## SLADILDPDSRLKTTMDLMEGLFPRDVNLLKENSVKRKAIQRTVSSSGCEGKRNEDKEAVSMLVNCPAYY
## SVSAPKAELLNKIKEMPAEVNEEEEQADVNEKKAELIGSLTHKLETLQEAKGSLLTDIKLNNALGEEVEA
## LISELCKPNEFDKYRMFIGDLDKVVNLLLSLSGRLARVENVLSGLGEDASNEERSSLYEKRKILAGQHED
## ARELKENLDRRERVVLGILANYLSEEQLQDYQHFVKMKSTLLIEQRKLDDKIKLGQEQVKCLLESLPSDF
## IPKAGALALPPNLTSEPIPAGGCTFSGIFPTLTSPL

This chunk gets protein sequences from mice, humans, and sea urchins and saves them to objects.

# Mouse shroom 3a (M. musculus)
mShroom3a <- rentrez::entrez_fetch(db = "protein", 
                          id = "AAF13269", 
                          rettype = "fasta")

# Human shroom 2 (H. sapiens)
hShroom2 <- rentrez::entrez_fetch(db = "protein", 
                          id = "CAA58534", 
                          rettype = "fasta")


# Sea-urchin shroom
sShroom <- rentrez::entrez_fetch(db = "protein", 
                          id = "XP_783573", 
                          rettype = "fasta")

This outputs the length of the sequence (including the new line and other characters)

nchar(hShroom3)

## [1] 2070

nchar(mShroom3a)

## [1] 2083

nchar(sShroom)

## [1] 1758

nchar(hShroom2)

## [1] 1673

Prepping macromolecular sequences

This function cleans the data for analysis by removing extraneous characters leaving only the sequence

fasta_cleaner

## function (fasta_object, parse = TRUE) 
## {
##     fasta_object <- sub("^(>)(.*?)(\\n)(.*)(\\n\\n)", "\\4", 
##         fasta_object)
##     fasta_object <- gsub("\n", "", fasta_object)
##     if (parse == TRUE) {
##         fasta_object <- stringr::str_split(fasta_object, pattern = "", 
##             simplify = FALSE)
##     }
##     return(fasta_object[[1]])
## }
## <bytecode: 0x00000000281ea9b8>
## <environment: namespace:compbio4all>

You can copy and paste this chunk in Rmarkdown or into the console to load this manually into your R session

fasta_cleaner <- function(fasta_object, parse = TRUE){

  fasta_object <- sub("^(>)(.*?)(\\n)(.*)(\\n\\n)","\\4",fasta_object)
  fasta_object <- gsub("\n", "", fasta_object)

  if(parse == TRUE){
    fasta_object <- stringr::str_split(fasta_object,
                                       pattern = "",
                                       simplify = FALSE)
  }

  return(fasta_object[[1]])
}

This code just cleans the existing sequences we have saved, to make it fit for analysis.

hShroom3  <- fasta_cleaner(hShroom3,  parse = F)
mShroom3a <- fasta_cleaner(mShroom3a, parse = F)
hShroom2  <- fasta_cleaner(hShroom2,  parse = F)
sShroom   <- fasta_cleaner(sShroom,   parse = F)

hShroom3

## [1] "MMRTTEDFHKPSATLNSNTATKGRYIYLEAFLEGGAPWGFTLKGGLEHGEPLIISKVEEGGKADTLSSKLQAGDEVVHINEVTLSSSRKEAVSLVKGSYKTLRLVVRRDVCTDPGHADTGASNFVSPEHLTSGPQHRKAAWSGGVKLRLKHRRSEPAGRPHSWHTTKSGEKQPDASMMQISQGMIGPPWHQSYHSSSSTSDLSNYDHAYLRRSPDQCSSQGSMESLEPSGAYPPCHLSPAKSTGSIDQLSHFHNKRDSAYSSFSTSSSILEYPHPGISGRERSGSMDNTSARGGLLEGMRQADIRYVKTVYDTRRGVSAEYEVNSSALLLQGREARASANGQGYDKWSNIPRGKGVPPPSWSQQCPSSLETATDNLPPKVGAPLPPARSDSYAAFRHRERPSSWSSLDQKRLCRPQANSLGSLKSPFIEEQLHTVLEKSPENSPPVKPKHNYTQKAQPGQPLLPTSIYPVPSLEPHFAQVPQPSVSSNGMLYPALAKESGYIAPQGACNKMATIDENGNQNGSGRPGFAFCQPLEHDLLSPVEKKPEATAKYVPSKVHFCSVPENEEDASLKRHLTPPQGNSPHSNERKSTHSNKPSSHPHSLKCPQAQAWQAGEDKRSSRLSEPWEGDFQEDHNANLWRRLEREGLGQSLSGNFGKTKSAFSSLQNIPESLRRHSSLELGRGTQEGYPGGRPTCAVNTKAEDPGRKAAPDLGSHLDRQVSYPRPEGRTGASASFNSTDPSPEEPPAPSHPHTSSLGRRGPGPGSASALQGFQYGKPHCSVLEKVSKFEQREQGSQRPSVGGSGFGHNYRPHRTVSTSSTSGNDFEETKAHIRFSESAEPLGNGEQHFKNGELKLEEASRQPCGQQLSGGASDSGRGPQRPDARLLRSQSTFQLSSEPEREPEWRDRPGSPESPLLDAPFSRAYRNSIKDAQSRVLGATSFRRRDLELGAPVASRSWRPRPSSAHVGLRSPEASASASPHTPRERHSVTPAEGDLARPVPPAARRGARRRLTPEQKKRSYSEPEKMNEVGIVEEAEPAPLGPQRNGMRFPESSVADRRRLFERDGKACSTLSLSGPELKQFQQSALADYIQRKTGKRPTSAAGCSLQEPGPLRERAQSAYLQPGPAALEGSGLASASSLSSLREPSLQPRREATLLPATVAETQQAPRDRSSSFAGGRRLGERRRGDLLSGANGGTRGTQRGDETPREPSSWGARAGKSMSAEDLLERSDVLAGPVHVRSRSSPATADKRQDVLLGQDSGFGLVKDPCYLAGPGSRSLSCSERGQEEMLPLFHHLTPRWGGSGCKAIGDSSVPSECPGTLDHQRQASRTPCPRPPLAGTQGLVTDTRAAPLTPIGTPLPSAIPSGYCSQDGQTGRQPLPPYTPAMMHRSNGHTLTQPPGPRGCEGDGPEHGVEEGTRKRVSLPQWPPPSRAKWAHAAREDSLPEESSAPDFANLKHYQKQQSLPSLCSTSDPDTPLGAPSTPGRISLRISESVLRDSPPPHEDYEDEVFVRDPHPKATSSPTFEPLPPPPPPPPSQETPVYSMDDFPPPPPHTVCEAQLDSEDPEGPRPSFNKLSKVTIARERHMPGAAHVVGSQTLASRLQTSIKGSEAESTPPSFMSVHAQLAGSLGGQPAPIQTQSLSHDPVSGTQGLEKKVSPDPQKSSEDIRTEALAKEIVHQDKSLADILDPDSRLKTTMDLMEGLFPRDVNLLKENSVKRKAIQRTVSSSGCEGKRNEDKEAVSMLVNCPAYYSVSAPKAELLNKIKEMPAEVNEEEEQADVNEKKAELIGSLTHKLETLQEAKGSLLTDIKLNNALGEEVEALISELCKPNEFDKYRMFIGDLDKVVNLLLSLSGRLARVENVLSGLGEDASNEERSSLYEKRKILAGQHEDARELKENLDRRERVVLGILANYLSEEQLQDYQHFVKMKSTLLIEQRKLDDKIKLGQEQVKCLLESLPSDFIPKAGALALPPNLTSEPIPAGGCTFSGIFPTLTSPL"

Pairwise Aligning sequences

This code construsts a pairwise alignment between Human shroom3 and Mouse shroom3

# add necessary function
align.h3.vs.m3a <- Biostrings::pairwiseAlignment(
                  hShroom3,
                  mShroom3a)

This shows the start and end of the alignment, compairing the sequences directly, and showing a score for how similar they are.

align.h3.vs.m3a

## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: MMRTTEDFHKPSATLN-SNTATKGRYIYLEAFLE...KAGALALPPNLTSEPIPAGGCTFSGIFPTLTSPL
## subject: MK-TPENLEEPSATPNPSRTPTE-RFVYLEALLE...KAGAISLPPALTGHATPGGTSVFGGVFPTLTSPL
## score: 2189.934

This shows the percent identity, which is how much they have in common.

# add necessary function
Biostrings::pid(align.h3.vs.m3a)

## [1] 70.56511

This is comparing Human shroom 3 to Human shroom 2.

align.h3.vs.h2 <- Biostrings::pairwiseAlignment(
                  hShroom3,
                  hShroom2)

This prints the score of the alignment, which is less that the previous one, because human shroom 3 and 2 have different functions while mouse shroom 3 has the same or similar function to human shroom 3.

score(align.h3.vs.h2)

## [1] -5673.853

The score is a numerical number for how similar they are, while the pid function outputs a percentage, which is easier to conceptualize.

Biostrings::pid(align.h3.vs.h2)

## [1] 33.83277

The shroom family of genes

This table shows various shroom accession numbers and the species so they can be easily accessed later.

shroom_table <- c("CAA78718" , "X. laevis Apx" ,         "xShroom1",
            "NP_597713" , "H. sapiens APXL2" ,     "hShroom1",
            "CAA58534" , "H. sapiens APXL",        "hShroom2",
            "ABD19518" , "M. musculus Apxl" ,      "mShroom2",
            "AAF13269" , "M. musculus ShroomL" ,   "mShroom3a",
            "AAF13270" , "M. musculus ShroomS" ,   "mShroom3b",
            "NP_065910", "H. sapiens Shroom" ,     "hShroom3",
            "ABD59319" , "X. laevis Shroom-like",  "xShroom3",
            "NP_065768", "H. sapiens KIAA1202" ,   "hShroom4a",
            "AAK95579" , "H. sapiens SHAP-A" ,     "hShroom4b",
            #"DQ435686" , "M. musculus KIAA1202" ,  "mShroom4",
            "ABA81834" , "D. melanogaster Shroom", "dmShroom",
            "EAA12598" , "A. gambiae Shroom",      "agShroom",
            "XP_392427" , "A. mellifera Shroom" ,  "amShroom",
            "XP_783573" , "S. purpuratus Shroom" , "spShroom") #sea urchin

This creates a matrix from the table, and then a data frame from the matrix, and lastly names each column and row

# convert to matrix
shroom_table_matrix <- matrix(shroom_table,
                                  byrow = T,
                                  nrow = 14)
# convert to data frame
shroom_table <- data.frame(shroom_table_matrix, 
                     stringsAsFactors = F)

# Naming the columns
names(shroom_table) <- c("accession", "name.orig","name.new")

# Create simplified species names
shroom_table$spp <- "Homo"
shroom_table$spp[grep("laevis",shroom_table$name.orig)] <- "Xenopus"
shroom_table$spp[grep("musculus",shroom_table$name.orig)] <- "Mus"
shroom_table$spp[grep("melanogaster",shroom_table$name.orig)] <- "Drosophila"
shroom_table$spp[grep("gambiae",shroom_table$name.orig)] <- "mosquito"
shroom_table$spp[grep("mellifera",shroom_table$name.orig)] <- "bee"
shroom_table$spp[grep("purpuratus",shroom_table$name.orig)] <- "sea urchin"

This prints the table

shroom_table

##    accession              name.orig  name.new        spp
## 1   CAA78718          X. laevis Apx  xShroom1    Xenopus
## 2  NP_597713       H. sapiens APXL2  hShroom1       Homo
## 3   CAA58534        H. sapiens APXL  hShroom2       Homo
## 4   ABD19518       M. musculus Apxl  mShroom2        Mus
## 5   AAF13269    M. musculus ShroomL mShroom3a        Mus
## 6   AAF13270    M. musculus ShroomS mShroom3b        Mus
## 7  NP_065910      H. sapiens Shroom  hShroom3       Homo
## 8   ABD59319  X. laevis Shroom-like  xShroom3    Xenopus
## 9  NP_065768    H. sapiens KIAA1202 hShroom4a       Homo
## 10  AAK95579      H. sapiens SHAP-A hShroom4b       Homo
## 11  ABA81834 D. melanogaster Shroom  dmShroom Drosophila
## 12  EAA12598      A. gambiae Shroom  agShroom   mosquito
## 13 XP_392427    A. mellifera Shroom  amShroom        bee
## 14 XP_783573   S. purpuratus Shroom  spShroom sea urchin

XXXXXing multiple sequences

The “$” prints the accession numbers, which we can use to access and download the sequence

shroom_table$accession

##  [1] "CAA78718"  "NP_597713" "CAA58534"  "ABD19518"  "AAF13269"  "AAF13270" 
##  [7] "NP_065910" "ABD59319"  "NP_065768" "AAK95579"  "ABA81834"  "EAA12598" 
## [13] "XP_392427" "XP_783573"

This downloads each of the seuqences into one object called shrooms

shrooms <-rentrez::entrez_fetch(db = "protein", 
                          id = shroom_table$accession, 
                          rettype = "fasta")

This prints shrooms in a neater manner

cat(shrooms)

This downloads all the sequneces we want as a list, which we can actually access individual parts separately and use unlike the previous object where it was combined together.

shrooms_list <- entrez_fetch_list(db = "protein", 
                          id = shroom_table$accession, 
                          rettype = "fasta")

is(shrooms_list)

## [1] "list"             "vector"           "list_OR_List"     "vector_OR_Vector"
## [5] "vector_OR_factor"

length(shrooms_list)

## [1] 14

nchar(shrooms_list)

##  CAA78718 NP_597713  CAA58534  ABD19518  AAF13269  AAF13270 NP_065910  ABD59319 
##      1486       915      1673      1543      2083      1895      2070      1864 
## NP_065768  AAK95579  ABA81834  EAA12598 XP_392427 XP_783573 
##      1560       778      1647       750      2230      1758

We are getting the length of the list to check if it had worked

length(shrooms_list)

## [1] 14

This cleans up all the data in the list

for(i in 1:length(shrooms_list)){
  shrooms_list[[i]] <- fasta_cleaner(shrooms_list[[i]], parse = F)
}

This grabs the accession numbers from the list and stores them in a vector and names each one.

# This creates a vector with the length of the list of shroom
shrooms_vector <- rep(NA, length(shrooms_list))

# This gets the accession numbers of the shroom genes and stores them in the vector
for(i in 1:length(shrooms_vector)){
  shrooms_vector[i] <- shrooms_list[[i]]
}

#This sets the names for what we have created
names(shrooms_vector) <- names(shrooms_list)

This is getting the data into a more usable form for manipulation

shrooms_vector_ss <- Biostrings::AAStringSet        (shrooms_vector)

MSA

This section will create an msa (Multiple Sequence Alignment) which compares the sequences and aligns it in the configuration that creates the highest score (match = 1, mismatch = 0, indel = -1) We can then analyze it to later build a phylogenetic tree.

Building an Multiple Sequence Alignment (MSA)

This stores the msa in a object called shrooms_align

shrooms_align <-msa(shrooms_vector_ss,
                     method = "ClustalW")

## use default substitution matrix

Viewing an MSA

This section will visualize the msa we have constructed.

Viewing an MSA in R

This displays the start and end of the msa

shrooms_align

## CLUSTAL 2.1  
## 
## Call:
##    msa(shrooms_vector_ss, method = "ClustalW")
## 
## MsaAAMultipleAlignment with 14 rows and 2252 columns
##      aln                                                   names
##  [1] -------------------------...------------------------- NP_065768
##  [2] -------------------------...------------------------- AAK95579
##  [3] -------------------------...SVFGGVFPTLTSPL----------- AAF13269
##  [4] -------------------------...SVFGGVFPTLTSPL----------- AAF13270
##  [5] -------------------------...CTFSGIFPTLTSPL----------- NP_065910
##  [6] -------------------------...NKS--LPPPLTSSL----------- ABD59319
##  [7] -------------------------...------------------------- CAA58534
##  [8] -------------------------...------------------------- ABD19518
##  [9] -------------------------...LT----------------------- NP_597713
## [10] -------------------------...------------------------- CAA78718
## [11] -------------------------...------------------------- EAA12598
## [12] -------------------------...------------------------- ABA81834
## [13] MTELQPSPPGYRVQDEAPGPPSCPP...------------------------- XP_392427
## [14] -------------------------...AATSSSSNGIGGPEQLNSNATSSYC XP_783573
##  Con -------------------------...------------------------- Consensus

This is prepping our data for printing and saving as a pdf

# This sets the alignment's class, an attribute of the object, as "AAMutlipleAlignment" 
class(shrooms_align) <- "AAMultipleAlignment"

# this creates a new object which it stores the converted msa in, which is more prep for the data
shrooms_align_seqinr <- msaConvert(shrooms_align, type = "seqinr::alignment")

This prints out the entire msa

print_msa(alignment = shrooms_align_seqinr, 
          chunksize = 60)

Displaying an MSA

This graphs the MSA in a more easily viewable version

#ggmsa::ggmsa(shrooms_align,   # shrooms_align, NOT shrooms_align_seqinr
#      start = 2000, 
#      end = 2100)

Saving an MSA as PDF

This command prints the better visualized msa and saves it as a pdf file

This may not work for everyone. If its not working you can comment it out.

#msaPrettyPrint(shrooms_align,             # alignment
#               file = "shroom_msa.pdf",   # file name
#               y=c(2000, 2100),           # range
#               askForOverwrite=FALSE)

This shows where R is running and working, and so it is where the pdf will save

getwd()

## [1] "C:/Users/David Reed Hall/Desktop/School/Current Classes/Computational Biology/Portfolio"