Code By: Nathan L. Brouwer
In bioinformatics, a phylogeny of species, proteins, or genes can be built to show how
Make a list of at least 10 vocab terms that are important (don’t have to define)
rentrez::entrez_fetch() compbio4all::entrez_fetch_list() compbio4all::print_msa() (Coghlan 2011) Biostrings::AAStringSet() msa::msa()
Add the necessary calls to library() to load call packages Indicate which packages cam from Bioconducotr, CRAN, and GitHub
# github packages
library(compbio4all)
# CRAN packages
library(rentrez)
library(seqinr)
library(ape)
# Bioconductor packages
library(msa)
library(Biostrings)
We are exploring sequences, and we will be using the entrez_fetch() function to download them; this function accesses gene/protein sequences from the NCBI database. This function is from the rentrez package, which stands for "R-Entrez rentrez::entrez_fetch
# Human shroom 3 (H. sapiens)
hShroom3 <- entrez_fetch(db = "protein",
id = "NP_065910",
rettype = "fasta")
TODO:cat() is doing some formatting by enforcing line breaks
cat(hShroom3)
## >NP_065910.3 protein Shroom3 [Homo sapiens]
## MMRTTEDFHKPSATLNSNTATKGRYIYLEAFLEGGAPWGFTLKGGLEHGEPLIISKVEEGGKADTLSSKL
## QAGDEVVHINEVTLSSSRKEAVSLVKGSYKTLRLVVRRDVCTDPGHADTGASNFVSPEHLTSGPQHRKAA
## WSGGVKLRLKHRRSEPAGRPHSWHTTKSGEKQPDASMMQISQGMIGPPWHQSYHSSSSTSDLSNYDHAYL
## RRSPDQCSSQGSMESLEPSGAYPPCHLSPAKSTGSIDQLSHFHNKRDSAYSSFSTSSSILEYPHPGISGR
## ERSGSMDNTSARGGLLEGMRQADIRYVKTVYDTRRGVSAEYEVNSSALLLQGREARASANGQGYDKWSNI
## PRGKGVPPPSWSQQCPSSLETATDNLPPKVGAPLPPARSDSYAAFRHRERPSSWSSLDQKRLCRPQANSL
## GSLKSPFIEEQLHTVLEKSPENSPPVKPKHNYTQKAQPGQPLLPTSIYPVPSLEPHFAQVPQPSVSSNGM
## LYPALAKESGYIAPQGACNKMATIDENGNQNGSGRPGFAFCQPLEHDLLSPVEKKPEATAKYVPSKVHFC
## SVPENEEDASLKRHLTPPQGNSPHSNERKSTHSNKPSSHPHSLKCPQAQAWQAGEDKRSSRLSEPWEGDF
## QEDHNANLWRRLEREGLGQSLSGNFGKTKSAFSSLQNIPESLRRHSSLELGRGTQEGYPGGRPTCAVNTK
## AEDPGRKAAPDLGSHLDRQVSYPRPEGRTGASASFNSTDPSPEEPPAPSHPHTSSLGRRGPGPGSASALQ
## GFQYGKPHCSVLEKVSKFEQREQGSQRPSVGGSGFGHNYRPHRTVSTSSTSGNDFEETKAHIRFSESAEP
## LGNGEQHFKNGELKLEEASRQPCGQQLSGGASDSGRGPQRPDARLLRSQSTFQLSSEPEREPEWRDRPGS
## PESPLLDAPFSRAYRNSIKDAQSRVLGATSFRRRDLELGAPVASRSWRPRPSSAHVGLRSPEASASASPH
## TPRERHSVTPAEGDLARPVPPAARRGARRRLTPEQKKRSYSEPEKMNEVGIVEEAEPAPLGPQRNGMRFP
## ESSVADRRRLFERDGKACSTLSLSGPELKQFQQSALADYIQRKTGKRPTSAAGCSLQEPGPLRERAQSAY
## LQPGPAALEGSGLASASSLSSLREPSLQPRREATLLPATVAETQQAPRDRSSSFAGGRRLGERRRGDLLS
## GANGGTRGTQRGDETPREPSSWGARAGKSMSAEDLLERSDVLAGPVHVRSRSSPATADKRQDVLLGQDSG
## FGLVKDPCYLAGPGSRSLSCSERGQEEMLPLFHHLTPRWGGSGCKAIGDSSVPSECPGTLDHQRQASRTP
## CPRPPLAGTQGLVTDTRAAPLTPIGTPLPSAIPSGYCSQDGQTGRQPLPPYTPAMMHRSNGHTLTQPPGP
## RGCEGDGPEHGVEEGTRKRVSLPQWPPPSRAKWAHAAREDSLPEESSAPDFANLKHYQKQQSLPSLCSTS
## DPDTPLGAPSTPGRISLRISESVLRDSPPPHEDYEDEVFVRDPHPKATSSPTFEPLPPPPPPPPSQETPV
## YSMDDFPPPPPHTVCEAQLDSEDPEGPRPSFNKLSKVTIARERHMPGAAHVVGSQTLASRLQTSIKGSEA
## ESTPPSFMSVHAQLAGSLGGQPAPIQTQSLSHDPVSGTQGLEKKVSPDPQKSSEDIRTEALAKEIVHQDK
## SLADILDPDSRLKTTMDLMEGLFPRDVNLLKENSVKRKAIQRTVSSSGCEGKRNEDKEAVSMLVNCPAYY
## SVSAPKAELLNKIKEMPAEVNEEEEQADVNEKKAELIGSLTHKLETLQEAKGSLLTDIKLNNALGEEVEA
## LISELCKPNEFDKYRMFIGDLDKVVNLLLSLSGRLARVENVLSGLGEDASNEERSSLYEKRKILAGQHED
## ARELKENLDRRERVVLGILANYLSEEQLQDYQHFVKMKSTLLIEQRKLDDKIKLGQEQVKCLLESLPSDF
## IPKAGALALPPNLTSEPIPAGGCTFSGIFPTLTSPL
TODO: This code chunk is automatically downloading sequence data from the NCBI database for various shroom proteins. Each of the shroom proteins has a unique id/accession number.
# Mouse shroom 3a (M. musculus)
mShroom3a <- entrez_fetch(db = "protein",
id = "AAF13269",
rettype = "fasta")
# Human shroom 2 (H. sapiens)
hShroom2 <- entrez_fetch(db = "protein",
id = "CAA58534",
rettype = "fasta")
# Sea-urchin shroom
sShroom <- entrez_fetch(db = "protein",
id = "XP_783573",
rettype = "fasta")
TODO: This code chuck helps us know how long each sequence is, by letting us know how many characters are in each object(each sequence).
nchar(hShroom3)
## [1] 2070
nchar(mShroom3a)
## [1] 2083
nchar(sShroom)
## [1] 1758
nchar(hShroom2)
## [1] 1673
TODO: This function “cleans” the data from the sequences by removing the parts that are not sequence information. This function essentially makes the data usable.
fasta_cleaner
## function (fasta_object, parse = TRUE)
## {
## fasta_object <- sub("^(>)(.*?)(\\n)(.*)(\\n\\n)", "\\4",
## fasta_object)
## fasta_object <- gsub("\n", "", fasta_object)
## if (parse == TRUE) {
## fasta_object <- stringr::str_split(fasta_object, pattern = "",
## simplify = FALSE)
## }
## return(fasta_object[[1]])
## }
## <bytecode: 0x00000000232bad10>
## <environment: namespace:compbio4all>
TODO: If you can’t download compbio4all, run the code chunk below to access the fasta_cleaner() function.
fasta_cleaner <- function(fasta_object, parse = TRUE){
fasta_object <- sub("^(>)(.*?)(\\n)(.*)(\\n\\n)","\\4",fasta_object)
fasta_object <- gsub("\n", "", fasta_object)
if(parse == TRUE){
fasta_object <- stringr::str_split(fasta_object,
pattern = "",
simplify = FALSE)
}
return(fasta_object[[1]])
}
TODO: This code chunk is cleaning each of the sequences.
hShroom3 <- fasta_cleaner(hShroom3, parse = F)
mShroom3a <- fasta_cleaner(mShroom3a, parse = F)
hShroom2 <- fasta_cleaner(hShroom2, parse = F)
sShroom <- fasta_cleaner(sShroom, parse = F)
hShroom3
## [1] "MMRTTEDFHKPSATLNSNTATKGRYIYLEAFLEGGAPWGFTLKGGLEHGEPLIISKVEEGGKADTLSSKLQAGDEVVHINEVTLSSSRKEAVSLVKGSYKTLRLVVRRDVCTDPGHADTGASNFVSPEHLTSGPQHRKAAWSGGVKLRLKHRRSEPAGRPHSWHTTKSGEKQPDASMMQISQGMIGPPWHQSYHSSSSTSDLSNYDHAYLRRSPDQCSSQGSMESLEPSGAYPPCHLSPAKSTGSIDQLSHFHNKRDSAYSSFSTSSSILEYPHPGISGRERSGSMDNTSARGGLLEGMRQADIRYVKTVYDTRRGVSAEYEVNSSALLLQGREARASANGQGYDKWSNIPRGKGVPPPSWSQQCPSSLETATDNLPPKVGAPLPPARSDSYAAFRHRERPSSWSSLDQKRLCRPQANSLGSLKSPFIEEQLHTVLEKSPENSPPVKPKHNYTQKAQPGQPLLPTSIYPVPSLEPHFAQVPQPSVSSNGMLYPALAKESGYIAPQGACNKMATIDENGNQNGSGRPGFAFCQPLEHDLLSPVEKKPEATAKYVPSKVHFCSVPENEEDASLKRHLTPPQGNSPHSNERKSTHSNKPSSHPHSLKCPQAQAWQAGEDKRSSRLSEPWEGDFQEDHNANLWRRLEREGLGQSLSGNFGKTKSAFSSLQNIPESLRRHSSLELGRGTQEGYPGGRPTCAVNTKAEDPGRKAAPDLGSHLDRQVSYPRPEGRTGASASFNSTDPSPEEPPAPSHPHTSSLGRRGPGPGSASALQGFQYGKPHCSVLEKVSKFEQREQGSQRPSVGGSGFGHNYRPHRTVSTSSTSGNDFEETKAHIRFSESAEPLGNGEQHFKNGELKLEEASRQPCGQQLSGGASDSGRGPQRPDARLLRSQSTFQLSSEPEREPEWRDRPGSPESPLLDAPFSRAYRNSIKDAQSRVLGATSFRRRDLELGAPVASRSWRPRPSSAHVGLRSPEASASASPHTPRERHSVTPAEGDLARPVPPAARRGARRRLTPEQKKRSYSEPEKMNEVGIVEEAEPAPLGPQRNGMRFPESSVADRRRLFERDGKACSTLSLSGPELKQFQQSALADYIQRKTGKRPTSAAGCSLQEPGPLRERAQSAYLQPGPAALEGSGLASASSLSSLREPSLQPRREATLLPATVAETQQAPRDRSSSFAGGRRLGERRRGDLLSGANGGTRGTQRGDETPREPSSWGARAGKSMSAEDLLERSDVLAGPVHVRSRSSPATADKRQDVLLGQDSGFGLVKDPCYLAGPGSRSLSCSERGQEEMLPLFHHLTPRWGGSGCKAIGDSSVPSECPGTLDHQRQASRTPCPRPPLAGTQGLVTDTRAAPLTPIGTPLPSAIPSGYCSQDGQTGRQPLPPYTPAMMHRSNGHTLTQPPGPRGCEGDGPEHGVEEGTRKRVSLPQWPPPSRAKWAHAAREDSLPEESSAPDFANLKHYQKQQSLPSLCSTSDPDTPLGAPSTPGRISLRISESVLRDSPPPHEDYEDEVFVRDPHPKATSSPTFEPLPPPPPPPPSQETPVYSMDDFPPPPPHTVCEAQLDSEDPEGPRPSFNKLSKVTIARERHMPGAAHVVGSQTLASRLQTSIKGSEAESTPPSFMSVHAQLAGSLGGQPAPIQTQSLSHDPVSGTQGLEKKVSPDPQKSSEDIRTEALAKEIVHQDKSLADILDPDSRLKTTMDLMEGLFPRDVNLLKENSVKRKAIQRTVSSSGCEGKRNEDKEAVSMLVNCPAYYSVSAPKAELLNKIKEMPAEVNEEEEQADVNEKKAELIGSLTHKLETLQEAKGSLLTDIKLNNALGEEVEALISELCKPNEFDKYRMFIGDLDKVVNLLLSLSGRLARVENVLSGLGEDASNEERSSLYEKRKILAGQHEDARELKENLDRRERVVLGILANYLSEEQLQDYQHFVKMKSTLLIEQRKLDDKIKLGQEQVKCLLESLPSDFIPKAGALALPPNLTSEPIPAGGCTFSGIFPTLTSPL"
TODO: This code chunk is aligning human versus mouse shroom using the global alignment function pairwiseAlignment():
# add necessary function
align.h3.vs.m3a <-Biostrings::pairwiseAlignment(
hShroom3,
mShroom3a)
TODO: This shows you part of the human versus mouse shroom alignment. The dashes show indels.
align.h3.vs.m3a
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: MMRTTEDFHKPSATLN-SNTATKGRYIYLEAFLE...KAGALALPPNLTSEPIPAGGCTFSGIFPTLTSPL
## subject: MK-TPENLEEPSATPNPSRTPTE-RFVYLEALLE...KAGAISLPPALTGHATPGGTSVFGGVFPTLTSPL
## score: 2189.934
TODO: pid(), the percent sequence identity, shows how similar the the two sequences are as a percentage.
# add necessary function
Biostrings::pid(align.h3.vs.m3a)
## [1] 70.56511
TODO: The previous code chunks aligned human shroom 3 with mouse shroom 3. This code chunk aligns human shroom 3 with human shroom 2.
align.h3.vs.h2 <- Biostrings::pairwiseAlignment(
hShroom3,
hShroom2)
TODO: This scores how similar h3 is to h2, but its hard to interpret.
score(align.h3.vs.h2)
## [1] -5673.853
TODO: pid() gives you a % to show how similar two sequnces are, wheras score() gives you a number, which can hard to interpret. Also,indels contribute negatively to an alignment score, but aren’t used in the most common calculations for PID
Biostrings::pid(align.h3.vs.h2)
## [1] 33.83277
TODO: This table is from a published paper and has the accession numbers of 15 shroom genes.
shroom_table <- c("CAA78718" , "X. laevis Apx" , "xShroom1",
"NP_597713" , "H. sapiens APXL2" , "hShroom1",
"CAA58534" , "H. sapiens APXL", "hShroom2",
"ABD19518" , "M. musculus Apxl" , "mShroom2",
"AAF13269" , "M. musculus ShroomL" , "mShroom3a",
"AAF13270" , "M. musculus ShroomS" , "mShroom3b",
"NP_065910", "H. sapiens Shroom" , "hShroom3",
"ABD59319" , "X. laevis Shroom-like", "xShroom3",
"NP_065768", "H. sapiens KIAA1202" , "hShroom4a",
"AAK95579" , "H. sapiens SHAP-A" , "hShroom4b",
#"DQ435686" , "M. musculus KIAA1202" , "mShroom4",
"ABA81834" , "D. melanogaster Shroom", "dmShroom",
"EAA12598" , "A. gambiae Shroom", "agShroom",
"XP_392427" , "A. mellifera Shroom" , "amShroom",
"XP_783573" , "S. purpuratus Shroom" , "spShroom") #sea urchin
TODO: This is converting data into the most usable format.
# convert the vector to a matrix using matrix() function
shroom_table_matrix <- matrix(shroom_table,
byrow = T,
nrow = 14)
# convert matrix to a datarframe using data.frame() function
shroom_table <- data.frame(shroom_table_matrix,
stringsAsFactors = F)
# naming dataframe columns using names() function
names(shroom_table) <- c("accession", "name.orig","name.new")
# Create simplified species names
shroom_table$spp <- "Homo"
shroom_table$spp[grep("laevis",shroom_table$name.orig)] <- "Xenopus"
shroom_table$spp[grep("musculus",shroom_table$name.orig)] <- "Mus"
shroom_table$spp[grep("melanogaster",shroom_table$name.orig)] <- "Drosophila"
shroom_table$spp[grep("gambiae",shroom_table$name.orig)] <- "mosquito"
shroom_table$spp[grep("mellifera",shroom_table$name.orig)] <- "bee"
shroom_table$spp[grep("purpuratus",shroom_table$name.orig)] <- "sea urchin"
TODO: This gives us a finished shroom table, with accession numbers, the original name, new names, and the species a shroom protein is from.
shroom_table
## accession name.orig name.new spp
## 1 CAA78718 X. laevis Apx xShroom1 Xenopus
## 2 NP_597713 H. sapiens APXL2 hShroom1 Homo
## 3 CAA58534 H. sapiens APXL hShroom2 Homo
## 4 ABD19518 M. musculus Apxl mShroom2 Mus
## 5 AAF13269 M. musculus ShroomL mShroom3a Mus
## 6 AAF13270 M. musculus ShroomS mShroom3b Mus
## 7 NP_065910 H. sapiens Shroom hShroom3 Homo
## 8 ABD59319 X. laevis Shroom-like xShroom3 Xenopus
## 9 NP_065768 H. sapiens KIAA1202 hShroom4a Homo
## 10 AAK95579 H. sapiens SHAP-A hShroom4b Homo
## 11 ABA81834 D. melanogaster Shroom dmShroom Drosophila
## 12 EAA12598 A. gambiae Shroom agShroom mosquito
## 13 XP_392427 A. mellifera Shroom amShroom bee
## 14 XP_783573 S. purpuratus Shroom spShroom sea urchin
TODO: The $ allows us to download mutliple sequences at the same time.
shroom_table$accession
## [1] "CAA78718" "NP_597713" "CAA58534" "ABD19518" "AAF13269" "AAF13270"
## [7] "NP_065910" "ABD59319" "NP_065768" "AAK95579" "ABA81834" "EAA12598"
## [13] "XP_392427" "XP_783573"
TODO: This chunk gives a whole set of accession numbers to the entrez_fetch() function, which is a vectorized function, which can handle a vector of inputs.
# add necessary function
shrooms <- rentrez::entrez_fetch(db = "protein",
id = shroom_table$accession,
rettype = "fasta")
TODO: This allows us the see each shroom protein sequence
cat(shrooms)
TODO: This will download all of the shroom sequences again and format the sequences into an R data format called a list. This makes the sequence data usable.
shrooms_list <- entrez_fetch_list(db = "protein",
id = shroom_table$accession,
rettype = "fasta")
TODO: This shows that we have an R list with 14 elements in it.
length(shrooms_list)
## [1] 14
TODO: This is cleaning all 14 of the shroom sequences. This for() loop allows us to do it in a more efficient way.
for(i in 1:length(shrooms_list)){
shrooms_list[[i]] <- fasta_cleaner(shrooms_list[[i]], parse = F)
}
TODO: In this code chunk we are creating an empty, the for() loop allows us to put each element of the shrooms_list into the vector we created, and then we name the vector using the names() function.
# making the empty vector
shrooms_vector <- rep(NA, length(shrooms_list))
# run the loop
for(i in 1:length(shrooms_vector)){
shrooms_vector[i] <- shrooms_list[[i]]
}
# name the vectors
names(shrooms_vector) <- names(shrooms_list)
TODO: This converts our vector into a string set, a form of data that helps organize sequence data.
# add necessary function
shrooms_vector_ss <- Biostrings::AAStringSet(shrooms_vector)
Multiple sequence alignments allow us to align more than 2 sequences.This section will align all of the sequences, and later on the mutliple sequence alignment we created will be used to create phylogenetic trees.
TODO: This sequence uses msa() package to use ClustalW, a mutliple sequence alignment algorithm, which will create an MSA
# add necessary function
library(msa)
shrooms_align <-msa(shrooms_vector_ss,
method = "ClustalW")
## use default substitution matrix
TODO: This section will create a graphic that allows us to visualize the MSA we built
TODO: A portion of the MSA is shown below, this area has lots of indels.
shrooms_align
## CLUSTAL 2.1
##
## Call:
## msa(shrooms_vector_ss, method = "ClustalW")
##
## MsaAAMultipleAlignment with 14 rows and 2252 columns
## aln names
## [1] -------------------------...------------------------- NP_065768
## [2] -------------------------...------------------------- AAK95579
## [3] -------------------------...SVFGGVFPTLTSPL----------- AAF13269
## [4] -------------------------...SVFGGVFPTLTSPL----------- AAF13270
## [5] -------------------------...CTFSGIFPTLTSPL----------- NP_065910
## [6] -------------------------...NKS--LPPPLTSSL----------- ABD59319
## [7] -------------------------...------------------------- CAA58534
## [8] -------------------------...------------------------- ABD19518
## [9] -------------------------...LT----------------------- NP_597713
## [10] -------------------------...------------------------- CAA78718
## [11] -------------------------...------------------------- EAA12598
## [12] -------------------------...------------------------- ABA81834
## [13] MTELQPSPPGYRVQDEAPGPPSCPP...------------------------- XP_392427
## [14] -------------------------...AATSSSSNGIGGPEQLNSNATSSYC XP_783573
## Con -------------------------...------------------------- Consensus
TODO: This chunk makes tweaks to the msa visualization to make it work with other functions.
# This helps shrooms_align work better with other functions
class(shrooms_align) <- "AAMultipleAlignment"
# This helps the object work with functions in the seqinr package
shrooms_align_seqinr <- msaConvert(shrooms_align, type = "seqinr::alignment")
TODO: This chunk displays the msa, but on a small monitor this isn’t really useful.
print_msa(alignment = shrooms_align_seqinr,
chunksize = 60)
TODO: This shows part of the MSA in a pretty, color-coded plot.
## add necessary function
library(ggmsa)
## Registered S3 methods overwritten by 'ggalt':
## method from
## grid.draw.absoluteGrob ggplot2
## grobHeight.absoluteGrob ggplot2
## grobWidth.absoluteGrob ggplot2
## grobX.absoluteGrob ggplot2
## grobY.absoluteGrob ggplot2
ggmsa::ggmsa(shrooms_align, # shrooms_align, NOT shrooms_align_seqinr
start = 2000,
end = 2100)
TODO: This chunk is saving the MSA plot as a PDF file. Add the package the function is coming from using :: notation This may not work for everyone. If its not working you can comment it out.
install.packages("tinytex")
tinytex::msaPrettyPrint()
msaPrettyPrint(shrooms_align, # alignment
file = "shroom_msa.pdf", # file name
y=c(2000, 2100), # range
askForOverwrite=FALSE)
TODO: This lets you know where R is saving things
getwd()
## [1] "C:/Users/saumi/Documents/comp bio"
To make things easier we’ll move forward with just a subset of sequences:
Our main working object shrooms_vector_ss has the names of our genes listed
names(shrooms_vector_ss)
## [1] "CAA78718" "NP_597713" "CAA58534" "ABD19518" "AAF13269" "AAF13270"
## [7] "NP_065910" "ABD59319" "NP_065768" "AAK95579" "ABA81834" "EAA12598"
## [13] "XP_392427" "XP_783573"
We can select the ones we want to focus on be first making a vector of the names
names.focal <- c("XP_392427","EAA12598","ABA81834","XP_783573","CAA78718")
We can use this vector and bracket notation to select the what we want from shrooms_vector_ss:
shrooms_vector_ss[names.focal]
## AAStringSet object of length 5:
## width seq names
## [1] 2126 MTELQPSPPGYRVQDEAPGPPSC...GREIQDKVKLGEEQLAALREAID XP_392427
## [2] 674 IPFSSSPKNRSNSKASYLPRQPR...ADKIKLGEEQLAALKDTLVQSEC EAA12598
## [3] 1576 MKMRNHKENGNGSEMGESTKSLA...AVRIKGSEEQLSSLSDALVQSDC ABA81834
## [4] 1661 MMKDAMYPTTTSTTSSSVNPLPK...TSSSSNGIGGPEQLNSNATSSYC XP_783573
## [5] 1420 MSAFGNTIERWNIKSTGVIAGLG...KNLEEKIKVYEEQFESIHNSLPP CAA78718
Let’s assign the subset of sequences to a new object called shrooms_vector_ss_subset.
shrooms_vector_ss_subset <- shrooms_vector_ss[names.focal]
Let’s make another MSA with just this subset. If msa isn’t working for you you can comment this out.
shrooms_align_subset <- msa(shrooms_vector_ss_subset,
method = "ClustalW")
## use default substitution matrix
To view it using ggmsa we need to do those annoying conversions again.
class(shrooms_align_subset) <- "AAMultipleAlignment"
shrooms_align_subset_seqinr <- msaConvert(shrooms_align_subset, type = "seqinr::alignment")
THen we can plot it
ggmsa::ggmsa(shrooms_align_subset, # shrooms_align, NOT shrooms_align_seqinr
start = 2030,
end = 2100)
We can save our new smaller MSA like this.
msaPrettyPrint(shrooms_align_subset, # alignment
file = "shroom_msa_subset.pdf", # file name
y=c(2030, 2100), # range
askForOverwrite=FALSE)
While an MSA is a good way to examine a sequence its hard to assess all of the information visually. A phylogenetic tree allows you to summarize patterns in an MSA. The fastest way to make phylogenetic trees to is first summarize an MSA using a genetic distance matrix. The more amino acids that are identical to each other, the smaller the genetic distance is between them and the less evolution has occurred.
We usually work in terms of difference or genetic distance (a.k.a. evolutionary distance), though often we also talk in terms of similarity or identity.
Calculating genetic distance from an MSA is done using the seqinr::dist.alignment() function.
shrooms_subset_dist <- seqinr::dist.alignment(shrooms_align_subset_seqinr,
matrix = "identity")
This produces a “dist” class object.
is(shrooms_subset_dist)
## [1] "dist" "oldClass"
class(shrooms_subset_dist)
## [1] "dist"
If you’ve been having trouble with the MSA software, the data necessary to build the distance matrix directly in R is in this code chunk (you can ignore the details).
shrooms_subset_dist_alt <- matrix(data = NA,
nrow = 5,
ncol = 5)
distances <- c(0.8260049,
0.8478722, 0.9000568,
0.9244596, 0.9435187, 0.9372139,
0.9238779, 0.9370038, 0.9323225,0.9413209)
shrooms_subset_dist_alt[lower.tri(shrooms_subset_dist_alt)] <- distances
seqnames <- c("EAA12598","ABA81834","XP_392427", "XP_783573","CAA78718")
colnames(shrooms_subset_dist_alt) <- seqnames
row.names(shrooms_subset_dist_alt) <- seqnames
shrooms_subset_dist_alt <- as.dist(shrooms_subset_dist_alt)
shrooms_subset_dist <- shrooms_subset_dist_alt
We’ve made a matrix using dist.alignment(); let’s round it off so its easier to look at using the round() function.
shrooms_subset_dist_rounded <- round(shrooms_subset_dist,
digits = 3)
If we want to look at it we can type
shrooms_subset_dist_rounded
## EAA12598 ABA81834 XP_392427 XP_783573
## ABA81834 0.826
## XP_392427 0.848 0.944
## XP_783573 0.900 0.937 0.937
## CAA78718 0.924 0.924 0.932 0.941
Not that we have 5 sequence, but the matrix is 4 x 4. This is because redundant information is dropped, including distances from one sequence to itself. This makes it so that the first column is EAA12598, but the first row is ABA81834.
We got our sequences, built a multiple sequence alignment, and calculated the genetic distance between sequences. Now we are - finally - ready to build a phylogenetic tree.
First, we let R figure out the structure of the tree. There are MANY ways to build phylogenetic trees. We’ll use a common one used for exploring sequences called neighbor joining algorithm via the function nj(). Neighbor joining uses genetic distances to cluster sequences into clades.
nj() is simple function that takes only a single argument, a distance matrix.
# Note - not using rounded values
install.packages("ape")
## Warning: package 'ape' is in use and will not be installed
library(ape)
tree_subset <- nj(shrooms_subset_dist)
Now we’ll make a quick plot of our tree using plot() (and add a little label using an important function called mtext()).
# plot tree
plot.phylo(tree_subset, main="Phylogenetic Tree",
type = "unrooted",
use.edge.length = F)
# add label
mtext(text = "Shroom family gene tree - UNrooted, no branch lengths")
This is an unrooted tree with no outgroup defined. For the sake of plotting we’ve also ignored the evolutionary distance between the sequences, so the branch lengths don’t have meaning.
To make a rooted tree we remove type = "unrooted. In the case of neighbor joining, the algorithm tries to figure out the outgroup on its own.
# plot tree
plot.phylo(tree_subset, main="Phylogenetic Tree",
use.edge.length = F)
# add label
mtext(text = "Shroom family gene tree - rooted, no branch lenths")
We can include information about branch length by setting use.edge.length = ... to T.
# plot tree
plot.phylo(tree_subset, main="Phylogenetic Tree",
use.edge.length = T)
# add label
mtext(text = "Shroom family gene tree - rooted, with branch lenths")
Now the length of the branches indicates the evolutionary distance between sequences and correlate to the distances reported in our distance matrix. The branches are all very long, indicating that these genes have been evolving independently for many millions of years.
An important note: the vertical lines on the tree have no meaning, only the horizontal ones.
Because the branch lengths are all so long I find this tree a bit hard to view when its rooted. Let’s make it unrooted again.
# plot tree
plot.phylo(tree_subset, main="Phylogenetic Tree",
type = "unrooted",
use.edge.length = T)
# add label
mtext(text = "Shroom family gene tree - unrooted, with branch lenths")
Now you can see that the ABA and EAA sequences form a clade, and that the distance between them is somewhat smaller than the distance between other sequences. If we go back to our original distance matrix, we can see that the smallest genetic distance is between ABA and EAA at 0.826.
shrooms_subset_dist_rounded
## EAA12598 ABA81834 XP_392427 XP_783573
## ABA81834 0.826
## XP_392427 0.848 0.944
## XP_783573 0.900 0.937 0.937
## CAA78718 0.924 0.924 0.932 0.941
We can confirm that this is the minimum using the min() function.
min(shrooms_subset_dist_rounded)
## [1] 0.826