By: Danielle Douse
In Biology, phylogenies is used to show the common ancestry amongst multiple biological species according to their differences and similarities in physical and genetic characteristics. Furthermore, phylogenies can be used to classify the evolutonary relationshops among homologous genes represeted in the genomes of fiverent species.
Consensus sequence Node Multiple Sequence Alignment (msa) Pairwise Sequence Alignment (psa) Taxa Clade/Cladogram Homology
Make a list of at least 5 key functions Put in the format of package::function Biostrings::pid() Biostrings::pairwiseAlignment() rentrez::entrez_fetch() ggmsa::ggmsa() score()
library(compbio4all) library(ggmsa)
library(rentrez) library(seqinr) library(ape)
library(BiocManager) library(msa) library(Biostrings)
# github packages
library(compbio4all)
library(ggmsa)
# CRAN packages
library(rentrez)
library(seqinr)
library(ape)
# Bioconductor packages
library(BiocManager)
library(msa)
library(Biostrings)
The Human shrom 3 (h. sapiens) protein information is being retrienved from the NCBI database using entrez_fetch fuction within the rentrez package from CRAN.
# Human shroom 3 (H. sapiens)
hShroom3 <- rentrez::entrez_fetch(db = "protein",
id = "NP_065910",
rettype = "fasta")
cat() respects the new line character present in the output of the previous function. It begins a new line for the data thus aiding in aligmment in the long run.
cat(hShroom3)
## >NP_065910.3 protein Shroom3 [Homo sapiens]
## MMRTTEDFHKPSATLNSNTATKGRYIYLEAFLEGGAPWGFTLKGGLEHGEPLIISKVEEGGKADTLSSKL
## QAGDEVVHINEVTLSSSRKEAVSLVKGSYKTLRLVVRRDVCTDPGHADTGASNFVSPEHLTSGPQHRKAA
## WSGGVKLRLKHRRSEPAGRPHSWHTTKSGEKQPDASMMQISQGMIGPPWHQSYHSSSSTSDLSNYDHAYL
## RRSPDQCSSQGSMESLEPSGAYPPCHLSPAKSTGSIDQLSHFHNKRDSAYSSFSTSSSILEYPHPGISGR
## ERSGSMDNTSARGGLLEGMRQADIRYVKTVYDTRRGVSAEYEVNSSALLLQGREARASANGQGYDKWSNI
## PRGKGVPPPSWSQQCPSSLETATDNLPPKVGAPLPPARSDSYAAFRHRERPSSWSSLDQKRLCRPQANSL
## GSLKSPFIEEQLHTVLEKSPENSPPVKPKHNYTQKAQPGQPLLPTSIYPVPSLEPHFAQVPQPSVSSNGM
## LYPALAKESGYIAPQGACNKMATIDENGNQNGSGRPGFAFCQPLEHDLLSPVEKKPEATAKYVPSKVHFC
## SVPENEEDASLKRHLTPPQGNSPHSNERKSTHSNKPSSHPHSLKCPQAQAWQAGEDKRSSRLSEPWEGDF
## QEDHNANLWRRLEREGLGQSLSGNFGKTKSAFSSLQNIPESLRRHSSLELGRGTQEGYPGGRPTCAVNTK
## AEDPGRKAAPDLGSHLDRQVSYPRPEGRTGASASFNSTDPSPEEPPAPSHPHTSSLGRRGPGPGSASALQ
## GFQYGKPHCSVLEKVSKFEQREQGSQRPSVGGSGFGHNYRPHRTVSTSSTSGNDFEETKAHIRFSESAEP
## LGNGEQHFKNGELKLEEASRQPCGQQLSGGASDSGRGPQRPDARLLRSQSTFQLSSEPEREPEWRDRPGS
## PESPLLDAPFSRAYRNSIKDAQSRVLGATSFRRRDLELGAPVASRSWRPRPSSAHVGLRSPEASASASPH
## TPRERHSVTPAEGDLARPVPPAARRGARRRLTPEQKKRSYSEPEKMNEVGIVEEAEPAPLGPQRNGMRFP
## ESSVADRRRLFERDGKACSTLSLSGPELKQFQQSALADYIQRKTGKRPTSAAGCSLQEPGPLRERAQSAY
## LQPGPAALEGSGLASASSLSSLREPSLQPRREATLLPATVAETQQAPRDRSSSFAGGRRLGERRRGDLLS
## GANGGTRGTQRGDETPREPSSWGARAGKSMSAEDLLERSDVLAGPVHVRSRSSPATADKRQDVLLGQDSG
## FGLVKDPCYLAGPGSRSLSCSERGQEEMLPLFHHLTPRWGGSGCKAIGDSSVPSECPGTLDHQRQASRTP
## CPRPPLAGTQGLVTDTRAAPLTPIGTPLPSAIPSGYCSQDGQTGRQPLPPYTPAMMHRSNGHTLTQPPGP
## RGCEGDGPEHGVEEGTRKRVSLPQWPPPSRAKWAHAAREDSLPEESSAPDFANLKHYQKQQSLPSLCSTS
## DPDTPLGAPSTPGRISLRISESVLRDSPPPHEDYEDEVFVRDPHPKATSSPTFEPLPPPPPPPPSQETPV
## YSMDDFPPPPPHTVCEAQLDSEDPEGPRPSFNKLSKVTIARERHMPGAAHVVGSQTLASRLQTSIKGSEA
## ESTPPSFMSVHAQLAGSLGGQPAPIQTQSLSHDPVSGTQGLEKKVSPDPQKSSEDIRTEALAKEIVHQDK
## SLADILDPDSRLKTTMDLMEGLFPRDVNLLKENSVKRKAIQRTVSSSGCEGKRNEDKEAVSMLVNCPAYY
## SVSAPKAELLNKIKEMPAEVNEEEEQADVNEKKAELIGSLTHKLETLQEAKGSLLTDIKLNNALGEEVEA
## LISELCKPNEFDKYRMFIGDLDKVVNLLLSLSGRLARVENVLSGLGEDASNEERSSLYEKRKILAGQHED
## ARELKENLDRRERVVLGILANYLSEEQLQDYQHFVKMKSTLLIEQRKLDDKIKLGQEQVKCLLESLPSDF
## IPKAGALALPPNLTSEPIPAGGCTFSGIFPTLTSPL
The protein information of Human shroom 2, Mouse shroom 3a and Sea-urchin shroom is being retrienved from the NCBI database using entrez_fetch fuction within the rentrez package from CRAN.
# Mouse shroom 3a (M. musculus)
mShroom3a <- entrez_fetch(db = "protein",
id = "AAF13269",
rettype = "fasta")
# Human shroom 2 (H. sapiens)
hShroom2 <- entrez_fetch(db = "protein",
id = "CAA58534",
rettype = "fasta")
# Sea-urchin shroom
sShroom <- entrez_fetch(db = "protein",
id = "XP_783573",
rettype = "fasta")
nchar is outputting the number of characters- amino acids- that are in each shroom’s sequence.
nchar(hShroom3)
## [1] 2070
nchar(mShroom3a)
## [1] 2083
nchar(sShroom)
## [1] 1758
nchar(hShroom2)
## [1] 1673
TThe fasta_cleaner puts fasta files in the appropriate format for data alignment and phylogenetic trees.
fasta_cleaner
## function (fasta_object, parse = TRUE)
## {
## fasta_object <- sub("^(>)(.*?)(\\n)(.*)(\\n\\n)", "\\4",
## fasta_object)
## fasta_object <- gsub("\n", "", fasta_object)
## if (parse == TRUE) {
## fasta_object <- stringr::str_split(fasta_object, pattern = "",
## simplify = FALSE)
## }
## return(fasta_object[[1]])
## }
## <bytecode: 0x7fc08683d4f8>
## <environment: namespace:compbio4all>
Add the function by downloading the compbio4all package from gitbub. Then, use the library() function to call the combio4all package into the current session.
fasta_cleaner <- function(fasta_object, parse = TRUE){
fasta_object <- sub("^(>)(.*?)(\\n)(.*)(\\n\\n)","\\4",fasta_object)
fasta_object <- gsub("\n", "", fasta_object)
if(parse == TRUE){
fasta_object <- stringr::str_split(fasta_object,
pattern = "",
simplify = FALSE)
}
return(fasta_object[[1]])
}
This code chuck takes all four shroom sequences and srubs them clean to the correct for alignment
hShroom3 <- fasta_cleaner(hShroom3, parse = F)
mShroom3a <- fasta_cleaner(mShroom3a, parse = F)
hShroom2 <- fasta_cleaner(hShroom2, parse = F)
sShroom <- fasta_cleaner(sShroom, parse = F)
hShroom3
## [1] "MMRTTEDFHKPSATLNSNTATKGRYIYLEAFLEGGAPWGFTLKGGLEHGEPLIISKVEEGGKADTLSSKLQAGDEVVHINEVTLSSSRKEAVSLVKGSYKTLRLVVRRDVCTDPGHADTGASNFVSPEHLTSGPQHRKAAWSGGVKLRLKHRRSEPAGRPHSWHTTKSGEKQPDASMMQISQGMIGPPWHQSYHSSSSTSDLSNYDHAYLRRSPDQCSSQGSMESLEPSGAYPPCHLSPAKSTGSIDQLSHFHNKRDSAYSSFSTSSSILEYPHPGISGRERSGSMDNTSARGGLLEGMRQADIRYVKTVYDTRRGVSAEYEVNSSALLLQGREARASANGQGYDKWSNIPRGKGVPPPSWSQQCPSSLETATDNLPPKVGAPLPPARSDSYAAFRHRERPSSWSSLDQKRLCRPQANSLGSLKSPFIEEQLHTVLEKSPENSPPVKPKHNYTQKAQPGQPLLPTSIYPVPSLEPHFAQVPQPSVSSNGMLYPALAKESGYIAPQGACNKMATIDENGNQNGSGRPGFAFCQPLEHDLLSPVEKKPEATAKYVPSKVHFCSVPENEEDASLKRHLTPPQGNSPHSNERKSTHSNKPSSHPHSLKCPQAQAWQAGEDKRSSRLSEPWEGDFQEDHNANLWRRLEREGLGQSLSGNFGKTKSAFSSLQNIPESLRRHSSLELGRGTQEGYPGGRPTCAVNTKAEDPGRKAAPDLGSHLDRQVSYPRPEGRTGASASFNSTDPSPEEPPAPSHPHTSSLGRRGPGPGSASALQGFQYGKPHCSVLEKVSKFEQREQGSQRPSVGGSGFGHNYRPHRTVSTSSTSGNDFEETKAHIRFSESAEPLGNGEQHFKNGELKLEEASRQPCGQQLSGGASDSGRGPQRPDARLLRSQSTFQLSSEPEREPEWRDRPGSPESPLLDAPFSRAYRNSIKDAQSRVLGATSFRRRDLELGAPVASRSWRPRPSSAHVGLRSPEASASASPHTPRERHSVTPAEGDLARPVPPAARRGARRRLTPEQKKRSYSEPEKMNEVGIVEEAEPAPLGPQRNGMRFPESSVADRRRLFERDGKACSTLSLSGPELKQFQQSALADYIQRKTGKRPTSAAGCSLQEPGPLRERAQSAYLQPGPAALEGSGLASASSLSSLREPSLQPRREATLLPATVAETQQAPRDRSSSFAGGRRLGERRRGDLLSGANGGTRGTQRGDETPREPSSWGARAGKSMSAEDLLERSDVLAGPVHVRSRSSPATADKRQDVLLGQDSGFGLVKDPCYLAGPGSRSLSCSERGQEEMLPLFHHLTPRWGGSGCKAIGDSSVPSECPGTLDHQRQASRTPCPRPPLAGTQGLVTDTRAAPLTPIGTPLPSAIPSGYCSQDGQTGRQPLPPYTPAMMHRSNGHTLTQPPGPRGCEGDGPEHGVEEGTRKRVSLPQWPPPSRAKWAHAAREDSLPEESSAPDFANLKHYQKQQSLPSLCSTSDPDTPLGAPSTPGRISLRISESVLRDSPPPHEDYEDEVFVRDPHPKATSSPTFEPLPPPPPPPPSQETPVYSMDDFPPPPPHTVCEAQLDSEDPEGPRPSFNKLSKVTIARERHMPGAAHVVGSQTLASRLQTSIKGSEAESTPPSFMSVHAQLAGSLGGQPAPIQTQSLSHDPVSGTQGLEKKVSPDPQKSSEDIRTEALAKEIVHQDKSLADILDPDSRLKTTMDLMEGLFPRDVNLLKENSVKRKAIQRTVSSSGCEGKRNEDKEAVSMLVNCPAYYSVSAPKAELLNKIKEMPAEVNEEEEQADVNEKKAELIGSLTHKLETLQEAKGSLLTDIKLNNALGEEVEALISELCKPNEFDKYRMFIGDLDKVVNLLLSLSGRLARVENVLSGLGEDASNEERSSLYEKRKILAGQHEDARELKENLDRRERVVLGILANYLSEEQLQDYQHFVKMKSTLLIEQRKLDDKIKLGQEQVKCLLESLPSDFIPKAGALALPPNLTSEPIPAGGCTFSGIFPTLTSPL"
This code chunk aligns the protein sequences of the different shrooms proteins based on similar characteristics.
# add necessary function
align.h3.vs.m3a <- Biostrings::pairwiseAlignment (
hShroom3,
mShroom3a)
This shows the actual output of the pairwise alinment.
align.h3.vs.m3a
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: MMRTTEDFHKPSATLN-SNTATKGRYIYLEAFLE...KAGALALPPNLTSEPIPAGGCTFSGIFPTLTSPL
## subject: MK-TPENLEEPSATPNPSRTPTE-RFVYLEALLE...KAGAISLPPALTGHATPGGTSVFGGVFPTLTSPL
## score: 2189.934
This shows the percent identity of the alignment. It is the simpliest way to compare the identity of the proetins.
# add necessary function
Biostrings::pid(align.h3.vs.m3a)
## [1] 70.56511
This assigns the value- the percent identity- from the last function to a variable named align.h3.vs.h2.
align.h3.vs.h2 <- Biostrings::pairwiseAlignment(
hShroom3,
hShroom2)
A mathematical summary of the quality of the alignment is being calculated by the score function using the percent identity results.
score(align.h3.vs.h2)
## [1] -5673.853
pid() is the simpliest way of comparing how identical two sequences are by dividing the number os similarities by the number of cases and multiplying by 100. The score() is a mathematical summary of the qualitiy of the alignment
Biostrings::pid(align.h3.vs.h2)
## [1] 33.83277
The point of this table is show the different types, and thus the size and variablity of the different shroom genes that make up the shroom family, It also shows the accession numbers, various species in which shrooms are present, and their scientific name.
shroom_table <- c("CAA78718" , "X. laevis Apx" , "xShroom1",
"NP_597713" , "H. sapiens APXL2" , "hShroom1",
"CAA58534" , "H. sapiens APXL", "hShroom2",
"ABD19518" , "M. musculus Apxl" , "mShroom2",
"AAF13269" , "M. musculus ShroomL" , "mShroom3a",
"AAF13270" , "M. musculus ShroomS" , "mShroom3b",
"NP_065910", "H. sapiens Shroom" , "hShroom3",
"ABD59319" , "X. laevis Shroom-like", "xShroom3",
"NP_065768", "H. sapiens KIAA1202" , "hShroom4a",
"AAK95579" , "H. sapiens SHAP-A" , "hShroom4b",
#"DQ435686" , "M. musculus KIAA1202" , "mShroom4",
"ABA81834" , "D. melanogaster Shroom", "dmShroom",
"EAA12598" , "A. gambiae Shroom", "agShroom",
"XP_392427" , "A. mellifera Shroom" , "amShroom",
"XP_783573" , "S. purpuratus Shroom" , "spShroom") #sea urchin
This code chuck takes data that is in a single, complex, long vector and combining it into something that is easier to work with.
# convert to XXXXXXXXXC
shroom_table_matrix <- matrix(shroom_table,
byrow = T,
nrow = 14)
# convert to XXXXXXXXXC
shroom_table <- data.frame(shroom_table_matrix,
stringsAsFactors = F)
# XXXXXXXXXC columns
names(shroom_table) <- c("accession", "name.orig","name.new")
# Create simplified species names
shroom_table$spp <- "Homo"
shroom_table$spp[grep("laevis",shroom_table$name.orig)] <- "Xenopus"
shroom_table$spp[grep("musculus",shroom_table$name.orig)] <- "Mus"
shroom_table$spp[grep("melanogaster",shroom_table$name.orig)] <- "Drosophila"
shroom_table$spp[grep("gambiae",shroom_table$name.orig)] <- "mosquito"
shroom_table$spp[grep("mellifera",shroom_table$name.orig)] <- "bee"
shroom_table$spp[grep("purpuratus",shroom_table$name.orig)] <- "sea urchin"
This lays the previous vector out into a simplier and more manageable data table in preparation for a multiple sequence alignment.
shroom_table
## accession name.orig name.new spp
## 1 CAA78718 X. laevis Apx xShroom1 Xenopus
## 2 NP_597713 H. sapiens APXL2 hShroom1 Homo
## 3 CAA58534 H. sapiens APXL hShroom2 Homo
## 4 ABD19518 M. musculus Apxl mShroom2 Mus
## 5 AAF13269 M. musculus ShroomL mShroom3a Mus
## 6 AAF13270 M. musculus ShroomS mShroom3b Mus
## 7 NP_065910 H. sapiens Shroom hShroom3 Homo
## 8 ABD59319 X. laevis Shroom-like xShroom3 Xenopus
## 9 NP_065768 H. sapiens KIAA1202 hShroom4a Homo
## 10 AAK95579 H. sapiens SHAP-A hShroom4b Homo
## 11 ABA81834 D. melanogaster Shroom dmShroom Drosophila
## 12 EAA12598 A. gambiae Shroom agShroom mosquito
## 13 XP_392427 A. mellifera Shroom amShroom bee
## 14 XP_783573 S. purpuratus Shroom spShroom sea urchin
The $ allows you to extract or replace elements from an object.
shroom_table$accession
## [1] "CAA78718" "NP_597713" "CAA58534" "ABD19518" "AAF13269" "AAF13270"
## [7] "NP_065910" "ABD59319" "NP_065768" "AAK95579" "ABA81834" "EAA12598"
## [13] "XP_392427" "XP_783573"
The accession id of the shroom genes is being extracted from the above shroom_table and is being retrieved by the entrez_fetch function using the rentrez package from CRAN.
# add necessary function
shrooms <-rentrez::entrez_fetch (db = "protein",
id = shroom_table$accession,
rettype = "fasta")
cat() respects the new line character present in the output of the previous function. It begins a new line for the data where the previous output indicated the should have been a new line.
cat(shrooms)
This code chunk obtains all 14 of the accession number in the form of a fasta file.
shrooms_list <- entrez_fetch_list(db = "protein",
id = shroom_table$accession,
rettype = "fasta")
This tells the type of the data, the length of the data and the number of characteristics of the shrooms_list.
is(shrooms_list)
## [1] "list" "vector" "list_OR_List" "vector_OR_Vector"
## [5] "vector_OR_factor"
length(shrooms_list)
## [1] 14
nchar(shrooms_list)
## CAA78718 NP_597713 CAA58534 ABD19518 AAF13269 AAF13270 NP_065910 ABD59319
## 1486 915 1673 1543 2083 1895 2070 1864
## NP_065768 AAK95579 ABA81834 EAA12598 XP_392427 XP_783573
## 1560 778 1647 750 2230 1758
The for() loop allows you to repeat a task. In this case, it allows you to keep cleaning until all 14 sequences are clean.
for(i in 1:length(shrooms_list)){
shrooms_list[[i]] <- fasta_cleaner(shrooms_list[[i]], parse = F)
}
This continues to clean the data.
# XXXXXXXXCX
shrooms_vector <- rep(NA, length(shrooms_list))
# XXXXXXXXCX
for(i in 1:length(shrooms_vector)){
shrooms_vector[i] <- shrooms_list[[i]]
}
# XXXXXXXXCX
names(shrooms_vector) <- names(shrooms_list)
This converts the vector to a string set in the final cleaning step of the data.
# add necessary function
shrooms_vector_ss <- Biostrings:: AAStringSet (shrooms_vector)
Multiple sequence alignment is the alignment of 3 or more sequences in which amino acids or nucleotides of identical charateristics are located in the same column.
This chunk builds the multiple sequence alignment using ClustalW
# add necessary function
shrooms_align <- msa (shrooms_vector_ss,
method = "ClustalW")
## use default substitution matrix
This will show the output of your msa.
This shows a rough output of the msa in which many deletion or insertions are present.
shrooms_align
## CLUSTAL 2.1
##
## Call:
## msa(shrooms_vector_ss, method = "ClustalW")
##
## MsaAAMultipleAlignment with 14 rows and 2252 columns
## aln names
## [1] -------------------------...------------------------- NP_065768
## [2] -------------------------...------------------------- AAK95579
## [3] -------------------------...SVFGGVFPTLTSPL----------- AAF13269
## [4] -------------------------...SVFGGVFPTLTSPL----------- AAF13270
## [5] -------------------------...CTFSGIFPTLTSPL----------- NP_065910
## [6] -------------------------...NKS--LPPPLTSSL----------- ABD59319
## [7] -------------------------...------------------------- CAA58534
## [8] -------------------------...------------------------- ABD19518
## [9] -------------------------...LT----------------------- NP_597713
## [10] -------------------------...------------------------- CAA78718
## [11] -------------------------...------------------------- EAA12598
## [12] -------------------------...------------------------- ABA81834
## [13] MTELQPSPPGYRVQDEAPGPPSCPP...------------------------- XP_392427
## [14] -------------------------...AATSSSSNGIGGPEQLNSNATSSYC XP_783573
## Con -------------------------...------------------------- Consensus
This is smoothing out the roughness of previous output. It continues to align the amino acid sequences of the shroom genes.
# WHAT IS THE LINE BELOW DOING? (its tricky - do your best)
class(shrooms_align) <- "AAMultipleAlignment"
# WHAT IS THE LINE BELOW DOING? This is simpler
shrooms_align_seqinr <- msaConvert(shrooms_align, type = "seqinr::alignment")
This is another alignment of the previous alignment which is bringing it closer to the finished msa product.
print_msa(alignment = shrooms_align_seqinr,
chunksize = 60)
This shows the msa in from the 2000th amino residue to the 2100 amino residue which appears to be a region where similar amino acids are present and mostly identical.
## add necessary function
ggmsa::ggmsa (shrooms_align, # shrooms_align, NOT shrooms_align_seqinr
start = 2000,
end = 2100)
Save file as a pdf.
#msaPrettyPrint(shrooms_align, # alignment
file = "shroom_msa.pdf", # file name
y=c(2000, 2100), # range
askForOverwrite=FALSE)
Tells where R has saved the document.
getwd()
## [1] "/Users/danielledouse/Documents/Fall 2021/Computational Biology/Portfolio Assignment 1"