# Preliminaries
## NOTE: only run install.packages() if you have to
## Packages
### devtools
#### devtools
#install.packages("devtools")
library(devtools)
## Loading required package: usethis
#### compbio4all
#devtools::install_github("brouwern/compbio4all")
library(compbio4all)
#### ggmsa
#devtools::install_github("YuLab-SMU/ggmsa")
library(ggmsa)
## Registered S3 methods overwritten by 'ggalt':
## method from
## grid.draw.absoluteGrob ggplot2
## grobHeight.absoluteGrob ggplot2
## grobWidth.absoluteGrob ggplot2
## grobX.absoluteGrob ggplot2
## grobY.absoluteGrob ggplot2
library(msa)
## Loading required package: Biostrings
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: XVector
## Loading required package: GenomeInfoDb
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
##
## strsplit
### Bioconductor / BiocManager
#install.packages("BiocManager")
library(BiocManager)
##
## Attaching package: 'BiocManager'
## The following object is masked from 'package:msa':
##
## version
## The following object is masked from 'package:devtools':
##
## install
#### Biostrings
#BiocManager::install("Biostrings")
#BiocManager::install("msa")
## R bioinformatics software
#install.packages("rentrez")
library(rentrez)
#install.packages("seqinr")
library(seqinr)
##
## Attaching package: 'seqinr'
## The following object is masked from 'package:Biostrings':
##
## translate
#install.packages("ape")
library(ape)
##
## Attaching package: 'ape'
## The following objects are masked from 'package:seqinr':
##
## as.alignment, consensus
## The following object is masked from 'package:Biostrings':
##
## complement
library(rentrez)
# Human shroom 3 (H. sapiens)
hShroom3 <- entrez_fetch(db = “protein”, id = “NP_065910”, rettype = “fasta”) hShroom3
cat(hShroom3)
## >NP_065910.3 protein Shroom3 [Homo sapiens]
## MMRTTEDFHKPSATLNSNTATKGRYIYLEAFLEGGAPWGFTLKGGLEHGEPLIISKVEEGGKADTLSSKL
## QAGDEVVHINEVTLSSSRKEAVSLVKGSYKTLRLVVRRDVCTDPGHADTGASNFVSPEHLTSGPQHRKAA
## WSGGVKLRLKHRRSEPAGRPHSWHTTKSGEKQPDASMMQISQGMIGPPWHQSYHSSSSTSDLSNYDHAYL
## RRSPDQCSSQGSMESLEPSGAYPPCHLSPAKSTGSIDQLSHFHNKRDSAYSSFSTSSSILEYPHPGISGR
## ERSGSMDNTSARGGLLEGMRQADIRYVKTVYDTRRGVSAEYEVNSSALLLQGREARASANGQGYDKWSNI
## PRGKGVPPPSWSQQCPSSLETATDNLPPKVGAPLPPARSDSYAAFRHRERPSSWSSLDQKRLCRPQANSL
## GSLKSPFIEEQLHTVLEKSPENSPPVKPKHNYTQKAQPGQPLLPTSIYPVPSLEPHFAQVPQPSVSSNGM
## LYPALAKESGYIAPQGACNKMATIDENGNQNGSGRPGFAFCQPLEHDLLSPVEKKPEATAKYVPSKVHFC
## SVPENEEDASLKRHLTPPQGNSPHSNERKSTHSNKPSSHPHSLKCPQAQAWQAGEDKRSSRLSEPWEGDF
## QEDHNANLWRRLEREGLGQSLSGNFGKTKSAFSSLQNIPESLRRHSSLELGRGTQEGYPGGRPTCAVNTK
## AEDPGRKAAPDLGSHLDRQVSYPRPEGRTGASASFNSTDPSPEEPPAPSHPHTSSLGRRGPGPGSASALQ
## GFQYGKPHCSVLEKVSKFEQREQGSQRPSVGGSGFGHNYRPHRTVSTSSTSGNDFEETKAHIRFSESAEP
## LGNGEQHFKNGELKLEEASRQPCGQQLSGGASDSGRGPQRPDARLLRSQSTFQLSSEPEREPEWRDRPGS
## PESPLLDAPFSRAYRNSIKDAQSRVLGATSFRRRDLELGAPVASRSWRPRPSSAHVGLRSPEASASASPH
## TPRERHSVTPAEGDLARPVPPAARRGARRRLTPEQKKRSYSEPEKMNEVGIVEEAEPAPLGPQRNGMRFP
## ESSVADRRRLFERDGKACSTLSLSGPELKQFQQSALADYIQRKTGKRPTSAAGCSLQEPGPLRERAQSAY
## LQPGPAALEGSGLASASSLSSLREPSLQPRREATLLPATVAETQQAPRDRSSSFAGGRRLGERRRGDLLS
## GANGGTRGTQRGDETPREPSSWGARAGKSMSAEDLLERSDVLAGPVHVRSRSSPATADKRQDVLLGQDSG
## FGLVKDPCYLAGPGSRSLSCSERGQEEMLPLFHHLTPRWGGSGCKAIGDSSVPSECPGTLDHQRQASRTP
## CPRPPLAGTQGLVTDTRAAPLTPIGTPLPSAIPSGYCSQDGQTGRQPLPPYTPAMMHRSNGHTLTQPPGP
## RGCEGDGPEHGVEEGTRKRVSLPQWPPPSRAKWAHAAREDSLPEESSAPDFANLKHYQKQQSLPSLCSTS
## DPDTPLGAPSTPGRISLRISESVLRDSPPPHEDYEDEVFVRDPHPKATSSPTFEPLPPPPPPPPSQETPV
## YSMDDFPPPPPHTVCEAQLDSEDPEGPRPSFNKLSKVTIARERHMPGAAHVVGSQTLASRLQTSIKGSEA
## ESTPPSFMSVHAQLAGSLGGQPAPIQTQSLSHDPVSGTQGLEKKVSPDPQKSSEDIRTEALAKEIVHQDK
## SLADILDPDSRLKTTMDLMEGLFPRDVNLLKENSVKRKAIQRTVSSSGCEGKRNEDKEAVSMLVNCPAYY
## SVSAPKAELLNKIKEMPAEVNEEEEQADVNEKKAELIGSLTHKLETLQEAKGSLLTDIKLNNALGEEVEA
## LISELCKPNEFDKYRMFIGDLDKVVNLLLSLSGRLARVENVLSGLGEDASNEERSSLYEKRKILAGQHED
## ARELKENLDRRERVVLGILANYLSEEQLQDYQHFVKMKSTLLIEQRKLDDKIKLGQEQVKCLLESLPSDF
## IPKAGALALPPNLTSEPIPAGGCTFSGIFPTLTSPL
# Mouse shroom 3a (M. musculus)
mShroom3a <- entrez_fetch(db = "protein",
id = "AAF13269",
rettype = "fasta")
# Human shroom 2 (H. sapiens)
hShroom2 <- entrez_fetch(db = "protein",
id = "CAA58534",
rettype = "fasta")
# Sea-urchin shroom
sShroom <- entrez_fetch(db = "protein",
id = "XP_783573",
rettype = "fasta")
nchar(hShroom3)
## [1] 2070
nchar(mShroom3a)
## [1] 2083
nchar(sShroom)
## [1] 1758
nchar(hShroom2)
## [1] 1673
fasta_cleaner <- function(fasta_object, parse = TRUE)
{
fasta_object <- sub("^(>)(.*?)(\\n)(.*)(\\n\\n)", "\\4",
fasta_object)
fasta_object <- gsub("\n", "", fasta_object)
if (parse == TRUE) {
fasta_object <- stringr::str_split(fasta_object, pattern = "",
simplify = FALSE)
}
return(fasta_object[[1]])
}
fasta_cleaner
## function(fasta_object, parse = TRUE)
## {
## fasta_object <- sub("^(>)(.*?)(\\n)(.*)(\\n\\n)", "\\4",
## fasta_object)
## fasta_object <- gsub("\n", "", fasta_object)
## if (parse == TRUE) {
## fasta_object <- stringr::str_split(fasta_object, pattern = "",
## simplify = FALSE)
## }
## return(fasta_object[[1]])
## }
hShroom3 <- fasta_cleaner(hShroom3, parse = F)
mShroom3a <- fasta_cleaner(mShroom3a, parse = F)
hShroom2 <- fasta_cleaner(hShroom2, parse = F)
sShroom <- fasta_cleaner(sShroom, parse = F)
hShroom3
## [1] "MMRTTEDFHKPSATLNSNTATKGRYIYLEAFLEGGAPWGFTLKGGLEHGEPLIISKVEEGGKADTLSSKLQAGDEVVHINEVTLSSSRKEAVSLVKGSYKTLRLVVRRDVCTDPGHADTGASNFVSPEHLTSGPQHRKAAWSGGVKLRLKHRRSEPAGRPHSWHTTKSGEKQPDASMMQISQGMIGPPWHQSYHSSSSTSDLSNYDHAYLRRSPDQCSSQGSMESLEPSGAYPPCHLSPAKSTGSIDQLSHFHNKRDSAYSSFSTSSSILEYPHPGISGRERSGSMDNTSARGGLLEGMRQADIRYVKTVYDTRRGVSAEYEVNSSALLLQGREARASANGQGYDKWSNIPRGKGVPPPSWSQQCPSSLETATDNLPPKVGAPLPPARSDSYAAFRHRERPSSWSSLDQKRLCRPQANSLGSLKSPFIEEQLHTVLEKSPENSPPVKPKHNYTQKAQPGQPLLPTSIYPVPSLEPHFAQVPQPSVSSNGMLYPALAKESGYIAPQGACNKMATIDENGNQNGSGRPGFAFCQPLEHDLLSPVEKKPEATAKYVPSKVHFCSVPENEEDASLKRHLTPPQGNSPHSNERKSTHSNKPSSHPHSLKCPQAQAWQAGEDKRSSRLSEPWEGDFQEDHNANLWRRLEREGLGQSLSGNFGKTKSAFSSLQNIPESLRRHSSLELGRGTQEGYPGGRPTCAVNTKAEDPGRKAAPDLGSHLDRQVSYPRPEGRTGASASFNSTDPSPEEPPAPSHPHTSSLGRRGPGPGSASALQGFQYGKPHCSVLEKVSKFEQREQGSQRPSVGGSGFGHNYRPHRTVSTSSTSGNDFEETKAHIRFSESAEPLGNGEQHFKNGELKLEEASRQPCGQQLSGGASDSGRGPQRPDARLLRSQSTFQLSSEPEREPEWRDRPGSPESPLLDAPFSRAYRNSIKDAQSRVLGATSFRRRDLELGAPVASRSWRPRPSSAHVGLRSPEASASASPHTPRERHSVTPAEGDLARPVPPAARRGARRRLTPEQKKRSYSEPEKMNEVGIVEEAEPAPLGPQRNGMRFPESSVADRRRLFERDGKACSTLSLSGPELKQFQQSALADYIQRKTGKRPTSAAGCSLQEPGPLRERAQSAYLQPGPAALEGSGLASASSLSSLREPSLQPRREATLLPATVAETQQAPRDRSSSFAGGRRLGERRRGDLLSGANGGTRGTQRGDETPREPSSWGARAGKSMSAEDLLERSDVLAGPVHVRSRSSPATADKRQDVLLGQDSGFGLVKDPCYLAGPGSRSLSCSERGQEEMLPLFHHLTPRWGGSGCKAIGDSSVPSECPGTLDHQRQASRTPCPRPPLAGTQGLVTDTRAAPLTPIGTPLPSAIPSGYCSQDGQTGRQPLPPYTPAMMHRSNGHTLTQPPGPRGCEGDGPEHGVEEGTRKRVSLPQWPPPSRAKWAHAAREDSLPEESSAPDFANLKHYQKQQSLPSLCSTSDPDTPLGAPSTPGRISLRISESVLRDSPPPHEDYEDEVFVRDPHPKATSSPTFEPLPPPPPPPPSQETPVYSMDDFPPPPPHTVCEAQLDSEDPEGPRPSFNKLSKVTIARERHMPGAAHVVGSQTLASRLQTSIKGSEAESTPPSFMSVHAQLAGSLGGQPAPIQTQSLSHDPVSGTQGLEKKVSPDPQKSSEDIRTEALAKEIVHQDKSLADILDPDSRLKTTMDLMEGLFPRDVNLLKENSVKRKAIQRTVSSSGCEGKRNEDKEAVSMLVNCPAYYSVSAPKAELLNKIKEMPAEVNEEEEQADVNEKKAELIGSLTHKLETLQEAKGSLLTDIKLNNALGEEVEALISELCKPNEFDKYRMFIGDLDKVVNLLLSLSGRLARVENVLSGLGEDASNEERSSLYEKRKILAGQHEDARELKENLDRRERVVLGILANYLSEEQLQDYQHFVKMKSTLLIEQRKLDDKIKLGQEQVKCLLESLPSDFIPKAGALALPPNLTSEPIPAGGCTFSGIFPTLTSPL"
align.h3.vs.m3a <- Biostrings::pairwiseAlignment(
hShroom3,
mShroom3a)
align.h3.vs.m3a
## Global PairwiseAlignmentsSingleSubject (1 of 1)
## pattern: MMRTTEDFHKPSATLN-SNTATKGRYIYLEAFLE...KAGALALPPNLTSEPIPAGGCTFSGIFPTLTSPL
## subject: MK-TPENLEEPSATPNPSRTPTE-RFVYLEALLE...KAGAISLPPALTGHATPGGTSVFGGVFPTLTSPL
## score: 2189.934
Biostrings::pid(align.h3.vs.m3a)
## [1] 70.56511
align.h3.vs.h2 <- Biostrings::pairwiseAlignment(
hShroom3,
hShroom2)
library(BiocGenerics)
score(align.h3.vs.h2)
## [1] -5673.853
Biostrings::pid(align.h3.vs.h2)
## [1] 33.83277
shroom_table <- c("CAA78718" , "X. laevis Apx" , "xShroom1",
"NP_597713" , "H. sapiens APXL2" , "hShroom1",
"CAA58534" , "H. sapiens APXL", "hShroom2",
"ABD19518" , "M. musculus Apxl" , "mShroom2",
"AAF13269" , "M. musculus ShroomL" , "mShroom3a",
"AAF13270" , "M. musculus ShroomS" , "mShroom3b",
"NP_065910", "H. sapiens Shroom" , "hShroom3",
"ABD59319" , "X. laevis Shroom-like", "xShroom3",
"NP_065768", "H. sapiens KIAA1202" , "hShroom4a",
"AAK95579" , "H. sapiens SHAP-A" , "hShroom4b",
#"DQ435686" , "M. musculus KIAA1202" , "mShroom4",
"ABA81834" , "D. melanogaster Shroom", "dmShroom",
"EAA12598" , "A. gambiae Shroom", "agShroom",
"XP_392427" , "A. mellifera Shroom" , "amShroom",
"XP_783573" , "S. purpuratus Shroom" , "spShroom") #sea urchin
# convert to XXXXXXXXXC
shroom_table_matrix <- matrix(shroom_table,
byrow = T,
nrow = 14)
# convert to XXXXXXXXXC
shroom_table <- data.frame(shroom_table_matrix,
stringsAsFactors = F)
# XXXXXXXXXC columns
names(shroom_table) <- c("accession", "name.orig","name.new")
# Create simplified species names
shroom_table$spp <- "Homo"
shroom_table$spp[grep("laevis",shroom_table$name.orig)] <- "Xenopus"
shroom_table$spp[grep("musculus",shroom_table$name.orig)] <- "Mus"
shroom_table$spp[grep("melanogaster",shroom_table$name.orig)] <- "Drosophila"
shroom_table$spp[grep("gambiae",shroom_table$name.orig)] <- "mosquito"
shroom_table$spp[grep("mellifera",shroom_table$name.orig)] <- "bee"
shroom_table$spp[grep("purpuratus",shroom_table$name.orig)] <- "sea urchin"
shroom_table
## accession name.orig name.new spp
## 1 CAA78718 X. laevis Apx xShroom1 Xenopus
## 2 NP_597713 H. sapiens APXL2 hShroom1 Homo
## 3 CAA58534 H. sapiens APXL hShroom2 Homo
## 4 ABD19518 M. musculus Apxl mShroom2 Mus
## 5 AAF13269 M. musculus ShroomL mShroom3a Mus
## 6 AAF13270 M. musculus ShroomS mShroom3b Mus
## 7 NP_065910 H. sapiens Shroom hShroom3 Homo
## 8 ABD59319 X. laevis Shroom-like xShroom3 Xenopus
## 9 NP_065768 H. sapiens KIAA1202 hShroom4a Homo
## 10 AAK95579 H. sapiens SHAP-A hShroom4b Homo
## 11 ABA81834 D. melanogaster Shroom dmShroom Drosophila
## 12 EAA12598 A. gambiae Shroom agShroom mosquito
## 13 XP_392427 A. mellifera Shroom amShroom bee
## 14 XP_783573 S. purpuratus Shroom spShroom sea urchin
shroom_table$accession
## [1] "CAA78718" "NP_597713" "CAA58534" "ABD19518" "AAF13269" "AAF13270"
## [7] "NP_065910" "ABD59319" "NP_065768" "AAK95579" "ABA81834" "EAA12598"
## [13] "XP_392427" "XP_783573"
shrooms <- entrez_fetch(db = "protein",
id = shroom_table$accession,
rettype = "fasta")
cat(shrooms)
shrooms_list <- entrez_fetch_list(db = "protein",
id = shroom_table$accession,
rettype = "fasta")
length(shrooms_list)
## [1] 14
for(i in 1:length(shrooms_list)){
shrooms_list[[i]] <- fasta_cleaner(shrooms_list[[i]], parse = F)
}
# XXXXXXXXCX
shrooms_vector <- rep(NA, length(shrooms_list))
# XXXXXXXXCX
for(i in 1:length(shrooms_vector)){
shrooms_vector[i] <- shrooms_list[[i]]
}
# XXXXXXXXCX
names(shrooms_vector) <- names(shrooms_list)
shrooms_vector_ss <- Biostrings::AAStringSet(shrooms_vector)
shrooms_align <- msa(shrooms_vector_ss,
method = "ClustalW")
## use default substitution matrix
shrooms_align
## CLUSTAL 2.1
##
## Call:
## msa(shrooms_vector_ss, method = "ClustalW")
##
## MsaAAMultipleAlignment with 14 rows and 2252 columns
## aln names
## [1] -------------------------...------------------------- NP_065768
## [2] -------------------------...------------------------- AAK95579
## [3] -------------------------...SVFGGVFPTLTSPL----------- AAF13269
## [4] -------------------------...SVFGGVFPTLTSPL----------- AAF13270
## [5] -------------------------...CTFSGIFPTLTSPL----------- NP_065910
## [6] -------------------------...NKS--LPPPLTSSL----------- ABD59319
## [7] -------------------------...------------------------- CAA58534
## [8] -------------------------...------------------------- ABD19518
## [9] -------------------------...LT----------------------- NP_597713
## [10] -------------------------...------------------------- CAA78718
## [11] -------------------------...------------------------- EAA12598
## [12] -------------------------...------------------------- ABA81834
## [13] MTELQPSPPGYRVQDEAPGPPSCPP...------------------------- XP_392427
## [14] -------------------------...AATSSSSNGIGGPEQLNSNATSSYC XP_783573
## Con -------------------------...------------------------- Consensus
class(shrooms_align) <- "AAMultipleAlignment"
shrooms_align_seqinr <- msaConvert(shrooms_align, type = "seqinr::alignment")
print_msa(alignment = shrooms_align_seqinr,
chunksize = 60)
ggmsa::ggmsa(shrooms_align, # shrooms_align, NOT shrooms_align_seqinr
start = 2000,
end = 2100)
msaPrettyPrint(shrooms_align, # alignment
file = "shroom_msa.pdf", # file name
y=c(2000, 2100), # range
askForOverwrite=FALSE)
getwd()
## [1] "/Users/dengletian/Desktop"
shrooms_dist <- seqinr::dist.alignment(shrooms_align_seqinr,
matrix = "identity")
shrooms_dist_rounded <- round(shrooms_dist,
digits = 3)
shrooms_dist_rounded
## NP_065768 AAK95579 AAF13269 AAF13270 NP_065910 ABD59319 CAA58534
## AAK95579 0.000
## AAF13269 0.884 0.917
## AAF13270 0.897 0.917 0.000
## NP_065910 0.878 0.912 0.533 0.536
## ABD59319 0.893 0.921 0.783 0.783 0.782
## CAA58534 0.872 0.908 0.838 0.849 0.840 0.864
## ABD19518 0.866 0.912 0.834 0.846 0.838 0.855 0.548
## NP_597713 0.916 0.939 0.903 0.903 0.902 0.904 0.896
## CAA78718 0.925 0.955 0.896 0.895 0.893 0.893 0.898
## EAA12598 0.914 0.947 0.899 0.899 0.902 0.897 0.891
## ABA81834 0.938 0.943 0.935 0.934 0.936 0.940 0.935
## XP_392427 0.936 0.963 0.935 0.934 0.938 0.941 0.938
## XP_783573 0.940 0.958 0.942 0.939 0.942 0.935 0.942
## ABD19518 NP_597713 CAA78718 EAA12598 ABA81834 XP_392427
## AAK95579
## AAF13269
## AAF13270
## NP_065910
## ABD59319
## CAA58534
## ABD19518
## NP_597713 0.900
## CAA78718 0.891 0.919
## EAA12598 0.896 0.920 0.922
## ABA81834 0.935 0.932 0.946 0.882
## XP_392427 0.934 0.927 0.947 0.878 0.923
## XP_783573 0.946 0.947 0.941 0.925 0.954 0.943
tree <- nj(shrooms_dist)
# plot tree
plot.phylo (tree, main="Phylogenetic Tree",
type = "unrooted",
use.edge.length = F)
# add label
mtext(text = "Shroom family gene tree - unrooted, no branch lengths")
# plot tree
plot.phylo (tree, main="Phylogenetic Tree",
use.edge.length = F)
# add label
mtext(text = "Shroom family gene tree - rooted, no branch lenths")
# plot tree
plot.phylo (tree, main="Phylogenetic Tree",
use.edge.length = T)
# add label
mtext(text = "Shroom family gene tree - rooted, with branch lenths")
plot(tree, main="Phylogenetic Tree")
mtext(text = "Shroom family gene tree")
x <- 0.551
x2 <- 0.6
# label Shrm 3
segments(x0 = x, y0 = 1,
x1 = x, y1 = 4,
lwd=2)
text(x = x*1.01, y = 2.5, "Shrm 3",adj = 0)
segments(x0 = x, y0 = 5,
x1 = x, y1 = 6,
lwd=2)
text(x = x*1.01, y = 5.5, "Shrm 2",adj = 0)
segments(x0 = x, y0 = 7,
x1 = x, y1 = 9,
lwd=2)
text(x = x*1.01, y = 8, "Shrm 1",adj = 0)
segments(x0 = x, y0 = 10,
x1 = x, y1 = 13,
lwd=2)
text(x = x*1.01, y = 12, "Shrm ?",adj = 0)
segments(x0 = x, y0 = 14,
x1 = x, y1 = 15,
lwd=2)
text(x = x*1.01, y = 14.5, "Shrm 4",adj = 0)
segments(x0 = x2, y0 = 1,
x1 = x2, y1 = 6,
lwd=2)
segments(x0 = x2, y0 = 7,
x1 = x2, y1 = 9,
lwd=2)
segments(x0 = x2, y0 = 10,
x1 = x2, y1 = 15,
lwd=2)