John: jmh09r@my.fsu.edu
A Molecular Reappraisal of the Systematics of the Leaf-Eared Mice Phyllotis and their Relatives
Scott J. Steppan, O. Ramirez, Jenner Banbury, Dorothée Huchon,
Víctor Pacheco, Laura I. Walker and Angel E. Spotorno
In The Quintessential Naturalist (2007)
Published by University of California Press
DOI: http://dx.doi.org/10.1525/california/9780520098596.003.0023
date()
## [1] "Tue Jan 10 22:13:17 2017"
library(knitr)
opts_knit$set(verbose = FALSE)
setwd("D:/Phyllotis/GeneTree")
This package is used to search and retrieve DNA sequences from NCBI
library(rentrez)
Below are a few notes on data contributers.
63099401: https://www.ncbi.nlm.nih.gov/popset/?term=63099401 Steppan et al, 2007
15788195: https://www.ncbi.nlm.nih.gov/popset/15788195 Salazar-Bravo et al, 2001
942553126: https://www.ncbi.nlm.nih.gov/popset/942553126 Rengifo and Pacheco, 2015
58701036: https://www.ncbi.nlm.nih.gov/popset/58701036 Unpublished, Palma et al, 2004
37964369: https://www.ncbi.nlm.nih.gov/popset/37964369 Unpublished, Palma et al, 2003
542215193 https://www.ncbi.nlm.nih.gov/popset/?term=542215193 Unpublished Jayat et al, 2013
AF1… Seq: https://www.ncbi.nlm.nih.gov/popset/?term=19548933 Kuch et al, 2002
AY033… Seq: https://www.ncbi.nlm.nih.gov/popset/?term=20372349: Salazar-Bravo et al, 2002 U868… Seq: https://www.ncbi.nlm.nih.gov/nuccore/U86816 Steppan, 1998 U035… Seq: https://www.ncbi.nlm.nih.gov/nuccore/U03544 Smith and Patton, 1999
Here we linkup to GenBank and pull the requested sequences and then combine them as “cytb”.
ACC = entrez_fetch(db = "nucleotide",
id = c("AY033177", "U86816", "U86817", "U86818", "U86819", "U86820", "U86830",
"U86821", "U86823", "U86824", "U86826", "U86827", "U86828", "U86829",
"AF108691", "AF108692", "AF159287", "AF159288",
"AF159289", "AF159290", "AF159291", "AF159293", "AF159294",
"AY033176", "AY033189", "AY033190", "AF484208", "AF484209", "AF484210",
"AF484211", "AF484212", "AF484213", "AF484214", "AY041188", "AY275128",
"U03543", "U03544", "U03545", "U86832", "U86833", "U86834", "U86835",
"U86831"),
rettype="fasta")
PopSet = entrez_fetch(db="popset", id = c("63099401", "942553126",
"58701036", "37964369",
"542215193"),
rettype="fasta")
cytb = paste(PopSet, ACC, sep=" ")
Always good to save a copy.
write(cytb, "./cytb.fasta")
Using Muscle to align sequences.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ape)
library(phangorn)
tf = tempfile()
write(cytb, tf)
cytb0 = read.dna(tf, format = "fasta")
cytb_aligned = muscle(cytb0, exec = "C:/Windows/muscle")
Looking for major gaps, etc…
Looks like AY956709, AY956710, and AY956711 (2x P. gerbilus and a P. osilae) may be problematic.
cytb_aligned2 = cytb_aligned
#Pulling the Accession Numbers to use as lables
Lab.df = as.data.frame(attr(cytb_aligned2, "dimnames")[[1]])
Lab.df$ACC = stringr::word(attr(cytb_aligned2, "dimnames")[[1]])
Lab.df$ACC = tm::removeWords(Lab.df$ACC, ".1")
attr(cytb_aligned2, "dimnames")[[1]] = Lab.df$ACC
image(cytb_aligned2, show.labels = TRUE, cex = 0.5)
Plotting a simple Neighbor-Joining Tree to have a look at the data. Most of the code below is to clean-up labels. C. sorellus is specified as the root.
BaseTree = nj(dist.dna(cytb_aligned))
Lab.df = as.data.frame(BaseTree$tip.label)
names(Lab.df) = "Lab"
Lab.df = Lab.df %>%
mutate(ACC = stringr::word(BaseTree$tip.label, 1),
Gen = stringr::word(BaseTree$tip.label, 2),
Sp = stringr::word(BaseTree$tip.label, 3),
sSp = stringr::word(BaseTree$tip.label, 4),
Epi = paste(Gen, Sp, sSp, ACC, sep = " "))
uglywords = c("isolate", "cytochrome", "speciment-voucher", "mitohondrion",
"voucher", "specimen-", ".1", ".2", "SJS-2005")
Lab.df$Labels = tm::removeWords(Lab.df$Epi, uglywords)
Lab.df$Labels = gsub("Phyllotis", "P.", Lab.df$Labels)
Lab.df$Labels = gsub("xanthopygus", "x.", Lab.df$Labels)
Lab.df$Labels = gsub("Calomys", "C.", Lab.df$Labels)
Lab.df$Labels = gsub("Eligmodontia", "E.", Lab.df$Labels)
Lab.df$Labels = gsub("Tapecomys", "T.", Lab.df$Labels)
Lab.df$Labels = gsub("Graomys", "G.", Lab.df$Labels)
Lab.df$Labels = gsub("Auliscomys", "A.", Lab.df$Labels)
BaseTree$tip.label = Lab.df$Labels
plot(root(BaseTree, outgroup = 109 ), cex = 0.5) #root = "C. sorellus U03543"
Unweighted Pair Group Method with Arithmetic Mean hierarchical clustering. First converting to “phyDat” format.
cytb1 = as.phyDat(cytb_aligned)
dm = dist.ml(cytb1)
treeUPGMA = upgma(dm)
parsimony(treeUPGMA, cytb1)
## [1] 3058
treeUPGMA$tip.label = Lab.df$Labels
plot(treeUPGMA, cex = 0.5)
Improving parsimony. Re-conversion to “phyDat” is needed as labels were changed for previous plot.
cytb1 = as.phyDat(cytb_aligned)
dm = dist.ml(cytb1)
treeUPGMA = upgma(dm)
parsimony(treeUPGMA, cytb1)
## [1] 3058
treePars = optim.parsimony(treeUPGMA, cytb1)
## Final p-score 3033 after 10 nni operations
treeRatchet = pratchet(cytb1, trace = 0)
parsimony(c(treePars, treeRatchet), cytb1)
## [1] 3033 3025
BestTree = treeRatchet
BestTree$tip.label = Lab.df$Labels
plot(root(BestTree, outgroup = 109), cex = 0.5)
Nucleotide substitution models
mt = modelTest(cytb1)
## negative edges length changed to 0!
## [1] "JC+I"
## [1] "JC+G"
## [1] "JC+G+I"
## [1] "F81+I"
## [1] "F81+G"
## [1] "F81+G+I"
## [1] "K80+I"
## [1] "K80+G"
## [1] "K80+G+I"
## [1] "HKY+I"
## [1] "HKY+G"
## [1] "HKY+G+I"
## [1] "SYM+I"
## [1] "SYM+G"
## [1] "SYM+G+I"
## [1] "GTR+I"
## [1] "GTR+G"
## [1] "GTR+G+I"
env = attr(mt, "env")
ls(envir=env)
## [1] "data" "F81" "F81+G" "F81+G+I"
## [5] "F81+I" "GTR" "GTR+G" "GTR+G+I"
## [9] "GTR+I" "HKY" "HKY+G" "HKY+G+I"
## [13] "HKY+I" "JC" "JC+G" "JC+G+I"
## [17] "JC+I" "K80" "K80+G" "K80+G+I"
## [21] "K80+I" "SYM" "SYM+G" "SYM+G+I"
## [25] "SYM+I" "tree_F81" "tree_F81+G" "tree_F81+G+I"
## [29] "tree_F81+I" "tree_GTR" "tree_GTR+G" "tree_GTR+G+I"
## [33] "tree_GTR+I" "tree_HKY" "tree_HKY+G" "tree_HKY+G+I"
## [37] "tree_HKY+I" "tree_JC" "tree_JC+G" "tree_JC+G+I"
## [41] "tree_JC+I" "tree_K80" "tree_K80+G" "tree_K80+G+I"
## [45] "tree_K80+I" "tree_SYM" "tree_SYM+G" "tree_SYM+G+I"
## [49] "tree_SYM+I"
mt2 = mt #copy
arrange(mt2, BIC) #Model summary , sorted by BIC
## Model df logLik AIC AICw AICc AICcw
## 1 HKY+G+I 235 -15176.07 30822.14 3.213968e-01 30940.89 8.250645e-01
## 2 GTR+G+I 239 -15171.32 30820.64 6.786032e-01 30944.00 1.749355e-01
## 3 GTR+G 238 -15195.50 30867.00 5.812492e-11 30989.20 2.675310e-11
## 4 HKY+G 234 -15217.73 30903.46 7.061216e-19 31021.08 3.193118e-18
## 5 SYM+G+I 236 -15375.67 31223.34 2.437297e-88 31343.24 3.540035e-88
## 6 SYM+G 235 -15419.79 31309.57 4.592586e-107 31428.33 1.178972e-106
## 7 K80+G+I 232 -15625.89 31715.78 2.854780e-195 31831.16 3.965803e-194
## 8 K80+G 231 -15648.16 31758.32 1.648520e-204 31872.59 3.993887e-203
## 9 GTR+I 238 -15699.56 31875.12 7.161687e-230 31997.31 3.296302e-230
## 10 HKY+I 234 -15714.65 31897.31 1.088154e-234 32014.93 4.920688e-234
## 11 SYM+I 235 -15804.53 32079.06 3.716997e-274 32197.82 9.541983e-274
## 12 K80+I 231 -16114.72 32691.44 0.000000e+00 32805.71 0.000000e+00
## 13 F81+G+I 234 -16445.58 33359.17 0.000000e+00 33476.79 0.000000e+00
## 14 F81+G 233 -16462.19 33390.39 0.000000e+00 33506.89 0.000000e+00
## 15 JC+G+I 231 -16925.16 34312.32 0.000000e+00 34426.59 0.000000e+00
## 16 JC+G 230 -16934.26 34328.51 0.000000e+00 34441.67 0.000000e+00
## 17 F81+I 233 -16969.28 34404.56 0.000000e+00 34521.06 0.000000e+00
## 18 GTR 237 -17297.55 35069.09 0.000000e+00 35190.14 0.000000e+00
## 19 JC+I 230 -17351.76 35163.52 0.000000e+00 35276.69 0.000000e+00
## 20 SYM 234 -17398.04 35264.07 0.000000e+00 35381.70 0.000000e+00
## 21 HKY 233 -17456.11 35378.22 0.000000e+00 35494.72 0.000000e+00
## 22 K80 230 -17773.83 36007.66 0.000000e+00 36120.82 0.000000e+00
## 23 F81 232 -18584.22 37632.44 0.000000e+00 37747.83 0.000000e+00
## 24 JC 229 -18955.32 38368.63 0.000000e+00 38480.70 0.000000e+00
## BIC
## 1 32012.36
## 2 32031.12
## 3 32072.42
## 4 32088.61
## 5 32418.62
## 6 32499.79
## 7 32890.80
## 8 32928.28
## 9 33080.53
## 10 33082.46
## 11 33269.28
## 12 33861.40
## 13 34544.32
## 14 34570.48
## 15 35482.28
## 16 35493.41
## 17 35584.65
## 18 36269.44
## 19 36328.42
## 20 36449.23
## 21 36558.30
## 22 37172.56
## 23 38807.47
## 24 39528.46
(fit = eval(get("GTR+G+I", env), env)) #select model, save as "fit"
##
## loglikelihood: -15171.32
##
## unconstrained loglikelihood: -6887.017
## Proportion of invariant sites: 0.4446713
## Discrete gamma model
## Number of rate categories: 4
## Shape parameter: 0.840493
##
## Rate matrix:
## a c g t
## a 0.0000000 0.7232852 12.106105 0.8316141
## c 0.7232852 0.0000000 1.019805 8.7251400
## g 12.1061050 1.0198050 0.000000 1.0000000
## t 0.8316141 8.7251400 1.000000 0.0000000
##
## Base frequencies:
## 0.3460411 0.3237148 0.06714555 0.2630986
Maximum Likelihood Tree
#Optimise model parameters without fitting edges
fit1 = optim.pml(fit,
optNni = FALSE, optBf = TRUE,
optQ = TRUE, optInv = TRUE,
optGamma = TRUE, optEdge = FALSE,
optRate = TRUE,
control = pml.control(epsilon = 1e-08,
maxit = 10, trace = 0))
#Fix substitution model and fit tree
fit2 = optim.pml(fit1,
optNni = TRUE, optBf = FALSE,
optQ = FALSE, optInv = FALSE,
optGamma = FALSE, optEdge = TRUE,
control = pml.control(epsilon = 1e-08,
maxit = 10, trace = 0))
#Fine tune
fit3 = optim.pml(fit2,
optNni = TRUE, optBf = TRUE,
optQ = TRUE, optInv = TRUE,
optGamma = TRUE, optEdge = TRUE,
optRate = FALSE,
control = pml.control(epsilon = 1e-08,
maxit = 10, trace = 0))
fit4 = fit3$tree
#fit4$tip.label
fit4 = root(fit4, 101, resolve.root = TRUE) # reroots the tree
The red arrow indicates the “Problematic” sequences identified above.
Lab.df = as.data.frame(fit4$tip.label)
names(Lab.df) = "Lab"
Lab.df = Lab.df %>%
mutate(ACC = stringr::word(fit4$tip.label, 1),
Gen = stringr::word(fit4$tip.label, 2),
Sp = stringr::word(fit4$tip.label, 3),
sSp = stringr::word(fit4$tip.label, 4),
Epi = paste(Gen, Sp, sSp, ACC, sep = " "))
uglywords = c("isolate", "cytochrome", "speciment-voucher", "mitohondrion", "MSB",
"voucher", "specimen-", ".1", ".2", "SJS-2005", "FMNH", "UNM")
Lab.df$Labels = tm::removeWords(Lab.df$Epi, uglywords)
Lab.df$Labels = gsub("Phyllotis", "P.", Lab.df$Labels)
Lab.df$Labels = gsub("xanthopygus", "xan.", Lab.df$Labels)
Lab.df$Labels = gsub("Calomys", "C.", Lab.df$Labels)
Lab.df$Labels = gsub("Eligmodontia", "E.", Lab.df$Labels)
Lab.df$Labels = gsub("Tapecomys", "T.", Lab.df$Labels)
Lab.df$Labels = gsub("Graomys", "G.", Lab.df$Labels)
Lab.df$Labels = gsub("Auliscomys", "A.", Lab.df$Labels)
fit4$tip.label = Lab.df$Labels
library(phytools)
## Loading required package: maps
plot(fit4,
cex = 0.5,
edge.color = "black",
tip.color = "black",
use.edge.length = TRUE,
label.offset = 0.001,
show.node.label = TRUE)
add.arrow(fit4, "P. osilae osilae AY956711", col = "red", offset = -14)
add.scale.bar()
title("Optimized Parsimony \n(GTR+G+I)")
set.seed(1976)
bs = bootstrap.pml(fit3,
bs = 100,
optNni = TRUE,
control = pml.control(trace = 0))
Lab.df = as.data.frame(attr(bs, "TipLabel"))
names(Lab.df) = "Lab"
Lab.df = Lab.df %>%
mutate(ACC = stringr::word(attr(bs, "TipLabel"), 1),
Gen = stringr::word(attr(bs, "TipLabel"), 2),
Sp = stringr::word(attr(bs, "TipLabel"), 3),
sSp = stringr::word(attr(bs, "TipLabel"), 4),
Epi = paste(Gen, Sp, sSp, ACC, sep = " "))
uglywords = c("isolate", "cytochrome", "speciment-voucher", "mitohondrion", "MSB",
"voucher", "specimen-", ".1", ".2", "SJS-2005", "FMNH", "UNM")
Lab.df$Labels = tm::removeWords(Lab.df$Epi, uglywords)
Lab.df$Labels = gsub("Phyllotis", "P.", Lab.df$Labels)
Lab.df$Labels = gsub("xanthopygus", "xan.", Lab.df$Labels)
Lab.df$Labels = gsub("Calomys", "C.", Lab.df$Labels)
Lab.df$Labels = gsub("Eligmodontia", "E.", Lab.df$Labels)
Lab.df$Labels = gsub("Tapecomys", "T.", Lab.df$Labels)
Lab.df$Labels = gsub("Graomys", "G.", Lab.df$Labels)
Lab.df$Labels = gsub("Auliscomys", "A.", Lab.df$Labels)
bs2 = bs #Copy
attr(bs2, "TipLabel") = as.character(Lab.df$Labels)
Optimized and bootstrapped.
plotBS(midpoint(fit4), bs2,
type="p", cex=0.4,
bs.adj = c(1.25, 1.25),
bs.col = "black")
add.scale.bar()
title("Maximum Parsimony")
sessionInfo()
## R version 3.3.2 (2016-10-31)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 8.1 x64 (build 9600)
##
## locale:
## [1] LC_COLLATE=English_United States.1252
## [2] LC_CTYPE=English_United States.1252
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.1252
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] phytools_0.5-64 maps_3.1.1 phangorn_2.1.1 ape_4.0
## [5] dplyr_0.5.0 rentrez_1.0.4 knitr_1.15.1
##
## loaded via a namespace (and not attached):
## [1] NLP_0.1-9 Rcpp_0.12.8
## [3] tools_3.3.2 digest_0.6.10
## [5] jsonlite_1.1 evaluate_0.10
## [7] tibble_1.2 nlme_3.1-128
## [9] lattice_0.20-34 Matrix_1.2-7.1
## [11] fastmatch_1.0-4 igraph_1.0.1
## [13] DBI_0.5-1 curl_2.3
## [15] parallel_3.3.2 expm_0.999-0
## [17] mvtnorm_1.0-5 coda_0.19-1
## [19] httr_1.2.1 stringr_1.1.0
## [21] combinat_0.0-8 scatterplot3d_0.3-37
## [23] rprojroot_1.1 grid_3.3.2
## [25] R6_2.2.0 plotrix_3.6-3
## [27] survival_2.40-1 XML_3.98-1.5
## [29] rmarkdown_1.2 animation_2.4
## [31] magrittr_1.5 splines_3.3.2
## [33] MASS_7.3-45 backports_1.0.4
## [35] htmltools_0.3.5 mnormt_1.5-5
## [37] assertthat_0.1 numDeriv_2016.8-1
## [39] quadprog_1.5-5 stringi_1.1.2
## [41] lazyeval_0.2.0 msm_1.6.4
## [43] slam_0.1-40 tm_0.6-2
## [45] clusterGeneration_1.3.4