set.seed(98109) # Fred Hutch zip code
library(pacman)
p_load(phytools, castor, caper)
pcm_demo
Settings
Between 60 and 100 tips
<- 60
lower_bound_tips <- 100 upper_bound_tips
Birth rate and death rate for the tree
<- 0.28
birth.rate <- 0.14 death.rate
Helpers
<- function(phy) {
count_tips return( length(phy$tip.label) )
}
<- function(transition_model, n.states = 2) {
generateQ <- castor::get_random_mk_transition_matrix(
Q Nstates = n.states,
rate_model = transition_model
)colnames(Q) <- c("A","B")
rownames(Q) <- c("A","B")
return( Q )
}# Presetting the colors that will be used for each trait
<- setNames(c("#FFC0CB", "#4682B4"), c("A","B"))
traitCols
<- function(fitMk_obj) {
rates_list_to_mat <- fitMk_obj$rates
rates if (length(rates) == 1) {
<- rates[1]
rate_B_to_A <- rates[1]
rate_A_to_B else {
} <- rates[1]
rate_B_to_A <- rates[2]
rate_A_to_B
}
matrix(
c(-rate_A_to_B, rate_A_to_B,
-rate_B_to_A),
rate_B_to_A, nrow=2, byrow = TRUE
)
}
<- function(x) {
write_matex2 <- "\\begin{bmatrix}"
begin <- "\\end{bmatrix}"
end <-
X apply(x, 1, function(x) {
paste(
paste(x, collapse = "&"),
"\\\\"
)
})paste(c(begin, X, end), collapse = "")
# https://stackoverflow.com/a/54088015 }
Simulate tree
<- 0
ntips <- 0
retry_counter while (lower_bound_tips) {
<- pbtree(b = birth.rate, d = death.rate, n = lower_bound_tips - 10)
tt <- count_tips(tt)
ntips if (ntips >= lower_bound_tips && ntips <= upper_bound_tips) {
cat(paste("Took", retry_counter, "retries before success"))
break
else {
} <- retry_counter + 1
retry_counter
} }
Took 3 retries before success
plot(tt, show.tip.label=FALSE); title(paste("Total tips:", count_tips(tt)))
Discrete Trait Simulation
Model Simulation
Equal Rates (ER) model
\(q_{A \rightarrow B} = q_{B \rightarrow A}\)
<- generateQ("ER")
er.Q <- rTraitDisc(tt, root.value = 1, model = er.Q)
er_trait <- summary(make.simmap(tt, er_trait, nsim=10, model = "ER", pi = "equal")) # root prior for *i* in *N* states **Pr**(state i) = 1/*N* er_simmap
make.simmap is sampling character histories conditioned on
the transition matrix
Q =
A B
A -0.3321349 0.3321349
B 0.3321349 -0.3321349
(estimated using likelihood);
and (mean) root node prior probabilities
pi =
A B
0.5 0.5
Done.
plot(er_simmap, colors = traitCols, ftype = "off")
table(er_trait)
er_trait
A B
43 56
\[ Q_{ER} = \begin{bmatrix}-0.98&0.98 \\0.98&-0.98 \\\end{bmatrix} \]
All Rates Different (ARD) model
\(q_{A \rightarrow B} \neq q_{B \rightarrow A}\)
<- generateQ("ARD")
ard.Q <- rTraitDisc(tt, root.value = 1, model = ard.Q)
ard_trait <- summary(make.simmap(tt, ard_trait, nsim=10, model = "ARD")) ard_simmap
make.simmap is sampling character histories conditioned on
the transition matrix
Q =
A B
A -0.01817035 0.01817035
B 0.30482590 -0.30482590
(estimated using likelihood);
and (mean) root node prior probabilities
pi =
A B
0.5 0.5
Done.
plot(ard_simmap, colors = traitCols, ftype = "off")
table(ard_trait)
ard_trait
A B
89 10
\[ Q_{ARD} = \begin{bmatrix}-0.04&0.04 \\0.31&-0.31 \\\end{bmatrix} \]
Inference
Estimating the \(Q\) matrix
Equal Rates Trait
<- fitMk(tt, er_trait, model = "ER")
fitER <- fitMk(tt, er_trait, model = "ARD")
fitARD <- -2 * (fitER$logLik - fitARD$logLik)
teststat pchisq(teststat, df = 1, lower.tail = FALSE)
[1] 0.4892939
There’s no significant difference between the equal rates model and all rates different model, so fitting the simmap
earlier using an equal rates model was appropriate.
\[ Q_{ER} = \begin{bmatrix}-0.98&0.98 \\0.98&-0.98 \\\end{bmatrix}\\ \] \[ \hat{Q}_{ER} = \begin{bmatrix}-0.33&0.33 \\0.33&-0.33 \\\end{bmatrix} \] (The estimated rates \(\hat{Q}\) are 3 times smaller than their true values in \(Q\). I’m not sure if this will matter a lot for analyses. If we can differentiate ER from ARD reliably, then we can at least comment on the rate model most appropriate for the data. However, we should probably not comment on the actual estimated rates. Maybe we can do it better*??
- for our use case
Let’s check the ARD trait…
All Rates Different Trait
<- fitMk(tt, ard_trait, model = "ER")
fitER <- fitMk(tt, ard_trait, model = "ARD")
fitARD <- -2 * (fitER$logLik - fitARD$logLik)
teststat pchisq(teststat, df = 1, lower.tail = FALSE)
[1] 0.0001920527
\[ Q_{ARD} = \begin{bmatrix}-0.04&0.04 \\0.31&-0.31 \\\end{bmatrix} \] \[ \hat{Q}_{ARD} = \begin{bmatrix}-0.02&0.02 \\0.3&-0.3 \\\end{bmatrix} \] (The estimate for rates between states is not too bad, especially relative to the ER model. From prior experience, this is not necessarily reliable. It would be useful to see what downstream stats look like when simulated on a phylogeny, but that’s getting beyond the scope of this.)
There is a significant difference between the ER model and ARD, so generating a simmap
with an ARD model prior was, still, appropriate.
We knew all of this beforehand because we generated the data, but I want to demonstrate that its possible to fit all of this, and estimate the transition rates betwen states in our data.
Phylogenetic Signal using \(D\) statistic (Fritz & Purvis 2010)
Fritz & Purvis 2010 – https://www.jstor.org/stable/40864204?seq=1 For binary traits; best used when N \(\ge\) 50 species
Phylogenetic signal of ER trait
<- data.frame(tip = names(er_trait), state = unname(er_trait))
er_df <- comparative.data(tt, er_df, names.col = 'tip')
er_compdat phylo.d(er_compdat, tt, binvar=state)
Calculation of D statistic for the phylogenetic structure of a binary variable
Data : er_df
Binary variable : state
Counts of states: A = 43
B = 56
Phylogeny : tt
Number of permutations : 1000
Estimated D : 0.765063
Probability of E(D) resulting from no (random) phylogenetic structure : 0.042
Probability of E(D) resulting from Brownian phylogenetic structure : 0
Phylogenetic signal of ARD trait
<- data.frame(tip = names(ard_trait), state = unname(ard_trait))
ard_df <- comparative.data(tt, ard_df, names.col = 'tip')
ard_compdat phylo.d(ard_compdat, tt, binvar=state)
Calculation of D statistic for the phylogenetic structure of a binary variable
Data : ard_df
Binary variable : state
Counts of states: A = 89
B = 10
Phylogeny : tt
Number of permutations : 1000
Estimated D : -0.03363157
Probability of E(D) resulting from no (random) phylogenetic structure : 0
Probability of E(D) resulting from Brownian phylogenetic structure : 0.555
Potential phylodynamics use case
Testing the phylodynamic signal of host characteristics, like zip code median income. We could possibly test if SARS-CoV-2 has a heterogenous transmission structure or if transmission chains are highly “bunched.” In the case of zip code median income, high signal would mean that we see transmission chains stay within income ranges. Low signal would mean that we would see frequently transmissions from high income to low income and low to high.
We could also use Q matricies to estimate if there is bias in transmission rates between vaccinated and unvaccinated individuals (maybe not the actual rates though). If an ARD model was the most appropriate, we might be able to see if individuals in either group are responsible for increased transmission. The fitMk model doesn’t allow for sampled ancestors – I wonder if we could make a model potentially better fit for epidemics.
Continuous trait evolution
Simulation
Brownian Motion ~ Neutral Evolution
Random walk trait evolution, where \(\sigma^2_{t=T} \propto T\), where \(T\) is time since the root, and \(\mu_{t=T} = \mu\).
BM with \(\mu_0 = 0, \sigma^2 = 0.25\)
<- fastBM(tt, a = 0, sig2 = 0.25)
bm_trait phenogram(tt, bm_trait, ftype="off")
Ornstein-Uhlenbeck ~ Adaptive Evolution
“Rubber banded” BM. Imagine a random walk of a point, but it’s held by a rubber band around some fixed point (\(\theta\)). The further away the stick goes from the fixed point, the harder it is pulled. The closer the stick object gets, the weaker the pull. \(\alpha\) represents the strength that rubber band effect. Extremely high \(\alpha\) pulls the random walk towards \(\alpha\) quickly (almost immediately), extremely low \(\alpha\) resembles BM.
OU with \(\mu_0 = 0, \sigma^2 = 0.25, \textcolor{red}{\theta = 10}, \alpha = 0.05\)
<- fastBM(tt, a = 0, sig2 = 0.25, theta = 10, alpha = 0.05)
ou_trait_1 phenogram(tt, ou_trait_1, ftype="off")
abline(h = 10, col = "red", lwd = 2)
OU with \(\mu_0 = 0, \sigma^2 = 0.25, \textcolor{red}{\theta = -20}, \alpha = 0.1\)
<- fastBM(tt, a = 0, sig2 = 0.25, theta = -20, alpha = 0.1)
ou_trait_2 phenogram(tt, ou_trait_2, ftype="off")
abline(h = -20, col = "red", lwd = 2)