and the R code
sim.extinct.truncdim <- function(brts,M){
brts = no.na(brts)
R = M - (length(brts)-1) # maximum possible number of extinctions
Ne = sample((0:R),1)
ms = runif(Ne,min=0,max=brts[1])
me = runif(Ne,min=ms,max=brts[1])
to = c(rep(1,Ne),rep(0,Ne))
df = data.frame(bt = c(ms,me),bte=c(me,rep(Inf,Ne)),to=to)
df = rbind(df,data.frame(bt=brts,to=rep(2,length(brts)),bte=rep(Inf,length(brts))))
df = df[order(df$bt),]
df$t.ext = df$bte-df$bt
return(df)
}
The sampling probability would be then
\[ g(x|y) = \frac{1}{R+1}\left(\frac{1}{T} \right)^{N_e} \prod_{i=1}^{N_e} \frac{1}{T-T_i} \prod_{j=1}^{2N_e} \frac{1}{n_j} \] or
\[ log(g(x|y)) = -log(R+1)-N_elog(T)-\sum_{i=1}^{N_e}log(T-T_i)-\sum_{j=1}^{2N_e}log(n_{j}) \] that is representated on the following code
g_samp <- function(df,M){
last = dim(df)[1]
ct = df[last,1] # crown time T
Ng = sum(df$to==2) - 1 # number of observed speciations (or given data)
R = M - Ng # number of allowed extinct speciations
subdf = df[df$to==1,] # missing speciations
Ne = dim(subdf)[1] # number of extinctions
Ts = ct-subdf$bt # extinction times
tempto = df$to
to = df$to[-last]
to[to==2] = 1
n = c(2,2+cumsum(to)+cumsum(to-1)) # number of species at every waiting time
n = n[tempto==1 | tempto==0] # number of species when missing speciations and extinctions happened
if(R==0){
log_dens = 0
}else{
log_dens = - log(R+1) - Ne*log(ct) - sum(log(Ts)) - sum(log(n))
}
return(log_dens)
}
On the case when only one speciation is allowed:
\[\begin{equation} g(x|y) = \begin{cases} 1 & \text{if } R=0 \\ & \text{if } R>0 \\ 0.5 & \text{if not missing species}\\ \frac{1}{2}\frac{1}{T}\frac{1}{T-T_1}\frac{1}{2}\frac{1}{3} & \text{if one missing species at time } T_1 \end{cases} \end{equation}\]g1 <- function(ta,to,CT){
ans<-NULL
n<-nrow(ta)
if (is.vector(to)){
to<-matrix(rep(to,each=n),ncol=2)
}
for (i in 1:n){
if (!is.na(to[i,2])){
ans[i] = 1
} else {
if (ta[i,1]==CT){
ans[i]=0.5
} else {
ans[i] = 0.5*dunif(ta[i,1],min=0,max = CT)*dunif(ta[i,2],min=ta[i,1],max = CT)/6
}
}
}
return(ans)
}
we can check if they match
set.seed(3)
n.sim = 5
lambda = 1
mu = 0.3
CT = 2
M = 1
x<-f.sim(n.sim,lambda = lambda, mu=mu, CT=CT,M = M)
x = rm_ext(x)
x
## $brts
## [,1] [,2] [,3]
## [1,] 0.47419869 0.7010891 2
## [2,] 0.87832436 0.9539585 2
## [3,] 0.02971179 0.2054646 2
## [4,] 0.54544109 2.0000000 NA
##
## $to
## [,1] [,2] [,3]
## [1,] 1 0 2
## [2,] 1 0 2
## [3,] 1 0 2
## [4,] 1 2 NA
##
## $t.ext
## [,1] [,2] [,3]
## [1,] 0.7010891 Inf Inf
## [2,] 0.9539585 Inf Inf
## [3,] 0.2054646 Inf Inf
## [4,] Inf Inf NA
tm = x$brts[x$to[,1]==1,]
tm
## [,1] [,2] [,3]
## [1,] 0.47419869 0.7010891 2
## [2,] 0.87832436 0.9539585 2
## [3,] 0.02971179 0.2054646 2
## [4,] 0.54544109 2.0000000 NA
x.obs<-observe(x)
x.obs
## [,1] [,2]
## [1,] 2.0000000 NA
## [2,] 2.0000000 NA
## [3,] 2.0000000 NA
## [4,] 0.5454411 2
x.ext<-g.sim(x.obs,M=M)
x.ext
## $brts
## [,1] [,2] [,3]
## [1,] 1.1208491 1.785228 2
## [2,] 2.0000000 NA NA
## [3,] 2.0000000 NA NA
## [4,] 0.5454411 2.000000 NA
##
## $to
## [,1] [,2] [,3]
## [1,] 1 0 2
## [2,] 2 NA NA
## [3,] 2 NA NA
## [4,] 2 2 NA
##
## $t.ext
## [,1] [,2] [,3]
## [1,] 0.6643785 Inf Inf
## [2,] Inf NA NA
## [3,] Inf NA NA
## [4,] Inf Inf NA
exp(g(x.ext,M))
## [1] 0.04739422 0.50000000 0.50000000 1.00000000
g1(x.ext$brts,x.obs,CT)
## [1] 0.04739422 0.50000000 0.50000000 1.00000000
time = proc.time()
CT=2
lambda<-2.1
n.sim<-40
mu <- seq(0.001,2,length=n.sim)
x.obs<-c(CT,NA)
n.gsim <- 1000000
f.hat<-NULL
f.true<-NULL
f.true2<-NULL
f.hat2 <- NULL
ws<-NULL
M<-1
x.ext<-g.sim(x.obs,n=n.gsim,M=M)
gs<-g(x.ext,M = M)
for (i in 1:n.sim){
x.ext<-g.sim(x.obs,n=n.gsim,M=M) #We could do this outside loop: Would introduce bias
gs<-g(x.ext,M = M) #but reduce variance and speed up calculations
fs <- f.joint(x.ext,lambda=lambda,mu=mu[i])
ws[[i]] <- exp(fs-gs)
f.hat[i] <- mean(ws[[i]])
f.true[i]<- f.marg(x.obs,lambda,mu[i])
f.true2[i] <- exp(DDD::dd_loglik(pars1=c(lambda,mu[i],9e+99),pars2=pars2,brts=x.obs,missnumspec = 0,))
}
plot(mu,f.true,type="l")
#plot(mu,f.hat,type="l")
points(mu,f.hat)
points(f.true2)
get.time(time)
## [1] 33674.64
For the case when only 1 speciation is allowed, we have that the marginal distribution of the observed tree \(y\) is
\[\begin{equation} f(y|\theta) = \begin{cases} \lambda e^{-2\lambda t_1} e^{-3\mu (T- t_1)} & \text{if } brts(y) = (t_1, T) \\ 1 + \frac{e^{3\mu T}}{3\mu-2\lambda}(1-e^{(3\mu-2\lambda)T})\lambda & \text{if } brts(y) = T \end{cases} \end{equation}\]To extend this marginal function to the case of \(M\) allowed speciations we, for now, focuss on \(brts(y) = T\).
To calculate a marginal function we first need to set all possible topology combinations
get.topologies <- function(M){
TO = matrix(nrow=2*M,ncol=1)
TO[1,1] = 1
for(i in 2:(2*M)){
comb = ncol(TO)
for(j in 1:comb){
ns = sum(no.na(TO[,j]))
ne = sum(1-no.na(TO[,j]))
if(ns < M & ns > ne){ #extinction or speciation
TO[i,j] = 1
TO = cbind(TO,matrix(TO[,j],ncol=1))
TO[i,ncol(TO)] = 0
}
if(ns == M & ns > ne){ #extinction
TO[i,j] = 0
}
if(ns < M & ns == ne){ #speciation
TO[i,j] = 1
}
}
}
return(TO)
}
dim=NULL
ti = NULL
TO = list()
for(i in 1:11){
time = proc.time()
to = get.topologies(i)
TO[[i]] = to
dim[i] = ncol(get.topologies(i))
ti[i] = get.time(time)
}
qplot(1:11,dim)
qplot(1:11,ti)