The M-dimension truncation problem

and the R code

sim.extinct.truncdim <- function(brts,M){
  brts = no.na(brts)
  R = M - (length(brts)-1) # maximum possible number of extinctions
  Ne = sample((0:R),1)
  ms = runif(Ne,min=0,max=brts[1])
  me = runif(Ne,min=ms,max=brts[1])
  to = c(rep(1,Ne),rep(0,Ne))
  df = data.frame(bt = c(ms,me),bte=c(me,rep(Inf,Ne)),to=to)
  df = rbind(df,data.frame(bt=brts,to=rep(2,length(brts)),bte=rep(Inf,length(brts))))
  df = df[order(df$bt),]
  df$t.ext = df$bte-df$bt
  return(df)
}

Data augmentation density

The sampling probability would be then

\[ g(x|y) = \frac{1}{R+1}\left(\frac{1}{T} \right)^{N_e} \prod_{i=1}^{N_e} \frac{1}{T-T_i} \prod_{j=1}^{2N_e} \frac{1}{n_j} \] or

\[ log(g(x|y)) = -log(R+1)-N_elog(T)-\sum_{i=1}^{N_e}log(T-T_i)-\sum_{j=1}^{2N_e}log(n_{j}) \] that is representated on the following code

g_samp <- function(df,M){
  last = dim(df)[1]
  ct = df[last,1] # crown time T
  Ng = sum(df$to==2) - 1 # number of observed speciations (or given data)
  R = M - Ng # number of allowed extinct speciations
  subdf = df[df$to==1,] # missing speciations
  Ne = dim(subdf)[1] # number of extinctions
  Ts = ct-subdf$bt # extinction times
  tempto = df$to 
  to = df$to[-last]
  to[to==2] = 1
  n = c(2,2+cumsum(to)+cumsum(to-1)) # number of species at every waiting time
  n = n[tempto==1 | tempto==0] # number of species when missing speciations and extinctions happened
  if(R==0){
    log_dens = 0
  }else{
    log_dens = - log(R+1) - Ne*log(ct) - sum(log(Ts)) - sum(log(n))
  }
  return(log_dens)
}

Example: M=1

On the case when only one speciation is allowed:

\[\begin{equation} g(x|y) = \begin{cases} 1 & \text{if } R=0 \\ & \text{if } R>0 \\ 0.5 & \text{if not missing species}\\ \frac{1}{2}\frac{1}{T}\frac{1}{T-T_1}\frac{1}{2}\frac{1}{3} & \text{if one missing species at time } T_1 \end{cases} \end{equation}\]
g1 <- function(ta,to,CT){
  ans<-NULL
  n<-nrow(ta)
  if (is.vector(to)){
    to<-matrix(rep(to,each=n),ncol=2)
  }
  for (i in 1:n){
    if (!is.na(to[i,2])){
      ans[i] = 1
    } else {
      if (ta[i,1]==CT){
        ans[i]=0.5
      } else {
        ans[i] = 0.5*dunif(ta[i,1],min=0,max = CT)*dunif(ta[i,2],min=ta[i,1],max = CT)/6
      }
    }
  }
  return(ans)
}

we can check if they match

set.seed(3)
n.sim = 5
lambda = 1
mu = 0.3
CT = 2
M = 1
x<-f.sim(n.sim,lambda = lambda, mu=mu, CT=CT,M = M)
x = rm_ext(x)
x
## $brts
##            [,1]      [,2] [,3]
## [1,] 0.47419869 0.7010891    2
## [2,] 0.87832436 0.9539585    2
## [3,] 0.02971179 0.2054646    2
## [4,] 0.54544109 2.0000000   NA
## 
## $to
##      [,1] [,2] [,3]
## [1,]    1    0    2
## [2,]    1    0    2
## [3,]    1    0    2
## [4,]    1    2   NA
## 
## $t.ext
##           [,1] [,2] [,3]
## [1,] 0.7010891  Inf  Inf
## [2,] 0.9539585  Inf  Inf
## [3,] 0.2054646  Inf  Inf
## [4,]       Inf  Inf   NA
tm = x$brts[x$to[,1]==1,]
tm
##            [,1]      [,2] [,3]
## [1,] 0.47419869 0.7010891    2
## [2,] 0.87832436 0.9539585    2
## [3,] 0.02971179 0.2054646    2
## [4,] 0.54544109 2.0000000   NA
x.obs<-observe(x)
x.obs
##           [,1] [,2]
## [1,] 2.0000000   NA
## [2,] 2.0000000   NA
## [3,] 2.0000000   NA
## [4,] 0.5454411    2
x.ext<-g.sim(x.obs,M=M)
x.ext
## $brts
##           [,1]     [,2] [,3]
## [1,] 1.1208491 1.785228    2
## [2,] 2.0000000       NA   NA
## [3,] 2.0000000       NA   NA
## [4,] 0.5454411 2.000000   NA
## 
## $to
##      [,1] [,2] [,3]
## [1,]    1    0    2
## [2,]    2   NA   NA
## [3,]    2   NA   NA
## [4,]    2    2   NA
## 
## $t.ext
##           [,1] [,2] [,3]
## [1,] 0.6643785  Inf  Inf
## [2,]       Inf   NA   NA
## [3,]       Inf   NA   NA
## [4,]       Inf  Inf   NA
exp(g(x.ext,M))
## [1] 0.04739422 0.50000000 0.50000000 1.00000000
g1(x.ext$brts,x.obs,CT)
## [1] 0.04739422 0.50000000 0.50000000 1.00000000
Check if the approximation works varying mu
time = proc.time()
CT=2
lambda<-2.1
n.sim<-40
mu <- seq(0.001,2,length=n.sim)
x.obs<-c(CT,NA)
n.gsim <- 1000000
f.hat<-NULL
f.true<-NULL
f.true2<-NULL
f.hat2 <- NULL
ws<-NULL
M<-1
x.ext<-g.sim(x.obs,n=n.gsim,M=M)
gs<-g(x.ext,M = M)
for (i in 1:n.sim){
  x.ext<-g.sim(x.obs,n=n.gsim,M=M)  #We could do this outside loop: Would introduce bias
  gs<-g(x.ext,M = M)                  #but reduce variance and speed up calculations
  fs <- f.joint(x.ext,lambda=lambda,mu=mu[i])
  ws[[i]] <- exp(fs-gs)
  f.hat[i] <- mean(ws[[i]])
  f.true[i]<- f.marg(x.obs,lambda,mu[i])
  f.true2[i] <- exp(DDD::dd_loglik(pars1=c(lambda,mu[i],9e+99),pars2=pars2,brts=x.obs,missnumspec = 0,))
}
plot(mu,f.true,type="l")
#plot(mu,f.hat,type="l")
points(mu,f.hat)
points(f.true2)

get.time(time)
## [1] 33674.64

Marginal distribution of observed trees

For the case when only 1 speciation is allowed, we have that the marginal distribution of the observed tree \(y\) is

\[\begin{equation} f(y|\theta) = \begin{cases} \lambda e^{-2\lambda t_1} e^{-3\mu (T- t_1)} & \text{if } brts(y) = (t_1, T) \\ 1 + \frac{e^{3\mu T}}{3\mu-2\lambda}(1-e^{(3\mu-2\lambda)T})\lambda & \text{if } brts(y) = T \end{cases} \end{equation}\]

To extend this marginal function to the case of \(M\) allowed speciations we, for now, focuss on \(brts(y) = T\).

To calculate a marginal function we first need to set all possible topology combinations

get.topologies <- function(M){
  TO = matrix(nrow=2*M,ncol=1)
  TO[1,1] = 1
  for(i in 2:(2*M)){
    comb = ncol(TO)
    for(j in 1:comb){
      ns = sum(no.na(TO[,j]))
      ne = sum(1-no.na(TO[,j]))
      if(ns < M & ns > ne){ #extinction or speciation
        TO[i,j] = 1
        TO = cbind(TO,matrix(TO[,j],ncol=1))
        TO[i,ncol(TO)] = 0
      }
      if(ns == M & ns > ne){ #extinction
        TO[i,j] = 0
      }
      if(ns < M & ns == ne){ #speciation
        TO[i,j] = 1
      }
    }
  }
  return(TO)
}


dim=NULL
ti = NULL
TO = list()
for(i in 1:11){
  time = proc.time()
  to = get.topologies(i)
  TO[[i]] = to
  dim[i] = ncol(get.topologies(i))
  ti[i] = get.time(time)
}
qplot(1:11,dim)

qplot(1:11,ti)