4Jan2019

Calulating probabilities of trees spaces via counting topologies and uniform sampling

Following previous notation we refer to \(\mathcal{X}_k(y)\) to the set of complete trees that are in agreement with the observed tree \(y\) and has \(k\) missing species. So,

\[ \mathcal{X}(y) = \cup_{k=0}^\infty \mathcal{X}_k(y)\]

Moreover, we know that

\[ \mathcal{X}_k(y) = \mathcal{T}_k \times \mathbb{R}^{2*d} \]

where \(\mathcal{T}_k\) is the set of possible sequences of missing events, the size of \(\mathcal{T}_k\) set is given by the catalan number

\[ c_k = \frac{1}{k+1}\binom{2k}{k} \]

Counting topologies and Uniform sampling

To calculate the probability that the resconstructed tree \(y\) has \(k\) missing species we follow the uniform algorithm

Calculate \(\mathcal{T}_k\), count all elements on it.
sample \(s\) uniformly from \(\mathcal{T}_k\)
Sample \(2k\) uniform branching times \(u_1,...,u_{2k}\) in \([0,T]\), sort them and then assign them speciation or extinction status according to \(s\)

We then consider the probability of sampling a tree \(x_i\) on this squeme which is

\[ g_k(x_i|y) = \frac{1}{c_n}\left(\frac{1}{T}\right)^{2d} (2d)! \]

Base on that, we can approximate the probability that the reconstructed tree \(y\) has \(k\) missing species

\[ f(x \in \mathcal{X}_k|y) \approx \frac{1}{m}\sum_{x_i \sim g_k} \frac{f(x_i,y)}{g_k(x_i|y)} = \frac{c_nT^{2d}}{(2d)!m}\sum_{x_i \sim g_k} f(x_i,y)\]

We can test that:

get.k.tops <- function(d,obt,nsim=10){
  #print(paste("Getting topologies for", d," missing species"))
  top = get.topologies(d)
  sumofllik = vector(mode="numeric",length=ncol(top))
  #print(paste("Drawing uniform trees for ",ncol(top),"topologies"))
  for(j in 1:ncol(top)){
    topi = top[,j]
    topi = replicate(nsim,topi,simplify = F)
    ntree = lapply(topi,sim.unif.tree.df,obt=obt)  # simulate unif random tree given topology
    nlliks = lapply(ntree,weight.unif,pars=pars,d=d,ct=max(obt))
    sumofllik[j] = Reduce("+", nlliks)
  }
  sumofllik = sum(sumofllik)
  return(sumofllik/(ncol(top)*nsim))
}

weight.unif <- function(tree,pars,d,ct){
  lik.tree(pars,tree,initspec = 1,topology = F)*((catalan(d)*(ct*(2*d)))/(factorial(2*d)))
}


sim.unif.mtree <- function(top,obt){
  nmbt = length(top) #length of missing bt
  mbt = sort(runif(nmbt,min=0,max=max(obt)))
  df = data.frame(bt=c(mbt,obt),to=c(top,rep(2,length(obt))))
  df = df[order(df$bt),]
  return(df)
}

sim.unif.tree.df <- function(top,obt){
  df = df2tree2(sim.unif.mtree(top,obt))
  return(df)
}

time = proc.time()
pars = c(0.2,0.1)
### for comparison with rampal method 
obt = 1
pars = c(pars,Inf)
dim = 7
int.d = vector(mode="numeric",length = 9)
print(paste("for 0 missing species the probability is",exp(-nllik.tree(pars,tree=list(wt=diff(c(0,obt)),to=rep(1,length(obt)-1)),initspec = 1,topology = F))))

## [1] "for 0 missing species the probability is 0.740818220681718"

###################

next think is to check probabilities of \(\mathcal{X}_1,\mathcal{X}_2,...,\mathcal{X}_M\)

for(i in 1:dim){
  int.d[i] = get.k.tops(nsim=100,obt=obt,d = i)
}

sum(int.d)

## [1] 0.01015096

int.d

## [1] 1.008924e-02 6.157530e-05 1.440317e-07 1.814700e-10 1.427348e-13
## [6] 7.772776e-17 3.090966e-20 0.000000e+00 0.000000e+00

get.time(time)

## [1] 16.944

dd_loglik_rhsEA2009 <- function(t,x,pars)
{
  la <- pars[1]
  mu <- pars[2]
  nx <- sqrt(length(x))
  dim(x) <- c(nx,nx)
  xx <- matrix(0,nx+2,nx+2)
  xx[2:(nx+1),2:(nx+1)] <- x
  nl <- t(array(0:(nx - 1), dim = c(nx,nx)))
  dx <- mu * (nl + 1) * xx[1:nx,3:(nx+2)] + la * (nl - 1) * xx[2:(nx+1),1:(nx)] - (la + mu) * nl * xx[2:(nx+1),2:(nx+1)]
  dim(dx) <- c(nx^2,1)
  return(list(dx))
}

dd_loglik_EA2009 <- function(pars,lx,age)
{
  p <- array(0,dim = c(lx,lx))
  p[1,2] <- 1
  dim(p) <- c(lx^2,1)
  methode <- 'ode45'
  rtol <- 1E-10
  atol <- 1E-10
  y <- ode(p,c(0,age),dd_loglik_rhsEA2009,parms = pars,rtol = rtol,atol = atol,method = methode)[2,2:(lx^2 + 1)]
  dim(y) <- c(lx,lx)
  return(y)
}

lx <- 9
age <- 1
pars <- c(0.2,0.1)
pNENL <- dd_loglik_EA2009(pars = pars,lx = lx,age = age)
pNENL

##               [,1]         [,2]         [,3]         [,4]         [,5]
##  [1,] 0.000000e+00 7.408182e-01 1.280044e-01 2.211760e-02 3.821653e-03
##  [2,] 8.639393e-02 1.343950e-02 3.057232e-03 6.552600e-04 1.351663e-04
##  [3,] 4.961060e-04 1.609181e-04 4.633716e-05 1.193801e-05 2.868076e-06
##  [4,] 3.508038e-06 1.628988e-06 5.727376e-07 1.728352e-07 4.742042e-08
##  [5,] 2.518233e-08 1.512068e-08 6.303184e-09 2.182868e-09 6.734557e-10
##  [6,] 1.810672e-10 1.332543e-10 6.436058e-11 2.516934e-11 8.114057e-12
##  [7,] 1.302228e-12 1.134702e-12 6.236493e-13 2.566033e-13 8.008002e-14
##  [8,] 9.366570e-15 9.430019e-15 5.489566e-15 2.203077e-15 6.394020e-16
##  [9,] 6.737713e-17 7.260250e-17 4.148291e-17 1.558860e-17 4.154739e-18
##               [,6]         [,7]         [,8]         [,9]
##  [1,] 6.603353e-04 1.140979e-04 1.971473e-05 3.406463e-06
##  [2,] 2.714698e-05 5.345858e-06 1.036908e-06 1.450355e-07
##  [3,] 6.564956e-07 1.450035e-07 2.707409e-08 3.177024e-09
##  [4,] 1.218039e-08 2.724257e-09 4.600150e-10 4.644674e-11
##  [5,] 1.788531e-10 3.753164e-11 5.642514e-12 4.995628e-13
##  [6,] 2.062174e-12 3.934412e-13 5.278509e-14 4.158588e-15
##  [7,] 1.875478e-14 3.233062e-15 3.900663e-16 2.766767e-17
##  [8,] 1.365462e-16 2.134309e-17 2.335248e-18 1.505494e-19
##  [9,] 8.098929e-19 1.155020e-19 1.154933e-20 6.819864e-22

pNL <- colSums(pNENL)
ENL <- (0:(lx -1)) %*% pNL
pNL

## [1] 8.689357e-02 7.544203e-01 1.311085e-01 2.278498e-02 3.959736e-03
## [6] 6.881511e-04 1.195915e-04 2.077918e-05 3.554723e-06

print(sum(pNENL))

## [1] 0.9999992

pNENL[,1]

## [1] 0.000000e+00 8.639393e-02 4.961060e-04 3.508038e-06 2.518233e-08
## [6] 1.810672e-10 1.302228e-12 9.366570e-15 6.737713e-17

sum(pNENL[,1])

## [1] 0.08689357

comparing

int.d

## [1] 1.008924e-02 6.157530e-05 1.440317e-07 1.814700e-10 1.427348e-13
## [6] 7.772776e-17 3.090966e-20 0.000000e+00 0.000000e+00

pNENL[,2]

## [1] 7.408182e-01 1.343950e-02 1.609181e-04 1.628988e-06 1.512068e-08
## [6] 1.332543e-10 1.134702e-12 9.430019e-15 7.260250e-17

Checking \(\mathcal{X}_1(y)\)

Xd.mc <- function(d,obt,nsim=10,pars){
  #print(paste("Getting topologies for", d," missing species"))
  top = get.topologies(d)
  sumofllik = vector(mode="numeric",length=ncol(top))
  #print(paste("Drawing uniform trees for ",ncol(top),"topologies"))
  for(j in 1:ncol(top)){
    topi = top[,j]
    topi = replicate(nsim,topi,simplify = F)
    ntree = lapply(topi,sim.unif.tree.df,obt=obt)  # simulate unif random tree given topology
    nlliks = lapply(ntree,weight.unif,pars=pars,d=d,ct=max(obt))
    sumofllik[j] = Reduce("+", nlliks)
  }
  sumofllik = sum(sumofllik)
  return(sumofllik/(ncol(top)*nsim))
}


weight.unif <- function(tree,pars,d,ct){
  lik.tree(pars,tree,initspec = 1,topology = F)*((catalan(d)*(ct*(2*d)))/(factorial(2*d)))
}


sim.unif.mtree <- function(top,obt){
  nmbt = length(top) #length of missing bt
  mbt = sort(runif(nmbt,min=0,max=max(obt)))
  df = data.frame(bt=c(mbt,obt),to=c(top,rep(2,length(obt))))
  df = df[order(df$bt),]
  return(df)
}

sim.unif.tree.df <- function(top,obt){
  df = df2tree2(sim.unif.mtree(top,obt))
  return(df)
}

pars = c(pars,Inf)
Xd.mc(d=1,obt = 1,nsim = 10,pars = pars)

## [1] 0.01011338

Xd.mc(d=1,obt = 1,nsim = 10,pars = pars)

## [1] 0.01006089

Xd.mc(d=1,obt = 1,nsim = 10,pars = pars)

## [1] 0.009924435

Xd.mc(d=1,obt = 1,nsim = 100,pars = pars)

## [1] 0.009987545

Xd.mc(d=1,obt = 1,nsim = 1000,pars = pars)

## [1] 0.009990221

Xd.mc(d=1,obt = 1,nsim = 1000,pars = pars)

## [1] 0.009889434