First we create the functions we are going to use

#simulate segments, the output are waiting times, topology and observed waiting times
sim.seg <- function(lambda,p,n=1){
  to = NULL
  t = NULL
  obs = NULL
  for(i in 1:n){
    to1=0
    while(to1 == 0){
      to1 = rbinom(1,1,p)
      to = c(to,to1)
      t = c(t,rexp(1,lambda))
    }
    obs = c(obs,sum(t)-sum(obs))
  }
  return(list(t=t,to=to,obs=obs))
}

# simulate missing data given the observed segment
rec.seg <- function(obs,lambda,p){
  to=NULL
  t=NULL
  for(i in 1:length(obs)){
    Dt = obs[i]
    n = rpois(1,(1-p)*lambda*Dt)
    it = sort(runif(n,min=0,max=Dt)) #inter-arrival times
    t = c(t,diff(c(0,it,Dt)))
    to = c(to,rep(0,times=n),1)
  }
  return(list(t=t,to=to))
}

#calculate the mle for a set of segments
mle.ss <- function(S){
  m = length(S)
  n = vector(mode='numeric',length = m)
  d = vector(mode='numeric',length = m)
  dt = vector(mode='numeric',length = m)
  for(i in 1:m){
    rec = S[[i]]
    d[i] = length(rec$t)
    n[i] = sum(rec$to)
    dt[i] = sum(rec$t)
  }
  lambda.mle = sum(d)/sum(dt)
  p.mle = sum(n)/sum(d)
  return(list(lambda.mle=lambda.mle,p.mle=p.mle))
}

# monte-carlo sampling of size m
sim.srs <- function(Dt,lambda,p,m){ 
  Rec = vector(mode = 'list',length = m)
  d = vector(mode='numeric',length = m)
  dt = vector(mode='numeric',length = m)
  for(j in 1:m){
    rec=rec.seg(Dt,lambda,p)
    Rec[[j]] = rec
  }
  return(Rec)
}

After that we can do some examples

Exercises 1. a random segment

lambda = 0.8
p = 0.5
#create random segment
seg = sim.seg(lambda = 0.8,p = 0.5)
#segment
seg 
## $t
## [1] 0.3044863
## 
## $to
## [1] 1
## 
## $obs
## [1] 0.3044863
#observed
Dt = seg$obs
m=10000
# monte-carlo sampling
R = sim.srs(Dt,lambda,p,m)
# mle
mle.ss(R)
## $lambda.mle
## [1] 3.690478
## 
## $p.mle
## [1] 0.8899172
# now with more segments
seg = sim.seg(lambda = 0.8,p = 0.5,n=25)
Dt=seg$obs
srs = sim.srs(Dt,lambda,p,m)
mle.ss(srs)
## $lambda.mle
## [1] 0.8661497
## 
## $p.mle
## [1] 0.5379375

Exercises 2. How good is the last iteration of MCEM

To perform this expetiment we run the following rutine l times

  1. simulate tree and get observed data
  2. MC sampling for the missing part
  3. estimate MLE and store
time = proc.time()
l=100
L=vector(mode = 'numeric',length=l)
P=vector(mode = 'numeric',length=l)
lambda=0.8
p=0.8
m=100
for(k in 1:l){
  seg = sim.seg(lambda = lambda, p = p,n=40)
  Dt = seg$obs
  srs = sim.srs(Dt,lambda,p,m)
  mle=mle.ss(srs)
  L[k] = mle$lambda.mle
  P[k] = mle$p.mle
}
print(proc.time()-time)
##    user  system elapsed 
##  18.804   0.004  18.805
summary(L)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.6213  0.7267  0.8026  0.8142  0.8880  1.1734
summary(P)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.7361  0.7787  0.8046  0.7991  0.8195  0.8598

Quite accurate estimations, now let´s observe another with a lot of missing data

time = proc.time()
l=100
L=vector(mode = 'numeric',length=l)
P=vector(mode = 'numeric',length=l)
lambda=0.8
p=0.01
m=100
for(k in 1:l){
  seg = sim.seg(lambda = lambda, p = p,n=40)
  Dt = seg$obs
  srs = sim.srs(Dt,lambda,p,m)
  mle=mle.ss(srs)
  L[k] = mle$lambda.mle
  P[k] = mle$p.mle
}
print(proc.time()-time)
##    user  system elapsed 
##  42.408   0.024  42.434
summary(L)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.7958  0.7988  0.8003  0.8004  0.8016  0.8080
summary(P)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.006423 0.009104 0.010308 0.010391 0.011778 0.016563

In general they are not biased and accurate. We can change the mc sampling size \(m\), the length of the tree \(n\) or the parameters, but they are more or less working fine. Now let´s see how the MCEM works.

MCEM

mcem.light <- function(Dt,lambda,p,nit=100){
  m=2
  L=vector(mode = 'numeric',length=l)
  P=vector(mode = 'numeric',length=l)
  for(k in 1:l){
  srs = sim.srs(Dt,lambda,p,m)
  mle=mle.ss(srs)
  L[k] = lambda = mle$lambda.mle
  P[k] = p = mle$p.mle
  m=m+1
  }
  return(list(L=L,P=P))
}

From now onwards I perform several MCEM routines to observe how is working with different input parameters.

time = proc.time()
l=300
lambda=0.8
p=0.5
seg = sim.seg(lambda = lambda, p = p,n=50)
Dt = seg$obs
mcem = mcem.light(Dt,lambda,p,m)
print(proc.time()-time)
##    user  system elapsed 
## 104.456   0.012 104.465
qplot(1:l,mcem$L) + geom_hline(yintercept = lambda)

qplot(1:l,mcem$P) +  geom_hline(yintercept = p)

time = proc.time()
l=300
lambda=0.8
p=0.5
#seg = sim.seg(lambda = 0.8, p = 0.5,n=50)
Dt = seg$obs
mcem = mcem.light(Dt,lambda,p,m)
print(proc.time()-time)
##    user  system elapsed 
## 107.852   0.004 107.853
qplot(1:l,mcem$L) + geom_hline(yintercept = lambda)

qplot(1:l,mcem$P) +  geom_hline(yintercept = p)

time = proc.time()
l=300
lambda=0.8
p=0.5
#seg = sim.seg(lambda = 0.8, p = 0.5,n=50)
Dt = seg$obs
mcem = mcem.light(Dt,lambda,p,m)
print(proc.time()-time)
##    user  system elapsed 
## 108.536   0.000 108.531
qplot(1:l,mcem$L) + geom_hline(yintercept = lambda)

qplot(1:l,mcem$P) +  geom_hline(yintercept = p)

time = proc.time()
l=300
lambda=0.8
p=0.5
#seg = sim.seg(lambda = 0.8, p = 0.5,n=50)
Dt = seg$obs
mcem = mcem.light(Dt,lambda,p,m)
print(proc.time()-time)
##    user  system elapsed 
## 107.212   0.004 107.218
qplot(1:l,mcem$L) + geom_hline(yintercept = lambda)

qplot(1:l,mcem$P) +  geom_hline(yintercept = p)

time = proc.time()
l=300
lambda=0.8
p=0.5
#seg = sim.seg(lambda = 0.8, p = 0.5,n=50)
Dt = seg$obs
mcem = mcem.light(Dt,lambda,p,m)
print(proc.time()-time)
##    user  system elapsed 
## 107.140   0.000 107.136
qplot(1:l,mcem$L) + geom_hline(yintercept = lambda)

qplot(1:l,mcem$P) +  geom_hline(yintercept = p)

time = proc.time()
l=300
lambda=0.8
p=0.1
seg = sim.seg(lambda = lambda, p = p,n=50)
Dt = seg$obs
mcem = mcem.light(Dt,lambda,p,m)
print(proc.time()-time)
##    user  system elapsed 
## 122.004   0.000 121.995
qplot(1:l,mcem$L) + geom_hline(yintercept = lambda)

qplot(1:l,mcem$P) +  geom_hline(yintercept = p)

time = proc.time()
l=300
lambda=0.8
p=0.1
#seg = sim.seg(lambda = 0.8, p = 0.5,n=50)
Dt = seg$obs
mcem = mcem.light(Dt,lambda,p,m)
print(proc.time()-time)
##    user  system elapsed 
## 122.104   0.008 122.113
qplot(1:l,mcem$L) + geom_hline(yintercept = lambda)

qplot(1:l,mcem$P) +  geom_hline(yintercept = p)

time = proc.time()
l=300
lambda=0.8
p=0.9
seg = sim.seg(lambda = lambda, p = p,n=50)
Dt = seg$obs
mcem = mcem.light(Dt,lambda,p,m)
print(proc.time()-time)
##    user  system elapsed 
## 103.856   0.000 103.855
qplot(1:l,mcem$L) + geom_hline(yintercept = lambda)

qplot(1:l,mcem$P) +  geom_hline(yintercept = p)

time = proc.time()
l=300
lambda=0.8
p=0.5
#seg = sim.seg(lambda = 0.8, p = 0.5,n=50)
Dt = seg$obs
mcem = mcem.light(Dt,lambda,p,m)
print(proc.time()-time)
##    user  system elapsed 
## 105.464   0.000 105.465
qplot(1:l,mcem$L) + geom_hline(yintercept = lambda)

qplot(1:l,mcem$P) +  geom_hline(yintercept = p)

time = proc.time()
l=300
lambda=0.8
p=0.99
seg = sim.seg(lambda = 0.8, p = 0.5,n=50)
Dt = seg$obs
mcem = mcem.light(Dt,lambda,p,m)
print(proc.time()-time)
##    user  system elapsed 
## 102.188   0.000 102.189
qplot(1:l,mcem$L) + geom_hline(yintercept = lambda)

qplot(1:l,mcem$P) +  geom_hline(yintercept = p)

m=100
for(k in 1:l){
  srs = sim.srs(Dt,lambda,p,m)
  mle=mle.ss(srs)
  L[k] = lambda = mle$lambda.mle
  P[k] = p = mle$p.mle
}
print(proc.time()-time)
##    user  system elapsed 
##  49.056   0.024  49.092
qplot(1:length(P),L) + geom_hline(yintercept = 0.8)

qplot(1:length(P),P) +  geom_hline(yintercept = 0.5)

P*L
##   [1] 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474
##   [8] 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474
##  [15] 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474
##  [22] 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474
##  [29] 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474
##  [36] 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474
##  [43] 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474
##  [50] 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474
##  [57] 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474
##  [64] 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474
##  [71] 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474
##  [78] 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474
##  [85] 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474
##  [92] 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474
##  [99] 0.3568474 0.3568474
length(Dt)/sum(Dt)
## [1] 0.3568474
m=10
lambda=1.2
p=0.1
for(k in 1:l){
  srs = sim.srs(Dt,lambda,p,m)
  mle=mle.ss(srs)
  L[k] = lambda = mle$lambda.mle
  P[k] = p = mle$p.mle
}
print(proc.time()-time)
##    user  system elapsed 
##   2.320   0.076   2.433
qplot(1:length(P),L) + geom_hline(yintercept = 0.8)

qplot(1:length(P),P) +  geom_hline(yintercept = 0.5)

P*L
##   [1] 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474
##   [8] 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474
##  [15] 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474
##  [22] 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474
##  [29] 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474
##  [36] 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474
##  [43] 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474
##  [50] 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474
##  [57] 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474
##  [64] 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474
##  [71] 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474
##  [78] 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474
##  [85] 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474
##  [92] 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474 0.3568474
##  [99] 0.3568474 0.3568474
length(Dt)/sum(Dt)
## [1] 0.3568474
time = proc.time()
l=100
L=vector(mode = 'numeric',length=l)
P=vector(mode = 'numeric',length=l)
lambda=0.8
p=0.5
m=10
seg = sim.seg(lambda = 0.8, p = 0.5,n=100)
Dt = seg$obs
for(k in 1:l){
  srs = sim.srs(Dt,lambda,p,m)
  mle=mle.ss(srs)
  L[k] = lambda = mle$lambda.mle
  P[k] = p = mle$p.mle
}
print(proc.time()-time)
##    user  system elapsed 
##   5.160   0.000   5.168
qplot(1:length(P),L) + geom_hline(yintercept = 0.8)

qplot(1:length(P),P) +  geom_hline(yintercept = 0.5)

qplot(1:length(P),P*L) +  geom_hline(yintercept = 0.5*0.8)

time = proc.time()
l=100
L=vector(mode = 'numeric',length=l)
P=vector(mode = 'numeric',length=l)
lambda=0.8
p=0.5
m=10
seg = sim.seg(lambda = 0.8, p = 0.5,n=100)
Dt = seg$obs
for(k in 1:l){
  srs = sim.srs(Dt,lambda,p,m)
  mle=mle.ss(srs)
  L[k] = lambda = mle$lambda.mle
  P[k] = p = mle$p.mle
}
print(proc.time()-time)
##    user  system elapsed 
##   5.792   0.000   5.805
qplot(1:length(P),L) + geom_hline(yintercept = 0.8)

qplot(1:length(P),P) +  geom_hline(yintercept = 0.5)

qplot(1:length(P),P*L) +  geom_hline(yintercept = 0.5*0.8)

time = proc.time()
l=100
L=vector(mode = 'numeric',length=l)
P=vector(mode = 'numeric',length=l)
lambda=0.8
p=0.5
m=100
seg = sim.seg(lambda = 0.8, p = 0.5,n=100)
Dt = seg$obs
for(k in 1:l){
  srs = sim.srs(Dt,lambda,p,m)
  mle=mle.ss(srs)
  L[k] = lambda = mle$lambda.mle
  P[k] = p = mle$p.mle
}
print(proc.time()-time)
##    user  system elapsed 
##  48.676   0.008  48.728
qplot(1:length(P),L) + geom_hline(yintercept = 0.8)

qplot(1:length(P),P) +  geom_hline(yintercept = 0.5)

qplot(1:length(P),P*L) +  geom_hline(yintercept = 0.5*0.8)

time = proc.time()
l=1000
L=vector(mode = 'numeric',length=(l+1))
P=vector(mode = 'numeric',length=(l+1))
L[1] = lambda=0.8
P[1] = p = 0.5
m=10
seg = sim.seg(lambda = 0.8, p = 0.1,n=100)
Dt = seg$obs
for(k in 1:l){
  srs = sim.srs(Dt,lambda,p,m)
  mle=mle.ss(srs)
  L[k+1] = lambda = mle$lambda.mle
  P[k+1] = p = mle$p.mle
}
print(proc.time()-time)
##    user  system elapsed 
##  53.200   0.000  53.271
qplot(1:length(P),L) + geom_hline(yintercept = 0.8)

qplot(1:length(P),P) +  geom_hline(yintercept = 0.1)

qplot(1:length(P),P*L) +  geom_hline(yintercept = 0.1*0.8)

qplot(1:(length(P)-1),diff(L)) 

time = proc.time()
l=100
L=vector(mode = 'numeric',length=(l+1))
P=vector(mode = 'numeric',length=(l+1))
L[1] = lambda=0.8
P[1] = p = 0.5
m=100
seg = sim.seg(lambda = 0.8, p = 0.1,n=100)
Dt = seg$obs
for(k in 1:l){
  srs = sim.srs(Dt,lambda,p,m)
  mle=mle.ss(srs)
  L[k+1] = lambda = mle$lambda.mle
  P[k+1] = p = mle$p.mle
}
print(proc.time()-time)
##    user  system elapsed 
##  55.272   0.004  55.303
qplot(1:length(P),L) + geom_hline(yintercept = 0.8)

qplot(1:length(P),P) +  geom_hline(yintercept = 0.1)

qplot(1:length(P),P*L) +  geom_hline(yintercept = 0.1*0.8)

qplot(1:(length(P)-1),diff(L)) 

NHPP

# NHPP
nhpp.missing <- function(pars,ct=15){
  lambda_0 = pars[1]
  mu_0 = pars[2]
  t_max     <- ct
  t         <- 0
  s         <- 0
  lambda <- function(t) lambda_0*(1-exp(-mu_0*(ct-t)))
  Lambda <- function(tupper) integrate(f = lambda, lower = 0, upper = tupper)$value
  Lambda_inv <- function(s){
    v <- seq(0,t_max+3, length.out = 1000)
    min(v[Vectorize(Lambda)(v)>=s])
  }
  X <- numeric(0)
  while(t <= t_max){
    u <- rexp(1)
    s <- s + u
    if(s > Lambda(ct)) break
    t <- Lambda_inv(s)
    X <- c( X, t)
  }
return(X)
}