I started this example to help a student implement an epidemic model where we wanted to have the dynamics (Euler steps) occurring on a faster time scale/more frequently than the observations. I ended up doing this by coding missing data/intermediate time steps as negative values and telling Stan to skip the predicted-observed comparison for those cases. (I think this is actually fairly straightforward way to deal with missing data (there is various discussion of how to model missing data in Stan, I haven’t spent long enough to see if my approach is reasonable or not).

In the process I had fun writing some “tidying” methods (i.e., for the broom package) for rstan and rjags models.

library("R2jags")
library("rstan")
options(mc.cores = parallel::detectCores())
rstan_options(auto_write = TRUE)
library("emdbook")  ## for perturb.params(): maybe unnecessary?
library("lattice")
library("plyr")
library("ggplot2"); theme_set(theme_bw())

## if necessary get broom package for tidying
## (BMB's development version has tidiers for 'rjags', 'rstan' objects)
if (FALSE) {
  library("devtools")
  install_github("bbolker/broom")
}
library("broom")

Simulation function:

simlogistdata <- function(seed=1001,
                          r=1,K=10,n0=1,t0=1,tot.t=10,dt=0.5,
                          sd.proc=1,sd.obs=1) {
    if (!is.null(seed)) set.seed(seed)
  tvec <- seq(1,tot.t,by=dt)
  n <- length(tvec)
  y <- numeric(n)
  ydet<-numeric(n)
  y[1] <- n0
  ydet[1] <- n0
  e.proc <- rnorm(n,mean=0,sd=sd.proc)
  e.obs <- rnorm(n,mean=0,sd=sd.obs)
  for (i in 2:n) {
      ydet[i] <- ydet[i-1]+r*ydet[i-1]*(1-ydet[i-1]/K)*dt
      y[i] <- y[i-1]+(r*y[i-1]*(1-y[i-1]/K))*dt
      y[i] <- y[i] + e.proc[i-1]
      ## note we have to be careful here if we are doing anything
      ## non-trivial with dt; nothing is done to ensure that
      ## variance of e.proc scales appropriately with dt ...
  }
    y.procobs <- y+e.obs
    return(cbind(tvec,y.procobs))
}
## now sim data ...
set.seed(102)
s1 <- simlogistdata(tot.t=10,dt=0.2)
plot(y.procobs~tvec,data=s1)

JAGS preliminaries.

model {
  ## initial values
  t[1] <- n0
  o[1] ~ dnorm(t[1],tau.obs)
  ## step through observations
  for (i in 2:N) {
     v[i] <- t[i-1]+r*t[i-1]*(1-t[i-1]/K)*dt
     t[i] ~ dnorm(v[i],tau.proc)
     o[i] ~ dnorm(t[i],tau.obs)
  }
  ## aux variables
  sd_proc <- 1/sqrt(tau.proc)
  sd_obs <- 1/sqrt(tau.obs)
  ## priors
  r ~ dunif(0.1,maxr)
  K ~ dgamma(0.005,0.005)
  tau.obs ~ dgamma(0.005,0.005)
  tau.proc ~ dgamma(0.005,0.005)
  n0 ~ dgamma(1,n0rate)
}
parameters <- c("r","K","sd_obs","sd_proc","n0")
inits <- perturb.params(
    list(n0=1,r=0.2,K=10,tau.obs=1,tau.proc=1),
    alt=list(r=c(0.1,0.4),tau.obs=3,tau.proc=3))

jagsdata <- list(o=s1[,2],dt=0.2,N=nrow(s1),n0rate=1,
                 maxr=5)

Run JAGS:

j1 <- jags(data=jagsdata,
           inits, param=parameters,
           model="logist.bug",
           n.chains=length(inits), n.iter=15000,
           progress.bar="none",  ## quiet ...
           n.burnin=5000,n.thin=37)
## module glm loaded

Diagnose (trace plots):

j1bugs <- as.mcmc.bugs(j1$BUGS)
xyplot(j1bugs)

Not sure why the \(K\) chain here is so horrendous … suspect that there’s a very flat region of parameter space, once you get out there you can wander around a lot (i.e. the posterior for \(K=100\) isn’t that different from the posterior for \(K=20\)), and the prior isn’t strong enough to stop that. But ???

Stan

Now with Stan.

data {
    int<lower=0> N;
    real<lower=0> dt;
    real o[N];
}
parameters {
    real t[N];
    real<lower=0> sd_obs;
    real<lower=0> sd_proc;
    real<lower=0,upper=1000> K;
    real<lower=0> r;
    real<lower=0> n0;
}
model {
    real v[N];

    // initial values
    // can't set t[1] <- n0 as in JAGS
    // make n0 be the value *before* t[1] (not quite the same model)
    t[1] ~ normal(n0,sd_proc);
    o[1] ~ normal(t[1],sd_obs);
    // step through observations
    for (i in 2:N) {
    v[i] <- t[i-1]+r*t[i-1]*(1-t[i-1]/K)*dt; // Euler step
    t[i] ~ normal(v[i],sd_proc);
    if (o[i]>0) {  // missing data coded as negative
        o[i] ~ normal(t[i],sd_obs); 
    }
    }
    // priors
    // (works quite badly with implicit/flat priors!)
    r ~ uniform(0.1,10);
    K ~ gamma(0.005,0.005);
    sd_obs ~ gamma(0.005,0.005);
    sd_proc ~ gamma(0.005,0.005);
    n0 ~ gamma(1,0.1);

}
// once this is working, try with missing values ...
## all default options: runs
s1 <- stan(file="logist.stan",
           data=jagsdata[1:3],
           seed=12345)

(suppressing/ignoring warnings about “divergent transitions after warmup” [what could possibly go wrong … ?])

pars2 <- c("r","K","sd_obs","sd_proc","n0")
pairs(s1,pars=pars2,gap=0)

traceplot(s1,pars=pars2)

## sd_proc looks a bit ugly, but oh well ...

Now set increasing fractions of the data to be negative (flagged to be ignored in the Stan model):

redoStan <- function(sampN) {
    jagsdataNew <- jagsdata
    N <- jagsdata$N
    jagsdataNew$o[(1:N) %% sampN != 1] <- -1
    stan(file="logist.stan",
        data=jagsdataNew[1:3],
             seed=12345+sampN,
             open_progress=FALSE,
             verbose=FALSE)
    }
stanres <- lapply(2:10,redoStan)

The results make sense –

allres <- c(list(full=s1,jags=j1),setNames(stanres,2:10))
res <- ldply(allres,
             tidy,pars=pars2,conf.int=TRUE,
             .id="data")
true_vec <- c(r=1,K=10,n0=1,sd_proc=1,sd_obs=1)
true_df <- data.frame(term=names(true_vec),v=true_vec)
## restrict upper CI to avoid blowing
##  up the plot range ... only binding for some K 
## would like to use an upper bound on the axis
## plus oob="squish" but that doesn't play nicely with
## facets -- can't set limits facet-wise ...
res[,"conf.high"] <- pmin(res[,"conf.high"],20)
ggplot(res,aes(data,estimate))+
    geom_pointrange(aes(ymin=conf.low,ymax=conf.high,
                        colour=as.numeric(data)))+
                   facet_wrap(~term,scale="free")+
    geom_hline(data=true_df,aes(yintercept=v),colour="red")

(I’m ignoring the fact here that the models aren’t performing particularly well: we could look at the trace plots, or be lazy and just look at the maximum R (Gelman-Rubin)-statistics (which are supposed to be <1.2 as a rule of thumb):

round(sapply(allres[-2],
       function(x) max(summary(x)$summary[,"Rhat"])),2)
## full    2    3    4    5    6    7    8    9   10 
## 1.05 1.14 2.62 1.08 1.20 1.22 2.51 1.33 1.36 1.33

To do/FIXME