I started this example to help a student implement an epidemic model where we wanted to have the dynamics (Euler steps) occurring on a faster time scale/more frequently than the observations. I ended up doing this by coding missing data/intermediate time steps as negative values and telling Stan to skip the predicted-observed comparison for those cases. (I think this is actually fairly straightforward way to deal with missing data (there is various discussion of how to model missing data in Stan, I haven’t spent long enough to see if my approach is reasonable or not).
In the process I had fun writing some “tidying” methods (i.e., for the broom package) for rstan and rjags models.
library("R2jags")
library("rstan")
options(mc.cores = parallel::detectCores())
rstan_options(auto_write = TRUE)
library("emdbook") ## for perturb.params(): maybe unnecessary?
library("lattice")
library("plyr")
library("ggplot2"); theme_set(theme_bw())
## if necessary get broom package for tidying
## (BMB's development version has tidiers for 'rjags', 'rstan' objects)
if (FALSE) {
library("devtools")
install_github("bbolker/broom")
}
library("broom")
Simulation function:
simlogistdata <- function(seed=1001,
r=1,K=10,n0=1,t0=1,tot.t=10,dt=0.5,
sd.proc=1,sd.obs=1) {
if (!is.null(seed)) set.seed(seed)
tvec <- seq(1,tot.t,by=dt)
n <- length(tvec)
y <- numeric(n)
ydet<-numeric(n)
y[1] <- n0
ydet[1] <- n0
e.proc <- rnorm(n,mean=0,sd=sd.proc)
e.obs <- rnorm(n,mean=0,sd=sd.obs)
for (i in 2:n) {
ydet[i] <- ydet[i-1]+r*ydet[i-1]*(1-ydet[i-1]/K)*dt
y[i] <- y[i-1]+(r*y[i-1]*(1-y[i-1]/K))*dt
y[i] <- y[i] + e.proc[i-1]
## note we have to be careful here if we are doing anything
## non-trivial with dt; nothing is done to ensure that
## variance of e.proc scales appropriately with dt ...
}
y.procobs <- y+e.obs
return(cbind(tvec,y.procobs))
}
## now sim data ...
set.seed(102)
s1 <- simlogistdata(tot.t=10,dt=0.2)
plot(y.procobs~tvec,data=s1)
JAGS preliminaries.
model {
## initial values
t[1] <- n0
o[1] ~ dnorm(t[1],tau.obs)
## step through observations
for (i in 2:N) {
v[i] <- t[i-1]+r*t[i-1]*(1-t[i-1]/K)*dt
t[i] ~ dnorm(v[i],tau.proc)
o[i] ~ dnorm(t[i],tau.obs)
}
## aux variables
sd_proc <- 1/sqrt(tau.proc)
sd_obs <- 1/sqrt(tau.obs)
## priors
r ~ dunif(0.1,maxr)
K ~ dgamma(0.005,0.005)
tau.obs ~ dgamma(0.005,0.005)
tau.proc ~ dgamma(0.005,0.005)
n0 ~ dgamma(1,n0rate)
}
parameters <- c("r","K","sd_obs","sd_proc","n0")
inits <- perturb.params(
list(n0=1,r=0.2,K=10,tau.obs=1,tau.proc=1),
alt=list(r=c(0.1,0.4),tau.obs=3,tau.proc=3))
jagsdata <- list(o=s1[,2],dt=0.2,N=nrow(s1),n0rate=1,
maxr=5)
Run JAGS:
j1 <- jags(data=jagsdata,
inits, param=parameters,
model="logist.bug",
n.chains=length(inits), n.iter=15000,
progress.bar="none", ## quiet ...
n.burnin=5000,n.thin=37)
## module glm loaded
Diagnose (trace plots):
j1bugs <- as.mcmc.bugs(j1$BUGS)
xyplot(j1bugs)
Not sure why the \(K\) chain here is so horrendous … suspect that there’s a very flat region of parameter space, once you get out there you can wander around a lot (i.e. the posterior for \(K=100\) isn’t that different from the posterior for \(K=20\)), and the prior isn’t strong enough to stop that. But ???
Now with Stan.
data {
int<lower=0> N;
real<lower=0> dt;
real o[N];
}
parameters {
real t[N];
real<lower=0> sd_obs;
real<lower=0> sd_proc;
real<lower=0,upper=1000> K;
real<lower=0> r;
real<lower=0> n0;
}
model {
real v[N];
// initial values
// can't set t[1] <- n0 as in JAGS
// make n0 be the value *before* t[1] (not quite the same model)
t[1] ~ normal(n0,sd_proc);
o[1] ~ normal(t[1],sd_obs);
// step through observations
for (i in 2:N) {
v[i] <- t[i-1]+r*t[i-1]*(1-t[i-1]/K)*dt; // Euler step
t[i] ~ normal(v[i],sd_proc);
if (o[i]>0) { // missing data coded as negative
o[i] ~ normal(t[i],sd_obs);
}
}
// priors
// (works quite badly with implicit/flat priors!)
r ~ uniform(0.1,10);
K ~ gamma(0.005,0.005);
sd_obs ~ gamma(0.005,0.005);
sd_proc ~ gamma(0.005,0.005);
n0 ~ gamma(1,0.1);
}
// once this is working, try with missing values ...
## all default options: runs
s1 <- stan(file="logist.stan",
data=jagsdata[1:3],
seed=12345)
(suppressing/ignoring warnings about “divergent transitions after warmup” [what could possibly go wrong … ?])
pars2 <- c("r","K","sd_obs","sd_proc","n0")
pairs(s1,pars=pars2,gap=0)
traceplot(s1,pars=pars2)
## sd_proc looks a bit ugly, but oh well ...
Now set increasing fractions of the data to be negative (flagged to be ignored in the Stan model):
redoStan <- function(sampN) {
jagsdataNew <- jagsdata
N <- jagsdata$N
jagsdataNew$o[(1:N) %% sampN != 1] <- -1
stan(file="logist.stan",
data=jagsdataNew[1:3],
seed=12345+sampN,
open_progress=FALSE,
verbose=FALSE)
}
stanres <- lapply(2:10,redoStan)
The results make sense –
allres <- c(list(full=s1,jags=j1),setNames(stanres,2:10))
res <- ldply(allres,
tidy,pars=pars2,conf.int=TRUE,
.id="data")
true_vec <- c(r=1,K=10,n0=1,sd_proc=1,sd_obs=1)
true_df <- data.frame(term=names(true_vec),v=true_vec)
## restrict upper CI to avoid blowing
## up the plot range ... only binding for some K
## would like to use an upper bound on the axis
## plus oob="squish" but that doesn't play nicely with
## facets -- can't set limits facet-wise ...
res[,"conf.high"] <- pmin(res[,"conf.high"],20)
ggplot(res,aes(data,estimate))+
geom_pointrange(aes(ymin=conf.low,ymax=conf.high,
colour=as.numeric(data)))+
facet_wrap(~term,scale="free")+
geom_hline(data=true_df,aes(yintercept=v),colour="red")
(I’m ignoring the fact here that the models aren’t performing particularly well: we could look at the trace plots, or be lazy and just look at the maximum R (Gelman-Rubin)-statistics (which are supposed to be <1.2 as a rule of thumb):
round(sapply(allres[-2],
function(x) max(summary(x)$summary[,"Rhat"])),2)
## full 2 3 4 5 6 7 8 9 10
## 1.05 1.14 2.62 1.08 1.20 1.22 2.51 1.33 1.36 1.33
jags.parallel() working?emdbook dependency?coord_flip doesn’t play nicely with facets so we need an explicitly horizontal range: e.g. see here for discussion of orientation control for geoms)knitr and rmarkdown share a cache? it’s a pain …