This section contains the setup and the various utility functions used throughout.
Libraries used:
library(data.table)
library(rstan)
library(tictoc)
library(ggplot2)
rstan_options(auto_write = TRUE)
Sys.setenv(LOCAL_CPPFLAGS = '-march=native')
Compiled code (any models used are shown later):
mod2 <- rstan::stan_model("..//stan//logit_not_so_simple.stan", verbose = FALSE)
I was asked why there wasn’t an apparent performance gain when using multiple cores in Stan.
The answer relates to the fact that there is a start up and close down cost associated with parallelism so it might not be worth doing if you have a relatively simple model and you are only using a few chains.
Parallelism happens across chains. You effectively dedicate the task of sampling a chain to a given core. So, if you are only using 4 chains when sampling then using more than 4 cores isn’t going to make any difference. However, it is also worth noting that you can also compile stan programs with multi-threading support and there are other approaches to parallelization, also see:
To demonstrate a reduction in sampling time under multicore support consider the following simulated scenario.
Data generation.
d <- CJ(
a = 1:10,
b = 1:10,
t = 1:3
)
ba <- rnorm(10, sd = 0.25)
bb <- rnorm(10, sd = 0.5)
bt <- c(0, 0.5, 1)
d[, eta := 0 + ba[a] + bb[b] + bt[t]]
d[, y := rbinom(.N, 1, plogis(eta))]
ld <- list(N = nrow(d), y = d$y, ga = d$a, gb = d$b, t = d$t)
Model
data {
int<lower=1> N;
int<lower=0> y[N];
int ga[N];
int gb[N];
int t[N];
}
parameters {
real a;
vector[2] b_raw;
vector[10] za;
vector[10] zb;
real<lower=0> sa;
real<lower=0> sb;
}
transformed parameters{
vector[3] b;
b[1] = 0;
b[2:3] = b_raw;
}
model {
vector[N] eta;
target += normal_lpdf(a | 0, 5);
target += normal_lpdf(b | 0, 1);
target += normal_lpdf(za | 0, 1);
target += normal_lpdf(zb | 0, 1);
target += exponential_lpdf(sa | 1);
target += exponential_lpdf(sb | 1);
for(i in 1:N){
eta[i] = a + za[ga[i]]*sa + zb[gb[i]]*sb + b[t[i]];
}
y ~ bernoulli_logit(eta);
}
generated quantities{
vector[N] p;
for(i in 1:N){
p[i] = inv_logit(a + za[ga[i]]*sa + zb[gb[i]]*sb + b[t[i]]);
}
}
Fit using a single core.
tic()
fit2a <- sampling(mod2, data = ld,
cores = 1, refresh = 0, chains = 10, iter = 2000,
warmup = 500, show_messages = FALSE)
## Warning: There were 2 divergent transitions after warmup. See
## http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
## to find out why this is a problem and how to eliminate them.
## Warning: Examine the pairs() plot to diagnose sampling problems
toc()
## 15.461 sec elapsed
Fit using multiple cores.
tic()
fit2b <- sampling(mod2, data = ld,
cores = 10, refresh = 0, chains = 10, iter = 2000,
warmup = 500, show_messages = FALSE)
## Warning: There were 1 divergent transitions after warmup. See
## http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
## to find out why this is a problem and how to eliminate them.
## Warning: Examine the pairs() plot to diagnose sampling problems
toc()
## 6.553 sec elapsed
Extract the posterior draws.
pp2a <- data.table(as.matrix(fit2a, pars = c("a", "b", "sa", "sb")))
pp2b <- data.table(as.matrix(fit2b, pars = c("a", "b", "sa", "sb")))
nrow(pp2a)
nrow(pp2b)
head(
cbind(
colMeans(pp2a),
colMeans(pp2b)
), 10)
## [1] 15000
## [1] 15000
## [,1] [,2]
## a 0.01276324 0.01164181
## b[1] 0.00000000 0.00000000
## b[2] 0.52800918 0.52606895
## b[3] 0.79671241 0.79609335
## sa 0.19104203 0.18985558
## sb 0.47232450 0.46915816
Plot the parameter estimates.
dfig1a <- melt(pp2a, measure.vars = names(pp2a))
dfig1a <- dfig1a[, .(mu = mean(value),
lb = quantile(value, probs = c(0.025)),
ub = quantile(value, probs = c(0.975))), keyby = variable]
X11()
ggplot(dfig1a, aes(x = mu, y = variable)) +
geom_point() +
geom_linerange(data = dfig1a, aes(xmin = lb, xmax = ub, y = variable))