Hierarchical Model Example

Setup

This section contains the setup and the various utility functions used throughout.

Libraries used:

library(data.table)
library(coadministr)
library(ggplot2)
library(grid)
library(gridExtra)
library(ggrepel)
library(rjags)
library(runjags)
library(rstan)

Compiled code (the models are shown later):

rstan::rstan_options(auto_write = TRUE)
Sys.setenv(LOCAL_CPPFLAGS = '-march=native')
options(mc.cores = 1)
tg_stan <- new.env()
tg_stan$model1 <- rstan::stan_model(file = "stan//logistic_001.stan",
                                        auto_write = TRUE)
tg_stan$model2 <- rstan::stan_model(file = "stan//logistic_002.stan",
                                        auto_write = TRUE)
tg_stan$model3 <- rstan::stan_model(file = "stan//logistic_003.stan",
                                        auto_write = TRUE)

Data simulation:

getdat <- function(nexpt = 9, 
                   u_mu = 0, u_sd = 1, 
                   n_trials = c(3, 10, 20),
                   outlier = TRUE){
  
  # Trials are of varying size.
  ntrials <- n_trials[sample(1:length(n_trials), 
                             replace = TRUE, 
                             size = nexpt)]
  
  # Draw the log odds of success from a normal dist
  eta <- rnorm(nexpt, u_mu, u_sd)
  theta_tru <- plogis(eta)
  
  # Simulate the number of successes in each cluster
  y <- rbinom(nexpt, ntrials, theta_tru)
  d1 <- data.table(
    expt = 1:nexpt,
    y = y,
    n = ntrials,
    theta = y/ntrials,
    theta_tru = theta_tru)
  
  # Likelihood for pooled estimate
  pp <- seq(0, 1, len = 1000)
  yy <- dbinom(sum(d1$y), sum(d1$n), pp)

  # made up outlier
  if(outlier){
    d1[nexpt, `:=`(y=min(n_trials), 
                   n=min(n_trials), 
                   theta=1, 
                   theta_tru=0.99)]  
  }
  
  # Scale num trials to fit on plot
  ff <- max(d1$n) / (max(yy)/3)
  d1[, nscaled := n/ff]
  d1[, `:=`(a = y + 1, b = n - y + 1)]
  # For a, b > 1 posterior mode is as follows. When an uniform prior 
  # (noninformative prior) is used, the posterior mode will equal the MLE.
  d1[, post_mode := (a - 1)/ (a + b - 2)]
  
  d1
}

Default colours for factors in ggplot:

# Get the default colours for factors in ggplot.
gg_color_hue <- function(n) {
  hues = seq(15, 375, length = n + 1)
  hcl(h = hues, l = 65, c = 100)[1:n]
}

Extract and optionally summarise posterior draws:

process_samples1 <- function(s, desc = "", summarise = FALSE){
  dd <- subset(s, select = grep("^theta", names(s)))
  dfig <- melt(dd, measure.vars = names(dd),
               variable.name = "expt")
  dfig[, expt := sub('[', "", expt, fixed = TRUE)]
  dfig[, expt := sub(']', "", expt, fixed = TRUE)]
  dfig[, expt := substr(expt, 6,
                            nchar(as.character(expt)))]
  dfig[, expt := as.integer(expt)]
  if(summarise == TRUE){
    dfig <- dfig[, 
               .(pooling = desc,
                 lb = quantile(value, 0.025), 
                 ub = quantile(value, 0.975),
                 mean = mean(value),
                 median = median(value)),
               keyby = .(expt)]  
  } else {
    dfig[, pooling := desc]
  }
  dfig
}

Binomial likelihood for completely pooled estimate:

plot1 <- function(d, addlabels = TRUE){
  
  # Likelihood for pooled data
  pp <- seq(0, 1, len = 1000)
  yy <- dbinom(sum(d$y), sum(d$n), pp)
  dfig1 <- data.table(x = pp, y = yy)
  
  # idx 1 is red, 2 is blue
  cols = gg_color_hue(2)
  
  p <- ggplot(dfig1, aes(x = x, y = y)) +
    geom_line() +
    geom_segment(data = d, 
                 aes(x = theta, xend = theta, 
                     y = 0, yend = nscaled),
                 col = cols[1]) +
    geom_point(data = d, 
               aes(x = post_mode, y = nscaled),
               col = cols[2]) +
    geom_point(data = d[, .(x = sum(y)/sum(n),
                            y = max(dfig1$y))], 
               aes(x = x, y = y), 
               col = cols[2]) +
    scale_x_continuous(name = "Probability", limits = c(0, 1)) +
    scale_y_continuous(name = "Density") +
    theme_bw()
  
  if(addlabels){
    p <- p + geom_label_repel(data = d,
                              aes(x = post_mode, y = nscaled, 
                                  label = paste0("Expt ", expt)),
                              box.padding = 1,
                              max.overlaps = Inf) 
  }
  
  p
}

Credible intervals for probability of success in each experiment:

plot2 <- function(dfig){
  p <- ggplot(dfig) +
    geom_linerange(aes(xmin = lb, xmax = ub,
                       y = factor(expt))) +
    geom_point(aes(x = mean, y = factor(expt)),
               size = 2) + # default shape circle
    geom_point(data = dfig[, head(.SD, 1), by = expt],
               aes(x = theta_tru, y  = factor(expt)), 
               shape = 2, size = 2) + # triangle
    geom_point(data = dfig[, head(.SD, 1), by = expt],
               aes(x = mle, y  = factor(expt)), 
               shape = 3, size = 5) + # cross
    scale_x_continuous(name = "Probability", limits = c(0, 1)) +
    scale_y_discrete(name = "Experiment") +
    theme_bw() 
  p
}

Credible intervals for probability of success in each experiment (paired summary):

plot3 <- function(dfig, cent = NULL, xpos = 0){
  p <- ggplot(dfig) +
    geom_linerange(aes(xmin = lb, xmax = ub,
                       y = factor(expt), 
                       group = pooling, col = pooling),
                   position = position_dodge(0.5)) +
    geom_point(aes(x = mean, y = factor(expt),
                   group = pooling, col = pooling),
               position = position_dodge(0.5),
               size = 2) +  # default shape circle
    geom_point(data = dfig[, head(.SD, 1), by = expt],
               aes(x = theta_tru, y  = factor(expt)), 
               shape = 2, size = 2) + # triangle
    geom_point(data = dfig[, head(.SD, 1), by = expt],
               aes(x = mle, y  = factor(expt)), 
               shape = 3, size = 5) + # cross
    geom_text_repel(data = dfig[, head(.SD, 1), by = expt],
                     aes(x = xpos, y  = factor(expt),
                         label = paste0("n = ", n)),
                     box.padding = 1,
                     max.overlaps = Inf,
                    segment.colour = NA,
                    size = 3) +  
    # segment.colour = NA
    scale_x_continuous(name = "Probability", limits = c(0, 1)) +
    scale_y_discrete(name = "Experiment") +
    scale_color_discrete("Pooling") +
    theme_bw()  +
    theme(legend.position = "bottom")
  
  if(!is.null(cent)){
    p <- p + geom_vline(xintercept = cent)
  }
  p
}

Compare absolute error:

plot4 <- function(dfig, ylab = "Absolute error"){
  p <- ggplot(dfig, aes(x = expt, y = err, 
                 group = factor(pooling),
                 col = factor(pooling)))+
    geom_point(position = position_dodge(width = 0.2)) +
    geom_vline(aes(xintercept = expt), lwd = 0.1) +
    scale_x_continuous(name = "Experiment", breaks = unique(dfig$expt)) +
    scale_y_continuous(name = ylab) +
    scale_color_discrete("Pooling") +
    theme_bw()  +
    theme(legend.position = "bottom",
          panel.grid.major = element_blank(), 
          panel.grid.minor = element_blank(),
          panel.background = element_blank())
  p
}

Variation in probability of success:

plot5 <- function(post, xlim = c(-4, 4)){
  p1 <- ggplot(post, aes(x = u_sd)) +
    geom_density() +
    scale_x_continuous(name = "Standard deviation") +
    scale_y_continuous(name = "Density") +
    theme_bw() 
  
  # https://clayford.github.io/dwir/dwr_07_regular_expressions.html
  dd <- subset(post, select = grep("[0-9]+", names(post)))
  dd <- melt(dd, measure.vars = names(dd),
               variable.name = "expt")
  dd[, expt := substr(expt, 2,
                      nchar(as.character(expt)))]
  p2 <- ggplot(dd, aes(x = value, group = factor(expt)))+
    geom_density()  +
    scale_x_continuous(name = "Log-odds success") +
    scale_y_continuous(name = "Density") +
    theme_bw() 
  
  # sample from the population
  nsampl <- 30
  idx <- sort(sample(1:nrow(post), size = nsampl))
  xx <- seq(xlim[1], xlim[2], len = 100)
  dd <- data.table(do.call(cbind, lapply(1:nsampl, function(i){
    dnorm(xx, post$eta[idx[i]], post$u_sd[idx[i]])
    })))
  dd <- melt(dd, measure.vars = names(dd),
               variable.name = "expt")
  dd[, expt := substr(expt, 2,
                      nchar(as.character(expt)))]
  dd[, x:= rep(xx, length= nrow(dd))]
  p3 <- ggplot(dd, aes(x = x, y = value, group = factor(expt)))+
    geom_line(alpha = 0.2)  +
    scale_x_continuous(name = "Log-odds success", limits = xlim) +
    scale_y_continuous(name = "Density") +
    theme_bw() 
  
  dd <- data.table(eta = rnorm(length(post$eta), post$eta, post$u_sd))
  dd[, theta := plogis(eta)]
  
  p4 <- ggplot(dd, aes(x = theta))+
    geom_density() +
    scale_x_continuous(name = "Probability success") +
    scale_y_continuous(name = "Density") +
    theme_bw() 
  
  p1 <- ggplotGrob(p1)
  p2 <- ggplotGrob(p2)
  p3 <- ggplotGrob(p3)
  p4 <- ggplotGrob(p4)
  
  grid::grid.newpage()
  vp <- viewport(x = 0.0, y = 0.5, 
                 width = 0.5, height = 0.5,
                 just = c("left", "bottom"))
  pushViewport(vp)
  grid.draw(p1)
  popViewport()
  
  vp <- viewport(x = 0.5, y = 0.5, 
                 width = 0.5, height = 0.5,
                 just = c("left", "bottom"))
  pushViewport(vp)
  grid.draw(p2)
  popViewport()
  
  vp <- viewport(x = 0.0, y = 0.0, 
                 width = 0.5, height = 0.5,
                 just = c("left", "bottom"))
  pushViewport(vp)
  grid.draw(p3)
  popViewport()
    
  vp <- viewport(x = 0.5, y = 0.0, 
                 width = 0.5, height = 0.5,
                 just = c("left", "bottom"))
  pushViewport(vp)
  grid.draw(p4)
  popViewport()
}

Density kernels for posterior parameter summary:

plot6 <- function(dfig){
  p <- ggplot(dfig, aes(x = value, 
                 group = factor(expt),
                 col = factor(expt)))+
    geom_density() +
    theme_bw() +
    scale_x_continuous(name = "Probability of success") +
    scale_y_continuous(name = "Density") +
    scale_color_discrete("Experiment") +
    facet_wrap(~pooling, ncol =  1) +
    theme(legend.position = "bottom")
  p
}

Scenario

Suppose we have data from \(K\) Binomial experiments that test the same drug in a variety of settings. We are given the number of successes and the number of trials for each. We do not know the probability of success \(\theta_k\), nor do we know if \(\theta_k = \theta_j\), \(\forall k \ne j\). We suspect that the \(\theta_k\) might perform similarly well across the experiments, but we are a long way from certain.

Data

Homogenous scenario

Simulate results from \(10\) experiments, with equal probability of success in each and equal number of trials in each. For each experiment, the probability of success \(\theta = 0.5\) and each has \(n = 12\) trials.

set.seed(6)
d1 <- getdat(nexpt = 10,
            u_mu = qlogis(0.5),
            u_sd = 0, 
            n_trials = c(12),
            outlier = FALSE)
print(d1)

##     expt y  n     theta theta_tru    nscaled a b post_mode
##  1:    1 7 12 0.5833333       0.5 0.02422687 8 6 0.5833333
##  2:    2 8 12 0.6666667       0.5 0.02422687 9 5 0.6666667
##  3:    3 4 12 0.3333333       0.5 0.02422687 5 9 0.3333333
##  4:    4 5 12 0.4166667       0.5 0.02422687 6 8 0.4166667
##  5:    5 7 12 0.5833333       0.5 0.02422687 8 6 0.5833333
##  6:    6 5 12 0.4166667       0.5 0.02422687 6 8 0.4166667
##  7:    7 6 12 0.5000000       0.5 0.02422687 7 7 0.5000000
##  8:    8 7 12 0.5833333       0.5 0.02422687 8 6 0.5833333
##  9:    9 4 12 0.3333333       0.5 0.02422687 5 9 0.3333333
## 10:   10 7 12 0.5833333       0.5 0.02422687 8 6 0.5833333

Some of the experiments have the same number of successes and so we get the same estimates for \(\hat{\theta}_k\) for a number of the experiments.

Plot the likelihood from aggregated number of success and trials. The maximum value represents a pooled estimate for \(\hat{\theta}_k = \hat{\theta}\), which is referred to as the maximum likelihood estimate or MLE. The vertical lines show the proportion of successes observed in each experiment with the height of each proportional to the number of trials.

print(plot1(d1))

Heterogeneous scenario 1

Now, simulate results from \(10\) experiments, with simulation parameters such that each experiment has a different probability of success and the number of trials also vary. For each experiment, the log odds of success arises from a normal distribution with a mean of \(0 \implies \bar{\theta} = 0.5\), where \(\bar{\theta}\) denotes the mean probability of success, a standard deviation in the log odds of success \(\sigma = 1.35\) and an experiment has either \(n = 12\) or \(n=24\) trials.

set.seed(7)
d2 <- getdat(nexpt = 10,
            u_mu = qlogis(0.5),
            u_sd = 1.35,  
            n_trials = c(12, 24),
            outlier = FALSE)
print(d2)

##     expt  y  n     theta theta_tru    nscaled  a  b post_mode
##  1:    1  6 24 0.2500000 0.2177498 0.02032754  7 19 0.2500000
##  2:    2 10 12 0.8333333 0.7330178 0.01016377 11  3 0.8333333
##  3:    3  4 12 0.3333333 0.4606094 0.01016377  5  9 0.3333333
##  4:    4 15 24 0.6250000 0.5513404 0.02032754 16 10 0.6250000
##  5:    5 12 12 1.0000000 0.9505684 0.01016377 13  1 1.0000000
##  6:    6 12 24 0.5000000 0.6182038 0.02032754 13 13 0.5000000
##  7:    7 12 12 1.0000000 0.9750986 0.01016377 13  1 1.0000000
##  8:    8 22 24 0.9166667 0.9560585 0.02032754 23  3 0.9166667
##  9:    9 12 24 0.5000000 0.6076459 0.02032754 13 13 0.5000000
## 10:   10 23 24 0.9583333 0.9282218 0.02032754 24  2 0.9583333

Clearly, there is much more variation in results obtained from this set of experiments. Moreover, while the mean for the distribution of probability of success was \(\theta=0.5\) in the simulation parameters, the pooled estimate is \(\hat{\theta} =\) 128 \(/\) 192 \(=\) 0.67.

print(plot1(d2))

For this data, the pooled estimate for \(\hat{\theta}\) does not align well with the simulation parameter we used to generate the data. However, this kind of variation is to be expected. Additionally, if we were able to repeat the 10 experiments over and over again, we would see that the long-run average for the pooled estimate would converge to the mean of the normal distribution. In other words, the long run average of the pooled estimate \(\hat{\theta}\) would align with the \(\bar{\theta}\) parameter that was used to simulate the data as shown below:

set.seed(5678)
runexpts <- function(){
  d3 <- getdat(nexpt = 10,
               u_mu = qlogis(0.5),
               u_sd = 2, 
               n_trials = c(12),
               outlier = FALSE)
  d3[, .(Y = sum(y), N = sum(n))][,Y/N]
}
sim1 <- replicate(1000, runexpts())
mean(sim1)

## [1] 0.5005917

Heterogeneous scenario 2

Create a third data set, simulating results from \(10\) experiments, with simulation parameters such that each experiments has a different probability of success and the number of trials also vary. The only change in the simulation parameters relative to the second data set is that I have reduced the standard deviation to \(0.55\).

set.seed(91) 
d3 <- getdat(nexpt = 10,
            u_mu = qlogis(0.5),
            u_sd = 0.55,  
            n_trials = c(12, 24),
            outlier = FALSE)
print(d3)

##     expt  y  n     theta theta_tru     nscaled  a  b post_mode
##  1:    1 10 12 0.8333333 0.7536549 0.009907476 11  3 0.8333333
##  2:    2 14 24 0.5833333 0.5298989 0.019814951 15 11 0.5833333
##  3:    3  5 12 0.4166667 0.4566232 0.009907476  6  8 0.4166667
##  4:    4 12 24 0.5000000 0.4099313 0.019814951 13 13 0.5000000
##  5:    5 12 24 0.5000000 0.4944143 0.019814951 13 13 0.5000000
##  6:    6  6 24 0.2500000 0.3846958 0.019814951  7 19 0.2500000
##  7:    7  5 12 0.4166667 0.3847549 0.009907476  6  8 0.4166667
##  8:    8  4 12 0.3333333 0.4843517 0.009907476  5  9 0.3333333
##  9:    9  7 12 0.5833333 0.5704142 0.009907476  8  6 0.5833333
## 10:   10 11 24 0.4583333 0.3709352 0.019814951 12 14 0.4583333

Below is a plot of the likelihood based on the aggregated successes and trials along with the proportion of successes in each experiment.

print(plot1(d3))

Analyses and results

First note that for the first data set, the point estimate for the probability of success \(\hat{\theta}\) happened to coincide with the simulation parameters used to generate the data. Additionally, the individual trial results were similar. However, in the second and third datasets, the pooled point estimate were offset from the simulation parameters for the probability of success and the individual experiments had varying results. Different situations warrant differing approaches to the analysis, but we rarely have the convenience of knowing what the true data generation process is.

Pooled analysis

The easiest place to start is to assume that the variation in \(\theta_k\) across all \(K\) experiments is not important, and then estimate a common value for \(\theta_k = \theta\) based on the aggregated results. One possible model we could use is a logistic model, a GLM:

\[ \begin{aligned} p(y | n, \theta) &\sim \mathsf{Binomial}(n, \theta) \\ \mu &= \text{logit}^{-1}(\eta) \\ p(\eta) &\sim \mathsf{Normal}^+(0, 10) \end{aligned} \]

although we could equally use a Beta-Binomial conjugate prior model

\[ \begin{aligned} p(y | n, \theta) &\sim \mathsf{Binomial}(n, \theta) \\ p(\theta) &\sim \mathsf{Beta}(a, b) \end{aligned} \]

where \(y\) and \(n\) now denote the sum of the successes and trials across the individual experiments. This will give us higher precision, but it might not reflect the variation in data adequately. For example, the following take draws from the common posterior estimate for \(\theta\).

sampl1 <- data.table(do.call(cbind, lapply(1:(nrow(d2)), function(i){
  rbeta(1000, d2[, sum(y)] + 1, d2[, sum(n) - sum(y)] + 1)
})))
names(sampl1) <- paste0("theta[", 1:nrow(d2),"]")

The figure below shows the 95% credible intervals from these draws with sample means (circle), MLE (cross) and the simulation parameters (triangle) also shown. It is clear that using the pooled estimate to characterise the individual \(\theta_k\) is unsatisfactory.

dfig <- process_samples1(sampl1, desc = "none", summarise = TRUE)
dfig <- merge(dfig, d2[, .(expt, y, n, mle = theta, theta_tru)], by = "expt")
print(plot2(dfig))

Figure: Credible intervals for estimated probability of success using a pooled approach

Note that there is really no need to sample from the posterior under the beta-binomial as the posterior distribution is known exactly, but simulation is a generic technique that can be used in most cases.

No pooling

The pooled estimate gives relatively little insight into the results for the individual experiments and while it might be acceptable for uniform data, it would not be not ideal for modeling more variable results. As an alternative, we could estimate separate \(\theta_k\) for each experiment. Additionally, if we can assume that the experiments were run under very different conditions, then we might make a simplifying assumption of independence. While using the aggregated data to form an estimate for \(\theta\) is referred to as a pooled estimate, here we do not pool any of the data sources and rely solely on the individual data from each experiment. This means we have less information informing each \(\hat{\theta}_k\), but it should allow us to a better characterise the individual results. The approach might also give us insight into the simulation parameters which represent the underlying truth (that would ordinarily be hidden from us).

A simple model for this analysis is as follows:

\[ \begin{aligned} p(y_k|n_i, \theta_i) &\sim \mathsf{Binomial}(n_k, \theta_k) \\ \mathsf{logit}(\theta_k) &= a_k \\ p(a_k) &\sim \mathsf{Normal}(0, \sigma) \end{aligned} \]

in which we would use an informative prior on the log odds of success \(a_k\) to regularise our parameter estimates. This can be implemented in JAGS as follows (note - JAGS uses precision rather than standard deviation or variance to parameterise the normal distribution, where precision is the reciprocal of the variance).

m1 <- "
model {
  ## likelihood
  for (i in 1:N){
     y[i] ~ dbin(theta[i], n[i])
     logit(theta[i]) <- a[i]
     # Vague-ish prior
     a[i] ~ dnorm(0, tau)
  }
  tau <- 1/(eta_sd*eta_sd)
}"

Build the model and sample from the posterior:

# No pooling - independent estimates for each cluster
inits <- function (){
  list (a = rnorm(d2[, .N], 0, 3))
}
mm1 <- run.jags(m1,
                n.chains = 3,
                inits = inits,
                data = list(y = d2[, y],
                            n = d2[, n],
                            N = d2[, .N],
                            eta_sd = 10
                          ),
                monitor = c("theta"),
                adapt = 1000,
                burnin = 1000,  # 10000
                sample = 10000, # 100000
                thin = 10)

## Compiling rjags model...
## Calling the simulation using the rjags method...
## Adapting the model for 1000 iterations...
## Burning in the model for 1000 iterations...
## Running the model for 100000 iterations...
## Simulation complete
## Calculating summary statistics...
## Calculating the Gelman-Rubin statistic for 10 variables....

sampl2 <- data.table(as.matrix(mm1$mcmc))

We could then view the evolution of the posterior samples via plot(mm1) which would let us assess convergence and autocorrelation, amongst others, to get a sense of whether there were any issues. However, for the purposes of the current work, I will skip this step.

A Stan implementation of the model is shown below

data{    
  int N;    
  int y[N];    
  int n[N];    
  real eta_sd;    
}    
parameters{    
  real eta[N];    
}    
model{    
  target += normal_lpdf(eta | 0, eta_sd);    
  target += binomial_logit_lpmf(y | n, eta);    
}    
generated quantities{    
  real theta[N] = inv_logit(eta);    
}

Stan tends to make noise if it notices that the sampling process could be going off the rails; JAGS does not. An example of this happened during writing this document, as I had originally specified the hyperparameter eta_sd = 100 in the data passed to the JAGS model. While JAGS dutifully (and silently) sampled from the posterior distribution, the resulting posterior draws for \(\theta\) in experiments 5, 7, 8 and 10 were highly concentrated around one. In contrast, Stan, was very unhappy, reporting many divergent transitions to indicate that the sampling process was not exploring the posterior well. In this instance, I decided to make the prior more informative, which improved the sampling process somewhat and remedied the problem with divergent transitions. However, addressing divergent transitions is usually context dependent and is probably better discussed more fully in a separate post.

ld = list(y = d2[, y],
          n = d2[, n],
          N = d2[, .N],
          eta_sd = 10)

l1 <- rstan::sampling(tg_stan$model1,
                  data    = ld,
                  chains  = 1,
                  thin    = 1,
                  iter    = 2400,
                  warmup  = 500,
                  refresh = 0)

sampl3 <- rstan::extract(l1)
sampl3 <- data.table(sampl3$theta)
names(sampl3) <- names(sampl2)

The following plots show the 95% credible intervals, sample means (circle), MLE (cross) and the simulation parameters (triangle) from each of the JAGS and STAN models, which give quite similar results. Note that the posterior means are not expected to align with the simulation parameters because we are looking at a single dataset from the universe of possible datasets. However, what is clear is that the posterior means are quite similar to the MLE, suggesting that a Bayesian or frequentist analysis would reach similar conclusions and that the model is apparently reflecting the data as intended, at least superficially anyway.

dfig <- process_samples1(sampl2, desc = "none", summarise = TRUE)
dfig <- merge(dfig, d2[, .(expt, y, n, mle = theta, theta_tru)], by = "expt")
print(plot2(dfig))

Figure: Credible intervals from JAGS posterior samples (no pooling)

dfig <- process_samples1(sampl3, desc = "none", summarise = TRUE)
dfig <- merge(dfig, d2[, .(expt, y, n, mle = theta, theta_tru)], by = "expt")
print(plot2(dfig))

Figure: Credible intervals from STAN posterior samples (no pooling)

Note: sample means (circle), MLE (cross) and the simulation parameters (triangle).

Additionally, it is worth noting that we could have run separate analyses for each experiment with a beta-binomial conjugate prior as discussed previously. This takes \(Y_k \sim \mathsf{Binomial}(n_k, \theta_k)\) and \(\theta_k \sim \mathsf{Beta}(a, b)\) where \(a\) and \(b\) are some fixed parameters. Again, the advantage of the beta-binomial is that the posterior is available in closed form

\[ \begin{aligned} p(\theta|y) &\propto p(y|\theta)p(\theta) \\ &\propto \theta^y (1-\theta)^{n-y} \theta^{a-1} (1-\theta)^{b-1} \\ &= \theta^{a + y - 1}(1 - \theta)^{b + n - y + 1} \end{aligned} \]

and therefore \(\theta|y \sim \mathsf{Beta}(a + y, b + n - y)\). Under a beta-binomial setting we could assume a somewhat informative prior as was done above in the JAGS model, but we could have also assumed a non-informative \(\mathsf{Beta}(1, 1)\) uniform prior. In the latter approach, the posterior modes align with the MLE.

For completeness, here is the Beta-Binomial implementation. Obviously, there are no sampling issues and we could have computed the 95% credible intervals exactly rather than drawing from the posterior.

sampl4 <- data.table(do.call(cbind, lapply(1:(nrow(d2)), function(i){
  rbeta(1000, d2[i, y] + 1, d2[i, n-y] + 1)
})))
names(sampl4) <- names(sampl1)

For the Beta-Binomial analysis, the posterior modes (rather than means) align with the MLE (not shown, but can be verified by inspection of data in Heterogeneous sections above).

dfig <- process_samples1(sampl4, desc = "none", summarise = TRUE)
dfig <- merge(dfig, d2[, .(expt, y, n, mle = theta, theta_tru)], by = "expt")
print(plot2(dfig))

Figure: Credible intervals derived from samples from a Beta-Binomial conjugate prior (no pooling)

Note: sample means (circle), MLE (cross) and the simulation parameters (triangle).

Partial pooling

While the assumption of independence is simplifying, further information can be extracted by using hierarchical models. In fact, some researchers, including Richard McElreath posit that hierarchical modeling should be the default approach to regression. There are a number of advantages with using these models, but the results obtained will depend on your particular situation, as is shown below.

Model 1

As far as we are concerned, the basic change to the model is to treat the hyperparameters as unknowns that are to be learned from the data. A revised model is shown below, which includes an overall estimate for the probability of success as \(\alpha\) and then derives estimates for the probability of success in each experiment as an offset from this overall mean. This approach takes advantage of the equivalence of \(X \sim \mathsf{Normal}(\mu, \sigma)\) and \(X = \mu + \sigma z\) where \(Z \sim \mathsf{Normal}(0, 1)\) and is known as a non-centred parameterisation. In some settings a non-centred parameterisation is computationally beneficial for sampling from the posterior.

\[ \begin{aligned} Y_k &\sim \mathsf{Binomial}(n_k, \theta_k) \\ \mathsf{logit}(\theta_k) &= \alpha + (u_k \times u_{\sigma}) \\ \alpha &\sim \mathsf{Normal}(0, 10) \\ u_k &\sim \mathsf{Normal}(0, 1) \\ u_{\sigma} &\sim \mathsf{Normal}^+ (0, 3) \\ \end{aligned} \]

Exchangeability is a property of the data that is discussed in relation to hierarchical modeling. Informally, the idea of exchangeability is that reshuffling the data indicies will not alter how you think about the problem. For example, if learning that \(\theta_k = 0.4\) would lead you to think that it was more likely to be a parameter estimate for some particular experiments rather than others, then the sequence is not exchangeable. On the other hand, if learning \(\theta_k = 0.4\) tells you nothing about which \(k\) it relates to, then the sequence is exchangeable.

To implement this model in stan, we introduce the additional structures in a fairly intuitive way

data{    
  int N;    
  int y[N];    
  int n[N];    
}    
parameters{    
  real eta;    
  real u[N];    
  real<lower = 0> u_sd;    
}    
model{    
  target += normal_lpdf(eta | 0, 10);    
  target += normal_lpdf(u | 0, 1);       
  target += normal_lpdf(u_sd | 0, 3);    
  for(i in 1:N){    
    target += binomial_logit_lpmf(y[i] | n[i], eta + u[i] * u_sd);    
  }    
}      
generated quantities{    
  real theta[N];         
  for(i in 1:N){    
    theta[i] = inv_logit(eta + u[i] * u_sd);    
  }    
}

Fit the model as shown previously:

ld = list(y = d2[, y],
          n = d2[, n],
          N = d2[, .N])

l2 <- rstan::sampling(tg_stan$model2,
                  data    = ld,
                  chains  = 1,
                  thin    = 1,
                  iter    = 5000,
                  warmup  = 1000,
                  refresh = 0)

sampl5 <- rstan::extract(l2)
sampl5 <- data.table(sampl5$theta)
names(sampl5) <- names(sampl2)

Below I compare the credible intervals and means for \(\hat{\theta_k}\) from the partial pooling model are compared to the results from the no-pooling model.

dfig <- rbind(
  process_samples1(sampl5, desc = "partial", summarise = TRUE),
  process_samples1(sampl3, desc = "none", summarise = TRUE))
dfig <- merge(dfig, d2[, .(expt, y, n, mle = theta, theta_tru)], by = "expt")
post <- rstan::extract(l2, pars = c("eta"))
print(plot3(dfig, cent = plogis(mean(post$eta))))

Figure: Credible intervals from STAN posterior samples (no pooling versus partial pooling)

Note: sample means (circle), MLE (cross) and the simulation parameters (triangle). Vertical line is grand mean from hierarchical model.

The comparison suggests that there is little benefit in using the partially pooled model here. Additionally, if we look at the absolute error \(\mathsf{abs}(\hat{\theta_k} - \theta_k)\), we see that the error is lower for the partial pooling model about half of the time.

dfig[, err := abs(mean - theta_tru), keyby = pooling]
print(plot4(dfig, ylab = "Absolute error"))

Figure:

However, for the particular dataset that we simulated, the mean absolute error is still lower when the hierarchical model is used:

dfig[, .(mae = sum(abs(mean - theta_tru))), keyby = pooling]

##    pooling       mae
## 1:    none 0.6687414
## 2: partial 0.6042891

We can get some insight into why this might be by looking at the posterior distribution for the other parameters.

post <- rstan::extract(l2, pars = c("eta", "u", "u_sd"))
post <- data.table(cbind(post$eta, post$u, post$u_sd))
names(post) <- c("eta", paste0("u", 1:max(d2$expt)), "u_sd")
# no need to print
plot5(post, xlim = c(-10, 10))

Figure: Left to right, top to bottom: Density kernels for standard deviation, log-odds of success in the experiments observed, draws from the population estimates for log-odds of success and probability of success averaged over thousands of newly simulated experiments

The top left plot shows the posterior for the standard deviation (the population variation) for the log-odds of success across the experiments, which is quite large due to the variation found in our sample. The median value for the standard deviation is around 2. If the log-odds of success were centred around zero and we added (or subtracted) two standard deviations this large, then we would not be surprised to see probabilities of success at the limits of the probability range. Therefore, because the model sees that the data is highly variable, extreme values for \(\theta_k\) are left alone. We see relatively little shrinkage observed in the parameter estimates and so the results from the no-pooling and partial pooling are similar.

Now, consider the third set of data that we generated, see Heterogeneous scenario 2 where we used a standard deviation of \(0.55\) to simulate the data. Fit both the no-pooling and partial pooling models to this dataset:

ld = list(y = d3[, y],
          n = d3[, n],
          N = d3[, .N],
          eta_sd = 10)
l3 <- rstan::sampling(tg_stan$model1,
                  data    = ld,
                  chains  = 1,
                  thin    = 1,
                  iter    = 5000,
                  warmup  = 1000,
                  refresh = 0)

l4 <- rstan::sampling(tg_stan$model2,
                  data    = ld,
                  chains  = 1,
                  thin    = 1,
                  iter    = 5000,
                  warmup  = 1000,
                  refresh = 0)

sampl6 <- rstan::extract(l3)
sampl6 <- data.table(sampl6$theta)
names(sampl6) <- names(sampl2)

sampl7 <- rstan::extract(l4)
sampl7 <- data.table(sampl7$theta)
names(sampl7) <- names(sampl2)

Summarise as previously:

dfig <- rbind(
  process_samples1(sampl7, desc = "partial", summarise = TRUE),
  process_samples1(sampl6, desc = "none", summarise = TRUE))
dfig <- merge(dfig, d3[, .(expt, y, n, mle = theta, theta_tru)], by = "expt")

post <- rstan::extract(l4, pars = c("eta"))
print(plot3(dfig, cent = plogis(mean(post$eta))))

Figure: Credible intervals from STAN posterior samples (no pooling versus partial pooling)

Note: sample means (circle), MLE (cross) and the simulation parameters (triangle). Vertical line is grand mean from hierarchical model.

For this data we see much stronger shrinkage towards the population mean and also narrower credible intervals. The model has inferred that the results from the population of experiments are similar and is therefore more doubtful in regards to abnormally high and low values for \(\hat{\theta_k}\). This is especially the case when the result is from an experiment with a small number of trials. If we now look at the four plot summary associated with the current result, we see that the kernel density for the standard deviation parameter is narrower than the previous dataset. Additionally, the overall probability of success is concentrated around \(0.5\), although this particular dataset was also concentrated around that value.

post <- rstan::extract(l4, pars = c("eta", "u", "u_sd"))
post <- data.table(cbind(post$eta, post$u, post$u_sd))
names(post) <- c("eta", paste0("u", 1:max(d2$expt)), "u_sd")
# no need to print
plot5(post, xlim = c(-1, 1))

Model 2

Another way to model this problem is to consider the population variation to be distributed according to a beta distribution rather that the normal distribution of the log-odds. The specification of such a model is as follows:

\[ \begin{aligned} Y_k &\sim \mathsf{Binomial}(n_k, \theta_k) \\ \theta_k &\sim \mathsf{Beta}(\alpha, \beta) \\ \alpha &= \eta \times \mu \\ \beta &= \eta \times (1-\mu) \\ \mu &\sim \mathsf{Beta}(a, b) \\ \eta &\sim \mathsf{Exponential}(\lambda) \\ \end{aligned} \]

which can be implemented in stan:

data {
  int<lower=0> N; // data
  int<lower=0> n[N];
  int<lower=0> y[N];
  real<lower=0> a; // prior
  real<lower=0> b;
  real<lower=0> lambda;
}
parameters {
  real<lower=0,upper=1> mu;
  real<lower=0> eta;
  real<lower=0,upper=1> theta[N];
}
transformed parameters {
  real<lower=0> alpha;
  real<lower=0> beta;
  alpha = eta * mu ;
  beta = eta * (1-mu);
}
model {
  target += beta_lpdf(mu | a, b);
  target += exponential_lpdf(eta | lambda);
  target += beta_lpdf(theta | alpha, beta);
  target += binomial_lpmf(y | n, theta);
}

In this specification all the machinations are on the probability scale; there are no transforms between log-odds to probability. The beta distribution is parameterised with \(\alpha\) and \(\beta\) which corresponds to the prior successes and failures respectively. I have reparameterised the \(\alpha\) and \(\beta\) to a somewhat more convenient scale, namely the prior mean and sample size where:

\[ \begin{aligned} \mu &= \frac{\alpha}{\alpha + \beta} \\ \eta &= \alpha + \beta \end{aligned} \]

If we know nothing about the \(\mu\) a priori then we can just set \(a, b = 1\) which gives an uninformative (standard uniform) prior. Selecting the prior for \(\eta\) is a little contrived in this setting as \(\eta\) refers to our prior knowledge of the number of trials involved in the experiments. However, if we know very little then we can set an small value, say \(\lambda = 0.05\) (or lower) which concentrates the prior lower than 20 but allows sample sizes as high as 100.

ld = list(y = d2[, y],
          n = d2[, n],
          N = d2[, .N],
          a = 1, 
          b = 1, 
          lambda = 0.05)
l5 <- rstan::sampling(tg_stan$model3,
                  data    = ld,
                  chains  = 1,
                  thin    = 1,
                  iter    = 5000,
                  warmup  = 1000,
                  refresh = 0)

sampl8 <- rstan::extract(l5)
sampl8 <- data.table(sampl8$theta)
names(sampl8) <- names(sampl2)

The plot below shows the 95% credible intervals using the latest STAN model and compares the result to the credible intervals derived from the logistic model that was used in the Model 1 section.

dfig <- rbind(
  process_samples1(sampl8, desc = "beta", summarise = TRUE),
  process_samples1(sampl5, desc = "logistic", summarise = TRUE))
dfig <- merge(dfig, d2[, .(expt, y, n, mle = theta, theta_tru)], by = "expt")
post <- rstan::extract(l5, pars = c("mu"))
print(plot3(dfig, cent = mean(post$mu)))

Figure: Credible intervals from STAN posterior samples (beta versus logistic model)

Note: sample means (circle), MLE (cross) and the simulation parameters (triangle). Vertical line is grand mean from beta hierarchical model.

Superficially at least, the results seem to be similar, however, the beta model does appear to shrink the estimates slightly more than the logistic model for this particular dataset. An additional view of the actual posterior distributions for the \(\theta_k\) parameters are shown below, which again suggests that the two approaches lead to very similar conclusions.

dfig <- rbind(
  process_samples1(sampl8, desc = "beta"),
  process_samples1(sampl5, desc = "logistic"))
print(plot6(dfig))

Figure: Posterior for probability of success (beta versus logistic model)

For completeness, I repeated the analysis based on the third dataset to see if there were any material differences between the hierarchical beta-binomial and logistic models.

ld = list(y = d3[, y],
          n = d3[, n],
          N = d3[, .N],
          a = 1, 
          b = 1, 
          lambda = 0.05)
l6 <- rstan::sampling(tg_stan$model3,
                  data    = ld,
                  chains  = 1,
                  thin    = 1,
                  iter    = 5000,
                  warmup  = 1000,
                  refresh = 0)

sampl9 <- rstan::extract(l6)
sampl9 <- data.table(sampl9$theta)
names(sampl9) <- names(sampl2)

The plot below shows the 95% credible intervals using the latest STAN model and compares the result to the credible intervals derived from the logistic model that was used in the Model 1 section. Again, the results appear to be more or less equivalent, however, for a couple of the experiments, the credible intervals from the beta model are narrower than those obtained from the logistic model.

dfig <- rbind(
  process_samples1(sampl9, desc = "beta", summarise = TRUE),
  process_samples1(sampl7, desc = "logistic", summarise = TRUE))
dfig <- merge(dfig, d3[, .(expt, y, n, mle = theta, theta_tru)], by = "expt")
post <- rstan::extract(l6, pars = c("mu"))
print(plot3(dfig, cent = mean(post$mu)))

Figure: Credible intervals from STAN posterior samples (beta versus logistic model)

Note: sample means (circle), MLE (cross) and the simulation parameters (triangle). Vertical line is grand mean from beta hierarchical model.

Conclusions

This was an introductory overview of using hierarchical models. We saw three takes on analysing a simulated dataset, a pooled approach, no pooling and partial pooling. Two common modeling approaches were used for hierarchical model; logistic and a beta binomial. There was little additional overhead associated with implementing and fitting the hierarchical models in this simple setting, which is commonly found to be the case. In contrast to the no-pooling approach, the hierarchical model learned and responded to the variability in the data. For data that are highly variable, the hierarchical model produces results very similar to the independent (no-pooling) model. However, in data with little variation, the hierarchical model was skeptical about extreme results and shrunk them towards the overall mean. The logistic and the beta-binomial implementations produced broadly similar results.