Illustration of MCMC concepts

Mixing and sampling efficiency of MCMC algorithms

Random-walk Metropolis sampler

rand_walk_metropolis <- function(eval_logp, init, n_iter, prop_sd, seed = NA) {
  
  if (!is.na(seed)) { set.seed(seed) }
  
  n_dim <- length(init)
  samples <- matrix(NA, nrow = n_dim, ncol = n_iter)
  logp_samples <- vector("numeric", n_iter)
  n_accepted <- 0L
  x <- init
  logp <- eval_logp(x)
  
  for (i in 1:n_iter) {
    x_prop <- x + prop_sd * rnorm(n_dim)
    logp_prop <- eval_logp(x_prop)
    accepted <- logp_prop - logp > log(runif(1))
    if (accepted) {
      x <- x_prop
      logp <- logp_prop
    }
    samples[, i] <- x
    logp_samples[i] <- logp
    n_accepted <- n_accepted + accepted
  }
  
  rownames(samples) <- paste0("param_", 1:n_dim)
  return(list(
    samples = samples, 
    logp = logp_samples,
    accept_rate = n_accepted / n_iter
  ))
}

Sample from 16-dimensional i.i.d. Gaussian

d <- 16
target_logp <- function(x) { - 0.5 * sum(x^2) }

# Pick theoretically optimal (for sufficiently large `d`) proposal sd
prop_sd <- 2.38 / sqrt(d) 

init <- rep(0, d)
n_iter <- 10^5
rwm_output <- rand_walk_metropolis(
  target_logp, init, n_iter, prop_sd, seed = 1918
)

Examine the traceplot, auto-correlation, and ESS along the first coordinate

coord_samples <- rwm_output$samples[1, ]
trace_plot(coord_samples[1:1000])

auto_cor_plot(coord_samples, lag.max = 100)

ess_est <- list(
  ar_proc = coda::effectiveSize(coord_samples)[[1]],
  batch_means = mcmcse::ess(coord_samples, method = "bm"),
  init_pos_seq = n_iter / mcmc::initseq(coord_samples)$var.pos
)
print(ess_est)

## $ar_proc
## [1] 2026.507
## 
## $batch_means
## [1] 1884.35
## 
## $init_pos_seq
## [1] 1937.267

mcmc_iter_per_ess <- lapply(ess_est, function (ess) n_iter / ess)
print(mcmc_iter_per_ess)

## $ar_proc
## [1] 49.346
## 
## $batch_means
## [1] 53.06869
## 
## $init_pos_seq
## [1] 51.61911

Check that the dist of MCMC samples coincides with the target

Compound symmetric Gaussian target

Consider a Gaussian target with covariance \(\Sigma_{ij} = \rho\) for all \(i \neq j\).

d <- 16
rho <- 0.9
Sigma <- (1 - rho) * diag(d) + rho * array(1., dim = c(d, d))
# Analytically invert `Sigma`
Prec <- 1 / (1 - rho) * (
  diag(d) - rho / (1 - rho + rho * d) * array(1., dim = c(d, d))
) 

cs_target_logp <- function(x) { - 0.5 * t(x) %*% Prec %*% x }

# Approximate, based on the compound symmetry structure, the optimal proposal sd
smaller_target_sd <- sqrt(min(eigen(Sigma)$values)) # only two unique eigenvalues
prop_sd <- smaller_target_sd * 2.38 / sqrt(d) 

init <- rep(0, d)
n_iter <- 10^5
rwm_output <- rand_walk_metropolis(
  cs_target_logp, init, n_iter, prop_sd, seed=1
)

As before, examine the traceplot, auto-correlation, and ESS

coord_samples <- rwm_output$samples[1, ]
trace_plot(coord_samples, thin = 100L)

auto_cor_plot(coord_samples, lag.max = 10000)

ess_est <- list(
  ar_proc = coda::effectiveSize(coord_samples)[[1]],
  batch_means = mcmcse::ess(coord_samples, method = "bm"),
  init_pos_seq = n_iter / mcmc::initseq(coord_samples)$var.pos
)
print(ess_est)

## $ar_proc
## [1] 73.92896
## 
## $batch_means
## [1] 47.3955
## 
## $init_pos_seq
## [1] 24.41273

mcmc_iter_per_ess <- lapply(ess_est, function (ess) n_iter / ess)
print(mcmc_iter_per_ess)

## $ar_proc
## [1] 1352.65
## 
## $batch_means
## [1] 2109.905
## 
## $init_pos_seq
## [1] 4096.223

How does the dist of MCMC samples compare to the actual target?

Don’t easily trust the output when the mixing is this bad

rwm_output <- rand_walk_metropolis(
  cs_target_logp, init, n_iter, prop_sd, seed=615
)
coord_samples <- rwm_output$samples[1, ]
trace_plot(coord_samples, thin = 100L)

auto_cor_plot(coord_samples, lag.max = 10000)

ess_est <- list(
  ar_proc = coda::effectiveSize(coord_samples)[[1]],
  batch_means = mcmcse::ess(coord_samples, method = "bm"),
  init_pos_seq = n_iter / mcmc::initseq(coord_samples)$var.pos
)
print(ess_est)

## $ar_proc
## [1] 38.80584
## 
## $batch_means
## [1] 36.57794
## 
## $init_pos_seq
## [1] 5.265344

hist_against_truth(coord_samples, true_density = dnorm)

Convergence of MCMC algorithms

Start away from stationarity on the compound symmetric target

set.seed(410)
init <- rnorm(d)
init[c(1, 2)] <- 3.5

n_iter <- 10^4
rwm_output <- rand_walk_metropolis(
  cs_target_logp, init, n_iter, prop_sd, seed=21205
)

Examine the chain’s trajectory in the first two dimensions

index <- c(1, 2)
target_density <- function(xy) {
  mvtnorm::dmvnorm(xy, sigma = Sigma[index, index])
}
n_iter_to_plot <- 1000L
plot_trajectory(rwm_output$samples, index, n_iter_to_plot, target_density)

Assess convergence via the log target density

trace_plot(
  rwm_output$logp[1:500],
  ylab = "Log density"
)

Visually check if the convergence in log density coincides with that in parameter values

n_iter_to_plot <- 200L
plot_trajectory(rwm_output$samples, index, n_iter_to_plot, target_density)

Also check that, by zooming in a bit, the log density actually stabilized

trace_plot(
  rwm_output$logp[200:1200],
  ylab = "Log density"
)

Illustration of MCMC concepts

By Akihiko Nishimura for “Bayesian Methods II” (Biostat 140.763) at Johns Hopkins

Mixing and sampling efficiency of MCMC algorithms

Random-walk Metropolis sampler

Sample from 16-dimensional i.i.d. Gaussian

Examine the traceplot, auto-correlation, and ESS along the first coordinate

Check that the dist of MCMC samples coincides with the target

Compound symmetric Gaussian target

As before, examine the traceplot, auto-correlation, and ESS

How does the dist of MCMC samples compare to the actual target?

Don’t easily trust the output when the mixing is this bad

Convergence of MCMC algorithms

Start away from stationarity on the compound symmetric target

Examine the chain’s trajectory in the first two dimensions

Assess convergence via the log target density

Visually check if the convergence in log density coincides with that in parameter values

Also check that, by zooming in a bit, the log density actually stabilized