Bayes Factor for One-Way ANOVA

============================================================================================================
Part I: matrix data structure, ‘R2jags’, & ‘bridgesampling’, https://rpubs.com/sherloconan/1018077 (unreliable estimates)

1. Getting Started

In Loftus and Masson (1994), some to-be-recalled 20-word lists are presented at a rate of 1, 2, or 5 sec per word. Suppose that the hypothetical experiment is run as a balanced one-way between-subjects design with three conditions.

dat <- matrix(c(10,6,11,22,16,15,1,12,9,8,
                13,8,14,23,18,17,1,15,12,9,
                13,8,14,25,20,17,4,17,12,12), ncol=3,
              dimnames=list(NULL, paste0("Level",1:3))) #wide data format
# dat <- dat * 100

df <- data.frame("Level"=factor(rep(paste0("Level",1:3), each=10)),
                 "Response"=c(dat)) #long data format

Or, testing a simulated data set.

m1 <- 100; sd <- 20; n <- 24
m2 <- m1 - pwr::pwr.t.test(n=n, power=.8)$d * sd #83.47737
set.seed(277)
dat <- matrix(c(rnorm(n, m1, sd),
                rnorm(n, m2, sd)), ncol=2,
              dimnames=list(NULL, paste0("Level",1:2)))
# dat <- dat * 100

df <- data.frame("Level"=factor(rep(paste0("Level",1:2), each=n)),
                 "Response"=c(dat))

## The Bayes factors will be computed for the simulated data (in the second chunk).

Method: Constructing the Jeffreys-Zellner-Siow (JZS) Bayes factor.

\[\begin{align*} \tag{1} \mathcal{M}_1:\ Y_{ij}&=\mu+\sigma_\epsilon d_i+\epsilon_{ij}\\ \text{versus}\quad\mathcal{M}_0:\ Y_{ij}&=\mu+\epsilon_{ij},\qquad\epsilon_{ij}\overset{\text{i.i.d.}}{\sim}\mathcal{N}(0,\sigma_\epsilon^2) \text{ for } i=1,\dotsb,a;\ j=1,\dotsb,n. \end{align*}\]

\[\begin{equation} \tag{2} \pi(\mu,\sigma_\epsilon^2)\propto 1/\ \sigma_\epsilon^{2} \end{equation}\]

\[\begin{equation} \tag{3} d_i\mid g\overset{\text{i.i.d.}}{\sim}\mathcal{N}(0,g) \end{equation}\]

\[\begin{equation} \tag{4} g\sim\text{Scale-inv-}\chi^2(1,h^2) \end{equation}\]

(JZS_BF <- anovaBF(Response~Level, data=df, whichRandom="Level", progress=F))

## Bayes factor analysis
## --------------
## [1] Level : 15.57637 ±0%
## 
## Against denominator:
##   Intercept only 
## ---
## Bayes factor type: BFlinearModel, JZS

Note: Random effects are assumed. See the discussion here.

Markov chain Monte Carlo (MCMC) diagnostics. Plots of iterations versus sampled values for each variable in the chain.

set.seed(277)
draws <- posterior(JZS_BF, iterations=1e5, progress=F)
summary(draws)

## 
## Iterations = 1:1e+05
## Thinning interval = 1 
## Number of chains = 1 
## Sample size per chain = 1e+05 
## 
## 1. Empirical mean and standard deviation for each variable,
##    plus standard error of the mean:
## 
##                 Mean     SD Naive SE Time-series SE
## mu            95.263  40.55   0.1282         0.1206
## Level-Level1   9.402  40.56   0.1283         0.1205
## Level-Level2  -9.648  40.55   0.1282         0.1206
## sig2         468.706 101.48   0.3209         0.3415
## g_Level        8.382 396.02   1.2523         1.6672
## 
## 2. Quantiles for each variable:
## 
##                  2.5%      25%     50%     75%  97.5%
## mu            40.0456  84.4686  95.087 105.916 149.69
## Level-Level1 -44.7282  -1.2284   9.296  20.156  65.37
## Level-Level2 -64.3941 -20.1608  -9.278   1.291  45.60
## sig2         309.8889 397.2583 454.970 525.217 705.63
## g_Level        0.1857   0.5092   1.031   2.485  27.46

plot(draws, cex.lab=1.5, cex.axis=1.5, cex.main=1.5, cex.sub=1.5)

2A. Full Stan Model

\[\begin{equation} \tag{2} \pi(\mu,\sigma_\epsilon^2)\propto 1/\ \sigma_\epsilon^{2} \end{equation}\]

\[\begin{equation} \tag{3} d_i\mid g\overset{\text{i.i.d.}}{\sim}\mathcal{N}(0,g) \end{equation}\]

\[\begin{equation} \tag{4} g\sim\text{Scale-inv-}\chi^2(1,h^2) \end{equation}\]

Specify the full model \(\mathcal{M}_1\) in (1). Compile and fit the model in Stan.

Note that to compute the (log) marginal likelihood for a Stan model, we need to specify the model in a certain way. Instead of using “~” signs for specifying distributions, we need to directly use the (log) density functions. The reason for this is that when using the “~” sign, constant terms are dropped which are not needed for sampling from the posterior. However, for computing the marginal likelihood, these constants need to be retained. For instance, instead of writing y ~ normal(mu, sigma); we would need to write target += normal_lpdf(y | mu, sigma); (Gronau, 2021). See also the discussion regarding the implicit uniform prior for computing the log marginal likelihood.

stancodeM1 <- "
data {
  int<lower=1> N;        // number of subjects in the balanced group
  int<lower=2> C;        // number of conditions
  // vector[C] Y[N];        // responses [removed features]
  array[N] vector[C] Y;  // responses [Stan 2.26+ syntax for array declarations]
  real<lower=0> ht;      // scale parameter of the standardized treatment effects
  real lwr;              // lower bound for the grand mean
  real upr;              // upper bound for the grand mean
}

parameters {
  real<lower=lwr, upper=upr> mu;     // grand mean
  real<lower=0> sigma;               // standard deviation of the error
  real<lower=0> gt;                  // variance of the standardized treatment effects
  vector[C] t;                       // standardized treatment effects
}

model {
  target += uniform_lpdf(mu | lwr, upr);
  target += scaled_inv_chi_square_lpdf(gt | 1, ht); // Rouder et al. (2012)
  target += -log(sigma);                            // Jeffreys prior
  target += normal_lpdf(t | 0, sqrt(gt));
  for (i in 1:N) {
    target += normal_lpdf(Y[i] | mu + sigma * t, sigma);
  }
}"

stanmodelM1 <- stan_model(model_code=stancodeM1)
stanfitM1 <- sampling(stanmodelM1, data=list("N"=nrow(dat), "C"=ncol(dat), "Y"=dat, "ht"=1,
                                             "lwr"=mean(dat)-6*sd(dat),
                                             "upr"=mean(dat)+6*sd(dat)),
                      iter=50000, warmup=10000, chains=4, seed=277, refresh=0)

rstan::summary(stanfitM1)[[1]] %>% kable(digits=4) %>% kable_classic(full_width=F)

	mean	se_mean	sd	2.5%	25%	50%	75%	97.5%	n_eff	Rhat
mu	95.0844	0.1697	24.0371	43.8462	84.6695	95.2221	105.7901	144.9831	20059.20	1.0002
sigma	21.5113	0.0087	2.2848	17.6000	19.9049	21.3020	22.9081	26.5530	68271.30	1.0000
gt	3.6528	0.1138	30.2947	0.1831	0.5027	1.0101	2.3984	20.9844	70831.87	1.0000
t[1]	0.4502	0.0080	1.1231	-1.8446	-0.0544	0.4301	0.9343	2.8369	19943.46	1.0002
t[2]	-0.4441	0.0080	1.1234	-2.7907	-0.9434	-0.4373	0.0496	1.9089	19925.32	1.0002
lp__	-221.3084	0.0113	1.8739	-225.9730	-222.2747	-220.9189	-219.9243	-218.8357	27739.84	1.0000

2B. Null Stan Model

\[\begin{equation} \tag{2} \pi(\mu,\sigma_\epsilon^2)\propto 1/\ \sigma_\epsilon^{2} \end{equation}\]

\[\begin{equation} \tag{3} d_i\mid g\overset{\text{i.i.d.}}{\sim}\mathcal{N}(0,g) \end{equation}\]

\[\begin{equation} \tag{4} g\sim\text{Scale-inv-}\chi^2(1,h^2) \end{equation}\]

Specify the null model \(\mathcal{M}_0\) in (1). Compile and fit the model in Stan.

stancodeM0 <- "
data {
  int<lower=1> N;        // total number of subjects
  vector[N] Y;           // responses
  real lwr;              // lower bound for the grand mean
  real upr;              // upper bound for the grand mean
}

parameters {
  real<lower=lwr, upper=upr> mu;     // grand mean
  real<lower=0> sigma;               // standard deviation of the error
}

model {
  target += uniform_lpdf(mu | lwr, upr);
  target += -log(sigma);             // Jeffreys prior
  target += normal_lpdf(Y | mu, sigma);
}"

stanmodelM0 <- stan_model(model_code=stancodeM0)
stanfitM0 <- sampling(stanmodelM0, data=list("N"=length(dat), "Y"=c(dat),
                                             "lwr"=mean(dat)-6*sd(dat),
                                             "upr"=mean(dat)+6*sd(dat)),
                      iter=50000, warmup=10000, chains=4, seed=277, refresh=0)

rstan::summary(stanfitM0)[[1]] %>% kable(digits=4) %>% kable_classic(full_width=F)

	mean	se_mean	sd	2.5%	25%	50%	75%	97.5%	n_eff	Rhat
mu	95.1550	0.0096	3.4545	88.3366	92.8554	95.1628	97.4477	101.9427	129408.41	1
sigma	23.7295	0.0073	2.5147	19.4183	21.9589	23.5224	25.2621	29.2509	118611.49	1
lp__	-221.2673	0.0038	1.0285	-224.0292	-221.6629	-220.9494	-220.5351	-220.2686	71402.53	1

3. Bayes Factor via Bridge Sampling

Compute log marginal likelihoods for full and null models via bridge sampling. Then, \(\textit{BF}_{10}=\frac{p(\text{Data}\ \mid\ \mathcal{M}_1)}{p(\text{Data}\ \mid\ \mathcal{M}_0)}\).

M1 <- bridge_sampler(stanfitM1, silent=T)
summary(M1)

## 
## Bridge sampling log marginal likelihood estimate 
## (method = "normal", repetitions = 1):
## 
##  -220.9802
## 
## Error Measures:
## 
##  Relative Mean-Squared Error: 1.294006e-05
##  Coefficient of Variation: 0.00359723
##  Percentage Error: 0%
## 
## Note:
## All error measures are approximate.

M0 <- bridge_sampler(stanfitM0, silent=T)
summary(M0)

## 
## Bridge sampling log marginal likelihood estimate 
## (method = "normal", repetitions = 1):
## 
##  -223.7123
## 
## Error Measures:
## 
##  Relative Mean-Squared Error: 8.649591e-08
##  Coefficient of Variation: 0.0002941019
##  Percentage Error: 0%
## 
## Note:
## All error measures are approximate.

bf(M1, M0)

## Estimated Bayes factor in favor of M1 over M0: 15.36527

Check the Monte Carlo error of estimating the Bayes factor.

num <- 50; BF <- PerErr.M1 <- PerErr.M0 <- numeric(num)
system.time(
for (i in 1:num) { #roughly 42 min of run time
  result <- BFBS(seed=i) #defined in All-in-One R Script
  BF[i] <- result$bf
  PerErr.M1[i] <- result$pererr.M1
  PerErr.M0[i] <- result$pererr.M0
})


range(PerErr.M1); range(PerErr.M0)

hist(BF,prob=T,col="white",yaxt="n",ylab="",
     xlab="Bayes factor estimates",main="")
lines(density(BF),col="red",lwd=2)

## [1] 0.002901795 0.004325503

## [1] 0.0002903806 0.0003130767

All-in-One R Script

library(rstan)
# devtools::install_github("quentingronau/bridgesampling@master")
library(bridgesampling)
library(BayesFactor)

run.this=F #if TRUE, test the hypothetical data; if FALSE, test the simulated data
if (run.this) {
  dat <- matrix(c(10,6,11,22,16,15,1,12,9,8,
                  13,8,14,23,18,17,1,15,12,9,
                  13,8,14,25,20,17,4,17,12,12), ncol=3,
                dimnames=list(NULL, paste0("Level",1:3))) #wide data format
  
  df <- data.frame("Level"=factor(rep(paste0("Level",1:3), each=10)),
                   "Response"=c(dat)) #long data format
} else {
  m1 <- 100; sd <- 20; n <- 24
  m2 <- m1 - pwr::pwr.t.test(n=n, power=.8)$d * sd #83.47737
  set.seed(277)
  dat <- matrix(c(rnorm(n, m1, sd),
                  rnorm(n, m2, sd)), ncol=2,
                dimnames=list(NULL, paste0("Level",1:2)))
  
  df <- data.frame("Level"=factor(rep(paste0("Level",1:2), each=n)),
                   "Response"=c(dat))
}

(JZS_BF <- anovaBF(Response~Level, data=df, progress=F))
# set.seed(277)
# draws <- posterior(JZS_BF, iterations=1e5, progress=F)
# summary(draws)
# plot(draws, cex.lab=1.5, cex.axis=1.5, cex.main=1.5, cex.sub=1.5)

{
  stancodeM1 <- "
  data {
    int<lower=1> N;        // number of subjects in the balanced group
    int<lower=2> C;        // number of conditions
    vector[C] Y[N];        // responses
    real<lower=0> ht;      // scale parameter of the standardized treatment effects
    real lwr;              // lower bound for the grand mean
    real upr;              // upper bound for the grand mean
  }

  parameters {
    real<lower=lwr, upper=upr> mu;     // grand mean
    real<lower=0> sigma;               // standard deviation of the error
    real<lower=0> gt;                  // variance of the standardized treatment effects
    vector[C] t;                       // treatment effects
  }

  transformed parameters {
    real<lower=0> eta;     // standard deviation of the treatment effects
    eta = sigma * sqrt(gt);
  }

  model {
    target += uniform_lpdf(mu | lwr, upr);
    target += scaled_inv_chi_square_lpdf(gt | 1, ht); // Rouder et al. (2012)
    target += -log(sigma);                            // Jeffreys prior
    target += normal_lpdf(t | 0, eta);
    for (i in 1:N) {
      target += normal_lpdf(Y[i] | mu + t, sigma);
    }
  }"
  
  stanmodelM1 <- stan_model(model_code=stancodeM1)
  
  stancodeM0 <- "
  data {
    int<lower=1> N;        // total number of subjects
    vector[N] Y;           // responses
    real lwr;              // lower bound for the grand mean
    real upr;              // upper bound for the grand mean
  }

  parameters {
    real<lower=lwr, upper=upr> mu;     // grand mean
    real<lower=0> sigma;               // standard deviation of the error
  }

  model {
    target += uniform_lpdf(mu | lwr, upr);
    target += -log(sigma);             // Jeffreys prior
    target += normal_lpdf(Y | mu, sigma);
  }"
  
  stanmodelM0 <- stan_model(model_code=stancodeM0)
} #Stan models


BFBS <- function(seed=277, diagnostics=F) {
  stanfitM0 <- sampling(stanmodelM0, data=list("N"=length(dat), "Y"=c(dat),
                                               "lwr"=mean(dat)-6*sd(dat),
                                               "upr"=mean(dat)+6*sd(dat)),
                        iter=50000, warmup=10000, chains=4,
                        seed=seed, refresh=0)
  stanfitM1 <- sampling(stanmodelM1, data=list("N"=nrow(dat), "C"=ncol(dat), "Y"=dat, "ht"=1,
                                               "lwr"=mean(dat)-6*sd(dat), "upr"=mean(dat)+6*sd(dat)),
                        iter=50000, warmup=10000, chains=4,
                        seed=seed, refresh=0)
  
  set.seed(seed)
  M0 <- bridge_sampler(stanfitM0, silent=T)
  
  set.seed(seed)
  M1 <- bridge_sampler(stanfitM1, silent=T)
  
  if (diagnostics) {
    list("bf"=exp(M1$logml - M0$logml),
         "pererr.M1"=error_measures(M1)$cv,
         "pererr.M0"=error_measures(M0)$cv,
         "stan.M1"=stanfitM1, "stan.M0"=stanfitM0)
  } else {
    list("bf"=exp(M1$logml - M0$logml),
         "pererr.M1"=error_measures(M1)$cv,
         "pererr.M0"=error_measures(M0)$cv)
  }
}

BFBS()


# MCMC <- BFBS(diag=T)
# rstan::summary(MCMC$stan.M1)[[1]]
# rstan::summary(MCMC$stan.M0)[[1]]

Bayes Factor for One-Way ANOVA

Part Three: Matrix Data Structure, ‘rstan’, and ‘bridgesampling’

Zhengxiao Wei

2023-03-28

1. Getting Started

2A. Full Stan Model

2B. Null Stan Model

3. Bayes Factor via Bridge Sampling

All-in-One R Script