Introduction

Assume that we have data sampled from a multivariate normal.

b <- c(1, 3, 2, 2)
s <- c(0.4, 1, 0.8, 2)
# correlations 
Rho <- rbind(
  c( 1.0,  0.4, -0.3, -0.1),
  c( 0.4,  1.0,  0.7,  0.3),
  c(-0.3,  0.7,  1.0,  0.1),
  c(-0.1,  0.3,  0.1,  1.0)
)
S <- diag(s) %*% Rho %*% diag(s)
# S needs to be positive definite
# For a positive definite matrix, the eigenvalues should be positive.
stopifnot(all(eigen(S)$values>0))
N_obs <- 200
set.seed(5) # used to replicate example
y <- mvtnorm::rmvnorm( N_obs , b , S ) 

We model the data as

\[ \begin{aligned} \mathbf{y} &\sim MVNormal(\mathbf{\mu}, \mathbf{\Sigma}) \\ \mathbf{\mu} &\sim t-dist(3, 0, 1) \\ \mathbf{\Sigma} &= \begin{pmatrix} \sigma_{\mu_{1}} & 0 & 0 & 0 \\ 0 & \sigma_{\mu_{2}} & 0 & 0 \\ 0 & 0 & \sigma_{\mu_{3}} & 0 \\ 0 & 0 & 0 & \sigma_{\mu_{4}} \\ \end{pmatrix} R \begin{pmatrix} \sigma_{\mu_{1}} & 0 & 0 & 0 \\ 0 & \sigma_{\mu_{2}} & 0 & 0 \\ 0 & 0 & \sigma_{\mu_{3}} & 0 \\ 0 & 0 & 0 & \sigma_{\mu_{4}} \\ \end{pmatrix} \\ R &\sim LKJcorr(2) \\ \sigma &\sim t-dist(3, 0, 1)^+ \end{aligned} \]

Prior on the correlation matrix

In stan, this can be implemented with a prior on the correlation matrix:

m2a <- cmdstanr::cmdstan_model(paste0(path_pref, "/stan/mvn_02a.stan"))
m2a
## data {
##   int N;
##   array[N] vector[4] y;
## }
## parameters {
##   // group means
##   vector[4] b;
##   corr_matrix[4] Rho;
##   vector<lower=0>[4] s;
## }
## transformed parameters{
## }
## model {
##   target += lkj_corr_lpdf(Rho | 2);
##   // for no particular reason
##   target += student_t_lpdf(b | 3, 0, 1);
##   target += student_t_lpdf(s | 3, 0, 1) - student_t_lccdf(0 | 3, 0, 1);
##   target += multi_normal_lpdf( y | b , quad_form_diag(Rho , s) );
## }
## generated quantities{
## }

Fit the model:

set.seed(2)
lsd <- list(
  N = nrow(y),
  y = y
)

f2a <- m2a$sample(lsd,
    iter_warmup = 1000,
    iter_sampling = 2000,
    parallel_chains = 3,
    chains = 3,
    refresh = 0, show_exceptions = F)
## Running MCMC with 3 parallel chains...
## 
## Chain 3 finished in 5.3 seconds.
## Chain 2 finished in 5.6 seconds.
## Chain 1 finished in 5.9 seconds.
## 
## All 3 chains finished successfully.
## Mean chain execution time: 5.6 seconds.
## Total execution time: 6.0 seconds.

and summarise the parameters:

rbind(b, s)
f2a$summary(variables = c("b", "s"))
##   [,1] [,2] [,3] [,4]
## b  1.0    3  2.0    2
## s  0.4    1  0.8    2
## # A tibble: 8 × 10
##   variable  mean median     sd    mad    q5   q95  rhat ess_bulk ess_tail
##   <chr>    <num>  <num>  <num>  <num> <num> <num> <num>    <num>    <num>
## 1 b[1]     0.941  0.941 0.0292 0.0293 0.893 0.989  1.00    6061.    4442.
## 2 b[2]     3.03   3.03  0.0717 0.0703 2.91  3.15   1.00    4860.    4219.
## 3 b[3]     2.10   2.10  0.0564 0.0556 2.00  2.19   1.00    5441.    4012.
## 4 b[4]     2.16   2.16  0.134  0.131  1.94  2.38   1.00    6766.    4355.
## 5 s[1]     0.411  0.410 0.0206 0.0203 0.379 0.446  1.00    6368.    4394.
## 6 s[2]     1.02   1.02  0.0508 0.0495 0.940 1.11   1.00    4176.    4058.
## 7 s[3]     0.802  0.800 0.0400 0.0398 0.739 0.872  1.00    4508.    3626.
## 8 s[4]     1.87   1.87  0.0937 0.0929 1.73  2.04   1.00    5493.    4377.
Rho[1,]
f2a$summary(variables = c("Rho[1,2]","Rho[1,3]","Rho[1,4]"))
## [1]  1.0  0.4 -0.3 -0.1
## # A tibble: 3 × 10
##   variable    mean  median     sd    mad      q5    q95  rhat ess_bulk ess_tail
##   <chr>      <num>   <num>  <num>  <num>   <num>  <num> <num>    <num>    <num>
## 1 Rho[1,2]  0.450   0.452  0.0554 0.0547  0.356   0.538  1.00    4371.    4036.
## 2 Rho[1,3] -0.271  -0.272  0.0653 0.0650 -0.377  -0.161  1.00    4577.    4197.
## 3 Rho[1,4]  0.0205  0.0211 0.0691 0.0698 -0.0934  0.134  1.00    6911.    4406.
Rho[2,]
f2a$summary(variables = c("Rho[2,3]","Rho[2,4]"))
## [1] 0.4 1.0 0.7 0.3
## # A tibble: 2 × 10
##   variable  mean median     sd    mad    q5   q95  rhat ess_bulk ess_tail
##   <chr>    <num>  <num>  <num>  <num> <num> <num> <num>    <num>    <num>
## 1 Rho[2,3] 0.680  0.682 0.0375 0.0378 0.617 0.739  1.00    5228.    4519.
## 2 Rho[2,4] 0.330  0.331 0.0623 0.0623 0.223 0.428  1.00    6533.    4296.
Rho[3,]
f2a$summary(variables = c("Rho[3,4]"))
## [1] -0.3  0.7  1.0  0.1
## # A tibble: 1 × 10
##   variable   mean median     sd    mad      q5   q95  rhat ess_bulk ess_tail
##   <chr>     <num>  <num>  <num>  <num>   <num> <num> <num>    <num>    <num>
## 1 Rho[3,4] 0.0575 0.0578 0.0697 0.0689 -0.0597 0.172  1.00    6304.    4448.

Prior on the Cholesky factor

The other way to implement this is via a prior on the Cholesky factor, which is normally a bit quicker:

m2b <- cmdstanr::cmdstan_model(paste0(path_pref, "/stan/mvn_02b.stan"))
m2b
## data {
##   int N;
##   array[N] vector[4] y;
## }
## parameters {
##   // group means
##   vector[4] b;
##   cholesky_factor_corr[4] LRho; 
##   vector<lower=0>[4] s;
## }
## transformed parameters{
## }
## model {
##   target += lkj_corr_cholesky_lpdf(LRho | 2);
##   // for no particular reason
##   target += student_t_lpdf(b | 3, 0, 1);
##   target += student_t_lpdf(s | 3, 0, 1) - student_t_lccdf(0 | 3, 0, 1);
##   target += multi_normal_cholesky_lpdf( y | b , diag_pre_multiply(s, LRho));
## }
## generated quantities{
##   matrix[4,4] Rho;
##   Rho = multiply_lower_tri_self_transpose(LRho);
## }
set.seed(2)
lsd <- list(
  N = nrow(y),
  y = y
)

f2b <- m2b$sample(lsd,
    iter_warmup = 1000,
    iter_sampling = 2000,
    parallel_chains = 3,
    chains = 3,
    refresh = 0, show_exceptions = F)
## Running MCMC with 3 parallel chains...
## 
## Chain 3 finished in 4.2 seconds.
## Chain 2 finished in 4.4 seconds.
## Chain 1 finished in 4.6 seconds.
## 
## All 3 chains finished successfully.
## Mean chain execution time: 4.4 seconds.
## Total execution time: 4.7 seconds.
rbind(b, s)
f2b$summary(variables = c("b", "s"))
##   [,1] [,2] [,3] [,4]
## b  1.0    3  2.0    2
## s  0.4    1  0.8    2
## # A tibble: 8 × 10
##   variable  mean median     sd    mad    q5   q95  rhat ess_bulk ess_tail
##   <chr>    <num>  <num>  <num>  <num> <num> <num> <num>    <num>    <num>
## 1 b[1]     0.941  0.941 0.0288 0.0286 0.893 0.989  1.00    6923.    3561.
## 2 b[2]     3.03   3.03  0.0727 0.0725 2.91  3.15   1.00    5741.    3773.
## 3 b[3]     2.10   2.10  0.0572 0.0579 2.00  2.19   1.00    6209.    4307.
## 4 b[4]     2.17   2.17  0.130  0.128  1.95  2.38   1.00    7716.    4243.
## 5 s[1]     0.411  0.410 0.0214 0.0213 0.378 0.448  1.00    7104.    4337.
## 6 s[2]     1.02   1.02  0.0508 0.0502 0.940 1.11   1.00    4517.    4000.
## 7 s[3]     0.802  0.800 0.0409 0.0406 0.738 0.872  1.00    4296.    4171.
## 8 s[4]     1.88   1.87  0.0936 0.0914 1.73  2.04   1.00    5879.    4799.
Rho[1,]
f2b$summary(variables = c("Rho[1,2]","Rho[1,3]","Rho[1,4]"))
## [1]  1.0  0.4 -0.3 -0.1
## # A tibble: 3 × 10
##   variable    mean  median     sd    mad      q5    q95  rhat ess_bulk ess_tail
##   <chr>      <num>   <num>  <num>  <num>   <num>  <num> <num>    <num>    <num>
## 1 Rho[1,2]  0.449   0.451  0.0556 0.0549  0.353   0.538  1.00    4319.    3994.
## 2 Rho[1,3] -0.272  -0.274  0.0644 0.0648 -0.376  -0.165  1.00    4864.    4213.
## 3 Rho[1,4]  0.0189  0.0183 0.0680 0.0688 -0.0925  0.129  1.00    6734.    4387.
Rho[2,]
f2b$summary(variables = c("Rho[2,3]","Rho[2,4]"))
## [1] 0.4 1.0 0.7 0.3
## # A tibble: 2 × 10
##   variable  mean median     sd    mad    q5   q95  rhat ess_bulk ess_tail
##   <chr>    <num>  <num>  <num>  <num> <num> <num> <num>    <num>    <num>
## 1 Rho[2,3] 0.680  0.682 0.0384 0.0374 0.613 0.740  1.00    5711.    4023.
## 2 Rho[2,4] 0.330  0.331 0.0611 0.0625 0.229 0.429  1.00    6987.    4743.
Rho[3,]
f2b$summary(variables = c("Rho[3,4]"))
## [1] -0.3  0.7  1.0  0.1
## # A tibble: 1 × 10
##   variable   mean median     sd    mad      q5   q95  rhat ess_bulk ess_tail
##   <chr>     <num>  <num>  <num>  <num>   <num> <num> <num>    <num>    <num>
## 1 Rho[3,4] 0.0580 0.0574 0.0687 0.0696 -0.0545 0.171  1.00    5818.    4571.

Linear regression as MVN

You can always re-write a linear regression as a multivariate normal distribution.

\[ \begin{aligned} \mathbf{y} &\sim MVNormal(\mathbf{\mu}, \mathbf{\Sigma}) \\ \mu_i &\sim \alpha + \beta x_i \\ \mathbf{\Sigma} &= I \sigma^2 \\ \alpha &\sim Normal(0, 1) \\ \beta &\sim Normal(0, 1) \\ \sigma &\sim t-dist(3, 0, 1)^+ \end{aligned} \]

where \(I\) denotes the identity matrix. One reason to formulate the regression like this is because the natural next step is to plug in something other than the identity matrix, e.g. a density kernel as used in a gaussian process.