Topic in Econometrics (2018)

05/03/2018

Conditionality and Interaction

Observations are conditional on how they get into the sample.
Posterior distributions and the likelihood are conditional on the data.
All model-based inference is conditional on the model.
The deeper conditionality always comes from the interaction.
Interaction: allows a part of your model to be conditional on further aspect: hierarchical structures are essentially interactions models
\(\beta^TX\) are conditional on cluster e.g. person, coalition, city, galaxy, etc.

Classic Interactions

Larger populations will both develop and sustain more complex technologies. (Obs from plays in one video game.)

culture = c("N","R","I","B","J","F","G","C","U","A")
pop = c(7, 7.3, 8.2, 8.5, 8.9, 9, 9.1, 9.4, 9.8, 12.5)
contact = c(0, 0, 0, 1, 1, 1, 1, 0, 1, 0)
cvi = factor(c(1,1,1,1,2,2,2,3,3,3))
tech = c(13, 22, 24, 43, 33, 19, 40, 28, 55, 71)
d = data.frame(culture,pop,contact,tech,cvi)

Classic Interactions

Larger populations will both develop and sustain more complex technologies. (Obs from plays in one video game.)

Classic Interactions

linear.model = lm(tech~ pop +pop*contact -1);

##               Estimate  Std. Error    t value     Pr(>|t|)
## pop           3.861255   0.7016904  5.5027897 0.0009038633
## contact     -81.852018 137.1312140 -0.5968883 0.5693759487
## pop:contact   9.367445  15.1357481  0.6188954 0.5555825431

We are fitting this model \[\begin{align} y_i &= \beta_1 x_{1,i} + \tilde{\beta} x_{2,i} + \varepsilon_i, \mbox{ and } \varepsilon_i \sim \mathcal{N}(0,1), \\ \tilde{\beta} &= \beta_{2} + \beta_{3} x_{1,i} .\end{align}\]
\(\tilde{\beta}\) defines the linear interaction between \(x_1\) and \(x_2\): \[y_i = \beta_1 x_{1,i} + \beta_{2} x_{2,i} + \beta_{3} x_{1,i}x_{2,i} + \varepsilon_i.\]
This is the so-called measurement error issue.

GLM Interactions

As \(Y\) is an integer. If \(Y \sim \mbox{Poi}(\eta)\), \(\eta\) is the expect number of technology level as the production outcome of the population \(X_{1}\), the collaboration \(X_{2}\), and the interaction effect \(X_{1}X_{2}\).
Consider a GLM model: \[\begin{align} y_i & \sim \mbox{Poi}(\theta_i), \\ \log(\theta_i) = \eta_i &= \beta_1 x_{1,i} + \beta_{2} x_{2,i} + \beta_{3} x_{1,i}x_{2,i}.\end{align}\]

GLM.model = glm(tech~ pop +pop*contact - 1, family=poisson)
summary(GLM.model)$coefficients

##                Estimate Std. Error    z value  Pr(>|z|)
## pop          0.35195648 0.00753518 46.7084387 0.0000000
## contact      0.59524097 1.51087661  0.3939706 0.6936027
## pop:contact -0.01728949 0.16560853 -0.1043997 0.9168521

GLM Interactions

library(R2jags)
d.GLM = list(n=length(pop), x1=pop, x2=contact, y=tech)
cat("model
  { # priors
    beta1 ~ dnorm(0,1)
    beta2 ~ dnorm(0,1)
    beta3 ~ dnorm(0,1)
    # likelihood
    for (i in 1:n){
      y[i] ~ dpois(pi[i])
      log(pi[i]) = beta1*x1[i] + beta2*x2[i] + beta3*x1[i]*x2[i]}
  }", file="model_GLM.bug")

params = c("beta1","beta2","beta3")
GLM.model = jags(data=d.GLM, model.file="model_GLM.bug", 
                    parameters.to.save=params, n.iter=2000, 
                    n.burnin=1000, n.chains=3)

## Compiling model graph
##    Resolving undeclared variables
##    Allocating nodes
## Graph information:
##    Observed stochastic nodes: 10
##    Unobserved stochastic nodes: 3
##    Total graph size: 82
## 
## Initializing model

GLM Interactions

plot(GLM.model)

GLM Interactions

##          beta1     beta2     beta3 deviance
## [1,] 0.2695306 -2.690066 0.3256664 286.1846

Random

The fixed and random effects: \[ \beta_{0,i} \sim \mathcal{N}(\mu,\sigma^2_i),\] when \(\sigma_i = \sigma\) for \(i\).
The random effect is simplified as \(\beta_{0,i} \sim \mathcal{N}(\mu,\sigma^2)\). A small modification of the previous code will give the new model:

beta0 ~ dnorm(0,100) # priors
[...]
log(pi[i]) = beta0 + beta1*x1[i] + beta2*x2[i] + beta3*x1[i]*x2[i]} # Link

## Compiling model graph
##    Resolving undeclared variables
##    Allocating nodes
## Graph information:
##    Observed stochastic nodes: 10
##    Unobserved stochastic nodes: 4
##    Total graph size: 85
## 
## Initializing model

Bayesian Method for Interaction

library(R2jags)
d.GLM = list(n=length(pop), x1=pop, x2=contact, y=tech)
cat("model
  { # priors
    beta0 ~ dnorm(0,100)
    beta1 ~ dnorm(0,1)
    beta2 ~ dnorm(0,1)
    beta3 ~ dnorm(0,1)
    # likelihood
    for (i in 1:n){
      y[i] ~ dpois(pi[i])
      log(pi[i]) = beta0 + beta1*x1[i] + beta2*x2[i] + beta3*x1[i]*x2[i]}
  }", file="model_GLM.bug")

params = c("beta0","beta1","beta2","beta3")
GLM.model = jags(data=d.GLM, model.file="model_GLM.bug", 
                    parameters.to.save=params, n.iter=2000, 
                    n.burnin=1000, n.chains=3)

Bayesian Method for Interaction

##           beta0     beta1      beta2     beta3 deviance
## [1,] 0.03147633 0.3049344 -0.8235241 0.1236619 151.1499

Classic Interactions

linear.model = lm(tech~cvi-1+pop, data=d)
summary(linear.model)$coefficients

##       Estimate Std. Error   t value   Pr(>|t|)
## cvi1 -67.17158  29.617638 -2.267959 0.06384642
## cvi2 -76.95194  34.388985 -2.237692 0.06655802
## cvi3 -75.01889  40.214613 -1.865463 0.11137707
## pop   11.95762   3.765343  3.175706 0.01917975

We are fitting this model \[y_i = \alpha_{1,i} + \alpha_{2,i} + \alpha_{3,i} +\beta x_i +\varepsilon_i \mbox{ and } \varepsilon_i \sim \mathcal{N}(0,1).\]
Factor codes for the group membership of each unit. Adding a dummy variable to \(\beta X\).

Classic Interactions

The model can be written as \[y_i = \alpha_{j[i]} + \beta x_i +\varepsilon_i \mbox{ and } \varepsilon_i \sim \mathcal{N}(0,1),\, j[i]=\mbox{individual } i \mbox{ in group }j.\]
\(j[10] = 3\) means the \(10\)th observation in the data belongs to group \(3\).

model.matrix(~cvi-1+pop)

##    cvi1 cvi2 cvi3  pop
## 1     1    0    0  7.0
## 2     1    0    0  7.3
## 3     1    0    0  8.2
## 4     1    0    0  8.5
## 5     0    1    0  8.9
## 6     0    1    0  9.0
## 7     0    1    0  9.1
## 8     0    0    1  9.4
## 9     0    0    1  9.8
## 10    0    0    1 12.5
## attr(,"assign")
## [1] 1 1 1 2
## attr(,"contrasts")
## attr(,"contrasts")$cvi
## [1] "contr.treatment"

Classic Interactions

Indicator variables as fixed effects. For \(J\)-categories into a regression, choose one of the categories as a baseline and include the other \(J-1\) indicators as dummies.

Classic Interactions

With grouped data, a regression that includes indictors for groups (like the factor(…)) is a varying intercept model \[y_i = \alpha_{j[i]} + \beta x_i +\varepsilon_i, \, j[i]=\mbox{individual } i \mbox{ in group }j.\]
Similarly, one can have varying slope and constant intercept model: \[y_i = \alpha + \beta_{j[i]} x_i +\varepsilon_i.\] The varying slopes are interactions between the continuous explorary variable \(x\) and the group indicators.
Or a varying intercept and varying slope model: \[y_i = \alpha_{j[i]} + \beta_{j[i]} x_i +\varepsilon_i.\]

Classic Interactions

The model matrix for a varying intercept and varying slope model:

model.matrix(~cvi*pop-1)

##    cvi1 cvi2 cvi3  pop cvi2:pop cvi3:pop
## 1     1    0    0  7.0      0.0      0.0
## 2     1    0    0  7.3      0.0      0.0
## 3     1    0    0  8.2      0.0      0.0
## 4     1    0    0  8.5      0.0      0.0
## 5     0    1    0  8.9      8.9      0.0
## 6     0    1    0  9.0      9.0      0.0
## 7     0    1    0  9.1      9.1      0.0
## 8     0    0    1  9.4      0.0      9.4
## 9     0    0    1  9.8      0.0      9.8
## 10    0    0    1 12.5      0.0     12.5
## attr(,"assign")
## [1] 1 1 1 2 3 3
## attr(,"contrasts")
## attr(,"contrasts")$cvi
## [1] "contr.treatment"

Classic Interactions

It is challenging to estimate all \(\alpha_{j[i]}\) or \(\beta_{j[i]}\) especially when some \(x\) are only available at the group level.
Usually, the estimation is divided into two steps.
1st-step: a regression with the varying coefficients.
2nd-step: a regression for the coefficients themselves.
The two-step analysis can run into problems when sample sizes are small, or when there are interactions between individual- and group-level \(x\).

Classic Interactions

lin = lm(tech~cvi-1+pop); 
coef(lin)

##      cvi1      cvi2      cvi3       pop 
## -67.17158 -76.95194 -75.01889  11.95762

lin2 = lm(tech~cvi*pop-1)
coef(lin2)

##       cvi1       cvi2       cvi3        pop   cvi2:pop   cvi3:pop 
##  -93.02941 -284.33333  -64.67702   15.29412   19.70588   -4.31522

Classic Interactions

Hierarchical Structure

In the classic regression \(\{\alpha_{1},\alpha_{2},\alpha_{3}\}\) are completely independent. (Otherwise there is a collinear problem.)
Consider \(\alpha_{k}\) as a sample from a larger number of effects, which could have been hidden in the observable model.
Hierarchical model describe a coupled set of ordinary models that are only "conditionally" independent each other.
It is unnecessary of picking one category as a baseline. As all the categories are inter-dependent in the group level (latent layer).
The varying coefficients \(\alpha_{j[i]}\) or \(\beta_{j[i]}\) with the hierarchical structures are the random effects, a term refers to the randomness in the probability model for the group-level coefficients.

Random Effects

Hierarchical varying intercept: \[y_i = \mathcal{N} (\alpha_{j[i]} + \beta x_i, \,\mbox{ and } \sigma_{Y}^{2}),\, \alpha_j \sim \mathcal{N}(\mu_\alpha , \sigma^{2}_\alpha) \mbox{ for }j=1,\dots,J.\]
Hierachical modeling as a regression that includes a categorical input variable representing group membership. \(J\) coefficients are themselves given by another group-level model.
If \(\sigma_\alpha \rightarrow 0\), \(\alpha_j\) converges to a constant, then no difference between random and fixed effects.
The group-level model is estimated simultaneously with the data-level regression of \(y\).

Random Effects (Intercept)

library(lme4); lmm=lmer(tech ~ pop + (1|cvi)); lmm

## Linear mixed model fit by REML ['lmerMod']
## Formula: tech ~ pop + (1 | cvi)
## REML criterion at convergence: 64.2238
## Random effects:
##  Groups   Name        Std.Dev.
##  cvi      (Intercept) 0.000   
##  Residual             9.596   
## Number of obs: 10, groups:  cvi, 3
## Fixed Effects:
## (Intercept)          pop  
##      -55.90        10.11

Random Effects (Intercept)

fixef(lmm)

## (Intercept)         pop 
##   -55.90284    10.11180

ranef(lmm)

## $cvi
##   (Intercept)
## 1           0
## 2           0
## 3           0

Fixed and Random Effects

A different way to write the the model \[\begin{align} \beta^T\mathbf{x}_i &= \beta_{0,j[i]} + \beta_1 x_{1,i} \\ \beta_{0,j} &= \mu + \mu_j + \sigma_j \upsilon \ \mbox{ and } \upsilon \sim \mathcal{N}(0,1).\end{align}\]
\(\mu\) defines the fixed effect.
\(\mu_j,\, \sigma_i \upsilon\) define the random effect.

Random Effects (Intercept and Slope)

library(lme4); lmm2=lmer(tech ~ (1 + pop|cvi))
ranef(lmm2)

## $cvi
##   (Intercept)        pop
## 1  -70.858257  8.2464027
## 2    4.738717 -0.5514865
## 3  -82.280248  9.5756810

coef(lmm2) # adjusted by the fixed effects

## $cvi
##          pop (Intercept)
## 1  8.2464027   -38.88224
## 2 -0.5514865    36.71474
## 3  9.5756810   -50.30423
## 
## attr(,"class")
## [1] "coef.mer"

Fixed and Random Effects

A different way to write the the model \[\begin{align} \beta^T\mathbf{x}_i &= \beta_{0,j[i]} + \beta_{1,j[i]} x_{1,i} \\ \left(\begin{array}{c} \beta_{0,j}\\ \beta_{1,j} \end{array}\right) &\sim\mathcal{N}\left(\left(\begin{array}{c} \mu_{0}\\ \mu_{1} \end{array}\right),\left(\begin{array}{cc} \sigma_{1}^{2} & \rho\sigma_{1}\sigma_{2}\\ \rho\sigma_{1}\sigma_{2} & \sigma_{2}^{2} \end{array}\right)\right) .\end{align}\]
\(\rho\) defines the correlation between two random effects caused by \(\beta_{0,j}\) and \(\beta_{1,j}\).
A compact representation \[y_i \sim \mathcal{N}(\beta_{j[i]}^T\mathbf{x}_i, \sigma^2_{y}),\, \mathbf{B} \sim \mathcal{N}(u_j \mu, \Sigma_{\beta}),\] where \(\mathbf{x}_i\) is a \(K\)-vector, \(\mathbf{B}=(\beta_1,\dots,\beta_J)^{T}\) is \(J\times K\) matrix, \(\mathbf{u}\) is a \(J\)-vector, \(\mu\) is a \(K\)-vector, \(\Sigma_{\beta}\) is a \(K\times K\) matrix.

Hierarchical Structure and Random Effects

GLM with Random Effect

Consider a GLM model: \[\begin{align} y_i & \sim \mbox{Poi}(\theta_i), \\ \log (\theta_i) = \eta_i &= \beta_{0,j[i]} + \beta_1 x_{1,i} + \beta_{2} x_{2,i} + \beta_{3} x_{1,i}x_{2,i}, \\ \beta_{0,j} & \sim \mathcal{N}(\mu_j,\sigma^{2}_j) \end{align}\]

GLM.model.ran = glmer(tech~ pop +pop*contact + (1|cvi) - 1, family=poisson)
coef(GLM.model.ran)

## $cvi
##   (Intercept)       pop  contact pop:contact
## 1  0.29522858 0.3470813 -0.55242   0.1181225
## 2 -0.17239744 0.3470813 -0.55242   0.1181225
## 3 -0.02065854 0.3470813 -0.55242   0.1181225
## 
## attr(,"class")
## [1] "coef.mer"

Bayesian Random Effect (Intercept)

d.ranef = list(n=length(pop), J=length(levels(cvi)), x=pop, cvi=cvi, y=tech)
cat("model
  { # priors
    tau.y = pow(sigma.y, -2)
    sigma.y ~ dunif(0,10)
    for (j in 1:J){
      beta0[j] ~ dnorm(mu.0, tau.0)  
    }
    mu.0 ~ dnorm(0, 0.01)
    tau.0 = pow(sigma.0, -2)
    sigma.0 ~ dunif(0,10)
    beta1 ~ dnorm(0,0.01)
    # likelihood
    for (i in 1:n){
      y[i] ~ dnorm(mu.y[i], tau.y)
      mu.y[i] = beta0[cvi[i]] + beta1*x[i]}
  }", file="model_BayesRE.bug")

## Compiling model graph
##    Resolving undeclared variables
##    Allocating nodes
## Graph information:
##    Observed stochastic nodes: 10
##    Unobserved stochastic nodes: 7
##    Total graph size: 72
## 
## Initializing model

Bayesian Random Effect (Intercept)

## Inference for Bugs model at "model_BayesRE.bug", fit using jags,
##  3 chains, each with 2000 iterations (first 1000 discarded)
##  n.sims = 3000 iterations saved
##          mu.vect sd.vect   2.5%    25%    50%   75% 97.5% Rhat n.eff
## beta0[1]  -12.34    8.78 -29.18 -18.25 -12.25 -6.64  4.82 1.02   100
## beta0[2]  -12.63    9.43 -31.51 -18.85 -12.52 -6.76  7.75 1.02   110
## beta0[3]   -7.47   10.32 -27.21 -13.95  -7.76 -0.87 13.55 1.02   110
## beta1       5.20    1.03   3.17   4.55   5.20  5.90  7.26 1.02   120
## mu.0       -9.93    8.65 -26.36 -15.38  -9.95 -4.35  8.11 1.03    94
## sigma.0     4.86    2.85   0.30   2.41   4.84  7.26  9.74 1.01   470
## sigma.y     8.92    0.82   7.03   8.41   9.09  9.58  9.96 1.00  1500
## deviance   77.86    2.86  73.52  75.82  77.41 79.46 84.50 1.01   260
## 
## For each parameter, n.eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor (at convergence, Rhat=1).
## 
## DIC info (using the rule, pD = var(deviance)/2)
## pD = 4.1 and DIC = 81.9
## DIC is an estimate of expected predictive error (lower deviance is better).

Bayesian Random Effect (Intercept and Slope)

[...]
    tau.y = pow(sigma.y, -2)
    sigma.y ~ dunif(0,10)
    for (j in 1:J){
      beta0[j] ~ dnorm(mu.0[j], tau.0) 
      beta1[j] ~ dnorm(mu.1[j], tau.1)
      mu.0[j] = mu0
      mu.1[j] = mu1
    }
    mu0 ~ dnorm(0, 0.01)
    mu1 ~ dnorm(0, 0.01)    
    tau.0 = pow(sigma.0,-2)
    tau.1 = pow(sigma.1,-2)
    sigma.0 ~ dunif(0,10)
    sigma.1 ~ dunif(0,10)

Bayesian Random Effect (Intercept and Slope)

    # likelihood
    for (i in 1:n){
      y[i] ~ dnorm(mu.y[i], tau.y)
      mu.y[i] = beta0[cvi[i]] + beta1[cvi[i]]*x[i]}
  }", file="model_BayesRE.bug")
  params = c("beta0","beta1","mu0","mu1", "sigma.0","sigma.1","sigma.y")

## Compiling model graph
##    Resolving undeclared variables
##    Allocating nodes
## Graph information:
##    Observed stochastic nodes: 10
##    Unobserved stochastic nodes: 11
##    Total graph size: 80
## 
## Initializing model

Intercept and Slope

## Inference for Bugs model at "model_BayesRE.bug", fit using jags,
##  3 chains, each with 2000 iterations (first 1000 discarded)
##  n.sims = 3000 iterations saved
##          mu.vect sd.vect   2.5%    25%   50%   75% 97.5% Rhat n.eff
## beta0[1]   -8.67   10.03 -29.29 -15.15 -8.36 -1.85 10.67 1.02   140
## beta0[2]   -8.30   10.40 -28.53 -15.37 -8.10 -1.02 11.47 1.01   160
## beta0[3]   -6.85   10.27 -26.34 -14.02 -6.79  0.15 13.80 1.01   190
## beta1[1]    4.57    1.33   1.99   3.68  4.60  5.42  7.17 1.01   160
## beta1[2]    4.45    1.23   2.10   3.59  4.49  5.23  6.87 1.01   160
## beta1[3]    5.45    1.08   3.30   4.71  5.40  6.22  7.58 1.01   150
## mu0        -7.21    9.10 -24.34 -13.48 -7.14 -0.73 10.45 1.01   140
## mu1         4.72    1.73   1.16   3.81  4.76  5.67  8.00 1.01   220
## sigma.0     4.64    2.83   0.32   2.11  4.44  6.96  9.67 1.01   600
## sigma.1     1.80    1.73   0.09   0.62  1.24  2.40  6.85 1.02   300
## sigma.y     8.91    0.83   6.89   8.43  9.09  9.57  9.96 1.01   350
## deviance   77.39    2.84  73.30  75.38 76.91 78.88 84.11 1.00   510
## 
## For each parameter, n.eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor (at convergence, Rhat=1).
## 
## DIC info (using the rule, pD = var(deviance)/2)
## pD = 4.0 and DIC = 81.4
## DIC is an estimate of expected predictive error (lower deviance is better).

Bayesian Random Effect (Correlation \(\rho\))

[...]
    for (j in 1:J){
      beta0[j] = B[j,1]
      beta1[j] = B[j,2]
      B[j,1:2] ~ dmnorm (B.hat[j,] , Tau.B[,])
      B.hat[j,1] = mu0
      B.hat[j,2] = mu1}
    mu0 ~ dnorm(0, 0.01)
    mu1 ~ dnorm(0, 0.01)    
    Tau.B[1:2,1:2] = inverse(Sigma.B[,])
    Sigma.B[1,1] = pow(sigma.0, 2)
    Sigma.B[2,2] = pow(sigma.1, 2)
    sigma.0 ~ dunif(0,10)
    sigma.1 ~ dunif(0,10)
    Sigma.B[1,2] = rho*sigma.0*sigma.1
    Sigma.B[2,1] = Sigma.B[1,2]
    rho ~ dunif(-1,1)
[...]
params = c("beta0","beta1","mu0","mu1", "sigma.0","sigma.1","sigma.y", "rho")

## Compiling model graph
##    Resolving undeclared variables
##    Allocating nodes
## Graph information:
##    Observed stochastic nodes: 10
##    Unobserved stochastic nodes: 9
##    Total graph size: 93
## 
## Initializing model

Bayesian Random Effect (Correlation \(\rho\))

## Inference for Bugs model at "model_BayesRE.bug", fit using jags,
##  3 chains, each with 2000 iterations (first 1000 discarded)
##  n.sims = 3000 iterations saved
##          mu.vect sd.vect   2.5%    25%   50%   75% 97.5% Rhat n.eff
## beta0[1]   -8.52    9.41 -28.12 -14.33 -8.54 -2.40  9.85 1.01   560
## beta0[2]   -8.09    9.86 -28.18 -14.09 -8.00 -1.86 10.91 1.01   430
## beta0[3]   -7.35    9.70 -27.40 -13.50 -7.36 -1.08 11.56 1.01  1600
## beta1[1]    4.54    1.26   2.04   3.72  4.56  5.33  7.12 1.01   480
## beta1[2]    4.41    1.19   2.19   3.63  4.40  5.19  6.76 1.01   630
## beta1[3]    5.46    1.05   3.41   4.75  5.49  6.13  7.56 1.00  2600
## mu0        -7.16    8.38 -24.30 -13.09 -7.19 -1.50  9.16 1.01   690
## mu1         4.72    1.79   0.96   3.83  4.73  5.65  8.35 1.02   500
## rho        -0.12    0.57  -0.97  -0.61 -0.18  0.33  0.95 1.00   910
## sigma.0     4.93    2.76   0.52   2.49  4.78  7.25  9.69 1.03   140
## sigma.1     2.06    1.90   0.11   0.73  1.42  2.77  7.23 1.01   300
## sigma.y     8.90    0.84   6.89   8.40  9.06  9.58  9.96 1.00  3000
## deviance   77.43    2.94  73.48  75.34 76.87 78.90 84.66 1.01   390
## 
## For each parameter, n.eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor (at convergence, Rhat=1).
## 
## DIC info (using the rule, pD = var(deviance)/2)
## pD = 4.3 and DIC = 81.7
## DIC is an estimate of expected predictive error (lower deviance is better).

Random and Fixed Effect in Panels

\(Y_{ij}\) is technology of country \(i\) with category \(j\).
Random effects: \[Y_{ij} = \mu + \beta_{0,j} + I_{ij}\] where \(\mu\) is a constant, \(\beta_{0,j}\) are the random effects from the categories, \(I_{ij}\) are the individual random effects.
Fixed effects from \(X_{1,ij}, X_{2,ij}\): \[Y_{ij}= \mu + \beta_{0,j} +\beta_{1}X_{1,ij}+\beta _{2}X_{2,ij} +I_{ij}.\] where \(\beta_1\) and \(\beta_2\) capture differences amongst all individual countries. The model has mixed effects.

Bayesian Method with Multivariate Priors

When there are more than two coefficients are varying (e.g. one intercept and two slopes), it is difficult to model the group-level correlations directly.
Scale inverse-Wishart model: using a full matrix notation to allow for an arbitrary number of coefficients that vary by group: \[y_i \sim \mathcal{N}(\beta_{j[i]}^T\mathbf{x}_i, \sigma^2_{y}),\, \beta_j \sim \mathcal{N}(u_j \mu, \Sigma_{\beta}).\]
\(\Sigma_\beta \sim \mbox{inv Wishart}_{K+1} (\mathbf{S})\), \(K\) is the dimension of \(\beta\) that vary by groups. \(\mathbf{S}\) is the scale matrix, if \(\mathbf{S}=\mathbf{I}\) is set to an \(K\times K\) identity matrix, the Wishart distribution is a unscaled one.

Multivariate Priors

The unscaled Wishart: the model implies a uniform distribution on the correlation parameters.
It is common to decompose \(\Sigma_\beta\): \[\Sigma_\beta = \mbox{Diag}(\xi) Q \mbox{Diag}(\xi),\, Q\sim \mbox{inv Wishart}_{K+1} (\mathbf{I})\] where \(\xi = (\xi_1 \dots, \xi_K)\) is a vector of scale parameters.
So \(\sigma_k^2 =\Sigma_{kk} =\xi_k^{2}Q_{kk}\), \(\Sigma_{kl} = \xi_k \xi_l Q_{kl}\), \[ \rho_{kl} = \frac{\Sigma_{kl}}{\sigma_k \sigma_l}.\]

Multivariate Priors

The unscaled Wishart: the model implies a uniform distribution on the correlation parameters.

mul.d.ranef = list(n=length(pop), J=length(levels(cvi)), 
                   X=as.matrix(data.frame(rep(1,10),pop,contact)), 
                   K=3, df=4, cvi=cvi, y=tech, W = diag(3))
                   cat("model
{ # priors
  tau.y = pow(sigma.y, -2)
  sigma.y ~ dunif(0,10)
  for (j in 1:J){
      for (k in 1:K){
        B[j,k] = xi[k]*B.raw[j,k]
      }
      B.raw[j,1:K] ~ dmnorm(mu.raw[],Tau.B.raw[,])
  }

Multivariate Priors

  for (k in 1:K){
      mu[k] = xi[k]*mu.raw[k]
      mu.raw[k] ~ dnorm(0,0.01)
      xi[k] ~ dunif(0,10)
  }
  Tau.B.raw[1:K,1:K] ~ dwish(W[,], df)
  Sigma.B.raw[1:K,1:K] = inverse(Tau.B.raw[,])
  for (k in 1:K){
      for (k.prime in 1:K){
          rho.B[k,k.prime] = Sigma.B.raw[k,k.prime]/
            sqrt(Sigma.B.raw[k,k]*Sigma.B.raw[k.prime,k.prime]) 
      }
      sigma.B[k] = abs(xi[k])*sqrt(Sigma.B.raw[k,k])
  }

Multivariate Priors

      # likelihood
      for (i in 1:n){
        y[i] ~ dnorm(mu.y[i], tau.y)
        mu.y[i] = inprod(B[cvi[i],], X[i,])
      }
}", file="model_Mul.BayesRE.bug")
params = c("B","mu.y","mu", "rho.B","sigma.B","sigma.y")
Mul.BayesRE.model = jags(data=mul.d.ranef, model.file="model_Mul.BayesRE.bug", 
                    parameters.to.save=params, n.iter=3000, 
                    n.burnin=2000, n.chains=3)

## Compiling model graph
##    Resolving undeclared variables
##    Allocating nodes
## Graph information:
##    Observed stochastic nodes: 10
##    Unobserved stochastic nodes: 11
##    Total graph size: 183
## 
## Initializing model