This example will go through the basics of using JAGS (https://sourceforge.net/projects/mcmc-jags/files/JAGS/3.x/) by way of the rjags library, for estimation of simple linear and generalized linear models. You must install both JAGS and rjags for this to work.

We will use the BRFSS data for the state of Texas for our example, and use BMI as a continous outcome, and obesity status outcome (BMI >= 30) as a dichotomous outcome.

First we load our data and recode some variables:

library(rjags)
## Loading required package: coda
## Loading required package: lattice
## Linked to JAGS 3.3.0
## Loaded modules: basemod,bugs
library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(car)
load("~/Google Drive/dem7903_App_Hier/data/brfss_11.Rdata")
nams<-names(brfss_11)
newnames<-gsub("_", "", nams)
names(brfss_11)<-tolower(newnames)
brfss_11$statefip<-sprintf("%02d", brfss_11$state )
brfss_11$cofip<-sprintf("%03d", brfss_11$cnty )
brfss_11$cofips<-paste(brfss_11$statefip, brfss_11$cofip, sep="")
brfss_11$obese<-ifelse(brfss_11$bmi5/100 >=30, 1,0)
brfss_11$black<-recode(brfss_11$racegr2, recodes="2=1; 9=NA; else=0", as.factor.result=F)
brfss_11$white<-recode(brfss_11$racegr2, recodes="1=1; 9=NA; else=0", as.factor.result=F)
brfss_11$other<-recode(brfss_11$racegr2, recodes="3:4=1; 9=NA; else=0", as.factor.result=F)
brfss_11$hispanic<-recode(brfss_11$racegr2, recodes="5=1; 9=NA; else=0", as.factor.result=F)
#education level
brfss_11$lths<-recode(brfss_11$educa, recodes="1:3=1;9=NA; else=0", as.factor.result=F)
brfss_11$coll<-recode(brfss_11$educa, recodes="5:6=1;9=NA; else=0", as.factor.result=F)
brfss_11$agez<-scale(brfss_11$age, center=T, scale=T)

Next, I use the filter function from the dplyr library to select the observations from Texas.

brf<-tbl_df(brfss_11)
tx<-as.data.frame(filter(brf, state=="48", is.na(obese)==F, is.na(black)==F, is.na(lths)==F))
nwncos<-table(tx$cofips)
tx$conum<-rep(1:length(unique(tx$cofips)), nwncos[nwncos!=0])

Linear Regression Example

Here is a simple linear regression model for bmi using a single predictor variable There a loads of ways to do this, but I like doing it this way. I write my code as a big string, then feed it to jags.

#Here is a simple linear regression model for bmi with 1 predictor
model1<-"
model{

#Likelihood
  for( i in 1:n)
    {
      bmi[i]~dnorm(mu[i], tau)
      mu[i]<-b0+b1*black[i]
    }

#priors
b0~dnorm(0,.01)
b1~dnorm(0,.01)
tau<-pow(sd, -2)
sd~dunif(0,100)
#bayesian p-values for the regression coefficient using the step() function
#step() is an indicator fuction an evaluates to 1 if the argument is greater than 0, 0 otherwise
p1<-step(b1-1)
p2<-1-p1

}
"

Next, we have to make a data list for jags, which contains anything we are reading into jags as data.

dat<-list(bmi=tx$bmi5/100, obese=tx$obese, black=tx$black, hisp=tx$hispanic, lths=tx$lths, coll=tx$coll, age=tx$agez, n=length(tx$obese),cofips=tx$conum, ncos=length(unique(tx$cofips)))

#quick summary
lapply(dat, summary)
## $bmi
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    12.6    23.4    26.6    27.6    30.5    82.3 
## 
## $obese
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   0.273   1.000   1.000 
## 
## $black
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.0915  0.0000  1.0000 
## 
## $hisp
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   0.171   0.000   1.000 
## 
## $lths
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     0.0     0.0     0.1     0.0     1.0 
## 
## $coll
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.000   0.681   1.000   1.000 
## 
## $age
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -2.620  -0.562   0.162   0.113   0.830   2.500 
## 
## $n
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    6990    6990    6990    6990    6990    6990 
## 
## $cofips
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    3.00    4.00    4.76    7.00    9.00 
## 
## $ncos
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       9       9       9       9       9       9

To use jags, we have to create a jags.model object, which contains the text representation of our model, our data, and some other parameters for the MCMC run

mod<-jags.model(file=textConnection(model1), data=dat, n.chains=2, n.adapt=1000)
## Warning: Unused variable "obese" in data
## Warning: Unused variable "hisp" in data
## Warning: Unused variable "lths" in data
## Warning: Unused variable "coll" in data
## Warning: Unused variable "age" in data
## Warning: Unused variable "cofips" in data
## Warning: Unused variable "ncos" in data
## Compiling model graph
##    Resolving undeclared variables
##    Allocating nodes
##    Graph Size: 14000
## 
## Initializing model
#next, we update the model, this is the "burn in" period
update(mod, 1000)

If we only want to see summaries of our parameters, then we can use jags.samples()

jags.samples(model= mod,variable.names=c("b0", "b1", "p1", "p2", "sd"), n.iter=1000 )
## $b0
## mcarray:
## [1] 27.31
## 
## Marginalizing over: iteration(1000),chain(2) 
## 
## $b1
## mcarray:
## [1] 2.795
## 
## Marginalizing over: iteration(1000),chain(2) 
## 
## $p1
## mcarray:
## [1] 1
## 
## Marginalizing over: iteration(1000),chain(2) 
## 
## $p2
## mcarray:
## [1] 0
## 
## Marginalizing over: iteration(1000),chain(2) 
## 
## $sd
## mcarray:
## [1] 5.987
## 
## Marginalizing over: iteration(1000),chain(2)

We can check how we did in comparison to a model fit via least squares in lm():

summary(lm(dat$bmi~dat$black, family=gaussian))
## Warning: extra argument 'family' is disregarded.
## 
## Call:
## lm(formula = dat$bmi ~ dat$black, family = gaussian)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -14.76  -4.06  -0.92   2.95  54.97 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  27.3124     0.0751   363.5   <2e-16 ***
## dat$black     2.7969     0.2483    11.3   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.99 on 6989 degrees of freedom
## Multiple R-squared:  0.0178, Adjusted R-squared:  0.0177 
## F-statistic:  127 on 1 and 6989 DF,  p-value: <2e-16

The parameters all have extremely similar estimates, and the “p values” also are in agreement

Next, we examine a few other elements of the model, including the posterior densities of the parameters, and First, we must collect some samples of each parameter using the coda.samples() function.

#collect 1000 samples of the betas and the residual sd
samps<-coda.samples(mod, variable.names=c("b0", "b1", "sd"), n.iter=1000)

#Numerical summary of each parameter:
summary(samps)
## 
## Iterations = 3001:4000
## Thinning interval = 1 
## Number of chains = 2 
## Sample size per chain = 1000 
## 
## 1. Empirical mean and standard deviation for each variable,
##    plus standard error of the mean:
## 
##     Mean     SD Naive SE Time-series SE
## b0 27.31 0.0765  0.00171        0.00191
## b1  2.79 0.2443  0.00546        0.00608
## sd  5.99 0.0525  0.00117        0.00153
## 
## 2. Quantiles for each variable:
## 
##     2.5%   25%   50%   75% 97.5%
## b0 27.16 27.26 27.31 27.36 27.46
## b1  2.30  2.63  2.79  2.94  3.26
## sd  5.89  5.95  5.99  6.02  6.09
#Plot a density of each parameter:
densityplot(samps)

plot of chunk unnamed-chunk-8

#traceplot of the markov chains:
traceplot(samps)

plot of chunk unnamed-chunk-8plot of chunk unnamed-chunk-8plot of chunk unnamed-chunk-8

#autocorrelation plot of each parameter:
autocorr.plot(samps)

plot of chunk unnamed-chunk-8plot of chunk unnamed-chunk-8

autocorr.diag(samps)
##               b0        b1        sd
## Lag 0   1.000000  1.000000  1.000000
## Lag 1   0.112180  0.083442  0.267975
## Lag 5  -0.016596  0.002981 -0.030055
## Lag 10 -0.027657  0.018931 -0.009944
## Lag 50  0.001773 -0.013727  0.004047
#Examine convergence of the Markov chains using the Gelman-Brooks-Rubin diagnostic
gelman.plot(samps)

plot of chunk unnamed-chunk-8

gelman.diag(samps)
## Potential scale reduction factors:
## 
##    Point est. Upper C.I.
## b0          1       1.00
## b1          1       1.00
## sd          1       1.01
## 
## Multivariate psrf
## 
## 1

Logistic Regression Example

Next, we consider a similar simple logistic regression model:

model2<-"
model{

#Likelihood
  for( i in 1:n)
    {
      obese[i]~dbern(p[i])
      logit(p[i])<-b0+b1*black[i]
    }

#priors
b0~dnorm(0,.01)
b1~dnorm(0,.01)
p1<-step(b1-1)
p2<-1-step(b1-1)

}
"

load.module("glm")
## module glm loaded
mod2<-jags.model(file=textConnection(model2), data=dat, n.chains=2, n.adapt=1000)
## Warning: Unused variable "bmi" in data
## Warning: Unused variable "hisp" in data
## Warning: Unused variable "lths" in data
## Warning: Unused variable "coll" in data
## Warning: Unused variable "age" in data
## Warning: Unused variable "cofips" in data
## Warning: Unused variable "ncos" in data
## Compiling model graph
##    Resolving undeclared variables
##    Allocating nodes
##    Graph Size: 13997
## 
## Initializing model
update(mod2, 1000)
jags.samples(mod2,variable.names=c("b0", "b1", "p1", "p2"), n.iter=1000 )
## $b0
## mcarray:
## [1] -1.055
## 
## Marginalizing over: iteration(1000),chain(2) 
## 
## $b1
## mcarray:
## [1] 0.7516
## 
## Marginalizing over: iteration(1000),chain(2) 
## 
## $p1
## mcarray:
## [1] 0.002
## 
## Marginalizing over: iteration(1000),chain(2) 
## 
## $p2
## mcarray:
## [1] 0.998
## 
## Marginalizing over: iteration(1000),chain(2)

Compare to a logistic regression from a glm() fit

summary(glm(dat$obese~dat$black, family=binomial))
## 
## Call:
## glm(formula = dat$obese ~ dat$black, family = binomial)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.052  -0.773  -0.773   1.308   1.645  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.0552     0.0287  -36.80   <2e-16 ***
## dat$black     0.7529     0.0849    8.86   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 8203.3  on 6990  degrees of freedom
## Residual deviance: 8128.1  on 6989  degrees of freedom
## AIC: 8132
## 
## Number of Fisher Scoring iterations: 4

Again, we see correspondence. Now we can again look at the posterior densities and other aspects of our models:

#collect 1000 samples of the betas and the residual sd
samps2<-coda.samples(mod2, variable.names=c("b0", "b1"), n.iter=1000)

#Numerical summary of each parameter:
summary(samps2)
## 
## Iterations = 2001:3000
## Thinning interval = 1 
## Number of chains = 2 
## Sample size per chain = 1000 
## 
## 1. Empirical mean and standard deviation for each variable,
##    plus standard error of the mean:
## 
##      Mean     SD Naive SE Time-series SE
## b0 -1.053 0.0277 0.000619       0.000779
## b1  0.749 0.0838 0.001873       0.002017
## 
## 2. Quantiles for each variable:
## 
##      2.5%    25%    50%    75%  97.5%
## b0 -1.109 -1.072 -1.052 -1.034 -0.999
## b1  0.585  0.693  0.749  0.805  0.915
#Plot a density of each parameter:
densityplot(samps2)

plot of chunk unnamed-chunk-11

#traceplot of the markov chains:
traceplot(samps2)

plot of chunk unnamed-chunk-11plot of chunk unnamed-chunk-11

#autocorrelation plot of each parameter:
autocorr.plot(samps2)

plot of chunk unnamed-chunk-11plot of chunk unnamed-chunk-11

autocorr.diag(samps2)
##               b0        b1
## Lag 0   1.000000  1.000000
## Lag 1   0.225732  0.138546
## Lag 5   0.042471 -0.021416
## Lag 10  0.009694 -0.015765
## Lag 50 -0.011985  0.001446
#Examine convergence of the Markov chains using the Gelman-Brooks-Rubin diagnostic
gelman.plot(samps2)

plot of chunk unnamed-chunk-11

gelman.diag(samps2)
## Potential scale reduction factors:
## 
##    Point est. Upper C.I.
## b0          1       1.01
## b1          1       1.00
## 
## Multivariate psrf
## 
## 1