Bayesian Data Analysis 1

This example will go through the basics of using JAGS (https://sourceforge.net/projects/mcmc-jags/files/JAGS/3.x/) by way of the rjags library, for estimation of simple linear and generalized linear models. You must install both JAGS and rjags for this to work.

We will use the BRFSS data for the state of Texas for our example, and use BMI as a continous outcome, and obesity status outcome (BMI >= 30) as a dichotomous outcome.

First we load our data and recode some variables:

library(rjags)

## Loading required package: coda
## Loading required package: lattice
## Linked to JAGS 3.3.0
## Loaded modules: basemod,bugs

library(dplyr)

## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(car)
load("~/Google Drive/dem7903_App_Hier/data/brfss_11.Rdata")
nams<-names(brfss_11)
newnames<-gsub("_", "", nams)
names(brfss_11)<-tolower(newnames)
brfss_11$statefip<-sprintf("%02d", brfss_11$state )
brfss_11$cofip<-sprintf("%03d", brfss_11$cnty )
brfss_11$cofips<-paste(brfss_11$statefip, brfss_11$cofip, sep="")
brfss_11$obese<-ifelse(brfss_11$bmi5/100 >=30, 1,0)
brfss_11$black<-recode(brfss_11$racegr2, recodes="2=1; 9=NA; else=0", as.factor.result=F)
brfss_11$white<-recode(brfss_11$racegr2, recodes="1=1; 9=NA; else=0", as.factor.result=F)
brfss_11$other<-recode(brfss_11$racegr2, recodes="3:4=1; 9=NA; else=0", as.factor.result=F)
brfss_11$hispanic<-recode(brfss_11$racegr2, recodes="5=1; 9=NA; else=0", as.factor.result=F)
#education level
brfss_11$lths<-recode(brfss_11$educa, recodes="1:3=1;9=NA; else=0", as.factor.result=F)
brfss_11$coll<-recode(brfss_11$educa, recodes="5:6=1;9=NA; else=0", as.factor.result=F)
brfss_11$agez<-scale(brfss_11$age, center=T, scale=T)

Next, I use the filter function from the dplyr library to select the observations from Texas.

brf<-tbl_df(brfss_11)
tx<-as.data.frame(filter(brf, state=="48", is.na(obese)==F, is.na(black)==F, is.na(lths)==F))
nwncos<-table(tx$cofips)
tx$conum<-rep(1:length(unique(tx$cofips)), nwncos[nwncos!=0])

Linear Regression Example

Here is a simple linear regression model for bmi using a single predictor variable There a loads of ways to do this, but I like doing it this way. I write my code as a big string, then feed it to jags.

#Here is a simple linear regression model for bmi with 1 predictor
model1<-"
model{

#Likelihood
  for( i in 1:n)
    {
      bmi[i]~dnorm(mu[i], tau)
      mu[i]<-b0+b1*black[i]
    }

#priors
b0~dnorm(0,.01)
b1~dnorm(0,.01)
tau<-pow(sd, -2)
sd~dunif(0,100)
#bayesian p-values for the regression coefficient using the step() function
#step() is an indicator fuction an evaluates to 1 if the argument is greater than 0, 0 otherwise
p1<-step(b1-1)
p2<-1-p1

}
"

Next, we have to make a data list for jags, which contains anything we are reading into jags as data.

dat<-list(bmi=tx$bmi5/100, obese=tx$obese, black=tx$black, hisp=tx$hispanic, lths=tx$lths, coll=tx$coll, age=tx$agez, n=length(tx$obese),cofips=tx$conum, ncos=length(unique(tx$cofips)))

#quick summary
lapply(dat, summary)

## $bmi
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    12.6    23.4    26.6    27.6    30.5    82.3 
## 
## $obese
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   0.273   1.000   1.000 
## 
## $black
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.0915  0.0000  1.0000 
## 
## $hisp
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   0.171   0.000   1.000 
## 
## $lths
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     0.0     0.0     0.1     0.0     1.0 
## 
## $coll
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.000   0.681   1.000   1.000 
## 
## $age
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -2.620  -0.562   0.162   0.113   0.830   2.500 
## 
## $n
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    6990    6990    6990    6990    6990    6990 
## 
## $cofips
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    3.00    4.00    4.76    7.00    9.00 
## 
## $ncos
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       9       9       9       9       9       9

To use jags, we have to create a jags.model object, which contains the text representation of our model, our data, and some other parameters for the MCMC run

mod<-jags.model(file=textConnection(model1), data=dat, n.chains=2, n.adapt=1000)

## Warning: Unused variable "obese" in data
## Warning: Unused variable "hisp" in data
## Warning: Unused variable "lths" in data
## Warning: Unused variable "coll" in data
## Warning: Unused variable "age" in data
## Warning: Unused variable "cofips" in data
## Warning: Unused variable "ncos" in data

## Compiling model graph
##    Resolving undeclared variables
##    Allocating nodes
##    Graph Size: 14000
## 
## Initializing model

#next, we update the model, this is the "burn in" period
update(mod, 1000)

If we only want to see summaries of our parameters, then we can use jags.samples()

jags.samples(model= mod,variable.names=c("b0", "b1", "p1", "p2", "sd"), n.iter=1000 )

## $b0
## mcarray:
## [1] 27.31
## 
## Marginalizing over: iteration(1000),chain(2) 
## 
## $b1
## mcarray:
## [1] 2.795
## 
## Marginalizing over: iteration(1000),chain(2) 
## 
## $p1
## mcarray:
## [1] 1
## 
## Marginalizing over: iteration(1000),chain(2) 
## 
## $p2
## mcarray:
## [1] 0
## 
## Marginalizing over: iteration(1000),chain(2) 
## 
## $sd
## mcarray:
## [1] 5.987
## 
## Marginalizing over: iteration(1000),chain(2)

We can check how we did in comparison to a model fit via least squares in lm():

summary(lm(dat$bmi~dat$black, family=gaussian))

## Warning: extra argument 'family' is disregarded.

## 
## Call:
## lm(formula = dat$bmi ~ dat$black, family = gaussian)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -14.76  -4.06  -0.92   2.95  54.97 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  27.3124     0.0751   363.5   <2e-16 ***
## dat$black     2.7969     0.2483    11.3   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.99 on 6989 degrees of freedom
## Multiple R-squared:  0.0178, Adjusted R-squared:  0.0177 
## F-statistic:  127 on 1 and 6989 DF,  p-value: <2e-16

The parameters all have extremely similar estimates, and the “p values” also are in agreement

Next, we examine a few other elements of the model, including the posterior densities of the parameters, and First, we must collect some samples of each parameter using the coda.samples() function.

#collect 1000 samples of the betas and the residual sd
samps<-coda.samples(mod, variable.names=c("b0", "b1", "sd"), n.iter=1000)

#Numerical summary of each parameter:
summary(samps)

## 
## Iterations = 3001:4000
## Thinning interval = 1 
## Number of chains = 2 
## Sample size per chain = 1000 
## 
## 1. Empirical mean and standard deviation for each variable,
##    plus standard error of the mean:
## 
##     Mean     SD Naive SE Time-series SE
## b0 27.31 0.0765  0.00171        0.00191
## b1  2.79 0.2443  0.00546        0.00608
## sd  5.99 0.0525  0.00117        0.00153
## 
## 2. Quantiles for each variable:
## 
##     2.5%   25%   50%   75% 97.5%
## b0 27.16 27.26 27.31 27.36 27.46
## b1  2.30  2.63  2.79  2.94  3.26
## sd  5.89  5.95  5.99  6.02  6.09

#Plot a density of each parameter:
densityplot(samps)

#traceplot of the markov chains:
traceplot(samps)

plot of chunk unnamed-chunk-8

#autocorrelation plot of each parameter:
autocorr.plot(samps)

plot of chunk unnamed-chunk-8

autocorr.diag(samps)

##               b0        b1        sd
## Lag 0   1.000000  1.000000  1.000000
## Lag 1   0.112180  0.083442  0.267975
## Lag 5  -0.016596  0.002981 -0.030055
## Lag 10 -0.027657  0.018931 -0.009944
## Lag 50  0.001773 -0.013727  0.004047

#Examine convergence of the Markov chains using the Gelman-Brooks-Rubin diagnostic
gelman.plot(samps)

plot of chunk unnamed-chunk-8

gelman.diag(samps)

## Potential scale reduction factors:
## 
##    Point est. Upper C.I.
## b0          1       1.00
## b1          1       1.00
## sd          1       1.01
## 
## Multivariate psrf
## 
## 1

So our model looks good.
The densities don’t look too out of wack
- (not multi-modal, the densities from each chain line up well)
our traceplots reveal good mixing in the chains
- (vs divergence)
our autocorrelation statistics look like there is minimal autocorrelation in the Markov Chains
- (i.e. they are mixing well and providing independent samples at each iteration)
The Gelman-Brooks-Rubin diagnostics show that numerically, there is little to no variation between the chains
- The chains have converged

Logistic Regression Example

Next, we consider a similar simple logistic regression model:

model2<-"
model{

#Likelihood
  for( i in 1:n)
    {
      obese[i]~dbern(p[i])
      logit(p[i])<-b0+b1*black[i]
    }

#priors
b0~dnorm(0,.01)
b1~dnorm(0,.01)
p1<-step(b1-1)
p2<-1-step(b1-1)

}
"

load.module("glm")

## module glm loaded

mod2<-jags.model(file=textConnection(model2), data=dat, n.chains=2, n.adapt=1000)

## Warning: Unused variable "bmi" in data
## Warning: Unused variable "hisp" in data
## Warning: Unused variable "lths" in data
## Warning: Unused variable "coll" in data
## Warning: Unused variable "age" in data
## Warning: Unused variable "cofips" in data
## Warning: Unused variable "ncos" in data

## Compiling model graph
##    Resolving undeclared variables
##    Allocating nodes
##    Graph Size: 13997
## 
## Initializing model

update(mod2, 1000)
jags.samples(mod2,variable.names=c("b0", "b1", "p1", "p2"), n.iter=1000 )

## $b0
## mcarray:
## [1] -1.055
## 
## Marginalizing over: iteration(1000),chain(2) 
## 
## $b1
## mcarray:
## [1] 0.7516
## 
## Marginalizing over: iteration(1000),chain(2) 
## 
## $p1
## mcarray:
## [1] 0.002
## 
## Marginalizing over: iteration(1000),chain(2) 
## 
## $p2
## mcarray:
## [1] 0.998
## 
## Marginalizing over: iteration(1000),chain(2)

Compare to a logistic regression from a glm() fit

summary(glm(dat$obese~dat$black, family=binomial))

## 
## Call:
## glm(formula = dat$obese ~ dat$black, family = binomial)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.052  -0.773  -0.773   1.308   1.645  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.0552     0.0287  -36.80   <2e-16 ***
## dat$black     0.7529     0.0849    8.86   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 8203.3  on 6990  degrees of freedom
## Residual deviance: 8128.1  on 6989  degrees of freedom
## AIC: 8132
## 
## Number of Fisher Scoring iterations: 4

Again, we see correspondence. Now we can again look at the posterior densities and other aspects of our models:

#collect 1000 samples of the betas and the residual sd
samps2<-coda.samples(mod2, variable.names=c("b0", "b1"), n.iter=1000)

#Numerical summary of each parameter:
summary(samps2)

## 
## Iterations = 2001:3000
## Thinning interval = 1 
## Number of chains = 2 
## Sample size per chain = 1000 
## 
## 1. Empirical mean and standard deviation for each variable,
##    plus standard error of the mean:
## 
##      Mean     SD Naive SE Time-series SE
## b0 -1.053 0.0277 0.000619       0.000779
## b1  0.749 0.0838 0.001873       0.002017
## 
## 2. Quantiles for each variable:
## 
##      2.5%    25%    50%    75%  97.5%
## b0 -1.109 -1.072 -1.052 -1.034 -0.999
## b1  0.585  0.693  0.749  0.805  0.915

#Plot a density of each parameter:
densityplot(samps2)

plot of chunk unnamed-chunk-11

#traceplot of the markov chains:
traceplot(samps2)

plot of chunk unnamed-chunk-11

#autocorrelation plot of each parameter:
autocorr.plot(samps2)

plot of chunk unnamed-chunk-11

autocorr.diag(samps2)

##               b0        b1
## Lag 0   1.000000  1.000000
## Lag 1   0.225732  0.138546
## Lag 5   0.042471 -0.021416
## Lag 10  0.009694 -0.015765
## Lag 50 -0.011985  0.001446

#Examine convergence of the Markov chains using the Gelman-Brooks-Rubin diagnostic
gelman.plot(samps2)

plot of chunk unnamed-chunk-11

gelman.diag(samps2)

## Potential scale reduction factors:
## 
##    Point est. Upper C.I.
## b0          1       1.01
## b1          1       1.00
## 
## Multivariate psrf
## 
## 1

So, again, our model looks good.
The densities don’t look too out of wack
- (not multi-modal, the densities from each chain line up well)
our traceplots reveal good mixing in the chains
- (vs divergence)
our autocorrelation statistics look like there is minimal autocorrelation in the Markov Chains
- (i.e. they are mixing well and providing independent samples at each iteration)
The Gelman-Brooks-Rubin diagnostics show that numerically, there is little to no variation between the chains
- The b1 parameter shows a point estimate slightly above 1, but this is not that concerning, larger values over 1.1 would indicate that we need to run the model longer.