This example will go through the basics of using Stan by way of the brms library, for estimation of simple linear and generalized linear models. You must install brms first, using install.packages("brms").

We will use the ECLS-K 2011 data for our example, and use height for age/sex z-score as a continuous outcome, and short for age status outcome (height for age z < -1) as a dichotomous outcome.

There is an overview on using brms for fitting various models. You can find these here. The package rstanarm is very similar and better documented, and can be found here, with numerous tutorials on that page.

Both packages serve as front-ends to the Stan library for MCMC. People new to Stan can often be put off by its syntax and model construction. Both of these packages allow us to use R syntax that we are accustomed to from functions like glm() and lmer() to fit models. We can also see the Stan code from the model that is generated, so we can learn how Stan works inside.

Libraries

library (car)
## Loading required package: carData
library(brms)
## Loading required package: Rcpp
## Loading required package: ggplot2
## Loading 'brms' package (version 2.4.0). Useful instructions
## can be found by typing help('brms'). A more detailed introduction
## to the package is available through vignette('brms_overview').
## Run theme_set(theme_default()) to use the default bayesplot theme.
library(pscl)
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
library(rstan)
## Loading required package: StanHeaders
## rstan (Version 2.17.3, GitRev: 2e1f913d3ca3)
## For execution on a local, multicore CPU with excess RAM we recommend calling
## options(mc.cores = parallel::detectCores()).
## To avoid recompilation of unchanged Stan programs, we recommend calling
## rstan_options(auto_write = TRUE)
library(haven)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Load data and recode some variables:

eclsk11<-read_sas("~/Google Drive/classes/dem7473/data/childk4p.sas7bdat")
names(eclsk11)<-tolower(names(eclsk11))
#get out only the variables I'm going to use for this example

myvars<-c("childid", "x_chsex_r", "x1locale", "x_raceth_r", "x2povty", "x12par1ed_i","p1curmar", "x1htotal",
          "x1mscalk4", "x2mscalk4", "x3mscalk4", "x4mscalk4","x5mscalk4","x6mscalk4","x7mscalk4","x8mscalk4",
          "p1hscale", "p2hscale", "p4hscale","p6hscale","p7hscale","p8hscale",
           "x12sesl","x4sesl_i", "p2parct1", "p2parct2", "s1_id", "p2safepl","x2krceth","p1o2near", "x_distpov" ,
          "w8cf8p_80","w8cf8p_8psu","w8cf8p_8str","x2fsstat2","x4fsstat2", "x1height","x2height","x3height","x4height","x5height","x6height","x7height","x8height",
        "x1kage_r","x2kage_r","x3age","x4age", "x5age", "x6age", "x7age", "x8age", "w1c0","w1c0psu","w1c0str", "w1p0", "w4c4p_20", "w4cf4p_2str", "w4cf4p_2psu")


#subset the data
eclsk.sub<-eclsk11[,myvars]

rm(eclsk11); gc()
##           used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells 2125473 113.6    4342178  231.9   4342178  231.9
## Vcells 4668534  35.7  683028732 5211.1 783895735 5980.7

Longitudinal variables

#First we recode some Child characteristics
#Child's sex: recode as male =1
eclsk.sub$male<-Recode(eclsk.sub$x_chsex_r, recodes="1=1; 2=0; -9=NA")

#Recode race with white, non Hispanic as reference using dummy vars
eclsk.sub$hisp<-Recode(eclsk.sub$x_raceth_r, recodes="3:4=1;-9=NA; else=0")
eclsk.sub$black<-Recode(eclsk.sub$x_raceth_r, recodes="2=1;-9=NA; else=0")
eclsk.sub$asian<-Recode(eclsk.sub$x_raceth_r, recodes="5=1;-9=NA; else=0")
eclsk.sub$nahn<-Recode(eclsk.sub$x_raceth_r, recodes="6:7=1;-9=NA; else=0")
eclsk.sub$other<-Recode(eclsk.sub$x_raceth_r, recodes="8=1;-9=NA; else=0")


#Longitudinal variables
#recode our outcomes, the  first is the child's math standardized test score  in Kindergarten
eclsk.sub$math1<-ifelse(eclsk.sub$x1mscalk4<0, NA, eclsk.sub$x1mscalk4)
eclsk.sub$math2<-ifelse(eclsk.sub$x2mscalk4<0, NA, eclsk.sub$x2mscalk4)
eclsk.sub$math4<-ifelse(eclsk.sub$x4mscalk4<0, NA, eclsk.sub$x4mscalk4)
eclsk.sub$math6<-ifelse(eclsk.sub$x6mscalk4<0, NA, eclsk.sub$x6mscalk4)
eclsk.sub$math7<-ifelse(eclsk.sub$x7mscalk4<0, NA, eclsk.sub$x7mscalk4)
eclsk.sub$math8<-ifelse(eclsk.sub$x8mscalk4<0, NA, eclsk.sub$x8mscalk4)

#Second outcome is child's height for age, continuous outcome
eclsk.sub$height1<-ifelse(eclsk.sub$x1height<=-7, NA, eclsk.sub$x1height)
eclsk.sub$height2<-ifelse(eclsk.sub$x2height<=-7, NA, eclsk.sub$x2height)
eclsk.sub$height4<-ifelse(eclsk.sub$x4height<=-7, NA, eclsk.sub$x4height)
eclsk.sub$height6<-ifelse(eclsk.sub$x6height<=-7, NA, eclsk.sub$x6height)
eclsk.sub$height7<-ifelse(eclsk.sub$x7height<=-7, NA, eclsk.sub$x7height)
eclsk.sub$height8<-ifelse(eclsk.sub$x8height<=-7, NA, eclsk.sub$x8height)

#Age at each wave
eclsk.sub$age_yrs1<-ifelse(eclsk.sub$x1kage_r<0, NA, eclsk.sub$x1kage_r/12)
eclsk.sub$age_yrs2<-ifelse(eclsk.sub$x2kage_r<0, NA, eclsk.sub$x2kage_r/12)
#eclsk.sub$age_yrs3<-ifelse(eclsk.sub$x3age<0, NA, eclsk.sub$x3age/12)
eclsk.sub$age_yrs4<-ifelse(eclsk.sub$x4age<0, NA, eclsk.sub$x4age/12)
eclsk.sub$age_yrs6<-ifelse(eclsk.sub$x6age<0, NA, eclsk.sub$x6age/12)
eclsk.sub$age_yrs7<-ifelse(eclsk.sub$x7age<0, NA, eclsk.sub$x7age/12)
eclsk.sub$age_yrs8<-ifelse(eclsk.sub$x8age<0, NA, eclsk.sub$x8age/12)



eclsk.sub<- eclsk.sub[is.na(eclsk.sub$age_yrs1)==F, ]

#Height for age z score standardized by sex and age
eclsk.sub$height_z1<-ave(eclsk.sub$height1, as.factor(paste(round(eclsk.sub$age_yrs1, 1.5), eclsk.sub$male)), FUN=scale)
eclsk.sub$height_z2<-ave(eclsk.sub$height2, as.factor(paste(round(eclsk.sub$age_yrs2, 1.5), eclsk.sub$male)), FUN=scale)
#eclsk.sub$height_z3<-ave(eclsk.sub$height3, as.factor(paste(round(eclsk.sub$age_yrs3, 1.5), eclsk.sub$male)), FUN=scale)
eclsk.sub$height_z4<-ave(eclsk.sub$height4, as.factor(paste(round(eclsk.sub$age_yrs4, 1.5), eclsk.sub$male)), FUN=scale)
eclsk.sub$height_z6<-ave(eclsk.sub$height6, as.factor(paste(round(eclsk.sub$age_yrs6, 1.5), eclsk.sub$male)), FUN=scale)
eclsk.sub$height_z7<-ave(eclsk.sub$height7, as.factor(paste(round(eclsk.sub$age_yrs7, 1.5), eclsk.sub$male)), FUN=scale)
eclsk.sub$height_z8<-ave(eclsk.sub$height8, as.factor(paste(round(eclsk.sub$age_yrs8, 1.5), eclsk.sub$male)), FUN=scale)



#Child health assessment Excellent to poor , ordinal outcome
eclsk.sub$chhealth1<-ifelse(eclsk.sub$p1hscale<0, NA, eclsk.sub$p1hscale)
eclsk.sub$chhealth2<-ifelse(eclsk.sub$p2hscale<0, NA, eclsk.sub$p2hscale)
eclsk.sub$chhealth4<-ifelse(eclsk.sub$p4hscale<0, NA, eclsk.sub$p4hscale)
eclsk.sub$chhealth6<-ifelse(eclsk.sub$p6hscale<0, NA, eclsk.sub$p6hscale)
eclsk.sub$chhealth7<-ifelse(eclsk.sub$p7hscale<0, NA, eclsk.sub$p7hscale)
eclsk.sub$chhealth8<-ifelse(eclsk.sub$p8hscale<0, NA, eclsk.sub$p8hscale)

#SES
eclsk.sub$hhses1<-ifelse(eclsk.sub$x12sesl==-9, NA, scale(eclsk.sub$x12sesl))
eclsk.sub$hhses4<-ifelse(eclsk.sub$x4sesl_i==-9, NA, scale(eclsk.sub$x4sesl_i))


#Household food insecurity, dichotomous outcome
#This outcome is only present at two waves
eclsk.sub$foodinsec1<-Recode(eclsk.sub$x2fsstat2, recodes="2:3=1; 1=0; else=NA")
eclsk.sub$foodinsec2<-Recode(eclsk.sub$x4fsstat2, recodes="2:3=1; 1=0; else=NA")

Non time varying variables

#First we recode some Child characteristics
#Child's sex: recode as male =1
eclsk.sub$male<-Recode(eclsk.sub$x_chsex_r, recodes="1=1; 2=0; -9=NA")

#Recode race with white, non Hispanic as reference using dummy vars
eclsk.sub$hisp<-Recode(eclsk.sub$x_raceth_r, recodes="3:4=1;-9=NA; else=0")
eclsk.sub$black<-Recode(eclsk.sub$x_raceth_r, recodes="2=1;-9=NA; else=0")
eclsk.sub$asian<-Recode(eclsk.sub$x_raceth_r, recodes="5=1;-9=NA; else=0")
eclsk.sub$nahn<-Recode(eclsk.sub$x_raceth_r, recodes="6:7=1;-9=NA; else=0")
eclsk.sub$other<-Recode(eclsk.sub$x_raceth_r, recodes="8=1;-9=NA; else=0")


#Then we recode some parent/mother characteristics
#Mother's education, recode as 2 dummys with HS = reference
eclsk.sub$lths<-Recode(eclsk.sub$x12par1ed_i, recodes = "0:2=1; 3:8=0; else = NA")
eclsk.sub$gths<-Recode(eclsk.sub$x12par1ed_i, recodes = "1:3=0; 4:8=1; else =NA") 

#marital status, recode as 2 dummys, ref= married
eclsk.sub$single<-Recode(eclsk.sub$p1curmar, recodes="4=1; -7:-9=NA; else=0")
eclsk.sub$notmar<-Recode(eclsk.sub$p1curmar, recodes="2:3=1; -7:-9=NA; else=0")


#Then we do some household level variables

#Urban school location = 1
eclsk.sub$urban<-Recode(eclsk.sub$x1locale, recodes = "1:3=1; 4=0; -1:-9=NA")

#poverty level in poverty = 1
eclsk.sub$pov<-Recode(eclsk.sub$x2povty , recodes ="1:2=1; 3=0; -9=NA")

#Household size
eclsk.sub$hhsize<-eclsk.sub$x1htotal

#school % minority student body
eclsk.sub$minorsch<-ifelse(eclsk.sub$x2krceth <0, NA, eclsk.sub$x2krceth/10)

#Unsafe neighborhood
eclsk.sub$unsafe<-Recode(eclsk.sub$p2safepl , recodes = "1:2='unsafe'; 3='safe'; else=NA", as.factor = T)

#school district poverty
eclsk.sub$dist_pov<-ifelse(eclsk.sub$x_distpov==-9, NA, scale(eclsk.sub$x_distpov))


#mean centered weights for wave 1
eclsk.sub$wts1<-eclsk.sub$w4c4p_20/mean(eclsk.sub$w4c4p_20, na.rm=T)
#eclsk.sub$wtsp1<-eclsk.sub$w1p0/mean(eclsk.sub$w1p0, na.rm=T)

Linear Regression Example

Here is a simple linear regression model for bmi using a single predictor variable. If we were to write the full model specification, it would be:

\(y \sim N(X' \beta, \sigma^2 )\)

In terms of priors, we need them for each \(\beta\) and for \(\sigma^2\). It is common to use a Normal prior for \(\beta\) parameters, since they can be both positive or negative, but the null hypothesis is that they are 0. For example a high variance prior can be contrasted with a low variance prior. Here’s a plot of this:

Normal distributions

So, a Normal prior with a \(\sigma\) of 10, is pretty vague, but a \(\sigma\) of 100, is really vague.

Student - t distributions

Often the Student - t distribution, or it’s truncated version , the half-Student t distribution will be use to model regression coefficients or variances:

#Student - t
x <- seq(-9, 9, length=100)
hx <- dstudent_t(x, df=4, sigma = 1)

degf <- c(2, 2,5, 5)
sigs<-c(1,5,1,5)
colors <- c("black", "red", "red", "blue", "blue", "green")
ltys<-c(1,2,1,2,1)
labels <- c("df=4, sd=1", "df=2, sd=1", "df=2, sd=5", "df=5, sd=1", "df=5, sd=5")

plot(x, hx, type="l", lty=1, xlab="x value",
  ylab="Density", main="Comparison of Student -t Distributions")

for (i in 1:4){ 
  lines(x, dstudent_t(x,sigma = sigs[i],degf[i]), lwd=2, col=colors[i], lty=ltys[i])
}

legend("topright", inset=.05, title="Distributions",
  labels, lwd=2,  col=colors, lty=c(1,1,2,1,2,1))

plot(x, hx, type="l", lty=1, xlab="x value", xlim=c(0, 9),
  ylab="Density", main="Comparison of half Student -t Distributions")

for (i in 1:4){ 
  lines(x, dstudent_t(x,sigma = sigs[i],degf[i]), lwd=2, col=colors[i], lty=ltys[i])
}

legend("topright", inset=.05, title="Distributions",
  labels, lwd=2,  col=colors, lty=c(1,1,2,1,2,1))

Likewise, for a variance parameter, we have a few options. In the past, inverse Gamma priors were used for variances, but Gelman (2006) suggests using a half-Cauchy prior. Here are some comparisons:

#Cauchy's
x <- seq(.0000001, 10, length=100)
hx <- dcauchy(x, scale=1)

degf <- c(.1, .5,1, 10)
colors <- c("red", "blue", "darkgreen", "gold", "black")
labels <- c("scale=.1", "scale=.5", "scale=1", "scale=10")

plot(x, hx, type="l", lty=2, xlab="x value", ylim=c(0, .5),
  ylab="Density", main="Comparison of Half-Cauchy Distributions")

for (i in 1:4){ 
  lines(x, dcauchy(x,0,degf[i]), lwd=2, col=colors[i])
}

legend("topright", inset=.05, title="Distributions",
  labels, lwd=2, lty=c(1, 1, 1, 1, 2), col=colors)

#Gamma's

x <- seq(0.0000001, 10, length=100)
hx <- densigamma(x, alpha = 1, beta = 1 )

degf <- c(.5, .1 ,.01, .001)
colors <- c("red", "blue", "darkgreen", "gold", "black")
labels <- c("b=.5", "b=.1", "b=.01", "b=.001")

plot(x, hx, type="l", lty=2, xlab="x value", ylim=c(0, 1),
  ylab="Density", main=c("Comparison of Inverse Gamma","Distributions with a = .5"))

for (i in 1:4){ 
  lines(x, densigamma(x,.5,degf[i]), lwd=2, col=colors[i])
}

legend("topright", inset=.05, title="Distributions",
  labels, lwd=2, lty=c(1, 1, 1, 1, 2), col=colors)

So, we see there is quite a variety of prior densities that we can get here. The big question is: “Does it matter much?”. Below I will specify a simple multiple regression model, again, with a Normal likelihood and various prior definitions.

\(y \sim N(X' \beta, \sigma )\)

There a loads of ways to do this, but I like doing it this way.

Here’s the linear model fit via ML

#Clean the data

eclsk.sub<-eclsk.sub%>%
  select(height_z2,chhealth1,foodinsec1,hhsize,hhses1,wts1, male, chhealth2)%>%
  filter(complete.cases(.))
fit.lm<- glm(height_z2~chhealth1+foodinsec1+hhsize+hhses1, data=eclsk.sub, weights=wts1)
summary(fit.lm)
## Warning in summary.glm(fit.lm): observations with zero weight not used for
## calculating dispersion
## 
## Call:
## glm(formula = height_z2 ~ chhealth1 + foodinsec1 + hhsize + hhses1, 
##     data = eclsk.sub, weights = wts1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -6.1677  -0.6031   0.0000   0.5801   5.6269  
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.253385   0.042525   5.958 2.64e-09 ***
## chhealth1   -0.046592   0.013388  -3.480 0.000504 ***
## foodinsec1  -0.079490   0.032565  -2.441 0.014667 *  
## hhsize      -0.034126   0.007784  -4.384 1.18e-05 ***
## hhses1       0.031048   0.013785   2.252 0.024322 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 1.228013)
## 
##     Null deviance: 11042  on 8938  degrees of freedom
## Residual deviance: 10971  on 8934  degrees of freedom
## AIC: Inf
## 
## Number of Fisher Scoring iterations: 2
confint(fit.lm)
## Waiting for profiling to be done...
## Warning in summary.glm(fitted): observations with zero weight not used for
## calculating dispersion
##                    2.5 %      97.5 %
## (Intercept)  0.170037424  0.33673245
## chhealth1   -0.072832416 -0.02035119
## foodinsec1  -0.143316104 -0.01566397
## hhsize      -0.049382809 -0.01886873
## hhses1       0.004030881  0.05806565

Here we use Stan to do a bayesian model

Using default priors. brms uses improper, flat priors for the regression effects, and half student-t priors for the Intercept and residual variance.

This would specify the model as:

\[y_i \sim \beta_0+ \sum_k \beta_k x_{ik} +e_i\] The intercept has prior:

\[\beta_0 \sim \text{Student -t}(\text{df} =3, \mu=0, \sigma=1 )\] The \(\beta\)’s have priors:

\[\beta_k \sim \text{Unif} (- \infty , \infty)\]

And the residual variance has prior:

\[\sigma_e \sim \text{Student -t}(\text{df} =3, \mu=0, \sigma=1 )\]

options(mc.cores = parallel::detectCores())

fit.1<-brm(height_z2|weights(wts1)~chhealth1+foodinsec1+hhsize+hhses1, data=eclsk.sub, family=gaussian(),
              warmup=1000, #burnin for 500 interations for each chain = 1000 burnin
              iter=5000, chains=2, #2*5000 = 10000 - 1000 burnin = 9000 total iterations
              cores=2,seed = 1115, #number of computer cores, 1 per chain is good.
            refresh = 0
              )
## Compiling the C++ model
## Start sampling

Now I print the summaries of the model fit by likelihood and the Bayesian model:

summary(fit.lm)
## Warning in summary.glm(fit.lm): observations with zero weight not used for
## calculating dispersion
## 
## Call:
## glm(formula = height_z2 ~ chhealth1 + foodinsec1 + hhsize + hhses1, 
##     data = eclsk.sub, weights = wts1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -6.1677  -0.6031   0.0000   0.5801   5.6269  
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.253385   0.042525   5.958 2.64e-09 ***
## chhealth1   -0.046592   0.013388  -3.480 0.000504 ***
## foodinsec1  -0.079490   0.032565  -2.441 0.014667 *  
## hhsize      -0.034126   0.007784  -4.384 1.18e-05 ***
## hhses1       0.031048   0.013785   2.252 0.024322 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 1.228013)
## 
##     Null deviance: 11042  on 8938  degrees of freedom
## Residual deviance: 10971  on 8934  degrees of freedom
## AIC: Inf
## 
## Number of Fisher Scoring iterations: 2
print(summary(fit.1), digits=3)
##  Family: gaussian 
##   Links: mu = identity; sigma = identity 
## Formula: height_z2 | weights(wts1) ~ chhealth1 + foodinsec1 + hhsize + hhses1 
##    Data: eclsk.sub (Number of observations: 10063) 
## Samples: 2 chains, each with iter = 5000; warmup = 1000; thin = 1;
##          total post-warmup samples = 8000
## 
## Population-Level Effects: 
##            Estimate Est.Error l-95% CI u-95% CI Eff.Sample  Rhat
## Intercept     0.254     0.038    0.180    0.329       8000 1.000
## chhealth1    -0.047     0.012   -0.070   -0.024       8000 1.000
## foodinsec1   -0.079     0.029   -0.138   -0.021       8000 1.000
## hhsize       -0.034     0.007   -0.048   -0.021       8000 1.000
## hhses1        0.031     0.012    0.007    0.055       8000 1.000
## 
## Family Specific Parameters: 
##       Estimate Est.Error l-95% CI u-95% CI Eff.Sample  Rhat
## sigma    0.974     0.006    0.962    0.987       8000 1.000
## 
## Samples were drawn using sampling(NUTS). For each parameter, Eff.Sample 
## is a crude measure of effective sample size, and Rhat is the potential 
## scale reduction factor on split chains (at convergence, Rhat = 1).

We see great similarity between the two models. Although the Bayesian model shows more significant effects

Plotting results

plot(fit.1, N=2)

Specifying new priors for the analysis

Here, we change the default priors for the model parameters to be Gaussian for the \(\beta\)’s and Inverse Gamma for the residual variances

prior1<-c(set_prior("normal(0,1)", class="b"),#prior for the beta's
         set_prior("normal(0,1)", class="Intercept"), #prior for the intercept
         set_prior("inv_gamma(.5,.5)", class="sigma"))#prior for the residual std. deviation

fit.1b<-brm(height_z2|weights(wts1)~chhealth1+foodinsec1+hhsize+hhses1, data=eclsk.sub, family=gaussian(),
              prior = prior1, 
              warmup=1000, #burnin for 1000 interations for each chain = 1000 burnin
              iter=5000, chains=2, #2*5000 = 10000 - 2000 burnin = 8000 total iterations
              cores=2,seed = 1115, #number of computer cores, 1 per chain is good.
              save_model="fit_lm_gamma_stan.txt", refresh = 0
            )
## Compiling the C++ model
## Start sampling

Here we check for sensitivity of the model estimates to the different priors. First I summarize the two analyses using the posterior mean, sd’s and credible intervals, then plot the densities of two of the model parameters.

In this analysis, the results are very robust to alternative prior choices.

Here I write a custom summary function that takes the sampled output from the models, and computes the posterior mean, posterior standard deviation (like the s.e(\(\beta\))) and the Bayesian credible interval for each parameter.

The Bayesian credible interval is what people think the confidence interval is, but they’re wrong. People believe that a frequentist confidence interval says something about the uncertainty in the parameter, but it really speaks to the repeat-ability of the observation. The Bayesian credible interval concerns the uncertainty in the parameter, which is what we want.

The BCI is found by taking the \(\alpha/2\) and \(1-\alpha\2\) values of the posterior density, where \(\alpha\) is the level of certainty you desire. So for a 95% credible interval, we would find the 2.5% and 97.5% values of the posterior.

out1<-as.data.frame(fit.1)
out2<-as.data.frame(fit.1b)

mysum<-function(x){
  mu=mean(x,na.rm=T)
  sdev=sd(x,na.rm=T)
  lci=quantile(x, p=.025,na.rm=T)
  uci=quantile(x, p=.975,na.rm=T)
list(mu=mu, sd=sdev, lci=lci, uci=uci)
  }

sapply(out1, mysum )
##     b_Intercept b_chhealth1 b_foodinsec1 b_hhsize    b_hhses1   
## mu  0.2538347   -0.04670047 -0.07936257  -0.03418134 0.03099305 
## sd  0.03781811  0.01172413  0.029399     0.00693087  0.0122303  
## lci 0.1797688   -0.06995061 -0.1384198   -0.04779784 0.007454393
## uci 0.3289076   -0.02400458 -0.02123371  -0.02058464 0.05514832 
##     sigma       lp__     
## mu  0.9740066   -16119.38
## sd  0.006386527 1.762603 
## lci 0.9615856   -16123.7 
## uci 0.9865148   -16116.94
sapply(out2, mysum)
##     b_Intercept b_chhealth1 b_foodinsec1 b_hhsize    b_hhses1   
## mu  0.2530128   -0.04656529 -0.07944534  -0.03404831 0.03092866 
## sd  0.03721325  0.01162452  0.02867235   0.006824437 0.01210782 
## lci 0.1802561   -0.06982933 -0.1358529   -0.04761781 0.006724002
## uci 0.3251884   -0.02334205 -0.02357385  -0.02059293 0.05449759 
##     sigma       lp__     
## mu  0.9741448   -16119.4 
## sd  0.006564318 1.75416  
## lci 0.9613761   -16123.7 
## uci 0.9872675   -16116.99

Densities for parameter in each model

If the models are invariant to the prior, the two lines should be very close to each other.

ggplot()+geom_density(data=out1, aes(b_chhealth1), col="red", alpha=.5, lwd=1.5)+geom_density(data=out2, aes(b_chhealth1), col="green", alpha=.5, lwd=1.5)

ggplot()+geom_density(data=out1, aes(sigma), col="red", alpha=.5, lwd=1.5)+geom_density(data=out2, aes(sigma), col="green", alpha=.5, lwd=1.5)

Bayesian P values.

In Bayesian models, we don’t get a p-value, we get the coverage interval, or Bayesian credible interval which gives us the values of the posterior density that are the lower and upper p percentiles of the posterior.

We can also calculate Bayesian p-values which give us the probability a value is greater or less than a specified value. So, if we observe a negative \(\beta\) parameter, but we want to know the probability that the \(\beta\) is really less than 0, we can calculate \(Pr(\theta < 0)\). This equates to finding the area o under the curve between 0 and the lower bound of the density. Since we have lots of samples from the posterior of the parameter, we can find this via Monte Carlo integration by taking the mean of the indicator function:

\[Pr(\theta < 0) = \frac{1}{n_{sims}} \sum I(\theta < \hat{\theta})\]

Here I write a function to calculate Bayesian p-values based on if the posterior mean of the parameter is positive or negative.

bayespval<-function(x){
  if(mean(x)<0) {
    pval=mean(x<0)
  }
  else{
    pval=mean(x>0)
  }
list(pval=pval)
}
sapply(out1, bayespval)
## $b_Intercept.pval
## [1] 1
## 
## $b_chhealth1.pval
## [1] 1
## 
## $b_foodinsec1.pval
## [1] 0.996625
## 
## $b_hhsize.pval
## [1] 1
## 
## $b_hhses1.pval
## [1] 0.994625
## 
## $sigma.pval
## [1] 1
## 
## $lp__.pval
## [1] 1

Weird priors - affect the posterior

Now I try to break the model by putting very strong priors on the regression effects.

#bayesian model half cauchy prior on residual variance
prior_weird<-c(set_prior("normal(10,.001)", class="b"),#prior for the beta's
         set_prior("normal(0,1)", class="Intercept"), #prior for the intercept
         set_prior("cauchy(0,5)", class="sigma"))#prior for the residual std. deviation


fit.2<-brm(height_z2|weights(wts1)~chhealth1+foodinsec1+hhsize+hhses1, data=eclsk.sub, family=gaussian(),
              prior = prior_weird, 
              warmup=1000, #burnin for 500 interations for each chain = 1000 burnin
              iter=5000, chains=2, #2*5000 = 10000 - 1000 burnin = 9000 total iterations
              cores=2,seed = 1115, #number of computer cores, 1 per chain is good.
              save_model="fit_lm_weird_stan.txt", #so we can see the stan code
            refresh = 0)
## Compiling the C++ model
## Start sampling
print(summary(fit.2), digits=3)
##  Family: gaussian 
##   Links: mu = identity; sigma = identity 
## Formula: height_z2 | weights(wts1) ~ chhealth1 + foodinsec1 + hhsize + hhses1 
##    Data: eclsk.sub (Number of observations: 10063) 
## Samples: 2 chains, each with iter = 5000; warmup = 1000; thin = 1;
##          total post-warmup samples = 8000
## 
## Population-Level Effects: 
##            Estimate Est.Error l-95% CI u-95% CI Eff.Sample  Rhat
## Intercept   -63.548     0.155  -63.857  -63.242       3544 1.001
## chhealth1    10.000     0.001    9.998   10.002       8000 1.000
## foodinsec1   10.000     0.001    9.998   10.002       8000 1.000
## hhsize        9.999     0.001    9.997   10.001       8000 1.000
## hhses1       10.000     0.001    9.998   10.002       8000 1.000
## 
## Family Specific Parameters: 
##       Estimate Est.Error l-95% CI u-95% CI Eff.Sample  Rhat
## sigma   16.613     0.108   16.399   16.826       4381 1.000
## 
## Samples were drawn using sampling(NUTS). For each parameter, Eff.Sample 
## is a crude measure of effective sample size, and Rhat is the potential 
## scale reduction factor on split chains (at convergence, Rhat = 1).
plot(fit.2,N=2)

out1<-as.data.frame(fit.1)
out2<-as.data.frame(fit.2)


sapply(out1, mysum )
##     b_Intercept b_chhealth1 b_foodinsec1 b_hhsize    b_hhses1   
## mu  0.2538347   -0.04670047 -0.07936257  -0.03418134 0.03099305 
## sd  0.03781811  0.01172413  0.029399     0.00693087  0.0122303  
## lci 0.1797688   -0.06995061 -0.1384198   -0.04779784 0.007454393
## uci 0.3289076   -0.02400458 -0.02123371  -0.02058464 0.05514832 
##     sigma       lp__     
## mu  0.9740066   -16119.38
## sd  0.006386527 1.762603 
## lci 0.9615856   -16123.7 
## uci 0.9865148   -16116.94
sapply(out2, mysum)
##     b_Intercept b_chhealth1 b_foodinsec1 b_hhsize    b_hhses1    sigma    
## mu  -63.54847   9.999758    9.999952     9.999267    9.999857    16.61255 
## sd  0.1553188   0.001003751 0.0009984208 0.001013839 0.001015203 0.1080021
## lci -63.85675   9.997781    9.997977     9.99728     9.997822    16.3987  
## uci -63.24245   10.00175    10.00193     10.00123    10.00181    16.82598 
##     lp__     
## mu  -48913.9 
## sd  1.747187 
## lci -48918.17
## uci -48911.48
ggplot()+geom_density(data=out1, aes(b_chhealth1), col="red", alpha=.5, lwd=1.5)+geom_density(data=out2, aes(b_chhealth1), col="green", alpha=.5, lwd=1.5)

ggplot()+geom_density(data=out1, aes(sigma), col="red", alpha=.5, lwd=1.5)+geom_density(data=out2, aes(sigma), col="green", alpha=.5, lwd=1.5)

Which certainly worked to break the model results!

Zero priors

If we assign a prior with a zero density at some point in the distribution, the posterior will also have 0 density.

prior_weird<-c(set_prior("uniform(0,1)", lb = 0, ub = 1, class="b"),#prior for the beta's
         set_prior("normal(0,1)", class="Intercept"), #prior for the intercept
         set_prior("cauchy(0,5)", class="sigma"))#prior for the residual std. deviation


fit.2<-brm(height_z2|weights(wts1)~chhealth1+foodinsec1+hhsize+hhses1, data=eclsk.sub, family=gaussian(),
              prior = prior_weird, 
              warmup=1000, #burnin for 500 interations for each chain = 1000 burnin
              iter=5000, chains=2, #2*5000 = 10000 - 1000 burnin = 9000 total iterations
              cores=2,seed = 1115, #number of computer cores, 1 per chain is good.
              save_model="fit_lm_weird_stan.txt", #so we can see the stan code
            refresh = 0)
## Compiling the C++ model
## Start sampling
out1<-as.data.frame(fit.1)
out2<-as.data.frame(fit.2)


sapply(out1, mysum )
##     b_Intercept b_chhealth1 b_foodinsec1 b_hhsize    b_hhses1   
## mu  0.2538347   -0.04670047 -0.07936257  -0.03418134 0.03099305 
## sd  0.03781811  0.01172413  0.029399     0.00693087  0.0122303  
## lci 0.1797688   -0.06995061 -0.1384198   -0.04779784 0.007454393
## uci 0.3289076   -0.02400458 -0.02123371  -0.02058464 0.05514832 
##     sigma       lp__     
## mu  0.9740066   -16119.38
## sd  0.006386527 1.762603 
## lci 0.9615856   -16123.7 
## uci 0.9865148   -16116.94
sapply(out2, mysum)
##     b_Intercept b_chhealth1 b_foodinsec1 b_hhsize     b_hhses1  
## mu  0.001293229 0.002562749 0.007530319  0.001241044  0.056101  
## sd  0.01147551  0.002507515 0.007013844  0.001210164  0.01168974
## lci -0.0228679  7.11159e-05 0.0002436454 3.303197e-05 0.0332066 
## uci 0.02277094  0.009532895 0.0258853    0.004363519  0.07911399
##     sigma       lp__     
## mu  0.976339    -16165.82
## sd  0.006392862 1.850657 
## lci 0.964033    -16170.24
## uci 0.9887433   -16163.23
ggplot()+geom_density(data=out1, aes(b_chhealth1), col="red", alpha=.5, lwd=1.5)+geom_density(data=out2, aes(b_chhealth1), col="green", alpha=.5, lwd=1.5)

ggplot()+geom_density(data=out1, aes(sigma), col="red", alpha=.5, lwd=1.5)+geom_density(data=out2, aes(sigma), col="green", alpha=.5, lwd=1.5)

Lesson learned - be very careful with 0 priors!

Alternative Lilkelihoods

In the Bayesian context, it’s easier to fit models that assume other types of distributions for our outcome. For example, we can use a Student-t distribution with \(\nu\) degrees of freedom for a continuous outcome, instead of a normal distribution. This is often called a type of “robust regression” because the tails of the t distribution are fatter than those of a normal, so the variance in the residuals can be greater. In the case below, our model would be specified as:

\[y \sim t(X' \beta, \sigma^2 , \nu)\]

and priors:

\(\beta \sim N(0, 1)\)

\(\sigma \sim half cauchy(5)\)

\(\nu \sim gamma(1, 1.5)\)

#Student's T regression model
prior<-c(set_prior("normal(0,1)", class="b"),#prior for the beta's
         set_prior("cauchy(0,5)", class="sigma"),#prior for the residual std. deviation
         set_prior("gamma(1,1.5)", class="nu")) #prior for the t distribution d.f.

fit.lm.b.t<-brm(height_z2|weights(wts1)~chhealth1+foodinsec1+hhsize+hhses1, data=eclsk.sub, family=student(),
              prior = prior, 
              warmup=1000, 
              iter=5000, chains=2, 
              cores=2,seed = 1115,
              save_model="fit_lm_b_t_stan.txt", refresh=0)
## Compiling the C++ model
## Start sampling
print(summary(fit.lm.b.t), digits=3)
##  Family: student 
##   Links: mu = identity; sigma = identity; nu = identity 
## Formula: height_z2 | weights(wts1) ~ chhealth1 + foodinsec1 + hhsize + hhses1 
##    Data: eclsk.sub (Number of observations: 10063) 
## Samples: 2 chains, each with iter = 5000; warmup = 1000; thin = 1;
##          total post-warmup samples = 8000
## 
## Population-Level Effects: 
##            Estimate Est.Error l-95% CI u-95% CI Eff.Sample  Rhat
## Intercept     0.255     0.037    0.181    0.328       8000 1.000
## chhealth1    -0.047     0.012   -0.071   -0.023       8000 1.000
## foodinsec1   -0.081     0.028   -0.137   -0.025       8000 1.000
## hhsize       -0.035     0.007   -0.048   -0.022       8000 1.000
## hhses1        0.032     0.012    0.008    0.057       8000 1.000
## 
## Family Specific Parameters: 
##       Estimate Est.Error l-95% CI u-95% CI Eff.Sample  Rhat
## sigma    0.945     0.007    0.931    0.959       8000 1.000
## nu      27.719     2.917   22.520   33.794       8000 1.000
## 
## Samples were drawn using sampling(NUTS). For each parameter, Eff.Sample 
## is a crude measure of effective sample size, and Rhat is the potential 
## scale reduction factor on split chains (at convergence, Rhat = 1).
plot(fit.lm.b.t, N=2)

Comparing Bayesian models

For comparing Bayesian models, we do not reply on traditional hypothesis testing procedure, such as the F test or likelihood ratio test. Typically some form of information criteria is used. Since the inventors of Stan also have put a lot of thought into these ideas, we use the “Watanabe-Akaike Information Criterion” or WAIC (also here ) to compare models. The Leave one out (LOO) method is also commonly used to compare the predictive capacity of models.

Gelman, Hwang and Vehtari (2014) describe the relative strengths and weaknesses of these approaches.

If you have nested models, it is appropriate to compare them using one of these approaches. WAIC is concerned with overall model fit, while LOO is more concerned with predictive capacity of a model.

WAIC

The WAIC is a recently developed alternative to the DIC that uses the log of the posterior predictive density, meaning that the log of the total density evaluated at the estimates of the parameters, and penalizes it using a penalty term, to represent the model complexity. As with AIC, smaller values of WAIC indicate better fitting models. According to Gelman and colleagues 2014 pg 174, the “WAIC has the desirable property of averaging over the posterior distribution rather than conditioning on a point estimate”

We can compare multiple models (fit to the same data) using the WAIC() function in brms:

WAIC1<-WAIC(fit.1)
WAIC2<-WAIC(fit.lm.b.t)
compare_ic(WAIC1, WAIC2, ic=c("waic"))
##                        WAIC     SE
## fit.1              32237.95 312.41
## fit.lm.b.t         32307.82 314.26
## fit.1 - fit.lm.b.t   -69.87   7.56

Which certainly looks like the Normal model is preferred here, because of the lower WAIC score. We can also evaluate the model fit to the sensitivity of the prior from the normal model:

WAIC(fit.1, fit.1b, compare = T)
##                    WAIC     SE
## fit.1          32237.95 312.41
## fit.1b         32237.63 312.37
## fit.1 - fit.1b     0.32   0.10

Which looks like the model fit is not affected by the choice of prior in this case. This is what we want to see!!

Leave one out cross-validation

Another technique for judging relative model fit is the predictive capacity of the model. The LOO method uses the posterior predictive distribution for the model. The posterior predictive distribution is the posterior distribution of new data, given the parameters we have obtained from our model. Typically, we first get samples of our parameters:

\(\theta_i \sim p(\theta|\text{data})\) gives us the posterior for each parameter, then we can plug these in and use the parameters to generate new samples: \(\text{data}_i' \sim p(\text{data}|\theta_i)\) where \(\text{data}_i'\) are predicted values from the model. NOTE this is also how Bayesian models can be used for forecasting and prediction in general.

By doing this a large number of times, you can generate the posterior predictive distribution.

LOO cross validation does this by generating the posterior predictive distribution for each point, without that point in the analysis. This quantity is then summed to generate a measure of overall model fit: \(\text{lppd}_{loo-cv} = \sum_i \text{log} \left (\frac{1}{S} \sum_s ^S p(y_i|\theta^{is}) \right)\)

Another NOTE THIS CAN TAKE A VERY LONG TIME!!! AND CAN TAKE LOTS OF MEMORY TOO!!!

loo(fit.1 , fit.lm.b.t)
## Warning: Found 1124 observations with a pareto_k > 0.7 in model 'fit.1'.
## With this many problematic observations, it may be more appropriate to use
## 'kfold' with argument 'K = 10' to perform 10-fold cross-validation rather
## than LOO.
## Warning: Found 1124 observations with a pareto_k > 0.7 in model
## 'fit.lm.b.t'. With this many problematic observations, it may be more
## appropriate to use 'kfold' with argument 'K = 10' to perform 10-fold cross-
## validation rather than LOO.
##                       LOOIC     SE
## fit.1              32237.96 312.41
## fit.lm.b.t         32307.83 314.26
## fit.1 - fit.lm.b.t   -69.87   7.56

Logistic Regression Example

Next, we consider a similar simple logistic regression model:

#here's the model fit via ML
eclsk.sub$badhealth2<-ifelse(eclsk.sub$chhealth2 >2, 1,0)
fit.glm<-glm(badhealth2~male+foodinsec1+hhsize, data=eclsk.sub, weights=wts1, family="binomial")
summary(fit.glm)$coef
##                Estimate Std. Error     z value      Pr(>|z|)
## (Intercept) -2.42260454 0.10225503 -23.6917895 4.381597e-124
## male         0.02051837 0.05633256   0.3642365  7.156814e-01
## foodinsec1   0.74032821 0.07178620  10.3129607  6.157462e-25
## hhsize       0.07831882 0.02001326   3.9133473  9.102548e-05
confint(fit.glm)
##                   2.5 %     97.5 %
## (Intercept) -2.62306447 -2.2221668
## male        -0.08985597  0.1310012
## foodinsec1   0.59845689  0.8799411
## hhsize       0.03882818  0.1172970

Bayesian Logistic Regression model

prior<-c(set_prior("normal(0,1)", class="b"))
#no residual sigma for the binomial
fit.glm.b<-brm(badhealth2|weights(wts1)~male+foodinsec1+hhsize, data=eclsk.sub, family=bernoulli(),
              prior = prior, 
              warmup=500, 
              iter=1000, chains=2, 
              cores=2,seed = 1115,
              save_model="fit_glm_b_stan.txt", refresh=0)
## Compiling the C++ model
## Start sampling
print(summary(fit.glm.b), digits=3)
##  Family: bernoulli 
##   Links: mu = logit 
## Formula: badhealth2 | weights(wts1) ~ male + foodinsec1 + hhsize 
##    Data: eclsk.sub (Number of observations: 10063) 
## Samples: 2 chains, each with iter = 1000; warmup = 500; thin = 1;
##          total post-warmup samples = 1000
## 
## Population-Level Effects: 
##            Estimate Est.Error l-95% CI u-95% CI Eff.Sample  Rhat
## Intercept    -2.423     0.099   -2.616   -2.236       1000 0.999
## male          0.021     0.057   -0.092    0.130       1000 1.000
## foodinsec1    0.736     0.071    0.602    0.874       1000 1.000
## hhsize        0.078     0.020    0.039    0.115       1000 0.999
## 
## Samples were drawn using sampling(NUTS). For each parameter, Eff.Sample 
## is a crude measure of effective sample size, and Rhat is the potential 
## scale reduction factor on split chains (at convergence, Rhat = 1).
plot(fit.glm.b, N=2)

References

Gelman, Andrew, Jessica Hwang, and Aki Vehtari. 2014. “Understanding Predictive Information Criteria for Bayesian Models.” Statistics and Computing 24(6):997-1016.