This example will go through the basics of using Stan by way of the brms library, for estimation of simple linear and generalized linear models. You must install brms first, using install.packages("brms").
We will use the ECLS-K 2011 data for our example, and use height for age/sex z-score as a continuous outcome, and short for age status outcome (height for age z < -1) as a dichotomous outcome.
There is an overview on using brms for fitting various models. You can find these here. The package rstanarm is very similar and better documented, and can be found here, with numerous tutorials on that page.
Both packages serve as front-ends to the Stan library for MCMC. People new to Stan can often be put off by its syntax and model construction. Both of these packages allow us to use R syntax that we are accustomed to from functions like glm() and lmer() to fit models. We can also see the Stan code from the model that is generated, so we can learn how Stan works inside.
library (car)
## Loading required package: carData
library(brms)
## Loading required package: Rcpp
## Loading required package: ggplot2
## Loading 'brms' package (version 2.4.0). Useful instructions
## can be found by typing help('brms'). A more detailed introduction
## to the package is available through vignette('brms_overview').
## Run theme_set(theme_default()) to use the default bayesplot theme.
library(pscl)
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
library(rstan)
## Loading required package: StanHeaders
## rstan (Version 2.17.3, GitRev: 2e1f913d3ca3)
## For execution on a local, multicore CPU with excess RAM we recommend calling
## options(mc.cores = parallel::detectCores()).
## To avoid recompilation of unchanged Stan programs, we recommend calling
## rstan_options(auto_write = TRUE)
library(haven)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
eclsk11<-read_sas("~/Google Drive/classes/dem7473/data/childk4p.sas7bdat")
names(eclsk11)<-tolower(names(eclsk11))
#get out only the variables I'm going to use for this example
myvars<-c("childid", "x_chsex_r", "x1locale", "x_raceth_r", "x2povty", "x12par1ed_i","p1curmar", "x1htotal",
"x1mscalk4", "x2mscalk4", "x3mscalk4", "x4mscalk4","x5mscalk4","x6mscalk4","x7mscalk4","x8mscalk4",
"p1hscale", "p2hscale", "p4hscale","p6hscale","p7hscale","p8hscale",
"x12sesl","x4sesl_i", "p2parct1", "p2parct2", "s1_id", "p2safepl","x2krceth","p1o2near", "x_distpov" ,
"w8cf8p_80","w8cf8p_8psu","w8cf8p_8str","x2fsstat2","x4fsstat2", "x1height","x2height","x3height","x4height","x5height","x6height","x7height","x8height",
"x1kage_r","x2kage_r","x3age","x4age", "x5age", "x6age", "x7age", "x8age", "w1c0","w1c0psu","w1c0str", "w1p0", "w4c4p_20", "w4cf4p_2str", "w4cf4p_2psu")
#subset the data
eclsk.sub<-eclsk11[,myvars]
rm(eclsk11); gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2125400 113.6 4344315 232.1 4344315 232.1
## Vcells 4667714 35.7 683027849 5211.1 783895299 5980.7
#First we recode some Child characteristics
#Child's sex: recode as male =1
eclsk.sub$male<-Recode(eclsk.sub$x_chsex_r, recodes="1=1; 2=0; -9=NA")
#Recode race with white, non Hispanic as reference using dummy vars
eclsk.sub$hisp<-Recode(eclsk.sub$x_raceth_r, recodes="3:4=1;-9=NA; else=0")
eclsk.sub$black<-Recode(eclsk.sub$x_raceth_r, recodes="2=1;-9=NA; else=0")
eclsk.sub$asian<-Recode(eclsk.sub$x_raceth_r, recodes="5=1;-9=NA; else=0")
eclsk.sub$nahn<-Recode(eclsk.sub$x_raceth_r, recodes="6:7=1;-9=NA; else=0")
eclsk.sub$other<-Recode(eclsk.sub$x_raceth_r, recodes="8=1;-9=NA; else=0")
#Longitudinal variables
#recode our outcomes, the first is the child's math standardized test score in Kindergarten
eclsk.sub$math1<-ifelse(eclsk.sub$x1mscalk4<0, NA, eclsk.sub$x1mscalk4)
eclsk.sub$math2<-ifelse(eclsk.sub$x2mscalk4<0, NA, eclsk.sub$x2mscalk4)
eclsk.sub$math4<-ifelse(eclsk.sub$x4mscalk4<0, NA, eclsk.sub$x4mscalk4)
eclsk.sub$math6<-ifelse(eclsk.sub$x6mscalk4<0, NA, eclsk.sub$x6mscalk4)
eclsk.sub$math7<-ifelse(eclsk.sub$x7mscalk4<0, NA, eclsk.sub$x7mscalk4)
eclsk.sub$math8<-ifelse(eclsk.sub$x8mscalk4<0, NA, eclsk.sub$x8mscalk4)
#Second outcome is child's height for age, continuous outcome
eclsk.sub$height1<-ifelse(eclsk.sub$x1height<=-7, NA, eclsk.sub$x1height)
eclsk.sub$height2<-ifelse(eclsk.sub$x2height<=-7, NA, eclsk.sub$x2height)
eclsk.sub$height4<-ifelse(eclsk.sub$x4height<=-7, NA, eclsk.sub$x4height)
eclsk.sub$height6<-ifelse(eclsk.sub$x6height<=-7, NA, eclsk.sub$x6height)
eclsk.sub$height7<-ifelse(eclsk.sub$x7height<=-7, NA, eclsk.sub$x7height)
eclsk.sub$height8<-ifelse(eclsk.sub$x8height<=-7, NA, eclsk.sub$x8height)
#Age at each wave
eclsk.sub$age_yrs1<-ifelse(eclsk.sub$x1kage_r<0, NA, eclsk.sub$x1kage_r/12)
eclsk.sub$age_yrs2<-ifelse(eclsk.sub$x2kage_r<0, NA, eclsk.sub$x2kage_r/12)
#eclsk.sub$age_yrs3<-ifelse(eclsk.sub$x3age<0, NA, eclsk.sub$x3age/12)
eclsk.sub$age_yrs4<-ifelse(eclsk.sub$x4age<0, NA, eclsk.sub$x4age/12)
eclsk.sub$age_yrs6<-ifelse(eclsk.sub$x6age<0, NA, eclsk.sub$x6age/12)
eclsk.sub$age_yrs7<-ifelse(eclsk.sub$x7age<0, NA, eclsk.sub$x7age/12)
eclsk.sub$age_yrs8<-ifelse(eclsk.sub$x8age<0, NA, eclsk.sub$x8age/12)
eclsk.sub<- eclsk.sub[is.na(eclsk.sub$age_yrs1)==F, ]
#Height for age z score standardized by sex and age
eclsk.sub$height_z1<-ave(eclsk.sub$height1, as.factor(paste(round(eclsk.sub$age_yrs1, 1.5), eclsk.sub$male)), FUN=scale)
eclsk.sub$height_z2<-ave(eclsk.sub$height2, as.factor(paste(round(eclsk.sub$age_yrs2, 1.5), eclsk.sub$male)), FUN=scale)
#eclsk.sub$height_z3<-ave(eclsk.sub$height3, as.factor(paste(round(eclsk.sub$age_yrs3, 1.5), eclsk.sub$male)), FUN=scale)
eclsk.sub$height_z4<-ave(eclsk.sub$height4, as.factor(paste(round(eclsk.sub$age_yrs4, 1.5), eclsk.sub$male)), FUN=scale)
eclsk.sub$height_z6<-ave(eclsk.sub$height6, as.factor(paste(round(eclsk.sub$age_yrs6, 1.5), eclsk.sub$male)), FUN=scale)
eclsk.sub$height_z7<-ave(eclsk.sub$height7, as.factor(paste(round(eclsk.sub$age_yrs7, 1.5), eclsk.sub$male)), FUN=scale)
eclsk.sub$height_z8<-ave(eclsk.sub$height8, as.factor(paste(round(eclsk.sub$age_yrs8, 1.5), eclsk.sub$male)), FUN=scale)
#Child health assessment Excellent to poor , ordinal outcome
eclsk.sub$chhealth1<-ifelse(eclsk.sub$p1hscale<0, NA, eclsk.sub$p1hscale)
eclsk.sub$chhealth2<-ifelse(eclsk.sub$p2hscale<0, NA, eclsk.sub$p2hscale)
eclsk.sub$chhealth4<-ifelse(eclsk.sub$p4hscale<0, NA, eclsk.sub$p4hscale)
eclsk.sub$chhealth6<-ifelse(eclsk.sub$p6hscale<0, NA, eclsk.sub$p6hscale)
eclsk.sub$chhealth7<-ifelse(eclsk.sub$p7hscale<0, NA, eclsk.sub$p7hscale)
eclsk.sub$chhealth8<-ifelse(eclsk.sub$p8hscale<0, NA, eclsk.sub$p8hscale)
#SES
eclsk.sub$hhses1<-ifelse(eclsk.sub$x12sesl==-9, NA, scale(eclsk.sub$x12sesl))
eclsk.sub$hhses4<-ifelse(eclsk.sub$x4sesl_i==-9, NA, scale(eclsk.sub$x4sesl_i))
#Household food insecurity, dichotomous outcome
#This outcome is only present at two waves
eclsk.sub$foodinsec1<-Recode(eclsk.sub$x2fsstat2, recodes="2:3=1; 1=0; else=NA")
eclsk.sub$foodinsec2<-Recode(eclsk.sub$x4fsstat2, recodes="2:3=1; 1=0; else=NA")
#First we recode some Child characteristics
#Child's sex: recode as male =1
eclsk.sub$male<-Recode(eclsk.sub$x_chsex_r, recodes="1=1; 2=0; -9=NA")
#Recode race with white, non Hispanic as reference using dummy vars
eclsk.sub$hisp<-Recode(eclsk.sub$x_raceth_r, recodes="3:4=1;-9=NA; else=0")
eclsk.sub$black<-Recode(eclsk.sub$x_raceth_r, recodes="2=1;-9=NA; else=0")
eclsk.sub$asian<-Recode(eclsk.sub$x_raceth_r, recodes="5=1;-9=NA; else=0")
eclsk.sub$nahn<-Recode(eclsk.sub$x_raceth_r, recodes="6:7=1;-9=NA; else=0")
eclsk.sub$other<-Recode(eclsk.sub$x_raceth_r, recodes="8=1;-9=NA; else=0")
#Then we recode some parent/mother characteristics
#Mother's education, recode as 2 dummys with HS = reference
eclsk.sub$lths<-Recode(eclsk.sub$x12par1ed_i, recodes = "0:2=1; 3:8=0; else = NA")
eclsk.sub$gths<-Recode(eclsk.sub$x12par1ed_i, recodes = "1:3=0; 4:8=1; else =NA")
#marital status, recode as 2 dummys, ref= married
eclsk.sub$single<-Recode(eclsk.sub$p1curmar, recodes="4=1; -7:-9=NA; else=0")
eclsk.sub$notmar<-Recode(eclsk.sub$p1curmar, recodes="2:3=1; -7:-9=NA; else=0")
#Then we do some household level variables
#Urban school location = 1
eclsk.sub$urban<-Recode(eclsk.sub$x1locale, recodes = "1:3=1; 4=0; -1:-9=NA")
#poverty level in poverty = 1
eclsk.sub$pov<-Recode(eclsk.sub$x2povty , recodes ="1:2=1; 3=0; -9=NA")
#Household size
eclsk.sub$hhsize<-eclsk.sub$x1htotal
#school % minority student body
eclsk.sub$minorsch<-ifelse(eclsk.sub$x2krceth <0, NA, eclsk.sub$x2krceth/10)
#Unsafe neighborhood
eclsk.sub$unsafe<-Recode(eclsk.sub$p2safepl , recodes = "1:2='unsafe'; 3='safe'; else=NA", as.factor = T)
#school district poverty
eclsk.sub$dist_pov<-ifelse(eclsk.sub$x_distpov==-9, NA, scale(eclsk.sub$x_distpov))
#mean centered weights for wave 1
eclsk.sub$wts1<-eclsk.sub$w4c4p_20/mean(eclsk.sub$w4c4p_20, na.rm=T)
#eclsk.sub$wtsp1<-eclsk.sub$w1p0/mean(eclsk.sub$w1p0, na.rm=T)
Here is a simple linear regression model for bmi using a single predictor variable. If we were to write the full model specification, it would be:
\(y \sim N(X' \beta, \sigma^2 )\)
In terms of priors, we need them for each \(\beta\) and for \(\sigma^2\). It is common to use a Normal prior for \(\beta\) parameters, since they can be both positive or negative, but the null hypothesis is that they are 0. For example a high variance prior can be contrasted with a low variance prior. Here’s a plot of this:
So, a Normal prior with a \(\sigma\) of 10, is pretty vague, but a \(\sigma\) of 100, is really vague.
Often the Student - t distribution, or it’s truncated version , the half-Student t distribution will be use to model regression coefficients or variances:
#Student - t
x <- seq(-9, 9, length=100)
hx <- dstudent_t(x, df=4, sigma = 1)
degf <- c(2, 2,5, 5)
sigs<-c(1,5,1,5)
colors <- c("black", "red", "red", "blue", "blue", "green")
ltys<-c(1,2,1,2,1)
labels <- c("df=4, sd=1", "df=2, sd=1", "df=2, sd=5", "df=5, sd=1", "df=5, sd=5")
plot(x, hx, type="l", lty=1, xlab="x value",
ylab="Density", main="Comparison of Student -t Distributions")
for (i in 1:4){
lines(x, dstudent_t(x,sigma = sigs[i],degf[i]), lwd=2, col=colors[i], lty=ltys[i])
}
legend("topright", inset=.05, title="Distributions",
labels, lwd=2, col=colors, lty=c(1,1,2,1,2,1))
plot(x, hx, type="l", lty=1, xlab="x value", xlim=c(0, 9),
ylab="Density", main="Comparison of half Student -t Distributions")
for (i in 1:4){
lines(x, dstudent_t(x,sigma = sigs[i],degf[i]), lwd=2, col=colors[i], lty=ltys[i])
}
legend("topright", inset=.05, title="Distributions",
labels, lwd=2, col=colors, lty=c(1,1,2,1,2,1))
Likewise, for a variance parameter, we have a few options. In the past, inverse Gamma priors were used for variances, but Gelman (2006) suggests using a half-Cauchy prior. Here are some comparisons:
#Cauchy's
x <- seq(.0000001, 10, length=100)
hx <- dcauchy(x, scale=1)
degf <- c(.1, .5,1, 10)
colors <- c("red", "blue", "darkgreen", "gold", "black")
labels <- c("scale=.1", "scale=.5", "scale=1", "scale=10")
plot(x, hx, type="l", lty=2, xlab="x value", ylim=c(0, .5),
ylab="Density", main="Comparison of Half-Cauchy Distributions")
for (i in 1:4){
lines(x, dcauchy(x,0,degf[i]), lwd=2, col=colors[i])
}
legend("topright", inset=.05, title="Distributions",
labels, lwd=2, lty=c(1, 1, 1, 1, 2), col=colors)
#Gamma's
x <- seq(0.0000001, 10, length=100)
hx <- densigamma(x, alpha = 1, beta = 1 )
degf <- c(.5, .1 ,.01, .001)
colors <- c("red", "blue", "darkgreen", "gold", "black")
labels <- c("b=.5", "b=.1", "b=.01", "b=.001")
plot(x, hx, type="l", lty=2, xlab="x value", ylim=c(0, 1),
ylab="Density", main=c("Comparison of Inverse Gamma","Distributions with a = .5"))
for (i in 1:4){
lines(x, densigamma(x,.5,degf[i]), lwd=2, col=colors[i])
}
legend("topright", inset=.05, title="Distributions",
labels, lwd=2, lty=c(1, 1, 1, 1, 2), col=colors)
So, we see there is quite a variety of prior densities that we can get here. The big question is: “Does it matter much?”. Below I will specify a simple multiple regression model, again, with a Normal likelihood and various prior definitions.
\(y \sim N(X' \beta, \sigma )\)
There a loads of ways to do this, but I like doing it this way.
#Clean the data
eclsk.sub<-eclsk.sub%>%
select(height_z2,chhealth1,foodinsec1,hhsize,hhses1,wts1, male, chhealth2)%>%
filter(complete.cases(.))
fit.lm<- glm(height_z2~chhealth1+foodinsec1+hhsize+hhses1, data=eclsk.sub, weights=wts1)
summary(fit.lm)
## Warning in summary.glm(fit.lm): observations with zero weight not used for
## calculating dispersion
##
## Call:
## glm(formula = height_z2 ~ chhealth1 + foodinsec1 + hhsize + hhses1,
## data = eclsk.sub, weights = wts1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -6.1677 -0.6031 0.0000 0.5801 5.6269
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.253385 0.042525 5.958 2.64e-09 ***
## chhealth1 -0.046592 0.013388 -3.480 0.000504 ***
## foodinsec1 -0.079490 0.032565 -2.441 0.014667 *
## hhsize -0.034126 0.007784 -4.384 1.18e-05 ***
## hhses1 0.031048 0.013785 2.252 0.024322 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 1.228013)
##
## Null deviance: 11042 on 8938 degrees of freedom
## Residual deviance: 10971 on 8934 degrees of freedom
## AIC: Inf
##
## Number of Fisher Scoring iterations: 2
confint(fit.lm)
## Waiting for profiling to be done...
## Warning in summary.glm(fitted): observations with zero weight not used for
## calculating dispersion
## 2.5 % 97.5 %
## (Intercept) 0.170037424 0.33673245
## chhealth1 -0.072832416 -0.02035119
## foodinsec1 -0.143316104 -0.01566397
## hhsize -0.049382809 -0.01886873
## hhses1 0.004030881 0.05806565
Using default priors. brms uses improper, flat priors for the regression effects, and half student-t priors for the Intercept and residual variance.
This would specify the model as:
\[y_i \sim \beta_0+ \sum_k \beta_k x_{ik} +e_i\] The intercept has prior:
\[\beta_0 \sim \text{Student -t}(\text{df} =3, \mu=0, \sigma=1 )\] The \(\beta\)’s have priors:
\[\beta_k \sim \text{Unif} (- \infty , \infty)\]
And the residual variance has prior:
\[\sigma_e \sim \text{Student -t}(\text{df} =3, \mu=0, \sigma=1 )\]
options(mc.cores = parallel::detectCores())
fit.1<-brm(height_z2|weights(wts1)~chhealth1+foodinsec1+hhsize+hhses1, data=eclsk.sub, family=gaussian(),
warmup=1000, #burnin for 500 interations for each chain = 1000 burnin
iter=5000, chains=2, #2*5000 = 10000 - 1000 burnin = 9000 total iterations
cores=2,seed = 1115, #number of computer cores, 1 per chain is good.
refresh = 0
)
## Compiling the C++ model
## Start sampling
Now I print the summaries of the model fit by likelihood and the Bayesian model:
summary(fit.lm)
## Warning in summary.glm(fit.lm): observations with zero weight not used for
## calculating dispersion
##
## Call:
## glm(formula = height_z2 ~ chhealth1 + foodinsec1 + hhsize + hhses1,
## data = eclsk.sub, weights = wts1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -6.1677 -0.6031 0.0000 0.5801 5.6269
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.253385 0.042525 5.958 2.64e-09 ***
## chhealth1 -0.046592 0.013388 -3.480 0.000504 ***
## foodinsec1 -0.079490 0.032565 -2.441 0.014667 *
## hhsize -0.034126 0.007784 -4.384 1.18e-05 ***
## hhses1 0.031048 0.013785 2.252 0.024322 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 1.228013)
##
## Null deviance: 11042 on 8938 degrees of freedom
## Residual deviance: 10971 on 8934 degrees of freedom
## AIC: Inf
##
## Number of Fisher Scoring iterations: 2
print(summary(fit.1), digits=3)
## Family: gaussian
## Links: mu = identity; sigma = identity
## Formula: height_z2 | weights(wts1) ~ chhealth1 + foodinsec1 + hhsize + hhses1
## Data: eclsk.sub (Number of observations: 10063)
## Samples: 2 chains, each with iter = 5000; warmup = 1000; thin = 1;
## total post-warmup samples = 8000
##
## Population-Level Effects:
## Estimate Est.Error l-95% CI u-95% CI Eff.Sample Rhat
## Intercept 0.254 0.038 0.180 0.329 8000 1.000
## chhealth1 -0.047 0.012 -0.070 -0.024 8000 1.000
## foodinsec1 -0.079 0.029 -0.138 -0.021 8000 1.000
## hhsize -0.034 0.007 -0.048 -0.021 8000 1.000
## hhses1 0.031 0.012 0.007 0.055 8000 1.000
##
## Family Specific Parameters:
## Estimate Est.Error l-95% CI u-95% CI Eff.Sample Rhat
## sigma 0.974 0.006 0.962 0.987 8000 1.000
##
## Samples were drawn using sampling(NUTS). For each parameter, Eff.Sample
## is a crude measure of effective sample size, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
We see great similarity between the two models. Although the Bayesian model shows more significant effects
plot(fit.1, N=2)
Here, we change the default priors for the model parameters to be Gaussian for the \(\beta\)’s and Inverse Gamma for the residual variances
prior1<-c(set_prior("normal(0,1)", class="b"),#prior for the beta's
set_prior("normal(0,1)", class="Intercept"), #prior for the intercept
set_prior("inv_gamma(.5,.5)", class="sigma"))#prior for the residual std. deviation
fit.1b<-brm(height_z2|weights(wts1)~chhealth1+foodinsec1+hhsize+hhses1, data=eclsk.sub, family=gaussian(),
prior = prior1,
warmup=1000, #burnin for 1000 interations for each chain = 1000 burnin
iter=5000, chains=2, #2*5000 = 10000 - 2000 burnin = 8000 total iterations
cores=2,seed = 1115, #number of computer cores, 1 per chain is good.
save_model="fit_lm_gamma_stan.txt", refresh = 0
)
## Compiling the C++ model
## Start sampling
Here we check for sensitivity of the model estimates to the different priors. First I summarize the two analyses using the posterior mean, sd’s and credible intervals, then plot the densities of two of the model parameters.
In this analysis, the results are very robust to alternative prior choices.
Here I write a custom summary function that takes the sampled output from the models, and computes the posterior mean, posterior standard deviation (like the s.e(\(\beta\))) and the Bayesian credible interval for each parameter.
The Bayesian credible interval is what people think the confidence interval is, but they’re wrong. People believe that a frequentist confidence interval says something about the uncertainty in the parameter, but it really speaks to the repeat-ability of the observation. The Bayesian credible interval concerns the uncertainty in the parameter, which is what we want.
The BCI is found by taking the \(\alpha/2\) and \(1-\alpha\2\) values of the posterior density, where \(\alpha\) is the level of certainty you desire. So for a 95% credible interval, we would find the 2.5% and 97.5% values of the posterior.
out1<-as.data.frame(fit.1)
out2<-as.data.frame(fit.1b)
mysum<-function(x){
mu=mean(x,na.rm=T)
sdev=sd(x,na.rm=T)
lci=quantile(x, p=.025,na.rm=T)
uci=quantile(x, p=.975,na.rm=T)
list(mu=mu, sd=sdev, lci=lci, uci=uci)
}
sapply(out1, mysum )
## b_Intercept b_chhealth1 b_foodinsec1 b_hhsize b_hhses1
## mu 0.2538347 -0.04670047 -0.07936257 -0.03418134 0.03099305
## sd 0.03781811 0.01172413 0.029399 0.00693087 0.0122303
## lci 0.1797688 -0.06995061 -0.1384198 -0.04779784 0.007454393
## uci 0.3289076 -0.02400458 -0.02123371 -0.02058464 0.05514832
## sigma lp__
## mu 0.9740066 -16119.38
## sd 0.006386527 1.762603
## lci 0.9615856 -16123.7
## uci 0.9865148 -16116.94
sapply(out2, mysum)
## b_Intercept b_chhealth1 b_foodinsec1 b_hhsize b_hhses1
## mu 0.2530128 -0.04656529 -0.07944534 -0.03404831 0.03092866
## sd 0.03721325 0.01162452 0.02867235 0.006824437 0.01210782
## lci 0.1802561 -0.06982933 -0.1358529 -0.04761781 0.006724002
## uci 0.3251884 -0.02334205 -0.02357385 -0.02059293 0.05449759
## sigma lp__
## mu 0.9741448 -16119.4
## sd 0.006564318 1.75416
## lci 0.9613761 -16123.7
## uci 0.9872675 -16116.99
If the models are invariant to the prior, the two lines should be very close to each other.
ggplot()+geom_density(data=out1, aes(b_chhealth1), col="red", alpha=.5, lwd=1.5)+geom_density(data=out2, aes(b_chhealth1), col="green", alpha=.5, lwd=1.5)
ggplot()+geom_density(data=out1, aes(sigma), col="red", alpha=.5, lwd=1.5)+geom_density(data=out2, aes(sigma), col="green", alpha=.5, lwd=1.5)
In Bayesian models, we don’t get a p-value, we get the coverage interval, or Bayesian credible interval which gives us the values of the posterior density that are the lower and upper p percentiles of the posterior.
We can also calculate Bayesian p-values which give us the probability a value is greater or less than a specified value. So, if we observe a negative \(\beta\) parameter, but we want to know the probability that the \(\beta\) is really less than 0, we can calculate \(Pr(\theta < 0)\). This equates to finding the area o under the curve between 0 and the lower bound of the density. Since we have lots of samples from the posterior of the parameter, we can find this via Monte Carlo integration by taking the mean of the indicator function:
\[Pr(\theta < 0) = \frac{1}{n_{sims}} \sum I(\theta < \hat{\theta})\]
Here I write a function to calculate Bayesian p-values based on if the posterior mean of the parameter is positive or negative.
bayespval<-function(x){
if(mean(x)<0) {
pval=mean(x<0)
}
else{
pval=mean(x>0)
}
list(pval=pval)
}
sapply(out1, bayespval)
## $b_Intercept.pval
## [1] 1
##
## $b_chhealth1.pval
## [1] 1
##
## $b_foodinsec1.pval
## [1] 0.996625
##
## $b_hhsize.pval
## [1] 1
##
## $b_hhses1.pval
## [1] 0.994625
##
## $sigma.pval
## [1] 1
##
## $lp__.pval
## [1] 1
Now I try to break the model by putting very strong priors on the regression effects.
#bayesian model half cauchy prior on residual variance
prior_weird<-c(set_prior("normal(10,.001)", class="b"),#prior for the beta's
set_prior("normal(0,1)", class="Intercept"), #prior for the intercept
set_prior("cauchy(0,5)", class="sigma"))#prior for the residual std. deviation
fit.2<-brm(height_z2|weights(wts1)~chhealth1+foodinsec1+hhsize+hhses1, data=eclsk.sub, family=gaussian(),
prior = prior_weird,
warmup=1000, #burnin for 500 interations for each chain = 1000 burnin
iter=5000, chains=2, #2*5000 = 10000 - 1000 burnin = 9000 total iterations
cores=2,seed = 1115, #number of computer cores, 1 per chain is good.
save_model="fit_lm_weird_stan.txt", #so we can see the stan code
refresh = 0)
## Compiling the C++ model
## Start sampling
print(summary(fit.2), digits=3)
## Family: gaussian
## Links: mu = identity; sigma = identity
## Formula: height_z2 | weights(wts1) ~ chhealth1 + foodinsec1 + hhsize + hhses1
## Data: eclsk.sub (Number of observations: 10063)
## Samples: 2 chains, each with iter = 5000; warmup = 1000; thin = 1;
## total post-warmup samples = 8000
##
## Population-Level Effects:
## Estimate Est.Error l-95% CI u-95% CI Eff.Sample Rhat
## Intercept -63.548 0.155 -63.857 -63.242 3544 1.001
## chhealth1 10.000 0.001 9.998 10.002 8000 1.000
## foodinsec1 10.000 0.001 9.998 10.002 8000 1.000
## hhsize 9.999 0.001 9.997 10.001 8000 1.000
## hhses1 10.000 0.001 9.998 10.002 8000 1.000
##
## Family Specific Parameters:
## Estimate Est.Error l-95% CI u-95% CI Eff.Sample Rhat
## sigma 16.613 0.108 16.399 16.826 4381 1.000
##
## Samples were drawn using sampling(NUTS). For each parameter, Eff.Sample
## is a crude measure of effective sample size, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
plot(fit.2,N=2)
out1<-as.data.frame(fit.1)
out2<-as.data.frame(fit.2)
sapply(out1, mysum )
## b_Intercept b_chhealth1 b_foodinsec1 b_hhsize b_hhses1
## mu 0.2538347 -0.04670047 -0.07936257 -0.03418134 0.03099305
## sd 0.03781811 0.01172413 0.029399 0.00693087 0.0122303
## lci 0.1797688 -0.06995061 -0.1384198 -0.04779784 0.007454393
## uci 0.3289076 -0.02400458 -0.02123371 -0.02058464 0.05514832
## sigma lp__
## mu 0.9740066 -16119.38
## sd 0.006386527 1.762603
## lci 0.9615856 -16123.7
## uci 0.9865148 -16116.94
sapply(out2, mysum)
## b_Intercept b_chhealth1 b_foodinsec1 b_hhsize b_hhses1 sigma
## mu -63.54847 9.999758 9.999952 9.999267 9.999857 16.61255
## sd 0.1553188 0.001003751 0.0009984208 0.001013839 0.001015203 0.1080021
## lci -63.85675 9.997781 9.997977 9.99728 9.997822 16.3987
## uci -63.24245 10.00175 10.00193 10.00123 10.00181 16.82598
## lp__
## mu -48913.9
## sd 1.747187
## lci -48918.17
## uci -48911.48
ggplot()+geom_density(data=out1, aes(b_chhealth1), col="red", alpha=.5, lwd=1.5)+geom_density(data=out2, aes(b_chhealth1), col="green", alpha=.5, lwd=1.5)
ggplot()+geom_density(data=out1, aes(sigma), col="red", alpha=.5, lwd=1.5)+geom_density(data=out2, aes(sigma), col="green", alpha=.5, lwd=1.5)
Which certainly worked to break the model results!
In the Bayesian context, it’s easier to fit models that assume other types of distributions for our outcome. For example, we can use a Student-t distribution with \(\nu\) degrees of freedom for a continuous outcome, instead of a normal distribution. This is often called a type of “robust regression” because the tails of the t distribution are fatter than those of a normal, so the variance in the residuals can be greater. In the case below, our model would be specified as:
\[y \sim t(X' \beta, \sigma^2 , \nu)\]
and priors:
\(\beta \sim N(0, 1)\)
\(\sigma \sim half cauchy(5)\)
\(\nu \sim gamma(1, 1.5)\)
#Student's T regression model
prior<-c(set_prior("normal(0,1)", class="b"),#prior for the beta's
set_prior("cauchy(0,5)", class="sigma"),#prior for the residual std. deviation
set_prior("gamma(1,1.5)", class="nu")) #prior for the t distribution d.f.
fit.lm.b.t<-brm(height_z2|weights(wts1)~chhealth1+foodinsec1+hhsize+hhses1, data=eclsk.sub, family=student(),
prior = prior,
warmup=1000,
iter=5000, chains=2,
cores=2,seed = 1115,
save_model="fit_lm_b_t_stan.txt", refresh=0)
## Compiling the C++ model
## Start sampling
print(summary(fit.lm.b.t), digits=3)
## Family: student
## Links: mu = identity; sigma = identity; nu = identity
## Formula: height_z2 | weights(wts1) ~ chhealth1 + foodinsec1 + hhsize + hhses1
## Data: eclsk.sub (Number of observations: 10063)
## Samples: 2 chains, each with iter = 5000; warmup = 1000; thin = 1;
## total post-warmup samples = 8000
##
## Population-Level Effects:
## Estimate Est.Error l-95% CI u-95% CI Eff.Sample Rhat
## Intercept 0.255 0.037 0.181 0.328 8000 1.000
## chhealth1 -0.047 0.012 -0.071 -0.023 8000 1.000
## foodinsec1 -0.081 0.028 -0.137 -0.025 8000 1.000
## hhsize -0.035 0.007 -0.048 -0.022 8000 1.000
## hhses1 0.032 0.012 0.008 0.057 8000 1.000
##
## Family Specific Parameters:
## Estimate Est.Error l-95% CI u-95% CI Eff.Sample Rhat
## sigma 0.945 0.007 0.931 0.959 8000 1.000
## nu 27.719 2.917 22.520 33.794 8000 1.000
##
## Samples were drawn using sampling(NUTS). For each parameter, Eff.Sample
## is a crude measure of effective sample size, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
plot(fit.lm.b.t, N=2)
For comparing Bayesian models, we do not reply on traditional hypothesis testing procedure, such as the F test or likelihood ratio test. Typically some form of information criteria is used. Since the inventors of Stan also have put a lot of thought into these ideas, we use the “Watanabe-Akaike Information Criterion” or WAIC (also here ) to compare models.
The WAIC is a recently developed alternative to the DIC that uses the log of the posterior predictive density, meaning that the log of the total density evaluated at the estimates of the parameters, and penalizes it using a penalty term, to represent the model complexity. As with AIC, smaller values of WAIC indicate better fitting models. According to Gelman and colleagues 2014 pg 174, the “WAIC has the desirable property of averaging over the posterior distribution rather than conditioning on a point estimate”
We can compare multiple models (fit to the same data) using the WAIC() function in brms:
WAIC1<-WAIC(fit.1)
WAIC2<-WAIC(fit.lm.b.t)
compare_ic(WAIC1, WAIC2, ic=c("waic"))
## WAIC SE
## fit.1 32237.95 312.41
## fit.lm.b.t 32307.82 314.26
## fit.1 - fit.lm.b.t -69.87 7.56
Which certainly looks like the Normal model is preferred here, because of the lower WAIC score. We can also evaluate the model fit to the sensitivity of the prior from the normal model:
WAIC(fit.1, fit.1b, compare = T)
## WAIC SE
## fit.1 32237.95 312.41
## fit.1b 32237.63 312.37
## fit.1 - fit.1b 0.32 0.10
Which looks like the model fit is not affected by the choice of prior in this case. This is what we want to see!!
Next, we consider a similar simple logistic regression model:
#here's the model fit via ML
eclsk.sub$badhealth2<-ifelse(eclsk.sub$chhealth2 >2, 1,0)
fit.glm<-glm(badhealth2~male+foodinsec1+hhsize, data=eclsk.sub, weights=wts1, family="binomial")
summary(fit.glm)$coef
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.42260454 0.10225503 -23.6917895 4.381597e-124
## male 0.02051837 0.05633256 0.3642365 7.156814e-01
## foodinsec1 0.74032821 0.07178620 10.3129607 6.157462e-25
## hhsize 0.07831882 0.02001326 3.9133473 9.102548e-05
confint(fit.glm)
## 2.5 % 97.5 %
## (Intercept) -2.62306447 -2.2221668
## male -0.08985597 0.1310012
## foodinsec1 0.59845689 0.8799411
## hhsize 0.03882818 0.1172970
prior<-c(set_prior("normal(0,1)", class="b"))
#no residual sigma for the binomial
fit.glm.b<-brm(badhealth2|weights(wts1)~male+foodinsec1+hhsize, data=eclsk.sub, family=bernoulli(),
prior = prior,
warmup=500,
iter=1000, chains=2,
cores=2,seed = 1115,
save_model="fit_glm_b_stan.txt", refresh=0)
## Compiling the C++ model
## Start sampling
print(summary(fit.glm.b), digits=3)
## Family: bernoulli
## Links: mu = logit
## Formula: badhealth2 | weights(wts1) ~ male + foodinsec1 + hhsize
## Data: eclsk.sub (Number of observations: 10063)
## Samples: 2 chains, each with iter = 1000; warmup = 500; thin = 1;
## total post-warmup samples = 1000
##
## Population-Level Effects:
## Estimate Est.Error l-95% CI u-95% CI Eff.Sample Rhat
## Intercept -2.423 0.099 -2.616 -2.236 1000 0.999
## male 0.021 0.057 -0.092 0.130 1000 1.000
## foodinsec1 0.736 0.071 0.602 0.874 1000 1.000
## hhsize 0.078 0.020 0.039 0.115 1000 0.999
##
## Samples were drawn using sampling(NUTS). For each parameter, Eff.Sample
## is a crude measure of effective sample size, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
plot(fit.glm.b, N=2)