Example of using survey design weights Bayesian regression models for survey data

This is an example of trying to use survey design elements (weights and stratum information) in a bayesian model using brm() from the brms package, which calls Stan.

library(lme4)

## Loading required package: Matrix

library(car)

#load brfss from github
library(RCurl)

## Loading required package: bitops

url<-"https://raw.githubusercontent.com/coreysparks/data/master/brfss_11.csv"
brfss_11<-getURL(url)
brfss_11<-read.csv(textConnection(brfss_11), header=T, dec=",")
nams<-names(brfss_11)
newnames<-gsub("_", "", nams)
names(brfss_11)<-tolower(newnames)

brfss_11$statefip<-sprintf("%02d", brfss_11$state )
brfss_11$cofip<-sprintf("%03d", brfss_11$cnty )
brfss_11$cofips<-paste(brfss_11$statefip, brfss_11$cofip, sep="")


brfss_11$weight<- as.numeric(as.character(brfss_11$cntywt))
#just for brevity, I just select TX respondents with non missing weights
brfss_11<-brfss_11[is.na(brfss_11$cntywt)==F,]
#insurance
brfss_11$ins<-ifelse(brfss_11$hlthpln1==1,1,0)
#smoking currently
brfss_11$smoke<-recode(brfss_11$smoker3, recodes="1:2=1; 3:4=0; else=NA")
#low physical activity
brfss_11$lowact<-recode(brfss_11$pacat, recodes="1:2=0; 3:4=1; else=NA")
#high blood pressure
brfss_11$hbp<-recode(brfss_11$bphigh4, recodes="1=1; 2:4=0; else=NA")
#high cholesterol
brfss_11$hc<-recode(brfss_11$toldhi2, recodes="1=1; 2=0; else=NA")
#bmi
brfss_11$bmi<-ifelse(is.na(brfss_11$bmi5)==T, NA, brfss_11$bmi5/100)
#poor or fair health
brfss_11$badhealth<-ifelse(brfss_11$genhlth %in% c(4,5),1,0)

#race
brfss_11$black<-recode(brfss_11$racegr2, recodes="2=1; 9=NA; else=0", as.factor.result=T)
brfss_11$white<-recode(brfss_11$racegr2, recodes="1=1; 9=NA; else=0", as.factor.result=T)
brfss_11$other<-recode(brfss_11$racegr2, recodes="3:4=1; 9=NA; else=0", as.factor.result=T)
brfss_11$hispanic<-recode(brfss_11$racegr2, recodes="5=1; 9=NA; else=0", as.factor.result=T)

#have a personal doctor?
brfss_11$doc<-recode(brfss_11$persdoc2, recodes="1:2=1; 3=0; else=NA", as.factor.result=F)

#needed care in last year but couldn't get it because of cost
brfss_11$medacc<-recode(brfss_11$medcost, recodes="1=1;2=0;else=NA")

#education level
brfss_11$lths<-recode(brfss_11$educa, recodes="1:3=1;9=NA; else=0", as.factor.result=F)
brfss_11$coll<-recode(brfss_11$educa, recodes="5:6=1;9=NA; else=0", as.factor.result=F)

#marital status
brfss_11$marst<-recode(brfss_11$marital, recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA", as.factor.result=T)
brfss_11$marst<-relevel(brfss_11$marst, ref='married')

#income
brfss_11$inc<-as.factor(ifelse(brfss_11$incomg==9, NA, brfss_11$incomg))

#Age cut into intervals
brfss_11$agec<-cut(brfss_11$age, breaks=c(0,24,39,59,79,99))
brfss_11$bmiz<-scale(brfss_11$bmi, center=T, scale=T)
brfss_11$agez<-scale(brfss_11$age, center=T, scale=T)
brfss_11<- brfss_11[complete.cases(brfss_11[, c("bmiz","agez","lths","coll","black")]),]
names(brfss_11)

##   [1] "x"         "state"     "precall"   "fmonth"    "intvid"   
##   [6] "seqno"     "psu"       "ctelenum"  "cellfon"   "pvtresid" 
##  [11] "genhlth"   "physhlth"  "menthlth"  "poorhlth"  "hlthpln1" 
##  [16] "persdoc2"  "medcost"   "checkup1"  "bphigh4"   "bpmeds"   
##  [21] "bloodcho"  "cholchk"   "toldhi2"   "cvdinfr4"  "cvdcrhd4" 
##  [26] "cvdstrk3"  "asthma3"   "asthnow"   "chcscncr"  "chcocncr" 
##  [31] "chccopd"   "havarth3"  "addepev2"  "chckidny"  "chcvison" 
##  [36] "diabete3"  "smoke100"  "smokday2"  "stopsmk2"  "lastsmk2" 
##  [41] "usenow3"   "age"       "hispanc2"  "mrace"     "orace2"   
##  [46] "veteran3"  "marital"   "children"  "educa"     "employ"   
##  [51] "income2"   "weight2"   "height3"   "numhhol2"  "numphon2" 
##  [56] "cpdemo1"   "cpdemo2"   "cpdemo3"   "cpdemo4"   "renthom1" 
##  [61] "sex"       "pregnant"  "fruitju1"  "fruit1"    "fvbeans"  
##  [66] "fvgreen"   "fvorang"   "vegetab1"  "exerany2"  "exract01" 
##  [71] "exeroft1"  "exerhmm1"  "exract02"  "exeroft2"  "exerhmm2" 
##  [76] "strength"  "qlactlm2"  "useequip"  "lmtjoin3"  "arthdis2" 
##  [81] "arthsocl"  "joinpain"  "seatbelt"  "flushot5"  "flshtmy2" 
##  [86] "imfvplac"  "pneuvac3"  "alcday5"   "avedrnk2"  "drnk3ge5" 
##  [91] "maxdrnks"  "hivtst6"   "hivtstd3"  "hivrisk3"  "ststr"    
##  [96] "impage"    "imprace"   "impnph"    "rfhlth"    "hcvu651"  
## [101] "rfhype5"   "cholchk.1" "rfchol"    "ltasth1"   "casthm1"  
## [106] "asthms1"   "drdxar1"   "smoker3"   "rfsmok3"   "mraceorg" 
## [111] "mraceasc"  "prace"     "mrace.1"   "race2"     "raceg2"   
## [116] "racegr2"   "raceg"     "cnrace"    "cnracec"   "ageg5yr"  
## [121] "age65yr"   "ageg"      "htin4"     "htm4"      "wtkg3"    
## [126] "bmi5"      "bmi5cat"   "rfbmi5"    "chldcnt"   "educag"   
## [131] "incomg"    "ftjuda1"   "frutda1"   "beanday"   "grenday"  
## [136] "orngday"   "vegeda1"   "misfrtn"   "misvegn"   "frtresp"  
## [141] "vegresp"   "frutsum"   "vegesum"   "frt16"     "veg23"    
## [146] "fruitex"   "vegetex"   "totinda"   "metval1"   "metval2"  
## [151] "maxvo2"    "fc60"      "actint1"   "actint2"   "padur1"   
## [156] "padur2"    "pafreq1"   "pafreq2"   "minact1"   "minact2"  
## [161] "strfreq"   "pamiss"    "pamin1"    "pamin2"    "pamin"    
## [166] "pavigm1"   "pavigm2"   "pavigmn"   "pacat"     "paindex"  
## [171] "pa150r1"   "pa300r1"   "pa3002l"   "pastrng"   "parec"    
## [176] "pastaer"   "rfseat2"   "rfseat3"   "flshot5"   "pneumo2"  
## [181] "drnkany5"  "drocdy3"   "rfbing5"   "drnkdy4"   "drnkmo4"  
## [186] "rfdrhv4"   "rfdrmn4"   "rfdrwm4"   "aidtst3"   "cntywt"   
## [191] "cnty"      "endofrec"  "badhealth" "black"     "white"    
## [196] "other"     "hispanic"  "ins"       "inc"       "smoke"    
## [201] "hivrisk"   "lowact"    "doc"       "medacc"    "educ"     
## [206] "marst"     "agec"      "bmi"       "statefip"  "cofip"    
## [211] "cofips"    "weight"    "hbp"       "hc"        "lths"     
## [216] "coll"      "bmiz"      "agez"

brfss_11$cntywt<-as.numeric(as.character(brfss_11$cntywt))
brfss_11$sampleid<-paste(brfss_11$state, brfss_11$ststr)
#within each sampling unit, sum the weights
wts<-tapply(brfss_11$cntywt,brfss_11$sampleid,sum)
#make a data frame from this
wts<-data.frame(id=names(unlist(wts)), wt=unlist(wts))
#get the unique sampling location ids'
t1<-as.data.frame(table(brfss_11$sampleid))
#put all of this into a data set
wts2<-data.frame(ids=wts$id, sumwt=wts$wt, jn=t1$Freq)
#merge all of this back to the original data file
brfss_11<-merge(brfss_11, wts2, by.x="sampleid", by.y="ids", all.x=T)
#In the new data set, multiply the original weight by the fraction of the
#sampling unit total population each person represents
brfss_11$swts<-brfss_11$cntywt*(brfss_11$jn/brfss_11$sumwt)
hist(brfss_11$swts)

head(brfss_11[, c()])

## data frame with 0 columns and 6 rows

Here I fit the regression model with no weights

fit.nowt<-lm(bmiz~agez+lths+coll+black+hispanic+other, brfss_11)
summary(fit.nowt)

## 
## Call:
## lm(formula = bmiz ~ agez + lths + coll + black + hispanic + other, 
##     data = brfss_11)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.4488 -0.6608 -0.1545  0.4853  8.7891 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.02004    0.02734  -0.733 0.463540    
## agez        -0.02061    0.01262  -1.634 0.102326    
## lths         0.11269    0.04632   2.433 0.015016 *  
## coll        -0.07435    0.02938  -2.531 0.011395 *  
## black1       0.46980    0.04210  11.159  < 2e-16 ***
## hispanic1    0.17652    0.03526   5.007 5.67e-07 ***
## other1      -0.18332    0.05202  -3.524 0.000427 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9859 on 6984 degrees of freedom
## Multiple R-squared:  0.0318, Adjusted R-squared:  0.03097 
## F-statistic: 38.23 on 6 and 6984 DF,  p-value: < 2.2e-16

Here I fit the model using the stratum as a random effect to mimic the sample design in lmer

fit.mix.samp<-lmer(bmiz~agez+lths+coll+black+hispanic+other+(1|ststr), brfss_11,weights=swts)
summary(fit.mix.samp, corr=F)

## Linear mixed model fit by REML ['lmerMod']
## Formula: 
## bmiz ~ agez + lths + coll + black + hispanic + other + (1 | ststr)
##    Data: brfss_11
## Weights: swts
## 
## REML criterion at convergence: 21932.1
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.3637 -0.5621 -0.1409  0.3967 12.7626 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  ststr    (Intercept) 0.02445  0.1564  
##  Residual             1.01692  1.0084  
## Number of obs: 6991, groups:  ststr, 29
## 
## Fixed effects:
##              Estimate Std. Error t value
## (Intercept) -0.001924   0.045337  -0.042
## agez         0.046296   0.013499   3.430
## lths         0.042484   0.043169   0.984
## coll        -0.028033   0.029556  -0.948
## black1       0.486322   0.038770  12.544
## hispanic1    0.236246   0.033293   7.096
## other1      -0.249114   0.050323  -4.950

What if we acually use the sample design information, what is our answer?

library(survey)

## Loading required package: grid

## 
## Attaching package: 'survey'

## The following object is masked from 'package:graphics':
## 
##     dotchart

options(survey.lonely.psu = "adjust")
des<- svydesign(ids=~1, strata=~ststr, weights=~swts, data=brfss_11)
fit.surv<-svyglm(bmiz~agez+lths+coll+black+hispanic+other, des, family=gaussian)
summary(fit.surv)

## 
## Call:
## svyglm(formula = bmiz ~ agez + lths + coll + black + hispanic + 
##     other, des, family = gaussian)
## 
## Survey design:
## svydesign(ids = ~1, strata = ~ststr, weights = ~swts, data = brfss_11)
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.004601   0.036639  -0.126 0.900060    
## agez         0.058279   0.017258   3.377 0.000737 ***
## lths         0.037957   0.070371   0.539 0.589636    
## coll        -0.048082   0.039978  -1.203 0.229126    
## black1       0.450114   0.056282   7.998 1.48e-15 ***
## hispanic1    0.222639   0.048433   4.597 4.36e-06 ***
## other1      -0.276598   0.051328  -5.389 7.32e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 1.027136)
## 
## Number of Fisher Scoring iterations: 2

The standard errors are much larger when using the svydesign, but why? It’s because the standard errors aren’t being calculated the same way between the two methods! That’s the big issue here.

If we use stan to do this, via rstanarm, we get:

library(brms)

## Loading required package: rstan

## Loading required package: ggplot2

## rstan (Version 2.9.0-3, packaged: 2016-02-11 15:54:41 UTC, GitRev: 05c3d0058b6a)

## For execution on a local, multicore CPU with excess RAM we recommend calling
## rstan_options(auto_write = TRUE)
## options(mc.cores = parallel::detectCores())

## Loading 'brms' package (version 0.8.0). Useful instructions 
## can be found by typing help('brms'). A more detailed introduction 
## to the package is available through vignette('brms').

brfss_11$wts<-brfss_11$weight/mean(brfss_11$weight)
brfss_11$strata<-factor(brfss_11$ststr)
fit.stan<-brm(bmiz|weights(swts)~agez+lths+coll+black+hispanic+other+(1|strata), data=brfss_11,
                                 save.model="fit_glm_stan.txt",chains=2,  cores=2,seed=1115)

## Warning: Argument 'save.model' is deprecated. Please use argument
## 'save_model' instead.

## Compiling the C++ model

print(fit.stan, digits=3)

##  Family: gaussian (identity) 
## Formula: bmiz | weights(swts) ~ agez + lths + coll + black + hispanic + other + (1 | strata) 
##    Data: brfss_11 (Number of observations: 6991) 
## Samples: 2 chains, each with iter = 2000; warmup = 1000; thin = 1; 
##          total post-warmup samples = 2000
##    WAIC: Not computed
##  
## Random Effects: 
## ~strata (Number of levels: 29) 
##               Estimate Est.Error l-95% CI u-95% CI Eff.Sample  Rhat
## sd(Intercept)    0.172     0.043    0.099    0.269        357 1.001
## 
## Fixed Effects: 
##           Estimate Est.Error l-95% CI u-95% CI Eff.Sample  Rhat
## Intercept   -0.004     0.048   -0.106    0.082        550 1.003
## agez         0.046     0.014    0.020    0.072       2000 0.999
## lths         0.040     0.043   -0.043    0.122       2000 1.000
## coll        -0.028     0.029   -0.084    0.030       2000 1.001
## black1       0.487     0.038    0.410    0.562       2000 1.001
## hispanic1    0.238     0.033    0.172    0.302       2000 1.001
## other1      -0.251     0.051   -0.355   -0.155       2000 0.999
## 
## Family Specific Parameters: 
##             Estimate Est.Error l-95% CI u-95% CI Eff.Sample Rhat
## sigma(bmiz)    1.008     0.009    0.992    1.025       2000    1
## 
## Samples were drawn using sampling(NUTS). For each parameter, Eff.Sample 
## is a crude measure of effective sample size, and Rhat is the potential 
## scale reduction factor on split chains (at convergence, Rhat = 1).

If we consult the beta’s they are almost identical between fit.stan, fit.surv and fit.mix.samp, but the standard errors in fit.surv are larger, and hence the test’s/significance levels are lower. While there is no substantive difference in interpretation of the models, in this case, there easily could be.