In this example, I introduce how to fit the multi-level model using the lme4 package. This example continues from the first example and considers the linear case of the model with random slopes and the use of a higher-level predictor. The example is extended to a binary outcome to illustrate the logistic Generalized Linear Mixed Model (GLMM).
This example shows how to: * Examine variation between groups using fixed effects * Fit the basic random intercept model : +\(y_{ij} = \mu + u_{j} + e_{ij}\) with \(u_j \sim N(0, \sigma^2)\) Where the intercepts (\(u_j\)) for each group vary randomly around the overall mean (\(\mu\))
*I also illustrate how to include group-level covariates and how to fit the random slopes model and the model for cross level interactions
The example merges data from the 2011 CDC Behavioral Risk Factor Surveillance System (BRFSS) SMART county data. Link and the 2010 American Community Survey 5-year estimates at the county level. More details on these data are found below.
library(lme4)
library(arm)
library(lmerTest)
library(car)
library(MASS)
library(ggplot2)
library(sjPlot)
#load brfss
load("~/Google Drive/dem7283/data/brfss_11.Rdata")
nams<-names(brfss_11)
newnames<-gsub("_", "", nams)
names(brfss_11)<-tolower(newnames)
brfss_11$statefip<-sprintf("%02d", brfss_11$state )
brfss_11$cofip<-sprintf("%03d", brfss_11$cnty )
brfss_11$cofips<-paste(brfss_11$statefip, brfss_11$cofip, sep="")
#just for brevity, I just select TX respondents with non missing weights
brfss_11<-brfss_11[is.na(brfss_11$cntywt)==F,]
#insurance
brfss_11$ins<-ifelse(brfss_11$hlthpln1==1,1,0)
#smoking currently
brfss_11$smoke<-recode(brfss_11$smoker3, recodes="1:2=1; 3:4=0; else=NA")
#low physical activity
brfss_11$lowact<-recode(brfss_11$pacat, recodes="1:2=0; 3:4=1; else=NA")
#high blood pressure
brfss_11$hbp<-recode(brfss_11$bphigh4, recodes="1=1; 2:4=0; else=NA")
#high cholesterol
brfss_11$hc<-recode(brfss_11$toldhi2, recodes="1=1; 2=0; else=NA")
#bmi
brfss_11$bmi<-ifelse(is.na(brfss_11$bmi5)==T, NA, brfss_11$bmi5/100)
#poor or fair health
brfss_11$badhealth<-ifelse(brfss_11$genhlth %in% c(4,5),1,0)
#race
brfss_11$black<-recode(brfss_11$racegr2, recodes="2=1; 9=NA; else=0", as.factor.result=T)
brfss_11$white<-recode(brfss_11$racegr2, recodes="1=1; 9=NA; else=0", as.factor.result=T)
brfss_11$other<-recode(brfss_11$racegr2, recodes="3:4=1; 9=NA; else=0", as.factor.result=T)
brfss_11$hispanic<-recode(brfss_11$racegr2, recodes="5=1; 9=NA; else=0", as.factor.result=T)
#have a personal doctor?
brfss_11$doc<-recode(brfss_11$persdoc2, recodes="1:2=1; 3=0; else=NA", as.factor.result=F)
#needed care in last year but couldn't get it because of cost
brfss_11$medacc<-recode(brfss_11$medcost, recodes="1=1;2=0;else=NA")
#education level
brfss_11$lths<-recode(brfss_11$educa, recodes="1:3=1;9=NA; else=0", as.factor.result=F)
brfss_11$coll<-recode(brfss_11$educa, recodes="5:6=1;9=NA; else=0", as.factor.result=F)
#employment
brfss_11$employ<-recode(brfss_11$employ, recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA", as.factor.result=T)
brfss_11$employ<-relevel(brfss_11$employ, ref='Employed')
#marital status
brfss_11$marst<-recode(brfss_11$marital, recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA", as.factor.result=T)
brfss_11$marst<-relevel(brfss_11$marst, ref='married')
#income
brfss_11$inc<-as.factor(ifelse(brfss_11$incomg==9, NA, brfss_11$incomg))
#Age cut into intervals
brfss_11$agec<-cut(brfss_11$age, breaks=c(0,24,39,59,79,99))
We will often be interested in factors at both the individual AND contextual levels. To illustrate this, I will use data from the American Community Survey measured at the county level. Specifically, I use the DP3 table, which provides economic characteristics of places, from the 2010 5 year ACS Link.
#I will also add in some Census variables from the ACS 2010 5 Year estimates
#load in ACS data from Factfinder
acsecon<-read.csv("~/Google Drive/dem7283/data/aff_download/ACS_10_5YR_DP03_with_ann.csv")
acsecon$povrate<-acsecon[, "HC03_VC156"]
acsecon$unemployed<-acsecon[, "HC03_VC13"]
acsecon$cofips<-substr(acsecon$GEO.id, 10,14)
acsecon$povz<-scale(acsecon$povrate, center=T, scale=T)
acsecon$unempz<-scale(acsecon$unemployed, center=T, scale=T)
acsecon<-acsecon[, c("cofips", "povrate","povz", "unemployed","unempz")]
head(acsecon)
## cofips povrate povz unemployed unempz
## 1 01001 7.5 -0.6960015 6.2 -0.396270981
## 2 01003 9.1 -0.4069846 6.6 -0.277120527
## 3 01005 19.9 1.5438794 9.6 0.616507876
## 4 01007 9.4 -0.3527939 9.1 0.467569809
## 5 01009 10.0 -0.2444126 7.5 -0.009032006
## 6 01011 22.6 2.0315954 11.2 1.093109691
joindata<-merge(brfss_11, acsecon, by="cofips", all.x=T, all.y=F)
## Warning in merge.data.frame(brfss_11, acsecon, by = "cofips", all.x = T, :
## column names 'cholchk', 'mrace' are duplicated in the result
#and merge the data back to the kids data
joindata$bmiz<-scale(joindata$bmi, center=T, scale=T)
joindata$agez<-scale(joindata$age, center=T, scale=T)
To specify a random intercept model for counties, we add, a model term that is (1|HIGHER LEVEL VARIABLE), which tells R to fit only a random intercept for each county, in our case it will be (1|cofips)
fit.mix<-lmer(bmiz~agez+lths+coll+black+hispanic+other+(1|cofips), data=joindata)
#do a test for the random effect
rand(fit.mix)
## Analysis of Random effects Table:
## Chi.sq Chi.DF p.value
## cofips 1603 1 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
display(fit.mix, detail=T)
## lme4::lmer(formula = bmiz ~ agez + lths + coll + black + hispanic +
## other + (1 | cofips), data = joindata)
## coef.est coef.se t value
## (Intercept) 0.03 0.01 3.73
## agez 0.01 0.00 3.86
## lths 0.06 0.01 6.86
## coll -0.11 0.00 -21.61
## black1 0.41 0.01 50.99
## hispanic1 0.17 0.01 19.88
## other1 -0.03 0.01 -3.32
##
## Error terms:
## Groups Name Std.Dev.
## cofips (Intercept) 0.10
## Residual 0.99
## ---
## number of obs: 226744, groups: cofips, 224
## AIC = 637546, DIC = 637411.7
## deviance = 637470.0
Now, I fit the same model above, but this time I include a predictor at the county level, the poverty rate.
#Now I estimate the multilevel model including the effects for the
#county level variables
fit.mix2<-lmer(bmiz~agez+lths+coll+black+hispanic+other+povz+(1|cofips), data=joindata)
display(fit.mix2, detail=T)
## lme4::lmer(formula = bmiz ~ agez + lths + coll + black + hispanic +
## other + povz + (1 | cofips), data = joindata)
## coef.est coef.se t value
## (Intercept) 0.04 0.01 4.20
## agez 0.01 0.00 3.84
## lths 0.06 0.01 6.84
## coll -0.11 0.00 -21.60
## black1 0.41 0.01 50.75
## hispanic1 0.17 0.01 19.80
## other1 -0.03 0.01 -3.35
## povz 0.02 0.01 1.90
##
## Error terms:
## Groups Name Std.Dev.
## cofips (Intercept) 0.10
## Residual 0.99
## ---
## number of obs: 226744, groups: cofips, 224
## AIC = 637552, DIC = 637400.8
## deviance = 637466.4
rand(fit.mix2)
## Analysis of Random effects Table:
## Chi.sq Chi.DF p.value
## cofips 1586 1 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ICC2<-VarCorr(fit.mix2)$cofips[1]/( VarCorr(fit.mix2)$cofips[1]+attr(VarCorr(fit.mix2), "sc"))
ICC2
## [1] 0.009905632
#compare the random intercept ad multilevel model with a LRT
anova(fit.mix, fit.mix2)
## refitting model(s) with ML (instead of REML)
## Data: joindata
## Models:
## object: bmiz ~ agez + lths + coll + black + hispanic + other + (1 | cofips)
## ..1: bmiz ~ agez + lths + coll + black + hispanic + other + povz +
## ..1: (1 | cofips)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## object 9 637488 637581 -318735 637470
## ..1 10 637486 637590 -318733 637466 3.6169 1 0.0572 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#There is slight improvement, p=.0572 becuase the poverty variable was almost significant, t=1.90
The ICC goes down, which points to the fact that we are controlling away some of the between-county variance by including the county level predictor. The improvement in model fit is almost “significant” at the 5% level.
This effectively allows one or more of the individual level variables to have different effects in each of the groups.
#To do a random slope model, I do:
fit.mix3<-lmer(bmiz~agez+lths+coll+black+hispanic+other+(black|cofips), joindata)
display(fit.mix3, digits=3)
## lme4::lmer(formula = bmiz ~ agez + lths + coll + black + hispanic +
## other + (black | cofips), data = joindata)
## coef.est coef.se
## (Intercept) 0.029 0.008
## agez 0.008 0.002
## lths 0.061 0.009
## coll -0.104 0.005
## black1 0.375 0.013
## hispanic1 0.175 0.009
## other1 -0.029 0.009
##
## Error terms:
## Groups Name Std.Dev. Corr
## cofips (Intercept) 0.105
## black1 0.113 -0.392
## Residual 0.985
## ---
## number of obs: 226744, groups: cofips, 224
## AIC = 637448, DIC = 637311.5
## deviance = 637368.7
rand(fit.mix3,)
## Analysis of Random effects Table:
## Chi.sq Chi.DF p.value
## black:cofips 102 2 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#compare the first model to the one with a random slope with a likelihood ratio test
anova(fit.mix, fit.mix3)
## refitting model(s) with ML (instead of REML)
## Data: joindata
## Models:
## object: bmiz ~ agez + lths + coll + black + hispanic + other + (1 | cofips)
## ..1: bmiz ~ agez + lths + coll + black + hispanic + other + (black |
## ..1: cofips)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## object 9 637488 637581 -318735 637470
## ..1 11 637391 637504 -318684 637369 101.27 2 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#the random slope model fits better
#plot random slopes and intercepts
rancoefs2<-ranef(fit.mix3)
plot(NULL, ylim=c(-.5, .5), xlim=c(-1,1),ylab="Intercept", xlab="Age")
title (main="Random Slope and Intercept Model")
for (i in 1: dim(rancoefs2$cofips)[1]){
abline(a=fixef(fit.mix3)[1]+rancoefs2$cofips[[1]][i], b=fixef(fit.mix3)[2]+rancoefs2$cofips[[2]][i], col=2)
}
So, here we see a significant variance among counties in the effects of the ‘black’ variable. in addition to the output from rand(), the anova() also shows the model improves overall compared to the model with only a random intercept.
Here, I show the model for a cross-level interaction. This model fits an interaction term between (at least) one individual level variable and a group level variable. This allows you to ask very informative questions regarding individuals within specific contexts. As a good rule, always z-score your higher level predictors BEFORE MERGINGE TO THE INDIVIDUAL DATA in order to ease interpretation of these effects. In this example, I as the reserach question, “Do Racial/ethnic minority groups, livinging in high poverty areas face an added disadvantage in BMI, compared to whites?”
#Cross-level interaction model:
fit.mix4<-lmer(bmiz~agez+lths+coll+black+hispanic+other+black*povz+hispanic*povz+other*povz+(1|cofips), joindata)
display(fit.mix4,detail=T, digits=3)
## lme4::lmer(formula = bmiz ~ agez + lths + coll + black + hispanic +
## other + black * povz + hispanic * povz + other * povz + (1 |
## cofips), data = joindata)
## coef.est coef.se t value
## (Intercept) 0.028 0.009 2.994
## agez 0.008 0.002 3.724
## lths 0.060 0.009 6.662
## coll -0.105 0.005 -21.515
## black1 0.415 0.008 51.577
## hispanic1 0.192 0.009 20.603
## other1 0.016 0.011 1.495
## povz -0.002 0.011 -0.176
## black1:povz 0.075 0.011 6.775
## hispanic1:povz 0.066 0.013 5.098
## other1:povz 0.108 0.013 8.048
##
## Error terms:
## Groups Name Std.Dev.
## cofips (Intercept) 0.100
## Residual 0.986
## ---
## number of obs: 226744, groups: cofips, 224
## AIC = 637464, DIC = 637265.8
## deviance = 637352.2
rand(fit.mix4)
## Analysis of Random effects Table:
## Chi.sq Chi.DF p.value
## cofips 1607 1 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(fit.mix, fit.mix4)
## refitting model(s) with ML (instead of REML)
## Data: joindata
## Models:
## object: bmiz ~ agez + lths + coll + black + hispanic + other + (1 | cofips)
## ..1: bmiz ~ agez + lths + coll + black + hispanic + other + black *
## ..1: povz + hispanic * povz + other * povz + (1 | cofips)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## object 9 637488 637581 -318735 637470
## ..1 13 637378 637513 -318676 637352 117.8 4 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
This basically says that racial/ethnic minorities in counties with higher than average poverty rates (povz==1 vs povz==0) have higher BMI’s than whites
Just like when we did ordinary logistic regression, vs. least squares, we use a slightly different function glm() vs. lm(). Same goes here, we will use glmer() vs lmer(). This example will fit a multilevel logistic regression model.
NOTE GLMM’s can take much longer to estimate than LMM’s, don’t be surprised if your model takes a long time to run. Also don’t be surprised if you see warning messages like Warning message: In checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, : Model failed to converge with max|grad| = somenumber (tol = 0.001)
#Just for quickness, i subset to TX & CA only
joindatatx<-joindata[joindata$state%in%c(6,48),]
#Here I fit a binomial model for healthcare access
fit.mix.bin<-glmer(doc~agez+lths+coll+black+hispanic+other+(1|cofips), family=binomial, joindatatx)
display(fit.mix.bin, detail=T)
## glmer(formula = doc ~ agez + lths + coll + black + hispanic +
## other + (1 | cofips), data = joindatatx, family = binomial)
## coef.est coef.se z value Pr(>|z|)
## (Intercept) 1.80 0.06 29.53 0.00
## agez 0.90 0.02 36.97 0.00
## lths -0.52 0.07 -7.37 0.00
## coll 0.42 0.05 7.79 0.00
## black1 -0.21 0.08 -2.53 0.01
## hispanic1 -0.58 0.05 -10.64 0.00
## other1 -0.18 0.08 -2.36 0.02
##
## Error terms:
## Groups Name Std.Dev.
## cofips (Intercept) 0.12
## Residual 1.00
## ---
## number of obs: 18355, groups: cofips, 18
## AIC = 14516, DIC = 14500
## deviance = 14500.0
#odds ratios
exp(fixef(fit.mix.bin)[-1])
## agez lths coll black1 hispanic1 other1
## 2.4535692 0.5929751 1.5235652 0.8068959 0.5620259 0.8336371
sjp.glmer(fit.mix.bin, facet.grid = FALSE,type="ri.pc")
So, we see that older people are more likely to have a regular doctor, as are those with a college education. On the other hand, those with low levels of education, and all of the racial/ethnic minorities are less likely to have a regular doctor. Unfortunately, there is no easy way test for random effects in GLMM’s in R.
fit.mix.bin2<-glmer(doc~agez+lths+coll+black+hispanic+other+(black+hispanic+other+1|cofips), family=binomial, joindatatx)
display(fit.mix.bin2, detail=T)
## glmer(formula = doc ~ agez + lths + coll + black + hispanic +
## other + (black + hispanic + other + 1 | cofips), data = joindatatx,
## family = binomial)
## coef.est coef.se z value Pr(>|z|)
## (Intercept) 1.81 0.06 28.32 0.00
## agez 0.90 0.02 36.97 0.00
## lths -0.52 0.07 -7.36 0.00
## coll 0.42 0.05 7.72 0.00
## black1 -0.23 0.09 -2.42 0.02
## hispanic1 -0.60 0.08 -8.05 0.00
## other1 -0.16 0.10 -1.55 0.12
##
## Error terms:
## Groups Name Std.Dev. Corr
## cofips (Intercept) 0.14
## black1 0.15 -0.83
## hispanic1 0.19 -0.77 0.99
## other1 0.23 0.68 -0.16 -0.05
## Residual 1.00
## ---
## number of obs: 18355, groups: cofips, 18
## AIC = 14521.1, DIC = 14487.1
## deviance = 14487.1
exp(fixef(fit.mix.bin2))
## (Intercept) agez lths coll black1 hispanic1
## 6.0802099 2.4562621 0.5920707 1.5191720 0.7956108 0.5462362
## other1
## 0.8550496
sjp.glmer(fit.mix.bin2, facet.grid = FALSE,type="ri.pc")
So, we see that there are pretty sizeable variances in all of the three race/ethnicity variables, suggesting that across the counties in the data, there is heterogeneity in the means of the BMI for these groups. i.e. it’s not always the case that minority groups face a disadvantage in terms of BMI in all counties in these two states.
We can likewise examine a cross-level interaction model like we did above:
fit.mix.bin3<-glmer(doc~agez+lths+coll+(black+hispanic+other)*povz+(1|cofips), family=binomial, joindatatx)
display(fit.mix.bin3, detail=T)
## glmer(formula = doc ~ agez + lths + coll + (black + hispanic +
## other) * povz + (1 | cofips), data = joindatatx, family = binomial)
## coef.est coef.se z value Pr(>|z|)
## (Intercept) 1.77 0.06 31.80 0.00
## agez 0.90 0.02 37.10 0.00
## lths -0.52 0.07 -7.41 0.00
## coll 0.41 0.05 7.62 0.00
## black1 -0.22 0.09 -2.53 0.01
## hispanic1 -0.59 0.05 -10.71 0.00
## other1 -0.26 0.08 -3.08 0.00
## povz -0.19 0.07 -2.75 0.01
## black1:povz -0.11 0.19 -0.57 0.57
## hispanic1:povz 0.02 0.11 0.21 0.83
## other1:povz -0.28 0.15 -1.81 0.07
##
## Error terms:
## Groups Name Std.Dev.
## cofips (Intercept) 0.06
## Residual 1.00
## ---
## number of obs: 18355, groups: cofips, 18
## AIC = 14509.1, DIC = 14485.1
## deviance = 14485.1
exp(fixef(fit.mix.bin3))
## (Intercept) agez lths coll black1
## 5.8430338 2.4610329 0.5916629 1.5087900 0.8055873
## hispanic1 other1 povz black1:povz hispanic1:povz
## 0.5567145 0.7701871 0.8241363 0.8973432 1.0225781
## other1:povz
## 0.7557557
So, we see that, in these two states at least, blacks and hispanics don’t face an added disparity against having a regular doctor, but other race/ethnicities do (at least this interaction term is marginally significant in the model)