Multi-level Models 2

In this example, I introduce how to fit the multi-level model using the lme4 package. This example continues from the first example and considers the linear case of the model with random slopes and the use of a higher-level predictor. The example is extended to a binary outcome to illustrate the logistic Generalized Linear Mixed Model (GLMM).

This example shows how to: * Examine variation between groups using fixed effects * Fit the basic random intercept model : +$y_{ij} = \mu + u_{j} + e_{ij}$ with $u_j \sim N(0, \sigma^2)$ Where the intercepts ($u_j$) for each group vary randomly around the overall mean ($\mu$)

*I also illustrate how to include group-level covariates and how to fit the random slopes model and the model for cross level interactions

The example merges data from the 2011 CDC Behavioral Risk Factor Surveillance System (BRFSS) SMART county data. Link and the 2010 American Community Survey 5-year estimates at the county level. More details on these data are found below.

library(lme4)
library(arm)
library(lmerTest)
library(car)
library(MASS)
library(ggplot2)
library(sjPlot)

#load brfss
load("~/Google Drive/dem7283/data/brfss_11.Rdata")
nams<-names(brfss_11)
newnames<-gsub("_", "", nams)
names(brfss_11)<-tolower(newnames)

brfss_11$statefip<-sprintf("%02d", brfss_11$state )
brfss_11$cofip<-sprintf("%03d", brfss_11$cnty )
brfss_11$cofips<-paste(brfss_11$statefip, brfss_11$cofip, sep="")

#just for brevity, I just select TX respondents with non missing weights
brfss_11<-brfss_11[is.na(brfss_11$cntywt)==F,]
#insurance
brfss_11$ins<-ifelse(brfss_11$hlthpln1==1,1,0)
#smoking currently
brfss_11$smoke<-recode(brfss_11$smoker3, recodes="1:2=1; 3:4=0; else=NA")
#low physical activity
brfss_11$lowact<-recode(brfss_11$pacat, recodes="1:2=0; 3:4=1; else=NA")
#high blood pressure
brfss_11$hbp<-recode(brfss_11$bphigh4, recodes="1=1; 2:4=0; else=NA")
#high cholesterol
brfss_11$hc<-recode(brfss_11$toldhi2, recodes="1=1; 2=0; else=NA")
#bmi
brfss_11$bmi<-ifelse(is.na(brfss_11$bmi5)==T, NA, brfss_11$bmi5/100)
#poor or fair health
brfss_11$badhealth<-ifelse(brfss_11$genhlth %in% c(4,5),1,0)

#race
brfss_11$black<-recode(brfss_11$racegr2, recodes="2=1; 9=NA; else=0", as.factor.result=T)
brfss_11$white<-recode(brfss_11$racegr2, recodes="1=1; 9=NA; else=0", as.factor.result=T)
brfss_11$other<-recode(brfss_11$racegr2, recodes="3:4=1; 9=NA; else=0", as.factor.result=T)
brfss_11$hispanic<-recode(brfss_11$racegr2, recodes="5=1; 9=NA; else=0", as.factor.result=T)

#have a personal doctor?
brfss_11$doc<-recode(brfss_11$persdoc2, recodes="1:2=1; 3=0; else=NA", as.factor.result=F)

#needed care in last year but couldn't get it because of cost
brfss_11$medacc<-recode(brfss_11$medcost, recodes="1=1;2=0;else=NA")

#education level
brfss_11$lths<-recode(brfss_11$educa, recodes="1:3=1;9=NA; else=0", as.factor.result=F)
brfss_11$coll<-recode(brfss_11$educa, recodes="5:6=1;9=NA; else=0", as.factor.result=F)

#employment
brfss_11$employ<-recode(brfss_11$employ, recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA", as.factor.result=T)
brfss_11$employ<-relevel(brfss_11$employ, ref='Employed')

#marital status
brfss_11$marst<-recode(brfss_11$marital, recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA", as.factor.result=T)
brfss_11$marst<-relevel(brfss_11$marst, ref='married')

#income
brfss_11$inc<-as.factor(ifelse(brfss_11$incomg==9, NA, brfss_11$incomg))

#Age cut into intervals
brfss_11$agec<-cut(brfss_11$age, breaks=c(0,24,39,59,79,99))

Higher level predictors

We will often be interested in factors at both the individual AND contextual levels. To illustrate this, I will use data from the American Community Survey measured at the county level. Specifically, I use the DP3 table, which provides economic characteristics of places, from the 2010 5 year ACS Link.

#I will also add in some Census variables from the ACS 2010 5 Year estimates
#load in ACS data from Factfinder
acsecon<-read.csv("~/Google Drive/dem7283/data/aff_download/ACS_10_5YR_DP03_with_ann.csv")
acsecon$povrate<-acsecon[, "HC03_VC156"]
acsecon$unemployed<-acsecon[, "HC03_VC13"]
acsecon$cofips<-substr(acsecon$GEO.id, 10,14)
acsecon$povz<-scale(acsecon$povrate, center=T, scale=T)
acsecon$unempz<-scale(acsecon$unemployed, center=T, scale=T)
acsecon<-acsecon[, c("cofips", "povrate","povz", "unemployed","unempz")]

head(acsecon)

##   cofips povrate       povz unemployed       unempz
## 1  01001     7.5 -0.6960015        6.2 -0.396270981
## 2  01003     9.1 -0.4069846        6.6 -0.277120527
## 3  01005    19.9  1.5438794        9.6  0.616507876
## 4  01007     9.4 -0.3527939        9.1  0.467569809
## 5  01009    10.0 -0.2444126        7.5 -0.009032006
## 6  01011    22.6  2.0315954       11.2  1.093109691

joindata<-merge(brfss_11, acsecon, by="cofips", all.x=T, all.y=F)

## Warning in merge.data.frame(brfss_11, acsecon, by = "cofips", all.x = T, :
## column names 'cholchk', 'mrace' are duplicated in the result

#and merge the data back to the kids data
joindata$bmiz<-scale(joindata$bmi, center=T, scale=T)
joindata$agez<-scale(joindata$age, center=T, scale=T)

Fitting the basic multi-level model - Random intercepts

The random intercept model assumes you have:
j groups (j=1 to J)
i individuals within the j groups (i=1 to $n_j$)
for each individual in the j groups you have measured $y_{ij}$ and $x_{ij}$ and potentially for each group j, we have may have measured $z_j$ which is a covariate measured at the group level For example, you do a survey on health and you measure:
y = the health status of each individual
x= SES, race, etc of each individual
j = the county each individual lives in, and
z = the poverty rate or median income in the county

To specify a random intercept model for counties, we add, a model term that is (1|HIGHER LEVEL VARIABLE), which tells R to fit only a random intercept for each county, in our case it will be (1|cofips)

fit.mix<-lmer(bmiz~agez+lths+coll+black+hispanic+other+(1|cofips), data=joindata)
#do a test for the random effect
rand(fit.mix)

## Analysis of Random effects Table:
##        Chi.sq Chi.DF p.value    
## cofips   1603      1  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

display(fit.mix, detail=T)

## lme4::lmer(formula = bmiz ~ agez + lths + coll + black + hispanic + 
##     other + (1 | cofips), data = joindata)
##             coef.est coef.se t value
## (Intercept)   0.03     0.01    3.73 
## agez          0.01     0.00    3.86 
## lths          0.06     0.01    6.86 
## coll         -0.11     0.00  -21.61 
## black1        0.41     0.01   50.99 
## hispanic1     0.17     0.01   19.88 
## other1       -0.03     0.01   -3.32 
## 
## Error terms:
##  Groups   Name        Std.Dev.
##  cofips   (Intercept) 0.10    
##  Residual             0.99    
## ---
## number of obs: 226744, groups: cofips, 224
## AIC = 637546, DIC = 637411.7
## deviance = 637470.0

Multilevel model with group-level predictor

Now, I fit the same model above, but this time I include a predictor at the county level, the poverty rate.

#Now I estimate the multilevel model including the effects for the
#county level variables
fit.mix2<-lmer(bmiz~agez+lths+coll+black+hispanic+other+povz+(1|cofips), data=joindata)
display(fit.mix2, detail=T)

## lme4::lmer(formula = bmiz ~ agez + lths + coll + black + hispanic + 
##     other + povz + (1 | cofips), data = joindata)
##             coef.est coef.se t value
## (Intercept)   0.04     0.01    4.20 
## agez          0.01     0.00    3.84 
## lths          0.06     0.01    6.84 
## coll         -0.11     0.00  -21.60 
## black1        0.41     0.01   50.75 
## hispanic1     0.17     0.01   19.80 
## other1       -0.03     0.01   -3.35 
## povz          0.02     0.01    1.90 
## 
## Error terms:
##  Groups   Name        Std.Dev.
##  cofips   (Intercept) 0.10    
##  Residual             0.99    
## ---
## number of obs: 226744, groups: cofips, 224
## AIC = 637552, DIC = 637400.8
## deviance = 637466.4

rand(fit.mix2)

## Analysis of Random effects Table:
##        Chi.sq Chi.DF p.value    
## cofips   1586      1  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

ICC2<-VarCorr(fit.mix2)$cofips[1]/( VarCorr(fit.mix2)$cofips[1]+attr(VarCorr(fit.mix2), "sc"))
ICC2

## [1] 0.009905632

#compare the random intercept ad multilevel model with a LRT
anova(fit.mix, fit.mix2)

## refitting model(s) with ML (instead of REML)

## Data: joindata
## Models:
## object: bmiz ~ agez + lths + coll + black + hispanic + other + (1 | cofips)
## ..1: bmiz ~ agez + lths + coll + black + hispanic + other + povz + 
## ..1:     (1 | cofips)
##        Df    AIC    BIC  logLik deviance  Chisq Chi Df Pr(>Chisq)  
## object  9 637488 637581 -318735   637470                           
## ..1    10 637486 637590 -318733   637466 3.6169      1     0.0572 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#There is slight improvement, p=.0572 becuase the poverty variable was almost significant, t=1.90

The ICC goes down, which points to the fact that we are controlling away some of the between-county variance by including the county level predictor. The improvement in model fit is almost “significant” at the 5% level.

Random slope models

We expand the random effect model to include group-varying slopes by: \[ \begin{aligned} \ E( Y) = \beta_{0j}+ \beta_{ j} x\\ \beta_{0j} = \beta_0 + u_{1j}\\ \beta_{j} = \beta + u_{2j}\\ \end{aligned}\\ \text{Where } \beta_0 \text{ is the average intercept, and } u_{1j } \text{ is }\\ \text{the group-specific deviation in the intercept}\\ \text{And where } \beta \text{ is the average slope, and } u_{2j } \text{ is}\\ \text{the group-specific deviation in the average slope}\\ \]

This effectively allows one or more of the individual level variables to have different effects in each of the groups.

#To do a random slope model, I do:
fit.mix3<-lmer(bmiz~agez+lths+coll+black+hispanic+other+(black|cofips), joindata)
display(fit.mix3, digits=3)

## lme4::lmer(formula = bmiz ~ agez + lths + coll + black + hispanic + 
##     other + (black | cofips), data = joindata)
##             coef.est coef.se
## (Intercept)  0.029    0.008 
## agez         0.008    0.002 
## lths         0.061    0.009 
## coll        -0.104    0.005 
## black1       0.375    0.013 
## hispanic1    0.175    0.009 
## other1      -0.029    0.009 
## 
## Error terms:
##  Groups   Name        Std.Dev. Corr   
##  cofips   (Intercept) 0.105           
##           black1      0.113    -0.392 
##  Residual             0.985           
## ---
## number of obs: 226744, groups: cofips, 224
## AIC = 637448, DIC = 637311.5
## deviance = 637368.7

rand(fit.mix3,)

## Analysis of Random effects Table:
##              Chi.sq Chi.DF p.value    
## black:cofips    102      2  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#compare the first model to the one with a random slope with a likelihood ratio test
anova(fit.mix, fit.mix3)

## refitting model(s) with ML (instead of REML)

## Data: joindata
## Models:
## object: bmiz ~ agez + lths + coll + black + hispanic + other + (1 | cofips)
## ..1: bmiz ~ agez + lths + coll + black + hispanic + other + (black | 
## ..1:     cofips)
##        Df    AIC    BIC  logLik deviance  Chisq Chi Df Pr(>Chisq)    
## object  9 637488 637581 -318735   637470                             
## ..1    11 637391 637504 -318684   637369 101.27      2  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#the random slope model fits better

#plot random slopes and intercepts
rancoefs2<-ranef(fit.mix3)

plot(NULL, ylim=c(-.5, .5), xlim=c(-1,1),ylab="Intercept", xlab="Age")
title (main="Random Slope and Intercept Model")

for (i in 1: dim(rancoefs2$cofips)[1]){
  
  abline(a=fixef(fit.mix3)[1]+rancoefs2$cofips[[1]][i], b=fixef(fit.mix3)[2]+rancoefs2$cofips[[2]][i], col=2)
}

So, here we see a significant variance among counties in the effects of the ‘black’ variable. in addition to the output from rand(), the anova() also shows the model improves overall compared to the model with only a random intercept.

Cross level interaction effect

Here, I show the model for a cross-level interaction. This model fits an interaction term between (at least) one individual level variable and a group level variable. This allows you to ask very informative questions regarding individuals within specific contexts. As a good rule, always z-score your higher level predictors BEFORE MERGINGE TO THE INDIVIDUAL DATA in order to ease interpretation of these effects. In this example, I as the reserach question, “Do Racial/ethnic minority groups, livinging in high poverty areas face an added disadvantage in BMI, compared to whites?”

#Cross-level interaction model:
fit.mix4<-lmer(bmiz~agez+lths+coll+black+hispanic+other+black*povz+hispanic*povz+other*povz+(1|cofips), joindata)
display(fit.mix4,detail=T, digits=3)

## lme4::lmer(formula = bmiz ~ agez + lths + coll + black + hispanic + 
##     other + black * povz + hispanic * povz + other * povz + (1 | 
##     cofips), data = joindata)
##                coef.est coef.se t value
## (Intercept)      0.028    0.009   2.994
## agez             0.008    0.002   3.724
## lths             0.060    0.009   6.662
## coll            -0.105    0.005 -21.515
## black1           0.415    0.008  51.577
## hispanic1        0.192    0.009  20.603
## other1           0.016    0.011   1.495
## povz            -0.002    0.011  -0.176
## black1:povz      0.075    0.011   6.775
## hispanic1:povz   0.066    0.013   5.098
## other1:povz      0.108    0.013   8.048
## 
## Error terms:
##  Groups   Name        Std.Dev.
##  cofips   (Intercept) 0.100   
##  Residual             0.986   
## ---
## number of obs: 226744, groups: cofips, 224
## AIC = 637464, DIC = 637265.8
## deviance = 637352.2

rand(fit.mix4)

## Analysis of Random effects Table:
##        Chi.sq Chi.DF p.value    
## cofips   1607      1  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

anova(fit.mix, fit.mix4)

## refitting model(s) with ML (instead of REML)

## Data: joindata
## Models:
## object: bmiz ~ agez + lths + coll + black + hispanic + other + (1 | cofips)
## ..1: bmiz ~ agez + lths + coll + black + hispanic + other + black * 
## ..1:     povz + hispanic * povz + other * povz + (1 | cofips)
##        Df    AIC    BIC  logLik deviance Chisq Chi Df Pr(>Chisq)    
## object  9 637488 637581 -318735   637470                            
## ..1    13 637378 637513 -318676   637352 117.8      4  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

This basically says that racial/ethnic minorities in counties with higher than average poverty rates (povz==1 vs povz==0) have higher BMI’s than whites

Logistic Generalized Linear Mixed Model (GLMM)

Just like when we did ordinary logistic regression, vs. least squares, we use a slightly different function glm() vs. lm(). Same goes here, we will use glmer() vs lmer(). This example will fit a multilevel logistic regression model.

NOTE GLMM’s can take much longer to estimate than LMM’s, don’t be surprised if your model takes a long time to run. Also don’t be surprised if you see warning messages like Warning message: In checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, : Model failed to converge with max|grad| = somenumber (tol = 0.001)

#Just for quickness, i subset to TX & CA only
joindatatx<-joindata[joindata$state%in%c(6,48),]
#Here I fit a binomial model for healthcare access
fit.mix.bin<-glmer(doc~agez+lths+coll+black+hispanic+other+(1|cofips), family=binomial, joindatatx)
display(fit.mix.bin, detail=T)

## glmer(formula = doc ~ agez + lths + coll + black + hispanic + 
##     other + (1 | cofips), data = joindatatx, family = binomial)
##             coef.est coef.se z value Pr(>|z|)
## (Intercept)   1.80     0.06   29.53    0.00  
## agez          0.90     0.02   36.97    0.00  
## lths         -0.52     0.07   -7.37    0.00  
## coll          0.42     0.05    7.79    0.00  
## black1       -0.21     0.08   -2.53    0.01  
## hispanic1    -0.58     0.05  -10.64    0.00  
## other1       -0.18     0.08   -2.36    0.02  
## 
## Error terms:
##  Groups   Name        Std.Dev.
##  cofips   (Intercept) 0.12    
##  Residual             1.00    
## ---
## number of obs: 18355, groups: cofips, 18
## AIC = 14516, DIC = 14500
## deviance = 14500.0

#odds ratios
exp(fixef(fit.mix.bin)[-1])

##      agez      lths      coll    black1 hispanic1    other1 
## 2.4535692 0.5929751 1.5235652 0.8068959 0.5620259 0.8336371

sjp.glmer(fit.mix.bin, facet.grid = FALSE,type="ri.pc")

So, we see that older people are more likely to have a regular doctor, as are those with a college education. On the other hand, those with low levels of education, and all of the racial/ethnic minorities are less likely to have a regular doctor. Unfortunately, there is no easy way test for random effects in GLMM’s in R.

Random slopes GLMM

fit.mix.bin2<-glmer(doc~agez+lths+coll+black+hispanic+other+(black+hispanic+other+1|cofips), family=binomial, joindatatx)
display(fit.mix.bin2, detail=T)

## glmer(formula = doc ~ agez + lths + coll + black + hispanic + 
##     other + (black + hispanic + other + 1 | cofips), data = joindatatx, 
##     family = binomial)
##             coef.est coef.se z value Pr(>|z|)
## (Intercept)  1.81     0.06   28.32    0.00   
## agez         0.90     0.02   36.97    0.00   
## lths        -0.52     0.07   -7.36    0.00   
## coll         0.42     0.05    7.72    0.00   
## black1      -0.23     0.09   -2.42    0.02   
## hispanic1   -0.60     0.08   -8.05    0.00   
## other1      -0.16     0.10   -1.55    0.12   
## 
## Error terms:
##  Groups   Name        Std.Dev. Corr              
##  cofips   (Intercept) 0.14                       
##           black1      0.15     -0.83             
##           hispanic1   0.19     -0.77  0.99       
##           other1      0.23      0.68 -0.16 -0.05 
##  Residual             1.00                       
## ---
## number of obs: 18355, groups: cofips, 18
## AIC = 14521.1, DIC = 14487.1
## deviance = 14487.1

exp(fixef(fit.mix.bin2))

## (Intercept)        agez        lths        coll      black1   hispanic1 
##   6.0802099   2.4562621   0.5920707   1.5191720   0.7956108   0.5462362 
##      other1 
##   0.8550496

sjp.glmer(fit.mix.bin2, facet.grid = FALSE,type="ri.pc")

So, we see that there are pretty sizeable variances in all of the three race/ethnicity variables, suggesting that across the counties in the data, there is heterogeneity in the means of the BMI for these groups. i.e. it’s not always the case that minority groups face a disadvantage in terms of BMI in all counties in these two states.

Cross-level GLMM

We can likewise examine a cross-level interaction model like we did above:

fit.mix.bin3<-glmer(doc~agez+lths+coll+(black+hispanic+other)*povz+(1|cofips), family=binomial, joindatatx)
display(fit.mix.bin3, detail=T)

## glmer(formula = doc ~ agez + lths + coll + (black + hispanic + 
##     other) * povz + (1 | cofips), data = joindatatx, family = binomial)
##                coef.est coef.se z value Pr(>|z|)
## (Intercept)      1.77     0.06   31.80    0.00  
## agez             0.90     0.02   37.10    0.00  
## lths            -0.52     0.07   -7.41    0.00  
## coll             0.41     0.05    7.62    0.00  
## black1          -0.22     0.09   -2.53    0.01  
## hispanic1       -0.59     0.05  -10.71    0.00  
## other1          -0.26     0.08   -3.08    0.00  
## povz            -0.19     0.07   -2.75    0.01  
## black1:povz     -0.11     0.19   -0.57    0.57  
## hispanic1:povz   0.02     0.11    0.21    0.83  
## other1:povz     -0.28     0.15   -1.81    0.07  
## 
## Error terms:
##  Groups   Name        Std.Dev.
##  cofips   (Intercept) 0.06    
##  Residual             1.00    
## ---
## number of obs: 18355, groups: cofips, 18
## AIC = 14509.1, DIC = 14485.1
## deviance = 14485.1

exp(fixef(fit.mix.bin3))

##    (Intercept)           agez           lths           coll         black1 
##      5.8430338      2.4610329      0.5916629      1.5087900      0.8055873 
##      hispanic1         other1           povz    black1:povz hispanic1:povz 
##      0.5567145      0.7701871      0.8241363      0.8973432      1.0225781 
##    other1:povz 
##      0.7557557

So, we see that, in these two states at least, blacks and hispanics don’t face an added disparity against having a regular doctor, but other race/ethnicities do (at least this interaction term is marginally significant in the model)