DEM 7473 Week 2: Basic Hierarchical Models

In this example, I will perform some basic recodes for the ECLS-K 2011 data. The outcome of interest here is the child’s kindergarten math score. I will illustrate how to fit the basic multilevel model and compare models with a likelihood ratio test. I also show how to extract the variance components of the models and form the intra class correlation coefficient.

First we load our data:

library(haven)
library (car)
library(lmtest)
library(arm)
library(lme4)

eclsk11<-read_sas("~/Google Drive/classes/dem7473/data/childk4p.sas7bdat")
names(eclsk11)<-tolower(names(eclsk11))

#get out only the variables I'm going to use for this example

myvars<-c("x_chsex_r", "x1locale", "x_raceth_r", "x2povty", "x12par1ed_i","p1curmar", "x1htotal", "x1mscalk4",  "s1_id", "p2safepl","x2krceth" )
#subset the data
eclsk.sub<-eclsk11[,myvars]
rm(eclsk11); gc()

##           used (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells 1833303 98.0    3614974  193.1   3614974  193.1
## Vcells 3409088 26.1  681453050 5199.1 783637535 5978.7

Next, I do some recoding of variables using a mixture of the ifelse() function and the recode () function.

#recode our outcomes, the  first is the child's math standardized test score  in Kindergarten
eclsk.sub$math<-ifelse(eclsk.sub$x1mscalk4<0, NA, eclsk.sub$x1mscalk4)

#First we recode some Child characteristics
#Child's sex: recode as male =1
eclsk.sub$male<-Recode(eclsk.sub$x_chsex_r, recodes="1=1; 2=0; -9=NA")

#Recode race with white, non Hispanic as reference using dummy vars
eclsk.sub$hisp<-recode (eclsk.sub$x_raceth_r, recodes="3:4=1;-9=NA; else=0")
eclsk.sub$black<-recode (eclsk.sub$x_raceth_r, recodes="2=1;-9=NA; else=0")
eclsk.sub$asian<-recode (eclsk.sub$x_raceth_r, recodes="5=1;-9=NA; else=0")
eclsk.sub$nahn<-recode (eclsk.sub$x_raceth_r, recodes="6:7=1;-9=NA; else=0")
eclsk.sub$other<-recode (eclsk.sub$x_raceth_r, recodes="8=1;-9=NA; else=0")


#Then we recode some parent/mother characteristics
#Mother's education, recode as 2 dummys with HS = reference
eclsk.sub$lths<-Recode(eclsk.sub$x12par1ed_i, recodes = "0:2=1; 3:8=0; else = NA")
eclsk.sub$gths<-Recode(eclsk.sub$x12par1ed_i, recodes = "1:3=0; 4:8=1; else =NA") 

#marital status, recode as 2 dummys, ref= married
eclsk.sub$single<-Recode(eclsk.sub$p1curmar, recodes="4=1; -7:-9=NA; else=0")
eclsk.sub$notmar<-Recode(eclsk.sub$p1curmar, recodes="2:3=1; -7:-9=NA; else=0")


#Then we do some household level variables
#Urban school location = 1
eclsk.sub$urban<-Recode(eclsk.sub$x1locale, recodes = "1:3=1; 4=0; -1:-9=NA")

#poverty level in poverty = 1
eclsk.sub$pov<-Recode(eclsk.sub$x2povty , recodes ="1:2=1; 3=0; -9=NA")

#Household size
eclsk.sub$hhsize<-eclsk.sub$x1htotal

#school % minority student body
eclsk.sub$minorsch<-ifelse(eclsk.sub$x2krceth <0, NA, eclsk.sub$x2krceth/10)

#Unsafe neighborhood
eclsk.sub$unsafe<-Recode(eclsk.sub$p2safepl , recodes = "1:2='unsafe'; 3='safe'; else=NA", as.factor = T)

#Show the first few lines of the data
head(eclsk.sub)

## # A tibble: 6 x 27
##   x_chsex_r x1locale x_raceth_r x2povty x12par1ed_i p1curmar x1htotal
##       <dbl>    <dbl>      <dbl>   <dbl>       <dbl>    <dbl>    <dbl>
## 1         1        4          1       2           5        1        6
## 2         2        2          1       3           5        1        4
## 3         1       NA          3       3           8       NA       NA
## 4         1        4          1       3           5        1        4
## 5         2        2          3       2           4        1        5
## 6         2        4          1       2           3        3        4
## # ... with 20 more variables: x1mscalk4 <dbl>, s1_id <chr>,
## #   p2safepl <dbl>, x2krceth <dbl>, math <dbl>, male <dbl>, hisp <dbl>,
## #   black <dbl>, asian <dbl>, nahn <dbl>, other <dbl>, lths <dbl>,
## #   gths <dbl>, single <dbl>, notmar <dbl>, urban <dbl>, pov <dbl>,
## #   hhsize <dbl>, minorsch <dbl>, unsafe <fct>

First, I want to test for variation in my outcome across the schools, this is ALWAYS THE FIRST STEP!!!!! If there is no variation across your groups, why do a hierarchical model??

fit0<-lm(math~factor(s1_id), data=eclsk.sub)
#Global F test for equality of means across the schools 
anova(fit0)

## Analysis of Variance Table
## 
## Response: math
##                  Df  Sum Sq Mean Sq F value    Pr(>F)    
## factor(s1_id)   857  543174  633.81  5.9427 < 2.2e-16 ***
## Residuals     14737 1571749  106.65                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Yes! The means are not equal

My logic of fitting a hierarchical model here is that I have 860 schools in the data. How would I possibly choose one of them as a reference group? Indeed, I don’t have a complete sampling of schools, so a fixed effects model is not really appropriate.

ANOVA for GLM

If our outcome were not assumed to be normally distributed, we can still do the ANOVA type of test. Here is an example for a binomial outcome:

fit_b<-glm(pov~factor(s1_id), family = binomial, data=eclsk.sub)
anova(fit_b, test="Chisq")

## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: pov
## 
## Terms added sequentially (first to last)
## 
## 
##                Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                           13526      18725              
## factor(s1_id) 857   4897.2     12669      13827 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Next, I proceed to fit the random intercept model and examine it There are lots of ways to fit hierarchical models in R, but I like the lme4 library because the methods it implements are the more robust type of methods for fitting these models more on this later

This first model will only fit the means of the schools, I’m doing this to get a “null” model fit, so I have a basis to compare my other models to.

fit1<-lmer(math~1+(1|s1_id), data = eclsk.sub, REML = T, subset=complete.cases(eclsk.sub))
summary(fit1)

## Linear mixed model fit by REML ['lmerMod']
## Formula: math ~ 1 + (1 | s1_id)
##    Data: eclsk.sub
##  Subset: complete.cases(eclsk.sub)
## 
## REML criterion at convergence: 79507.3
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.3340 -0.6773 -0.0905  0.5853  9.0507 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  s1_id    (Intercept)  29.89    5.467  
##  Residual             110.34   10.504  
## Number of obs: 10384, groups:  s1_id, 852
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)  35.4857     0.2183   162.5

The next model will fit a model that considers the individual and family level variables

fit2<-lmer(math~male+hisp+black+asian+nahn+other+lths+gths+single+notmar+urban+pov+hhsize+(1|s1_id), data=eclsk.sub, REML=T, subset=complete.cases(eclsk.sub))

summary(fit2)

## Linear mixed model fit by REML ['lmerMod']
## Formula: 
## math ~ male + hisp + black + asian + nahn + other + lths + gths +  
##     single + notmar + urban + pov + hhsize + (1 | s1_id)
##    Data: eclsk.sub
##  Subset: complete.cases(eclsk.sub)
## 
## REML criterion at convergence: 78200.2
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.9523 -0.6815 -0.0839  0.5994  9.3871 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  s1_id    (Intercept)  12.07    3.474  
##  Residual             101.74   10.087  
## Number of obs: 10384, groups:  s1_id, 852
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept) 37.73442    0.50070  75.363
## male         0.70105    0.20332   3.448
## hisp        -3.43521    0.31963 -10.747
## black       -3.26747    0.40553  -8.057
## asian        3.22769    0.45766   7.053
## nahn        -1.81644    1.00131  -1.814
## other        0.76010    0.47502   1.600
## lths        -2.17445    0.40275  -5.399
## gths         3.22927    0.28012  11.528
## single      -2.24546    0.33997  -6.605
## notmar      -2.31633    0.34742  -6.667
## urban       -0.11312    0.12154  -0.931
## pov         -3.26669    0.25794 -12.664
## hhsize      -0.31699    0.08183  -3.874

## 
## Correlation matrix not shown by default, as p = 14 > 12.
## Use print(x, correlation=TRUE)  or
##     vcov(x)        if you need it

We then compare these two fits using a Likelihood ratio test

anova(fit1, fit2)

## refitting model(s) with ML (instead of REML)

## Data: eclsk.sub
## Subset: complete.cases(eclsk.sub)
## Models:
## fit1: math ~ 1 + (1 | s1_id)
## fit2: math ~ male + hisp + black + asian + nahn + other + lths + gths + 
## fit2:     single + notmar + urban + pov + hhsize + (1 | s1_id)
##      Df   AIC   BIC logLik deviance  Chisq Chi Df Pr(>Chisq)    
## fit1  3 79512 79534 -39753    79506                             
## fit2 16 78222 78338 -39095    78190 1315.7     13  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The second model is a better fit than the first, so it’s not just the school in and of itself that matters

Finally, I extract the ICCs from each fit, to do this, we use the cryptic VarCorr() function that works with fits from lmer()

Here is the ICC from fit 1

VarCorr(fit1)$s1_id[1]/((attr(VarCorr(fit1), "sc")^2)+VarCorr(fit1)$s1_id[1])

## [1] 0.213129

#the "sc" attribute is the residual variance, who knew?

Or a faster way:

library(sjstats)
icc(fit1)

## 
## Linear mixed model
## 
## Family : gaussian (identity)
## Formula: math ~ 1 + (1 | s1_id)
## 
##   ICC (s1_id): 0.2131

And model 2

VarCorr(fit2)$s1_id[1]/((attr(VarCorr(fit2), "sc")^2)+VarCorr(fit2)$s1_id[1])

## [1] 0.1060206

icc(fit2)

## 
## Linear mixed model
## 
## Family : gaussian (identity)
## Formula: math ~ male + hisp + black + asian + nahn + other + lths + gths + single + notmar + urban + pov + hhsize + (1 | s1_id)
## 
##   ICC (s1_id): 0.1060

Basically what this shows us is that we have reduced the variability between the schools by controlling for children’s characteristics

binary outcome model

Note this will take much longer to fit!!

we use glmer() instead of lmer() when we have a count outcome or binary outcome, but our logic is otherwise the same:

fit_b<-glmer(pov~1+(1|s1_id), data = eclsk.sub,family=binomial, subset=complete.cases(eclsk.sub))
summary(fit_b)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: pov ~ 1 + (1 | s1_id)
##    Data: eclsk.sub
##  Subset: complete.cases(eclsk.sub)
## 
##      AIC      BIC   logLik deviance df.resid 
##  12174.5  12189.0  -6085.3  12170.5    10382 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.0837 -0.6445 -0.3343  0.6525  3.0269 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  s1_id  (Intercept) 2.276    1.509   
## Number of obs: 10384, groups:  s1_id, 852
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.23600    0.05829  -4.049 5.15e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

icc(fit_b)

## 
## Generalized linear mixed model
## 
## Family : binomial (logit)
## Formula: pov ~ 1 + (1 | s1_id)
## 
##   ICC (s1_id): 0.4089

or 2.276/(2.276+(pi^2)/3)

Let’s look at the means of the school math scores:

require(lattice)

## Loading required package: lattice

schmeans<-ranef(fit1, condVar=T)
schmeans$s1_id$`(Intercept)`<-schmeans$s1_id$`(Intercept)`+fixef(fit1)[1]
dotplot(schmeans, main=T, ylab="Schools")

## $s1_id

This shows the estimate for each school, and the associated variance in the estimate.

Further examination of estimated values at the group level

These models are good for estimating group means better than traditional methods this follows the examples in chapter 12 of Gelman and Hill. Basically what we hope to see here is that, when samples are small, the fixed effect model has higher variance in the estimates for schools, but in the multilevel model, the variances are smaller because, we are pooling information across schools when estimating means, instead of estimating them one at a time.

#complete pooling, this model fits the grand mean
eclsk.sub<-eclsk.sub[complete.cases(eclsk.sub),]
fit.cp<-lm(math~factor(s1_id), data = eclsk.sub, subset=complete.cases(eclsk.sub))
#display(fit.cp)


#No pooling i.e. fixed effects regression, this model fits separate means for each county
lm.unpooled<-lm(math~factor(s1_id)-1, data = eclsk.sub, subset=complete.cases(eclsk.sub))
#display(lm.unpooled)

#partial pooling, this model fits the population mean and county deviations
fit0<-lmer(math~1+(1|s1_id),data = eclsk.sub, subset=complete.cases(eclsk.sub))
#display(fit0)


#Plot the means of the schools
J<-length(unique(eclsk.sub$s1_id))
ns<-as.numeric(table(eclsk.sub$s1_id))
sample.size <- as.numeric(table(eclsk.sub$s1_id))
sample.size.jittered <- sample.size*exp (runif (J, -.1, .1))

par (mar=c(5,5,4,2)+.1)
plot (sample.size.jittered, coef(lm.unpooled), cex.lab=1.2, cex.axis=1.2,
      xlab="sample size in School j", ylab=expression (paste
                                                       ("est. intercept, ", alpha[j], "   (no pooling)")),
      pch=20, log="x", ylim=c(10,70), yaxt="n", xaxt="n")
axis (1, quantile(ns), cex.axis=1.1)
axis (2, seq(10, 70), cex.axis=1.1)
for (j in 1:J){
  lines (rep(sample.size.jittered[j],2),
         coef(lm.unpooled)[j] + c(-1,1)*se.coef(lm.unpooled)[j], lwd=.5)
}
abline (coef(fit.cp)[1], 0, lwd=.5)
title(main="Estimates of School Means from the Fixed Effect Model")

#plot MLM estimates of county means + se's
par (mar=c(5,5,4,2)+.1)
a.hat.M1 <- coef(fit0)$s1_id[,1]
a.se.M1 <- se.coef(fit0)$s1_id

plot (sample.size.jittered, t(a.hat.M1), cex.lab=1.2, cex.axis=1.1,
      xlab="sample size in School j",
      ylab=expression (paste("est. intercept, ", alpha[j], "(multilevel model)")),
      pch=20, log="x", ylim=c(10,70), yaxt="n", xaxt="n")
axis (1, quantile(ns), cex.axis=1.1)
axis (2, seq(10,70), cex.axis=1.1)
for (j in 1:length(unique(eclsk.sub$s1_id))){
  lines (rep(sample.size.jittered[j],2),
         as.vector(a.hat.M1[j]) + c(-1,1)*a.se.M1[j], lwd=.5, col="gray10")
}
abline (coef(fit.cp)[1], 0, lwd=.5)
title(main="Estimates of School Means from the Hierarchical Model")

DEM 7473 Week 2: Basic Hierarchical Models - Random Intercepts

Corey S. Sparks, Ph.D.

ANOVA for GLM

binary outcome model

Note this will take much longer to fit!!

Further examination of estimated values at the group level