In this example, I will perform some basic recodes for the ECLS-K 2011 data. The outcome of interest here is the child’s kindergarten math score. I will illustrate how to fit the basic multilevel model and compare models with a likelihood ratio test. I also show how to extract the variance components of the models and form the intra class correlation coefficient.
First we load our data:
library(haven)
library (car)
library(lmtest)
library(arm)
library(lme4)
eclsk11<-read_sas("~/Google Drive/classes/dem7473/data/childk4p.sas7bdat")
names(eclsk11)<-tolower(names(eclsk11))
#get out only the variables I'm going to use for this example
myvars<-c("x_chsex_r", "x1locale", "x_raceth_r", "x2povty", "x12par1ed_i","p1curmar", "x1htotal", "x1mscalk4", "s1_id", "p2safepl","x2krceth" )
#subset the data
eclsk.sub<-eclsk11[,myvars]
rm(eclsk11); gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 1833303 98.0 3614974 193.1 3614974 193.1
## Vcells 3409088 26.1 681453050 5199.1 783637535 5978.7
Next, I do some recoding of variables using a mixture of the ifelse() function and the recode () function.
#recode our outcomes, the first is the child's math standardized test score in Kindergarten
eclsk.sub$math<-ifelse(eclsk.sub$x1mscalk4<0, NA, eclsk.sub$x1mscalk4)
#First we recode some Child characteristics
#Child's sex: recode as male =1
eclsk.sub$male<-Recode(eclsk.sub$x_chsex_r, recodes="1=1; 2=0; -9=NA")
#Recode race with white, non Hispanic as reference using dummy vars
eclsk.sub$hisp<-recode (eclsk.sub$x_raceth_r, recodes="3:4=1;-9=NA; else=0")
eclsk.sub$black<-recode (eclsk.sub$x_raceth_r, recodes="2=1;-9=NA; else=0")
eclsk.sub$asian<-recode (eclsk.sub$x_raceth_r, recodes="5=1;-9=NA; else=0")
eclsk.sub$nahn<-recode (eclsk.sub$x_raceth_r, recodes="6:7=1;-9=NA; else=0")
eclsk.sub$other<-recode (eclsk.sub$x_raceth_r, recodes="8=1;-9=NA; else=0")
#Then we recode some parent/mother characteristics
#Mother's education, recode as 2 dummys with HS = reference
eclsk.sub$lths<-Recode(eclsk.sub$x12par1ed_i, recodes = "0:2=1; 3:8=0; else = NA")
eclsk.sub$gths<-Recode(eclsk.sub$x12par1ed_i, recodes = "1:3=0; 4:8=1; else =NA")
#marital status, recode as 2 dummys, ref= married
eclsk.sub$single<-Recode(eclsk.sub$p1curmar, recodes="4=1; -7:-9=NA; else=0")
eclsk.sub$notmar<-Recode(eclsk.sub$p1curmar, recodes="2:3=1; -7:-9=NA; else=0")
#Then we do some household level variables
#Urban school location = 1
eclsk.sub$urban<-Recode(eclsk.sub$x1locale, recodes = "1:3=1; 4=0; -1:-9=NA")
#poverty level in poverty = 1
eclsk.sub$pov<-Recode(eclsk.sub$x2povty , recodes ="1:2=1; 3=0; -9=NA")
#Household size
eclsk.sub$hhsize<-eclsk.sub$x1htotal
#school % minority student body
eclsk.sub$minorsch<-ifelse(eclsk.sub$x2krceth <0, NA, eclsk.sub$x2krceth/10)
#Unsafe neighborhood
eclsk.sub$unsafe<-Recode(eclsk.sub$p2safepl , recodes = "1:2='unsafe'; 3='safe'; else=NA", as.factor = T)
#Show the first few lines of the data
head(eclsk.sub)
## # A tibble: 6 x 27
## x_chsex_r x1locale x_raceth_r x2povty x12par1ed_i p1curmar x1htotal
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 4 1 2 5 1 6
## 2 2 2 1 3 5 1 4
## 3 1 NA 3 3 8 NA NA
## 4 1 4 1 3 5 1 4
## 5 2 2 3 2 4 1 5
## 6 2 4 1 2 3 3 4
## # ... with 20 more variables: x1mscalk4 <dbl>, s1_id <chr>,
## # p2safepl <dbl>, x2krceth <dbl>, math <dbl>, male <dbl>, hisp <dbl>,
## # black <dbl>, asian <dbl>, nahn <dbl>, other <dbl>, lths <dbl>,
## # gths <dbl>, single <dbl>, notmar <dbl>, urban <dbl>, pov <dbl>,
## # hhsize <dbl>, minorsch <dbl>, unsafe <fct>
First, I want to test for variation in my outcome across the schools, this is ALWAYS THE FIRST STEP!!!!! If there is no variation across your groups, why do a hierarchical model??
fit0<-lm(math~factor(s1_id), data=eclsk.sub)
#Global F test for equality of means across the schools
anova(fit0)
## Analysis of Variance Table
##
## Response: math
## Df Sum Sq Mean Sq F value Pr(>F)
## factor(s1_id) 857 543174 633.81 5.9427 < 2.2e-16 ***
## Residuals 14737 1571749 106.65
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Yes! The means are not equal
My logic of fitting a hierarchical model here is that I have 860 schools in the data. How would I possibly choose one of them as a reference group? Indeed, I don’t have a complete sampling of schools, so a fixed effects model is not really appropriate.
If our outcome were not assumed to be normally distributed, we can still do the ANOVA type of test. Here is an example for a binomial outcome:
fit_b<-glm(pov~factor(s1_id), family = binomial, data=eclsk.sub)
anova(fit_b, test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: pov
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 13526 18725
## factor(s1_id) 857 4897.2 12669 13827 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Next, I proceed to fit the random intercept model and examine it There are lots of ways to fit hierarchical models in R, but I like the lme4 library because the methods it implements are the more robust type of methods for fitting these models more on this later
This first model will only fit the means of the schools, I’m doing this to get a “null” model fit, so I have a basis to compare my other models to.
fit1<-lmer(math~1+(1|s1_id), data = eclsk.sub, REML = T, subset=complete.cases(eclsk.sub))
summary(fit1)
## Linear mixed model fit by REML ['lmerMod']
## Formula: math ~ 1 + (1 | s1_id)
## Data: eclsk.sub
## Subset: complete.cases(eclsk.sub)
##
## REML criterion at convergence: 79507.3
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.3340 -0.6773 -0.0905 0.5853 9.0507
##
## Random effects:
## Groups Name Variance Std.Dev.
## s1_id (Intercept) 29.89 5.467
## Residual 110.34 10.504
## Number of obs: 10384, groups: s1_id, 852
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 35.4857 0.2183 162.5
The next model will fit a model that considers the individual and family level variables
fit2<-lmer(math~male+hisp+black+asian+nahn+other+lths+gths+single+notmar+urban+pov+hhsize+(1|s1_id), data=eclsk.sub, REML=T, subset=complete.cases(eclsk.sub))
summary(fit2)
## Linear mixed model fit by REML ['lmerMod']
## Formula:
## math ~ male + hisp + black + asian + nahn + other + lths + gths +
## single + notmar + urban + pov + hhsize + (1 | s1_id)
## Data: eclsk.sub
## Subset: complete.cases(eclsk.sub)
##
## REML criterion at convergence: 78200.2
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.9523 -0.6815 -0.0839 0.5994 9.3871
##
## Random effects:
## Groups Name Variance Std.Dev.
## s1_id (Intercept) 12.07 3.474
## Residual 101.74 10.087
## Number of obs: 10384, groups: s1_id, 852
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 37.73442 0.50070 75.363
## male 0.70105 0.20332 3.448
## hisp -3.43521 0.31963 -10.747
## black -3.26747 0.40553 -8.057
## asian 3.22769 0.45766 7.053
## nahn -1.81644 1.00131 -1.814
## other 0.76010 0.47502 1.600
## lths -2.17445 0.40275 -5.399
## gths 3.22927 0.28012 11.528
## single -2.24546 0.33997 -6.605
## notmar -2.31633 0.34742 -6.667
## urban -0.11312 0.12154 -0.931
## pov -3.26669 0.25794 -12.664
## hhsize -0.31699 0.08183 -3.874
##
## Correlation matrix not shown by default, as p = 14 > 12.
## Use print(x, correlation=TRUE) or
## vcov(x) if you need it
We then compare these two fits using a Likelihood ratio test
anova(fit1, fit2)
## refitting model(s) with ML (instead of REML)
## Data: eclsk.sub
## Subset: complete.cases(eclsk.sub)
## Models:
## fit1: math ~ 1 + (1 | s1_id)
## fit2: math ~ male + hisp + black + asian + nahn + other + lths + gths +
## fit2: single + notmar + urban + pov + hhsize + (1 | s1_id)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## fit1 3 79512 79534 -39753 79506
## fit2 16 78222 78338 -39095 78190 1315.7 13 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The second model is a better fit than the first, so it’s not just the school in and of itself that matters
Finally, I extract the ICCs from each fit, to do this, we use the cryptic VarCorr() function that works with fits from lmer()
Here is the ICC from fit 1
VarCorr(fit1)$s1_id[1]/((attr(VarCorr(fit1), "sc")^2)+VarCorr(fit1)$s1_id[1])
## [1] 0.213129
#the "sc" attribute is the residual variance, who knew?
Or a faster way:
library(sjstats)
icc(fit1)
##
## Linear mixed model
##
## Family : gaussian (identity)
## Formula: math ~ 1 + (1 | s1_id)
##
## ICC (s1_id): 0.2131
And model 2
VarCorr(fit2)$s1_id[1]/((attr(VarCorr(fit2), "sc")^2)+VarCorr(fit2)$s1_id[1])
## [1] 0.1060206
icc(fit2)
##
## Linear mixed model
##
## Family : gaussian (identity)
## Formula: math ~ male + hisp + black + asian + nahn + other + lths + gths + single + notmar + urban + pov + hhsize + (1 | s1_id)
##
## ICC (s1_id): 0.1060
Basically what this shows us is that we have reduced the variability between the schools by controlling for children’s characteristics
we use glmer() instead of lmer() when we have a count outcome or binary outcome, but our logic is otherwise the same:
fit_b<-glmer(pov~1+(1|s1_id), data = eclsk.sub,family=binomial, subset=complete.cases(eclsk.sub))
summary(fit_b)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: pov ~ 1 + (1 | s1_id)
## Data: eclsk.sub
## Subset: complete.cases(eclsk.sub)
##
## AIC BIC logLik deviance df.resid
## 12174.5 12189.0 -6085.3 12170.5 10382
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.0837 -0.6445 -0.3343 0.6525 3.0269
##
## Random effects:
## Groups Name Variance Std.Dev.
## s1_id (Intercept) 2.276 1.509
## Number of obs: 10384, groups: s1_id, 852
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.23600 0.05829 -4.049 5.15e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
icc(fit_b)
##
## Generalized linear mixed model
##
## Family : binomial (logit)
## Formula: pov ~ 1 + (1 | s1_id)
##
## ICC (s1_id): 0.4089
or 2.276/(2.276+(pi^2)/3)
Let’s look at the means of the school math scores:
require(lattice)
## Loading required package: lattice
schmeans<-ranef(fit1, condVar=T)
schmeans$s1_id$`(Intercept)`<-schmeans$s1_id$`(Intercept)`+fixef(fit1)[1]
dotplot(schmeans, main=T, ylab="Schools")
## $s1_id
This shows the estimate for each school, and the associated variance in the estimate.
These models are good for estimating group means better than traditional methods this follows the examples in chapter 12 of Gelman and Hill. Basically what we hope to see here is that, when samples are small, the fixed effect model has higher variance in the estimates for schools, but in the multilevel model, the variances are smaller because, we are pooling information across schools when estimating means, instead of estimating them one at a time.
#complete pooling, this model fits the grand mean
eclsk.sub<-eclsk.sub[complete.cases(eclsk.sub),]
fit.cp<-lm(math~factor(s1_id), data = eclsk.sub, subset=complete.cases(eclsk.sub))
#display(fit.cp)
#No pooling i.e. fixed effects regression, this model fits separate means for each county
lm.unpooled<-lm(math~factor(s1_id)-1, data = eclsk.sub, subset=complete.cases(eclsk.sub))
#display(lm.unpooled)
#partial pooling, this model fits the population mean and county deviations
fit0<-lmer(math~1+(1|s1_id),data = eclsk.sub, subset=complete.cases(eclsk.sub))
#display(fit0)
#Plot the means of the schools
J<-length(unique(eclsk.sub$s1_id))
ns<-as.numeric(table(eclsk.sub$s1_id))
sample.size <- as.numeric(table(eclsk.sub$s1_id))
sample.size.jittered <- sample.size*exp (runif (J, -.1, .1))
par (mar=c(5,5,4,2)+.1)
plot (sample.size.jittered, coef(lm.unpooled), cex.lab=1.2, cex.axis=1.2,
xlab="sample size in School j", ylab=expression (paste
("est. intercept, ", alpha[j], " (no pooling)")),
pch=20, log="x", ylim=c(10,70), yaxt="n", xaxt="n")
axis (1, quantile(ns), cex.axis=1.1)
axis (2, seq(10, 70), cex.axis=1.1)
for (j in 1:J){
lines (rep(sample.size.jittered[j],2),
coef(lm.unpooled)[j] + c(-1,1)*se.coef(lm.unpooled)[j], lwd=.5)
}
abline (coef(fit.cp)[1], 0, lwd=.5)
title(main="Estimates of School Means from the Fixed Effect Model")
#plot MLM estimates of county means + se's
par (mar=c(5,5,4,2)+.1)
a.hat.M1 <- coef(fit0)$s1_id[,1]
a.se.M1 <- se.coef(fit0)$s1_id
plot (sample.size.jittered, t(a.hat.M1), cex.lab=1.2, cex.axis=1.1,
xlab="sample size in School j",
ylab=expression (paste("est. intercept, ", alpha[j], "(multilevel model)")),
pch=20, log="x", ylim=c(10,70), yaxt="n", xaxt="n")
axis (1, quantile(ns), cex.axis=1.1)
axis (2, seq(10,70), cex.axis=1.1)
for (j in 1:length(unique(eclsk.sub$s1_id))){
lines (rep(sample.size.jittered[j],2),
as.vector(a.hat.M1[j]) + c(-1,1)*a.se.M1[j], lwd=.5, col="gray10")
}
abline (coef(fit.cp)[1], 0, lwd=.5)
title(main="Estimates of School Means from the Hierarchical Model")