In this example, we will use hierarchical models to do some longitudinal modeling of data from the ECLS-K 2011. Specifically, we will model changes in a student’s standardized math score from fall kindergarten to spring, 1st grade.
Longitudinal data are collected in waves, representing the sample at different time points For example, currently, the ECLS-K 2011 has eight waves, intending to capture the children as they progress through school, and will eventually have more. We really want to measure similar items at each time point so we can understand what makes people change over their lives. When it comes to data we need to introduce a different data structure to accommodate this.
In this example, I will focus on the use of the linear mixed model for a continuous outcome and a binomial model for a binary outcome.
We start simple then move into growth curve models. This follows the presentation in Singer and Willett (2003) Chapters 3-6.
library (car)
## Loading required package: carData
library(geepack)
library(MuMIn)
library(haven)
library(lme4)
## Loading required package: Matrix
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
First we load our data
eclsk11<-read_sas("~/Google Drive/classes/dem7473/data/childk4p.sas7bdat")
names(eclsk11)<-tolower(names(eclsk11))
myvars<-c("childid", "x_chsex_r", "x1locale", "x_raceth_r", "x2povty", "x12par1ed_i","p1curmar", "x1htotal",
"x1mscalk4", "x2mscalk4", "x3mscalk4", "x4mscalk4","x5mscalk4","x6mscalk4","x7mscalk4","x8mscalk4",
"p1hscale", "p2hscale", "p4hscale","p6hscale","p7hscale","p8hscale",
"x12sesl","x4sesl_i", "p2parct1", "p2parct2", "s1_id", "p2safepl","x2krceth","p1o2near", "x_distpov" ,
"w8cf8p_80","w8cf8p_8psu","w8cf8p_8str", "x1height","x2height","x3height","x4height","x5height","x6height","x7height","x8height",
"x1kage_r","x2kage_r","x3age","x4age", "x5age", "x6age", "x7age", "x8age")
#subset the data
eclsk.sub<-eclsk11[,myvars]
rm(eclsk11); gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 1864793 99.6 3644527 194.7 3644527 194.7
## Vcells 4173583 31.9 681453772 5199.1 783653138 5978.8
Next, I do some recoding of variables. I have to make the repeated measures of each of my longitudinal variables.
#First we recode some Child characteristics
#Child's sex: recode as male =1
eclsk.sub$male<-Recode(eclsk.sub$x_chsex_r, recodes="1=1; 2=0; -9=NA")
#Recode race with white, non Hispanic as reference using dummy vars
eclsk.sub$hisp<-Recode(eclsk.sub$x_raceth_r, recodes="3:4=1;-9=NA; else=0")
eclsk.sub$black<-Recode(eclsk.sub$x_raceth_r, recodes="2=1;-9=NA; else=0")
eclsk.sub$asian<-Recode(eclsk.sub$x_raceth_r, recodes="5=1;-9=NA; else=0")
eclsk.sub$nahn<-Recode(eclsk.sub$x_raceth_r, recodes="6:7=1;-9=NA; else=0")
eclsk.sub$other<-Recode(eclsk.sub$x_raceth_r, recodes="8=1;-9=NA; else=0")
#Longitudinal variables
#recode our outcomes, the first is the child's math standardized test score in Kindergarten
eclsk.sub$math1<-ifelse(eclsk.sub$x1mscalk4<0, NA, eclsk.sub$x1mscalk4)
eclsk.sub$math2<-ifelse(eclsk.sub$x2mscalk4<0, NA, eclsk.sub$x2mscalk4)
eclsk.sub$math4<-ifelse(eclsk.sub$x4mscalk4<0, NA, eclsk.sub$x4mscalk4)
eclsk.sub$math6<-ifelse(eclsk.sub$x6mscalk4<0, NA, eclsk.sub$x6mscalk4)
eclsk.sub$math7<-ifelse(eclsk.sub$x7mscalk4<0, NA, eclsk.sub$x7mscalk4)
eclsk.sub$math8<-ifelse(eclsk.sub$x8mscalk4<0, NA, eclsk.sub$x8mscalk4)
#Second outcome is child's height for age, continuous outcome
eclsk.sub$height1<-ifelse(eclsk.sub$x1height<=-7, NA, eclsk.sub$x1height)
eclsk.sub$height2<-ifelse(eclsk.sub$x2height<=-7, NA, eclsk.sub$x2height)
eclsk.sub$height4<-ifelse(eclsk.sub$x4height<=-7, NA, eclsk.sub$x4height)
eclsk.sub$height6<-ifelse(eclsk.sub$x6height<=-7, NA, eclsk.sub$x6height)
eclsk.sub$height7<-ifelse(eclsk.sub$x7height<=-7, NA, eclsk.sub$x7height)
eclsk.sub$height8<-ifelse(eclsk.sub$x8height<=-7, NA, eclsk.sub$x8height)
#Age at each wave
eclsk.sub$age_yrs1<-ifelse(eclsk.sub$x1kage_r<0, NA, eclsk.sub$x1kage_r/12)
eclsk.sub$age_yrs2<-ifelse(eclsk.sub$x2kage_r<0, NA, eclsk.sub$x2kage_r/12)
#eclsk.sub$age_yrs3<-ifelse(eclsk.sub$x3age<0, NA, eclsk.sub$x3age/12)
eclsk.sub$age_yrs4<-ifelse(eclsk.sub$x4age<0, NA, eclsk.sub$x4age/12)
eclsk.sub$age_yrs6<-ifelse(eclsk.sub$x6age<0, NA, eclsk.sub$x6age/12)
eclsk.sub$age_yrs7<-ifelse(eclsk.sub$x7age<0, NA, eclsk.sub$x7age/12)
eclsk.sub$age_yrs8<-ifelse(eclsk.sub$x8age<0, NA, eclsk.sub$x8age/12)
eclsk.sub<- eclsk.sub[is.na(eclsk.sub$age_yrs1)==F, ]
#Height for age z score standardized by sex and age
eclsk.sub$height_z1<-ave(eclsk.sub$height1, as.factor(paste(round(eclsk.sub$age_yrs1, 1.5), eclsk.sub$male)), FUN=scale)
eclsk.sub$height_z2<-ave(eclsk.sub$height2, as.factor(paste(round(eclsk.sub$age_yrs2, 1.5), eclsk.sub$male)), FUN=scale)
#eclsk.sub$height_z3<-ave(eclsk.sub$height3, as.factor(paste(round(eclsk.sub$age_yrs3, 1.5), eclsk.sub$male)), FUN=scale)
eclsk.sub$height_z4<-ave(eclsk.sub$height4, as.factor(paste(round(eclsk.sub$age_yrs4, 1.5), eclsk.sub$male)), FUN=scale)
eclsk.sub$height_z6<-ave(eclsk.sub$height6, as.factor(paste(round(eclsk.sub$age_yrs6, 1.5), eclsk.sub$male)), FUN=scale)
eclsk.sub$height_z7<-ave(eclsk.sub$height7, as.factor(paste(round(eclsk.sub$age_yrs7, 1.5), eclsk.sub$male)), FUN=scale)
eclsk.sub$height_z8<-ave(eclsk.sub$height8, as.factor(paste(round(eclsk.sub$age_yrs8, 1.5), eclsk.sub$male)), FUN=scale)
#Child health assessment Excellent to poor , ordinal outcome
eclsk.sub$chhealth1<-ifelse(eclsk.sub$p1hscale<0, NA, eclsk.sub$p1hscale)
eclsk.sub$chhealth2<-ifelse(eclsk.sub$p2hscale<0, NA, eclsk.sub$p2hscale)
eclsk.sub$chhealth4<-ifelse(eclsk.sub$p4hscale<0, NA, eclsk.sub$p4hscale)
eclsk.sub$chhealth6<-ifelse(eclsk.sub$p6hscale<0, NA, eclsk.sub$p6hscale)
eclsk.sub$chhealth7<-ifelse(eclsk.sub$p7hscale<0, NA, eclsk.sub$p7hscale)
eclsk.sub$chhealth8<-ifelse(eclsk.sub$p8hscale<0, NA, eclsk.sub$p8hscale)
#SES
eclsk.sub$hhses1<-ifelse(eclsk.sub$x12sesl==-9, NA, scale(eclsk.sub$x12sesl))
eclsk.sub$hhses4<-ifelse(eclsk.sub$x4sesl_i==-9, NA, scale(eclsk.sub$x4sesl_i))
To analyze data longitudinally, we must reshape the data from its current “wide” format, where each repeated measure is a column, into the “long” format, where there is a single column for a particular variable, and we account for the repeated measurements of each person. In this case, I’m going to use three waves of data, so each child can contribute up to three lines to the data.
e.long<-reshape(as.data.frame(eclsk.sub), idvar="childid", varying=list(math = c("math1", "math2", "math4", "math6", "math7", "math8"),
age = c("age_yrs1", "age_yrs2", "age_yrs4","age_yrs6","age_yrs7","age_yrs8"),
height= c("height_z1", "height_z2", "height_z4","height_z6","height_z7","height_z8"),
health=c("chhealth1", "chhealth2", "chhealth4","chhealth6","chhealth7","chhealth8"),
ses = c("hhses1", "hhses1", "hhses4","hhses4","hhses4","hhses4")),
v.names = c("math", "age", "height", "health", "ses"),
timevar="wave",times=c(1,2,4,6,7,8),direction="long",
drop = names(eclsk.sub)[c(2:26, 28:31, 35:50)])
e.long<-e.long[order(e.long$childid, e.long$wave),]
head(e.long[c("childid" ,"wave", "age", "height", "health", "ses")], n=18)
#get non missing cases
e.long.comp<-e.long%>%
select(childid,wave, s1_id,w8cf8p_80,w8cf8p_8psu,w8cf8p_8str,math, male, hisp, black, asian,nahn, other,age, health, height, ses )%>%
filter(complete.cases(math, black, age,height, health, ses, w8cf8p_80,w8cf8p_8psu))%>%
mutate(badhealth= ifelse(health>2, 1,0), agez=as.numeric(scale(age)), agez2=as.numeric(scale(age^2)), psu=w8cf8p_8psu, strata=w8cf8p_8str, weight= w8cf8p_80/mean(w8cf8p_80))
The basic form of the longitudinal model is:
\[y_{it} = \beta_{0i}+\sum_k \beta_k x_{ik}+e_{it}\] \[\beta_{0i} = \beta_0 +u_i\]
\[u_i \sim Normal(0, \sigma_u)\] Which is just a random intercept model for each individual person’s life. We can see examples of this in the following plots of the first few students in the ECLS-K 2011, which highlight the individual level trajectories and starting points that each child has. We can see that the individual level variation is quite dramatic in some of the outcomes.
library(ggplot2)
ggplot(e.long.comp[1:52,], aes(y=math,x= age, color=childid))+geom_point()+geom_smooth(method = "lm")+ggtitle("Math Score over Age by child")
ggplot(e.long.comp[1:52,], aes(y=math, x=age, color=childid))+geom_point()+geom_smooth(method = "lm")+facet_wrap(~childid)+ggtitle("Math Score over Age by child")
ggplot(e.long.comp[1:52,], aes(y=height,x= age, color=childid))+geom_point()+geom_smooth(method = "lm")+ggtitle("Height over Age by child")
ggplot(e.long.comp[1:52,], aes(y=height,x= age, color=childid))+geom_point()+geom_smooth(method = "lm")+facet_wrap(~childid)+ggtitle("Height over Age by child")
In addition to the random intercept model, we can also include terms that allow for individual level variation in trajectory of change. Adding a random slope for age allows each child to change at a different rate, \(\gamma_i\):
\[y_{it} = \beta_{0i}+\sum_k \beta_k x_{ik}+\gamma_i*Age_{it}+e_{it}\] \[\beta_{0i} = \beta_0 +u_{1i}\] \[\gamma_{i} = \gamma +u_{2i}\]
\[u \sim \text{MVN}(0, \Sigma_u)\]
fit.2<-lmer(scale(math)~agez+male+black+hisp+asian+nahn+other+(1|childid)+(1|strata/psu),weights = weight, data=e.long.comp)
summary(fit.2)
## Linear mixed model fit by REML ['lmerMod']
## Formula:
## scale(math) ~ agez + male + black + hisp + asian + nahn + other +
## (1 | childid) + (1 | strata/psu)
## Data: e.long.comp
## Weights: weight
##
## REML criterion at convergence: 14224.2
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.6955 -0.5425 0.0048 0.5628 4.6492
##
## Random effects:
## Groups Name Variance Std.Dev.
## childid (Intercept) 0.136698 0.36973
## psu:strata (Intercept) 0.003873 0.06224
## strata (Intercept) 0.005230 0.07232
## Residual 0.085891 0.29307
## Number of obs: 13281, groups: childid, 2345; psu:strata, 123; strata, 28
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 0.100948 0.023287 4.335
## agez 0.874439 0.002604 335.854
## male 0.032892 0.016846 1.952
## black -0.354049 0.034106 -10.381
## hisp -0.247260 0.023403 -10.566
## asian 0.121723 0.039199 3.105
## nahn -0.104625 0.069332 -1.509
## other -0.044849 0.042340 -1.059
##
## Correlation of Fixed Effects:
## (Intr) agez male black hisp asian nahn
## agez 0.000
## male -0.372 -0.006
## black -0.232 0.005 0.020
## hisp -0.422 0.004 -0.001 0.228
## asian -0.229 0.001 0.041 0.112 0.222
## nahn -0.100 0.000 0.029 0.046 0.093 0.049
## other -0.162 0.001 -0.032 0.112 0.185 0.118 0.070
fit.3<-lmer(scale(math)~agez+male+black+hisp+asian+nahn+other+(agez|childid)+(1|strata/psu),weights = weight, data=e.long.comp)
summary(fit.3)
## Linear mixed model fit by REML ['lmerMod']
## Formula:
## scale(math) ~ agez + male + black + hisp + asian + nahn + other +
## (agez | childid) + (1 | strata/psu)
## Data: e.long.comp
## Weights: weight
##
## REML criterion at convergence: 13809.4
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.3029 -0.5657 -0.0089 0.5568 4.8518
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## childid (Intercept) 0.137990 0.37147
## agez 0.005719 0.07562 0.64
## psu:strata (Intercept) 0.004276 0.06539
## strata (Intercept) 0.005480 0.07403
## Residual 0.079115 0.28127
## Number of obs: 13281, groups: childid, 2345; psu:strata, 123; strata, 28
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 0.115993 0.023138 5.013
## agez 0.876848 0.003109 282.079
## male -0.001088 0.015961 -0.068
## black -0.280332 0.031717 -8.838
## hisp -0.239736 0.022529 -10.641
## asian 0.098467 0.037737 2.609
## nahn -0.088216 0.065331 -1.350
## other -0.042970 0.040386 -1.064
##
## Correlation of Fixed Effects:
## (Intr) agez male black hisp asian nahn
## agez 0.111
## male -0.352 -0.011
## black -0.219 0.039 0.016
## hisp -0.407 -0.021 -0.004 0.226
## asian -0.220 -0.016 0.039 0.110 0.218
## nahn -0.096 -0.001 0.030 0.044 0.092 0.048
## other -0.155 -0.010 -0.034 0.113 0.183 0.117 0.072
anova(fit.3, fit.2)
## refitting model(s) with ML (instead of REML)
fit.4<-lmer(scale(math)~agez+male+black+hisp+asian+nahn+other+agez2+(agez+agez2|childid)+(1|strata/psu),weights = weight, data=e.long.comp)
summary(fit.4)
## Linear mixed model fit by REML ['lmerMod']
## Formula:
## scale(math) ~ agez + male + black + hisp + asian + nahn + other +
## agez2 + (agez + agez2 | childid) + (1 | strata/psu)
## Data: e.long.comp
## Weights: weight
##
## REML criterion at convergence: 6060.6
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.9514 -0.4609 0.0074 0.4674 4.9284
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## childid (Intercept) 0.154889 0.39356
## agez 0.863226 0.92910 0.69
## agez2 0.790166 0.88891 -0.70 -0.99
## psu:strata (Intercept) 0.004044 0.06359
## strata (Intercept) 0.005503 0.07418
## Residual 0.029693 0.17232
## Number of obs: 13281, groups: childid, 2345; psu:strata, 123; strata, 28
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 0.05839 0.02265 2.578
## agez 2.96397 0.02825 104.914
## male 0.01902 0.01512 1.258
## black -0.23412 0.03034 -7.716
## hisp -0.18659 0.02139 -8.722
## asian 0.11744 0.03549 3.309
## nahn -0.03470 0.06152 -0.564
## other -0.01724 0.03808 -0.453
## agez2 -2.09450 0.02762 -75.826
##
## Correlation of Fixed Effects:
## (Intr) agez male black hisp asian nahn other
## agez 0.164
## male -0.342 -0.008
## black -0.208 0.035 0.017
## hisp -0.398 -0.027 -0.003 0.225
## asian -0.220 -0.024 0.042 0.110 0.224
## nahn -0.094 -0.001 0.030 0.045 0.093 0.049
## other -0.153 -0.011 -0.033 0.112 0.185 0.119 0.073
## agez2 -0.162 -0.994 0.006 -0.034 0.029 0.024 0.001 0.012
anova(fit.4, fit.3)
## refitting model(s) with ML (instead of REML)
fit.5<-lmer(scale(math)~agez*(male+black+hisp+asian+nahn+other)+(agez|childid)+(1|strata/psu),weights = weight, data=e.long.comp)
summary(fit.5)
## Linear mixed model fit by REML ['lmerMod']
## Formula:
## scale(math) ~ agez * (male + black + hisp + asian + nahn + other) +
## (agez | childid) + (1 | strata/psu)
## Data: e.long.comp
## Weights: weight
##
## REML criterion at convergence: 13757.9
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.3356 -0.5662 -0.0092 0.5548 4.8971
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## childid (Intercept) 0.137496 0.37080
## agez 0.004886 0.06990 0.66
## psu:strata (Intercept) 0.004273 0.06537
## strata (Intercept) 0.005452 0.07384
## Residual 0.079095 0.28124
## Number of obs: 13281, groups: childid, 2345; psu:strata, 123; strata, 28
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 0.1068890 0.0234607 4.556
## agez 0.8652905 0.0052601 164.502
## male 0.0326502 0.0167954 1.944
## black -0.3569151 0.0340234 -10.490
## hisp -0.2525845 0.0231976 -10.888
## asian 0.1164710 0.0388830 2.995
## nahn -0.0897392 0.0688898 -1.303
## other -0.0455110 0.0421553 -1.080
## agez:male 0.0406356 0.0060440 6.723
## agez:black -0.0654812 0.0101132 -6.475
## agez:hisp -0.0158942 0.0072445 -2.194
## agez:asian 0.0318312 0.0144761 2.199
## agez:nahn 0.0004592 0.0210528 0.022
## agez:other -0.0016056 0.0158740 -0.101
##
## Correlation matrix not shown by default, as p = 14 > 12.
## Use print(x, correlation=TRUE) or
## vcov(x) if you need it
model.sel( fit.2, fit.3, fit.4, fit.5)
We can likewise do a longitudinal analysis for binomial or count outcomes. These rely on GLMM’s to do the lifting. Here is an example of analyzing individual level poor health over age for the children in the ECLS-K 2011
binomial_smooth <- function(...) {
geom_smooth(method = "glm", method.args = list(family = "binomial"), ...)
}
ggplot(e.long.comp[1:52,], aes(y=badhealth, x=age))+geom_point()+binomial_smooth()+facet_wrap(~childid)+ggtitle("Poor Health over Age by child")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
fit1.b<-glmer(badhealth~agez+male+black+hisp+asian+nahn+other+agez2+(1|childid)+(1|strata),weights = weight, data=e.long.comp, family = binomial, control = glmerControl(optimizer="bobyqa",optCtrl=list(maxfun=2e5)))
## Warning in eval(family$initialize, rho): non-integer #successes in a
## binomial glm!
summary(fit1.b)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: badhealth ~ agez + male + black + hisp + asian + nahn + other +
## agez2 + (1 | childid) + (1 | strata)
## Data: e.long.comp
## Weights: weight
## Control:
## glmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
##
## AIC BIC logLik deviance df.resid
## 8007.0 8089.5 -3992.5 7985.0 13270
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.7525 -0.2492 -0.1667 -0.1092 5.0053
##
## Random effects:
## Groups Name Variance Std.Dev.
## childid (Intercept) 2.707 1.645
## strata (Intercept) 0.000 0.000
## Number of obs: 13281, groups: childid, 2345; strata, 28
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.6388 0.1211 -30.051 < 2e-16 ***
## agez 0.7347 0.3559 2.064 0.03897 *
## male 0.3267 0.1127 2.899 0.00375 **
## black 0.9419 0.1943 4.848 1.25e-06 ***
## hisp 1.7327 0.1307 13.257 < 2e-16 ***
## asian 1.0871 0.2599 4.183 2.88e-05 ***
## nahn 0.7810 0.3964 1.970 0.04881 *
## other 0.3046 0.3024 1.007 0.31374
## agez2 -0.6054 0.3531 -1.715 0.08639 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) agez male black hisp asian nahn other
## agez -0.024
## male -0.527 0.004
## black -0.347 0.002 0.015
## hisp -0.564 0.007 0.000 0.298
## asian -0.276 -0.002 0.050 0.145 0.221
## nahn -0.171 0.000 0.020 0.095 0.143 0.071
## other -0.178 0.001 -0.034 0.121 0.181 0.089 0.059
## agez2 0.020 -0.996 -0.004 0.000 -0.004 0.003 0.000 -0.001
fit2.b<-glmer(badhealth~agez+male+black+hisp+asian+nahn+other+agez2+(agez|childid)+(1|strata),weights = weight, data=e.long.comp, family = binomial, control = glmerControl(optimizer="bobyqa",optCtrl=list(maxfun=2e5)))
## Warning in eval(family$initialize, rho): non-integer #successes in a
## binomial glm!
summary(fit2.b)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: badhealth ~ agez + male + black + hisp + asian + nahn + other +
## agez2 + (agez | childid) + (1 | strata)
## Data: e.long.comp
## Weights: weight
## Control:
## glmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
##
## AIC BIC logLik deviance df.resid
## 7864.3 7961.7 -3919.1 7838.3 13268
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.9592 -0.2219 -0.1434 -0.0913 4.4059
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## childid (Intercept) 3.2760 1.8100
## agez 0.6319 0.7949 0.02
## strata (Intercept) 0.0000 0.0000
## Number of obs: 13281, groups: childid, 2345; strata, 28
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.0062 0.1373 -29.170 < 2e-16 ***
## agez 2.2733 0.4488 5.065 4.09e-07 ***
## male 0.3398 0.1210 2.809 0.00497 **
## black 0.9909 0.2118 4.679 2.88e-06 ***
## hisp 1.8464 0.1407 13.126 < 2e-16 ***
## asian 1.1280 0.2760 4.088 4.36e-05 ***
## nahn 0.8106 0.4287 1.891 0.05863 .
## other 0.3238 0.3186 1.016 0.30948
## agez2 -2.1606 0.4409 -4.900 9.59e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) agez male black hisp asian nahn other
## agez -0.149
## male -0.505 0.019
## black -0.331 0.021 0.023
## hisp -0.553 0.027 0.001 0.286
## asian -0.266 0.004 0.050 0.142 0.220
## nahn -0.164 0.009 0.024 0.095 0.138 0.070
## other -0.169 0.004 -0.035 0.118 0.180 0.090 0.058
## agez2 0.138 -0.987 -0.010 -0.009 -0.026 -0.003 -0.002 -0.004
anova(fit1.b, fit2.b)
Up until now, we have used (G)LMM’s to analyze data that were “clustered”
The next topic will introduce a modeling strategy that allows us to consider clustered data, but in a different fashion
GLMM’s are commonly referred to as conditional models, because the model coefficients “\(\beta\)’s” are condition on the random effects in the model.
Likewise, the mean if conditional on the random effects. This is another way of saying that the mean for a given covariate pattern is conditional on the group that the particular person is in.
\(\mu_{ij}^c = E(Y_{ij} | u_j) = X_{ij}\beta + u_j\)
In contrast, Generalzed Estimating Equations are referred to as marginal models because they only estimate the overall mean.
\(\mu_{ij} = X_{ij}\beta\)
Lee and Nelder, 2004 provide a very good description of how these two methods compare to one another
A basic form of the model would be:
\(Y_{ij} = \beta_0 + \sum_k X_{ijk} \beta_k + CORR + error\)
Ordinary models will tend to over estimate the standard errors for the \(\beta\)’s for time varying predictors in a model with repeated observations, because these models do not account for the correlation within clusters observations over time.
Likewise, the standard errors of time invariant predictors will be under estimated
Given the mean function for the model and a specified correlation function, the model parameters may be estimated by finding the solution for:
\[U(\beta) = \sum_i ^n \frac{\delta \mu_{ij}}{ \delta \beta_k} V_i^{-1} (Y_{ij} - \mu(\beta))\]
Which gives estimates of the \(\beta\)’s for the linear mean function.
For three time points per person, the ordinary regression model correlation in residuals within clusters/persons over time can be thought of as the matrix:
\[\begin{bmatrix} \sigma^2 & 0 & 0 \\ 0 & \sigma^2 &0 \\ 0 & 0 & \sigma^2 \end{bmatrix}\]
which assumed the variances are constant and the residuals are independent over time
But in a GEE, the model include the actual correlation between measurements over time:
\[\begin{bmatrix} \sigma_1 ^2 & a & c \\ a & \sigma_2 ^2 &b \\ b & c & \sigma_3 ^2 \end{bmatrix}\]
Which allows the variances over time to be different, as well as correlations between times to be present.
\[\begin{bmatrix} 1 & 0 & 0 \\ 0 & 1 &0 \\ 0 & 0 & 1 \end{bmatrix}\]
\[\begin{bmatrix} 1 & \rho & \rho \\ \rho & 1 &\rho \\ \rho &\rho & 1 \end{bmatrix}\]
\[\begin{bmatrix} 1 & \rho & \rho^2 \\ \rho & 1 &\rho\\ \rho^2 & \rho & 1 \end{bmatrix}\]
\[\begin{bmatrix} 1 & \rho_1 & \rho_2 \\ \rho_1 & 1 &\rho_3 \\ \rho_2 & \rho_3& 1 \end{bmatrix}\]
fit.1<-glm(scale(math)~agez+agez2+factor(wave)+male+black+hisp+asian+nahn+other+ses, data=e.long.comp, weights=w8cf8p_80/mean(w8cf8p_80))
summary(fit.1)
##
## Call:
## glm(formula = scale(math) ~ agez + agez2 + factor(wave) + male +
## black + hisp + asian + nahn + other + ses, data = e.long.comp,
## weights = w8cf8p_80/mean(w8cf8p_80))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.98248 -0.21480 0.01056 0.22237 1.87471
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.881877 0.024131 -36.546 < 2e-16 ***
## agez 0.940583 0.080024 11.754 < 2e-16 ***
## agez2 -0.647273 0.078924 -8.201 2.60e-16 ***
## factor(wave)2 0.297253 0.014702 20.218 < 2e-16 ***
## factor(wave)4 0.853834 0.023541 36.270 < 2e-16 ***
## factor(wave)6 1.202288 0.031437 38.245 < 2e-16 ***
## factor(wave)7 1.473849 0.038141 38.642 < 2e-16 ***
## factor(wave)8 1.620513 0.046617 34.762 < 2e-16 ***
## male 0.053364 0.007144 7.470 8.55e-14 ***
## black -0.311969 0.011278 -27.662 < 2e-16 ***
## hisp -0.156561 0.009517 -16.451 < 2e-16 ***
## asian 0.081354 0.017689 4.599 4.28e-06 ***
## nahn -0.067882 0.023669 -2.868 0.00414 **
## other -0.089433 0.019601 -4.563 5.10e-06 ***
## ses 0.153098 0.004482 34.159 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.167762)
##
## Null deviance: 13091.5 on 13280 degrees of freedom
## Residual deviance: 2225.5 on 13266 degrees of freedom
## AIC: 18207
##
## Number of Fisher Scoring iterations: 2
#Get residuals and put them in a data frame
e.long.comp$resid<- residuals(fit.1)
resid<- data.frame(r1 = e.long.comp$resid[e.long$wave==1], r2 = e.long.comp$resid[e.long$wave==2], r4 = e.long.comp$resid[e.long$wave==4])
Here is our actual correlation matrix in the residuals between waves:
cor(resid, use="pairwise.complete")
## r1 r2 r4
## r1 1.0000000 0.6963999 0.5294997
## r2 0.6963999 1.0000000 0.6998337
## r4 0.5294997 0.6998337 1.0000000
This is certainly not independence, and looks more like an AR(1), because the correlation decreases as the difference between wave number increases.
Now we fit a GEE:
#Model with independent correlation, meaning ZERO correlation
fit.1b<-geeglm(scale(math)~scale(age)+male+black+hisp+asian+nahn+other+ses,id=childid , wave = wave, corstr ="independence",
data=e.long.comp, weights=w8cf8p_80/mean(w8cf8p_80))
summary(fit.1b)
##
## Call:
## geeglm(formula = scale(math) ~ scale(age) + male + black + hisp +
## asian + nahn + other + ses, data = e.long.comp, weights = w8cf8p_80/mean(w8cf8p_80),
## id = childid, waves = wave, corstr = "independence")
##
## Coefficients:
## Estimate Std.err Wald Pr(>|W|)
## (Intercept) 0.025406 0.017224 2.176 0.14019
## scale(age) 0.855391 0.004036 44918.647 < 2e-16 ***
## male 0.035205 0.019698 3.194 0.07389 .
## black -0.297930 0.035274 71.337 < 2e-16 ***
## hisp -0.129440 0.026508 23.844 1.04e-06 ***
## asian 0.127519 0.039338 10.508 0.00119 **
## nahn -0.088015 0.078716 1.250 0.26351
## other -0.082174 0.051367 2.559 0.10966
## ses 0.150448 0.012200 152.071 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Estimated Scale Parameters:
## Estimate Std.err
## (Intercept) 0.2155 0.005723
##
## Correlation: Structure = independenceNumber of clusters: 2345 Maximum cluster size: 6
#Model with Exchangeable correlation:
fit.2<-geeglm(scale(math)~scale(age)+male+black+hisp+asian+nahn+other+ses,id=childid , wave = wave, corstr ="exchangeable",
data=e.long.comp, weights=w8cf8p_80/mean(w8cf8p_80))
summary(fit.2)
##
## Call:
## geeglm(formula = scale(math) ~ scale(age) + male + black + hisp +
## asian + nahn + other + ses, data = e.long.comp, weights = w8cf8p_80/mean(w8cf8p_80),
## id = childid, waves = wave, corstr = "exchangeable")
##
## Coefficients:
## Estimate Std.err Wald Pr(>|W|)
## (Intercept) 0.03054 0.01733 3.11 0.07802 .
## scale(age) 0.87711 0.00335 68349.70 < 2e-16 ***
## male 0.03465 0.01988 3.04 0.08135 .
## black -0.31817 0.03527 81.37 < 2e-16 ***
## hisp -0.15858 0.02615 36.78 1.3e-09 ***
## asian 0.14793 0.03927 14.19 0.00017 ***
## nahn -0.12045 0.08185 2.17 0.14113
## other -0.07537 0.05185 2.11 0.14609
## ses 0.11363 0.01262 81.01 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Estimated Scale Parameters:
## Estimate Std.err
## (Intercept) 0.217 0.00578
##
## Correlation: Structure = exchangeable Link = identity
##
## Estimated Correlation Parameters:
## Estimate Std.err
## alpha 0.561 0.0201
## Number of clusters: 2345 Maximum cluster size: 6
The second model shows the exchangeable correlation to be 0.561, which is not very different from our measured correlations above 1, 0.696, 0.529, 0.696, 1, 0.7, 0.529, 0.7, 1
Now we examine the other correlation types:
fit.3<-geeglm(scale(math)~scale(age)+male+black+hisp+asian+nahn+other+ses, id=childid , wave = wave, corstr ="ar1", data=e.long.comp, weights=w8cf8p_80/mean(w8cf8p_80))
summary(fit.3)
##
## Call:
## geeglm(formula = scale(math) ~ scale(age) + male + black + hisp +
## asian + nahn + other + ses, data = e.long.comp, weights = w8cf8p_80/mean(w8cf8p_80),
## id = childid, waves = wave, corstr = "ar1")
##
## Coefficients:
## Estimate Std.err Wald Pr(>|W|)
## (Intercept) -0.09177 0.01640 31.32 2.2e-08 ***
## scale(age) 0.84794 0.00331 65494.98 < 2e-16 ***
## male 0.03848 0.01865 4.26 0.03905 *
## black -0.28557 0.03319 74.02 < 2e-16 ***
## hisp -0.14373 0.02463 34.07 5.3e-09 ***
## asian 0.14904 0.03929 14.39 0.00015 ***
## nahn -0.09752 0.06862 2.02 0.15527
## other -0.06954 0.04695 2.19 0.13853
## ses 0.10747 0.01216 78.07 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Estimated Scale Parameters:
## Estimate Std.err
## (Intercept) 0.23 0.00578
##
## Correlation: Structure = ar1 Link = identity
##
## Estimated Correlation Parameters:
## Estimate Std.err
## alpha 0.779 0.0121
## Number of clusters: 2345 Maximum cluster size: 6
Since GEE’s aren’t fit via maximum likelihood, they aren’t comparable in terms of AIC or likelihood ratio tests. However, Pan, 2001 describe an information criterion using a Quasi-likelihood formulation. This can be used to compare models with alternative correlation structures, with the lowest QIC representing the best fitting model.
library(MESS) #need to install
## Loading required package: geeM
##
## Attaching package: 'MESS'
## The following object is masked from 'package:MuMIn':
##
## QIC
QIC(fit.1b) #independence
## QIC QICu Quasi Lik CIC params QICC
## 3081.3 3001.1 -1491.6 49.1 9.0 3081.3
QIC(fit.2) #exchangeable
## QIC QICu Quasi Lik CIC params QICC
## 3036.2 3023.0 -1502.5 15.6 9.0 3036.3
QIC(fit.3) #ar1
## QIC QICu Quasi Lik CIC params QICC
## 3221.2 3214.9 -1598.4 12.1 9.0 3221.2
So, it looks like the exchangeable correlation structure is certainly better than independence or AR(1).
Here we use the GEE for a binomial outcome.
#basic linear model
fitb.1<-geeglm(I(health>2)~age+male+black+hisp+asian+nahn+other+ses, waves = wave,id=childid ,corstr ="independence", family=binomial, data=e.long.comp, weights=w8cf8p_80/mean(w8cf8p_80))
## Warning in eval(family$initialize): non-integer #successes in a binomial
## glm!
summary(fitb.1)
##
## Call:
## geeglm(formula = I(health > 2) ~ age + male + black + hisp +
## asian + nahn + other + ses, family = binomial, data = e.long.comp,
## weights = w8cf8p_80/mean(w8cf8p_80), id = childid, waves = wave,
## corstr = "independence")
##
## Coefficients:
## Estimate Std.err Wald Pr(>|W|)
## (Intercept) -2.6215 0.1986 174.26 < 2e-16 ***
## age 0.0391 0.0198 3.92 0.048 *
## male 0.1975 0.1038 3.62 0.057 .
## black 0.4119 0.1860 4.90 0.027 *
## hisp 0.6885 0.1266 29.58 5.4e-08 ***
## asian 1.1035 0.2634 17.56 2.8e-05 ***
## nahn 0.4342 0.3460 1.57 0.210
## other 0.2643 0.2594 1.04 0.308
## ses -0.5524 0.0641 74.37 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Estimated Scale Parameters:
## Estimate Std.err
## (Intercept) 1 0.129
##
## Correlation: Structure = independenceNumber of clusters: 2345 Maximum cluster size: 6
fitb.2<-geeglm(I(health>2)~age+male+black+hisp+asian+nahn+other+ses, waves = wave,id=childid, family=binomial, data=e.long.comp, corstr="exchangeable", weights=w8cf8p_80/mean(w8cf8p_80))
## Warning in eval(family$initialize): non-integer #successes in a binomial
## glm!
summary(fitb.2)
##
## Call:
## geeglm(formula = I(health > 2) ~ age + male + black + hisp +
## asian + nahn + other + ses, family = binomial, data = e.long.comp,
## weights = w8cf8p_80/mean(w8cf8p_80), id = childid, waves = wave,
## corstr = "exchangeable")
##
## Coefficients:
## Estimate Std.err Wald Pr(>|W|)
## (Intercept) -2.6730 0.1910 195.81 < 2e-16 ***
## age 0.0449 0.0189 5.66 0.017 *
## male 0.2090 0.1030 4.12 0.042 *
## black 0.4449 0.1828 5.92 0.015 *
## hisp 0.7417 0.1249 35.24 2.9e-09 ***
## asian 1.0626 0.2562 17.20 3.4e-05 ***
## nahn 0.4521 0.3695 1.50 0.221
## other 0.2322 0.2671 0.76 0.385
## ses -0.4837 0.0650 55.32 1.0e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Estimated Scale Parameters:
## Estimate Std.err
## (Intercept) 0.991 0.116
##
## Correlation: Structure = exchangeable Link = identity
##
## Estimated Correlation Parameters:
## Estimate Std.err
## alpha 0.309 0.0615
## Number of clusters: 2345 Maximum cluster size: 6
fitb.3<-geeglm(I(health>2)~age+male+black+hisp+asian+nahn+other+ses, waves = wave,id=childid, family=binomial, data=e.long.comp, corstr="ar1", weights=w8cf8p_80/mean(w8cf8p_80))
## Warning in eval(family$initialize): non-integer #successes in a binomial
## glm!
summary(fitb.3)
##
## Call:
## geeglm(formula = I(health > 2) ~ age + male + black + hisp +
## asian + nahn + other + ses, family = binomial, data = e.long.comp,
## weights = w8cf8p_80/mean(w8cf8p_80), id = childid, waves = wave,
## corstr = "ar1")
##
## Coefficients:
## Estimate Std.err Wald Pr(>|W|)
## (Intercept) -2.6930 0.1972 186.47 < 2e-16 ***
## age 0.0395 0.0196 4.06 0.0440 *
## male 0.2441 0.1042 5.49 0.0191 *
## black 0.4861 0.1841 6.98 0.0083 **
## hisp 0.7661 0.1244 37.96 7.2e-10 ***
## asian 1.1146 0.2755 16.37 5.2e-05 ***
## nahn 0.3832 0.3502 1.20 0.2739
## other 0.3203 0.2638 1.47 0.2247
## ses -0.5551 0.0649 73.14 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Estimated Scale Parameters:
## Estimate Std.err
## (Intercept) 1.02 0.143
##
## Correlation: Structure = ar1 Link = identity
##
## Estimated Correlation Parameters:
## Estimate Std.err
## alpha 0.523 0.076
## Number of clusters: 2345 Maximum cluster size: 6
QIC(fitb.1)
## QIC QICu Quasi Lik CIC params QICC
## 10525 10471 -5226 36 9 10525
QIC(fitb.2)
## QIC QICu Quasi Lik CIC params QICC
## 10493.7 10478.1 -5230.0 16.8 9.0 10493.8
QIC(fitb.3)
## QIC QICu Quasi Lik CIC params QICC
## 10481.9 10466.9 -5224.4 16.5 9.0 10481.9