In this example, we will use hierarchical models to do some longitudinal modeling of data from the ECLS-K 2011. Specifically, we will model changes in a student’s standardized math score from fall kindergarten to spring, 1st grade.

Longitudinal data are collected in waves, representing the sample at different time points For example, currently, the ECLS-K 2011 has eight waves, intending to capture the children as they progress through school, and will eventually have more. We really want to measure similar items at each time point so we can understand what makes people change over their lives. When it comes to data we need to introduce a different data structure to accommodate this.

In this example, I will focus on the use of the linear mixed model for a continuous outcome and a binomial model for a binary outcome.

We start simple then move into growth curve models. This follows the presentation in Singer and Willett (2003) Chapters 3-6.

Libraries

library (car)

## Loading required package: carData

library(geepack)
library(MuMIn)
library(haven)
library(lme4)

## Loading required package: Matrix

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:car':
## 
##     recode

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Data and recodes

First we load our data

eclsk11<-read_sas("~/Google Drive/classes/dem7473/data/childk4p.sas7bdat")
names(eclsk11)<-tolower(names(eclsk11))


myvars<-c("childid", "x_chsex_r", "x1locale", "x_raceth_r", "x2povty", "x12par1ed_i","p1curmar", "x1htotal",
          "x1mscalk4", "x2mscalk4", "x3mscalk4", "x4mscalk4","x5mscalk4","x6mscalk4","x7mscalk4","x8mscalk4",
          "p1hscale", "p2hscale", "p4hscale","p6hscale","p7hscale","p8hscale",
           "x12sesl","x4sesl_i", "p2parct1", "p2parct2", "s1_id", "p2safepl","x2krceth","p1o2near", "x_distpov" ,
          "w8cf8p_80","w8cf8p_8psu","w8cf8p_8str", "x1height","x2height","x3height","x4height","x5height","x6height","x7height","x8height",
        "x1kage_r","x2kage_r","x3age","x4age", "x5age", "x6age", "x7age", "x8age")

#subset the data
eclsk.sub<-eclsk11[,myvars]

rm(eclsk11); gc()

##           used (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells 1864793 99.6    3644527  194.7   3644527  194.7
## Vcells 4173583 31.9  681453772 5199.1 783653138 5978.8

Next, I do some recoding of variables. I have to make the repeated measures of each of my longitudinal variables.

#First we recode some Child characteristics
#Child's sex: recode as male =1
eclsk.sub$male<-Recode(eclsk.sub$x_chsex_r, recodes="1=1; 2=0; -9=NA")

#Recode race with white, non Hispanic as reference using dummy vars
eclsk.sub$hisp<-Recode(eclsk.sub$x_raceth_r, recodes="3:4=1;-9=NA; else=0")
eclsk.sub$black<-Recode(eclsk.sub$x_raceth_r, recodes="2=1;-9=NA; else=0")
eclsk.sub$asian<-Recode(eclsk.sub$x_raceth_r, recodes="5=1;-9=NA; else=0")
eclsk.sub$nahn<-Recode(eclsk.sub$x_raceth_r, recodes="6:7=1;-9=NA; else=0")
eclsk.sub$other<-Recode(eclsk.sub$x_raceth_r, recodes="8=1;-9=NA; else=0")


#Longitudinal variables
#recode our outcomes, the  first is the child's math standardized test score  in Kindergarten
eclsk.sub$math1<-ifelse(eclsk.sub$x1mscalk4<0, NA, eclsk.sub$x1mscalk4)
eclsk.sub$math2<-ifelse(eclsk.sub$x2mscalk4<0, NA, eclsk.sub$x2mscalk4)
eclsk.sub$math4<-ifelse(eclsk.sub$x4mscalk4<0, NA, eclsk.sub$x4mscalk4)
eclsk.sub$math6<-ifelse(eclsk.sub$x6mscalk4<0, NA, eclsk.sub$x6mscalk4)
eclsk.sub$math7<-ifelse(eclsk.sub$x7mscalk4<0, NA, eclsk.sub$x7mscalk4)
eclsk.sub$math8<-ifelse(eclsk.sub$x8mscalk4<0, NA, eclsk.sub$x8mscalk4)

#Second outcome is child's height for age, continuous outcome
eclsk.sub$height1<-ifelse(eclsk.sub$x1height<=-7, NA, eclsk.sub$x1height)
eclsk.sub$height2<-ifelse(eclsk.sub$x2height<=-7, NA, eclsk.sub$x2height)
eclsk.sub$height4<-ifelse(eclsk.sub$x4height<=-7, NA, eclsk.sub$x4height)
eclsk.sub$height6<-ifelse(eclsk.sub$x6height<=-7, NA, eclsk.sub$x6height)
eclsk.sub$height7<-ifelse(eclsk.sub$x7height<=-7, NA, eclsk.sub$x7height)
eclsk.sub$height8<-ifelse(eclsk.sub$x8height<=-7, NA, eclsk.sub$x8height)

#Age at each wave
eclsk.sub$age_yrs1<-ifelse(eclsk.sub$x1kage_r<0, NA, eclsk.sub$x1kage_r/12)
eclsk.sub$age_yrs2<-ifelse(eclsk.sub$x2kage_r<0, NA, eclsk.sub$x2kage_r/12)
#eclsk.sub$age_yrs3<-ifelse(eclsk.sub$x3age<0, NA, eclsk.sub$x3age/12)
eclsk.sub$age_yrs4<-ifelse(eclsk.sub$x4age<0, NA, eclsk.sub$x4age/12)
eclsk.sub$age_yrs6<-ifelse(eclsk.sub$x6age<0, NA, eclsk.sub$x6age/12)
eclsk.sub$age_yrs7<-ifelse(eclsk.sub$x7age<0, NA, eclsk.sub$x7age/12)
eclsk.sub$age_yrs8<-ifelse(eclsk.sub$x8age<0, NA, eclsk.sub$x8age/12)



eclsk.sub<- eclsk.sub[is.na(eclsk.sub$age_yrs1)==F, ]

#Height for age z score standardized by sex and age
eclsk.sub$height_z1<-ave(eclsk.sub$height1, as.factor(paste(round(eclsk.sub$age_yrs1, 1.5), eclsk.sub$male)), FUN=scale)
eclsk.sub$height_z2<-ave(eclsk.sub$height2, as.factor(paste(round(eclsk.sub$age_yrs2, 1.5), eclsk.sub$male)), FUN=scale)
#eclsk.sub$height_z3<-ave(eclsk.sub$height3, as.factor(paste(round(eclsk.sub$age_yrs3, 1.5), eclsk.sub$male)), FUN=scale)
eclsk.sub$height_z4<-ave(eclsk.sub$height4, as.factor(paste(round(eclsk.sub$age_yrs4, 1.5), eclsk.sub$male)), FUN=scale)
eclsk.sub$height_z6<-ave(eclsk.sub$height6, as.factor(paste(round(eclsk.sub$age_yrs6, 1.5), eclsk.sub$male)), FUN=scale)
eclsk.sub$height_z7<-ave(eclsk.sub$height7, as.factor(paste(round(eclsk.sub$age_yrs7, 1.5), eclsk.sub$male)), FUN=scale)
eclsk.sub$height_z8<-ave(eclsk.sub$height8, as.factor(paste(round(eclsk.sub$age_yrs8, 1.5), eclsk.sub$male)), FUN=scale)



#Child health assessment Excellent to poor , ordinal outcome
eclsk.sub$chhealth1<-ifelse(eclsk.sub$p1hscale<0, NA, eclsk.sub$p1hscale)
eclsk.sub$chhealth2<-ifelse(eclsk.sub$p2hscale<0, NA, eclsk.sub$p2hscale)
eclsk.sub$chhealth4<-ifelse(eclsk.sub$p4hscale<0, NA, eclsk.sub$p4hscale)
eclsk.sub$chhealth6<-ifelse(eclsk.sub$p6hscale<0, NA, eclsk.sub$p6hscale)
eclsk.sub$chhealth7<-ifelse(eclsk.sub$p7hscale<0, NA, eclsk.sub$p7hscale)
eclsk.sub$chhealth8<-ifelse(eclsk.sub$p8hscale<0, NA, eclsk.sub$p8hscale)

#SES
eclsk.sub$hhses1<-ifelse(eclsk.sub$x12sesl==-9, NA, scale(eclsk.sub$x12sesl))
eclsk.sub$hhses4<-ifelse(eclsk.sub$x4sesl_i==-9, NA, scale(eclsk.sub$x4sesl_i))

Reshaping data into longitudinal format

To analyze data longitudinally, we must reshape the data from its current “wide” format, where each repeated measure is a column, into the “long” format, where there is a single column for a particular variable, and we account for the repeated measurements of each person. In this case, I’m going to use three waves of data, so each child can contribute up to three lines to the data.

e.long<-reshape(as.data.frame(eclsk.sub), idvar="childid", varying=list(math = c("math1", "math2",  "math4", "math6", "math7", "math8"),
                                         age = c("age_yrs1", "age_yrs2",  "age_yrs4","age_yrs6","age_yrs7","age_yrs8"),
                                         height= c("height_z1", "height_z2", "height_z4","height_z6","height_z7","height_z8"),
                                         health=c("chhealth1", "chhealth2", "chhealth4","chhealth6","chhealth7","chhealth8"), 
                                         ses = c("hhses1", "hhses1", "hhses4","hhses4","hhses4","hhses4")),
                                        v.names = c("math", "age", "height", "health", "ses"),
                                         timevar="wave",times=c(1,2,4,6,7,8),direction="long",  
                                          drop = names(eclsk.sub)[c(2:26, 28:31, 35:50)])

e.long<-e.long[order(e.long$childid, e.long$wave),]
head(e.long[c("childid" ,"wave", "age", "height", "health", "ses")], n=18)

Filtering and mutating

#get non missing cases
e.long.comp<-e.long%>%
  select(childid,wave, s1_id,w8cf8p_80,w8cf8p_8psu,w8cf8p_8str,math, male, hisp, black, asian,nahn, other,age, health, height, ses )%>%
  filter(complete.cases(math, black, age,height, health, ses, w8cf8p_80,w8cf8p_8psu))%>%
  mutate(badhealth= ifelse(health>2, 1,0), agez=as.numeric(scale(age)), agez2=as.numeric(scale(age^2)), psu=w8cf8p_8psu, strata=w8cf8p_8str, weight= w8cf8p_80/mean(w8cf8p_80))

Hierarchical model for individual change - growth curves using (G)LMM’s

The basic form of the longitudinal model is:

\[y_{it} = \beta_{0i}+\sum_k \beta_k x_{ik}+e_{it}\] \[\beta_{0i} = \beta_0 +u_i\]

\[u_i \sim Normal(0, \sigma_u)\] Which is just a random intercept model for each individual person’s life. We can see examples of this in the following plots of the first few students in the ECLS-K 2011, which highlight the individual level trajectories and starting points that each child has. We can see that the individual level variation is quite dramatic in some of the outcomes.

library(ggplot2)
ggplot(e.long.comp[1:52,], aes(y=math,x= age, color=childid))+geom_point()+geom_smooth(method = "lm")+ggtitle("Math Score over Age by child")

ggplot(e.long.comp[1:52,], aes(y=math, x=age, color=childid))+geom_point()+geom_smooth(method = "lm")+facet_wrap(~childid)+ggtitle("Math Score over Age by child")

ggplot(e.long.comp[1:52,], aes(y=height,x= age, color=childid))+geom_point()+geom_smooth(method = "lm")+ggtitle("Height over Age by child")

ggplot(e.long.comp[1:52,], aes(y=height,x= age, color=childid))+geom_point()+geom_smooth(method = "lm")+facet_wrap(~childid)+ggtitle("Height over Age by child")

GLMM for individual growth

In addition to the random intercept model, we can also include terms that allow for individual level variation in trajectory of change. Adding a random slope for age allows each child to change at a different rate, \(\gamma_i\):

\[y_{it} = \beta_{0i}+\sum_k \beta_k x_{ik}+\gamma_i*Age_{it}+e_{it}\] \[\beta_{0i} = \beta_0 +u_{1i}\] \[\gamma_{i} = \gamma +u_{2i}\]

\[u \sim \text{MVN}(0, \Sigma_u)\]

Random intercept for each person

fit.2<-lmer(scale(math)~agez+male+black+hisp+asian+nahn+other+(1|childid)+(1|strata/psu),weights = weight, data=e.long.comp)
summary(fit.2)

## Linear mixed model fit by REML ['lmerMod']
## Formula: 
## scale(math) ~ agez + male + black + hisp + asian + nahn + other +  
##     (1 | childid) + (1 | strata/psu)
##    Data: e.long.comp
## Weights: weight
## 
## REML criterion at convergence: 14224.2
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.6955 -0.5425  0.0048  0.5628  4.6492 
## 
## Random effects:
##  Groups     Name        Variance Std.Dev.
##  childid    (Intercept) 0.136698 0.36973 
##  psu:strata (Intercept) 0.003873 0.06224 
##  strata     (Intercept) 0.005230 0.07232 
##  Residual               0.085891 0.29307 
## Number of obs: 13281, groups:  childid, 2345; psu:strata, 123; strata, 28
## 
## Fixed effects:
##              Estimate Std. Error t value
## (Intercept)  0.100948   0.023287   4.335
## agez         0.874439   0.002604 335.854
## male         0.032892   0.016846   1.952
## black       -0.354049   0.034106 -10.381
## hisp        -0.247260   0.023403 -10.566
## asian        0.121723   0.039199   3.105
## nahn        -0.104625   0.069332  -1.509
## other       -0.044849   0.042340  -1.059
## 
## Correlation of Fixed Effects:
##       (Intr) agez   male   black  hisp   asian  nahn  
## agez   0.000                                          
## male  -0.372 -0.006                                   
## black -0.232  0.005  0.020                            
## hisp  -0.422  0.004 -0.001  0.228                     
## asian -0.229  0.001  0.041  0.112  0.222              
## nahn  -0.100  0.000  0.029  0.046  0.093  0.049       
## other -0.162  0.001 -0.032  0.112  0.185  0.118  0.070

individual trajectory model with random slope for time for each person

fit.3<-lmer(scale(math)~agez+male+black+hisp+asian+nahn+other+(agez|childid)+(1|strata/psu),weights = weight, data=e.long.comp)
summary(fit.3)

## Linear mixed model fit by REML ['lmerMod']
## Formula: 
## scale(math) ~ agez + male + black + hisp + asian + nahn + other +  
##     (agez | childid) + (1 | strata/psu)
##    Data: e.long.comp
## Weights: weight
## 
## REML criterion at convergence: 13809.4
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.3029 -0.5657 -0.0089  0.5568  4.8518 
## 
## Random effects:
##  Groups     Name        Variance Std.Dev. Corr
##  childid    (Intercept) 0.137990 0.37147      
##             agez        0.005719 0.07562  0.64
##  psu:strata (Intercept) 0.004276 0.06539      
##  strata     (Intercept) 0.005480 0.07403      
##  Residual               0.079115 0.28127      
## Number of obs: 13281, groups:  childid, 2345; psu:strata, 123; strata, 28
## 
## Fixed effects:
##              Estimate Std. Error t value
## (Intercept)  0.115993   0.023138   5.013
## agez         0.876848   0.003109 282.079
## male        -0.001088   0.015961  -0.068
## black       -0.280332   0.031717  -8.838
## hisp        -0.239736   0.022529 -10.641
## asian        0.098467   0.037737   2.609
## nahn        -0.088216   0.065331  -1.350
## other       -0.042970   0.040386  -1.064
## 
## Correlation of Fixed Effects:
##       (Intr) agez   male   black  hisp   asian  nahn  
## agez   0.111                                          
## male  -0.352 -0.011                                   
## black -0.219  0.039  0.016                            
## hisp  -0.407 -0.021 -0.004  0.226                     
## asian -0.220 -0.016  0.039  0.110  0.218              
## nahn  -0.096 -0.001  0.030  0.044  0.092  0.048       
## other -0.155 -0.010 -0.034  0.113  0.183  0.117  0.072

anova(fit.3, fit.2)

## refitting model(s) with ML (instead of REML)

curvilinear trajectory model with random nonlinear time

fit.4<-lmer(scale(math)~agez+male+black+hisp+asian+nahn+other+agez2+(agez+agez2|childid)+(1|strata/psu),weights = weight, data=e.long.comp)
summary(fit.4)

## Linear mixed model fit by REML ['lmerMod']
## Formula: 
## scale(math) ~ agez + male + black + hisp + asian + nahn + other +  
##     agez2 + (agez + agez2 | childid) + (1 | strata/psu)
##    Data: e.long.comp
## Weights: weight
## 
## REML criterion at convergence: 6060.6
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.9514 -0.4609  0.0074  0.4674  4.9284 
## 
## Random effects:
##  Groups     Name        Variance Std.Dev. Corr       
##  childid    (Intercept) 0.154889 0.39356             
##             agez        0.863226 0.92910   0.69      
##             agez2       0.790166 0.88891  -0.70 -0.99
##  psu:strata (Intercept) 0.004044 0.06359             
##  strata     (Intercept) 0.005503 0.07418             
##  Residual               0.029693 0.17232             
## Number of obs: 13281, groups:  childid, 2345; psu:strata, 123; strata, 28
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)  0.05839    0.02265   2.578
## agez         2.96397    0.02825 104.914
## male         0.01902    0.01512   1.258
## black       -0.23412    0.03034  -7.716
## hisp        -0.18659    0.02139  -8.722
## asian        0.11744    0.03549   3.309
## nahn        -0.03470    0.06152  -0.564
## other       -0.01724    0.03808  -0.453
## agez2       -2.09450    0.02762 -75.826
## 
## Correlation of Fixed Effects:
##       (Intr) agez   male   black  hisp   asian  nahn   other 
## agez   0.164                                                 
## male  -0.342 -0.008                                          
## black -0.208  0.035  0.017                                   
## hisp  -0.398 -0.027 -0.003  0.225                            
## asian -0.220 -0.024  0.042  0.110  0.224                     
## nahn  -0.094 -0.001  0.030  0.045  0.093  0.049              
## other -0.153 -0.011 -0.033  0.112  0.185  0.119  0.073       
## agez2 -0.162 -0.994  0.006 -0.034  0.029  0.024  0.001  0.012

anova(fit.4, fit.3)

## refitting model(s) with ML (instead of REML)

individual trajectory model with fixed effects for race and different population trajectories for each race

fit.5<-lmer(scale(math)~agez*(male+black+hisp+asian+nahn+other)+(agez|childid)+(1|strata/psu),weights = weight, data=e.long.comp)
summary(fit.5)

## Linear mixed model fit by REML ['lmerMod']
## Formula: 
## scale(math) ~ agez * (male + black + hisp + asian + nahn + other) +  
##     (agez | childid) + (1 | strata/psu)
##    Data: e.long.comp
## Weights: weight
## 
## REML criterion at convergence: 13757.9
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.3356 -0.5662 -0.0092  0.5548  4.8971 
## 
## Random effects:
##  Groups     Name        Variance Std.Dev. Corr
##  childid    (Intercept) 0.137496 0.37080      
##             agez        0.004886 0.06990  0.66
##  psu:strata (Intercept) 0.004273 0.06537      
##  strata     (Intercept) 0.005452 0.07384      
##  Residual               0.079095 0.28124      
## Number of obs: 13281, groups:  childid, 2345; psu:strata, 123; strata, 28
## 
## Fixed effects:
##               Estimate Std. Error t value
## (Intercept)  0.1068890  0.0234607   4.556
## agez         0.8652905  0.0052601 164.502
## male         0.0326502  0.0167954   1.944
## black       -0.3569151  0.0340234 -10.490
## hisp        -0.2525845  0.0231976 -10.888
## asian        0.1164710  0.0388830   2.995
## nahn        -0.0897392  0.0688898  -1.303
## other       -0.0455110  0.0421553  -1.080
## agez:male    0.0406356  0.0060440   6.723
## agez:black  -0.0654812  0.0101132  -6.475
## agez:hisp   -0.0158942  0.0072445  -2.194
## agez:asian   0.0318312  0.0144761   2.199
## agez:nahn    0.0004592  0.0210528   0.022
## agez:other  -0.0016056  0.0158740  -0.101

## 
## Correlation matrix not shown by default, as p = 14 > 12.
## Use print(x, correlation=TRUE)  or
##   vcov(x)     if you need it

Model comparisons

model.sel( fit.2, fit.3, fit.4, fit.5)

GLMM for change in a binary variable

We can likewise do a longitudinal analysis for binomial or count outcomes. These rely on GLMM’s to do the lifting. Here is an example of analyzing individual level poor health over age for the children in the ECLS-K 2011

binomial_smooth <- function(...) {
  geom_smooth(method = "glm", method.args = list(family = "binomial"), ...)
}
ggplot(e.long.comp[1:52,], aes(y=badhealth, x=age))+geom_point()+binomial_smooth()+facet_wrap(~childid)+ggtitle("Poor Health over Age by child")

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

fit1.b<-glmer(badhealth~agez+male+black+hisp+asian+nahn+other+agez2+(1|childid)+(1|strata),weights = weight, data=e.long.comp, family = binomial, control = glmerControl(optimizer="bobyqa",optCtrl=list(maxfun=2e5)))

## Warning in eval(family$initialize, rho): non-integer #successes in a
## binomial glm!

summary(fit1.b)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: badhealth ~ agez + male + black + hisp + asian + nahn + other +  
##     agez2 + (1 | childid) + (1 | strata)
##    Data: e.long.comp
## Weights: weight
## Control: 
## glmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
## 
##      AIC      BIC   logLik deviance df.resid 
##   8007.0   8089.5  -3992.5   7985.0    13270 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.7525 -0.2492 -0.1667 -0.1092  5.0053 
## 
## Random effects:
##  Groups  Name        Variance Std.Dev.
##  childid (Intercept) 2.707    1.645   
##  strata  (Intercept) 0.000    0.000   
## Number of obs: 13281, groups:  childid, 2345; strata, 28
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -3.6388     0.1211 -30.051  < 2e-16 ***
## agez          0.7347     0.3559   2.064  0.03897 *  
## male          0.3267     0.1127   2.899  0.00375 ** 
## black         0.9419     0.1943   4.848 1.25e-06 ***
## hisp          1.7327     0.1307  13.257  < 2e-16 ***
## asian         1.0871     0.2599   4.183 2.88e-05 ***
## nahn          0.7810     0.3964   1.970  0.04881 *  
## other         0.3046     0.3024   1.007  0.31374    
## agez2        -0.6054     0.3531  -1.715  0.08639 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##       (Intr) agez   male   black  hisp   asian  nahn   other 
## agez  -0.024                                                 
## male  -0.527  0.004                                          
## black -0.347  0.002  0.015                                   
## hisp  -0.564  0.007  0.000  0.298                            
## asian -0.276 -0.002  0.050  0.145  0.221                     
## nahn  -0.171  0.000  0.020  0.095  0.143  0.071              
## other -0.178  0.001 -0.034  0.121  0.181  0.089  0.059       
## agez2  0.020 -0.996 -0.004  0.000 -0.004  0.003  0.000 -0.001

fit2.b<-glmer(badhealth~agez+male+black+hisp+asian+nahn+other+agez2+(agez|childid)+(1|strata),weights = weight, data=e.long.comp, family = binomial, control = glmerControl(optimizer="bobyqa",optCtrl=list(maxfun=2e5)))

## Warning in eval(family$initialize, rho): non-integer #successes in a
## binomial glm!

summary(fit2.b)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: badhealth ~ agez + male + black + hisp + asian + nahn + other +  
##     agez2 + (agez | childid) + (1 | strata)
##    Data: e.long.comp
## Weights: weight
## Control: 
## glmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05))
## 
##      AIC      BIC   logLik deviance df.resid 
##   7864.3   7961.7  -3919.1   7838.3    13268 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.9592 -0.2219 -0.1434 -0.0913  4.4059 
## 
## Random effects:
##  Groups  Name        Variance Std.Dev. Corr
##  childid (Intercept) 3.2760   1.8100       
##          agez        0.6319   0.7949   0.02
##  strata  (Intercept) 0.0000   0.0000       
## Number of obs: 13281, groups:  childid, 2345; strata, 28
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -4.0062     0.1373 -29.170  < 2e-16 ***
## agez          2.2733     0.4488   5.065 4.09e-07 ***
## male          0.3398     0.1210   2.809  0.00497 ** 
## black         0.9909     0.2118   4.679 2.88e-06 ***
## hisp          1.8464     0.1407  13.126  < 2e-16 ***
## asian         1.1280     0.2760   4.088 4.36e-05 ***
## nahn          0.8106     0.4287   1.891  0.05863 .  
## other         0.3238     0.3186   1.016  0.30948    
## agez2        -2.1606     0.4409  -4.900 9.59e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##       (Intr) agez   male   black  hisp   asian  nahn   other 
## agez  -0.149                                                 
## male  -0.505  0.019                                          
## black -0.331  0.021  0.023                                   
## hisp  -0.553  0.027  0.001  0.286                            
## asian -0.266  0.004  0.050  0.142  0.220                     
## nahn  -0.164  0.009  0.024  0.095  0.138  0.070              
## other -0.169  0.004 -0.035  0.118  0.180  0.090  0.058       
## agez2  0.138 -0.987 -0.010 -0.009 -0.026 -0.003 -0.002 -0.004

anova(fit1.b, fit2.b)

Generalized estimating equations for longitudinal data

Introduction to GEE’s

Up until now, we have used (G)LMM’s to analyze data that were “clustered”

Persons within neighborhoods
Survey data in general - stratified sampling

The next topic will introduce a modeling strategy that allows us to consider clustered data, but in a different fashion

GLMMS’s

GLMM’s are commonly referred to as conditional models, because the model coefficients “\(\beta\)’s” are condition on the random effects in the model.

Likewise, the mean if conditional on the random effects. This is another way of saying that the mean for a given covariate pattern is conditional on the group that the particular person is in.

\(\mu_{ij}^c = E(Y_{ij} | u_j) = X_{ij}\beta + u_j\)

GLMMS’s and GEE’s

In contrast, Generalzed Estimating Equations are referred to as marginal models because they only estimate the overall mean.

\(\mu_{ij} = X_{ij}\beta\)

Lee and Nelder, 2004 provide a very good description of how these two methods compare to one another

Generalized Estimating Equations

Typically first attributed to Liang and Zeger, 1986
GEE’s are regression models
Interested in modeling the mean response, while treating correlation within person/cluster as a nuisance
NOT based on maximum likelihood
Does not need a fully specified joint distribution, only the marginal distribution (mean) of the outcome
Models can be for any distribution for the outcome

GEE’s

For longitudinal data, we assume we have \(y_{ij}\) as our outcome on person i at time j. This could just as easily be persons within other types of clusters, like counties or sampling units.
Also have \(X_{ij}\), the matrix of predictors
Specify the link function between \(y_{ij}\) and \(X_{ij}\) as in a GLM, via a link function
Focus is on the linear predictor of the link function - the mean
NOT INTERESTED in variance components ONLY regression coefficients

GEE’s

Covariance structure
We also may wish to model how observations are related to one another via some type of correlation structure between waves
This directly implies that observations are NOT INDEPENDENT, and that’s fine
Observations between clusters are independent
Errors are correlated
No assumption of common variance (homoskedsasticity)

GEE’s - Model form

A basic form of the model would be:

\(Y_{ij} = \beta_0 + \sum_k X_{ijk} \beta_k + CORR + error\)

Ordinary models will tend to over estimate the standard errors for the \(\beta\)’s for time varying predictors in a model with repeated observations, because these models do not account for the correlation within clusters observations over time.

Likewise, the standard errors of time invariant predictors will be under estimated

GEE’s - Model estimation

Given the mean function for the model and a specified correlation function, the model parameters may be estimated by finding the solution for:

\[U(\beta) = \sum_i ^n \frac{\delta \mu_{ij}}{ \delta \beta_k} V_i^{-1} (Y_{ij} - \mu(\beta))\]

Which gives estimates of the \(\beta\)’s for the linear mean function.

GEE’s - Model estimation

First, a naive linear regression analysis is carried out, assuming the observations within subjects are independent.
Then, residuals are calculated from the naive model (observed-predicted) and a working correlation matrix is estimated from these residuals.
Then the regression coefficients are refit, correcting for the correlation. (Iterative process)
The within-subject correlation structure is treated as a nuisance variable (i.e. as a covariate)

GEE’s - Correlation Structure

For three time points per person, the ordinary regression model correlation in residuals within clusters/persons over time can be thought of as the matrix:

\[\begin{bmatrix} \sigma^2 & 0 & 0 \\ 0 & \sigma^2 &0 \\ 0 & 0 & \sigma^2 \end{bmatrix}\]

which assumed the variances are constant and the residuals are independent over time

GEE’s - Correlation Structure

But in a GEE, the model include the actual correlation between measurements over time:

\[\begin{bmatrix} \sigma_1 ^2 & a & c \\ a & \sigma_2 ^2 &b \\ b & c & \sigma_3 ^2 \end{bmatrix}\]

Which allows the variances over time to be different, as well as correlations between times to be present.

GEE’s - Correlation Structure

Several types of correlation/covariance are commonly used in GEE’s
When we fit a GEE, we have to assume a certain type of correlation for the repeated measures. These are typically:
Independence - same as OLS
Exchangeable/compound symmetry (simplest)
Autoregressive
Unstructured (most complicated)

GEE’s - Correlation Structure - Independent

\[\begin{bmatrix} 1 & 0 & 0 \\ 0 & 1 &0 \\ 0 & 0 & 1 \end{bmatrix}\]

GEE’s - Correlation Structure - Exchangeable

\[\begin{bmatrix} 1 & \rho & \rho \\ \rho & 1 &\rho \\ \rho &\rho & 1 \end{bmatrix}\]

GEE’s - Correlation Structure - AR(1)

\[\begin{bmatrix} 1 & \rho & \rho^2 \\ \rho & 1 &\rho\\ \rho^2 & \rho & 1 \end{bmatrix}\]

GEE’s - Correlation Structure - Unstructured

\[\begin{bmatrix} 1 & \rho_1 & \rho_2 \\ \rho_1 & 1 &\rho_3 \\ \rho_2 & \rho_3& 1 \end{bmatrix}\]

Basic linear model

fit.1<-glm(scale(math)~agez+agez2+factor(wave)+male+black+hisp+asian+nahn+other+ses, data=e.long.comp, weights=w8cf8p_80/mean(w8cf8p_80))
summary(fit.1)

## 
## Call:
## glm(formula = scale(math) ~ agez + agez2 + factor(wave) + male + 
##     black + hisp + asian + nahn + other + ses, data = e.long.comp, 
##     weights = w8cf8p_80/mean(w8cf8p_80))
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.98248  -0.21480   0.01056   0.22237   1.87471  
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -0.881877   0.024131 -36.546  < 2e-16 ***
## agez           0.940583   0.080024  11.754  < 2e-16 ***
## agez2         -0.647273   0.078924  -8.201 2.60e-16 ***
## factor(wave)2  0.297253   0.014702  20.218  < 2e-16 ***
## factor(wave)4  0.853834   0.023541  36.270  < 2e-16 ***
## factor(wave)6  1.202288   0.031437  38.245  < 2e-16 ***
## factor(wave)7  1.473849   0.038141  38.642  < 2e-16 ***
## factor(wave)8  1.620513   0.046617  34.762  < 2e-16 ***
## male           0.053364   0.007144   7.470 8.55e-14 ***
## black         -0.311969   0.011278 -27.662  < 2e-16 ***
## hisp          -0.156561   0.009517 -16.451  < 2e-16 ***
## asian          0.081354   0.017689   4.599 4.28e-06 ***
## nahn          -0.067882   0.023669  -2.868  0.00414 ** 
## other         -0.089433   0.019601  -4.563 5.10e-06 ***
## ses            0.153098   0.004482  34.159  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.167762)
## 
##     Null deviance: 13091.5  on 13280  degrees of freedom
## Residual deviance:  2225.5  on 13266  degrees of freedom
## AIC: 18207
## 
## Number of Fisher Scoring iterations: 2

#Get residuals and put them in a data frame
e.long.comp$resid<- residuals(fit.1)
resid<- data.frame(r1 = e.long.comp$resid[e.long$wave==1], r2 = e.long.comp$resid[e.long$wave==2], r4 = e.long.comp$resid[e.long$wave==4])

Here is our actual correlation matrix in the residuals between waves:

cor(resid, use="pairwise.complete")

##           r1        r2        r4
## r1 1.0000000 0.6963999 0.5294997
## r2 0.6963999 1.0000000 0.6998337
## r4 0.5294997 0.6998337 1.0000000

This is certainly not independence, and looks more like an AR(1), because the correlation decreases as the difference between wave number increases.

Now we fit a GEE:

#Model with independent correlation, meaning ZERO correlation
fit.1b<-geeglm(scale(math)~scale(age)+male+black+hisp+asian+nahn+other+ses,id=childid , wave = wave, corstr ="independence", 
               data=e.long.comp, weights=w8cf8p_80/mean(w8cf8p_80))
  
summary(fit.1b)

## 
## Call:
## geeglm(formula = scale(math) ~ scale(age) + male + black + hisp + 
##     asian + nahn + other + ses, data = e.long.comp, weights = w8cf8p_80/mean(w8cf8p_80), 
##     id = childid, waves = wave, corstr = "independence")
## 
##  Coefficients:
##              Estimate   Std.err      Wald Pr(>|W|)    
## (Intercept)  0.025406  0.017224     2.176  0.14019    
## scale(age)   0.855391  0.004036 44918.647  < 2e-16 ***
## male         0.035205  0.019698     3.194  0.07389 .  
## black       -0.297930  0.035274    71.337  < 2e-16 ***
## hisp        -0.129440  0.026508    23.844 1.04e-06 ***
## asian        0.127519  0.039338    10.508  0.00119 ** 
## nahn        -0.088015  0.078716     1.250  0.26351    
## other       -0.082174  0.051367     2.559  0.10966    
## ses          0.150448  0.012200   152.071  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Estimated Scale Parameters:
##             Estimate  Std.err
## (Intercept)   0.2155 0.005723
## 
## Correlation: Structure = independenceNumber of clusters:   2345   Maximum cluster size: 6

#Model with Exchangeable correlation:
fit.2<-geeglm(scale(math)~scale(age)+male+black+hisp+asian+nahn+other+ses,id=childid , wave = wave, corstr ="exchangeable", 
               data=e.long.comp, weights=w8cf8p_80/mean(w8cf8p_80))
  
summary(fit.2)

## 
## Call:
## geeglm(formula = scale(math) ~ scale(age) + male + black + hisp + 
##     asian + nahn + other + ses, data = e.long.comp, weights = w8cf8p_80/mean(w8cf8p_80), 
##     id = childid, waves = wave, corstr = "exchangeable")
## 
##  Coefficients:
##             Estimate  Std.err     Wald Pr(>|W|)    
## (Intercept)  0.03054  0.01733     3.11  0.07802 .  
## scale(age)   0.87711  0.00335 68349.70  < 2e-16 ***
## male         0.03465  0.01988     3.04  0.08135 .  
## black       -0.31817  0.03527    81.37  < 2e-16 ***
## hisp        -0.15858  0.02615    36.78  1.3e-09 ***
## asian        0.14793  0.03927    14.19  0.00017 ***
## nahn        -0.12045  0.08185     2.17  0.14113    
## other       -0.07537  0.05185     2.11  0.14609    
## ses          0.11363  0.01262    81.01  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Estimated Scale Parameters:
##             Estimate Std.err
## (Intercept)    0.217 0.00578
## 
## Correlation: Structure = exchangeable  Link = identity 
## 
## Estimated Correlation Parameters:
##       Estimate Std.err
## alpha    0.561  0.0201
## Number of clusters:   2345   Maximum cluster size: 6

The second model shows the exchangeable correlation to be 0.561, which is not very different from our measured correlations above 1, 0.696, 0.529, 0.696, 1, 0.7, 0.529, 0.7, 1

Now we examine the other correlation types:

fit.3<-geeglm(scale(math)~scale(age)+male+black+hisp+asian+nahn+other+ses, id=childid , wave = wave, corstr ="ar1",  data=e.long.comp, weights=w8cf8p_80/mean(w8cf8p_80))
  
summary(fit.3)

## 
## Call:
## geeglm(formula = scale(math) ~ scale(age) + male + black + hisp + 
##     asian + nahn + other + ses, data = e.long.comp, weights = w8cf8p_80/mean(w8cf8p_80), 
##     id = childid, waves = wave, corstr = "ar1")
## 
##  Coefficients:
##             Estimate  Std.err     Wald Pr(>|W|)    
## (Intercept) -0.09177  0.01640    31.32  2.2e-08 ***
## scale(age)   0.84794  0.00331 65494.98  < 2e-16 ***
## male         0.03848  0.01865     4.26  0.03905 *  
## black       -0.28557  0.03319    74.02  < 2e-16 ***
## hisp        -0.14373  0.02463    34.07  5.3e-09 ***
## asian        0.14904  0.03929    14.39  0.00015 ***
## nahn        -0.09752  0.06862     2.02  0.15527    
## other       -0.06954  0.04695     2.19  0.13853    
## ses          0.10747  0.01216    78.07  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Estimated Scale Parameters:
##             Estimate Std.err
## (Intercept)     0.23 0.00578
## 
## Correlation: Structure = ar1  Link = identity 
## 
## Estimated Correlation Parameters:
##       Estimate Std.err
## alpha    0.779  0.0121
## Number of clusters:   2345   Maximum cluster size: 6

Since GEE’s aren’t fit via maximum likelihood, they aren’t comparable in terms of AIC or likelihood ratio tests. However, Pan, 2001 describe an information criterion using a Quasi-likelihood formulation. This can be used to compare models with alternative correlation structures, with the lowest QIC representing the best fitting model.

library(MESS) #need to install

## Loading required package: geeM

## 
## Attaching package: 'MESS'

## The following object is masked from 'package:MuMIn':
## 
##     QIC

QIC(fit.1b) #independence

##       QIC      QICu Quasi Lik       CIC    params      QICC 
##    3081.3    3001.1   -1491.6      49.1       9.0    3081.3

QIC(fit.2) #exchangeable

##       QIC      QICu Quasi Lik       CIC    params      QICC 
##    3036.2    3023.0   -1502.5      15.6       9.0    3036.3

QIC(fit.3) #ar1

##       QIC      QICu Quasi Lik       CIC    params      QICC 
##    3221.2    3214.9   -1598.4      12.1       9.0    3221.2

So, it looks like the exchangeable correlation structure is certainly better than independence or AR(1).

Binary response longitudinal model

Here we use the GEE for a binomial outcome.

#basic linear model
fitb.1<-geeglm(I(health>2)~age+male+black+hisp+asian+nahn+other+ses, waves = wave,id=childid ,corstr ="independence", family=binomial, data=e.long.comp, weights=w8cf8p_80/mean(w8cf8p_80))

## Warning in eval(family$initialize): non-integer #successes in a binomial
## glm!

summary(fitb.1)

## 
## Call:
## geeglm(formula = I(health > 2) ~ age + male + black + hisp + 
##     asian + nahn + other + ses, family = binomial, data = e.long.comp, 
##     weights = w8cf8p_80/mean(w8cf8p_80), id = childid, waves = wave, 
##     corstr = "independence")
## 
##  Coefficients:
##             Estimate Std.err   Wald Pr(>|W|)    
## (Intercept)  -2.6215  0.1986 174.26  < 2e-16 ***
## age           0.0391  0.0198   3.92    0.048 *  
## male          0.1975  0.1038   3.62    0.057 .  
## black         0.4119  0.1860   4.90    0.027 *  
## hisp          0.6885  0.1266  29.58  5.4e-08 ***
## asian         1.1035  0.2634  17.56  2.8e-05 ***
## nahn          0.4342  0.3460   1.57    0.210    
## other         0.2643  0.2594   1.04    0.308    
## ses          -0.5524  0.0641  74.37  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Estimated Scale Parameters:
##             Estimate Std.err
## (Intercept)        1   0.129
## 
## Correlation: Structure = independenceNumber of clusters:   2345   Maximum cluster size: 6

fitb.2<-geeglm(I(health>2)~age+male+black+hisp+asian+nahn+other+ses, waves = wave,id=childid, family=binomial, data=e.long.comp, corstr="exchangeable", weights=w8cf8p_80/mean(w8cf8p_80))

## Warning in eval(family$initialize): non-integer #successes in a binomial
## glm!

summary(fitb.2)

## 
## Call:
## geeglm(formula = I(health > 2) ~ age + male + black + hisp + 
##     asian + nahn + other + ses, family = binomial, data = e.long.comp, 
##     weights = w8cf8p_80/mean(w8cf8p_80), id = childid, waves = wave, 
##     corstr = "exchangeable")
## 
##  Coefficients:
##             Estimate Std.err   Wald Pr(>|W|)    
## (Intercept)  -2.6730  0.1910 195.81  < 2e-16 ***
## age           0.0449  0.0189   5.66    0.017 *  
## male          0.2090  0.1030   4.12    0.042 *  
## black         0.4449  0.1828   5.92    0.015 *  
## hisp          0.7417  0.1249  35.24  2.9e-09 ***
## asian         1.0626  0.2562  17.20  3.4e-05 ***
## nahn          0.4521  0.3695   1.50    0.221    
## other         0.2322  0.2671   0.76    0.385    
## ses          -0.4837  0.0650  55.32  1.0e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Estimated Scale Parameters:
##             Estimate Std.err
## (Intercept)    0.991   0.116
## 
## Correlation: Structure = exchangeable  Link = identity 
## 
## Estimated Correlation Parameters:
##       Estimate Std.err
## alpha    0.309  0.0615
## Number of clusters:   2345   Maximum cluster size: 6

fitb.3<-geeglm(I(health>2)~age+male+black+hisp+asian+nahn+other+ses, waves = wave,id=childid, family=binomial, data=e.long.comp, corstr="ar1", weights=w8cf8p_80/mean(w8cf8p_80))

## Warning in eval(family$initialize): non-integer #successes in a binomial
## glm!

summary(fitb.3)

## 
## Call:
## geeglm(formula = I(health > 2) ~ age + male + black + hisp + 
##     asian + nahn + other + ses, family = binomial, data = e.long.comp, 
##     weights = w8cf8p_80/mean(w8cf8p_80), id = childid, waves = wave, 
##     corstr = "ar1")
## 
##  Coefficients:
##             Estimate Std.err   Wald Pr(>|W|)    
## (Intercept)  -2.6930  0.1972 186.47  < 2e-16 ***
## age           0.0395  0.0196   4.06   0.0440 *  
## male          0.2441  0.1042   5.49   0.0191 *  
## black         0.4861  0.1841   6.98   0.0083 ** 
## hisp          0.7661  0.1244  37.96  7.2e-10 ***
## asian         1.1146  0.2755  16.37  5.2e-05 ***
## nahn          0.3832  0.3502   1.20   0.2739    
## other         0.3203  0.2638   1.47   0.2247    
## ses          -0.5551  0.0649  73.14  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Estimated Scale Parameters:
##             Estimate Std.err
## (Intercept)     1.02   0.143
## 
## Correlation: Structure = ar1  Link = identity 
## 
## Estimated Correlation Parameters:
##       Estimate Std.err
## alpha    0.523   0.076
## Number of clusters:   2345   Maximum cluster size: 6

QIC(fitb.1)

##       QIC      QICu Quasi Lik       CIC    params      QICC 
##     10525     10471     -5226        36         9     10525

QIC(fitb.2)

##       QIC      QICu Quasi Lik       CIC    params      QICC 
##   10493.7   10478.1   -5230.0      16.8       9.0   10493.8

QIC(fitb.3)

##       QIC      QICu Quasi Lik       CIC    params      QICC 
##   10481.9   10466.9   -5224.4      16.5       9.0   10481.9

DEM 7473 - Week 6: Longitudinal Models for Change

Corey S. Sparks, PhD

October 8, 2018

Libraries

Data and recodes

Reshaping data into longitudinal format

Filtering and mutating

Hierarchical model for individual change - growth curves using (G)LMM’s

GLMM for individual growth

Random intercept for each person

individual trajectory model with random slope for time for each person

curvilinear trajectory model with random nonlinear time

individual trajectory model with fixed effects for race and different population trajectories for each race

Model comparisons

GLMM for change in a binary variable

Generalized estimating equations for longitudinal data

Introduction to GEE’s

GLMMS’s

GLMMS’s and GEE’s

Generalized Estimating Equations

GEE’s

GEE’s

GEE’s - Model form

GEE’s - Model estimation

GEE’s - Model estimation

GEE’s - Correlation Structure

GEE’s - Correlation Structure

GEE’s - Correlation Structure

GEE’s - Correlation Structure - Independent

GEE’s - Correlation Structure - Exchangeable

GEE’s - Correlation Structure - AR(1)

GEE’s - Correlation Structure - Unstructured

Basic linear model

Binary response longitudinal model