In this example, we will use hierarchical models to do some longitudinal modeling of data from the ECLS-K 2011. Specifically, we will model changes in a student’s standardized math score from fall kindergarten to spring, 1st grade.

Longitudinal data are collected in waves, representing the sample at different time points For example, currently, the ECLS-K 2011 has four waves, intending to capture the children as they progress through school, and will eventuall have more. We really want to measure “similar” items at each time point so we can understand what makes people change over their lives. When it comes to data we need to introduce a different data structure to accomodate this.

In this example, I will focus on the use of the linear mixed model for a continuous outcome

We start simple then move into growth curve models. This follows the presentation in Singer and Willett (2003) Chapters 3-6.

Data and recodes

First we load our data

load("~/Google Drive/dem7903_App_Hier/data/eclsk_11.Rdata")
names(eclsk11)<-tolower(names(eclsk11))
library (car)
library(arm)
library(lmerTest)
#get out only the variables I'm going to use for this example

myvars<-c("childid", "x_chsex_r", "x1locale", "x_raceth_r", "x2povty", "x12par1ed_i","p1curmar", "x1htotal", "x1mscalk1", "x2mscalk1", "x3mscalk1", "x4mscalk1", "p1hscale", "p2hscale", "p4hscale","x2fsstat2","x4fsstat2", "x12sesl","x4sesl_i", "p2parct1", "p2parct2", "s1_id", "p2safepl","x2krceth","p1o2near", "x_distpov" , "w1c0", "w1p0", "w2p0", "w1c0str", "w1p0str",
        "w4c4p_40","w4c4p_4str", "w4c4p_4psu",  "w1c0psu", "w1p0psu", "x1height","x2height","x3height","x4height", "x1kage_r","x2kage_r","x3age","x4age")

#subset the data
eclsk.sub<-eclsk11[,myvars]

rm(eclsk11); gc()
##           used (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells 1665255 89.0    2637877  140.9   2164898  115.7
## Vcells 2968870 22.7  219933776 1678.0 182884454 1395.3

Next, I do some recoding of variables. I have to make the repeated measures of each of my longitudinal variables.

#Longitudinal variables
#recode our outcomes, the  first is the child's math standardized test score  in Kindergarten
eclsk.sub$math1<-ifelse(eclsk.sub$x1mscalk1<0, NA, eclsk.sub$x1mscalk1)
eclsk.sub$math2<-ifelse(eclsk.sub$x2mscalk1<0, NA, eclsk.sub$x2mscalk1)
#eclsk.sub$math3<-ifelse(eclsk.sub$x3mscalk1<0, NA, eclsk.sub$x3mscalk1)
eclsk.sub$math4<-ifelse(eclsk.sub$x4mscalk1<0, NA, eclsk.sub$x4mscalk1)

#Second outcome is child's height for age, continuous outcome
eclsk.sub$height1<-ifelse(eclsk.sub$x1height<=-7, NA, eclsk.sub$x1height)
eclsk.sub$height2<-ifelse(eclsk.sub$x2height<=-7, NA, eclsk.sub$x2height)
#eclsk.sub$height3<-ifelse(eclsk.sub$x3height<=-7, NA, eclsk.sub$x3height)
eclsk.sub$height4<-ifelse(eclsk.sub$x4height<=-7, NA, eclsk.sub$x4height)

#Age at each wave
eclsk.sub$age_yrs1<-ifelse(eclsk.sub$x1kage_r<0, NA, eclsk.sub$x1kage_r/12)
eclsk.sub$age_yrs2<-ifelse(eclsk.sub$x2kage_r<0, NA, eclsk.sub$x2kage_r/12)
#eclsk.sub$age_yrs3<-ifelse(eclsk.sub$x3age<0, NA, eclsk.sub$x3age/12)
eclsk.sub$age_yrs4<-ifelse(eclsk.sub$x4age<0, NA, eclsk.sub$x4age/12)

eclsk.sub<- eclsk.sub[is.na(eclsk.sub$age_yrs1)==F, ]

#Height for age z score standardized by sex and age
eclsk.sub$height_z1<-ave(eclsk.sub$height1, as.factor(paste(round(eclsk.sub$age_yrs1, 1.5), eclsk.sub$male)), FUN=scale)
eclsk.sub$height_z2<-ave(eclsk.sub$height2, as.factor(paste(round(eclsk.sub$age_yrs2, 1.5), eclsk.sub$male)), FUN=scale)
#eclsk.sub$height_z3<-ave(eclsk.sub$height3, as.factor(paste(round(eclsk.sub$age_yrs3, 1.5), eclsk.sub$male)), FUN=scale)
eclsk.sub$height_z4<-ave(eclsk.sub$height4, as.factor(paste(round(eclsk.sub$age_yrs4, 1.5), eclsk.sub$male)), FUN=scale)


#Household food insecurity, dichotomous outcome
#This outcome is only present at two waves
eclsk.sub$foodinsec1<-recode(eclsk.sub$x2fsstat2, recodes="2:3=1; 1=0; else=NA")
eclsk.sub$foodinsec2<-recode(eclsk.sub$x4fsstat2, recodes="2:3=1; 1=0; else=NA")


#Child health assessment Excellent to poor , ordinal outcome
eclsk.sub$chhealth1<-ifelse(eclsk.sub$p1hscale<0, NA, eclsk.sub$p1hscale)
eclsk.sub$chhealth2<-ifelse(eclsk.sub$p2hscale<0, NA, eclsk.sub$p2hscale)
eclsk.sub$chhealth4<-ifelse(eclsk.sub$p4hscale<0, NA, eclsk.sub$p4hscale)

#SES
eclsk.sub$hhses1<-ifelse(eclsk.sub$x12sesl==-9, NA, scale(eclsk.sub$x12sesl))
eclsk.sub$hhses4<-ifelse(eclsk.sub$x4sesl_i==-9, NA, scale(eclsk.sub$x4sesl_i))

That does it for our time varying variables. We now code some variables from the kindergarten wave to use as predictors:

#Non time varying variables
#First we recode some Child characteristics
#Child's sex: recode as male =1
eclsk.sub$male<-recode(eclsk.sub$x_chsex_r, recodes="1=1; 2=0; -9=NA")

#Recode race with white, non Hispanic as reference using dummy vars
eclsk.sub$hisp<-recode (eclsk.sub$x_raceth_r, recodes="3:4=1;-9=NA; else=0")
eclsk.sub$black<-recode (eclsk.sub$x_raceth_r, recodes="2=1;-9=NA; else=0")
eclsk.sub$asian<-recode (eclsk.sub$x_raceth_r, recodes="5=1;-9=NA; else=0")
eclsk.sub$nahn<-recode (eclsk.sub$x_raceth_r, recodes="6:7=1;-9=NA; else=0")
eclsk.sub$other<-recode (eclsk.sub$x_raceth_r, recodes="8=1;-9=NA; else=0")


#Then we recode some parent/mother characteristics
#Mother's education, recode as 2 dummys with HS = reference
eclsk.sub$lths<-recode(eclsk.sub$x12par1ed_i, recodes = "0:2=1; 3:8=0; else = NA")
eclsk.sub$gths<-recode(eclsk.sub$x12par1ed_i, recodes = "1:3=0; 4:8=1; else =NA") 

#marital status, recode as 2 dummys, ref= married
eclsk.sub$single<-recode(eclsk.sub$p1curmar, recodes="4=1; -7:-9=NA; else=0")
eclsk.sub$notmar<-recode(eclsk.sub$p1curmar, recodes="2:3=1; -7:-9=NA; else=0")


#Then we do some household level variables

#Urban school location = 1
eclsk.sub$urban<-recode(eclsk.sub$x1locale, recodes = "1:3=1; 4=0; -1:-9=NA")

#poverty level in poverty = 1
eclsk.sub$pov<-recode(eclsk.sub$x2povty , recodes ="1:2=1; 3=0; -9=NA")

#Household size
eclsk.sub$hhsize<-eclsk.sub$x1htotal

#school % minority student body
eclsk.sub$minorsch<-ifelse(eclsk.sub$x2krceth <0, NA, eclsk.sub$x2krceth/10)

#Unsafe neighborhood
eclsk.sub$unsafe<-recode(eclsk.sub$p2safepl , recodes = "1:2='unsafe'; 3='safe'; else=NA", as.factor.result = T)

#school district poverty
eclsk.sub$dist_pov<-ifelse(eclsk.sub$x_distpov==-9, NA, scale(eclsk.sub$x_distpov))

Reshaping data into longitudinal format

To analyze data longitudinally, we must reshape the data fron its current “wide” format, where each repeated measure is a column, into the “long” format, where there is a single column for a particular variable, and we account for the repeated measurements of each person. In this case, I’m going to use three waves of data, so each child can contribute up to three lines to the data.

This is just a little example using the first 10 kids, just to illustrate this process.

test<-eclsk.sub[1:10,]
#look at the data in "wide" format
head(test)
##    childid x_chsex_r x1locale x_raceth_r x2povty x12par1ed_i p1curmar
## 1 10000001         1        4          1       2           5        1
## 2 10000002         2        2          1       3           5        1
## 4 10000004         1        4          1       3           5        1
## 5 10000005         2        2          3       2           4        1
## 6 10000006         2        4          1       2           3        3
## 7 10000007         2        3          1       1           4        1
##   x1htotal x1mscalk1 x2mscalk1 x3mscalk1 x4mscalk1 p1hscale p2hscale
## 1        6   44.9695   61.5056        NA   79.5010        1        2
## 2        4   30.1857   50.0694        NA   71.0421        1        1
## 4        4   13.8410   31.0148        NA   53.9906        1        1
## 5        5   27.6587   45.5457        NA        NA        1        1
## 6        4   22.2918   27.8530        NA   42.6581        1        2
## 7        5   26.1178   36.7276        NA   59.3014        2        2
##   p4hscale x2fsstat2 x4fsstat2 x12sesl x4sesl_i p2parct1 p2parct2 s1_id
## 1        3         1         1   -0.13    -0.07        1        1  1861
## 2        1         1         1    0.27     0.23        1        1  2113
## 4       -9         1        -9   -0.26    -0.29        1        1  2070
## 5       NA         1        NA    0.28       NA        1        3  1079
## 6        1         1         1   -0.69    -0.71        1       -1  1210
## 7       -9         1        -9   -0.58    -0.60        1        1  1282
##   p2safepl x2krceth p1o2near x_distpov      w1c0     w1p0     w2p0 w1c0str
## 1        2     8.00        2        45 188.09105 211.4320 215.1901      70
## 2        3    30.00       -1         7 237.88083 274.5976 327.8694      45
## 4        2    18.00       -1        17  96.36698 103.5068 111.8929      51
## 5        3    90.48        3        -1 178.54929 237.3439 267.5328      30
## 6        3     2.00       -1        11 497.90703 525.4855 636.1168      65
## 7        3     3.00        0         9 447.97361 543.0198 584.5155      65
##   w1p0str w4c4p_40 w4c4p_4str w4c4p_4psu w1c0psu w1p0psu x1height x2height
## 1      70 322.0055         69          3       3       3    44.50    45.75
## 2      45 474.8417         43          2       2       2    45.63    47.00
## 4      51 171.4766         49          2       2       2    46.00    48.00
## 5      30   0.0000         NA         NA       3       3    40.80    42.00
## 6      65 957.6496         64          2       2       2    46.00    47.00
## 7      65 535.6196         64          1       1       1    45.25    46.00
##   x3height x4height x1kage_r x2kage_r x3age x4age   math1   math2   math4
## 1       NA    48.00    72.39    79.76    NA 91.66 44.9695 61.5056 79.5010
## 2       NA    49.50    71.41    77.52    NA 89.56 30.1857 50.0694 71.0421
## 4       NA    50.00    72.39    78.84    NA 89.23 13.8410 31.0148 53.9906
## 5       NA       NA    59.51    65.06    NA    NA 27.6587 45.5457      NA
## 6       NA    49.75    75.78    82.62    NA 94.82 22.2918 27.8530 42.6581
## 7       NA    48.00    63.95    69.90    NA 80.98 26.1178 36.7276 59.3014
##   height1 height2 height4 age_yrs1 age_yrs2 age_yrs4   height_z1
## 1   44.50   45.75   48.00 6.032500 6.646667 7.638333 -0.74750120
## 2   45.63   47.00   49.50 5.950833 6.460000 7.463333  0.01182955
## 4   46.00   48.00   50.00 6.032500 6.570000 7.435833  0.03840209
## 5   40.80   42.00      NA 4.959167 5.421667       NA -1.45613917
## 6   46.00   47.00   49.75 6.315000 6.885000 7.901667 -0.62911889
## 7   45.25   46.00   48.00 5.329167 5.825000 6.748333  0.49810428
##    height_z2   height_z4 foodinsec1 foodinsec2 chhealth1 chhealth2
## 1 -0.7144538 -0.69549851          0          0         1         2
## 2  0.1659170  0.01627224          0          0         1         1
## 4  0.4210688  0.41691644          0         NA         1         1
## 5 -1.0816114          NA          0         NA         1         1
## 6 -0.2994689 -0.52126128          0          0         1         2
## 7  0.3141864  0.16071039          0         NA         2         2
##   chhealth4      hhses1      hhses4 male hisp black asian nahn other lths
## 1         3 -0.05343635 -0.04820708    1    0     0     0    0     0    0
## 2         1  0.36092641  0.32296225    0    0     0     0    0     0    0
## 4        NA -0.18810425 -0.32039792    1    0     0     0    0     0    0
## 5        NA  0.37128548          NA    0    1     0     0    0     0    0
## 6         1 -0.63354422 -0.84003498    0    0     0     0    0     0    0
## 7        NA -0.51959446 -0.70393956    0    0     0     0    0     0    0
##   gths single notmar urban pov hhsize minorsch unsafe    dist_pov
## 1    1      0      0     0   1      6    0.800 unsafe  2.22748174
## 2    1      0      0     1   0      4    3.000   safe -0.67176163
## 4    1      0      0     0   0      4    1.800 unsafe  0.09119715
## 5    1      0      0     1   1      5    9.048   safe -1.28212866
## 6    0      0      1     0   1      4    0.200   safe -0.36657812
## 7    1      0      0     1   1      5    0.300   safe -0.51916987
e.longt<-reshape(test, idvar="childid", varying=list(math = c("math1", "math2",  "math4"),
                                         age = c("age_yrs1", "age_yrs2",  "age_yrs4"),
                                         height= c("height_z1", "height_z2", "height_z4"),
                                         health=c("chhealth1", "chhealth2", "chhealth4"), 
                                         ses = c("hhses1", "hhses1", "hhses4")),
                                          v.names = c("math", "age", "height", "health"),
                                         timevar="wave",
                                         times=c(1,2,4),direction="long",  
                                          drop = names(test)[c(2:20, 22:25)])

#here we look at the new data set, in the "long" format

head(e.longt[order(e.longt$childid, e.longt$wave),c(1, 40:44)], n=20)
##             childid minorsch unsafe    dist_pov wave    math
## 10000001.1 10000001    0.800 unsafe  2.22748174    1 44.9695
## 10000001.2 10000001    0.800 unsafe  2.22748174    2 61.5056
## 10000001.4 10000001    0.800 unsafe  2.22748174    4 79.5010
## 10000002.1 10000002    3.000   safe -0.67176163    1 30.1857
## 10000002.2 10000002    3.000   safe -0.67176163    2 50.0694
## 10000002.4 10000002    3.000   safe -0.67176163    4 71.0421
## 10000004.1 10000004    1.800 unsafe  0.09119715    1 13.8410
## 10000004.2 10000004    1.800 unsafe  0.09119715    2 31.0148
## 10000004.4 10000004    1.800 unsafe  0.09119715    4 53.9906
## 10000005.1 10000005    9.048   safe -1.28212866    1 27.6587
## 10000005.2 10000005    9.048   safe -1.28212866    2 45.5457
## 10000005.4 10000005    9.048   safe -1.28212866    4      NA
## 10000006.1 10000006    0.200   safe -0.36657812    1 22.2918
## 10000006.2 10000006    0.200   safe -0.36657812    2 27.8530
## 10000006.4 10000006    0.200   safe -0.36657812    4 42.6581
## 10000007.1 10000007    0.300   safe -0.51916987    1 26.1178
## 10000007.2 10000007    0.300   safe -0.51916987    2 36.7276
## 10000007.4 10000007    0.300   safe -0.51916987    4 59.3014
## 10000008.1 10000008    2.600   safe  0.39638067    1 30.9004
## 10000008.2 10000008    2.600   safe  0.39638067    2 54.5561

So, we can see the children that are repeated three times, but some children are missing for some variables at some waves, which is common in longitudinal surveys.

To illustrate these data, we can plot the children’s math scores and other variables over time to see trends within child. Some kids are missing at all times, others don’t have complete data for all times, but you can see the trends within child.

require(lattice)
xyplot(math~age|childid, data=e.longt,
       panel=function(x,y){
       panel.xyplot(x,y)
     panel.lmline(x,y,)}, main="Math Scores over time for each child", xlab= "Age", ylab="Math Score")

xyplot(height~age|childid, data=e.longt,
       panel=function(x,y){
       panel.xyplot(x,y)
     panel.lmline(x,y,)}, main="Height over time for each child", xlab= "Age", ylab="Height")