In this example, we will use Bayesian hierarchical models to do some longitudinal modeling of data from the ECLS-K. Specifically, we will model changes in a student’s standardized math score from kindergarten to 8th grade. We will use JAGS to illustrate the use of random slope models /growth curve models for individual change
First we load our data
load("~/Google Drive/dem7903_App_Hier/data/eclsk.Rdata")
names(eclsk)<-tolower(names(eclsk))
library (car)
library(rjags)
## Loading required package: coda
## Loading required package: lattice
## Linked to JAGS 3.4.0
## Loaded modules: basemod,bugs
#get out only the variables I'm going to use for this example
myvars<-c( "childid","gender", "race", "r1_kage","r2_kage", "r4age", "r5age", "r6age", "r7age","c1r4mtsc", "c4r4mtsc", "c5r4mtsc", "c6r4mtsc", "c7r4mtsc", "w1povrty","w1povrty","w3povrty", "w5povrty", "w8povrty","wkmomed", "s2_id", "p2fsstat","p5fsstat","p6fsstat","p7fsstat", "cregion")
eclsk<-eclsk[,myvars]
#recode outcome, math score at each of the 4 waves
eclsk$math1<-ifelse(eclsk$c1r4mtsc==-9,NA, eclsk$c1r4mtsc)
eclsk$math2<-ifelse(eclsk$c4r4mtsc==-9,NA, eclsk$c4r4mtsc)
eclsk$math3<-ifelse(eclsk$c5r4mtsc==-9,NA, eclsk$c5r4mtsc)
eclsk$math4<-ifelse(eclsk$c6r4mtsc==-9,NA, eclsk$c6r4mtsc)
eclsk$math5<-ifelse(eclsk$c7r4mtsc==-9,NA, eclsk$c7r4mtsc)
eclsk$age1<-ifelse(eclsk$r1_kage==-9, NA, eclsk$r1_kage/12)
eclsk$age2<-ifelse(eclsk$r4age==-9, NA, eclsk$r4age/12)
#for the later waves, the NCES group the ages into ranges of months, so 1= <105 months, 2=105 to 108 months. So, I fix the age at the midpoint of the interval they give, and make it into years by dividing by 12
eclsk$age3<-recode(eclsk$r5age,recodes="1=105; 2=107; 3=109; 4=112; 5=115; 6=117; -9=NA")/12
eclsk$age4<-recode(eclsk$r6age,recodes="1=118; 2=129; 3=135; 4=141; 5=155; -9=NA")/12
eclsk$age5<-recode(eclsk$r7age,recodes="1=155; 2=166; 3=172; 4=178; 5=192; -9=NA")/12
eclsk$pov1<-ifelse(eclsk$w1povrty==1,1,0)
eclsk$pov2<-ifelse(eclsk$w1povrty==1,1,0)
eclsk$pov3<-ifelse(eclsk$w3povrty==1,1,0)
eclsk$pov4<-ifelse(eclsk$w5povrty==1,1,0)
eclsk$pov5<-ifelse(eclsk$w5povrty==1,1,0)
#Recode race with white, non Hispanic as reference using dummy vars
eclsk$hisp<-recode (eclsk$race, recodes="3:4=1;-9=NA; else=0")
eclsk$black<-recode (eclsk$race, recodes="2=1;-9=NA; else=0")
eclsk$asian<-recode (eclsk$race, recodes="5=1;-9=NA; else=0")
eclsk$nahn<-recode (eclsk$race, recodes="6:7=1;-9=NA; else=0")
eclsk$other<-recode (eclsk$race, recodes="8=1;-9=NA; else=0")
eclsk$male<-recode(eclsk$gender, recodes="1=1; 2=0; -9=NA")
eclsk$mlths<-recode(eclsk$wkmomed, recodes = "1:2=1; 3:9=0; else = NA")
eclsk$mgths<-recode(eclsk$wkmomed, recodes = "1:3=0; 4:9=1; else =NA")
To analyze data longitudinally, we need to reshape the data from the current “wide” format (repeated measures in columns) to a “long” format (repeated observations in rows). The reshape()
function allows us to do this easily. It allows us to specify our repeated measures, time varying covariates as well as time-constant covariates.
This takes a long time to run with the full sample, so I just subset to children from the south census region (cregion==3)
eclsk<-subset(eclsk, cregion==3)
e.long<-reshape(eclsk, idvar="childid", varying=list(math = c("math1", "math2", "math3", "math4","math5"),
age = c("age1", "age2", "age3", "age4", "age5"),
pov= c("pov1", "pov2", "pov3", "pov4", "pov5")),
times=1:5,direction="long",
drop = names(eclsk)[c(4:19,22:25) ],)
e.long<-e.long[order(e.long$childid, e.long$time),]
head(e.long, n=20)
## childid gender race wkmomed s2_id cregion hisp black asian
## 0536001C.1 0536001C 1 2 5 <NA> 3 0 1 0
## 0536001C.2 0536001C 1 2 5 <NA> 3 0 1 0
## 0536001C.3 0536001C 1 2 5 <NA> 3 0 1 0
## 0536001C.4 0536001C 1 2 5 <NA> 3 0 1 0
## 0536001C.5 0536001C 1 2 5 <NA> 3 0 1 0
## 0536002C.1 0536002C 2 1 2 0536 3 0 0 0
## 0536002C.2 0536002C 2 1 2 0536 3 0 0 0
## 0536002C.3 0536002C 2 1 2 0536 3 0 0 0
## 0536002C.4 0536002C 2 1 2 0536 3 0 0 0
## 0536002C.5 0536002C 2 1 2 0536 3 0 0 0
## 0536003C.1 0536003C 1 1 5 0536 3 0 0 0
## 0536003C.2 0536003C 1 1 5 0536 3 0 0 0
## 0536003C.3 0536003C 1 1 5 0536 3 0 0 0
## 0536003C.4 0536003C 1 1 5 0536 3 0 0 0
## 0536003C.5 0536003C 1 1 5 0536 3 0 0 0
## 0536004C.1 0536004C 2 1 3 0536 3 0 0 0
## 0536004C.2 0536004C 2 1 3 0536 3 0 0 0
## 0536004C.3 0536004C 2 1 3 0536 3 0 0 0
## 0536004C.4 0536004C 2 1 3 0536 3 0 0 0
## 0536004C.5 0536004C 2 1 3 0536 3 0 0 0
## nahn other male mlths mgths time math1 age1 pov1
## 0536001C.1 0 0 1 0 1 1 28.422 5.464167 1
## 0536001C.2 0 0 1 0 1 2 NA NA 1
## 0536001C.3 0 0 1 0 1 3 NA NA NA
## 0536001C.4 0 0 1 0 1 4 NA NA NA
## 0536001C.5 0 0 1 0 1 5 NA NA NA
## 0536002C.1 0 0 0 1 0 1 38.180 5.600000 1
## 0536002C.2 0 0 0 1 0 2 26.281 7.180833 1
## 0536002C.3 0 0 0 1 0 3 30.951 9.083333 0
## 0536002C.4 0 0 0 1 0 4 NA NA NA
## 0536002C.5 0 0 0 1 0 5 NA NA NA
## 0536003C.1 0 0 1 0 1 1 34.458 5.822500 0
## 0536003C.2 0 0 1 0 1 2 39.385 7.316667 0
## 0536003C.3 0 0 1 0 1 3 47.576 9.333333 0
## 0536003C.4 0 0 1 0 1 4 48.059 11.250000 0
## 0536003C.5 0 0 1 0 1 5 NA NA 0
## 0536004C.1 0 0 0 0 0 1 36.472 5.358333 0
## 0536004C.2 0 0 0 0 0 2 NA NA 0
## 0536004C.3 0 0 0 0 0 3 NA NA NA
## 0536004C.4 0 0 0 0 0 4 NA NA NA
## 0536004C.5 0 0 0 0 0 5 NA NA NA
The first model is a simple random intercept model for each child’s mean for the math score, with a population trajectory for age, not child specific
model1<-"
model{
#Likelihood
for( i in 1:n)
{
math[i]~dnorm(mu[i], tau)
mu[i]<-b[1]+b[2]*black[i]+b[3]*hisp[i]+b[4]*asian[i]+b[5]*other[i]+b[6]*lths[i]+b[7]*gths[i]+b[8]*pov[i]+b[9]*age[i]+u[childnum[i]]
}
#priors
#Prior for random intercept
for (j in 1:nkids)
{
u[j] ~ dnorm(0, tauu)
}
#regression effects, MVN prior
b[1:9]~dmnorm(meanb[], prec.beta[,])
for(j in 1:9){
meanb[j]~dnorm(0, .0001)
}
prec.beta[1:9,1:9]~dwish(Obeta[,], 9)
for(j in 1:9){ for(k in 1:9){ Obeta[j,k] <- equals(j,k)*.1 } }
tau<-pow(sd, -2)
sd~dunif(0,100)
tauu<-pow(sdu, -2)
sdu~dunif(0,100)
}
"
e.long<-subset(e.long, subset = is.na(math1)==F&is.na(black)==F&is.na(male)==F&is.na(pov1)==F&is.na(mlths)==F&is.na(age1)==F)
nkids<-table(e.long$childid)
head(nkids) #Number of people within counties
##
## 0536001C 0536002C 0536003C 0536004C 0536005C 0536006C
## 1 3 4 1 3 2
e.long$childnum<-rep(1:length(unique(e.long$childid)), nkids)
dat<-list(math=as.numeric(scale(e.long$math1, center=T, scale=T)), sex=e.long$male, black=e.long$black, hisp=e.long$hisp, other=e.long$other, asian=e.long$asian,pov=e.long$pov1,lths=e.long$mlths,gths=e.long$mgths,age=e.long$age1, childnum=e.long$childnum,n=length(e.long$math1), nkids=length(unique(e.long$childid)))
lapply(dat , summary)
## $math
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -5.17600 -0.63890 0.05798 0.00000 0.67650 3.68900
##
## $sex
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 1.0000 0.5162 1.0000 1.0000
##
## $black
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.2276 0.0000 1.0000
##
## $hisp
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1483 0.0000 1.0000
##
## $other
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.02149 0.00000 1.00000
##
## $asian
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.02532 0.00000 1.00000
##
## $pov
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.2281 0.0000 1.0000
##
## $lths
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 0.147 0.000 1.000
##
## $gths
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 1.0000 0.5262 1.0000 1.0000
##
## $age
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.500 6.761 8.750 8.950 11.250 16.000
##
## $childnum
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 1356 2675 2687 4036 5354
##
## $n
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 19080 19080 19080 19080 19080 19080
##
## $nkids
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5354 5354 5354 5354 5354 5354
#Set some initial values
init.rng1<-list(".RNG.seed" = 1234, ".RNG.name" = "base::Mersenne-Twister", sd=.1, sdu=.1)
init.rng2<-list(".RNG.seed" = 5678, ".RNG.name" = "base::Mersenne-Twister", sd=.5, sdu=.5)
#Initialize the model
mod1<-jags.model(file=textConnection(model1), data=dat, n.chains=2,inits =list(init.rng1, init.rng2) )
## Warning in jags.model(file = textConnection(model1), data = dat, n.chains
## = 2, : Unused variable "sex" in data
## Compiling model graph
## Resolving undeclared variables
## Allocating nodes
## Graph Size: 216412
##
## Initializing model
#burn in
update(mod1, 5000)
#collect samples of the parameters
samps1<-coda.samples(mod1, variable.names=c( "b", "sd", "sdu"), n.iter=5000, n.thin=1)
#Effective sample size for each parameter
effectiveSize(samps1)
## b[1] b[2] b[3] b[4] b[5] b[6] b[7]
## 882.7850 720.6274 597.1324 584.2845 576.8167 590.1810 634.0036
## b[8] b[9] sd sdu
## 1926.7616 7152.3877 3930.8941 3587.5588
#Numerical summary of each parameter:
summary(samps1, quantiles = c(.025, .05, .95, .975))
##
## Iterations = 6001:11000
## Thinning interval = 1
## Number of chains = 2
## Sample size per chain = 5000
##
## 1. Empirical mean and standard deviation for each variable,
## plus standard error of the mean:
##
## Mean SD Naive SE Time-series SE
## b[1] 0.035865 0.024942 2.494e-04 8.412e-04
## b[2] -0.593525 0.028035 2.803e-04 1.044e-03
## b[3] -0.152945 0.032954 3.295e-04 1.348e-03
## b[4] 0.020740 0.071589 7.159e-04 2.964e-03
## b[5] -0.176288 0.078025 7.802e-04 3.245e-03
## b[6] -0.309745 0.035888 3.589e-04 1.477e-03
## b[7] 0.410148 0.024353 2.435e-04 9.680e-04
## b[8] -0.093710 0.017976 1.798e-04 4.098e-04
## b[9] -0.005022 0.001436 1.436e-05 1.698e-05
## sd 0.517576 0.003083 3.083e-05 4.917e-05
## sdu 0.757076 0.008551 8.551e-05 1.431e-04
##
## 2. Quantiles for each variable:
##
## 2.5% 5% 95% 97.5%
## b[1] -0.012567 -0.005553 0.076729 0.084336
## b[2] -0.648325 -0.639522 -0.546979 -0.538484
## b[3] -0.215880 -0.206543 -0.098854 -0.088803
## b[4] -0.115255 -0.094319 0.139368 0.161513
## b[5] -0.325417 -0.303909 -0.046833 -0.023834
## b[6] -0.379075 -0.369218 -0.249675 -0.238929
## b[7] 0.361830 0.369295 0.450098 0.456452
## b[8] -0.128510 -0.122623 -0.063876 -0.057577
## b[9] -0.007829 -0.007365 -0.002661 -0.002171
## sd 0.511537 0.512429 0.522653 0.523574
## sdu 0.740708 0.743258 0.771640 0.774012
#GBR diagnostic
gelman.diag(samps1, multivariate = F)
## Potential scale reduction factors:
##
## Point est. Upper C.I.
## b[1] 1.00 1.00
## b[2] 1.01 1.03
## b[3] 1.00 1.01
## b[4] 1.00 1.00
## b[5] 1.01 1.03
## b[6] 1.00 1.00
## b[7] 1.00 1.00
## b[8] 1.00 1.00
## b[9] 1.00 1.00
## sd 1.00 1.00
## sdu 1.00 1.00
#DIC
dic.samples(mod1, n.iter = 1000,type = "pD")
## Mean deviance: 29014
## penalty 4643
## Penalized deviance: 33657
The second model is for a child-specific linear trajectory (random slope) and mean (random intercept)
model2<-"
model{
#Likelihood
for( i in 1:n)
{
math[i]~dnorm(mu[i], tau)
mu[i]<-b[1]*black[i]+b[2]*hisp[i]+b[3]*asian[i]+b[4]*other[i]+b[5]*lths[i]+b[6]*gths[i]+b[7]*pov[i]+u[childnum[i],1]*age[i]+u[childnum[i],2]
}
#priors
#MVN Prior for random intercepts and slopes
for (j in 1:nkids)
{
u[j, 1:2] ~ dmnorm(meanu[], prec.Sigma[,])
}
meanu[1]~dnorm(0, .001)
meanu[2]~dnorm(0, .001)
prec.Sigma[1:2, 1:2] ~ dwish(Omega[,], 2)
Sigma[1:2, 1:2]<-inverse(prec.Sigma[,])
rho12<-Sigma[1,2]/ sqrt(Sigma[1,1]* Sigma[2,2])
#Set some initial values for the covariance matrix
for (j in 1:2){ for (k in 1:2){ Omega[j,k] <-equals(j,k)*.1 } }
#regression effects, MVN prior
b[1:7]~dmnorm(meanb[], prec.beta[,])
for(j in 1:7){
meanb[j]~dnorm(0, .0001)
}
prec.beta[1:7,1:7]~dwish(Obeta[,], 7)
for(j in 1:7){ for(k in 1:7){ Obeta[j,k] <- equals(j,k)*.1 } }
#prior on residual precision
tau<-pow(sd, -2)
sd~dunif(0,100)
}
"
#Set some initial values
init.rng1<-list(".RNG.seed" = 1234, ".RNG.name" = "base::Mersenne-Twister", sd=.1)
init.rng2<-list(".RNG.seed" = 5678, ".RNG.name" = "base::Mersenne-Twister", sd=.5)
#initialize the model
mod2<-jags.model(file=textConnection(model2), data=dat, n.chains=2,inits =list(init.rng1, init.rng2) )
## Warning in jags.model(file = textConnection(model2), data = dat, n.chains
## = 2, : Unused variable "sex" in data
## Compiling model graph
## Resolving undeclared variables
## Allocating nodes
## Graph Size: 245122
##
## Initializing model
#burn in
update(mod2, 5000)
#collect samples of the parameters
samps2<-coda.samples(mod2, variable.names=c( "b", "sd", "Sigma", "rho12"), n.iter=5000, n.thin=1)
#effective sample size for each parameter
effectiveSize(samps2)
## Sigma[1,1] Sigma[2,1] Sigma[1,2] Sigma[2,2] b[1] b[2]
## 689.7690 862.1431 862.1431 1188.0520 472.0621 405.7380
## b[3] b[4] b[5] b[6] b[7] rho12
## 496.4664 462.1116 374.9937 213.7298 1260.9138 1114.6249
## sd
## 2064.9716
#Numerical summary of each parameter:
summary(samps2, quantiles = c(.025, .05, .95, .975))
##
## Iterations = 6001:11000
## Thinning interval = 1
## Number of chains = 2
## Sample size per chain = 5000
##
## 1. Empirical mean and standard deviation for each variable,
## plus standard error of the mean:
##
## Mean SD Naive SE Time-series SE
## Sigma[1,1] 0.00579 0.0003092 3.092e-06 1.176e-05
## Sigma[2,1] -0.05194 0.0031380 3.138e-05 1.066e-04
## Sigma[1,2] -0.05194 0.0031380 3.138e-05 1.066e-04
## Sigma[2,2] 1.04687 0.0377018 3.770e-04 1.101e-03
## b[1] -0.59219 0.0279609 2.796e-04 1.285e-03
## b[2] -0.15372 0.0337498 3.375e-04 1.687e-03
## b[3] 0.03416 0.0705059 7.051e-04 3.199e-03
## b[4] -0.16535 0.0776084 7.761e-04 3.616e-03
## b[5] -0.30248 0.0367289 3.673e-04 1.898e-03
## b[6] 0.40717 0.0248319 2.483e-04 1.726e-03
## b[7] -0.10730 0.0189664 1.897e-04 5.345e-04
## rho12 -0.66673 0.0154052 1.541e-04 4.633e-04
## sd 0.46588 0.0032848 3.285e-05 7.321e-05
##
## 2. Quantiles for each variable:
##
## 2.5% 5% 95% 97.5%
## Sigma[1,1] 0.005212 0.005299 0.006318 0.006424
## Sigma[2,1] -0.058309 -0.057170 -0.046899 -0.045954
## Sigma[1,2] -0.058309 -0.057170 -0.046899 -0.045954
## Sigma[2,2] 0.974739 0.985370 1.109618 1.122984
## b[1] -0.646077 -0.638113 -0.546435 -0.537822
## b[2] -0.219594 -0.208227 -0.097651 -0.086651
## b[3] -0.106093 -0.083803 0.148285 0.170299
## b[4] -0.315406 -0.291578 -0.037502 -0.012409
## b[5] -0.373772 -0.363276 -0.242731 -0.230909
## b[6] 0.359955 0.366694 0.450266 0.458566
## b[7] -0.144226 -0.138809 -0.076306 -0.070219
## rho12 -0.696095 -0.691641 -0.640746 -0.635212
## sd 0.459525 0.460565 0.471271 0.472399
#GBR
gelman.diag(samps2, multivariate = F)
## Potential scale reduction factors:
##
## Point est. Upper C.I.
## Sigma[1,1] 1.01 1.06
## Sigma[2,1] 1.01 1.04
## Sigma[1,2] 1.01 1.04
## Sigma[2,2] 1.01 1.02
## b[1] 1.01 1.04
## b[2] 1.00 1.00
## b[3] 1.00 1.01
## b[4] 1.00 1.01
## b[5] 1.00 1.02
## b[6] 1.01 1.04
## b[7] 1.00 1.01
## rho12 1.01 1.03
## sd 1.00 1.01
dic.samples(mod2, n.iter = 1000,type = "pD")
## Mean deviance: 25006
## penalty 6457
## Penalized deviance: 31463