The outcome of interest here is the child’s height for age, which is a continuous outcome.

Loading packages

library(haven)
library (car)
library(lmtest)
library(arm)
library(lme4)

Loading data:

eclsk11<-read_sas("P:/dem7473/data/childk4p.sas7bdat")
names(eclsk11)<-tolower(names(eclsk11))

#getting out the only the variables we need
myvars<-c("x_chsex_r", "x4height",  "x4age", "x1locale", "x_raceth_r", "x2povty", "x12par1ed_i","p1curmar", "x1htotal", "x1mscalk4",  "childid", "s1_id", "p2safepl","x2krceth", "x4sesl_i", "x_distpov")

#subsetting the data
eclsk.sub<-eclsk11[,myvars]
rm(eclsk11); gc()

##           used (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells 1845252 98.6    3591291  191.8   3591291  191.8
## Vcells 3525282 26.9  681442396 5199.0 783630191 5978.7

Recoding variables

#Child's height for age, continuous outcome
eclsk.sub$height4<-ifelse(eclsk.sub$x4height<=-7, NA, eclsk.sub$x4height)

#Child characteristics
#Child's sex: recode as male =1
eclsk.sub$male<-Recode(eclsk.sub$x_chsex_r, recodes="1=1; 2=0; -9=NA")

#Age
eclsk.sub$age_yrs4<-ifelse(eclsk.sub$x4age<0, NA, eclsk.sub$x4age/12)

eclsk.sub<- eclsk.sub[is.na(eclsk.sub$age_yrs4)==F, ]

#Height for age z score standardized by sex and age
eclsk.sub$height_z4<-ave(eclsk.sub$height4, as.factor(paste(round(eclsk.sub$age_yrs4, 1.5), eclsk.sub$male)), FUN=scale)

#SES
eclsk.sub$hhses4<-ifelse(eclsk.sub$x4sesl_i==-9, NA, scale(eclsk.sub$x4sesl_i))

#Recoded race with white, non Hispanic as reference using dummy vars
eclsk.sub$hisp<-recode (eclsk.sub$x_raceth_r, recodes="3:4=1;-9=NA; else=0")
eclsk.sub$black<-recode (eclsk.sub$x_raceth_r, recodes="2=1;-9=NA; else=0")
eclsk.sub$asian<-recode (eclsk.sub$x_raceth_r, recodes="5=1;-9=NA; else=0")
eclsk.sub$nahn<-recode (eclsk.sub$x_raceth_r, recodes="6:7=1;-9=NA; else=0")
eclsk.sub$other<-recode (eclsk.sub$x_raceth_r, recodes="8=1;-9=NA; else=0")

#Parent/mother characteristics
#Mother's education, recode as 2 dummys with HS = reference
eclsk.sub$lths<-recode(eclsk.sub$x12par1ed_i, recodes = "0:2=1; 3:8=0; else = NA")
eclsk.sub$gths<-recode(eclsk.sub$x12par1ed_i, recodes = "1:3=0; 4:8=1; else =NA") 

#marital status, recode as 2 dummys, ref= married
eclsk.sub$single<-recode(eclsk.sub$p1curmar, recodes="4=1; -7:-9=NA; else=0")
eclsk.sub$notmar<-recode(eclsk.sub$p1curmar, recodes="2:3=1; -7:-9=NA; else=0")

#Household level variables
#Urban school location = 1
eclsk.sub$urban<-Recode(eclsk.sub$x1locale, recodes = "1:3=1; 4=0; -1:-9=NA")

#poverty level in poverty = 1
eclsk.sub$pov<-Recode(eclsk.sub$x2povty , recodes ="1:2=1; 3=0; -9=NA")

#Household size
eclsk.sub$hhsize<-eclsk.sub$x1htotal

#school % minority student body
eclsk.sub$minorsch<-ifelse(eclsk.sub$x2krceth <0, NA, eclsk.sub$x2krceth/10)

#Unsafe neighborhood
eclsk.sub$unsafe<-Recode(eclsk.sub$p2safepl , recodes = "1:2='unsafe'; 3='safe'; else=NA", as.factor = T)

#school district poverty
eclsk.sub$dist_pov<-ifelse(eclsk.sub$x_distpov==-9, NA, scale(eclsk.sub$x_distpov))

Showing the first few lines of the data

head(eclsk.sub)

## # A tibble: 6 x 36
##   x_chsex_r x4height x4age x1locale x_raceth_r x2povty x12par1ed_i
##       <dbl>    <dbl> <dbl>    <dbl>      <dbl>   <dbl>       <dbl>
## 1         1     48    91.7        4          1       2           5
## 2         2     49.5  89.6        2          1       3           5
## 3         1     49    84.3       NA          3       3           8
## 4         1     50    89.2        4          1       3           5
## 5         2     49.8  94.8        4          1       2           3
## 6         2     48    81.0        3          1       1           4
## # ... with 29 more variables: p1curmar <dbl>, x1htotal <dbl>,
## #   x1mscalk4 <dbl>, childid <chr>, s1_id <chr>, p2safepl <dbl>,
## #   x2krceth <dbl>, x4sesl_i <dbl>, x_distpov <dbl>, height4 <dbl>,
## #   male <dbl>, age_yrs4 <dbl>, height_z4 <dbl>, hhses4 <dbl>, hisp <dbl>,
## #   black <dbl>, asian <dbl>, nahn <dbl>, other <dbl>, lths <dbl>,
## #   gths <dbl>, single <dbl>, notmar <dbl>, urban <dbl>, pov <dbl>,
## #   hhsize <dbl>, minorsch <dbl>, unsafe <fct>, dist_pov <dbl>

Checking for variation the outcome

Global F test for equality of means

fit0<-lm(height4~factor(s1_id), data=eclsk.sub)
anova(fit0)

## Analysis of Variance Table
## 
## Response: height4
##                  Df Sum Sq Mean Sq F value    Pr(>F)    
## factor(s1_id)   848   7125  8.4016   1.601 < 2.2e-16 ***
## Residuals     14235  74702  5.2478                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The means are not equal. However, the variation is only slight which is expected as the height of the kids will not vary much.

If we look at the height standardized by sex and age

fit01<-lm(height_z4~factor(s1_id), data=eclsk.sub)
anova(fit01)

## Analysis of Variance Table
## 
## Response: height_z4
##                  Df  Sum Sq Mean Sq F value    Pr(>F)    
## factor(s1_id)   847  1103.3 1.30255  1.3658 3.016e-11 ***
## Residuals     14157 13501.7 0.95372                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The means are not equal, but the variation decreases even more.

Fitting the random intercept model

“null” model fit with no predictors for height

fit1<-lmer(height4~1+(1|s1_id), data = eclsk.sub, REML = T, subset=complete.cases(eclsk.sub))
summary(fit1)

## Linear mixed model fit by REML ['lmerMod']
## Formula: height4 ~ 1 + (1 | s1_id)
##    Data: eclsk.sub
##  Subset: complete.cases(eclsk.sub)
## 
## REML criterion at convergence: 36792.5
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.59892 -0.67080 -0.01918  0.65481  2.58777 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  s1_id    (Intercept) 0.217    0.4658  
##  Residual             5.060    2.2495  
## Number of obs: 8186, groups:  s1_id, 818
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept) 48.46548    0.03038    1596

The variance component is 0.217 with a gigantic t-value which is the proof of statistical significance.

The following model will consider the individual and family level variables

fit2<-lmer(height4~male+age_yrs4+hisp+black+asian+nahn+other+lths+gths+hhses4+single+notmar+urban+pov+hhsize+minorsch+unsafe+dist_pov+(1|s1_id), data=eclsk.sub, REML=T, subset=complete.cases(eclsk.sub))

summary(fit2)

## Linear mixed model fit by REML ['lmerMod']
## Formula: 
## height4 ~ male + age_yrs4 + hisp + black + asian + nahn + other +  
##     lths + gths + hhses4 + single + notmar + urban + pov + hhsize +  
##     minorsch + unsafe + dist_pov + (1 | s1_id)
##    Data: eclsk.sub
##  Subset: complete.cases(eclsk.sub)
## 
## REML criterion at convergence: 35638.6
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.3471 -0.6764 -0.0181  0.6465  3.3847 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  s1_id    (Intercept) 0.09105  0.3018  
##  Residual             4.43843  2.1068  
## Number of obs: 8186, groups:  s1_id, 818
## 
## Fixed effects:
##              Estimate Std. Error t value
## (Intercept)  33.79916    0.49945  67.673
## male          0.34214    0.04725   7.241
## age_yrs4      2.07107    0.06794  30.483
## hisp         -0.08291    0.07806  -1.062
## black         0.94364    0.09839   9.591
## asian        -0.66202    0.10453  -6.333
## nahn          0.27744    0.23162   1.198
## other         0.14836    0.11010   1.348
## lths          0.04789    0.09576   0.500
## gths          0.08851    0.07276   1.216
## hhses4        0.06789    0.03956   1.716
## single       -0.05673    0.08256  -0.687
## notmar       -0.03312    0.08357  -0.396
## urban         0.02533    0.02064   1.227
## pov          -0.05395    0.06684  -0.807
## hhsize       -0.06577    0.01918  -3.430
## minorsch     -0.01104    0.01109  -0.995
## unsafeunsafe -0.01491    0.05821  -0.256
## dist_pov     -0.01576    0.03346  -0.471

## 
## Correlation matrix not shown by default, as p = 19 > 12.
## Use print(x, correlation=TRUE)  or
##     vcov(x)        if you need it

The variance component of 0.09105 went down when additional variables are included in the model.

Variance Components and ICC

VarCorr(fit1)$s1_id[1]/((attr(VarCorr(fit1), "sc")^2)+VarCorr(fit1)$s1_id[1])

## [1] 0.04112083

library(sjstats)
icc(fit1)

## 
## Linear mixed model
## 
## Family : gaussian (identity)
## Formula: height4 ~ 1 + (1 | s1_id)
## 
##   ICC (s1_id): 0.0411

For model 2

VarCorr(fit2)$s1_id[1]/((attr(VarCorr(fit2), "sc")^2)+VarCorr(fit2)$s1_id[1])

## [1] 0.02010265

icc(fit2)

## 
## Linear mixed model
## 
## Family : gaussian (identity)
## Formula: height4 ~ male + age_yrs4 + hisp + black + asian + nahn + other + lths + gths + hhses4 + single + notmar + urban + pov + hhsize + minorsch + unsafe + dist_pov + (1 | s1_id)
## 
##   ICC (s1_id): 0.0201

The ICC in the second model goes down to 0.0201 from 0.0411 in the first model. It means that we have reduced the variability children’s height by controlling for additional characteristics.

Compare these two fits using a Likelihood ratio test

anova(fit1, fit2)

## refitting model(s) with ML (instead of REML)

## Data: eclsk.sub
## Subset: complete.cases(eclsk.sub)
## Models:
## fit1: height4 ~ 1 + (1 | s1_id)
## fit2: height4 ~ male + age_yrs4 + hisp + black + asian + nahn + other + 
## fit2:     lths + gths + hhses4 + single + notmar + urban + pov + hhsize + 
## fit2:     minorsch + unsafe + dist_pov + (1 | s1_id)
##      Df   AIC   BIC logLik deviance  Chisq Chi Df Pr(>Chisq)    
## fit1  3 36793 36814 -18394    36787                             
## fit2 21 35604 35751 -17781    35562 1225.5     18  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The second model is a better fit than the first.

DEM 7473 HW 1: Basic Hierarchical Models - Random Intercepts

Muntasir Masum

September 17, 2018