The outcome of interest here is the child’s height for age, which is a continuous outcome.
library(haven)
library (car)
library(lmtest)
library(arm)
library(lme4)
eclsk11<-read_sas("P:/dem7473/data/childk4p.sas7bdat")
names(eclsk11)<-tolower(names(eclsk11))
#getting out the only the variables we need
myvars<-c("x_chsex_r", "x4height", "x4age", "x1locale", "x_raceth_r", "x2povty", "x12par1ed_i","p1curmar", "x1htotal", "x1mscalk4", "childid", "s1_id", "p2safepl","x2krceth", "x4sesl_i", "x_distpov")
#subsetting the data
eclsk.sub<-eclsk11[,myvars]
rm(eclsk11); gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 1845252 98.6 3591291 191.8 3591291 191.8
## Vcells 3525282 26.9 681442396 5199.0 783630191 5978.7
#Child's height for age, continuous outcome
eclsk.sub$height4<-ifelse(eclsk.sub$x4height<=-7, NA, eclsk.sub$x4height)
#Child characteristics
#Child's sex: recode as male =1
eclsk.sub$male<-Recode(eclsk.sub$x_chsex_r, recodes="1=1; 2=0; -9=NA")
#Age
eclsk.sub$age_yrs4<-ifelse(eclsk.sub$x4age<0, NA, eclsk.sub$x4age/12)
eclsk.sub<- eclsk.sub[is.na(eclsk.sub$age_yrs4)==F, ]
#Height for age z score standardized by sex and age
eclsk.sub$height_z4<-ave(eclsk.sub$height4, as.factor(paste(round(eclsk.sub$age_yrs4, 1.5), eclsk.sub$male)), FUN=scale)
#SES
eclsk.sub$hhses4<-ifelse(eclsk.sub$x4sesl_i==-9, NA, scale(eclsk.sub$x4sesl_i))
#Recoded race with white, non Hispanic as reference using dummy vars
eclsk.sub$hisp<-recode (eclsk.sub$x_raceth_r, recodes="3:4=1;-9=NA; else=0")
eclsk.sub$black<-recode (eclsk.sub$x_raceth_r, recodes="2=1;-9=NA; else=0")
eclsk.sub$asian<-recode (eclsk.sub$x_raceth_r, recodes="5=1;-9=NA; else=0")
eclsk.sub$nahn<-recode (eclsk.sub$x_raceth_r, recodes="6:7=1;-9=NA; else=0")
eclsk.sub$other<-recode (eclsk.sub$x_raceth_r, recodes="8=1;-9=NA; else=0")
#Parent/mother characteristics
#Mother's education, recode as 2 dummys with HS = reference
eclsk.sub$lths<-recode(eclsk.sub$x12par1ed_i, recodes = "0:2=1; 3:8=0; else = NA")
eclsk.sub$gths<-recode(eclsk.sub$x12par1ed_i, recodes = "1:3=0; 4:8=1; else =NA")
#marital status, recode as 2 dummys, ref= married
eclsk.sub$single<-recode(eclsk.sub$p1curmar, recodes="4=1; -7:-9=NA; else=0")
eclsk.sub$notmar<-recode(eclsk.sub$p1curmar, recodes="2:3=1; -7:-9=NA; else=0")
#Household level variables
#Urban school location = 1
eclsk.sub$urban<-Recode(eclsk.sub$x1locale, recodes = "1:3=1; 4=0; -1:-9=NA")
#poverty level in poverty = 1
eclsk.sub$pov<-Recode(eclsk.sub$x2povty , recodes ="1:2=1; 3=0; -9=NA")
#Household size
eclsk.sub$hhsize<-eclsk.sub$x1htotal
#school % minority student body
eclsk.sub$minorsch<-ifelse(eclsk.sub$x2krceth <0, NA, eclsk.sub$x2krceth/10)
#Unsafe neighborhood
eclsk.sub$unsafe<-Recode(eclsk.sub$p2safepl , recodes = "1:2='unsafe'; 3='safe'; else=NA", as.factor = T)
#school district poverty
eclsk.sub$dist_pov<-ifelse(eclsk.sub$x_distpov==-9, NA, scale(eclsk.sub$x_distpov))
Showing the first few lines of the data
head(eclsk.sub)
## # A tibble: 6 x 36
## x_chsex_r x4height x4age x1locale x_raceth_r x2povty x12par1ed_i
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 48 91.7 4 1 2 5
## 2 2 49.5 89.6 2 1 3 5
## 3 1 49 84.3 NA 3 3 8
## 4 1 50 89.2 4 1 3 5
## 5 2 49.8 94.8 4 1 2 3
## 6 2 48 81.0 3 1 1 4
## # ... with 29 more variables: p1curmar <dbl>, x1htotal <dbl>,
## # x1mscalk4 <dbl>, childid <chr>, s1_id <chr>, p2safepl <dbl>,
## # x2krceth <dbl>, x4sesl_i <dbl>, x_distpov <dbl>, height4 <dbl>,
## # male <dbl>, age_yrs4 <dbl>, height_z4 <dbl>, hhses4 <dbl>, hisp <dbl>,
## # black <dbl>, asian <dbl>, nahn <dbl>, other <dbl>, lths <dbl>,
## # gths <dbl>, single <dbl>, notmar <dbl>, urban <dbl>, pov <dbl>,
## # hhsize <dbl>, minorsch <dbl>, unsafe <fct>, dist_pov <dbl>
fit0<-lm(height4~factor(s1_id), data=eclsk.sub)
anova(fit0)
## Analysis of Variance Table
##
## Response: height4
## Df Sum Sq Mean Sq F value Pr(>F)
## factor(s1_id) 848 7125 8.4016 1.601 < 2.2e-16 ***
## Residuals 14235 74702 5.2478
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fit01<-lm(height_z4~factor(s1_id), data=eclsk.sub)
anova(fit01)
## Analysis of Variance Table
##
## Response: height_z4
## Df Sum Sq Mean Sq F value Pr(>F)
## factor(s1_id) 847 1103.3 1.30255 1.3658 3.016e-11 ***
## Residuals 14157 13501.7 0.95372
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The means are not equal, but the variation decreases even more.
fit1<-lmer(height4~1+(1|s1_id), data = eclsk.sub, REML = T, subset=complete.cases(eclsk.sub))
summary(fit1)
## Linear mixed model fit by REML ['lmerMod']
## Formula: height4 ~ 1 + (1 | s1_id)
## Data: eclsk.sub
## Subset: complete.cases(eclsk.sub)
##
## REML criterion at convergence: 36792.5
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.59892 -0.67080 -0.01918 0.65481 2.58777
##
## Random effects:
## Groups Name Variance Std.Dev.
## s1_id (Intercept) 0.217 0.4658
## Residual 5.060 2.2495
## Number of obs: 8186, groups: s1_id, 818
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 48.46548 0.03038 1596
The following model will consider the individual and family level variables
fit2<-lmer(height4~male+age_yrs4+hisp+black+asian+nahn+other+lths+gths+hhses4+single+notmar+urban+pov+hhsize+minorsch+unsafe+dist_pov+(1|s1_id), data=eclsk.sub, REML=T, subset=complete.cases(eclsk.sub))
summary(fit2)
## Linear mixed model fit by REML ['lmerMod']
## Formula:
## height4 ~ male + age_yrs4 + hisp + black + asian + nahn + other +
## lths + gths + hhses4 + single + notmar + urban + pov + hhsize +
## minorsch + unsafe + dist_pov + (1 | s1_id)
## Data: eclsk.sub
## Subset: complete.cases(eclsk.sub)
##
## REML criterion at convergence: 35638.6
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.3471 -0.6764 -0.0181 0.6465 3.3847
##
## Random effects:
## Groups Name Variance Std.Dev.
## s1_id (Intercept) 0.09105 0.3018
## Residual 4.43843 2.1068
## Number of obs: 8186, groups: s1_id, 818
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 33.79916 0.49945 67.673
## male 0.34214 0.04725 7.241
## age_yrs4 2.07107 0.06794 30.483
## hisp -0.08291 0.07806 -1.062
## black 0.94364 0.09839 9.591
## asian -0.66202 0.10453 -6.333
## nahn 0.27744 0.23162 1.198
## other 0.14836 0.11010 1.348
## lths 0.04789 0.09576 0.500
## gths 0.08851 0.07276 1.216
## hhses4 0.06789 0.03956 1.716
## single -0.05673 0.08256 -0.687
## notmar -0.03312 0.08357 -0.396
## urban 0.02533 0.02064 1.227
## pov -0.05395 0.06684 -0.807
## hhsize -0.06577 0.01918 -3.430
## minorsch -0.01104 0.01109 -0.995
## unsafeunsafe -0.01491 0.05821 -0.256
## dist_pov -0.01576 0.03346 -0.471
##
## Correlation matrix not shown by default, as p = 19 > 12.
## Use print(x, correlation=TRUE) or
## vcov(x) if you need it
The variance component of 0.09105 went down when additional variables are included in the model.
VarCorr(fit1)$s1_id[1]/((attr(VarCorr(fit1), "sc")^2)+VarCorr(fit1)$s1_id[1])
## [1] 0.04112083
library(sjstats)
icc(fit1)
##
## Linear mixed model
##
## Family : gaussian (identity)
## Formula: height4 ~ 1 + (1 | s1_id)
##
## ICC (s1_id): 0.0411
For model 2
VarCorr(fit2)$s1_id[1]/((attr(VarCorr(fit2), "sc")^2)+VarCorr(fit2)$s1_id[1])
## [1] 0.02010265
icc(fit2)
##
## Linear mixed model
##
## Family : gaussian (identity)
## Formula: height4 ~ male + age_yrs4 + hisp + black + asian + nahn + other + lths + gths + hhses4 + single + notmar + urban + pov + hhsize + minorsch + unsafe + dist_pov + (1 | s1_id)
##
## ICC (s1_id): 0.0201
anova(fit1, fit2)
## refitting model(s) with ML (instead of REML)
## Data: eclsk.sub
## Subset: complete.cases(eclsk.sub)
## Models:
## fit1: height4 ~ 1 + (1 | s1_id)
## fit2: height4 ~ male + age_yrs4 + hisp + black + asian + nahn + other +
## fit2: lths + gths + hhses4 + single + notmar + urban + pov + hhsize +
## fit2: minorsch + unsafe + dist_pov + (1 | s1_id)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## fit1 3 36793 36814 -18394 36787
## fit2 21 35604 35751 -17781 35562 1225.5 18 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1