Week 3: Basic Hierarchical Models - Random Intercepts and Slopes

First we load our data

load("~/Google Drive/dem7903_App_Hier/data/eclsk.Rdata")
names(eclsk)<-tolower(names(eclsk))
library (car)
library(lme4)

## Loading required package: Matrix
## Loading required package: Rcpp

library(arm)

## Loading required package: MASS
## 
## arm (Version 1.7-05, built: 2014-8-1)
## 
## Working directory is /Users/ozd504/Google Drive/dem7903_App_Hier/code/code14
## 
## 
## Attaching package: 'arm'
## 
## The following object is masked from 'package:car':
## 
##     logit

#get out only the variables I'm going to use for this example

myvars<-c("gender", "kurban_r", "race", "w1povrty", "wkmomed","p2homecm", "p2cover", "p2curmar", "p2sprhhm", "c2r4mtsc", "p2dentis", "s2_id", "s2kpupri","s2kminor" )
#subset the data
eclsk.sub<-eclsk[,myvars]
rm(eclsk); gc()

##           used (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells 1290867 69.0    1967602  105.1   1967602  105.1
## Vcells 2363459 18.1  489978910 3738.3 408904644 3119.7

Next, I do some recoding of variables using a mixture of the ifelse() function and the recode () function.

#recode our outcomes, the  first is the child's math standardized test score  in Kindergarten
eclsk.sub$math<-ifelse(eclsk.sub$c2r4mtsc<0, NA, eclsk.sub$c2r4mtsc)

#the second outcome is whether each child has seen a dentist within the last year
eclsk.sub$dentist<-recode(eclsk.sub$p2dentis, recodes = "2:3=1; -1:-9= NA; else=0")

#First we recode some Child characteristics
#Child's sex: recode as male =1
eclsk.sub$male<-recode(eclsk.sub$gender, recodes="1=1; 2=0; -9=NA")

#Recode race with white, non Hispanic as reference using dummy vars
eclsk.sub$hisp<-recode (eclsk.sub$race, recodes="3:4=1;-9=NA; else=0")
eclsk.sub$black<-recode (eclsk.sub$race, recodes="2=1;-9=NA; else=0")
eclsk.sub$asian<-recode (eclsk.sub$race, recodes="5=1;-9=NA; else=0")
eclsk.sub$nahn<-recode (eclsk.sub$race, recodes="6:7=1;-9=NA; else=0")
eclsk.sub$other<-recode (eclsk.sub$race, recodes="8=1;-9=NA; else=0")

#insurance coverage
eclsk.sub$covered<-recode(eclsk.sub$p2cover, recodes="1=1; -1:-9=NA; else=0")

#Then we recode some parent/mother characteristics
#Mother's education, recode as 2 dummys with HS = reference
eclsk.sub$mlths<-recode(eclsk.sub$wkmomed, recodes = "1:2=1; 3:9=0; else = NA")
eclsk.sub$mgths<-recode(eclsk.sub$wkmomed, recodes = "1:3=0; 4:9=1; else =NA") 

#marital status, recode as 2 dummys, ref= married
eclsk.sub$single<-recode(eclsk.sub$p2curmar, recodes="5=1; -7:-9=NA; else=0")
eclsk.sub$notmar<-recode(eclsk.sub$p2curmar, recodes="2:4=1; -7:-9=NA; else=0")


#Then we do some household level variables
#Urban residence = 1
eclsk.sub$urban<-recode(eclsk.sub$kurban_r, recodes = "1:2=1; 3=0")

#poverty level in poverty = 1
eclsk.sub$pov<-recode(eclsk.sub$w1povrty , recodes ="1=1; 2=0")

#Household size
eclsk.sub$hhsize<-eclsk.sub$p2sprhhm

#school is private
eclsk.sub$privsch<-ifelse(eclsk.sub$s2kpupri==1, 1, 0)

#school has >50% minority student body
eclsk.sub$minorsch<-recode(eclsk.sub$s2kminor, recodes="1:3=0; 4:5=1; else=NA" )

#Show the first few lines of the data
head(eclsk.sub)

##   gender kurban_r race w1povrty wkmomed p2homecm p2cover p2curmar p2sprhhm
## 1      2        1    1        2       6        1       1        1        4
## 2      2        1    1        2       3        1       1        1        4
## 3      2        1    1        2       9        1       1        1        5
## 4      1        1    1        2       6        1       1        1        4
## 5      2        1    1        2       6        1       1        1        4
## 6      1        1    1        2       8        2       1        1        4
##   c2r4mtsc p2dentis s2_id s2kpupri s2kminor  math dentist male hisp black
## 1    62.24        2  0001        1        1 62.24       1    0    0     0
## 2    67.12        3  0001        1        1 67.12       1    0    0     0
## 3    49.10        2  0001        1        1 49.10       1    0    0     0
## 4    50.50        2  0001        1        1 50.50       1    1    0     0
## 5    54.24        2  0001        1        1 54.24       1    0    0     0
## 6    51.25        2  0001        1        1 51.25       1    1    0     0
##   asian nahn other covered mlths mgths single notmar urban pov hhsize
## 1     0    0     0       1     0     1      0      0     1   0      4
## 2     0    0     0       1     0     0      0      0     1   0      4
## 3     0    0     0       1     0     1      0      0     1   0      5
## 4     0    0     0       1     0     1      0      0     1   0      4
## 5     0    0     0       1     0     1      0      0     1   0      4
## 6     0    0     0       1     0     1      0      0     1   0      4
##   privsch minorsch
## 1       1        0
## 2       1        0
## 3       1        0
## 4       1        0
## 5       1        0
## 6       1        0

The First model will fit a model that considers the individual and family level variables and a random intercept only

fit1<-lmer(math~male+hisp+black+asian+nahn+other+mlths+mgths+single+notmar+urban+pov+hhsize+(1|s2_id), data=eclsk.sub, REML=T)

display(fit1)

## lmer(formula = math ~ male + hisp + black + asian + nahn + other + 
##     mlths + mgths + single + notmar + urban + pov + hhsize + 
##     (1 | s2_id), data = eclsk.sub, REML = T)
##             coef.est coef.se
## (Intercept) 52.46     0.40  
## male         0.23     0.14  
## hisp        -3.53     0.24  
## black       -3.95     0.28  
## asian        0.65     0.36  
## nahn        -2.89     0.53  
## other       -1.93     0.45  
## mlths       -2.36     0.26  
## mgths        2.64     0.17  
## single      -1.72     0.25  
## notmar      -1.23     0.22  
## urban        0.88     0.31  
## pov         -2.41     0.22  
## hhsize      -0.29     0.06  
## 
## Error terms:
##  Groups   Name        Std.Dev.
##  s2_id    (Intercept) 3.18    
##  Residual             8.26    
## ---
## number of obs: 14630, groups: s2_id, 1127
## AIC = 104465, DIC = 104400
## deviance = 104416.7

The second model considers both random intercepts and random slopes, in this case, i’m only considering a random slope for poverty status

fit2<-lmer(math~male+hisp+black+asian+nahn+other+mlths+mgths+single+notmar+urban+pov+hhsize+(1+pov|s2_id), data = eclsk.sub, REML = T)
display(fit2)

## lmer(formula = math ~ male + hisp + black + asian + nahn + other + 
##     mlths + mgths + single + notmar + urban + pov + hhsize + 
##     (1 + pov | s2_id), data = eclsk.sub, REML = T)
##             coef.est coef.se
## (Intercept) 52.47     0.40  
## male         0.23     0.14  
## hisp        -3.54     0.24  
## black       -3.92     0.28  
## asian        0.66     0.36  
## nahn        -2.95     0.53  
## other       -1.94     0.45  
## mlths       -2.37     0.26  
## mgths        2.63     0.17  
## single      -1.71     0.25  
## notmar      -1.22     0.22  
## urban        0.86     0.31  
## pov         -2.52     0.22  
## hhsize      -0.29     0.06  
## 
## Error terms:
##  Groups   Name        Std.Dev. Corr  
##  s2_id    (Intercept) 3.25           
##           pov         0.78     -0.60 
##  Residual             8.26           
## ---
## number of obs: 14630, groups: s2_id, 1127
## AIC = 104466, DIC = 104397
## deviance = 104413.6

anova (fit1, fit2)

## refitting model(s) with ML (instead of REML)

## Data: eclsk.sub
## Models:
## fit1: math ~ male + hisp + black + asian + nahn + other + mlths + mgths + 
## fit1:     single + notmar + urban + pov + hhsize + (1 | s2_id)
## fit2: math ~ male + hisp + black + asian + nahn + other + mlths + mgths + 
## fit2:     single + notmar + urban + pov + hhsize + (1 + pov | s2_id)
##      Df    AIC    BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## fit1 16 104449 104570 -52208   104417                        
## fit2 18 104450 104586 -52207   104414  3.05      2       0.22

In this case, fitting the random slope for poverty isn’t adding anything to the model I interpret this as kids living in housholds below the poverty line face a systematic disadvantage, regardless of school

Now, let’s plot the regression lines for each school!

#get the random effects
rancoefs2<-ranef(fit2)

#here are the first 10 schools random effects
head(rancoefs2$s2_id, n=10)

##      (Intercept)       pov
## 0001     0.46266 -0.067041
## 0002    -0.96343  0.277943
## 0005    -1.99301  0.202019
## 0006    -0.82630  0.103789
## 0007    -0.00813  0.001178
## 0009    -3.92873  0.489163
## 0010    -4.96850  0.746682
## 0011    -2.75480  0.399181
## 0013     5.60871 -0.812722
## 0014    -0.99225  0.143781

#Histograms 
par(mfrow=c(1,2))
hist(rancoefs2$s2_id[,"(Intercept)"], main = "Distribution of Random Intercepts")
hist(rancoefs2$s2_id[,"pov"], main = "Distribution of Random Slopes")

plot of chunk unnamed-chunk-6

#Plot of random slopes, I iterate over all schools, drawing lines
par(mfrow=c(1,1))
plot(NULL, ylim=c(35, 65), xlim=c(0,1),ylab="Math Score", xlab="HH Poverty Status")
title (main="Random Slope and Intercept Model")
cols=sample(rainbow(n=50), size = dim(rancoefs2$s2_id)[1], replace = T)
for (i in 1: dim(rancoefs2$s2_id)[1]){
  
  abline(a=fixef(fit2)[1]+rancoefs2$s2_id[[1]][i], b=fixef(fit2)[13]+rancoefs2$s2_id[[2]][i], col=cols[i],lwd=.5 )
}

Here is the ICC from fit 2

VarCorr(fit2)

##  Groups   Name        Std.Dev. Corr 
##  s2_id    (Intercept) 3.251         
##           pov         0.785    -0.60
##  Residual             8.260

VarCorr(fit2)$s2_id[1]/((attr(VarCorr(fit1), "sc")^2)+VarCorr(fit2)$s2_id[1]+VarCorr(fit2)$s2_id[4])

## [1] 0.133

#the "sc" attribute is the residual variance