Week 4: Basic Hierarchical Models

Here, we will talk about a few more miscellaneous topics: GLMM’s and Cross level interactions The data I will use here is from the Behavioral Risk Factor Surveillance System (BRFSS) SMART county data First we load our data

library(lme4)

## Loading required package: Matrix
## Loading required package: Rcpp

library(arm)

## Loading required package: MASS
## 
## arm (Version 1.7-05, built: 2014-8-1)
## 
## Working directory is /Users/ozd504/Google Drive/dem7903_App_Hier/code/code14

library(car)

## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:arm':
## 
##     logit

library(MASS)
library(ordinal)

## 
## Attaching package: 'ordinal'
## 
## The following object is masked from 'package:arm':
## 
##     ranef
## 
## The following objects are masked from 'package:lme4':
## 
##     ranef, VarCorr

#load brfss
load("~/Google Drive/dem7903_App_Hier/data/brfss_11.Rdata")
nams<-names(brfss_11)
newnames<-gsub("_", "", nams)
names(brfss_11)<-tolower(newnames)

brfss_11$statefip<-sprintf("%02d", brfss_11$state )
brfss_11$cofip<-sprintf("%03d", brfss_11$cnty )
brfss_11$cofips<-paste(brfss_11$statefip, brfss_11$cofip, sep="")

#Outcome is: Now thinking about your physical health, which includes physical illness and injury, for how many days during the past 30 days was your physical health not good?
brfss_11$healthydays<-ifelse(brfss_11$physhlth %in%c(77,99), NA,ifelse(brfss_11$physhlth==88,0, brfss_11$physhlth))

#just for brevity, I just select TX respondents with non missing weights
brfss_11<-brfss_11[is.na(brfss_11$cntywt)==F,]
#insurance
brfss_11$ins<-ifelse(brfss_11$hlthpln1==1,1,0)
#smoking currently
brfss_11$smoke<-recode(brfss_11$smoker3, recodes="1:2=1; 3:4=0; else=NA")
#low physical activity
brfss_11$lowact<-recode(brfss_11$pacat, recodes="1:2=0; 3:4=1; else=NA")
#high blood pressure
brfss_11$hbp<-recode(brfss_11$bphigh4, recodes="1=1; 2:4=0; else=NA")
#high cholesterol
brfss_11$hc<-recode(brfss_11$toldhi2, recodes="1=1; 2=0; else=NA")
#bmi
brfss_11$bmi<-ifelse(is.na(brfss_11$bmi5)==T, NA, brfss_11$bmi5/100)
#poor or fair health
brfss_11$badhealth<-ifelse(brfss_11$genhlth %in% c(4,5),1,0)

#race
brfss_11$black<-recode(brfss_11$racegr2, recodes="2=1; 9=NA; else=0", as.factor.result=T)
brfss_11$white<-recode(brfss_11$racegr2, recodes="1=1; 9=NA; else=0", as.factor.result=T)
brfss_11$other<-recode(brfss_11$racegr2, recodes="3:4=1; 9=NA; else=0", as.factor.result=T)
brfss_11$hispanic<-recode(brfss_11$racegr2, recodes="5=1; 9=NA; else=0", as.factor.result=T)

#have a personal doctor?
brfss_11$doc<-recode(brfss_11$persdoc2, recodes="1:2=1; 3=0; else=NA", as.factor.result=F)

#needed care in last year but couldn't get it because of cost
brfss_11$medacc<-recode(brfss_11$medcost, recodes="1=1;2=0;else=NA")

#education level
brfss_11$lths<-recode(brfss_11$educa, recodes="1:3=1;9=NA; else=0", as.factor.result=F)
brfss_11$coll<-recode(brfss_11$educa, recodes="5:6=1;9=NA; else=0", as.factor.result=F)

#employment
brfss_11$employ<-recode(brfss_11$employ, recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA", as.factor.result=T)
brfss_11$employ<-relevel(brfss_11$employ, ref='Employed')

#marital status
brfss_11$marst<-recode(brfss_11$marital, recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA", as.factor.result=T)
brfss_11$marst<-relevel(brfss_11$marst, ref='married')

#income
brfss_11$inc<-as.factor(ifelse(brfss_11$incomg==9, NA, brfss_11$incomg))

#Age cut into intervals
brfss_11$agec<-cut(brfss_11$age, breaks=c(0,24,39,59,79,99))

Now we will begin fitting the multilevel regression model with the county that the person lives in being the higher level

I will also add in some Census variables from the ACS 2010 5 Year estimates load in ACS data from Factfinder

acsecon<-read.csv("~/Google Drive/dem7903_App_Hier/data/aff_download/ACS_10_5YR_DP03_with_ann.csv")
acsecon$povrate<-acsecon[, "HC03_VC156"]
acsecon$unemployed<-acsecon[, "HC03_VC13"]
acsecon$cofips<-substr(acsecon$GEO.id, 10,14)
acsecon<-acsecon[, c("cofips", "povrate", "unemployed")]

joindata<-acsecon
head(acsecon)

##   cofips povrate unemployed
## 1  01001     7.5        6.2
## 2  01003     9.1        6.6
## 3  01005    19.9        9.6
## 4  01007     9.4        9.1
## 5  01009    10.0        7.5
## 6  01011    22.6       11.2

I also have segregation measures at the count level that I made, I will merge these in too As a rule, when including continuous predictors at the higher level, you should always standardize them This is done for 2 reasons, first is that it makes the computer happy, second is that it makes the interpretation of the higher level variable easier.

load("~/Google Drive/dem7903_App_Hier/data/segregation_county.Rdata")
joindata<-merge(joindata, county_dat, by.x="cofips", by.y="cofips", all.x=T)

joindata$povz<-scale(joindata$povrate, center=T, scale=T)
joindata$unempz<-scale(joindata$unemployed, center=T, scale=T)
joindata$diss.z<-scale(joindata$dissim_wb, center=T, scale=T)
joindata$iso.z<-scale(joindata$isolation_b, center=T, scale=T)
joindata$theil.z<-scale(joindata$TheileH, center=T, scale=T)

head(joindata)

##   cofips povrate unemployed dissim_wb isolation_b interaction_bw TheileH
## 1  01001     7.5        6.2    0.2898     0.27475         0.6897 0.08300
## 2  01003     9.1        6.6    0.3903     0.18464         0.7674 0.09204
## 3  01005    19.9        9.6    0.1914     0.50243         0.4518 0.04899
## 4  01007     9.4        9.1    0.4339     0.33170         0.6477 0.11960
## 5  01009    10.0        7.5    0.3777     0.02201         0.9041 0.05284
## 6  01011    22.6       11.2    0.2412     0.71810         0.2139 0.05627
##      povz    unempz  diss.z   iso.z theil.z
## 1 -0.6960 -0.396271 -0.1041  0.7875  0.1091
## 2 -0.4070 -0.277121  0.4778  0.2928  0.2125
## 3  1.5439  0.616508 -0.6734  2.0376 -0.2797
## 4 -0.3528  0.467570  0.7300  1.1002  0.5276
## 5 -0.2444 -0.009032  0.4046 -0.6001 -0.2357
## 6  2.0316  1.093110 -0.3849  3.2217 -0.1964

#and merge the data back to the kids data
merged<-merge(x=brfss_11, y=joindata, by.x="cofips", by.y="cofips", all.x=T)

## Warning: column names 'cholchk', 'mrace' are duplicated in the result

merged$bmiz<-scale(merged$bmi, center=T, scale=T)
merged$agez<-scale(merged$age, center=T, scale=T)

Here I fit a binomial model (binary logistic regression) for healthcare access (1= has a regular primary care physician, 0=does not) First a simple random intercept model, then a model with a higher level variable, then a cross level interaction model

#################
#### GLMM's #####
#################
#I subset the data to only have states with more than 8 counties sampled
tapply(merged$cofips, merged$state, function(x) length(unique(x))>5)

##     1     2     4     5     6     8     9    10    11    12    13    15 
## FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE 
##    16    17    18    19    20    21    22    23    24    25    26    27 
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE FALSE  TRUE 
##    29    30    31    32    33    34    35    36    37    38    39    40 
## FALSE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE FALSE FALSE FALSE  TRUE FALSE 
##    41    42    44    45    46    48    49    50    53    54    55    56 
## FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE FALSE FALSE

merged<-merged[merged$statefip%in%c("06","08","23","30", "31", "25","39","45" ,"48","49"),]

fit.mix.bin<-glmer(doc~agez+lths+coll+black+hispanic+other+(1|cofips), family=binomial, data=merged)
summary(fit.mix.bin)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## doc ~ agez + lths + coll + black + hispanic + other + (1 | cofips)
##    Data: merged
## 
##      AIC      BIC   logLik deviance df.resid 
##    69835    69911   -34910    69819    99208 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -11.748   0.196   0.284   0.405   1.923 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  cofips (Intercept) 0.142    0.377   
## Number of obs: 99216, groups:  cofips, 91
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   1.9297     0.0445    43.4  < 2e-16 ***
## agez          0.7563     0.0106    71.7  < 2e-16 ***
## lths         -0.4527     0.0356   -12.7  < 2e-16 ***
## coll          0.4002     0.0226    17.7  < 2e-16 ***
## black1       -0.1825     0.0416    -4.4  1.2e-05 ***
## hispanic1    -0.5891     0.0320   -18.4  < 2e-16 ***
## other1       -0.3561     0.0401    -8.9  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##           (Intr) agez   lths   coll   black1 hspnc1
## agez       0.074                                   
## lths      -0.184 -0.070                            
## coll      -0.340  0.054  0.385                     
## black1    -0.097  0.048 -0.039  0.059              
## hispanic1 -0.107  0.147 -0.248  0.121  0.164       
## other1    -0.065  0.090 -0.042 -0.011  0.101  0.164

#Then I do a multi-level model where I include the poverty rate and the segregation index at the county level
fit.mix.bin.ml<-glmer(doc~agez+lths+coll+black+hispanic+other+povz+(1|cofips), family=binomial, data=merged)
summary(fit.mix.bin.ml)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: doc ~ agez + lths + coll + black + hispanic + other + povz +  
##     (1 | cofips)
##    Data: merged
## 
##      AIC      BIC   logLik deviance df.resid 
##    69834    69920   -34908    69816    99207 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -11.759   0.196   0.284   0.405   1.922 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  cofips (Intercept) 0.137    0.371   
## Number of obs: 99216, groups:  cofips, 91
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   1.8932     0.0491    38.6  < 2e-16 ***
## agez          0.7565     0.0106    71.7  < 2e-16 ***
## lths         -0.4522     0.0356   -12.7  < 2e-16 ***
## coll          0.3996     0.0226    17.7  < 2e-16 ***
## black1       -0.1794     0.0417    -4.3  1.7e-05 ***
## hispanic1    -0.5886     0.0320   -18.4  < 2e-16 ***
## other1       -0.3547     0.0401    -8.9  < 2e-16 ***
## povz         -0.1114     0.0669    -1.7    0.096 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##           (Intr) agez   lths   coll   black1 hspnc1 other1
## agez       0.060                                          
## lths      -0.170 -0.070                                   
## coll      -0.302  0.054  0.385                            
## black1    -0.108  0.049 -0.038  0.058                     
## hispanic1 -0.102  0.147 -0.248  0.121  0.164              
## other1    -0.068  0.090 -0.042 -0.011  0.101  0.164       
## povz       0.446 -0.016 -0.009  0.015 -0.044 -0.012 -0.020

anova(fit.mix.bin, fit.mix.bin.ml)

## Data: merged
## Models:
## fit.mix.bin: doc ~ agez + lths + coll + black + hispanic + other + (1 | cofips)
## fit.mix.bin.ml: doc ~ agez + lths + coll + black + hispanic + other + povz + 
## fit.mix.bin.ml:     (1 | cofips)
##                Df   AIC   BIC logLik deviance Chisq Chi Df Pr(>Chisq)  
## fit.mix.bin     8 69835 69911 -34910    69819                          
## fit.mix.bin.ml  9 69834 69920 -34908    69816  2.72      1      0.099 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#Little evidence of poverty effect

#Cross level interaction model between poverty and race and segregation and race
#You simply include a term of individualvar*highervar
fit.mix.bin.cl<-glmer(doc~agez+lths+coll+(black+hispanic+other)*povz+(1|cofips), family=binomial, data=merged)
summary(fit.mix.bin.cl)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: doc ~ agez + lths + coll + (black + hispanic + other) * povz +  
##     (1 | cofips)
##    Data: merged
## 
##      AIC      BIC   logLik deviance df.resid 
##    69835    69949   -34905    69811    99204 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -11.762   0.196   0.284   0.404   1.908 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  cofips (Intercept) 0.138    0.372   
## Number of obs: 99216, groups:  cofips, 91
## 
## Fixed effects:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      1.8953     0.0494    38.3  < 2e-16 ***
## agez             0.7565     0.0106    71.7  < 2e-16 ***
## lths            -0.4533     0.0356   -12.7  < 2e-16 ***
## coll             0.4001     0.0226    17.7  < 2e-16 ***
## black1          -0.1780     0.0424    -4.2  2.7e-05 ***
## hispanic1       -0.5676     0.0357   -15.9  < 2e-16 ***
## other1          -0.3833     0.0440    -8.7  < 2e-16 ***
## povz            -0.1063     0.0683    -1.6     0.12    
## black1:povz     -0.0114     0.0721    -0.2     0.87    
## hispanic1:povz   0.0792     0.0550     1.4     0.15    
## other1:povz     -0.1010     0.0615    -1.6     0.10    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) agez   lths   coll   black1 hspnc1 other1 povz   blck1:
## agez         0.059                                                        
## lths        -0.166 -0.070                                                 
## coll        -0.300  0.054  0.384                                          
## black1      -0.123  0.049 -0.041  0.059                                   
## hispanic1   -0.120  0.137 -0.238  0.112  0.192                            
## other1      -0.093  0.084 -0.045 -0.012  0.116  0.180                     
## povz         0.456 -0.017 -0.004  0.013 -0.074 -0.064 -0.076              
## black1:povz -0.076 -0.003 -0.001  0.009  0.167  0.083  0.047 -0.125       
## hispnc1:pvz -0.060  0.010 -0.036  0.009  0.098  0.445  0.082 -0.111  0.136
## other1:povz -0.071  0.005 -0.015 -0.005  0.054  0.086  0.407 -0.130  0.100
##             hspn1:
## agez              
## lths              
## coll              
## black1            
## hispanic1         
## other1            
## povz              
## black1:povz       
## hispnc1:pvz       
## other1:povz  0.144

anova(fit.mix.bin.ml, fit.mix.bin.cl)

## Data: merged
## Models:
## fit.mix.bin.ml: doc ~ agez + lths + coll + black + hispanic + other + povz + 
## fit.mix.bin.ml:     (1 | cofips)
## fit.mix.bin.cl: doc ~ agez + lths + coll + (black + hispanic + other) * povz + 
## fit.mix.bin.cl:     (1 | cofips)
##                Df   AIC   BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## fit.mix.bin.ml  9 69834 69920 -34908    69816                        
## fit.mix.bin.cl 12 69835 69949 -34905    69811   5.6      3       0.13

#Cross level interaction is not significant

Here is an example of using a Poisson outcome, number of healthy days reported in the last month

fit.pois<-glmer(healthydays~(black+hispanic+other)+lths+coll+agez+badhealth+(1|cofips),family=poisson, data=merged)
summary(fit.pois)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: poisson  ( log )
## Formula: healthydays ~ (black + hispanic + other) + lths + coll + agez +  
##     badhealth + (1 | cofips)
##    Data: merged
## 
##      AIC      BIC   logLik deviance df.resid 
##   940865   940950  -470423   940847    97370 
## 
## Scaled residuals: 
##    Min     1Q Median     3Q    Max 
## -4.413 -1.478 -1.374 -0.041 25.571 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  cofips (Intercept) 0.00962  0.0981  
## Number of obs: 97379, groups:  cofips, 91
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.76160    0.01095      70  < 2e-16 ***
## black1      -0.09628    0.00686     -14  < 2e-16 ***
## hispanic1   -0.15690    0.00621     -25  < 2e-16 ***
## other1       0.02596    0.00731       4  0.00038 ***
## lths         0.01640    0.00547       3  0.00273 ** 
## coll        -0.05645    0.00365     -15  < 2e-16 ***
## agez         0.08302    0.00174      48  < 2e-16 ***
## badhealth    1.90296    0.00342     557  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##           (Intr) black1 hspnc1 other1 lths   coll   agez  
## black1    -0.047                                          
## hispanic1 -0.050  0.144                                   
## other1    -0.035  0.089  0.130                            
## lths      -0.098 -0.049 -0.253 -0.051                     
## coll      -0.240  0.037  0.089 -0.020  0.365              
## agez      -0.031  0.084  0.200  0.116 -0.033  0.057       
## badhealth -0.186 -0.069 -0.082 -0.050 -0.076  0.145 -0.186

Perhaps a negative binomial model would be more appropriate, as our outcome is very overdispersed:

fit.nb<-glmer.nb(healthydays~(black+hispanic+other)+lths+coll+agez+badhealth+(1|cofips), data=merged)
summary(fit.nb)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: Negative Binomial(0.1739)  ( log )
## Formula: healthydays ~ (black + hispanic + other) + lths + coll + agez +  
##     badhealth + (1 | cofips)
##    Data: ..2
## 
##      AIC      BIC   logLik deviance df.resid 
##   355794   355889  -177887   355774    97369 
## 
## Scaled residuals: 
##    Min     1Q Median     3Q    Max 
## -0.416 -0.402 -0.399 -0.003  8.639 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  cofips   (Intercept) 0.00738  0.0859  
##  Residual             0.99508  0.9975  
## Number of obs: 97379, groups:  cofips, 91
## 
## Fixed effects:
##             Estimate Std. Error t value Pr(>|z|)    
## (Intercept)  0.79309    0.01938    40.9   <2e-16 ***
## black1      -0.01526    0.03512    -0.4   0.6639    
## hispanic1   -0.00451    0.03107    -0.1   0.8845    
## other1       0.07784    0.03727     2.1   0.0367 *  
## lths         0.09651    0.03340     2.9   0.0039 ** 
## coll        -0.15035    0.01853    -8.1    5e-16 ***
## agez         0.11697    0.00832    14.1   <2e-16 ***
## badhealth    1.87339    0.02136    87.7   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##           (Intr) black1 hspnc1 other1 lths   coll   agez  
## black1    -0.139                                          
## hispanic1 -0.162  0.115                                   
## other1    -0.112  0.076  0.107                            
## lths      -0.317 -0.047 -0.190 -0.035                     
## coll      -0.700  0.028  0.075 -0.002  0.367              
## agez      -0.047  0.055  0.186  0.102 -0.033  0.039       
## badhealth -0.226 -0.039 -0.035 -0.021 -0.066  0.077 -0.121

GLMMs can also be used on ordinal outcomes, for example an ordinal logit:

#Ordinal Logit GLMM
merged$hlth<-recode(merged$genhlth, recodes="1=1; 2=2; 3=3; 4=4;5=5; else=NA", as.factor.result=T)
table(merged$hlth)

## 
##     1     2     3     4     5 
## 19558 33596 30123 12535  4907

fit.mix.clmm<-clmm(hlth~agez+lths+coll+black+hispanic+other+theil.z+povz+(1|cofips), data=merged)
summary(fit.mix.clmm)

## Cumulative Link Mixed Model fitted with the Laplace approximation
## 
## formula: hlth ~ agez + lths + coll + black + hispanic + other + theil.z +  
##     povz + (1 | cofips)
## data:    merged
## 
##  link  threshold nobs  logLik     AIC       niter      max.grad cond.H 
##  logit flexible  99181 -138098.49 276222.97 2528(5059) 1.10e-01 3.2e+02
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  cofips (Intercept) 0.02     0.142   
## Number of groups:  cofips 91 
## 
## Coefficients:
##           Estimate Std. Error z value Pr(>|z|)    
## agez       0.33199    0.00607   54.66  < 2e-16 ***
## lths       0.71778    0.02449   29.31  < 2e-16 ***
## coll      -0.64604    0.01363  -47.38  < 2e-16 ***
## black1     0.61881    0.02600   23.80  < 2e-16 ***
## hispanic1  0.62383    0.02329   26.79  < 2e-16 ***
## other1     0.42949    0.02726   15.76  < 2e-16 ***
## theil.z    0.01256    0.01717    0.73     0.46    
## povz       0.12600    0.02807    4.49  7.2e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Threshold coefficients:
##     Estimate Std. Error z value
## 1|2  -1.8492     0.0242   -76.4
## 2|3  -0.1993     0.0234    -8.5
## 3|4   1.3953     0.0240    58.1
## 4|5   2.8888     0.0271   106.7
## (1905 observations deleted due to missingness)

how do I get ICC from one of these? For the binomial model, the residual variance is a constant of \(\pi^{2}/3\) For E.g. from the first model:

.142/(.142+ (pi^2)/3)

## [1] 0.04138

Week 4: Basic Hierarchical Models - Other Topics