DEM 7283 - Multi-level Models 3

Recode variables
Higher level predictors
Cross level interaction effects
Cross-level interaction model
Generalized Linear Mixed Models
- Multilevel Logistic Model
- Predicted values from the model

In this example, I introduce how to fit the multi-level model using the lme4 package. This example continues from the first example and considers the linear case of the model the use of a higher-level predictor and the formation of a cross level interaction model. The example also considers an individual level binary outcome to illustrate the logistic Generalized Linear Mixed Model (GLMM).

This example shows how to: * Examine variation between groups using fixed effects * Fit the basic random intercept model : +$y_{ij} = \mu + u_{j} + e_{ij}$ with $u_j \sim N(0, \sigma^2)$ Where the intercepts ($u_j$) for each group vary randomly around the overall mean ($\mu$)

*I also illustrate how to include group-level covariates and how to the model for cross level interactions.

The example merges data from the 2016 CDC Behavioral Risk Factor Surveillance System (BRFSS) SMART MSA data. Link and the 2010 American Community Survey 5-year estimates at the MSA level. More details on these data are found below.

#load brfss
library(car)
library(stargazer)
library(survey)
library(sjPlot)
library(ggplot2)
library(pander)
library(knitr)
library(lme4)
load("~/Google Drive/classes/dem7283/class18/data/brfss16_mmsa.Rdata")
set.seed(12345)
#samps<-sample(1:nrow(brfss16m), size = 40000, replace=F)
#brfss16m<-brfss16m[samps,]
#The names in the data are very ugly, so I make them less ugly
nams<-names(brfss16m)
#we see some names are lower case, some are upper and some have a little _ in the first position. This is a nightmare.
newnames<-gsub(pattern = "_",replacement =  "",x =  nams)
names(brfss16m)<-tolower(newnames)

Recode variables

#sex
brfss16m$male<-ifelse(brfss16m$sex==1, 1, 0)

#BMI
brfss16m$bmi<-ifelse(is.na(brfss16m$bmi5)==T, NA, brfss16m$bmi5/100)

#Healthy days
brfss16m$healthdays<-Recode(brfss16m$physhlth, recodes = "88=0; 77=NA; 99=NA")

#Healthy mental health days
brfss16m$healthmdays<-Recode(brfss16m$menthlth, recodes = "88=0; 77=NA; 99=NA")

brfss16m$badhealth<-Recode(brfss16m$genhlth, recodes="4:5=1; 1:3=0; else=NA")
#race/ethnicity
brfss16m$black<-Recode(brfss16m$racegr3, recodes="2=1; 9=NA; else=0")
brfss16m$white<-Recode(brfss16m$racegr3, recodes="1=1; 9=NA; else=0")
brfss16m$other<-Recode(brfss16m$racegr3, recodes="3:4=1; 9=NA; else=0")
brfss16m$hispanic<-Recode(brfss16m$racegr3, recodes="5=1; 9=NA; else=0")

brfss16m$race_eth<-Recode(brfss16m$racegr3, 
recodes="1='nhwhite'; 2='nh black'; 3='nh other';4='nh multirace'; 5='hispanic'; else=NA",
as.factor.result = T)
brfss16m$race_eth<-relevel(brfss16m$race_eth, ref = "nhwhite")

#insurance
brfss16m$ins<-Recode(brfss16m$hlthpln1, recodes ="7:9=NA; 1=1;2=0")

#income grouping
brfss16m$inc<-Recode(brfss16m$incomg, recodes = "9= NA;1='1_lt15k'; 2='2_15-25k';3='3_25-35k';4='4_35-50k';5='5_50kplus'", as.factor.result = T)
brfss16m$inc<-as.ordered(brfss16m$inc)
#education level
brfss16m$educ<-Recode(brfss16m$educa,
recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA",
as.factor.result=T)
brfss16m$educ<-relevel(brfss16m$educ, ref='2hsgrad')

#employloyment
brfss16m$employ<-Recode(brfss16m$employ1,
recodes="1:2='employloyed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA",
as.factor.result=T)
brfss16m$employ<-relevel(brfss16m$employ, ref='employloyed')

#marital status
brfss16m$marst<-Recode(brfss16m$marital,
recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA",
as.factor.result=T)
brfss16m$marst<-relevel(brfss16m$marst, ref='married')

#Age cut into intervals
brfss16m$agec<-cut(brfss16m$age80, breaks=c(0,24,39,59,79,99))

#BMI, in the brfss16ma the bmi variable has 2 implied decimal places,
#so we must divide by 100 to get real bmi's

brfss16m$bmi<-brfss16m$bmi5/100

#smoking currently
brfss16m$smoke<-Recode(brfss16m$smoker3, 
recodes="1:2=1; 3:4=0; else=NA")
#brfss16m$smoke<-relevel(brfss16m$smoke, ref = "NeverSmoked")

brfss16m$obese<-ifelse(is.na(brfss16m$bmi)==T, NA, 
                       ifelse(brfss16m$bmi>30,1,0))

Higher level predictors

We will often be interested in factors at both the individual AND contextual levels. To illustrate this, I will use data from the American Community Survey measured at the MSA level. Specifically, I use the DP3 table, which provides economic characteristics of places, from the 2010 5 year ACS Link.

library(acs)
#Get 2010 ACS median household incomes for tracts in Texas
msaacs<-geo.make(msa="*")

#ACS tables B17 is poverty, B19 is Gini, B25001 is housing vacancy, B25035 is median year built 
acsecon<-acs.fetch(key=mykey, endyear=2010, span=5, geography=msaacs, variable = c("B19083_001","B17001_001","B17001_002" , "B25002_001","B25002_003", "B25035_001"))

colnames(acsecon@estimate)

## [1] "B19083_001" "B17001_001" "B17001_002" "B25002_001" "B25002_003"
## [6] "B25035_001"

msaecon<-data.frame(gini=acsecon@estimate[, "B19083_001"], 
ppoverty=acsecon@estimate[, "B17001_002"]/acsecon@estimate[, "B17001_001"],
giniz=scale(acsecon@estimate[, "B19083_001"]),
pvacant=acsecon@estimate[,"B25002_003"]/acsecon@estimate[, "B25002_001"],
ppovertyz=scale(acsecon@estimate[, "B17001_002"]/acsecon@estimate[, "B17001_001"]), 
pvacantz=scale(acsecon@estimate[,"B25002_003"]/acsecon@estimate[, "B25002_001"]), 
medhouse=acsecon@estimate[, "B25035_001" ],
medhousez=scale(acsecon@estimate[, "B25035_001" ]))

msaecon$ids<-paste(acsecon@geography$metropolitanstatisticalareamicropolitanstatisticalarea)
head(msaecon)

##                                                  gini  ppoverty    giniz
## Adjuntas, PR Micro Area                         0.525 0.5925656 2.718785
## Aguadilla-Isabela-San SebastiÃ¡n, PR Metro Area 0.529 0.5577311 2.847562
## Coamo, PR Micro Area                            0.532 0.5633984 2.944145
## Fajardo, PR Metro Area                          0.483 0.4309070 1.366623
## Guayama, PR Metro Area                          0.518 0.4980518 2.493425
## Jayuya, PR Micro Area                           0.502 0.5446137 1.978315
##                                                   pvacant ppovertyz
## Adjuntas, PR Micro Area                         0.1996029  6.271934
## Aguadilla-Isabela-San SebastiÃ¡n, PR Metro Area 0.2005956  5.762039
## Coamo, PR Micro Area                            0.1500353  5.844994
## Fajardo, PR Metro Area                          0.3143172  3.905632
## Guayama, PR Metro Area                          0.1951184  4.888474
## Jayuya, PR Micro Area                           0.1562109  5.570031
##                                                  pvacantz medhouse
## Adjuntas, PR Micro Area                         0.9053869     1978
## Aguadilla-Isabela-San SebastiÃ¡n, PR Metro Area 0.9199516     1979
## Coamo, PR Micro Area                            0.1781186     1979
## Fajardo, PR Metro Area                          2.5885036     1977
## Guayama, PR Metro Area                          0.8395882     1978
## Jayuya, PR Micro Area                           0.2687281     1978
##                                                 medhousez   ids
## Adjuntas, PR Micro Area                         0.4923898 10260
## Aguadilla-Isabela-San SebastiÃ¡n, PR Metro Area 0.5982027 10380
## Coamo, PR Micro Area                            0.5982027 17620
## Fajardo, PR Metro Area                          0.3865770 21940
## Guayama, PR Metro Area                          0.4923898 25020
## Jayuya, PR Micro Area                           0.4923898 27580

summary(msaecon)

##       gini           ppoverty           giniz             pvacant       
##  Min.   :0.3510   Min.   :0.02425   Min.   :-2.88303   Min.   :0.04246  
##  1st Qu.:0.4190   1st Qu.:0.12308   1st Qu.:-0.69381   1st Qu.:0.09454  
##  Median :0.4390   Median :0.15172   Median :-0.04993   Median :0.12076  
##  Mean   :0.4406   Mean   :0.16409   Mean   : 0.00000   Mean   :0.13790  
##  3rd Qu.:0.4590   3rd Qu.:0.18746   3rd Qu.: 0.59396   3rd Qu.:0.15964  
##  Max.   :0.5730   Max.   :0.59257   Max.   : 4.26411   Max.   :0.63877  
##    ppovertyz          pvacantz          medhouse      medhousez      
##  Min.   :-2.0468   Min.   :-1.4002   Min.   :1940   Min.   :-3.5285  
##  1st Qu.:-0.6002   1st Qu.:-0.6361   1st Qu.:1968   1st Qu.:-0.5657  
##  Median :-0.1810   Median :-0.2514   Median :1975   Median : 0.1750  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   :1973   Mean   : 0.0000  
##  3rd Qu.: 0.3421   3rd Qu.: 0.3190   3rd Qu.:1980   3rd Qu.: 0.7040  
##  Max.   : 6.2719   Max.   : 7.3489   Max.   :1998   Max.   : 2.6086  
##      ids           
##  Length:955        
##  Class :character  
##  Mode  :character  
##                    
##                    
##

Let’s see the geographic variation in these economic indicators:

library(tigris)

## To enable 
## caching of data, set `options(tigris_use_cache = TRUE)` in your R script or .Rprofile.

## 
## Attaching package: 'tigris'

## The following object is masked from 'package:graphics':
## 
##     plot

msa<-core_based_statistical_areas(cb=T)
msa_ec<-geo_join(msa, msaecon, "CBSAFP", "ids", how="inner")

library(RColorBrewer)
library(sp)
spplot(msa_ec, "gini", at=quantile(msa_ec$gini), col.regions=brewer.pal(n=6, "Greys"), col="transparent")

spplot(msa_ec, "pvacant", at=quantile(msa_ec$pvacant), col.regions=brewer.pal(n=6, "Greys"), col="transparent")

spplot(msa_ec, "medhouse", at=quantile(msa_ec$medhouse), col.regions=brewer.pal(n=6, "Greys"), col="transparent")

Merge the MSA data to the BRFSS data

 msa_ec$state<-str_sub(msa_ec$NAME, -2,-1)
msaecon<-merge(msaecon, msa_ec@data[, c(1:7,18)], by.x="ids", by.y="CBSAFP", all.x=T)

joindata<-merge(brfss16m, msaecon, by.x="mmsa",by.y="ids", all.x=T)
joindata$bmiz<-as.numeric(scale(joindata$bmi, center=T, scale=T))
#joindata<-joindata[complete.cases(joindata[, c("bmiz", "race_eth", "agec", "educ", "gini")]),]
#and merge the data back to the kids data
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:acs':
## 
##     combine

## The following object is masked from 'package:car':
## 
##     recode

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

joindata<-joindata%>%
  select(bmiz, obese, mmsa, agec, educ, race_eth,smoke, healthmdays, badhealth,bmi,gini, ppoverty, ppovertyz,giniz,medhouse,medhousez, pvacant, pvacantz, male, mmsawt, mmsaname, state )%>%
  filter(complete.cases(.))


head(joindata[, c("bmiz","obese", "male", "agec", "educ", "gini","medhouse", "pvacant", "mmsa", "state")])

##         bmiz obese male    agec     educ  gini medhouse   pvacant  mmsa
## 1 -0.5184799     0    0 (79,99] 3somecol 0.427     1960 0.1115985 10580
## 2  0.3421579     0    0 (59,79]  2hsgrad 0.427     1960 0.1115985 10580
## 3 -0.2272375     0    1 (79,99] 4colgrad 0.427     1960 0.1115985 10580
## 4 -0.3646778     0    1 (59,79]  2hsgrad 0.427     1960 0.1115985 10580
## 5  0.3683370     1    0 (79,99] 3somecol 0.427     1960 0.1115985 10580
## 6 -1.1026011     0    0 (39,59]  1somehs 0.427     1960 0.1115985 10580
##   state
## 1    NY
## 2    NY
## 3    NY
## 4    NY
## 5    NY
## 6    NY

meanbmi<-mean(joindata$bmi, na.rm=T)
sdbmi<-sd(joindata$bmi, na.rm=T)

Cross level interaction effects

Again, we are assuming a set of propositions where we have an individual level outcome, and predictor variables at both the micro and macro levels. This can be shown in several different ways.

from

The first case is a macro to micro proposition, which may be exemplified by a statement such as: “Individuals in areas with high rates of poverty are more likely to be obese”.

Whereas the second frame illustrates a more specific special case, where there is a macro level effect, net of the individual level predictor, and may be stated “Individuals who are racial/ethnic minorities are more likely to be obese, compared to Non-Hispanic whites, and living in areas of high poverty also increase the risk of obesity”.

The last panel illustrates what is known as a cross level interaction, or a macro-micro interaction. This is where the relationship between x and y is dependent on Z. This leads to the statement “Individuals who are racial/ethnic minorities, and who live in areas with high poverty rates are at higher risk of obesity”.

First, I fit the multi level model, then the cross level interaction model:

#MSA level variables
fit.mix2<-lmer(bmiz~agec+educ+race_eth+medhousez+(1|mmsa), data=joindata)
arm::display(fit.mix2, detail=T)

## lmer(formula = bmiz ~ agec + educ + race_eth + medhousez + (1 | 
##     mmsa), data = joindata)
##                      coef.est coef.se t value
## (Intercept)           -0.41     0.01  -32.45 
## agec(24,39]            0.47     0.01   40.38 
## agec(39,59]            0.63     0.01   58.30 
## agec(59,79]            0.55     0.01   51.26 
## agec(79,99]            0.16     0.01   12.49 
## educ0Prim              0.09     0.02    5.27 
## educ1somehs            0.03     0.01    2.56 
## educ3somecol          -0.01     0.01   -2.29 
## educ4colgrad          -0.20     0.01  -32.77 
## race_ethhispanic       0.11     0.01   10.68 
## race_ethnh black       0.30     0.01   33.46 
## race_ethnh multirace   0.14     0.02    7.32 
## race_ethnh other      -0.16     0.01  -11.45 
## medhousez             -0.03     0.01   -4.91 
## 
## Error terms:
##  Groups   Name        Std.Dev.
##  mmsa     (Intercept) 0.07    
##  Residual             0.98    
## ---
## number of obs: 171535, groups: mmsa, 123
## AIC = 479937, DIC = 479692.3
## deviance = 479798.4

So, we see that in MSAs with newer housing, the BMI is lower on average, net of individual level factors.

Cross-level interaction model

This involves forming the interaction term between an individual level variable and a higher level variable. In this case, we will interact race/ethnicity with houseing age. We can state this kind of hypothesis as “Ethinic minority groups, living in areas with newer housing will have all have lower BMI’s”. It’s important to note the statement, where you situate the individual level characteristic (ethnicity) within the higher level context (housing in MSAs). This will allow us to test whether all groups benefit equally from newer housing.

fit.mix4<-lmer(bmiz~agec+educ+race_eth+medhousez*race_eth+(1|mmsa), joindata, REML=F)
arm::display(fit.mix4, detail=T)

## lmer(formula = bmiz ~ agec + educ + race_eth + medhousez * race_eth + 
##     (1 | mmsa), data = joindata, REML = F)
##                                coef.est coef.se t value
## (Intercept)                     -0.40     0.01  -32.56 
## agec(24,39]                      0.47     0.01   40.36 
## agec(39,59]                      0.63     0.01   58.28 
## agec(59,79]                      0.55     0.01   51.26 
## agec(79,99]                      0.16     0.01   12.49 
## educ0Prim                        0.09     0.02    5.27 
## educ1somehs                      0.03     0.01    2.57 
## educ3somecol                    -0.01     0.01   -2.29 
## educ4colgrad                    -0.20     0.01  -32.79 
## race_ethhispanic                 0.12     0.01   10.54 
## race_ethnh black                 0.29     0.01   30.51 
## race_ethnh multirace             0.14     0.02    7.35 
## race_ethnh other                -0.16     0.01  -11.43 
## medhousez                       -0.04     0.01   -5.17 
## race_ethhispanic:medhousez      -0.02     0.01   -1.48 
## race_ethnh black:medhousez       0.03     0.01    3.50 
## race_ethnh multirace:medhousez  -0.02     0.02   -0.92 
## race_ethnh other:medhousez       0.02     0.01    1.21 
## 
## Error terms:
##  Groups   Name        Std.Dev.
##  mmsa     (Intercept) 0.07    
##  Residual             0.98    
## ---
## number of obs: 171535, groups: mmsa, 123
## AIC = 479821, DIC = 479780.8
## deviance = 479780.8

This basically says that blacks in MSAs with newer than average housing stock (medhousez==1 vs medhousez==0) have higher BMI’s, while the BMI of the other groups does is not affected by housing.

#compare the models with a LRT
anova(fit.mix2, fit.mix4)

## refitting model(s) with ML (instead of REML)

## Data: joindata
## Models:
## fit.mix2: bmiz ~ agec + educ + race_eth + medhousez + (1 | mmsa)
## fit.mix4: bmiz ~ agec + educ + race_eth + medhousez * race_eth + (1 | mmsa)
##          Df    AIC    BIC  logLik deviance  Chisq Chi Df Pr(>Chisq)   
## fit.mix2 16 479830 479991 -239899   479798                            
## fit.mix4 20 479821 480022 -239890   479781 17.585      4   0.001487 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Generalized Linear Mixed Models

Generli

I will illustrate how to fit Generalized Linear Mixed models to outcomes that are not continuous. I will illustrate the Laplace approximation using the glmer() function in the lme4 library.

NOTE GLMM’s can take much longer to estimate than LMM’s, don’t be surprised if your model takes a long time to run. Also don’t be surprised if you see warning messages like:

Warning message: In checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, : Model failed to converge with max|grad| = somenumber (tol = 0.001)

This is a warning message that the computer did not reach a satisfactory numerical solution for your model. This just means we may have to tweak your model a little to get R to converge. This Rpubs document provides an excellent section on the technical matters involved in fitting the models in R.

You may have to refit the model, meaning restart the optimization process where it stopped. This can be done using the refit() function in lme4.

Multilevel Logistic Model

The First model will fit a model that considers a binary response, if a person’s BMI was greater than 30, the cut off for obese weight status.

A research question corresponding to this could be:

“How do sociodemographic factors affect the odds or being obese in US cities?”

If we were to write the symbolic form of this model it would be the logistic regression model:

\[ln \left ( \frac {p_{ij}}{1-p{ij}} \right ) = \beta_{0j} + \sum {\beta_k x_{ik}} \]

\[\beta_{0j} = \beta_0 + u_j\]

\[u_j \sim N(0, \sigma^2_u)\] Note, the logistic regression function does not have a separate independent residual variance at the individual level. In fact, owing to the logistic distribution function, the variance of a logistic distribution with scale parameter equal to 1 is $\frac{\pi^2}{3}$. Link to article about this.

the glmer() function will estimate this model:

joindata$mmsa_fac<-as.factor(joindata$mmsa)
fit.mix.bin<-glmer(obese~male+agec+educ+race_eth+(1|mmsa_fac), family=binomial, joindata,
                   control=glmerControl(optimizer="bobyqa",optCtrl=list(maxfun=2e5)))

## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control
## $checkConv, : Model failed to converge with max|grad| = 0.00102225 (tol =
## 0.001, component 1)

fit.mix.bin<-refit(fit.mix.bin)

## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control
## $checkConv, : Model failed to converge with max|grad| = 0.00138182 (tol =
## 0.001, component 1)

arm::display(fit.mix.bin, detail=T)

## glmer(formula = obese ~ male + agec + educ + race_eth + (1 | 
##     mmsa_fac), data = joindata, family = binomial, control = glmerControl(optimizer = "bobyqa", 
##     optCtrl = list(maxfun = 2e+05)))
##                      coef.est coef.se z value Pr(>|z|)
## (Intercept)           -1.64     0.03  -50.39    0.00  
## male                   0.06     0.01    5.43    0.00  
## agec(24,39]            0.82     0.03   26.74    0.00  
## agec(39,59]            1.11     0.03   38.55    0.00  
## agec(59,79]            0.99     0.03   34.33    0.00  
## agec(79,99]            0.16     0.04    4.49    0.00  
## educ0Prim              0.12     0.04    3.02    0.00  
## educ1somehs            0.11     0.03    4.14    0.00  
## educ3somecol          -0.05     0.01   -3.38    0.00  
## educ4colgrad          -0.43     0.01  -31.09    0.00  
## race_ethhispanic       0.18     0.02    7.87    0.00  
## race_ethnh black       0.52     0.02   27.28    0.00  
## race_ethnh multirace   0.26     0.04    6.15    0.00  
## race_ethnh other      -0.30     0.03   -8.67    0.00  
## 
## Error terms:
##  Groups   Name        Std.Dev.
##  mmsa_fac (Intercept) 0.15    
##  Residual             1.00    
## ---
## number of obs: 171535, groups: mmsa_fac, 123
## AIC = 203084, DIC = 202410.7
## deviance = 202732.4

#odds ratios
exp(fixef(fit.mix.bin)[-1])

##                 male          agec(24,39]          agec(39,59] 
##            1.0608158            2.2602768            3.0462191 
##          agec(59,79]          agec(79,99]            educ0Prim 
##            2.6896637            1.1737133            1.1237945 
##          educ1somehs         educ3somecol         educ4colgrad 
##            1.1154861            0.9527690            0.6503676 
##     race_ethhispanic     race_ethnh black race_ethnh multirace 
##            1.2008227            1.6810032            1.2950267 
##     race_ethnh other 
##            0.7406912

exp(confint(fit.mix.bin, method="Wald"))

##                          2.5 %    97.5 %
## .sig01                      NA        NA
## (Intercept)          0.1822862 0.2070651
## male                 1.0384686 1.0836439
## agec(24,39]          2.1291295 2.3995023
## agec(39,59]          2.8784792 3.2237337
## agec(59,79]          2.5419377 2.8459749
## agec(79,99]          1.0944296 1.2587405
## educ0Prim            1.0418868 1.2121413
## educ1somehs          1.0591970 1.1747664
## educ3somecol         0.9263848 0.9799047
## educ4colgrad         0.6329635 0.6682503
## race_ethhispanic     1.1473301 1.2568094
## race_ethnh black     1.6194325 1.7449147
## race_ethnh multirace 1.1926827 1.4061529
## race_ethnh other     0.6920769 0.7927204

sjp.glmer(fit.mix.bin, facet.grid = F, type="ri.slope", vars=c("educ", "agec"), show.legend = F)

A multilevel logistic model with a higher level variable can be written: \[ln \left ( \frac {p_{ij}}{1-p{ij}} \right ) = \beta_{0j} + \sum {\beta_k x_{ik}} +\gamma z_j\] In this analysis, we will test the hypothesis:

“Inidividuals living in cities with newer housing will have lower odds of being obese, net of individual level factors”

So the hypothesis of interest here involves us testing whether the effect of the housing age medhousez shows a $\gamma$ coefficient different from zero.

fit.mix.bin2<-glmer(obese~male+agec+educ+race_eth+medhousez+(1|mmsa_fac), family=binomial, joindata,
                   control=glmerControl(optimizer="bobyqa",optCtrl=list(maxfun=2e5)))

## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control
## $checkConv, : Model failed to converge with max|grad| = 0.00114374 (tol =
## 0.001, component 1)

fit.mix.bin2<-refit(fit.mix.bin2)
arm::display(fit.mix.bin2, detail=T)

## glmer(formula = obese ~ male + agec + educ + race_eth + medhousez + 
##     (1 | mmsa_fac), data = joindata, family = binomial, control = glmerControl(optimizer = "bobyqa", 
##     optCtrl = list(maxfun = 2e+05)))
##                      coef.est coef.se z value Pr(>|z|)
## (Intercept)           -1.63     0.03  -50.41    0.00  
## male                   0.06     0.01    5.43    0.00  
## agec(24,39]            0.82     0.03   26.68    0.00  
## agec(39,59]            1.11     0.03   38.47    0.00  
## agec(59,79]            0.99     0.03   34.26    0.00  
## agec(79,99]            0.16     0.04    4.47    0.00  
## educ0Prim              0.12     0.04    3.03    0.00  
## educ1somehs            0.11     0.03    4.15    0.00  
## educ3somecol          -0.05     0.01   -3.34    0.00  
## educ4colgrad          -0.43     0.01  -31.05    0.00  
## race_ethhispanic       0.18     0.02    7.95    0.00  
## race_ethnh black       0.52     0.02   27.43    0.00  
## race_ethnh multirace   0.26     0.04    6.19    0.00  
## race_ethnh other      -0.30     0.03   -8.66    0.00  
## medhousez             -0.05     0.01   -3.71    0.00  
## 
## Error terms:
##  Groups   Name        Std.Dev.
##  mmsa_fac (Intercept) 0.14    
##  Residual             1.00    
## ---
## number of obs: 171535, groups: mmsa_fac, 123
## AIC = 203073, DIC = 202427.4
## deviance = 202734.1

#odds ratios
exp(fixef(fit.mix.bin2)[-1])

##                 male          agec(24,39]          agec(39,59] 
##            1.0607928            2.2601965            3.0450495 
##          agec(59,79]          agec(79,99]            educ0Prim 
##            2.6892157            1.1734909            1.1242071 
##          educ1somehs         educ3somecol         educ4colgrad 
##            1.1159785            0.9532566            0.6507260 
##     race_ethhispanic     race_ethnh black race_ethnh multirace 
##            1.2024835            1.6851797            1.2959646 
##     race_ethnh other            medhousez 
##            0.7409598            0.9478214

exp(confint(fit.mix.bin, method="Wald"))

##                          2.5 %    97.5 %
## .sig01                      NA        NA
## (Intercept)          0.1822862 0.2070651
## male                 1.0384686 1.0836439
## agec(24,39]          2.1291295 2.3995023
## agec(39,59]          2.8784792 3.2237337
## agec(59,79]          2.5419377 2.8459749
## agec(79,99]          1.0944296 1.2587405
## educ0Prim            1.0418868 1.2121413
## educ1somehs          1.0591970 1.1747664
## educ3somecol         0.9263848 0.9799047
## educ4colgrad         0.6329635 0.6682503
## race_ethhispanic     1.1473301 1.2568094
## race_ethnh black     1.6194325 1.7449147
## race_ethnh multirace 1.1926827 1.4061529
## race_ethnh other     0.6920769 0.7927204

#LRT between models
anova(fit.mix.bin, fit.mix.bin2)

## Data: joindata
## Models:
## fit.mix.bin: obese ~ male + agec + educ + race_eth + (1 | mmsa_fac)
## fit.mix.bin2: obese ~ male + agec + educ + race_eth + medhousez + (1 | mmsa_fac)
##              Df    AIC    BIC  logLik deviance  Chisq Chi Df Pr(>Chisq)
## fit.mix.bin  15 203084 203235 -101527   203054                         
## fit.mix.bin2 16 203073 203234 -101520   203041 13.146      1  0.0002881
##                 
## fit.mix.bin     
## fit.mix.bin2 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

highmsa<-levels(joindata$mmsa_fac)[as.numeric(lapply(ranef(fit.mix.bin)$mmsa, which.max))]
lowmsa<-levels(joindata$mmsa_fac)[as.numeric(lapply(ranef(fit.mix.bin)$mmsa, which.min))]

In this case, the houseing age ** affects the probability that someone was obese, net of individual factors, although there was quite a lot of variance between cities in the level of obesity.

Predicted values from the model

These predicted values actually allow us to set up a counterfactual type of analysis. We can effectively change the houseing age variable to be older or younger than the real value to see how the outcome changes.

highmsa_house<-msa_ec@data[msa_ec$ids==highmsa,"medhousez"]
lowmsa_house<-msa_ec@data[msa_ec$ids==lowmsa,"medhousez"]

#get newdata using real values
dat1<-expand.grid(male=mean(joindata$male), agec=levels(joindata$agec)[3], educ=levels(joindata$educ), race_eth=levels(joindata$race_eth),mmsa_fac = c(lowmsa, highmsa) , medhousez=c(highmsa_house,lowmsa_house ))

#get newdata using artificially generated housing age values  
dat2<-expand.grid(male=mean(joindata$male), agec=levels(joindata$agec)[3], educ=levels(joindata$educ), race_eth=levels(joindata$race_eth),mmsa_fac = c(lowmsa, highmsa) , medhousez=c( 1))

#predictions
dat1$pred<-predict(fit.mix.bin2, dat1, type="response")
dat2$pred<-predict(fit.mix.bin2, dat2, type="response")


head(dat1[dat1$race_eth=="nh black"&dat1$educ=="2hsgrad",], n=10)

##         male    agec    educ race_eth mmsa_fac  medhousez      pred
## 11 0.4499432 (39,59] 2hsgrad nh black    19740 -0.3541129 0.4240806
## 36 0.4499432 (39,59] 2hsgrad nh black    26580 -0.3541129 0.6008181
## 61 0.4499432 (39,59] 2hsgrad nh black    19740  0.5982027 0.4116671
## 86 0.4499432 (39,59] 2hsgrad nh black    26580  0.5982027 0.5885178

head(dat2[dat2$race_eth=="nh black"&dat2$educ=="2hsgrad",], n=10)

##         male    agec    educ race_eth mmsa_fac medhousez      pred
## 11 0.4499432 (39,59] 2hsgrad nh black    19740         1 0.4064622
## 36 0.4499432 (39,59] 2hsgrad nh black    26580         1 0.5832937

So, we see for instance in Huntington-Ashland, WV-KY-OH, Metropolitan Statistical Area the obesity rate for NH Blacks, with high school education is estimated to be 42.4%, but the houseing is older than average there 0.5982027 , if the housing were newer, the rate would be 40.6%. That is called a counterfactual, because we are artificially changing a variable to a different value, to see the result on the outcome. However, you do not see a change in Denver-Aurora-Lakewood, CO, Metropolitan Statistical Area, because the housing is already new there. Multilevel models are very useful for this.