In this example, I introduce how to fit the multi-level model using the lme4 package. This example continues from the first example and considers the linear case of the model the use of a higher-level predictor and the formation of a cross level interaction model. The example also considers an individual level binary outcome to illustrate the logistic Generalized Linear Mixed Model (GLMM).
This example shows how to: * Examine variation between groups using fixed effects * Fit the basic random intercept model : +\(y_{ij} = \mu + u_{j} + e_{ij}\) with \(u_j \sim N(0, \sigma^2)\) Where the intercepts (\(u_j\)) for each group vary randomly around the overall mean (\(\mu\))
*I also illustrate how to include group-level covariates and how to the model for cross level interactions.
The example merges data from the 2016 CDC Behavioral Risk Factor Surveillance System (BRFSS) SMART MSA data. Link and the 2010 American Community Survey 5-year estimates at the MSA level. More details on these data are found below.
#load brfss
library(car)
library(stargazer)
library(survey)
library(sjPlot)
library(ggplot2)
library(pander)
library(knitr)
library(lme4)
load("~/Google Drive/classes/dem7283/class18/data/brfss16_mmsa.Rdata")
set.seed(12345)
#samps<-sample(1:nrow(brfss16m), size = 40000, replace=F)
#brfss16m<-brfss16m[samps,]
#The names in the data are very ugly, so I make them less ugly
nams<-names(brfss16m)
#we see some names are lower case, some are upper and some have a little _ in the first position. This is a nightmare.
newnames<-gsub(pattern = "_",replacement = "",x = nams)
names(brfss16m)<-tolower(newnames)
#sex
brfss16m$male<-ifelse(brfss16m$sex==1, 1, 0)
#BMI
brfss16m$bmi<-ifelse(is.na(brfss16m$bmi5)==T, NA, brfss16m$bmi5/100)
#Healthy days
brfss16m$healthdays<-Recode(brfss16m$physhlth, recodes = "88=0; 77=NA; 99=NA")
#Healthy mental health days
brfss16m$healthmdays<-Recode(brfss16m$menthlth, recodes = "88=0; 77=NA; 99=NA")
brfss16m$badhealth<-Recode(brfss16m$genhlth, recodes="4:5=1; 1:3=0; else=NA")
#race/ethnicity
brfss16m$black<-Recode(brfss16m$racegr3, recodes="2=1; 9=NA; else=0")
brfss16m$white<-Recode(brfss16m$racegr3, recodes="1=1; 9=NA; else=0")
brfss16m$other<-Recode(brfss16m$racegr3, recodes="3:4=1; 9=NA; else=0")
brfss16m$hispanic<-Recode(brfss16m$racegr3, recodes="5=1; 9=NA; else=0")
brfss16m$race_eth<-Recode(brfss16m$racegr3,
recodes="1='nhwhite'; 2='nh black'; 3='nh other';4='nh multirace'; 5='hispanic'; else=NA",
as.factor.result = T)
brfss16m$race_eth<-relevel(brfss16m$race_eth, ref = "nhwhite")
#insurance
brfss16m$ins<-Recode(brfss16m$hlthpln1, recodes ="7:9=NA; 1=1;2=0")
#income grouping
brfss16m$inc<-Recode(brfss16m$incomg, recodes = "9= NA;1='1_lt15k'; 2='2_15-25k';3='3_25-35k';4='4_35-50k';5='5_50kplus'", as.factor.result = T)
brfss16m$inc<-as.ordered(brfss16m$inc)
#education level
brfss16m$educ<-Recode(brfss16m$educa,
recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA",
as.factor.result=T)
brfss16m$educ<-relevel(brfss16m$educ, ref='2hsgrad')
#employloyment
brfss16m$employ<-Recode(brfss16m$employ1,
recodes="1:2='employloyed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA",
as.factor.result=T)
brfss16m$employ<-relevel(brfss16m$employ, ref='employloyed')
#marital status
brfss16m$marst<-Recode(brfss16m$marital,
recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA",
as.factor.result=T)
brfss16m$marst<-relevel(brfss16m$marst, ref='married')
#Age cut into intervals
brfss16m$agec<-cut(brfss16m$age80, breaks=c(0,24,39,59,79,99))
#BMI, in the brfss16ma the bmi variable has 2 implied decimal places,
#so we must divide by 100 to get real bmi's
brfss16m$bmi<-brfss16m$bmi5/100
#smoking currently
brfss16m$smoke<-Recode(brfss16m$smoker3,
recodes="1:2=1; 3:4=0; else=NA")
#brfss16m$smoke<-relevel(brfss16m$smoke, ref = "NeverSmoked")
brfss16m$obese<-ifelse(is.na(brfss16m$bmi)==T, NA,
ifelse(brfss16m$bmi>30,1,0))
We will often be interested in factors at both the individual AND contextual levels. To illustrate this, I will use data from the American Community Survey measured at the MSA level. Specifically, I use the DP3 table, which provides economic characteristics of places, from the 2010 5 year ACS Link.
library(acs)
#Get 2010 ACS median household incomes for tracts in Texas
msaacs<-geo.make(msa="*")
#ACS tables B17 is poverty, B19 is Gini, B25001 is housing vacancy, B25035 is median year built
acsecon<-acs.fetch(key=mykey, endyear=2010, span=5, geography=msaacs, variable = c("B19083_001","B17001_001","B17001_002" , "B25002_001","B25002_003", "B25035_001"))
colnames(acsecon@estimate)
## [1] "B19083_001" "B17001_001" "B17001_002" "B25002_001" "B25002_003"
## [6] "B25035_001"
msaecon<-data.frame(gini=acsecon@estimate[, "B19083_001"],
ppoverty=acsecon@estimate[, "B17001_002"]/acsecon@estimate[, "B17001_001"],
giniz=scale(acsecon@estimate[, "B19083_001"]),
pvacant=acsecon@estimate[,"B25002_003"]/acsecon@estimate[, "B25002_001"],
ppovertyz=scale(acsecon@estimate[, "B17001_002"]/acsecon@estimate[, "B17001_001"]),
pvacantz=scale(acsecon@estimate[,"B25002_003"]/acsecon@estimate[, "B25002_001"]),
medhouse=acsecon@estimate[, "B25035_001" ],
medhousez=scale(acsecon@estimate[, "B25035_001" ]))
msaecon$ids<-paste(acsecon@geography$metropolitanstatisticalareamicropolitanstatisticalarea)
head(msaecon)
## gini ppoverty giniz
## Adjuntas, PR Micro Area 0.525 0.5925656 2.718785
## Aguadilla-Isabela-San Sebastián, PR Metro Area 0.529 0.5577311 2.847562
## Coamo, PR Micro Area 0.532 0.5633984 2.944145
## Fajardo, PR Metro Area 0.483 0.4309070 1.366623
## Guayama, PR Metro Area 0.518 0.4980518 2.493425
## Jayuya, PR Micro Area 0.502 0.5446137 1.978315
## pvacant ppovertyz
## Adjuntas, PR Micro Area 0.1996029 6.271934
## Aguadilla-Isabela-San Sebastián, PR Metro Area 0.2005956 5.762039
## Coamo, PR Micro Area 0.1500353 5.844994
## Fajardo, PR Metro Area 0.3143172 3.905632
## Guayama, PR Metro Area 0.1951184 4.888474
## Jayuya, PR Micro Area 0.1562109 5.570031
## pvacantz medhouse
## Adjuntas, PR Micro Area 0.9053869 1978
## Aguadilla-Isabela-San Sebastián, PR Metro Area 0.9199516 1979
## Coamo, PR Micro Area 0.1781186 1979
## Fajardo, PR Metro Area 2.5885036 1977
## Guayama, PR Metro Area 0.8395882 1978
## Jayuya, PR Micro Area 0.2687281 1978
## medhousez ids
## Adjuntas, PR Micro Area 0.4923898 10260
## Aguadilla-Isabela-San Sebastián, PR Metro Area 0.5982027 10380
## Coamo, PR Micro Area 0.5982027 17620
## Fajardo, PR Metro Area 0.3865770 21940
## Guayama, PR Metro Area 0.4923898 25020
## Jayuya, PR Micro Area 0.4923898 27580
summary(msaecon)
## gini ppoverty giniz pvacant
## Min. :0.3510 Min. :0.02425 Min. :-2.88303 Min. :0.04246
## 1st Qu.:0.4190 1st Qu.:0.12308 1st Qu.:-0.69381 1st Qu.:0.09454
## Median :0.4390 Median :0.15172 Median :-0.04993 Median :0.12076
## Mean :0.4406 Mean :0.16409 Mean : 0.00000 Mean :0.13790
## 3rd Qu.:0.4590 3rd Qu.:0.18746 3rd Qu.: 0.59396 3rd Qu.:0.15964
## Max. :0.5730 Max. :0.59257 Max. : 4.26411 Max. :0.63877
## ppovertyz pvacantz medhouse medhousez
## Min. :-2.0468 Min. :-1.4002 Min. :1940 Min. :-3.5285
## 1st Qu.:-0.6002 1st Qu.:-0.6361 1st Qu.:1968 1st Qu.:-0.5657
## Median :-0.1810 Median :-0.2514 Median :1975 Median : 0.1750
## Mean : 0.0000 Mean : 0.0000 Mean :1973 Mean : 0.0000
## 3rd Qu.: 0.3421 3rd Qu.: 0.3190 3rd Qu.:1980 3rd Qu.: 0.7040
## Max. : 6.2719 Max. : 7.3489 Max. :1998 Max. : 2.6086
## ids
## Length:955
## Class :character
## Mode :character
##
##
##
Let’s see the geographic variation in these economic indicators:
library(tigris)
## To enable
## caching of data, set `options(tigris_use_cache = TRUE)` in your R script or .Rprofile.
##
## Attaching package: 'tigris'
## The following object is masked from 'package:graphics':
##
## plot
msa<-core_based_statistical_areas(cb=T)
msa_ec<-geo_join(msa, msaecon, "CBSAFP", "ids", how="inner")
library(RColorBrewer)
library(sp)
spplot(msa_ec, "gini", at=quantile(msa_ec$gini), col.regions=brewer.pal(n=6, "Greys"), col="transparent")
spplot(msa_ec, "pvacant", at=quantile(msa_ec$pvacant), col.regions=brewer.pal(n=6, "Greys"), col="transparent")
spplot(msa_ec, "medhouse", at=quantile(msa_ec$medhouse), col.regions=brewer.pal(n=6, "Greys"), col="transparent")
Merge the MSA data to the BRFSS data
msa_ec$state<-str_sub(msa_ec$NAME, -2,-1)
msaecon<-merge(msaecon, msa_ec@data[, c(1:7,18)], by.x="ids", by.y="CBSAFP", all.x=T)
joindata<-merge(brfss16m, msaecon, by.x="mmsa",by.y="ids", all.x=T)
joindata$bmiz<-as.numeric(scale(joindata$bmi, center=T, scale=T))
#joindata<-joindata[complete.cases(joindata[, c("bmiz", "race_eth", "agec", "educ", "gini")]),]
#and merge the data back to the kids data
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:acs':
##
## combine
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
joindata<-joindata%>%
select(bmiz, obese, mmsa, agec, educ, race_eth,smoke, healthmdays, badhealth,bmi,gini, ppoverty, ppovertyz,giniz,medhouse,medhousez, pvacant, pvacantz, male, mmsawt, mmsaname, state )%>%
filter(complete.cases(.))
head(joindata[, c("bmiz","obese", "male", "agec", "educ", "gini","medhouse", "pvacant", "mmsa", "state")])
## bmiz obese male agec educ gini medhouse pvacant mmsa
## 1 -0.5184799 0 0 (79,99] 3somecol 0.427 1960 0.1115985 10580
## 2 0.3421579 0 0 (59,79] 2hsgrad 0.427 1960 0.1115985 10580
## 3 -0.2272375 0 1 (79,99] 4colgrad 0.427 1960 0.1115985 10580
## 4 -0.3646778 0 1 (59,79] 2hsgrad 0.427 1960 0.1115985 10580
## 5 0.3683370 1 0 (79,99] 3somecol 0.427 1960 0.1115985 10580
## 6 -1.1026011 0 0 (39,59] 1somehs 0.427 1960 0.1115985 10580
## state
## 1 NY
## 2 NY
## 3 NY
## 4 NY
## 5 NY
## 6 NY
meanbmi<-mean(joindata$bmi, na.rm=T)
sdbmi<-sd(joindata$bmi, na.rm=T)
Again, we are assuming a set of propositions where we have an individual level outcome, and predictor variables at both the micro and macro levels. This can be shown in several different ways.
from
The first case is a macro to micro proposition, which may be exemplified by a statement such as: “Individuals in areas with high rates of poverty are more likely to be obese”.
Whereas the second frame illustrates a more specific special case, where there is a macro level effect, net of the individual level predictor, and may be stated “Individuals who are racial/ethnic minorities are more likely to be obese, compared to Non-Hispanic whites, and living in areas of high poverty also increase the risk of obesity”.
The last panel illustrates what is known as a cross level interaction, or a macro-micro interaction. This is where the relationship between x and y is dependent on Z. This leads to the statement “Individuals who are racial/ethnic minorities, and who live in areas with high poverty rates are at higher risk of obesity”.
First, I fit the multi level model, then the cross level interaction model:
#MSA level variables
fit.mix2<-lmer(bmiz~agec+educ+race_eth+medhousez+(1|mmsa), data=joindata)
arm::display(fit.mix2, detail=T)
## lmer(formula = bmiz ~ agec + educ + race_eth + medhousez + (1 |
## mmsa), data = joindata)
## coef.est coef.se t value
## (Intercept) -0.41 0.01 -32.45
## agec(24,39] 0.47 0.01 40.38
## agec(39,59] 0.63 0.01 58.30
## agec(59,79] 0.55 0.01 51.26
## agec(79,99] 0.16 0.01 12.49
## educ0Prim 0.09 0.02 5.27
## educ1somehs 0.03 0.01 2.56
## educ3somecol -0.01 0.01 -2.29
## educ4colgrad -0.20 0.01 -32.77
## race_ethhispanic 0.11 0.01 10.68
## race_ethnh black 0.30 0.01 33.46
## race_ethnh multirace 0.14 0.02 7.32
## race_ethnh other -0.16 0.01 -11.45
## medhousez -0.03 0.01 -4.91
##
## Error terms:
## Groups Name Std.Dev.
## mmsa (Intercept) 0.07
## Residual 0.98
## ---
## number of obs: 171535, groups: mmsa, 123
## AIC = 479937, DIC = 479692.3
## deviance = 479798.4
So, we see that in MSAs with newer housing, the BMI is lower on average, net of individual level factors.
This involves forming the interaction term between an individual level variable and a higher level variable. In this case, we will interact race/ethnicity with houseing age. We can state this kind of hypothesis as “Ethinic minority groups, living in areas with newer housing will have all have lower BMI’s”. It’s important to note the statement, where you situate the individual level characteristic (ethnicity) within the higher level context (housing in MSAs). This will allow us to test whether all groups benefit equally from newer housing.
fit.mix4<-lmer(bmiz~agec+educ+race_eth+medhousez*race_eth+(1|mmsa), joindata, REML=F)
arm::display(fit.mix4, detail=T)
## lmer(formula = bmiz ~ agec + educ + race_eth + medhousez * race_eth +
## (1 | mmsa), data = joindata, REML = F)
## coef.est coef.se t value
## (Intercept) -0.40 0.01 -32.56
## agec(24,39] 0.47 0.01 40.36
## agec(39,59] 0.63 0.01 58.28
## agec(59,79] 0.55 0.01 51.26
## agec(79,99] 0.16 0.01 12.49
## educ0Prim 0.09 0.02 5.27
## educ1somehs 0.03 0.01 2.57
## educ3somecol -0.01 0.01 -2.29
## educ4colgrad -0.20 0.01 -32.79
## race_ethhispanic 0.12 0.01 10.54
## race_ethnh black 0.29 0.01 30.51
## race_ethnh multirace 0.14 0.02 7.35
## race_ethnh other -0.16 0.01 -11.43
## medhousez -0.04 0.01 -5.17
## race_ethhispanic:medhousez -0.02 0.01 -1.48
## race_ethnh black:medhousez 0.03 0.01 3.50
## race_ethnh multirace:medhousez -0.02 0.02 -0.92
## race_ethnh other:medhousez 0.02 0.01 1.21
##
## Error terms:
## Groups Name Std.Dev.
## mmsa (Intercept) 0.07
## Residual 0.98
## ---
## number of obs: 171535, groups: mmsa, 123
## AIC = 479821, DIC = 479780.8
## deviance = 479780.8
This basically says that blacks in MSAs with newer than average housing stock (medhousez==1 vs medhousez==0) have higher BMI’s, while the BMI of the other groups does is not affected by housing.
#compare the models with a LRT
anova(fit.mix2, fit.mix4)
## refitting model(s) with ML (instead of REML)
## Data: joindata
## Models:
## fit.mix2: bmiz ~ agec + educ + race_eth + medhousez + (1 | mmsa)
## fit.mix4: bmiz ~ agec + educ + race_eth + medhousez * race_eth + (1 | mmsa)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## fit.mix2 16 479830 479991 -239899 479798
## fit.mix4 20 479821 480022 -239890 479781 17.585 4 0.001487 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Generli
I will illustrate how to fit Generalized Linear Mixed models to outcomes that are not continuous. I will illustrate the Laplace approximation using the glmer() function in the lme4 library.
NOTE GLMM’s can take much longer to estimate than LMM’s, don’t be surprised if your model takes a long time to run. Also don’t be surprised if you see warning messages like:
Warning message: In checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, : Model failed to converge with max|grad| = somenumber (tol = 0.001)
This is a warning message that the computer did not reach a satisfactory numerical solution for your model. This just means we may have to tweak your model a little to get R to converge. This Rpubs document provides an excellent section on the technical matters involved in fitting the models in R.
You may have to refit the model, meaning restart the optimization process where it stopped. This can be done using the refit() function in lme4.
The First model will fit a model that considers a binary response, if a person’s BMI was greater than 30, the cut off for obese weight status.
A research question corresponding to this could be:
“How do sociodemographic factors affect the odds or being obese in US cities?”
If we were to write the symbolic form of this model it would be the logistic regression model:
\[ln \left ( \frac {p_{ij}}{1-p{ij}} \right ) = \beta_{0j} + \sum {\beta_k x_{ik}} \]
\[\beta_{0j} = \beta_0 + u_j\]
\[u_j \sim N(0, \sigma^2_u)\] Note, the logistic regression function does not have a separate independent residual variance at the individual level. In fact, owing to the logistic distribution function, the variance of a logistic distribution with scale parameter equal to 1 is \(\frac{\pi^2}{3}\). Link to article about this.
the glmer() function will estimate this model:
joindata$mmsa_fac<-as.factor(joindata$mmsa)
fit.mix.bin<-glmer(obese~male+agec+educ+race_eth+(1|mmsa_fac), family=binomial, joindata,
control=glmerControl(optimizer="bobyqa",optCtrl=list(maxfun=2e5)))
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control
## $checkConv, : Model failed to converge with max|grad| = 0.00102225 (tol =
## 0.001, component 1)
fit.mix.bin<-refit(fit.mix.bin)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control
## $checkConv, : Model failed to converge with max|grad| = 0.00138182 (tol =
## 0.001, component 1)
arm::display(fit.mix.bin, detail=T)
## glmer(formula = obese ~ male + agec + educ + race_eth + (1 |
## mmsa_fac), data = joindata, family = binomial, control = glmerControl(optimizer = "bobyqa",
## optCtrl = list(maxfun = 2e+05)))
## coef.est coef.se z value Pr(>|z|)
## (Intercept) -1.64 0.03 -50.39 0.00
## male 0.06 0.01 5.43 0.00
## agec(24,39] 0.82 0.03 26.74 0.00
## agec(39,59] 1.11 0.03 38.55 0.00
## agec(59,79] 0.99 0.03 34.33 0.00
## agec(79,99] 0.16 0.04 4.49 0.00
## educ0Prim 0.12 0.04 3.02 0.00
## educ1somehs 0.11 0.03 4.14 0.00
## educ3somecol -0.05 0.01 -3.38 0.00
## educ4colgrad -0.43 0.01 -31.09 0.00
## race_ethhispanic 0.18 0.02 7.87 0.00
## race_ethnh black 0.52 0.02 27.28 0.00
## race_ethnh multirace 0.26 0.04 6.15 0.00
## race_ethnh other -0.30 0.03 -8.67 0.00
##
## Error terms:
## Groups Name Std.Dev.
## mmsa_fac (Intercept) 0.15
## Residual 1.00
## ---
## number of obs: 171535, groups: mmsa_fac, 123
## AIC = 203084, DIC = 202410.7
## deviance = 202732.4
#odds ratios
exp(fixef(fit.mix.bin)[-1])
## male agec(24,39] agec(39,59]
## 1.0608158 2.2602768 3.0462191
## agec(59,79] agec(79,99] educ0Prim
## 2.6896637 1.1737133 1.1237945
## educ1somehs educ3somecol educ4colgrad
## 1.1154861 0.9527690 0.6503676
## race_ethhispanic race_ethnh black race_ethnh multirace
## 1.2008227 1.6810032 1.2950267
## race_ethnh other
## 0.7406912
exp(confint(fit.mix.bin, method="Wald"))
## 2.5 % 97.5 %
## .sig01 NA NA
## (Intercept) 0.1822862 0.2070651
## male 1.0384686 1.0836439
## agec(24,39] 2.1291295 2.3995023
## agec(39,59] 2.8784792 3.2237337
## agec(59,79] 2.5419377 2.8459749
## agec(79,99] 1.0944296 1.2587405
## educ0Prim 1.0418868 1.2121413
## educ1somehs 1.0591970 1.1747664
## educ3somecol 0.9263848 0.9799047
## educ4colgrad 0.6329635 0.6682503
## race_ethhispanic 1.1473301 1.2568094
## race_ethnh black 1.6194325 1.7449147
## race_ethnh multirace 1.1926827 1.4061529
## race_ethnh other 0.6920769 0.7927204
sjp.glmer(fit.mix.bin, facet.grid = F, type="ri.slope", vars=c("educ", "agec"), show.legend = F)
A multilevel logistic model with a higher level variable can be written: \[ln \left ( \frac {p_{ij}}{1-p{ij}} \right ) = \beta_{0j} + \sum {\beta_k x_{ik}} +\gamma z_j\] In this analysis, we will test the hypothesis:
“Inidividuals living in cities with newer housing will have lower odds of being obese, net of individual level factors”
So the hypothesis of interest here involves us testing whether the effect of the housing age medhousez shows a \(\gamma\) coefficient different from zero.
fit.mix.bin2<-glmer(obese~male+agec+educ+race_eth+medhousez+(1|mmsa_fac), family=binomial, joindata,
control=glmerControl(optimizer="bobyqa",optCtrl=list(maxfun=2e5)))
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control
## $checkConv, : Model failed to converge with max|grad| = 0.00114374 (tol =
## 0.001, component 1)
fit.mix.bin2<-refit(fit.mix.bin2)
arm::display(fit.mix.bin2, detail=T)
## glmer(formula = obese ~ male + agec + educ + race_eth + medhousez +
## (1 | mmsa_fac), data = joindata, family = binomial, control = glmerControl(optimizer = "bobyqa",
## optCtrl = list(maxfun = 2e+05)))
## coef.est coef.se z value Pr(>|z|)
## (Intercept) -1.63 0.03 -50.41 0.00
## male 0.06 0.01 5.43 0.00
## agec(24,39] 0.82 0.03 26.68 0.00
## agec(39,59] 1.11 0.03 38.47 0.00
## agec(59,79] 0.99 0.03 34.26 0.00
## agec(79,99] 0.16 0.04 4.47 0.00
## educ0Prim 0.12 0.04 3.03 0.00
## educ1somehs 0.11 0.03 4.15 0.00
## educ3somecol -0.05 0.01 -3.34 0.00
## educ4colgrad -0.43 0.01 -31.05 0.00
## race_ethhispanic 0.18 0.02 7.95 0.00
## race_ethnh black 0.52 0.02 27.43 0.00
## race_ethnh multirace 0.26 0.04 6.19 0.00
## race_ethnh other -0.30 0.03 -8.66 0.00
## medhousez -0.05 0.01 -3.71 0.00
##
## Error terms:
## Groups Name Std.Dev.
## mmsa_fac (Intercept) 0.14
## Residual 1.00
## ---
## number of obs: 171535, groups: mmsa_fac, 123
## AIC = 203073, DIC = 202427.4
## deviance = 202734.1
#odds ratios
exp(fixef(fit.mix.bin2)[-1])
## male agec(24,39] agec(39,59]
## 1.0607928 2.2601965 3.0450495
## agec(59,79] agec(79,99] educ0Prim
## 2.6892157 1.1734909 1.1242071
## educ1somehs educ3somecol educ4colgrad
## 1.1159785 0.9532566 0.6507260
## race_ethhispanic race_ethnh black race_ethnh multirace
## 1.2024835 1.6851797 1.2959646
## race_ethnh other medhousez
## 0.7409598 0.9478214
exp(confint(fit.mix.bin, method="Wald"))
## 2.5 % 97.5 %
## .sig01 NA NA
## (Intercept) 0.1822862 0.2070651
## male 1.0384686 1.0836439
## agec(24,39] 2.1291295 2.3995023
## agec(39,59] 2.8784792 3.2237337
## agec(59,79] 2.5419377 2.8459749
## agec(79,99] 1.0944296 1.2587405
## educ0Prim 1.0418868 1.2121413
## educ1somehs 1.0591970 1.1747664
## educ3somecol 0.9263848 0.9799047
## educ4colgrad 0.6329635 0.6682503
## race_ethhispanic 1.1473301 1.2568094
## race_ethnh black 1.6194325 1.7449147
## race_ethnh multirace 1.1926827 1.4061529
## race_ethnh other 0.6920769 0.7927204
#LRT between models
anova(fit.mix.bin, fit.mix.bin2)
## Data: joindata
## Models:
## fit.mix.bin: obese ~ male + agec + educ + race_eth + (1 | mmsa_fac)
## fit.mix.bin2: obese ~ male + agec + educ + race_eth + medhousez + (1 | mmsa_fac)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## fit.mix.bin 15 203084 203235 -101527 203054
## fit.mix.bin2 16 203073 203234 -101520 203041 13.146 1 0.0002881
##
## fit.mix.bin
## fit.mix.bin2 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
highmsa<-levels(joindata$mmsa_fac)[as.numeric(lapply(ranef(fit.mix.bin)$mmsa, which.max))]
lowmsa<-levels(joindata$mmsa_fac)[as.numeric(lapply(ranef(fit.mix.bin)$mmsa, which.min))]
In this case, the houseing age ** affects the probability that someone was obese, net of individual factors, although there was quite a lot of variance between cities in the level of obesity.
These predicted values actually allow us to set up a counterfactual type of analysis. We can effectively change the houseing age variable to be older or younger than the real value to see how the outcome changes.
highmsa_house<-msa_ec@data[msa_ec$ids==highmsa,"medhousez"]
lowmsa_house<-msa_ec@data[msa_ec$ids==lowmsa,"medhousez"]
#get newdata using real values
dat1<-expand.grid(male=mean(joindata$male), agec=levels(joindata$agec)[3], educ=levels(joindata$educ), race_eth=levels(joindata$race_eth),mmsa_fac = c(lowmsa, highmsa) , medhousez=c(highmsa_house,lowmsa_house ))
#get newdata using artificially generated housing age values
dat2<-expand.grid(male=mean(joindata$male), agec=levels(joindata$agec)[3], educ=levels(joindata$educ), race_eth=levels(joindata$race_eth),mmsa_fac = c(lowmsa, highmsa) , medhousez=c( 1))
#predictions
dat1$pred<-predict(fit.mix.bin2, dat1, type="response")
dat2$pred<-predict(fit.mix.bin2, dat2, type="response")
head(dat1[dat1$race_eth=="nh black"&dat1$educ=="2hsgrad",], n=10)
## male agec educ race_eth mmsa_fac medhousez pred
## 11 0.4499432 (39,59] 2hsgrad nh black 19740 -0.3541129 0.4240806
## 36 0.4499432 (39,59] 2hsgrad nh black 26580 -0.3541129 0.6008181
## 61 0.4499432 (39,59] 2hsgrad nh black 19740 0.5982027 0.4116671
## 86 0.4499432 (39,59] 2hsgrad nh black 26580 0.5982027 0.5885178
head(dat2[dat2$race_eth=="nh black"&dat2$educ=="2hsgrad",], n=10)
## male agec educ race_eth mmsa_fac medhousez pred
## 11 0.4499432 (39,59] 2hsgrad nh black 19740 1 0.4064622
## 36 0.4499432 (39,59] 2hsgrad nh black 26580 1 0.5832937
So, we see for instance in Huntington-Ashland, WV-KY-OH, Metropolitan Statistical Area the obesity rate for NH Blacks, with high school education is estimated to be 42.4%, but the houseing is older than average there 0.5982027 , if the housing were newer, the rate would be 40.6%. That is called a counterfactual, because we are artificially changing a variable to a different value, to see the result on the outcome. However, you do not see a change in Denver-Aurora-Lakewood, CO, Metropolitan Statistical Area, because the housing is already new there. Multilevel models are very useful for this.