Multilevel Models with R

Load packages and BRFSS Data

library(car)
## Loading required package: carData
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
library(sjPlot)
## Warning: package 'sjPlot' was built under R version 4.0.4
## Registered S3 methods overwritten by 'lme4':
##   method                          from
##   cooks.distance.influence.merMod car 
##   influence.merMod                car 
##   dfbeta.influence.merMod         car 
##   dfbetas.influence.merMod        car
library(ggplot2)
library(pander)
library(knitr)
library(tidycensus)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
load(url("https://github.com/coreysparks/data/blob/master/brfss_2017.Rdata?raw=true"))

set.seed(12345)
#samps<-sample(1:nrow(brfss_17), size = 40000, replace=F)
#brfss_17<-brfss_17[samps,]
#The names in the data are very ugly, so I make them less ugly
nams<-names(brfss_17)
#we see some names are lower case, some are upper and some have a little _ in the first position. This is a nightmare.

newnames<-gsub(pattern = "_",
               replacement =  "",
               x =  nams)

names(brfss_17)<-tolower(newnames)

Recode Variables

#sex
brfss_17$male<-ifelse(brfss_17$sex==1, 1, 0)

#BMI
brfss_17$bmi<-ifelse(is.na(brfss_17$bmi5)==T, NA, brfss_17$bmi5/100)

#Healthy days
brfss_17$healthdays<-Recode(brfss_17$physhlth, recodes = "88=0; 77=NA; 99=NA")

#Healthy mental health days
brfss_17$healthmdays<-Recode(brfss_17$menthlth, recodes = "88=0; 77=NA; 99=NA")

brfss_17$badhealth<-Recode(brfss_17$genhlth, recodes="4:5=1; 1:3=0; else=NA")
#race/ethnicity
brfss_17$black<-Recode(brfss_17$racegr3, recodes="2=1; 9=NA; else=0")
brfss_17$white<-Recode(brfss_17$racegr3, recodes="1=1; 9=NA; else=0")
brfss_17$other<-Recode(brfss_17$racegr3, recodes="3:4=1; 9=NA; else=0")
brfss_17$hispanic<-Recode(brfss_17$racegr3, recodes="5=1; 9=NA; else=0")

brfss_17$race_eth<-Recode(brfss_17$racegr3, 
recodes="1='nhwhite'; 2='nh black'; 3='nh other';4='nh multirace'; 5='hispanic'; else=NA",
as.factor = T)
brfss_17$race_eth<-relevel(brfss_17$race_eth, ref = "nhwhite")

#insurance
brfss_17$ins<-Recode(brfss_17$hlthpln1, recodes ="7:9=NA; 1=1;2=0")

#income grouping
brfss_17$inc<-Recode(brfss_17$incomg, recodes = "9= NA;1='1_lt15k'; 2='2_15-25k';3='3_25-35k';4='4_35-50k';5='5_50kplus'", as.factor = T)
brfss_17$inc<-as.ordered(brfss_17$inc)
#education level
brfss_17$educ<-Recode(brfss_17$educa,
recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA",
as.factor=T)
brfss_17$educ<-relevel(brfss_17$educ, ref='2hsgrad')

#employment
brfss_17$employ<-Recode(brfss_17$employ1,
recodes="1:2='employloyed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA",
as.factor=T)
brfss_17$employ<-relevel(brfss_17$employ, ref='employloyed')

#marital status
brfss_17$marst<-Recode(brfss_17$marital,
recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA",
as.factor=T)
brfss_17$marst<-relevel(brfss_17$marst,
                        ref='married')

#Age cut into intervals
brfss_17$agec<-cut(brfss_17$age80,
                   breaks=c(0,24,39,59,79,99))

#BMI, in the brfss_17a the bmi variable has 2 implied decimal places,
#so we must divide by 100 to get real bmi's

brfss_17$bmi<-brfss_17$bmi5/100

#smoking currently
brfss_17$smoke<-Recode(brfss_17$smoker3, 
recodes="1:2=1; 3:4=0; else=NA")
#brfss_17$smoke<-relevel(brfss_17$smoke, ref = "NeverSmoked")

brfss_17$obese<-ifelse(is.na(brfss_17$bmi)==T, NA, 
                       ifelse(brfss_17$bmi>30,1,0))

1. Estimate the random intercept model with just the higher level units as your predictor (NULL model)

usacs<-get_acs(geography = "metropolitan statistical area/micropolitan statistical area", year = 2011,
                variables=c( "DP05_0001E",
                             "DP03_0062E",
                             "DP04_0003PE") ,
                summary_var = "B01001_001",
                geometry = F,
               output = "wide")
## Getting data from the 2007-2011 5-year ACS
## Using the ACS Data Profile
usacs<-usacs%>%
  mutate(totpop= DP05_0001E,
         medhhinc=DP03_0062E,
         pvacant=DP04_0003PE/100)%>%
  dplyr::select(GEOID,NAME, totpop, medhhinc, pvacant)


head(usacs)
## # A tibble: 6 x 5
##   GEOID NAME                                 totpop medhhinc pvacant
##   <chr> <chr>                                 <dbl>    <dbl>   <dbl>
## 1 15100 Brookings, SD Micro Area              31639    46101   0.107
## 2 15140 Brownsville, TN Micro Area            18847    33504   0.135
## 3 15180 Brownsville-Harlingen, TX Metro Area 400332    32156   0.188
## 4 15220 Brownwood, TX Micro Area              38136    39965   0.264
## 5 15260 Brunswick, GA Metro Area             110738    46199   0.244
## 6 15340 Bucyrus, OH Micro Area                43998    41336   0.113

Clean and standardize contextual data

myscale<-function(x){as.numeric(scale(x))}
usacs<-usacs %>%
  mutate_at(c( "medhhinc", "pvacant"),myscale)

merged<-usacs%>%
  filter(complete.cases(.))
head(merged)
## # A tibble: 6 x 5
##   GEOID NAME                                 totpop medhhinc pvacant
##   <chr> <chr>                                 <dbl>    <dbl>   <dbl>
## 1 15100 Brookings, SD Micro Area              31639   0.0782 -0.485 
## 2 15140 Brownsville, TN Micro Area            18847  -1.21   -0.0850
## 3 15180 Brownsville-Harlingen, TX Metro Area 400332  -1.34    0.673 
## 4 15220 Brownwood, TX Micro Area              38136  -0.548   1.76  
## 5 15260 Brunswick, GA Metro Area             110738   0.0882  1.47  
## 6 15340 Bucyrus, OH Micro Area                43998  -0.408  -0.400
merged<-merge(x=brfss_17,
              y=merged,
              by.x="mmsa",
              by.y="GEOID",
              all.x=F)

Merge MSA data to BRFSS data

merged$bmiz<-as.numeric(scale(merged$bmi,
                              center=T,
                              scale=T))
#merged<-merged[complete.cases(merged[, c("bmiz", "race_eth", "agec", "educ", "gini")]),]
#and merge the data back to the kids data

merged<-merged%>%
  dplyr::select(bmiz, obese, mmsa, agec, educ, race_eth,smoke, healthmdays, badhealth,bmi,medhhinc,pvacant, male, mmsawt, mmsaname )%>%
  filter(complete.cases(.))


head(merged[, c("bmiz", "male", "agec", "educ","medhhinc", "pvacant", "mmsa")])
##         bmiz male    agec     educ  medhhinc    pvacant  mmsa
## 1 -0.7061616    1 (39,59] 3somecol 0.2101566 -0.6427496 10100
## 2 -0.5879484    1 (24,39] 3somecol 0.2101566 -0.6427496 10100
## 3 -1.2732656    0 (59,79] 3somecol 0.2101566 -0.6427496 10100
## 4 -0.8770916    1 (59,79] 4colgrad 0.2101566 -0.6427496 10100
## 5 -0.5000872    0 (59,79]  2hsgrad 0.2101566 -0.6427496 10100
## 6 -0.5639863    0 (39,59]  2hsgrad 0.2101566 -0.6427496 10100

Initial test for significant variation b/w groups using ANOVA

fit.ob<-glm(badhealth~as.factor(mmsa),
            family=binomial,
            merged)

anova(fit.ob, test="Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: badhealth
## 
## Terms added sequentially (first to last)
## 
## 
##                  Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                            159428     149062              
## as.factor(mmsa) 117   2104.3    159311     146958 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# There is significant variation

Estimate with random intercept model

library(lme4)
library(lmerTest)
## Warning: package 'lmerTest' was built under R version 4.0.5
## 
## Attaching package: 'lmerTest'
## The following object is masked from 'package:lme4':
## 
##     lmer
## The following object is masked from 'package:stats':
## 
##     step
library(arm)
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## arm (Version 1.11-2, built: 2020-7-27)
## Working directory is C:/Users/maman/OneDrive/DEM Spring 2021/DEM 7283 Stats 2
## 
## Attaching package: 'arm'
## The following object is masked from 'package:car':
## 
##     logit
fit<-glmer(badhealth~(1|mmsa),
           family=binomial, merged)
arm::display(fit,
             detail=T)
## glmer(formula = badhealth ~ (1 | mmsa), data = merged, family = binomial)
##             coef.est coef.se z value Pr(>|z|)
## (Intercept)  -1.50     0.03  -56.61    0.00  
## 
## Error terms:
##  Groups   Name        Std.Dev.
##  mmsa     (Intercept) 0.28    
##  Residual             1.00    
## ---
## number of obs: 159429, groups: mmsa, 118
## AIC = 147379, DIC = 146562.8
## deviance = 146968.7
# The ICC is 0.072, calculated by the equation: (.28^2)/(1+.28^2)
# The variance between cities is .28 

2. Fit the multilevel model for your outcome and include both the higher level term and appropriate individual - level predictors

fit2<-glmer(badhealth~male+agec+educ+(1|mmsa),
           family=binomial,merged,
           na.action=na.omit)
arm::display(fit2, detail=T)
## glmer(formula = badhealth ~ male + agec + educ + (1 | mmsa), 
##     data = merged, family = binomial, na.action = na.omit)
##              coef.est coef.se z value Pr(>|z|)
## (Intercept)   -2.10     0.04  -49.00    0.00  
## male          -0.07     0.01   -4.77    0.00  
## agec(24,39]    0.47     0.04   11.36    0.00  
## agec(39,59]    1.05     0.04   27.39    0.00  
## agec(59,79]    1.21     0.04   31.83    0.00  
## agec(79,99]    1.31     0.04   30.93    0.00  
## educ0Prim      0.92     0.04   22.28    0.00  
## educ1somehs    0.77     0.03   26.94    0.00  
## educ3somecol  -0.33     0.02  -19.09    0.00  
## educ4colgrad  -1.12     0.02  -61.84    0.00  
## 
## Error terms:
##  Groups   Name        Std.Dev.
##  mmsa     (Intercept) 0.21    
##  Residual             1.00    
## ---
## number of obs: 159429, groups: mmsa, 118
## AIC = 137845, DIC = 137152.1
## deviance = 137487.4
# The between cities variance decreased to 0.21 after including individual level variables. The ICC is 0.042. According to the model, those with some college or college graduates, have lower odds of reporting poor self-rated health compared to high school graduates. Those with less than high school education are more likely to report poor health.

3. Compare the models from 1 and 2 using a likelihood ratio test

anova(fit, fit2, test="Chisq")
## Data: merged
## Models:
## fit: badhealth ~ (1 | mmsa)
## fit2: badhealth ~ male + agec + educ + (1 | mmsa)
##      npar    AIC    BIC logLik deviance  Chisq Df Pr(>Chisq)    
## fit     2 147379 147399 -73687   147375                         
## fit2   11 137845 137954 -68911   137823 9551.9  9  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# The inclusion of individual level variables improves the overall model fit. AIC decreased from the base model to the final one, this difference is significant.