#load brfss
library(car)
library(stargazer)
library(survey)
library(sjPlot)
library(ggplot2)
library(pander)
library(knitr)
library(dplyr)
library(tidycensus)
library(lme4)
library(lmerTest)
library(arm)
load(url("https://github.com/coreysparks/data/blob/master/brfss_2017.Rdata?raw=true"))
set.seed(12345)
#samps<-sample(1:nrow(brfss_17), size = 40000, replace=F)
#brfss_17<-brfss_17[samps,]
#The names in the data are very ugly, so I make them less ugly
nams<-names(brfss_17)
#we see some names are lower case, some are upper and some have a little _ in the first position. This is a nightmare.
newnames<-gsub(pattern = "_",
replacement = "",
x = nams)
names(brfss_17)<-tolower(newnames)
#brfss_17$badhealth<-ifelse(brfss_17$genhlth %in% c(4,5),1,0)
brfss_17$badhealth<-Recode(brfss_17$genhlth, recodes="4:5=1; 1:3=0; else=NA")
#race/ethnicity
brfss_17$black<-Recode(brfss_17$racegr3, recodes="2=1; 9=NA; else=0")
brfss_17$white<-Recode(brfss_17$racegr3, recodes="1=1; 9=NA; else=0")
brfss_17$other<-Recode(brfss_17$racegr3, recodes="3:4=1; 9=NA; else=0")
brfss_17$hispanic<-Recode(brfss_17$racegr3, recodes="5=1; 9=NA; else=0")
brfss_17$race_eth<-Recode(brfss_17$racegr3,
recodes="1='nhwhite'; 2='nh black'; 3='nh other';4='nh multirace'; 5='hispanic'; else=NA",
as.factor = T)
brfss_17$race_eth<-relevel(brfss_17$race_eth, ref = "nhwhite")
#BMI
brfss_17$bmi<-ifelse(is.na(brfss_17$bmi5)==T, NA, brfss_17$bmi5/100)
#insurance
brfss_17$ins<-Recode(brfss_17$hlthpln1, recodes ="7:9=NA; 1=1;2=0")
#income grouping
brfss_17$inc<-ifelse(brfss_17$incomg==9, NA, brfss_17$incomg)
#education level
brfss_17$educ<-Recode(brfss_17$educa,
recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA",
as.factor=T)
brfss_17$educ<-relevel(brfss_17$educ, ref='2hsgrad')
#employment
brfss_17$employ<-Recode(brfss_17$employ1,
recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA",
as.factor=T)
brfss_17$employ<-relevel(brfss_17$employ, ref='Employed')
#marital status
brfss_17$marst<-Recode(brfss_17$marital,
recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA",
as.factor=T)
brfss_17$marst<-relevel(brfss_17$marst, ref='married')
#Age cut into intervals
brfss_17$agec<-cut(brfss_17$age80, breaks=c(0,24,39,59,79,99))
#BMI, in the brfss_17a the bmi variable has 2 implied decimal places,
#so we must divide by 100 to get real bmi's
brfss_17$bmi<-brfss_17$bmi5/100
#smoking currently
brfss_17$smoke<-Recode(brfss_17$smoker3,
recodes="1:2='Current'; 3='Former';4='NeverSmoked'; else=NA",
as.factor=T)
brfss_17$smoke<-relevel(brfss_17$smoke, ref = "NeverSmoked")
##Exercise
brfss_17$exercise<-Recode(brfss_17$exerany2, recodes ="7:9=NA; 1=1;2=0")
#rEcigeret
brfss_17$ecigaret<-Recode(brfss_17$ecigaret, recodes ="7:9=NA; 1=1;2=0")
##obese
brfss_17$obese<-ifelse(is.na(brfss_17$bmi)==T, NA,
ifelse(brfss_17$bmi>30,1,0))
merged<-brfss_17%>%
dplyr::select(ecigaret, mmsa, agec, educ,obese, race_eth,smoke, badhealth,bmi,exercise, mmsawt, mmsaname)%>%
filter(complete.cases(.))
Conduct a multi-level analysis where you: 1.Estimate the random intercept model with just the higher level units as your predictor (NULL model) a. report the variance components and ICC
fit.an<-lm(bmi~as.factor(mmsa),
brfss_17)
anova(fit.an)
## Analysis of Variance Table
##
## Response: bmi
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(mmsa) 135 71046 526.27 13.819 < 2.2e-16 ***
## Residuals 211169 8041703 38.08
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##variance
##Since the F value is greater than 1 and the p value is small. Shows significant variation in BMI due to the large f value(13.81) and the small pr value. This means that there is variance among higher level units
##ICC
fit<-lmer(bmi~(1|mmsa),
data=merged)
arm::display(fit,
detail=T)
## lmer(formula = bmi ~ (1 | mmsa), data = merged)
## coef.est coef.se t value
## (Intercept) 28.14 0.05 528.30
##
## Error terms:
## Groups Name Std.Dev.
## mmsa (Intercept) 0.59
## Residual 6.18
## ---
## number of obs: 193926, groups: mmsa, 136
## AIC = 1.25694e+06, DIC = 1256922
## deviance = 1256925.8
##ICC (.58^2)/( 6.17 ^2+.58^2) = 0.008759208
##The variation in the ICC means that 2 randomly selected people will be correlated at 0.008, which is really small.
##This shows that the likelihood for 2 people to be identical is statistically small between 2 cities
2.Now fit the multilevel model for your outcome and include both the higher level term and appropriate individual - level predictors a.Report the variance components of the model and describe the results of the model
fit2<-lmer(bmi~race_eth+agec+educ+exercise+(1|mmsa),
data=merged,
na.action=na.omit)
arm::display(fit2, detail=T)
## lmer(formula = bmi ~ race_eth + agec + educ + exercise + (1 |
## mmsa), data = merged, na.action = na.omit)
## coef.est coef.se t value
## (Intercept) 26.91 0.08 344.70
## race_ethhispanic 0.70 0.06 12.36
## race_ethnh black 1.75 0.05 35.94
## race_ethnh multirace 0.89 0.10 8.69
## race_ethnh other -1.05 0.07 -14.63
## agec(24,39] 2.61 0.06 40.51
## agec(39,59] 3.51 0.06 57.94
## agec(59,79] 3.01 0.06 50.01
## agec(79,99] 0.47 0.07 6.25
## educ0Prim 0.11 0.11 1.00
## educ1somehs -0.02 0.07 -0.26
## educ3somecol 0.06 0.04 1.46
## educ4colgrad -0.94 0.04 -26.17
## exercise -1.78 0.03 -55.19
##
## Error terms:
## Groups Name Std.Dev.
## mmsa (Intercept) 0.49
## Residual 5.98
## ---
## number of obs: 193926, groups: mmsa, 136
## AIC = 1.24462e+06, DIC = 1244473
## deviance = 1244530.9
##The individual level characteristics show that
####The ICC shows that the likelihood of 2 people being identical in the model is statistically small
##ICC is lower in this model compared to fit 1
library(sjstats); library(sjPlot)
##
## Attaching package: 'sjstats'
## The following object is masked from 'package:survey':
##
## cv
icc(fit2)
## Warning: 'icc' is deprecated.
## Use 'performance::icc()' instead.
## See help("Deprecated")
## # Intraclass Correlation Coefficient
##
## Adjusted ICC: 0.007
## Conditional ICC: 0.006
plot_model(fit,
type = "re",
sort.est="sort.all",
grid=F)
## Warning: `select_vars()` is deprecated as of dplyr 0.8.4.
## Please use `tidyselect::vars_select()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
##The ICC shows that the likelihood of 2 people being identical in the model is statistically small
##ICC is lower in this model compared to fit 1
##Looking at the error terms, we see that the variance in cities went down therefore reducing the differences in BMI among cities in level 1 and level 2 variance
b.Report the results for the fixed effects
##report the variables in the model
##BMI reduces when you exercise and when you have higher education. Blacks have the highest BMI. Age group (39,59) have the highest BMI
3.Compare the models from 1 and 2 using a likelihood ratio test (anova(fit1, fit2, test=”Chisq”)) .
anova(fit, fit2, data=merged)
## refitting model(s) with ML (instead of REML)
## Data: merged
## Models:
## fit: bmi ~ (1 | mmsa)
## fit2: bmi ~ race_eth + agec + educ + exercise + (1 | mmsa)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## fit 3 1256932 1256962 -628463 1256926
## fit2 16 1244563 1244726 -622265 1244531 12395 13 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
a.Does the inclusion of the individual level variables increase the overall model fit?
##Fit 2 is definitely the better model. The AIC in fit2 is lower. The Chi square is lower making it statistically significant
##This means that it fits the data a lot better than fit1 which was the null model