#load brfss
library(car)
library(stargazer)
library(survey)
library(sjPlot)
library(ggplot2)
library(pander)
library(knitr)
library(dplyr)
library(tidycensus)
library(lme4)
library(lmerTest)
library(arm)
load(url("https://github.com/coreysparks/data/blob/master/brfss_2017.Rdata?raw=true"))

set.seed(12345)
#samps<-sample(1:nrow(brfss_17), size = 40000, replace=F)
#brfss_17<-brfss_17[samps,]
#The names in the data are very ugly, so I make them less ugly
nams<-names(brfss_17)
#we see some names are lower case, some are upper and some have a little _ in the first position. This is a nightmare.

newnames<-gsub(pattern = "_",
               replacement =  "",
               x =  nams)

names(brfss_17)<-tolower(newnames)
#brfss_17$badhealth<-ifelse(brfss_17$genhlth %in% c(4,5),1,0)
brfss_17$badhealth<-Recode(brfss_17$genhlth, recodes="4:5=1; 1:3=0; else=NA")
#race/ethnicity
brfss_17$black<-Recode(brfss_17$racegr3, recodes="2=1; 9=NA; else=0")
brfss_17$white<-Recode(brfss_17$racegr3, recodes="1=1; 9=NA; else=0")
brfss_17$other<-Recode(brfss_17$racegr3, recodes="3:4=1; 9=NA; else=0")
brfss_17$hispanic<-Recode(brfss_17$racegr3, recodes="5=1; 9=NA; else=0")

brfss_17$race_eth<-Recode(brfss_17$racegr3, 
recodes="1='nhwhite'; 2='nh black'; 3='nh other';4='nh multirace'; 5='hispanic'; else=NA",
as.factor = T)
brfss_17$race_eth<-relevel(brfss_17$race_eth, ref = "nhwhite")

#BMI
brfss_17$bmi<-ifelse(is.na(brfss_17$bmi5)==T, NA, brfss_17$bmi5/100)

#insurance
brfss_17$ins<-Recode(brfss_17$hlthpln1, recodes ="7:9=NA; 1=1;2=0")

#income grouping
brfss_17$inc<-ifelse(brfss_17$incomg==9, NA, brfss_17$incomg)

#education level
brfss_17$educ<-Recode(brfss_17$educa,
recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA",
as.factor=T)
brfss_17$educ<-relevel(brfss_17$educ, ref='2hsgrad')

#employment
brfss_17$employ<-Recode(brfss_17$employ1,
recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA",
as.factor=T)
brfss_17$employ<-relevel(brfss_17$employ, ref='Employed')

#marital status
brfss_17$marst<-Recode(brfss_17$marital,
recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA",
as.factor=T)
brfss_17$marst<-relevel(brfss_17$marst, ref='married')

#Age cut into intervals
brfss_17$agec<-cut(brfss_17$age80, breaks=c(0,24,39,59,79,99))

#BMI, in the brfss_17a the bmi variable has 2 implied decimal places,
#so we must divide by 100 to get real bmi's

brfss_17$bmi<-brfss_17$bmi5/100

#smoking currently
brfss_17$smoke<-Recode(brfss_17$smoker3, 
recodes="1:2='Current'; 3='Former';4='NeverSmoked'; else=NA", 
as.factor=T)
brfss_17$smoke<-relevel(brfss_17$smoke, ref = "NeverSmoked")

##Exercise
brfss_17$exercise<-Recode(brfss_17$exerany2, recodes ="7:9=NA; 1=1;2=0")

#rEcigeret
brfss_17$ecigaret<-Recode(brfss_17$ecigaret, recodes ="7:9=NA; 1=1;2=0")

##obese
brfss_17$obese<-ifelse(is.na(brfss_17$bmi)==T, NA, 
                       ifelse(brfss_17$bmi>30,1,0))
merged<-brfss_17%>%
  dplyr::select(ecigaret, mmsa, agec, educ,obese, race_eth,smoke, badhealth,bmi,exercise, mmsawt, mmsaname)%>%
  filter(complete.cases(.))

Conduct a multi-level analysis where you: 1.Estimate the random intercept model with just the higher level units as your predictor (NULL model) a. report the variance components and ICC

fit.an<-lm(bmi~as.factor(mmsa),
           brfss_17)
anova(fit.an)
## Analysis of Variance Table
## 
## Response: bmi
##                     Df  Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(mmsa)    135   71046  526.27  13.819 < 2.2e-16 ***
## Residuals       211169 8041703   38.08                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##variance
##Since the F value is greater than 1 and the p value is small. Shows significant variation in BMI  due to the large  f value(13.81) and the small pr value. This means that there is variance among higher level units 
##ICC
fit<-lmer(bmi~(1|mmsa),
           data=merged)
arm::display(fit,
             detail=T)
## lmer(formula = bmi ~ (1 | mmsa), data = merged)
##             coef.est coef.se t value
## (Intercept)  28.14     0.05  528.30 
## 
## Error terms:
##  Groups   Name        Std.Dev.
##  mmsa     (Intercept) 0.59    
##  Residual             6.18    
## ---
## number of obs: 193926, groups: mmsa, 136
## AIC = 1.25694e+06, DIC = 1256922
## deviance = 1256925.8
##ICC  (.58^2)/( 6.17 ^2+.58^2) = 0.008759208
##The variation in the ICC means that 2 randomly selected people will be correlated at 0.008, which is really small.
##This shows that the likelihood for 2 people to be identical is statistically small between 2 cities

2.Now fit the multilevel model for your outcome and include both the higher level term and appropriate individual - level predictors a.Report the variance components of the model and describe the results of the model

fit2<-lmer(bmi~race_eth+agec+educ+exercise+(1|mmsa),
           data=merged,
           na.action=na.omit)
arm::display(fit2, detail=T)
## lmer(formula = bmi ~ race_eth + agec + educ + exercise + (1 | 
##     mmsa), data = merged, na.action = na.omit)
##                      coef.est coef.se t value
## (Intercept)           26.91     0.08  344.70 
## race_ethhispanic       0.70     0.06   12.36 
## race_ethnh black       1.75     0.05   35.94 
## race_ethnh multirace   0.89     0.10    8.69 
## race_ethnh other      -1.05     0.07  -14.63 
## agec(24,39]            2.61     0.06   40.51 
## agec(39,59]            3.51     0.06   57.94 
## agec(59,79]            3.01     0.06   50.01 
## agec(79,99]            0.47     0.07    6.25 
## educ0Prim              0.11     0.11    1.00 
## educ1somehs           -0.02     0.07   -0.26 
## educ3somecol           0.06     0.04    1.46 
## educ4colgrad          -0.94     0.04  -26.17 
## exercise              -1.78     0.03  -55.19 
## 
## Error terms:
##  Groups   Name        Std.Dev.
##  mmsa     (Intercept) 0.49    
##  Residual             5.98    
## ---
## number of obs: 193926, groups: mmsa, 136
## AIC = 1.24462e+06, DIC = 1244473
## deviance = 1244530.9
##The individual level characteristics show that 
####The ICC shows that the likelihood of 2 people being identical in the model is statistically small
##ICC is lower in this model compared to fit 1
library(sjstats); library(sjPlot)
## 
## Attaching package: 'sjstats'
## The following object is masked from 'package:survey':
## 
##     cv
icc(fit2)
## Warning: 'icc' is deprecated.
## Use 'performance::icc()' instead.
## See help("Deprecated")
## # Intraclass Correlation Coefficient
## 
##      Adjusted ICC: 0.007
##   Conditional ICC: 0.006
plot_model(fit,
           type = "re",
           sort.est="sort.all",
           grid=F)
## Warning: `select_vars()` is deprecated as of dplyr 0.8.4.
## Please use `tidyselect::vars_select()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

##The ICC shows that the likelihood of 2 people being identical in the model is statistically small
##ICC is lower in this model compared to fit 1
##Looking at the error terms, we see that the variance in cities went down therefore reducing the differences in BMI among cities in level 1 and level 2 variance

b.Report the results for the fixed effects

##report the variables in the model
##BMI reduces when you exercise and when you have higher education. Blacks have the highest BMI. Age group (39,59) have the highest BMI

3.Compare the models from 1 and 2 using a likelihood ratio test (anova(fit1, fit2, test=”Chisq”)) .

 anova(fit, fit2, data=merged)
## refitting model(s) with ML (instead of REML)
## Data: merged
## Models:
## fit: bmi ~ (1 | mmsa)
## fit2: bmi ~ race_eth + agec + educ + exercise + (1 | mmsa)
##      npar     AIC     BIC  logLik deviance Chisq Df Pr(>Chisq)    
## fit     3 1256932 1256962 -628463  1256926                        
## fit2   16 1244563 1244726 -622265  1244531 12395 13  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

a.Does the inclusion of the individual level variables increase the overall model fit?

##Fit 2 is definitely the better model. The AIC in fit2 is lower. The Chi square is lower making it statistically significant
##This means that it fits the data a lot better than fit1 which was the null model