For this homework, I am using BRFSS 2016 data. The continuous outcome variable is how old someone was when they were told they had diabetes. The individual level predictors that were used here: Education, Obesity, Smoking status, age and sex. The metropolitan metro areas are included as the higher level predictor.

Loading Libraries

library(car)
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.1. https://CRAN.R-project.org/package=stargazer
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
library(ggplot2)
library(pander)
library(knitr)

Loading BRFSS Data

load("C:/Users/munta/Google Drive/BRFSS16/brfss16_mmsa.Rdata")
set.seed(12345)
#Cleaning up the names
nams<-names(brfss16m)
#Fixing the capital/lower case
newnames<-gsub(pattern = "_",replacement =  "",x =  nams)
names(brfss16m)<-tolower(newnames)
nams<-names(brfss16m)

Recoding Variables

#sex
brfss16m$male<-ifelse(brfss16m$sex==1, 1, 0)

#BMI
brfss16m$bmi<-ifelse(is.na(brfss16m$bmi5)==T, NA, brfss16m$bmi5/100)

# #Healthy days
# brfss16m$healthdays<-recode(brfss16m$physhlth, recodes = "88=0; 77=NA; 99=NA")

# #Healthy mental health days
# brfss16m$healthmdays<-recode(brfss16m$menthlth, recodes = "88=0; 77=NA; 99=NA")

# brfss16m$badhealth<-recode(brfss16m$genhlth, recodes="4:5=1; 1:3=0; else=NA")

#education level
brfss16m$educ<-recode(brfss16m$educa,
recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA",
as.factor.result=T)
brfss16m$educ<-relevel(brfss16m$educ, ref='2hsgrad')

# #race/ethnicity
# brfss16m$black<-recode(brfss16m$racegr3, recodes="2=1; 9=NA; else=0")
# brfss16m$white<-recode(brfss16m$racegr3, recodes="1=1; 9=NA; else=0")
# brfss16m$other<-recode(brfss16m$racegr3, recodes="3:4=1; 9=NA; else=0")
# brfss16m$hispanic<-recode(brfss16m$racegr3, recodes="5=1; 9=NA; else=0")

# brfss16m$race_eth<-recode(brfss16m$racegr3, 
# recodes="1='nhwhite'; 2='nh black'; 3='nh other';4='nh multirace'; 5='hispanic'; else=NA",
# as.factor.result = T)
# brfss16m$race_eth<-relevel(brfss16m$race_eth, ref = "nhwhite")
# 
# #insurance
# brfss16m$ins<-recode(brfss16m$hlthpln1, recodes ="7:9=NA; 1=1;2=0")
# 
# #income grouping
# brfss16m$inc<-recode(brfss16m$incomg, recodes = "9= NA;1='1_lt15k'; 2='2_15-25k';3='3_25-35k';4='4_35-50k';5='5_50kplus'", as.factor.result = T)
# brfss16m$inc<-as.ordered(brfss16m$inc)
# 
# #employloyment
# brfss16m$employ<-recode(brfss16m$employ1,
# recodes="1:2='employloyed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA",
# as.factor.result=T)
# brfss16m$employ<-relevel(brfss16m$employ, ref='employloyed')
# 
# #marital status
# brfss16m$marst<-recode(brfss16m$marital,
# recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA",
# as.factor.result=T)
# brfss16m$marst<-relevel(brfss16m$marst, ref='married')

#Age cut into intervals
brfss16m$agec<-cut(brfss16m$age80, breaks=c(0,24,39,59,79,99))

#BMI, in the brfss16ma the bmi variable has 2 implied decimal places,
#so we must divide by 100 to get real bmi's

brfss16m$bmi<-brfss16m$bmi5/100

#smoking currently
brfss16m$smoke<-recode(brfss16m$smoker3, recodes="1:2=1; 3:4=0; else=NA")
#brfss16m$smoke<-relevel(brfss16m$smoke, ref = "NeverSmoked")

brfss16m$obese<-ifelse(is.na(brfss16m$bmi)==T, NA, 
                       ifelse(brfss16m$bmi>30,1,0))

#Outcome Variable: How old were you when you were told you have diabetes?
brfss16m$agediab<-recode(brfss16m$diabage2, recodes = "98=NA; 99=NA")
summary(brfss16m$agediab)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    1.00   42.00   52.00   51.22   60.00   93.00  218888
hist(brfss16m$agediab)

The histogram shows that the age at getting told about diabetes is normally disctributed.

Fitting the multilevel regression model with the msa

#people within each msa
head(data.frame(name=table(brfss16m$mmsaname),id=unique(brfss16m$mmsa)))
##                                                          name.Var1
## 1       Albany-Schenectady-Troy, NY, Metropolitan Statistical Area
## 2                   Albuquerque, NM, Metropolitan Statistical Area
## 3 Allentown-Bethlehem-Easton, PA-NJ, Metropolitan Statistical Area
## 4                     Anchorage, AK, Metropolitan Statistical Area
## 5 Atlanta-Sandy Springs-Roswell, GA, Metropolitan Statistical Area
## 6    Augusta-Richmond County, GA-SC, Metropolitan Statistical Area
##   name.Freq    id
## 1      2901 10580
## 2      1300 10740
## 3       675 10900
## 4      1028 11260
## 5      2444 12060
## 6       849 12260
#MSAs
length(table(brfss16m$mmsa))
## [1] 143

Getting complete cases

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
mydata<-brfss16m%>%
  select(agediab, mmsa, educ, agec, obese, smoke, bmi, male, mmsawt, mmsaname )%>%
  filter(complete.cases(.))

head(mydata[, c("agediab", "male", "agec", "educ", "obese", "smoke", "mmsa")])
## # A tibble: 6 x 7
##   agediab  male agec    educ     obese smoke   mmsa
##     <dbl> <dbl> <fct>   <fct>    <dbl> <dbl>  <dbl>
## 1     45.    1. (59,79] 4colgrad    1.    0. 10580.
## 2     42.    1. (59,79] 2hsgrad     0.    0. 10580.
## 3     35.    0. (59,79] 2hsgrad     1.    0. 10580.
## 4     32.    1. (24,39] 3somecol    1.    0. 10580.
## 5     48.    0. (59,79] 2hsgrad     1.    0. 10580.
## 6      7.    0. (39,59] 4colgrad    0.    0. 10580.
#Storing Information
meanagediab<-mean(mydata$agediab, na.rm=T)
sdagediab<-sd(mydata$agediab, na.rm=T)

Fixed-Effects ANOVA test to see variation

fit.an<-lm(agediab~as.factor(mmsa), mydata)
anova(fit.an)
## Analysis of Variance Table
## 
## Response: agediab
##                    Df  Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(mmsa)   142   59156  416.59  1.9557 9.372e-11 ***
## Residuals       27094 5771362  213.01                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

So we see significant variation in the age at getting told about diabetes across the MSAs.

Random Intercept Model

Now we fit the hierarchical model

library(lme4)
library(lmerTest)
## 
## Attaching package: 'lmerTest'
## The following object is masked from 'package:lme4':
## 
##     lmer
## The following object is masked from 'package:stats':
## 
##     step
library(arm)
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## arm (Version 1.9-3, built: 2016-11-21)
## Working directory is C:/Users/munta/Dropbox/###Dem 7283 Stat II/HW/HW7
## 
## Attaching package: 'arm'
## The following object is masked from 'package:car':
## 
##     logit

Basic hierarchical model for means of each MSA:

mydata$agediabz<-as.numeric(scale(mydata$agediab, center=T, scale=T))

fit<-lmer(agediabz~(1|mmsa), data=mydata)
arm::display(fit, detail=T)
## lme4::lmer(formula = agediabz ~ (1 | mmsa), data = mydata)
##             coef.est coef.se t value
## (Intercept) 0.00     0.01    0.02   
## 
## Error terms:
##  Groups   Name        Std.Dev.
##  mmsa     (Intercept) 0.07    
##  Residual             1.00    
## ---
## number of obs: 27237, groups: mmsa, 143
## AIC = 77264.3, DIC = 77243.1
## deviance = 77250.7

The random intercept model shows a standard deviation of 0.07, which is the square root of the variance we can see.

Ploting the relationship

mydata$preagediab <- sdagediab*(fitted(fit))+meanagediab

citymeans<-aggregate(cbind(agediab, preagediab)~mmsaname,mydata, mean)
head(citymeans, n=10)
##                                                            mmsaname
## 1        Albany-Schenectady-Troy, NY, Metropolitan Statistical Area
## 2                    Albuquerque, NM, Metropolitan Statistical Area
## 3  Allentown-Bethlehem-Easton, PA-NJ, Metropolitan Statistical Area
## 4                      Anchorage, AK, Metropolitan Statistical Area
## 5  Atlanta-Sandy Springs-Roswell, GA, Metropolitan Statistical Area
## 6     Augusta-Richmond County, GA-SC, Metropolitan Statistical Area
## 7              Austin-Round Rock, TX, Metropolitan Statistical Area
## 8      Baltimore-Columbia-Towson, MD, Metropolitan Statistical Area
## 9                    Baton Rouge, LA, Metropolitan Statistical Area
## 10                       Beckley, WV, Metropolitan Statistical Area
##     agediab preagediab
## 1  50.71181   50.98640
## 2  51.31579   51.34325
## 3  47.52239   50.43676
## 4  49.88991   50.86019
## 5  51.43816   51.40497
## 6  49.80412   50.87096
## 7  51.82659   51.57030
## 8  51.39922   51.39088
## 9  52.42683   51.65857
## 10 48.70213   50.54222
plot(preagediab~agediab, citymeans)

The plot shows a positive relationship with the values centered around ages between 50-52.

Model with individual level predictors:

liklihood ratio test to see if our individual level variables are explaining anything about out ourcome:

fit2<-lmer(agediabz~male+agec+educ+obese+smoke+(1|mmsa), data=mydata,  na.action=na.omit)
anova(fit, fit2)
## refitting model(s) with ML (instead of REML)
## Data: mydata
## Models:
## object: agediabz ~ (1 | mmsa)
## ..1: agediabz ~ male + agec + educ + obese + smoke + (1 | mmsa)
##        Df   AIC   BIC logLik deviance Chisq Chi Df Pr(>Chisq)    
## object  3 77257 77281 -38625    77251                            
## ..1    14 64646 64761 -32309    64618 12632     11  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

We can see that, individual level predictors (fit2) improve the model.

Test for whether variance is zero or not

rand(fit2)
## Analysis of Random effects Table:
##      Chi.sq Chi.DF p.value  
## mmsa   2.81      1    0.09 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The variation remains in average age at getting told about diabetes across MSAs.

Fitting the basic multi-level model

fit.mix<-lmer(agediabz~male+agec+educ+obese+smoke+(male|mmsa), data=mydata)
#test for the random effect
rand(fit.mix)
## Analysis of Random effects Table:
##           Chi.sq Chi.DF p.value
## male:mmsa 0.0303      2       1
display(fit.mix, detail=T)
## lme4::lmer(formula = agediabz ~ male + agec + educ + obese + 
##     smoke + (male | mmsa), data = mydata)
##              coef.est coef.se t value
## (Intercept)   -2.56     0.08  -33.56 
## male           0.03     0.01    3.50 
## agec(24,39]    0.78     0.08    9.82 
## agec(39,59]    1.93     0.08   25.27 
## agec(59,79]    2.78     0.08   36.69 
## agec(79,99]    3.52     0.08   45.67 
## educ0Prim     -0.07     0.03   -2.86 
## educ1somehs   -0.06     0.02   -2.93 
## educ3somecol  -0.03     0.01   -2.34 
## educ4colgrad  -0.02     0.01   -1.62 
## obese          0.02     0.01    1.58 
## smoke         -0.02     0.01   -1.26 
## 
## Error terms:
##  Groups   Name        Std.Dev. Corr  
##  mmsa     (Intercept) 0.03           
##           male        0.00     -1.00 
##  Residual             0.79           
## ---
## number of obs: 27237, groups: mmsa, 143
## AIC = 64728.5, DIC = 64540.1
## deviance = 64618.3

We can see that our standard deviation at the MSA level is 0.03, and the standard deviation at the individual level (residual standard deviation) is 0.79. These are the square root of the variance.

Random Slope Model

#To do a random slope model, I do:
fit.mix3<-lmer(agediabz~male+agec+educ+obese+smoke+(male|mmsa), data=mydata, REML=F)

display(fit.mix3, detail=T)
## lme4::lmer(formula = agediabz ~ male + agec + educ + obese + 
##     smoke + (male | mmsa), data = mydata, REML = F)
##              coef.est coef.se t value
## (Intercept)   -2.56     0.08  -33.57 
## male           0.03     0.01    3.51 
## agec(24,39]    0.78     0.08    9.82 
## agec(39,59]    1.93     0.08   25.28 
## agec(59,79]    2.78     0.08   36.70 
## agec(79,99]    3.52     0.08   45.68 
## educ0Prim     -0.08     0.03   -2.86 
## educ1somehs   -0.06     0.02   -2.93 
## educ3somecol  -0.03     0.01   -2.34 
## educ4colgrad  -0.02     0.01   -1.62 
## obese          0.02     0.01    1.58 
## smoke         -0.02     0.01   -1.26 
## 
## Error terms:
##  Groups   Name        Std.Dev. Corr  
##  mmsa     (Intercept) 0.02           
##           male        0.00     -1.00 
##  Residual             0.79           
## ---
## number of obs: 27237, groups: mmsa, 143
## AIC = 64650.3, DIC = 64618.3
## deviance = 64618.3

The standard deviation of 0.10 shows how much variation there is. The male standard deviation of 0.01 shows the evidence of non-stationarity; which means we have variance.

Compare the models with a LRT

anova(fit.mix, fit.mix3)
## refitting model(s) with ML (instead of REML)
## Data: mydata
## Models:
## object: agediabz ~ male + agec + educ + obese + smoke + (male | mmsa)
## ..1: agediabz ~ male + agec + educ + obese + smoke + (male | mmsa)
##        Df   AIC   BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## object 16 64650 64782 -32309    64618                        
## ..1    16 64650 64782 -32309    64618     0      0          1

We see the random slope model fits better and is statistically meaningful.