For this homework, I am using BRFSS 2016 data. The continuous outcome variable is how old someone was when they were told they had diabetes. The individual level predictors that were used here: Education, Obesity, Smoking status, age and sex. The metropolitan metro areas are included as the higher level predictor.
library(car)
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.1. https://CRAN.R-project.org/package=stargazer
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
library(ggplot2)
library(pander)
library(knitr)
load("C:/Users/munta/Google Drive/BRFSS16/brfss16_mmsa.Rdata")
set.seed(12345)
#Cleaning up the names
nams<-names(brfss16m)
#Fixing the capital/lower case
newnames<-gsub(pattern = "_",replacement = "",x = nams)
names(brfss16m)<-tolower(newnames)
nams<-names(brfss16m)
#sex
brfss16m$male<-ifelse(brfss16m$sex==1, 1, 0)
#BMI
brfss16m$bmi<-ifelse(is.na(brfss16m$bmi5)==T, NA, brfss16m$bmi5/100)
# #Healthy days
# brfss16m$healthdays<-recode(brfss16m$physhlth, recodes = "88=0; 77=NA; 99=NA")
# #Healthy mental health days
# brfss16m$healthmdays<-recode(brfss16m$menthlth, recodes = "88=0; 77=NA; 99=NA")
# brfss16m$badhealth<-recode(brfss16m$genhlth, recodes="4:5=1; 1:3=0; else=NA")
#education level
brfss16m$educ<-recode(brfss16m$educa,
recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA",
as.factor.result=T)
brfss16m$educ<-relevel(brfss16m$educ, ref='2hsgrad')
# #race/ethnicity
# brfss16m$black<-recode(brfss16m$racegr3, recodes="2=1; 9=NA; else=0")
# brfss16m$white<-recode(brfss16m$racegr3, recodes="1=1; 9=NA; else=0")
# brfss16m$other<-recode(brfss16m$racegr3, recodes="3:4=1; 9=NA; else=0")
# brfss16m$hispanic<-recode(brfss16m$racegr3, recodes="5=1; 9=NA; else=0")
# brfss16m$race_eth<-recode(brfss16m$racegr3,
# recodes="1='nhwhite'; 2='nh black'; 3='nh other';4='nh multirace'; 5='hispanic'; else=NA",
# as.factor.result = T)
# brfss16m$race_eth<-relevel(brfss16m$race_eth, ref = "nhwhite")
#
# #insurance
# brfss16m$ins<-recode(brfss16m$hlthpln1, recodes ="7:9=NA; 1=1;2=0")
#
# #income grouping
# brfss16m$inc<-recode(brfss16m$incomg, recodes = "9= NA;1='1_lt15k'; 2='2_15-25k';3='3_25-35k';4='4_35-50k';5='5_50kplus'", as.factor.result = T)
# brfss16m$inc<-as.ordered(brfss16m$inc)
#
# #employloyment
# brfss16m$employ<-recode(brfss16m$employ1,
# recodes="1:2='employloyed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA",
# as.factor.result=T)
# brfss16m$employ<-relevel(brfss16m$employ, ref='employloyed')
#
# #marital status
# brfss16m$marst<-recode(brfss16m$marital,
# recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA",
# as.factor.result=T)
# brfss16m$marst<-relevel(brfss16m$marst, ref='married')
#Age cut into intervals
brfss16m$agec<-cut(brfss16m$age80, breaks=c(0,24,39,59,79,99))
#BMI, in the brfss16ma the bmi variable has 2 implied decimal places,
#so we must divide by 100 to get real bmi's
brfss16m$bmi<-brfss16m$bmi5/100
#smoking currently
brfss16m$smoke<-recode(brfss16m$smoker3, recodes="1:2=1; 3:4=0; else=NA")
#brfss16m$smoke<-relevel(brfss16m$smoke, ref = "NeverSmoked")
brfss16m$obese<-ifelse(is.na(brfss16m$bmi)==T, NA,
ifelse(brfss16m$bmi>30,1,0))
#Outcome Variable: How old were you when you were told you have diabetes?
brfss16m$agediab<-recode(brfss16m$diabage2, recodes = "98=NA; 99=NA")
summary(brfss16m$agediab)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.00 42.00 52.00 51.22 60.00 93.00 218888
hist(brfss16m$agediab)
The histogram shows that the age at getting told about diabetes is normally disctributed.
#people within each msa
head(data.frame(name=table(brfss16m$mmsaname),id=unique(brfss16m$mmsa)))
## name.Var1
## 1 Albany-Schenectady-Troy, NY, Metropolitan Statistical Area
## 2 Albuquerque, NM, Metropolitan Statistical Area
## 3 Allentown-Bethlehem-Easton, PA-NJ, Metropolitan Statistical Area
## 4 Anchorage, AK, Metropolitan Statistical Area
## 5 Atlanta-Sandy Springs-Roswell, GA, Metropolitan Statistical Area
## 6 Augusta-Richmond County, GA-SC, Metropolitan Statistical Area
## name.Freq id
## 1 2901 10580
## 2 1300 10740
## 3 675 10900
## 4 1028 11260
## 5 2444 12060
## 6 849 12260
#MSAs
length(table(brfss16m$mmsa))
## [1] 143
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
mydata<-brfss16m%>%
select(agediab, mmsa, educ, agec, obese, smoke, bmi, male, mmsawt, mmsaname )%>%
filter(complete.cases(.))
head(mydata[, c("agediab", "male", "agec", "educ", "obese", "smoke", "mmsa")])
## # A tibble: 6 x 7
## agediab male agec educ obese smoke mmsa
## <dbl> <dbl> <fct> <fct> <dbl> <dbl> <dbl>
## 1 45. 1. (59,79] 4colgrad 1. 0. 10580.
## 2 42. 1. (59,79] 2hsgrad 0. 0. 10580.
## 3 35. 0. (59,79] 2hsgrad 1. 0. 10580.
## 4 32. 1. (24,39] 3somecol 1. 0. 10580.
## 5 48. 0. (59,79] 2hsgrad 1. 0. 10580.
## 6 7. 0. (39,59] 4colgrad 0. 0. 10580.
#Storing Information
meanagediab<-mean(mydata$agediab, na.rm=T)
sdagediab<-sd(mydata$agediab, na.rm=T)
fit.an<-lm(agediab~as.factor(mmsa), mydata)
anova(fit.an)
## Analysis of Variance Table
##
## Response: agediab
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(mmsa) 142 59156 416.59 1.9557 9.372e-11 ***
## Residuals 27094 5771362 213.01
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
So we see significant variation in the age at getting told about diabetes across the MSAs.
Now we fit the hierarchical model
library(lme4)
library(lmerTest)
##
## Attaching package: 'lmerTest'
## The following object is masked from 'package:lme4':
##
## lmer
## The following object is masked from 'package:stats':
##
## step
library(arm)
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
##
## arm (Version 1.9-3, built: 2016-11-21)
## Working directory is C:/Users/munta/Dropbox/###Dem 7283 Stat II/HW/HW7
##
## Attaching package: 'arm'
## The following object is masked from 'package:car':
##
## logit
mydata$agediabz<-as.numeric(scale(mydata$agediab, center=T, scale=T))
fit<-lmer(agediabz~(1|mmsa), data=mydata)
arm::display(fit, detail=T)
## lme4::lmer(formula = agediabz ~ (1 | mmsa), data = mydata)
## coef.est coef.se t value
## (Intercept) 0.00 0.01 0.02
##
## Error terms:
## Groups Name Std.Dev.
## mmsa (Intercept) 0.07
## Residual 1.00
## ---
## number of obs: 27237, groups: mmsa, 143
## AIC = 77264.3, DIC = 77243.1
## deviance = 77250.7
The random intercept model shows a standard deviation of 0.07, which is the square root of the variance we can see.
mydata$preagediab <- sdagediab*(fitted(fit))+meanagediab
citymeans<-aggregate(cbind(agediab, preagediab)~mmsaname,mydata, mean)
head(citymeans, n=10)
## mmsaname
## 1 Albany-Schenectady-Troy, NY, Metropolitan Statistical Area
## 2 Albuquerque, NM, Metropolitan Statistical Area
## 3 Allentown-Bethlehem-Easton, PA-NJ, Metropolitan Statistical Area
## 4 Anchorage, AK, Metropolitan Statistical Area
## 5 Atlanta-Sandy Springs-Roswell, GA, Metropolitan Statistical Area
## 6 Augusta-Richmond County, GA-SC, Metropolitan Statistical Area
## 7 Austin-Round Rock, TX, Metropolitan Statistical Area
## 8 Baltimore-Columbia-Towson, MD, Metropolitan Statistical Area
## 9 Baton Rouge, LA, Metropolitan Statistical Area
## 10 Beckley, WV, Metropolitan Statistical Area
## agediab preagediab
## 1 50.71181 50.98640
## 2 51.31579 51.34325
## 3 47.52239 50.43676
## 4 49.88991 50.86019
## 5 51.43816 51.40497
## 6 49.80412 50.87096
## 7 51.82659 51.57030
## 8 51.39922 51.39088
## 9 52.42683 51.65857
## 10 48.70213 50.54222
plot(preagediab~agediab, citymeans)
The plot shows a positive relationship with the values centered around ages between 50-52.
fit2<-lmer(agediabz~male+agec+educ+obese+smoke+(1|mmsa), data=mydata, na.action=na.omit)
anova(fit, fit2)
## refitting model(s) with ML (instead of REML)
## Data: mydata
## Models:
## object: agediabz ~ (1 | mmsa)
## ..1: agediabz ~ male + agec + educ + obese + smoke + (1 | mmsa)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## object 3 77257 77281 -38625 77251
## ..1 14 64646 64761 -32309 64618 12632 11 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
We can see that, individual level predictors (fit2) improve the model.
rand(fit2)
## Analysis of Random effects Table:
## Chi.sq Chi.DF p.value
## mmsa 2.81 1 0.09 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The variation remains in average age at getting told about diabetes across MSAs.
fit.mix<-lmer(agediabz~male+agec+educ+obese+smoke+(male|mmsa), data=mydata)
#test for the random effect
rand(fit.mix)
## Analysis of Random effects Table:
## Chi.sq Chi.DF p.value
## male:mmsa 0.0303 2 1
display(fit.mix, detail=T)
## lme4::lmer(formula = agediabz ~ male + agec + educ + obese +
## smoke + (male | mmsa), data = mydata)
## coef.est coef.se t value
## (Intercept) -2.56 0.08 -33.56
## male 0.03 0.01 3.50
## agec(24,39] 0.78 0.08 9.82
## agec(39,59] 1.93 0.08 25.27
## agec(59,79] 2.78 0.08 36.69
## agec(79,99] 3.52 0.08 45.67
## educ0Prim -0.07 0.03 -2.86
## educ1somehs -0.06 0.02 -2.93
## educ3somecol -0.03 0.01 -2.34
## educ4colgrad -0.02 0.01 -1.62
## obese 0.02 0.01 1.58
## smoke -0.02 0.01 -1.26
##
## Error terms:
## Groups Name Std.Dev. Corr
## mmsa (Intercept) 0.03
## male 0.00 -1.00
## Residual 0.79
## ---
## number of obs: 27237, groups: mmsa, 143
## AIC = 64728.5, DIC = 64540.1
## deviance = 64618.3
We can see that our standard deviation at the MSA level is 0.03, and the standard deviation at the individual level (residual standard deviation) is 0.79. These are the square root of the variance.
#To do a random slope model, I do:
fit.mix3<-lmer(agediabz~male+agec+educ+obese+smoke+(male|mmsa), data=mydata, REML=F)
display(fit.mix3, detail=T)
## lme4::lmer(formula = agediabz ~ male + agec + educ + obese +
## smoke + (male | mmsa), data = mydata, REML = F)
## coef.est coef.se t value
## (Intercept) -2.56 0.08 -33.57
## male 0.03 0.01 3.51
## agec(24,39] 0.78 0.08 9.82
## agec(39,59] 1.93 0.08 25.28
## agec(59,79] 2.78 0.08 36.70
## agec(79,99] 3.52 0.08 45.68
## educ0Prim -0.08 0.03 -2.86
## educ1somehs -0.06 0.02 -2.93
## educ3somecol -0.03 0.01 -2.34
## educ4colgrad -0.02 0.01 -1.62
## obese 0.02 0.01 1.58
## smoke -0.02 0.01 -1.26
##
## Error terms:
## Groups Name Std.Dev. Corr
## mmsa (Intercept) 0.02
## male 0.00 -1.00
## Residual 0.79
## ---
## number of obs: 27237, groups: mmsa, 143
## AIC = 64650.3, DIC = 64618.3
## deviance = 64618.3
The standard deviation of 0.10 shows how much variation there is. The male standard deviation of 0.01 shows the evidence of non-stationarity; which means we have variance.
anova(fit.mix, fit.mix3)
## refitting model(s) with ML (instead of REML)
## Data: mydata
## Models:
## object: agediabz ~ male + agec + educ + obese + smoke + (male | mmsa)
## ..1: agediabz ~ male + agec + educ + obese + smoke + (male | mmsa)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## object 16 64650 64782 -32309 64618
## ..1 16 64650 64782 -32309 64618 0 0 1
We see the random slope model fits better and is statistically meaningful.