All the variables in this dataset were compiled the Behavioral Risk Factor Surveillance System (BRFFS).
For this assignment, we include as a binary outcome whether the person exercised or not in the last 30 days. As our group level variables we consider the Metropolitan Metro areas as well as the Gini coeffiecients. Our individual variables consist of the level of education and race/ethnicity for all participants.
The number of observations for this analysis is: 69,793
RESEARCH QUESTION Our analysis, attempts to respond the following question: do individuals who have a lower level of education, who are racial/ethnic minorities and live in areas with high poverty rates are less likely to exercise than others?
#load brfss
library(car)
library(stargazer)
library(survey)
library(sjPlot)
library(ggplot2)
library(pander)
library(knitr)
library(lme4)
load("C:/Users/PCMcC/Documents/Statistics II/Data for project/brfss16_mmsa.Rdata")
set.seed(12345)
samps<-sample(1:nrow(brfss16m), size = 90000, replace=F)
brfss16m<-brfss16m[samps,]
#Name modification
nams<-names(brfss16m)
#we see some names are lower case, some are upper and some have a little _ in the first position. This is a nightmare.
newnames<-gsub(pattern = "_",replacement = "",x = nams)
names(brfss16m)<-tolower(newnames)
nams<-names(brfss16m)
#we see some names are lower case, some are upper and some have a little _ in the first position. This is a nightmare.
newnames<-gsub(pattern = "_",replacement = "",x = nams)
names(brfss16m)<-tolower(newnames)
#Exercise
#During the past month, other than your regular job, did you participate in any physical activities or exercises such as running, calisthenics, golf, gardening, or walking for exercise? yes=1, no=0, don't now=7, refused=9, not asked=blank
brfss16m$exercise<-recode(brfss16m$exerany2, recodes="1=1; 2=0; else=NA")
table(brfss16m$exercise)
##
## 0 1
## 21592 68259
#sex
brfss16m$female<-ifelse(brfss16m$sex==2, 2, 0)
#Healthy days
brfss16m$healthdays<-recode(brfss16m$physhlth, recodes = "88=0; 77=NA; 99=NA")
#Healthy mental health days
brfss16m$healthmdays<-recode(brfss16m$menthlth, recodes = "88=0; 77=NA; 99=NA")
#Education
brfss16m$education<-recode(brfss16m$educa, recodes="1='no school'; 2='Grade 1-8'; 3='Grade 9-11';4='high school'; 5='some college'; 6='college or more'; 9='NA'",as.factor.result = T)
brfss16m$race_eth<-relevel(brfss16m$education, ref = "college or more")
brfss16m$badhealth<-recode(brfss16m$genhlth, recodes="4:5=1; 1:3=0; else=NA")
#race/ethnicity
brfss16m$black<-recode(brfss16m$racegr3, recodes="2=1; 9=NA; else=0")
brfss16m$white<-recode(brfss16m$racegr3, recodes="1=1; 9=NA; else=0")
brfss16m$other<-recode(brfss16m$racegr3, recodes="3:4=1; 9=NA; else=0")
brfss16m$hispanic<-recode(brfss16m$racegr3, recodes="5=1; 9=NA; else=0")
brfss16m$race_eth<-recode(brfss16m$racegr3,
recodes="1='nhwhite'; 2='nh black'; 3='nh other';4='nh multirace'; 5='hispanic'; else=NA",
as.factor.result = T)
brfss16m$race_eth<-relevel(brfss16m$race_eth, ref = "nhwhite")
#insurance
brfss16m$ins<-recode(brfss16m$hlthpln1, recodes ="7:9=NA; 1=1;2=0")
#education level
brfss16m$educ<-recode(brfss16m$educa,
recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA",
as.factor.result=T)
brfss16m$educ<-relevel(brfss16m$educ, ref='2hsgrad')
#employment
brfss16m$employ<-recode(brfss16m$employ1,
recodes="1:2='employloyed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA",
as.factor.result=T)
brfss16m$employ<-relevel(brfss16m$employ, ref='employloyed')
#marital status
brfss16m$marst<-recode(brfss16m$marital,
recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA",
as.factor.result=T)
brfss16m$marst<-relevel(brfss16m$marst, ref='married')
#Drinking days
brfss16m$daysdrink<-recode(brfss16m$avedrnk2, recodes = "77=NA; 99=NA")
summary(brfss16m$daysdrink)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.00 1.00 2.00 2.16 2.00 76.00 44337
I want to see how many people we have in each MSA in the data:
#Now we will begin fitting the multilevel regression model with the msa
#that the person lives in being the higher level
head(data.frame(name=table(brfss16m$mmsaname),id=unique(brfss16m$mmsa)))
## name.Var1
## 1 Albany-Schenectady-Troy, NY, Metropolitan Statistical Area
## 2 Albuquerque, NM, Metropolitan Statistical Area
## 3 Allentown-Bethlehem-Easton, PA-NJ, Metropolitan Statistical Area
## 4 Anchorage, AK, Metropolitan Statistical Area
## 5 Atlanta-Sandy Springs-Roswell, GA, Metropolitan Statistical Area
## 6 Augusta-Richmond County, GA-SC, Metropolitan Statistical Area
## name.Freq id
## 1 997 39900
## 2 463 45060
## 3 254 41060
## 4 380 45220
## 5 871 33340
## 6 306 16980
#people within each msa
#How many total MSAs are in the data?
length(table(brfss16m$mmsa))
## [1] 143
#counties
We will often be interested in factors at both the individual AND contextual levels. To illustrate this, we will use data from the American Community Survey measured at the MSA level. Specifically, we use the DP3 table, which provides economic characteristics of places, from the 2010 5 year ACS Link.
library(acs)
#Get 2010 ACS median household incomes for tracts in Texas
msaacs<-geo.make(msa="*")
#ACS table B03002 is Hispanic ethnicity by race
acsecon<-acs.fetch(key=pckey, endyear=2010, span=5, geography=msaacs, variable = c("B19083_001","B17001_001","B17001_002" ))
colnames(acsecon@estimate)
## [1] "B19083_001" "B17001_001" "B17001_002"
msaecon<-data.frame(gini=acsecon@estimate[, "B19083_001"],
ppoverty=acsecon@estimate[, "B17001_002"]/acsecon@estimate[, "B17001_001"],
giniz=scale(acsecon@estimate[, "B19083_001"]),
ppovertyz=scale(acsecon@estimate[, "B17001_002"]/acsecon@estimate[, "B17001_001"]))
msaecon$ids<-paste(acsecon@geography$metropolitanstatisticalareamicropolitanstatisticalarea)
Merge the MSA data to the BRFSS data
mixdata<-merge(brfss16m, msaecon, by.x="mmsa",by.y="ids", all.x=T)
mixdata<-mixdata[complete.cases(mixdata[, c("race_eth", "exercise", "educ", "gini")]),]
head(mixdata[, c("race_eth", "exercise", "educ", "gini")])
## race_eth exercise educ gini
## 1 nhwhite 1 4colgrad 0.427
## 2 nhwhite 1 2hsgrad 0.427
## 3 nh other 1 2hsgrad 0.427
## 4 nhwhite 1 4colgrad 0.427
## 5 nhwhite 1 3somecol 0.427
## 6 nhwhite 1 4colgrad 0.427
The MSA Level Variables show that there is a negative relationship between exercises and those with a some highshool or lower as well as black individuals and those who live in areas with a lower level of wealth.
#MSA level variables
fit.mix1<-lmer(exercise~+educ+race_eth+giniz+(1|mmsa), data=mixdata)
arm::display(fit.mix1, detail=T)
## lmer(formula = exercise ~ +educ + race_eth + giniz + (1 | mmsa),
## data = mixdata)
## coef.est coef.se t value
## (Intercept) 0.66 0.00 138.63
## educ0Prim -0.13 0.01 -11.40
## educ1somehs -0.10 0.01 -12.66
## educ3somecol 0.09 0.00 21.89
## educ4colgrad 0.20 0.00 49.87
## race_ethhispanic 0.00 0.01 0.53
## race_ethnh black -0.04 0.01 -6.71
## race_ethnh multirace 0.00 0.01 0.38
## race_ethnh other -0.02 0.01 -2.34
## giniz -0.03 0.01 -5.10
##
## Error terms:
## Groups Name Std.Dev.
## mmsa (Intercept) 0.04
## Residual 0.42
## ---
## number of obs: 69793, groups: mmsa, 123
## AIC = 75656.1, DIC = 75466.2
## deviance = 75549.1
The cross level interaction tells a different story; the ability of non-hispanic blacks with a low level of education (primary or some high school) to exercise is highly influenced by the area’s income or wealth.
#Cross-level interaction model:
fit.mix2<-lmer(exercise~educ+race_eth*educ+giniz+(1|mmsa), mixdata, REML=F)
arm::display(fit.mix2, detail=T)
## lmer(formula = exercise ~ educ + race_eth * educ + giniz + (1 |
## mmsa), data = mixdata, REML = F)
## coef.est coef.se t value
## (Intercept) 0.66 0.00 133.94
## educ0Prim -0.16 0.02 -8.72
## educ1somehs -0.12 0.01 -11.43
## educ3somecol 0.10 0.00 19.60
## educ4colgrad 0.21 0.00 45.61
## race_ethhispanic 0.02 0.01 1.76
## race_ethnh black -0.05 0.01 -4.91
## race_ethnh multirace 0.05 0.03 1.77
## race_ethnh other 0.01 0.02 0.32
## giniz -0.03 0.01 -5.13
## educ0Prim:race_ethhispanic 0.03 0.03 1.18
## educ1somehs:race_ethhispanic 0.04 0.02 1.71
## educ3somecol:race_ethhispanic -0.03 0.02 -1.72
## educ4colgrad:race_ethhispanic -0.05 0.02 -3.26
## educ0Prim:race_ethnh black 0.14 0.04 3.21
## educ1somehs:race_ethnh black 0.07 0.02 3.09
## educ3somecol:race_ethnh black 0.01 0.01 0.43
## educ4colgrad:race_ethnh black 0.01 0.01 0.48
## educ0Prim:race_ethnh multirace -0.21 0.10 -2.19
## educ1somehs:race_ethnh multirace -0.08 0.06 -1.32
## educ3somecol:race_ethnh multirace -0.03 0.03 -0.92
## educ4colgrad:race_ethnh multirace -0.06 0.03 -1.84
## educ0Prim:race_ethnh other 0.01 0.06 0.12
## educ1somehs:race_ethnh other -0.09 0.05 -1.96
## educ3somecol:race_ethnh other 0.01 0.03 0.38
## educ4colgrad:race_ethnh other -0.05 0.02 -2.32
##
## Error terms:
## Groups Name Std.Dev.
## mmsa (Intercept) 0.04
## Residual 0.42
## ---
## number of obs: 69793, groups: mmsa, 123
## AIC = 75543.3, DIC = 75487.3
## deviance = 75487.3
A comparison of the two models shows an interaction.
#compare the models with a LRT. There is evidence of interaction.
anova(fit.mix1, fit.mix2)
## refitting model(s) with ML (instead of REML)
## Data: mixdata
## Models:
## fit.mix1: exercise ~ +educ + race_eth + giniz + (1 | mmsa)
## fit.mix2: exercise ~ educ + race_eth * educ + giniz + (1 | mmsa)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## fit.mix1 12 75573 75683 -37775 75549
## fit.mix2 28 75543 75800 -37744 75487 61.778 16 2.619e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(glm(exercise~ educ+race_eth+giniz+(1|mmsa), mixdata, family=binomial), test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: exercise
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 69792 77684
## educ 4 3927.5 69788 73756 < 2.2e-16 ***
## race_eth 4 104.0 69784 73652 < 2.2e-16 ***
## giniz 1 196.2 69783 73456 < 2.2e-16 ***
## 1 | mmsa 0 0.0 69783 73456
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The following results confirm previous findings; higher levels of education are related with a higher reports of exercising. In fact, individuals with a college education are 3 times more likely to have exercised in the last 30 days when compared to those with just a high school education. Black individuals OR those who live in areas with lower levels of wealth are also less likely to have exercised in the past 30 days.
fit.mix.bin<-glmer(I(exercise>0)~educ+race_eth+giniz+(1|mmsa), family=binomial, mixdata,
control=glmerControl(optimizer="bobyqa",optCtrl=list(maxfun=2e5)))
arm::display(fit.mix.bin, detail=T)
## glmer(formula = I(exercise > 0) ~ educ + race_eth + giniz + (1 |
## mmsa), data = mixdata, family = binomial, control = glmerControl(optimizer = "bobyqa",
## optCtrl = list(maxfun = 2e+05)))
## coef.est coef.se z value Pr(>|z|)
## (Intercept) 0.69 0.03 26.39 0.00
## educ0Prim -0.52 0.06 -9.39 0.00
## educ1somehs -0.42 0.04 -10.63 0.00
## educ3somecol 0.46 0.02 19.85 0.00
## educ4colgrad 1.17 0.02 48.99 0.00
## race_ethhispanic 0.01 0.04 0.14 0.89
## race_ethnh black -0.20 0.03 -6.37 0.00
## race_ethnh multirace 0.02 0.07 0.31 0.76
## race_ethnh other -0.13 0.05 -2.44 0.01
## giniz -0.15 0.03 -4.99 0.00
##
## Error terms:
## Groups Name Std.Dev.
## mmsa (Intercept) 0.21
## Residual 1.00
## ---
## number of obs: 69793, groups: mmsa, 123
## AIC = 73152.6, DIC = 72573.9
## deviance = 72852.2
#odds ratios
exp(fixef(fit.mix.bin)[-1])
## educ0Prim educ1somehs educ3somecol
## 0.5943653 0.6575536 1.5769399
## educ4colgrad race_ethhispanic race_ethnh black
## 3.2173542 1.0051307 0.8169054
## race_ethnh multirace race_ethnh other giniz
## 1.0232416 0.8802847 0.8607573
An multilevel logistic analysis that excludes the GINI coeffecient shows a similar trend. Removing the GINI variable does not seem to have a strong effect in the odd ratios for any group. Although there is a slight shift observed amongst the black group, we can conclude that the level of education is a much more important factor that influences the person’s ability to exercise than it is the level of wealth of the area in which they live.
fit.mix.bin2<-glmer(I(exercise>0)~educ+race_eth+(1|mmsa), family=binomial, mixdata,
control=glmerControl(optimizer="bobyqa",optCtrl=list(maxfun=2e5)))
arm::display(fit.mix.bin, detail=T)
## glmer(formula = I(exercise > 0) ~ educ + race_eth + giniz + (1 |
## mmsa), data = mixdata, family = binomial, control = glmerControl(optimizer = "bobyqa",
## optCtrl = list(maxfun = 2e+05)))
## coef.est coef.se z value Pr(>|z|)
## (Intercept) 0.69 0.03 26.39 0.00
## educ0Prim -0.52 0.06 -9.39 0.00
## educ1somehs -0.42 0.04 -10.63 0.00
## educ3somecol 0.46 0.02 19.85 0.00
## educ4colgrad 1.17 0.02 48.99 0.00
## race_ethhispanic 0.01 0.04 0.14 0.89
## race_ethnh black -0.20 0.03 -6.37 0.00
## race_ethnh multirace 0.02 0.07 0.31 0.76
## race_ethnh other -0.13 0.05 -2.44 0.01
## giniz -0.15 0.03 -4.99 0.00
##
## Error terms:
## Groups Name Std.Dev.
## mmsa (Intercept) 0.21
## Residual 1.00
## ---
## number of obs: 69793, groups: mmsa, 123
## AIC = 73152.6, DIC = 72573.9
## deviance = 72852.2
#odds ratios
exp(fixef(fit.mix.bin2)[-1])
## educ0Prim educ1somehs educ3somecol
## 0.5947029 0.6570282 1.5749383
## educ4colgrad race_ethhispanic race_ethnh black
## 3.2081725 0.9914936 0.8096774
## race_ethnh multirace race_ethnh other
## 1.0223257 0.8782305
#LRT between models
anova(fit.mix.bin, fit.mix.bin2)
## Data: mixdata
## Models:
## fit.mix.bin2: I(exercise > 0) ~ educ + race_eth + (1 | mmsa)
## fit.mix.bin: I(exercise > 0) ~ educ + race_eth + giniz + (1 | mmsa)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## fit.mix.bin2 10 73173 73264 -36576 73153
## fit.mix.bin 11 73153 73253 -36565 73131 22.343 1 2.281e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Going back to our research question, do individuals who have a lower level of education, who are racial/ethnic minorities and live in areas with high poverty rates are less likely to exercise than others?.. The answer is no.
Although racial minorities, primarily black show a disadvantage when compared to other groups, the gap is certainly larger by levels of education than by race_ethnicity. Furthermore, including the GINI variable shows little impact. We conclude that there is a small interaction between the area’s level of wealth and the race/ethnicity of the individual in their likelihood to exercise. The level of education is by far, the strongest determinant in a person likelihood to exercise.