The outcome of interest here is the general health status which is treated as a dichotomous variable and the grouping variable for the analysis is cohorts starting from 1915 to 2010. Our hypothesis is, whether the effect of obesity shows a beta coefficient different from zero.
library(readr)
library(car)
library(haven)
library(tidyr)
library(labelled)
library(ggplot2)
library(foreign)
library(lme4)
library(lmerTest)
library(sjstats)
library(MuMIn)
mydata <- read_dta("C://Users/munta/Google Drive/NHIS/nhis_00006.dta")
#names(mydata)
mydata$yob<-mydata$year-mydata$age
#mydata<-subset (mydata, yob%in%c(1915:2010) & year%in%c(2001:2009))
#mydata$cohort<-car::recode(mydata$yob, recodes="1940:1963='early'; 1964:1988='late'; else=NA", as.factor = T)
#Taking a subset of the total observations
sub<-subset(mydata, mydata$mortelig==1&is.na(mydata$racenew)==F)
samps<-sample(1:length(sub$psu), size = 20000, replace = F)
sub<-sub[samps,] #Using a subset of 20000 samples
#Age
sub$d.age<-ifelse(sub$mortstat==1, sub$mortdody-(sub$year-sub$age) ,
ifelse(sub$mortstat==2, 2009-(sub$year-sub$age), NA))
#Mortality Status
#0=Alive, 1=Dead
sub$d.event<-ifelse(sub$mortstat==1,1,0)
#sub$d.event<-as.character(sub$d.event)
sub$timetodeath<-ifelse(sub$mortstat ==1, sub$mortdody-sub$year, 2009 - sub$year )
sub$d5yr<-ifelse(sub$timetodeath<=5&sub$mortstat==1, 1,0)
#Mortalist Status: Dead or Alive
#sub$mortstat<-recode(sub$mortstat, recodes= "1='Dead'; 2='Alive'; 9=NA", as.factor.result=T)
#Education
sub$moredu<-recode(sub$educrec2, recodes="00=NA; 10:41='LHS'; 42='HS'; 50:53='SomeCollege'; 54:60='College+'; else=NA", as.factor=T, levels = c("LHS", "HS", "SomeCollege", "College+"))
sub$moredu<-relevel(sub$moredu, ref = "LHS")
sub$hs<-ifelse(sub$moredu=='hs or less',1,0)
sub$scol1<-ifelse(sub$moredu=='some coll',1,0)
#Race/Ethnicity
sub$race<-recode(sub$racenew, recodes ="10='wht'; 20 ='blk'; 30:61=NA; 97:99=NA", as.factor=T)
sub$black<-ifelse(sub$race=='blk',1,0)
sub$oth<-ifelse(sub$race=='other',1,0)
sub$hisp<-recode(sub$hispyn, recodes="1=0; 2=1; else=NA")
sub$race_eth[sub$hisp == 0 & sub$race=="wht"]<-"NHWhite"
## Warning: Unknown or uninitialised column: 'race_eth'.
sub$race_eth[sub$hisp == 0 & sub$race=="blk"]<-"NHBlack"
# sub$race_eth[sub$hisp == 0 & sub$race=="other"]<-"NHOther"
sub$race_eth[sub$hisp == 1 ]<-"Hispanic"
sub$race_eth[is.na(sub$hisp) ==T | is.na(sub$race)==T]<-NA
sub$race_eth <- as.factor(sub$race_eth)
sub$race_eth<-relevel(sub$race_eth, ref = "NHWhite")
#Marital Status
sub$married<-recode(sub$marstat, recodes="00=NA; 99=NA; 10:13=1; else=0", as.factor=T )
#sub$sep<-ifelse(sub$married=='separated',1,0)
#sub$nm<-ifelse(sub$married=='nevermarried',1,0)
#sub$married<-relevel(sub$married, ref = "1")
#Smoking Status
#sub$smoke<-recode(sub$smokestatus2, recodes="00=NA; 10:13='CurrentSmoker'; 20='FormerSmoker'; 30='NeverSmoked'; else=NA", as.factor=T)
#sub$smoke<-relevel(sub$smoke, ref = "CurrentSmoker")
sub$currsmoke<-Recode(sub$smokestatus2, recodes="0=NA;90=NA; 10:13=1; else=0", as.factor = F)
#Drinking Status
# sub$alcohol<-recode(sub$alcstat2, recodes= "00=NA; 10='NeverDrinker'; 20:23='FormerDrinker'; 31:32='LightDrinker'; 35='LightDrinker'; 33='ModerateDrinker'; 34='HeavyDrinker'; 97:99=NA", as.factor.result=T)
# sub$alcohol<-relevel(sub$alcohol, ref = "HeavyDrinker")
#Drinking Status
sub$alcohol<-recode(sub$alcstat2, recodes= "00=NA; 10='LifetimeAbstainer'; 20:23='FormerDrinker'; 30:32='LightDrinker'; 35='LightDrinker'; 33='ModerateDrinker'; 34='HeavyDrinker'; 40:43=NA; 97:99=NA", as.factor=T, levels = c("LifetimeAbstainer", "FormerDrinker", "LightDrinker", "ModerateDrinker", "HeavyDrinker"))
sub$alcohol<-relevel(sub$alcohol, ref = "HeavyDrinker")
sub$alcohol2<-recode(sub$alcstat1, recodes= "0=NA; 1='LifetimeAbstainer'; 2='FormerDrinker'; 3='CurrentDrinker'; 9=NA", as.factor=T, levels = c("LifetimeAbstainer", "FormerDrinker", "CurrentDrinker"))
sub$alcohol2<-relevel(sub$alcohol2, ref = "LifetimeAbstainer")
sub$male<-ifelse(sub$sex==1,1,0)
sub$mwt<-sub$mortwt/mean(sub$mortwt, na.rm=T)
#Age cut into intervals
sub<-sub[sub$age>=18,]
sub$agec<-cut(sub$age, breaks =seq(15, 100, 5))
sub$age2<-sub$age^2
sub$birth_year<-ifelse(sub$birthyr>=9997, NA, sub$birthyr)
sub$goodhealth<-recode(sub$health, recodes="1:3=0; 4:5=1; else=NA")
sub$bmi_clean<-ifelse(sub$bmicalc%in%c(0,996),NA, sub$bmicalc)
sub$obese<-ifelse(sub$bmi_clean>=30,1,0)
sub$pov<-Recode(sub$pooryn, recodes = "2=1; 1=0; 9=NA")
sub$cohort2<- cut(sub$birth_year, breaks=seq(1915, 2010, 5))
sub<-subset(sub, complete.cases(moredu, pov, cohort2, currsmoke, alcohol, alcohol2, race_eth, hisp, black, oth, birth_year, goodhealth, bmi_clean, obese, age2, d.event, mortstat, timetodeath, year, agec, male, married, mortwt, strata, psu, sampweight, nhispid))
set.seed(1115)
fit00<-glm(goodhealth~factor(cohort2), data=sub, family = binomial)
anova(fit00, test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: goodhealth
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 5150 4090.8
## factor(cohort2) 15 305.21 5135 3785.5 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fit0<-glm(goodhealth~obese*factor(cohort2), data=sub, family = binomial)
anova(fit0, test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: goodhealth
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 5150 4090.8
## obese 1 52.496 5149 4038.3 4.311e-13 ***
## factor(cohort2) 15 310.489 5134 3727.8 < 2.2e-16 ***
## obese:factor(cohort2) 15 19.212 5119 3708.6 0.2043
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The means are not equal, which means we can run further hierarchical analysis to test the association.
fit1<-glmer(goodhealth~male+moredu+race_eth+obese+alcohol2+currsmoke+married+(1|cohort2), data=sub, family = binomial)
summary(fit1)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## goodhealth ~ male + moredu + race_eth + obese + alcohol2 + currsmoke +
## married + (1 | cohort2)
## Data: sub
##
## AIC BIC logLik deviance df.resid
## 3511.3 3596.4 -1742.6 3485.3 5138
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.6549 -0.3905 -0.2597 -0.1668 8.0162
##
## Random effects:
## Groups Name Variance Std.Dev.
## cohort2 (Intercept) 0.5319 0.7293
## Number of obs: 5151, groups: cohort2, 16
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.26207 0.23327 -5.410 6.29e-08 ***
## male -0.08005 0.09345 -0.857 0.39169
## moreduHS -0.64709 0.11673 -5.544 2.96e-08 ***
## moreduSomeCollege -0.94059 0.12984 -7.244 4.36e-13 ***
## moreduCollege+ -1.59023 0.16209 -9.811 < 2e-16 ***
## race_ethHispanic 0.07145 0.13336 0.536 0.59212
## race_ethNHBlack 0.34739 0.11670 2.977 0.00291 **
## obese 0.58969 0.09571 6.161 7.22e-10 ***
## alcohol2FormerDrinker 0.39550 0.12592 3.141 0.00168 **
## alcohol2CurrentDrinker -0.48516 0.11604 -4.181 2.90e-05 ***
## currsmoke 0.58360 0.10670 5.470 4.51e-08 ***
## married1 -0.14282 0.09330 -1.531 0.12583
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) male mordHS mrdSmC mrdCl+ rc_thH rc_NHB obese alc2FD
## male -0.082
## moreduHS -0.303 0.038
## moredSmCllg -0.298 0.053 0.564
## moreduCllg+ -0.229 -0.001 0.463 0.444
## rac_thHspnc -0.263 -0.016 0.241 0.271 0.239
## rc_thNHBlck -0.206 0.004 0.065 0.086 0.088 0.266
## obese -0.133 0.030 -0.003 0.006 0.030 -0.025 -0.113
## alchl2FrmrD -0.212 -0.182 -0.038 -0.056 -0.057 0.082 0.065 -0.013
## alchl2CrrnD -0.219 -0.224 -0.081 -0.131 -0.153 0.051 0.095 -0.005 0.589
## currsmoke -0.133 -0.050 0.040 0.082 0.159 0.115 0.006 0.104 -0.104
## married1 -0.164 -0.133 -0.026 0.016 -0.018 0.002 0.156 0.013 0.002
## alc2CD crrsmk
## male
## moreduHS
## moredSmCllg
## moreduCllg+
## rac_thHspnc
## rc_thNHBlck
## obese
## alchl2FrmrD
## alchl2CrrnD
## currsmoke -0.184
## married1 0.016 0.116
1)a. The variance component of the model is 0.6521 which is small enough.
1)b. The negative covariates show that, good health is associated to education, race/ethnicity, obesity status, current alcohol drinking and smoking status, as well as to the marital status. Obesity has a big-statistic, which supports our hypothesis that it significantly affects our outcome of good health.
fit2<-glmer(goodhealth~male+moredu+race_eth+obese+alcohol2+currsmoke+married+(1+obese|cohort2), data=sub, family = binomial)
summary(fit2)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## goodhealth ~ male + moredu + race_eth + obese + alcohol2 + currsmoke +
## married + (1 + obese | cohort2)
## Data: sub
##
## AIC BIC logLik deviance df.resid
## 3513.5 3611.7 -1741.8 3483.5 5136
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.6706 -0.3955 -0.2578 -0.1649 8.2955
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## cohort2 (Intercept) 0.59304 0.7701
## obese 0.02332 0.1527 -1.00
## Number of obs: 5151, groups: cohort2, 16
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.27559 0.24191 -5.273 1.34e-07 ***
## male -0.08159 0.09345 -0.873 0.38264
## moreduHS -0.64648 0.11664 -5.543 2.98e-08 ***
## moreduSomeCollege -0.93989 0.12970 -7.246 4.28e-13 ***
## moreduCollege+ -1.58375 0.16200 -9.776 < 2e-16 ***
## race_ethHispanic 0.07231 0.13323 0.543 0.58731
## race_ethNHBlack 0.34461 0.11665 2.954 0.00313 **
## obese 0.62093 0.10486 5.921 3.19e-09 ***
## alcohol2FormerDrinker 0.39518 0.12585 3.140 0.00169 **
## alcohol2CurrentDrinker -0.48696 0.11597 -4.199 2.68e-05 ***
## currsmoke 0.58512 0.10662 5.488 4.07e-08 ***
## married1 -0.14069 0.09328 -1.508 0.13151
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) male mordHS mrdSmC mrdCl+ rc_thH rc_NHB obese alc2FD
## male -0.079
## moreduHS -0.292 0.038
## moredSmCllg -0.287 0.052 0.563
## moreduCllg+ -0.221 -0.001 0.462 0.443
## rac_thHspnc -0.253 -0.015 0.240 0.269 0.237
## rc_thNHBlck -0.197 0.005 0.066 0.087 0.087 0.267
## obese -0.430 0.029 -0.006 0.003 0.030 -0.025 -0.110
## alchl2FrmrD -0.204 -0.182 -0.037 -0.055 -0.057 0.082 0.065 -0.021
## alchl2CrrnD -0.210 -0.224 -0.081 -0.130 -0.153 0.050 0.094 -0.015 0.589
## currsmoke -0.128 -0.049 0.040 0.080 0.158 0.111 0.004 0.090 -0.103
## married1 -0.158 -0.135 -0.025 0.015 -0.018 0.002 0.156 0.002 0.003
## alc2CD crrsmk
## male
## moreduHS
## moredSmCllg
## moreduCllg+
## rac_thHspnc
## rc_thNHBlck
## obese
## alchl2FrmrD
## alchl2CrrnD
## currsmoke -0.183
## married1 0.016 0.115
2)a. The variance component 0.69797 which is slighly bigger than what we have seen in the random intercept model. The z-values did not change much due the gitting of the random slope into the model. However, the model failed to converge whihc is an issue of concern.
anova (fit1, fit2)
## Data: sub
## Models:
## fit1: goodhealth ~ male + moredu + race_eth + obese + alcohol2 + currsmoke +
## fit1: married + (1 | cohort2)
## fit2: goodhealth ~ male + moredu + race_eth + obese + alcohol2 + currsmoke +
## fit2: married + (1 + obese | cohort2)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## fit1 13 3511.3 3596.4 -1742.6 3485.3
## fit2 15 3513.5 3611.7 -1741.8 3483.5 1.7482 2 0.4172
Comparing the AIC values, it is seen that the second model with the random effect of obesity does not seem to fit the data better than the first model without this term in the model.
icc(fit1)
##
## Generalized linear mixed model
##
## Family : binomial (logit)
## Formula: goodhealth ~ male + moredu + race_eth + obese + alcohol2 + currsmoke + married + (1 | cohort2)
##
## ICC (cohort2): 0.1392
icc(fit2)
## Caution! ICC for random-slope-intercept models usually not meaningful. See 'Note' in `?icc`.
##
## Generalized linear mixed model
##
## Family : binomial (logit)
## Formula: goodhealth ~ male + moredu + race_eth + obese + alcohol2 + currsmoke + married + (1 + obese | cohort2)
##
## ICC (cohort2): 0.1527
These results suggest that by including the random slope in fit2, we increase the correlation within cohorts.
Test for random effect
#ranova(fit1)
#ranova(fit2)
model.sel(fit1, fit2)
## Model selection table
## (Int) al2 crr mal mrr mrd obs rac_eth random df logLik
## fit1 -1.262 + 0.5836 -0.08005 + + 0.5897 + c 13 -1742.640
## fit2 -1.276 + 0.5851 -0.08159 + + 0.6209 + 1+o|c 15 -1741.766
## AICc delta weight
## fit1 3511.4 0.00 0.757
## fit2 3513.6 2.27 0.243
## Models ranked by AICc(x)
## Random terms:
## c = '1 | cohort2'
## 1+o|c = '1 + obese | cohort2'
The AIC value suggest that model 2 or ‘fit2’ is not better than the first random intercept model.
#r.squaredGLMM(fit1)
#r.squaredGLMM(fit2)
rancoefs2<-ranef(fit2)
#here are the first 10
head(rancoefs2$cohort2, n=10)
## (Intercept) obese
## (1915,1920] 1.05867825 -0.20993168
## (1920,1925] 1.21040529 -0.24001857
## (1925,1930] 0.47695636 -0.09457855
## (1930,1935] 0.55793925 -0.11063715
## (1935,1940] 0.54016247 -0.10711205
## (1940,1945] 0.77591548 -0.15386095
## (1945,1950] 0.58473508 -0.11595060
## (1950,1955] 0.14317065 -0.02839014
## (1955,1960] -0.09992376 0.01981451
## (1960,1965] -0.33764530 0.06695370
#Histograms of the random effects
par(mfrow=c(1,2))
hist(rancoefs2$cohort2[,"(Intercept)"],xlab= "Intercept Random Effect", main = "Distribution of Random Intercepts")
hist(rancoefs2$cohort2[,"obese"], xlab= "Obese Slope Random Effect", main = "Distribution of Random Slopes")
par(mfrow=c(1,1))
plot(rancoefs2$cohort2[,"(Intercept)"],rancoefs2$cohort2[,"obese"], main="Random effects for each cohort", xlab="Intercept", ylab="Obesity slope" )
par(mfrow=c(1,1))
fixef(fit2)
## (Intercept) male moreduHS
## -1.27559470 -0.08158876 -0.64648074
## moreduSomeCollege moreduCollege+ race_ethHispanic
## -0.93988716 -1.58374697 0.07230720
## race_ethNHBlack obese alcohol2FormerDrinker
## 0.34460921 0.62093226 0.39517676
## alcohol2CurrentDrinker currsmoke married1
## -0.48696339 0.58512345 -0.14068857
summary(fixef(fit2)[1]+rancoefs2$cohort2) # I do this to get values for the range for the y axis.
## (Intercept) obese
## Min. :-2.24284 Min. :-1.516
## 1st Qu.:-2.01207 1st Qu.:-1.388
## Median :-1.25397 Median :-1.280
## Mean :-1.26217 Mean :-1.278
## 3rd Qu.:-0.71096 3rd Qu.:-1.130
## Max. :-0.06519 Max. :-1.084
# plot(NULL, ylim=c(-3, 2), xlim=c(-2,2),
# ylab="General Health",
# xlab="Obesity Status") #You may have to change the ylim() for your outcome.
# title (main="Regression Lines from Random Slope and Intercept Model")
#
# cols=sample(rainbow(n=25), size = dim(rancoefs2$cohort2)[1], replace = T)
#
# for (i in 1: dim(rancoefs2$cohort2)[1]){
# abline(a=fixef(fit2)[1]+rancoefs2$cohort2[[1]][i],
# b=fixef(fit2)[16]+rancoefs2$cohort2[[2]][i],
# col=cols[i],lwd=.5 )}
#
# #this line shows the "average" effect of poverty.
# abline(a=fixef(fit2)[1],
# b=fixef(fit2)[16],
# col=1, lwd=3)
#
# legend("topright", col=1, lwd=5, legend="Average Effect of Obesity")