The outcome of interest here is the general health status which is treated as a dichotomous variable and the grouping variable for the analysis is cohorts starting from 1915 to 2010. Our hypothesis is, whether the effect of obesity shows a beta coefficient different from zero.

Loading Packages

library(readr)
library(car)
library(haven)
library(tidyr)
library(labelled)
library(ggplot2)
library(foreign)
library(lme4)
library(lmerTest)
library(sjstats)
library(MuMIn)

Loading NHIS Data

mydata <- read_dta("C://Users/munta/Google Drive/NHIS/nhis_00006.dta")
#names(mydata)

Recoding Variables

mydata$yob<-mydata$year-mydata$age

#mydata<-subset (mydata, yob%in%c(1915:2010) & year%in%c(2001:2009))

#mydata$cohort<-car::recode(mydata$yob, recodes="1940:1963='early'; 1964:1988='late'; else=NA", as.factor = T)

#Taking a subset of the total observations
sub<-subset(mydata, mydata$mortelig==1&is.na(mydata$racenew)==F)

samps<-sample(1:length(sub$psu), size = 20000, replace = F)
sub<-sub[samps,] #Using a subset of 20000 samples

#Age
sub$d.age<-ifelse(sub$mortstat==1, sub$mortdody-(sub$year-sub$age) ,
                  ifelse(sub$mortstat==2, 2009-(sub$year-sub$age), NA))
#Mortality Status
#0=Alive, 1=Dead
sub$d.event<-ifelse(sub$mortstat==1,1,0)
#sub$d.event<-as.character(sub$d.event)

sub$timetodeath<-ifelse(sub$mortstat ==1, sub$mortdody-sub$year, 2009 - sub$year )
sub$d5yr<-ifelse(sub$timetodeath<=5&sub$mortstat==1, 1,0)

#Mortalist Status: Dead or Alive
#sub$mortstat<-recode(sub$mortstat, recodes= "1='Dead'; 2='Alive'; 9=NA", as.factor.result=T)

#Education 
sub$moredu<-recode(sub$educrec2, recodes="00=NA; 10:41='LHS'; 42='HS'; 50:53='SomeCollege'; 54:60='College+'; else=NA", as.factor=T, levels = c("LHS", "HS", "SomeCollege", "College+"))
sub$moredu<-relevel(sub$moredu, ref = "LHS")

sub$hs<-ifelse(sub$moredu=='hs or less',1,0)
sub$scol1<-ifelse(sub$moredu=='some coll',1,0)

#Race/Ethnicity
sub$race<-recode(sub$racenew, recodes ="10='wht'; 20 ='blk'; 30:61=NA; 97:99=NA", as.factor=T)
sub$black<-ifelse(sub$race=='blk',1,0)
sub$oth<-ifelse(sub$race=='other',1,0)
 
sub$hisp<-recode(sub$hispyn, recodes="1=0; 2=1; else=NA")

sub$race_eth[sub$hisp == 0 & sub$race=="wht"]<-"NHWhite"

## Warning: Unknown or uninitialised column: 'race_eth'.

sub$race_eth[sub$hisp == 0 & sub$race=="blk"]<-"NHBlack"
# sub$race_eth[sub$hisp == 0 & sub$race=="other"]<-"NHOther"
sub$race_eth[sub$hisp == 1 ]<-"Hispanic"
sub$race_eth[is.na(sub$hisp) ==T | is.na(sub$race)==T]<-NA

sub$race_eth <- as.factor(sub$race_eth)
sub$race_eth<-relevel(sub$race_eth, ref = "NHWhite")

#Marital Status
sub$married<-recode(sub$marstat, recodes="00=NA; 99=NA; 10:13=1; else=0", as.factor=T )
#sub$sep<-ifelse(sub$married=='separated',1,0)
#sub$nm<-ifelse(sub$married=='nevermarried',1,0)
#sub$married<-relevel(sub$married, ref = "1")

#Smoking Status
#sub$smoke<-recode(sub$smokestatus2, recodes="00=NA; 10:13='CurrentSmoker'; 20='FormerSmoker'; 30='NeverSmoked'; else=NA", as.factor=T)
#sub$smoke<-relevel(sub$smoke, ref = "CurrentSmoker")

sub$currsmoke<-Recode(sub$smokestatus2, recodes="0=NA;90=NA; 10:13=1; else=0", as.factor = F)

#Drinking Status
# sub$alcohol<-recode(sub$alcstat2, recodes= "00=NA; 10='NeverDrinker'; 20:23='FormerDrinker'; 31:32='LightDrinker'; 35='LightDrinker'; 33='ModerateDrinker'; 34='HeavyDrinker'; 97:99=NA", as.factor.result=T)
# sub$alcohol<-relevel(sub$alcohol, ref = "HeavyDrinker")

#Drinking Status
sub$alcohol<-recode(sub$alcstat2, recodes= "00=NA; 10='LifetimeAbstainer'; 20:23='FormerDrinker'; 30:32='LightDrinker'; 35='LightDrinker'; 33='ModerateDrinker'; 34='HeavyDrinker'; 40:43=NA; 97:99=NA", as.factor=T, levels = c("LifetimeAbstainer", "FormerDrinker", "LightDrinker", "ModerateDrinker", "HeavyDrinker"))
sub$alcohol<-relevel(sub$alcohol, ref = "HeavyDrinker")


sub$alcohol2<-recode(sub$alcstat1, recodes= "0=NA; 1='LifetimeAbstainer'; 2='FormerDrinker'; 3='CurrentDrinker'; 9=NA", as.factor=T, levels = c("LifetimeAbstainer", "FormerDrinker", "CurrentDrinker"))
sub$alcohol2<-relevel(sub$alcohol2, ref = "LifetimeAbstainer")

sub$male<-ifelse(sub$sex==1,1,0)

sub$mwt<-sub$mortwt/mean(sub$mortwt, na.rm=T)

#Age cut into intervals
sub<-sub[sub$age>=18,]
sub$agec<-cut(sub$age, breaks =seq(15, 100, 5))
sub$age2<-sub$age^2

sub$birth_year<-ifelse(sub$birthyr>=9997, NA, sub$birthyr)

sub$goodhealth<-recode(sub$health, recodes="1:3=0; 4:5=1; else=NA")
sub$bmi_clean<-ifelse(sub$bmicalc%in%c(0,996),NA, sub$bmicalc)
sub$obese<-ifelse(sub$bmi_clean>=30,1,0)

sub$pov<-Recode(sub$pooryn, recodes = "2=1; 1=0; 9=NA")

sub$cohort2<- cut(sub$birth_year, breaks=seq(1915, 2010, 5))

sub<-subset(sub, complete.cases(moredu, pov, cohort2, currsmoke, alcohol, alcohol2, race_eth, hisp, black, oth, birth_year, goodhealth, bmi_clean, obese, age2, d.event, mortstat, timetodeath, year, agec, male, married, mortwt, strata, psu, sampweight, nhispid))

set.seed(1115)

Checking for variation in the the outcome

Global F test for equality of means

fit00<-glm(goodhealth~factor(cohort2), data=sub, family = binomial)
anova(fit00, test="Chisq")

## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: goodhealth
## 
## Terms added sequentially (first to last)
## 
## 
##                 Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                             5150     4090.8              
## factor(cohort2) 15   305.21      5135     3785.5 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

fit0<-glm(goodhealth~obese*factor(cohort2), data=sub, family = binomial)
anova(fit0, test="Chisq")

## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: goodhealth
## 
## Terms added sequentially (first to last)
## 
## 
##                       Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                                   5150     4090.8              
## obese                  1   52.496      5149     4038.3 4.311e-13 ***
## factor(cohort2)       15  310.489      5134     3727.8 < 2.2e-16 ***
## obese:factor(cohort2) 15   19.212      5119     3708.6    0.2043    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The means are not equal, which means we can run further hierarchical analysis to test the association.

Fitting the random intercept model using individual level predictors

fit1<-glmer(goodhealth~male+moredu+race_eth+obese+alcohol2+currsmoke+married+(1|cohort2), data=sub, family = binomial)
summary(fit1)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## goodhealth ~ male + moredu + race_eth + obese + alcohol2 + currsmoke +  
##     married + (1 | cohort2)
##    Data: sub
## 
##      AIC      BIC   logLik deviance df.resid 
##   3511.3   3596.4  -1742.6   3485.3     5138 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.6549 -0.3905 -0.2597 -0.1668  8.0162 
## 
## Random effects:
##  Groups  Name        Variance Std.Dev.
##  cohort2 (Intercept) 0.5319   0.7293  
## Number of obs: 5151, groups:  cohort2, 16
## 
## Fixed effects:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            -1.26207    0.23327  -5.410 6.29e-08 ***
## male                   -0.08005    0.09345  -0.857  0.39169    
## moreduHS               -0.64709    0.11673  -5.544 2.96e-08 ***
## moreduSomeCollege      -0.94059    0.12984  -7.244 4.36e-13 ***
## moreduCollege+         -1.59023    0.16209  -9.811  < 2e-16 ***
## race_ethHispanic        0.07145    0.13336   0.536  0.59212    
## race_ethNHBlack         0.34739    0.11670   2.977  0.00291 ** 
## obese                   0.58969    0.09571   6.161 7.22e-10 ***
## alcohol2FormerDrinker   0.39550    0.12592   3.141  0.00168 ** 
## alcohol2CurrentDrinker -0.48516    0.11604  -4.181 2.90e-05 ***
## currsmoke               0.58360    0.10670   5.470 4.51e-08 ***
## married1               -0.14282    0.09330  -1.531  0.12583    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) male   mordHS mrdSmC mrdCl+ rc_thH rc_NHB obese  alc2FD
## male        -0.082                                                        
## moreduHS    -0.303  0.038                                                 
## moredSmCllg -0.298  0.053  0.564                                          
## moreduCllg+ -0.229 -0.001  0.463  0.444                                   
## rac_thHspnc -0.263 -0.016  0.241  0.271  0.239                            
## rc_thNHBlck -0.206  0.004  0.065  0.086  0.088  0.266                     
## obese       -0.133  0.030 -0.003  0.006  0.030 -0.025 -0.113              
## alchl2FrmrD -0.212 -0.182 -0.038 -0.056 -0.057  0.082  0.065 -0.013       
## alchl2CrrnD -0.219 -0.224 -0.081 -0.131 -0.153  0.051  0.095 -0.005  0.589
## currsmoke   -0.133 -0.050  0.040  0.082  0.159  0.115  0.006  0.104 -0.104
## married1    -0.164 -0.133 -0.026  0.016 -0.018  0.002  0.156  0.013  0.002
##             alc2CD crrsmk
## male                     
## moreduHS                 
## moredSmCllg              
## moreduCllg+              
## rac_thHspnc              
## rc_thNHBlck              
## obese                    
## alchl2FrmrD              
## alchl2CrrnD              
## currsmoke   -0.184       
## married1     0.016  0.116

1)a. The variance component of the model is 0.6521 which is small enough.

1)b. The negative covariates show that, good health is associated to education, race/ethnicity, obesity status, current alcohol drinking and smoking status, as well as to the marital status. Obesity has a big-statistic, which supports our hypothesis that it significantly affects our outcome of good health.

Fitting randome slopes into the random intercept model

fit2<-glmer(goodhealth~male+moredu+race_eth+obese+alcohol2+currsmoke+married+(1+obese|cohort2), data=sub, family = binomial)

summary(fit2)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## goodhealth ~ male + moredu + race_eth + obese + alcohol2 + currsmoke +  
##     married + (1 + obese | cohort2)
##    Data: sub
## 
##      AIC      BIC   logLik deviance df.resid 
##   3513.5   3611.7  -1741.8   3483.5     5136 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.6706 -0.3955 -0.2578 -0.1649  8.2955 
## 
## Random effects:
##  Groups  Name        Variance Std.Dev. Corr 
##  cohort2 (Intercept) 0.59304  0.7701        
##          obese       0.02332  0.1527   -1.00
## Number of obs: 5151, groups:  cohort2, 16
## 
## Fixed effects:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            -1.27559    0.24191  -5.273 1.34e-07 ***
## male                   -0.08159    0.09345  -0.873  0.38264    
## moreduHS               -0.64648    0.11664  -5.543 2.98e-08 ***
## moreduSomeCollege      -0.93989    0.12970  -7.246 4.28e-13 ***
## moreduCollege+         -1.58375    0.16200  -9.776  < 2e-16 ***
## race_ethHispanic        0.07231    0.13323   0.543  0.58731    
## race_ethNHBlack         0.34461    0.11665   2.954  0.00313 ** 
## obese                   0.62093    0.10486   5.921 3.19e-09 ***
## alcohol2FormerDrinker   0.39518    0.12585   3.140  0.00169 ** 
## alcohol2CurrentDrinker -0.48696    0.11597  -4.199 2.68e-05 ***
## currsmoke               0.58512    0.10662   5.488 4.07e-08 ***
## married1               -0.14069    0.09328  -1.508  0.13151    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) male   mordHS mrdSmC mrdCl+ rc_thH rc_NHB obese  alc2FD
## male        -0.079                                                        
## moreduHS    -0.292  0.038                                                 
## moredSmCllg -0.287  0.052  0.563                                          
## moreduCllg+ -0.221 -0.001  0.462  0.443                                   
## rac_thHspnc -0.253 -0.015  0.240  0.269  0.237                            
## rc_thNHBlck -0.197  0.005  0.066  0.087  0.087  0.267                     
## obese       -0.430  0.029 -0.006  0.003  0.030 -0.025 -0.110              
## alchl2FrmrD -0.204 -0.182 -0.037 -0.055 -0.057  0.082  0.065 -0.021       
## alchl2CrrnD -0.210 -0.224 -0.081 -0.130 -0.153  0.050  0.094 -0.015  0.589
## currsmoke   -0.128 -0.049  0.040  0.080  0.158  0.111  0.004  0.090 -0.103
## married1    -0.158 -0.135 -0.025  0.015 -0.018  0.002  0.156  0.002  0.003
##             alc2CD crrsmk
## male                     
## moreduHS                 
## moredSmCllg              
## moreduCllg+              
## rac_thHspnc              
## rc_thNHBlck              
## obese                    
## alchl2FrmrD              
## alchl2CrrnD              
## currsmoke   -0.183       
## married1     0.016  0.115

2)a. The variance component 0.69797 which is slighly bigger than what we have seen in the random intercept model. The z-values did not change much due the gitting of the random slope into the model. However, the model failed to converge whihc is an issue of concern.

likelihood ratio between models

anova (fit1, fit2)

## Data: sub
## Models:
## fit1: goodhealth ~ male + moredu + race_eth + obese + alcohol2 + currsmoke + 
## fit1:     married + (1 | cohort2)
## fit2: goodhealth ~ male + moredu + race_eth + obese + alcohol2 + currsmoke + 
## fit2:     married + (1 + obese | cohort2)
##      Df    AIC    BIC  logLik deviance  Chisq Chi Df Pr(>Chisq)
## fit1 13 3511.3 3596.4 -1742.6   3485.3                         
## fit2 15 3513.5 3611.7 -1741.8   3483.5 1.7482      2     0.4172

Comparing the AIC values, it is seen that the second model with the random effect of obesity does not seem to fit the data better than the first model without this term in the model.

Intra-class correlations

icc(fit1)

## 
## Generalized linear mixed model
## 
## Family : binomial (logit)
## Formula: goodhealth ~ male + moredu + race_eth + obese + alcohol2 + currsmoke + married + (1 | cohort2)
## 
##   ICC (cohort2): 0.1392

icc(fit2)

## Caution! ICC for random-slope-intercept models usually not meaningful. See 'Note' in `?icc`.

## 
## Generalized linear mixed model
## 
## Family : binomial (logit)
## Formula: goodhealth ~ male + moredu + race_eth + obese + alcohol2 + currsmoke + married + (1 + obese | cohort2)
## 
##   ICC (cohort2): 0.1527

These results suggest that by including the random slope in fit2, we increase the correlation within cohorts.

Test for random effect

#ranova(fit1)
#ranova(fit2)

more model selection

model.sel(fit1, fit2)

## Model selection table 
##       (Int) al2    crr      mal mrr mrd    obs rac_eth random df    logLik
## fit1 -1.262   + 0.5836 -0.08005   +   + 0.5897       +      c 13 -1742.640
## fit2 -1.276   + 0.5851 -0.08159   +   + 0.6209       +  1+o|c 15 -1741.766
##        AICc delta weight
## fit1 3511.4  0.00  0.757
## fit2 3513.6  2.27  0.243
## Models ranked by AICc(x) 
## Random terms: 
## c = '1 | cohort2'
## 1+o|c = '1 + obese | cohort2'

The AIC value suggest that model 2 or ‘fit2’ is not better than the first random intercept model.

#r.squaredGLMM(fit1)
#r.squaredGLMM(fit2)

Visualizations of effects

rancoefs2<-ranef(fit2)

#here are the first 10 
head(rancoefs2$cohort2, n=10)

##             (Intercept)       obese
## (1915,1920]  1.05867825 -0.20993168
## (1920,1925]  1.21040529 -0.24001857
## (1925,1930]  0.47695636 -0.09457855
## (1930,1935]  0.55793925 -0.11063715
## (1935,1940]  0.54016247 -0.10711205
## (1940,1945]  0.77591548 -0.15386095
## (1945,1950]  0.58473508 -0.11595060
## (1950,1955]  0.14317065 -0.02839014
## (1955,1960] -0.09992376  0.01981451
## (1960,1965] -0.33764530  0.06695370

#Histograms of the random effects
par(mfrow=c(1,2))
hist(rancoefs2$cohort2[,"(Intercept)"],xlab= "Intercept Random Effect",  main = "Distribution of Random Intercepts")
hist(rancoefs2$cohort2[,"obese"], xlab= "Obese Slope Random Effect", main = "Distribution of Random Slopes")

par(mfrow=c(1,1))
plot(rancoefs2$cohort2[,"(Intercept)"],rancoefs2$cohort2[,"obese"], main="Random effects for each cohort", xlab="Intercept", ylab="Obesity slope" )

par(mfrow=c(1,1))
fixef(fit2)

##            (Intercept)                   male               moreduHS 
##            -1.27559470            -0.08158876            -0.64648074 
##      moreduSomeCollege         moreduCollege+       race_ethHispanic 
##            -0.93988716            -1.58374697             0.07230720 
##        race_ethNHBlack                  obese  alcohol2FormerDrinker 
##             0.34460921             0.62093226             0.39517676 
## alcohol2CurrentDrinker              currsmoke               married1 
##            -0.48696339             0.58512345            -0.14068857

summary(fixef(fit2)[1]+rancoefs2$cohort2) # I do this to get values for the range for the y axis.

##   (Intercept)           obese       
##  Min.   :-2.24284   Min.   :-1.516  
##  1st Qu.:-2.01207   1st Qu.:-1.388  
##  Median :-1.25397   Median :-1.280  
##  Mean   :-1.26217   Mean   :-1.278  
##  3rd Qu.:-0.71096   3rd Qu.:-1.130  
##  Max.   :-0.06519   Max.   :-1.084

# plot(NULL, ylim=c(-3, 2), xlim=c(-2,2),
#      ylab="General Health", 
#      xlab="Obesity Status") #You may have to change the ylim() for your outcome.
#      title (main="Regression Lines from Random Slope and Intercept Model")
# 
# cols=sample(rainbow(n=25), size = dim(rancoefs2$cohort2)[1], replace = T)
# 
# for (i in 1: dim(rancoefs2$cohort2)[1]){
#     abline(a=fixef(fit2)[1]+rancoefs2$cohort2[[1]][i], 
#            b=fixef(fit2)[16]+rancoefs2$cohort2[[2]][i], 
#            col=cols[i],lwd=.5 )}
# 
# #this line shows the "average" effect of poverty.
# abline(a=fixef(fit2)[1], 
#        b=fixef(fit2)[16], 
#        col=1, lwd=3)
# 
# legend("topright", col=1, lwd=5, legend="Average Effect of Obesity")

DEM 7473 HW 2: Basic Hierarchical Models - Random Intercepts + Random Slopes

Muntasir Masum

September 19, 2018