The outcome of interest here is the mortalist status which is treated as a dichotomous variable and the grouping variable for the analysis is cohorts starting from 1915 to 2010.

Loading Packages

library(readr)
library(car)
library(stargazer)
library(survey)
library(survival)
library(cmprsk)
library(questionr)
library(haven)
library(tidyr)
library(labelled)
library(knitr)
library(Hmisc)
library(gmodels)
library(ggplot2)
library(foreign)
library(lme4)

Loading NHIS Data

mydata <- read_dta("C://Users/munta/Google Drive/NHIS/nhis_00006.dta")
#names(mydata)

Recoding Variables

mydata$yob<-mydata$year-mydata$age

#mydata<-subset (mydata, yob%in%c(1915:2010) & year%in%c(2001:2009))

#mydata$cohort<-car::recode(mydata$yob, recodes="1940:1963='early'; 1964:1988='late'; else=NA", as.factor = T)

#Taking a subset of the total observations
sub<-subset(mydata, mydata$mortelig==1&is.na(mydata$racenew)==F)

samps<-sample(1:length(sub$psu), size = 20000, replace = F)
sub<-sub[samps,] #Using a subset of 20000 samples

#Age
sub$d.age<-ifelse(sub$mortstat==1, sub$mortdody-(sub$year-sub$age) ,
                  ifelse(sub$mortstat==2, 2009-(sub$year-sub$age), NA))
#Mortality Status
#0=Alive, 1=Dead
sub$d.event<-ifelse(sub$mortstat==1,1,0)
#sub$d.event<-as.character(sub$d.event)

sub$timetodeath<-ifelse(sub$mortstat ==1, sub$mortdody-sub$year, 2009 - sub$year )
sub$d5yr<-ifelse(sub$timetodeath<=5&sub$mortstat==1, 1,0)

#Mortalist Status: Dead or Alive
#sub$mortstat<-recode(sub$mortstat, recodes= "1='Dead'; 2='Alive'; 9=NA", as.factor.result=T)

#Education 
sub$moredu<-recode(sub$educrec2, recodes="00=NA; 10:41='LHS'; 42='HS'; 50:53='SomeCollege'; 54:60='College+'; else=NA", as.factor=T, levels = c("LHS", "HS", "SomeCollege", "College+"))
sub$moredu<-relevel(sub$moredu, ref = "LHS")

sub$hs<-ifelse(sub$moredu=='hs or less',1,0)
sub$scol1<-ifelse(sub$moredu=='some coll',1,0)

#Race/Ethnicity
sub$race<-recode(sub$racenew, recodes ="10='wht'; 20 ='blk'; 30:61=NA; 97:99=NA", as.factor=T)
sub$black<-ifelse(sub$race=='blk',1,0)
sub$oth<-ifelse(sub$race=='other',1,0)
 
sub$hisp<-recode(sub$hispyn, recodes="1=0; 2=1; else=NA")

sub$race_eth[sub$hisp == 0 & sub$race=="wht"]<-"NHWhite"

## Warning: Unknown or uninitialised column: 'race_eth'.

sub$race_eth[sub$hisp == 0 & sub$race=="blk"]<-"NHBlack"
# sub$race_eth[sub$hisp == 0 & sub$race=="other"]<-"NHOther"
sub$race_eth[sub$hisp == 1 ]<-"Hispanic"
sub$race_eth[is.na(sub$hisp) ==T | is.na(sub$race)==T]<-NA

sub$race_eth <- as.factor(sub$race_eth)
sub$race_eth<-relevel(sub$race_eth, ref = "NHWhite")

#Marital Status
sub$married<-recode(sub$marstat, recodes="00=NA; 99=NA; 10:13=1; else=0", as.factor=T )
#sub$sep<-ifelse(sub$married=='separated',1,0)
#sub$nm<-ifelse(sub$married=='nevermarried',1,0)
#sub$married<-relevel(sub$married, ref = "1")

#Smoking Status
#sub$smoke<-recode(sub$smokestatus2, recodes="00=NA; 10:13='CurrentSmoker'; 20='FormerSmoker'; 30='NeverSmoked'; else=NA", as.factor=T)
#sub$smoke<-relevel(sub$smoke, ref = "CurrentSmoker")

sub$currsmoke<-Recode(sub$smokestatus2, recodes="0=NA;90=NA; 10:13=1; else=0", as.factor = F)

#Drinking Status
# sub$alcohol<-recode(sub$alcstat2, recodes= "00=NA; 10='NeverDrinker'; 20:23='FormerDrinker'; 31:32='LightDrinker'; 35='LightDrinker'; 33='ModerateDrinker'; 34='HeavyDrinker'; 97:99=NA", as.factor.result=T)
# sub$alcohol<-relevel(sub$alcohol, ref = "HeavyDrinker")

#Drinking Status
sub$alcohol<-recode(sub$alcstat2, recodes= "00=NA; 10='LifetimeAbstainer'; 20:23='FormerDrinker'; 30:32='LightDrinker'; 35='LightDrinker'; 33='ModerateDrinker'; 34='HeavyDrinker'; 40:43=NA; 97:99=NA", as.factor=T, levels = c("LifetimeAbstainer", "FormerDrinker", "LightDrinker", "ModerateDrinker", "HeavyDrinker"))
sub$alcohol<-relevel(sub$alcohol, ref = "HeavyDrinker")


sub$alcohol2<-recode(sub$alcstat1, recodes= "0=NA; 1='LifetimeAbstainer'; 2='FormerDrinker'; 3='CurrentDrinker'; 9=NA", as.factor=T, levels = c("LifetimeAbstainer", "FormerDrinker", "CurrentDrinker"))
sub$alcohol2<-relevel(sub$alcohol2, ref = "LifetimeAbstainer")

sub$male<-ifelse(sub$sex==1,1,0)

sub$mwt<-sub$mortwt/mean(sub$mortwt, na.rm=T)

#Age cut into intervals
sub<-sub[sub$age>=18,]
sub$agec<-cut(sub$age, breaks =seq(15, 100, 5))
sub$age2<-sub$age^2

sub$birth_year<-ifelse(sub$birthyr>=9997, NA, sub$birthyr)

sub<-subset(sub, complete.cases(moredu, currsmoke, alcohol, alcohol2, race_eth, birth_year, age2, d.event, mortstat, timetodeath, year, agec, male, married, mortwt, strata, psu, sampweight, nhispid))

sub$cohort2<- cut(sub$birth_year, breaks=seq(1915, 2010, 5))
set.seed(1115)

Checking for variation in the the outcome

Global F test for equality of means

fit0<-glm(d.event~factor(cohort2), family = binomial, data=sub)
anova(fit0, test="Chisq")

## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: d.event
## 
## Terms added sequentially (first to last)
## 
## 
##                 Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                             6731     3770.1              
## factor(cohort2) 15   1125.1      6716     2645.0 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The means are not equal, which means we can run further tests and there’s a reason to not do regular regression analysis to test the association.

Fitting the random intercept model

“null” model fit with no predictors for height

fit1<-glmer(d.event~1+(1|cohort2), data = sub,family=binomial)
summary(fit1)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: d.event ~ 1 + (1 | cohort2)
##    Data: sub
## 
##      AIC      BIC   logLik deviance df.resid 
##   2723.9   2737.5  -1359.9   2719.9     6730 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.4614 -0.2413 -0.1264 -0.0843 14.3823 
## 
## Random effects:
##  Groups  Name        Variance Std.Dev.
##  cohort2 (Intercept) 3.489    1.868   
## Number of obs: 6732, groups:  cohort2, 16
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -3.0608     0.4841  -6.323 2.57e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The variance component is 3.832.

The following model will consider other coviates such as drinking status, marital status, smoking status.

fit2<-glmer(d.event~male+moredu+race_eth+alcohol2+currsmoke+married+(1|cohort2), data=sub, family = binomial)

## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control
## $checkConv, : Model failed to converge with max|grad| = 0.00473477 (tol =
## 0.001, component 1)

summary(fit2)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## d.event ~ male + moredu + race_eth + alcohol2 + currsmoke + married +  
##     (1 | cohort2)
##    Data: sub
## 
##      AIC      BIC   logLik deviance df.resid 
##   2654.5   2736.3  -1315.3   2630.5     6720 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.3404 -0.2380 -0.1223 -0.0761 16.4780 
## 
## Random effects:
##  Groups  Name        Variance Std.Dev.
##  cohort2 (Intercept) 3.457    1.859   
## Number of obs: 6732, groups:  cohort2, 16
## 
## Fixed effects:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             -2.9417     0.5059  -5.815 6.06e-09 ***
## male                     0.5297     0.1124   4.711 2.46e-06 ***
## moreduHS                -0.1660     0.1369  -1.212 0.225474    
## moreduSomeCollege       -0.3317     0.1572  -2.110 0.034847 *  
## moreduCollege+          -0.4267     0.1777  -2.402 0.016320 *  
## race_ethHispanic        -0.1015     0.1813  -0.560 0.575679    
## race_ethNHBlack          0.1111     0.1459   0.761 0.446549    
## alcohol2FormerDrinker    0.1790     0.1386   1.291 0.196745    
## alcohol2CurrentDrinker  -0.4455     0.1334  -3.340 0.000837 ***
## currsmoke                0.6605     0.1392   4.746 2.07e-06 ***
## married1                -0.2682     0.1151  -2.330 0.019786 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) male   mordHS mrdSmC mrdCl+ rc_thH rc_NHB alc2FD alc2CD
## male        -0.073                                                        
## moreduHS    -0.167  0.056                                                 
## moredSmCllg -0.158  0.045  0.535                                          
## moreduCllg+ -0.139 -0.007  0.484  0.460                                   
## rac_thHspnc -0.135  0.015  0.214  0.224  0.208                            
## rc_thNHBlck -0.109  0.021  0.087  0.096  0.132  0.199                     
## alchl2FrmrD -0.096 -0.120 -0.050 -0.056 -0.023  0.030 -0.025              
## alchl2CrrnD -0.104 -0.183 -0.095 -0.159 -0.172  0.053  0.062  0.530       
## currsmoke   -0.087 -0.042  0.042  0.088  0.141  0.076  0.031 -0.080 -0.127
## married1    -0.059 -0.241 -0.041 -0.036 -0.030 -0.004  0.137 -0.001 -0.003
##             crrsmk
## male              
## moreduHS          
## moredSmCllg       
## moreduCllg+       
## rac_thHspnc       
## rc_thNHBlck       
## alchl2FrmrD       
## alchl2CrrnD       
## currsmoke         
## married1     0.094
## convergence code: 0
## Model failed to converge with max|grad| = 0.00473477 (tol = 0.001, component 1)

The variance component of 4.059 went up when additional variables are included in the model.

Compare these two fits using a Likelihood ratio test

anova(fit1, fit2)

## Data: sub
## Models:
## fit1: d.event ~ 1 + (1 | cohort2)
## fit2: d.event ~ male + moredu + race_eth + alcohol2 + currsmoke + married + 
## fit2:     (1 | cohort2)
##      Df    AIC    BIC  logLik deviance  Chisq Chi Df Pr(>Chisq)    
## fit1  2 2723.9 2737.5 -1359.9   2719.9                             
## fit2 12 2654.5 2736.3 -1315.3   2630.5 89.314     10  7.325e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The second model is a better fit than the first.

Variance Components and ICC

VarCorr(fit1)$s1_id[1]/((attr(VarCorr(fit1), "sc")^2)+VarCorr(fit1)$s1_id[1])

## numeric(0)

library(sjstats)
icc(fit1)

## 
## Generalized linear mixed model
## 
## Family : binomial (logit)
## Formula: d.event ~ 1 + (1 | cohort2)
## 
##   ICC (cohort2): 0.5147

The ICC in the second model goes down to 0.5381

VarCorr(fit2)$s1_id[1]/((attr(VarCorr(fit2), "sc")^2)+VarCorr(fit2)$s1_id[1])

## numeric(0)

icc(fit2)

## 
## Generalized linear mixed model
## 
## Family : binomial (logit)
## Formula: d.event ~ male + moredu + race_eth + alcohol2 + currsmoke + married + (1 | cohort2)
## 
##   ICC (cohort2): 0.5124

The ICC in the second model went up from 0.5381 to 0.5523. It means that the variability has increased by controlling for additional characteristics.

#require(lattice)
schmeans<-ranef(fit1, condVar=T)
schmeans$cohort2$`(Intercept)`<-schmeans$cohort2$`(Intercept)`+fixef(fit1)[1]
dotplot(schmeans, main=T, ylab="Cohort")

## $cohort2

DEM 7473 HW 1: Basic Hierarchical Models - Random Intercepts

Muntasir Masum

September 19, 2018