The outcome of interest here is the mortalist status which is treated as a dichotomous variable and the grouping variable for the analysis is cohorts starting from 1915 to 2010.
library(readr)
library(car)
library(stargazer)
library(survey)
library(survival)
library(cmprsk)
library(questionr)
library(haven)
library(tidyr)
library(labelled)
library(knitr)
library(Hmisc)
library(gmodels)
library(ggplot2)
library(foreign)
library(lme4)
mydata <- read_dta("C://Users/munta/Google Drive/NHIS/nhis_00006.dta")
#names(mydata)
mydata$yob<-mydata$year-mydata$age
#mydata<-subset (mydata, yob%in%c(1915:2010) & year%in%c(2001:2009))
#mydata$cohort<-car::recode(mydata$yob, recodes="1940:1963='early'; 1964:1988='late'; else=NA", as.factor = T)
#Taking a subset of the total observations
sub<-subset(mydata, mydata$mortelig==1&is.na(mydata$racenew)==F)
samps<-sample(1:length(sub$psu), size = 20000, replace = F)
sub<-sub[samps,] #Using a subset of 20000 samples
#Age
sub$d.age<-ifelse(sub$mortstat==1, sub$mortdody-(sub$year-sub$age) ,
ifelse(sub$mortstat==2, 2009-(sub$year-sub$age), NA))
#Mortality Status
#0=Alive, 1=Dead
sub$d.event<-ifelse(sub$mortstat==1,1,0)
#sub$d.event<-as.character(sub$d.event)
sub$timetodeath<-ifelse(sub$mortstat ==1, sub$mortdody-sub$year, 2009 - sub$year )
sub$d5yr<-ifelse(sub$timetodeath<=5&sub$mortstat==1, 1,0)
#Mortalist Status: Dead or Alive
#sub$mortstat<-recode(sub$mortstat, recodes= "1='Dead'; 2='Alive'; 9=NA", as.factor.result=T)
#Education
sub$moredu<-recode(sub$educrec2, recodes="00=NA; 10:41='LHS'; 42='HS'; 50:53='SomeCollege'; 54:60='College+'; else=NA", as.factor=T, levels = c("LHS", "HS", "SomeCollege", "College+"))
sub$moredu<-relevel(sub$moredu, ref = "LHS")
sub$hs<-ifelse(sub$moredu=='hs or less',1,0)
sub$scol1<-ifelse(sub$moredu=='some coll',1,0)
#Race/Ethnicity
sub$race<-recode(sub$racenew, recodes ="10='wht'; 20 ='blk'; 30:61=NA; 97:99=NA", as.factor=T)
sub$black<-ifelse(sub$race=='blk',1,0)
sub$oth<-ifelse(sub$race=='other',1,0)
sub$hisp<-recode(sub$hispyn, recodes="1=0; 2=1; else=NA")
sub$race_eth[sub$hisp == 0 & sub$race=="wht"]<-"NHWhite"
## Warning: Unknown or uninitialised column: 'race_eth'.
sub$race_eth[sub$hisp == 0 & sub$race=="blk"]<-"NHBlack"
# sub$race_eth[sub$hisp == 0 & sub$race=="other"]<-"NHOther"
sub$race_eth[sub$hisp == 1 ]<-"Hispanic"
sub$race_eth[is.na(sub$hisp) ==T | is.na(sub$race)==T]<-NA
sub$race_eth <- as.factor(sub$race_eth)
sub$race_eth<-relevel(sub$race_eth, ref = "NHWhite")
#Marital Status
sub$married<-recode(sub$marstat, recodes="00=NA; 99=NA; 10:13=1; else=0", as.factor=T )
#sub$sep<-ifelse(sub$married=='separated',1,0)
#sub$nm<-ifelse(sub$married=='nevermarried',1,0)
#sub$married<-relevel(sub$married, ref = "1")
#Smoking Status
#sub$smoke<-recode(sub$smokestatus2, recodes="00=NA; 10:13='CurrentSmoker'; 20='FormerSmoker'; 30='NeverSmoked'; else=NA", as.factor=T)
#sub$smoke<-relevel(sub$smoke, ref = "CurrentSmoker")
sub$currsmoke<-Recode(sub$smokestatus2, recodes="0=NA;90=NA; 10:13=1; else=0", as.factor = F)
#Drinking Status
# sub$alcohol<-recode(sub$alcstat2, recodes= "00=NA; 10='NeverDrinker'; 20:23='FormerDrinker'; 31:32='LightDrinker'; 35='LightDrinker'; 33='ModerateDrinker'; 34='HeavyDrinker'; 97:99=NA", as.factor.result=T)
# sub$alcohol<-relevel(sub$alcohol, ref = "HeavyDrinker")
#Drinking Status
sub$alcohol<-recode(sub$alcstat2, recodes= "00=NA; 10='LifetimeAbstainer'; 20:23='FormerDrinker'; 30:32='LightDrinker'; 35='LightDrinker'; 33='ModerateDrinker'; 34='HeavyDrinker'; 40:43=NA; 97:99=NA", as.factor=T, levels = c("LifetimeAbstainer", "FormerDrinker", "LightDrinker", "ModerateDrinker", "HeavyDrinker"))
sub$alcohol<-relevel(sub$alcohol, ref = "HeavyDrinker")
sub$alcohol2<-recode(sub$alcstat1, recodes= "0=NA; 1='LifetimeAbstainer'; 2='FormerDrinker'; 3='CurrentDrinker'; 9=NA", as.factor=T, levels = c("LifetimeAbstainer", "FormerDrinker", "CurrentDrinker"))
sub$alcohol2<-relevel(sub$alcohol2, ref = "LifetimeAbstainer")
sub$male<-ifelse(sub$sex==1,1,0)
sub$mwt<-sub$mortwt/mean(sub$mortwt, na.rm=T)
#Age cut into intervals
sub<-sub[sub$age>=18,]
sub$agec<-cut(sub$age, breaks =seq(15, 100, 5))
sub$age2<-sub$age^2
sub$birth_year<-ifelse(sub$birthyr>=9997, NA, sub$birthyr)
sub<-subset(sub, complete.cases(moredu, currsmoke, alcohol, alcohol2, race_eth, birth_year, age2, d.event, mortstat, timetodeath, year, agec, male, married, mortwt, strata, psu, sampweight, nhispid))
sub$cohort2<- cut(sub$birth_year, breaks=seq(1915, 2010, 5))
set.seed(1115)
fit0<-glm(d.event~factor(cohort2), family = binomial, data=sub)
anova(fit0, test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: d.event
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 6731 3770.1
## factor(cohort2) 15 1125.1 6716 2645.0 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fit1<-glmer(d.event~1+(1|cohort2), data = sub,family=binomial)
summary(fit1)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: d.event ~ 1 + (1 | cohort2)
## Data: sub
##
## AIC BIC logLik deviance df.resid
## 2723.9 2737.5 -1359.9 2719.9 6730
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.4614 -0.2413 -0.1264 -0.0843 14.3823
##
## Random effects:
## Groups Name Variance Std.Dev.
## cohort2 (Intercept) 3.489 1.868
## Number of obs: 6732, groups: cohort2, 16
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.0608 0.4841 -6.323 2.57e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The following model will consider other coviates such as drinking status, marital status, smoking status.
fit2<-glmer(d.event~male+moredu+race_eth+alcohol2+currsmoke+married+(1|cohort2), data=sub, family = binomial)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control
## $checkConv, : Model failed to converge with max|grad| = 0.00473477 (tol =
## 0.001, component 1)
summary(fit2)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## d.event ~ male + moredu + race_eth + alcohol2 + currsmoke + married +
## (1 | cohort2)
## Data: sub
##
## AIC BIC logLik deviance df.resid
## 2654.5 2736.3 -1315.3 2630.5 6720
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.3404 -0.2380 -0.1223 -0.0761 16.4780
##
## Random effects:
## Groups Name Variance Std.Dev.
## cohort2 (Intercept) 3.457 1.859
## Number of obs: 6732, groups: cohort2, 16
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.9417 0.5059 -5.815 6.06e-09 ***
## male 0.5297 0.1124 4.711 2.46e-06 ***
## moreduHS -0.1660 0.1369 -1.212 0.225474
## moreduSomeCollege -0.3317 0.1572 -2.110 0.034847 *
## moreduCollege+ -0.4267 0.1777 -2.402 0.016320 *
## race_ethHispanic -0.1015 0.1813 -0.560 0.575679
## race_ethNHBlack 0.1111 0.1459 0.761 0.446549
## alcohol2FormerDrinker 0.1790 0.1386 1.291 0.196745
## alcohol2CurrentDrinker -0.4455 0.1334 -3.340 0.000837 ***
## currsmoke 0.6605 0.1392 4.746 2.07e-06 ***
## married1 -0.2682 0.1151 -2.330 0.019786 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) male mordHS mrdSmC mrdCl+ rc_thH rc_NHB alc2FD alc2CD
## male -0.073
## moreduHS -0.167 0.056
## moredSmCllg -0.158 0.045 0.535
## moreduCllg+ -0.139 -0.007 0.484 0.460
## rac_thHspnc -0.135 0.015 0.214 0.224 0.208
## rc_thNHBlck -0.109 0.021 0.087 0.096 0.132 0.199
## alchl2FrmrD -0.096 -0.120 -0.050 -0.056 -0.023 0.030 -0.025
## alchl2CrrnD -0.104 -0.183 -0.095 -0.159 -0.172 0.053 0.062 0.530
## currsmoke -0.087 -0.042 0.042 0.088 0.141 0.076 0.031 -0.080 -0.127
## married1 -0.059 -0.241 -0.041 -0.036 -0.030 -0.004 0.137 -0.001 -0.003
## crrsmk
## male
## moreduHS
## moredSmCllg
## moreduCllg+
## rac_thHspnc
## rc_thNHBlck
## alchl2FrmrD
## alchl2CrrnD
## currsmoke
## married1 0.094
## convergence code: 0
## Model failed to converge with max|grad| = 0.00473477 (tol = 0.001, component 1)
The variance component of 4.059 went up when additional variables are included in the model.
anova(fit1, fit2)
## Data: sub
## Models:
## fit1: d.event ~ 1 + (1 | cohort2)
## fit2: d.event ~ male + moredu + race_eth + alcohol2 + currsmoke + married +
## fit2: (1 | cohort2)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## fit1 2 2723.9 2737.5 -1359.9 2719.9
## fit2 12 2654.5 2736.3 -1315.3 2630.5 89.314 10 7.325e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
VarCorr(fit1)$s1_id[1]/((attr(VarCorr(fit1), "sc")^2)+VarCorr(fit1)$s1_id[1])
## numeric(0)
library(sjstats)
icc(fit1)
##
## Generalized linear mixed model
##
## Family : binomial (logit)
## Formula: d.event ~ 1 + (1 | cohort2)
##
## ICC (cohort2): 0.5147
The ICC in the second model goes down to 0.5381
VarCorr(fit2)$s1_id[1]/((attr(VarCorr(fit2), "sc")^2)+VarCorr(fit2)$s1_id[1])
## numeric(0)
icc(fit2)
##
## Generalized linear mixed model
##
## Family : binomial (logit)
## Formula: d.event ~ male + moredu + race_eth + alcohol2 + currsmoke + married + (1 | cohort2)
##
## ICC (cohort2): 0.5124
#require(lattice)
schmeans<-ranef(fit1, condVar=T)
schmeans$cohort2$`(Intercept)`<-schmeans$cohort2$`(Intercept)`+fixef(fit1)[1]
dotplot(schmeans, main=T, ylab="Cohort")
## $cohort2