This example will illustrate how to fit the discrete time hazard model with group-level frailty to continuous duration data (i.e. person-level data) and a discrete-time (longitudinal) data set. In this example, I will use two data sets, first:

The longitudinal data example uses data from the ECLS-K. Specifically, we will examine the transition into poverty between kindergarten and 8th grade.

The second example uses the event of a child dying before age 5 in the DHS model data file. The data for this example come from the model.data Demographic and Health Survey for 2012 birth history recode file. This file contains information for all births to women in the survey.

Longitudinal Data

As in the other examples, I illustrate fitting these models to data that are longitudinal, instead of person-duration. In this example, we will examine how to fit the Cox model to a longitudinally collected data set.

#Load required libraries
library(foreign)
library(survival)
library(car)
library(survey)
library(lme4)
library(arm)

First we load our data

load("~/Google Drive/dem7223/class2016/data/eclsk.Rdata")
names(eclsk)<-tolower(names(eclsk))
#get out only the variables I'm going to use for this example
myvars<-c( "childid","gender", "race", "r1_kage","r3age", "r4age",
           "r5age", "r6age", "r7age","c1r4mtsc", "c4r4mtsc",
           "c5r4mtsc", "c6r4mtsc", "c7r4mtsc","wkpov_r", "w1povrty",
           "w3povrty", "w5povrty", "w8povrty","wkmomed","w1momed",
           "w3momed", "w5momed", "w8momed", "s2_id", "c1_7fp0",
           "c17fpstr", "c17fppsu")
eclsk<-eclsk[,myvars]


eclsk$age1<-ifelse(eclsk$r1_kage==-9, NA, eclsk$r1_kage/12)
eclsk$age2<-ifelse(eclsk$r3age==-9, NA, eclsk$r3age/12)
#for the later waves, the NCES group the ages into ranges of months, so 1= <105 months, 2=105 to 108 months. So, I fix the age at the midpoint of the interval they give, and make it into years by dividing by 12
eclsk$age3<-recode(eclsk$r5age,recodes="1=105; 2=107; 3=109; 4=112; 5=115; 6=117; -9=NA")/12
eclsk$age4<-recode(eclsk$r7age,recodes="1=155; 2=166; 3=172; 4=178; 5=192; -9=NA")/12


eclsk$pov1<-ifelse(eclsk$wkpov_r==1,1,0)
eclsk$pov2<-ifelse(eclsk$w3povrty==1,1,0)
eclsk$pov3<-ifelse(eclsk$w5povrty==1,1,0)
eclsk$pov4<-ifelse(eclsk$w8povrty==1,1,0)

#mother's education-time varying > hs ==1
eclsk$momedu1<-recode(eclsk$wkmomed, recodes = "1:2='primarysch';3='highschool';4:5='jcollege'; 6:9='college'; else =NA", as.factor.result = T);
eclsk$momedu1<-relevel(eclsk$momedu1, ref = "highschool") 
eclsk$momedu2<-recode(eclsk$w3momed, recodes = "1:2='primarysch';3='highschool';4:5='jcollege'; 6:9='college'; else =NA", as.factor.result = T);
eclsk$momedu2<-relevel(eclsk$momedu2, ref = "highschool")
eclsk$momedu3<-recode(eclsk$w5momed, recodes = "1:2='primarysch';3='highschool';4:5='jcollege'; 6:9='college'; else =NA", as.factor.result = T);
eclsk$momedu3<-relevel(eclsk$momedu3, ref = "highschool") 


#Recode race with white, non Hispanic as reference using dummy vars
eclsk$hisp<-recode (eclsk$race, recodes="3:4=1;-9=NA; else=0")
eclsk$black<-recode (eclsk$race, recodes="2=1;-9=NA; else=0")
eclsk$asian<-recode (eclsk$race, recodes="5=1;-9=NA; else=0")
eclsk$nahn<-recode (eclsk$race, recodes="6:7=1;-9=NA; else=0")
eclsk$other<-recode (eclsk$race, recodes="8=1;-9=NA; else=0")
eclsk$race_gr<-recode(eclsk$race, recodes="3:4='hisp'; 2='nh black'; 5='nh asian'; 6:7='nahn'; 8='other'; 1='nh white'; else=NA", as.factor.result = T)
eclsk$race_gr<-relevel(eclsk$race_gr, ref = 'nh white')
eclsk$male<-recode(eclsk$gender, recodes="1=1; 2=0; -9=NA")

Now, I need to form the transition variable, this is my event variable, and in this case it will be 1 if a child enters poverty between the first wave of the data and the third grade wave, and 0 otherwise. NOTE I need to remove any children who are already in poverty age wave 1, because they are not at risk of experiencing this particular transition.

eclsk<-subset(eclsk, is.na(pov1)==F&is.na(pov2)==F&is.na(pov3)==F&is.na(age1)==F&is.na(age2)==F&is.na(age3)==F&pov1!=1&is.na(eclsk$c1_7fp0)==F&is.na(eclsk$s2_id)==F)

#make an id that is the combination of state and strata 
eclsk$sampleid<-paste(eclsk$c17fpstr, eclsk$c17fppsu)
#within each sampling unit, sum the weights
wts<-tapply(eclsk$c1_7fp0,eclsk$sampleid,sum)
#make a data frame from this
wts<-data.frame(id=names(unlist(wts)), wt=unlist(wts))
#get the unique sampling location ids'
t1<-as.data.frame(table(eclsk$sampleid))
#put all of this into a data set
wts2<-data.frame(ids=wts$id, sumwt=wts$wt, jn=t1$Freq)
#merge all of this back to the original data file
eclsk2<-merge(eclsk, wts2, by.x="sampleid", by.y="ids", all.x=T)
#In the new data set, multiply the original weight by the fraction of the
#sampling unit total population each person represents
eclsk2$swts<-eclsk2$c1_7fp0*(eclsk2$jn/eclsk2$sumwt)

Now we do the entire data set. To analyze data longitudinally, we need to reshape the data from the current “wide” format (repeated measures in columns) to a “long” format (repeated observations in rows). The reshape() function allows us to do this easily. It allows us to specify our repeated measures, time varying covariates as well as time-constant covariates.

e.long<-reshape(eclsk2, idvar="childid", varying=list(age=c("age1","age2", "age3"), age2=c("age2", "age3", "age4"), momedu=c("momedu1", "momedu2", "momedu3")), times=1:3, direction="long" , v.names = c("age_enter", "age_exit", "momedu"))
e.long<-e.long[order(e.long$childid, e.long$time),]
head(e.long)

##            sampleid  childid gender race r1_kage r3age r4age r5age r6age
## 0015006C.1     77 1 0015006C      1    1    76.8 87.83 95.53     6    NA
## 0015006C.2     77 1 0015006C      1    1    76.8 87.83 95.53     6    NA
## 0015006C.3     77 1 0015006C      1    1    76.8 87.83 95.53     6    NA
## 0015014C.1     77 1 0015014C      2    1    64.2 75.23 82.93     2     2
## 0015014C.2     77 1 0015014C      2    1    64.2 75.23 82.93     2     2
## 0015014C.3     77 1 0015014C      2    1    64.2 75.23 82.93     2     2
##            r7age c1r4mtsc c4r4mtsc c5r4mtsc c6r4mtsc c7r4mtsc wkpov_r
## 0015006C.1     4   56.187   51.941   52.684       NA   50.263       2
## 0015006C.2     4   56.187   51.941   52.684       NA   50.263       2
## 0015006C.3     4   56.187   51.941   52.684       NA   50.263       2
## 0015014C.1     2   57.739   55.178   51.977   59.293   60.001       2
## 0015014C.2     2   57.739   55.178   51.977   59.293   60.001       2
## 0015014C.3     2   57.739   55.178   51.977   59.293   60.001       2
##            w1povrty w3povrty w5povrty w8povrty wkmomed w1momed w3momed
## 0015006C.1        2        2        2        2       5       5       6
## 0015006C.2        2        2        2        2       5       5       6
## 0015006C.3        2        2        2        2       5       5       6
## 0015014C.1        2        2        2        2       5       6       6
## 0015014C.2        2        2        2        2       5       6       6
## 0015014C.3        2        2        2        2       5       6       6
##            w5momed w8momed s2_id c1_7fp0 c17fpstr c17fppsu pov1 pov2 pov3
## 0015006C.1       8       8  0015  934.11       77        1    0    0    0
## 0015006C.2       8       8  0015  934.11       77        1    0    0    0
## 0015006C.3       8       8  0015  934.11       77        1    0    0    0
## 0015014C.1       6       6  0015  129.55       77        1    0    0    0
## 0015014C.2       6       6  0015  129.55       77        1    0    0    0
## 0015014C.3       6       6  0015  129.55       77        1    0    0    0
##            pov4 hisp black asian nahn other  race_gr male    sumwt jn
## 0015006C.1    0    0     0     0    0     0 nh white    1 13743.12 58
## 0015006C.2    0    0     0     0    0     0 nh white    1 13743.12 58
## 0015006C.3    0    0     0     0    0     0 nh white    1 13743.12 58
## 0015014C.1    0    0     0     0    0     0 nh white    0 13743.12 58
## 0015014C.2    0    0     0     0    0     0 nh white    0 13743.12 58
## 0015014C.3    0    0     0     0    0     0 nh white    0 13743.12 58
##                swts time age_enter  age_exit   momedu
## 0015006C.1 3.942218    1  6.400000  7.319167 jcollege
## 0015006C.2 3.942218    2  7.319167  9.750000  college
## 0015006C.3 3.942218    3  9.750000 14.833333  college
## 0015014C.1 0.546739    1  5.350000  6.269167 jcollege
## 0015014C.2 0.546739    2  6.269167  8.916667  college
## 0015014C.3 0.546739    3  8.916667 13.833333  college

Since I have 3 risk periods this time, the previous way of coding the transitions has become burdensome. So, instead I hard code it. To do this, I initialize the povtran variable as missing, then hard code the specific transitions I’m interested in. This lets me identify when values between waves stay the same, versus when they change. Also, since I’m initilizing the variable as missing, I dont’ have to recode the missing values as I’ve done in previous examples.

#find which kids failed in the first time period and remove them from the second risk period risk set
e.long$povtran<-NA

e.long$povtran[e.long$pov1==0&e.long$pov2==1&e.long$time==1]<-1
e.long$povtran[e.long$pov2==0&e.long$pov3==1&e.long$time==2]<-1
e.long$povtran[e.long$pov3==0&e.long$pov4==1&e.long$time==3]<-1

e.long$povtran[e.long$pov1==0&e.long$pov2==0&e.long$time==1]<-0
e.long$povtran[e.long$pov2==0&e.long$pov3==0&e.long$time==2]<-0
e.long$povtran[e.long$pov3==0&e.long$pov4==0&e.long$time==3]<-0

#find which kids failed in earlier time periods and remove them from the second & third period risk set
failed1<-which(is.na(e.long$povtran)==T)
e.long<-e.long[-failed1,]

#round age to the nearest year
e.long$age_enter_r<-round(e.long$age_enter, 0)
e.long$age_exit_r<-round(e.long$age_exit, 0)

head(e.long[, c("childid","time", "age_enter", "age_exit", "pov1", "pov2", "pov3", "time", "povtran")], n=10)

##             childid time age_enter  age_exit pov1 pov2 pov3 time.1 povtran
## 0015006C.1 0015006C    1  6.400000  7.319167    0    0    0      1       0
## 0015006C.2 0015006C    2  7.319167  9.750000    0    0    0      2       0
## 0015006C.3 0015006C    3  9.750000 14.833333    0    0    0      3       0
## 0015014C.1 0015014C    1  5.350000  6.269167    0    0    0      1       0
## 0015014C.2 0015014C    2  6.269167  8.916667    0    0    0      2       0
## 0015014C.3 0015014C    3  8.916667 13.833333    0    0    0      3       0
## 0015019C.1 0015019C    1  5.685833  6.605833    0    0    0      1       0
## 0015019C.2 0015019C    2  6.605833  9.333333    0    0    0      2       0
## 0015019C.3 0015019C    3  9.333333 14.333333    0    0    0      3       0
## 0015020C.1 0015020C    1  6.583333  7.802500    0    0    0      1       0

Basic Discrete time model

First, we fit the basic discrete time model for our outcome. Here I’m going to use the new standardized weights I created above in a regular glm() model:

fit1<-glm(povtran~as.factor(time)+momedu+race_gr-1, data=e.long, weights = swts, family=binomial(link="cloglog"))

## Warning: non-integer #successes in a binomial glm!

arm::display(fit1, detail=T)

## glm(formula = povtran ~ as.factor(time) + momedu + race_gr - 
##     1, family = binomial(link = "cloglog"), data = e.long, weights = swts)
##                  coef.est coef.se z value Pr(>|z|)
## as.factor(time)1  -2.47     0.15  -16.07    0.00  
## as.factor(time)2  -2.81     0.18  -15.55    0.00  
## as.factor(time)3  -3.00     0.19  -15.47    0.00  
## momeducollege     -2.64     0.34   -7.72    0.00  
## momedujcollege    -0.75     0.17   -4.55    0.00  
## momeduprimarysch   0.61     0.22    2.78    0.01  
## race_grhisp        0.65     0.19    3.41    0.00  
## race_grnahn        0.99     0.31    3.15    0.00  
## race_grnh asian    0.35     0.42    0.85    0.39  
## race_grnh black    0.79     0.21    3.74    0.00  
## race_grother       0.24     0.42    0.56    0.57  
## ---
##   n = 4757, k = 11
##   residual deviance = 1394.0, null deviance = 9233.2 (difference = 7839.2)

Discrete time model with shared frailty

For the discrete time model, if the logit link is used, then we are effectively fitting a multilevel model for our outcome. The model with only a group frailty component will have the exact same form as the multilevel logit model with a random intecept at the group level: Fit the basic random intercept model using the complementary log-log link function: \(log(-log(1-h(t))) = \beta_{0j} +x'\beta +Z\gamma'\)

with

\(\beta_{0j} = \beta_0 + Z\gamma'+ u_j\)

and

\(u_j\sim N(0, \sigma_u^2)\)

Where the intercepts (\(u_j\)) for each group vary randomly around the overall mean (\(\beta_0\)).

The individual level predictors are incorporated into \(x\), while the group level predictors (if any are measured) are included in \(Z\). If only a random intercept is specified, then \(Z\) is a vector of 1’s.

#this will take a lot longer to fit compared to the ordinary logit
#I also had to include some extra optimization details because the default maximizer didn't return a satisfactory model fit criteria.

fit2<-glmer(povtran~as.factor(time)+momedu+race_gr-1+(1|s2_id),
            data=e.long, weights = swts, family=binomial(link="cloglog"), 
            control=glmerControl(optimizer="bobyqa",optCtrl=list(maxfun=2e5)))

## Warning: non-integer #successes in a binomial glm!

arm::display(fit2, detail=T)

## glmer(formula = povtran ~ as.factor(time) + momedu + race_gr - 
##     1 + (1 | s2_id), data = e.long, family = binomial(link = "cloglog"), 
##     control = glmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05)), 
##     weights = swts)
##                  coef.est coef.se z value Pr(>|z|)
## as.factor(time)1  -3.27     0.25  -13.00    0.00  
## as.factor(time)2  -3.37     0.26  -12.99    0.00  
## as.factor(time)3  -3.56     0.27  -13.16    0.00  
## momeducollege     -2.63     0.39   -6.79    0.00  
## momedujcollege    -0.71     0.20   -3.60    0.00  
## momeduprimarysch   0.97     0.28    3.42    0.00  
## race_grhisp        0.55     0.25    2.20    0.03  
## race_grnahn        0.90     0.47    1.90    0.06  
## race_grnh asian    0.67     0.53    1.27    0.20  
## race_grnh black    1.08     0.30    3.57    0.00  
## race_grother      -0.08     0.56   -0.15    0.88  
## 
## Error terms:
##  Groups   Name        Std.Dev.
##  s2_id    (Intercept) 1.32    
##  Residual             1.00    
## ---
## number of obs: 4757, groups: s2_id, 273
## AIC = 1238.9, DIC = 833
## deviance = 1024.0

AIC comparison of the two models:

AIC(fit1) #Regular GLM

## [1] 1441.312

AIC(fit2) #GLMM

## [1] 1238.944

DHS data

load the data

#load the data
model.dat<-read.dta("/Users/ozd504/Google Drive/dem7223/class2016//data/zzbr62dt/ZZBR62FL.DTA", convert.factors = F)

Subset and calculate variables

#We form a subset of variables
sub<-data.frame(CASEID=model.dat$caseid,kidid=paste(model.dat$caseid, model.dat$bidx, sep="-"), v008=model.dat$v008,bord=model.dat$bidx,csex=model.dat$b4,b2=model.dat$b2, b3=model.dat$b3, b5=model.dat$b5, b7=model.dat$b7, ibint=model.dat$b11, rural=model.dat$v025, educ=model.dat$v106,age=model.dat$v012,partneredu=model.dat$v701,partnerage=model.dat$v730, hhses=model.dat$v190, weight=model.dat$v005/1000000, psu=model.dat$v021, strata=model.dat$v022, region=model.dat$v023)


sub$death.age<-ifelse(sub$b5==1,
                          ((((sub$v008))+1900)-(((sub$b3))+1900)) 
                          ,sub$b7)+1

#censoring indicator for death by age 5, in months (<=60 months)
sub$d.event<-ifelse(is.na(sub$b7)==T|sub$b7>60,0,1)
sub$d.eventfac<-factor(sub$d.event); levels(sub$d.eventfac)<-c("Alive at Age 5", "Dead by Age 5")
table(sub$d.eventfac)

## 
## Alive at Age 5  Dead by Age 5 
##          19224           4442

#recodes
sub$male<-ifelse(sub$csex==1,1,0)
sub$educ.high<-ifelse(sub$educ %in% c(2,3), 1, 0)
sub$age2<-sub$age^2
sub$partnerhiedu<-ifelse(sub$partneredu<3,0,ifelse(sub$partneredu%in%c(8,9),NA,1 ))
sub$hises<-ifelse(sub$hhses>3, 1,0)
sub$rural<-ifelse(sub$rural==2,1,0 )

Create the person-period file

The distinction between the way we have been doing things and the discrete time model, is that we treat time discretely, versus continuously. This means that we transform the data from the case-duration data format to the person-period format. For this example, a natural choice would be year, since we have 5 intervals of equal length (12 months each). R provides a useful function called survSplit() in the survival library that will split a continuous duration into discrete periods.

#make person period file
pp<-survSplit(Surv(death.age, d.event)~. , data = sub, cut=seq(12,60,12), episode="year_age")
pp<-pp[order(pp$kidid, pp$year_age),]
head(pp[, c("kidid", "death.age", "d.event", "year_age", "male", "hises")], n=20)

##                kidid death.age d.event year_age male hises
## 1          1  1  2-1         6       0        1    0     0
## 2          1  1  2-2        12       0        1    1     0
## 3          1  1  2-2        24       0        2    1     0
## 4          1  1  2-2        36       0        3    1     0
## 5          1  1  2-2        38       0        4    1     0
## 6          1  1  2-3        12       0        1    1     0
## 7          1  1  2-3        24       0        2    1     0
## 8          1  1  2-3        36       0        3    1     0
## 9          1  1  2-3        48       0        4    1     0
## 10         1  1  2-3        60       0        5    1     0
## 11         1  1  2-3        92       0        6    1     0
## 12         1  1  2-4        12       0        1    1     0
## 13         1  1  2-4        24       0        2    1     0
## 14         1  1  2-4        25       1        3    1     0
## 15         1  1  2-5        12       0        1    1     0
## 16         1  1  2-5        24       0        2    1     0
## 17         1  1  2-5        36       0        3    1     0
## 18         1  1  2-5        48       0        4    1     0
## 19         1  1  2-5        60       0        5    1     0
## 20         1  1  2-5       151       0        6    1     0

We see that each child is not in the data for multiple “risk periods”, until they experience the event (death) or age out of the risk set (year 6 in this case).

Discrete time model

So, the best thing about the discrete time model, is that it’s just logistic regression. Each risk period is treated as a single Bernoulli trial, and the child can either fail (y=1) or not (y=0) in the period. This is how we get the hazard of the event, as the estimated probability of failure in each discrete time period. So, any method you would like to use to model this probability would probably work (logit, probit models), but I will show two standard approaches. We will use the complementary log-log link. This is used because it preserves the proportional hazards property of the model, as in the Cox model.

fitd1<-glm( d.event~factor(year_age)+bord+male+educ.high+I(age/5)+I(hhses>3)-1,family=binomial(link="cloglog"),
                data=pp, weights=weight)

## Warning: non-integer #successes in a binomial glm!

arm::display(fitd1, detail=T)

## glm(formula = d.event ~ factor(year_age) + bord + male + educ.high + 
##     I(age/5) + I(hhses > 3) - 1, family = binomial(link = "cloglog"), 
##     data = pp, weights = weight)
##                   coef.est coef.se z value Pr(>|z|)
## factor(year_age)1  -2.25     0.08  -29.66    0.00  
## factor(year_age)2  -3.47     0.08  -41.73    0.00  
## factor(year_age)3  -3.95     0.09  -44.07    0.00  
## factor(year_age)4  -4.73     0.11  -44.40    0.00  
## factor(year_age)5  -5.12     0.12  -42.24    0.00  
## factor(year_age)6  -5.35     0.13  -40.11    0.00  
## bord                0.17     0.01   23.97    0.00  
## male                0.07     0.03    2.44    0.01  
## educ.high          -0.05     0.05   -0.90    0.37  
## I(age/5)           -0.05     0.01   -4.68    0.00  
## I(hhses > 3)TRUE   -0.18     0.04   -5.20    0.00  
## ---
##   n = 106587, k = 11
##   residual deviance = 30683.3, null deviance = 203511.8 (difference = 172828.5)

Discrete time Model with frailty

This again, is just a GLMM with a complementary log-log link, following the example from above

fitd2<-glmer(d.event~year_age+bord+male+educ.high+I(age/5)+I(hhses>3)+(1|region), family=binomial(link="cloglog") , data=pp, weights=weight,
            control=glmerControl(optimizer="bobyqa",optCtrl=list(maxfun=2e5)))

## Warning: non-integer #successes in a binomial glm!

arm::display(fitd2, detail=T)

## glmer(formula = d.event ~ year_age + bord + male + educ.high + 
##     I(age/5) + I(hhses > 3) + (1 | region), data = pp, family = binomial(link = "cloglog"), 
##     control = glmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e+05)), 
##     weights = weight)
##                  coef.est coef.se z value Pr(>|z|)
## (Intercept)       -1.47     0.09  -16.09    0.00  
## year_age          -0.78     0.02  -50.09    0.00  
## bord               0.17     0.01   23.27    0.00  
## male               0.07     0.03    2.17    0.03  
## educ.high         -0.05     0.06   -0.87    0.39  
## I(age/5)          -0.06     0.01   -5.08    0.00  
## I(hhses > 3)TRUE  -0.20     0.05   -4.50    0.00  
## 
## Error terms:
##  Groups   Name        Std.Dev.
##  region   (Intercept) 0.10    
##  Residual             1.00    
## ---
## number of obs: 106587, groups: region, 8
## AIC = 27924.5, DIC = 33725
## deviance = 30816.7

AIC(fitd1)

## [1] 30142.02

AIC(fitd2)

## [1] 27924.47

DEM 7223 - Example 8 - Discrete Time hazard model with frailty

Corey Sparks, PhD

November 9, 2016