This example will illustrate how to fit the discrete time hazard model to longitudinal and continuous duration data (i.e. person-level data).

The first example will use as its outcome variable, the event of a child dying before age 5. The data for this example come from the model.data Demographic and Health Survey for 2012 children’s recode file. This file contains information for all births in the last 5 years prior to the survey.

The longitudinal data example uses data from the ECLS-K. Specifically, we will examine the transition into poverty between kindergarten and 8th grade.

#Load required libraries
library(foreign)
library(survival)
library(car)
library(survey)

Using Longitudinal Data

As in the other examples, I illustrate fitting these models to data that are longitudinal, instead of person-duration. In this example, we will examine how to fit the Cox model to a longitudinally collected data set.

load("~/Google Drive/dem7223/class2016//data/eclsk.Rdata")
names(eclsk)<-tolower(names(eclsk))
#get out only the variables I'm going to use for this example
myvars<-c( "childid","gender", "race", "r1_kage","r3age", "r4age", "r5age", "r6age", "r7age", "wkpov_r","w1povrty","w3povrty", "w5povrty", "w8povrty","wkmomed","w1momed", "w3momed", "w5momed", "w8momed","s2_id", "c1_7fp0", "c17fpstr", "c17fppsu")
eclsk<-eclsk[,myvars]


eclsk$age1<-ifelse(eclsk$r1_kage==-9, NA, eclsk$r1_kage/12)
eclsk$age2<-ifelse(eclsk$r3age==-9, NA, eclsk$r4age/12)
#for the later waves, the NCES group the ages into ranges of months, so 1= <105 months, 2=105 to 108 months. So, I fix the age at the midpoint of the interval they give, and make it into years by dividing by 12
eclsk$age3<-recode(eclsk$r5age,recodes="1=105; 2=107; 3=109; 4=112; 5=115; 6=117; -9=NA")/12
eclsk$age4<-recode(eclsk$r7age,recodes="1=155; 2=166; 3=172; 4=178; 5=192; -9=NA")/12


eclsk$pov1<-ifelse(eclsk$wkpov_r==1,1,0)
eclsk$pov2<-ifelse(eclsk$w3povrty==1,1,0)
eclsk$pov3<-ifelse(eclsk$w5povrty==1,1,0)
eclsk$pov4<-ifelse(eclsk$w8povrty==1,1,0)


#mother's education-time varying > hs ==1
eclsk$momedu1<-recode(eclsk$wkmomed, recodes = "1:2='primarysch';3='highschool';4:5='jcollege'; 6:9='college'; else =NA", as.factor.result = T); eclsk$momedu1<-relevel(eclsk$momedu1, ref = "highschool") 
eclsk$momedu2<-recode(eclsk$w3momed, recodes = "1:2='primarysch';3='highschool';4:5='jcollege'; 6:9='college'; else =NA", as.factor.result = T); eclsk$momedu2<-relevel(eclsk$momedu2, ref = "highschool")
eclsk$momedu3<-recode(eclsk$w5momed, recodes = "1:2='primarysch';3='highschool';4:5='jcollege'; 6:9='college'; else =NA", as.factor.result = T); eclsk$momedu3<-relevel(eclsk$momedu3, ref = "highschool") 



#Recode race with white, non Hispanic as reference using dummy vars
eclsk$hisp<-recode (eclsk$race, recodes="3:4=1;-9=NA; else=0")
eclsk$black<-recode (eclsk$race, recodes="2=1;-9=NA; else=0")
eclsk$asian<-recode (eclsk$race, recodes="5=1;-9=NA; else=0")
eclsk$nahn<-recode (eclsk$race, recodes="6:7=1;-9=NA; else=0")
eclsk$other<-recode (eclsk$race, recodes="8=1;-9=NA; else=0")
eclsk$race_gr<-recode(eclsk$race, recodes="3:4='hisp'; 2='nh black'; 5='nh asian'; 6:7='nahn'; 8='other'; 1='nh white'; else=NA", as.factor.result = T)
eclsk$race_gr<-relevel(eclsk$race_gr, ref = 'nh white')
eclsk$male<-recode(eclsk$gender, recodes="1=1; 2=0; -9=NA")
eclsk$mlths<-recode(eclsk$wkmomed, recodes = "1:2=1; 3:9=0; else = NA")
eclsk$mgths<-recode(eclsk$wkmomed, recodes = "1:3=0; 4:9=1; else =NA") 

Now, I need to form the transition variable, this is my event variable, and in this case it will be 1 if a child enters poverty between the first wave of the data and the third grade wave, and 0 otherwise. NOTE I need to remove any children who are already in poverty age wave 1, because they are not at risk of experiencing this particular transition.

eclsk<-subset(eclsk, is.na(pov1)==F&is.na(pov2)==F&is.na(pov3)==F&is.na(age1)==F&is.na(age2)==F&is.na(age3)==F&pov1!=1&is.na(eclsk$c17fpstr)==F)

Just as before, we reshape the data from the current “wide” format (repeated measures in columns) to a “long” format (repeated observations in rows). The reshape() function allows us to do this easily. It allows us to specify our repeated measures, time varying covariates as well as time-constant covariates.

e.long<-reshape(eclsk, idvar="childid", varying=list(age=c("age1","age2", "age3"), age2=c("age2", "age3", "age4"), momedu=c("momedu1", "momedu2", "momedu3")), times=1:3, direction="long" , v.names = c("age_enter", "age_exit", "momedu"))
e.long<-e.long[order(e.long$childid, e.long$time),]
head(e.long)
##             childid gender race r1_kage r3age r4age r5age r6age r7age
## 0015006C.1 0015006C      1    1    76.8 87.83 95.53     6    NA     4
## 0015006C.2 0015006C      1    1    76.8 87.83 95.53     6    NA     4
## 0015006C.3 0015006C      1    1    76.8 87.83 95.53     6    NA     4
## 0015014C.1 0015014C      2    1    64.2 75.23 82.93     2     2     2
## 0015014C.2 0015014C      2    1    64.2 75.23 82.93     2     2     2
## 0015014C.3 0015014C      2    1    64.2 75.23 82.93     2     2     2
##            wkpov_r w1povrty w3povrty w5povrty w8povrty wkmomed w1momed
## 0015006C.1       2        2        2        2        2       5       5
## 0015006C.2       2        2        2        2        2       5       5
## 0015006C.3       2        2        2        2        2       5       5
## 0015014C.1       2        2        2        2        2       5       6
## 0015014C.2       2        2        2        2        2       5       6
## 0015014C.3       2        2        2        2        2       5       6
##            w3momed w5momed w8momed s2_id c1_7fp0 c17fpstr c17fppsu pov1
## 0015006C.1       6       8       8  0015  934.11       77        1    0
## 0015006C.2       6       8       8  0015  934.11       77        1    0
## 0015006C.3       6       8       8  0015  934.11       77        1    0
## 0015014C.1       6       6       6  0015  129.55       77        1    0
## 0015014C.2       6       6       6  0015  129.55       77        1    0
## 0015014C.3       6       6       6  0015  129.55       77        1    0
##            pov2 pov3 pov4 hisp black asian nahn other  race_gr male mlths
## 0015006C.1    0    0    0    0     0     0    0     0 nh white    1     0
## 0015006C.2    0    0    0    0     0     0    0     0 nh white    1     0
## 0015006C.3    0    0    0    0     0     0    0     0 nh white    1     0
## 0015014C.1    0    0    0    0     0     0    0     0 nh white    0     0
## 0015014C.2    0    0    0    0     0     0    0     0 nh white    0     0
## 0015014C.3    0    0    0    0     0     0    0     0 nh white    0     0
##            mgths time age_enter  age_exit   momedu
## 0015006C.1     1    1  6.400000  7.960833 jcollege
## 0015006C.2     1    2  7.960833  9.750000  college
## 0015006C.3     1    3  9.750000 14.833333  college
## 0015014C.1     1    1  5.350000  6.910833 jcollege
## 0015014C.2     1    2  6.910833  8.916667  college
## 0015014C.3     1    3  8.916667 13.833333  college
#find which kids failed in the first time period and remove them from the second risk period risk set
e.long$povtran<-NA
e.long$povtran[e.long$pov1==0&e.long$pov2==1&e.long$time==1]<-1
e.long$povtran[e.long$pov2==0&e.long$pov3==1&e.long$time==2]<-1
e.long$povtran[e.long$pov3==0&e.long$pov4==1&e.long$time==3]<-1

e.long$povtran[e.long$pov1==0&e.long$pov2==0&e.long$time==1]<-0
e.long$povtran[e.long$pov2==0&e.long$pov3==0&e.long$time==2]<-0
e.long$povtran[e.long$pov3==0&e.long$pov4==0&e.long$time==3]<-0


#find which kids failed in the first time period and remove them from the second risk period risk set
failed1<-which(is.na(e.long$povtran)==T)
e.long<-e.long[-failed1,]
e.long$age1r<-round(e.long$age_enter, 0)
e.long$age2r<-round(e.long$age_exit, 0)

head(e.long[, c("childid", "pov1", "pov2", "pov3", "pov4", "time", "povtran")], n=30)
##             childid pov1 pov2 pov3 pov4 time povtran
## 0015006C.1 0015006C    0    0    0    0    1       0
## 0015006C.2 0015006C    0    0    0    0    2       0
## 0015006C.3 0015006C    0    0    0    0    3       0
## 0015014C.1 0015014C    0    0    0    0    1       0
## 0015014C.2 0015014C    0    0    0    0    2       0
## 0015014C.3 0015014C    0    0    0    0    3       0
## 0015019C.1 0015019C    0    0    0    0    1       0
## 0015019C.2 0015019C    0    0    0    0    2       0
## 0015019C.3 0015019C    0    0    0    0    3       0
## 0015020C.1 0015020C    0    0    0    0    1       0
## 0015020C.2 0015020C    0    0    0    0    2       0
## 0015020C.3 0015020C    0    0    0    0    3       0
## 0015026C.1 0015026C    0    0    0    0    1       0
## 0015026C.2 0015026C    0    0    0    0    2       0
## 0015026C.3 0015026C    0    0    0    0    3       0
## 0021001C.1 0021001C    0    1    1    0    1       1
## 0021002C.1 0021002C    0    0    0    0    1       0
## 0021002C.2 0021002C    0    0    0    0    2       0
## 0021002C.3 0021002C    0    0    0    0    3       0
## 0022002C.1 0022002C    0    0    0    0    1       0
## 0022002C.2 0022002C    0    0    0    0    2       0
## 0022002C.3 0022002C    0    0    0    0    3       0
## 0022003C.1 0022003C    0    0    0    0    1       0
## 0022003C.2 0022003C    0    0    0    0    2       0
## 0022003C.3 0022003C    0    0    0    0    3       0
## 0022005C.1 0022005C    0    0    0    0    1       0
## 0022005C.2 0022005C    0    0    0    0    2       0
## 0022005C.3 0022005C    0    0    0    0    3       0
## 0022009C.1 0022009C    0    1    1    1    1       1
## 0022012C.1 0022012C    0    0    0    0    1       0

Now we fit the discrete time model using full survey design. In the ECLS-K, I use the longitudinal weight for waves 1-7, as well as the associated psu and strata id’s for the longitudinal data from these waves from the parents of the child, since no data from the child themselves are used in the outcome.

options(survey.lonely.psu = "adjust")
des2<-svydesign(ids = ~c17fppsu, strata = ~c17fpstr, weights=~c1_7fp0, data=e.long[complete.cases(e.long),], nest=T)

Discrete time model

Following the notation in the notes, we specify a genearlized linear model for a binary outcome at each time period of risk. In this case, the binomial event is 0 if a child doesn’t transition into poverty between waves, and 1 if they do. This can be specified as a logistic regression model with a choice of link functions. We can use a logit specification for the hazard:

\(\text{logit}(h(t))= \left [ \alpha_1 D_1 +\alpha_2 D_2 + \cdots \alpha_t D_t \right ]+ \sum_k \beta_k x_{k}\)

or, as I do below, use a complementary log-log link:

\(log(-log(1-h(t)))= \left [ \alpha_1 D_1 +\alpha_2 D_2 + \cdots \alpha_t D_t \right ]+ \sum_k \beta_k x_{k}\)

while the choice of link function genearlly has no effect on predictions, the interpretation of the log-log link is the same as the proportional hazards model, instead of the odds ratio interpretation.

#Fit the model
fitl1<-svyglm(povtran~as.factor(time)+momedu+race_gr-1, design=des2, family=binomial(link="cloglog"))
## Warning: non-integer #successes in a binomial glm!
summary(fitl1)
## 
## Call:
## svyglm(formula = povtran ~ as.factor(time) + momedu + race_gr - 
##     1, design = des2, family = binomial(link = "cloglog"))
## 
## Survey design:
## svydesign(ids = ~c17fppsu, strata = ~c17fpstr, weights = ~c1_7fp0, 
##     data = e.long[complete.cases(e.long), ], nest = T)
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## as.factor(time)1  -2.4835     0.3304  -7.518 7.08e-11 ***
## as.factor(time)2  -2.7617     0.3152  -8.762 2.57e-13 ***
## as.factor(time)3  -2.9697     0.4127  -7.196 2.97e-10 ***
## momeducollege     -3.0851     0.6279  -4.913 4.67e-06 ***
## momedujcollege    -0.6937     0.3232  -2.146   0.0349 *  
## momeduprimarysch   0.4619     0.3009   1.535   0.1288    
## race_grhisp        0.6554     0.2389   2.744   0.0075 ** 
## race_grnahn        1.1062     0.5369   2.060   0.0426 *  
## race_grnh asian   -0.8580     0.8327  -1.030   0.3060    
## race_grnh black    0.6738     0.4415   1.526   0.1309    
## race_grother       0.2884     0.7052   0.409   0.6836    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1.067241)
## 
## Number of Fisher Scoring iterations: 8
#make a Hazard ratio
sums<-data.frame(summary(fitl1)$coef); sums$HR<-round(exp(sums[,1]), 3)
sums
##                    Estimate Std..Error    t.value     Pr...t..    HR
## as.factor(time)1 -2.4835003  0.3303573 -7.5176193 7.075821e-11 0.083
## as.factor(time)2 -2.7616670  0.3151700 -8.7624669 2.573844e-13 0.063
## as.factor(time)3 -2.9696978  0.4126719 -7.1962691 2.970429e-10 0.051
## momeducollege    -3.0851123  0.6279239 -4.9131945 4.670633e-06 0.046
## momedujcollege   -0.6936799  0.3232399 -2.1460216 3.490211e-02 0.500
## momeduprimarysch  0.4618579  0.3009270  1.5347837 1.287825e-01 1.587
## race_grhisp       0.6554319  0.2388875  2.7436846 7.496800e-03 1.926
## race_grnahn       1.1062070  0.5368948  2.0603793 4.261267e-02 3.023
## race_grnh asian  -0.8579799  0.8327348 -1.0303159 3.059667e-01 0.424
## race_grnh black   0.6737680  0.4414881  1.5261294 1.309216e-01 1.962
## race_grother      0.2884319  0.7052225  0.4089942 6.836368e-01 1.334
dat3<-expand.grid(time=as.factor(c(1,2,3)),momedu=levels(des2$variables$momedu), race_gr=levels(des2$variables$race_gr))
#unfortunately, expand.grid makes some unrealistic cases sometimes, get rid of those.

dat3$fitted<-predict(fitl1, dat3, type="response")
head(dat3)
##   time     momedu  race_gr      fitted
## 1    1 highschool nh white 0.080063476
## 2    2 highschool nh white 0.061231479
## 3    3 highschool nh white 0.050024246
## 4    1    college nh white 0.003808500
## 5    2    college nh white 0.002885020
## 6    3    college nh white 0.002343797

Do some plots, these aren’t very cool, since there is only 2 time periods.

plot(NULL,xlim=c(1,3), type="l",
     ylim=c(min(dat3$fitted), .4),
     ylab="h(t)", xlab="Wave", lwd=3)
lines(fitted~time, dat3[dat3$momedu=="highschool"&dat3$race_gr=="nh white",], col=1, lty=1, lwd=3)
lines(fitted~time, dat3[dat3$momedu=="college" &dat3$race_gr=="nh white",], col=1, lty=2, lwd=3)
lines(fitted~time, dat3[dat3$momedu=="highschool"&dat3$race_gr=="nh black",], col=2, lwd=3)
lines(fitted~time, dat3[dat3$momedu=="college"&dat3$race_gr=="nh black",], col=2, lty=2, lwd=3)
lines(fitted~time, dat3[dat3$momedu=="highschool"&dat3$race_gr=="hisp",], col=3, lwd=3)
lines(fitted~time, dat3[dat3$momedu=="college"&dat3$race_gr=="hisp",], col=3, lty=2, lwd=3)
lines(fitted~time, dat3[dat3$momedu=="highschool"&dat3$race_gr=="nahn",], col=4, lwd=3)
lines(fitted~time, dat3[dat3$momedu=="college"&dat3$race_gr=="nahn",], col=4, lty=2, lwd=3)
lines(fitted~time, dat3[dat3$momedu=="highschool"&dat3$race_gr=="nh asian",], col=5, lwd=3)
lines(fitted~time, dat3[dat3$momedu=="college"&dat3$race_gr=="nh asian",], col=5, lty=2, lwd=3)

legend("topright", legend=c("NH White, HS Edu", "NH White, LT HS Edu", "NH Black, HS Edu", "NH Black, LT HS Edu","Hisp, HS Edu", "Hisp, LT HS Edu","NAHN, HS Edu", "NAHN, LT HS Edu","NH Asian, HS Edu", "NH Asian, LT HS Edu"), col=c(1,1,2,2,3,3,4,4,5,5), lty=rep(c(1,2), 5), cex=.75)
title(main="Hazard of Transitioning into Poverty")

Interaction between time and a covariate

In the discrete time model, it’s easy enough to test if a covariate’s effect is constant across time, just interact it with time in the model.

#Fit the model
fitl2<-svyglm(povtran~as.factor(time)*race_gr+momedu-1, design=des2, family=binomial(link="cloglog"))
## Warning: non-integer #successes in a binomial glm!
summary(fitl2) 
## 
## Call:
## svyglm(formula = povtran ~ as.factor(time) * race_gr + momedu - 
##     1, design = des2, family = binomial(link = "cloglog"))
## 
## Survey design:
## svydesign(ids = ~c17fppsu, strata = ~c17fpstr, weights = ~c1_7fp0, 
##     data = e.long[complete.cases(e.long), ], nest = T)
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## as.factor(time)1                  -2.45126    0.35959  -6.817 2.69e-09 ***
## as.factor(time)2                  -3.10690    0.42913  -7.240 4.55e-10 ***
## as.factor(time)3                  -2.73771    0.46564  -5.879 1.28e-07 ***
## race_grhisp                        0.49418    0.36661   1.348   0.1820    
## race_grnahn                        1.01883    0.75558   1.348   0.1819    
## race_grnh asian                   -1.17354    0.91102  -1.288   0.2019    
## race_grnh black                    0.82668    0.64581   1.280   0.2047    
## race_grother                     -14.41042    0.50180 -28.717  < 2e-16 ***
## momeducollege                     -3.08526    0.62960  -4.900 5.95e-06 ***
## momedujcollege                    -0.67140    0.31432  -2.136   0.0362 *  
## momeduprimarysch                   0.45348    0.30183   1.502   0.1375    
## as.factor(time)2:race_grhisp       0.59332    0.63785   0.930   0.3555    
## as.factor(time)3:race_grhisp       0.04999    0.65246   0.077   0.9391    
## as.factor(time)2:race_grnahn       0.91697    0.65709   1.396   0.1673    
## as.factor(time)3:race_grnahn      -1.62551    1.21261  -1.341   0.1844    
## as.factor(time)2:race_grnh asian   0.90640    1.11668   0.812   0.4197    
## as.factor(time)3:race_grnh asian   0.20746    1.35220   0.153   0.8785    
## as.factor(time)2:race_grnh black   0.22586    1.01820   0.222   0.8251    
## as.factor(time)3:race_grnh black  -1.19578    0.99157  -1.206   0.2319    
## as.factor(time)2:race_grother     16.36122    1.16577  14.035  < 2e-16 ***
## as.factor(time)3:race_grother      0.55741    0.48115   1.158   0.2506    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1.084763)
## 
## Number of Fisher Scoring iterations: 16
regTermTest(fitl2, ~as.factor(time):race_gr)
## Wald test for as.factor(time):race_gr
##  in svyglm(formula = povtran ~ as.factor(time) * race_gr + momedu - 
##     1, design = des2, family = binomial(link = "cloglog"))
## F =  28.56107  on  10  and  70  df: p= < 2.22e-16
dat4<-expand.grid(time=c(1,2,3),momedu=levels(des2$variables$momedu), race_gr=levels(des2$variables$race_gr))
#unfortunately, expand.grid makes some unrealistic cases sometimes, get rid of those.

dat4$fitted<-predict(fitl2, dat4, type="response")
head(dat4)
##   time     momedu  race_gr      fitted
## 1    1 highschool nh white 0.082575820
## 2    2 highschool nh white 0.043753523
## 3    3 highschool nh white 0.062668738
## 4    1    college nh white 0.003932485
## 5    2    college nh white 0.002043322
## 6    3    college nh white 0.002954441
plot(NULL,xlim=c(1,3), type="l",
     ylim=c(min(dat4$fitted), .4),
     ylab="h(t)", xlab="Wave", lwd=3)
lines(fitted~time, dat4[dat4$momedu=="highschool"&dat4$race_gr=="nh white",], col=1, lty=1, lwd=3)
lines(fitted~time, dat4[dat4$momedu=="college" &dat4$race_gr=="nh white",], col=1, lty=2, lwd=3)
lines(fitted~time, dat4[dat4$momedu=="highschool"&dat4$race_gr=="nh black",], col=2, lwd=3)
lines(fitted~time, dat4[dat4$momedu=="college"&dat4$race_gr=="nh black",], col=2, lty=2, lwd=3)
lines(fitted~time, dat4[dat4$momedu=="highschool"&dat4$race_gr=="hisp",], col=3, lwd=3)
lines(fitted~time, dat4[dat4$momedu=="college"&dat4$race_gr=="hisp",], col=3, lty=2, lwd=3)
lines(fitted~time, dat4[dat4$momedu=="highschool"&dat4$race_gr=="nahn",], col=4, lwd=3)
lines(fitted~time, dat4[dat4$momedu=="college"&dat4$race_gr=="nahn",], col=4, lty=2, lwd=3)
lines(fitted~time, dat4[dat4$momedu=="highschool"&dat4$race_gr=="nh asian",], col=5, lwd=3)
lines(fitted~time, dat4[dat4$momedu=="college"&dat4$race_gr=="nh asian",], col=5, lty=2, lwd=3)


legend("topright", legend=c("NH White, HS Edu", "NH White, LT HS Edu", "NH Black, HS Edu", "NH Black, LT HS Edu","Hisp, HS Edu", "Hisp, LT HS Edu","NAHN, HS Edu", "NAHN, LT HS Edu","NH Asian, HS Edu", "NH Asian, LT HS Edu"), col=c(1,1,2,2,3,3,4,4,5,5), lty=rep(c(1,2), 5), cex=.75)
title(main="Hazard of Transitioning into Poverty")

Continuous duration outcome

load the data

model.dat<-read.dta("/Users/ozd504/Google Drive/dem7223/class2016/data/zzkr62dt/ZZKR62FL.DTA", convert.factors = F)

Event - Infant Mortality

In the DHS, they record if a child is dead or alive and the age at death if the child is dead. This can be understood using a series of variables about each child.

If the child is alive at the time of interview, then the variable B5==1, and the age at death is censored.

If the age at death is censored, then the age at the date of interview (censored age at death) is the date of the interview - date of birth (in months).

If the child is dead at the time of interview,then the variable B5!=1, then the age at death in months is the variable B7. Here we code this:

#We form a subset of variables
sub<-data.frame(CASEID=model.dat$caseid,kidid=paste(model.dat$caseid, model.dat$bidx, sep="-"), v008=model.dat$v008,bord=model.dat$bidx,csex=model.dat$b4,b2=model.dat$b2, b3=model.dat$b3, b5=model.dat$b5, b7=model.dat$b7, ibint=model.dat$b11, rural=model.dat$v025, educ=model.dat$v106,age=model.dat$v012,partneredu=model.dat$v701,partnerage=model.dat$v730, hhses=model.dat$v190, weight=model.dat$v005/1000000, psu=model.dat$v021, strata=model.dat$v022)

sub$death.age<-ifelse(sub$b5==1,
                          ((((sub$v008))+1900)-(((sub$b3))+1900)) 
                          ,sub$b7)+.0001

#censoring indicator for death by age 5, in months (<=60 months)
sub$d.event<-ifelse(is.na(sub$b7)==T|sub$b7>60,0,1)
sub$d.eventfac<-factor(sub$d.event); levels(sub$d.eventfac)<-c("Alive at Age 5", "Dead by Age 5")
table(sub$d.eventfac)
## 
## Alive at Age 5  Dead by Age 5 
##           5335            633
#recodes
sub$male<-ifelse(sub$csex==1,1,0)
sub$educ.high<-ifelse(sub$educ %in% c(2,3), 1, 0)
sub$age2<-sub$age^2
sub$partnerhiedu<-ifelse(sub$partneredu<3,0,ifelse(sub$partneredu%in%c(8,9),NA,1 ))
sub$hises<-ifelse(sub$hhses>3, 1,0)
sub$rural<-ifelse(sub$rural==2,1,0 )

Create the person-period file

The distinction between the way we have been doing things and the discrete time model, is that we treat time discretely, versus continuously. This means that we transform the data from the case-duration data format to the person-period format. For this example, a natural choice would be year, since we have 5 intervals of equal length (12 months each). R provides a useful function called survSplit() in the survival library that will split a continuous duration into discrete periods.

#make person period file
pp<-survSplit(Surv(death.age, d.event)~. , data = sub, cut=seq(0,60,12), episode="year_age")
pp<-pp[order(pp$kidid, pp$year_age),]
head(pp[, c("kidid", "death.age", "d.event", "year_age", "male", "hises")], n=20)
##                kidid death.age d.event year_age male hises
## 1          1  1  2-1    5.0001       0        2    0     0
## 2          1  1  2-2   12.0000       0        2    1     0
## 3          1  1  2-2   24.0000       0        3    1     0
## 4          1  1  2-2   36.0000       0        4    1     0
## 5          1  1  2-2   37.0001       0        5    1     0
## 6          1  3  2-1   12.0000       0        2    0     0
## 7          1  3  2-1   24.0000       0        3    0     0
## 8          1  3  2-1   30.0001       0        4    0     0
## 9          1  4  3-1   10.0001       0        2    0     0
## 10         1  4  3-2   12.0000       0        2    1     0
## 11         1  4  3-2   24.0000       0        3    1     0
## 12         1  4  3-2   30.0001       0        4    1     0
## 13         1  5  1-1    0.0001       1        2    1     0
## 14         1  5  1-2   12.0000       0        2    0     0
## 15         1  5  1-2   24.0000       0        3    0     0
## 16         1  5  1-2   34.0001       0        4    0     0
## 17         1 15  1-1    1.0001       1        2    0     0
## 18         1 15  1-2   12.0000       0        2    1     0
## 19         1 15  1-2   18.0001       0        3    1     0
## 20         1 15  1-3    3.0001       1        2    1     0

We see that each child is not in the data for multiple “risk periods”, until they experience the event (death) or age out of the risk set (year 6 in this case).

Discrete time model

So, the best thing about the discrete time model, is that it’s just logistic regression. Each risk period is treated as a single Bernoulli trial, and the child can either fail (y=1) or not (y=0) in the period. This is how we get the hazard of the event, as the estimated probability of failure in each discrete time period. So, any method you would like to use to model this probability would probably work (logit, probit models), but I will show two standard approaches. We will use the complementary log-log link. This is used because it preserves the proportional hazards property of the model, as in the Cox model.

#generate survey design

des<-svydesign(ids=~psu, strata = ~strata , weights=~weight, data=pp)

#Fit the basic logistic model with ONLY time in the model
#I do -1 so that no intercept is fit in the model, and we get a hazard estimate for each time period
fit.0<-svyglm(d.event~as.factor(year_age)-1,design=des , family="binomial")
## Warning: non-integer #successes in a binomial glm!
summary(fit.0)
## 
## Call:
## svyglm(formula = d.event ~ as.factor(year_age) - 1, design = des, 
##     family = "binomial")
## 
## Survey design:
## svydesign(ids = ~psu, strata = ~strata, weights = ~weight, data = pp)
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## as.factor(year_age)2 -2.41351    0.05901  -40.90   <2e-16 ***
## as.factor(year_age)3 -3.72379    0.13236  -28.13   <2e-16 ***
## as.factor(year_age)4 -4.28401    0.19830  -21.60   <2e-16 ***
## as.factor(year_age)5 -5.16879    0.33117  -15.61   <2e-16 ***
## as.factor(year_age)6 -4.63002    0.34620  -13.37   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1.00006)
## 
## Number of Fisher Scoring iterations: 7
#Plot the hazard function on the probability scale
haz<-1/(1+exp(-coef(fit.0)))
time<-seq(1,5,1)
plot(haz~time, type="l", ylab="h(t)")
title(main="Discrete Time Hazard Function for Child Mortality")

#now we include a single predictor and examine the proportional hazards

fit.1<-svyglm(d.event~as.factor(year_age)+rural-1,design=des , family=binomial(link="cloglog"))
## Warning: non-integer #successes in a binomial glm!
summary(fit.1)
## 
## Call:
## svyglm(formula = d.event ~ as.factor(year_age) + rural - 1, design = des, 
##     family = binomial(link = "cloglog"))
## 
## Survey design:
## svydesign(ids = ~psu, strata = ~strata, weights = ~weight, data = pp)
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## as.factor(year_age)2 -2.32949    0.09087 -25.634   <2e-16 ***
## as.factor(year_age)3 -3.60789    0.15709 -22.967   <2e-16 ***
## as.factor(year_age)4 -4.16113    0.20838 -19.969   <2e-16 ***
## as.factor(year_age)5 -5.04442    0.32888 -15.338   <2e-16 ***
## as.factor(year_age)6 -4.50825    0.35846 -12.577   <2e-16 ***
## rural                -0.19695    0.10632  -1.852   0.0656 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1.002157)
## 
## Number of Fisher Scoring iterations: 7

Which shows a lower risk of death for children of higher SES households. In fact, it’s a -27.7% lower risk

Next, I do the plots of the hazard functions the Singer and Willett way. I got this code from Singer and Willett’s example from Ch 11

t<-data.frame(cbind(y=1/(1+exp(-fit.1$linear.predictors)), year=pp$year_age,rural=pp$rural))
t0<-t[t$rural==0,]
t0<-unique(t0[order(t0$year),])
t1<-t[t$rural==1,]
t1<-unique(t1[order(t1$year),])

#Here we plot the hazards, I subtract .5 from each time so it looks like the hazard is at the midpoint of the year
plot(t0$year-.5, t0$y, type="b", ylab="Hazard",  xlim=c(1,6), xlab="Year", col="red", main =c("Discrete time hazard model", "effect of Rural Residence"))
points(t1$year-.5, t1$y, type="b", col="blue")
legend("topright", legend=c("Urban", "Rural"), col=c("red", "blue"),lty=1)

But I like this way too, using expand.grid to make the various types of people, and using predict() get fitted values. Same thing, just more compact:

dat<-expand.grid(year_age=1:5, rural=c(0,1))
dat$fitted<-predict(fit.1, type = "response", newdata=dat)

plot(dat$year[dat$rural==0]-.5, dat$fitted[dat$rural==0], type="b", ylab="Hazard", xlim=c(0,5), xlab="Year", col="red", main =c("Discrete time hazard model", "effect of Rural Residence"))
points(dat$year[dat$rural==1]-.5, dat$fitted[dat$rural==1], type="b", col="blue")
legend("topright", legend=c("Urban", "Rural"), col=c("red", "blue"),lty=1)

See, same thing.

Interaction between covariate and time

fit.2<-svyglm(d.event~as.factor(year_age)*rural-1,design=des , family=binomial(link="cloglog"))
## Warning: non-integer #successes in a binomial glm!
summary(fit.2)
## 
## Call:
## svyglm(formula = d.event ~ as.factor(year_age) * rural - 1, design = des, 
##     family = binomial(link = "cloglog"))
## 
## Survey design:
## svydesign(ids = ~psu, strata = ~strata, weights = ~weight, data = pp)
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## as.factor(year_age)2       -2.31691    0.09705 -23.873  < 2e-16 ***
## as.factor(year_age)3       -3.69673    0.28862 -12.808  < 2e-16 ***
## as.factor(year_age)4       -4.02818    0.34948 -11.526  < 2e-16 ***
## as.factor(year_age)5       -5.38882    0.85647  -6.292 2.30e-09 ***
## as.factor(year_age)6       -4.52065    0.61743  -7.322 7.75e-12 ***
## rural                      -0.21723    0.11920  -1.822   0.0701 .  
## as.factor(year_age)3:rural  0.15855    0.34039   0.466   0.6419    
## as.factor(year_age)4:rural -0.19743    0.43291  -0.456   0.6489    
## as.factor(year_age)5:rural  0.52695    0.93078   0.566   0.5720    
## as.factor(year_age)6:rural  0.04021    0.73653   0.055   0.9565    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1.00006)
## 
## Number of Fisher Scoring iterations: 7
regTermTest(fit.2, ~as.factor(year_age):rural)
## Wald test for as.factor(year_age):rural
##  in svyglm(formula = d.event ~ as.factor(year_age) * rural - 1, design = des, 
##     family = binomial(link = "cloglog"))
## F =  0.1956178  on  4  and  181  df: p= 0.94044
#No effect of the interaction

dat<-expand.grid(year_age=1:5, rural=c(0,1))
dat$fitted<-predict(fit.2, type = "response", newdata=dat)
#plot the interaction with time
plot(dat$year[dat$rural==0]-.5, dat$fitted[dat$rural==0], type="b", ylab="Hazard",  xlim=c(0,6), xlab="Year", col="red", main =c("Discrete time hazard model", "effect of rural residence"))
points(dat$year[dat$rural==1]-.5, dat$fitted[dat$rural==1], type="b", col="blue")
legend("topright", legend=c("Urban", "Rural"), col=c("red", "blue"),lty=1)

I can’t see any difference.