This example will illustrate how to fit the discrete time hazard model to longitudinal and continuous duration data (i.e. person-level data).
The first example will use as its outcome variable, the event of a child dying before age 5. The data for this example come from the model.data Demographic and Health Survey for 2012 children’s recode file. This file contains information for all births in the last 5 years prior to the survey.
The longitudinal data example uses data from the ECLS-K. Specifically, we will examine the transition into poverty between kindergarten and 8th grade.
#Load required libraries
library(foreign)
library(survival)
library(car)
library(survey)
As in the other examples, I illustrate fitting these models to data that are longitudinal, instead of person-duration. In this example, we will examine how to fit the Cox model to a longitudinally collected data set.
load("~/Google Drive/dem7223/class2016//data/eclsk.Rdata")
names(eclsk)<-tolower(names(eclsk))
#get out only the variables I'm going to use for this example
myvars<-c( "childid","gender", "race", "r1_kage","r3age", "r4age", "r5age", "r6age", "r7age", "wkpov_r","w1povrty","w3povrty", "w5povrty", "w8povrty","wkmomed","w1momed", "w3momed", "w5momed", "w8momed","s2_id", "c1_7fp0", "c17fpstr", "c17fppsu")
eclsk<-eclsk[,myvars]
eclsk$age1<-ifelse(eclsk$r1_kage==-9, NA, eclsk$r1_kage/12)
eclsk$age2<-ifelse(eclsk$r3age==-9, NA, eclsk$r4age/12)
#for the later waves, the NCES group the ages into ranges of months, so 1= <105 months, 2=105 to 108 months. So, I fix the age at the midpoint of the interval they give, and make it into years by dividing by 12
eclsk$age3<-recode(eclsk$r5age,recodes="1=105; 2=107; 3=109; 4=112; 5=115; 6=117; -9=NA")/12
eclsk$age4<-recode(eclsk$r7age,recodes="1=155; 2=166; 3=172; 4=178; 5=192; -9=NA")/12
eclsk$pov1<-ifelse(eclsk$wkpov_r==1,1,0)
eclsk$pov2<-ifelse(eclsk$w3povrty==1,1,0)
eclsk$pov3<-ifelse(eclsk$w5povrty==1,1,0)
eclsk$pov4<-ifelse(eclsk$w8povrty==1,1,0)
#mother's education-time varying > hs ==1
eclsk$momedu1<-recode(eclsk$wkmomed, recodes = "1:2='primarysch';3='highschool';4:5='jcollege'; 6:9='college'; else =NA", as.factor.result = T); eclsk$momedu1<-relevel(eclsk$momedu1, ref = "highschool")
eclsk$momedu2<-recode(eclsk$w3momed, recodes = "1:2='primarysch';3='highschool';4:5='jcollege'; 6:9='college'; else =NA", as.factor.result = T); eclsk$momedu2<-relevel(eclsk$momedu2, ref = "highschool")
eclsk$momedu3<-recode(eclsk$w5momed, recodes = "1:2='primarysch';3='highschool';4:5='jcollege'; 6:9='college'; else =NA", as.factor.result = T); eclsk$momedu3<-relevel(eclsk$momedu3, ref = "highschool")
#Recode race with white, non Hispanic as reference using dummy vars
eclsk$hisp<-recode (eclsk$race, recodes="3:4=1;-9=NA; else=0")
eclsk$black<-recode (eclsk$race, recodes="2=1;-9=NA; else=0")
eclsk$asian<-recode (eclsk$race, recodes="5=1;-9=NA; else=0")
eclsk$nahn<-recode (eclsk$race, recodes="6:7=1;-9=NA; else=0")
eclsk$other<-recode (eclsk$race, recodes="8=1;-9=NA; else=0")
eclsk$race_gr<-recode(eclsk$race, recodes="3:4='hisp'; 2='nh black'; 5='nh asian'; 6:7='nahn'; 8='other'; 1='nh white'; else=NA", as.factor.result = T)
eclsk$race_gr<-relevel(eclsk$race_gr, ref = 'nh white')
eclsk$male<-recode(eclsk$gender, recodes="1=1; 2=0; -9=NA")
eclsk$mlths<-recode(eclsk$wkmomed, recodes = "1:2=1; 3:9=0; else = NA")
eclsk$mgths<-recode(eclsk$wkmomed, recodes = "1:3=0; 4:9=1; else =NA")
Now, I need to form the transition variable, this is my event variable, and in this case it will be 1 if a child enters poverty between the first wave of the data and the third grade wave, and 0 otherwise. NOTE I need to remove any children who are already in poverty age wave 1, because they are not at risk of experiencing this particular transition.
eclsk<-subset(eclsk, is.na(pov1)==F&is.na(pov2)==F&is.na(pov3)==F&is.na(age1)==F&is.na(age2)==F&is.na(age3)==F&pov1!=1&is.na(eclsk$c17fpstr)==F)
Just as before, we reshape the data from the current “wide” format (repeated measures in columns) to a “long” format (repeated observations in rows). The reshape() function allows us to do this easily. It allows us to specify our repeated measures, time varying covariates as well as time-constant covariates.
e.long<-reshape(eclsk, idvar="childid", varying=list(age=c("age1","age2", "age3"), age2=c("age2", "age3", "age4"), momedu=c("momedu1", "momedu2", "momedu3")), times=1:3, direction="long" , v.names = c("age_enter", "age_exit", "momedu"))
e.long<-e.long[order(e.long$childid, e.long$time),]
head(e.long)
## childid gender race r1_kage r3age r4age r5age r6age r7age
## 0015006C.1 0015006C 1 1 76.8 87.83 95.53 6 NA 4
## 0015006C.2 0015006C 1 1 76.8 87.83 95.53 6 NA 4
## 0015006C.3 0015006C 1 1 76.8 87.83 95.53 6 NA 4
## 0015014C.1 0015014C 2 1 64.2 75.23 82.93 2 2 2
## 0015014C.2 0015014C 2 1 64.2 75.23 82.93 2 2 2
## 0015014C.3 0015014C 2 1 64.2 75.23 82.93 2 2 2
## wkpov_r w1povrty w3povrty w5povrty w8povrty wkmomed w1momed
## 0015006C.1 2 2 2 2 2 5 5
## 0015006C.2 2 2 2 2 2 5 5
## 0015006C.3 2 2 2 2 2 5 5
## 0015014C.1 2 2 2 2 2 5 6
## 0015014C.2 2 2 2 2 2 5 6
## 0015014C.3 2 2 2 2 2 5 6
## w3momed w5momed w8momed s2_id c1_7fp0 c17fpstr c17fppsu pov1
## 0015006C.1 6 8 8 0015 934.11 77 1 0
## 0015006C.2 6 8 8 0015 934.11 77 1 0
## 0015006C.3 6 8 8 0015 934.11 77 1 0
## 0015014C.1 6 6 6 0015 129.55 77 1 0
## 0015014C.2 6 6 6 0015 129.55 77 1 0
## 0015014C.3 6 6 6 0015 129.55 77 1 0
## pov2 pov3 pov4 hisp black asian nahn other race_gr male mlths
## 0015006C.1 0 0 0 0 0 0 0 0 nh white 1 0
## 0015006C.2 0 0 0 0 0 0 0 0 nh white 1 0
## 0015006C.3 0 0 0 0 0 0 0 0 nh white 1 0
## 0015014C.1 0 0 0 0 0 0 0 0 nh white 0 0
## 0015014C.2 0 0 0 0 0 0 0 0 nh white 0 0
## 0015014C.3 0 0 0 0 0 0 0 0 nh white 0 0
## mgths time age_enter age_exit momedu
## 0015006C.1 1 1 6.400000 7.960833 jcollege
## 0015006C.2 1 2 7.960833 9.750000 college
## 0015006C.3 1 3 9.750000 14.833333 college
## 0015014C.1 1 1 5.350000 6.910833 jcollege
## 0015014C.2 1 2 6.910833 8.916667 college
## 0015014C.3 1 3 8.916667 13.833333 college
#find which kids failed in the first time period and remove them from the second risk period risk set
e.long$povtran<-NA
e.long$povtran[e.long$pov1==0&e.long$pov2==1&e.long$time==1]<-1
e.long$povtran[e.long$pov2==0&e.long$pov3==1&e.long$time==2]<-1
e.long$povtran[e.long$pov3==0&e.long$pov4==1&e.long$time==3]<-1
e.long$povtran[e.long$pov1==0&e.long$pov2==0&e.long$time==1]<-0
e.long$povtran[e.long$pov2==0&e.long$pov3==0&e.long$time==2]<-0
e.long$povtran[e.long$pov3==0&e.long$pov4==0&e.long$time==3]<-0
#find which kids failed in the first time period and remove them from the second risk period risk set
failed1<-which(is.na(e.long$povtran)==T)
e.long<-e.long[-failed1,]
e.long$age1r<-round(e.long$age_enter, 0)
e.long$age2r<-round(e.long$age_exit, 0)
head(e.long[, c("childid", "pov1", "pov2", "pov3", "pov4", "time", "povtran")], n=30)
## childid pov1 pov2 pov3 pov4 time povtran
## 0015006C.1 0015006C 0 0 0 0 1 0
## 0015006C.2 0015006C 0 0 0 0 2 0
## 0015006C.3 0015006C 0 0 0 0 3 0
## 0015014C.1 0015014C 0 0 0 0 1 0
## 0015014C.2 0015014C 0 0 0 0 2 0
## 0015014C.3 0015014C 0 0 0 0 3 0
## 0015019C.1 0015019C 0 0 0 0 1 0
## 0015019C.2 0015019C 0 0 0 0 2 0
## 0015019C.3 0015019C 0 0 0 0 3 0
## 0015020C.1 0015020C 0 0 0 0 1 0
## 0015020C.2 0015020C 0 0 0 0 2 0
## 0015020C.3 0015020C 0 0 0 0 3 0
## 0015026C.1 0015026C 0 0 0 0 1 0
## 0015026C.2 0015026C 0 0 0 0 2 0
## 0015026C.3 0015026C 0 0 0 0 3 0
## 0021001C.1 0021001C 0 1 1 0 1 1
## 0021002C.1 0021002C 0 0 0 0 1 0
## 0021002C.2 0021002C 0 0 0 0 2 0
## 0021002C.3 0021002C 0 0 0 0 3 0
## 0022002C.1 0022002C 0 0 0 0 1 0
## 0022002C.2 0022002C 0 0 0 0 2 0
## 0022002C.3 0022002C 0 0 0 0 3 0
## 0022003C.1 0022003C 0 0 0 0 1 0
## 0022003C.2 0022003C 0 0 0 0 2 0
## 0022003C.3 0022003C 0 0 0 0 3 0
## 0022005C.1 0022005C 0 0 0 0 1 0
## 0022005C.2 0022005C 0 0 0 0 2 0
## 0022005C.3 0022005C 0 0 0 0 3 0
## 0022009C.1 0022009C 0 1 1 1 1 1
## 0022012C.1 0022012C 0 0 0 0 1 0
Now we fit the discrete time model using full survey design. In the ECLS-K, I use the longitudinal weight for waves 1-7, as well as the associated psu and strata id’s for the longitudinal data from these waves from the parents of the child, since no data from the child themselves are used in the outcome.
options(survey.lonely.psu = "adjust")
des2<-svydesign(ids = ~c17fppsu, strata = ~c17fpstr, weights=~c1_7fp0, data=e.long[complete.cases(e.long),], nest=T)
Following the notation in the notes, we specify a genearlized linear model for a binary outcome at each time period of risk. In this case, the binomial event is 0 if a child doesn’t transition into poverty between waves, and 1 if they do. This can be specified as a logistic regression model with a choice of link functions. We can use a logit specification for the hazard:
\(\text{logit}(h(t))= \left [ \alpha_1 D_1 +\alpha_2 D_2 + \cdots \alpha_t D_t \right ]+ \sum_k \beta_k x_{k}\)
or, as I do below, use a complementary log-log link:
\(log(-log(1-h(t)))= \left [ \alpha_1 D_1 +\alpha_2 D_2 + \cdots \alpha_t D_t \right ]+ \sum_k \beta_k x_{k}\)
while the choice of link function genearlly has no effect on predictions, the interpretation of the log-log link is the same as the proportional hazards model, instead of the odds ratio interpretation.
#Fit the model
fitl1<-svyglm(povtran~as.factor(time)+momedu+race_gr-1, design=des2, family=binomial(link="cloglog"))
## Warning: non-integer #successes in a binomial glm!
summary(fitl1)
##
## Call:
## svyglm(formula = povtran ~ as.factor(time) + momedu + race_gr -
## 1, design = des2, family = binomial(link = "cloglog"))
##
## Survey design:
## svydesign(ids = ~c17fppsu, strata = ~c17fpstr, weights = ~c1_7fp0,
## data = e.long[complete.cases(e.long), ], nest = T)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## as.factor(time)1 -2.4835 0.3304 -7.518 7.08e-11 ***
## as.factor(time)2 -2.7617 0.3152 -8.762 2.57e-13 ***
## as.factor(time)3 -2.9697 0.4127 -7.196 2.97e-10 ***
## momeducollege -3.0851 0.6279 -4.913 4.67e-06 ***
## momedujcollege -0.6937 0.3232 -2.146 0.0349 *
## momeduprimarysch 0.4619 0.3009 1.535 0.1288
## race_grhisp 0.6554 0.2389 2.744 0.0075 **
## race_grnahn 1.1062 0.5369 2.060 0.0426 *
## race_grnh asian -0.8580 0.8327 -1.030 0.3060
## race_grnh black 0.6738 0.4415 1.526 0.1309
## race_grother 0.2884 0.7052 0.409 0.6836
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1.067241)
##
## Number of Fisher Scoring iterations: 8
#make a Hazard ratio
sums<-data.frame(summary(fitl1)$coef); sums$HR<-round(exp(sums[,1]), 3)
sums
## Estimate Std..Error t.value Pr...t.. HR
## as.factor(time)1 -2.4835003 0.3303573 -7.5176193 7.075821e-11 0.083
## as.factor(time)2 -2.7616670 0.3151700 -8.7624669 2.573844e-13 0.063
## as.factor(time)3 -2.9696978 0.4126719 -7.1962691 2.970429e-10 0.051
## momeducollege -3.0851123 0.6279239 -4.9131945 4.670633e-06 0.046
## momedujcollege -0.6936799 0.3232399 -2.1460216 3.490211e-02 0.500
## momeduprimarysch 0.4618579 0.3009270 1.5347837 1.287825e-01 1.587
## race_grhisp 0.6554319 0.2388875 2.7436846 7.496800e-03 1.926
## race_grnahn 1.1062070 0.5368948 2.0603793 4.261267e-02 3.023
## race_grnh asian -0.8579799 0.8327348 -1.0303159 3.059667e-01 0.424
## race_grnh black 0.6737680 0.4414881 1.5261294 1.309216e-01 1.962
## race_grother 0.2884319 0.7052225 0.4089942 6.836368e-01 1.334
dat3<-expand.grid(time=as.factor(c(1,2,3)),momedu=levels(des2$variables$momedu), race_gr=levels(des2$variables$race_gr))
#unfortunately, expand.grid makes some unrealistic cases sometimes, get rid of those.
dat3$fitted<-predict(fitl1, dat3, type="response")
head(dat3)
## time momedu race_gr fitted
## 1 1 highschool nh white 0.080063476
## 2 2 highschool nh white 0.061231479
## 3 3 highschool nh white 0.050024246
## 4 1 college nh white 0.003808500
## 5 2 college nh white 0.002885020
## 6 3 college nh white 0.002343797
Do some plots, these aren’t very cool, since there is only 2 time periods.
plot(NULL,xlim=c(1,3), type="l",
ylim=c(min(dat3$fitted), .4),
ylab="h(t)", xlab="Wave", lwd=3)
lines(fitted~time, dat3[dat3$momedu=="highschool"&dat3$race_gr=="nh white",], col=1, lty=1, lwd=3)
lines(fitted~time, dat3[dat3$momedu=="college" &dat3$race_gr=="nh white",], col=1, lty=2, lwd=3)
lines(fitted~time, dat3[dat3$momedu=="highschool"&dat3$race_gr=="nh black",], col=2, lwd=3)
lines(fitted~time, dat3[dat3$momedu=="college"&dat3$race_gr=="nh black",], col=2, lty=2, lwd=3)
lines(fitted~time, dat3[dat3$momedu=="highschool"&dat3$race_gr=="hisp",], col=3, lwd=3)
lines(fitted~time, dat3[dat3$momedu=="college"&dat3$race_gr=="hisp",], col=3, lty=2, lwd=3)
lines(fitted~time, dat3[dat3$momedu=="highschool"&dat3$race_gr=="nahn",], col=4, lwd=3)
lines(fitted~time, dat3[dat3$momedu=="college"&dat3$race_gr=="nahn",], col=4, lty=2, lwd=3)
lines(fitted~time, dat3[dat3$momedu=="highschool"&dat3$race_gr=="nh asian",], col=5, lwd=3)
lines(fitted~time, dat3[dat3$momedu=="college"&dat3$race_gr=="nh asian",], col=5, lty=2, lwd=3)
legend("topright", legend=c("NH White, HS Edu", "NH White, LT HS Edu", "NH Black, HS Edu", "NH Black, LT HS Edu","Hisp, HS Edu", "Hisp, LT HS Edu","NAHN, HS Edu", "NAHN, LT HS Edu","NH Asian, HS Edu", "NH Asian, LT HS Edu"), col=c(1,1,2,2,3,3,4,4,5,5), lty=rep(c(1,2), 5), cex=.75)
title(main="Hazard of Transitioning into Poverty")
In the discrete time model, it’s easy enough to test if a covariate’s effect is constant across time, just interact it with time in the model.
#Fit the model
fitl2<-svyglm(povtran~as.factor(time)*race_gr+momedu-1, design=des2, family=binomial(link="cloglog"))
## Warning: non-integer #successes in a binomial glm!
summary(fitl2)
##
## Call:
## svyglm(formula = povtran ~ as.factor(time) * race_gr + momedu -
## 1, design = des2, family = binomial(link = "cloglog"))
##
## Survey design:
## svydesign(ids = ~c17fppsu, strata = ~c17fpstr, weights = ~c1_7fp0,
## data = e.long[complete.cases(e.long), ], nest = T)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## as.factor(time)1 -2.45126 0.35959 -6.817 2.69e-09 ***
## as.factor(time)2 -3.10690 0.42913 -7.240 4.55e-10 ***
## as.factor(time)3 -2.73771 0.46564 -5.879 1.28e-07 ***
## race_grhisp 0.49418 0.36661 1.348 0.1820
## race_grnahn 1.01883 0.75558 1.348 0.1819
## race_grnh asian -1.17354 0.91102 -1.288 0.2019
## race_grnh black 0.82668 0.64581 1.280 0.2047
## race_grother -14.41042 0.50180 -28.717 < 2e-16 ***
## momeducollege -3.08526 0.62960 -4.900 5.95e-06 ***
## momedujcollege -0.67140 0.31432 -2.136 0.0362 *
## momeduprimarysch 0.45348 0.30183 1.502 0.1375
## as.factor(time)2:race_grhisp 0.59332 0.63785 0.930 0.3555
## as.factor(time)3:race_grhisp 0.04999 0.65246 0.077 0.9391
## as.factor(time)2:race_grnahn 0.91697 0.65709 1.396 0.1673
## as.factor(time)3:race_grnahn -1.62551 1.21261 -1.341 0.1844
## as.factor(time)2:race_grnh asian 0.90640 1.11668 0.812 0.4197
## as.factor(time)3:race_grnh asian 0.20746 1.35220 0.153 0.8785
## as.factor(time)2:race_grnh black 0.22586 1.01820 0.222 0.8251
## as.factor(time)3:race_grnh black -1.19578 0.99157 -1.206 0.2319
## as.factor(time)2:race_grother 16.36122 1.16577 14.035 < 2e-16 ***
## as.factor(time)3:race_grother 0.55741 0.48115 1.158 0.2506
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1.084763)
##
## Number of Fisher Scoring iterations: 16
regTermTest(fitl2, ~as.factor(time):race_gr)
## Wald test for as.factor(time):race_gr
## in svyglm(formula = povtran ~ as.factor(time) * race_gr + momedu -
## 1, design = des2, family = binomial(link = "cloglog"))
## F = 28.56107 on 10 and 70 df: p= < 2.22e-16
dat4<-expand.grid(time=c(1,2,3),momedu=levels(des2$variables$momedu), race_gr=levels(des2$variables$race_gr))
#unfortunately, expand.grid makes some unrealistic cases sometimes, get rid of those.
dat4$fitted<-predict(fitl2, dat4, type="response")
head(dat4)
## time momedu race_gr fitted
## 1 1 highschool nh white 0.082575820
## 2 2 highschool nh white 0.043753523
## 3 3 highschool nh white 0.062668738
## 4 1 college nh white 0.003932485
## 5 2 college nh white 0.002043322
## 6 3 college nh white 0.002954441
plot(NULL,xlim=c(1,3), type="l",
ylim=c(min(dat4$fitted), .4),
ylab="h(t)", xlab="Wave", lwd=3)
lines(fitted~time, dat4[dat4$momedu=="highschool"&dat4$race_gr=="nh white",], col=1, lty=1, lwd=3)
lines(fitted~time, dat4[dat4$momedu=="college" &dat4$race_gr=="nh white",], col=1, lty=2, lwd=3)
lines(fitted~time, dat4[dat4$momedu=="highschool"&dat4$race_gr=="nh black",], col=2, lwd=3)
lines(fitted~time, dat4[dat4$momedu=="college"&dat4$race_gr=="nh black",], col=2, lty=2, lwd=3)
lines(fitted~time, dat4[dat4$momedu=="highschool"&dat4$race_gr=="hisp",], col=3, lwd=3)
lines(fitted~time, dat4[dat4$momedu=="college"&dat4$race_gr=="hisp",], col=3, lty=2, lwd=3)
lines(fitted~time, dat4[dat4$momedu=="highschool"&dat4$race_gr=="nahn",], col=4, lwd=3)
lines(fitted~time, dat4[dat4$momedu=="college"&dat4$race_gr=="nahn",], col=4, lty=2, lwd=3)
lines(fitted~time, dat4[dat4$momedu=="highschool"&dat4$race_gr=="nh asian",], col=5, lwd=3)
lines(fitted~time, dat4[dat4$momedu=="college"&dat4$race_gr=="nh asian",], col=5, lty=2, lwd=3)
legend("topright", legend=c("NH White, HS Edu", "NH White, LT HS Edu", "NH Black, HS Edu", "NH Black, LT HS Edu","Hisp, HS Edu", "Hisp, LT HS Edu","NAHN, HS Edu", "NAHN, LT HS Edu","NH Asian, HS Edu", "NH Asian, LT HS Edu"), col=c(1,1,2,2,3,3,4,4,5,5), lty=rep(c(1,2), 5), cex=.75)
title(main="Hazard of Transitioning into Poverty")
model.dat<-read.dta("/Users/ozd504/Google Drive/dem7223/class2016/data/zzkr62dt/ZZKR62FL.DTA", convert.factors = F)
In the DHS, they record if a child is dead or alive and the age at death if the child is dead. This can be understood using a series of variables about each child.
If the child is alive at the time of interview, then the variable B5==1, and the age at death is censored.
If the age at death is censored, then the age at the date of interview (censored age at death) is the date of the interview - date of birth (in months).
If the child is dead at the time of interview,then the variable B5!=1, then the age at death in months is the variable B7. Here we code this:
#We form a subset of variables
sub<-data.frame(CASEID=model.dat$caseid,kidid=paste(model.dat$caseid, model.dat$bidx, sep="-"), v008=model.dat$v008,bord=model.dat$bidx,csex=model.dat$b4,b2=model.dat$b2, b3=model.dat$b3, b5=model.dat$b5, b7=model.dat$b7, ibint=model.dat$b11, rural=model.dat$v025, educ=model.dat$v106,age=model.dat$v012,partneredu=model.dat$v701,partnerage=model.dat$v730, hhses=model.dat$v190, weight=model.dat$v005/1000000, psu=model.dat$v021, strata=model.dat$v022)
sub$death.age<-ifelse(sub$b5==1,
((((sub$v008))+1900)-(((sub$b3))+1900))
,sub$b7)+.0001
#censoring indicator for death by age 5, in months (<=60 months)
sub$d.event<-ifelse(is.na(sub$b7)==T|sub$b7>60,0,1)
sub$d.eventfac<-factor(sub$d.event); levels(sub$d.eventfac)<-c("Alive at Age 5", "Dead by Age 5")
table(sub$d.eventfac)
##
## Alive at Age 5 Dead by Age 5
## 5335 633
#recodes
sub$male<-ifelse(sub$csex==1,1,0)
sub$educ.high<-ifelse(sub$educ %in% c(2,3), 1, 0)
sub$age2<-sub$age^2
sub$partnerhiedu<-ifelse(sub$partneredu<3,0,ifelse(sub$partneredu%in%c(8,9),NA,1 ))
sub$hises<-ifelse(sub$hhses>3, 1,0)
sub$rural<-ifelse(sub$rural==2,1,0 )
The distinction between the way we have been doing things and the discrete time model, is that we treat time discretely, versus continuously. This means that we transform the data from the case-duration data format to the person-period format. For this example, a natural choice would be year, since we have 5 intervals of equal length (12 months each). R provides a useful function called survSplit() in the survival library that will split a continuous duration into discrete periods.
#make person period file
pp<-survSplit(Surv(death.age, d.event)~. , data = sub, cut=seq(0,60,12), episode="year_age")
pp<-pp[order(pp$kidid, pp$year_age),]
head(pp[, c("kidid", "death.age", "d.event", "year_age", "male", "hises")], n=20)
## kidid death.age d.event year_age male hises
## 1 1 1 2-1 5.0001 0 2 0 0
## 2 1 1 2-2 12.0000 0 2 1 0
## 3 1 1 2-2 24.0000 0 3 1 0
## 4 1 1 2-2 36.0000 0 4 1 0
## 5 1 1 2-2 37.0001 0 5 1 0
## 6 1 3 2-1 12.0000 0 2 0 0
## 7 1 3 2-1 24.0000 0 3 0 0
## 8 1 3 2-1 30.0001 0 4 0 0
## 9 1 4 3-1 10.0001 0 2 0 0
## 10 1 4 3-2 12.0000 0 2 1 0
## 11 1 4 3-2 24.0000 0 3 1 0
## 12 1 4 3-2 30.0001 0 4 1 0
## 13 1 5 1-1 0.0001 1 2 1 0
## 14 1 5 1-2 12.0000 0 2 0 0
## 15 1 5 1-2 24.0000 0 3 0 0
## 16 1 5 1-2 34.0001 0 4 0 0
## 17 1 15 1-1 1.0001 1 2 0 0
## 18 1 15 1-2 12.0000 0 2 1 0
## 19 1 15 1-2 18.0001 0 3 1 0
## 20 1 15 1-3 3.0001 1 2 1 0
We see that each child is not in the data for multiple “risk periods”, until they experience the event (death) or age out of the risk set (year 6 in this case).
So, the best thing about the discrete time model, is that it’s just logistic regression. Each risk period is treated as a single Bernoulli trial, and the child can either fail (y=1) or not (y=0) in the period. This is how we get the hazard of the event, as the estimated probability of failure in each discrete time period. So, any method you would like to use to model this probability would probably work (logit, probit models), but I will show two standard approaches. We will use the complementary log-log link. This is used because it preserves the proportional hazards property of the model, as in the Cox model.
#generate survey design
des<-svydesign(ids=~psu, strata = ~strata , weights=~weight, data=pp)
#Fit the basic logistic model with ONLY time in the model
#I do -1 so that no intercept is fit in the model, and we get a hazard estimate for each time period
fit.0<-svyglm(d.event~as.factor(year_age)-1,design=des , family="binomial")
## Warning: non-integer #successes in a binomial glm!
summary(fit.0)
##
## Call:
## svyglm(formula = d.event ~ as.factor(year_age) - 1, design = des,
## family = "binomial")
##
## Survey design:
## svydesign(ids = ~psu, strata = ~strata, weights = ~weight, data = pp)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## as.factor(year_age)2 -2.41351 0.05901 -40.90 <2e-16 ***
## as.factor(year_age)3 -3.72379 0.13236 -28.13 <2e-16 ***
## as.factor(year_age)4 -4.28401 0.19830 -21.60 <2e-16 ***
## as.factor(year_age)5 -5.16879 0.33117 -15.61 <2e-16 ***
## as.factor(year_age)6 -4.63002 0.34620 -13.37 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1.00006)
##
## Number of Fisher Scoring iterations: 7
#Plot the hazard function on the probability scale
haz<-1/(1+exp(-coef(fit.0)))
time<-seq(1,5,1)
plot(haz~time, type="l", ylab="h(t)")
title(main="Discrete Time Hazard Function for Child Mortality")
#now we include a single predictor and examine the proportional hazards
fit.1<-svyglm(d.event~as.factor(year_age)+rural-1,design=des , family=binomial(link="cloglog"))
## Warning: non-integer #successes in a binomial glm!
summary(fit.1)
##
## Call:
## svyglm(formula = d.event ~ as.factor(year_age) + rural - 1, design = des,
## family = binomial(link = "cloglog"))
##
## Survey design:
## svydesign(ids = ~psu, strata = ~strata, weights = ~weight, data = pp)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## as.factor(year_age)2 -2.32949 0.09087 -25.634 <2e-16 ***
## as.factor(year_age)3 -3.60789 0.15709 -22.967 <2e-16 ***
## as.factor(year_age)4 -4.16113 0.20838 -19.969 <2e-16 ***
## as.factor(year_age)5 -5.04442 0.32888 -15.338 <2e-16 ***
## as.factor(year_age)6 -4.50825 0.35846 -12.577 <2e-16 ***
## rural -0.19695 0.10632 -1.852 0.0656 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1.002157)
##
## Number of Fisher Scoring iterations: 7
Which shows a lower risk of death for children of higher SES households. In fact, it’s a -27.7% lower risk
Next, I do the plots of the hazard functions the Singer and Willett way. I got this code from Singer and Willett’s example from Ch 11
t<-data.frame(cbind(y=1/(1+exp(-fit.1$linear.predictors)), year=pp$year_age,rural=pp$rural))
t0<-t[t$rural==0,]
t0<-unique(t0[order(t0$year),])
t1<-t[t$rural==1,]
t1<-unique(t1[order(t1$year),])
#Here we plot the hazards, I subtract .5 from each time so it looks like the hazard is at the midpoint of the year
plot(t0$year-.5, t0$y, type="b", ylab="Hazard", xlim=c(1,6), xlab="Year", col="red", main =c("Discrete time hazard model", "effect of Rural Residence"))
points(t1$year-.5, t1$y, type="b", col="blue")
legend("topright", legend=c("Urban", "Rural"), col=c("red", "blue"),lty=1)
But I like this way too, using expand.grid to make the various types of people, and using predict() get fitted values. Same thing, just more compact:
dat<-expand.grid(year_age=1:5, rural=c(0,1))
dat$fitted<-predict(fit.1, type = "response", newdata=dat)
plot(dat$year[dat$rural==0]-.5, dat$fitted[dat$rural==0], type="b", ylab="Hazard", xlim=c(0,5), xlab="Year", col="red", main =c("Discrete time hazard model", "effect of Rural Residence"))
points(dat$year[dat$rural==1]-.5, dat$fitted[dat$rural==1], type="b", col="blue")
legend("topright", legend=c("Urban", "Rural"), col=c("red", "blue"),lty=1)
See, same thing.
Interaction between covariate and time
fit.2<-svyglm(d.event~as.factor(year_age)*rural-1,design=des , family=binomial(link="cloglog"))
## Warning: non-integer #successes in a binomial glm!
summary(fit.2)
##
## Call:
## svyglm(formula = d.event ~ as.factor(year_age) * rural - 1, design = des,
## family = binomial(link = "cloglog"))
##
## Survey design:
## svydesign(ids = ~psu, strata = ~strata, weights = ~weight, data = pp)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## as.factor(year_age)2 -2.31691 0.09705 -23.873 < 2e-16 ***
## as.factor(year_age)3 -3.69673 0.28862 -12.808 < 2e-16 ***
## as.factor(year_age)4 -4.02818 0.34948 -11.526 < 2e-16 ***
## as.factor(year_age)5 -5.38882 0.85647 -6.292 2.30e-09 ***
## as.factor(year_age)6 -4.52065 0.61743 -7.322 7.75e-12 ***
## rural -0.21723 0.11920 -1.822 0.0701 .
## as.factor(year_age)3:rural 0.15855 0.34039 0.466 0.6419
## as.factor(year_age)4:rural -0.19743 0.43291 -0.456 0.6489
## as.factor(year_age)5:rural 0.52695 0.93078 0.566 0.5720
## as.factor(year_age)6:rural 0.04021 0.73653 0.055 0.9565
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1.00006)
##
## Number of Fisher Scoring iterations: 7
regTermTest(fit.2, ~as.factor(year_age):rural)
## Wald test for as.factor(year_age):rural
## in svyglm(formula = d.event ~ as.factor(year_age) * rural - 1, design = des,
## family = binomial(link = "cloglog"))
## F = 0.1956178 on 4 and 181 df: p= 0.94044
#No effect of the interaction
dat<-expand.grid(year_age=1:5, rural=c(0,1))
dat$fitted<-predict(fit.2, type = "response", newdata=dat)
#plot the interaction with time
plot(dat$year[dat$rural==0]-.5, dat$fitted[dat$rural==0], type="b", ylab="Hazard", xlim=c(0,6), xlab="Year", col="red", main =c("Discrete time hazard model", "effect of rural residence"))
points(dat$year[dat$rural==1]-.5, dat$fitted[dat$rural==1], type="b", col="blue")
legend("topright", legend=c("Urban", "Rural"), col=c("red", "blue"),lty=1)
I can’t see any difference.