This example will illustrate how to fit the discrete time hazard model with group-level frailty to continuous duration data (i.e. person-level data) and a discrete-time (longitudinal) data set. In this example, I will use the event of a child dying before age 5 in Haiti. The data for this example come from the Haitian Demographic and Health Survey for 2012 birth recode file. This file contains information for all live births to women sampled in the survey.
The longitudinal data example uses data from the ECLS-K. Specifically, we will examine the transition into poverty between kindergarten and 8th grade.
#Load required libraries
library(foreign)
library(survival)
library(car)
library(survey)
library(lme4)
library(arm)
#load the data
haiti<-read.dta("/Users/ozd504/Google Drive/dem7223/data/HTBR61FL.DTA", convert.factors = F)
#We form a subset of variables
sub<-data.frame(CASEID=haiti$caseid,kidid=paste(haiti$caseid, haiti$bidx, sep="-"),v008=haiti$v008,bord=haiti$bidx,csex=haiti$b4,b2=haiti$b2, b3=haiti$b3, b5=haiti$b5, b7=haiti$b7, ibint=haiti$b11, rural=haiti$v025, educ=haiti$v106,age=haiti$v012,partneredu=haiti$v701,partnerage=haiti$v730, hhses=haiti$v190, weight=haiti$v005/1000000, psu=haiti$v021, strata=haiti$v022, region=haiti$v023)
sub$death.age<-ifelse(sub$b5==1,
((((sub$v008))+1900)-(((sub$b3))+1900))
,sub$b7)
#censoring indicator for death by age 5, in months (<=60 months)
sub$d.event<-ifelse(is.na(sub$b7)==T|sub$b7>60,0,1)
sub$d.eventfac<-factor(sub$d.event); levels(sub$d.eventfac)<-c("Alive at Age 5", "Dead by Age 5")
table(sub$d.eventfac)
##
## Alive at Age 5 Dead by Age 5
## 25981 3032
#recodes
sub$male<-ifelse(sub$csex==1,1,0)
sub$educ.high<-ifelse(sub$educ %in% c(2,3), 1, 0)
sub$age2<-sub$age^2
sub$partnerhiedu<-ifelse(sub$partneredu<3,0,ifelse(sub$partneredu%in%c(8,9),NA,1 ))
sub$hises<-ifelse(sub$hhses>3, 1,0)
The distinction between the way we have been doing things and the discrete time model, is that we treat time discretely, versus continuously. This means that we transform the data from the case-duration data format to the person-period format. For this example, a natural choice would be year, since we have 5 intervals of equal length (12 months each). R provides a useful function called survSplit()
in the survival
library that will split a continuous duration into discrete periods.
#make person period file
pp<-survSplit(sub, cut=seq(0,60,12), start="start", end="death.age", event="d.event", episode="year")
pp<-pp[order(pp$kidid, pp$year),]
head(pp[, c("kidid", "death.age", "d.event", "start", "year", "male", "hises")], n=20)
## kidid death.age d.event start year male hises
## 29014 1 2 4-1 12 0 0 1 0 1
## 58027 1 2 4-1 24 0 12 2 0 1
## 87040 1 2 4-1 36 0 24 3 0 1
## 116053 1 2 4-1 48 0 36 4 0 1
## 145066 1 2 4-1 60 0 48 5 0 1
## 174079 1 2 4-1 102 0 60 6 0 1
## 29015 1 2 4-2 12 0 0 1 1 1
## 58028 1 2 4-2 24 0 12 2 1 1
## 87041 1 2 4-2 36 0 24 3 1 1
## 116054 1 2 4-2 48 0 36 4 1 1
## 145067 1 2 4-2 60 0 48 5 1 1
## 174080 1 2 4-2 132 0 60 6 1 1
## 29016 1 2 4-3 12 0 0 1 1 1
## 58029 1 2 4-3 24 0 12 2 1 1
## 87042 1 2 4-3 36 0 24 3 1 1
## 116055 1 2 4-3 48 0 36 4 1 1
## 145068 1 2 4-3 60 0 48 5 1 1
## 174081 1 2 4-3 188 0 60 6 1 1
## 29017 1 3 1-1 12 0 0 1 0 1
## 58030 1 3 1-1 24 0 12 2 0 1
library(maptools)
## Loading required package: sp
## Checking rgeos availability: TRUE
htshp<-readShapePoly("~/Google Drive/dem7223/data/sdr_subnational_data_2015-03-18/shps/sdr_subnational_data_dhs_2012.shp")
plot(htshp)
text(getSpPPolygonsLabptSlots(htshp), labels=as.character(htshp$DHSREGFR), cex=0.6)
## Warning: use coordinates method
#Since the region variable in the shapefile doesn't include two of the areas, I need to make them an ID number:
htshp$reg_merge<-ifelse(htshp$DHSREGFR=="Aire Métropolitaine", 1, ifelse(htshp$DHSREGFR=="Reste-Ouest",2, htshp$REGCODE+1))
htshp$reg_merge
## [1] 3 4 5 6 7 8 9 10 11 2 1
htshp$z_net<-scale(htshp$iC6D4E13)
htshp$z_edu<-scale(htshp$iA999FA)
pp2<-merge(pp, htshp@data, by.x="region", by.y="reg_merge")
We see that each child is not in the data for multiple “risk periods”, until they experience the event (death) or age out of the risk set (year 6 in this case).
As in the other examples, I illustrate fitting these models to data that are longitudinal, instead of person-duration. In this example, we will examine how to fit the discrete time hazard model with frailty to a longitudinally collected data set.
First we load our data and do some recodes and variable construction
load("~/Google Drive/dem7903_App_Hier/data/eclsk.Rdata")
names(eclsk)<-tolower(names(eclsk))
#get out only the variables I'm going to use for this example
myvars<-c( "childid","gender", "race", "r1_kage","r4age", "r5age", "r6age", "r7age","c1r4mtsc", "c4r4mtsc", "c5r4mtsc", "c6r4mtsc", "c7r4mtsc", "w1povrty","w1povrty","w3povrty", "w5povrty", "w8povrty","wkmomed", "s2_id", "c1_5fp0", "c15fpstr", "c15fppsu")
eclsk<-eclsk[,myvars]
eclsk$age1<-ifelse(eclsk$r1_kage==-9, NA, eclsk$r1_kage/12)
eclsk$age2<-ifelse(eclsk$r4age==-9, NA, eclsk$r4age/12)
#for the later waves, the NCES group the ages into ranges of months, so 1= <105 months, 2=105 to 108 months. So, I fix the age at the midpoint of the interval they give, and make it into years by dividing by 12
eclsk$age3<-recode(eclsk$r5age,recodes="1=105; 2=107; 3=109; 4=112; 5=115; 6=117; -9=NA")/12
eclsk$pov1<-ifelse(eclsk$w1povrty==1,1,0)
eclsk$pov2<-ifelse(eclsk$w3povrty==1,1,0)
eclsk$pov3<-ifelse(eclsk$w5povrty==1,1,0)
#Recode race with white, non Hispanic as reference using dummy vars
eclsk$hisp<-recode (eclsk$race, recodes="3:4=1;-9=NA; else=0")
eclsk$black<-recode (eclsk$race, recodes="2=1;-9=NA; else=0")
eclsk$asian<-recode (eclsk$race, recodes="5=1;-9=NA; else=0")
eclsk$nahn<-recode (eclsk$race, recodes="6:7=1;-9=NA; else=0")
eclsk$other<-recode (eclsk$race, recodes="8=1;-9=NA; else=0")
eclsk$male<-recode(eclsk$gender, recodes="1=1; 2=0; -9=NA")
eclsk$mlths<-recode(eclsk$wkmomed, recodes = "1:2=1; 3:9=0; else = NA")
eclsk$mgths<-recode(eclsk$wkmomed, recodes = "1:3=0; 4:9=1; else =NA")
Now, I need to form the transition variable, this is my event variable, and in this case it will be 1 if a child enters poverty between the first wave of the data and the third grade wave, and 0 otherwise. NOTE I need to remove any children who are already in poverty age wave 1, because they are not at risk of experiencing this particular transition.
eclsk<-subset(eclsk, is.na(pov1)==F&is.na(pov2)==F&is.na(pov3)==F&is.na(age1)==F&is.na(age2)==F&is.na(age3)==F&pov1!=1&is.na(eclsk$c15fpstr)==F)
eclsk$povtran1<-ifelse(eclsk$pov1==0&eclsk$pov2==0, 0,1)
eclsk$povtran2<-ifelse(eclsk$povtran1==1, NA,ifelse(eclsk$pov2==0&eclsk$pov3==0,0,1))
We can rescale the survey weights for use in multilevel models follwing the presentation in Carle, 2009.
#make an id that is the combination of strata and psu
eclsk$sampleid<-paste(eclsk$c15fpstr, eclsk$c15fppsu)
#within each sampling unit, sum the weights
wts<-tapply(eclsk$c1_5fp0,eclsk$sampleid,sum)
#make a data frame from this
wts<-data.frame(id=names(unlist(wts)), wt=unlist(wts))
#get the unique sampling location ids'
t1<-as.data.frame(table(eclsk$sampleid))
#put all of this into a data set
wts2<-data.frame(ids=wts$id, sumwt=wts$wt, jn=t1$Freq)
#merge all of this back to the original data file
eclsk2<-merge(eclsk, wts2, by.x="sampleid", by.y="ids", all.x=T)
#In the new data set, multiply the original weight by the fraction of the
#sampling unit total population each person represents
eclsk2$swts<-eclsk2$c1_5fp0*(eclsk2$jn/eclsk2$sumwt)
Now we do the entire data set. To analyze data longitudinally, we need to reshape the data from the current “wide” format (repeated measures in columns) to a “long” format (repeated observations in rows). The reshape()
function allows us to do this easily. It allows us to specify our repeated measures, time varying covariates as well as time-constant covariates.
e.long<-reshape(eclsk2, idvar="childid", varying=list(age=c("age1","age2"), age2=c("age2", "age3"), povtran=c("povtran1", "povtran2")), times=1:2, direction="long" , drop = names(eclsk)[4:19])
e.long<-e.long[order(e.long$childid, e.long$time),]
#find which kids failed in the first time period and remove them from the second risk period risk set
failed1<-which(is.na(e.long$povtran1)==T)
e.long<-e.long[-failed1,]
e.long$age1r<-round(e.long$age1, 0)
e.long$age2r<-round(e.long$age2, 0)
head(e.long, n=10)
## sampleid childid gender race s2_id c1_5fp0 c15fpstr c15fppsu
## 0001002C.1 63 1 0001002C 2 1 0001 352.35 63 1
## 0001002C.2 63 1 0001002C 2 1 0001 352.35 63 1
## 0001007C.1 63 1 0001007C 1 1 0001 358.99 63 1
## 0001007C.2 63 1 0001007C 1 1 0001 358.99 63 1
## 0001010C.1 63 1 0001010C 2 1 0001 352.35 63 1
## 0001010C.2 63 1 0001010C 2 1 0001 352.35 63 1
## 0002004C.1 63 1 0002004C 2 1 0002 228.05 63 1
## 0002004C.2 63 1 0002004C 2 1 0002 228.05 63 1
## 0002006C.1 63 1 0002006C 2 1 0002 287.25 63 1
## 0002008C.1 63 1 0002008C 2 1 0002 259.19 63 1
## pov1 pov2 pov3 hisp black asian nahn other male mlths mgths
## 0001002C.1 0 0 0 0 0 0 0 0 0 0 0
## 0001002C.2 0 0 0 0 0 0 0 0 0 0 0
## 0001007C.1 0 0 0 0 0 0 0 0 1 0 1
## 0001007C.2 0 0 0 0 0 0 0 0 1 0 1
## 0001010C.1 0 0 0 0 0 0 0 0 0 0 1
## 0001010C.2 0 0 0 0 0 0 0 0 0 0 1
## 0002004C.1 0 0 0 0 0 0 0 0 0 0 1
## 0002004C.2 0 0 0 0 0 0 0 0 0 0 1
## 0002006C.1 0 1 1 0 0 0 0 0 0 NA NA
## 0002008C.1 0 0 0 0 0 0 0 0 0 0 1
## sumwt jn swts time age1 age2 povtran1 age1r
## 0001002C.1 21844.73 72 1.1613419 1 6.433333 7.894167 0 6
## 0001002C.2 21844.73 72 1.1613419 2 7.894167 9.750000 0 8
## 0001007C.1 21844.73 72 1.1832273 1 5.300000 6.825000 0 5
## 0001007C.2 21844.73 72 1.1832273 2 6.825000 8.916667 0 7
## 0001010C.1 21844.73 72 1.1613419 1 5.885833 7.385833 0 6
## 0001010C.2 21844.73 72 1.1613419 2 7.385833 9.333333 0 7
## 0002004C.1 21844.73 72 0.7516504 1 5.450000 6.977500 0 5
## 0002004C.2 21844.73 72 0.7516504 2 6.977500 9.083333 0 7
## 0002006C.1 21844.73 72 0.9467730 1 5.158333 6.685833 1 5
## 0002008C.1 21844.73 72 0.8542875 1 5.905833 7.433333 0 6
## age2r
## 0001002C.1 8
## 0001002C.2 10
## 0001007C.1 7
## 0001007C.2 9
## 0001010C.1 7
## 0001010C.2 9
## 0002004C.1 7
## 0002004C.2 9
## 0002006C.1 7
## 0002008C.1 7
Now we fit the discrete time model and doing fraily by the school identifier. I use the weights calculated above, standardized to the within cluster sample size.
#Fit the model
fitl1<-glmer(povtran1~time+mlths+mgths+black+hisp+other+nahn+(1|s2_id),family=binomial,weights=swts, e.long)
## Warning: non-integer #successes in a binomial glm!
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl =
## control$checkConv, : Model failed to converge with max|grad| = 0.00240573
## (tol = 0.001, component 4)
display(fitl1, detail=T)
## glmer(formula = povtran1 ~ time + mlths + mgths + black + hisp +
## other + nahn + (1 | s2_id), data = e.long, family = binomial,
## weights = swts)
## coef.est coef.se z value Pr(>|z|)
## (Intercept) -2.78 0.17 -16.63 0.00
## time -0.36 0.09 -4.13 0.00
## mlths 0.95 0.13 7.19 0.00
## mgths -0.88 0.10 -8.64 0.00
## black 1.43 0.15 9.55 0.00
## hisp 0.89 0.13 7.03 0.00
## other 0.68 0.29 2.34 0.02
## nahn 1.35 0.24 5.56 0.00
##
## Error terms:
## Groups Name Std.Dev.
## s2_id (Intercept) 1.04
## Residual 1.00
## ---
## number of obs: 12976, groups: s2_id, 898
## AIC = 4640.1, DIC = 4622.1
## deviance = 4622.1