This example will illustrate how to fit the discrete time hazard model to continuous duration data (i.e. person-level data) and a discrete-time (longitudinal) data set. In this example, I will use the event of a child dying before age 5 in Haiti. The data for this example come from the Haitian Demographic and Health Survey for 2012 birth recode file. This file contains information for all live births to women sampled in the survey.

The longitudinal data example uses data from the ECLS-K. Specifically, we will examine the transition into poverty between kindergarten and 8th grade.

#Load required libraries
library(foreign)
library(survival)

## Loading required package: splines

library(car)
library(survey)

## Loading required package: grid
## 
## Attaching package: 'survey'
## 
## The following object is masked from 'package:graphics':
## 
##     dotchart

#load the data
haiti<-read.dta("/Users/ozd504/Google Drive/dem7223/data/HTBR61FL.DTA", convert.factors = F)

#We form a subset of variables
sub<-data.frame(CASEID=haiti$caseid,kidid=paste(haiti$caseid, haiti$bidx, sep="-"), v008=haiti$v008,bord=haiti$bidx,csex=haiti$b4,b2=haiti$b2, b3=haiti$b3, b5=haiti$b5, b7=haiti$b7, ibint=haiti$b11, rural=haiti$v025, educ=haiti$v106,age=haiti$v012,partneredu=haiti$v701,partnerage=haiti$v730, hhses=haiti$v190, weight=haiti$v005/1000000, psu=haiti$v021, strata=haiti$v022)

sub$death.age<-ifelse(sub$b5==1,
                          ((((sub$v008))+1900)-(((sub$b3))+1900)) 
                          ,sub$b7)

#censoring indicator for death by age 5, in months (<=60 months)
sub$d.event<-ifelse(is.na(sub$b7)==T|sub$b7>60,0,1)
sub$d.eventfac<-factor(sub$d.event); levels(sub$d.eventfac)<-c("Alive at Age 5", "Dead by Age 5")
table(sub$d.eventfac)

## 
## Alive at Age 5  Dead by Age 5 
##          25981           3032

#recodes
sub$male<-ifelse(sub$csex==1,1,0)
sub$educ.high<-ifelse(sub$educ %in% c(2,3), 1, 0)
sub$age2<-sub$age^2
sub$partnerhiedu<-ifelse(sub$partneredu<3,0,ifelse(sub$partneredu%in%c(8,9),NA,1 ))
sub$hises<-ifelse(sub$hhses>3, 1,0)

Create the person-period file

The distinction between the way we have been doing things and the discrete time model, is that we treat time discretely, versus continuously. This means that we transform the data from the case-duration data format to the person-period format. For this example, a natural choice would be year, since we have 5 intervals of equal length (12 months each). R provides a useful function called survSplit() in the survival library that will split a continuous duration into discrete periods.

#make person period file
pp<-survSplit(sub, cut=seq(0,60,12), start="start", end="death.age", event="d.event", episode="year")
pp<-pp[order(pp$kidid, pp$year),]
head(pp[, c("kidid", "death.age", "d.event", "start", "year", "male", "hises")], n=20)

##                    kidid death.age d.event start year male hises
## 29014           1 2  4-1        12       0     0    1    0     1
## 58027           1 2  4-1        24       0    12    2    0     1
## 87040           1 2  4-1        36       0    24    3    0     1
## 116053          1 2  4-1        48       0    36    4    0     1
## 145066          1 2  4-1        60       0    48    5    0     1
## 174079          1 2  4-1       102       0    60    6    0     1
## 29015           1 2  4-2        12       0     0    1    1     1
## 58028           1 2  4-2        24       0    12    2    1     1
## 87041           1 2  4-2        36       0    24    3    1     1
## 116054          1 2  4-2        48       0    36    4    1     1
## 145067          1 2  4-2        60       0    48    5    1     1
## 174080          1 2  4-2       132       0    60    6    1     1
## 29016           1 2  4-3        12       0     0    1    1     1
## 58029           1 2  4-3        24       0    12    2    1     1
## 87042           1 2  4-3        36       0    24    3    1     1
## 116055          1 2  4-3        48       0    36    4    1     1
## 145068          1 2  4-3        60       0    48    5    1     1
## 174081          1 2  4-3       188       0    60    6    1     1
## 29017           1 3  1-1        12       0     0    1    0     1
## 58030           1 3  1-1        24       0    12    2    0     1

We see that each child is not in the data for multiple “risk periods”, until they experience the event (death) or age out of the risk set (year 6 in this case).

Discrete time model

So, the best thing about the discrete time model, is that it’s just logistic regression. Each risk period is treated as a single Bernoulli trial, and the child can either fail (y=1) or not (y=0) in the period. This is how we get the hazard of the event, as the estimated probability of failure in each discrete time period. So, any method you would like to use to model this probability would probably work (logit, probit models), but I will show two standard approaches. First, we will use the traditional logit link to the binomial distribution, then we will use the complementary log-log link. The latter is used because it preserves the proportional hazards property of the model, as in the Cox model.

#generate survey design

des<-svydesign(ids=~psu, strata = ~strata , weights=~weight, data=pp)

#Fit the basic logistic model with ONLY time in the model
#I do -1 so that no intercept is fit in the model, and we get a hazard estimate for each time period
fit.0<-svyglm(d.event~as.factor(year)-1,design=des , family="binomial")

## Warning: non-integer #successes in a binomial glm!

summary(fit.0)

## 
## Call:
## svyglm(formula = d.event ~ as.factor(year) - 1, design = des, 
##     family = "binomial")
## 
## Survey design:
## svydesign(ids = ~psu, strata = ~strata, weights = ~weight, data = pp)
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## as.factor(year)1  -2.9997     0.0521   -57.6   <2e-16 ***
## as.factor(year)2  -3.8042     0.0632   -60.2   <2e-16 ***
## as.factor(year)3  -5.0808     0.1084   -46.9   <2e-16 ***
## as.factor(year)4  -5.5318     0.1311   -42.2   <2e-16 ***
## as.factor(year)5  -5.5702     0.1438   -38.7   <2e-16 ***
## as.factor(year)6 -19.6395     0.0250  -786.1   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 0.8616)
## 
## Number of Fisher Scoring iterations: 18

#Plot the hazard function on the probability scale
haz<-1/(1+exp(-coef(fit.0)))
time<-seq(1,6,1)
plot(haz~time, type="l", ylab="h(t)", ylim=c(0,.05))
title(main="Discrete Time Hazard Function for Child Mortality")

plot of chunk models1

#now we include a single predictor and examine the proportional hazards

fit.1<-svyglm(d.event~as.factor(year)+hises-1,design=des , family="binomial")

## Warning: non-integer #successes in a binomial glm!

summary(fit.1)

## 
## Call:
## svyglm(formula = d.event ~ as.factor(year) + hises - 1, design = des, 
##     family = "binomial")
## 
## Survey design:
## svydesign(ids = ~psu, strata = ~strata, weights = ~weight, data = pp)
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## as.factor(year)1  -2.8997     0.0595  -48.73  < 2e-16 ***
## as.factor(year)2  -3.7045     0.0705  -52.56  < 2e-16 ***
## as.factor(year)3  -4.9804     0.1103  -45.14  < 2e-16 ***
## as.factor(year)4  -5.4315     0.1353  -40.14  < 2e-16 ***
## as.factor(year)5  -5.4706     0.1492  -36.68  < 2e-16 ***
## as.factor(year)6 -19.5398     0.0360 -543.13  < 2e-16 ***
## hises             -0.3243     0.0786   -4.13  4.5e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 0.8466)
## 
## Number of Fisher Scoring iterations: 18

Which shows a lower risk of death for children of higher SES households. In fact, it’s a -27.7% lower risk

Next, I do the plots of the hazard functions the Singer and Willett way. I got this code from Singer and Willett’s example from Ch 11

t<-data.frame(cbind(y=1/(1+exp(-fit.1$linear.predictors)), year=pp$year,hises=pp$hises))
t0<-t[t$hises==0,]
t0<-unique(t0[order(t0$year),])
t1<-t[t$hises==1,]
t1<-unique(t1[order(t1$year),])

#Here we plot the hazards, I subtract .5 from each time so it looks like the hazard is at the midpoint of the year
plot(t0$year-.5, t0$y, type="b", ylab="Hazard", ylim=c(0, max(t0$y, t1$y)), xlim=c(0,6), xlab="Year", col="red", main =c("Discrete time hazard model", "effect of Household SES"))
points(t1$year-.5, t1$y, type="b", col="blue")
legend("topright", legend=c("LOW SES", "HIGH SES"), col=c("red", "blue"),lty=1)

plot of chunk unnamed-chunk-2

But I like this way too, using expand.grid to make the various types of people, and using predict() get fitted values. Same thing, just more compact:

dat<-expand.grid(year=1:6, hises=c(0,1))
dat$fitted<-predict(fit.1, type = "response", newdata=dat)

plot(dat$year[dat$hises==0]-.5, dat$fitted[dat$hises==0], type="b", ylab="Hazard", ylim=c(0, max(t0$y, t1$y)), xlim=c(0,6), xlab="Year", col="red", main =c("Discrete time hazard model", "effect of Household SES"))
points(dat$year[dat$hises==1]-.5, dat$fitted[dat$hises==1], type="b", col="blue")
legend("topright", legend=c("LOW SES", "HIGH SES"), col=c("red", "blue"),lty=1)

plot of chunk unnamed-chunk-3

See, same thing.

Interaction between covariate and time

fit.2<-svyglm(d.event~as.factor(year)*hises-1,design=des , family="binomial")

## Warning: non-integer #successes in a binomial glm!

summary(fit.2)

## 
## Call:
## svyglm(formula = d.event ~ as.factor(year) * hises - 1, design = des, 
##     family = "binomial")
## 
## Survey design:
## svydesign(ids = ~psu, strata = ~strata, weights = ~weight, data = pp)
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## as.factor(year)1        -2.9747     0.0653  -45.58  < 2e-16 ***
## as.factor(year)2        -3.6048     0.0700  -51.52  < 2e-16 ***
## as.factor(year)3        -4.8656     0.1110  -43.84  < 2e-16 ***
## as.factor(year)4        -5.3599     0.1418  -37.80  < 2e-16 ***
## as.factor(year)5        -5.3262     0.1558  -34.19  < 2e-16 ***
## as.factor(year)6       -19.6020     0.0350 -559.34  < 2e-16 ***
## hises                   -0.0750     0.0927   -0.81   0.4191    
## as.factor(year)2:hises  -0.6699     0.1660   -4.04  6.5e-05 ***
## as.factor(year)3:hises  -0.7473     0.2834   -2.64   0.0087 ** 
## as.factor(year)4:hises  -0.5375     0.3641   -1.48   0.1406    
## as.factor(year)5:hises  -0.9251     0.3820   -2.42   0.0159 *  
## as.factor(year)6:hises  -0.0421     0.1167   -0.36   0.7187    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 0.8616)
## 
## Number of Fisher Scoring iterations: 18

dat<-expand.grid(year=1:6, hises=c(0,1))
dat$fitted<-predict(fit.2, type = "response", newdata=dat)
#plot the interaction with time
plot(dat$year[dat$hises==0]-.5, dat$fitted[dat$hises==0], type="b", ylab="Hazard", ylim=c(0, max(t0$y, t1$y)), xlim=c(0,6), xlab="Year", col="red", main =c("Discrete time hazard model", "effect of Household SES"))
points(dat$year[dat$hises==1]-.5, dat$fitted[dat$hises==1], type="b", col="blue")
legend("topright", legend=c("LOW SES", "HIGH SES"), col=c("red", "blue"),lty=1)

plot of chunk unnamed-chunk-4

Which shows the bigger gap in the hazard for high SES households in periods 2 &3 (and even 5, although it’s hard to see), as per the output from the model.

log-log link function

I promised I would show this too, nothing heavy, but the log-log link function preserves the proportional hazards property:

fit.3<-svyglm(d.event~as.factor(year)*hises-1,design=des , family=binomial(link="cloglog"))

## Warning: non-integer #successes in a binomial glm!

summary(fit.3)

## 
## Call:
## svyglm(formula = d.event ~ as.factor(year) * hises - 1, design = des, 
##     family = binomial(link = "cloglog"))
## 
## Survey design:
## svydesign(ids = ~psu, strata = ~strata, weights = ~weight, data = pp)
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## as.factor(year)1        -2.9997     0.0637  -47.11  < 2e-16 ***
## as.factor(year)2        -3.6182     0.0690  -52.41  < 2e-16 ***
## as.factor(year)3        -4.8695     0.1106  -44.05  < 2e-16 ***
## as.factor(year)4        -5.3623     0.1415  -37.90  < 2e-16 ***
## as.factor(year)5        -5.3286     0.1554  -34.28  < 2e-16 ***
## as.factor(year)6       -20.4826     0.0350 -584.46  < 2e-16 ***
## hises                   -0.0732     0.0905   -0.81   0.4191    
## as.factor(year)2:hises  -0.6647     0.1641   -4.05  6.1e-05 ***
## as.factor(year)3:hises  -0.7469     0.2825   -2.64   0.0085 ** 
## as.factor(year)4:hises  -0.5382     0.3630   -1.48   0.1389    
## as.factor(year)5:hises  -0.9253     0.3812   -2.43   0.0156 *  
## as.factor(year)6:hises  -0.0673     0.1149   -0.59   0.5583    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 0.8616)
## 
## Number of Fisher Scoring iterations: 19

dat2<-expand.grid(year=1:6, hises=c(0,1))
dat2$fitted<-predict(fit.3, type = "response", newdata=dat)
#plot the interaction with time
plot(dat$year[dat$hises==0]-.5, dat$fitted[dat$hises==0], type="b", ylab="Hazard", ylim=c(0, max(t0$y, t1$y)), xlim=c(0,6), xlab="Year", col="red", main =c("Discrete time hazard model", "effect of Household SES"))
points(dat$year[dat$hises==1]-.5, dat$fitted[dat$hises==1], type="b", col="blue")
legend("topright", legend=c("LOW SES", "HIGH SES"), col=c("red", "blue"),lty=1)

plot of chunk unnamed-chunk-5 I can’t see any difference.

Using Longitudinal Data

As in the other examples, I illustrate fitting these models to data that are longitudinal, instead of person-duration. In this example, we will examine how to fit the Cox model to a longitudinally collected data set.

load("~/Google Drive/dem7903_App_Hier/data/eclsk.Rdata")
names(eclsk)<-tolower(names(eclsk))
#get out only the variables I'm going to use for this example
myvars<-c( "childid","gender", "race", "r1_kage","r4age", "r5age", "r6age", "r7age","c1r4mtsc", "c4r4mtsc", "c5r4mtsc", "c6r4mtsc", "c7r4mtsc", "w1povrty","w1povrty","w3povrty", "w5povrty", "w8povrty","wkmomed", "s2_id", "c1_5fp0", "c15fpstr", "c15fppsu")
eclsk<-eclsk[,myvars]


eclsk$age1<-ifelse(eclsk$r1_kage==-9, NA, eclsk$r1_kage/12)
eclsk$age2<-ifelse(eclsk$r4age==-9, NA, eclsk$r4age/12)
#for the later waves, the NCES group the ages into ranges of months, so 1= <105 months, 2=105 to 108 months. So, I fix the age at the midpoint of the interval they give, and make it into years by dividing by 12
eclsk$age3<-recode(eclsk$r5age,recodes="1=105; 2=107; 3=109; 4=112; 5=115; 6=117; -9=NA")/12

eclsk$pov1<-ifelse(eclsk$w1povrty==1,1,0)
eclsk$pov2<-ifelse(eclsk$w3povrty==1,1,0)
eclsk$pov3<-ifelse(eclsk$w5povrty==1,1,0)

#Recode race with white, non Hispanic as reference using dummy vars
eclsk$hisp<-recode (eclsk$race, recodes="3:4=1;-9=NA; else=0")
eclsk$black<-recode (eclsk$race, recodes="2=1;-9=NA; else=0")
eclsk$asian<-recode (eclsk$race, recodes="5=1;-9=NA; else=0")
eclsk$nahn<-recode (eclsk$race, recodes="6:7=1;-9=NA; else=0")
eclsk$other<-recode (eclsk$race, recodes="8=1;-9=NA; else=0")
eclsk$race_gr<-recode(eclsk$race, recodes="3:4='hisp'; 2='nh black'; 5='nh asian'; 6:7='nahn'; 8='other'; 1='nh white'; else=NA", as.factor.result = T)
eclsk$race_gr<-relevel(eclsk$race_gr, ref = 'nh white')
eclsk$male<-recode(eclsk$gender, recodes="1=1; 2=0; -9=NA")
eclsk$mlths<-recode(eclsk$wkmomed, recodes = "1:2=1; 3:9=0; else = NA")
eclsk$mgths<-recode(eclsk$wkmomed, recodes = "1:3=0; 4:9=1; else =NA")

Now, I need to form the transition variable, this is my event variable, and in this case it will be 1 if a child enters poverty between the first wave of the data and the third grade wave, and 0 otherwise. NOTE I need to remove any children who are already in poverty age wave 1, because they are not at risk of experiencing this particular transition.

eclsk<-subset(eclsk, is.na(pov1)==F&is.na(pov2)==F&is.na(pov3)==F&is.na(age1)==F&is.na(age2)==F&is.na(age3)==F&pov1!=1&is.na(eclsk$c15fpstr)==F)
eclsk$povtran1<-ifelse(eclsk$pov1==0&eclsk$pov2==0, 0,1)
eclsk$povtran2<-ifelse(eclsk$povtran1==1, NA,ifelse(eclsk$pov2==0&eclsk$pov3==0,0,1))

Just as before, we reshape the data from the current “wide” format (repeated measures in columns) to a “long” format (repeated observations in rows). The reshape() function allows us to do this easily. It allows us to specify our repeated measures, time varying covariates as well as time-constant covariates.

e.long<-reshape(eclsk, idvar="childid", varying=list(age=c("age1","age2"), age2=c("age2", "age3"), povtran=c("povtran1", "povtran2")), times=1:2, direction="long" , drop = names(eclsk)[4:20])
e.long<-e.long[order(e.long$childid, e.long$time),]

#find which kids failed in the first time period and remove them from the second risk period risk set
failed1<-which(is.na(e.long$povtran1)==T)
e.long<-e.long[-failed1,]
e.long$age1r<-round(e.long$age1, 0)
e.long$age2r<-round(e.long$age2, 0)
head(e.long, n=10)

##             childid gender race c1_5fp0 c15fpstr c15fppsu pov1 pov2 pov3
## 0001002C.1 0001002C      2    1   352.4       63        1    0    0    0
## 0001002C.2 0001002C      2    1   352.4       63        1    0    0    0
## 0001007C.1 0001007C      1    1   359.0       63        1    0    0    0
## 0001007C.2 0001007C      1    1   359.0       63        1    0    0    0
## 0001010C.1 0001010C      2    1   352.4       63        1    0    0    0
## 0001010C.2 0001010C      2    1   352.4       63        1    0    0    0
## 0002004C.1 0002004C      2    1   228.1       63        1    0    0    0
## 0002004C.2 0002004C      2    1   228.1       63        1    0    0    0
## 0002006C.1 0002006C      2    1   287.2       63        1    0    1    1
## 0002008C.1 0002008C      2    1   259.2       63        1    0    0    0
##            hisp black asian nahn other  race_gr male mlths mgths time
## 0001002C.1    0     0     0    0     0 nh white    0     0     0    1
## 0001002C.2    0     0     0    0     0 nh white    0     0     0    2
## 0001007C.1    0     0     0    0     0 nh white    1     0     1    1
## 0001007C.2    0     0     0    0     0 nh white    1     0     1    2
## 0001010C.1    0     0     0    0     0 nh white    0     0     1    1
## 0001010C.2    0     0     0    0     0 nh white    0     0     1    2
## 0002004C.1    0     0     0    0     0 nh white    0     0     1    1
## 0002004C.2    0     0     0    0     0 nh white    0     0     1    2
## 0002006C.1    0     0     0    0     0 nh white    0    NA    NA    1
## 0002008C.1    0     0     0    0     0 nh white    0     0     1    1
##             age1  age2 povtran1 age1r age2r
## 0001002C.1 6.433 7.894        0     6     8
## 0001002C.2 7.894 9.750        0     8    10
## 0001007C.1 5.300 6.825        0     5     7
## 0001007C.2 6.825 8.917        0     7     9
## 0001010C.1 5.886 7.386        0     6     7
## 0001010C.2 7.386 9.333        0     7     9
## 0002004C.1 5.450 6.978        0     5     7
## 0002004C.2 6.978 9.083        0     7     9
## 0002006C.1 5.158 6.686        1     5     7
## 0002008C.1 5.906 7.433        0     6     7

Now we fit the discrete time model using full survey design. In the ECLS-K, I use the longitudinal weight for waves 1-5, as well as the associated psu and strata id’s for the longitudinal data from these waves from the parents of the child, since no data from the child themselves are used in the outcome.

des2<-svydesign(ids = ~c15fppsu, strata = ~c15fpstr, weights=~c1_5fp0, data=e.long[complete.cases(e.long),], nest=T)

#Fit the model
fitl1<-svyglm(povtran1~as.factor(time)+mlths+mgths+race_gr-1, design=des2, family=binomial)

## Warning: non-integer #successes in a binomial glm!

summary(fitl1)

## 
## Call:
## svyglm(formula = povtran1 ~ as.factor(time) + mlths + mgths + 
##     race_gr - 1, design = des2, family = binomial)
## 
## Survey design:
## svydesign(ids = ~c15fppsu, strata = ~c15fpstr, weights = ~c1_5fp0, 
##     data = e.long[complete.cases(e.long), ], nest = T)
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## as.factor(time)1   -2.601      0.134  -19.44  < 2e-16 ***
## as.factor(time)2   -3.088      0.125  -24.79  < 2e-16 ***
## mlths               0.910      0.140    6.51  2.3e-10 ***
## mgths              -1.029      0.129   -7.97  1.7e-14 ***
## race_grhisp         0.878      0.123    7.16  3.9e-12 ***
## race_grnahn         1.304      0.391    3.33  0.00094 ***
## race_grnh asian     0.196      0.331    0.59  0.55375    
## race_grnh black     1.461      0.175    8.36  1.0e-15 ***
## race_grother        0.940      0.319    2.94  0.00344 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 0.9786)
## 
## Number of Fisher Scoring iterations: 6

dat3<-expand.grid(time=c(1,2), mlths=c(0,1), mgths=c(0,1), race_gr=levels(des2$variables$race_gr))
#unfortunately, expand.grid makes some unrealistic cases sometimes, get rid of those.
rem<-which(apply(dat3[,2:3],1,sum)>1)
dat3<-dat3[-rem,]

dat3$fitted<-predict(fitl1, dat3, type="response")
dat3

##    time mlths mgths  race_gr  fitted
## 1     1     0     0 nh white 0.06910
## 2     2     0     0 nh white 0.04361
## 3     1     1     0 nh white 0.15573
## 4     2     1     0 nh white 0.10178
## 5     1     0     1 nh white 0.02585
## 6     2     0     1 nh white 0.01604
## 9     1     0     0     hisp 0.15147
## 10    2     0     0     hisp 0.09883
## 11    1     1     0     hisp 0.30728
## 12    2     1     0     hisp 0.21415
## 13    1     0     1     hisp 0.05998
## 14    2     0     1     hisp 0.03772
## 17    1     0     0     nahn 0.21476
## 18    2     0     0     nahn 0.14385
## 19    1     1     0     nahn 0.40464
## 20    2     1     0     nahn 0.29455
## 21    1     0     1     nahn 0.08906
## 22    2     0     1     nahn 0.05666
## 25    1     0     0 nh asian 0.08284
## 26    2     0     0 nh asian 0.05257
## 27    1     1     0 nh asian 0.18331
## 28    2     1     0 nh asian 0.12118
## 29    1     0     1 nh asian 0.03128
## 30    2     0     1 nh asian 0.01945
## 33    1     0     0 nh black 0.24231
## 34    2     0     0 nh black 0.16421
## 35    1     1     0 nh black 0.44281
## 36    2     1     0 nh black 0.32806
## 37    1     0     1 nh black 0.10259
## 38    2     0     1 nh black 0.06562
## 41    1     0     0    other 0.15961
## 42    2     0     0    other 0.10449
## 43    1     1     0    other 0.32063
## 44    2     1     0    other 0.22477
## 45    1     0     1    other 0.06358
## 46    2     0     1    other 0.04004

Do some plots, these aren’t very cool, since there is only 2 time periods.

plot(fitted~time, dat3[dat3$mlths==0&dat3$mgths==0&dat3$race_gr=="nh white",], type="l",
     ylim=c(min(dat3$fitted), .6),
     ylab="h(t)", xlab="Wave", lwd=3)
lines(fitted~time, dat3[dat3$mlths==1&dat3$mgths==0&dat3$race_gr=="nh white",], col=1, lty=2, lwd=3)
lines(fitted~time, dat3[dat3$mlths==0&dat3$mgths==0&dat3$race_gr=="nh black",], col=2, lwd=3)
lines(fitted~time, dat3[dat3$mlths==1&dat3$mgths==0&dat3$race_gr=="nh black",], col=2, lty=2, lwd=3)
lines(fitted~time, dat3[dat3$mlths==0&dat3$mgths==0&dat3$race_gr=="hisp",], col=3, lwd=3)
lines(fitted~time, dat3[dat3$mlths==1&dat3$mgths==0&dat3$race_gr=="hisp",], col=3, lty=2, lwd=3)
lines(fitted~time, dat3[dat3$mlths==0&dat3$mgths==0&dat3$race_gr=="nahn",], col=4, lwd=3)
lines(fitted~time, dat3[dat3$mlths==1&dat3$mgths==0&dat3$race_gr=="nahn",], col=4, lty=2, lwd=3)
lines(fitted~time, dat3[dat3$mlths==0&dat3$mgths==0&dat3$race_gr=="nh asian",], col=5, lwd=3)
lines(fitted~time, dat3[dat3$mlths==1&dat3$mgths==0&dat3$race_gr=="nh asian",], col=5, lty=2, lwd=3)

legend("topright", legend=c("NH White, HS Edu", "NH White, LT HS Edu", "NH Black, HS Edu", "NH Black, LT HS Edu","Hisp, HS Edu", "Hisp, LT HS Edu","NAHN, HS Edu", "NAHN, LT HS Edu","NH Asian, HS Edu", "NH Asian, LT HS Edu"), col=c(1,1,2,2,3,3,4,4,5,5), lty=rep(c(1,2), 5), cex=.75)
title(main="Hazard of Transitioning into Poverty")

plot of chunk unnamed-chunk-7

Event History Analysis - Example 6 - Discrete Time Hazard Model

coreysparks

February 25, 2015

Create the person-period file

Discrete time model

log-log link function

Using Longitudinal Data