This example will illustrate how to fit parametric the Cox Proportional hazards model to continuous duration data (i.e. person-level data) and a discrete-time (longitudinal) data set.

The first example uses longitudinal data from the ECLS-K. Specifically, we will examine the transition into poverty between kindergarten and third grade.

In the second example, I use the time between the first and second birth for women in the data as the outcome variable. The data for this example come from the DHS Model data file Demographic and Health Survey for 2012 individual recode file. This file contains information for all women sampled in the survey between the ages of 15 and 49.

Using Longitudinal Data

As in the other examples, I illustrate fitting these models to data that are longitudinal, instead of person-duration. In this example, we will examine how to fit the Cox model to a longitudinally collected data set.

First we load our data

#Load required libraries
library(foreign)
library(survival)
library(car)
library(survey)
library(eha)


load("C:/Users/ozd504/Google Drive/classes/dem7223/class17/data/eclsk.Rdata")
names(eclsk)<-tolower(names(eclsk))
#get out only the variables I'm going to use for this example
myvars<-c( "childid","gender", "race", "r1_kage","r4age", "r5age", "r6age", "r7age","c1r4mtsc", "c4r4mtsc", "c5r4mtsc", "c6r4mtsc", "c7r4mtsc", "w1povrty","w1povrty","w3povrty", "w5povrty", "w8povrty","wkmomed", "s2_id", "c15fppsu", "c15fpstr", "c1_5fp0")
eclsk<-eclsk[,myvars]


eclsk$age1<-ifelse(eclsk$r1_kage==-9, NA, eclsk$r1_kage/12)
eclsk$age2<-ifelse(eclsk$r4age==-9, NA, eclsk$r4age/12)
#for the later waves, the NCES group the ages into ranges of months, so 1= <105 months, 2=105 to 108 months. So, I fix the age at the midpoint of the interval they give, and make it into years by dividing by 12
eclsk$age3<-recode(eclsk$r5age,recodes="1=105; 2=107; 3=109; 4=112; 5=115; 6=117; -9=NA")/12

eclsk$pov1<-ifelse(eclsk$w1povrty==1,1,0)
eclsk$pov2<-ifelse(eclsk$w3povrty==1,1,0)
eclsk$pov3<-ifelse(eclsk$w5povrty==1,1,0)

#Recode race with white, non Hispanic as reference using dummy vars
eclsk$hisp<-recode (eclsk$race, recodes="3:4=1;-9=NA; else=0")
eclsk$black<-recode (eclsk$race, recodes="2=1;-9=NA; else=0")
eclsk$asian<-recode (eclsk$race, recodes="5=1;-9=NA; else=0")
eclsk$nahn<-recode (eclsk$race, recodes="6:7=1;-9=NA; else=0")
eclsk$other<-recode (eclsk$race, recodes="8=1;-9=NA; else=0")

eclsk$male<-recode(eclsk$gender, recodes="1=1; 2=0; -9=NA")
eclsk$mlths<-recode(eclsk$wkmomed, recodes = "1:2=1; 3:9=0; else = NA")
eclsk$mgths<-recode(eclsk$wkmomed, recodes = "1:3=0; 4:9=1; else =NA")

Now, I need to form the transition variable, this is my event variable, and in this case it will be 1 if a child enters poverty between the first wave of the data and the third grade wave, and 0 otherwise. NOTE I need to remove any children who are already in poverty age wave 1, because they are not at risk of experiencing this particular transition.

eclsk<-subset(eclsk, is.na(mlths)==F&is.na(black)==F&is.na(pov1)==F&is.na(pov2)==F&is.na(pov3)==F&is.na(age1)==F&is.na(age2)==F&is.na(age3)==F&pov1!=1)

Now we do the entire data set. To analyze data longitudinally, we need to reshape the data from the current “wide” format (repeated measures in columns) to a “long” format (repeated observations in rows). The reshape() function allows us to do this easily. It allows us to specify our repeated measures, time varying covariates as well as time-constant covariates.

e.long<-reshape(eclsk, idvar="childid", varying=list(c("age1","age2"),
                                                     c("age2", "age3")),
                v.names=c("age_enter", "age_exit"),
                times=1:2, direction="long" )
e.long<-e.long[order(e.long$childid, e.long$time),]

e.long$povtran<-NA

e.long$povtran[e.long$pov1==0&e.long$pov2==1&e.long$time==1]<-1
e.long$povtran[e.long$pov2==0&e.long$pov3==1&e.long$time==2]<-1

e.long$povtran[e.long$pov1==0&e.long$pov2==0&e.long$time==1]<-0
e.long$povtran[e.long$pov2==0&e.long$pov3==0&e.long$time==2]<-0

#find which kids failed in earlier time periods and remove them from the second & third period risk set
failed1<-which(is.na(e.long$povtran)==T)
e.long<-e.long[-failed1,]


e.long$age1r<-round(e.long$age_enter, 0)
e.long$age2r<-round(e.long$age_exit, 0)
head(e.long, n=10)

##             childid gender race r1_kage r4age r5age r6age r7age c1r4mtsc
## 0001002C.1 0001002C      2    1   77.20 94.73     6     4     4   68.324
## 0001002C.2 0001002C      2    1   77.20 94.73     6     4     4   68.324
## 0001007C.1 0001007C      1    1   63.60 81.90     2     2     2   43.836
## 0001007C.2 0001007C      1    1   63.60 81.90     2     2     2   43.836
## 0001010C.1 0001010C      2    1   70.63 88.63     4     3     3   53.324
## 0001010C.2 0001010C      2    1   70.63 88.63     4     3     3   53.324
## 0002004C.1 0002004C      2    1   65.40 83.73     3     2     2   59.598
## 0002004C.2 0002004C      2    1   65.40 83.73     3     2     2   59.598
## 0002008C.1 0002008C      2    1   70.87 89.20     5     3     3   42.235
## 0002008C.2 0002008C      2    1   70.87 89.20     5     3     3   42.235
##            c4r4mtsc c5r4mtsc c6r4mtsc c7r4mtsc w1povrty w1povrty.1
## 0001002C.1   58.738   60.390   62.585   66.481        2          2
## 0001002C.2   58.738   60.390   62.585   66.481        2          2
## 0001007C.1   37.160   40.946   44.830   44.122        2          2
## 0001007C.2   37.160   40.946   44.830   44.122        2          2
## 0001010C.1   52.256   54.986   54.759   50.096        2          2
## 0001010C.2   52.256   54.986   54.759   50.096        2          2
## 0002004C.1   54.965   53.793   57.428   62.880        2          2
## 0002004C.2   54.965   53.793   57.428   62.880        2          2
## 0002008C.1   41.269   40.119   35.635   46.600        2          2
## 0002008C.2   41.269   40.119   35.635   46.600        2          2
##            w3povrty w5povrty w8povrty wkmomed s2_id c15fppsu c15fpstr
## 0001002C.1        2        2        2       3  0001        1       63
## 0001002C.2        2        2        2       3  0001        1       63
## 0001007C.1        2        2        2       6  0001        1       63
## 0001007C.2        2        2        2       6  0001        1       63
## 0001010C.1        2        2        2       7  0001        1       63
## 0001010C.2        2        2        2       7  0001        1       63
## 0002004C.1        2        2        2       6  0002        1       63
## 0002004C.2        2        2        2       6  0002        1       63
## 0002008C.1        2        2        2       4  0002        1       63
## 0002008C.2        2        2        2       4  0002        1       63
##            c1_5fp0 pov1 pov2 pov3 hisp black asian nahn other male mlths
## 0001002C.1  352.35    0    0    0    0     0     0    0     0    0     0
## 0001002C.2  352.35    0    0    0    0     0     0    0     0    0     0
## 0001007C.1  358.99    0    0    0    0     0     0    0     0    1     0
## 0001007C.2  358.99    0    0    0    0     0     0    0     0    1     0
## 0001010C.1  352.35    0    0    0    0     0     0    0     0    0     0
## 0001010C.2  352.35    0    0    0    0     0     0    0     0    0     0
## 0002004C.1  228.05    0    0    0    0     0     0    0     0    0     0
## 0002004C.2  228.05    0    0    0    0     0     0    0     0    0     0
## 0002008C.1  259.19    0    0    0    0     0     0    0     0    0     0
## 0002008C.2  259.19    0    0    0    0     0     0    0     0    0     0
##            mgths time age_enter age_exit povtran age1r age2r
## 0001002C.1     0    1  6.433333 7.894167       0     6     8
## 0001002C.2     0    2  7.894167 9.750000       0     8    10
## 0001007C.1     1    1  5.300000 6.825000       0     5     7
## 0001007C.2     1    2  6.825000 8.916667       0     7     9
## 0001010C.1     1    1  5.885833 7.385833       0     6     7
## 0001010C.2     1    2  7.385833 9.333333       0     7     9
## 0002004C.1     1    1  5.450000 6.977500       0     5     7
## 0002004C.2     1    2  6.977500 9.083333       0     7     9
## 0002008C.1     1    1  5.905833 7.433333       0     6     7
## 0002008C.2     1    2  7.433333 9.583333       0     7    10

Cox regression model

Compared to the parametric models we saw last week, the Cox model See also the partial likelihood paper, does not specify a parametric form for the baseline hazard rate. The model still looks the same as the other proportional hazards models:

\(\text{h}(t) =h_0 \exp \left (x`\beta \right)\)

but \(h_0\) is not a parametric function. Instead, the baseline hazard rate is the empirically observed hazard rate, and the covariates shift it up or down, proportionally.

Using age as the time variable:

Here I use age of the child as the time variable, this should show how children experience poverty during school.

#Cox Model
#interval censored
fitl1<-coxreg(Surv(time = age_enter,time2=age_exit, event = povtran)~mlths+mgths+hisp+black+nahn+asian+other, data=e.long)
summary(fitl1)

## Call:
## coxreg(formula = Surv(time = age_enter, time2 = age_exit, event = povtran) ~ 
##     mlths + mgths + hisp + black + nahn + asian + other, data = e.long)
## 
## Covariate           Mean       Coef     Rel.Risk   S.E.    Wald p
## mlths               0.057     0.742     2.100     0.105     0.000 
## mgths               0.683    -0.983     0.374     0.089     0.000 
## hisp                0.132     0.978     2.659     0.100     0.000 
## black               0.064     1.377     3.964     0.116     0.000 
## nahn                0.023     1.387     4.004     0.168     0.000 
## asian               0.054     0.557     1.745     0.176     0.002 
## other               0.027     0.388     1.474     0.257     0.132 
## 
## Events                    666 
## Total time at risk         23497 
## Max. log. likelihood      -5412.8 
## LR test statistic         575.17 
## Degrees of freedom        7 
## Overall p-value           0

plot(survfit(fitl1), ylim=c(.8,1), 
     main="Survivorship Function for Cox Regression model - Average Child")

The model results (Rel.Risk) show that children with mom’s who have less than a high school education have 2.1 times higher risk of going into poverty during this period, while children whose mother have more than a high school education are 67% less likely to enter poverty, compared to children whose mothers had a high school education. Likewise, Hispanic, Non-Hispanic black, Native American and Asian children all face higher risk of entering poverty, compared to Non-Hispanic whites.

Risk scores

The Cox model generates a “risk score” for each individual. This is just \(\exp (x ` \beta)\), or the exponent of the linear predictor. These are not absolute values that have any real direct interpretation, but they are interpretable in a relative sense. Risk scores >1 indicate that a person has higher risk than the baseline category, while risk scores <1 have lower relative risk, compared to the baseline. If you want to intrepret these, it’s necessary to have the baseline category be a meaningful “type” of person. In our example above, the baseline group would be non-Hispanic whites, with a mother who had a high school education. i.e. all x’s are 0.

e.long$risk<-exp(fitl1$linear.predictors)

#highest risk child
e.long[which.max(e.long$risk),c("childid", "time", "mlths", "mgths", "black", "hisp", "nahn", "other", "asian", "risk")]

##             childid time mlths mgths black hisp nahn other asian     risk
## 0517014C.1 0517014C    1     1     0     0    0    1     0     0 11.78469

#lowest risk child
e.long[which.min(e.long$risk),c("childid", "time", "mlths", "mgths", "black", "hisp", "nahn", "other", "asian", "risk")]

##             childid time mlths mgths black hisp nahn other asian      risk
## 0001007C.1 0001007C    1     0     1     0    0    0     0     0 0.5242608

So, the first of these children has a risk score of 11.78 which means their risk was almost twelve times that of the baseline child. Likewise, the lowest risk child had a risk score of .524 that means their score is almost 50% less than the baseline category.

Fitting the Cox model with survey design information

Now we fit the Cox model using full survey design. In the ECLS-K, I use the longitudinal weight for waves 1-5, as well as the associated psu and strata id’s for the longitudinal data from these waves from the parents of the child, since no data from the child themselves are used in the outcome.

des2<-svydesign(ids = ~c15fppsu, strata = ~c15fpstr, weights=~c1_5fp0, data=e.long[ is.na(e.long$c15fppsu)==F,], nest=T)


#Fit the model
fitl1<-svycoxph(Surv(time =age_enter, time2 = age_exit, event = povtran)~mlths+mgths+hisp+black+nahn+asian+other, design=des2)
summary(fitl1)

## Stratified 1 - level Cluster Sampling design (with replacement)
## With (501) clusters.
## svydesign(ids = ~c15fppsu, strata = ~c15fpstr, weights = ~c1_5fp0, 
##     data = e.long[is.na(e.long$c15fppsu) == F, ], nest = T)

## Call:
## svycoxph(formula = Surv(time = age_enter, time2 = age_exit, event = povtran) ~ 
##     mlths + mgths + hisp + black + nahn + asian + other, design = des2)
## 
##   n= 12980, number of events= 617 
## 
##          coef exp(coef) se(coef)      z Pr(>|z|)    
## mlths  0.8009    2.2275   0.1251  6.404 1.51e-10 ***
## mgths -0.9973    0.3689   0.1240 -8.040 8.88e-16 ***
## hisp   0.8992    2.4576   0.1174  7.659 1.88e-14 ***
## black  1.4335    4.1932   0.1596  8.983  < 2e-16 ***
## nahn   1.1671    3.2127   0.3660  3.189  0.00143 ** 
## asian  0.3189    1.3756   0.2971  1.073  0.28316    
## other  0.9230    2.5168   0.3061  3.015  0.00257 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##       exp(coef) exp(-coef) lower .95 upper .95
## mlths    2.2275     0.4489    1.7433    2.8461
## mgths    0.3689     2.7109    0.2893    0.4704
## hisp     2.4576     0.4069    1.9524    3.0934
## black    4.1932     0.2385    3.0671    5.7329
## nahn     3.2127     0.3113    1.5679    6.5830
## asian    1.3756     0.7269    0.7684    2.4628
## other    2.5168     0.3973    1.3814    4.5855
## 
## Concordance= 0.752  (se = 0.01 )
## Rsquare= NA   (max possible= NA )
## Likelihood ratio test= NA  on 7 df,   p=NA
## Wald test            = 495.8  on 7 df,   p=0
## Score (logrank) test = NA  on 7 df,   p=NA

plot(survfit(fitl1, conf.int = F), ylab="S(t)", xlab="Child Age",  ylim=c(.2,1))
      
lines(survfit(fitl1, newdata = data.frame(mlths=1, mgths=0, black=1, hisp=0,asian=0, other=0, nahn=0), conf.int=F) ,col="red", lty=1)
lines(survfit(fitl1, newdata = data.frame(mlths=0, mgths=0, black=1, hisp=0, other=0, asian=0,nahn=0), conf.int=F) ,col="red", lty=2)

lines(survfit(fitl1, newdata = data.frame(mlths=1, mgths=0, black=0, hisp=1,asian=0, other=0, nahn=0), conf.int=F) ,col="green", lty=1)
lines(survfit(fitl1, newdata = data.frame(mlths=0, mgths=0, black=0, hisp=1, other=0, asian=0,nahn=0), conf.int=F) ,col="green", lty=2)

lines(survfit(fitl1, newdata = data.frame(mlths=1, mgths=0, black=0, hisp=0,asian=0, other=0, nahn=0), conf.int=F) ,col="blue", lty=1)
lines(survfit(fitl1, newdata = data.frame(mlths=0, mgths=0, black=0, hisp=0, other=0, asian=0,nahn=0), conf.int=F) ,col="blue", lty=2)


title(main=c("Survival function for poverty transition between","Kindergarten and 5th Grade"))
legend("bottomleft", 
       legend=c("mean", "Black, < HS ", "Black, HS","Hispanic, < HS ", "Hispanic, HS","NH White, < HS ", "NH White, HS"), col=c(1,"red","red", "green","green", "blue", "blue"), lty=c(1,1,2,1,2,1,2), cex=.8)

Use time instead of age

Next, I will use the time variable we created in e.long as the time axis. This model will not focus on the age of the children, but on the probability of experiencing the transition between waves.

#Cox Model
#interval censored
fitl2<-coxreg(Surv(time = time, event = povtran)~mlths+mgths+hisp+black+nahn+asian+other, data=e.long)
summary(fitl2)

## Call:
## coxreg(formula = Surv(time = time, event = povtran) ~ mlths + 
##     mgths + hisp + black + nahn + asian + other, data = e.long)
## 
## Covariate           Mean       Coef     Rel.Risk   S.E.    Wald p
## mlths               0.055     0.803     2.232     0.105     0.000 
## mgths               0.685    -0.994     0.370     0.089     0.000 
## hisp                0.130     0.912     2.490     0.100     0.000 
## black               0.063     1.322     3.750     0.115     0.000 
## nahn                0.023     1.369     3.933     0.168     0.000 
## asian               0.054     0.458     1.580     0.176     0.009 
## other               0.027     0.355     1.426     0.257     0.168 
## 
## Events                    666 
## Total time at risk         19908 
## Max. log. likelihood      -5847.8 
## LR test statistic         575.59 
## Degrees of freedom        7 
## Overall p-value           0

plot(survfit(fitl2),  ylim=c(.8,1), xlab="Wave",xaxt="n", 
     main="Survivorship Function for Cox Regression model - Average Child")
axis(1, at=c(0,1,2))

#Fit the model
fitl2s<-svycoxph(Surv(time = time, event = povtran)~mlths+mgths+hisp+black+nahn+asian+other, design=des2)
summary(fitl2s)

## Stratified 1 - level Cluster Sampling design (with replacement)
## With (501) clusters.
## svydesign(ids = ~c15fppsu, strata = ~c15fpstr, weights = ~c1_5fp0, 
##     data = e.long[is.na(e.long$c15fppsu) == F, ], nest = T)

## Call:
## svycoxph(formula = Surv(time = time, event = povtran) ~ mlths + 
##     mgths + hisp + black + nahn + asian + other, design = des2)
## 
##   n= 12980, number of events= 617 
## 
##          coef exp(coef) se(coef)      z Pr(>|z|)    
## mlths  0.8410    2.3186   0.1301  6.466 1.00e-10 ***
## mgths -1.0004    0.3677   0.1267 -7.898 2.78e-15 ***
## hisp   0.8431    2.3236   0.1207  6.986 2.82e-12 ***
## black  1.3878    4.0062   0.1640  8.463  < 2e-16 ***
## nahn   1.1961    3.3073   0.3511  3.407 0.000658 ***
## asian  0.2013    1.2229   0.3302  0.610 0.542138    
## other  0.9351    2.5474   0.3226  2.899 0.003744 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##       exp(coef) exp(-coef) lower .95 upper .95
## mlths    2.3186     0.4313    1.7969    2.9918
## mgths    0.3677     2.7194    0.2869    0.4713
## hisp     2.3236     0.4304    1.8342    2.9437
## black    4.0062     0.2496    2.9049    5.5248
## nahn     3.3073     0.3024    1.6618    6.5818
## asian    1.2229     0.8177    0.6403    2.3358
## other    2.5474     0.3926    1.3537    4.7935
## 
## Concordance= 0.757  (se = 0.01 )
## Rsquare= NA   (max possible= NA )
## Likelihood ratio test= NA  on 7 df,   p=NA
## Wald test            = 452.1  on 7 df,   p=0
## Score (logrank) test = NA  on 7 df,   p=NA

plot(survfit(fitl2s, conf.int = F), ylab="S(t)", xlab="Wave", xaxt="n", ylim=c(.2,1))
axis(1, at=c(0,1,2))
      
lines(survfit(fitl2s, newdata = data.frame(mlths=1, mgths=0, black=1, hisp=0,asian=0, other=0, nahn=0), conf.int=F) ,col="red", lty=1)
lines(survfit(fitl2s, newdata = data.frame(mlths=0, mgths=0, black=1, hisp=0, other=0, asian=0,nahn=0), conf.int=F) ,col="red", lty=2)

lines(survfit(fitl2s, newdata = data.frame(mlths=1, mgths=0, black=0, hisp=1,asian=0, other=0, nahn=0), conf.int=F) ,col="green", lty=1)
lines(survfit(fitl2s, newdata = data.frame(mlths=0, mgths=0, black=0, hisp=1, other=0, asian=0,nahn=0), conf.int=F) ,col="green", lty=2)

lines(survfit(fitl2s, newdata = data.frame(mlths=1, mgths=0, black=0, hisp=0,asian=0, other=0, nahn=0), conf.int=F) ,col="blue", lty=1)
lines(survfit(fitl2s, newdata = data.frame(mlths=0, mgths=0, black=0, hisp=0, other=0, asian=0,nahn=0), conf.int=F) ,col="blue", lty=2)


title(main=c("Survival function for poverty transition between","Kindergarten and 5th Grade"))
legend("bottomleft", 
       legend=c("mean", "Black, < HS ", "Black, HS","Hispanic, < HS ", "Hispanic, HS","NH White, < HS ", "NH White, HS"), col=c(1,"red","red", "green","green", "blue", "blue"), lty=c(1,1,2,1,2,1,2))

We see similar results as we did in the age based analysis, but now, we are treating time discretely, and separating it from the child’s age entirely.

Functions of survival time

Here is an example of calculating the functions of survival time. These aren’t very exciting for this example

sf1<-survfit(fitl2s)
H1<--log(sf1$surv)
times<-sf1$time

plot(H1~times, type="s",  ylab="H(t)",ylim=c(0, .1), xlab="Months", xaxt="n", main="Cumulative Hazard plot")
axis(1, at=c(0,1,2))

hs1<-diff(c(0,H1))
plot(hs1~times,type="l", ylab="smoothed h(t)", xlab="Wave", xaxt="n", main="Hazard plot")
axis(1, at=c(0,1,2))

ft<- -diff(sf1$surv)
plot(ft,ylim=c(0, .1),
     ylab="f(t)",xlab="Time in Months",
     main="Probability Density Function")

#here is the cumulative distribution function
Ft<-cumsum(ft)
plot(Ft,  ylab="F(t)",xlab="Time in Months", main="Cumulative Distribution Function")

DHS data example

library(haven)
#load the data
model.dat<-read_dta("C:/Users/ozd504//Google Drive/classes/dem7223/class17/data/zzir62dt/ZZIR62FL.DTA")

In the DHS individual recode file, information on every live birth is collected using a retrospective birth history survey mechanism.

Since our outcome is time between first and second birth, we must select as our risk set, only women who have had a first birth.

The bidx variable indexes the birth history and if bidx_01 is not missing, then the woman should be at risk of having a second birth (i.e. she has had a first birth, i.e. bidx_01==1).

I also select only non-twin births (b0 == 0).

The DHS provides the dates of when each child was born in Century Month Codes.

To get the interval for women who acutally had a second birth, that is the difference between the CMC for the first birth b3_01 and the second birth b3_02, but for women who had not had a second birth by the time of the interview, the censored time between births is the difference between b3_01 and v008, the date of the interview.

We have 6161 women who are at risk of a second birth.

table(is.na(model.dat$bidx_01))

## 
## FALSE  TRUE 
##  6161  2187

#now we extract those women
sub<-subset(model.dat, model.dat$bidx_01==1&model.dat$b0_01==0)

#Here I keep only a few of the variables for the dates, and some characteristics of the women, and details of the survey
sub2<-data.frame(CASEID=sub$caseid, 
                 int.cmc=sub$v008,
                 fbir.cmc=sub$b3_01,
                 sbir.cmc=sub$b3_02,
                 marr.cmc=sub$v509,
                 rural=sub$v025,
                 educ=sub$v106,
                 age=sub$v012/5,
                 partneredu=sub$v701,
                 partnerage=sub$v730,
                 weight=sub$v005/1000000,
                 psu=sub$v021, strata=sub$v022)

Now I need to calculate the birth intervals, both observed and censored, and the event indicator (i.e. did the women have the second birth?)

sub2$secbi<-ifelse(is.na(sub2$sbir.cmc)==T, ((sub2$int.cmc))-((sub2$fbir.cmc)), (sub2$fbir.cmc-sub2$sbir.cmc))
sub2$b2event<-ifelse(is.na(sub2$sbir.cmc)==T,0,1)

Create covariates

Here, we create some predictor variables: Woman’s education (secondary +, vs < secondary), Woman’s age^2, Partner’s education (> secondary school)

sub2$educ.high<-ifelse(sub2$educ %in% c(2,3), 1, 0)
sub2$age2<-(sub2$age)^2
sub2$partnerhiedu<-ifelse(sub2$partneredu<3,0,ifelse(sub2$partneredu%in%c(8,9),NA,1 ))

options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~psu, strata=~strata, data=sub2[sub2$secbi>0,], weight=~weight )

Fit the model

#using coxph in survival library
fit.cox2<-coxph(Surv(secbi,b2event)~educ.high+partnerhiedu+I(age/5)+age2 , data=sub2)
summary(fit.cox2)

## Call:
## coxph(formula = Surv(secbi, b2event) ~ educ.high + partnerhiedu + 
##     I(age/5) + age2, data = sub2)
## 
##   n= 5289, number of events= 4527 
##    (737 observations deleted due to missingness)
## 
##                   coef exp(coef)  se(coef)      z Pr(>|z|)    
## educ.high    -0.392343  0.675472  0.047600 -8.243 2.22e-16 ***
## partnerhiedu -0.194839  0.822967  0.068752 -2.834   0.0046 ** 
## I(age/5)     -0.486017  0.615071  0.399352 -1.217   0.2236    
## age2          0.002542  1.002545  0.005755  0.442   0.6587    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##              exp(coef) exp(-coef) lower .95 upper .95
## educ.high       0.6755     1.4804    0.6153    0.7415
## partnerhiedu    0.8230     1.2151    0.7192    0.9417
## I(age/5)        0.6151     1.6258    0.2812    1.3454
## age2            1.0025     0.9975    0.9913    1.0139
## 
## Concordance= 0.547  (se = 0.005 )
## Rsquare= 0.026   (max possible= 1 )
## Likelihood ratio test= 142  on 4 df,   p=0
## Wald test            = 131.2  on 4 df,   p=0
## Score (logrank) test = 132.5  on 4 df,   p=0

plot(survfit(fit.cox2),   xlim=c(0,60), ylab="S(t)", xlab="Age in Months")
title(main="Survival Function for Second Birth Interval")

#use survey design
des<-svydesign(ids=~psu, strata = ~strata , weights=~weight, data=sub2)

cox.s<-svycoxph(Surv(secbi,b2event)~educ.high+partnerhiedu+I(age/5)+age2, design=des)
summary(cox.s)

## Stratified 1 - level Cluster Sampling design (with replacement)
## With (217) clusters.
## svydesign(ids = ~psu, strata = ~strata, weights = ~weight, data = sub2)

## Call:
## svycoxph(formula = Surv(secbi, b2event) ~ educ.high + partnerhiedu + 
##     I(age/5) + age2, design = des)
## 
##   n= 5289, number of events= 4527 
##    (737 observations deleted due to missingness)
## 
##                   coef exp(coef)  se(coef)      z Pr(>|z|)    
## educ.high    -0.417896  0.658431  0.056984 -7.334 2.24e-13 ***
## partnerhiedu -0.235204  0.790409  0.091225 -2.578  0.00993 ** 
## I(age/5)     -0.985446  0.373273  0.520795 -1.892  0.05846 .  
## age2          0.009808  1.009856  0.007556  1.298  0.19430    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##              exp(coef) exp(-coef) lower .95 upper .95
## educ.high       0.6584     1.5188    0.5889    0.7362
## partnerhiedu    0.7904     1.2652    0.6610    0.9452
## I(age/5)        0.3733     2.6790    0.1345    1.0359
## age2            1.0099     0.9902    0.9950    1.0249
## 
## Concordance= 0.555  (se = 0.005 )
## Rsquare= NA   (max possible= NA )
## Likelihood ratio test= NA  on 4 df,   p=NA
## Wald test            = 98.89  on 4 df,   p=0
## Score (logrank) test = NA  on 4 df,   p=NA

#plot some survival function estimates for various types of children
plot(survfit(cox.s, conf.int = F),  xlim=c(0,60), ylab="S(t)", xlab="Months")
title (main = "Survival Plots for for Second Birth Interval")

dat<-expand.grid(educ.high=c(0,1), partnerhiedu=c(0,1), age=4:8)
dat$age2<-dat$age^2
head(dat)

##   educ.high partnerhiedu age age2
## 1         0            0   4   16
## 2         1            0   4   16
## 3         0            1   4   16
## 4         1            1   4   16
## 5         0            0   5   25
## 6         1            0   5   25

lines(survfit(cox.s, newdata=dat[9,], conf.int = F), col="red")
lines(survfit(cox.s, newdata=dat[12,], conf.int = F), col="green")
legend("bottomleft", legend=c("Means of Covariates", "low edu, age 30", "high edu, age 30"), lty=1, col=c(1,"red", "green"), cex=.8)

Now we look at some more plots we can examine from the models

#Now we look at the cumulative hazard functions
sf1<-survfit(cox.s)
sf2<-survfit(cox.s, newdata=dat[9,])
sf3<-survfit(cox.s,newdata=dat[12,])



H1<--log(sf1$surv)
H2<--log(sf2$surv)
H3<--log(sf3$surv)

times<-sf1$time

plot(H1~times, type="s",  ylab="H(t)",ylim=c(0, 6), xlab="Months")
lines(H2~times,col="red", type="s")
lines(H3~times,col="green", type="s")
legend("topright", legend=c("Means of Covariates", "low edu, age 30", "high edu, age 30"), lty=1, col=c(1,"red", "green"), cex=.8)

#and the hazard function
hs1<-loess(diff(c(0,H1))~times, degree=1, span=.25)
hs2<-loess(diff(c(0,H2))~times, degree=1, span=.25)
hs3<-loess(diff(c(0,H3))~times, degree=1, span=.25)

plot(predict(hs1)~times,type="l", ylab="smoothed h(t)", xlab="Months", ylim=c(0, .04))
title(main="Smoothed hazard plots")
lines(predict(hs2)~times, type="l", col="red")
lines(predict(hs3)~times, type="l", col="green")
legend("topright", legend=c("Means of Covariates", "low edu, age 30", "high edu, age 30"), lty=1, col=c(1,"red", "green"), cex=.8)

DEM 7223 Event History Analysis - Example 4 Cox Proportional Hazards Model Part 1

Corey Sparks, PhD

October 2, 2017