This example will illustrate how to fit the Cox Proportional hazards model to continuous duration data (i.e. person-level data) and a discrete-time (longitudinal) data set.

The first example uses longitudinal data from the ECLS-K. Specifically, we will examine the transition into poverty between kindergarten and third grade.

In the second example, I use the time between the first and second birth for women in the data as the outcome variable. The data for this example come from the DHS Model data file Demographic and Health Survey for 2012 individual recode file. This file contains information for all women sampled in the survey between the ages of 15 and 49.

Using Longitudinal Data

As in the other examples, I illustrate fitting these models to data that are longitudinal, instead of person-duration. In this example, we will examine how to fit the Cox model to a longitudinally collected data set.

First we load our data

#Load required libraries
library(foreign)
library(survival)
library(car)
library(survey)
library(eha)
library(dplyr)
options(survey.lonely.psu = "adjust")
load("~/Google Drive/classes/dem7223/dem7223_19/data/eclsk_k5.Rdata")
names(eclskk5)<-tolower(names(eclskk5))
#get out only the variables I'm going to use for this example
myvars<-c( "childid","x_chsex_r", "x_raceth_r", "x1kage_r","x4age", "x5age", "x6age", "x7age", "x2povty","x4povty_i", "x6povty_i", "x8povty_i","x12par1ed_i", "s2_id", "w6c6p_6psu", "w6c6p_6str", "w6c6p_20")
eclskk5<-eclskk5[,myvars]


eclskk5$age1<-ifelse(eclskk5$x1kage_r==-9, NA, eclskk5$x1kage_r/12)
eclskk5$age2<-ifelse(eclskk5$x4age==-9, NA, eclskk5$x4age/12)
#for the later waves, the NCES group the ages into ranges of months, so 1= <105 months, 2=105 to 108 months. So, I fix the age at the midpoint of the interval they give, and make it into years by dividing by 12
eclskk5$age3<-ifelse(eclskk5$x5age==-9, NA, eclskk5$x5age/12)

eclskk5$pov1<-ifelse(eclskk5$x2povty==1,1,0)
eclskk5$pov2<-ifelse(eclskk5$x4povty_i==1,1,0)
eclskk5$pov3<-ifelse(eclskk5$x6povty_i==1,1,0)

#Recode race with white, non Hispanic as reference using dummy vars
eclskk5$race_rec<-Recode (eclskk5$x_raceth_r, recodes="1 = 'nhwhite';2='nhblack';3:4='hispanic';5='nhasian'; 6:8='other';-9=NA", as.factor = T)
eclskk5$race_rec<-relevel(eclskk5$race_rec, ref = "nhwhite")
eclskk5$male<-Recode(eclskk5$x_chsex_r, recodes="1=1; 2=0; -9=NA")
eclskk5$mlths<-Recode(eclskk5$x12par1ed_i, recodes = "1:2=1; 3:9=0; else = NA")
eclskk5$mgths<-Recode(eclskk5$x12par1ed_i, recodes = "1:3=0; 4:9=1; else =NA") 

Now, I need to form the transition variable, this is my event variable, and in this case it will be 1 if a child enters poverty between the first wave of the data and the third grade wave, and 0 otherwise.

NOTE I need to remove any children who are already in poverty age wave 1, because they are not at risk of experiencing this particular transition. Again, this is called forming the risk set

eclskk5<-subset(eclskk5, is.na(pov1)==F&is.na(pov2)==F&is.na(pov3)==F&is.na(age1)==F&is.na(age2)==F&is.na(age3)==F&pov1!=1)

Now we do the entire data set. To analyze data longitudinally, we need to reshape the data from the current “wide” format (repeated measures in columns) to a “long” format (repeated observations in rows). The reshape() function allows us to do this easily. It allows us to specify our repeated measures, time varying covariates as well as time-constant covariates.

e.long<-reshape(data.frame(eclskk5), idvar="childid", varying=list(c("age1","age2"),
                                                     c("age2", "age3")),
                v.names=c("age_enter", "age_exit"),
                times=1:2, direction="long" )
e.long<-e.long[order(e.long$childid, e.long$time),]

e.long$povtran<-NA

e.long$povtran[e.long$pov1==0&e.long$pov2==1&e.long$time==1]<-1
e.long$povtran[e.long$pov2==0&e.long$pov3==1&e.long$time==2]<-1

e.long$povtran[e.long$pov1==0&e.long$pov2==0&e.long$time==1]<-0
e.long$povtran[e.long$pov2==0&e.long$pov3==0&e.long$time==2]<-0

#find which kids failed in earlier time periods and remove them from the second & third period risk set
failed1<-which(is.na(e.long$povtran)==T)
e.long<-e.long[-failed1,]


e.long$age1r<-round(e.long$age_enter, 0)
e.long$age2r<-round(e.long$age_exit, 0)
head(e.long, n=10)
##             childid x_chsex_r x_raceth_r x1kage_r x4age x5age  x6age
## 10000014.1 10000014         1          1    67.82 85.94 91.73  97.51
## 10000014.2 10000014         1          1    67.82 85.94 91.73  97.51
## 10000020.1 10000020         2          5    68.38 88.57 93.37 100.34
## 10000020.2 10000020         2          5    68.38 88.57 93.37 100.34
## 10000022.1 10000022         2          8    68.61 87.68 92.98  99.19
## 10000022.2 10000022         2          8    68.61 87.68 92.98  99.19
## 10000029.1 10000029         2          1    69.40 86.86 92.68  99.32
## 10000029.2 10000029         2          1    69.40 86.86 92.68  99.32
## 10000034.1 10000034         1          2    76.24 93.30 99.55 105.96
## 10000034.2 10000034         1          2    76.24 93.30 99.55 105.96
##             x7age x2povty x4povty_i x6povty_i x8povty_i x12par1ed_i s2_id
## 10000014.1 106.85       3         3         3         3           3  1433
## 10000014.2 106.85       3         3         3         3           3  1433
## 10000020.1 111.12       3         3         3         3           3  1365
## 10000020.2 111.12       3         3         3         3           3  1365
## 10000022.1 110.99       3         3         3         3           6  1405
## 10000022.2 110.99       3         3         3         3           6  1405
## 10000029.1 110.40       2         2         2         2           1  2042
## 10000029.2 110.40       2         2         2         2           1  2042
## 10000034.1 115.10       2         2         1        NA           3  2008
## 10000034.2 115.10       2         2         1        NA           3  2008
##            w6c6p_6psu w6c6p_6str w6c6p_20 pov1 pov2 pov3 race_rec male
## 10000014.1          2         39 328.0577    0    0    0  nhwhite    1
## 10000014.2          2         39 328.0577    0    0    0  nhwhite    1
## 10000020.1          2         53 136.5265    0    0    0  nhasian    0
## 10000020.2          2         53 136.5265    0    0    0  nhasian    0
## 10000022.1          1         35 163.1234    0    0    0    other    0
## 10000022.2          1         35 163.1234    0    0    0    other    0
## 10000029.1          2         60 341.5456    0    0    0  nhwhite    0
## 10000029.2          2         60 341.5456    0    0    0  nhwhite    0
## 10000034.1          1         50 289.4607    0    0    1  nhblack    1
## 10000034.2          1         50 289.4607    0    0    1  nhblack    1
##            mlths mgths time age_enter age_exit povtran age1r age2r
## 10000014.1     0     0    1  5.651667 7.161667       0     6     7
## 10000014.2     0     0    2  7.161667 7.644167       0     7     8
## 10000020.1     0     0    1  5.698333 7.380833       0     6     7
## 10000020.2     0     0    2  7.380833 7.780833       0     7     8
## 10000022.1     0     1    1  5.717500 7.306667       0     6     7
## 10000022.2     0     1    2  7.306667 7.748333       0     7     8
## 10000029.1     1     0    1  5.783333 7.238333       0     6     7
## 10000029.2     1     0    2  7.238333 7.723333       0     7     8
## 10000034.1     0     0    1  6.353333 7.775000       0     6     8
## 10000034.2     0     0    2  7.775000 8.295833       1     8     8

Cox regression model

Compared to the parametric models we saw last week, the Cox model See also the partial likelihood paper, does not specify a parametric form for the baseline hazard rate. The model still looks the same as the other proportional hazards models:

\(\text{h}(t) =h_0 \exp \left (x`\beta \right)\)

but \(h_0\) is not a parametric function. Instead, the baseline hazard rate is the empirically observed hazard rate, and the covariates shift it up or down, proportionally.

Using age as the time variable:

Here I use age of the child as the time variable, this should show how children experience poverty during school.

#Cox Model
#interval censored
e.long<-e.long%>%
  filter(complete.cases(age_enter, age_exit, povtran, mlths, mgths, race_rec))
fitl1<-coxreg(Surv(time = age_enter,time2=age_exit, event = povtran)~mlths+mgths+race_rec, data=e.long)
summary(fitl1)  
## Call:
## coxreg(formula = Surv(time = age_enter, time2 = age_exit, event = povtran) ~ 
##     mlths + mgths + race_rec, data = e.long)
## 
## Covariate           Mean       Coef     Rel.Risk   S.E.    Wald p
## mlths               0.056     0.566     1.761     0.189     0.003 
## mgths               0.788    -1.132     0.322     0.160     0.000 
## race_rec 
##          nhwhite    0.560     0         1 (reference)
##         hispanic    0.221     1.210     3.353     0.178     0.000 
##          nhasian    0.084     0.586     1.796     0.299     0.050 
##          nhblack    0.065     1.155     3.173     0.254     0.000 
##            other    0.071     0.746     2.110     0.291     0.010 
## 
## Events                    223 
## Total time at risk        3994.3 
## Max. log. likelihood      -1462.5 
## LR test statistic         223.81 
## Degrees of freedom        6 
## Overall p-value           0
plot(survfit(fitl1), ylim=c(.6,1), xlim=c(6, 9),
     main="Survivorship Function for Cox Regression model - Average Child")

The model results (Rel.Risk) show that children with mom’s who have less than a high school education have 1.76 times higher risk of going into poverty during this period, while children whose mother have more than a high school education are r1-round(exp(coef(fitl1)[2]), 2)% less likely to enter poverty, compared to children whose mothers had a high school education. Likewise, Hispanic, Non-Hispanic black, Native American children all face higher risk of entering poverty, compared to Non-Hispanic whites.

Risk scores

The Cox model generates a “risk score” for each individual. This is just \(\exp (x ` \beta)\), or the exponent of the linear predictor. These are not absolute values that have any real direct interpretation, but they are interpretable in a relative sense. Risk scores >1 indicate that a person has higher risk than the baseline category, while risk scores <1 have lower relative risk, compared to the baseline. If you want to intrepret these, it’s necessary to have the baseline category be a meaningful “type” of person. In our example above, the baseline group would be non-Hispanic whites, with a mother who had a high school education. i.e. all x’s are 0.

e.long$risk<-exp(fitl1$linear.predictors)

#highest risk child
e.long[which.max(e.long$risk),c("childid", "age_enter", "mlths", "mgths","race_rec", "risk")]
##      childid age_enter mlths mgths race_rec     risk
## 180 10000744  5.263333     1     0 hispanic 9.076231
#lowest risk child
e.long[which.min(e.long$risk),c("childid", "age_enter", "mlths", "mgths","race_rec", "risk")]
##     childid age_enter mlths mgths race_rec      risk
## 13 10000046    5.9175     0     1  nhwhite 0.4955275

So, the first of these children has a risk score of 9.07 which means their risk was almost nine times that of the baseline child. Likewise, the lowest risk child had a risk score of .495 that means their score is almost 50% less than the baseline category.

Fitting the Cox model with survey design information

Now we fit the Cox model using full survey design. In the ECLS-K, I use the longitudinal weight for waves 1-5, as well as the associated psu and strata id’s for the longitudinal data from these waves from the parents of the child, since no data from the child themselves are used in the outcome.

des2<-svydesign(ids = ~w6c6p_6psu, strata = ~w6c6p_6str, weights=~w6c6p_20, data=e.long[is.na(e.long$w6c6p_6psu)==F,], nest=T)


#Fit the model
fitl1<-svycoxph(Surv(time =age_enter, time2 = age_exit, event = povtran)~mlths+mgths+race_rec, design=des2)
summary(fitl1) 
## Stratified 1 - level Cluster Sampling design (with replacement)
## With (123) clusters.
## svydesign(ids = ~w6c6p_6psu, strata = ~w6c6p_6str, weights = ~w6c6p_20, 
##     data = e.long[is.na(e.long$w6c6p_6psu) == F, ], nest = T)
## Call:
## svycoxph(formula = Surv(time = age_enter, time2 = age_exit, event = povtran) ~ 
##     mlths + mgths + race_rec, design = des2)
## 
##   n= 3938, number of events= 221 
## 
##                     coef exp(coef) se(coef)      z Pr(>|z|)    
## mlths             0.4620    1.5872   0.1931  2.392   0.0168 *  
## mgths            -1.1422    0.3191   0.2191 -5.213 1.86e-07 ***
## race_rechispanic  1.0563    2.8758   0.2091  5.052 4.36e-07 ***
## race_recnhasian   0.4110    1.5084   0.3986  1.031   0.3025    
## race_recnhblack   1.0686    2.9114   0.2325  4.595 4.32e-06 ***
## race_recother     0.4820    1.6193   0.2343  2.057   0.0397 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                  exp(coef) exp(-coef) lower .95 upper .95
## mlths               1.5872     0.6300    1.0870    2.3176
## mgths               0.3191     3.1336    0.2077    0.4903
## race_rechispanic    2.8758     0.3477    1.9090    4.3324
## race_recnhasian     1.5084     0.6630    0.6905    3.2947
## race_recnhblack     2.9114     0.3435    1.8456    4.5924
## race_recother       1.6193     0.6175    1.0230    2.5633
## 
## Concordance= 0.745  (se = 0.023 )
## Likelihood ratio test= NA  on 6 df,   p=NA
## Wald test            = 104.6  on 6 df,   p=<2e-16
## Score (logrank) test = NA  on 6 df,   p=NA
plot(survfit(fitl1, conf.int = F), ylab="S(t)", xlab="Child Age", xlim=c(6, 9))
      
lines(survfit(fitl1, newdata = data.frame(mlths=1, mgths=0, race_rec="nhblack"), conf.int=F) ,col="red", lty=1)
lines(survfit(fitl1, newdata = data.frame(mlths=0, mgths=0, race_rec="nhblack"), conf.int=F) ,col="red", lty=2)

lines(survfit(fitl1, newdata = data.frame(mlths=1, mgths=0, race_rec="hispanic"), conf.int=F) ,col="green", lty=1)
lines(survfit(fitl1, newdata = data.frame(mlths=0, mgths=0, race_rec="hispanic"), conf.int=F) ,col="green", lty=2)

lines(survfit(fitl1, newdata = data.frame(mlths=1, mgths=0, race_rec="nhwhite"), conf.int=F) ,col="blue", lty=1)
lines(survfit(fitl1, newdata = data.frame(mlths=0, mgths=0, race_rec="nhwhite"), conf.int=F) ,col="blue", lty=2)


title(main=c("Survival function for poverty transition between","Kindergarten and 5th Grade"))
legend("bottomleft", 
       legend=c("mean", "Black, < HS ", "Black, HS","Hispanic, < HS ", "Hispanic, HS","NH White, < HS ", "NH White, HS"), col=c(1,"red","red", "green","green", "blue", "blue"), lty=c(1,1,2,1,2,1,2), cex=.8)

Use time instead of age

Next, I will use the time variable we created in e.long as the time axis. This model will not focus on the age of the children, but on the probability of experiencing the transition between waves.

#Cox Model
#interval censored
fitl2<-coxreg(Surv(time = time, event = povtran)~mlths+mgths+race_rec, data=e.long)
summary(fitl2)  
## Call:
## coxreg(formula = Surv(time = time, event = povtran) ~ mlths + 
##     mgths + race_rec, data = e.long)
## 
## Covariate           Mean       Coef     Rel.Risk   S.E.    Wald p
## mlths               0.051     0.501     1.650     0.188     0.008 
## mgths               0.797    -1.213     0.297     0.160     0.000 
## race_rec 
##          nhwhite    0.568     0         1 (reference)
##         hispanic    0.213     1.126     3.083     0.179     0.000 
##          nhasian    0.085     0.452     1.572     0.298     0.129 
##          nhblack    0.063     1.123     3.075     0.252     0.000 
##            other    0.071     0.766     2.150     0.290     0.008 
## 
## Events                    223 
## Total time at risk          5905 
## Max. log. likelihood       -1679 
## LR test statistic         221.42 
## Degrees of freedom        6 
## Overall p-value           0
plot(survfit(fitl2),  ylim=c(.8,1), xlab="Wave",xaxt="n", 
     main="Survivorship Function for Cox Regression model - Average Child")
axis(1, at=c(0,1,2))

The model results (Rel.Risk) show that children with mom’s who have less than a high school education have 2.1 times higher risk of going into poverty during this period, while children whose mother have more than a high school education are 67% less likely to enter poverty, compared to children whose mothers had a high school education. Likewise, Hispanic, Non-Hispanic black, Native American and Asian children all face higher risk of entering poverty, compared to Non-Hispanic whites.

Now we fit the Cox model using full survey design. In the ECLS-K, I use the longitudinal weight for waves 1-5, as well as the associated psu and strata id’s for the longitudinal data from these waves from the parents of the child, since no data from the child themselves are used in the outcome.

#Fit the model
fitl2s<-svycoxph(Surv(time = time, event = povtran)~mlths+mgths+race_rec, design=des2)
summary(fitl2s) 
## Stratified 1 - level Cluster Sampling design (with replacement)
## With (123) clusters.
## svydesign(ids = ~w6c6p_6psu, strata = ~w6c6p_6str, weights = ~w6c6p_20, 
##     data = e.long[is.na(e.long$w6c6p_6psu) == F, ], nest = T)
## Call:
## svycoxph(formula = Surv(time = time, event = povtran) ~ mlths + 
##     mgths + race_rec, design = des2)
## 
##   n= 3938, number of events= 221 
## 
##                     coef exp(coef) se(coef)      z Pr(>|z|)    
## mlths             0.4252    1.5299   0.2121  2.004 0.045018 *  
## mgths            -1.2290    0.2926   0.2195 -5.599 2.16e-08 ***
## race_rechispanic  0.9157    2.4984   0.2929  3.127 0.001768 ** 
## race_recnhasian   0.2686    1.3082   0.4022  0.668 0.504237    
## race_recnhblack   0.9278    2.5290   0.2751  3.373 0.000744 ***
## race_recother     0.4383    1.5501   0.2870  1.527 0.126760    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                  exp(coef) exp(-coef) lower .95 upper .95
## mlths               1.5299     0.6536    1.0095    2.3186
## mgths               0.2926     3.4178    0.1903    0.4499
## race_rechispanic    2.4984     0.4003    1.4073    4.4355
## race_recnhasian     1.3082     0.7644    0.5947    2.8777
## race_recnhblack     2.5290     0.3954    1.4750    4.3360
## race_recother       1.5501     0.6451    0.8831    2.7208
## 
## Concordance= 0.739  (se = 0.022 )
## Likelihood ratio test= NA  on 6 df,   p=NA
## Wald test            = 84.51  on 6 df,   p=4e-16
## Score (logrank) test = NA  on 6 df,   p=NA
plot(survfit(fitl2s, conf.int = F), ylab="S(t)", xlab="Wave", xaxt="n", ylim=c(.2,1))
axis(1, at=c(0,1,2))
      
lines(survfit(fitl2s, newdata = data.frame(mlths=1, mgths=0, race_rec="nhblack"), conf.int=F) ,col="red", lty=1)
lines(survfit(fitl2s, newdata = data.frame(mlths=0, mgths=0, race_rec="nhblack"), conf.int=F) ,col="red", lty=2)

lines(survfit(fitl2s, newdata = data.frame(mlths=1, mgths=0, race_rec="hispanic"), conf.int=F) ,col="green", lty=1)
lines(survfit(fitl2s, newdata = data.frame(mlths=0, mgths=0, race_rec="hispanic"), conf.int=F) ,col="green", lty=2)

lines(survfit(fitl2s, newdata = data.frame(mlths=1, mgths=0, race_rec="nhwhite"), conf.int=F) ,col="blue", lty=1)
lines(survfit(fitl2s, newdata = data.frame(mlths=0, mgths=0, race_rec="nhwhite"), conf.int=F) ,col="blue", lty=2)


title(main=c("Survival function for poverty transition between","Kindergarten and 5th Grade"))
legend("bottomleft", 
       legend=c("mean", "Black, < HS ", "Black, HS","Hispanic, < HS ", "Hispanic, HS","NH White, < HS ", "NH White, HS"), col=c(1,"red","red", "green","green", "blue", "blue"), lty=c(1,1,2,1,2,1,2))

We see similar results as we did in the age based analysis, but now, we are treating time discretely, and separating it from the child’s age entirely.

Functions of survival time

Here is an example of calculating the functions of survival time. These aren’t very exciting for this example

sf1<-survfit(fitl1)
H1<--log(sf1$surv)
times<-sf1$time

plot(H1~times, type="s",  ylab="H(t)",xlab="Child Age", xaxt="n", main="Cumulative Hazard plot")
axis(1, at=c(0,1,2))

hs1<-diff(c(0,H1))
plot(hs1~times,type="l", ylab="smoothed h(t)", xlab="Wave", xaxt="n", main="Hazard plot")
axis(1, at=c(0,1,2))

ft<- -diff(sf1$surv)
plot(ft,ylim=c(0, .1), type="l",
     ylab="f(t)",xlab="Time in Months",
     main="Probability Density Function")

#here is the cumulative distribution function
Ft<-cumsum(ft)
plot(Ft, type="l", ylab="F(t)",xlab="Age of child", main="Cumulative Distribution Function")

DHS data example

library(haven)
#load the data
model.dat<-read_dta("~/Google Drive/classes/dem7223/dem7223_19/data/zzir62dt/ZZIR62FL.DTA")

In the DHS individual recode file, information on every live birth is collected using a retrospective birth history survey mechanism.

Since our outcome is time between first and second birth, we must select as our risk set, only women who have had a first birth.

The bidx variable indexes the birth history and if bidx_01 is not missing, then the woman should be at risk of having a second birth (i.e. she has had a first birth, i.e. bidx_01==1).

I also select only non-twin births (b0 == 0).

The DHS provides the dates of when each child was born in Century Month Codes.

To get the interval for women who acutally had a second birth, that is the difference between the CMC for the first birth b3_01 and the second birth b3_02, but for women who had not had a second birth by the time of the interview, the censored time between births is the difference between b3_01 and v008, the date of the interview.

We have 6161 women who are at risk of a second birth.

table(is.na(model.dat$bidx_01))
## 
## FALSE  TRUE 
##  6161  2187
#now we extract those women
sub<-subset(model.dat, model.dat$bidx_01==1&model.dat$b0_01==0)

#Here I keep only a few of the variables for the dates, and some characteristics of the women, and details of the survey
sub2<-data.frame(CASEID=sub$caseid, 
                 int.cmc=sub$v008,
                 fbir.cmc=sub$b3_01,
                 sbir.cmc=sub$b3_02,
                 marr.cmc=sub$v509,
                 rural=sub$v025,
                 educ=sub$v106,
                 age=sub$v012,
                 partneredu=sub$v701,
                 partnerage=sub$v730,
                 weight=sub$v005/1000000,
                 psu=sub$v021, strata=sub$v022)

sub2$agefb = (sub2$age - (sub2$int.cmc - sub2$fbir.cmc)/12)

Now I need to calculate the birth intervals, both observed and censored, and the event indicator (i.e. did the women have the second birth?)

sub2$secbi<-ifelse(is.na(sub2$sbir.cmc)==T, ((sub2$int.cmc))-((sub2$fbir.cmc)), (sub2$fbir.cmc-sub2$sbir.cmc))
sub2$b2event<-ifelse(is.na(sub2$sbir.cmc)==T,0,1) 

Create covariates

Here, we create some predictor variables: Woman’s education (secondary +, vs < secondary), Woman’s age^2, Partner’s education (> secondary school)

sub2$educ.high<-ifelse(sub2$educ %in% c(2,3), 1, 0)
sub2$age2<-(sub2$agefb)^2
sub2$partnerhiedu<-ifelse(sub2$partneredu<3,0,ifelse(sub2$partneredu%in%c(8,9),NA,1 ))

options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~psu, strata=~strata, data=sub2[sub2$secbi>0,], weight=~weight )

Fit the model

#using coxph in survival library
fit.cox2<-coxph(Surv(secbi,b2event)~educ.high+partnerhiedu+agefb+age2 , data=sub2)
summary(fit.cox2)
## Call:
## coxph(formula = Surv(secbi, b2event) ~ educ.high + partnerhiedu + 
##     agefb + age2, data = sub2)
## 
##   n= 5289, number of events= 4527 
##    (737 observations deleted due to missingness)
## 
##                    coef  exp(coef)   se(coef)       z Pr(>|z|)    
## educ.high    -0.3474108  0.7065151  0.0479067  -7.252 4.11e-13 ***
## partnerhiedu -0.2401339  0.7865225  0.0686891  -3.496 0.000472 ***
## agefb         0.1995273  1.2208256  0.0167700  11.898  < 2e-16 ***
## age2         -0.0033308  0.9966747  0.0002819 -11.816  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##              exp(coef) exp(-coef) lower .95 upper .95
## educ.high       0.7065     1.4154    0.6432    0.7761
## partnerhiedu    0.7865     1.2714    0.6875    0.8999
## agefb           1.2208     0.8191    1.1814    1.2616
## age2            0.9967     1.0033    0.9961    0.9972
## 
## Concordance= 0.541  (se = 0.005 )
## Likelihood ratio test= 259.3  on 4 df,   p=<2e-16
## Wald test            = 231.2  on 4 df,   p=<2e-16
## Score (logrank) test = 234  on 4 df,   p=<2e-16
plot(survfit(fit.cox2),   xlim=c(0,90), ylab="S(t)", xlab="Age in Months")
title(main="Survival Function for Second Birth Interval")

#use survey design
des<-svydesign(ids=~psu, strata = ~strata , weights=~weight, data=sub2)

cox.s<-svycoxph(Surv(secbi,b2event)~educ.high+partnerhiedu+I(agefb/5)+age2, design=des)
summary(cox.s)
## Stratified 1 - level Cluster Sampling design (with replacement)
## With (217) clusters.
## svydesign(ids = ~psu, strata = ~strata, weights = ~weight, data = sub2)
## Call:
## svycoxph(formula = Surv(secbi, b2event) ~ educ.high + partnerhiedu + 
##     I(agefb/5) + age2, design = des)
## 
##   n= 5289, number of events= 4527 
##    (737 observations deleted due to missingness)
## 
##                    coef  exp(coef)   se(coef)       z Pr(>|z|)    
## educ.high    -0.3921114  0.6756288  0.0496199  -7.902 2.74e-15 ***
## partnerhiedu -0.2826167  0.7538087  0.0876592  -3.224  0.00126 ** 
## I(agefb/5)    1.0275279  2.7941498  0.0996731  10.309  < 2e-16 ***
## age2         -0.0034088  0.9965970  0.0003235 -10.537  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##              exp(coef) exp(-coef) lower .95 upper .95
## educ.high       0.6756     1.4801    0.6130    0.7446
## partnerhiedu    0.7538     1.3266    0.6348    0.8951
## I(agefb/5)      2.7941     0.3579    2.2983    3.3970
## age2            0.9966     1.0034    0.9960    0.9972
## 
## Concordance= 0.544  (se = 0.006 )
## Likelihood ratio test= NA  on 4 df,   p=NA
## Wald test            = 139.7  on 4 df,   p=<2e-16
## Score (logrank) test = NA  on 4 df,   p=NA
dat<-expand.grid(educ.high=c(0,1), partnerhiedu=c(0,1), agefb=seq(20,40,5), b2event=1)
dat$age2<-dat$age^2
head(dat)
##   educ.high partnerhiedu agefb b2event age2
## 1         0            0    20       1  400
## 2         1            0    20       1  400
## 3         0            1    20       1  400
## 4         1            1    20       1  400
## 5         0            0    25       1  625
## 6         1            0    25       1  625
#plot some survival function estimates for various types of children
plot(survfit(fit.cox2, conf.int = F), xlim=c(0, 90),  ylab="S(t)", xlab="Months")
title (main = "Survival Plots for for Second Birth Interval")

lines(survfit(fit.cox2, newdata=data.frame(educ.high=0, partnerhiedu=0, agefb=30, age2=900), conf.int = F), col="red")
lines(survfit(fit.cox2, newdata=data.frame(educ.high=1, partnerhiedu=0, agefb=30, age2=900), conf.int = F), col="green")
legend("bottomleft", legend=c("Means of Covariates", "low edu, age 30", "high edu, age 30"), lty=1, col=c(1,"red", "green"), cex=.8)

Now we look at some more plots we can examine from the models

#Now we look at the cumulative hazard functions
sf1<-survfit(fit.cox2)
sf2<-survfit(fit.cox2, newdata=data.frame(educ.high=0, partnerhiedu=0, agefb=30, age2=900))
sf3<-survfit(fit.cox2,newdata=data.frame(educ.high=1, partnerhiedu=0, agefb=30, age2=900))



H1<--log(sf1$surv)
H2<--log(sf2$surv)
H3<--log(sf3$surv)

times<-sf1$time

plot(H1~times, type="s",  ylab="H(t)",ylim=c(0, 6), xlab="Months")
lines(H2~times,col="red", type="s")
lines(H3~times,col="green", type="s")
legend("topright", legend=c("Means of Covariates", "low edu, age 30", "high edu, age 30"), lty=1, col=c(1,"red", "green"), cex=.8)

#and the hazard function
hs1<-loess(diff(c(0,H1))~times, degree=1, span=.25)
hs2<-loess(diff(c(0,H2))~times, degree=1, span=.25)
hs3<-loess(diff(c(0,H3))~times, degree=1, span=.25)

plot(predict(hs1)~times,type="l", ylab="smoothed h(t)", xlab="Months", ylim=c(0, .04))
title(main="Smoothed hazard plots")
lines(predict(hs2)~times, type="l", col="red")
lines(predict(hs3)~times, type="l", col="green")
legend("topright", legend=c("Means of Covariates", "low edu, age 30", "high edu, age 30"), lty=1, col=c(1,"red", "green"), cex=.8)