Event: 2nd birth

Time duration: <=2 years, <=4years, <=6 years of birth interval

Censoring indicator: no 2nd child during the interview

Risk set: Each time point has a risk set for 2nd birth

Data: DHS, 2014

library(haven)
library(survival)
library(car)
library(survey)
library(survminer)
library(ggplot2)
library(ggpubr)
library(muhaz)
library(car)
library(dplyr)

Loading data

dta <- read_dta ("C:/Users/nahin/Google Drive/MSc Demography/Fall 2020/Event History/Term paper/BD_2014_DHS_09042018_147_123449/BDIR72DT/BDIR72FL.dta")

dta <- zap_labels(dta)

Creating subset

sub<-subset(dta,dta$bidx_01==1&dta$b0_01==0)

sub2<-data.frame(CASEID=sub$caseid, 
                 int.cmc=sub$v008,
                 fbir.cmc=sub$b3_01,
                 sbir.cmc=sub$b3_02,
                 marr.cmc=sub$v509,
                 rural=sub$v025,
                 educ=sub$v106,
                 religion=sub$v130,
                 age=sub$v012,
                 partneredu=sub$v701,
                 partnerage=sub$v730,
                 weight=sub$v005/1000000,
                 psu=sub$v021, strata=sub$v022)

sub2$agefb = (sub2$age - (sub2$int.cmc - sub2$fbir.cmc)/12)
#censoring indicator for death by age 5, in months (<=60 months)
sub2$secbi<-ifelse(is.na(sub2$sbir.cmc)==T,
                   ((sub2$int.cmc))-((sub2$fbir.cmc)),
                   (sub2$fbir.cmc-sub2$sbir.cmc))
sub2$b2event<-ifelse(is.na(sub2$sbir.cmc)==T,0,1) 
table(sub2$b2event)
## 
##     0     1 
##  3975 11959
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~psu, strata=~strata, data=sub2[sub2$secbi>0,], weight=~weight )

#Creating covariates:

sub2$rururb <- ifelse(sub2$rural==2,0,1)
sub2$educ.high<-ifelse(sub2$educ %in% c(2,3), 1, 0)


options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~psu, strata=~strata,
               data=sub2[sub2$secbi>0,], weight=~weight )

rep.des<-as.svrepdesign(des, type="bootstrap" )

The person-period file preparation

pp<-survSplit(Surv(secbi, b2event)~. , data = sub2[sub2$secbi>0,],
              cut=c(0, 24, 48, 72),  episode="year_birth")

pp$year <- pp$year_birth-1
pp<-pp[order(pp$CASEID, pp$year_birth),]
head(pp[, c("CASEID", "secbi", "b2event", "year")], n=20)
##             CASEID secbi b2event year
## 1          1  3  2    24       0    1
## 2          1  3  2    48       0    2
## 3          1  3  2    72       0    3
## 4          1  3  2   110       1    4
## 5          1  6  2    24       0    1
## 6          1  6  2    48       0    2
## 7          1  6  2    72       0    3
## 8          1  6  2   114       1    4
## 9          1  9  2    24       0    1
## 10         1  9  2    30       1    2
## 11         1 20  2    24       0    1
## 12         1 20  2    48       0    2
## 13         1 20  2    72       0    3
## 14         1 20  2    98       1    4
## 15         1 23  3    24       0    1
## 16         1 23  3    48       0    2
## 17         1 23  3    64       1    3
## 18         1 26  1    24       0    1
## 19         1 26  1    48       0    2
## 20         1 26  1    72       0    3

The general model and other time specifications

Descriptive analysis

pp%>%
  group_by(year)%>%
  summarise(prop_bir=mean(b2event, na.rm=T))%>%
  ggplot(aes(x=year, y=prop_bir))+
  geom_line()+
  ggtitle(label = "Hazard of having a second birth by year after first birth")

#pp%>%
  #group_by(year, rururb)%>%
  #summarise(prop_bir=mean(b2event, na.rm=T))%>%
  #ggplot(aes(x=year, y=prop_bir))+
  #geom_line(aes(group=factor(rururb), color=factor(rururb) ))+
  #ggtitle(label = "Hazard of having a second birth by year after first birth and Rural&Urban")
#generate survey design
library(survey)
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~psu, strata = ~strata , weights=~weight, data=pp)

# fit.0<-svyglm(b2event~1,design=des , family="binomial")
# summary(fit.0)

fit.0<-svyglm(b2event~as.factor(year)-1,design=des , family="binomial")
summary(fit.0)
## 
## Call:
## svyglm(formula = b2event ~ as.factor(year) - 1, design = des, 
##     family = "binomial")
## 
## Survey design:
## svydesign(ids = ~psu, strata = ~strata, weights = ~weight, data = pp)
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## as.factor(year)1 -1.96791    0.03119  -63.09   <2e-16 ***
## as.factor(year)2 -0.67084    0.03822  -17.55   <2e-16 ***
## as.factor(year)3 -0.44057    0.03237  -13.61   <2e-16 ***
## as.factor(year)4  0.96518    0.04148   23.27   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1.000025)
## 
## Number of Fisher Scoring iterations: 4
haz<-1/(1+exp(-coef(fit.0)))
time<-seq(1,4,1)
plot(haz~time, type="l", ylab="h(t)")
title(main="Discrete Time Hazard Function for second birth")

Plot the survival function estimate

St<-NA
time<-1:length(haz)
St[1]<-1-haz[1]
for(i in 2:length(haz)){
  St[i]<-St[i-1]* (1-haz[i])}

St <- c(1, St)
time <- c(0, time)
plot(y=St,x=time, type="l", main="Survival function for second birth interval")

Including main effects of the model

fit.1<-svyglm(b2event~as.factor(year),design=des , family=binomial(link="cloglog"))
summary(fit.1)
## 
## Call:
## svyglm(formula = b2event ~ as.factor(year), design = des, family = binomial(link = "cloglog"))
## 
## Survey design:
## svydesign(ids = ~psu, strata = ~strata, weights = ~weight, data = pp)
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -2.03403    0.02924  -69.57   <2e-16 ***
## as.factor(year)2  1.14962    0.04199   27.38   <2e-16 ***
## as.factor(year)3  1.33472    0.03505   38.08   <2e-16 ***
## as.factor(year)4  2.28706    0.03554   64.35   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1.000025)
## 
## Number of Fisher Scoring iterations: 5

###Measure the AIC and delta AIC for at least 3 alternative time specifications

Constant

fit.2<-svyglm(b2event~1,design=des , family="binomial")
summary(fit.2)
## 
## Call:
## svyglm(formula = b2event ~ 1, design = des, family = "binomial")
## 
## Survey design:
## svydesign(ids = ~psu, strata = ~strata, weights = ~weight, data = pp)
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.84499    0.01203  -70.21   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1.000025)
## 
## Number of Fisher Scoring iterations: 4

Linear term for time

fit.3<-svyglm(b2event~year,design=des , family="binomial")
summary(fit.3)
## 
## Call:
## svyglm(formula = b2event ~ year, design = des, family = "binomial")
## 
## Survey design:
## svydesign(ids = ~psu, strata = ~strata, weights = ~weight, data = pp)
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.66318    0.03998  -66.61   <2e-16 ***
## year         0.85602    0.01387   61.70   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 0.9880945)
## 
## Number of Fisher Scoring iterations: 4

Interpretation: The effect of year is positive as the estimate shows a positive value of 0.85. However, the hazard plot says it goes up but not in a linear fashion.

#Quadratic term for time
#fit.s<-svyglm(dta$b2event~year+I(year^2),design=des , family="binomial")
#summary(fit.s)

Cubic term time

fit.c<-svyglm(b2event~year+I(year^2)+I(year^3 ),design=des , family="binomial")
summary(fit.c)
## 
## Call:
## svyglm(formula = b2event ~ year + I(year^2) + I(year^3), design = des, 
##     family = "binomial")
## 
## Survey design:
## svydesign(ids = ~psu, strata = ~strata, weights = ~weight, data = pp)
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -6.57410    0.30499  -21.55   <2e-16 ***
## year         7.00818    0.47133   14.87   <2e-16 ***
## I(year^2)   -2.77570    0.20694  -13.41   <2e-16 ***
## I(year^3)    0.37372    0.02719   13.74   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1.000025)
## 
## Number of Fisher Scoring iterations: 4
#Spline
#library(splines)
#fit.sp<-svyglm(b2event~ns(year, df = 2),design=des , family="binomial")
#summary(fit.sp)
dat<-expand.grid(year=seq(1,4,1))
dat$const<-predict(fit.2, newdata=dat, type="response")
dat$genmod<-predict(fit.1, newdata=data.frame(year=as.factor(1:4 )), type="response")
dat$lin<-predict(fit.3, newdata=dat, type="response")
dat$cub<-predict(fit.c, newdata=dat, type="response")
#dat$spline<-predict(fit.sp, newdata=expand.grid(year=seq(1,4,1)), type="response")

dat
##   year     const    genmod       lin       cub
## 1    1 0.3004854 0.1226131 0.1409820 0.1226131
## 2    2 0.3004854 0.3383092 0.2786568 0.3383092
## 3    3 0.3004854 0.3916041 0.4762403 0.3916041
## 4    4 0.3004854 0.7241570 0.6815519 0.7241570
plot(genmod~year, dat, type="l", ylab="h(t)", xlab="Time")
title(main="Hazard function from different time parameterizations")
lines(const~year, dat, col=1, lwd=2)
lines(genmod~year, dat, col=2, lwd=2)
lines(lin~year, dat, col=3, lwd=2)
lines(cub~year, dat, col=4, lwd=2)
#lines(spline~year, dat, col=6, lwd=2)

legend("topleft", 
       legend=c("constant", "General Model", "Linear", "Cubic"),
       col=1:4, lwd=1.5)

#AIC table
aic<-round(c(
  fit.2$deviance+2*length(fit.2$coefficients),
  fit.3$deviance+2*length(fit.3$coefficients),
  fit.c$deviance+2*length(fit.c$coefficients),
  fit.1$deviance+2*length(fit.1$coefficients)),2)

#compare all aics to the one from the general model
dif.aic<-round(aic-aic[4],2)
data.frame(model =c( "constant", "linear", "cubic", "general"), aic=aic, aic_dif=dif.aic)
##      model      aic aic_dif
## 1 constant 49191.30 6241.86
## 2   linear 43459.98  510.54
## 3    cubic 42949.44    0.00
## 4  general 42949.44    0.00