Event: 2nd birth
Time duration: <=2 years, <=4years, <=6 years of birth interval
Censoring indicator: no 2nd child during the interview
Risk set: Each time point has a risk set for 2nd birth
Data: DHS, 2014
library(haven)
library(survival)
library(car)
library(survey)
library(survminer)
library(ggplot2)
library(ggpubr)
library(muhaz)
library(car)
library(dplyr)
Loading data
dta <- read_dta ("C:/Users/nahin/Google Drive/MSc Demography/Fall 2020/Event History/Term paper/BD_2014_DHS_09042018_147_123449/BDIR72DT/BDIR72FL.dta")
dta <- zap_labels(dta)
Creating subset
sub<-subset(dta,dta$bidx_01==1&dta$b0_01==0)
sub2<-data.frame(CASEID=sub$caseid,
int.cmc=sub$v008,
fbir.cmc=sub$b3_01,
sbir.cmc=sub$b3_02,
marr.cmc=sub$v509,
rural=sub$v025,
educ=sub$v106,
religion=sub$v130,
age=sub$v012,
partneredu=sub$v701,
partnerage=sub$v730,
weight=sub$v005/1000000,
psu=sub$v021, strata=sub$v022)
sub2$agefb = (sub2$age - (sub2$int.cmc - sub2$fbir.cmc)/12)
#censoring indicator for death by age 5, in months (<=60 months)
sub2$secbi<-ifelse(is.na(sub2$sbir.cmc)==T,
((sub2$int.cmc))-((sub2$fbir.cmc)),
(sub2$fbir.cmc-sub2$sbir.cmc))
sub2$b2event<-ifelse(is.na(sub2$sbir.cmc)==T,0,1)
table(sub2$b2event)
##
## 0 1
## 3975 11959
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~psu, strata=~strata, data=sub2[sub2$secbi>0,], weight=~weight )
#Creating covariates:
sub2$rururb <- ifelse(sub2$rural==2,0,1)
sub2$educ.high<-ifelse(sub2$educ %in% c(2,3), 1, 0)
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~psu, strata=~strata,
data=sub2[sub2$secbi>0,], weight=~weight )
rep.des<-as.svrepdesign(des, type="bootstrap" )
pp<-survSplit(Surv(secbi, b2event)~. , data = sub2[sub2$secbi>0,],
cut=c(0, 24, 48, 72), episode="year_birth")
pp$year <- pp$year_birth-1
pp<-pp[order(pp$CASEID, pp$year_birth),]
head(pp[, c("CASEID", "secbi", "b2event", "year")], n=20)
## CASEID secbi b2event year
## 1 1 3 2 24 0 1
## 2 1 3 2 48 0 2
## 3 1 3 2 72 0 3
## 4 1 3 2 110 1 4
## 5 1 6 2 24 0 1
## 6 1 6 2 48 0 2
## 7 1 6 2 72 0 3
## 8 1 6 2 114 1 4
## 9 1 9 2 24 0 1
## 10 1 9 2 30 1 2
## 11 1 20 2 24 0 1
## 12 1 20 2 48 0 2
## 13 1 20 2 72 0 3
## 14 1 20 2 98 1 4
## 15 1 23 3 24 0 1
## 16 1 23 3 48 0 2
## 17 1 23 3 64 1 3
## 18 1 26 1 24 0 1
## 19 1 26 1 48 0 2
## 20 1 26 1 72 0 3
Descriptive analysis
pp%>%
group_by(year)%>%
summarise(prop_bir=mean(b2event, na.rm=T))%>%
ggplot(aes(x=year, y=prop_bir))+
geom_line()+
ggtitle(label = "Hazard of having a second birth by year after first birth")
#pp%>%
#group_by(year, rururb)%>%
#summarise(prop_bir=mean(b2event, na.rm=T))%>%
#ggplot(aes(x=year, y=prop_bir))+
#geom_line(aes(group=factor(rururb), color=factor(rururb) ))+
#ggtitle(label = "Hazard of having a second birth by year after first birth and Rural&Urban")
#generate survey design
library(survey)
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~psu, strata = ~strata , weights=~weight, data=pp)
# fit.0<-svyglm(b2event~1,design=des , family="binomial")
# summary(fit.0)
fit.0<-svyglm(b2event~as.factor(year)-1,design=des , family="binomial")
summary(fit.0)
##
## Call:
## svyglm(formula = b2event ~ as.factor(year) - 1, design = des,
## family = "binomial")
##
## Survey design:
## svydesign(ids = ~psu, strata = ~strata, weights = ~weight, data = pp)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## as.factor(year)1 -1.96791 0.03119 -63.09 <2e-16 ***
## as.factor(year)2 -0.67084 0.03822 -17.55 <2e-16 ***
## as.factor(year)3 -0.44057 0.03237 -13.61 <2e-16 ***
## as.factor(year)4 0.96518 0.04148 23.27 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1.000025)
##
## Number of Fisher Scoring iterations: 4
haz<-1/(1+exp(-coef(fit.0)))
time<-seq(1,4,1)
plot(haz~time, type="l", ylab="h(t)")
title(main="Discrete Time Hazard Function for second birth")
St<-NA
time<-1:length(haz)
St[1]<-1-haz[1]
for(i in 2:length(haz)){
St[i]<-St[i-1]* (1-haz[i])}
St <- c(1, St)
time <- c(0, time)
plot(y=St,x=time, type="l", main="Survival function for second birth interval")
Including main effects of the model
fit.1<-svyglm(b2event~as.factor(year),design=des , family=binomial(link="cloglog"))
summary(fit.1)
##
## Call:
## svyglm(formula = b2event ~ as.factor(year), design = des, family = binomial(link = "cloglog"))
##
## Survey design:
## svydesign(ids = ~psu, strata = ~strata, weights = ~weight, data = pp)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.03403 0.02924 -69.57 <2e-16 ***
## as.factor(year)2 1.14962 0.04199 27.38 <2e-16 ***
## as.factor(year)3 1.33472 0.03505 38.08 <2e-16 ***
## as.factor(year)4 2.28706 0.03554 64.35 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1.000025)
##
## Number of Fisher Scoring iterations: 5
###Measure the AIC and delta AIC for at least 3 alternative time specifications
Constant
fit.2<-svyglm(b2event~1,design=des , family="binomial")
summary(fit.2)
##
## Call:
## svyglm(formula = b2event ~ 1, design = des, family = "binomial")
##
## Survey design:
## svydesign(ids = ~psu, strata = ~strata, weights = ~weight, data = pp)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.84499 0.01203 -70.21 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1.000025)
##
## Number of Fisher Scoring iterations: 4
Linear term for time
fit.3<-svyglm(b2event~year,design=des , family="binomial")
summary(fit.3)
##
## Call:
## svyglm(formula = b2event ~ year, design = des, family = "binomial")
##
## Survey design:
## svydesign(ids = ~psu, strata = ~strata, weights = ~weight, data = pp)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.66318 0.03998 -66.61 <2e-16 ***
## year 0.85602 0.01387 61.70 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 0.9880945)
##
## Number of Fisher Scoring iterations: 4
Interpretation: The effect of year is positive as the estimate shows a positive value of 0.85. However, the hazard plot says it goes up but not in a linear fashion.
#Quadratic term for time
#fit.s<-svyglm(dta$b2event~year+I(year^2),design=des , family="binomial")
#summary(fit.s)
Cubic term time
fit.c<-svyglm(b2event~year+I(year^2)+I(year^3 ),design=des , family="binomial")
summary(fit.c)
##
## Call:
## svyglm(formula = b2event ~ year + I(year^2) + I(year^3), design = des,
## family = "binomial")
##
## Survey design:
## svydesign(ids = ~psu, strata = ~strata, weights = ~weight, data = pp)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.57410 0.30499 -21.55 <2e-16 ***
## year 7.00818 0.47133 14.87 <2e-16 ***
## I(year^2) -2.77570 0.20694 -13.41 <2e-16 ***
## I(year^3) 0.37372 0.02719 13.74 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1.000025)
##
## Number of Fisher Scoring iterations: 4
#Spline
#library(splines)
#fit.sp<-svyglm(b2event~ns(year, df = 2),design=des , family="binomial")
#summary(fit.sp)
dat<-expand.grid(year=seq(1,4,1))
dat$const<-predict(fit.2, newdata=dat, type="response")
dat$genmod<-predict(fit.1, newdata=data.frame(year=as.factor(1:4 )), type="response")
dat$lin<-predict(fit.3, newdata=dat, type="response")
dat$cub<-predict(fit.c, newdata=dat, type="response")
#dat$spline<-predict(fit.sp, newdata=expand.grid(year=seq(1,4,1)), type="response")
dat
## year const genmod lin cub
## 1 1 0.3004854 0.1226131 0.1409820 0.1226131
## 2 2 0.3004854 0.3383092 0.2786568 0.3383092
## 3 3 0.3004854 0.3916041 0.4762403 0.3916041
## 4 4 0.3004854 0.7241570 0.6815519 0.7241570
plot(genmod~year, dat, type="l", ylab="h(t)", xlab="Time")
title(main="Hazard function from different time parameterizations")
lines(const~year, dat, col=1, lwd=2)
lines(genmod~year, dat, col=2, lwd=2)
lines(lin~year, dat, col=3, lwd=2)
lines(cub~year, dat, col=4, lwd=2)
#lines(spline~year, dat, col=6, lwd=2)
legend("topleft",
legend=c("constant", "General Model", "Linear", "Cubic"),
col=1:4, lwd=1.5)
#AIC table
aic<-round(c(
fit.2$deviance+2*length(fit.2$coefficients),
fit.3$deviance+2*length(fit.3$coefficients),
fit.c$deviance+2*length(fit.c$coefficients),
fit.1$deviance+2*length(fit.1$coefficients)),2)
#compare all aics to the one from the general model
dif.aic<-round(aic-aic[4],2)
data.frame(model =c( "constant", "linear", "cubic", "general"), aic=aic, aic_dif=dif.aic)
## model aic aic_dif
## 1 constant 49191.30 6241.86
## 2 linear 43459.98 510.54
## 3 cubic 42949.44 0.00
## 4 general 42949.44 0.00