Event: 2nd birth
Time duration: <=2 years, <=4years, <=6 years of birth interval
Independent variable: Religion
Censoring indicator: no 2nd child during the interview
Risk set: Each time point has a risk set for 2nd birth
Data: DHS, 2014
library(haven)
library(survival)
library(car)
library(survey)
library(survminer)
library(ggplot2)
library(ggpubr)
library(muhaz)
library(car)
library(dplyr)
Loading data
dta <- read_dta ("C:/Users/nahin/Google Drive/MSc Demography/Fall 2020/Event History/Term paper/BD_2014_DHS_09042018_147_123449/BDIR72DT/BDIR72FL.dta")
dta <- zap_labels(dta)
Creating subset
sub<-subset(dta,dta$bidx_01==1&dta$b0_01==0)
sub2<-data.frame(CASEID=sub$caseid,
int.cmc=sub$v008,
fbir.cmc=sub$b3_01,
sbir.cmc=sub$b3_02,
marr.cmc=sub$v509,
rural=sub$v025,
educ=sub$v106,
religion=sub$v130,
age=sub$v012,
partneredu=sub$v701,
partnerage=sub$v730,
weight=sub$v005/1000000,
psu=sub$v021, strata=sub$v022)
sub2$agefb = (sub2$age - (sub2$int.cmc - sub2$fbir.cmc)/12)
#censoring indicator for death by age 5, in months (<=60 months)
sub2$secbi<-ifelse(is.na(sub2$sbir.cmc)==T,
((sub2$int.cmc))-((sub2$fbir.cmc)),
(sub2$fbir.cmc-sub2$sbir.cmc))
sub2$b2event<-ifelse(is.na(sub2$sbir.cmc)==T,0,1)
table(sub2$b2event)
##
## 0 1
## 3975 11959
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~psu, strata=~strata, data=sub2[sub2$secbi>0,], weight=~weight )
Creating covariates:
place of residence: rural & urban
Education: less than high school and more than high school
sub2$rururb <- ifelse(sub2$rural==2,0,1)
sub2$educ.high<-ifelse(sub2$educ %in% c(2,3), 1, 0)
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~psu, strata=~strata,
data=sub2[sub2$secbi>0,], weight=~weight )
rep.des<-as.svrepdesign(des, type="bootstrap" )
Question a: You must form a person-period data set
pp<-survSplit(Surv(secbi, b2event)~. , data = sub2[sub2$secbi>0,],
cut=c(0, 24, 48, 72), episode="year_birth")
pp$year <- pp$year_birth-1
pp<-pp[order(pp$CASEID, pp$year_birth),]
head(pp[, c("CASEID", "secbi", "b2event", "year", "religion", "agefb")], n=20)
## CASEID secbi b2event year religion agefb
## 1 1 3 2 24 0 1 1 24.91667
## 2 1 3 2 48 0 2 1 24.91667
## 3 1 3 2 72 0 3 1 24.91667
## 4 1 3 2 110 1 4 1 24.91667
## 5 1 6 2 24 0 1 1 33.75000
## 6 1 6 2 48 0 2 1 33.75000
## 7 1 6 2 72 0 3 1 33.75000
## 8 1 6 2 114 1 4 1 33.75000
## 9 1 9 2 24 0 1 1 19.83333
## 10 1 9 2 30 1 2 1 19.83333
## 11 1 20 2 24 0 1 1 23.00000
## 12 1 20 2 48 0 2 1 23.00000
## 13 1 20 2 72 0 3 1 23.00000
## 14 1 20 2 98 1 4 1 23.00000
## 15 1 23 3 24 0 1 1 24.08333
## 16 1 23 3 48 0 2 1 24.08333
## 17 1 23 3 64 1 3 1 24.08333
## 18 1 26 1 24 0 1 1 32.08333
## 19 1 26 1 48 0 2 1 32.08333
## 20 1 26 1 72 0 3 1 32.08333
Question b: Consider both the general model and other time specifications
pp%>%
group_by(year)%>%
summarise(prop_bir=mean(b2event, na.rm=T))%>%
ggplot(aes(x=year, y=prop_bir))+
geom_line()+
ggtitle(label = "Hazard of having a second birth by year after first birth")
## `summarise()` ungrouping output (override with `.groups` argument)
pp%>%
group_by(year, rururb)%>%
summarise(prop_bir=mean(b2event, na.rm=T))%>%
ggplot(aes(x=year, y=prop_bir))+
geom_line(aes(group=factor(rururb), color=factor(rururb) ))+
ggtitle(label = "Hazard of having a second birth by year after first birth and Rural&Urban")
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
Interpretation:
Between 3-4 years time interval, probability of 2nd birth increased compared to other time periods.
Rural areas have higher probability of 2nd child compared to the urban areas in each time points.
des<-svydesign(ids=~psu, strata = ~strata , weights=~weight, data=pp)
fit.0<-svyglm(b2event~as.factor(year)-1,design=des , family="binomial")
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.0)
##
## Call:
## svyglm(formula = b2event ~ as.factor(year) - 1, design = des,
## family = "binomial")
##
## Survey design:
## svydesign(ids = ~psu, strata = ~strata, weights = ~weight, data = pp)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## as.factor(year)1 -1.96791 0.03119 -63.09 <2e-16 ***
## as.factor(year)2 -0.67084 0.03822 -17.55 <2e-16 ***
## as.factor(year)3 -0.44057 0.03237 -13.61 <2e-16 ***
## as.factor(year)4 0.96518 0.04148 23.27 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1.000025)
##
## Number of Fisher Scoring iterations: 4
#Plot the hazard function on the probability scale
haz0<-1/(1+exp(-coef(fit.0)))
time0<-seq(0,72,24)
plot(haz0~time0, type="l", ylab="h(t)")
title(main="Discrete Time Hazard Function for 2nd birth")
Question C: Include all main effects in the model
fit.1<-svyglm(b2event~as.factor(year)+rururb-1,design=des , family=binomial(link="cloglog"))
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.1)
##
## Call:
## svyglm(formula = b2event ~ as.factor(year) + rururb - 1, design = des,
## family = binomial(link = "cloglog"))
##
## Survey design:
## svydesign(ids = ~psu, strata = ~strata, weights = ~weight, data = pp)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## as.factor(year)1 -1.97190 0.03108 -63.441 < 2e-16 ***
## as.factor(year)2 -0.81944 0.03178 -25.784 < 2e-16 ***
## as.factor(year)3 -0.62895 0.02714 -23.173 < 2e-16 ***
## as.factor(year)4 0.33176 0.02558 12.968 < 2e-16 ***
## rururb -0.24180 0.03065 -7.888 1.55e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 0.9998169)
##
## Number of Fisher Scoring iterations: 5
Question d: Test for an interaction between at least two of the predictors
pp1<-survSplit(Surv(secbi, b2event)~. , data = sub2[sub2$secbi>0,],
cut=c(0, 24, 48, 72), episode="year_birth")
pp1$year <- pp1$year_birth-1
pp1<-pp1[order(pp1$CASEID, pp1$year_birth),]
head(pp1[, c("CASEID", "secbi", "b2event", "year", "rururb", "educ.high")], n=20)
## CASEID secbi b2event year rururb educ.high
## 1 1 3 2 24 0 1 0 0
## 2 1 3 2 48 0 2 0 0
## 3 1 3 2 72 0 3 0 0
## 4 1 3 2 110 1 4 0 0
## 5 1 6 2 24 0 1 0 0
## 6 1 6 2 48 0 2 0 0
## 7 1 6 2 72 0 3 0 0
## 8 1 6 2 114 1 4 0 0
## 9 1 9 2 24 0 1 0 0
## 10 1 9 2 30 1 2 0 0
## 11 1 20 2 24 0 1 0 0
## 12 1 20 2 48 0 2 0 0
## 13 1 20 2 72 0 3 0 0
## 14 1 20 2 98 1 4 0 0
## 15 1 23 3 24 0 1 0 0
## 16 1 23 3 48 0 2 0 0
## 17 1 23 3 64 1 3 0 0
## 18 1 26 1 24 0 1 0 0
## 19 1 26 1 48 0 2 0 0
## 20 1 26 1 72 0 3 0 0
pp1%>%
group_by(year)%>%
summarise(prop_bir=mean(b2event, na.rm=T))%>%
ggplot(aes(x=year, y=prop_bir))+
geom_line()+
ggtitle(label = "Hazard of having a second birth by year after first birth")
## `summarise()` ungrouping output (override with `.groups` argument)
pp1%>%
group_by(year, rururb, educ.high)%>%
summarise(prop_bir=mean(b2event, na.rm=T))%>%
ggplot(aes(x=year, y=prop_bir))+
geom_line(aes(group=factor(educ.high), color=factor(educ.high) ))+
ggtitle(label = "Hazard of having a second birth by year after first birth and rural&urban, and education")
## `summarise()` regrouping output by 'year', 'rururb' (override with `.groups` argument)
Interpretation:
Less than high school category has higher probability of having second child in each time period.
Question e: Generate hazard plots for interesting cases highlighting the significant predictors in your analysis
#generate survey design
des<-svydesign(ids=~psu, strata = ~strata , weights=~weight, data=pp1)
#Fit the basic logistic model with ONLY time in the model
#I do -1 so that no intercept is fit in the model, and we get a hazard estimate for each time period
fit.2<-svyglm(b2event~as.factor(year)-1+(educ.high*rururb)-1, design=des , family="binomial")
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.2)
##
## Call:
## svyglm(formula = b2event ~ as.factor(year) - 1 + (educ.high *
## rururb) - 1, design = des, family = "binomial")
##
## Survey design:
## svydesign(ids = ~psu, strata = ~strata, weights = ~weight, data = pp1)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## as.factor(year)1 -1.72977 0.03768 -45.903 < 2e-16 ***
## as.factor(year)2 -0.42234 0.03489 -12.105 < 2e-16 ***
## as.factor(year)3 -0.17558 0.04310 -4.074 5.27e-05 ***
## as.factor(year)4 1.24896 0.05197 24.034 < 2e-16 ***
## educ.high -0.45770 0.04612 -9.924 < 2e-16 ***
## rururb -0.24545 0.05214 -4.708 3.14e-06 ***
## educ.high:rururb 0.01299 0.06635 0.196 0.845
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 0.9979489)
##
## Number of Fisher Scoring iterations: 4
#haz and time need to have equal length
haz2 <-1/(1+exp(-coef(fit.2)))
time2 <- seq(0,72, 12)
plot(haz2 ~ time2, type="l", ylab="h(t)")
title(main="Discrete Time Hazard Function for second birth by education and rural urban")