library(haven)
library(survival)
library(car)
library(survey)
library(survminer)
library(ggplot2)
library(ggpubr)
library(muhaz)
library(car)
library(dplyr)
library(eha)
dat2<-read_dta("https://github.com/coreysparks/data/blob/master/ZZIR62FL.DTA?raw=true")
dat2<-zap_labels(dat2)

Event: Event of 2nd birth Time duration: 1st birth to 2nd birth Independent variable: place of residence (Rural&Urban) and religion (Islam&Hinduism) Censoring indicator: no 2nd child during the interview Risk set:Women had 1st child

a.Did you choose an AFT or PH model and why? I chose Proportional Hazard model because my outcome variable is the second birth after 1st birth based on the characteristics of individual like place of residence and religion.

table(is.na(dat2$bidx_01))
## 
## FALSE  TRUE 
##  6161  2187

Subtracting women who had first birth and who had only singleton birth no twins.

sub<-subset(dat2, dat2$bidx_01==1&dat2$b0_01==0)
sub2<-data.frame(CASEID=sub$caseid, 
                 int.cmc=sub$v008,
                 fbir.cmc=sub$b3_01,
                 sbir.cmc=sub$b3_02,
                 # marr.cmc=sub$v509,
                 rural=sub$v025,
                 religion=sub$v130,
                 # educ=sub$v106,
                 age=sub$v012,
                 # partneredu=sub$v701,
                 # partnerage=sub$v730,
                 weight=sub$v005/1000000,
                 psu=sub$v021, strata=sub$v022)

Calculation of intervals

sub2$agefb = (sub2$age - (sub2$int.cmc - sub2$fbir.cmc)/12)
sub2$secbi<-ifelse(is.na(sub2$sbir.cmc)==T,
                   ((sub2$int.cmc))-((sub2$fbir.cmc)), 
                   (sub2$fbir.cmc-sub2$sbir.cmc))

sub2$b2event<-ifelse(is.na(sub2$sbir.cmc)==T,0,1) 
fit<-survfit(Surv(secbi, b2event)~1, sub2)
fit
## Call: survfit(formula = Surv(secbi, b2event) ~ 1, data = sub2)
## 
##       n  events  median 0.95LCL 0.95UCL 
##    6026    4789      39      38      40
plot(fit, conf.int=T, ylab="S(t)", xlab="Months")
title(main="Survival Function for Second Birth Interval, DHS Model Data")

fit.haz.km<-kphaz.fit(sub2$secbi[sub2$secbi>0],
                      sub2$b2event[sub2$secbi>0] , 
                      method = "product-limit")

#this is a version of the hazard that is smoothed using a kernel-density method
fit.haz.sm<-muhaz(sub2$secbi[sub2$secbi>0], sub2$b2event[sub2$secbi>0] )

#Empirical hazard function (product-limit estimate) plot
kphaz.plot(fit.haz.km,main="Plot of the hazard of having a second birth")
#overlay the smoothed version
lines(fit.haz.sm, col=2, lwd=3)
legend("topleft", legend = c("KM hazard", "Smoothed Hazard"),
       col=c(1,2), lty=c(1,1))

Creating predictor/independent variable

sub2$rururb <- ifelse(sub2$rural==2,0,1)
sub2$relg <- Recode(sub2$religion, recodes="1='Islam'; 2='Hinduism'; else=NA", as.factor=T)

# sub2$educ.high<-ifelse(sub2$educ %in% c(2,3), 1, 0)
# sub2$age2<-(sub2$agefb/5)^2

options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~psu, strata=~strata,
               data=sub2[sub2$secbi>0,], weight=~weight )

rep.des<-as.svrepdesign(des, type="bootstrap" )
  1. Justify what parametric distribution you choose

I chose weibull distribution because it includes covariates and unlike hazard function is not assumed to be constant

  1. Carry out model fit diagnostics for the model

For model diagnostics this diagnostics will carry out: Exponential model, Weibull model, log normal model, and log-logistic model.

  1. Test for an interaction between at least two of the predictors
fit.1<-phreg(Surv(secbi, b2event)~rururb+relg,
                   data=sub2[sub2$secbi>0,],dist = "weibull", shape = 1)

summary(fit.1)
## Call:
## phreg(formula = Surv(secbi, b2event) ~ rururb + relg, data = sub2[sub2$secbi > 
##     0, ], dist = "weibull", shape = 1)
## 
## Covariate          W.mean      Coef Exp(Coef)  se(Coef)    Wald p
## rururb              0.401    -0.270     0.763     0.031     0.000 
## relg 
##         Hinduism    0.780     0         1           (reference)
##            Islam    0.220    -0.155     0.856     0.038     0.000 
## 
## log(scale)                    3.892               0.019     0.000 
## 
##  Shape is fixed at  1 
## 
## Events                    4760 
## Total time at risk        265737 
## Max. log. likelihood      -23851 
## LR test statistic         109.40 
## Degrees of freedom        2 
## Overall p-value           0

Interpretation: As the coefficients are negative it imply the hazard of second birth in urban is 24% lower than rural area. The hazard of second birth for Islam followers is 15% lower compared to Hindu followers.

plot(fit.1)

fit.2<-phreg(Surv(secbi, b2event)~rururb+relg,
             data=sub2[sub2$secbi>0,], dist="weibull")
summary(fit.2)
## Call:
## phreg(formula = Surv(secbi, b2event) ~ rururb + relg, data = sub2[sub2$secbi > 
##     0, ], dist = "weibull")
## 
## Covariate          W.mean      Coef Exp(Coef)  se(Coef)    Wald p
## rururb              0.401    -0.369     0.691     0.031     0.000 
## relg 
##         Hinduism    0.780     0         1           (reference)
##            Islam    0.220    -0.195     0.823     0.038     0.000 
## 
## log(scale)                    3.956               0.012     0.000 
## log(shape)                    0.426               0.010     0.000 
## 
## Events                    4760 
## Total time at risk        265737 
## Max. log. likelihood      -23124 
## LR test statistic         197.36 
## Degrees of freedom        2 
## Overall p-value           0

Interpretation: As the coefficients are negative it imply the hazard of second birth in urban is 31% lower than rural area. The hazard of second birth for Islam followers is 18% lower compared to Hindu followers.

plot(fit.2)

plot(fit.2, fn="haz")
lines(fit.haz.sm, col=2)

The model is poor fit because the hazard function changes the direction.

Log-Normal Model

fit.3<-phreg(Surv(secbi, b2event)~rururb+relg,
             data=sub2[sub2$secbi>0,], dist="lognormal", center=T)
summary(fit.3)
## Call:
## phreg(formula = Surv(secbi, b2event) ~ rururb + relg, data = sub2[sub2$secbi > 
##     0, ], dist = "lognormal", center = T)
## 
## Covariate          W.mean      Coef Exp(Coef)  se(Coef)    Wald p
## (Intercept)                  -2.182               0.137     0.000 
## rururb              0.401    -0.349     0.705     0.031     0.000 
## relg 
##         Hinduism    0.780     0         1           (reference)
##            Islam    0.220    -0.189     0.827     0.038     0.000 
## 
## log(scale)                    2.909               0.031     0.000 
## log(shape)                    1.377               0.055     0.000 
## 
## Events                    4760 
## Total time at risk        265737 
## Max. log. likelihood      -22022 
## LR test statistic         180.22 
## Degrees of freedom        2 
## Overall p-value           0

Interpretation: As the coefficients are negative it imply the hazard of second birth in urban is 30% lower than rural area. The hazard of second birth for Islam followers is 19% lower compared to Hindu followers.

plot(fit.3)

plot(fit.3, fn="haz")
lines(fit.haz.sm, col=2)

Log-normal model did not fit with the empirical hazard model.

Log-logistic Model

fit.4<-phreg(Surv(secbi, b2event)~rururb+relg,
             data=sub2[sub2$secbi>0,], dist="loglogistic", center = T)
summary(fit.4)
## Call:
## phreg(formula = Surv(secbi, b2event) ~ rururb + relg, data = sub2[sub2$secbi > 
##     0, ], dist = "loglogistic", center = T)
## 
## Covariate          W.mean      Coef Exp(Coef)  se(Coef)    Wald p
## (Intercept)                  -0.833               0.048     0.000 
## rururb              0.401    -0.329     0.719     0.031     0.000 
## relg 
##         Hinduism    0.780     0         1           (reference)
##            Islam    0.220    -0.180     0.835     0.038     0.000 
## 
## log(scale)                    3.306               0.016     0.000 
## log(shape)                    1.570               0.028     0.000 
## 
## Events                    4760 
## Total time at risk        265737 
## Max. log. likelihood      -21911 
## LR test statistic         160.51 
## Degrees of freedom        2 
## Overall p-value           0

Interpretation: Interpretation: As the coefficients are negative it imply the hazard of second birth in urban is 29% lower than rural area. The hazard of second birth for Islam followers is 16% lower compared to Hindu followers.

plot(fit.4)

plot(fit.4, fn="haz")
lines(fit.haz.sm, col=2)

The model perfectly fit with the empirical data.

AIC1<--2*fit.1$loglik[2]+2*length(fit.1$coefficients); AIC1
## [1] 47708.51
AIC2<--2*fit.2$loglik[2]+2*length(fit.2$coefficients); AIC2
## [1] 46255.1
AIC3<--2*fit.3$loglik[2]+2*length(fit.3$coefficients); AIC3
## [1] 44054.74
AIC4<--2*fit.4$loglik[2]+2*length(fit.4$coefficients); AIC4
## [1] 43832.39

Based on the minimum AIC value, log logistic hazard function is the best model fit.

Conclusion: Even though all the models provided significant hazard values for each predictors however, the strength of model is how it fits with the empirical data. And we find that clearly from the log logistic function that how well it fits with the empirical data. Also, the lowest AIC value among the models provide evidence to use log-logistic model for this study. To support the argument more graphical and tabular presentation piecewise constant exponential model and cox regression has been done below.

Piecewise constant exponential model

fit.5<-phreg(Surv(secbi, b2event)~rururb+relg,
             data=sub2[sub2$secbi>0,], dist="pch",
             cuts=c(12, 120,220, 320))
summary(fit.5)
## Call:
## phreg(formula = Surv(secbi, b2event) ~ rururb + relg, data = sub2[sub2$secbi > 
##     0, ], dist = "pch", cuts = c(12, 120, 220, 320))
## 
## Covariate          W.mean      Coef Exp(Coef)  se(Coef)    Wald p
## rururb              0.401    -0.306     0.736     0.031     0.000 
## relg 
##         Hinduism    0.780     0         1           (reference)
##            Islam    0.220    -0.173     0.841     0.038     0.000 
## 
## 
## Events                    4760 
## Total time at risk        265737 
## Max. log. likelihood      -22554 
## LR test statistic         140.66 
## Degrees of freedom        2 
## Overall p-value           0
plot(fit.5)

plot(fit.5, fn="haz", ylim=c(0, .05))
lines(fit.haz.sm, col=2)

AIC5<--2*fit.5$loglik[2]+2*length(fit.5$coefficients); AIC5
## [1] 45112.21
AIC4
## [1] 43832.39

Graphical output

emp<-coxreg(Surv(secbi, b2event)~rururb+relg,
            data=sub2[sub2$secbi>0,])

check.dist(sp=emp,pp=fit.1, main = "Empirical vs. Exponential")

The two lines did not match at all. That is the model did not fit well with the data.

check.dist(sp=emp,pp=fit.2, main = "Empirical vs. Weibull")

At the begining the curves were following each other together but later they split away into two different ways.

check.dist(sp=emp,pp=fit.3, main = "Empirical vs. Log-Normal")

It is slightly better than weibull but the gap at the end is bigger.

check.dist(sp=emp,pp=fit.4, main = "Empirical vs. Log-Logistic")

This shows the best fit compared to earlier models because all the way long the curves followed each other and at the end gap is lesser than the other models. That is why this is the best model for to understand the relationship of birth interval, and education and age.

check.dist(sp=emp,pp=fit.5, main = "Empirical vs. PCH")