library(haven)
library(survival)
library(car)
library(survey)
library(survminer)
library(ggplot2)
library(ggpubr)
library(muhaz)
library(car)
library(dplyr)
library(eha)
dat2<-read_dta("https://github.com/coreysparks/data/blob/master/ZZIR62FL.DTA?raw=true")
dat2<-zap_labels(dat2)
Event: Event of 2nd birth Time duration: 1st birth to 2nd birth Independent variable: place of residence (Rural&Urban) and religion (Islam&Hinduism) Censoring indicator: no 2nd child during the interview Risk set:Women had 1st child
a.Did you choose an AFT or PH model and why? I chose Proportional Hazard model because my outcome variable is the second birth after 1st birth based on the characteristics of individual like place of residence and religion.
table(is.na(dat2$bidx_01))
##
## FALSE TRUE
## 6161 2187
Subtracting women who had first birth and who had only singleton birth no twins.
sub<-subset(dat2, dat2$bidx_01==1&dat2$b0_01==0)
sub2<-data.frame(CASEID=sub$caseid,
int.cmc=sub$v008,
fbir.cmc=sub$b3_01,
sbir.cmc=sub$b3_02,
# marr.cmc=sub$v509,
rural=sub$v025,
religion=sub$v130,
# educ=sub$v106,
age=sub$v012,
# partneredu=sub$v701,
# partnerage=sub$v730,
weight=sub$v005/1000000,
psu=sub$v021, strata=sub$v022)
Calculation of intervals
sub2$agefb = (sub2$age - (sub2$int.cmc - sub2$fbir.cmc)/12)
sub2$secbi<-ifelse(is.na(sub2$sbir.cmc)==T,
((sub2$int.cmc))-((sub2$fbir.cmc)),
(sub2$fbir.cmc-sub2$sbir.cmc))
sub2$b2event<-ifelse(is.na(sub2$sbir.cmc)==T,0,1)
fit<-survfit(Surv(secbi, b2event)~1, sub2)
fit
## Call: survfit(formula = Surv(secbi, b2event) ~ 1, data = sub2)
##
## n events median 0.95LCL 0.95UCL
## 6026 4789 39 38 40
plot(fit, conf.int=T, ylab="S(t)", xlab="Months")
title(main="Survival Function for Second Birth Interval, DHS Model Data")
fit.haz.km<-kphaz.fit(sub2$secbi[sub2$secbi>0],
sub2$b2event[sub2$secbi>0] ,
method = "product-limit")
#this is a version of the hazard that is smoothed using a kernel-density method
fit.haz.sm<-muhaz(sub2$secbi[sub2$secbi>0], sub2$b2event[sub2$secbi>0] )
#Empirical hazard function (product-limit estimate) plot
kphaz.plot(fit.haz.km,main="Plot of the hazard of having a second birth")
#overlay the smoothed version
lines(fit.haz.sm, col=2, lwd=3)
legend("topleft", legend = c("KM hazard", "Smoothed Hazard"),
col=c(1,2), lty=c(1,1))
Creating predictor/independent variable
sub2$rururb <- ifelse(sub2$rural==2,0,1)
sub2$relg <- Recode(sub2$religion, recodes="1='Islam'; 2='Hinduism'; else=NA", as.factor=T)
# sub2$educ.high<-ifelse(sub2$educ %in% c(2,3), 1, 0)
# sub2$age2<-(sub2$agefb/5)^2
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~psu, strata=~strata,
data=sub2[sub2$secbi>0,], weight=~weight )
rep.des<-as.svrepdesign(des, type="bootstrap" )
I chose weibull distribution because it includes covariates and unlike hazard function is not assumed to be constant
For model diagnostics this diagnostics will carry out: Exponential model, Weibull model, log normal model, and log-logistic model.
fit.1<-phreg(Surv(secbi, b2event)~rururb+relg,
data=sub2[sub2$secbi>0,],dist = "weibull", shape = 1)
summary(fit.1)
## Call:
## phreg(formula = Surv(secbi, b2event) ~ rururb + relg, data = sub2[sub2$secbi >
## 0, ], dist = "weibull", shape = 1)
##
## Covariate W.mean Coef Exp(Coef) se(Coef) Wald p
## rururb 0.401 -0.270 0.763 0.031 0.000
## relg
## Hinduism 0.780 0 1 (reference)
## Islam 0.220 -0.155 0.856 0.038 0.000
##
## log(scale) 3.892 0.019 0.000
##
## Shape is fixed at 1
##
## Events 4760
## Total time at risk 265737
## Max. log. likelihood -23851
## LR test statistic 109.40
## Degrees of freedom 2
## Overall p-value 0
Interpretation: As the coefficients are negative it imply the hazard of second birth in urban is 24% lower than rural area. The hazard of second birth for Islam followers is 15% lower compared to Hindu followers.
plot(fit.1)
fit.2<-phreg(Surv(secbi, b2event)~rururb+relg,
data=sub2[sub2$secbi>0,], dist="weibull")
summary(fit.2)
## Call:
## phreg(formula = Surv(secbi, b2event) ~ rururb + relg, data = sub2[sub2$secbi >
## 0, ], dist = "weibull")
##
## Covariate W.mean Coef Exp(Coef) se(Coef) Wald p
## rururb 0.401 -0.369 0.691 0.031 0.000
## relg
## Hinduism 0.780 0 1 (reference)
## Islam 0.220 -0.195 0.823 0.038 0.000
##
## log(scale) 3.956 0.012 0.000
## log(shape) 0.426 0.010 0.000
##
## Events 4760
## Total time at risk 265737
## Max. log. likelihood -23124
## LR test statistic 197.36
## Degrees of freedom 2
## Overall p-value 0
Interpretation: As the coefficients are negative it imply the hazard of second birth in urban is 31% lower than rural area. The hazard of second birth for Islam followers is 18% lower compared to Hindu followers.
plot(fit.2)
plot(fit.2, fn="haz")
lines(fit.haz.sm, col=2)
The model is poor fit because the hazard function changes the direction.
Log-Normal Model
fit.3<-phreg(Surv(secbi, b2event)~rururb+relg,
data=sub2[sub2$secbi>0,], dist="lognormal", center=T)
summary(fit.3)
## Call:
## phreg(formula = Surv(secbi, b2event) ~ rururb + relg, data = sub2[sub2$secbi >
## 0, ], dist = "lognormal", center = T)
##
## Covariate W.mean Coef Exp(Coef) se(Coef) Wald p
## (Intercept) -2.182 0.137 0.000
## rururb 0.401 -0.349 0.705 0.031 0.000
## relg
## Hinduism 0.780 0 1 (reference)
## Islam 0.220 -0.189 0.827 0.038 0.000
##
## log(scale) 2.909 0.031 0.000
## log(shape) 1.377 0.055 0.000
##
## Events 4760
## Total time at risk 265737
## Max. log. likelihood -22022
## LR test statistic 180.22
## Degrees of freedom 2
## Overall p-value 0
Interpretation: As the coefficients are negative it imply the hazard of second birth in urban is 30% lower than rural area. The hazard of second birth for Islam followers is 19% lower compared to Hindu followers.
plot(fit.3)
plot(fit.3, fn="haz")
lines(fit.haz.sm, col=2)
Log-normal model did not fit with the empirical hazard model.
Log-logistic Model
fit.4<-phreg(Surv(secbi, b2event)~rururb+relg,
data=sub2[sub2$secbi>0,], dist="loglogistic", center = T)
summary(fit.4)
## Call:
## phreg(formula = Surv(secbi, b2event) ~ rururb + relg, data = sub2[sub2$secbi >
## 0, ], dist = "loglogistic", center = T)
##
## Covariate W.mean Coef Exp(Coef) se(Coef) Wald p
## (Intercept) -0.833 0.048 0.000
## rururb 0.401 -0.329 0.719 0.031 0.000
## relg
## Hinduism 0.780 0 1 (reference)
## Islam 0.220 -0.180 0.835 0.038 0.000
##
## log(scale) 3.306 0.016 0.000
## log(shape) 1.570 0.028 0.000
##
## Events 4760
## Total time at risk 265737
## Max. log. likelihood -21911
## LR test statistic 160.51
## Degrees of freedom 2
## Overall p-value 0
Interpretation: Interpretation: As the coefficients are negative it imply the hazard of second birth in urban is 29% lower than rural area. The hazard of second birth for Islam followers is 16% lower compared to Hindu followers.
plot(fit.4)
plot(fit.4, fn="haz")
lines(fit.haz.sm, col=2)
The model perfectly fit with the empirical data.
AIC1<--2*fit.1$loglik[2]+2*length(fit.1$coefficients); AIC1
## [1] 47708.51
AIC2<--2*fit.2$loglik[2]+2*length(fit.2$coefficients); AIC2
## [1] 46255.1
AIC3<--2*fit.3$loglik[2]+2*length(fit.3$coefficients); AIC3
## [1] 44054.74
AIC4<--2*fit.4$loglik[2]+2*length(fit.4$coefficients); AIC4
## [1] 43832.39
Based on the minimum AIC value, log logistic hazard function is the best model fit.
Conclusion: Even though all the models provided significant hazard values for each predictors however, the strength of model is how it fits with the empirical data. And we find that clearly from the log logistic function that how well it fits with the empirical data. Also, the lowest AIC value among the models provide evidence to use log-logistic model for this study. To support the argument more graphical and tabular presentation piecewise constant exponential model and cox regression has been done below.
Piecewise constant exponential model
fit.5<-phreg(Surv(secbi, b2event)~rururb+relg,
data=sub2[sub2$secbi>0,], dist="pch",
cuts=c(12, 120,220, 320))
summary(fit.5)
## Call:
## phreg(formula = Surv(secbi, b2event) ~ rururb + relg, data = sub2[sub2$secbi >
## 0, ], dist = "pch", cuts = c(12, 120, 220, 320))
##
## Covariate W.mean Coef Exp(Coef) se(Coef) Wald p
## rururb 0.401 -0.306 0.736 0.031 0.000
## relg
## Hinduism 0.780 0 1 (reference)
## Islam 0.220 -0.173 0.841 0.038 0.000
##
##
## Events 4760
## Total time at risk 265737
## Max. log. likelihood -22554
## LR test statistic 140.66
## Degrees of freedom 2
## Overall p-value 0
plot(fit.5)
plot(fit.5, fn="haz", ylim=c(0, .05))
lines(fit.haz.sm, col=2)
AIC5<--2*fit.5$loglik[2]+2*length(fit.5$coefficients); AIC5
## [1] 45112.21
AIC4
## [1] 43832.39
Graphical output
emp<-coxreg(Surv(secbi, b2event)~rururb+relg,
data=sub2[sub2$secbi>0,])
check.dist(sp=emp,pp=fit.1, main = "Empirical vs. Exponential")
The two lines did not match at all. That is the model did not fit well with the data.
check.dist(sp=emp,pp=fit.2, main = "Empirical vs. Weibull")
At the begining the curves were following each other together but later they split away into two different ways.
check.dist(sp=emp,pp=fit.3, main = "Empirical vs. Log-Normal")
It is slightly better than weibull but the gap at the end is bigger.
check.dist(sp=emp,pp=fit.4, main = "Empirical vs. Log-Logistic")
This shows the best fit compared to earlier models because all the way long the curves followed each other and at the end gap is lesser than the other models. That is why this is the best model for to understand the relationship of birth interval, and education and age.
check.dist(sp=emp,pp=fit.5, main = "Empirical vs. PCH")