Code
library(haven)
library(tidyverse)
library(dplyr)
library(survival)
library(survminer)
library(ggsurvfit)
library(survey)
Discrete-time models analyse time-to-event data when survival times are intrinsically discrete (e.g. number of machine cycles) or grouped into discrete intervals of time (often referred to as interval censoring). -Mills, Chapter 10
I am again using the DHS file for Bangladesh 2014 data.
library(haven)
library(tidyverse)
library(dplyr)
library(survival)
library(survminer)
library(ggsurvfit)
library(survey)
First, reading in the household file for Bangladesh and creating a multigen variable.
Next, bringing in the individual file and combining.
<- read_dta("C:/Users/codar/OneDrive - University of Texas at San Antonio/R 2022/BDIR72FL.DTA")
bgin
$hh <- substr(bgin$caseid, 1,12)
bgin<- left_join(bgin, bghh22, by=c("hh" ="hhid")) #you used bdat, not bdat2
bang1
# names(mdat)
Now, I select variables and create survey design.
<- bang1 %>%
bang2 filter(bidx_01==1&b0_01==0)%>%
transmute(CASEID=caseid,
int.cmc=v008,
fbir.cmc=b3_01,
sbir.cmc=b3_02,
marr.cmc=v509,
rural=v025,
educ=v106,
age = v012,
agec=cut(v012, breaks = seq(15,50,5), include.lowest=T),
partneredu=v701,
partnerage=v730,
weight=v005/1000000,
psu=v021,
strata=v022, #by adding them into the transmute, I was able to get them to show up in the mdat2 data frame...why?
mgenhh = mgenhh,
hh = hh) %>%
select(CASEID, int.cmc, fbir.cmc, sbir.cmc, marr.cmc, rural, educ, age, agec, partneredu, partnerage, weight, psu, strata, mgenhh, hh, rural)%>%
mutate(agefb = (age - (int.cmc - fbir.cmc)/12))
#Calculating the birth intervals, observed and censored and the event indicator (did women have a second birth?)
<-bang2%>%
bang3mutate(secbi = ifelse(is.na(sbir.cmc)==T,
- fbir.cmc,
int.cmc - sbir.cmc),
fbir.cmc b2event = ifelse(is.na(sbir.cmc)==T,0,1))
#Survey Design
options(survey.lonely.psu = "adjust")
<-svydesign(ids=~psu,
desstrata=~strata,
data=bang3[bang3$secbi>0,],
weight=~weight )
A subject-period file, also sometimes referred to as a person-period file or discrete-time data, is arranged by a temporal unit of analysis that serves as a ‘clock’ that counts the exposure time or risk period of each subject. In this type of file, the clock starts for each subject at the onset of risk. -Mills, Chapter 3
#make person period file
<-survSplit(Surv(secbi, b2event)~. ,
bbdata = bang3[bang3$secbi>0,],
cut=seq(0,60, 12), #changing the middle # changes the output of "secbi" --
episode="year_birth")
$year <- bb$year_birth-1
bb<-bb[order(bb$CASEID, bb$year_birth),]
bb::kable(head(bb[, c("CASEID", "secbi", "b2event", "year", "educ", "mgenhh", "rural")], n=20)) knitr
CASEID | secbi | b2event | year | educ | mgenhh | rural |
---|---|---|---|---|---|---|
1 3 2 | 12 | 0 | 1 | 1 | mgenhh | 2 |
1 3 2 | 24 | 0 | 2 | 1 | mgenhh | 2 |
1 3 2 | 36 | 0 | 3 | 1 | mgenhh | 2 |
1 3 2 | 48 | 0 | 4 | 1 | mgenhh | 2 |
1 3 2 | 60 | 0 | 5 | 1 | mgenhh | 2 |
1 3 2 | 110 | 1 | 6 | 1 | mgenhh | 2 |
1 6 2 | 12 | 0 | 1 | 1 | notmgen | 2 |
1 6 2 | 24 | 0 | 2 | 1 | notmgen | 2 |
1 6 2 | 36 | 0 | 3 | 1 | notmgen | 2 |
1 6 2 | 48 | 0 | 4 | 1 | notmgen | 2 |
1 6 2 | 60 | 0 | 5 | 1 | notmgen | 2 |
1 6 2 | 114 | 1 | 6 | 1 | notmgen | 2 |
1 9 2 | 12 | 0 | 1 | 1 | notmgen | 2 |
1 9 2 | 24 | 0 | 2 | 1 | notmgen | 2 |
1 9 2 | 30 | 1 | 3 | 1 | notmgen | 2 |
1 20 2 | 12 | 0 | 1 | 0 | notmgen | 2 |
1 20 2 | 24 | 0 | 2 | 0 | notmgen | 2 |
1 20 2 | 36 | 0 | 3 | 0 | notmgen | 2 |
1 20 2 | 48 | 0 | 4 | 0 | notmgen | 2 |
1 20 2 | 60 | 0 | 5 | 0 | notmgen | 2 |
The following code was used in the Mills book example. I wasn’t able to make it work. In our DHS dataset, which variable would we use for start? Would it be int.cmc or date of interview? It doesn’t seem to work with the interview date variable. In your code, you specify episode as “year_birth”, which is an optional according to the argument descriptions. Can you explain how/why that was added?
#From Mills book
# birthperiod<-survSplit(bang3, cut = c(1:12), end = "b2event", event = "secbi", start = "year_birth")
Fiting the basic logistic model with only time in the model. We have to create a new survey design because we now have created a person-period.
options(survey.lonely.psu = "adjust")
<-survey::svydesign(ids=~psu,
des1strata=~strata,
data=bb,
weight=~weight )
<-svyglm(b2event~as.factor(year)-1,design=des1 , family="binomial") fit.b
Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.b)
Call:
svyglm(formula = b2event ~ as.factor(year) - 1, design = des1,
family = "binomial")
Survey design:
survey::svydesign(ids = ~psu, strata = ~strata, data = bb, weight = ~weight)
Coefficients:
Estimate Std. Error t value Pr(>|t|)
as.factor(year)1 -4.16877 0.08860 -47.05 <2e-16 ***
as.factor(year)2 -2.04914 0.03391 -60.42 <2e-16 ***
as.factor(year)3 -1.47029 0.05241 -28.05 <2e-16 ***
as.factor(year)4 -1.41197 0.03540 -39.89 <2e-16 ***
as.factor(year)5 -1.26657 0.03837 -33.01 <2e-16 ***
as.factor(year)6 1.13695 0.03690 30.81 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1.000015)
Number of Fisher Scoring iterations: 6
This provides a hazard estimate for each time period.
<-1/(1+exp(-coef(fit.b)))
haz<-seq(1,6,1)
timeplot(haz~time, type="l", ylab="h(t)")
title(main="Discrete Time Hazard Function for birth interval")
<-NA
St<-1:length(haz)
time1]<-1-haz[1]
St[for(i in 2:length(haz)){
<-St[i-1]* (1-haz[i])
St[i]
}
<-c(1, St)
St<-c(0, time)
timeplot(y=St,x=time, type="l",
main="Survival function for second birth interval")
```{r}
#| warning: false
#| error: false
#| results: hide
#General Model
#Linear term for time
<-svyglm(b2event~1,
fit.gdesign=des1 ,
family=binomial(link="cloglog"))
#Linear term for time
<-svyglm(b2event~year,
fit.lbdesign=des1 ,
family=binomial(link="cloglog"))
summary(fit.lb)
```
```{r}
#| warning: false
#| error: false
#| results: hide
<-svyglm(b2event~year+I(year^2),
fit.qbdesign=des1 ,
family=binomial(link="cloglog"))
summary(fit.qb)
```
```{r}
#| warning: false
#| error: false
#| results: hide
<-svyglm(b2event~year+I(year^2)+I(year^3 ),
fit.cbdesign=des1 ,
family=binomial(link="cloglog"))
```
```{r}
#| warning: false
#| error: false
#| results: hide
<-svyglm(b2event~year+I(year^2)+I(year^3 )+I(year^4),
fit.qdesign=des1 ,
family=binomial(link="cloglog"))
```
```{r}
#| warning: false
#| error: false
#| results: hide
library(splines)
<-svyglm(b2event~ns(year, df = 3),
fit.spdesign=des1 ,
family=binomial(link="cloglog"))
```
<-expand.grid(year=seq(1,20,1))
dat$genmod<-predict(fit.g, newdata=data.frame(year=as.factor(1:20 )), type="response")
dat$lin<-predict(fit.lb, newdata=dat, type="response")
dat$sq<-predict(fit.qb, newdata=dat, type="response")
dat$cub<-predict(fit.cb, newdata=dat, type="response")
dat$quart<-predict(fit.q, newdata=dat, type="response")
dat$spline<-predict(fit.sp, newdata=expand.grid(year=seq(1,20,1)), type="response")
dat
dat
year genmod lin sq cub quart spline
1 1 0.1813186 0.04020198 0.05104934 0.01430889 0.01593735 0.01258671
2 2 0.1813186 0.07405644 0.07647749 0.11550122 0.11059055 0.02208520
3 3 0.1813186 0.13435285 0.12409598 0.19131111 0.19491629 0.03779483
4 4 0.1813186 0.23703485 0.21496689 0.18040223 0.18549099 0.06165278
5 5 0.1813186 0.39788631 0.38424949 0.23589300 0.22652270 0.09376960
6 6 0.1813186 0.61375417 0.65548391 0.75340628 0.75601587 0.13033404
7 7 0.1813186 0.83200088 0.92334787 1.00000000 1.00000000 0.16275251
8 8 0.1813186 0.96473587 0.99887541 1.00000000 1.00000000 0.18037862
9 9 0.1813186 0.99811188 1.00000000 1.00000000 1.00000000 0.18085977
10 10 0.1813186 0.99999220 1.00000000 1.00000000 1.00000000 0.17024388
11 11 0.1813186 1.00000000 1.00000000 1.00000000 1.00000000 0.15602093
12 12 0.1813186 1.00000000 1.00000000 1.00000000 1.00000000 0.14449391
13 13 0.1813186 1.00000000 1.00000000 1.00000000 1.00000000 0.14049818
14 14 0.1813186 1.00000000 1.00000000 1.00000000 1.00000000 0.14895397
15 15 0.1813186 1.00000000 1.00000000 1.00000000 1.00000000 0.17482395
16 16 0.1813186 1.00000000 1.00000000 1.00000000 1.00000000 0.22269761
17 17 0.1813186 1.00000000 1.00000000 1.00000000 1.00000000 0.29991942
18 18 0.1813186 1.00000000 1.00000000 1.00000000 1.00000000 0.41402299
19 19 0.1813186 1.00000000 1.00000000 1.00000000 1.00000000 0.56500697
20 20 0.1813186 1.00000000 1.00000000 1.00000000 1.00000000 0.73327147
plot(genmod~year, dat, type="l", ylab="h(t)", xlab="Time", ylim=c(0, .25), xlim=c(0, 20))
title(main="Hazard function from different time parameterizations")
lines(lin~year, dat, col=2, lwd=2)
lines(sq~year, dat, col=3, lwd=2)
lines(cub~year, dat, col=4, lwd=2)
lines(quart~year, dat, col=5, lwd=2)
lines(spline~year, dat, col=6, lwd=2)
legend("topleft",
legend=c("General Model","constant", "Linear","Square", "Cubic", "Quartic", "Natural spline"),
col=c(1,1:6), lty=c(1,3,1,1,1,1,1), lwd=1.5)
#AIC table
<-round(c(
aic$deviance+2*length(fit.lb$coefficients),
fit.lb$deviance+2*length(fit.qb$coefficients),
fit.qb$deviance+2*length(fit.cb$coefficients),
fit.cb$deviance+2*length(fit.q$coefficients),
fit.q$deviance+2*length(fit.sp$coefficients),
fit.sp$deviance+2*length(fit.g$coefficients)),2)
fit.g#compare all aics to the one from the general model
<-round(aic-aic[6],2)
dif.aicdata.frame(model =c( "linear","square", "cubic", "quartic","spline", "general"),
aic=aic,
aic_dif=dif.aic)
model aic aic_dif
1 linear 51883.31 -11311.12
2 square 51756.44 -11437.99
3 cubic 49316.54 -13877.89
4 quartic 49305.35 -13889.08
5 spline 49610.72 -13583.71
6 general 63194.43 0.00
After fitting different alternative time specification models and running an Akaike Information Criterion analysis, the AIC scores show that the cubic model is the best for the data.