Homework 5 - Cox regression, DEM 7223 - Event History Analysis
load library
library(tidyverse)
library(haven)
library(survival)
library(car)
library(survey)
library(muhaz)
library(eha)
library(ggsurvfit)
library(dplyr)# load the data
d1<-read_dta("GHKR72DT/GHKR72FL.DTA")
d1<-zap_label(d1)d2<-d1 %>%
mutate(caseid=caseid,
death.age=b7,
int.cmc=v008,
fbir.cmc=b3,
marr.cmc=v509,
birth_date=b3,
child_alive=b5,
highses=v190,
educ=v106,
partnedu=v701,
age=v012,
rural=v025,
agec=cut(v012, breaks = seq(15,50,5), include.lowest=T),
partnerage=v730,
weight=v005/1000000,
psu=v021,
strata=v022) %>%
select(caseid,death.age,int.cmc,fbir.cmc,marr.cmc,fbir.cmc,birth_date,child_alive,highses,educ,rural,agec,partnedu, partnerage, weight, psu, strata) %>%
mutate(highses =car::Recode(highses, recodes ="1:3 =0; 4:5 =1; else =NA"))
d2$death.age<-ifelse(d2$child_alive==1,
((((d2$int.cmc)))- (((d2$birth_date))))
,d2$death.age)
# censoring indicator for death by age 1, in months (12 months)
d2$d.event<-ifelse(is.na(d2$death.age) ==T| d2$death.age > 12,0,1)
d2$d.eventfac<-factor(d2$d.event)
levels(d2$d.eventfac)<-c("Alive at 1", "Death by 1")#Creating covariates
d2$educ.high<-ifelse(d2$educ %in% c(2,3), 1, 0)
d2$partnhiedu<-ifelse(d2$partnedu<3,0,
ifelse(d2$partnedu%in%c(8,9),NA,1 ))options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~psu, strata = ~strata,
data = d2, weight = ~weight)Create Person - Period file
d1<- survSplit(Surv(death.age, d.event)~. ,
data = d2[d2$death.age>0,],
cut = seq(0,60,5),
episode = "year_death")
d1$year<-d1$year_death-1
d1<-d1[order(d1$caseid, d1$year_death),]
knitr::kable(head(d1[, c("caseid", "death.age", "d.event", "year", "educ", "highses", "agec", "rural")], n = 20))| caseid | death.age | d.event | year | educ | highses | agec | rural | |
|---|---|---|---|---|---|---|---|---|
| 1 | 1 1 2 | 5 | 0 | 1 | 1 | 0 | (40,45] | 2 |
| 2 | 1 1 2 | 10 | 0 | 2 | 1 | 0 | (40,45] | 2 |
| 3 | 1 1 2 | 15 | 0 | 3 | 1 | 0 | (40,45] | 2 |
| 4 | 1 1 2 | 20 | 0 | 4 | 1 | 0 | (40,45] | 2 |
| 5 | 1 1 2 | 25 | 0 | 5 | 1 | 0 | (40,45] | 2 |
| 6 | 1 1 2 | 30 | 0 | 6 | 1 | 0 | (40,45] | 2 |
| 7 | 1 1 2 | 31 | 0 | 7 | 1 | 0 | (40,45] | 2 |
| 8 | 1 3 2 | 5 | 0 | 1 | 2 | 0 | (20,25] | 2 |
| 9 | 1 3 2 | 10 | 1 | 2 | 2 | 0 | (20,25] | 2 |
| 10 | 1 5 2 | 5 | 0 | 1 | 1 | 1 | (35,40] | 2 |
| 12 | 1 5 2 | 5 | 0 | 1 | 1 | 1 | (35,40] | 2 |
| 11 | 1 5 2 | 7 | 1 | 2 | 1 | 1 | (35,40] | 2 |
| 13 | 1 5 2 | 10 | 0 | 2 | 1 | 1 | (35,40] | 2 |
| 14 | 1 5 2 | 15 | 0 | 3 | 1 | 1 | (35,40] | 2 |
| 15 | 1 5 2 | 20 | 0 | 4 | 1 | 1 | (35,40] | 2 |
| 16 | 1 5 2 | 25 | 0 | 5 | 1 | 1 | (35,40] | 2 |
| 17 | 1 5 2 | 30 | 0 | 6 | 1 | 1 | (35,40] | 2 |
| 18 | 1 5 2 | 35 | 0 | 7 | 1 | 1 | (35,40] | 2 |
| 19 | 1 5 2 | 40 | 0 | 8 | 1 | 1 | (35,40] | 2 |
| 20 | 1 5 2 | 45 | 0 | 9 | 1 | 1 | (35,40] | 2 |
Discrete time model
The General Model
options(survey.lonely.psu = "adjust")
des<-survey::svydesign(ids=~psu,
strata = ~strata,
data = d1,
weight=~weight)fit.gm<-svyglm(d.event~as.factor(year) -1,
design = des,
family = binomial(link="cloglog"))Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.gm)
Call:
svyglm(formula = d.event ~ as.factor(year) - 1, design = des,
family = binomial(link = "cloglog"))
Survey design:
survey::svydesign(ids = ~psu, strata = ~strata, data = d1, weight = ~weight)
Coefficients:
Estimate Std. Error t value Pr(>|t|)
as.factor(year)1 -2.21219 0.04636 -47.72 <2e-16 ***
as.factor(year)2 -2.23171 0.05475 -40.76 <2e-16 ***
as.factor(year)3 -3.02859 0.09756 -31.04 <2e-16 ***
as.factor(year)4 -20.52106 0.03668 -559.41 <2e-16 ***
as.factor(year)5 -20.52194 0.03816 -537.76 <2e-16 ***
as.factor(year)6 -20.52213 0.03770 -544.31 <2e-16 ***
as.factor(year)7 -20.52348 0.03873 -529.94 <2e-16 ***
as.factor(year)8 -20.52290 0.03915 -524.16 <2e-16 ***
as.factor(year)9 -20.51785 0.04063 -505.00 <2e-16 ***
as.factor(year)10 -20.51935 0.04687 -437.77 <2e-16 ***
as.factor(year)11 -20.52747 0.05593 -367.04 <2e-16 ***
as.factor(year)12 -20.51075 0.07376 -278.09 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 0.4417794)
Number of Fisher Scoring iterations: 19
Plotting the Hazard Function
haz<-1/(1+exp(-coef(fit.gm)))
time<-seq(1,12,1)
plot(haz~time, type="l", ylab="h(t)")
title(main="Discrete Time Hazard Function for Infant Mortality")Plotting the Survival Estimate
St<-NA
time<-1:length(haz)
St[1]<-1-haz[1]
for(i in 2:length(haz)){
St[i]<-St[i-1]* (1-haz[i])
}
St<-c(1, St)
time<-c(0, time)
plot(y=St,x=time, type="l",
main="Survival Plot for Infant Mortality")Alternative Time Specification
Constant model
fit.cm<-svyglm(d.event~ year + I(year^2) + I(year^3),
design=des, family=binomial (link="cloglog"))Warning in eval(family$initialize): non-integer #successes in a binomial glm!
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(fit.cm)
Call:
svyglm(formula = d.event ~ year + I(year^2) + I(year^3), design = des,
family = binomial(link = "cloglog"))
Survey design:
survey::svydesign(ids = ~psu, strata = ~strata, data = d1, weight = ~weight)
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 11.48022 0.55907 20.53 <2e-16 ***
year -25.34562 0.91562 -27.68 <2e-16 ***
I(year^2) 14.06158 0.42912 32.77 <2e-16 ***
I(year^3) -2.40838 0.05909 -40.76 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 0.4417794)
Number of Fisher Scoring iterations: 25
linear model
fit.lm<-svyglm(d.event~year, design=des,
family=binomial (link="cloglog"))Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.lm)
Call:
svyglm(formula = d.event ~ year, design = des, family = binomial(link = "cloglog"))
Survey design:
survey::svydesign(ids = ~psu, strata = ~strata, data = d1, weight = ~weight)
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.17868 0.04697 -25.09 <2e-16 ***
year -0.76734 0.02273 -33.76 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 0.5650687)
Number of Fisher Scoring iterations: 8
Quadratic model
fit.qtc<-svyglm(d.event~year + I(year^2),
design=des, family=binomial (link="cloglog"))Warning in eval(family$initialize): non-integer #successes in a binomial glm!
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(fit.qtc)
Call:
svyglm(formula = d.event ~ year + I(year^2), design = des, family = binomial(link = "cloglog"))
Survey design:
survey::svydesign(ids = ~psu, strata = ~strata, data = d1, weight = ~weight)
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -3.55573 0.16638 -21.37 <2e-16 ***
year 1.92683 0.17716 10.88 <2e-16 ***
I(year^2) -0.61060 0.04312 -14.16 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 0.4696829)
Number of Fisher Scoring iterations: 12
Quartic model
fit.q<-svyglm(d.event~year+I(year^2)+I(year^3 )+I(year^4),
design=des ,
family=binomial(link="cloglog"))Warning in eval(family$initialize): non-integer #successes in a binomial glm!
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(fit.q)
Call:
svyglm(formula = d.event ~ year + I(year^2) + I(year^3) + I(year^4),
design = des, family = binomial(link = "cloglog"))
Survey design:
survey::svydesign(ids = ~psu, strata = ~strata, data = d1, weight = ~weight)
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 18.96746 0.70669 26.84 <2e-16 ***
year -40.91271 1.23114 -33.23 <2e-16 ***
I(year^2) 24.92306 0.65282 38.18 <2e-16 ***
I(year^3) -5.49674 0.12326 -44.59 <2e-16 ***
I(year^4) 0.30675 0.00642 47.78 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 0.4417794)
Number of Fisher Scoring iterations: 23
Spline model
library(splines)
fit.sp<-svyglm(d.event~ns(year, df=3), design=des,
family=binomial (link="cloglog"))Warning in eval(family$initialize): non-integer #successes in a binomial glm!
Warning: glm.fit: algorithm did not converge
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(fit.sp)
Call:
svyglm(formula = d.event ~ ns(year, df = 3), design = des, family = binomial(link = "cloglog"))
Survey design:
survey::svydesign(ids = ~psu, strata = ~strata, data = d1, weight = ~weight)
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -2.212e+00 4.636e-02 -47.72 < 2e-16 ***
ns(year, df = 3)1 -1.368e+01 2.505e+00 -5.46 8.34e-08 ***
ns(year, df = 3)2 -3.146e+03 1.081e+02 -29.12 < 2e-16 ***
ns(year, df = 3)3 -5.235e+03 1.814e+02 -28.86 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 0.4417794)
Number of Fisher Scoring iterations: 25
dd<- expand.grid(year=seq(1,12,1))
dd$genmod<-predict(fit.gm, newdata = data.frame(year=as.factor(1:12)), type="response")
dd$lin<-predict(fit.lm, newdata=dd, type="response")
dd$sq<-predict(fit.qtc, newdata=dd, type="response")
dd$cub<-predict(fit.cm, newdata=dd, type="response")
dd$quart<-predict(fit.q, newdata=dd, type="response")
dd$spline<-predict(fit.sp, newdata=expand.grid(year=seq(1,12,1)), type="response")
dd year genmod lin sq cub quart
1 1 1.036824e-01 1.331085e-01 1.010351e-01 1.036824e-01 1.036824e-01
2 2 1.017844e-01 6.416241e-02 1.105292e-01 1.017844e-01 1.017844e-01
3 3 4.723214e-02 3.031653e-02 3.726871e-02 4.723214e-02 4.723214e-02
4 4 1.224104e-09 1.419038e-02 3.625063e-03 5.313069e-09 4.687522e-09
5 5 1.223019e-09 6.613030e-03 1.023895e-04 2.220446e-16 2.220446e-16
6 6 1.222785e-09 3.075518e-03 8.513085e-07 2.220446e-16 2.220446e-16
7 7 1.221144e-09 1.428970e-03 2.087047e-09 2.220446e-16 2.220446e-16
8 8 1.221850e-09 6.636451e-04 1.508734e-12 2.220446e-16 2.220446e-16
9 9 1.228041e-09 3.081482e-04 3.216095e-16 2.220446e-16 2.220446e-16
10 10 1.226190e-09 1.430678e-04 2.220446e-16 2.220446e-16 2.220446e-16
11 11 1.216282e-09 6.642096e-05 2.220446e-16 2.220446e-16 2.220446e-16
12 12 1.236791e-09 3.083610e-05 2.220446e-16 2.220446e-16 1.030462e-09
spline
1 1.036824e-01
2 1.000000e+00
3 1.000000e+00
4 1.000000e+00
5 1.000000e+00
6 1.000000e+00
7 2.220446e-16
8 2.220446e-16
9 2.220446e-16
10 2.220446e-16
11 2.220446e-16
12 2.220446e-16
Include all main effects in the model
fit.0<-svyglm(d.event~as.factor(year) + highses + educ.high + partnhiedu + agec,
design = des,
family = binomial(link="cloglog"))Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.0)
Call:
svyglm(formula = d.event ~ as.factor(year) + highses + educ.high +
partnhiedu + agec, design = des, family = binomial(link = "cloglog"))
Survey design:
survey::svydesign(ids = ~psu, strata = ~strata, data = d1, weight = ~weight)
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.64797 0.13165 -12.518 < 2e-16 ***
as.factor(year)2 -0.01199 0.07765 -0.154 0.877413
as.factor(year)3 -0.74691 0.11662 -6.405 4.38e-10 ***
as.factor(year)4 -18.25767 0.05979 -305.350 < 2e-16 ***
as.factor(year)5 -18.25454 0.06096 -299.471 < 2e-16 ***
as.factor(year)6 -18.24842 0.06076 -300.322 < 2e-16 ***
as.factor(year)7 -18.24354 0.06191 -294.675 < 2e-16 ***
as.factor(year)8 -18.23796 0.06246 -291.983 < 2e-16 ***
as.factor(year)9 -18.22760 0.06241 -292.079 < 2e-16 ***
as.factor(year)10 -18.22205 0.06629 -274.883 < 2e-16 ***
as.factor(year)11 -18.20833 0.07284 -249.966 < 2e-16 ***
as.factor(year)12 -18.18507 0.08702 -208.980 < 2e-16 ***
highses -0.01724 0.08138 -0.212 0.832328
educ.high 0.03339 0.07159 0.466 0.641177
partnhiedu 0.17808 0.10944 1.627 0.104509
agec(20,25] -0.43561 0.13279 -3.280 0.001130 **
agec(25,30] -0.58583 0.13731 -4.266 2.50e-05 ***
agec(30,35] -0.68271 0.13922 -4.904 1.39e-06 ***
agec(35,40] -0.86841 0.16074 -5.403 1.15e-07 ***
agec(40,45] -1.03830 0.18379 -5.649 3.13e-08 ***
agec(45,50] -1.17353 0.30627 -3.832 0.000148 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 0.4348594)
Number of Fisher Scoring iterations: 19
Testing for interaction between at least two of the predictors
fit.1<-svyglm(d.event~as.factor(year) + highses + educ.high + partnhiedu + agec*highses + educ.high*agec + highses*partnhiedu,
design = des,
family = binomial(link="cloglog"))Warning in eval(family$initialize): non-integer #successes in a binomial glm!
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(fit.1)
Call:
svyglm(formula = d.event ~ as.factor(year) + highses + educ.high +
partnhiedu + agec * highses + educ.high * agec + highses *
partnhiedu, design = des, family = binomial(link = "cloglog"))
Survey design:
survey::svydesign(ids = ~psu, strata = ~strata, data = d1, weight = ~weight)
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.75370 0.16709 -10.496 < 2e-16 ***
as.factor(year)2 -0.01040 0.07757 -0.134 0.893460
as.factor(year)3 -0.74318 0.11680 -6.363 5.79e-10 ***
as.factor(year)4 -18.24915 0.05981 -305.093 < 2e-16 ***
as.factor(year)5 -18.24647 0.06095 -299.364 < 2e-16 ***
as.factor(year)6 -18.24035 0.06052 -301.414 < 2e-16 ***
as.factor(year)7 -18.23418 0.06189 -294.605 < 2e-16 ***
as.factor(year)8 -18.22711 0.06226 -292.777 < 2e-16 ***
as.factor(year)9 -18.21914 0.06194 -294.135 < 2e-16 ***
as.factor(year)10 -18.21454 0.06586 -276.547 < 2e-16 ***
as.factor(year)11 -18.20092 0.07310 -248.987 < 2e-16 ***
as.factor(year)12 -18.18373 0.08697 -209.079 < 2e-16 ***
highses -0.01549 0.44062 -0.035 0.971974
educ.high 0.24620 0.23329 1.055 0.291943
partnhiedu 0.40337 0.15290 2.638 0.008682 **
agec(20,25] -0.42834 0.18559 -2.308 0.021547 *
agec(25,30] -0.62337 0.20163 -3.092 0.002140 **
agec(30,35] -0.48086 0.18335 -2.623 0.009082 **
agec(35,40] -0.61526 0.19935 -3.086 0.002178 **
agec(40,45] -0.83996 0.24322 -3.454 0.000617 ***
agec(45,50] -0.77460 0.32646 -2.373 0.018162 *
highses:agec(20,25] 0.12167 0.54153 0.225 0.822357
highses:agec(25,30] 0.24742 0.44819 0.552 0.581248
highses:agec(30,35] -0.21273 0.44689 -0.476 0.634329
highses:agec(35,40] 0.02999 0.47119 0.064 0.949289
highses:agec(40,45] -0.30840 0.67291 -0.458 0.646995
highses:agec(45,50] -14.53364 0.84563 -17.187 < 2e-16 ***
educ.high:agec(20,25] -0.10174 0.26337 -0.386 0.699487
educ.high:agec(25,30] -0.17068 0.27500 -0.621 0.535218
educ.high:agec(30,35] -0.22137 0.27061 -0.818 0.413852
educ.high:agec(35,40] -0.55877 0.34081 -1.640 0.101937
educ.high:agec(40,45] -0.29715 0.44439 -0.669 0.504122
educ.high:agec(45,50] -16.40189 0.54741 -29.963 < 2e-16 ***
highses:partnhiedu -0.28573 0.20989 -1.361 0.174232
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 0.4323507)
Number of Fisher Scoring iterations: 19
Generate hazard plots for interesting cases highlighting the significant predictors in your analysis
plot(genmod~year, dd, type="l", ylab="h(t)", xlab="Time", ylim=c(0, .25), xlim=c(0, 20))
title(main="Hazard function from different time parameterizations")
lines(lin~year, dd, col=2, lwd=2)
lines(sq~year, dd, col=3, lwd=2)
lines(cub~year, dd, col=4, lwd=2)
lines(quart~year, dd, col=5, lwd=2)
lines(spline~year, dd, col=6, lwd=2)
legend("topright",
legend=c("General Model","constant", "Linear","Square", "Cubic", "Quartic", "Natural spline"),
col=c(1:6), lwd=1.5)#AIC table
aic<-round(c(
fit.lm$deviance+2*length(fit.lm$coefficients),
fit.qtc$deviance+2*length(fit.qtc$coefficients),
fit.cm$deviance+2*length(fit.cm$coefficients),
fit.q$deviance+2*length(fit.q$coefficients),
fit.sp$deviance+2*length(fit.sp$coefficients),
fit.gm$deviance+2*length(fit.gm$coefficients)),2)
#comparing all aics to the general model
dif.aic<-round(aic-aic[6],2)
data.frame(model =c( "linear","square", "cubic", "quartic","spline", "general"),
aic=aic,
aic_dif=dif.aic) model aic aic_dif
1 linear 9228.99 359.82
2 square 8897.41 28.24
3 cubic 8853.17 -16.00
4 quartic 8855.17 -14.00
5 spline 8853.17 -16.00
6 general 8869.17 0.00
Interpretation
The AIC’s of the alternative time specifications shows that the cubic and spline AIC fits the data best.