Homework 5 - Cox regression, DEM 7223 - Event History Analysis

Author

Assogba Albert

Published

October 27, 2022

load library

library(tidyverse)
library(haven)
library(survival)
library(car)
library(survey)
library(muhaz)
library(eha)
library(ggsurvfit)
library(dplyr)
# load the data
d1<-read_dta("GHKR72DT/GHKR72FL.DTA")
d1<-zap_label(d1)
d2<-d1 %>%
    mutate(caseid=caseid,
           death.age=b7,
           int.cmc=v008,
           fbir.cmc=b3,
           marr.cmc=v509,
           birth_date=b3,
           child_alive=b5,
           highses=v190,
           educ=v106,
           partnedu=v701,
           age=v012,
           rural=v025,
           agec=cut(v012, breaks = seq(15,50,5), include.lowest=T),
           partnerage=v730,
           weight=v005/1000000,
           psu=v021,
           strata=v022) %>% 
  select(caseid,death.age,int.cmc,fbir.cmc,marr.cmc,fbir.cmc,birth_date,child_alive,highses,educ,rural,agec,partnedu, partnerage, weight, psu, strata) %>% 
    mutate(highses =car::Recode(highses, recodes ="1:3 =0; 4:5 =1; else =NA")) 
 

d2$death.age<-ifelse(d2$child_alive==1,
                      ((((d2$int.cmc)))- (((d2$birth_date))))
                      ,d2$death.age)

# censoring indicator for death by age 1, in months (12 months)

d2$d.event<-ifelse(is.na(d2$death.age) ==T| d2$death.age > 12,0,1)
d2$d.eventfac<-factor(d2$d.event)
levels(d2$d.eventfac)<-c("Alive at 1", "Death by 1")

#Creating covariates

d2$educ.high<-ifelse(d2$educ %in% c(2,3), 1, 0)
d2$partnhiedu<-ifelse(d2$partnedu<3,0,
                           ifelse(d2$partnedu%in%c(8,9),NA,1 ))
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~psu, strata = ~strata,
               data = d2, weight = ~weight)

Create Person - Period file

d1<- survSplit(Surv(death.age, d.event)~. ,
              data = d2[d2$death.age>0,],
              cut = seq(0,60,5),
              episode = "year_death")


d1$year<-d1$year_death-1
d1<-d1[order(d1$caseid, d1$year_death),]
knitr::kable(head(d1[, c("caseid", "death.age", "d.event", "year", "educ", "highses", "agec", "rural")], n = 20))
caseid death.age d.event year educ highses agec rural
1 1 1 2 5 0 1 1 0 (40,45] 2
2 1 1 2 10 0 2 1 0 (40,45] 2
3 1 1 2 15 0 3 1 0 (40,45] 2
4 1 1 2 20 0 4 1 0 (40,45] 2
5 1 1 2 25 0 5 1 0 (40,45] 2
6 1 1 2 30 0 6 1 0 (40,45] 2
7 1 1 2 31 0 7 1 0 (40,45] 2
8 1 3 2 5 0 1 2 0 (20,25] 2
9 1 3 2 10 1 2 2 0 (20,25] 2
10 1 5 2 5 0 1 1 1 (35,40] 2
12 1 5 2 5 0 1 1 1 (35,40] 2
11 1 5 2 7 1 2 1 1 (35,40] 2
13 1 5 2 10 0 2 1 1 (35,40] 2
14 1 5 2 15 0 3 1 1 (35,40] 2
15 1 5 2 20 0 4 1 1 (35,40] 2
16 1 5 2 25 0 5 1 1 (35,40] 2
17 1 5 2 30 0 6 1 1 (35,40] 2
18 1 5 2 35 0 7 1 1 (35,40] 2
19 1 5 2 40 0 8 1 1 (35,40] 2
20 1 5 2 45 0 9 1 1 (35,40] 2

Discrete time model

The General Model

options(survey.lonely.psu = "adjust")
des<-survey::svydesign(ids=~psu,
                strata = ~strata,
                data = d1,
                weight=~weight)
fit.gm<-svyglm(d.event~as.factor(year) -1,
              design = des,
              family = binomial(link="cloglog"))
Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.gm)

Call:
svyglm(formula = d.event ~ as.factor(year) - 1, design = des, 
    family = binomial(link = "cloglog"))

Survey design:
survey::svydesign(ids = ~psu, strata = ~strata, data = d1, weight = ~weight)

Coefficients:
                   Estimate Std. Error t value Pr(>|t|)    
as.factor(year)1   -2.21219    0.04636  -47.72   <2e-16 ***
as.factor(year)2   -2.23171    0.05475  -40.76   <2e-16 ***
as.factor(year)3   -3.02859    0.09756  -31.04   <2e-16 ***
as.factor(year)4  -20.52106    0.03668 -559.41   <2e-16 ***
as.factor(year)5  -20.52194    0.03816 -537.76   <2e-16 ***
as.factor(year)6  -20.52213    0.03770 -544.31   <2e-16 ***
as.factor(year)7  -20.52348    0.03873 -529.94   <2e-16 ***
as.factor(year)8  -20.52290    0.03915 -524.16   <2e-16 ***
as.factor(year)9  -20.51785    0.04063 -505.00   <2e-16 ***
as.factor(year)10 -20.51935    0.04687 -437.77   <2e-16 ***
as.factor(year)11 -20.52747    0.05593 -367.04   <2e-16 ***
as.factor(year)12 -20.51075    0.07376 -278.09   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 0.4417794)

Number of Fisher Scoring iterations: 19

Plotting the Hazard Function

haz<-1/(1+exp(-coef(fit.gm))) 
time<-seq(1,12,1) 
plot(haz~time, type="l", ylab="h(t)") 
title(main="Discrete Time Hazard Function for Infant Mortality")

Plotting the Survival Estimate

St<-NA
time<-1:length(haz)
St[1]<-1-haz[1]
for(i in 2:length(haz)){
St[i]<-St[i-1]* (1-haz[i])
}

St<-c(1, St)
time<-c(0, time)
plot(y=St,x=time, type="l",
main="Survival Plot for Infant Mortality")

Alternative Time Specification

Constant model

fit.cm<-svyglm(d.event~ year + I(year^2) + I(year^3),
design=des, family=binomial (link="cloglog"))
Warning in eval(family$initialize): non-integer #successes in a binomial glm!
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(fit.cm)

Call:
svyglm(formula = d.event ~ year + I(year^2) + I(year^3), design = des, 
    family = binomial(link = "cloglog"))

Survey design:
survey::svydesign(ids = ~psu, strata = ~strata, data = d1, weight = ~weight)

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  11.48022    0.55907   20.53   <2e-16 ***
year        -25.34562    0.91562  -27.68   <2e-16 ***
I(year^2)    14.06158    0.42912   32.77   <2e-16 ***
I(year^3)    -2.40838    0.05909  -40.76   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 0.4417794)

Number of Fisher Scoring iterations: 25

linear model

fit.lm<-svyglm(d.event~year, design=des,
family=binomial (link="cloglog"))
Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.lm)

Call:
svyglm(formula = d.event ~ year, design = des, family = binomial(link = "cloglog"))

Survey design:
survey::svydesign(ids = ~psu, strata = ~strata, data = d1, weight = ~weight)

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -1.17868    0.04697  -25.09   <2e-16 ***
year        -0.76734    0.02273  -33.76   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 0.5650687)

Number of Fisher Scoring iterations: 8

Quadratic model

fit.qtc<-svyglm(d.event~year + I(year^2),
design=des, family=binomial (link="cloglog"))
Warning in eval(family$initialize): non-integer #successes in a binomial glm!
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(fit.qtc)

Call:
svyglm(formula = d.event ~ year + I(year^2), design = des, family = binomial(link = "cloglog"))

Survey design:
survey::svydesign(ids = ~psu, strata = ~strata, data = d1, weight = ~weight)

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -3.55573    0.16638  -21.37   <2e-16 ***
year         1.92683    0.17716   10.88   <2e-16 ***
I(year^2)   -0.61060    0.04312  -14.16   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 0.4696829)

Number of Fisher Scoring iterations: 12

Quartic model

fit.q<-svyglm(d.event~year+I(year^2)+I(year^3 )+I(year^4),
design=des ,
family=binomial(link="cloglog"))
Warning in eval(family$initialize): non-integer #successes in a binomial glm!
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(fit.q)

Call:
svyglm(formula = d.event ~ year + I(year^2) + I(year^3) + I(year^4), 
    design = des, family = binomial(link = "cloglog"))

Survey design:
survey::svydesign(ids = ~psu, strata = ~strata, data = d1, weight = ~weight)

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  18.96746    0.70669   26.84   <2e-16 ***
year        -40.91271    1.23114  -33.23   <2e-16 ***
I(year^2)    24.92306    0.65282   38.18   <2e-16 ***
I(year^3)    -5.49674    0.12326  -44.59   <2e-16 ***
I(year^4)     0.30675    0.00642   47.78   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 0.4417794)

Number of Fisher Scoring iterations: 23

Spline model

library(splines)

fit.sp<-svyglm(d.event~ns(year, df=3), design=des,
family=binomial (link="cloglog"))
Warning in eval(family$initialize): non-integer #successes in a binomial glm!
Warning: glm.fit: algorithm did not converge
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(fit.sp)

Call:
svyglm(formula = d.event ~ ns(year, df = 3), design = des, family = binomial(link = "cloglog"))

Survey design:
survey::svydesign(ids = ~psu, strata = ~strata, data = d1, weight = ~weight)

Coefficients:
                    Estimate Std. Error t value Pr(>|t|)    
(Intercept)       -2.212e+00  4.636e-02  -47.72  < 2e-16 ***
ns(year, df = 3)1 -1.368e+01  2.505e+00   -5.46 8.34e-08 ***
ns(year, df = 3)2 -3.146e+03  1.081e+02  -29.12  < 2e-16 ***
ns(year, df = 3)3 -5.235e+03  1.814e+02  -28.86  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 0.4417794)

Number of Fisher Scoring iterations: 25
dd<- expand.grid(year=seq(1,12,1))
dd$genmod<-predict(fit.gm, newdata = data.frame(year=as.factor(1:12)), type="response")

dd$lin<-predict(fit.lm, newdata=dd, type="response")
dd$sq<-predict(fit.qtc, newdata=dd, type="response")
dd$cub<-predict(fit.cm, newdata=dd, type="response")
dd$quart<-predict(fit.q, newdata=dd, type="response")
dd$spline<-predict(fit.sp, newdata=expand.grid(year=seq(1,12,1)), type="response")

dd
   year       genmod          lin           sq          cub        quart
1     1 1.036824e-01 1.331085e-01 1.010351e-01 1.036824e-01 1.036824e-01
2     2 1.017844e-01 6.416241e-02 1.105292e-01 1.017844e-01 1.017844e-01
3     3 4.723214e-02 3.031653e-02 3.726871e-02 4.723214e-02 4.723214e-02
4     4 1.224104e-09 1.419038e-02 3.625063e-03 5.313069e-09 4.687522e-09
5     5 1.223019e-09 6.613030e-03 1.023895e-04 2.220446e-16 2.220446e-16
6     6 1.222785e-09 3.075518e-03 8.513085e-07 2.220446e-16 2.220446e-16
7     7 1.221144e-09 1.428970e-03 2.087047e-09 2.220446e-16 2.220446e-16
8     8 1.221850e-09 6.636451e-04 1.508734e-12 2.220446e-16 2.220446e-16
9     9 1.228041e-09 3.081482e-04 3.216095e-16 2.220446e-16 2.220446e-16
10   10 1.226190e-09 1.430678e-04 2.220446e-16 2.220446e-16 2.220446e-16
11   11 1.216282e-09 6.642096e-05 2.220446e-16 2.220446e-16 2.220446e-16
12   12 1.236791e-09 3.083610e-05 2.220446e-16 2.220446e-16 1.030462e-09
         spline
1  1.036824e-01
2  1.000000e+00
3  1.000000e+00
4  1.000000e+00
5  1.000000e+00
6  1.000000e+00
7  2.220446e-16
8  2.220446e-16
9  2.220446e-16
10 2.220446e-16
11 2.220446e-16
12 2.220446e-16

Include all main effects in the model

fit.0<-svyglm(d.event~as.factor(year) + highses + educ.high + partnhiedu + agec,
              design = des,
              family = binomial(link="cloglog"))
Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.0)

Call:
svyglm(formula = d.event ~ as.factor(year) + highses + educ.high + 
    partnhiedu + agec, design = des, family = binomial(link = "cloglog"))

Survey design:
survey::svydesign(ids = ~psu, strata = ~strata, data = d1, weight = ~weight)

Coefficients:
                   Estimate Std. Error  t value Pr(>|t|)    
(Intercept)        -1.64797    0.13165  -12.518  < 2e-16 ***
as.factor(year)2   -0.01199    0.07765   -0.154 0.877413    
as.factor(year)3   -0.74691    0.11662   -6.405 4.38e-10 ***
as.factor(year)4  -18.25767    0.05979 -305.350  < 2e-16 ***
as.factor(year)5  -18.25454    0.06096 -299.471  < 2e-16 ***
as.factor(year)6  -18.24842    0.06076 -300.322  < 2e-16 ***
as.factor(year)7  -18.24354    0.06191 -294.675  < 2e-16 ***
as.factor(year)8  -18.23796    0.06246 -291.983  < 2e-16 ***
as.factor(year)9  -18.22760    0.06241 -292.079  < 2e-16 ***
as.factor(year)10 -18.22205    0.06629 -274.883  < 2e-16 ***
as.factor(year)11 -18.20833    0.07284 -249.966  < 2e-16 ***
as.factor(year)12 -18.18507    0.08702 -208.980  < 2e-16 ***
highses            -0.01724    0.08138   -0.212 0.832328    
educ.high           0.03339    0.07159    0.466 0.641177    
partnhiedu          0.17808    0.10944    1.627 0.104509    
agec(20,25]        -0.43561    0.13279   -3.280 0.001130 ** 
agec(25,30]        -0.58583    0.13731   -4.266 2.50e-05 ***
agec(30,35]        -0.68271    0.13922   -4.904 1.39e-06 ***
agec(35,40]        -0.86841    0.16074   -5.403 1.15e-07 ***
agec(40,45]        -1.03830    0.18379   -5.649 3.13e-08 ***
agec(45,50]        -1.17353    0.30627   -3.832 0.000148 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 0.4348594)

Number of Fisher Scoring iterations: 19

Testing for interaction between at least two of the predictors

fit.1<-svyglm(d.event~as.factor(year) + highses + educ.high + partnhiedu + agec*highses + educ.high*agec + highses*partnhiedu,
              design = des,
              family = binomial(link="cloglog"))
Warning in eval(family$initialize): non-integer #successes in a binomial glm!
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(fit.1)

Call:
svyglm(formula = d.event ~ as.factor(year) + highses + educ.high + 
    partnhiedu + agec * highses + educ.high * agec + highses * 
    partnhiedu, design = des, family = binomial(link = "cloglog"))

Survey design:
survey::svydesign(ids = ~psu, strata = ~strata, data = d1, weight = ~weight)

Coefficients:
                       Estimate Std. Error  t value Pr(>|t|)    
(Intercept)            -1.75370    0.16709  -10.496  < 2e-16 ***
as.factor(year)2       -0.01040    0.07757   -0.134 0.893460    
as.factor(year)3       -0.74318    0.11680   -6.363 5.79e-10 ***
as.factor(year)4      -18.24915    0.05981 -305.093  < 2e-16 ***
as.factor(year)5      -18.24647    0.06095 -299.364  < 2e-16 ***
as.factor(year)6      -18.24035    0.06052 -301.414  < 2e-16 ***
as.factor(year)7      -18.23418    0.06189 -294.605  < 2e-16 ***
as.factor(year)8      -18.22711    0.06226 -292.777  < 2e-16 ***
as.factor(year)9      -18.21914    0.06194 -294.135  < 2e-16 ***
as.factor(year)10     -18.21454    0.06586 -276.547  < 2e-16 ***
as.factor(year)11     -18.20092    0.07310 -248.987  < 2e-16 ***
as.factor(year)12     -18.18373    0.08697 -209.079  < 2e-16 ***
highses                -0.01549    0.44062   -0.035 0.971974    
educ.high               0.24620    0.23329    1.055 0.291943    
partnhiedu              0.40337    0.15290    2.638 0.008682 ** 
agec(20,25]            -0.42834    0.18559   -2.308 0.021547 *  
agec(25,30]            -0.62337    0.20163   -3.092 0.002140 ** 
agec(30,35]            -0.48086    0.18335   -2.623 0.009082 ** 
agec(35,40]            -0.61526    0.19935   -3.086 0.002178 ** 
agec(40,45]            -0.83996    0.24322   -3.454 0.000617 ***
agec(45,50]            -0.77460    0.32646   -2.373 0.018162 *  
highses:agec(20,25]     0.12167    0.54153    0.225 0.822357    
highses:agec(25,30]     0.24742    0.44819    0.552 0.581248    
highses:agec(30,35]    -0.21273    0.44689   -0.476 0.634329    
highses:agec(35,40]     0.02999    0.47119    0.064 0.949289    
highses:agec(40,45]    -0.30840    0.67291   -0.458 0.646995    
highses:agec(45,50]   -14.53364    0.84563  -17.187  < 2e-16 ***
educ.high:agec(20,25]  -0.10174    0.26337   -0.386 0.699487    
educ.high:agec(25,30]  -0.17068    0.27500   -0.621 0.535218    
educ.high:agec(30,35]  -0.22137    0.27061   -0.818 0.413852    
educ.high:agec(35,40]  -0.55877    0.34081   -1.640 0.101937    
educ.high:agec(40,45]  -0.29715    0.44439   -0.669 0.504122    
educ.high:agec(45,50] -16.40189    0.54741  -29.963  < 2e-16 ***
highses:partnhiedu     -0.28573    0.20989   -1.361 0.174232    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 0.4323507)

Number of Fisher Scoring iterations: 19

Generate hazard plots for interesting cases highlighting the significant predictors in your analysis

plot(genmod~year, dd, type="l", ylab="h(t)", xlab="Time", ylim=c(0, .25), xlim=c(0, 20))
title(main="Hazard function from different time parameterizations")
lines(lin~year, dd, col=2, lwd=2)
lines(sq~year, dd, col=3, lwd=2)
lines(cub~year, dd, col=4, lwd=2)
lines(quart~year, dd, col=5, lwd=2)
lines(spline~year, dd, col=6, lwd=2)
legend("topright", 
       legend=c("General Model","constant", "Linear","Square", "Cubic", "Quartic", "Natural spline"),
       col=c(1:6), lwd=1.5)

#AIC table
aic<-round(c(
  fit.lm$deviance+2*length(fit.lm$coefficients),
  fit.qtc$deviance+2*length(fit.qtc$coefficients),
  fit.cm$deviance+2*length(fit.cm$coefficients),
  fit.q$deviance+2*length(fit.q$coefficients),
  fit.sp$deviance+2*length(fit.sp$coefficients),
  fit.gm$deviance+2*length(fit.gm$coefficients)),2)

#comparing all aics to the general model
dif.aic<-round(aic-aic[6],2)
data.frame(model =c( "linear","square", "cubic", "quartic","spline", "general"),
           aic=aic,
           aic_dif=dif.aic)
    model     aic aic_dif
1  linear 9228.99  359.82
2  square 8897.41   28.24
3   cubic 8853.17  -16.00
4 quartic 8855.17  -14.00
5  spline 8853.17  -16.00
6 general 8869.17    0.00

Interpretation

The AIC’s of the alternative time specifications shows that the cubic and spline AIC fits the data best.