Event History Analysis Homework 4

Author

Cristina Martinez, MPH

Carry out the following analysis:

#determine date of last interview for each respondent
last_int <- df_raw %>%
  filter(is.na(CV_CHILD_BIRTH_MONTH.01_2019) == T) %>%
  select(PUBID_1997,
         CV_INTERVIEW_CMONTH_1997, CV_INTERVIEW_CMONTH_1998,
         CV_INTERVIEW_CMONTH_1999, CV_INTERVIEW_CMONTH_2000,
         CV_INTERVIEW_CMONTH_2001, CV_INTERVIEW_CMONTH_2002,
         CV_INTERVIEW_CMONTH_2004, CV_INTERVIEW_CMONTH_2005,
         CV_INTERVIEW_CMONTH_2006, CV_INTERVIEW_CMONTH_2007,
         CV_INTERVIEW_CMONTH_2008, CV_INTERVIEW_CMONTH_2009,
         CV_INTERVIEW_CMONTH_2010, CV_INTERVIEW_CMONTH_2011,
         CV_INTERVIEW_CMONTH_2013, CV_INTERVIEW_CMONTH_2015,
         CV_INTERVIEW_CMONTH_2017, CV_INTERVIEW_CMONTH_2019) %>%
  pivot_longer(cols = c(-PUBID_1997), 
               names_to = "interview", 
               values_to = "cmonth") %>%
  group_by(PUBID_1997) %>%
  summarize(age.cens = max(cmonth, na.rm = TRUE))


df_raw <- left_join(df_raw, last_int, by = "PUBID_1997")
# merge custom weights to data frame
df_raw <- left_join(df_raw, weights)
rm(last_int)
df_raw_rc <- df_raw %>%
  mutate(
    #calculate age at first birth
    age97 = CV_AGE.MONTHS._INT_DATE_1997/12,
    months2fb = CV_CHILD_BIRTH_MONTH.01_2019 - CV_AGE.MONTHS._INT_DATE_1997,
    years2fb = months2fb/12,
    age_fb = round(age97+years2fb,2),
    # calculate censored age based on last interview
    months_cens = age.cens-CV_AGE.MONTHS._INT_DATE_1997,
    years_cens = months_cens/12,
    age_cens = round(age97+years_cens,2),
    age_obs = if_else(is.na(age_cens) == T, age_fb, age_cens),
    #age_obs = floor(age_obs0),
    event = if_else(is.na(age_cens) == T, 1, 0),
    #debt_high = as.factor(if_else(CVC_ASSETS_DEBTS_25_XRND > 17000, 1, 0)),
    debt_high = factor(if_else(CVC_ASSETS_DEBTS_25_XRND > 17000, 1, 0)),
    debt = CVC_ASSETS_DEBTS_25_XRND,
    #debt_high = factor(if_else(CVC_ASSETS_DEBTS_25_XRND > 0, 1, 0)),
    #zero_debt = if_else(CVC_ASSETS_DEBTS_25_XRND == 0, 1, 0),
    #male = as.factor(if_else(KEY_SEX_1997 == 1, 1, 0)),
    male = factor(if_else(KEY_SEX_1997 == 1, 1, 0)),
    #bach = as.factor(if_else(CV_HIGHEST_DEGREE_EVER_EDT_2019 >= 5, 1, 0)),
    bach = factor(if_else(CV_HIGHEST_DEGREE_EVER_EDT_2019 >= 5, 1, 0))
    # raceth = case_when(KEY_RACE_ETHNICITY_1997 == 1 ~ "1black",
    #                    KEY_RACE_ETHNICITY_1997 == 2 ~ "2hispanic",
    #                    KEY_RACE_ETHNICITY_1997 == 3 ~ "3mixed race",
    #                    KEY_RACE_ETHNICITY_1997 == 4 ~ "0non-black/non-hisp"),
    # raceth2 = factor(raceth)
  )

df_comp <- df_raw_rc %>%
  select(PUBID_1997, age97, age_obs, event, debt, debt_high, male, bach, VPSU_1997, VSTRAT_1997,
         weights) %>%
  mutate(check = if_else(age97 >= age_obs, 1, 0)) %>%
  filter(check == 0) %>%
  select(-check) %>%
  filter(complete.cases(.))

Fit the Cox model for the data

Cox model using survey design

options(survey.lonely.psu = "adjust")

des <- svydesign(ids = ~VPSU_1997, 
                strata = ~VSTRAT_1997, weights=~weights,
                data=df_comp, nest=T)
#Fit the model
fit.cox2<-svycoxph(Surv(time = age97,
                        time2 = age_obs,
                        event = event) ~ male + bach + debt_high,
                design=des)

summary(fit.cox2) 
Stratified 1 - level Cluster Sampling design (with replacement)
With (234) clusters.
svydesign(ids = ~VPSU_1997, strata = ~VSTRAT_1997, weights = ~weights, 
    data = df_comp, nest = T)
Call:
svycoxph(formula = Surv(time = age97, time2 = age_obs, event = event) ~ 
    male + bach + debt_high, design = des)

  n= 6483, number of events= 4761 

               coef exp(coef) se(coef) robust se       z Pr(>|z|)    
male1      -0.41603   0.65966  0.02973   0.02777 -14.980   <2e-16 ***
bach1      -0.45434   0.63487  0.04608   0.04880  -9.309   <2e-16 ***
debt_high1 -0.07435   0.92835  0.03303   0.03392  -2.192   0.0284 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

           exp(coef) exp(-coef) lower .95 upper .95
male1         0.6597      1.516    0.6247    0.6966
bach1         0.6349      1.575    0.5770    0.6986
debt_high1    0.9283      1.077    0.8686    0.9922

Concordance= 0.588  (se = 0.004 )
Likelihood ratio test= NA  on 3 df,   p=NA
Wald test            = 309.7  on 3 df,   p=<2e-16
Score (logrank) test = NA  on 3 df,   p=NA

  (Note: the likelihood ratio and score tests assume independence of
     observations within a cluster, the Wald and robust score tests do not).

This model shows that being male, having a bachelor’s degree or higher, and having high debt are all associated with a lower risk of transition to first birth. All covariates are significant.

Cox model with interaction term

#Fit the model
fit.cox3<-svycoxph(Surv(time = age97,
                        time2 = age_obs,
                        event = event) ~ male + bach * debt_high,
                design=des)

summary(fit.cox3) 
Stratified 1 - level Cluster Sampling design (with replacement)
With (234) clusters.
svydesign(ids = ~VPSU_1997, strata = ~VSTRAT_1997, weights = ~weights, 
    data = df_comp, nest = T)
Call:
svycoxph(formula = Surv(time = age97, time2 = age_obs, event = event) ~ 
    male + bach * debt_high, design = des)

  n= 6483, number of events= 4761 

                     coef exp(coef) se(coef) robust se       z Pr(>|z|)    
male1            -0.41767   0.65858  0.02974   0.02770 -15.079  < 2e-16 ***
bach1            -0.56099   0.57064  0.06355   0.06236  -8.995  < 2e-16 ***
debt_high1       -0.10988   0.89594  0.03606   0.03855  -2.850  0.00437 ** 
bach1:debt_high1  0.23752   1.26810  0.09251   0.09743   2.438  0.01477 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

                 exp(coef) exp(-coef) lower .95 upper .95
male1               0.6586     1.5184    0.6238    0.6953
bach1               0.5706     1.7524    0.5050    0.6448
debt_high1          0.8959     1.1161    0.8307    0.9663
bach1:debt_high1    1.2681     0.7886    1.0477    1.5349

Concordance= 0.589  (se = 0.004 )
Likelihood ratio test= NA  on 4 df,   p=NA
Wald test            = 354.2  on 4 df,   p=<2e-16
Score (logrank) test = NA  on 4 df,   p=NA

  (Note: the likelihood ratio and score tests assume independence of
     observations within a cluster, the Wald and robust score tests do not).

This interaction model shows the same outcomes as the previous one. Interestingly, having a bachelor’s degree or higher AND high debt is significantly associated with a higher risk of transition to first birth.

Plot survival curves

library(ggsurvfit)

plot(survfit(fit.cox2, conf.int = FALSE),
     ylab = "S(t)",
     xlab = "Age",
     xlim = c(15, 40),
     ylim = c(.1, 1))

lines(survfit(fit.cox2, newdata = data.frame(male = 1, bach = 1, debt_high = 1), conf.int = FALSE), col = "darkmagenta", lty = 1)

lines(survfit(fit.cox2, newdata = data.frame(male = 0, bach = 1, debt_high = 1), conf.int = FALSE), col = "darkorange2", lty = 1)

lines(survfit(fit.cox2, newdata = data.frame(male = 1, bach = 1, debt_high = 0), conf.int = FALSE), col = "darkmagenta", lty = 2)

lines(survfit(fit.cox2, newdata = data.frame(male = 0, bach = 1, debt_high = 0), conf.int = FALSE), col = "darkorange2", lty = 2)

title(main = str_wrap(c("Survival function for transition to first birth for males and female with a bachelor's degree or higher")))

legend("bottomleft",
       legend = c("mean", "Male, high debt", "Female, high debt", "Male, low debt", "Female, low debt"),
       col = c(1, "darkmagenta", "darkorange2", "darkmagenta", "darkorange2"),
       lty = c(1,1,1,2,2), cex = .8)

Hazard Plots

sf1 <- survfit(fit.cox2)
sf2 <- survfit(fit.cox2, newdata = data.frame(male = 1, bach = 1, debt_high = 1))
Warning in model.frame.default(data = structure(list(male = 1, bach = 1, :
variable 'male' is not a factor
Warning in model.frame.default(data = structure(list(male = 1, bach = 1, :
variable 'bach' is not a factor
Warning in model.frame.default(data = structure(list(male = 1, bach = 1, :
variable 'debt_high' is not a factor
sf3 <- survfit(fit.cox2, newdata = data.frame(male = 0, bach = 1, debt_high = 1))
Warning in model.frame.default(data = structure(list(male = 0, bach = 1, :
variable 'male' is not a factor
Warning in model.frame.default(data = structure(list(male = 0, bach = 1, :
variable 'bach' is not a factor
Warning in model.frame.default(data = structure(list(male = 0, bach = 1, :
variable 'debt_high' is not a factor
sf4 <- survfit(fit.cox2, newdata = data.frame(male = 1, bach = 1, debt_high = 0))
Warning in model.frame.default(data = structure(list(male = 1, bach = 1, :
variable 'male' is not a factor
Warning in model.frame.default(data = structure(list(male = 1, bach = 1, :
variable 'bach' is not a factor
Warning in model.frame.default(data = structure(list(male = 1, bach = 1, :
variable 'debt_high' is not a factor
sf5 <- survfit(fit.cox2, newdata = data.frame(male = 0, bach = 1, debt_high = 0))
Warning in model.frame.default(data = structure(list(male = 0, bach = 1, :
variable 'male' is not a factor
Warning in model.frame.default(data = structure(list(male = 0, bach = 1, :
variable 'bach' is not a factor
Warning in model.frame.default(data = structure(list(male = 0, bach = 1, :
variable 'debt_high' is not a factor
h1 <- log(sf1$surv)
h2 <- log(sf2$surv)
h3 <- log(sf3$surv)
h4 <- log(sf4$surv)
h5 <- log(sf5$surv)

test <- data.frame(
  H = c(h1, h2, h3),
  group = c(rep("means", length(h1)),
            rep("male", length(h1)),
            rep("female", length(h1))),
  times = sf1$time
)

test %>%
  ggplot(aes(x = times, y = H)) +
  geom_line(aes(group = group, color = group)) 

Doesn’t seem right….

hazard function

times <- sf1$time
hs1 <- loess(diff(c(0,h1))~times, degree = 1, span = .25)
hs2 <- loess(diff(c(0,h2))~times, degree = 1, span = .25)
hs3 <- loess(diff(c(0,h3))~times, degree = 1, span = .25)
hs4 <- loess(diff(c(0,h4))~times, degree = 1, span = .25)
hs5 <- loess(diff(c(0,h5))~times, degree = 1, span = .25)

plot(predict(hs1)~times, type = "l", ylab = "smoothed h(t)", xlab = "Years")
title(main = "Smoothed hazard plots")
lines(predict(hs2)~times, lty = 1, col = "darkmagenta")
lines(predict(hs3)~times, lty = 1, col = "darkorange2")
lines(predict(hs4)~times, lty = 2, col = "darkmagenta")
lines(predict(hs5)~times, lty = 2, col = "darkorange2")
legend("topright",
       legend = c("Means of Covariates",
                  "Male, bachelor, high debt",
                  "Female, bachlor, high debt",
                  "Male, bachelor, low debt",
                  "Female, bachlor, low debt"),
       lty = c(1,1,1,2,2), col = c(1, "darkmagenta", "darkorange2", "darkmagenta", "darkorange2"), cex = .8)

Not sure these hazards look correct, shouldn’t they be flipped? Females should have higher hazards than males? Are negative hazards possible? Something doesn’t seem right here…

Scoenfeld residuals

schoenresid <- resid(fit.cox2, type = "schoenfeld")

fit.sr <- lm(schoenresid~des$variables$age97[des$variables$event==1])

fit.sr %>%
  broom::tidy() %>%
  filter(term != "(Intercept)") %>%
  select(response, estimate, statistic, p.value)
# A tibble: 3 × 4
  response    estimate statistic p.value
  <chr>          <dbl>     <dbl>   <dbl>
1 male1      -0.00131     -0.262   0.793
2 bach1       0.000988     0.331   0.741
3 debt_high1  0.00289      0.659   0.510
## Another way to measure proportionality
fit.test <- cox.zph(fit.cox2)
fit.test
            chisq df    p
male      0.00353  1 0.95
bach      0.03814  1 0.85
debt_high 0.02058  1 0.89
GLOBAL    0.06036  3 1.00

P values are not significant for both tests, therefore we can assume proportionality in the covariates in this model.

## Plot residuals
plot(fit.test, df = 2)

Plotting residuals confirms proportionality.

Martingale residuals

res.mar <- resid(fit.cox2, type = "martingale")

#plot
scatter.smooth(des$variables$debt_high, res.mar, degree = 2,
               span = 1, ylab = "Martingale Residual",
               col = 1, cex = .5, lpars = list(col = "red", lwd = 3))
title(main = "Martingale residuals for debt")

Martingale residuals also shows proportionality.