Event History Analysis Homework 4

Author

Cristina Martinez, MPH

Carry out the following analysis:

Define your outcome as in HW 1.
Consider what covariates are hypothesized to affect the outcome variable
Define these and construct a Cox model for your outcome
- The outcome for this analysis is transition to first birth. Covariates that will be included in the model are sex, education and debt.

#determine date of last interview for each respondent
last_int <- df_raw %>%
  filter(is.na(CV_CHILD_BIRTH_MONTH.01_2019) == T) %>%
  select(PUBID_1997,
         CV_INTERVIEW_CMONTH_1997, CV_INTERVIEW_CMONTH_1998,
         CV_INTERVIEW_CMONTH_1999, CV_INTERVIEW_CMONTH_2000,
         CV_INTERVIEW_CMONTH_2001, CV_INTERVIEW_CMONTH_2002,
         CV_INTERVIEW_CMONTH_2004, CV_INTERVIEW_CMONTH_2005,
         CV_INTERVIEW_CMONTH_2006, CV_INTERVIEW_CMONTH_2007,
         CV_INTERVIEW_CMONTH_2008, CV_INTERVIEW_CMONTH_2009,
         CV_INTERVIEW_CMONTH_2010, CV_INTERVIEW_CMONTH_2011,
         CV_INTERVIEW_CMONTH_2013, CV_INTERVIEW_CMONTH_2015,
         CV_INTERVIEW_CMONTH_2017, CV_INTERVIEW_CMONTH_2019) %>%
  pivot_longer(cols = c(-PUBID_1997), 
               names_to = "interview", 
               values_to = "cmonth") %>%
  group_by(PUBID_1997) %>%
  summarize(age.cens = max(cmonth, na.rm = TRUE))


df_raw <- left_join(df_raw, last_int, by = "PUBID_1997")
# merge custom weights to data frame
df_raw <- left_join(df_raw, weights)
rm(last_int)

df_raw_rc <- df_raw %>%
  mutate(
    #calculate age at first birth
    age97 = CV_AGE.MONTHS._INT_DATE_1997/12,
    months2fb = CV_CHILD_BIRTH_MONTH.01_2019 - CV_AGE.MONTHS._INT_DATE_1997,
    years2fb = months2fb/12,
    age_fb = round(age97+years2fb,2),
    # calculate censored age based on last interview
    months_cens = age.cens-CV_AGE.MONTHS._INT_DATE_1997,
    years_cens = months_cens/12,
    age_cens = round(age97+years_cens,2),
    age_obs = if_else(is.na(age_cens) == T, age_fb, age_cens),
    #age_obs = floor(age_obs0),
    event = if_else(is.na(age_cens) == T, 1, 0),
    #debt_high = as.factor(if_else(CVC_ASSETS_DEBTS_25_XRND > 17000, 1, 0)),
    debt_high = factor(if_else(CVC_ASSETS_DEBTS_25_XRND > 17000, 1, 0)),
    debt = CVC_ASSETS_DEBTS_25_XRND,
    #debt_high = factor(if_else(CVC_ASSETS_DEBTS_25_XRND > 0, 1, 0)),
    #zero_debt = if_else(CVC_ASSETS_DEBTS_25_XRND == 0, 1, 0),
    #male = as.factor(if_else(KEY_SEX_1997 == 1, 1, 0)),
    male = factor(if_else(KEY_SEX_1997 == 1, 1, 0)),
    #bach = as.factor(if_else(CV_HIGHEST_DEGREE_EVER_EDT_2019 >= 5, 1, 0)),
    bach = factor(if_else(CV_HIGHEST_DEGREE_EVER_EDT_2019 >= 5, 1, 0))
    # raceth = case_when(KEY_RACE_ETHNICITY_1997 == 1 ~ "1black",
    #                    KEY_RACE_ETHNICITY_1997 == 2 ~ "2hispanic",
    #                    KEY_RACE_ETHNICITY_1997 == 3 ~ "3mixed race",
    #                    KEY_RACE_ETHNICITY_1997 == 4 ~ "0non-black/non-hisp"),
    # raceth2 = factor(raceth)
  )

df_comp <- df_raw_rc %>%
  select(PUBID_1997, age97, age_obs, event, debt, debt_high, male, bach, VPSU_1997, VSTRAT_1997,
         weights) %>%
  mutate(check = if_else(age97 >= age_obs, 1, 0)) %>%
  filter(check == 0) %>%
  select(-check) %>%
  filter(complete.cases(.))

Fit the Cox model for the data

Include all the main effects in the model
Test for an interaction between at least two of the predictors
Produce plots for the survival function and the cumulative hazard function for different “risk profiles”, as done in the notes
Evaluate the assumptions of the Cox model, including proportionality of hazards of covariates, and if you use a continuous variable, evaluate linearity of the effect using Martingale residuals

Cox model using survey design

options(survey.lonely.psu = "adjust")

des <- svydesign(ids = ~VPSU_1997, 
                strata = ~VSTRAT_1997, weights=~weights,
                data=df_comp, nest=T)

#Fit the model
fit.cox2<-svycoxph(Surv(time = age97,
                        time2 = age_obs,
                        event = event) ~ male + bach + debt_high,
                design=des)

summary(fit.cox2)

Stratified 1 - level Cluster Sampling design (with replacement)
With (234) clusters.
svydesign(ids = ~VPSU_1997, strata = ~VSTRAT_1997, weights = ~weights, 
    data = df_comp, nest = T)

Call:
svycoxph(formula = Surv(time = age97, time2 = age_obs, event = event) ~ 
    male + bach + debt_high, design = des)

  n= 6483, number of events= 4761 

               coef exp(coef) se(coef) robust se       z Pr(>|z|)    
male1      -0.41603   0.65966  0.02973   0.02777 -14.980   <2e-16 ***
bach1      -0.45434   0.63487  0.04608   0.04880  -9.309   <2e-16 ***
debt_high1 -0.07435   0.92835  0.03303   0.03392  -2.192   0.0284 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

           exp(coef) exp(-coef) lower .95 upper .95
male1         0.6597      1.516    0.6247    0.6966
bach1         0.6349      1.575    0.5770    0.6986
debt_high1    0.9283      1.077    0.8686    0.9922

Concordance= 0.588  (se = 0.004 )
Likelihood ratio test= NA  on 3 df,   p=NA
Wald test            = 309.7  on 3 df,   p=<2e-16
Score (logrank) test = NA  on 3 df,   p=NA

  (Note: the likelihood ratio and score tests assume independence of
     observations within a cluster, the Wald and robust score tests do not).

This model shows that being male, having a bachelor’s degree or higher, and having high debt are all associated with a lower risk of transition to first birth. All covariates are significant.

Cox model with interaction term

#Fit the model
fit.cox3<-svycoxph(Surv(time = age97,
                        time2 = age_obs,
                        event = event) ~ male + bach * debt_high,
                design=des)

summary(fit.cox3)

Stratified 1 - level Cluster Sampling design (with replacement)
With (234) clusters.
svydesign(ids = ~VPSU_1997, strata = ~VSTRAT_1997, weights = ~weights, 
    data = df_comp, nest = T)

Call:
svycoxph(formula = Surv(time = age97, time2 = age_obs, event = event) ~ 
    male + bach * debt_high, design = des)

  n= 6483, number of events= 4761 

                     coef exp(coef) se(coef) robust se       z Pr(>|z|)    
male1            -0.41767   0.65858  0.02974   0.02770 -15.079  < 2e-16 ***
bach1            -0.56099   0.57064  0.06355   0.06236  -8.995  < 2e-16 ***
debt_high1       -0.10988   0.89594  0.03606   0.03855  -2.850  0.00437 ** 
bach1:debt_high1  0.23752   1.26810  0.09251   0.09743   2.438  0.01477 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

                 exp(coef) exp(-coef) lower .95 upper .95
male1               0.6586     1.5184    0.6238    0.6953
bach1               0.5706     1.7524    0.5050    0.6448
debt_high1          0.8959     1.1161    0.8307    0.9663
bach1:debt_high1    1.2681     0.7886    1.0477    1.5349

Concordance= 0.589  (se = 0.004 )
Likelihood ratio test= NA  on 4 df,   p=NA
Wald test            = 354.2  on 4 df,   p=<2e-16
Score (logrank) test = NA  on 4 df,   p=NA

  (Note: the likelihood ratio and score tests assume independence of
     observations within a cluster, the Wald and robust score tests do not).

This interaction model shows the same outcomes as the previous one. Interestingly, having a bachelor’s degree or higher AND high debt is significantly associated with a higher risk of transition to first birth.

Plot survival curves

library(ggsurvfit)

plot(survfit(fit.cox2, conf.int = FALSE),
     ylab = "S(t)",
     xlab = "Age",
     xlim = c(15, 40),
     ylim = c(.1, 1))

lines(survfit(fit.cox2, newdata = data.frame(male = 1, bach = 1, debt_high = 1), conf.int = FALSE), col = "darkmagenta", lty = 1)

lines(survfit(fit.cox2, newdata = data.frame(male = 0, bach = 1, debt_high = 1), conf.int = FALSE), col = "darkorange2", lty = 1)

lines(survfit(fit.cox2, newdata = data.frame(male = 1, bach = 1, debt_high = 0), conf.int = FALSE), col = "darkmagenta", lty = 2)

lines(survfit(fit.cox2, newdata = data.frame(male = 0, bach = 1, debt_high = 0), conf.int = FALSE), col = "darkorange2", lty = 2)

title(main = str_wrap(c("Survival function for transition to first birth for males and female with a bachelor's degree or higher")))

legend("bottomleft",
       legend = c("mean", "Male, high debt", "Female, high debt", "Male, low debt", "Female, low debt"),
       col = c(1, "darkmagenta", "darkorange2", "darkmagenta", "darkorange2"),
       lty = c(1,1,1,2,2), cex = .8)

Hazard Plots

sf1 <- survfit(fit.cox2)
sf2 <- survfit(fit.cox2, newdata = data.frame(male = 1, bach = 1, debt_high = 1))

Warning in model.frame.default(data = structure(list(male = 1, bach = 1, :
variable 'male' is not a factor

Warning in model.frame.default(data = structure(list(male = 1, bach = 1, :
variable 'bach' is not a factor

Warning in model.frame.default(data = structure(list(male = 1, bach = 1, :
variable 'debt_high' is not a factor

sf3 <- survfit(fit.cox2, newdata = data.frame(male = 0, bach = 1, debt_high = 1))

Warning in model.frame.default(data = structure(list(male = 0, bach = 1, :
variable 'male' is not a factor

Warning in model.frame.default(data = structure(list(male = 0, bach = 1, :
variable 'bach' is not a factor

Warning in model.frame.default(data = structure(list(male = 0, bach = 1, :
variable 'debt_high' is not a factor

sf4 <- survfit(fit.cox2, newdata = data.frame(male = 1, bach = 1, debt_high = 0))

Warning in model.frame.default(data = structure(list(male = 1, bach = 1, :
variable 'male' is not a factor

Warning in model.frame.default(data = structure(list(male = 1, bach = 1, :
variable 'bach' is not a factor

Warning in model.frame.default(data = structure(list(male = 1, bach = 1, :
variable 'debt_high' is not a factor

sf5 <- survfit(fit.cox2, newdata = data.frame(male = 0, bach = 1, debt_high = 0))

Warning in model.frame.default(data = structure(list(male = 0, bach = 1, :
variable 'male' is not a factor

Warning in model.frame.default(data = structure(list(male = 0, bach = 1, :
variable 'bach' is not a factor

Warning in model.frame.default(data = structure(list(male = 0, bach = 1, :
variable 'debt_high' is not a factor

h1 <- log(sf1$surv)
h2 <- log(sf2$surv)
h3 <- log(sf3$surv)
h4 <- log(sf4$surv)
h5 <- log(sf5$surv)

test <- data.frame(
  H = c(h1, h2, h3),
  group = c(rep("means", length(h1)),
            rep("male", length(h1)),
            rep("female", length(h1))),
  times = sf1$time
)

test %>%
  ggplot(aes(x = times, y = H)) +
  geom_line(aes(group = group, color = group))

Doesn’t seem right….

hazard function

times <- sf1$time
hs1 <- loess(diff(c(0,h1))~times, degree = 1, span = .25)
hs2 <- loess(diff(c(0,h2))~times, degree = 1, span = .25)
hs3 <- loess(diff(c(0,h3))~times, degree = 1, span = .25)
hs4 <- loess(diff(c(0,h4))~times, degree = 1, span = .25)
hs5 <- loess(diff(c(0,h5))~times, degree = 1, span = .25)

plot(predict(hs1)~times, type = "l", ylab = "smoothed h(t)", xlab = "Years")
title(main = "Smoothed hazard plots")
lines(predict(hs2)~times, lty = 1, col = "darkmagenta")
lines(predict(hs3)~times, lty = 1, col = "darkorange2")
lines(predict(hs4)~times, lty = 2, col = "darkmagenta")
lines(predict(hs5)~times, lty = 2, col = "darkorange2")
legend("topright",
       legend = c("Means of Covariates",
                  "Male, bachelor, high debt",
                  "Female, bachlor, high debt",
                  "Male, bachelor, low debt",
                  "Female, bachlor, low debt"),
       lty = c(1,1,1,2,2), col = c(1, "darkmagenta", "darkorange2", "darkmagenta", "darkorange2"), cex = .8)

Not sure these hazards look correct, shouldn’t they be flipped? Females should have higher hazards than males? Are negative hazards possible? Something doesn’t seem right here…

Scoenfeld residuals

schoenresid <- resid(fit.cox2, type = "schoenfeld")

fit.sr <- lm(schoenresid~des$variables$age97[des$variables$event==1])

fit.sr %>%
  broom::tidy() %>%
  filter(term != "(Intercept)") %>%
  select(response, estimate, statistic, p.value)

# A tibble: 3 × 4
  response    estimate statistic p.value
  <chr>          <dbl>     <dbl>   <dbl>
1 male1      -0.00131     -0.262   0.793
2 bach1       0.000988     0.331   0.741
3 debt_high1  0.00289      0.659   0.510

## Another way to measure proportionality
fit.test <- cox.zph(fit.cox2)
fit.test

            chisq df    p
male      0.00353  1 0.95
bach      0.03814  1 0.85
debt_high 0.02058  1 0.89
GLOBAL    0.06036  3 1.00

P values are not significant for both tests, therefore we can assume proportionality in the covariates in this model.

## Plot residuals
plot(fit.test, df = 2)

Plotting residuals confirms proportionality.

Martingale residuals

res.mar <- resid(fit.cox2, type = "martingale")

#plot
scatter.smooth(des$variables$debt_high, res.mar, degree = 2,
               span = 1, ylab = "Martingale Residual",
               col = 1, cex = .5, lpars = list(col = "red", lwd = 3))
title(main = "Martingale residuals for debt")

Martingale residuals also shows proportionality.