Event History Analysis Homework 3

Author

Cristina Martinez, MPH

options(Ncores = 12)
library(tidyverse)
library(haven)
library(survival)
library(car)
library(survey)
library(muhaz)
library(eha)
library(janitor)

df_cat <- read.csv('categories.csv')
df_raw <- read.csv('new_data.csv')

last_int <- df_raw %>%
  filter(is.na(CV_CHILD_BIRTH_MONTH.01_2019) == T) %>%
  select(PUBID_1997,
         CV_INTERVIEW_CMONTH_1997, CV_INTERVIEW_CMONTH_1998,
         CV_INTERVIEW_CMONTH_1999, CV_INTERVIEW_CMONTH_2000,
         CV_INTERVIEW_CMONTH_2001, CV_INTERVIEW_CMONTH_2002,
         CV_INTERVIEW_CMONTH_2004, CV_INTERVIEW_CMONTH_2005,
         CV_INTERVIEW_CMONTH_2006, CV_INTERVIEW_CMONTH_2007,
         CV_INTERVIEW_CMONTH_2008, CV_INTERVIEW_CMONTH_2009,
         CV_INTERVIEW_CMONTH_2010, CV_INTERVIEW_CMONTH_2011,
         CV_INTERVIEW_CMONTH_2013, CV_INTERVIEW_CMONTH_2015,
         CV_INTERVIEW_CMONTH_2017, CV_INTERVIEW_CMONTH_2019) %>%
  pivot_longer(cols = c(-PUBID_1997), 
               names_to = "interview", 
               values_to = "cmonth") %>%
  group_by(PUBID_1997) %>%
  summarize(age.cens = max(cmonth, na.rm = TRUE))


df_raw <- left_join(df_raw, last_int, by = "PUBID_1997")

df_raw_rc <- df_raw %>%
  mutate(
    #calculate age at first birth
    age97 = CV_AGE.MONTHS._INT_DATE_1997/12,
    months2fb = CV_CHILD_BIRTH_MONTH.01_2019 - CV_AGE.MONTHS._INT_DATE_1997,
    years2fb = months2fb/12,
    age_fb = round(age97+years2fb,1),
    # calculate censored age based on last interview
    months_cens = age.cens-CV_AGE.MONTHS._INT_DATE_1997,
    years_cens = months_cens/12,
    age_cens = round(age97+years_cens,1),
    age_obs = if_else(is.na(age_cens) == T, age_fb, age_cens),
    event = if_else(is.na(age_cens) == T, 1, 0),
    debt_high = as.factor(if_else(CVC_ASSETS_DEBTS_25_XRND > 17000, 1, 0)),
    male = as.factor(if_else(KEY_SEX_1997 == 1, 1, 0)),
    bach = as.factor(if_else(CV_HIGHEST_DEGREE_EVER_EDT_2019 >= 5, 1, 0)),
    raceth = case_when(KEY_RACE_ETHNICITY_1997 == 1 ~ "black",
                       KEY_RACE_ETHNICITY_1997 == 2 ~ "hispanic",
                       KEY_RACE_ETHNICITY_1997 == 3 ~ "mixed race",
                       KEY_RACE_ETHNICITY_1997 == 4 ~ "non-black/non-hisp"),
    raceth2 = factor(KEY_RACE_ETHNICITY_1997)
  ) #%>%
  # select(PUBID_1997:KEY_RACE_ETHNICITY_1997, CV_HIGHEST_DEGREE_EVER_EDT_2010, CV_HIGHEST_DEGREE_EVER_EDT_2019,
  #        CVC_ASSETS_DEBTS_25_XRND:CVC_ASSETS_DEBTS_35_XRND, age97, CV_CHILD_BIRTH_MONTH.01_2019, CV_AGE.MONTHS._INT_DATE_1997, months2fb:debt_cat)

df_raw_comp <- df_raw_rc %>%
  filter(complete.cases(bach, debt_high))

Define your outcome as in HW 1. Also consider what covariates are hypothesized to affect the outcome variable.

Outcome: First birth

Covariates: Debt level, sex, and education

Define these and construct a parametric model for your outcome.

Fit the parametric model of your choosing to the data.

Did you choose an AFT or PH model and why?
- Based on the analysis below, the best model to use with these data are the PH models
Justify what parametric distribution you choose
- Log-logistic has the lowest AIC, but Piecewise Constant Hazard has the best visual fit to the empirical curve.
Interpret your results and write them up
- For both models, there was a lower risk of first birth for males, for those with bachelor’s degree or higher, and for those with high debt at age 25 ($17k or more). All three covariates were highly significant in both models.

library(ggsurvfit)
fit <- survfit2(Surv(age_obs, event)~1, df_raw_comp)

fit %>%
  ggsurvfit() +
  labs(title = "Survival Function for First Birth, NLSY 97",
       y = "S(t)",
       x = "Years")

# Kaplan-Meier
fit.haz.km <- kphaz.fit(df_raw_comp$age_obs, df_raw_comp$event, method = "product-limit")

# Smoothed using a kernel-density method
fit.haz.sm <- muhaz(df_raw_comp$age_obs, df_raw_comp$event)

# Empirical hazard function (product-limit estimate) plot
kphaz.plot(fit.haz.km, main = "Plot of the hazard to first birth")

# Overlay the smoothed version
lines(fit.haz.sm, col = 2, lwd = 3)
legend("topleft",
       legend = c("KM hazard", "Smoothed Hazard"),
       col = c(1,2),
       lty = c(1,1))

Exponential Model

# Why add month?
month <- 1/12
fit.1 <- phreg(Surv(age_obs+month, event) ~ male + bach + debt_high, data = df_raw_comp, 
               dist = "weibull", shape = 1)

summary(fit.1)

Covariate             Mean       Coef     Rel.Risk   S.E.    LR p
male                                                        0.0000 
               0      0.488     0         1 (reference)
               1      0.512    -0.259     0.772     0.029
bach                                                        0.0000 
               0      0.878     0         1 (reference)
               1      0.122    -0.264     0.768     0.049
debt_high                                                   0.1141 
               0      0.734     0         1 (reference)
               1      0.266    -0.053     0.948     0.034

Events                    4817 
Total time at risk        199992 
Max. log. likelihood      -22713 
LR test statistic         106.35 
Degrees of freedom        3 
Overall p-value           0

plot(fit.1, ylim = c(0, 0.075))
lines(fit.haz.sm)

Accelerated Failure Time Model

fit.1.aft <- survreg(Surv(age_obs+month, event) ~ male + bach + debt_high, 
                     data = df_raw_comp, dist = "exponential")
summary(fit.1.aft)


Call:
survreg(formula = Surv(age_obs + month, event) ~ male + bach + 
    debt_high, data = df_raw_comp, dist = "exponential")
             Value Std. Error      z       p
(Intercept) 3.5585     0.0222 160.43 < 2e-16
male1       0.2589     0.0291   8.88 < 2e-16
bach1       0.2641     0.0492   5.36 8.1e-08
debt_high1  0.0530     0.0337   1.57    0.12

Scale fixed at 1 

Exponential distribution
Loglik(model)= -22712.6   Loglik(intercept only)= -22765.7
    Chisq= 106.35 on 3 degrees of freedom, p= 6.7e-23 
Number of Newton-Raphson Iterations: 3 
n= 6539

Weibull Model

# weibull distribution for hazard
fit.2 <- phreg(Surv(age_obs+month, event) ~ male + bach + debt_high, 
               data = df_raw_comp, 
               dist = "weibull")

summary(fit.2)

Covariate             Mean       Coef     Rel.Risk   S.E.    LR p
male                                                        0.0000 
               0      0.488     0         1 (reference)
               1      0.512    -0.443     0.642     0.029
bach                                                        0.0000 
               0      0.878     0         1 (reference)
               1      0.122    -0.489     0.613     0.049
debt_high                                                   0.0002 
               0      0.734     0         1 (reference)
               1      0.266    -0.126     0.882     0.034

Events                    4817 
Total time at risk        199992 
Max. log. likelihood      -19374 
LR test statistic         325.47 
Degrees of freedom        3 
Overall p-value           0

plot(fit.2, fn = "haz")
lines(fit.haz.sm, col = 2)

Log-Normal Model

# log-normal distribution for hazard
fit.3 <- phreg(Surv(age_obs, event) ~ male + bach + debt_high, 
               data = df_raw_comp, dist = "lognormal")

summary(fit.3)

Covariate             Mean       Coef     Rel.Risk   S.E.    LR p
male                                                        0.0000 
               0      0.488     0         1 (reference)
               1      0.512    -1.344     0.261     0.113
bach                                                        0.0000 
               0      0.878     0         1 (reference)
               1      0.122    -0.442     0.643     0.029
debt_high                                                   0.0000 
               0      0.734     0         1 (reference)
               1      0.266    -0.497     0.608     0.049

Events                    4817 
Total time at risk        199447 
Max. log. likelihood      -18811 
LR test statistic         332.62 
Degrees of freedom        3 
Overall p-value           0

# how to interpret? why does it stop?
plot(fit.3, ylim = c(0, 200))

Log logistic model

# log-normal distribution for hazard
fit.4 <- phreg(Surv(age_obs, event) ~ male + bach + debt_high,
               data = df_raw_comp, dist = "loglogistic")

summary(fit.4)

Covariate             Mean       Coef     Rel.Risk   S.E.    LR p
male                                                        0.0000 
               0      0.488     0         1 (reference)
               1      0.512    -1.303     0.272     0.051
bach                                                        0.0000 
               0      0.878     0         1 (reference)
               1      0.122    -0.429     0.651     0.029
debt_high                                                   0.0000 
               0      0.734     0         1 (reference)
               1      0.266    -0.486     0.615     0.049

Events                    4817 
Total time at risk        199447 
Max. log. likelihood      -18595 
LR test statistic         318.02 
Degrees of freedom        3 
Overall p-value           0

plot(fit.4, fn = "haz")
lines(fit.haz.sm, col = 2)

AIC

AIC(fit.1)

[1] 45433.14

AIC(fit.2)

[1] 38757.68

AIC(fit.3)

[1] 37633.52

AIC(fit.4) # log-logistic wins

[1] 37201.82

AIC(fit.1.aft)

[1] 45433.14

Piecewise constant exponential model

# must supply the times for the "pieces" where I expect the hazard to be constant
fit.5 <- pchreg(Surv(age_obs, event) ~ male + bach + debt_high, 
                data = df_raw_comp, 
                cuts = seq(0, 40, 5))

summary(fit.5)

Covariate             Mean       Coef     Rel.Risk   S.E.    LR p
male                                                        0.0000 
               0      0.488     0         1 (reference)
               1      0.512    -0.424     0.655     0.029
bach                                                        0.0000 
               0      0.878     0         1 (reference)
               1      0.122    -0.479     0.620     0.049
debt_high                                                   0.0000 
               0      0.734     0         1 (reference)
               1      0.266    -0.144     0.866     0.034

Events                    4817 
Total time at risk        199447 
Max. log. likelihood      -18740 
LR test statistic         308.95 
Degrees of freedom        3 
Overall p-value           0

Restricted mean survival:  28.51858 in (0, 40]

plot(fit.5, fn = "haz")
lines(fit.haz.sm, col = 2)

AIC(fit.4)

[1] 37201.82

-2 * fit.5$loglik[2] + length(fit.5$coefficients) # AIC calc

[1] 37482.07

Graphical checks on model fit

emp <- coxreg(Surv(age_obs, event) ~ male + bach + debt_high, data = df_raw_comp)

check.dist(sp = emp, pp = fit.1, main = "Empirical vs. Exponential")

Empirical vs. Weibull

check.dist(sp = emp, pp = fit.2, main = "Empirical vs. Weibull")

Empirical vs. Log-Normal

check.dist(sp = emp, pp = fit.3, main = "Empirical vs. Log-Normal")

Fit check: Empirical vs. Log-Logistic

check.dist(sp = emp, pp = fit.4, main = "Empirical vs. Log-Logistic")

Fit check: Empirical vs. PCH

check.dist(sp = emp, pp = fit.5, main = "Empirical vs. PCH")