options(Ncores = 12)
library(tidyverse)
library(haven)
library(survival)
library(car)
library(survey)
library(muhaz)
library(eha)
library(janitor)Event History Analysis Homework 3
df_cat <- read.csv('categories.csv')
df_raw <- read.csv('new_data.csv')last_int <- df_raw %>%
filter(is.na(CV_CHILD_BIRTH_MONTH.01_2019) == T) %>%
select(PUBID_1997,
CV_INTERVIEW_CMONTH_1997, CV_INTERVIEW_CMONTH_1998,
CV_INTERVIEW_CMONTH_1999, CV_INTERVIEW_CMONTH_2000,
CV_INTERVIEW_CMONTH_2001, CV_INTERVIEW_CMONTH_2002,
CV_INTERVIEW_CMONTH_2004, CV_INTERVIEW_CMONTH_2005,
CV_INTERVIEW_CMONTH_2006, CV_INTERVIEW_CMONTH_2007,
CV_INTERVIEW_CMONTH_2008, CV_INTERVIEW_CMONTH_2009,
CV_INTERVIEW_CMONTH_2010, CV_INTERVIEW_CMONTH_2011,
CV_INTERVIEW_CMONTH_2013, CV_INTERVIEW_CMONTH_2015,
CV_INTERVIEW_CMONTH_2017, CV_INTERVIEW_CMONTH_2019) %>%
pivot_longer(cols = c(-PUBID_1997),
names_to = "interview",
values_to = "cmonth") %>%
group_by(PUBID_1997) %>%
summarize(age.cens = max(cmonth, na.rm = TRUE))
df_raw <- left_join(df_raw, last_int, by = "PUBID_1997")df_raw_rc <- df_raw %>%
mutate(
#calculate age at first birth
age97 = CV_AGE.MONTHS._INT_DATE_1997/12,
months2fb = CV_CHILD_BIRTH_MONTH.01_2019 - CV_AGE.MONTHS._INT_DATE_1997,
years2fb = months2fb/12,
age_fb = round(age97+years2fb,1),
# calculate censored age based on last interview
months_cens = age.cens-CV_AGE.MONTHS._INT_DATE_1997,
years_cens = months_cens/12,
age_cens = round(age97+years_cens,1),
age_obs = if_else(is.na(age_cens) == T, age_fb, age_cens),
event = if_else(is.na(age_cens) == T, 1, 0),
debt_high = as.factor(if_else(CVC_ASSETS_DEBTS_25_XRND > 17000, 1, 0)),
male = as.factor(if_else(KEY_SEX_1997 == 1, 1, 0)),
bach = as.factor(if_else(CV_HIGHEST_DEGREE_EVER_EDT_2019 >= 5, 1, 0)),
raceth = case_when(KEY_RACE_ETHNICITY_1997 == 1 ~ "black",
KEY_RACE_ETHNICITY_1997 == 2 ~ "hispanic",
KEY_RACE_ETHNICITY_1997 == 3 ~ "mixed race",
KEY_RACE_ETHNICITY_1997 == 4 ~ "non-black/non-hisp"),
raceth2 = factor(KEY_RACE_ETHNICITY_1997)
) #%>%
# select(PUBID_1997:KEY_RACE_ETHNICITY_1997, CV_HIGHEST_DEGREE_EVER_EDT_2010, CV_HIGHEST_DEGREE_EVER_EDT_2019,
# CVC_ASSETS_DEBTS_25_XRND:CVC_ASSETS_DEBTS_35_XRND, age97, CV_CHILD_BIRTH_MONTH.01_2019, CV_AGE.MONTHS._INT_DATE_1997, months2fb:debt_cat)
df_raw_comp <- df_raw_rc %>%
filter(complete.cases(bach, debt_high))Define your outcome as in HW 1. Also consider what covariates are hypothesized to affect the outcome variable.
Outcome: First birth
Covariates: Debt level, sex, and education
Define these and construct a parametric model for your outcome.
Fit the parametric model of your choosing to the data.
Did you choose an AFT or PH model and why?
- Based on the analysis below, the best model to use with these data are the PH models
Justify what parametric distribution you choose
- Log-logistic has the lowest AIC, but Piecewise Constant Hazard has the best visual fit to the empirical curve.
Interpret your results and write them up
- For both models, there was a lower risk of first birth for males, for those with bachelor’s degree or higher, and for those with high debt at age 25 ($17k or more). All three covariates were highly significant in both models.
library(ggsurvfit)
fit <- survfit2(Surv(age_obs, event)~1, df_raw_comp)
fit %>%
ggsurvfit() +
labs(title = "Survival Function for First Birth, NLSY 97",
y = "S(t)",
x = "Years")# Kaplan-Meier
fit.haz.km <- kphaz.fit(df_raw_comp$age_obs, df_raw_comp$event, method = "product-limit")
# Smoothed using a kernel-density method
fit.haz.sm <- muhaz(df_raw_comp$age_obs, df_raw_comp$event)
# Empirical hazard function (product-limit estimate) plot
kphaz.plot(fit.haz.km, main = "Plot of the hazard to first birth")
# Overlay the smoothed version
lines(fit.haz.sm, col = 2, lwd = 3)
legend("topleft",
legend = c("KM hazard", "Smoothed Hazard"),
col = c(1,2),
lty = c(1,1))Exponential Model
# Why add month?
month <- 1/12
fit.1 <- phreg(Surv(age_obs+month, event) ~ male + bach + debt_high, data = df_raw_comp,
dist = "weibull", shape = 1)
summary(fit.1)Covariate Mean Coef Rel.Risk S.E. LR p
male 0.0000
0 0.488 0 1 (reference)
1 0.512 -0.259 0.772 0.029
bach 0.0000
0 0.878 0 1 (reference)
1 0.122 -0.264 0.768 0.049
debt_high 0.1141
0 0.734 0 1 (reference)
1 0.266 -0.053 0.948 0.034
Events 4817
Total time at risk 199992
Max. log. likelihood -22713
LR test statistic 106.35
Degrees of freedom 3
Overall p-value 0
plot(fit.1, ylim = c(0, 0.075))
lines(fit.haz.sm)Accelerated Failure Time Model
fit.1.aft <- survreg(Surv(age_obs+month, event) ~ male + bach + debt_high,
data = df_raw_comp, dist = "exponential")
summary(fit.1.aft)
Call:
survreg(formula = Surv(age_obs + month, event) ~ male + bach +
debt_high, data = df_raw_comp, dist = "exponential")
Value Std. Error z p
(Intercept) 3.5585 0.0222 160.43 < 2e-16
male1 0.2589 0.0291 8.88 < 2e-16
bach1 0.2641 0.0492 5.36 8.1e-08
debt_high1 0.0530 0.0337 1.57 0.12
Scale fixed at 1
Exponential distribution
Loglik(model)= -22712.6 Loglik(intercept only)= -22765.7
Chisq= 106.35 on 3 degrees of freedom, p= 6.7e-23
Number of Newton-Raphson Iterations: 3
n= 6539
Weibull Model
# weibull distribution for hazard
fit.2 <- phreg(Surv(age_obs+month, event) ~ male + bach + debt_high,
data = df_raw_comp,
dist = "weibull")
summary(fit.2)Covariate Mean Coef Rel.Risk S.E. LR p
male 0.0000
0 0.488 0 1 (reference)
1 0.512 -0.443 0.642 0.029
bach 0.0000
0 0.878 0 1 (reference)
1 0.122 -0.489 0.613 0.049
debt_high 0.0002
0 0.734 0 1 (reference)
1 0.266 -0.126 0.882 0.034
Events 4817
Total time at risk 199992
Max. log. likelihood -19374
LR test statistic 325.47
Degrees of freedom 3
Overall p-value 0
plot(fit.2, fn = "haz")
lines(fit.haz.sm, col = 2)Log-Normal Model
# log-normal distribution for hazard
fit.3 <- phreg(Surv(age_obs, event) ~ male + bach + debt_high,
data = df_raw_comp, dist = "lognormal")
summary(fit.3)Covariate Mean Coef Rel.Risk S.E. LR p
male 0.0000
0 0.488 0 1 (reference)
1 0.512 -1.344 0.261 0.113
bach 0.0000
0 0.878 0 1 (reference)
1 0.122 -0.442 0.643 0.029
debt_high 0.0000
0 0.734 0 1 (reference)
1 0.266 -0.497 0.608 0.049
Events 4817
Total time at risk 199447
Max. log. likelihood -18811
LR test statistic 332.62
Degrees of freedom 3
Overall p-value 0
# how to interpret? why does it stop?
plot(fit.3, ylim = c(0, 200))Log logistic model
# log-normal distribution for hazard
fit.4 <- phreg(Surv(age_obs, event) ~ male + bach + debt_high,
data = df_raw_comp, dist = "loglogistic")
summary(fit.4)Covariate Mean Coef Rel.Risk S.E. LR p
male 0.0000
0 0.488 0 1 (reference)
1 0.512 -1.303 0.272 0.051
bach 0.0000
0 0.878 0 1 (reference)
1 0.122 -0.429 0.651 0.029
debt_high 0.0000
0 0.734 0 1 (reference)
1 0.266 -0.486 0.615 0.049
Events 4817
Total time at risk 199447
Max. log. likelihood -18595
LR test statistic 318.02
Degrees of freedom 3
Overall p-value 0
plot(fit.4, fn = "haz")
lines(fit.haz.sm, col = 2)AIC
AIC(fit.1)[1] 45433.14
AIC(fit.2)[1] 38757.68
AIC(fit.3)[1] 37633.52
AIC(fit.4) # log-logistic wins[1] 37201.82
AIC(fit.1.aft)[1] 45433.14
Piecewise constant exponential model
# must supply the times for the "pieces" where I expect the hazard to be constant
fit.5 <- pchreg(Surv(age_obs, event) ~ male + bach + debt_high,
data = df_raw_comp,
cuts = seq(0, 40, 5))
summary(fit.5)Covariate Mean Coef Rel.Risk S.E. LR p
male 0.0000
0 0.488 0 1 (reference)
1 0.512 -0.424 0.655 0.029
bach 0.0000
0 0.878 0 1 (reference)
1 0.122 -0.479 0.620 0.049
debt_high 0.0000
0 0.734 0 1 (reference)
1 0.266 -0.144 0.866 0.034
Events 4817
Total time at risk 199447
Max. log. likelihood -18740
LR test statistic 308.95
Degrees of freedom 3
Overall p-value 0
Restricted mean survival: 28.51858 in (0, 40]
plot(fit.5, fn = "haz")
lines(fit.haz.sm, col = 2)AIC(fit.4)[1] 37201.82
-2 * fit.5$loglik[2] + length(fit.5$coefficients) # AIC calc[1] 37482.07
Graphical checks on model fit
emp <- coxreg(Surv(age_obs, event) ~ male + bach + debt_high, data = df_raw_comp)
check.dist(sp = emp, pp = fit.1, main = "Empirical vs. Exponential")Empirical vs. Weibull
check.dist(sp = emp, pp = fit.2, main = "Empirical vs. Weibull")Empirical vs. Log-Normal
check.dist(sp = emp, pp = fit.3, main = "Empirical vs. Log-Normal")Fit check: Empirical vs. Log-Logistic
check.dist(sp = emp, pp = fit.4, main = "Empirical vs. Log-Logistic")Fit check: Empirical vs. PCH
check.dist(sp = emp, pp = fit.5, main = "Empirical vs. PCH")