Event History Analysis Homework 4
Carry out the following analysis:
Define your outcome as in HW 1.
Consider what covariates are hypothesized to affect the outcome variable
Define these and construct a Cox model for your outcome
- The outcome for this analysis is transition to first birth. Covariates that will be included in the model are sex, education and debt.
#determine date of last interview for each respondent
last_int <- df_raw %>%
filter(is.na(CV_CHILD_BIRTH_MONTH.01_2019) == T) %>%
select(PUBID_1997,
CV_INTERVIEW_CMONTH_1997, CV_INTERVIEW_CMONTH_1998,
CV_INTERVIEW_CMONTH_1999, CV_INTERVIEW_CMONTH_2000,
CV_INTERVIEW_CMONTH_2001, CV_INTERVIEW_CMONTH_2002,
CV_INTERVIEW_CMONTH_2004, CV_INTERVIEW_CMONTH_2005,
CV_INTERVIEW_CMONTH_2006, CV_INTERVIEW_CMONTH_2007,
CV_INTERVIEW_CMONTH_2008, CV_INTERVIEW_CMONTH_2009,
CV_INTERVIEW_CMONTH_2010, CV_INTERVIEW_CMONTH_2011,
CV_INTERVIEW_CMONTH_2013, CV_INTERVIEW_CMONTH_2015,
CV_INTERVIEW_CMONTH_2017, CV_INTERVIEW_CMONTH_2019) %>%
pivot_longer(cols = c(-PUBID_1997),
names_to = "interview",
values_to = "cmonth") %>%
group_by(PUBID_1997) %>%
summarize(age.cens = max(cmonth, na.rm = TRUE))
df_raw <- left_join(df_raw, last_int, by = "PUBID_1997")
# merge custom weights to data frame
df_raw <- left_join(df_raw, weights)
rm(last_int)df_raw_rc <- df_raw %>%
mutate(
#calculate age at first birth
age97 = CV_AGE.MONTHS._INT_DATE_1997/12,
months2fb = CV_CHILD_BIRTH_MONTH.01_2019 - CV_AGE.MONTHS._INT_DATE_1997,
years2fb = months2fb/12,
age_fb = round(age97+years2fb,2),
# calculate censored age based on last interview
months_cens = age.cens-CV_AGE.MONTHS._INT_DATE_1997,
years_cens = months_cens/12,
age_cens = round(age97+years_cens,2),
age_obs = if_else(is.na(age_cens) == T, age_fb, age_cens),
#age_obs = floor(age_obs0),
event = if_else(is.na(age_cens) == T, 1, 0),
#debt_high = as.factor(if_else(CVC_ASSETS_DEBTS_25_XRND > 17000, 1, 0)),
debt_high = factor(if_else(CVC_ASSETS_DEBTS_25_XRND > 17000, 1, 0)),
debt = CVC_ASSETS_DEBTS_25_XRND,
#debt_high = factor(if_else(CVC_ASSETS_DEBTS_25_XRND > 0, 1, 0)),
#zero_debt = if_else(CVC_ASSETS_DEBTS_25_XRND == 0, 1, 0),
#male = as.factor(if_else(KEY_SEX_1997 == 1, 1, 0)),
male = factor(if_else(KEY_SEX_1997 == 1, 1, 0)),
#bach = as.factor(if_else(CV_HIGHEST_DEGREE_EVER_EDT_2019 >= 5, 1, 0)),
bach = factor(if_else(CV_HIGHEST_DEGREE_EVER_EDT_2019 >= 5, 1, 0))
# raceth = case_when(KEY_RACE_ETHNICITY_1997 == 1 ~ "1black",
# KEY_RACE_ETHNICITY_1997 == 2 ~ "2hispanic",
# KEY_RACE_ETHNICITY_1997 == 3 ~ "3mixed race",
# KEY_RACE_ETHNICITY_1997 == 4 ~ "0non-black/non-hisp"),
# raceth2 = factor(raceth)
)
df_comp <- df_raw_rc %>%
select(PUBID_1997, age97, age_obs, event, debt, debt_high, male, bach, VPSU_1997, VSTRAT_1997,
weights) %>%
mutate(check = if_else(age97 >= age_obs, 1, 0)) %>%
filter(check == 0) %>%
select(-check) %>%
filter(complete.cases(.))Fit the Cox model for the data
Include all the main effects in the model
Test for an interaction between at least two of the predictors
Produce plots for the survival function and the cumulative hazard function for different “risk profiles”, as done in the notes
Evaluate the assumptions of the Cox model, including proportionality of hazards of covariates, and if you use a continuous variable, evaluate linearity of the effect using Martingale residuals
Cox model using survey design
options(survey.lonely.psu = "adjust")
des <- svydesign(ids = ~VPSU_1997,
strata = ~VSTRAT_1997, weights=~weights,
data=df_comp, nest=T)#Fit the model
fit.cox2<-svycoxph(Surv(time = age97,
time2 = age_obs,
event = event) ~ male + bach + debt_high,
design=des)
summary(fit.cox2) Stratified 1 - level Cluster Sampling design (with replacement)
With (234) clusters.
svydesign(ids = ~VPSU_1997, strata = ~VSTRAT_1997, weights = ~weights,
data = df_comp, nest = T)
Call:
svycoxph(formula = Surv(time = age97, time2 = age_obs, event = event) ~
male + bach + debt_high, design = des)
n= 6483, number of events= 4761
coef exp(coef) se(coef) robust se z Pr(>|z|)
male1 -0.41603 0.65966 0.02973 0.02777 -14.980 <2e-16 ***
bach1 -0.45434 0.63487 0.04608 0.04880 -9.309 <2e-16 ***
debt_high1 -0.07435 0.92835 0.03303 0.03392 -2.192 0.0284 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
exp(coef) exp(-coef) lower .95 upper .95
male1 0.6597 1.516 0.6247 0.6966
bach1 0.6349 1.575 0.5770 0.6986
debt_high1 0.9283 1.077 0.8686 0.9922
Concordance= 0.588 (se = 0.004 )
Likelihood ratio test= NA on 3 df, p=NA
Wald test = 309.7 on 3 df, p=<2e-16
Score (logrank) test = NA on 3 df, p=NA
(Note: the likelihood ratio and score tests assume independence of
observations within a cluster, the Wald and robust score tests do not).
This model shows that being male, having a bachelor’s degree or higher, and having high debt are all associated with a lower risk of transition to first birth. All covariates are significant.
Cox model with interaction term
#Fit the model
fit.cox3<-svycoxph(Surv(time = age97,
time2 = age_obs,
event = event) ~ male + bach * debt_high,
design=des)
summary(fit.cox3) Stratified 1 - level Cluster Sampling design (with replacement)
With (234) clusters.
svydesign(ids = ~VPSU_1997, strata = ~VSTRAT_1997, weights = ~weights,
data = df_comp, nest = T)
Call:
svycoxph(formula = Surv(time = age97, time2 = age_obs, event = event) ~
male + bach * debt_high, design = des)
n= 6483, number of events= 4761
coef exp(coef) se(coef) robust se z Pr(>|z|)
male1 -0.41767 0.65858 0.02974 0.02770 -15.079 < 2e-16 ***
bach1 -0.56099 0.57064 0.06355 0.06236 -8.995 < 2e-16 ***
debt_high1 -0.10988 0.89594 0.03606 0.03855 -2.850 0.00437 **
bach1:debt_high1 0.23752 1.26810 0.09251 0.09743 2.438 0.01477 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
exp(coef) exp(-coef) lower .95 upper .95
male1 0.6586 1.5184 0.6238 0.6953
bach1 0.5706 1.7524 0.5050 0.6448
debt_high1 0.8959 1.1161 0.8307 0.9663
bach1:debt_high1 1.2681 0.7886 1.0477 1.5349
Concordance= 0.589 (se = 0.004 )
Likelihood ratio test= NA on 4 df, p=NA
Wald test = 354.2 on 4 df, p=<2e-16
Score (logrank) test = NA on 4 df, p=NA
(Note: the likelihood ratio and score tests assume independence of
observations within a cluster, the Wald and robust score tests do not).
This interaction model shows the same outcomes as the previous one. Interestingly, having a bachelor’s degree or higher AND high debt is significantly associated with a higher risk of transition to first birth.
Plot survival curves
library(ggsurvfit)
plot(survfit(fit.cox2, conf.int = FALSE),
ylab = "S(t)",
xlab = "Age",
xlim = c(15, 40),
ylim = c(.1, 1))
lines(survfit(fit.cox2, newdata = data.frame(male = 1, bach = 1, debt_high = 1), conf.int = FALSE), col = "darkmagenta", lty = 1)
lines(survfit(fit.cox2, newdata = data.frame(male = 0, bach = 1, debt_high = 1), conf.int = FALSE), col = "darkorange2", lty = 1)
lines(survfit(fit.cox2, newdata = data.frame(male = 1, bach = 1, debt_high = 0), conf.int = FALSE), col = "darkmagenta", lty = 2)
lines(survfit(fit.cox2, newdata = data.frame(male = 0, bach = 1, debt_high = 0), conf.int = FALSE), col = "darkorange2", lty = 2)
title(main = str_wrap(c("Survival function for transition to first birth for males and female with a bachelor's degree or higher")))
legend("bottomleft",
legend = c("mean", "Male, high debt", "Female, high debt", "Male, low debt", "Female, low debt"),
col = c(1, "darkmagenta", "darkorange2", "darkmagenta", "darkorange2"),
lty = c(1,1,1,2,2), cex = .8)Hazard Plots
sf1 <- survfit(fit.cox2)
sf2 <- survfit(fit.cox2, newdata = data.frame(male = 1, bach = 1, debt_high = 1))Warning in model.frame.default(data = structure(list(male = 1, bach = 1, :
variable 'male' is not a factor
Warning in model.frame.default(data = structure(list(male = 1, bach = 1, :
variable 'bach' is not a factor
Warning in model.frame.default(data = structure(list(male = 1, bach = 1, :
variable 'debt_high' is not a factor
sf3 <- survfit(fit.cox2, newdata = data.frame(male = 0, bach = 1, debt_high = 1))Warning in model.frame.default(data = structure(list(male = 0, bach = 1, :
variable 'male' is not a factor
Warning in model.frame.default(data = structure(list(male = 0, bach = 1, :
variable 'bach' is not a factor
Warning in model.frame.default(data = structure(list(male = 0, bach = 1, :
variable 'debt_high' is not a factor
sf4 <- survfit(fit.cox2, newdata = data.frame(male = 1, bach = 1, debt_high = 0))Warning in model.frame.default(data = structure(list(male = 1, bach = 1, :
variable 'male' is not a factor
Warning in model.frame.default(data = structure(list(male = 1, bach = 1, :
variable 'bach' is not a factor
Warning in model.frame.default(data = structure(list(male = 1, bach = 1, :
variable 'debt_high' is not a factor
sf5 <- survfit(fit.cox2, newdata = data.frame(male = 0, bach = 1, debt_high = 0))Warning in model.frame.default(data = structure(list(male = 0, bach = 1, :
variable 'male' is not a factor
Warning in model.frame.default(data = structure(list(male = 0, bach = 1, :
variable 'bach' is not a factor
Warning in model.frame.default(data = structure(list(male = 0, bach = 1, :
variable 'debt_high' is not a factor
h1 <- log(sf1$surv)
h2 <- log(sf2$surv)
h3 <- log(sf3$surv)
h4 <- log(sf4$surv)
h5 <- log(sf5$surv)
test <- data.frame(
H = c(h1, h2, h3),
group = c(rep("means", length(h1)),
rep("male", length(h1)),
rep("female", length(h1))),
times = sf1$time
)
test %>%
ggplot(aes(x = times, y = H)) +
geom_line(aes(group = group, color = group)) Doesn’t seem right….
hazard function
times <- sf1$time
hs1 <- loess(diff(c(0,h1))~times, degree = 1, span = .25)
hs2 <- loess(diff(c(0,h2))~times, degree = 1, span = .25)
hs3 <- loess(diff(c(0,h3))~times, degree = 1, span = .25)
hs4 <- loess(diff(c(0,h4))~times, degree = 1, span = .25)
hs5 <- loess(diff(c(0,h5))~times, degree = 1, span = .25)
plot(predict(hs1)~times, type = "l", ylab = "smoothed h(t)", xlab = "Years")
title(main = "Smoothed hazard plots")
lines(predict(hs2)~times, lty = 1, col = "darkmagenta")
lines(predict(hs3)~times, lty = 1, col = "darkorange2")
lines(predict(hs4)~times, lty = 2, col = "darkmagenta")
lines(predict(hs5)~times, lty = 2, col = "darkorange2")
legend("topright",
legend = c("Means of Covariates",
"Male, bachelor, high debt",
"Female, bachlor, high debt",
"Male, bachelor, low debt",
"Female, bachlor, low debt"),
lty = c(1,1,1,2,2), col = c(1, "darkmagenta", "darkorange2", "darkmagenta", "darkorange2"), cex = .8)Not sure these hazards look correct, shouldn’t they be flipped? Females should have higher hazards than males? Are negative hazards possible? Something doesn’t seem right here…
Scoenfeld residuals
schoenresid <- resid(fit.cox2, type = "schoenfeld")
fit.sr <- lm(schoenresid~des$variables$age97[des$variables$event==1])
fit.sr %>%
broom::tidy() %>%
filter(term != "(Intercept)") %>%
select(response, estimate, statistic, p.value)# A tibble: 3 × 4
response estimate statistic p.value
<chr> <dbl> <dbl> <dbl>
1 male1 -0.00131 -0.262 0.793
2 bach1 0.000988 0.331 0.741
3 debt_high1 0.00289 0.659 0.510
## Another way to measure proportionality
fit.test <- cox.zph(fit.cox2)
fit.test chisq df p
male 0.00353 1 0.95
bach 0.03814 1 0.85
debt_high 0.02058 1 0.89
GLOBAL 0.06036 3 1.00
P values are not significant for both tests, therefore we can assume proportionality in the covariates in this model.
## Plot residuals
plot(fit.test, df = 2)Plotting residuals confirms proportionality.
Martingale residuals
res.mar <- resid(fit.cox2, type = "martingale")
#plot
scatter.smooth(des$variables$debt_high, res.mar, degree = 2,
span = 1, ylab = "Martingale Residual",
col = 1, cex = .5, lpars = list(col = "red", lwd = 3))
title(main = "Martingale residuals for debt")Martingale residuals also shows proportionality.