Event History Analysis Homework 5
Fit the discrete time hazard model to your outcome
- You must form a person-period data set
- Consider both the general model and other time specifications
- Include all main effects in the model
- Test for an interaction between at least two of the predictors
- Generate hazard plots for interesting cases highlighting the significant predictors in your analysis
#df_cat <- read.csv('categories.csv')
df_raw <- read.csv('raw_data.csv')
weights <- read.table('customweight.dat', sep=' ') %>%
rename(PUBID_1997 = V1, weights = V2)# Determine latest interview date for those without children
last_int <- df_raw %>%
filter(is.na(CV_CHILD_BIRTH_MONTH.01_2019) == T) %>%
select(PUBID_1997,
CV_INTERVIEW_CMONTH_1997, CV_INTERVIEW_CMONTH_1998,
CV_INTERVIEW_CMONTH_1999, CV_INTERVIEW_CMONTH_2000,
CV_INTERVIEW_CMONTH_2001, CV_INTERVIEW_CMONTH_2002,
CV_INTERVIEW_CMONTH_2004, CV_INTERVIEW_CMONTH_2005,
CV_INTERVIEW_CMONTH_2006, CV_INTERVIEW_CMONTH_2007,
CV_INTERVIEW_CMONTH_2008, CV_INTERVIEW_CMONTH_2009,
CV_INTERVIEW_CMONTH_2010, CV_INTERVIEW_CMONTH_2011,
CV_INTERVIEW_CMONTH_2013, CV_INTERVIEW_CMONTH_2015,
CV_INTERVIEW_CMONTH_2017, CV_INTERVIEW_CMONTH_2019) %>%
pivot_longer(cols = c(-PUBID_1997),
names_to = "interview",
values_to = "cmonth") %>%
group_by(PUBID_1997) %>%
summarize(age.cens = max(cmonth, na.rm = TRUE))
df_raw <- left_join(df_raw, last_int, by = "PUBID_1997")
df_raw <- left_join(df_raw, weights)
rm(last_int)df_raw_rc <- df_raw %>%
mutate(
#calculate age at first birth
age97 = CV_AGE.MONTHS._INT_DATE_1997/12,
months2fb = CV_CHILD_BIRTH_MONTH.01_2019 - CV_AGE.MONTHS._INT_DATE_1997,
years2fb = months2fb/12,
age_fb = round(age97+years2fb,2),
# calculate censored age based on last interview
months_cens = age.cens-CV_AGE.MONTHS._INT_DATE_1997,
years_cens = months_cens/12,
age_cens = round(age97+years_cens,2),
age_obs = if_else(is.na(age_cens) == T, age_fb, age_cens),
event = if_else(is.na(age_cens) == T, 1, 0),
debt_high = if_else(CVC_ASSETS_DEBTS_25_XRND > 17000, 1, 0),
debt_cat = factor(case_when(
CVC_ASSETS_DEBTS_25_XRND == 0 ~ "0No debt",
CVC_ASSETS_DEBTS_25_XRND > 0 & CVC_ASSETS_DEBTS_25_XRND <= 11000 ~ "1low debt",
CVC_ASSETS_DEBTS_25_XRND > 17500 ~ "2high debt")), # I used the mean debt at 25 when all zero debt is removed
sex = factor(if_else(KEY_SEX_1997 == 1, "male", "female")),
#bach = as.factor(if_else(CV_HIGHEST_DEGREE_EVER_EDT_2019 >= 5, 1, 0)),
bach = if_else(CV_HIGHEST_DEGREE_EVER_EDT_2019 >= 5, 1, 0),
raceth = factor(case_when(KEY_RACE_ETHNICITY_1997 == 1 ~ "1black",
KEY_RACE_ETHNICITY_1997 == 2 ~ "2hispanic",
#KEY_RACE_ETHNICITY_1997 == 3 ~ "3mixed race",
KEY_RACE_ETHNICITY_1997 == 4 ~ "0non-black/non-hisp")),
timefb = if_else(event == 1,
CV_CHILD_BIRTH_MONTH.01_2019 - CV_AGE.MONTHS._INT_DATE_1997,
age.cens-CV_AGE.MONTHS._INT_DATE_1997),
) %>%
filter(CV_HIGHEST_DEGREE_EVER_EDT_2019 > 2) # only include those who have attended college in the sample
df_comp <- df_raw_rc %>%
select(PUBID_1997, timefb, age97, age_obs, event, debt_cat, sex, raceth, VPSU_1997, VSTRAT_1997, weights) %>%
# remove those with first birth before first time point
mutate(check = if_else(age97 >= age_obs, 1, 0)) %>%
filter(check == 0) %>%
select(-check) %>%
filter(complete.cases(.))Create person period file
pp <- survSplit(Surv(timefb, event) ~. ,
data = df_comp,
cut = seq(0,336, 12),
episode = "year_birth")
pp$year <- pp$year_birth-1
pp<-pp[order(pp$PUBID_1997, pp$year_birth),]Descriptive analysis
pp %>%
group_by(year) %>%
summarise(prop_bir = mean(event, na.rm = T)) %>%
ggplot(aes(x = year, y = prop_bir)) +
geom_line() +
ggtitle(label = "Hazard of having first birth by year after first interview")pp %>%
group_by(year, debt_cat) %>%
summarise(prop_bir = mean(event, na.rm = T)) %>%
ggplot(aes(x = year, y = prop_bir)) +
geom_line(aes(group = debt_cat, color = debt_cat)) +
ggtitle(label = "Hazard of having first birth by year after first interview by debt category")Discrete time model
# create survey design
options(survey.lonely.psu = "adjust")
des <- svydesign(ids = ~VPSU_1997,
strata = ~VSTRAT_1997,
weights = ~weights,
data = pp, nest = TRUE)General model
The general model with year, debt, sex and race/ethnicity as covariates shows that the risk of first birth gradually increases and then decrease as time continues. Males have a lower risk of first birth than females. Both Black and Hispanic people have a higher risk of birth birth than Non-black/Non-Hispanic people. Debt did not seem to have any significant contribution to risk of first birth.
# general model
fit.0 <- svyglm(event~as.factor(year) + debt_cat + sex + raceth - 1, design = des, family = binomial(link = "cloglog")) # family = "binomial"Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.0)
Call:
svyglm(formula = event ~ as.factor(year) + debt_cat + sex + raceth -
1, design = des, family = binomial(link = "cloglog"))
Survey design:
svydesign(ids = ~VPSU_1997, strata = ~VSTRAT_1997, weights = ~weights,
data = pp, nest = TRUE)
Coefficients:
Estimate Std. Error t value Pr(>|t|)
as.factor(year)1 -8.635177 1.003758 -8.603 5.81e-13 ***
as.factor(year)2 -4.955445 0.276824 -17.901 < 2e-16 ***
as.factor(year)3 -5.086531 0.280259 -18.149 < 2e-16 ***
as.factor(year)4 -4.535878 0.257434 -17.620 < 2e-16 ***
as.factor(year)5 -4.734368 0.243822 -19.417 < 2e-16 ***
as.factor(year)6 -4.425961 0.216025 -20.488 < 2e-16 ***
as.factor(year)7 -4.323438 0.194468 -22.232 < 2e-16 ***
as.factor(year)8 -3.692257 0.180713 -20.432 < 2e-16 ***
as.factor(year)9 -3.639628 0.158521 -22.960 < 2e-16 ***
as.factor(year)10 -3.489790 0.141603 -24.645 < 2e-16 ***
as.factor(year)11 -3.539906 0.168561 -21.001 < 2e-16 ***
as.factor(year)12 -3.073292 0.150974 -20.356 < 2e-16 ***
as.factor(year)13 -2.992874 0.142193 -21.048 < 2e-16 ***
as.factor(year)14 -3.023524 0.152562 -19.818 < 2e-16 ***
as.factor(year)15 -2.953928 0.132292 -22.329 < 2e-16 ***
as.factor(year)16 -2.629849 0.146652 -17.933 < 2e-16 ***
as.factor(year)17 -2.574974 0.144480 -17.822 < 2e-16 ***
as.factor(year)18 -2.636597 0.136313 -19.342 < 2e-16 ***
as.factor(year)19 -2.386863 0.120068 -19.879 < 2e-16 ***
as.factor(year)20 -2.398121 0.125509 -19.107 < 2e-16 ***
as.factor(year)21 -2.495961 0.113632 -21.965 < 2e-16 ***
as.factor(year)22 -2.565367 0.151602 -16.922 < 2e-16 ***
as.factor(year)23 -2.632760 0.135452 -19.437 < 2e-16 ***
as.factor(year)24 -2.330663 0.151345 -15.400 < 2e-16 ***
as.factor(year)25 -2.898878 0.257273 -11.268 < 2e-16 ***
as.factor(year)26 -2.422947 0.206714 -11.721 < 2e-16 ***
as.factor(year)27 -2.965589 0.288455 -10.281 3.16e-16 ***
as.factor(year)28 -3.202921 0.583610 -5.488 4.76e-07 ***
debt_cat1low debt 0.093652 0.095291 0.983 0.3287
debt_cat2high debt 0.101214 0.079882 1.267 0.2089
sexmale -0.368030 0.057196 -6.435 8.81e-09 ***
raceth1black 0.166937 0.074361 2.245 0.0276 *
raceth2hispanic 0.005562 0.095094 0.058 0.9535
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 0.9660403)
Number of Fisher Scoring iterations: 10
General model with interaction
The general model with an interaction term shows similar results to the model above. We see the same increasing and decreasing and increasing trend as time progresses. Debt on its own still does not significantly contribute to risk of first birth. Males have a significantly lower risk of first birth than females. Black people have a significantly higher risk of first birth compared to non-Black/non-Hispanic people. Males with high debt interestingly have a higher risk of first birth.
fit.1 <- svyglm(event~as.factor(year) + debt_cat * sex + raceth - 1,
design = des, family = binomial(link = "cloglog"))Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.1)
Call:
svyglm(formula = event ~ as.factor(year) + debt_cat * sex + raceth -
1, design = des, family = binomial(link = "cloglog"))
Survey design:
svydesign(ids = ~VPSU_1997, strata = ~VSTRAT_1997, weights = ~weights,
data = pp, nest = TRUE)
Coefficients:
Estimate Std. Error t value Pr(>|t|)
as.factor(year)1 -8.5465914 1.0046428 -8.507 1.07e-12 ***
as.factor(year)2 -4.8667513 0.2807721 -17.333 < 2e-16 ***
as.factor(year)3 -4.9977442 0.2770158 -18.041 < 2e-16 ***
as.factor(year)4 -4.4470701 0.2653582 -16.759 < 2e-16 ***
as.factor(year)5 -4.6453568 0.2490277 -18.654 < 2e-16 ***
as.factor(year)6 -4.3367806 0.2255554 -19.227 < 2e-16 ***
as.factor(year)7 -4.2343269 0.1969796 -21.496 < 2e-16 ***
as.factor(year)8 -3.6031428 0.1875829 -19.208 < 2e-16 ***
as.factor(year)9 -3.5502532 0.1589611 -22.334 < 2e-16 ***
as.factor(year)10 -3.3999009 0.1556097 -21.849 < 2e-16 ***
as.factor(year)11 -3.4495884 0.1778034 -19.401 < 2e-16 ***
as.factor(year)12 -2.9828775 0.1583046 -18.843 < 2e-16 ***
as.factor(year)13 -2.9024666 0.1522211 -19.067 < 2e-16 ***
as.factor(year)14 -2.9330999 0.1637182 -17.916 < 2e-16 ***
as.factor(year)15 -2.8636459 0.1484609 -19.289 < 2e-16 ***
as.factor(year)16 -2.5399927 0.1611660 -15.760 < 2e-16 ***
as.factor(year)17 -2.4854256 0.1552027 -16.014 < 2e-16 ***
as.factor(year)18 -2.5474906 0.1462739 -17.416 < 2e-16 ***
as.factor(year)19 -2.2977481 0.1291010 -17.798 < 2e-16 ***
as.factor(year)20 -2.3084133 0.1358616 -16.991 < 2e-16 ***
as.factor(year)21 -2.4058961 0.1200516 -20.041 < 2e-16 ***
as.factor(year)22 -2.4748189 0.1610071 -15.371 < 2e-16 ***
as.factor(year)23 -2.5415193 0.1510860 -16.822 < 2e-16 ***
as.factor(year)24 -2.2384127 0.1641655 -13.635 < 2e-16 ***
as.factor(year)25 -2.8039709 0.2694639 -10.406 2.45e-16 ***
as.factor(year)26 -2.3256415 0.2181368 -10.661 < 2e-16 ***
as.factor(year)27 -2.8667287 0.3013089 -9.514 1.23e-14 ***
as.factor(year)28 -3.1040442 0.5867810 -5.290 1.11e-06 ***
debt_cat1low debt 0.0334879 0.1205671 0.278 0.7819
debt_cat2high debt -0.0360666 0.1043572 -0.346 0.7306
sexmale -0.5691341 0.1254277 -4.538 2.06e-05 ***
raceth1black 0.1688652 0.0748941 2.255 0.0270 *
raceth2hispanic -0.0005682 0.0964884 -0.006 0.9953
debt_cat1low debt:sexmale 0.1279423 0.1473405 0.868 0.3879
debt_cat2high debt:sexmale 0.3382524 0.1364265 2.479 0.0153 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 0.9626439)
Number of Fisher Scoring iterations: 10
Time specifications
Constant model: Shows males have lower risk of first birth and Black people have higher risk of first birth. Having debt seems to increase risk of first birth, but this is not significant.
# constant specification
fit.con <- svyglm(event ~ 1 + debt_cat + sex + raceth,
design = des, family = binomial(link = "cloglog"))Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.con)
Call:
svyglm(formula = event ~ 1 + debt_cat + sex + raceth, design = des,
family = binomial(link = "cloglog"))
Survey design:
svydesign(ids = ~VPSU_1997, strata = ~VSTRAT_1997, weights = ~weights,
data = pp, nest = TRUE)
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -3.273628 0.062175 -52.652 < 2e-16 ***
debt_cat1low debt 0.066737 0.076355 0.874 0.3841
debt_cat2high debt 0.081380 0.064882 1.254 0.2125
sexmale -0.294888 0.046182 -6.385 4.63e-09 ***
raceth1black 0.130185 0.057922 2.248 0.0267 *
raceth2hispanic 0.008899 0.076639 0.116 0.9078
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 0.9995857)
Number of Fisher Scoring iterations: 6
# Annual Hazard rate
1/(1+exp(-coef(fit.con))) (Intercept) debt_cat1low debt debt_cat2high debt sexmale
0.03648708 0.51667810 0.52033384 0.42680755
raceth1black raceth2hispanic
0.53250035 0.50222464
Linear model: The linear term shows an increase in risk of first birth. Shows the same outcomes as the constant model, however, the risk for Black people is no longer significant.
# Linear term for time using year
fit.lin <- svyglm(event ~ year + debt_cat + sex + raceth,
design = des, family = binomial(link = "cloglog"))Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.lin)
Call:
svyglm(formula = event ~ year + debt_cat + sex + raceth, design = des,
family = binomial(link = "cloglog"))
Survey design:
svydesign(ids = ~VPSU_1997, strata = ~VSTRAT_1997, weights = ~weights,
data = pp, nest = TRUE)
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -4.611316 0.106645 -43.240 < 2e-16 ***
year 0.098965 0.003860 25.635 < 2e-16 ***
debt_cat1low debt 0.099372 0.099121 1.003 0.3184
debt_cat2high debt 0.112290 0.084855 1.323 0.1886
sexmale -0.376796 0.059231 -6.361 5.31e-09 ***
raceth1black 0.139408 0.075097 1.856 0.0662 .
raceth2hispanic -0.006002 0.097485 -0.062 0.9510
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 0.9138613)
Number of Fisher Scoring iterations: 7
Squared model: The quadratic term shows a decrease in risk of birth. All other covariates remain the same with a switch back to a significant increased risk of first birth for the Black population.
# square model
fit.sq <- svyglm(event ~ year + I(year^2) + debt_cat + sex + raceth,
design = des, family = binomial(link = "cloglog"))Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.sq)
Call:
svyglm(formula = event ~ year + I(year^2) + debt_cat + sex +
raceth, design = des, family = binomial(link = "cloglog"))
Survey design:
svydesign(ids = ~VPSU_1997, strata = ~VSTRAT_1997, weights = ~weights,
data = pp, nest = TRUE)
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -6.1060344 0.1759874 -34.696 < 2e-16 ***
year 0.3410710 0.0219629 15.529 < 2e-16 ***
I(year^2) -0.0080237 0.0007194 -11.153 < 2e-16 ***
debt_cat1low debt 0.0942582 0.0953345 0.989 0.3251
debt_cat2high debt 0.1010526 0.0799508 1.264 0.2091
sexmale -0.3683603 0.0571155 -6.449 3.6e-09 ***
raceth1black 0.1681260 0.0743784 2.260 0.0259 *
raceth2hispanic 0.0062647 0.0950760 0.066 0.9476
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 0.9714705)
Number of Fisher Scoring iterations: 7
Cubic model: The cubic term shows a decreased risk in first birth. However, this is not significant. All other covariates reamin the same with males having a significantly lower risk of first birth and the Black population having a significantly higher risk of first birth.
# cubic model
fit.cube <- svyglm(event ~ year + I(year^2) + I(year^3) + debt_cat + sex + raceth,
design = des, family = binomial(link = "cloglog"))Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.cube)
Call:
svyglm(formula = event ~ year + I(year^2) + I(year^3) + debt_cat +
sex + raceth, design = des, family = binomial(link = "cloglog"))
Survey design:
svydesign(ids = ~VPSU_1997, strata = ~VSTRAT_1997, weights = ~weights,
data = pp, nest = TRUE)
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -6.003e+00 2.394e-01 -25.074 < 2e-16 ***
year 3.117e-01 5.421e-02 5.750 9.17e-08 ***
I(year^2) -5.767e-03 4.052e-03 -1.423 0.158
I(year^3) -5.076e-05 9.132e-05 -0.556 0.580
debt_cat1low debt 9.387e-02 9.540e-02 0.984 0.327
debt_cat2high debt 1.009e-01 7.993e-02 1.263 0.210
sexmale -3.683e-01 5.715e-02 -6.445 3.78e-09 ***
raceth1black 1.680e-01 7.438e-02 2.259 0.026 *
raceth2hispanic 6.137e-03 9.513e-02 0.065 0.949
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 0.967206)
Number of Fisher Scoring iterations: 8
Quadratic model: The linear term remains significant and the quadratic term shows a lower risk of first birth, but is not significant. First birth risk remains significant for both Males and the Black population.
# quadratic model
fit.quad <- svyglm(event ~ year + I(year^2) + I(year^3) + I(year^4) + debt_cat + sex + raceth,
design = des, family = binomial(link = "cloglog"))Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.quad)
Call:
svyglm(formula = event ~ year + I(year^2) + I(year^3) + I(year^4) +
debt_cat + sex + raceth, design = des, family = binomial(link = "cloglog"))
Survey design:
svydesign(ids = ~VPSU_1997, strata = ~VSTRAT_1997, weights = ~weights,
data = pp, nest = TRUE)
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -6.243e+00 3.589e-01 -17.394 < 2e-16 ***
year 4.220e-01 1.245e-01 3.389 0.000999 ***
I(year^2) -2.041e-02 1.588e-02 -1.285 0.201733
I(year^3) 6.902e-04 8.025e-04 0.860 0.391770
I(year^4) -1.265e-05 1.389e-05 -0.911 0.364645
debt_cat1low debt 9.299e-02 9.520e-02 0.977 0.330975
debt_cat2high debt 1.006e-01 7.982e-02 1.260 0.210514
sexmale -3.679e-01 5.722e-02 -6.430 4.17e-09 ***
raceth1black 1.672e-01 7.432e-02 2.250 0.026592 *
raceth2hispanic 5.811e-03 9.515e-02 0.061 0.951425
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 0.9688199)
Number of Fisher Scoring iterations: 8
Spline: This model shows an increase in risk of first birth and a decrease as time goes on. All four splines are significant. First birth risk remains significant for both Males and the Black population.
# spline model
library(splines)
fit.sp <- svyglm(event ~ ns(year, df = 4) + debt_cat + sex + raceth,
design = des, family = binomial(link = "cloglog"))Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.sp)
Call:
svyglm(formula = event ~ ns(year, df = 4) + debt_cat + sex +
raceth, design = des, family = binomial(link = "cloglog"))
Survey design:
svydesign(ids = ~VPSU_1997, strata = ~VSTRAT_1997, weights = ~weights,
data = pp, nest = TRUE)
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -5.86516 0.26760 -21.917 < 2e-16 ***
ns(year, df = 4)1 2.35606 0.23356 10.087 < 2e-16 ***
ns(year, df = 4)2 3.15142 0.17591 17.915 < 2e-16 ***
ns(year, df = 4)3 4.67361 0.51693 9.041 1.08e-14 ***
ns(year, df = 4)4 2.26913 0.17316 13.105 < 2e-16 ***
debt_cat1low debt 0.09439 0.09535 0.990 0.325
debt_cat2high debt 0.10139 0.07999 1.268 0.208
sexmale -0.36873 0.05714 -6.453 3.74e-09 ***
raceth1black 0.16808 0.07440 2.259 0.026 *
raceth2hispanic 0.00615 0.09512 0.065 0.949
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 0.9688535)
Number of Fisher Scoring iterations: 8
Hazards
newdat <- expand.grid(year = unique(pp$year),
debt_cat = unique(pp$debt_cat),
sex = unique(pp$sex),
raceth = unique(pp$raceth))
newdat$h_gen <- predict(fit.0, newdata = newdat, type = "response")
newdat$h_con <- predict(fit.con, newdata = newdat, type = "response")
newdat$h_lin <- predict(fit.lin, newdata = newdat, type = "response")
newdat$h_sq <- predict(fit.sq, newdata = newdat, type = "response")
newdat$h_cube <- predict(fit.cube, newdata = newdat, type = "response")
newdat$h_quad <- predict(fit.quad, newdata = newdat, type = "response")
newdat$h_sp <- predict(fit.sp, newdata = newdat, type = "response")
head(newdat) year debt_cat sex raceth h_gen h_con h_lin
1 1 1low debt female 2hispanic 0.0001962619 0.04002126 0.01197424
2 2 1low debt female 2hispanic 0.0077495702 0.04002126 0.01321163
3 3 1low debt female 2hispanic 0.0068007212 0.04002126 0.01457595
4 4 1low debt female 2hispanic 0.0117656049 0.04002126 0.01608000
5 5 1low debt female 2hispanic 0.0096576805 0.04002126 0.01773784
6 6 1low debt female 2hispanic 0.0131235877 0.04002126 0.01956491
h_sq h_cube h_quad h_sp
1 0.003433472 0.003702164 0.003201968 0.003131672
2 0.004711144 0.004964753 0.004611269 0.003968935
3 0.006360169 0.006576418 0.006422882 0.005015558
4 0.008447832 0.008601696 0.008681687 0.006302296
5 0.011039297 0.011105415 0.011422780 0.007852373
6 0.014192003 0.014148007 0.014670074 0.009674239
AIC table
The square model appears to have the best fit
aic<-round(c(
fit.con$deviance+2*length(fit.con$coefficients),
fit.lin$deviance+2*length(fit.lin$coefficients),
fit.sq$deviance+2*length(fit.sq$coefficients),
fit.cube$deviance+2*length(fit.cube$coefficients),
fit.quad$deviance+2*length(fit.quad$coefficients),
fit.sp$deviance+2*length(fit.sp$coefficients),
fit.0$deviance+2*length(fit.0$coefficients)),2)
#compare all aics to the one from the general model
dif.aic<-round(aic-aic[7],2)
data.frame(model =c("constant", "linear","square", "cubic", "quartic","spline", "general"),
aic=aic,
aic_dif=dif.aic) model aic aic_dif
1 constant 12972.03 906.79
2 linear 12229.68 164.44
3 square 12056.15 -9.09
4 cubic 12057.86 -7.38
5 quartic 12058.99 -6.25
6 spline 12060.09 -5.15
7 general 12065.24 0.00
library(magrittr)
library(data.table)
out <- melt(setDT(newdat),
id = c("year", "debt_cat", "sex","raceth"),
measure.vars = list(haz=c("h_gen", "h_con","h_lin","h_sq", "h_cube", "h_quad", "h_sp")))
head(out, n=20) year debt_cat sex raceth variable value
1: 1 1low debt female 2hispanic h_gen 0.0001962619
2: 2 1low debt female 2hispanic h_gen 0.0077495702
3: 3 1low debt female 2hispanic h_gen 0.0068007212
4: 4 1low debt female 2hispanic h_gen 0.0117656049
5: 5 1low debt female 2hispanic h_gen 0.0096576805
6: 6 1low debt female 2hispanic h_gen 0.0131235877
7: 7 1low debt female 2hispanic h_gen 0.0145300958
8: 8 1low debt female 2hispanic h_gen 0.0271394305
9: 9 1low debt female 2hispanic h_gen 0.0285848362
10: 10 1low debt female 2hispanic h_gen 0.0331281306
11: 11 1low debt female 2hispanic h_gen 0.0315346073
12: 12 1low debt female 2hispanic h_gen 0.0498111863
13: 13 1low debt female 2hispanic h_gen 0.0538680354
14: 14 1low debt female 2hispanic h_gen 0.0522852995
15: 15 1low debt female 2hispanic h_gen 0.0559463964
16: 16 1low debt female 2hispanic h_gen 0.0765222107
17: 17 1low debt female 2hispanic h_gen 0.0806598298
18: 18 1low debt female 2hispanic h_gen 0.0760276406
19: 19 1low debt female 2hispanic h_gen 0.0965232183
20: 20 1low debt female 2hispanic h_gen 0.0954959683
Hazard plots: With these alternative time specification plots we see that for every model, females are at higher risk for firstbirth, and that for most models, the risk of first birth increases, and then decreases. We saw above that the best fit is the Square time specification. Why do the plots look spikey?
library(ggplot2)
out %>%
ggplot(aes(x = year, y = value))+
geom_line(aes(group = sex, color = sex) )+
labs(title = "Hazard function for first birth",
subtitle = "Alternative model specifications")+
xlab("Age") + ylab("H(t)")+
facet_wrap(~variable, scales= "free_y")Don't know how to automatically pick scale for object of type svystat. Defaulting to continuous.
out %>%
ggplot(aes(x = year, y = value))+
geom_line(aes(group = debt_cat, color = debt_cat) )+
labs(title = "Hazard function for first birth",
subtitle = "Alternative model specifications")+
xlab("Age") + ylab("H(t)")+
facet_wrap(~variable, scales= "free_y")Don't know how to automatically pick scale for object of type svystat. Defaulting to continuous.