Event History Analysis Homework 5

Author

Cristina Martinez, MPH

  1. Fit the discrete time hazard model to your outcome

    • You must form a person-period data set
    • Consider both the general model and other time specifications
    • Include all main effects in the model
    • Test for an interaction between at least two of the predictors
    • Generate hazard plots for interesting cases highlighting the significant predictors in your analysis
#df_cat <- read.csv('categories.csv')
df_raw <- read.csv('raw_data.csv')
weights <- read.table('customweight.dat', sep=' ') %>%
  rename(PUBID_1997 = V1, weights = V2)
# Determine latest interview date for those without children
last_int <- df_raw %>%
  filter(is.na(CV_CHILD_BIRTH_MONTH.01_2019) == T) %>%
  select(PUBID_1997,
         CV_INTERVIEW_CMONTH_1997, CV_INTERVIEW_CMONTH_1998,
         CV_INTERVIEW_CMONTH_1999, CV_INTERVIEW_CMONTH_2000,
         CV_INTERVIEW_CMONTH_2001, CV_INTERVIEW_CMONTH_2002,
         CV_INTERVIEW_CMONTH_2004, CV_INTERVIEW_CMONTH_2005,
         CV_INTERVIEW_CMONTH_2006, CV_INTERVIEW_CMONTH_2007,
         CV_INTERVIEW_CMONTH_2008, CV_INTERVIEW_CMONTH_2009,
         CV_INTERVIEW_CMONTH_2010, CV_INTERVIEW_CMONTH_2011,
         CV_INTERVIEW_CMONTH_2013, CV_INTERVIEW_CMONTH_2015,
         CV_INTERVIEW_CMONTH_2017, CV_INTERVIEW_CMONTH_2019) %>%
  pivot_longer(cols = c(-PUBID_1997), 
               names_to = "interview", 
               values_to = "cmonth") %>%
  group_by(PUBID_1997) %>%
  summarize(age.cens = max(cmonth, na.rm = TRUE))


df_raw <- left_join(df_raw, last_int, by = "PUBID_1997")

df_raw <- left_join(df_raw, weights)
rm(last_int)
df_raw_rc <- df_raw %>%
  mutate(
    #calculate age at first birth
    age97 = CV_AGE.MONTHS._INT_DATE_1997/12,
    months2fb = CV_CHILD_BIRTH_MONTH.01_2019 - CV_AGE.MONTHS._INT_DATE_1997,
    years2fb = months2fb/12,
    age_fb = round(age97+years2fb,2),
    # calculate censored age based on last interview
    months_cens = age.cens-CV_AGE.MONTHS._INT_DATE_1997,
    years_cens = months_cens/12,
    age_cens = round(age97+years_cens,2),
    age_obs = if_else(is.na(age_cens) == T, age_fb, age_cens),
    event = if_else(is.na(age_cens) == T, 1, 0),
    debt_high = if_else(CVC_ASSETS_DEBTS_25_XRND > 17000, 1, 0),
    debt_cat = factor(case_when(
      CVC_ASSETS_DEBTS_25_XRND == 0 ~ "0No debt",
      CVC_ASSETS_DEBTS_25_XRND > 0 & CVC_ASSETS_DEBTS_25_XRND <= 11000 ~ "1low debt",
      CVC_ASSETS_DEBTS_25_XRND > 17500 ~ "2high debt")), # I used the mean debt at 25 when all zero debt is removed
    sex = factor(if_else(KEY_SEX_1997 == 1, "male", "female")),
    #bach = as.factor(if_else(CV_HIGHEST_DEGREE_EVER_EDT_2019 >= 5, 1, 0)),
    bach = if_else(CV_HIGHEST_DEGREE_EVER_EDT_2019 >= 5, 1, 0),
    raceth = factor(case_when(KEY_RACE_ETHNICITY_1997 == 1 ~ "1black",
                       KEY_RACE_ETHNICITY_1997 == 2 ~ "2hispanic",
                       #KEY_RACE_ETHNICITY_1997 == 3 ~ "3mixed race",
                       KEY_RACE_ETHNICITY_1997 == 4 ~ "0non-black/non-hisp")),
    timefb = if_else(event == 1,
                         CV_CHILD_BIRTH_MONTH.01_2019 - CV_AGE.MONTHS._INT_DATE_1997,
                         age.cens-CV_AGE.MONTHS._INT_DATE_1997),
  ) %>%
  filter(CV_HIGHEST_DEGREE_EVER_EDT_2019 > 2) # only include those who have attended college in the sample

df_comp <- df_raw_rc %>%
  select(PUBID_1997, timefb, age97, age_obs, event, debt_cat, sex, raceth, VPSU_1997, VSTRAT_1997, weights) %>%
  # remove those with first birth before first time point
  mutate(check = if_else(age97 >= age_obs, 1, 0)) %>%
  filter(check == 0) %>%
  select(-check) %>%
  filter(complete.cases(.))

Create person period file

pp <- survSplit(Surv(timefb, event) ~. ,
                data = df_comp,
                cut = seq(0,336, 12),
                episode = "year_birth")

pp$year <- pp$year_birth-1
pp<-pp[order(pp$PUBID_1997, pp$year_birth),]

Descriptive analysis

pp %>%
  group_by(year) %>%
  summarise(prop_bir = mean(event, na.rm = T)) %>%
  ggplot(aes(x = year, y = prop_bir)) +
  geom_line() +
  ggtitle(label = "Hazard of having first birth by year after first interview")

pp %>%
  group_by(year, debt_cat) %>%
  summarise(prop_bir = mean(event, na.rm = T)) %>%
  ggplot(aes(x = year, y = prop_bir)) +
  geom_line(aes(group = debt_cat, color = debt_cat)) +
  ggtitle(label = "Hazard of having first birth by year after first interview by debt category")

Discrete time model

# create survey design
options(survey.lonely.psu = "adjust")

des <- svydesign(ids = ~VPSU_1997, 
                  strata = ~VSTRAT_1997, 
                  weights = ~weights,
                  data = pp, nest = TRUE)

General model

The general model with year, debt, sex and race/ethnicity as covariates shows that the risk of first birth gradually increases and then decrease as time continues. Males have a lower risk of first birth than females. Both Black and Hispanic people have a higher risk of birth birth than Non-black/Non-Hispanic people. Debt did not seem to have any significant contribution to risk of first birth.

# general model
fit.0 <- svyglm(event~as.factor(year) + debt_cat + sex + raceth - 1, design = des, family = binomial(link = "cloglog")) # family = "binomial"
Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.0)

Call:
svyglm(formula = event ~ as.factor(year) + debt_cat + sex + raceth - 
    1, design = des, family = binomial(link = "cloglog"))

Survey design:
svydesign(ids = ~VPSU_1997, strata = ~VSTRAT_1997, weights = ~weights, 
    data = pp, nest = TRUE)

Coefficients:
                    Estimate Std. Error t value Pr(>|t|)    
as.factor(year)1   -8.635177   1.003758  -8.603 5.81e-13 ***
as.factor(year)2   -4.955445   0.276824 -17.901  < 2e-16 ***
as.factor(year)3   -5.086531   0.280259 -18.149  < 2e-16 ***
as.factor(year)4   -4.535878   0.257434 -17.620  < 2e-16 ***
as.factor(year)5   -4.734368   0.243822 -19.417  < 2e-16 ***
as.factor(year)6   -4.425961   0.216025 -20.488  < 2e-16 ***
as.factor(year)7   -4.323438   0.194468 -22.232  < 2e-16 ***
as.factor(year)8   -3.692257   0.180713 -20.432  < 2e-16 ***
as.factor(year)9   -3.639628   0.158521 -22.960  < 2e-16 ***
as.factor(year)10  -3.489790   0.141603 -24.645  < 2e-16 ***
as.factor(year)11  -3.539906   0.168561 -21.001  < 2e-16 ***
as.factor(year)12  -3.073292   0.150974 -20.356  < 2e-16 ***
as.factor(year)13  -2.992874   0.142193 -21.048  < 2e-16 ***
as.factor(year)14  -3.023524   0.152562 -19.818  < 2e-16 ***
as.factor(year)15  -2.953928   0.132292 -22.329  < 2e-16 ***
as.factor(year)16  -2.629849   0.146652 -17.933  < 2e-16 ***
as.factor(year)17  -2.574974   0.144480 -17.822  < 2e-16 ***
as.factor(year)18  -2.636597   0.136313 -19.342  < 2e-16 ***
as.factor(year)19  -2.386863   0.120068 -19.879  < 2e-16 ***
as.factor(year)20  -2.398121   0.125509 -19.107  < 2e-16 ***
as.factor(year)21  -2.495961   0.113632 -21.965  < 2e-16 ***
as.factor(year)22  -2.565367   0.151602 -16.922  < 2e-16 ***
as.factor(year)23  -2.632760   0.135452 -19.437  < 2e-16 ***
as.factor(year)24  -2.330663   0.151345 -15.400  < 2e-16 ***
as.factor(year)25  -2.898878   0.257273 -11.268  < 2e-16 ***
as.factor(year)26  -2.422947   0.206714 -11.721  < 2e-16 ***
as.factor(year)27  -2.965589   0.288455 -10.281 3.16e-16 ***
as.factor(year)28  -3.202921   0.583610  -5.488 4.76e-07 ***
debt_cat1low debt   0.093652   0.095291   0.983   0.3287    
debt_cat2high debt  0.101214   0.079882   1.267   0.2089    
sexmale            -0.368030   0.057196  -6.435 8.81e-09 ***
raceth1black        0.166937   0.074361   2.245   0.0276 *  
raceth2hispanic     0.005562   0.095094   0.058   0.9535    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 0.9660403)

Number of Fisher Scoring iterations: 10

General model with interaction

The general model with an interaction term shows similar results to the model above. We see the same increasing and decreasing and increasing trend as time progresses. Debt on its own still does not significantly contribute to risk of first birth. Males have a significantly lower risk of first birth than females. Black people have a significantly higher risk of first birth compared to non-Black/non-Hispanic people. Males with high debt interestingly have a higher risk of first birth.

fit.1 <- svyglm(event~as.factor(year) + debt_cat * sex + raceth - 1,
                design = des, family = binomial(link = "cloglog"))
Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.1)

Call:
svyglm(formula = event ~ as.factor(year) + debt_cat * sex + raceth - 
    1, design = des, family = binomial(link = "cloglog"))

Survey design:
svydesign(ids = ~VPSU_1997, strata = ~VSTRAT_1997, weights = ~weights, 
    data = pp, nest = TRUE)

Coefficients:
                             Estimate Std. Error t value Pr(>|t|)    
as.factor(year)1           -8.5465914  1.0046428  -8.507 1.07e-12 ***
as.factor(year)2           -4.8667513  0.2807721 -17.333  < 2e-16 ***
as.factor(year)3           -4.9977442  0.2770158 -18.041  < 2e-16 ***
as.factor(year)4           -4.4470701  0.2653582 -16.759  < 2e-16 ***
as.factor(year)5           -4.6453568  0.2490277 -18.654  < 2e-16 ***
as.factor(year)6           -4.3367806  0.2255554 -19.227  < 2e-16 ***
as.factor(year)7           -4.2343269  0.1969796 -21.496  < 2e-16 ***
as.factor(year)8           -3.6031428  0.1875829 -19.208  < 2e-16 ***
as.factor(year)9           -3.5502532  0.1589611 -22.334  < 2e-16 ***
as.factor(year)10          -3.3999009  0.1556097 -21.849  < 2e-16 ***
as.factor(year)11          -3.4495884  0.1778034 -19.401  < 2e-16 ***
as.factor(year)12          -2.9828775  0.1583046 -18.843  < 2e-16 ***
as.factor(year)13          -2.9024666  0.1522211 -19.067  < 2e-16 ***
as.factor(year)14          -2.9330999  0.1637182 -17.916  < 2e-16 ***
as.factor(year)15          -2.8636459  0.1484609 -19.289  < 2e-16 ***
as.factor(year)16          -2.5399927  0.1611660 -15.760  < 2e-16 ***
as.factor(year)17          -2.4854256  0.1552027 -16.014  < 2e-16 ***
as.factor(year)18          -2.5474906  0.1462739 -17.416  < 2e-16 ***
as.factor(year)19          -2.2977481  0.1291010 -17.798  < 2e-16 ***
as.factor(year)20          -2.3084133  0.1358616 -16.991  < 2e-16 ***
as.factor(year)21          -2.4058961  0.1200516 -20.041  < 2e-16 ***
as.factor(year)22          -2.4748189  0.1610071 -15.371  < 2e-16 ***
as.factor(year)23          -2.5415193  0.1510860 -16.822  < 2e-16 ***
as.factor(year)24          -2.2384127  0.1641655 -13.635  < 2e-16 ***
as.factor(year)25          -2.8039709  0.2694639 -10.406 2.45e-16 ***
as.factor(year)26          -2.3256415  0.2181368 -10.661  < 2e-16 ***
as.factor(year)27          -2.8667287  0.3013089  -9.514 1.23e-14 ***
as.factor(year)28          -3.1040442  0.5867810  -5.290 1.11e-06 ***
debt_cat1low debt           0.0334879  0.1205671   0.278   0.7819    
debt_cat2high debt         -0.0360666  0.1043572  -0.346   0.7306    
sexmale                    -0.5691341  0.1254277  -4.538 2.06e-05 ***
raceth1black                0.1688652  0.0748941   2.255   0.0270 *  
raceth2hispanic            -0.0005682  0.0964884  -0.006   0.9953    
debt_cat1low debt:sexmale   0.1279423  0.1473405   0.868   0.3879    
debt_cat2high debt:sexmale  0.3382524  0.1364265   2.479   0.0153 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 0.9626439)

Number of Fisher Scoring iterations: 10

Time specifications

Constant model: Shows males have lower risk of first birth and Black people have higher risk of first birth. Having debt seems to increase risk of first birth, but this is not significant.

# constant specification
fit.con <- svyglm(event ~ 1 + debt_cat + sex + raceth, 
                  design = des, family = binomial(link = "cloglog"))
Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.con)

Call:
svyglm(formula = event ~ 1 + debt_cat + sex + raceth, design = des, 
    family = binomial(link = "cloglog"))

Survey design:
svydesign(ids = ~VPSU_1997, strata = ~VSTRAT_1997, weights = ~weights, 
    data = pp, nest = TRUE)

Coefficients:
                    Estimate Std. Error t value Pr(>|t|)    
(Intercept)        -3.273628   0.062175 -52.652  < 2e-16 ***
debt_cat1low debt   0.066737   0.076355   0.874   0.3841    
debt_cat2high debt  0.081380   0.064882   1.254   0.2125    
sexmale            -0.294888   0.046182  -6.385 4.63e-09 ***
raceth1black        0.130185   0.057922   2.248   0.0267 *  
raceth2hispanic     0.008899   0.076639   0.116   0.9078    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 0.9995857)

Number of Fisher Scoring iterations: 6
# Annual Hazard rate
1/(1+exp(-coef(fit.con)))
       (Intercept)  debt_cat1low debt debt_cat2high debt            sexmale 
        0.03648708         0.51667810         0.52033384         0.42680755 
      raceth1black    raceth2hispanic 
        0.53250035         0.50222464 

Linear model: The linear term shows an increase in risk of first birth. Shows the same outcomes as the constant model, however, the risk for Black people is no longer significant.

# Linear term for time using year
fit.lin <- svyglm(event ~ year + debt_cat + sex + raceth, 
                   design = des, family = binomial(link = "cloglog"))
Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.lin)

Call:
svyglm(formula = event ~ year + debt_cat + sex + raceth, design = des, 
    family = binomial(link = "cloglog"))

Survey design:
svydesign(ids = ~VPSU_1997, strata = ~VSTRAT_1997, weights = ~weights, 
    data = pp, nest = TRUE)

Coefficients:
                    Estimate Std. Error t value Pr(>|t|)    
(Intercept)        -4.611316   0.106645 -43.240  < 2e-16 ***
year                0.098965   0.003860  25.635  < 2e-16 ***
debt_cat1low debt   0.099372   0.099121   1.003   0.3184    
debt_cat2high debt  0.112290   0.084855   1.323   0.1886    
sexmale            -0.376796   0.059231  -6.361 5.31e-09 ***
raceth1black        0.139408   0.075097   1.856   0.0662 .  
raceth2hispanic    -0.006002   0.097485  -0.062   0.9510    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 0.9138613)

Number of Fisher Scoring iterations: 7

Squared model: The quadratic term shows a decrease in risk of birth. All other covariates remain the same with a switch back to a significant increased risk of first birth for the Black population.

# square model
fit.sq <- svyglm(event ~ year + I(year^2) + debt_cat + sex + raceth, 
                   design = des, family = binomial(link = "cloglog"))
Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.sq)

Call:
svyglm(formula = event ~ year + I(year^2) + debt_cat + sex + 
    raceth, design = des, family = binomial(link = "cloglog"))

Survey design:
svydesign(ids = ~VPSU_1997, strata = ~VSTRAT_1997, weights = ~weights, 
    data = pp, nest = TRUE)

Coefficients:
                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)        -6.1060344  0.1759874 -34.696  < 2e-16 ***
year                0.3410710  0.0219629  15.529  < 2e-16 ***
I(year^2)          -0.0080237  0.0007194 -11.153  < 2e-16 ***
debt_cat1low debt   0.0942582  0.0953345   0.989   0.3251    
debt_cat2high debt  0.1010526  0.0799508   1.264   0.2091    
sexmale            -0.3683603  0.0571155  -6.449  3.6e-09 ***
raceth1black        0.1681260  0.0743784   2.260   0.0259 *  
raceth2hispanic     0.0062647  0.0950760   0.066   0.9476    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 0.9714705)

Number of Fisher Scoring iterations: 7

Cubic model: The cubic term shows a decreased risk in first birth. However, this is not significant. All other covariates reamin the same with males having a significantly lower risk of first birth and the Black population having a significantly higher risk of first birth.

# cubic model
fit.cube <- svyglm(event ~ year + I(year^2) + I(year^3) + debt_cat + sex + raceth, 
                   design = des, family = binomial(link = "cloglog"))
Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.cube)

Call:
svyglm(formula = event ~ year + I(year^2) + I(year^3) + debt_cat + 
    sex + raceth, design = des, family = binomial(link = "cloglog"))

Survey design:
svydesign(ids = ~VPSU_1997, strata = ~VSTRAT_1997, weights = ~weights, 
    data = pp, nest = TRUE)

Coefficients:
                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)        -6.003e+00  2.394e-01 -25.074  < 2e-16 ***
year                3.117e-01  5.421e-02   5.750 9.17e-08 ***
I(year^2)          -5.767e-03  4.052e-03  -1.423    0.158    
I(year^3)          -5.076e-05  9.132e-05  -0.556    0.580    
debt_cat1low debt   9.387e-02  9.540e-02   0.984    0.327    
debt_cat2high debt  1.009e-01  7.993e-02   1.263    0.210    
sexmale            -3.683e-01  5.715e-02  -6.445 3.78e-09 ***
raceth1black        1.680e-01  7.438e-02   2.259    0.026 *  
raceth2hispanic     6.137e-03  9.513e-02   0.065    0.949    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 0.967206)

Number of Fisher Scoring iterations: 8

Quadratic model: The linear term remains significant and the quadratic term shows a lower risk of first birth, but is not significant. First birth risk remains significant for both Males and the Black population.

# quadratic model
fit.quad <- svyglm(event ~ year + I(year^2) + I(year^3) + I(year^4) + debt_cat + sex + raceth, 
                   design = des, family = binomial(link = "cloglog"))
Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.quad)

Call:
svyglm(formula = event ~ year + I(year^2) + I(year^3) + I(year^4) + 
    debt_cat + sex + raceth, design = des, family = binomial(link = "cloglog"))

Survey design:
svydesign(ids = ~VPSU_1997, strata = ~VSTRAT_1997, weights = ~weights, 
    data = pp, nest = TRUE)

Coefficients:
                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)        -6.243e+00  3.589e-01 -17.394  < 2e-16 ***
year                4.220e-01  1.245e-01   3.389 0.000999 ***
I(year^2)          -2.041e-02  1.588e-02  -1.285 0.201733    
I(year^3)           6.902e-04  8.025e-04   0.860 0.391770    
I(year^4)          -1.265e-05  1.389e-05  -0.911 0.364645    
debt_cat1low debt   9.299e-02  9.520e-02   0.977 0.330975    
debt_cat2high debt  1.006e-01  7.982e-02   1.260 0.210514    
sexmale            -3.679e-01  5.722e-02  -6.430 4.17e-09 ***
raceth1black        1.672e-01  7.432e-02   2.250 0.026592 *  
raceth2hispanic     5.811e-03  9.515e-02   0.061 0.951425    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 0.9688199)

Number of Fisher Scoring iterations: 8

Spline: This model shows an increase in risk of first birth and a decrease as time goes on. All four splines are significant. First birth risk remains significant for both Males and the Black population.

# spline model
library(splines)

fit.sp <- svyglm(event ~ ns(year, df = 4) + debt_cat + sex + raceth,
                 design = des, family = binomial(link = "cloglog"))
Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.sp)

Call:
svyglm(formula = event ~ ns(year, df = 4) + debt_cat + sex + 
    raceth, design = des, family = binomial(link = "cloglog"))

Survey design:
svydesign(ids = ~VPSU_1997, strata = ~VSTRAT_1997, weights = ~weights, 
    data = pp, nest = TRUE)

Coefficients:
                   Estimate Std. Error t value Pr(>|t|)    
(Intercept)        -5.86516    0.26760 -21.917  < 2e-16 ***
ns(year, df = 4)1   2.35606    0.23356  10.087  < 2e-16 ***
ns(year, df = 4)2   3.15142    0.17591  17.915  < 2e-16 ***
ns(year, df = 4)3   4.67361    0.51693   9.041 1.08e-14 ***
ns(year, df = 4)4   2.26913    0.17316  13.105  < 2e-16 ***
debt_cat1low debt   0.09439    0.09535   0.990    0.325    
debt_cat2high debt  0.10139    0.07999   1.268    0.208    
sexmale            -0.36873    0.05714  -6.453 3.74e-09 ***
raceth1black        0.16808    0.07440   2.259    0.026 *  
raceth2hispanic     0.00615    0.09512   0.065    0.949    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 0.9688535)

Number of Fisher Scoring iterations: 8

Hazards

newdat <- expand.grid(year = unique(pp$year),
                      debt_cat = unique(pp$debt_cat),
                      sex = unique(pp$sex),
                      raceth = unique(pp$raceth))

newdat$h_gen <- predict(fit.0, newdata = newdat, type = "response")
newdat$h_con <- predict(fit.con, newdata = newdat, type = "response")
newdat$h_lin <- predict(fit.lin, newdata = newdat, type = "response")
newdat$h_sq <- predict(fit.sq, newdata = newdat, type = "response")
newdat$h_cube <- predict(fit.cube, newdata = newdat, type = "response")
newdat$h_quad <- predict(fit.quad, newdata = newdat, type = "response")
newdat$h_sp <- predict(fit.sp, newdata = newdat, type = "response")

head(newdat)
  year  debt_cat    sex    raceth        h_gen      h_con      h_lin
1    1 1low debt female 2hispanic 0.0001962619 0.04002126 0.01197424
2    2 1low debt female 2hispanic 0.0077495702 0.04002126 0.01321163
3    3 1low debt female 2hispanic 0.0068007212 0.04002126 0.01457595
4    4 1low debt female 2hispanic 0.0117656049 0.04002126 0.01608000
5    5 1low debt female 2hispanic 0.0096576805 0.04002126 0.01773784
6    6 1low debt female 2hispanic 0.0131235877 0.04002126 0.01956491
         h_sq      h_cube      h_quad        h_sp
1 0.003433472 0.003702164 0.003201968 0.003131672
2 0.004711144 0.004964753 0.004611269 0.003968935
3 0.006360169 0.006576418 0.006422882 0.005015558
4 0.008447832 0.008601696 0.008681687 0.006302296
5 0.011039297 0.011105415 0.011422780 0.007852373
6 0.014192003 0.014148007 0.014670074 0.009674239

AIC table

The square model appears to have the best fit

aic<-round(c(
  fit.con$deviance+2*length(fit.con$coefficients),
  fit.lin$deviance+2*length(fit.lin$coefficients),
  fit.sq$deviance+2*length(fit.sq$coefficients),
  fit.cube$deviance+2*length(fit.cube$coefficients),
  fit.quad$deviance+2*length(fit.quad$coefficients),
  fit.sp$deviance+2*length(fit.sp$coefficients),
  fit.0$deviance+2*length(fit.0$coefficients)),2)

#compare all aics to the one from the general model
dif.aic<-round(aic-aic[7],2)
data.frame(model =c("constant", "linear","square", "cubic", "quartic","spline", "general"),
           aic=aic,
           aic_dif=dif.aic)
     model      aic aic_dif
1 constant 12972.03  906.79
2   linear 12229.68  164.44
3   square 12056.15   -9.09
4    cubic 12057.86   -7.38
5  quartic 12058.99   -6.25
6   spline 12060.09   -5.15
7  general 12065.24    0.00
library(magrittr)
library(data.table)

out <- melt(setDT(newdat),
            id = c("year", "debt_cat", "sex","raceth"),
            measure.vars = list(haz=c("h_gen", "h_con","h_lin","h_sq", "h_cube", "h_quad", "h_sp")))
head(out, n=20)
    year  debt_cat    sex    raceth variable        value
 1:    1 1low debt female 2hispanic    h_gen 0.0001962619
 2:    2 1low debt female 2hispanic    h_gen 0.0077495702
 3:    3 1low debt female 2hispanic    h_gen 0.0068007212
 4:    4 1low debt female 2hispanic    h_gen 0.0117656049
 5:    5 1low debt female 2hispanic    h_gen 0.0096576805
 6:    6 1low debt female 2hispanic    h_gen 0.0131235877
 7:    7 1low debt female 2hispanic    h_gen 0.0145300958
 8:    8 1low debt female 2hispanic    h_gen 0.0271394305
 9:    9 1low debt female 2hispanic    h_gen 0.0285848362
10:   10 1low debt female 2hispanic    h_gen 0.0331281306
11:   11 1low debt female 2hispanic    h_gen 0.0315346073
12:   12 1low debt female 2hispanic    h_gen 0.0498111863
13:   13 1low debt female 2hispanic    h_gen 0.0538680354
14:   14 1low debt female 2hispanic    h_gen 0.0522852995
15:   15 1low debt female 2hispanic    h_gen 0.0559463964
16:   16 1low debt female 2hispanic    h_gen 0.0765222107
17:   17 1low debt female 2hispanic    h_gen 0.0806598298
18:   18 1low debt female 2hispanic    h_gen 0.0760276406
19:   19 1low debt female 2hispanic    h_gen 0.0965232183
20:   20 1low debt female 2hispanic    h_gen 0.0954959683

Hazard plots: With these alternative time specification plots we see that for every model, females are at higher risk for firstbirth, and that for most models, the risk of first birth increases, and then decreases. We saw above that the best fit is the Square time specification. Why do the plots look spikey?

library(ggplot2)
out %>%
  ggplot(aes(x = year, y = value))+
    geom_line(aes(group = sex, color = sex) )+
    labs(title = "Hazard function for first birth",
    subtitle = "Alternative model specifications")+
    xlab("Age") + ylab("H(t)")+
    facet_wrap(~variable, scales= "free_y")
Don't know how to automatically pick scale for object of type svystat. Defaulting to continuous.

out %>%
  ggplot(aes(x = year, y = value))+
    geom_line(aes(group = debt_cat, color = debt_cat) )+
    labs(title = "Hazard function for first birth",
    subtitle = "Alternative model specifications")+
    xlab("Age") + ylab("H(t)")+
    facet_wrap(~variable, scales= "free_y")
Don't know how to automatically pick scale for object of type svystat. Defaulting to continuous.