Discrete Time Models HW

Importing data and creating coavariates

library(haven)
library(tidyverse)
── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
✔ tibble  3.1.8      ✔ dplyr   1.0.10
✔ tidyr   1.2.1      ✔ stringr 1.4.1 
✔ readr   2.1.2      ✔ forcats 0.5.2 
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
library(car)
Loading required package: carData

Attaching package: 'car'

The following object is masked from 'package:dplyr':

    recode

The following object is masked from 'package:purrr':

    some
nhis <- read_stata("C:/Users/maman/OneDrive - University of Texas at San Antonio/Event History Analysis Data/nhis_00002.dta.gz")

nhis <- haven::zap_labels(nhis)

nhis <- nhis %>%
  filter(mortelig == 1)
#Creating covariates
nhis$sex1 <- Recode(nhis$sex,
                    recodes= "1='male'; 2='female'; else=NA")
nhis$cit <- Recode(nhis$citizen,
                      recodes= "1='no'; 2='yes'; else=NA")
nhis$obese <- Recode(nhis$bmicat,
                     recodes= "1:3='no'; 4='yes'; else=NA")
nhis$education <- Recode(nhis$educrec2,
                         recodes= "10:41='less than hs'; 42='hs grad'; 50:53= 'some college'; 54='college degree'; 60='more than college'; else=NA")
nhis$pov <- Recode(nhis$pooryn,
                   recodes= "1='at or above'; 2='below'; else=NA")
nhis$smokstat <- Recode(nhis$smokfreqnow,
                        recodes= "1='non-smoker'; 2='current smoker'; 3='evry day smoker'; else=NA")
nhis$marstat1 <- Recode(nhis$marstat,
                        recodes= "10:13='married'; 20='widowed'; 30='divorced'; 40='separated'; 50='never married'; else=NA")

nhis$white <- Recode(nhis$racenew,
                     recodes= "100=1; 997:999=NA; else=0")
nhis$black <- Recode(nhis$racenew,
                      recodes= "200=1; 997:999=NA; else=0")
nhis$othr <- Recode(nhis$racenew,
                    recodes= "300:542=1; 997:999=NA; else=0")
nhis$hisp <- Recode(nhis$hispeth,
                    recodes= "10=0; 20:70=1; else=NA")
nhis$hisp1 <- Recode(nhis$hispeth,
                    recodes= "10='no'; 20:70='yes'; else=NA")
nhis$race_eth[nhis$hisp == 0 & nhis$white==1]<-"NHWhite"
Warning: Unknown or uninitialised column: `race_eth`.
nhis$race_eth[nhis$hisp == 0 & nhis$black==1]<-"NHBlack"
nhis$race_eth[nhis$hisp == 0 & nhis$othr==1]<-"NHOthr"
nhis$race_eth[nhis$hisp == 1]<-"Hispanic"
nhis$race_eth[is.na(nhis$hisp) ==T | is.na(nhis$white)==T | is.na(nhis$black)==T | is.na(nhis$othr) ==T]<-NA

Discrete Time Models

1.Fit the discrete time hazard model to your outcome

a)You must form a person-period data set

b)Consider both the general model and other time specifications

c)Include all main effects in the model

d)Test for an interaction between at least two of the predictors e)Generate hazard plots for interesting cases highlighting the significant predictors in your analysis

library(survival)
library(eha)

nhis1 <- nhis %>%
  filter(complete.cases(.)) %>% 
  mutate(death_age = ifelse( mortstat ==1, 
                             mortdody - (year - age), 
                             2018 - (year - age)), 
         d.event = ifelse(mortstat == 1, 1, 0))

age_fit <- survfit(Surv(death_age, d.event) ~ 1, 
                   data = nhis1)

Creating person-period data set

pp<-survSplit(Surv(death_age, d.event)~. ,
data = nhis1,
cut=seq(18,100,5), #age 18-100 with 5 year intervals
episode="mort_5_yr")

Survey Design

library(survey)
Loading required package: grid
Loading required package: Matrix

Attaching package: 'Matrix'
The following objects are masked from 'package:tidyr':

    expand, pack, unpack

Attaching package: 'survey'
The following object is masked from 'package:graphics':

    dotchart
des<-svydesign(ids=~psu, strata=~strata,
weights = ~mortwtsa, data=pp, nest=T)

The general model and other model specifications

m0<-svyglm(d.event~ race_eth+obese, #constant model
design=des, family=binomial (link="cloglog"))
Warning in eval(family$initialize): non-integer #successes in a binomial glm!
#General model
m1<-svyglm(d.event~factor(mort_5_yr) + race_eth+obese,
design=des, family=binomial (link="cloglog"))
Warning in eval(family$initialize): non-integer #successes in a binomial glm!
#Linear model
m2<-svyglm(d.event~mort_5_yr+race_eth+obese, design=des,
family=binomial (link="cloglog"))
Warning in eval(family$initialize): non-integer #successes in a binomial glm!
#Quadratic model
m3<-svyglm(d.event~mort_5_yr+ I(mort_5_yr^2)+race_eth+obese,
design=des, family=binomial (link="cloglog"))
Warning in eval(family$initialize): non-integer #successes in a binomial glm!
#Spline model
library(splines)
m4<-svyglm(d.event~ns(mort_5_yr, df=3)+race_eth+obese, design=des,
family=binomial (link="cloglog"))
Warning in eval(family$initialize): non-integer #successes in a binomial glm!

Generating Hazard Plots

newdat<-expand.grid(mort_5_yr = unique(pp$mort_5_yr),
race_eth= unique(pp$race_eth),
obese = unique(pp$obese))

newdat<-newdat%>%
dplyr::filter(complete.cases(.))
newdat$h0<-predict(m0, newdata=newdat, type="response")
newdat$h1<-predict(m1, newdata=newdat, type="response")
newdat$h2<-predict(m2, newdata=newdat, type="response")
newdat$h3<-predict(m3, newdata=newdat, type="response")
newdat$h4<-predict(m4, newdata=newdat, type="response")

head(newdat)
  mort_5_yr race_eth obese         h0           h1           h2           h3
1         1 Hispanic   yes 0.00288786 4.552234e-10 3.233751e-05 0.0001238310
2         2 Hispanic   yes 0.00288786 5.294875e-05 6.023394e-05 0.0001712853
3         3 Hispanic   yes 0.00288786 2.701771e-04 1.121943e-04 0.0002448916
4         4 Hispanic   yes 0.00288786 3.305838e-04 2.089731e-04 0.0003618987
5         5 Hispanic   yes 0.00288786 6.158231e-04 3.892172e-04 0.0005527805
6         6 Hispanic   yes 0.00288786 8.925748e-04 7.248695e-04 0.0008726883
            h4
1 1.288632e-05
2 2.736263e-05
3 5.669379e-05
4 1.118438e-04
5 2.049920e-04
6 3.408039e-04
library(data.table)

Attaching package: 'data.table'
The following objects are masked from 'package:dplyr':

    between, first, last
The following object is masked from 'package:purrr':

    transpose
library(magrittr)

Attaching package: 'magrittr'
The following object is masked from 'package:purrr':

    set_names
The following object is masked from 'package:tidyr':

    extract
out<-melt(setDT(newdat),
id = c("mort_5_yr", "race_eth", "obese"),
measure.vars = list(haz=c("h0", "h1","h2","h3", "h4")))
head(out, n=20)
    mort_5_yr race_eth obese variable       value
 1:         1 Hispanic   yes       h0 0.002887860
 2:         2 Hispanic   yes       h0 0.002887860
 3:         3 Hispanic   yes       h0 0.002887860
 4:         4 Hispanic   yes       h0 0.002887860
 5:         5 Hispanic   yes       h0 0.002887860
 6:         6 Hispanic   yes       h0 0.002887860
 7:         7 Hispanic   yes       h0 0.002887860
 8:         8 Hispanic   yes       h0 0.002887860
 9:         9 Hispanic   yes       h0 0.002887860
10:        10 Hispanic   yes       h0 0.002887860
11:        11 Hispanic   yes       h0 0.002887860
12:        12 Hispanic   yes       h0 0.002887860
13:        13 Hispanic   yes       h0 0.002887860
14:        14 Hispanic   yes       h0 0.002887860
15:        15 Hispanic   yes       h0 0.002887860
16:         1  NHWhite   yes       h0 0.004086106
17:         2  NHWhite   yes       h0 0.004086106
18:         3  NHWhite   yes       h0 0.004086106
19:         4  NHWhite   yes       h0 0.004086106
20:         5  NHWhite   yes       h0 0.004086106
library(ggplot2)
out%>%
dplyr::filter(obese== "yes") %>%   
dplyr::mutate(mod = dplyr::case_when(.$variable =="h0" ~ "Constant",
.$variable =="h1" ~ "General",
.$variable =="h2" ~ "Linear",
.$variable =="h3" ~ "Polynomial - 2",
.$variable =="h4" ~ "Spline"))%>%
ggplot(aes(x = mort_5_yr*5, y=value ))+
geom_line(aes(group=race_eth, color=race_eth) )+
labs(title = "Hazard function for adult mortality",
subtitle = "Alternative model specifications")+
xlab("Age")+ylab("S(t)")+
facet_wrap(~mod)#+ scale_y_continuous(trans = "log10")
Don't know how to automatically pick scale for object of type svystat. Defaulting to continuous.

The plotted hazard rates for each model specification suggest that the mortality risk among obese individuals appears to be similar for Hispanics and non-Hispanic Whites, while non-Hispanic Blacks have the highest rate.

Model fit

#constructing a table of relative model fits
AIC0<-AIC(m0)
Warning in eval(family$initialize): non-integer #successes in a binomial glm!
AIC1<-AIC(m1)
Warning in eval(family$initialize): non-integer #successes in a binomial glm!
AIC2<-AIC(m2)
Warning in eval(family$initialize): non-integer #successes in a binomial glm!
AIC3<-AIC(m3)
Warning in eval(family$initialize): non-integer #successes in a binomial glm!
AIC4<-AIC(m4)
Warning in eval(family$initialize): non-integer #successes in a binomial glm!
AICS<-data.frame(AIC = c(AIC0["AIC"],AIC1["AIC"],AIC2["AIC"],AIC3["AIC"],AIC4["AIC"]),
mod = factor(c("const", "general", "linear", "poly 2", "spline")) )

AICS$mod<-forcats::fct_relevel(AICS$mod,
c("general", "const" , "linear", "poly2", "spline"))
Warning: 1 unknown level in `f`: poly2
AICS$deltaAIC<-AICS$AIC - AICS$AIC[AICS$mod=="general"]

knitr::kable(AICS[, c("mod", "AIC", "deltaAIC")],
caption = "Relative AIC for alternative time specifications")
Relative AIC for alternative time specifications
mod AIC deltaAIC
const 15791.71 4544.15216
general 11247.56 0.00000
linear 11315.69 68.13744
poly 2 11284.06 36.50958
spline 11283.56 36.00395

It seems that out of all the alternative time specifications, the spline fits the general model the best.