Parametric Models HW

library(haven)
library(tidyverse)

── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
✔ tibble  3.1.8      ✔ dplyr   1.0.10
✔ tidyr   1.2.1      ✔ stringr 1.4.1 
✔ readr   2.1.2      ✔ forcats 0.5.2 
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()

library(car)

Loading required package: carData

Attaching package: 'car'

The following object is masked from 'package:dplyr':

    recode

The following object is masked from 'package:purrr':

    some

nhis <- read_stata("C:/Users/maman/OneDrive - University of Texas at San Antonio/Event History Analysis Data/nhis_00002.dta.gz")

nhis <- haven::zap_labels(nhis)

nhis <- nhis %>%
  filter(mortelig == 1)

nhis$sex1 <- Recode(nhis$sex,
                    recodes= "1='male'; 2='female'; else=NA")
nhis$cit <- Recode(nhis$citizen,
                      recodes= "1='no'; 2='yes'; else=NA")
nhis$obese <- Recode(nhis$bmicat,
                     recodes= "1:3='no'; 4='yes'; else=NA")
nhis$education <- Recode(nhis$educrec2,
                         recodes= "10:41='less than hs'; 42='hs grad'; 50:53= 'some college'; 54='college degree'; 60='more than college'; else=NA")
nhis$pov <- Recode(nhis$pooryn,
                   recodes= "1='at or above'; 2='below'; else=NA")
nhis$smokstat <- Recode(nhis$smokfreqnow,
                        recodes= "1='non-smoker'; 2='current smoker'; 3='evry day smoker'; else=NA")
nhis$marstat1 <- Recode(nhis$marstat,
                        recodes= "10:13='married'; 20='widowed'; 30='divorced'; 40='separated'; 50='never married'; else=NA")

View(nhis)

nhis$hisp <- Recode(nhis$hispeth,
                    recodes= "10='no'; 20:70='yes'; else=NA")
nhis_h <- nhis %>% 
  filter(hisp=="yes")

#xtabs <- xtabs(~ mortstat+year+obese, data=nhis_h)

#View(xtabs)

nhis_h <- nhis_h %>%
  mutate(death_age = ifelse( mortstat ==1, 
                             mortdody - (year - age), 
                             2018 - (year - age)), 
         d.event = ifelse(mortstat == 1, 1, 0))

A. Parametric Models

Carry out the following analysis

Define your outcome as in HW 1. Also consider what covariates are hypothesized to affect the outcome variable.

Define these and construct a parametric model for your outcome. Fit the parametric model of your choosing to the data.

Did you choose an AFT or PH model and why?

The PH model will be used for this project because the aim of this project is to examine the mortality risk associated with obesity status across Hispanic subgroups.

Justify what parametric distribution you choose.

The Gompertz distribution will be used since it’s a widely used model for adult mortality.

Carry out model fit diagnostics for the model.

library(survival)

age_fit <- survfit(Surv(death_age, d.event) ~ 1, 
                   data = nhis_h)

library(ggsurvfit)

age_fit %>%
  ggsurvfit() +
  add_confidence_interval(type = "ribbon") +
  add_quantile()

age_fit%>%
 broom::tidy() %>%
  select(time, estimate) %>%
  mutate(cumhaz = -log(estimate)) %>%
  mutate(haz = c(0, diff(cumhaz)))%>%
  ggplot()+
  aes(x=time, y=haz)+
  geom_line()

library(eha)

nhis_h$sex2 <- Recode(nhis_h$sex,
                    recodes= "1=1; 2=0; else=NA")
nhis_h$pov1 <- Recode(nhis_h$pooryn,
                   recodes= "1=0; 2=1; else=NA")
nhis_h$smk <- Recode(nhis_h$smokfreqnow,
                     recodes= "recodes= 1=0; 2:3=1; else=NA")
nhis_h$cit <- Recode(nhis_h$citizen,
                      recodes= "1=0; 2=1; else=NA")
nhis_h$obese_yn <- Recode(nhis_h$bmicat,
                          recodes= "1:3=0; 4=1; else=NA") #had extra text in the recodes part

 nhis_h <- nhis_h %>%
   filter(is.na(obese_yn)==F, is.na(sex2)==F, is.na(pov1)==F)

fit.1<-phreg(Surv(death_age+.01, d.event)~obese_yn,
             data=nhis_h,
             dist="gompertz")

summary(fit.1)

Covariate             Mean       Coef     Rel.Risk   S.E.    LR p
obese_yn              0.339     0.340     1.406     0.177   0.0596 

Events                    144 
Total time at risk        422284 
Max. log. likelihood      -967.95 
LR test statistic         3.55 
Degrees of freedom        1 
Overall p-value           0.0595613

plot(fit.1)

fit.2<-phreg(Surv(death_age+.01, d.event)~obese_yn,
             data=nhis_h,
             dist="weibull")

summary(fit.2)

Covariate             Mean       Coef     Rel.Risk   S.E.    LR p
obese_yn              0.339     0.295     1.343     0.177   0.1017 

Events                    144 
Total time at risk        422284 
Max. log. likelihood      -983.34 
LR test statistic         2.68 
Degrees of freedom        1 
Overall p-value           0.101735

plot(fit.2)

AIC(fit.1)

[1] 1941.894

AIC(fit.2)

[1] 1972.689

According to AIC the model with the gompertz distribution is the better fit.

Include all main effects in the model.
Test for an interaction between at least two of the predictors.

fit.3<-phreg(Surv(death_age+.01, d.event)~obese_yn+sex2+pov1,
             data=nhis_h,
             dist="gompertz")

summary(fit.3)

Covariate             Mean       Coef     Rel.Risk   S.E.    LR p
obese_yn              0.339     0.345     1.412     0.178   0.0571 
sex2                  0.436     0.721     2.056     0.169   0.0000 
pov1                  0.223     0.218     1.244     0.183   0.2391 

Events                    144 
Total time at risk        422284 
Max. log. likelihood      -958.45 
LR test statistic         22.55 
Degrees of freedom        3 
Overall p-value           5.01399e-05

plot(fit.3)

Interpret your results and write them up.
Provide tabular and graphical output to support your conclusions.

summary(fit.2)

Covariate             Mean       Coef     Rel.Risk   S.E.    LR p
obese_yn              0.339     0.295     1.343     0.177   0.1017 

Events                    144 
Total time at risk        422284 
Max. log. likelihood      -983.34 
LR test statistic         2.68 
Degrees of freedom        1 
Overall p-value           0.101735