DEM 7223 - Event History Analysis - Parametric Hazard Models

Published

September 16, 2022

Libraries

library(haven, quietly=T)
library(survival, quietly=T)
library(car, quietly=T)
library(ipumsr)
library(knitr,quietly = T)
library(survey, quietly=T)

Attaching package: 'survey'
The following object is masked from 'package:graphics':

    dotchart
library(survminer, quietly=T)

Attaching package: 'survminer'
The following object is masked from 'package:survival':

    myeloma
library(ggplot2, quietly=T)
library(ggpubr, quietly=T)
library(muhaz, quietly=T)
library(car, quietly=T)
library(dplyr, quietly=T)

Attaching package: 'dplyr'
The following object is masked from 'package:car':

    recode
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(tidycensus, quietly=T)
library(tidyverse, quietly=T)
── Attaching packages
───────────────────────────────────────
tidyverse 1.3.2 ──
✔ tibble  3.1.8     ✔ purrr   0.3.4
✔ tidyr   1.2.1     ✔ stringr 1.4.1
✔ readr   2.1.2     ✔ forcats 0.5.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ tidyr::expand() masks Matrix::expand()
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
✖ tidyr::pack()   masks Matrix::pack()
✖ dplyr::recode() masks car::recode()
✖ purrr::some()   masks car::some()
✖ tidyr::unpack() masks Matrix::unpack()
library(questionr, quietly=T)
library(forcats, quietly=T)
library(srvyr, quietly=T)

Attaching package: 'srvyr'

The following object is masked from 'package:stats':

    filter
library(caret, quietly=T)

Attaching package: 'caret'

The following object is masked from 'package:purrr':

    lift

The following object is masked from 'package:survival':

    cluster
library(muhaz, quietly=T)
library(eha, quietly=T)
library(ggsurvfit)
library(survival)
setwd("C:/Users/spara/Desktop/event history")

Data

ddi <- read_ipums_ddi("nhis_00018.xml")
data <- read_ipums_micro(ddi)
Use of data from IPUMS NHIS is subject to conditions including that users
should cite the data appropriately. Use command `ipums_conditions()` for more
details.
data<- haven::zap_labels(data)

In this work, I have studied the effect of education, sex and poverty affect the likelihood of adult mortality. I have used IPUMS data (1997-2018) to perform the analysis. Education is measured as less than high school/ higher than high school. Poverty is measured as ratio of family income to poverty threshold being less or greater than 1. Hence, my event variable is adult mortality and my covariates are education, sex, and poverty .

Filtering data (adults + mortality)

data <- data %>%
  filter(MORTELIG == 1)


data <- data %>%
filter(AGE ==18) 

data <- data%>%
filter(complete.cases(.))
data <- data %>%
  mutate(death_age = ifelse( MORTSTAT ==1, 
                             MORTDODY - (YEAR - AGE), 
                             2018 - (YEAR - AGE)), 
         d.event = ifelse(MORTSTAT == 1, 1, 0))



age_fit <- survfit(Surv(death_age, d.event) ~ 1, 
                   data = data)


age_fit %>%
  ggsurvfit() +
  add_confidence_interval(type = "ribbon") +
  add_quantile()

fit.haz.km<-kphaz.fit(data$death_age,
                      data$d.event , 
                      method = "product-limit")
fit.haz.sm<-muhaz(data$death_age, data$d.event )
plot(fit.haz.sm)
lines(fit.haz.sm, col=2)

data$educcat<-car::Recode(data$EDUC, recodes="100:116 ='No HS' ; 200:504='HS or higher' ; else=NA", as.factor=T)

data$educcat<-relevel(data$educcat, ref='No HS') 


data$povertycat<-car::Recode(data$POVERTY, recodes="10:14='Ratio less than 1'; 20:38='Ratio greater than 1'; else=NA", as.factor=T)


data$sex<-as.factor(ifelse(data$SEX==1, "Male", "Female"))


data <- data%>%
filter(complete.cases(.))

Exponential Model

This model suggests men are 3 times more likely to experience mortality compared to female and adults with higher than HS degree are are lower risk of mortality compared to adults with less than HS degree. Finally, adults with family income: poverty threshold ratio less than 1 are at higher risk of mortality.

fit.1<-phreg(Surv(death_age, d.event)~sex+educcat+povertycat*POVERTY,
             data=data,
             dist="weibull",
             shape = 1)

summary(fit.1)
Single term deletions

Model:
Surv(death_age, d.event) ~ sex + educcat + povertycat * POVERTY
                   Df  AIC  LRT Pr(>Chi)    
<none>                4577                  
sex                 1 4647 72.7   <2e-16 ***
educcat             1 4578  3.8    0.052 .  
povertycat:POVERTY  1 4575  0.0    0.852    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Covariate             Mean       Coef     Rel.Risk   S.E.    Wald p
sex 
          Female      0.481     0         1 (reference)
            Male      0.519     1.145     3.144     0.146   0.0000 
educcat 
           No HS      0.455     0         1 (reference)
    HS or higher      0.545    -0.239     0.788     0.123   0.0530 
povertycat 
Ratio greater th      0.757     0         1 (reference)
Ratio less than       0.243    -0.020     0.981     1.501   0.9896 
POVERTY              26.319    -0.024     0.976     0.012   0.0420 
povertycat:POVERTY 
   Ratio less than 1:        -0.023     0.978     0.122    0.8521 

Events                    270 
Total time at risk        545198 
Max. log. likelihood      -2282.3 
LR test statistic         85.13 
Degrees of freedom        5 
Overall p-value           1.11022e-16
plot(fit.1)
lines(fit.haz.sm, col=2)

Weibull Model

This model also suggests adult men higher risk (3.22) to experience mortality compared to female) and adults with higher than HS degree are are lower risk of mortality (0.85) compared to adults with less than HS degree. Finally, adults with family income: poverty threshold ratio less than 1 are at higher risk of mortality (2.439).

fit.2<-phreg(Surv(death_age, d.event)~sex+educcat+povertycat*POVERTY,
             data=data,
             dist="weibull")

summary(fit.2)
Single term deletions

Model:
Surv(death_age, d.event) ~ sex + educcat + povertycat * POVERTY
                   Df  AIC  LRT Pr(>Chi)    
<none>                4078                  
sex                 1 4152 76.1   <2e-16 ***
educcat             1 4077  1.7     0.19    
povertycat:POVERTY  1 4076  0.2     0.67    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Covariate             Mean       Coef     Rel.Risk   S.E.    Wald p
sex 
          Female      0.481     0         1 (reference)
            Male      0.519     1.171     3.226     0.146   0.0000 
educcat 
           No HS      0.455     0         1 (reference)
    HS or higher      0.545    -0.161     0.851     0.124   0.1934 
povertycat 
Ratio greater th      0.757     0         1 (reference)
Ratio less than       0.243    -0.891     0.410     1.587   0.5743 
POVERTY              26.319    -0.024     0.976     0.012   0.0439 
povertycat:POVERTY 
   Ratio less than 1:         0.055     1.057     0.129    0.6702 

Events                    270 
Total time at risk        545198 
Max. log. likelihood      -2031.8 
LR test statistic         85.93 
Degrees of freedom        5 
Overall p-value           0
plot(fit.2, fn = "haz")
lines(fit.haz.sm, col=2)

AFT Model

fit.1.aft<-survreg(Surv(death_age, d.event)~sex+educcat+povertycat*POVERTY,
                   data=data,
                   dist = "exponential" )

summary(fit.1.aft)

Call:
survreg(formula = Surv(death_age, d.event) ~ sex + educcat + 
    povertycat * POVERTY, data = data, dist = "exponential")
                                      Value Std. Error     z       p
(Intercept)                          7.5453     0.3788 19.92 < 2e-16
sexMale                             -1.1455     0.1462 -7.84 4.6e-15
educcatHS or higher                  0.2387     0.1234  1.93   0.053
povertycatRatio less than 1          0.0196     1.5005  0.01   0.990
POVERTY                              0.0241     0.0118  2.03   0.042
povertycatRatio less than 1:POVERTY  0.0227     0.1219  0.19   0.852

Scale fixed at 1 

Exponential distribution
Loglik(model)= -2282.3   Loglik(intercept only)= -2324.8
    Chisq= 85.13 on 5 degrees of freedom, p= 7.1e-17 
Number of Newton-Raphson Iterations: 8 
n= 18642 

Log-normal model

fit.3<-phreg(Surv(death_age, d.event)~sex+educcat+povertycat*POVERTY,
             data=data,
             dist="lognormal")
fail in [dsytrf]; info = 8
Warning in phreg(Surv(death_age, d.event) ~ sex + educcat + povertycat * :
Failed with error code 8
summary(fit.3)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
      1       1       1       1       1       1 

Log-logistic model

fit.4<-phreg(Surv(death_age, d.event)~sex+educcat+povertycat*POVERTY,
             data=data,
             dist="loglogistic")
summary(fit.4)
Warning in sqrt(varcoef): NaNs produced
Warning in sqrt(varhaz): NaNs produced
Single term deletions

Model:
Surv(death_age, d.event) ~ sex + educcat + povertycat * POVERTY
                   Df  AIC  LRT Pr(>Chi)    
<none>                4080                  
sex                 1 4142 64.0  1.3e-15 ***
educcat             1 4068 -9.8     1.00    
povertycat:POVERTY  1 4078  0.2     0.67    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Covariate             Mean       Coef     Rel.Risk   S.E.    Wald p
sex 
          Female      0.481     0         1 (reference)
            Male      0.519    15.599 5953118.388       NaN      NaN 
educcat 
           No HS      0.455     0         1 (reference)
    HS or higher      0.545     1.171     3.226     0.146   0.0000 
povertycat 
Ratio greater th      0.757     0         1 (reference)
Ratio less than       0.243    -0.161     0.851     0.124   0.1934 
POVERTY              26.319    -0.891     0.410     1.587   0.5743 
povertycat:POVERTY 
                             -0.024     0.976     0.012    0.0439 

Events                    270 
Total time at risk        545198 
Max. log. likelihood      -2031.8 
LR test statistic         85.92 
Degrees of freedom        5 
Overall p-value           0
plot(fit.4, fn="haz", xlim = c(0, 50) , ylim = c(0, 1) ) 
lines(fit.haz.sm, col=2)

Fit of model

Based on the AIC values obtained below fit4 (log-logistic) model is a best fit.

AIC(fit.1)
[1] 4576.53
AIC(fit.2)
[1] 4077.564
##AIC(fit.3)**
AIC(fit.4)
[1] 4075.564

Piece wise model (exponential)

This model also suggests adult men higher risk (3.6) to experience mortality compared to female and adults with higher than HS degree are are lower risk of mortality (0.785) compared to adults with less than HS degree. Finally, adults with family income: poverty threshold ratio less than 1 are at higher risk of mortality (1.049). Piece Wise model had lowest AIC value so, it is the most suitable model to explain the data.

fit.5<-pchreg(Surv(death_age, d.event)~sex+educcat+povertycat*POVERTY,
             data=data,
             cuts=seq(1, 40, 5))

summary(fit.5)
Single term deletions

Model:
Surv(death_age, d.event) ~ sex + educcat + povertycat * POVERTY
                   Df  AIC  LRT Pr(>Chi)    
<none>                3805                  
sex                 1 3884 81.0   <2e-16 ***
educcat             1 3807  3.6    0.059 .  
povertycat:POVERTY  1 3804  0.1    0.713    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Covariate             Mean       Coef     Rel.Risk   S.E.    Wald p
sex 
          Female      0.481     0         1 (reference)
            Male      0.519     1.281     3.600     0.158   0.0000 
educcat 
           No HS      0.455     0         1 (reference)
    HS or higher      0.545    -0.243     0.785     0.129   0.0604 
povertycat 
Ratio greater th      0.757     0         1 (reference)
Ratio less than       0.243    -0.832     0.435     1.610   0.6054 
POVERTY              26.319    -0.026     0.975     0.012   0.0389 
povertycat:POVERTY 
   Ratio less than 1:         0.048     1.049     0.131    0.7122 

Events                    270 
Total time at risk        545198 
Max. log. likelihood      -1890.7 
LR test statistic         94.35 
Degrees of freedom        5 
Overall p-value           0

Restricted mean survival:  34.81202 in (1, 36] 
plot(fit.5, fn="haz")
lines(fit.haz.sm, col=2)

fit5<--2*fit.5$loglik[2]+length(fit.5$coefficients)
fit5
[1] 3786.366

Graphical checks on the model fit

emp<-coxreg(Surv(death_age, d.event)~sex+educcat+povertycat*POVERTY,
            data=data)

check.dist(sp=emp,pp=fit.1, main = "Empirical vs. Exponential")

check.dist(sp=emp,pp=fit.2, main = "Empirical vs. Weibull")

check.dist(sp=emp,pp=fit.4, main = "Empirical vs. Log-Logistic")

check.dist(sp=emp,pp=fit.5, main = "Empirical vs. PCH")

In the above analysis, I I chose PCH model because it fits better to empirical data compared to other models.