gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 2905173 155.2 5599308 299.1 NA 4006350 214.0
## Vcells 4875103 37.2 10146329 77.5 16384 6992917 53.4
library(ipumsr)
##
## Attaching package: 'ipumsr'
## The following objects are masked from 'package:sjlabelled':
##
## as_factor, zap_labels
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 2909141 155.4 5599308 299.1 NA 4006350 214.0
## Vcells 4879379 37.3 10146329 77.5 16384 6992917 53.4
setwd("~/Downloads")
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 2909137 155.4 5599308 299.1 NA 4006350 214.0
## Vcells 4879536 37.3 10146329 77.5 16384 6992917 53.4
ddi <- read_ipums_ddi("nhis_00008.xml")
data <- read_ipums_micro(ddi)
## Use of data from IPUMS NHIS is subject to conditions including that users
## should cite the data appropriately. Use command `ipums_conditions()` for more
## details.
data<- haven::zap_labels(data)
data <- data %>%
filter(MORTELIG == 1)
data <- data %>%
filter(AGE ==18)
data <- data%>%
filter(complete.cases(.))
data <- data %>%
mutate(death_age = ifelse( MORTSTAT ==1,
MORTDODY - (YEAR - AGE),
2018 - (YEAR - AGE)),
d.event = ifelse(MORTSTAT == 1, 1, 0))
library(survival)
age_fit <- survfit(Surv(death_age, d.event) ~ 1,
data = data)
library(ggsurvfit)
age_fit %>%
ggsurvfit() +
add_confidence_interval(type = "ribbon") +
add_quantile()
library(muhaz)
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 3155916 168.6 8957226 478.4 NA 7074552 377.9
## Vcells 10936104 83.5 72084264 550.0 16384 90105330 687.5
#since these functions don't work with durations of 0, we add a very small amount to the intervals
fit.haz.km<-kphaz.fit(data$death_age,
data$d.event ,
method = "product-limit")
#this is a version of the hazard that is smoothed using a kernel-density method
fit.haz.sm<-muhaz(data$death_age, data$d.event )
library(eha)
# Recoding the variables
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 3163857 169.0 8957226 478.4 NA 7074552 377.9
## Vcells 10989400 83.9 57667412 440.0 16384 90105330 687.5
# BMI
data$bmicat2<-car::Recode(data$BMICAT, recodes="1:2 ='Underweight/Normal weight ' ; 3:4='Overweight/Obesity' ; else=NA", as.factor=T)
data$bmicat2<-relevel(data$bmicat2, ref='Overweight/Obesity')
#Poor or fair self rated health
data$badhealth<-car::Recode(data$HEALTH, recodes="4:5='Poor/Fair Health'; 1:3='Good Health'; else=NA", as.factor=T)
#sex
data$sex2<-as.factor(ifelse(data$SEX==1, "Male", "Female"))
data <- data%>%
filter(complete.cases(.))
#exponential distribution for hazard, here we hard code it to be
#a weibull dist with shape ==1
fit.11<-phreg(Surv(death_age, d.event)~sex2+bmicat2*BMI+badhealth*HEALTH,
data=data,
dist="weibull",
shape = 1)
summary(fit.11)
## Single term deletions
##
## Model:
## Surv(death_age, d.event) ~ sex2 + bmicat2 * BMI + badhealth *
## HEALTH
## Df AIC LRT Pr(>Chi)
## <none> 972
## sex2 1 1000 29.74 5e-08 ***
## bmicat2:BMI 1 970 0.00 0.97
## badhealth:HEALTH 1 970 0.01 0.93
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Covariate Mean Coef Rel.Risk S.E. Wald p
## sex2
## Female 0.504 0 1 (reference)
## Male 0.496 1.702 5.486 0.365 0.0000
## bmicat2
## Overweight/Obesi 0.361 0 1 (reference)
## Underweight/Norm 0.639 0.131 1.140 2.244 0.9535
## BMI 24.491 0.017 1.017 0.042 0.6885
## badhealth
## Good Health 0.962 0 1 (reference)
## Poor/Fair Health 0.038 1.113 3.044 4.509 0.8050
## HEALTH 1.801 -0.040 0.961 0.191 0.8343
## bmicat2:BMI
## Underweight/Normal wei 0.004 1.004 0.094 0.9693
## badhealth:HEALTH
## Poor/Fair Health: 0.096 1.101 1.098 0.9304
##
## Events 56
## Total time at risk 147539
## Max. log. likelihood -478.17
## LR test statistic 37.82
## Degrees of freedom 7
## Overall p-value 3.27279e-06
#exponential distribution for hazard, here we hard code it to be
#a weibull dist with shape ==1
fit.1<-phreg(Surv(death_age, d.event)~sex2+bmicat2*BMI+badhealth*HEALTH,
data=data,
dist="weibull",
shape = 1)
summary(fit.1)
## Single term deletions
##
## Model:
## Surv(death_age, d.event) ~ sex2 + bmicat2 * BMI + badhealth *
## HEALTH
## Df AIC LRT Pr(>Chi)
## <none> 972
## sex2 1 1000 29.74 5e-08 ***
## bmicat2:BMI 1 970 0.00 0.97
## badhealth:HEALTH 1 970 0.01 0.93
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Covariate Mean Coef Rel.Risk S.E. Wald p
## sex2
## Female 0.504 0 1 (reference)
## Male 0.496 1.702 5.486 0.365 0.0000
## bmicat2
## Overweight/Obesi 0.361 0 1 (reference)
## Underweight/Norm 0.639 0.131 1.140 2.244 0.9535
## BMI 24.491 0.017 1.017 0.042 0.6885
## badhealth
## Good Health 0.962 0 1 (reference)
## Poor/Fair Health 0.038 1.113 3.044 4.509 0.8050
## HEALTH 1.801 -0.040 0.961 0.191 0.8343
## bmicat2:BMI
## Underweight/Normal wei 0.004 1.004 0.094 0.9693
## badhealth:HEALTH
## Poor/Fair Health: 0.096 1.101 1.098 0.9304
##
## Events 56
## Total time at risk 147539
## Max. log. likelihood -478.17
## LR test statistic 37.82
## Degrees of freedom 7
## Overall p-value 3.27279e-06
plot(fit.1)
lines(fit.haz.sm, col=2)
#AFT Model specification
fit.1.aft<-survreg(Surv(death_age, d.event)~sex2+bmicat2*BMI+badhealth*HEALTH,
data=data,
dist = "exponential" )
summary(fit.1.aft)
##
## Call:
## survreg(formula = Surv(death_age, d.event) ~ sex2 + bmicat2 *
## BMI + badhealth * HEALTH, data = data, dist = "exponential")
## Value Std. Error z p
## (Intercept) 9.62740 1.34349 7.17 7.7e-13
## sex2Male -1.70222 0.36534 -4.66 3.2e-06
## bmicat2Underweight/Normal weight -0.13096 2.24407 -0.06 0.95
## BMI -0.01677 0.04184 -0.40 0.69
## badhealthPoor/Fair Health -1.11318 4.50913 -0.25 0.81
## HEALTH 0.03994 0.19097 0.21 0.83
## bmicat2Underweight/Normal weight :BMI -0.00362 0.09422 -0.04 0.97
## badhealthPoor/Fair Health:HEALTH -0.09589 1.09792 -0.09 0.93
##
## Scale fixed at 1
##
## Exponential distribution
## Loglik(model)= -478.2 Loglik(intercept only)= -497.1
## Chisq= 37.82 on 7 degrees of freedom, p= 3.3e-06
## Number of Newton-Raphson Iterations: 9
## n= 5545
#Weibull model
#weibull distribution for hazard
fit.2<-phreg(Surv(death_age, d.event)~sex2+bmicat2*BMI+badhealth*HEALTH,
data=data,
dist="weibull")
summary(fit.2)
## Single term deletions
##
## Model:
## Surv(death_age, d.event) ~ sex2 + bmicat2 * BMI + badhealth *
## HEALTH
## Df AIC LRT Pr(>Chi)
## <none> 864
## sex2 1 893 30.99 2.6e-08 ***
## bmicat2:BMI 1 862 0.02 0.89
## badhealth:HEALTH 1 862 0.00 0.97
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Covariate Mean Coef Rel.Risk S.E. Wald p
## sex2
## Female 0.504 0 1 (reference)
## Male 0.496 1.737 5.679 0.366 0.0000
## bmicat2
## Overweight/Obesi 0.361 0 1 (reference)
## Underweight/Norm 0.639 0.493 1.638 2.215 0.8238
## BMI 24.491 0.020 1.020 0.039 0.6172
## badhealth
## Good Health 0.962 0 1 (reference)
## Poor/Fair Health 0.038 1.341 3.822 4.520 0.7668
## HEALTH 1.801 -0.051 0.951 0.191 0.7908
## bmicat2:BMI
## Underweight/Normal wei -0.014 0.987 0.094 0.8853
## badhealth:HEALTH
## Poor/Fair Health: 0.044 1.045 1.100 0.9681
##
## Events 56
## Total time at risk 147539
## Max. log. likelihood -422.83
## LR test statistic 39.56
## Degrees of freedom 7
## Overall p-value 1.529e-06
plot(fit.2, fn = "haz")
lines(fit.haz.sm, col=2)
#re-scaled beta's
(betaHat <- -coef(fit.1.aft)[-1] / fit.1.aft$scale)
## sex2Male bmicat2Underweight/Normal weight
## 1.702221896 0.130959502
## BMI badhealthPoor/Fair Health
## 0.016774398 1.113178737
## HEALTH bmicat2Underweight/Normal weight :BMI
## -0.039943628 0.003624976
## badhealthPoor/Fair Health:HEALTH
## 0.095889780
#beta's from the PH model
coef(fit.1)
## sex2Male bmicat2Underweight/Normal weight
## 1.702221896 0.130959502
## BMI badhealthPoor/Fair Health
## 0.016774398 1.113178737
## HEALTH bmicat2Underweight/Normal weight :BMI
## -0.039943628 0.003624976
## badhealthPoor/Fair Health:HEALTH log(scale)
## 0.095889780 9.627399943
#log-normal distribution for hazard
fit.3<-phreg(Surv(death_age, d.event)~sex2+bmicat2*BMI+badhealth*HEALTH,
data=data,
dist="lognormal")
## fail in [dsytrf]; info = 10
## Warning in phreg(Surv(death_age, d.event) ~ sex2 + bmicat2 * BMI + badhealth * :
## Failed with error code 10
summary(fit.3)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 1 1 1 1 1
#plot the hazard from the log normal vs the empirical hazard
#plot(fit.3)
#lines(fit.haz.sm, col=2)
#Log logistic model
#log-normal distribution for hazard
fit.4<-phreg(Surv(death_age, d.event)~sex2+bmicat2*BMI+badhealth*HEALTH,
data=data,
dist="loglogistic")
summary(fit.4)
## Warning in sqrt(varcoef): NaNs produced
## Warning in sqrt(varhaz): NaNs produced
## Single term deletions
##
## Model:
## Surv(death_age, d.event) ~ sex2 + bmicat2 * BMI + badhealth *
## HEALTH
## Df AIC LRT Pr(>Chi)
## <none> 866
## sex2 1 895 30.99 2.6e-08 ***
## bmicat2:BMI 1 864 0.02 0.89
## badhealth:HEALTH 1 864 0.00 0.97
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Covariate Mean Coef Rel.Risk S.E. Wald p
## sex2
## Female 0.504 0 1 (reference)
## Male 0.496 18.036 68035368.612 NaN NaN
## bmicat2
## Overweight/Obesi 0.361 0 1 (reference)
## Underweight/Norm 0.639 1.737 5.679 0.366 0.0000
## BMI 24.491 0.493 1.638 2.215 0.8238
## badhealth
## Good Health 0.962 0 1 (reference)
## Poor/Fair Health 0.038 0.020 1.020 0.039 0.6172
## HEALTH 1.801 1.341 3.822 4.520 0.7668
## bmicat2:BMI
## HEALTH -0.051 0.951 0.191 0.7908
## badhealth:HEALTH
## bmicat2Underweight/Nor -0.014 0.987 0.094 0.8853
##
## Events 56
## Total time at risk 147539
## Max. log. likelihood -422.83
## LR test statistic 39.56
## Degrees of freedom 7
## Overall p-value 1.52991e-06
#plot the hazard from the log normal vs the empirical hazard
plot(fit.4, fn="haz", xlim = c(0, 60) , ylim = c(0, 60) )
lines(fit.haz.sm, col=2)
AIC(fit.1)
## [1] 972.3436
AIC(fit.2)
## [1] 863.6526
#AIC(fit.3)
AIC(fit.4)
## [1] 861.6526
log-logistic model best fits the data, based on the minimum AIC criteria
#Piecewise constant exponential model
# here I must supply the times for the "pieces" where I expect the hazard to be constant
fit.5<-pchreg(Surv(death_age, d.event)~sex2+bmicat2*BMI+badhealth*HEALTH,
data=data,
cuts=seq(1, 30, 5))
summary(fit.5)
## Single term deletions
##
## Model:
## Surv(death_age, d.event) ~ sex2 + bmicat2 * BMI + badhealth *
## HEALTH
## Df AIC LRT Pr(>Chi)
## <none> 578
## sex2 1 597 21.79 3e-06 ***
## bmicat2:BMI 1 577 1.35 0.25
## badhealth:HEALTH 1 576 0.15 0.70
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Covariate Mean Coef Rel.Risk S.E. Wald p
## sex2
## Female 0.504 0 1 (reference)
## Male 0.496 1.870 6.491 0.484 0.0001
## bmicat2
## Overweight/Obesi 0.361 0 1 (reference)
## Underweight/Norm 0.639 -2.755 0.064 3.042 0.3650
## BMI 24.491 -0.010 0.990 0.063 0.8778
## badhealth
## Good Health 0.962 0 1 (reference)
## Poor/Fair Health 0.038 -0.996 0.369 4.968 0.8411
## HEALTH 1.801 0.153 1.165 0.227 0.5008
## bmicat2:BMI
## Underweight/Normal wei 0.140 1.151 0.124 0.2581
## badhealth:HEALTH
## Poor/Fair Health: 0.480 1.616 1.180 0.6841
##
## Events 56
## Total time at risk 147539
## Max. log. likelihood -276.81
## LR test statistic 29.00
## Degrees of freedom 7
## Overall p-value 0.000144793
##
## Restricted mean survival: 24.9927 in (1, 26]
plot(fit.5, fn="haz")
lines(fit.haz.sm, col=2)
AIC(fit.4)
## [1] 861.6526
-2*fit.5$loglik[2]+length(fit.5$coefficients) #have to construct this ourselves
## [1] 560.6175
#Graphical checks on the model fit
emp<-coxreg(Surv(death_age, d.event)~sex2+bmicat2*BMI+badhealth*HEALTH,
data=data)
check.dist(sp=emp,pp=fit.1, main = "Empirical vs. Exponential")
check.dist(sp=emp,pp=fit.2, main = "Empirical vs. Weibull")
#check.dist(sp=emp,pp=fit.3, main = "Empirical vs. Log-Normal")
check.dist(sp=emp,pp=fit.4, main = "Empirical vs. Log-Logistic")
check.dist(sp=emp,pp=fit.5, main = "Empirical vs. PCH")
I chose the PCH model because it fits well with the empirical data compared to other models. Also the Weibull is not doing bad