The artificial data provided here and the numerical results represent numerical illustrations of the methods proposed in Rubio et al. (2019) . These data and results should not be used to make epidemiological claims.
We refer the reader to Rubio et al. (2019) for a review of excess hazard regression models and the assumptions behind the relative survival framework. The model is formulated in terms of the excess hazard function: \[ h_E(t; {\bf x}, \alpha, \beta, \theta) = h_0(t \exp\{ {\tilde{\bf x}}^{\top}\alpha\}; \theta) \exp\{{\bf x}^{\top}\beta\}, \] where \(h_0(;\theta)\) is a parametric baseline hazard. This model contains the Proportional hazards (\(\alpha=0\)), accelerated hazards (\(\beta=0\)), and accelerated failure time (\(\alpha = \beta\), \(\tilde{\bf x} = {\bf x}\)) models as particular cases.
The data set used here is based on the Simulacrum data set, which contains artificial patient-like cancer data to help researchers gain insights. We consider the cohort of female lung cancer patients diagnosed in 2013, with follow-up until 2019. Background mortality rates were obtained for each cancer patient from population life tables for England defined for each calendar year in 2010-2015, and stratified by single year, age, sex, and deprivation category.
We consider the time dependent effect of the variable age
(\(\tilde{\bf x}\), scaled), and the hazard-level effects (\({\bf x}\)) of age
(scaled) and deprivation
(categorical I-V).
See also: LBANS.
# Required packages
# library(devtools)
#install_github("FJRubio67/GHSurv")
library(GHSurv)
library(knitr)
library(numDeriv)
# Load the simulated data set
load('data_lung.Rda')
dim(df)
## [1] 16828 14
# A snipet of the data frame
head(df)
## time year dep dep.1 dep.2 dep.3 dep.4 dep.5 age age.out sex status
## 1 5.776865000 2013 2 0 1 0 0 0 18 23.77687 1 0
## 2 0.084873371 2013 2 0 1 0 0 0 20 20.08487 1 1
## 3 0.388774810 2013 1 1 0 0 0 0 20 20.38877 1 1
## 4 5.771389500 2013 4 0 0 0 1 0 20 25.77139 1 0
## 5 0.002737851 2013 5 0 0 0 0 1 21 21.00274 1 1
## 6 5.264886900 2013 2 0 1 0 0 0 22 27.26489 1 0
## hrate agec
## 1 0.0001792260 -5.094741
## 2 0.0001898829 -4.912291
## 3 0.0001898829 -4.912291
## 4 0.0001898829 -4.912291
## 5 0.0001944859 -4.821066
## 6 0.0002008885 -4.729842
# Summaries
# Deprivation level
table(df$dep)
##
## 1 2 3 4 5
## 2340 3043 3431 3747 4267
# Age at diagnosis
summary(df$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 18.00 67.00 74.00 73.85 82.00 99.00
# Proportion of observed times to event
mean(df$status)
## [1] 0.7747801
#******************
# Data preparation
#******************
n.ind <- dim(df)[1] # number of individuals
# Design matrices
X <- as.matrix(cbind( df$agec, df$dep.2, df$dep.3, df$dep.4, df$dep.5 ))
colnames(X) <- cbind("std age", "dep.2", "dep.3", "dep.4", "dep.5")
Xt <- as.matrix(df$agec)
q <- dim(Xt)[2]
p <- dim(X)[2]
# Histogram of survival times
hist(df$time, breaks = 50, probability = T,
xlab = "Time (years)", ylab = "Density", main = "Simulacrum Lung Cancer (F)")
box()
# Required quantities
status <- as.vector(df$status)
times <- as.vector(df$time) # in years
n.obs <- sum(status)
# Population hazard rates for the uncensored individuals
hp.obs <- df$hrate[as.logical(status)]
GHSurv
packageThe GHSurv
R package implements the GH model with lognormal (LNGEH), log-logistic (LLGEH), Gamma (GGEH), Power Generalised Weibull (PGWGEH) , Exponentiated Weibull (EWGEH), and Generalised Gamma (GGGEH) baseline hazards. Maximum likelihood estimates of the parameters can be obtained using the R command GEHMLE
in this package, while confidence intervals of the parameters can be obtained using the command Conf.Int
, as shown below.
#################
# Model fitting
#################
# Optimisation step
# LN baseline
OPT1 <- GEHMLE(init = rep(0,2+p+q), times = times, status = status, hstr = "LNGEH",
des = X, des_t = Xt, hp.obs = hp.obs, method = "nlminb", maxit = 10000)
# PGW baseline
OPT2 <- GEHMLE(init = c(0,0,0.5,rep(0,p+q)), times = times, status = status, hstr = "PGWGEH",
des = X, des_t = Xt, hp.obs = hp.obs, method = "nlminb", maxit = 10000)
# GG baseline
OPT3 <- GEHMLE(init = rep(0,2+p+q), times = times, status = status, hstr = "GGEH",
des = X, des_t = Xt, hp.obs = hp.obs, method = "nlminb", maxit = 10000)
# LL baseline
OPT4 <- GEHMLE(init = rep(0,2+p+q), times = times, status = status, hstr = "LLGEH",
des = X, des_t = Xt, hp.obs = hp.obs, method = "nlminb", maxit = 10000)
# Comparing models
AIC1 <- 2*OPT1$OPT$objective + 2*length(OPT1$OPT$par)
AIC2 <- 2*OPT2$OPT$objective + 2*length(OPT2$OPT$par)
AIC3 <- 2*OPT3$OPT$objective + 2*length(OPT3$OPT$par)
AIC4 <- 2*OPT4$OPT$objective + 2*length(OPT4$OPT$par)
AICs <- c(AIC1,AIC2,AIC3,AIC4)
which.min(AICs)
## [1] 2
# MLE: Best model is the one with PGW baseline hazard
MLE <- c(exp(OPT2$OPT$par[1:3]),OPT2$OPT$par[-c(1:3)])
# Confidence intervals under reparameterisation
CF <- Conf.Int(FUN = OPT2$loglik,MLE = OPT2$OPT$par,level=0.95,index= NULL)
kable( CF, digits = 3)
Lower | Upper | Transf MLE | Std. Error | |
---|---|---|---|---|
par1 | -2.378 | -2.213 | -2.295 | 0.042 |
par2 | 0.112 | 0.184 | 0.148 | 0.018 |
par3 | 1.734 | 1.854 | 1.794 | 0.031 |
par4 | 0.303 | 0.477 | 0.390 | 0.044 |
par5 | -0.084 | 0.013 | -0.036 | 0.025 |
par6 | 0.100 | 0.237 | 0.169 | 0.035 |
par7 | 0.044 | 0.179 | 0.111 | 0.034 |
par8 | 0.080 | 0.212 | 0.146 | 0.034 |
par9 | 0.158 | 0.286 | 0.222 | 0.033 |
# Confidence intervals under original parameterisation
CFO <- CF[,-4]
CFO[1:3,1:3] <- exp(CFO[1:3,1:3])
colnames(CFO) <- c("Lower", "Upper", "MLE")
kable( CFO, digits = 3)
Lower | Upper | MLE | |
---|---|---|---|
par1 | 0.093 | 0.109 | 0.101 |
par2 | 1.118 | 1.202 | 1.159 |
par3 | 5.664 | 6.387 | 6.015 |
par4 | 0.303 | 0.477 | 0.390 |
par5 | -0.084 | 0.013 | -0.036 |
par6 | 0.100 | 0.237 | 0.169 |
par7 | 0.044 | 0.179 | 0.111 |
par8 | 0.080 | 0.212 | 0.146 |
par9 | 0.158 | 0.286 | 0.222 |
######################################################
# Net survival function
# Deprivation I vs Deprivation V vs All
######################################################
NS_PGWGH <- function(t,par,des_t,des){
des_t <- as.matrix(des_t); des <- as.matrix(des)
p <- ncol(des)
pt <- ncol(des_t)
# Classical model
exp.mle0 <- as.vector(exp( des_t%*%par[4:(3+pt)] ))
exp.mledif0 <- as.vector(exp( des%*%par[(4+pt):(3+pt+p)] -des_t%*%par[4:(3+pt)] ))
out <- mean(exp(-chpgw(t*exp.mle0,par[1],par[2],par[3])*exp.mledif0))
return(out)
}
# Deprivation indicators
ind.I <- which(df$dep.1 == 1)
ind.V <- which(df$dep.5 == 1)
# Population Net survival
NS0 <- Vectorize(function(t) NS_PGWGH(t, MLE, Xt, X) )
# Net survival for deprivation I group
NS1 <- Vectorize(function(t) NS_PGWGH(t, MLE, Xt[ind.I,], X[ind.I,]) )
# Net survival for deprivation V group
NS5 <- Vectorize(function(t) NS_PGWGH(t, MLE, Xt[ind.V,], X[ind.V,]) )
# Comparison
curve(NS1, 0, 6, ylim = c(0,1), lwd = 2, main = "Net survival by Deprivation",
xlab = "Time(Years)", ylab = "Net Survival", cex.axis = 1.5, cex.lab = 1.5)
curve(NS5, 0, 6, add = T, lty = 2, lwd = 2)
curve(NS0, 0, 6, add = T, lty = 3, lwd = 2)
legend("topright", col = c("black","black","black"), lty = c(1,2,3), legend = c("Dep-I", "Dep-V", "All"), lwd = c(2,2,2))