Disclaimer

The artificial data provided here and the numerical results represent numerical illustrations of the methods proposed in Rubio et al. (2019) . These data and results should not be used to make epidemiological claims.

The excess hazard regression model

We refer the reader to Rubio et al. (2019) for a review of excess hazard regression models and the assumptions behind the relative survival framework. The model is formulated in terms of the excess hazard function: \[ h_E(t; {\bf x}, \alpha, \beta, \theta) = h_0(t \exp\{ {\tilde{\bf x}}^{\top}\alpha\}; \theta) \exp\{{\bf x}^{\top}\beta\}, \] where \(h_0(;\theta)\) is a parametric baseline hazard. This model contains the Proportional hazards (\(\alpha=0\)), accelerated hazards (\(\beta=0\)), and accelerated failure time (\(\alpha = \beta\), \(\tilde{\bf x} = {\bf x}\)) models as particular cases.

The simulated data set

The data set used here is based on the Simulacrum data set, which contains artificial patient-like cancer data to help researchers gain insights. We consider the cohort of female lung cancer patients diagnosed in 2013, with follow-up until 2019. Background mortality rates were obtained for each cancer patient from population life tables for England defined for each calendar year in 2010-2015, and stratified by single year, age, sex, and deprivation category.

We consider the time dependent effect of the variable age (\(\tilde{\bf x}\), scaled), and the hazard-level effects (\({\bf x}\)) of age (scaled) and deprivation (categorical I-V).

See also: LBANS.

Data Preparation

# Required packages
# library(devtools)
#install_github("FJRubio67/GHSurv")
library(GHSurv)
library(knitr)
library(numDeriv)

# Load the simulated data set
load('data_lung.Rda')

dim(df)
## [1] 16828    14
# A snipet of the data frame
head(df)
##          time year dep dep.1 dep.2 dep.3 dep.4 dep.5 age  age.out sex status
## 1 5.776865000 2013   2     0     1     0     0     0  18 23.77687   1      0
## 2 0.084873371 2013   2     0     1     0     0     0  20 20.08487   1      1
## 3 0.388774810 2013   1     1     0     0     0     0  20 20.38877   1      1
## 4 5.771389500 2013   4     0     0     0     1     0  20 25.77139   1      0
## 5 0.002737851 2013   5     0     0     0     0     1  21 21.00274   1      1
## 6 5.264886900 2013   2     0     1     0     0     0  22 27.26489   1      0
##          hrate      agec
## 1 0.0001792260 -5.094741
## 2 0.0001898829 -4.912291
## 3 0.0001898829 -4.912291
## 4 0.0001898829 -4.912291
## 5 0.0001944859 -4.821066
## 6 0.0002008885 -4.729842
# Summaries

# Deprivation level
table(df$dep)
## 
##    1    2    3    4    5 
## 2340 3043 3431 3747 4267
# Age at diagnosis
summary(df$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   18.00   67.00   74.00   73.85   82.00   99.00
# Proportion of observed times to event
mean(df$status)
## [1] 0.7747801
#******************
# Data preparation
#******************

n.ind <- dim(df)[1]  # number of individuals
# Design matrices
X <- as.matrix(cbind( df$agec, df$dep.2, df$dep.3, df$dep.4, df$dep.5 ))
colnames(X) <- cbind("std age", "dep.2", "dep.3", "dep.4", "dep.5")
Xt <- as.matrix(df$agec)
q <- dim(Xt)[2]
p <- dim(X)[2]

# Histogram of survival times
hist(df$time, breaks = 50, probability = T, 
     xlab = "Time (years)", ylab = "Density", main = "Simulacrum Lung Cancer (F)")
box()

# Required quantities
status <- as.vector(df$status)
times <- as.vector(df$time) # in years
n.obs <- sum(status)
# Population hazard rates for the uncensored individuals
hp.obs <- df$hrate[as.logical(status)]

Model fitting using the GHSurv package

The GHSurv R package implements the GH model with lognormal (LNGEH), log-logistic (LLGEH), Gamma (GGEH), Power Generalised Weibull (PGWGEH) , Exponentiated Weibull (EWGEH), and Generalised Gamma (GGGEH) baseline hazards. Maximum likelihood estimates of the parameters can be obtained using the R command GEHMLE in this package, while confidence intervals of the parameters can be obtained using the command Conf.Int, as shown below.

#################
# Model fitting
#################
# Optimisation step

# LN baseline
OPT1 <- GEHMLE(init = rep(0,2+p+q), times = times, status = status, hstr = "LNGEH", 
              des = X, des_t = Xt, hp.obs = hp.obs, method = "nlminb", maxit = 10000)
# PGW baseline
OPT2 <- GEHMLE(init = c(0,0,0.5,rep(0,p+q)), times = times, status = status, hstr = "PGWGEH", 
              des = X, des_t = Xt, hp.obs = hp.obs, method = "nlminb", maxit = 10000)
# GG baseline
OPT3 <- GEHMLE(init = rep(0,2+p+q), times = times, status = status, hstr = "GGEH", 
              des = X, des_t = Xt, hp.obs = hp.obs, method = "nlminb", maxit = 10000)
# LL baseline
OPT4 <- GEHMLE(init = rep(0,2+p+q), times = times, status = status, hstr = "LLGEH", 
              des = X, des_t = Xt, hp.obs = hp.obs, method = "nlminb", maxit = 10000)

# Comparing models 
AIC1 <- 2*OPT1$OPT$objective + 2*length(OPT1$OPT$par)
AIC2 <- 2*OPT2$OPT$objective + 2*length(OPT2$OPT$par)
AIC3 <- 2*OPT3$OPT$objective + 2*length(OPT3$OPT$par)
AIC4 <- 2*OPT4$OPT$objective + 2*length(OPT4$OPT$par)

AICs <- c(AIC1,AIC2,AIC3,AIC4)
which.min(AICs)
## [1] 2

Best model summaries

# MLE: Best model is the one with PGW baseline hazard
MLE <- c(exp(OPT2$OPT$par[1:3]),OPT2$OPT$par[-c(1:3)])

# Confidence intervals under reparameterisation
CF <- Conf.Int(FUN = OPT2$loglik,MLE = OPT2$OPT$par,level=0.95,index= NULL)
kable( CF, digits = 3)
Lower Upper Transf MLE Std. Error
par1 -2.378 -2.213 -2.295 0.042
par2 0.112 0.184 0.148 0.018
par3 1.734 1.854 1.794 0.031
par4 0.303 0.477 0.390 0.044
par5 -0.084 0.013 -0.036 0.025
par6 0.100 0.237 0.169 0.035
par7 0.044 0.179 0.111 0.034
par8 0.080 0.212 0.146 0.034
par9 0.158 0.286 0.222 0.033
# Confidence intervals under original parameterisation
CFO <- CF[,-4]
CFO[1:3,1:3] <- exp(CFO[1:3,1:3])
colnames(CFO) <- c("Lower", "Upper", "MLE")
kable( CFO, digits = 3)
Lower Upper MLE
par1 0.093 0.109 0.101
par2 1.118 1.202 1.159
par3 5.664 6.387 6.015
par4 0.303 0.477 0.390
par5 -0.084 0.013 -0.036
par6 0.100 0.237 0.169
par7 0.044 0.179 0.111
par8 0.080 0.212 0.146
par9 0.158 0.286 0.222

Net survival by deprivation

######################################################
# Net survival function
# Deprivation I vs Deprivation V vs All
######################################################

NS_PGWGH <- function(t,par,des_t,des){
  des_t <- as.matrix(des_t); des <- as.matrix(des)
  p <- ncol(des)
  pt <- ncol(des_t)
  # Classical model
  exp.mle0 <- as.vector(exp( des_t%*%par[4:(3+pt)] ))
  exp.mledif0 <- as.vector(exp( des%*%par[(4+pt):(3+pt+p)]  -des_t%*%par[4:(3+pt)] ))
  out <- mean(exp(-chpgw(t*exp.mle0,par[1],par[2],par[3])*exp.mledif0))  
  return(out)
}

# Deprivation indicators
ind.I <- which(df$dep.1 == 1)
ind.V <- which(df$dep.5 == 1)

# Population Net survival
NS0 <- Vectorize(function(t) NS_PGWGH(t, MLE, Xt, X)  )
# Net survival for deprivation I group
NS1 <- Vectorize(function(t) NS_PGWGH(t, MLE, Xt[ind.I,], X[ind.I,])  )
# Net survival for deprivation V group
NS5 <- Vectorize(function(t) NS_PGWGH(t, MLE, Xt[ind.V,], X[ind.V,])  )

# Comparison
curve(NS1, 0, 6, ylim = c(0,1), lwd = 2, main = "Net survival by Deprivation", 
      xlab = "Time(Years)", ylab = "Net Survival", cex.axis = 1.5, cex.lab = 1.5)
curve(NS5, 0, 6, add = T, lty = 2, lwd = 2)
curve(NS0, 0, 6, add = T, lty = 3, lwd = 2)
legend("topright", col = c("black","black","black"), lty = c(1,2,3), legend = c("Dep-I", "Dep-V", "All"), lwd = c(2,2,2))

References

Rubio, F. J., L. Remontet, N. P. Jewell, and A. Belot. 2019. β€œOn a General Structure for Hazard-Based Regression Models: An Application to Population-Based Cancer Research.” Statistical Methods in Medical Research 28: 2404–17.