Disclaimer

The artificial data provided here and the numerical results represent numerical illustrations of the methods proposed in Rubio et al. (2019) . These data and results should not be used to make epidemiological claims.

The excess hazard regression model

We refer the reader to Rubio et al. (2019) for a review of excess hazard regression models and the assumptions behind the relative survival framework. The model is formulated in terms of the excess hazard function: \[ h_E(t; {\bf x}, \alpha, \beta, \theta) = h_0(t \exp\{ {\tilde{\bf x}}^{\top}\alpha\}; \theta) \exp\{{\bf x}^{\top}\beta\}, \] where \(h_0(;\theta)\) is a parametric baseline hazard. This model contains the Proportional hazards (\(\alpha=0\)), accelerated hazards (\(\beta=0\)), and accelerated failure time (\(\alpha = \beta\), \(\tilde{\bf x} = {\bf x}\)) models as particular cases.

The simulated data set

The data set used here is based on the Simulacrum data set, which contains artificial patient-like cancer data to help researchers gain insights. We consider the cohort of female lung cancer patients diagnosed in 2013, with follow-up until 2019. Background mortality rates were obtained for each cancer patient from population life tables for England defined for each calendar year in 2010-2015, and stratified by single year, age, sex, and deprivation category.

We consider the time dependent effect of the variable age (\(\tilde{\bf x}\), scaled), and the hazard-level effects (\({\bf x}\)) of age (scaled) and deprivation (categorical I-V).

Data Preparation

# Required packages
# library(devtools)
#install_github("FJRubio67/GHSurv")
library(GHSurv)
library(knitr)
library(numDeriv)

# Load the simulated data set
load('data_lung.Rda')

dim(df)

## [1] 16828    14

# A snipet of the data frame
head(df)

##          time year dep dep.1 dep.2 dep.3 dep.4 dep.5 age  age.out sex status
## 1 5.776865000 2013   2     0     1     0     0     0  18 23.77687   1      0
## 2 0.084873371 2013   2     0     1     0     0     0  20 20.08487   1      1
## 3 0.388774810 2013   1     1     0     0     0     0  20 20.38877   1      1
## 4 5.771389500 2013   4     0     0     0     1     0  20 25.77139   1      0
## 5 0.002737851 2013   5     0     0     0     0     1  21 21.00274   1      1
## 6 5.264886900 2013   2     0     1     0     0     0  22 27.26489   1      0
##          hrate      agec
## 1 0.0001792260 -5.094741
## 2 0.0001898829 -4.912291
## 3 0.0001898829 -4.912291
## 4 0.0001898829 -4.912291
## 5 0.0001944859 -4.821066
## 6 0.0002008885 -4.729842

# Summaries

# Deprivation level
table(df$dep)

## 
##    1    2    3    4    5 
## 2340 3043 3431 3747 4267

# Age at diagnosis
summary(df$age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   18.00   67.00   74.00   73.85   82.00   99.00

# Proportion of observed times to event
mean(df$status)

## [1] 0.7747801

#******************
# Data preparation
#******************

n.ind <- dim(df)[1]  # number of individuals
# Design matrices
X <- as.matrix(cbind( df$agec, df$dep.2, df$dep.3, df$dep.4, df$dep.5 ))
colnames(X) <- cbind("std age", "dep.2", "dep.3", "dep.4", "dep.5")
Xt <- as.matrix(df$agec)
q <- dim(Xt)[2]
p <- dim(X)[2]

# Histogram of survival times
hist(df$time, breaks = 50, probability = T, 
     xlab = "Time (years)", ylab = "Density", main = "Simulacrum Lung Cancer (F)")
box()

# Required quantities
status <- as.vector(df$status)
times <- as.vector(df$time) # in years
n.obs <- sum(status)
# Population hazard rates for the uncensored individuals
hp.obs <- df$hrate[as.logical(status)]

Model fitting using the `GHSurv` package

The GHSurv R package implements the GH model with lognormal (LNGEH), log-logistic (LLGEH), Gamma (GGEH), Power Generalised Weibull (PGWGEH) , Exponentiated Weibull (EWGEH), and Generalised Gamma (GGGEH) baseline hazards. Maximum likelihood estimates of the parameters can be obtained using the R command GEHMLE in this package, while confidence intervals of the parameters can be obtained using the command Conf.Int, as shown below.

#################
# Model fitting
#################
# Optimisation step

# LN baseline
OPT1 <- GEHMLE(init = rep(0,2+p+q), times = times, status = status, hstr = "LNGEH", 
              des = X, des_t = Xt, hp.obs = hp.obs, method = "nlminb", maxit = 10000)
# PGW baseline
OPT2 <- GEHMLE(init = c(0,0,0.5,rep(0,p+q)), times = times, status = status, hstr = "PGWGEH", 
              des = X, des_t = Xt, hp.obs = hp.obs, method = "nlminb", maxit = 10000)
# GG baseline
OPT3 <- GEHMLE(init = rep(0,2+p+q), times = times, status = status, hstr = "GGEH", 
              des = X, des_t = Xt, hp.obs = hp.obs, method = "nlminb", maxit = 10000)
# LL baseline
OPT4 <- GEHMLE(init = rep(0,2+p+q), times = times, status = status, hstr = "LLGEH", 
              des = X, des_t = Xt, hp.obs = hp.obs, method = "nlminb", maxit = 10000)

# Comparing models 
AIC1 <- 2*OPT1$OPT$objective + 2*length(OPT1$OPT$par)
AIC2 <- 2*OPT2$OPT$objective + 2*length(OPT2$OPT$par)
AIC3 <- 2*OPT3$OPT$objective + 2*length(OPT3$OPT$par)
AIC4 <- 2*OPT4$OPT$objective + 2*length(OPT4$OPT$par)

AICs <- c(AIC1,AIC2,AIC3,AIC4)
which.min(AICs)

## [1] 2

Best model summaries

# MLE: Best model is the one with PGW baseline hazard
MLE <- c(exp(OPT2$OPT$par[1:3]),OPT2$OPT$par[-c(1:3)])

# Confidence intervals under reparameterisation
CF <- Conf.Int(FUN = OPT2$loglik,MLE = OPT2$OPT$par,level=0.95,index= NULL)
kable( CF, digits = 3)

	Lower	Upper	Transf MLE	Std. Error
par1	-2.378	-2.213	-2.295	0.042
par2	0.112	0.184	0.148	0.018
par3	1.734	1.854	1.794	0.031
par4	0.303	0.477	0.390	0.044
par5	-0.084	0.013	-0.036	0.025
par6	0.100	0.237	0.169	0.035
par7	0.044	0.179	0.111	0.034
par8	0.080	0.212	0.146	0.034
par9	0.158	0.286	0.222	0.033

# Confidence intervals under original parameterisation
CFO <- CF[,-4]
CFO[1:3,1:3] <- exp(CFO[1:3,1:3])
colnames(CFO) <- c("Lower", "Upper", "MLE")
kable( CFO, digits = 3)

	Lower	Upper	MLE
par1	0.093	0.109	0.101
par2	1.118	1.202	1.159
par3	5.664	6.387	6.015
par4	0.303	0.477	0.390
par5	-0.084	0.013	-0.036
par6	0.100	0.237	0.169
par7	0.044	0.179	0.111
par8	0.080	0.212	0.146
par9	0.158	0.286	0.222

Net survival by deprivation

######################################################
# Net survival function
# Deprivation I vs Deprivation V vs All
######################################################

NS_PGWGH <- function(t,par,des_t,des){
  des_t <- as.matrix(des_t); des <- as.matrix(des)
  p <- ncol(des)
  pt <- ncol(des_t)
  # Classical model
  exp.mle0 <- as.vector(exp( des_t%*%par[4:(3+pt)] ))
  exp.mledif0 <- as.vector(exp( des%*%par[(4+pt):(3+pt+p)]  -des_t%*%par[4:(3+pt)] ))
  out <- mean(exp(-chpgw(t*exp.mle0,par[1],par[2],par[3])*exp.mledif0))  
  return(out)
}

# Deprivation indicators
ind.I <- which(df$dep.1 == 1)
ind.V <- which(df$dep.5 == 1)

# Population Net survival
NS0 <- Vectorize(function(t) NS_PGWGH(t, MLE, Xt, X)  )
# Net survival for deprivation I group
NS1 <- Vectorize(function(t) NS_PGWGH(t, MLE, Xt[ind.I,], X[ind.I,])  )
# Net survival for deprivation V group
NS5 <- Vectorize(function(t) NS_PGWGH(t, MLE, Xt[ind.V,], X[ind.V,])  )

# Comparison
curve(NS1, 0, 6, ylim = c(0,1), lwd = 2, main = "Net survival by Deprivation", 
      xlab = "Time(Years)", ylab = "Net Survival", cex.axis = 1.5, cex.lab = 1.5)
curve(NS5, 0, 6, add = T, lty = 2, lwd = 2)
curve(NS0, 0, 6, add = T, lty = 3, lwd = 2)
legend("topright", col = c("black","black","black"), lty = c(1,2,3), legend = c("Dep-I", "Dep-V", "All"), lwd = c(2,2,2))

References

Rubio, F. J., L. Remontet, N. P. Jewell, and A. Belot. 2019. “On a General Structure for Hazard-Based Regression Models: An Application to Population-Based Cancer Research.” Statistical Methods in Medical Research 28: 2404–17.

Simulacrum data: lung cancer and Net Survival GH model

F. Javier Rubio and Aurelien Belot

September 28, 2021

Disclaimer

The excess hazard regression model

The simulated data set

Data Preparation

Model fitting using the `GHSurv` package

Best model summaries

Net survival by deprivation

References

Simulacrum data: lung cancer and Net Survival GH model

F. Javier Rubio and Aurelien Belot

September 28, 2021

Disclaimer

The excess hazard regression model

The simulated data set

Data Preparation

Model fitting using the GHSurv package

Best model summaries

Net survival by deprivation

References

Model fitting using the `GHSurv` package