Disclaimer

The artificial data provided here and the numerical results represent numerical illustrations of the methods proposed in Eletti et al. (2021). These data and results should not be used to make epidemiological claims.

The proposed link-based additive excess hazard model

Excess hazard modeling is one of the main tools in population-based cancer survival research. In this setting, due to the absence of reliable information on the cause of death, the total or overall hazard is separated into two components: the excess hazard due to the cancer of interest, and the population hazard due to all other causes of death. We propose a unifying link-based additive modeling framework for the excess hazard that allows for the inclusion of many types of covariate effects, including spatial and time-dependent effects, using any type of smoother, such as thin plate, cubic splines, tensor products and Markov random fields (Eletti et al. 2021). In addition, this framework accounts for all types of censoring as well as left-truncation. Estimation is conducted by using an efficient and stable penalized likelihood-based algorithm whose empirical performance is evaluated through extensive simulation studies. The proposed approach is available in the R package GJRM.

We refer the reader to Rubio et al. (2019) for a review of excess hazard regression models and the assumptions behind the relative survival framework.

The Simulacrum data set

The data set used here is based on The Simulacrum data set, which contains artificial patient-like cancer data to help researchers gain insights. We consider the cohort of female lung cancer patients diagnosed in 2013, with follow-up until 2019. Background mortality rates were obtained for each cancer patient from population life tables for England defined for each calendar year in 2010-2015, and stratified by single year, age, sex, and deprivation category.

Data Summary

# Required packages
library(survival)
library(ranger)
library(ggplot2)
library(dplyr)
library(ggfortify)

# Load the simulated data set
load('data_lung.Rda')

dim(df)

## [1] 16828    14

# A snipet of the data frame
head(df)

##          time year dep dep.1 dep.2 dep.3 dep.4 dep.5 age  age.out sex status
## 1 5.776865000 2013   2     0     1     0     0     0  18 23.77687   1      0
## 2 0.084873371 2013   2     0     1     0     0     0  20 20.08487   1      1
## 3 0.388774810 2013   1     1     0     0     0     0  20 20.38877   1      1
## 4 5.771389500 2013   4     0     0     0     1     0  20 25.77139   1      0
## 5 0.002737851 2013   5     0     0     0     0     1  21 21.00274   1      1
## 6 5.264886900 2013   2     0     1     0     0     0  22 27.26489   1      0
##          hrate      agec
## 1 0.0001792260 -5.094741
## 2 0.0001898829 -4.912291
## 3 0.0001898829 -4.912291
## 4 0.0001898829 -4.912291
## 5 0.0001944859 -4.821066
## 6 0.0002008885 -4.729842

# Histogram of the survival times
hist(df$time, breaks = 30, probability = TRUE, xlab = "Time", main = "Simulated data")
box()

# Summaries

# Deprivation level
table(df$dep)

## 
##    1    2    3    4    5 
## 2340 3043 3431 3747 4267

# Age at diagnosis
summary(df$age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   18.00   67.00   74.00   73.85   82.00   99.00

# Proportion of observed times to event
mean(df$status)

## [1] 0.7747801

# Kaplan Meier estimators for each deprivation level
km_dep_fit <- survfit(Surv(time, status) ~ dep, data=df)
autoplot(km_dep_fit, conf.int = FALSE, xlab = "Time", ylab = "Survival")

Modelling using the R package GJRM

#############################################################
#
# GJRM model fitting 
# Data: Women diagnosed with lung cancer, England
# The Simulacrum data set
#
#############################################################


# Load packages:
library(GJRM)
library(knitr)

# Select background mortality rate only for uncensored observations (this is our convention)
hrate.select = df$hrate[df$status == 1]


#################
# Model fitting
#################

# Using only model equation:

eq_GJRM = list(time ~ s(log(time), bs = "mpi") + s(agec, bs='cr') + ti(log(time), agec) +
               dep.2 + dep.3 + dep.4 + dep.5)


# Fitting three different hazard structures and retaining the best model based on smallest AIC:

# Generalised proportional hazards model
out_GJRM_1 = gamlss(eq_GJRM, data = df, surv = TRUE, margin = 'PH',
                    cens = status, type.cens = "R", hrate = hrate.select)

# Generalised proportional odds model
out_GJRM_2 = gamlss(eq_GJRM, data = df, surv = TRUE, margin = 'PO',
                    cens = status, type.cens = "R", hrate = hrate.select)

# Generalised probit model
out_GJRM_3 = gamlss(eq_GJRM, data = df, surv = TRUE, margin = 'probit',
                    cens = status, type.cens = "R", hrate = hrate.select)

best <- which.min(c(AIC(out_GJRM_1), AIC(out_GJRM_2), AIC(out_GJRM_3)))

if(best == 1) out_GJRM <- out_GJRM_1
if(best == 2) out_GJRM <- out_GJRM_2
if(best == 3) out_GJRM <- out_GJRM_3

# Best model
best

## [1] 3

# AIC
c(AIC(out_GJRM_1), AIC(out_GJRM_2), AIC(out_GJRM_3))

## [1] 37672.15 37588.75 37570.98

Net survival estimation for the whole population at specific time points

# Estimate net survival (and confidence intervals) for the whole cohort 
# at times t = 1, 3 , 5 (years):

net.surv <- hazsurv.plot(out_GJRM, type = 'surv', t.range = c(1,5), newdata = df, ls = 3,
                         intervals = TRUE, n.sim = 1000, plot.out = FALSE)

net.surv$s

##        [,1]
## 1 0.4835268
## 2 0.3282661
## 3 0.3009963

net.surv$CIs

##        2.5%     97.5%
## 1 0.4762803 0.4902375
## 2 0.3211438 0.3345267
## 3 0.2935971 0.3079030

Net survival estimation for subgroups of the population (Least vs. Most deprived) with plots

# Estimate net survival (and confidence intervals) for deprivation level 1 (least deprived patients) 
# at times t = 1, 3 , 5 (years):

# Define sub-population index for deprivation level 1
pop.indices.dep1 <- (df$dep.1 == 1)

# Create sub-population data using previous index
data.dep1 <- df[pop.indices.dep1, ]

# Estimate net survival
net.surv.dep1 <- hazsurv.plot(out_GJRM, type = 'surv', newdata = data.dep1, t.range = c(1,5),
                              ls = 3, intervals = TRUE, n.sim = 1000, plot.out = FALSE)

net.surv.dep1$s

##        [,1]
## 1 0.5116463
## 2 0.3549088
## 3 0.3270999

net.surv.dep1$CIs

##        2.5%     97.5%
## 1 0.4957776 0.5263289
## 2 0.3400652 0.3692478
## 3 0.3125600 0.3408104

# Estimate net survival (and confidence intervals) for deprivation level 5 (most deprived patients) 
# at times t = 1, 3 , 5 (years):

# Define sub-population index for deprivation level 5
pop.indices.dep5 <- (df$dep.5 == 1)

# Create sub-population data using previous index
data.dep5 <- df[pop.indices.dep5, ]

# Estimate net survival
net.surv.dep5 <- hazsurv.plot(out_GJRM, type = 'surv', newdata = data.dep5, t.range = c(1,5),
                              ls = 3, intervals = TRUE, n.sim = 1000, plot.out = FALSE)

net.surv.dep5$s

##        [,1]
## 1 0.4607089
## 2 0.3072413
## 3 0.2805275

net.surv.dep5$CIs

##        2.5%     97.5%
## 1 0.4486894 0.4716148
## 2 0.2960779 0.3177889
## 3 0.2696788 0.2907953

# Export results to a table (use library(xtable) to export to LaTeX)
MNSG <- cbind(c(1,3,5), net.surv$s, net.surv$CIs[,1], net.surv$CIs[,2], 
              net.surv.dep1$s, net.surv.dep1$CIs[,1], net.surv.dep1$CIs[,2],
              net.surv.dep5$s, net.surv.dep5$CIs[,1], net.surv.dep5$CIs[,2])

colnames(MNSG) <- c('F-up (yrs)','NS pop','95% LL', '95% UL',
                    'NS dep 1','95% LL', '95% UL',
                    'NS dep 5','95% LL', '95% UL')
kable(MNSG, digits = 3)

F-up (yrs)	NS pop	95% LL	95% UL	NS dep 1	95% LL	95% UL	NS dep 5	95% LL	95% UL
1	0.484	0.476	0.490	0.512	0.496	0.526	0.461	0.449	0.472
3	0.328	0.321	0.335	0.355	0.340	0.369	0.307	0.296	0.318
5	0.301	0.294	0.308	0.327	0.313	0.341	0.281	0.270	0.291

# Plot of net survival by deprivation (least deprived versus most deprived):


net.surv.dep1.plot <- hazsurv.plot(out_GJRM, type = 'surv', newdata = data.dep1, t.range = c(0,5),
                              ls = 100, intervals = TRUE, n.sim = 1000, plot.out = TRUE)

net.surv.dep5.plot <- hazsurv.plot(out_GJRM, type = 'surv', newdata = data.dep5, t.range = c(0,5),
                                   ls = 100, intervals = TRUE, n.sim = 1000, plot.out = TRUE)

# Comparison 
plot(seq(0,5, length.out = 100), net.surv.dep1.plot$s, las=1, type = 'l', lwd = 1.5, ylim = c(0,1),
     ylab = 'Net survival', xlab = 'Follow-up time (years)', cex.axis=1.25, cex.lab=1.25)

polygon(c(seq(0,5, length.out = 100),rev(seq(0,5, length.out = 100))), 
        c(net.surv.dep1.plot$CIs[,2],rev(net.surv.dep1.plot$CIs[,1])),col="grey", border=NA)

polygon(c(seq(0,5, length.out = 100),rev(seq(0,5, length.out = 100))),
        c(net.surv.dep5.plot$CIs[,2],rev(net.surv.dep5.plot$CIs[,1])),col="grey", border=NA)

lines(seq(0,5, length.out = 100), net.surv.dep1.plot$s, lwd = 1.5, lty = 1)
lines(seq(0,5, length.out = 100), net.surv.dep5.plot$s, lwd = 1.5, lty = 2)

legend('topright', legend = c('Least deprived', 'Most deprived'), lwd = 1.2, lty = c(1,5))

References

Eletti, A., G. Marra, M. Quaresma, R. Radice, and F. J. Rubio. 2021. “A Unifying Framework for Flexible Excess Hazard Modelling with Applications in Cancer Epidemiology.” Submitted, na–.

Rubio, F. J., L. Remontet, N. P. Jewell, and A. Belot. 2019. “On a General Structure for Hazard-Based Regression Models: An Application to Population-Based Cancer Research.” Statistical Methods in Medical Research 28: 2404–17.

A unifying framework for flexible excess hazard modelling with applications in cancer epidemiology: The Simulacrum

Alessia Eletti, Giampiero Marra, Manuela Quaresma, Rosalba Radice, and F. Javier Rubio

September 17, 2021