The artificial data provided here and the numerical results represent numerical illustrations of the methods proposed in Eletti et al. (2021). These data and results should not be used to make epidemiological claims.
Excess hazard modeling is one of the main tools in population-based cancer survival research. In this setting, due to the absence of reliable information on the cause of death, the total or overall hazard is separated into two components: the excess hazard due to the cancer of interest, and the population hazard due to all other causes of death. We propose a unifying link-based additive modeling framework for the excess hazard that allows for the inclusion of many types of covariate effects, including spatial and time-dependent effects, using any type of smoother, such as thin plate, cubic splines, tensor products and Markov random fields (Eletti et al. 2021). In addition, this framework accounts for all types of censoring as well as left-truncation. Estimation is conducted by using an efficient and stable penalized likelihood-based algorithm whose empirical performance is evaluated through extensive simulation studies. The proposed approach is available in the R package GJRM
.
We refer the reader to Rubio et al. (2019) for a review of excess hazard regression models and the assumptions behind the relative survival framework.
The data set used here is based on The Simulacrum data set, which contains artificial patient-like cancer data to help researchers gain insights. We consider the cohort of female lung cancer patients diagnosed in 2013, with follow-up until 2019. Background mortality rates were obtained for each cancer patient from population life tables for England defined for each calendar year in 2010-2015, and stratified by single year, age, sex, and deprivation category.
# Required packages
library(survival)
library(ranger)
library(ggplot2)
library(dplyr)
library(ggfortify)
# Load the simulated data set
load('data_lung.Rda')
dim(df)
## [1] 16828 14
# A snipet of the data frame
head(df)
## time year dep dep.1 dep.2 dep.3 dep.4 dep.5 age age.out sex status
## 1 5.776865000 2013 2 0 1 0 0 0 18 23.77687 1 0
## 2 0.084873371 2013 2 0 1 0 0 0 20 20.08487 1 1
## 3 0.388774810 2013 1 1 0 0 0 0 20 20.38877 1 1
## 4 5.771389500 2013 4 0 0 0 1 0 20 25.77139 1 0
## 5 0.002737851 2013 5 0 0 0 0 1 21 21.00274 1 1
## 6 5.264886900 2013 2 0 1 0 0 0 22 27.26489 1 0
## hrate agec
## 1 0.0001792260 -5.094741
## 2 0.0001898829 -4.912291
## 3 0.0001898829 -4.912291
## 4 0.0001898829 -4.912291
## 5 0.0001944859 -4.821066
## 6 0.0002008885 -4.729842
# Histogram of the survival times
hist(df$time, breaks = 30, probability = TRUE, xlab = "Time", main = "Simulated data")
box()
# Summaries
# Deprivation level
table(df$dep)
##
## 1 2 3 4 5
## 2340 3043 3431 3747 4267
# Age at diagnosis
summary(df$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 18.00 67.00 74.00 73.85 82.00 99.00
# Proportion of observed times to event
mean(df$status)
## [1] 0.7747801
# Kaplan Meier estimators for each deprivation level
km_dep_fit <- survfit(Surv(time, status) ~ dep, data=df)
autoplot(km_dep_fit, conf.int = FALSE, xlab = "Time", ylab = "Survival")
#############################################################
#
# GJRM model fitting
# Data: Women diagnosed with lung cancer, England
# The Simulacrum data set
#
#############################################################
# Load packages:
library(GJRM)
library(knitr)
# Select background mortality rate only for uncensored observations (this is our convention)
hrate.select = df$hrate[df$status == 1]
#################
# Model fitting
#################
# Using only model equation:
eq_GJRM = list(time ~ s(log(time), bs = "mpi") + s(agec, bs='cr') + ti(log(time), agec) +
dep.2 + dep.3 + dep.4 + dep.5)
# Fitting three different hazard structures and retaining the best model based on smallest AIC:
# Generalised proportional hazards model
out_GJRM_1 = gamlss(eq_GJRM, data = df, surv = TRUE, margin = 'PH',
cens = status, type.cens = "R", hrate = hrate.select)
# Generalised proportional odds model
out_GJRM_2 = gamlss(eq_GJRM, data = df, surv = TRUE, margin = 'PO',
cens = status, type.cens = "R", hrate = hrate.select)
# Generalised probit model
out_GJRM_3 = gamlss(eq_GJRM, data = df, surv = TRUE, margin = 'probit',
cens = status, type.cens = "R", hrate = hrate.select)
best <- which.min(c(AIC(out_GJRM_1), AIC(out_GJRM_2), AIC(out_GJRM_3)))
if(best == 1) out_GJRM <- out_GJRM_1
if(best == 2) out_GJRM <- out_GJRM_2
if(best == 3) out_GJRM <- out_GJRM_3
# Best model
best
## [1] 3
# AIC
c(AIC(out_GJRM_1), AIC(out_GJRM_2), AIC(out_GJRM_3))
## [1] 37672.15 37588.75 37570.98
# Estimate net survival (and confidence intervals) for the whole cohort
# at times t = 1, 3 , 5 (years):
net.surv <- hazsurv.plot(out_GJRM, type = 'surv', t.range = c(1,5), newdata = df, ls = 3,
intervals = TRUE, n.sim = 1000, plot.out = FALSE)
net.surv$s
## [,1]
## 1 0.4835268
## 2 0.3282661
## 3 0.3009963
net.surv$CIs
## 2.5% 97.5%
## 1 0.4762803 0.4902375
## 2 0.3211438 0.3345267
## 3 0.2935971 0.3079030
# Estimate net survival (and confidence intervals) for deprivation level 1 (least deprived patients)
# at times t = 1, 3 , 5 (years):
# Define sub-population index for deprivation level 1
pop.indices.dep1 <- (df$dep.1 == 1)
# Create sub-population data using previous index
data.dep1 <- df[pop.indices.dep1, ]
# Estimate net survival
net.surv.dep1 <- hazsurv.plot(out_GJRM, type = 'surv', newdata = data.dep1, t.range = c(1,5),
ls = 3, intervals = TRUE, n.sim = 1000, plot.out = FALSE)
net.surv.dep1$s
## [,1]
## 1 0.5116463
## 2 0.3549088
## 3 0.3270999
net.surv.dep1$CIs
## 2.5% 97.5%
## 1 0.4957776 0.5263289
## 2 0.3400652 0.3692478
## 3 0.3125600 0.3408104
# Estimate net survival (and confidence intervals) for deprivation level 5 (most deprived patients)
# at times t = 1, 3 , 5 (years):
# Define sub-population index for deprivation level 5
pop.indices.dep5 <- (df$dep.5 == 1)
# Create sub-population data using previous index
data.dep5 <- df[pop.indices.dep5, ]
# Estimate net survival
net.surv.dep5 <- hazsurv.plot(out_GJRM, type = 'surv', newdata = data.dep5, t.range = c(1,5),
ls = 3, intervals = TRUE, n.sim = 1000, plot.out = FALSE)
net.surv.dep5$s
## [,1]
## 1 0.4607089
## 2 0.3072413
## 3 0.2805275
net.surv.dep5$CIs
## 2.5% 97.5%
## 1 0.4486894 0.4716148
## 2 0.2960779 0.3177889
## 3 0.2696788 0.2907953
# Export results to a table (use library(xtable) to export to LaTeX)
MNSG <- cbind(c(1,3,5), net.surv$s, net.surv$CIs[,1], net.surv$CIs[,2],
net.surv.dep1$s, net.surv.dep1$CIs[,1], net.surv.dep1$CIs[,2],
net.surv.dep5$s, net.surv.dep5$CIs[,1], net.surv.dep5$CIs[,2])
colnames(MNSG) <- c('F-up (yrs)','NS pop','95% LL', '95% UL',
'NS dep 1','95% LL', '95% UL',
'NS dep 5','95% LL', '95% UL')
kable(MNSG, digits = 3)
F-up (yrs) | NS pop | 95% LL | 95% UL | NS dep 1 | 95% LL | 95% UL | NS dep 5 | 95% LL | 95% UL |
---|---|---|---|---|---|---|---|---|---|
1 | 0.484 | 0.476 | 0.490 | 0.512 | 0.496 | 0.526 | 0.461 | 0.449 | 0.472 |
3 | 0.328 | 0.321 | 0.335 | 0.355 | 0.340 | 0.369 | 0.307 | 0.296 | 0.318 |
5 | 0.301 | 0.294 | 0.308 | 0.327 | 0.313 | 0.341 | 0.281 | 0.270 | 0.291 |
# Plot of net survival by deprivation (least deprived versus most deprived):
net.surv.dep1.plot <- hazsurv.plot(out_GJRM, type = 'surv', newdata = data.dep1, t.range = c(0,5),
ls = 100, intervals = TRUE, n.sim = 1000, plot.out = TRUE)
net.surv.dep5.plot <- hazsurv.plot(out_GJRM, type = 'surv', newdata = data.dep5, t.range = c(0,5),
ls = 100, intervals = TRUE, n.sim = 1000, plot.out = TRUE)
# Comparison
plot(seq(0,5, length.out = 100), net.surv.dep1.plot$s, las=1, type = 'l', lwd = 1.5, ylim = c(0,1),
ylab = 'Net survival', xlab = 'Follow-up time (years)', cex.axis=1.25, cex.lab=1.25)
polygon(c(seq(0,5, length.out = 100),rev(seq(0,5, length.out = 100))),
c(net.surv.dep1.plot$CIs[,2],rev(net.surv.dep1.plot$CIs[,1])),col="grey", border=NA)
polygon(c(seq(0,5, length.out = 100),rev(seq(0,5, length.out = 100))),
c(net.surv.dep5.plot$CIs[,2],rev(net.surv.dep5.plot$CIs[,1])),col="grey", border=NA)
lines(seq(0,5, length.out = 100), net.surv.dep1.plot$s, lwd = 1.5, lty = 1)
lines(seq(0,5, length.out = 100), net.surv.dep5.plot$s, lwd = 1.5, lty = 2)
legend('topright', legend = c('Least deprived', 'Most deprived'), lwd = 1.2, lty = c(1,5))