Disclaimer

The artificial data provided here and the numerical results represent numerical illustrations of the methods proposed in Eletti et al. (2021). These data and results should not be used to make epidemiological claims.

The Simulacrum data set

The data set used here is based on The Simulacrum data set, which contains artificial patient-like cancer data to help researchers gain insights. We consider the cohort of female lung cancer patients diagnosed in 2013, with follow-up until 2019. Background mortality rates were obtained for each cancer patient from population life tables for England defined for each calendar year in 2010-2015, and stratified by single year, age, sex, and deprivation category.

Data Summary

# Required packages
library(survival)
library(ranger)
library(ggplot2)
library(dplyr)
library(ggfortify)

# Load the simulated data set
load('data_lung.Rda')

dim(df)
## [1] 16828    14
# A snipet of the data frame
head(df)
##          time year dep dep.1 dep.2 dep.3 dep.4 dep.5 age  age.out sex status
## 1 5.776865000 2013   2     0     1     0     0     0  18 23.77687   1      0
## 2 0.084873371 2013   2     0     1     0     0     0  20 20.08487   1      1
## 3 0.388774810 2013   1     1     0     0     0     0  20 20.38877   1      1
## 4 5.771389500 2013   4     0     0     0     1     0  20 25.77139   1      0
## 5 0.002737851 2013   5     0     0     0     0     1  21 21.00274   1      1
## 6 5.264886900 2013   2     0     1     0     0     0  22 27.26489   1      0
##          hrate      agec
## 1 0.0001792260 -5.094741
## 2 0.0001898829 -4.912291
## 3 0.0001898829 -4.912291
## 4 0.0001898829 -4.912291
## 5 0.0001944859 -4.821066
## 6 0.0002008885 -4.729842
# Histogram of the survival times
hist(df$time, breaks = 30, probability = TRUE, xlab = "Time", main = "Simulated data")
box()

# Summaries

# Deprivation level
table(df$dep)
## 
##    1    2    3    4    5 
## 2340 3043 3431 3747 4267
# Age at diagnosis
summary(df$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   18.00   67.00   74.00   73.85   82.00   99.00
# Proportion of observed times to event
mean(df$status)
## [1] 0.7747801
# Kaplan Meier estimators for each deprivation level
km_dep_fit <- survfit(Surv(time, status) ~ dep, data=df)
autoplot(km_dep_fit, conf.int = FALSE, xlab = "Time", ylab = "Survival")

Modelling using the R package GJRM

#############################################################
#
# GJRM model fitting 
# Data: Women diagnosed with lung cancer, England
# The Simulacrum data set
#
#############################################################


# Load packages:
library(GJRM)
library(knitr)

# Select background mortality rate only for uncensored observations (this is our convention)
hrate.select = df$hrate[df$status == 1]


#################
# Model fitting
#################

# Using only model equation:

eq_GJRM = list(time ~ s(log(time), bs = "mpi") + s(agec, bs='cr') + ti(log(time), agec) +
               dep.2 + dep.3 + dep.4 + dep.5)


# Fitting three different hazard structures and retaining the best model based on smallest AIC:

# Generalised proportional hazards model
out_GJRM_1 = gamlss(eq_GJRM, data = df, surv = TRUE, margin = 'PH',
                    cens = status, type.cens = "R", hrate = hrate.select)

# Generalised proportional odds model
out_GJRM_2 = gamlss(eq_GJRM, data = df, surv = TRUE, margin = 'PO',
                    cens = status, type.cens = "R", hrate = hrate.select)

# Generalised probit model
out_GJRM_3 = gamlss(eq_GJRM, data = df, surv = TRUE, margin = 'probit',
                    cens = status, type.cens = "R", hrate = hrate.select)

best <- which.min(c(AIC(out_GJRM_1), AIC(out_GJRM_2), AIC(out_GJRM_3)))

if(best == 1) out_GJRM <- out_GJRM_1
if(best == 2) out_GJRM <- out_GJRM_2
if(best == 3) out_GJRM <- out_GJRM_3

# Best model
best
## [1] 3
# AIC
c(AIC(out_GJRM_1), AIC(out_GJRM_2), AIC(out_GJRM_3))
## [1] 37672.15 37588.75 37570.98

Net survival estimation for the whole population at specific time points

# Estimate net survival (and confidence intervals) for the whole cohort 
# at times t = 1, 3 , 5 (years):

net.surv <- hazsurv.plot(out_GJRM, type = 'surv', t.range = c(1,5), newdata = df, ls = 3,
                         intervals = TRUE, n.sim = 1000, plot.out = FALSE)
net.surv$s
##        [,1]
## 1 0.4835268
## 2 0.3282661
## 3 0.3009963
net.surv$CIs
##        2.5%     97.5%
## 1 0.4762803 0.4902375
## 2 0.3211438 0.3345267
## 3 0.2935971 0.3079030

Net survival estimation for subgroups of the population (Least vs. Most deprived) with plots

# Estimate net survival (and confidence intervals) for deprivation level 1 (least deprived patients) 
# at times t = 1, 3 , 5 (years):

# Define sub-population index for deprivation level 1
pop.indices.dep1 <- (df$dep.1 == 1)

# Create sub-population data using previous index
data.dep1 <- df[pop.indices.dep1, ]

# Estimate net survival
net.surv.dep1 <- hazsurv.plot(out_GJRM, type = 'surv', newdata = data.dep1, t.range = c(1,5),
                              ls = 3, intervals = TRUE, n.sim = 1000, plot.out = FALSE)
net.surv.dep1$s
##        [,1]
## 1 0.5116463
## 2 0.3549088
## 3 0.3270999
net.surv.dep1$CIs
##        2.5%     97.5%
## 1 0.4957776 0.5263289
## 2 0.3400652 0.3692478
## 3 0.3125600 0.3408104
# Estimate net survival (and confidence intervals) for deprivation level 5 (most deprived patients) 
# at times t = 1, 3 , 5 (years):

# Define sub-population index for deprivation level 5
pop.indices.dep5 <- (df$dep.5 == 1)

# Create sub-population data using previous index
data.dep5 <- df[pop.indices.dep5, ]

# Estimate net survival
net.surv.dep5 <- hazsurv.plot(out_GJRM, type = 'surv', newdata = data.dep5, t.range = c(1,5),
                              ls = 3, intervals = TRUE, n.sim = 1000, plot.out = FALSE)
net.surv.dep5$s
##        [,1]
## 1 0.4607089
## 2 0.3072413
## 3 0.2805275
net.surv.dep5$CIs
##        2.5%     97.5%
## 1 0.4486894 0.4716148
## 2 0.2960779 0.3177889
## 3 0.2696788 0.2907953
# Export results to a table (use library(xtable) to export to LaTeX)
MNSG <- cbind(c(1,3,5), net.surv$s, net.surv$CIs[,1], net.surv$CIs[,2], 
              net.surv.dep1$s, net.surv.dep1$CIs[,1], net.surv.dep1$CIs[,2],
              net.surv.dep5$s, net.surv.dep5$CIs[,1], net.surv.dep5$CIs[,2])

colnames(MNSG) <- c('F-up (yrs)','NS pop','95% LL', '95% UL',
                    'NS dep 1','95% LL', '95% UL',
                    'NS dep 5','95% LL', '95% UL')
kable(MNSG, digits = 3)
F-up (yrs) NS pop 95% LL 95% UL NS dep 1 95% LL 95% UL NS dep 5 95% LL 95% UL
1 0.484 0.476 0.490 0.512 0.496 0.526 0.461 0.449 0.472
3 0.328 0.321 0.335 0.355 0.340 0.369 0.307 0.296 0.318
5 0.301 0.294 0.308 0.327 0.313 0.341 0.281 0.270 0.291
# Plot of net survival by deprivation (least deprived versus most deprived):


net.surv.dep1.plot <- hazsurv.plot(out_GJRM, type = 'surv', newdata = data.dep1, t.range = c(0,5),
                              ls = 100, intervals = TRUE, n.sim = 1000, plot.out = TRUE)

net.surv.dep5.plot <- hazsurv.plot(out_GJRM, type = 'surv', newdata = data.dep5, t.range = c(0,5),
                                   ls = 100, intervals = TRUE, n.sim = 1000, plot.out = TRUE)

# Comparison 
plot(seq(0,5, length.out = 100), net.surv.dep1.plot$s, las=1, type = 'l', lwd = 1.5, ylim = c(0,1),
     ylab = 'Net survival', xlab = 'Follow-up time (years)', cex.axis=1.25, cex.lab=1.25)

polygon(c(seq(0,5, length.out = 100),rev(seq(0,5, length.out = 100))), 
        c(net.surv.dep1.plot$CIs[,2],rev(net.surv.dep1.plot$CIs[,1])),col="grey", border=NA)

polygon(c(seq(0,5, length.out = 100),rev(seq(0,5, length.out = 100))),
        c(net.surv.dep5.plot$CIs[,2],rev(net.surv.dep5.plot$CIs[,1])),col="grey", border=NA)

lines(seq(0,5, length.out = 100), net.surv.dep1.plot$s, lwd = 1.5, lty = 1)
lines(seq(0,5, length.out = 100), net.surv.dep5.plot$s, lwd = 1.5, lty = 2)

legend('topright', legend = c('Least deprived', 'Most deprived'), lwd = 1.2, lty = c(1,5))

References

Eletti, A., G. Marra, M. Quaresma, R. Radice, and F. J. Rubio. 2021. “A Unifying Framework for Flexible Excess Hazard Modelling with Applications in Cancer Epidemiology.” Submitted, na–.
Rubio, F. J., L. Remontet, N. P. Jewell, and A. Belot. 2019. “On a General Structure for Hazard-Based Regression Models: An Application to Population-Based Cancer Research.” Statistical Methods in Medical Research 28: 2404–17.