The data provided here and the numerical results represent numerical illustrations of the methods proposed in Eletti et al. (2021). These data and results should not be used to make epidemiological claims.
Excess hazard modeling is one of the main tools in population-based cancer survival research. In this setting, due to the absence of reliable information on the cause of death, the total or overall hazard is separated into two components: the excess hazard due to the cancer of interest, and the population hazard due to all other causes of death. We propose a unifying link-based additive modeling framework for the excess hazard that allows for the inclusion of many types of covariate effects, including spatial and time-dependent effects, using any type of smoother, such as thin plate, cubic splines, tensor products and Markov random fields (Eletti et al. 2021). In addition, this framework accounts for all types of censoring as well as left-truncation. Estimation is conducted by using an efficient and stable penalized likelihood-based algorithm whose empirical performance is evaluated through extensive simulation studies. The proposed approach is available in the R package GJRM.
We refer the reader to Rubio et al. (2019) for a review of excess hazard regression models and the assumptions behind the relative survival framework.
The data set used here is based on the LeukSurv data set from the R package spBayesSurv (Zhou, Hanson, and Zhang 2020). This data set contains information on \(n = 1043\) patients of acute myeloid leukemia. We use information about the survival times (years), vital status at the end of follow-up (0 - right-censored, 1 - dead), age (in years), sex (0 - female, 1 - male), white blood cell count at diagnosis (wbc, truncated at 500), the Townsend score (tpi, higher values indicates less affluent areas), and district of residence (district) as the spatial information. Background mortality rates were obtained for each cancer patient from national life tables for England defined for the calendar year 2010, and stratified by age, sex, and deprivation (based on the quantiles of the Townsend score). Thus, it is important to notice that the background mortality rates do not correspond to the correct values and this example should only be taken as an illustration of the proposed methods. A proper analysis would require obtaining life tables for the corresponding years based on the Townsend score, which is now obsolete.
#############################################################
#
# GJRM spatial model fitting
# Data: Leukemia data set
#############################################################
rm(list=ls())
# Load packages:
library(GJRM)
library(xtable)
library(knitr)
library(survival)
library(spBayesSurv)
library(BayesX)
library(R2BayesX)
library(sp)
library(fields)
library(surveillance)
library(ranger)
library(ggplot2)
library(dplyr)
library(ggfortify)
#------------------------------------------------------------------------
# Life tables
#------------------------------------------------------------------------
LT <- as.data.frame(read.table("England_LT_2010_2015_dep_gor.txt",header = T))
head(LT)
## X_year sex dep age rate gor
## 1 2010 1 9 0 0.0047365 10
## 2 2011 1 9 0 0.0046188 10
## 3 2012 1 9 0 0.0046188 10
## 4 2013 1 9 0 0.0046188 10
## 5 2014 1 9 0 0.0046188 10
## 6 2015 1 9 0 0.0046188 10
LT$sex <- LT$sex-1
#----------------------------------------------------------------------------------------------
# Leukemia data
#---------------------------------------------------------------------------------------------
data(LeukSurv)
?LeukSurv
head(LeukSurv)
## time cens xcoord ycoord age sex wbc tpi district
## 1 1 1 0.2050717 0.4972437 61 0 13.3 -1.96 9
## 2 1 1 0.2855568 0.8489526 76 0 450.0 -3.39 7
## 3 1 1 0.1764057 0.7364939 74 0 154.0 -4.95 7
## 4 1 1 0.2447630 0.2105843 79 1 500.0 -1.40 24
## 5 1 1 0.3274531 0.9073870 83 1 160.0 -2.59 7
## 6 1 1 0.6383682 0.3627343 81 1 30.4 0.03 11
dim(LeukSurv)
## [1] 1043 9
# Sample size
n <- nrow(LeukSurv)
# Age
age <- as.numeric(LeukSurv$age)
# Sex
sex <- as.numeric(LeukSurv$sex)
# white blood cell count at diagnosis
wbc <- as.numeric(LeukSurv$wbc)
# Townsend score
tpi <- as.numeric(LeukSurv$tpi)
# Survival time
time <- LeukSurv$time/365.24
# Age at the end of follow up
age.out <- round(age + time)
# Creating deprivation quintiles based on the tpi
quintiles <- quantile(LeukSurv$tpi,c(0.2,0.4,0.6,0.8))
dep0 <- vector()
for(i in 1:n){
if(LeukSurv$tpi[i] <= quintiles[1]) dep0[i] <- 1
if(quintiles[1] < LeukSurv$tpi[i] & LeukSurv$tpi[i] <= quintiles[2]) dep0[i] <- 2
if(quintiles[2] < LeukSurv$tpi[i] & LeukSurv$tpi[i] <= quintiles[3]) dep0[i] <- 3
if(quintiles[3] < LeukSurv$tpi[i] & LeukSurv$tpi[i] <= quintiles[4]) dep0[i] <- 4
if(quintiles[4] < LeukSurv$tpi[i] ) dep0[i] <- 5
}
# Population hazard rates
# gor = 2 for all patients as this seems to correspond to North West England
# https://www.ons.gov.uk/methodology/geography/ukgeographies/administrativegeography/england
MAT.ID <- cbind(1:n, rep(2010,n), age.out,sex, dep0, rep(2,n))
MAT.ID <- as.data.frame(MAT.ID)
colnames(MAT.ID) <- c("index", "X_year", "age", "sex", "dep","gor")
hrates <- merge(x = MAT.ID, y = LT, all.x = TRUE, by = c("X_year", "age", "sex", "dep","gor"), sort = FALSE)
hrates <- hrates[order(hrates$index),]
pop.rates <- hrates$rate
# Create data frame for model fitting
df <- data.frame(time = time, age = age, agec = scale(age),
hrate = pop.rates, wbc = wbc, tpi = tpi, sex = sex,
status = as.logical(LeukSurv$cens), district= as.factor(LeukSurv$district))
# Select background mortality rate only for uncensored observations (this is our convention)
hrate.select = df$hrate[df$status == 1]
# Scale age at diagnosis
df$agec <- as.numeric(scale(df$age))
# Scale wbc at diagnosis
df$wbcc <- as.numeric(scale(df$wbc))
# Scale tpi at diagnosis
df$tpic <- as.numeric(scale(df$tpi))
# Set-up the polygons -----------------------------------------------------------------------------------------------------
# Map object
nwengland <- read.bnd(system.file("otherdata/nwengland.bnd",
package = "spBayesSurv"))
## Note: map consists of 29 polygons
## Note: map consists of 24 regions
## Reading map ... finished
nwsp = bnd2sp(nwengland)
nwspd = SpatialPolygonsDataFrame(Sr=nwsp, data=data.frame(NAME_0=paste0("X",1:length(nwsp))))
class(nwspd)
## [1] "SpatialPolygonsDataFrame"
## attr(,"package")
## [1] "sp"
poldistrict = polys.setup(nwspd)
# Save the polygons
final.poldistrict = poldistrict$polys
# The final polygons will be:
xt <- list(polys = final.poldistrict)
#################
# Model fitting
#################
eq_GJRM = list(time ~ s(log(time), bs = "mpi") + s(agec, bs='cr') + ti(log(time), agec) + sex +
wbcc + tpic +
s(district, bs = 'mrf', xt = xt) )
# Fitting three different hazard structures and retaining the best model based on smallest AIC:
start_time <- Sys.time()
# Generalised proportional hazards model
out_GJRM_1 = gamlss(eq_GJRM, data = df, surv = TRUE, margin = 'PH',
cens = status, type.cens = "R", hrate = hrate.select)
end_time <- Sys.time()
run_time <- end_time - start_time
run_time
## Time difference of 16.14725 secs
# Generalised proportional odds model
out_GJRM_2 = gamlss(eq_GJRM, data = df, surv = TRUE, margin = 'PO',
cens = status, type.cens = "R", hrate = hrate.select)
# Generalised probit model
out_GJRM_3 = gamlss(eq_GJRM, data = df, surv = TRUE, margin = 'probit',
cens = status, type.cens = "R", hrate = hrate.select)
best <- which.min(c(AIC(out_GJRM_1), AIC(out_GJRM_2), AIC(out_GJRM_3)))
if(best == 1) out_GJRM <- out_GJRM_1
if(best == 2) out_GJRM <- out_GJRM_2
if(best == 3) out_GJRM <- out_GJRM_3
conv.check(out_GJRM)
##
## Largest absolute gradient value: 0.2152802
## Observed information matrix is positive definite
## Eigenvalue range: [0.234049,475905.9]
##
## Trust region iterations before smoothing parameter estimation: 15
## Loops for smoothing parameter estimation: 5
## Trust region iterations within smoothing loops: 80
## Estimated overall probability range: 0.02820429 0.9970063
## Estimated overall density range: 0.002984713 0.25
best
## [1] 2
# 2
c(AIC(out_GJRM_1), AIC(out_GJRM_2), AIC(out_GJRM_3))
## [1] 1382.532 1364.361 1366.792
# Estimate net survival (and confidence intervals) for the whole cohort
# at times t = 1, 3, 5 years
net.surv <- matrix(0, ncol = 3, nrow = 24)
times = c(1,5)
for(i in 1:24){
df.temp <- df[which(df$district == i),]
net.surv[i,] <- hazsurv.plot(out_GJRM, type = 'surv', t.range = times, ls = 3, newdata = df.temp,
intervals = FALSE, n.sim = 1000, plot.out = FALSE)$s
}
# One year net survival by district
nsd1 <- cbind(1:24, net.surv[,1])
colnames(nsd1) <- cbind("District", "1-year Net Survival")
kable(nsd1, digits = 3)
| District | 1-year Net Survival |
|---|---|
| 1 | 0.424 |
| 2 | 0.328 |
| 3 | 0.344 |
| 4 | 0.355 |
| 5 | 0.330 |
| 6 | 0.439 |
| 7 | 0.278 |
| 8 | 0.331 |
| 9 | 0.407 |
| 10 | 0.473 |
| 11 | 0.432 |
| 12 | 0.500 |
| 13 | 0.435 |
| 14 | 0.322 |
| 15 | 0.413 |
| 16 | 0.407 |
| 17 | 0.319 |
| 18 | 0.405 |
| 19 | 0.367 |
| 20 | 0.349 |
| 21 | 0.438 |
| 22 | 0.446 |
| 23 | 0.388 |
| 24 | 0.328 |
#One-year net survival, LeukSurv
polys.map(final.poldistrict, net.surv[,1], zlim=c(0,0.51), las=1, lab = "",
cex.main = 1.2, rev.col=FALSE, main = "", scheme = "heat")
#Three-year net survival, LeukSurv
polys.map(final.poldistrict, net.surv[,2], zlim=c(0,0.51), las=1, lab = "",
cex.main = 1.2, rev.col=FALSE, main = "", scheme = "heat")
#Five-year net survival, LeukSurv
polys.map(final.poldistrict, net.surv[,3], zlim=c(0,0.51), las=1, lab = "",
cex.main = 1.2, rev.col=FALSE, main = "", scheme = "heat")