Consider the following joint model, which corresponds to the Simulation Scenario 2 in Alvares and Rubio (2021) (see also this GitHub Repository).
The longitudinal model: \[ y_i(t) = \tilde{\beta}_0 + \tilde{\beta}_1 t + \gamma_1 \left\{t \, \text{age}_i \right\} + \beta_1 \,\text{sex}_i + \beta_2 \, \text{age}_i + b_{0i} + b_{1i}t + \epsilon_i(t). \]
The survival process: \[ h(t \mid {\Psi}_{2i}) = h_{0}\left(t \exp\left\{ \alpha_{1}\left(\gamma_1 \, \text{age}_i +b_{1i} \right) \right\} \mid \theta\right) \exp\left\{ \tilde{\kappa}_1 \text{comorb}_i + \lambda_1 \,\text{sex}_i + \lambda_2 \, \text{age}_i +\alpha_0 b_{0i} \right\}. \]
#################################################################################
# Simulation of design matrices: Scenarios 1 and 2
#################################################################################
# seed : seed for simulation
simDes12 <- function(seed){
set.seed(seed)
# sex : Men ( sex = 0 ) - Women ( sex = 1)
sex <- ifelse(runif(n) < 0.5, 0, 1)
# Age (scaled and centred)
age <- c(runif(n*0.25, min = 30, max = 65), runif(n*0.35, min = 65, max = 75), runif(n*0.40, min = 75, max = 85))
age <- (age - 70)/10
comorb <- ifelse(runif(n) < 0.5, 0, 1)
X <- cbind(sex,age,comorb)
colnames(X) <- c("sex","age","comorb")
return(X)
}
For each individual \(i=1,\dots,n\), and for given values of the parameters and the design matrix:
Random Effects. Simulate \(b_i\sim N_{2}(0,\Sigma)\).
Survival Process. In order to simplify notation, let us denote: \[ A = \exp\left\{ {w}_i^{\top}{\kappa} + \alpha_1\left( \tilde{x}_i^{\top}\gamma + b_{1i} \right) \right\}, \] \[ B = \exp\left\{ \tilde{w}_i^{\top}\tilde{\kappa} + s_i^{\top}\lambda+ \alpha_0 b_{0i} - \left[ {w}_i^{\top}{\kappa} + \alpha_1\left( \tilde{x}_i^{\top}\gamma + b_{1i} \right) \right] \right\}. \] The individual survival function is \(S(t \mid \Psi_{2i}) = \exp\left[-H(t \mid \Psi_{2i})\right]\), we can apply the probability integral transform directly to obtain: \[ t_{i} = \dfrac{F_0^{-1}\left[ 1 -\exp\left\{ \dfrac{\log(1-u_{i})}{B} \right\} \mid \theta \right]}{A}, \] where \(F_0\) is the cumulative distribution function associated to the baseline hazard \(h_0\), and \(u_{i} \sim U(0,1)\).
Longitudinal Process. Once a simulated time-to-event \(t_i\) is obtained from the previous step, specify the distribution of the distance between the repeated observations (e.g.~equidistant or random). This produces the time points \(t_{ij}\), \(j=1,\dots,n_i\), at which the repeated observations are recorded. The longitudinal process simulation is thus obtained by plugging-in the corresponding values of the parameters and covariates in \(\mu_{ij}\), and simulating from the corresponding GLMM based on the joint model proposed in (Alvares and Rubio 2021).
The following R code provides the routines required to simulate these longitudinal and survival processes with a Log-Normal baseline hazard. The simulated data can be used to fit the models in Stan (See GitHub repository). This example corresponds to Scenario 2 in the main paper (Alvares and Rubio 2021).
#########################################################################
# Joint model simulation
#########################################################################
# n : sample size
# seed : seed for simulation
# baseline: baseline hazard distribution. Lognormal (LN), Gamma (G),
# Power Generalised Weibull (PGW)
# Model parameters: beta, beta.tilde, gamma, sigma11, sigma22, rho,
# sigma.re, theta, kappa, kappa.tilde, lambda, alpha
# Design matrices: X.tilde, S, W, W.tilde
simDataJ <- function(n, seed, beta, beta.tilde, gamma, sigma11, sigma22, rho, sigma.re,
theta, kappa, kappa.tilde, lambda, alpha,
cens.times, DBRO, baseline = "LN",
X.tilde, S, W, W.tilde){
times <- ID <- longit.out <- vector()
K <- 2
# Giving the design matrix a matrix format
X.tilde <- as.matrix(X.tilde)
S <- as.matrix(S)
W <- as.matrix(W)
W.tilde <- as.matrix(W.tilde)
# Inner products used in the simulation
S_beta <- as.vector(S%*%beta)
X.tilde_gamma <- as.vector(X.tilde%*%gamma)
W_kappa <- as.vector(W%*%kappa)
S_lambda <- as.vector(S%*%lambda)
W.tilde_kappa.tilde <- as.vector(W.tilde%*%kappa.tilde)
# Link parameters
alpha0 <- alpha[1]; alpha1 <- alpha[2];
#################
# Random effects
#################
bi <- rmvnorm(n, mean = rep(0,K), sigma = Sigma)
###################
# Survival process
###################
# Baseline hazard
if(baseline == "LN") quantf <- function(p) qlnorm(p, theta[1], theta[2])
if(baseline == "G") quantf <- function(p) qgamma(p, theta[1], theta[2])
if(baseline == "PGW") quantf <- function(p) qpgw(p, theta[1], theta[2], theta[3])
# Simulating the times to event
A0 <- as.vector( W_kappa + alpha1*(X.tilde_gamma + bi[,2] ) )
A <- as.vector(exp( A0 ))
B <- as.vector(exp( ( W.tilde_kappa.tilde + S_lambda + alpha0*bi[,1] ) - A0 ))
u = runif(n)
p0 <- as.vector(1 - exp(log(1-u)/B))
times <- as.vector(quantf(p0)/A)
status <- as.vector(times < cens.times)
times <- as.vector(ifelse(status, times, cens.times))
status <- as.numeric(status) # Censoring indicators (1=Observed, 0=Censored)
##############################
# Longitudinal process
##############################
visits.out <- vector()
for(i in 1:n){
visits <- seq(0,times[i], by = DBRO) # number of repeated observations for each individual
yt <- S_beta[i] + beta.tilde[1] + beta.tilde[2]*visits + X.tilde_gamma[i]*visits +
bi[i,1] + bi[i,2]*visits + rnorm(length(visits),0,sigma.re)
longit.out <- c(longit.out,yt)
ID <- c(ID,rep(i,length(visits)))
visits.out <- c(visits.out,visits)
}
#---------------------------------------------------------------------
# Creating the longitudinal and survival processes object
#---------------------------------------------------------------------
long.proc <- as.matrix(cbind(longit.out,ID)) # Longitudinal process
surv.proc <- as.matrix(cbind(times,status)) # Survival process
obj <- list(long.proc,surv.proc,X,visits.out)
names(obj) <- c("longitudinal","survival","X","visits")
return(obj)
}
library(mvtnorm)
# Parameters
n <- 500;
beta.tilde = c(2,0.5); beta = c(0.3,-0.2); gamma = c(-0.5);
sigma11 <- 0.4; sigma22 <- 0.2; rho <- 0.4; sigma.re = 0.2;
Sigma = rbind(c(sigma11^2,sigma11*sigma22*rho),c(sigma11*sigma22*rho,sigma22^2));
kappa <- 0; kappa.tilde <- c(0.3); lambda <- c(0.3,0.5);
theta <- c(0.5,0.5); alpha = c(0.5,1);
# Censoring times and distance between repeated observations
cens.times = 2.1; DBRO = 0.5
# Seed
seed <- 123
# Simulated design matrix
X <- as.matrix(simDes12(seed))
head(X)
## sex age comorb
## [1,] 0 -2.762379 0
## [2,] 1 -2.717455 1
## [3,] 0 -2.995150 0
## [4,] 1 -3.720095 1
## [5,] 1 -2.720910 1
## [6,] 0 -3.376952 0
# Simulated processes
obj <- simDataJ(n = n, seed = seed, beta = beta, beta.tilde = beta.tilde, gamma = gamma,
sigma11 = sigma11, sigma22 = sigma22, rho = rho, sigma.re = sigma.re,
theta = theta, kappa = kappa, kappa.tilde = kappa.tilde, alpha = alpha, lambda = lambda,
X.tilde = X[,2], S = X[,1:2], W = rep(0,n), W.tilde = X[,3],
baseline = "LN", cens.times = cens.times, DBRO = DBRO)
# Required quantities for model fitting
X <- obj$X # design matrix X
n <- nrow(X) # total number of observations
y <- obj$longitudinal[,1] # longitudinal outcomes
ID <- obj$longitudinal[,2] # patient IDs
nid <- length(unique(ID)) # number of patients
status <- obj$survival[,2] # vital status (1 = dead, 0 = alive)
times <- obj$survival[,1] # times to event
visits <- obj$visits # visit times for repeated observations
N <- length(y) # total number of longitudinal outcomes
indobs <- which(status==1) # observed survival times indicator
nobs <- length(indobs) # number of observed survival times
# Visualising the simulated data
# Simulated data
head(X)
## sex age comorb
## [1,] 0 -2.762379 0
## [2,] 1 -2.717455 1
## [3,] 0 -2.995150 0
## [4,] 1 -3.720095 1
## [5,] 1 -2.720910 1
## [6,] 0 -3.376952 0
# Longitudinal process
require(ggplot2)
## Loading required package: ggplot2
long <- data.frame(y = y, visits = visits, ID = ID)
p <- ggplot(data = long, aes(x = visits, y = y, group = ID))
p + geom_line()
# Survival process
hist(times, probability = TRUE, xlab = "time", ylab = "Density")
box()
Alvares, D., and F. J. Rubio. 2021. “A Tractable Bayesian Joint Model for Longitudinal and Survival Data.” Statistics in Medicine.