This is a short example of how to use the SimSurvNMarker package to simulate from joint survival time and marker models. We will skip details about the model as these can be found by calling:
vignette("SimSurvNMarker", package = "SimSurvNMarker")
The main thing we do in this example is consider any bias by simulating from a model and estimating the true model using the joineRML package. We also illustrate the time it takes to simulate from the model.
First we install and load the packages we need:
#####
# install the packages we need
library(remotes)
install_github("boennecd/SimSurvNMarker", upgrade = "never",
ref = "aacbf3c07657161f0ad")
install.packages("joineRML")
#####
# load libraries
library(joineRML, quietly = TRUE)
library(SimSurvNMarker)
library(bench)
Then we check the simulation time:
#####
# setup to perform the simulation
y_max <- 2
g_func <- function(x){
x <- x - y_max / 2
cbind(x = x, x2 = x^2, x3 = x^3)
}
g_func_surv = function(x)
matrix(0., nrow = length(x), ncol = 3L)
m_func <- function(x){
x <- x - y_max / 2
cbind(`(Intercept)` = 1, x = x, x2 = x^2)
}
b_func <- function(x)
cbind(log(x))
M <- 1L
gamma <- rbind(rep(0, M), rep(.5, M), rep(-.4, M))
B <- rbind(rep(-0.25, M), rep(.3, M), rep(-0.15, M))
sig <- diag(rep(.5^2, M), M)
delta <- c(-1, .2, .5)
omega <- 0
alpha <- .5
Psi <- diag(.5^2, NROW(B) * M)
Psi[lower.tri(Psi)] <- Psi[upper.tri(Psi)] <- .1
r_n_marker <- function(id)
6L
r_obs_time <- function(id, n_markes)
seq(0, y_max, length.out = n_markes + 1L)[-n_markes - 1L]
r_right_cens <- function(id)
rexp(1L, rate = 1 / y_max)
r_left_trunc <- function(id)
# no truncation
0
#####
# define functions to do the simulation
#
# Args:
# n_obs: number of individuals to draw
sim_from_SimSurvNMarker <- function(n_obs = 250L){
# sample covariates
X <- cbind(1, runif(n_obs) < .3, rnorm(n_obs))
# functions to return the covariates
r_x <- r_z <- function(id)
X[id, ]
# perform the simulation
gl_dat <- get_gl_rule(30)
sim_joint_data_set(
n_obs = n_obs, B = B, Psi = Psi, omega = omega, delta = delta,
alpha = alpha, sigma = sig, gamma = gamma, b_func = b_func,
m_func = m_func, g_func = g_func, r_z = r_z,
r_left_trunc = r_left_trunc, r_right_cens = r_right_cens,
r_n_marker = r_n_marker, r_x = r_x, r_obs_time = r_obs_time,
y_max = y_max, gl_dat = gl_dat, use_fixed_latent = TRUE,
g_func_surv = g_func_surv)
}
#####
# show one example and check simulation time
set.seed(97788016)
dat <- sim_from_SimSurvNMarker()
# how the data looks
str(dat$marker_data)
#> 'data.frame': 758 obs. of 6 variables:
#> $ obs_time: num 0 0.333 0 0.333 0.667 ...
#> $ Y1 : num 1.197 0.767 0.898 -0.914 -0.245 ...
#> $ X1 : num 1 1 1 1 1 1 1 1 1 1 ...
#> $ X2 : num 1 1 0 0 0 0 0 0 0 0 ...
#> $ X3 : num 1.226 1.226 0.362 0.362 0.362 ...
#> $ id : int 1 1 2 2 2 3 3 3 3 3 ...
str(dat$survival_data)
#> 'data.frame': 250 obs. of 7 variables:
#> $ Z1 : num 1 1 1 1 1 1 1 1 1 1 ...
#> $ Z2 : num 1 0 0 0 0 0 0 1 0 1 ...
#> $ Z3 : num 1.226 0.362 1.153 -0.932 -1.89 ...
#> $ left_trunc: num 0 0 0 0 0 0 0 0 0 0 ...
#> $ y : num 0.527 0.884 2 0.118 0.37 ...
#> $ event : logi TRUE TRUE FALSE TRUE TRUE TRUE ...
#> $ id : int 1 2 3 4 5 6 7 8 9 10 ...
# fraction of observed events
mean(dat$survival_data$event)
#> [1] 0.416
# quantiles of observed event times
quantile(subset(dat$survival_data, event, y)[[1L]])
#> 0% 25% 50% 75% 100%
#> 0.003 0.213 0.541 1.004 1.999
# check the simulation time
mark(sim_from_SimSurvNMarker(), min_iterations = 25L)
#> # A tibble: 1 x 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 sim_from_SimSurvNMarker() 48.7ms 51.7ms 19.4 5.79MB 9.12
It is easy for us to check that the variance of the error term of the markers is correct and that the covariance matrix of the unobserved random effects is correct:
# sample a large number of individuals
set.seed(1L)
system.time(larger_dat <- sim_from_SimSurvNMarker(n_obs = 100000L))
#> user system elapsed
#> 24.483 0.148 24.631
# get the unobersved random effects and check the covariance matrix
Us <- sapply(larger_dat$complete_data, `[[`, "U")
cov(t(Us))
#> [,1] [,2] [,3]
#> [1,] 0.2505 0.0997 0.0992
#> [2,] 0.0997 0.2498 0.0996
#> [3,] 0.0992 0.0996 0.2496
Psi # the true value
#> [,1] [,2] [,3]
#> [1,] 0.25 0.10 0.10
#> [2,] 0.10 0.25 0.10
#> [3,] 0.10 0.10 0.25
# compute the variance of the iid error term
iid_errs <- unlist(sapply(larger_dat$complete_data, function(x){
tis <- x$markers[, "obs_time"]
eta <- drop(x$x %*% gamma) + drop(g_func(tis) %*% B) +
drop(m_func(tis) %*% x$U)
x$markers[, "Y1"] - eta
}))
var(iid_errs)
#> [1] 0.251
sig
#> [,1]
#> [1,] 0.25
Next we check for bias using an MCEM algorithm to estimate the true model in a simulation study:
#####
# estimate a model
packageVersion("joineRML")
#> [1] '0.4.4'
# the seeds we will use
seeds <- c(13908296L, 16427665L, 49144100L, 26108742L, 99992758L, 46136559L, 29661269L, 10716539L, 22023030L, 44836167L, 67930105L, 71303079L, 77670326L, 6099844L, 81273005L, 31185998L, 87465781L, 92293121L, 76372017L, 49615189L, 83714861L, 33588456L, 42858587L, 26229033L, 79946479L, 37859705L, 5742148L, 82166042L, 90986604L, 48958949L, 2330445L, 37790656L, 99833229L, 85058877L, 82763854L, 88216171L, 94616597L, 40880146L, 53502304L, 76020037L, 57354482L, 14531000L, 11331200L, 73875893L, 92071254L, 86828391L, 52351030L, 60851522L, 27934530L, 70627556L, 21588549L, 3317167L, 54191650L, 21570385L, 10431273L, 42834674L, 30021797L, 50293553L, 82738427L, 46452595L, 1587956L, 83694552L, 57489492L, 2278745L, 40782790L, 5376483L, 87093928L, 93851928L, 73050973L, 25434481L, 78601955L, 96396472L, 10572139L, 58873010L, 18918519L, 33056539L, 5450374L, 47601070L, 84295674L, 64029603L, 41355907L, 72477504L, 5653236L, 91748567L, 88505266L, 35294365L, 97222791L, 20407747L, 4336537L, 53952762L, 46254464L, 13805568L, 24344985L, 2053051L, 94522180L, 87899392L, 71547884L, 63152352L, 86435043L, 86550545L)
# perform the simulation study
out <- sapply(seeds, function(s){
out_file <- sprintf("sim-ex-%d.RDS", s)
if(file.exists(out_file)){
message(sprintf("Loading '%s'", out_file))
readRDS(out_file)
} else {
gc(full = TRUE)
message(sprintf("Running results for '%s'", out_file))
# simulate individuals
set.seed(s)
dat <- sim_from_SimSurvNMarker()
# fit model w/ MCEM
fit <- mjoint(
formLongFixed = list(Y1 = Y1 ~ X2 + X3 + g_func(obs_time)),
formLongRandom = list(Y1 = ~ (m_func(obs_time) - 1) | id),
formSurv = Surv(y, event) ~ Z2 + Z3,
data = dat$marker_data, verbose = FALSE,
survData = dat$survival_data, timeVar = "obs_time")
# extract the result
co <- fit$coefficients
beta_is_B <- grepl("^g_func", names(co$beta))
out <- list(
coef = co,
Psi = co$D,
B = co$beta[ beta_is_B],
gamma = co$beta[!beta_is_B],
sig = co$sigma2,
delta = co$gamma[seq_len(length(delta) - 1L)] -
alpha * co$beta[!beta_is_B][-1],
alpha = tail(co$gamma, -(length(delta) - 1)))
saveRDS(out, out_file)
out
}
})
Finally, we compute and show bias and MC error:
comp_bias <- function(res, truth){
n <- NROW(res)
err <- res - rep(truth, each = n)
bias <- colMeans(err)
SE <- apply(res, 2, sd) / sqrt(n)
z_val <- bias / SE
cbind(`mean estimate` = colMeans(res), truth = drop(truth),
bias = bias, SE = SE, `z value` = z_val)
}
comp_bias(do.call(rbind, out["B", ]), B)
#> mean estimate truth bias SE z value
#> g_func(obs_time)x_1 -0.252 -0.25 -0.00220 0.00937 -0.235
#> g_func(obs_time)x2_1 0.295 0.30 -0.00549 0.01434 -0.383
#> g_func(obs_time)x3_1 -0.153 -0.15 -0.00346 0.01597 -0.217
comp_bias(do.call(rbind, out["gamma", ]), gamma)
#> mean estimate truth bias SE z value
#> (Intercept)_1 -0.011 0.0 -0.01096 0.00615 -1.784
#> X2_1 0.506 0.5 0.00637 0.00973 0.655
#> X3_1 -0.404 -0.4 -0.00412 0.00404 -1.017
comp_bias(do.call(rbind, out["sig", ]), sig)
#> mean estimate truth bias SE z value
#> sigma2_1 0.247 0.25 -0.00265 0.00196 -1.35
comp_bias(do.call(rbind, out["delta", ]), delta[-1])
#> mean estimate truth bias SE z value
#> Z2 0.165 0.2 -0.0354 0.0226 -1.57
#> Z3 0.511 0.5 0.0112 0.0102 1.10
comp_bias(do.call(rbind, out["alpha", ]), alpha)
#> mean estimate truth bias SE z value
#> gamma_1 0.522 0.5 0.0216 0.0208 1.04
keep <- which(lower.tri(Psi, TRUE))
comp_bias(do.call(rbind, lapply(out["Psi", ], c))[, keep], c(Psi)[keep])
#> mean estimate truth bias SE z value
#> [1,] 0.2408 0.25 -0.009226 0.00461 -1.9996
#> [2,] 0.0968 0.10 -0.003174 0.00513 -0.6183
#> [3,] 0.0995 0.10 -0.000496 0.00570 -0.0869
#> [4,] 0.2461 0.25 -0.003949 0.00869 -0.4546
#> [5,] 0.0932 0.10 -0.006842 0.00825 -0.8298
#> [6,] 0.2426 0.25 -0.007362 0.01220 -0.6035