Example of Using the SimSurvNMarker Package

This is a short example of how to use the SimSurvNMarker package to simulate from joint survival time and marker models. We will skip details about the model as these can be found by calling:

vignette("SimSurvNMarker", package = "SimSurvNMarker")

The main thing we do in this example is consider any bias by simulating from a model and estimating the true model using the joineRML package. We also illustrate the time it takes to simulate from the model.

First we install and load the packages we need:

#####
# install the packages we need
library(remotes)
install_github("boennecd/SimSurvNMarker", upgrade = "never",
               ref = "aacbf3c07657161f0ad")
install.packages("joineRML")

#####
# load libraries
library(joineRML, quietly = TRUE)
library(SimSurvNMarker)
library(bench)

Then we check the simulation time:

#####
# setup to perform the simulation
y_max <- 2
g_func <- function(x){
  x <- x - y_max / 2
  cbind(x = x, x2 = x^2, x3 = x^3)
}
g_func_surv = function(x)
    matrix(0., nrow = length(x), ncol = 3L)
m_func <- function(x){
  x <- x - y_max / 2
  cbind(`(Intercept)` = 1, x = x, x2 = x^2)
}
b_func <- function(x)
  cbind(log(x))

M <- 1L
gamma <- rbind(rep(0, M), rep(.5, M), rep(-.4, M))
B <- rbind(rep(-0.25, M), rep(.3, M), rep(-0.15, M))
sig <- diag(rep(.5^2, M), M)
delta <- c(-1, .2, .5)
omega <- 0
alpha <- .5
Psi <- diag(.5^2, NROW(B) * M)
Psi[lower.tri(Psi)] <- Psi[upper.tri(Psi)] <- .1

r_n_marker <- function(id)
  6L
r_obs_time <- function(id, n_markes)
  seq(0, y_max, length.out = n_markes + 1L)[-n_markes - 1L]
r_right_cens <- function(id)
  rexp(1L, rate = 1 / y_max)
r_left_trunc <- function(id)
  # no truncation
  0

#####
# define functions to do the simulation
# 
# Args:
#   n_obs: number of individuals to draw
sim_from_SimSurvNMarker <- function(n_obs = 250L){
  # sample covariates 
  X <- cbind(1, runif(n_obs) < .3, rnorm(n_obs))
  
  # functions to return the covariates
  r_x <- r_z <- function(id)
    X[id, ]
  
  # perform the simulation
  gl_dat <- get_gl_rule(30)
  sim_joint_data_set(
    n_obs = n_obs, B = B, Psi = Psi, omega = omega, delta = delta,
    alpha = alpha, sigma = sig, gamma = gamma, b_func = b_func,
    m_func = m_func, g_func = g_func, r_z = r_z,
    r_left_trunc = r_left_trunc, r_right_cens = r_right_cens,
    r_n_marker = r_n_marker, r_x = r_x, r_obs_time = r_obs_time,
    y_max = y_max, gl_dat = gl_dat, use_fixed_latent = TRUE, 
    g_func_surv = g_func_surv)
}

#####
# show one example and check simulation time
set.seed(97788016)
dat <- sim_from_SimSurvNMarker()

# how the data looks
str(dat$marker_data)
#> 'data.frame':    758 obs. of  6 variables:
#>  $ obs_time: num  0 0.333 0 0.333 0.667 ...
#>  $ Y1      : num  1.197 0.767 0.898 -0.914 -0.245 ...
#>  $ X1      : num  1 1 1 1 1 1 1 1 1 1 ...
#>  $ X2      : num  1 1 0 0 0 0 0 0 0 0 ...
#>  $ X3      : num  1.226 1.226 0.362 0.362 0.362 ...
#>  $ id      : int  1 1 2 2 2 3 3 3 3 3 ...
str(dat$survival_data)
#> 'data.frame':    250 obs. of  7 variables:
#>  $ Z1        : num  1 1 1 1 1 1 1 1 1 1 ...
#>  $ Z2        : num  1 0 0 0 0 0 0 1 0 1 ...
#>  $ Z3        : num  1.226 0.362 1.153 -0.932 -1.89 ...
#>  $ left_trunc: num  0 0 0 0 0 0 0 0 0 0 ...
#>  $ y         : num  0.527 0.884 2 0.118 0.37 ...
#>  $ event     : logi  TRUE TRUE FALSE TRUE TRUE TRUE ...
#>  $ id        : int  1 2 3 4 5 6 7 8 9 10 ...

# fraction of observed events
mean(dat$survival_data$event)
#> [1] 0.416

# quantiles of observed event times
quantile(subset(dat$survival_data, event, y)[[1L]])
#>    0%   25%   50%   75%  100% 
#> 0.003 0.213 0.541 1.004 1.999

# check the simulation time
mark(sim_from_SimSurvNMarker(), min_iterations = 25L)
#> # A tibble: 1 x 6
#>   expression                     min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>                <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 sim_from_SimSurvNMarker()   48.7ms   51.7ms      19.4    5.79MB     9.12

It is easy for us to check that the variance of the error term of the markers is correct and that the covariance matrix of the unobserved random effects is correct:

# sample a large number of individuals
set.seed(1L)
system.time(larger_dat <- sim_from_SimSurvNMarker(n_obs = 100000L))
#>    user  system elapsed 
#>  24.483   0.148  24.631

# get the unobersved random effects and check the covariance matrix
Us <- sapply(larger_dat$complete_data, `[[`, "U")
cov(t(Us))
#>        [,1]   [,2]   [,3]
#> [1,] 0.2505 0.0997 0.0992
#> [2,] 0.0997 0.2498 0.0996
#> [3,] 0.0992 0.0996 0.2496
Psi # the true value
#>      [,1] [,2] [,3]
#> [1,] 0.25 0.10 0.10
#> [2,] 0.10 0.25 0.10
#> [3,] 0.10 0.10 0.25

# compute the variance of the iid error term
iid_errs <- unlist(sapply(larger_dat$complete_data, function(x){
  tis <- x$markers[, "obs_time"]
  eta <- drop(x$x %*% gamma) + drop(g_func(tis) %*% B) + 
    drop(m_func(tis) %*% x$U)
  
  x$markers[, "Y1"] - eta
}))
var(iid_errs)
#> [1] 0.251
sig
#>      [,1]
#> [1,] 0.25

Next we check for bias using an MCEM algorithm to estimate the true model in a simulation study:

#####
# estimate a model
packageVersion("joineRML")
#> [1] '0.4.4'

# the seeds we will use
seeds <- c(13908296L, 16427665L, 49144100L, 26108742L, 99992758L, 46136559L, 29661269L, 10716539L, 22023030L, 44836167L, 67930105L, 71303079L, 77670326L, 6099844L, 81273005L, 31185998L, 87465781L, 92293121L, 76372017L, 49615189L, 83714861L, 33588456L, 42858587L, 26229033L, 79946479L, 37859705L, 5742148L, 82166042L, 90986604L, 48958949L, 2330445L, 37790656L, 99833229L, 85058877L, 82763854L, 88216171L, 94616597L, 40880146L, 53502304L, 76020037L, 57354482L, 14531000L, 11331200L, 73875893L, 92071254L, 86828391L, 52351030L, 60851522L, 27934530L, 70627556L, 21588549L, 3317167L, 54191650L, 21570385L, 10431273L, 42834674L, 30021797L, 50293553L, 82738427L, 46452595L, 1587956L, 83694552L, 57489492L, 2278745L, 40782790L, 5376483L, 87093928L, 93851928L, 73050973L, 25434481L, 78601955L, 96396472L, 10572139L, 58873010L, 18918519L, 33056539L, 5450374L, 47601070L, 84295674L, 64029603L, 41355907L, 72477504L, 5653236L, 91748567L, 88505266L, 35294365L, 97222791L, 20407747L, 4336537L, 53952762L, 46254464L, 13805568L, 24344985L, 2053051L, 94522180L, 87899392L, 71547884L, 63152352L, 86435043L, 86550545L)

# perform the simulation study
out <- sapply(seeds, function(s){
  out_file <- sprintf("sim-ex-%d.RDS", s)
  if(file.exists(out_file)){
    message(sprintf("Loading '%s'", out_file))
    readRDS(out_file)
  } else {
    gc(full = TRUE)
    message(sprintf("Running results for '%s'", out_file))
    
    # simulate individuals
    set.seed(s)
    dat <- sim_from_SimSurvNMarker()
    
    # fit model w/ MCEM
    fit <- mjoint(
      formLongFixed = list(Y1 = Y1 ~ X2 + X3 + g_func(obs_time)),
      formLongRandom = list(Y1 = ~ (m_func(obs_time) - 1) | id),
      formSurv = Surv(y, event) ~ Z2 + Z3,
      data = dat$marker_data, verbose = FALSE,
      survData = dat$survival_data, timeVar = "obs_time")
  
    # extract the result
    co <- fit$coefficients
    beta_is_B <- grepl("^g_func", names(co$beta))
    out <- list(
      coef = co, 
      Psi = co$D, 
      B     = co$beta[ beta_is_B], 
      gamma = co$beta[!beta_is_B], 
      sig = co$sigma2,
      delta = co$gamma[seq_len(length(delta) - 1L)] - 
        alpha * co$beta[!beta_is_B][-1], 
      alpha = tail(co$gamma, -(length(delta) - 1)))
    saveRDS(out, out_file)
    out
  }
})

Finally, we compute and show bias and MC error:

comp_bias <- function(res, truth){
  n <- NROW(res)
  err <- res - rep(truth, each = n)
  bias <- colMeans(err)
  SE <- apply(res, 2, sd) / sqrt(n)
  z_val <- bias / SE
  cbind(`mean estimate` = colMeans(res), truth = drop(truth),
        bias = bias, SE = SE, `z value` = z_val)
}

comp_bias(do.call(rbind, out["B", ]), B)
#>                      mean estimate truth     bias      SE z value
#> g_func(obs_time)x_1         -0.252 -0.25 -0.00220 0.00937  -0.235
#> g_func(obs_time)x2_1         0.295  0.30 -0.00549 0.01434  -0.383
#> g_func(obs_time)x3_1        -0.153 -0.15 -0.00346 0.01597  -0.217
comp_bias(do.call(rbind, out["gamma", ]), gamma)
#>               mean estimate truth     bias      SE z value
#> (Intercept)_1        -0.011   0.0 -0.01096 0.00615  -1.784
#> X2_1                  0.506   0.5  0.00637 0.00973   0.655
#> X3_1                 -0.404  -0.4 -0.00412 0.00404  -1.017
comp_bias(do.call(rbind, out["sig", ]), sig)
#>          mean estimate truth     bias      SE z value
#> sigma2_1         0.247  0.25 -0.00265 0.00196   -1.35
comp_bias(do.call(rbind, out["delta", ]), delta[-1])
#>    mean estimate truth    bias     SE z value
#> Z2         0.165   0.2 -0.0354 0.0226   -1.57
#> Z3         0.511   0.5  0.0112 0.0102    1.10
comp_bias(do.call(rbind, out["alpha", ]), alpha)
#>         mean estimate truth   bias     SE z value
#> gamma_1         0.522   0.5 0.0216 0.0208    1.04

keep <- which(lower.tri(Psi, TRUE))
comp_bias(do.call(rbind, lapply(out["Psi", ], c))[, keep], c(Psi)[keep])
#>      mean estimate truth      bias      SE z value
#> [1,]        0.2408  0.25 -0.009226 0.00461 -1.9996
#> [2,]        0.0968  0.10 -0.003174 0.00513 -0.6183
#> [3,]        0.0995  0.10 -0.000496 0.00570 -0.0869
#> [4,]        0.2461  0.25 -0.003949 0.00869 -0.4546
#> [5,]        0.0932  0.10 -0.006842 0.00825 -0.8298
#> [6,]        0.2426  0.25 -0.007362 0.01220 -0.6035

Example of Using the SimSurvNMarker Package

Benjamin Christoffersen

14 September, 2020