First a warning; this document assumes some knowledge of survival analysis concepts.
Data simulation examples are often trivially simple. Sometimes this will suffice, but we often need more complex simulations to capture all the salient features of the data generating mechanism. This post aims to simulate survival data from a clinical trial with participants entering at around 6 months to 1 year of age and being followed until they are 3 years old. The model is based on time (in weeks) of the first event following treatment in the control and treatment groups. Each participant is followed up at 2 weeks, then 2 months post enrolment and then at 1.5, 2, 2.5 and 3 years of age. During these follow up, we determine if the participant has had an event, otherwise they are considered right censored with censoring time dictated by the time of the last review/maximum followup time.
You will need to load a few libraries.
library(flexsurv)
library(data.table)
library(parallel)
library(muhaz)
This function simulates the event data assuming that the survival times from entry follow a weibull distribution (with PH parameterisation). With the parameters used here the median times to event are around 45 and 56 weeks from entry.
get_data <- function(n = 10, # sample size per arm
alpha = 0.95,
logb0 = -4,
logb1 = -0.2,
npwk = 1,
seed = NULL){
if(!is.null(seed)) set.seed(seed)
# Treatment indicator
trt <- rep(0:1, len = n * 2)
# Simulate age at entry and time to event from the enrolment
age0 <- runif(n*2, 26, 52)
y <- sapply(1:(n*2), function(i){
if(trt[i] == 0){
rweibullPH(1, alpha, exp(logb0))
} else {
rweibullPH(1, alpha, exp(logb0+logb1))
}
})
# Accrual - time of entry into study starting from zero (first enrolment marks
# start of trial)
t0 <- c(0, cumsum(rexp( ((n*2)-1) , npwk)))
# We assume current time is enrolment date of last participant
tnow <- max(t0)
# The current age will be the number of weeks between time of enrolment and current date plus
# the age at enrolment in weeks.
agenow <- tnow - t0 + age0
# Knowing the time of enrolment, along with the age at enrolment and the current time
# we can determine what the latest review is. I have coded each review time as 1
# through to 7 with 1 being no review, 2 being the 2 week review, 3 being the 2 month
# review and so on.
# no review, week 2, week 8 post enrolment
# then 78, 104, 130, 156 wks of age
latest_rev <- data.table::fcase(
tnow < t0 + 2, 0L, # no review
tnow >= t0 + 2 & tnow < t0 + 8, 1L, # last review was 2 week fu
# remember the participants were enrolled at age 26 to 52 months.
tnow >= t0 + 8 & agenow < 52, 2L, # last review was 8 wk fu and not reached 1.5 years of age
agenow >= 52 & agenow < 78, 3L, # last review was 18 months of age
agenow >= 78 & agenow < 104, 4L, # last review was 18 months of age
agenow >= 104 & agenow < 130, 5L, # last review was 24 months of age
agenow >= 130 & agenow < 156, 6L, # last review was 30 months of age
agenow >= 156, 7L # last review was 36 months of age
)
# Translate the last review indicator to the age at time of review.
# The runif simulates the 2 wks of allowed slippage behind schedule.
agereview <- data.table::fcase(
latest_rev == 0L, rep(NA_real_, 2*n),
latest_rev == 1L, age0 + 2,
latest_rev == 2L, age0 + 8,
latest_rev == 3L, 52,
latest_rev == 4L, 78,
latest_rev == 5L, 104,
latest_rev == 6L, 130,
latest_rev == 7L, 156
)
# And then from age of review, compute the week of study at which the last
# review occurred as the enrolment date - age of enrolment + age at last review
treview <- t0 - age0 + agereview
d <- data.table(id = 1:(2*n),
trt = trt,
last_rev = latest_rev, # indicates last review
age0 = age0, # age at entry
age_rev = agereview, # age at last review
age_evt = NA_real_, # age at time of event
# what we see. age is either age at time of last review
# or age at time of event depending on whether censored
# or not.
age_evt_hat = NA_real_,
# time to event __from entry__;
# age at event would be age0 + y
y = y, # time to event from entry
# Time to latest review (censoring time) is
# simply age at review minus age at entry
c = agereview - age0, # time to review (censoring)
u = NA_real_, # observed time
evt = 1,
t0 # entry time
)
# The age at which the simulated event would occur
d[, age_evt := age0 + y]
# Is the age at the time of the event lower than the age at latest review?
# If so, we did not see the event at the latest review.
d[, evt := age_evt < age_rev ]
# Depending on whether the event was seen, age_evt_hat is the age
# at time of event or age at time of review.
d[evt == TRUE, age_evt_hat := age_evt]
d[evt == FALSE, age_evt_hat := age_rev]
# Some participants will have not reached review yet and so age_rev
# will be set to NA. Censor those that are NA.
d[last_rev == 0, evt := FALSE]
d[last_rev == 0, age_evt_hat := age0 + 1e-08]
# If we have observed an event set yevt to the simulated event time (y)
# and if not then set the time to latest review.
d[evt == TRUE, u := y]
d[evt == FALSE, u := c]
# In cases where a review hasn't happened set time to event as small value.
d[last_rev == 0, u := 1e-08]
d
}
Sample dataset.
n = 40
alpha = 1 # was 0.95
logb0 = -4
logb1 = -0.2
npwk = 1
d <- get_data(n,
alpha,
logb0,
logb1,
npwk,
seed = 1)
head(d)
## id trt last_rev age0 age_rev age_evt age_evt_hat y c
## 1: 1 0 4 32.90323 78 78.39399 78.00000 45.49076 45.09677
## 2: 2 1 4 35.67522 78 58.27887 58.27887 22.60365 42.32478
## 3: 3 0 5 40.89419 104 90.92273 90.92273 50.02855 63.10581
## 4: 4 1 5 49.61340 104 124.49196 104.00000 74.87856 54.38660
## 5: 5 0 4 31.24373 78 46.43713 46.43713 15.19340 46.75627
## 6: 6 1 5 49.35813 104 155.79395 104.00000 106.43581 54.64187
## u evt t0
## 1: 45.09677 FALSE 0.0000000
## 2: 22.60365 TRUE 0.8323091
## 3: 50.02855 TRUE 1.1416159
## 4: 54.38660 FALSE 1.1858445
## 5: 15.19340 TRUE 2.2209417
## 6: 54.64187 FALSE 2.6683936
The figure below shows the age at entry and the age at which the last follow up review occurred assuming that we looked at the data at the time of the latest enrolment. These are the time-points where we would determine whether the event of interest had occurred. Review 1 is the two week follow-up, review 2 occurs at around 8 weeks and then follow up occurs at 1.5, 2, 2.5 and 3 years of age. As the data is assumed to have been extracted immediately after the last enrolment, some participants will not have had a review yet.
# Review age could be NA if
xx <- c(d[!is.na(age_rev), age_rev])
plot(1, type = "n",
xlim = c(0, max(xx)), ylim = c(1, 2*n),
xlab = "Age (wks)", ylab = "ID",
main = "")
points(d[, age0], d[, id], pch = 10, col = 2)
points(d[, age_rev], d[, id], pch = 3, col = d[, last_rev] + 2)
segments(d[, age0], d[, id],
d[, age_rev], d[, id])
legend("topright",
legend = c("No review", paste0("Review ", 1:(max(d$last_rev)-1))),
col = 1:(max(d$last_rev))+2, pch = c(NA, rep(3, max(d$last_rev)-1)))
legend("topleft", legend = "Entry age", col = 2, pch = 10)
Next, the simulated event times are added to the figure.
Note that the plot shows age as the x axis and so the vertical line just shows the maximum follow up time, which is 156 weeks of age. If I had used time from enrolment, then the final follow up for each individual would occur at \(156\) minus the age at enrolment.
Any events simulated to occur after this age would never be observed because the participant reached their maximum followup. If a patient has a simulated event prior to the maximum follow up, but it occurs after the latest review, we would not see those events at the current time either.
Therefore, a number of the simulated data points will be censored either at the maximum follow up or at the time of the latest review. If no review has occurred, we would just censor the point and assign some very small value like 0.00001 added to the age at entry.
xx <- c(d[!is.na(age_rev), age_rev] , d[ , age_evt] )
plot(1, type = "n",
xlim = c(0, max(xx)), ylim = c(1, 2*n),
xlab = "Age (wks)", ylab = "ID",
main = "")
points(d[, age0], d[, id], pch = 10, col = 1, lwd = 0.5)
points(d[, age_rev], d[, id], pch = 3, col = 1, lwd = 0.5)
# points(d[, ])
abline(v = 156, lwd = 0.5, lty = 1)
points(d[, age_evt], d[, id], pch = 20, col = 2)
This next plot shows the data that would be available for estimating the parameters. For the sake of clarity, I have removed the age at entry and replaced it with a horizontal line that shows the follow up until the age of event (or censoring). You can see how a lot of the event times have been censored with the censoring time either being the time of the last review or the maximum followup.
xx <- c(d[!is.na(age_rev), age_rev] , d[, age_evt_hat] )
pts <- c(4, 20)
plot(1, type = "n",
xlim = c(0, max(xx)), ylim = c(1, 2*n),
xlab = "Age (wks)", ylab = "ID",
main = "")
for(i in 1:(2*n)){
segments(d[i, age0], d[i, id], x1 = d[i, age_evt_hat], y1 = d[i, id], lwd = 0.5)
}
points(d[, age_evt_hat], d[, id], pch = pts[d[, evt] + 1], col = 2)
legend("topright", legend = c("Censored", "Event"), pch = pts, col = 2)
We can compare the theoretical with the empirical distribution of event times, the pdf.
xx <- seq(0.1, 300, len = 1000)
d0 <- dweibullPH(xx, alpha, exp(logb0))
d1 <- dweibullPH(xx, alpha, exp(logb0+logb1))
de0 <- density(d[trt == 0, y])
de1 <- density(d[trt == 1, y])
plot(1, type = "n",
xlim = range(xx), ylim = range(c(d0, d1)),
xlab = "Weeks to event", ylab = "Density",
main = "")
lines(xx, d0)
lines(xx, d1, col = 2)
lines(de0$x, de0$y, col = 1, lty = 2)
lines(de1$x, de1$y, col = 2, lty = 2)
By convention, this is usually presented in terms of survival times with the y-axis being \(1-F(t)\). Both the closed form (solid lines) using the simulation parameters and the Kaplan Meir estimates (dashed lines) from the simulated data are shown. The KM estimates are based on the data that would actually be observed, i.e. it includes a mix of event times and censoring times.
dtmp <- get_data(1000,
alpha,
logb0,
logb1,
npwk)
xx <- seq(0.1, 300, len = 200)
d0 <- 1 - pweibullPH(xx, alpha, exp(logb0))
d1 <- 1 - pweibullPH(xx, alpha, exp(logb0+logb1))
# km estimate
f1 <- survival::survfit(survival::Surv(u, evt) ~ trt, data = dtmp)
plot(1, type = "n",
xlim = range(xx), ylim = range(c(d0, d1)),
xlab = "Weeks to event", ylab = "Survival",
main = "")
lines(xx, d0)
lines(xx, d1, col = 2)
dtmpp <- data.table(t = f1$time, s = f1$surv)
nobs0 <- f1$strata[1]
nobs1 <- f1$strata[2]
lines(f1$time[1:nobs0], f1$surv[1:nobs0], col = 1, lty = 2)
lines(f1$time[(nobs0+1):(nobs0 + nobs1)], f1$surv[(nobs0+1):(nobs0 + nobs1)], col = 2, lty = 2)
legend("topright", legend = c("PBO", "TRT"), lty = 1, col = 1:2)
The hazard function is implied from the parameters and knowledge of the distributional form. Specifically, for the Weibull distribution, when the shape parameter is less than 1, the hazard will decline over time, which we can see by plotting the hazard. Thus, the instantaneous risk of the event decreases over time. You will see that the empirical estimate of hazard functions is typically a crude approximation of the truth.
xx <- seq(0.1, 300, len = 200)
d0 <- hweibullPH(xx, alpha, exp(logb0))
d1 <- hweibullPH(xx, alpha, exp(logb0+logb1))
mh0 <- muhaz(dtmp[trt==0, u], dtmp[trt==0, evt])
mh1 <- muhaz(dtmp[trt==1, u], dtmp[trt==1, evt])
plot(1, type = "n",
xlim = range(xx), ylim = range(c(d0, d1, mh0$haz.est, mh1$haz.est)),
xlab = "Weeks to event", ylab = "Survival",
main = "")
lines(xx, d0)
lines(xx, d1, col = 2)
lines(mh0$est.grid, mh0$haz.est, lty = 2)
lines(mh1$est.grid, mh1$haz.est, col = 2, lty = 2)
The implied distribution of age at the time of the first event following entry into the trial is more complex because this is a linear combination of the age of entry and the time to event from enrolment. The age of entry is a random variable, assumed to be uniformly distributed between 26 and 52 weeks. Additionally, it is conceivable that an event occurs prior to entry into the trial.
xx <- seq(0.1, 300, len = 1000)
de0 <- density(dtmp[trt == 0, age_evt])
de1 <- density(dtmp[trt == 1, age_evt])
f1 <- survival::survfit(survival::Surv(age_evt, as.numeric(evt)) ~ trt, data = dtmp)
par(mfrow = c(1, 2))
plot(1, type = "n",
xlim = range(c(de0$x, de1$x)), ylim = range(c(de0$y, de1$y)),
xlab = "Age (weeks) at first event after entry", ylab = "Density",
main = "")
lines(de0$x, de0$y, col = 1, lty = 2)
lines(de1$x, de1$y, col = 2, lty = 2)
plot(1, type = "n",
xlim = range(xx), ylim = c(0, 1),
xlab = "Age (weeks) at first event after entry", ylab = "Survival",
main = "")
nobs0 <- f1$strata[1]
nobs1 <- f1$strata[2]
lines(f1$time[1:nobs0], f1$surv[1:nobs0], col = 1, lty = 2)
lines(f1$time[(nobs0+1):(nobs0 + nobs1)], f1$surv[(nobs0+1):(nobs0 + nobs1)], col = 2, lty = 2)
abline(v = 26, lwd = 0.3)
legend("topright", legend = c("PBO", "TRT"), lty = 1, col = 1:2)
par(mfrow = c(1, 1))
Simulation is a useful way to get some confidence that an implementation is working as required. Below I simulate the data generation process 500 times with 400 participants per arm and to each dataset fit a WeibullPH model. On average, the parameter estimates derived from these models should be close to the parameters we used to simulate the data.
n <- 400
if(.Platform$OS.type == "unix") {
ncores <- parallel::detectCores()-3
} else {
ncores <- 1
}
lres <- mclapply(X=1:500, mc.cores=ncores, FUN=function(i) {
d <- get_data(n,
alpha,
logb0,
logb1,
npwk)
#d$evt <- 1
f1 <- flexsurvreg(Surv(u, evt) ~ trt , data = d, dist = "weibullph")
cf <- f1$coefficients
cf[c("shape", "scale", "trt")]
})
m1 <- do.call(rbind, lres)
Below are the distributions of each parameter estimate obtained from the simulation study. The suggest that, on average, we are able to recover the data generation parameters that we used. The means of the simulated values are 1, -4.01 and -0.2 and the population parameters are 1, -4 and -0.2 for \(\alpha\), \(\log(\beta_0)\) and \(\log(\beta_1)\) respectively.
par(mfrow = c(2, 2))
hist(exp(m1[, 1]), main = "shape")
abline(v = alpha, col = 2, lwd = 2)
abline(v = mean(exp(m1[, 1])), col = 3, lwd = 3)
hist(m1[, 2], main = "scale")
abline(v = mean(m1[, 2]), col = 3, lwd = 3)
abline(v = logb0, col = 2, lwd = 2)
hist(m1[, 3], main = "logb1")
abline(v = mean(m1[, 3]), col = 3, lwd = 3)
abline(v = logb1, col = 2, lwd = 2)
par(mfrow = c(1, 1))