library(WR)
library(gumbel)
library(tidyverse)
Simulations for residual analysis of win ratio regression
\[ \newcommand{\indep}{\perp \!\!\! \perp} \def\T{{ \mathrm{\scriptscriptstyle T} }} \def\pr{{\rm pr}} \def\d{{\rm d}} \def\W{{\mathcal W}} \def\H{{\mathcal H}} \def\I{{\mathcal I}} \def\E{{\mathcal E}} \def\S{{\mathcal S}} \def\O{{\mathcal O}} \def\Un{{n\choose 2}^{-1}\sum_{i<j}^n\sum} \def\l{(l)} \def\n{{(n)}} \def\v{\varepsilon} \def\bSig\mathbf{\Sigma} \def\Gn{n^{-1/2}\sum_{i=1}^n} \def\Pni{(n-1)^{-1}\sum_{j=1}^{n-1}} \]
Basic functions
Code
# Helper function of generate_gb_data ----------------------------
<- function(death_time, nonfatal_time, censor_time) {
time_fun
<- min(death_time, censor_time)
X <- (death_time <= censor_time) + 0
delta if (nonfatal_time < X) {
<- tibble(
data time = c(nonfatal_time, X),
status = c(2, delta)
)else {
} <- tibble(
data time = X,
status = delta
)
}
return(data)
}
# Simulate bivariate Gumbel random variables --------------------
<- function(n, beta0, lambdaH, lambdaD, kappa, lambdaC, fmin, fmax,
generate_gb_data f_Z1 = NULL) {
# Censoring
# Expn(lambdaC) \wedge U(fmin, fmax)
<- pmin(rexp(n, lambdaC), (fmax - fmin) * runif(n) + fmin)
C # Simulate bivariate Gumbel random variables
<- rgumbel(n, alpha = kappa, dim = 2, method = 1)
outcomes # Simulate covariates (Z1 Bernoulli(0.5); Z2 Z3: N(0, 1))
<- tibble(
Z Z1 = rnorm(n, 0, 1),
# Z1 = 4 * runif(n),
Z2 = rnorm(n, -1, 1),
Z3 = rbinom(n, 1, 0.5)
)# This only goes to outcome generation
if (!is.null(f_Z1)) {
<- Z |>
Z_outcomes mutate(
Z1 = f_Z1(Z1)
)else{
} <- Z
Z_outcomes
}
<- as.matrix(Z_outcomes)
Zmat
# Generate outcomes based on conditional dist
<- outcomes[, 1]
U1 <- outcomes[, 2]
U2 <- -log(U1) / (lambdaH * exp(- Zmat %*% beta0))
nonfatal_time <- -log(U2) / (lambdaD * exp(- Zmat %*% beta0))
death_time # Combine data to wide format
<- tibble(
df_wide id = 1:n,
nonfatal_time = as.vector(nonfatal_time),
death_time = as.vector(death_time),
censor_time = C,
Z
)
# Wide to long
<- df_wide |>
df group_by(id) |>
reframe(
ts = time_fun(death_time, nonfatal_time, censor_time)
|>
) unnest(ts) |>
left_join(
tibble(id = 1:n, Z),
by = "id")
df
}
# Model fitting -----------------------------
<- function(df) {
fit_pw_model
<- df$id
id <- df$time
time <- df$status
status <- df[, 4:ncol(df)]
Z
<- pwreg1(id, time, status, Z)
obj
return(obj)
}
# Extract results from fitted models --------------------------------------
## function to extract results from a single model
<- function(obj) {
extract_beta list(
estimate = obj$beta,
std.error = sqrt(diag(obj$Var))
)
}
## Extract results from all models
<- function(simulation_results, by_n = FALSE) {
tidy_beta_est_err
<- simulation_results |>
stats mutate(
statistics = map(results, extract_beta)
|>
) unnest_wider(statistics) |>
unnest_longer(c(estimate, std.error))
if (by_n) {
|>
stats select(n, sim_id,
term = estimate_id,
estimate,
std.error)else {
} |>
stats select(sim_id,
term = estimate_id,
estimate,
std.error)
}
}
# Summarize results across simulated datasets -----------------------------
<- function(stats, beta0, alpha = 0.05) {
summarize_beta
<- qnorm(1 - alpha / 2)
za
<- unique(stats$term)
terms
<- tibble(
beta0_tibble term = terms,
beta0 = beta0
)
|>
stats left_join(beta0_tibble) |>
group_by(term) |>
summarize(
bias = mean(estimate - beta0),
SE = sd(estimate),
SEE = mean(std.error),
coverage = mean((beta0 > estimate - za * std.error) & (beta0 < estimate + za * std.error))
)
}
<- function(stats, beta0, alpha = 0.05) {
summarize_beta_by_n
<- qnorm(1 - alpha / 2)
za
<- unique(stats$term)
terms
<- tibble(
beta0_tibble term = terms,
beta0 = beta0
)
|>
stats left_join(beta0_tibble) |>
group_by(n, term) |>
summarize(
bias = mean(estimate - beta0),
SE = sd(estimate),
SEE = mean(std.error),
coverage = mean((beta0 > estimate - za * std.error) & (beta0 < estimate + za * std.error))
) }
Data generating process
- Covariates: \(Z = (Z_1, Z_2, Z_3)^{\rm T}\) with mutually independent components
- \(Z_1\sim N(0, 1)\)
- \(Z_2\sim N(-1, 1)\)
- \(Z_3\sim \mbox{Bernoulli}(0.5)\)
- Outcome model: \[\begin{equation}\tag{1}
\pr(D > s, T > t\mid Z) =
\exp\left(-\left[\{\exp(\beta^\T Z)\lambda_Ds\}^\kappa + \{\exp(\beta^\T Z)\lambda_Ht\}^\kappa\right]^{1/\kappa}\right)
\end{equation}\]
- \(\Lambda_D = 0.2\)
- \(\Lambda_H = 2\)
- \(\kappa = 2\)
- \(\beta_0 = (1, 0, -0.5)^\T\)
- Censoring: \(\mbox{Un}[1, 4]\wedge\mbox{Expn}(\lambda_C)\)
- \(\lambda_C = 0.2\)
Checking functional form of \(Z_1\)
Five scenarios
Replace \(Z_1\) in (1) with the following five functions:
- Standard (correct): \(Z_1\)
- Absolute value: \(|Z_1|\)
- Negative thresholding: \(\max(Z_1, -1)\)
- Positive thresholding: \(\min(Z_1, 1)\)
- Exponential: \(\exp(Z_1)/2\)
Generate data
For \(n = 100, 200, 500, 1000\), and 2000, generate \(N=2000\) replicates of data with the above five scenarios.
- Generated data saved in
simulated_datasets/data_**.rds
Code
library(WR)
library(gumbel)
library(tidyverse)
source("basic_functions.R")
# Consider 5 scenarios
# 1. Standard: Z1
# 2. Abs: abs(Z1)
# 3. Negative threshold -1
# 4. Positive threshold 1
# 5. Exoponential: exp(-Z1) / 2
<- function(Z1) abs(Z1)
f_Z1_abs <- function(Z1) pmax(-1, Z1)
f_Z1_thres_neg <- function(Z1) pmin(1, Z1)
f_Z1_thres_pos <- function(Z1) exp(Z1) / 2
f_Z1_exp
## Individual experiments
# Fix parameters
<- 2
kappa #hazard rates
<- 2
lambdaH <- 0.2
lambdaD <- 1
lambdaD #censoring
<- 0.2
lambdaC <- 1
fmin <- 4
fmax # beta0=c(0,-0)
<- c(1, 0, -0.5)
beta0
<- 2000
N set.seed(123)
## Function to simulate N datasets with n samples each
<- function(N, n, beta0, lambdaH, lambdaD, kappa, lambdaC, fmin, fmax, f_Z1 = NULL){
simulate_n tibble(
sim_id = 1:N
%>%
) mutate(
data = map(sim_id, ~ generate_gb_data(n, beta0, lambdaH, lambdaD, kappa, lambdaC, fmin, fmax,
f_Z1 = f_Z1)) # Generate datasets
)
}
# Standard ---------------------------------------------------------
<- tibble(
data_standard n = c(100, 200, 500, 1000, 2000),
data = map(n, ~ simulate_n(N, .x, beta0, lambdaH, lambdaD, kappa, lambdaC, fmin, fmax))
)saveRDS(data_standard, "simulated_datasets/data_standard.rds")
rm(data_standard)
# Abs ---------------------------------------------------------
<- tibble(
data_abs n = c(100, 200, 500, 1000, 2000),
data = map(n, ~ simulate_n(N, .x, beta0, lambdaH, lambdaD, kappa, lambdaC, fmin, fmax, f_Z1 = f_Z1_abs))
)saveRDS(data_abs, "simulated_datasets/data_abs.rds")
rm(data_abs)
# Negative threshold -1 ---------------------------------------------------------
<- tibble(
data_neg n = c(100, 200, 500, 1000, 2000),
data = map(n, ~ simulate_n(N, .x, beta0, lambdaH, lambdaD, kappa, lambdaC, fmin, fmax, f_Z1 = f_Z1_thres_neg))
)saveRDS(data_neg, "simulated_datasets/data_neg.rds")
rm(data_neg)
# Positive threshold 1 ---------------------------------------------------------
<- tibble(
data_pos n = c(100, 200, 500, 1000, 2000),
data = map(n, ~ simulate_n(N, .x, beta0, lambdaH, lambdaD, kappa, lambdaC, fmin, fmax, f_Z1 = f_Z1_thres_pos))
)
saveRDS(data_pos, "simulated_datasets/data_pos.rds")
rm(data_pos)
# Exponential ---------------------------------------------------------
<- tibble(
data_exp n = c(100, 200, 500, 1000, 2000),
data = map(n, ~ simulate_n(N, .x, beta0, lambdaH, lambdaD, kappa, lambdaC, fmin, fmax, f_Z1 = f_Z1_exp))
)
saveRDS(data_exp, "simulated_datasets/data_exp.rds")
rm(data_exp)
# {
# dfs <- readRDS("simulated_datasets/data_exp.rds") |>
# filter(n == 500) |>
# unnest(data) |>
# pull(data)
#
# ## fit model
# obj <- fit_pw_model(dfs[[4]])
#
# ## residuals analysis
# resids <- residuals(obj) |>
# select(id, delta, R, M, r) |>
# left_join(obj$Zn, by = "id")
#
# ## plot
# resids |>
# ggplot(aes(x = Z1, y = M)) +
# geom_point() +
# geom_smooth(se = FALSE) +
# theme_minimal()
# }
Fit data and perform residual analysis
- Read the simulated data from
simulated_datasets/data_**.rds
- Fit model to each dataset and save the fitted models in
fitted_models/mod_fit_**.rds
Code
library(WR)
library(gumbel)
library(tidyverse)
source("basic_functions.R")
# Consider 5 scenarios
# 1. Standard: Z1
# 2. Abs: abs(Z1)
# 3. Negative threshold -1
# 4. Positive threshold 1
# 5. Exoponential: exp(-Z1) / 2
<- function(Z1) abs(Z1)
f_Z1_abs <- function(Z1) pmax(-1, Z1)
f_Z1_thres_neg <- function(Z1) pmin(1, Z1)
f_Z1_thres_pos <- function(Z1) exp(Z1) / 2
f_Z1_exp
# Model fitting for corrected specified Z1 --------------------------------
<- readRDS("simulated_datasets/data_standard.rds") |>
mod_fit_standard unnest(data) |>
mutate(
results = map(data, fit_pw_model)
)
saveRDS(mod_fit_standard, "fitted_models/mod_fit_standard.rds")
rm(mod_fit_standard)
# Model fitting abs --------------------------------
<- readRDS("simulated_datasets/data_abs.rds") |>
mod_fit_abs unnest(data) |>
mutate(
results = map(data, fit_pw_model)
)saveRDS(mod_fit_abs, "fitted_models/mod_fit_abs.rds")
rm(mod_fit_abs)
# Model fitting for neg --------------------------------
<- readRDS("simulated_datasets/data_neg.rds") |>
mod_fit_neg unnest(data) |>
mutate(
results = map(data, fit_pw_model)
)saveRDS(mod_fit_neg, "fitted_models/mod_fit_neg.rds")
rm(mod_fit_neg)
# Model fitting for pos --------------------------------
<- readRDS("simulated_datasets/data_pos.rds") |>
mod_fit_pos unnest(data) |>
mutate(
results = map(data, fit_pw_model)
)saveRDS(mod_fit_pos, "fitted_models/mod_fit_pos.rds")
rm(mod_fit_pos)
# Model fitting for exp --------------------------------
<- readRDS("simulated_datasets/data_exp.rds") |>
mod_fit_exp unnest(data) |>
mutate(
results = map(data, fit_pw_model)
)saveRDS(mod_fit_exp, "fitted_models/mod_fit_exp.rds")
rm(mod_fit_exp)