Simulations for residual analysis of win ratio regression

Author

Lu Mao

Published

December 22, 2024

\[ \newcommand{\indep}{\perp \!\!\! \perp} \def\T{{ \mathrm{\scriptscriptstyle T} }} \def\pr{{\rm pr}} \def\d{{\rm d}} \def\W{{\mathcal W}} \def\H{{\mathcal H}} \def\I{{\mathcal I}} \def\E{{\mathcal E}} \def\S{{\mathcal S}} \def\O{{\mathcal O}} \def\Un{{n\choose 2}^{-1}\sum_{i<j}^n\sum} \def\l{(l)} \def\n{{(n)}} \def\v{\varepsilon} \def\bSig\mathbf{\Sigma} \def\Gn{n^{-1/2}\sum_{i=1}^n} \def\Pni{(n-1)^{-1}\sum_{j=1}^{n-1}} \]

library(WR)
library(gumbel)
library(tidyverse)

Basic functions

Code
# Helper function of generate_gb_data ----------------------------
time_fun <- function(death_time, nonfatal_time, censor_time) {

  X <- min(death_time, censor_time)
  delta <- (death_time <= censor_time) + 0
  if (nonfatal_time < X) {
    data <- tibble(
      time = c(nonfatal_time, X),
      status = c(2, delta)
    )
  } else {
    data <- tibble(
      time = X,
      status = delta
    )
  }


  return(data)
}

# Simulate bivariate Gumbel random variables --------------------
generate_gb_data <- function(n, beta0, lambdaH, lambdaD, kappa, lambdaC, fmin, fmax,
                             f_Z1 = NULL) {
  # Censoring
  # Expn(lambdaC) \wedge U(fmin, fmax)
  C <- pmin(rexp(n, lambdaC), (fmax - fmin) * runif(n) + fmin)
  # Simulate bivariate Gumbel random variables
  outcomes <- rgumbel(n, alpha = kappa, dim = 2, method = 1)
  # Simulate covariates (Z1 Bernoulli(0.5); Z2 Z3: N(0, 1))
  Z <- tibble(
    Z1 = rnorm(n, 0, 1),
    # Z1 = 4 * runif(n),
    Z2 = rnorm(n, -1, 1),
    Z3 = rbinom(n, 1, 0.5)
  )
  # This only goes to outcome generation
  if (!is.null(f_Z1)) {
    Z_outcomes <- Z |>
      mutate(
        Z1 = f_Z1(Z1)
      )
  } else{
    Z_outcomes <- Z
  }



  Zmat <- as.matrix(Z_outcomes)

  # Generate outcomes based on conditional dist
  U1 <- outcomes[, 1]
  U2 <- outcomes[, 2]
  nonfatal_time <- -log(U1) / (lambdaH * exp(- Zmat %*% beta0))
  death_time <- -log(U2) / (lambdaD * exp(- Zmat %*% beta0))
  # Combine data to wide format
  df_wide <- tibble(
    id = 1:n,
    nonfatal_time = as.vector(nonfatal_time),
    death_time = as.vector(death_time),
    censor_time = C,
    Z
  )

  # Wide to long
  df <- df_wide |>
    group_by(id) |>
    reframe(
      ts = time_fun(death_time, nonfatal_time, censor_time)
    ) |>
    unnest(ts) |>
    left_join(
      tibble(id = 1:n, Z),
      by = "id")

  df
}

# Model fitting  -----------------------------
fit_pw_model <- function(df) {

  id <- df$id
  time <- df$time
  status <- df$status
  Z <- df[, 4:ncol(df)]

  obj <- pwreg1(id, time, status, Z)

  return(obj)

}


# Extract results from fitted models --------------------------------------

## function to extract results from a single model
extract_beta <- function(obj) {
  list(
    estimate = obj$beta,
    std.error = sqrt(diag(obj$Var))
  )
}

## Extract results from all models
tidy_beta_est_err <- function(simulation_results, by_n = FALSE) {

  stats <- simulation_results |>
    mutate(
      statistics = map(results, extract_beta)
    ) |>
    unnest_wider(statistics) |>
    unnest_longer(c(estimate, std.error))


  if (by_n) {
    stats |>
    select(n, sim_id,
           term = estimate_id,
           estimate,
           std.error)
    } else {
    stats |>
      select(sim_id,
           term = estimate_id,
           estimate,
           std.error)
  }
}


# Summarize results across simulated datasets -----------------------------

summarize_beta <- function(stats, beta0, alpha = 0.05) {

  za <- qnorm(1 - alpha / 2)

  terms <- unique(stats$term)

  beta0_tibble <- tibble(
    term = terms,
    beta0 = beta0
  )

  stats |>
    left_join(beta0_tibble) |>
    group_by(term) |>
    summarize(
      bias = mean(estimate - beta0),
      SE = sd(estimate),
      SEE = mean(std.error),
      coverage = mean((beta0 > estimate - za * std.error) & (beta0 < estimate + za * std.error))
    )
}


summarize_beta_by_n <- function(stats, beta0, alpha = 0.05) {

  za <- qnorm(1 - alpha / 2)

  terms <- unique(stats$term)

  beta0_tibble <- tibble(
    term = terms,
    beta0 = beta0
  )

  stats |>
    left_join(beta0_tibble) |>
    group_by(n, term) |>
    summarize(
      bias = mean(estimate - beta0),
      SE = sd(estimate),
      SEE = mean(std.error),
      coverage = mean((beta0 > estimate - za * std.error) & (beta0 < estimate + za * std.error))
    )
}

Data generating process

  • Covariates: \(Z = (Z_1, Z_2, Z_3)^{\rm T}\) with mutually independent components
    • \(Z_1\sim N(0, 1)\)
    • \(Z_2\sim N(-1, 1)\)
    • \(Z_3\sim \mbox{Bernoulli}(0.5)\)
  • Outcome model: \[\begin{equation}\tag{1} \pr(D > s, T > t\mid Z) = \exp\left(-\left[\{\exp(\beta^\T Z)\lambda_Ds\}^\kappa + \{\exp(\beta^\T Z)\lambda_Ht\}^\kappa\right]^{1/\kappa}\right) \end{equation}\]
    • \(\Lambda_D = 0.2\)
    • \(\Lambda_H = 2\)
    • \(\kappa = 2\)
    • \(\beta_0 = (1, 0, -0.5)^\T\)
  • Censoring: \(\mbox{Un}[1, 4]\wedge\mbox{Expn}(\lambda_C)\)
    • \(\lambda_C = 0.2\)

Checking functional form of \(Z_1\)

Five scenarios

Replace \(Z_1\) in (1) with the following five functions:

  1. Standard (correct): \(Z_1\)
  2. Absolute value: \(|Z_1|\)
  3. Negative thresholding: \(\max(Z_1, -1)\)
  4. Positive thresholding: \(\min(Z_1, 1)\)
  5. Exponential: \(\exp(Z_1)/2\)

Generate data

For \(n = 100, 200, 500, 1000\), and 2000, generate \(N=2000\) replicates of data with the above five scenarios.

  • Generated data saved in simulated_datasets/data_**.rds
Code
library(WR)
library(gumbel)
library(tidyverse)

source("basic_functions.R")
# Consider 5 scenarios
  # 1. Standard: Z1
  # 2. Abs: abs(Z1)
  # 3. Negative threshold -1
  # 4. Positive threshold 1
  # 5. Exoponential: exp(-Z1) / 2

f_Z1_abs <- function(Z1) abs(Z1)
f_Z1_thres_neg <- function(Z1) pmax(-1, Z1)
f_Z1_thres_pos <- function(Z1) pmin(1, Z1)
f_Z1_exp <- function(Z1) exp(Z1) / 2


## Individual experiments
# Fix parameters
kappa <- 2
#hazard rates
lambdaH <- 2
lambdaD <- 0.2
lambdaD <- 1
#censoring
lambdaC <- 0.2
fmin <- 1
fmax <- 4
# beta0=c(0,-0)
beta0 <- c(1, 0, -0.5)

N <- 2000
set.seed(123)
## Function to simulate N datasets with n samples each

simulate_n <- function(N, n, beta0, lambdaH, lambdaD, kappa, lambdaC, fmin, fmax, f_Z1 = NULL){
  tibble(
    sim_id = 1:N
  ) %>%
    mutate(
      data = map(sim_id, ~ generate_gb_data(n, beta0, lambdaH, lambdaD, kappa, lambdaC, fmin, fmax,
                                            f_Z1 = f_Z1)) # Generate datasets
    )
}

# Standard ---------------------------------------------------------
data_standard <- tibble(
  n = c(100, 200, 500, 1000, 2000),
  data = map(n, ~ simulate_n(N, .x, beta0, lambdaH, lambdaD, kappa, lambdaC, fmin, fmax))
)
saveRDS(data_standard, "simulated_datasets/data_standard.rds")
rm(data_standard)

# Abs ---------------------------------------------------------
data_abs <- tibble(
  n = c(100, 200, 500, 1000, 2000),
  data = map(n, ~ simulate_n(N, .x, beta0, lambdaH, lambdaD, kappa, lambdaC, fmin, fmax, f_Z1 = f_Z1_abs))
)
saveRDS(data_abs, "simulated_datasets/data_abs.rds")
rm(data_abs)

# Negative threshold -1 ---------------------------------------------------------
data_neg <- tibble(
  n = c(100, 200, 500, 1000, 2000),
  data = map(n, ~ simulate_n(N, .x, beta0, lambdaH, lambdaD, kappa, lambdaC, fmin, fmax, f_Z1 = f_Z1_thres_neg))
)
saveRDS(data_neg, "simulated_datasets/data_neg.rds")
rm(data_neg)

# Positive threshold 1 ---------------------------------------------------------
data_pos <- tibble(
  n = c(100, 200, 500, 1000, 2000),
  data = map(n, ~ simulate_n(N, .x, beta0, lambdaH, lambdaD, kappa, lambdaC, fmin, fmax, f_Z1 = f_Z1_thres_pos))
)

saveRDS(data_pos, "simulated_datasets/data_pos.rds")
rm(data_pos)

# Exponential ---------------------------------------------------------
data_exp <- tibble(
  n = c(100, 200, 500, 1000, 2000),
  data = map(n, ~ simulate_n(N, .x, beta0, lambdaH, lambdaD, kappa, lambdaC, fmin, fmax, f_Z1 = f_Z1_exp))
)

saveRDS(data_exp, "simulated_datasets/data_exp.rds")
rm(data_exp)


# {
# dfs <- readRDS("simulated_datasets/data_exp.rds") |>
#   filter(n == 500) |>
#   unnest(data) |>
#   pull(data)
#
# ## fit model
# obj <- fit_pw_model(dfs[[4]])
#
# ## residuals analysis
# resids <- residuals(obj) |>
#   select(id, delta, R, M, r) |>
#   left_join(obj$Zn, by = "id")
#
# ## plot
# resids |>
#   ggplot(aes(x = Z1, y = M)) +
#   geom_point() +
#   geom_smooth(se = FALSE) +
#   theme_minimal()
# }

Fit data and perform residual analysis

  • Read the simulated data from simulated_datasets/data_**.rds
  • Fit model to each dataset and save the fitted models in fitted_models/mod_fit_**.rds
Code
library(WR)
library(gumbel)
library(tidyverse)

source("basic_functions.R")
# Consider 5 scenarios
  # 1. Standard: Z1
  # 2. Abs: abs(Z1)
  # 3. Negative threshold -1
  # 4. Positive threshold 1
  # 5. Exoponential: exp(-Z1) / 2

f_Z1_abs <- function(Z1) abs(Z1)
f_Z1_thres_neg <- function(Z1) pmax(-1, Z1)
f_Z1_thres_pos <- function(Z1) pmin(1, Z1)
f_Z1_exp <- function(Z1) exp(Z1) / 2


# Model fitting for corrected specified Z1 --------------------------------

mod_fit_standard <- readRDS("simulated_datasets/data_standard.rds") |>
  unnest(data) |>
  mutate(
    results = map(data, fit_pw_model)
  )

saveRDS(mod_fit_standard, "fitted_models/mod_fit_standard.rds")
rm(mod_fit_standard)

# Model fitting abs --------------------------------
mod_fit_abs <- readRDS("simulated_datasets/data_abs.rds") |>
  unnest(data) |>
  mutate(
    results = map(data, fit_pw_model)
    )
saveRDS(mod_fit_abs, "fitted_models/mod_fit_abs.rds")
rm(mod_fit_abs)

# Model fitting for neg --------------------------------
mod_fit_neg <- readRDS("simulated_datasets/data_neg.rds") |>
  unnest(data) |>
  mutate(
    results = map(data, fit_pw_model)
    )
saveRDS(mod_fit_neg, "fitted_models/mod_fit_neg.rds")
rm(mod_fit_neg)
# Model fitting for pos --------------------------------
mod_fit_pos <- readRDS("simulated_datasets/data_pos.rds") |>
  unnest(data) |>
  mutate(
    results = map(data, fit_pw_model)
    )
saveRDS(mod_fit_pos, "fitted_models/mod_fit_pos.rds")
rm(mod_fit_pos)

# Model fitting for exp --------------------------------
mod_fit_exp <- readRDS("simulated_datasets/data_exp.rds") |>
  unnest(data) |>
  mutate(
    results = map(data, fit_pw_model)
    )
saveRDS(mod_fit_exp, "fitted_models/mod_fit_exp.rds")
rm(mod_fit_exp)

Check estimation/inference in correct model