Regularized win ratio regression for variable selection and risk prediction

Simulation studies and a real-world application

Author

Lu Mao

Published

January 15, 2025

\[ \newcommand{\wh}{\widehat} \newcommand{\wt}{\widetilde} \def\bs{\boldsymbol} \newcommand{\indep}{\perp \!\!\! \perp} \def\T{{ \mathrm{\scriptscriptstyle T} }} \def\pr{{\rm pr}} \def\d{{\rm d}} \def\W{{\mathcal W}} \]

library(WR)
library(survival)
library(tidyverse) # For data wrangling and visualization
library(knitr)
library(glmnet)
library(patchwork)
library(rmt)

This document provides a step-by-step guide to repoduce the numerical results in the simulations studies (Section 3) and real data analysis (Section 4) in the manuscript.

Note: the real data used in the analysis are not publicly available.

Basic functions

Function definitions

# Get minimum of a vector, return Inf if vector is null or all elements are NA
# Handles edge cases where the input vector is null or entirely NA.
min_na_inf <- function(x){
  if(all(is.na(x)) | is.null(x)){
    # Return Inf if input is null or all elements are NA
    return(Inf)
  } else {
    # Otherwise, return the minimum value, ignoring NA
    return(min(x, na.rm = TRUE))
  }
}


# Extract first time where status == k by time t
# Outputs Inf if no such time exists
# Arguments:
# - time: Vector of time points
# - status: Vector of statuses corresponding to time points
# - k: The specific status of interest
# - t: Upper limit of time (default Inf)
extact_time_with_status <- function(time, status, k, t = Inf){
  # Filter times where status equals k and time is within the limit, then find the minimum
  time[status == k & time <= t] |> min_na_inf()
}


# Transform outcomes data by standard win function (Pocock's setting)
# Groups data by subject and converts long format to wide format
# Then computes pairwise wins and losses
poc_win <- function(df_y, n){
  df_y_wide <- df_y |>
    group_by(id) |>
    reframe(
      results = long_to_wide_death_nf(time, status)
    ) |>
    mutate(
      names = rep(c("death_time", "nonfatal_time", "fu_time"), n)
    ) |>
    pivot_wider(
      names_from = names,
      values_from = results
    )

  # Generate all pairwise combinations
  pairwise_tibble <- crossing(i = df_y_wide, j = df_y_wide) %>%
    filter(i |> row_number() < j |> row_number()) |>
    unnest(c(i, j), names_sep = "_")


  # Compute pairwise wins and losses
  pw_win <- pairwise_tibble |>
    mutate(
      win_death = case_when(
        j_death_time < pmin(i_death_time, i_fu_time) ~ 1,
        i_death_time < pmin(j_death_time, j_fu_time) ~ -1,
        TRUE ~ 0
      ),
      win_nonfatal = if_else(win_death == 0, case_when(
        j_nonfatal_time < pmin(i_nonfatal_time, i_fu_time) ~ 1,
        i_nonfatal_time < pmin(j_nonfatal_time, j_fu_time) ~ -1,
        TRUE ~ 0
      ), 0),
      win = win_death + win_nonfatal,
      component = case_when(
        win_death != 0 ~ "death",
        win_nonfatal != 0 ~ "nonfatal",
        TRUE ~ "tie"
      )
    ) |>
    select(
      id_i = i_id,
      id_j = j_id,
      win,
      component
    ) |>
    filter(win != 0) # Remove pairs with win == 0

  # Remove pairs with win == 0 in pairwise tibble as well
  # by left joining with
  pairwise_tibble <- pw_win |> select(id_i, id_j) |>
    left_join(pairwise_tibble, by = c("id_i" = "i_id", "id_j" = "j_id"))

  list(pairwise_tibble = pairwise_tibble, pw_win = pw_win)
}



# Reorganize long outcomes (death, nonfatal, censor) to wide format
# Returns a vector with death time, nonfatal event time, and follow-up time
long_to_wide_death_nf <- function(time, status, deathcode = 1, nfcode = 2, t = Inf){

  fu_time <- min(max(time), t)
  death_time <- extact_time_with_status(time, status, deathcode, fu_time)
  nonfatal_time <- extact_time_with_status(time, status, nfcode, fu_time)

  obj <- list(
    death_time = death_time,
    nonfatal_time = nonfatal_time,
    fu_time = fu_time
  )
  # return(obj)
  return(c(death_time, nonfatal_time, fu_time))

}

# Transform data from subject-wise long format to pairwise win-loss format
# Includes information on the deciding component (death or nonfatal events)
# Arguments:
# - df: Data frame with (id, time, status) and potentially other covariates
pair_win_loss_transform <- function(df) {
  # Separate outcomes with baseline covariates
  # Covariates
  Zn <- df |>
    group_by(id) |>
    slice(1) |>
    select(-c(time, status))
  # Number of covariates
  p <- ncol(Zn) - 1


  # Flatten outcome data and self-pair --------------------------------------
  # Specific to win function wfun
  # df_y -> pairwise_tibble, pw_win

  df_y <- df |>
    select(id, time, status)


  n <- nrow(Zn)

  poc_win_obj <- poc_win(df_y, n)

  # Extract pairwise data
  pairwise_tibble <- poc_win_obj$pairwise_tibble
  pw_win <- poc_win_obj$pw_win
  # Self-pair covariates




  pw_Z <- crossing(i = Zn, j = Zn) %>%
    filter(i |> row_number() < j |> row_number()) |>
    unnest(c(i, j), names_sep = "_") |>
    # Right joint with pw_win to remove pairs with win ==0
    right_join(pw_win |> select(id_i, id_j),
               by = c("i_id" = "id_i", "j_id" = "id_j"))



  # Compute difference in covariates
  pw_Zd <- pw_Z |>
    select(id_i = i_id, id_j = j_id) |>
    bind_cols(
      # Z_i - Z_j
      pw_Z |> select(starts_with("i_")) |> rename_with(~ str_remove(., "i_")) |> select(-id) -
        pw_Z |> select(starts_with("j_")) |> rename_with(~ str_remove(., "j_")) |> select(-id)
    )

  # Merge Zd with pw_win by id_i and id_j
  pw_df <- pw_win |>
    left_join(pw_Zd, by = c("id_i", "id_j")) |>
    mutate(
      win = win |> as.factor(),
      component = component |> as.factor()
    )

  pw_df
}

# Split data into training and test sets
# Arguments:
# - df: Data frame with (id, time, status) and potentially other covariates
# - prop: Proportion of data to be used for training
# - strata: Whether to stratify the split based on event types
# Returns a list with training and test data frames
wr_split <- function(df, prop = 0.8, strata = TRUE){
  if (strata){
    out_id_train <- df |>
      group_by(id) |>
      summarize(
        out = case_when(
          any(status == 1) & any(status == 2) ~ 1,
          any(status == 1) ~ 2,
          any(status == 2) ~ 3,
          TRUE ~ 4
        )
      ) |>
      group_by(out) |>
      mutate(
        split = sample(c("train", "test"), n(), replace = TRUE, prob = c(prop, 1 - prop))
      ) |>
      ungroup() |>
      select(-out)
  } else {
    out_id_train <- df |>
      distinct(id) |>
      mutate(split = sample(c("train", "test"), n(), replace = TRUE, prob = c(prop, 1 - prop)))
  }

  df_split <- df |>
    inner_join(out_id_train, by = "id")

  list(df_train = df_split |> filter(split == "train") |> select(-split),
       df_test = df_split |> filter(split == "test") |> select(-split))


}


# Main function to fit a penalized win ratio regression with pathwise solution
# Arguments (long format):
# - id, time, status: Subject identifiers, event times, and event statuses
# - Z: Matrix of covariates
# - ...: Additional arguments passed to glmnet (lambda, alpha, etc.)
wrnet <- function(id, time, status, Z, ...){

  df <- tibble(id, time, status, Z)
  # Transform data to pairwise win-loss format
  pw_df <- pair_win_loss_transform(df)

  y <- pw_df$win
  x <- pw_df |> select(-c(id_i, id_j, win, component))

  # Use glmnet for logistic regression

  obj <- glmnet(
    x = x,
    y = y,
    family = "binomial",
    # Constrain intercept = 0
    intercept = FALSE,
    ...
  )

  obj$Zn <-  df |> group_by(id) |> slice(1) |> ungroup() |>  select(-c(id, time, status)) |> as.matrix()

  class(obj) <- "wrnet"
  obj
}

# Create default lambda sequence for regularization
# Arguments:
# - id, time, status: Subject identifiers, event times, and statuses
# - Z: Covariate matrix
create_default_lambda <- function(id, time, status, Z){
  Z <- as.matrix(Z)
  df <- tibble(id, time, status, Z)
  n <- length(unique(id))
  p <- ncol(Z)
  # Transform data to pairwise win-loss format
  pw_df <- pair_win_loss_transform(df)
  y <- pw_df$win
  x <- pw_df |> select(-c(id_i, id_j, win, component))
  x_scaled <- scale(x)
  N <- nrow(x)

  lambda.max <- max(abs(1 / N * t(x_scaled) %*% (1/N - (y == unique(y)[1]))), na.rm = TRUE)
  lambda_min_ratio <- ifelse(n <= p, 0.01, 0.0001)
  lambda <- lambda.max * lambda_min_ratio^seq(0, 1, length.out = 100)


  return(lambda)

}

# Cross-validation for wrnet
# Arguments:
# - id, time, status: Subject identifiers, event times, and statuses
# - Z: Covariate matrix
# - k: Number of folds
# - strata: Whether to stratify folds based on event types
cv_wrnet <- function(id, time, status, Z, k = 10, strata = TRUE, lambda = NULL, ...){

  # Generate default lambda sequence if not provided
  if (is.null(lambda)){
    lambda <- create_default_lambda(id, time, status, Z)
  }

  df_train <- tibble(id, time, status, Z)

  # Generate folds
  if (strata){
    out_id_train_cv <- df_train |>
      group_by(id) |>
      summarize(
        out = case_when(
          any(status == 1) & any(status == 2) ~ 1,
          any(status == 1) ~ 2,
          any(status == 2) ~ 3,
          TRUE ~ 4
        )
      ) |>
      group_by(out) |>
      mutate(
        fold = sample(1:k, n(), replace = TRUE)
      ) |>
      ungroup() |>
      select(-out)
  } else {
    out_id_train_cv <- df_train |>
      distinct(id) |>
      mutate(fold = sample(1:k, n(), replace = TRUE))
  }

  # Transform each fold to pairwise format
  df_train_cv <- df_train |>
    inner_join(out_id_train_cv, by = "id") |>
    select(fold, everything()) |>
    group_by(fold) |>
    nest() |>
    mutate(
      pw_data = map(data, ~ .x |> pair_win_loss_transform())
    ) |>
    select(-data)

  fit_pw_fold <- function(k, lambda, ...){
    df_train_cv_k <- df_train_cv |> filter(fold != k) |> unnest(pw_data) |> ungroup()
    # Fit regularized logistic regression model
    pw_fit <- glmnet(
      x = df_train_cv_k |> select(-(1:5)) |> as.matrix(),
      y = df_train_cv_k$win,
      family = "binomial",
      # constrain intercept = 0
      intercept = FALSE,
      lambda= lambda,
      ...
    )

    Beta <- as.matrix(coef(pw_fit))[-1, ]

    df_val_cv_k <- df_train_cv |> filter(fold == k) |> unnest(pw_data)|> ungroup()

    pw_win_test_score <- df_val_cv_k |>
      select((1:5)) |>
      bind_cols(
        as_tibble(df_val_cv_k |> select(-(1:5)) |> as.matrix() %*% Beta)
      )

    # Compute concordance index
    cind_lambda <- pw_win_test_score |>
      mutate(
        win = case_when(
          win == "1" ~ 1,
          win == "-1" ~ -1
        ),
        across(-c(fold, id_i,id_j,win,component), \(x) if_else(x == 0, 0.5, sign(x) * win == 1))
      ) |>
      summarize(
        across(-c(fold, id_i,id_j,win,component), mean)
      ) |>
      pivot_longer(
        everything(),
        names_to = "lambda_name",
        values_to = "concordance"
      ) |>
      mutate(
        lambda = pw_fit$lambda
      )

    cind_lambda
  }


  fit_pw_fold_param <- function(k) fit_pw_fold(k, lambda, ...)


  cv_results <- tibble(
    fold = 1:k,
    cind_lambda = map(fold, ~ fit_pw_fold_param(.x))
  )


  cv_results_expanded <- cv_results |>
    unnest(cind_lambda) |>
    group_by(lambda) |>
    summarize(
      concordance = mean(concordance)
    )

  cv_results_expanded

}

# Test the fitted wrnet model on test data
# Arguments:
# - fit: Fitted wrnet object
# - df_test: Test dataset with (id, time, status, covariates)
test_wrnet <- function(fit, df_test){


  # fit <- final_fit

  # Extract beta
  beta <- as.matrix(fit$beta)

  Zn_test <- df_test |> group_by(id) |> slice(1) |> select(-c(time, status)) |> ungroup()
  score_test <- tibble(
    id = Zn_test$id,
    win_score = Zn_test |> select(-id) |> as.matrix() %*% beta |> as.vector(),
    risk_score = - win_score
  )

  pw_score_test <- crossing(i = score_test, j = score_test) %>%
    filter(i |> row_number() < j |> row_number()) |>
    unnest(c(i, j), names_sep = "_") |>
    select(
      id_i = i_id,
      id_j = j_id,
      win_score_i = i_win_score,
      win_score_j = j_win_score
    )




  df_y_test <- df_test |>
    select(id, time, status)
  n_test <- nrow(Zn_test)

  poc_win_obj <- poc_win(df_y_test, n_test)

  # Extract pairwise data
  pw_win_test <- poc_win_obj$pw_win

  # Merge win-loss outcomes with risk scores
  pw_win_score_test <- pw_win_test |>
    left_join(pw_score_test, by = c("id_i" = "id_i", "id_j" = "id_j"))

  # Overall and component-wise concordance
  concordance <- pw_win_score_test |>
    bind_rows(
      pw_win_score_test |> mutate(component = "overall")
    ) |>
    mutate(
      score_diff = win_score_i - win_score_j,
      concord = if_else(score_diff == 0, 0.5, sign(score_diff)  == win)
    ) |>
    group_by(component) |>
    summarize(
      concordance = mean(concord, na.rm = TRUE)
    )

  list(concordance = concordance, score_test = score_test, pw_win_score_test = pw_win_score_test)
}

# Computes variable importance based on standardized coefficients
# Arguments:
# - fit: Fitted wrnet object
vi_wrnet <- function(fit){

  beta <- fit$beta
  Zn <- fit$Zn

  sds <- apply(Zn, 2, sd)
  importance <- abs(beta * sds)

  obj <- tibble(
    vars = rownames(beta),
    importance = as.vector(importance)
  ) |>
  arrange(
    desc(importance)
  ) |>
    filter(
      importance > 0
    )

  class(obj) <- "vi_wrnet"

  obj
}

# Plot top k variables based on variable importance
# Arguments:
# - x: Variable importance object (vi_wrnet)
# - k: Number of top variables to display (default is top 10 or all if fewer than 20)
vip.vi_wrnet <- function(x, k = NULL){


  vip_tbl <- tibble(
      vars = x$vars,
      importance = x$importance
    )

  m <- nrow(vip_tbl)

  if (is.null(k)){
    k <- ifelse(m <= 20, m, 10)
  } else(
    k = min(k, m)
  )

  vip_tbl |>
    head(k) |>
    ggplot(
      aes(y = reorder(vars, importance), x = importance)
    ) +
    geom_col() +
    labs(
      x = "Importance",
      y = NULL
    ) +
    theme_minimal()


}

logit <- function(x) {

  x <- ifelse(x == 1,
              1 - 1e-12,
              x)
  x <- ifelse(x == 0,
              1e-12,
              x)
  # return results
  log(x / (1 - x))
}



# A streamlined function to automate the process
# Data partitioning -> CV > Final Model -> Test
wrnet_pipeline <- function(df,  k = 10, prop = 0.8, lambda = NULL, cox = FALSE, ...){

  # Split data into training and test sets
  split <- wr_split(df, prop = prop)
  df_train <- split$df_train
  df_test <- split$df_test

  # Cross-validation
  cv_results <- cv_wrnet(df_train$id, df_train$time, df_train$status, df_train |> select(-c(id, time, status)),
                         k = k, lambda = lambda, ...)

  # Fit final model
  lambda_final <- cv_results |>
    filter(concordance == max(concordance)) |>
    pull(lambda)

  fit <- wrnet(df_train$id, df_train$time, df_train$status, df_train |> select(-c(id, time, status)),
               lambda = lambda_final, ...)

  beta <- as.vector(fit$beta)
  varnames <- rownames(fit$beta)
  # Test the model
  test_results <- test_wrnet(fit, df_test)
  concordance <- test_results$concordance

  result <- list(varnames = varnames, beta = beta, concordance = concordance)

  if (cox) {
    # Fit regularized Cox model by glmnet
    # Subset to first events
    df_train_tfe <- df_train |>
      group_by(id) |>
      arrange(time) |>
      slice(1) |>
      mutate(
        status = status > 0,
        time = if_else(time <= 0, 0.0001, time) # fix 0 times
      ) |>
      ungroup()
    # Fit regularized cox regression model
    # 10-fold CV
    cox_cvfit <- cv.glmnet(
      x = df_train_tfe |> select(-c(id, time, status)) |> as.matrix(),
      y = df_train_tfe |> select(time, status) |> as.matrix(),
      family = "cox",
      type.measure = "C", # C-index for CV
      ...
    )

    cox_beta <- as.vector(coef(cox_cvfit, s = "lambda.min"))
    # Trick test_wrnet() to get risk score for Cox model
    cox_cvfit$beta <-  - cox_beta
    # Test the final model on test set
    cox_test_result <- cox_cvfit |> test_wrnet(df_test)
    # Test C-index
    cox_concordance <- cox_test_result$concordance

    result$cox_beta<- cox_beta
    result$cox_concordance <- cox_concordance

  }

  result
}



# Functions for simulation studies ----------------------------------------

# Generate AR(1) correlation matrix
ar1_corr_matrix <- function(p, rho) {
  matrix(rho^abs(row(matrix(1:p, p, p)) - col(matrix(1:p, p, p))), nrow = p)
}

# Simulate n-sample from AR(1) correlation matrix
simu_Z_ar1 <- function(n, p, rho) {
  Sigma <- ar1_corr_matrix(p, rho)
  MASS::mvrnorm(n = n, mu = rep(0, p), Sigma = Sigma)
}

# Simulate bivariate Gumbel random variables --------------------
generate_gb_ar1Z_data <- function(n, betaD0, betaH0,rho, lambdaH, lambdaD, kappa, lambdaC, fmin, fmax) {
  # Censoring
  # Expn(lambdaC) \wedge U(fmin, fmax)
  C <- pmin(rexp(n, lambdaC), (fmax - fmin) * runif(n) + fmin)
  # Simulate bivariate Gumbel random variables
  outcomes <- rgumbel(n, alpha = kappa, dim = 2, method = 1)

  # Simulate covariates (Z1 Bernoulli(0.5); Z2 Z3: N(0, 1))
  p <- length(betaD0)
  Z <- as_tibble(simu_Z_ar1(n, p, rho))

  Zmat <- as.matrix(Z)

  # Generate outcomes based on conditional dist
  U1 <- outcomes[, 1]
  U2 <- outcomes[, 2]
  nonfatal_time <- -log(U1) / (lambdaH * exp(- Zmat %*% betaH0))
  death_time <- -log(U2) / (lambdaD * exp(- Zmat %*% betaD0))
  # Combine data to wide format
  df_wide <- tibble(
    id = 1:n,
    nonfatal_time = as.vector(nonfatal_time),
    death_time = as.vector(death_time),
    censor_time = C,
    Z
  )

  # Wide to long
  df <- df_wide |>
    group_by(id) |>
    reframe(
      ts = time_fun(death_time, nonfatal_time, censor_time)
    ) |>
    unnest(ts) |>
    left_join(
      tibble(id = 1:n, Z),
      by = "id")

  df
}



# Helper function of generate_gb_data ----------------------------
time_fun <- function(death_time, nonfatal_time, censor_time) {

  X <- min(death_time, censor_time)
  delta <- (death_time <= censor_time) + 0
  if (nonfatal_time < X) {
    data <- tibble(
      time = c(nonfatal_time, X),
      status = c(2, delta)
    )
  } else {
    data <- tibble(
      time = X,
      status = delta
    )
  }

  return(data)
}

Simulation Studies

Simulation set-up

Covariates: \(z = (z_{\cdot 1}, z_{\cdot 2}, \ldots, z_{\cdot p})^\T\)
- \(z_{\cdot k} \sim N(0,1)\)
- Correlation strcuture: AR(1) with \(\rho = 0.1\)
Outcome model: \[\begin{equation}\tag{1} \pr(D > s, T > t\mid z) = \exp\left(-\left[\{\exp(-\beta_D^\T z)\lambda_Ds\}^\kappa + \{\exp(-\beta_H^\T z)\lambda_Ht\}^\kappa\right]^{1/\kappa}\right), \end{equation}\]
- \(\lambda_D = 0.1\)
- \(\lambda_H = 1\)
- \(\kappa = 1.25\)
Censoring: \(\mbox{Un}[0.2, 4]\wedge\mbox{Expn}(\lambda_C)\)
- \(\lambda_C = 0.02\)
Scenarios
1. Same effect pattern on components \[ \beta_D=\beta_H=(\underbrace{0.5,\ldots, 0.5}_{10}, \underbrace{0,\ldots, 0}_{p - 10}). \]
2. Different effect patterns on components \[\begin{align*} \beta_D&=(\underbrace{0.75,\ldots, 0.75}_{5}, \underbrace{0,\ldots, 0}_{p - 5});\\ \beta_H&=(\underbrace{0,\ldots, 0}_{5}, \underbrace{0.75,\ldots, 0.75}_{5}, \underbrace{0,\ldots, 0}_{p - 10}) \end{align*}\]

Model parameter specification

## Individual experiments
# Fix parameters
kappa <- 1.25
#hazard rates
lambdaH <- 1
lambdaD <- 0.1
#censoring
lambdaC <- 0.02
fmin <- 0.2
fmax <- 4
# beta0=c(0,-0)
# AR1 correlation among covariates
rho <- 0.1

Simulation data generation

library(WR)
library(gumbel)
library(tidyverse)

source("R code/basic_functions.R")
source("R code/model_parameter_specification.R")

N <- 1000
set.seed(123)


## Function to simulate N datasets with n samples each
simulate_n <- function(N, n, betaD0, betaH0,rho, lambdaH, lambdaD, kappa, lambdaC, fmin, fmax){
  tibble(
    sim_id = 1:N
  ) %>%
    mutate(
      data = map(sim_id, ~ generate_gb_ar1Z_data (n, betaD0, betaH0,rho, lambdaH, lambdaD, kappa, lambdaC, fmin, fmax)) # Generate datasets
    )
}


p <- 20

# Generate data with different sample size n
n_list <- c(200, 500, 1000, 2000)
# ========= Two scenarios ==============

# Scenario 1: Same effect pattern for death and nonfatal ------------------

ptt0 <- c(rep(1, 10), rep(0, p - 10))
betaD0 <- 0.5 * ptt0
betaH0 <- 0.5 * ptt0

# Generate and save data
for (n in n_list){
  data = simulate_n(N, n, betaD0, betaH0,rho, lambdaH, lambdaD, kappa, lambdaC, fmin, fmax)
  saveRDS(data, paste0("simulated_datasets/s1_data_n", n, ".rds"))
  rm(data)
}

# Scenario 2: Different effect pattern for death and nonfatal --------------
ptt1 <- c(rep(1, 5), rep(0, p-5))
ptt2 <- c(rep(0, 5), rep(1, 5), rep(0, p-10))
betaD0 <- 0.75 * ptt1
betaH0 <- 0.75 * ptt2


# Generate and save data
for (n in n_list){
  data = simulate_n(N, n, betaD0, betaH0,rho, lambdaH, lambdaD, kappa, lambdaC, fmin, fmax)
  saveRDS(data, paste0("simulated_datasets/s2_data_n", n, ".rds"))
  rm(data)
}

Table 1: Sensitivity and Specificity

Model fitting

library(WR)
library(tidyverse)
library(glmnet)

source("R code/basic_functions.R")


# Function to fit wrnet and cox safely without throwing error
wrnet_cox_pipeline <- function(x){
  wrnet_pipeline(x, cox = TRUE)
}

safe_wrnet_pipeline <- safely(wrnet_cox_pipeline)

set.seed(123)

options(warn = -1)
for (n in c(200, 500, 1000, 2000)){
  # Scenario 1
  s1dat <- readRDS(paste0("simulated_datasets/s1_data_n", n, ".rds")) |>
    filter(sim_id<=1000)

  # Fit wrnet and cox to each dataset
   s1wrfit <- s1dat |>
    mutate(fit = map(data, safe_wrnet_pipeline)) |>
    unnest_longer(fit) |>
    filter(fit_id == "result") |>
    select(-c(data, fit_id)) |>
    unnest_wider(fit)

   # Save fitted results
  saveRDS(s1wrfit, paste0("fitted_models/s1_wrnet_fit_n", n, ".rds"))

  # Scenario 2
  s2dat <- readRDS(paste0("simulated_datasets/s2_data_n", n, ".rds")) |>
    filter(sim_id<=1000)

  # Fit wrnet and cox to each dataset
  s2wrfit <- s2dat |>
    mutate(fit = map(data, safe_wrnet_pipeline)) |>
    unnest_longer(fit) |>
    filter(fit_id == "result") |>
    select(-c(data, fit_id)) |>
    unnest_wider(fit)
  # Save fitted results
  saveRDS(s2wrfit, paste0("fitted_models/s2_wrnet_fit_n", n, ".rds"))
}

Summarize sensitivity and specificity data

# Read in the fitted model results
  wrfit <- bind_rows(
    readRDS(paste0("fitted_models/s1_wrnet_fit_n", 200, ".rds")) |> 
      mutate(n = "n = 200", scn = "Scenario 1"),
    readRDS(paste0("fitted_models/s2_wrnet_fit_n", 200, ".rds")) |>
      mutate(n = "n = 200", scn = "Scenario 2"),
    readRDS(paste0("fitted_models/s1_wrnet_fit_n", 500, ".rds")) |>
      mutate(n = "n = 500", scn = "Scenario 1"),
    readRDS(paste0("fitted_models/s2_wrnet_fit_n", 500, ".rds")) |>
      mutate(n = "n = 500", scn = "Scenario 2"),
    readRDS(paste0("fitted_models/s1_wrnet_fit_n", 1000, ".rds")) |>
      mutate(n = "n = 1000", scn = "Scenario 1"),
    readRDS(paste0("fitted_models/s2_wrnet_fit_n", 1000, ".rds")) |>
      mutate(n = "n = 1000", scn = "Scenario 2"),
    readRDS(paste0("fitted_models/s1_wrnet_fit_n", 2000, ".rds")) |>
      mutate(n = "n = 2000", scn = "Scenario 1"),
    readRDS(paste0("fitted_models/s2_wrnet_fit_n", 2000, ".rds")) |>
      mutate(n = "n = 2000", scn = "Scenario 2")
  ) |> 
    mutate(
      n = fct(n)
    )

# Summarize sensitivity/specificity data
sens_spec <-  wrfit |> 
    unnest(c(varnames, beta, cox_beta)) |> 
    select(n, scn, sim_id, varnames, beta, cox_beta) |> 
    mutate(varnames = fct(varnames)) |> 
    group_by(n, scn, varnames) |>
    summarize(
      across(c(beta, cox_beta), ~ mean(.x != 0))
    ) |> 
    pivot_longer(
      cols = c(beta, cox_beta),
      names_to = "model",
      values_to = "selection_rate" 
    ) |> 
    pivot_wider(
      names_from = varnames,
      values_from = selection_rate
    ) |> 
    mutate(
      across(V11:V20, ~ (1 - .x)),
      across(V1:V20, ~ round(.x, 2))
    ) |> 
    group_by(n, scn) |>
    arrange(model) |> 
    summarize(
      across(V1:V20, \(x) str_c(x[model=="beta"], " (", x[model=="cox_beta"],")")) 
    ) |> 
  pivot_longer(
    cols = V1:V20,
    names_to = "varnames",
    values_to = "value"
  ) |> 
  pivot_wider(
    names_from = c(n, scn),
    values_from = value
  )

# Take a look 
sens_spec

# Take sensitivity data
sens <- sens_spec |> 
  filter(varnames %in% c("V1", "V2", "V3", "V4", "V5", "V6", "V7", "V8", "V9", "V10")) 
# Simplify column names
colnames(sens)[-1] <- c(rep(c("S1", "S2"), 4))
# Relabel variable names
sens$varnames <- c("$z_{\\cdot 1}$", "$z_{\\cdot 2}$", "$z_{\\cdot 3}$", "$z_{\\cdot 4}$", "$z_{\\cdot 5}$", "$z_{\\cdot 6}$", "$z_{\\cdot 7}$", "$z_{\\cdot 8}$", "$z_{\\cdot 9}$", "$z_{\\cdot 10}$")

# Output in latex code
latex_code <- kable(sens, format = "latex", booktabs = TRUE, escape = FALSE)

# Print LaTeX code
cat(latex_code)
kable(sens)

# Take specificity data
spec <- sens_spec |>
  filter(varnames %in% c("V11", "V12", "V13", "V14", "V15", "V16", "V17", "V18", "V19", "V20"))
# Simplify column names
colnames(spec)[-1] <- c(rep(c("S1", "S2"), 4))
# Relabel variable names
spec$varnames <- c("$z_{\\cdot 11}$", "$z_{\\cdot 12}$", "$z_{\\cdot 13}$", "$z_{\\cdot 14}$", "$z_{\\cdot 15}$", "$z_{\\cdot 16}$", "$z_{\\cdot 17}$", "$z_{\\cdot 18}$", "$z_{\\cdot 19}$", "$z_{\\cdot 20}$")

# Output in latex code
latex_code <- kable(spec, format = "latex", booktabs = TRUE, escape = FALSE)

# Print LaTeX code
cat(latex_code)
kable(spec)

Fig. 1: Boxplots of C-indices

Plot C-indices

# Create data for concordance plot
cind_plot <- wrfit |> 
  unnest(c(concordance, cox_concordance),
         names_sep = "_") |> 
  select(
    n, scn, sim_id, component = concordance_component,
    wrnet = concordance_concordance,
    cox = cox_concordance_concordance
  ) |> 
  pivot_longer(
    cols = c(wrnet, cox),
    names_to = "model",
    values_to = "concordance"
  ) |> 
  mutate(
    component = factor(component, levels = c("overall", "death", "nonfatal"),
                    labels = c("Overall", "Death", "Nonfatal")),
    comp_x = case_when(
      component == "Overall" ~ 1,
      component == "Death" ~ 1.8,
      component == "Nonfatal" ~ 2.2
    ),
    model = factor(model, levels = c("wrnet", "cox"), labels = c("Win ratio", "Cox"))     
    )


# Create the plot
cind_plot |> 
  ggplot(aes(x = comp_x, y = concordance, fill = model)) +
  geom_boxplot(data = cind_plot |> filter(component == "Overall")) +
  geom_boxplot(data = cind_plot |> filter(component != "Overall"), aes(group = interaction(component, model))) +
  facet_grid(n ~ scn, scales = "free_y") +
  scale_x_continuous(limits = c(0.5, 2.5), breaks = c(1, 1.8, 2.2),
                     labels = c("Overall", "Death", "Nonfatal")) +
  # scale_y_continuous() +
  # scale_y_continuous(limits = c(0.1, 1.1)) +
  scale_fill_manual(values = c("#4DAF4A", "#FF7F00")) +
  coord_cartesian(ylim = c(0.45, 0.9)) +
  theme_minimal() +
  labs(
    x = NULL,
    y = "Concordance (Test)",
    fill = NULL
  ) +
  theme(
    legend.position = "top",
    # axis.text.x = element_text(size = 11),
    panel.grid.minor.x = element_blank(),
    # panel.grid.minor.y = element_blank()
  )

# Save the plot
ggsave("figs/fig_cind.png", width = 7.5, height = 9.2)
ggsave("figs/fig_cind.eps", width = 7.5, height = 9.2)

Reproducible code example

Minimal reproducible example

library(tidyverse) # For data wrangling and visualization
library(WR) # To get the gbc data for illustration
source("R code/basic_functions.R")

# Load data
data("gbc")
df <- gbc
df
# Data partitioning
set.seed(123)
obj_split <- df |> wr_split()
# Take training and test set
df_train <- obj_split$df_train
df_test <- obj_split$df_test

# 10-fold CV
set.seed(1234)
obj_cv <- cv_wrnet(df_train$id, df_train$time, df_train$status, df_train |> select(-c(id, time, status)))
# Plot CV results (C-index vs log-lambda)
obj_cv |> 
  ggplot(
    aes(x =  log(lambda), y = concordance)
  ) +
  geom_point() +
  geom_line()
# Optimal lambda
lambda_opt <- obj_cv$lambda[which.max(obj_cv$concordance)]
# Final model
final_fit <- wrnet(df_train$id, df_train$time, df_train$status, df_train |> select(-c(id, time, status)), lambda = lambda_opt)
# Estimated coefficients
final_fit$beta
#> 8 x 1 sparse Matrix of class "dgCMatrix"
#>                      s0
#> hormone     0.306026364
#> age         0.003111462
#> menopause   .          
#> size       -0.007720497
#> grade      -0.285511701
#> nodes      -0.082227827
#> prog_recp   0.001861367
#> estrg_recp  .     
# Variable importance plot
final_fit |> 
  vi_wrnet() |> 
  vip()

# Test the final model on test set ----------------------------
test_result <- final_fit |> test_wrnet(df_test)
# Overall and event-specific C-indices
test_result$concordance
#> # A tibble: 3 × 2
#>   component concordance
#>   <chr>           <dbl>
#> 1 death           0.724
#> 2 nonfatal        0.607
#> 3 overall         0.664

A Real Example

Data preparation

Load and clean data on entire cohort

# Load the data
# Dataset non-public
# Can file a request to NHLBI BioLINCC: https://biolincc.nhlbi.nih.gov/studies/hf-action/
tmp0 <- read_csv("HF-ACTION/hf_action_base.csv",
                 na = "NA")  
# Clean up data
df0 <- tmp0 |> 
  mutate(
    patid = factor(patid),
    across(c(creatnc,totcholc,hema1cc,hemogloc), parse_number),
    sex = factor(sex, levels = c(1, 2), labels = c("Male", "Female")),
    racec = factor(racec, levels = c(1, 2, 3, 4), labels = c("Non-White", "White", "Non-White", "Non-White")),
    trt_ab = factor(trt_ab, levels = c(0, 1), labels = c("UC", "Training")),
    nyhacl = factor(nyhacl, levels = c(2, 3, 4), labels = c("II", "III", "IV")),
    etiology = factor(etiology, levels = c(1, 2), labels = c("Ischemic", "Non-ischemic")),
    cpxdur = pmin(cpxdur, 20),
    sixmwlkd = sixmwlkd / 100
  )|> 
  select(!where(is.character))

# Convert variables to numeric
df <- df0 |> 
  select(- c(dthhosp, cechfcmp, dhfu, cechfcfu, death, deathfu, ceccvcmp, ceccvcfu,
             cpct3m:peakvo12, # post-randomization measurements
             raceblack, raceother, nyhacl, egfrcat, bmicat,  ecgcond,
              cpxdays3, cpxdays3,
             com_use.x:GENDER.y)) |> 
  mutate(
    female = (sex == "Female"),
    nonwhite = (racec == "Non-White"),
    trt_ab = (trt_ab == "Training"),
    etiology = (etiology == "Ischemic"),
  ) |> 
  select(- c(sex, racec)) |> 
  # impute missing values by median
  mutate(across(where(is.numeric), ~ ifelse(is.na(.), median(., na.rm = TRUE), .)))

# This dataset contain all outcomes and covariates on entire cohort
saveRDS(df, "df.rds")

Subset data to high-risk cohort

# Read in the full dataset
df <- readRDS("df.rds")
# Load hfaction data from rmt package
data("hfaction")
# Obtain patient IDs 
df_sub <- hfaction |> group_by(patid) |> slice(1) |> ungroup() |> select(patid) 
# Retrieve data for high-risk cohort
df1 <- df |> inner_join(df_sub, by = "patid")
## This is the analysis dataset
df <- df1

Data summary

# Number of unique patients
n_distinct(df$patid)
#> [1] 426
p <- ncol(df) - 3
# Outcome summary
df |> 
  group_by(patid) |> 
  summarize( # Subject-level characteristics
    death = any(status == 1),
    hosp = any(status == 2),
    comp = any(status >0),
    fu_time = max(time)
  ) |> 
  summarize( # Summarize across sample
    # N (%) of patients with each outcome
    across(c(death, hosp, comp), ~ paste0(sum(.x), " (", round(mean(.x) * 100, 1), "%)")),
    # Median follow-up time in years
    fu_time = median(fu_time) / 365.25
  )

Data partitioning

Partitioning into training and testing sets

## Partition data into training versus test sets
## stratified by outcome status:
## 1. Death + hosp; 2. Death only; 3. Hosp only; 4. None
# df <- df1
# Compute outcome status
out_id <- df |> 
  group_by(patid) |>
  summarize(
    out = case_when(
      any(status == 1) & any(status == 2) ~ 1,
      any(status == 1) ~ 2,
      any(status == 2) ~ 3,
      TRUE ~ 4
    )
  )
# Sample 80% of the data for training by each outcome status
set.seed(123)
out_id_train <- out_id |> 
  group_by(out) |> 
  sample_frac(0.8) 

# Summarize outcome status in training data
out_id_train |> count(out) |> 
  group_by() |> 
  mutate(prop = n / sum(n))

# Partition data
df_train <- df |> 
  inner_join(out_id_train, by = "patid") |> select(-out)
df_test <- df |> 
  anti_join(df_train, by = "patid") |> rename(id = patid)

# Summarize outcome status in test data
out_id |> 
  anti_join(df_train, by = "patid") |> 
  count(out) |> 
  group_by() |> 
  mutate(prop = n / sum(n))

Model training, cross validation, and testing

Train and test model

source("R code/basic_functions.R")

# 10-fold CV
set.seed(1234)
# Range of penalty parameter
lambda_range <- exp(seq(log(0.1), log(0.0002), length.out = 100))
# Run cross validation
obj_cv <- cv_wrnet(df_train$patid, df_train$time, df_train$status, df_train |> select(-c(patid, time, status)), lambda = lambda_range)

# Plot CV results (C-index vs log-lambda)
obj_cv |> 
  ggplot(
    aes(x =  log(lambda), y = concordance)
  ) +
  geom_point() +
  geom_line() 

# Optimal lambda
lambda_opt <- obj_cv$lambda[which.max(obj_cv$concordance)]
lambda_opt
#> [1] 0.04152671
# Fit final model with optimal lambda
final_fit <- wrnet(df_train$patid, df_train$time, df_train$status, df_train |> select(-c(patid, time, status)), lambda = lambda_opt)

# Test the final model on test set
test_result <- final_fit |> test_wrnet(df_test)
# Test C-index
test_result$concordance


# Variable importance
final_fit |> 
  vi_wrnet() |> 
  vip()

Fig. 2: Proper vs naive CV via pairwise logistic model

Naive CV based on pairwise logistic regression

# Transform training data into pairwise format
pw_win_train <- pair_win_loss_transform(df_train |> rename(id = patid))

# Improper cv (this will take some time)
set.seed(1234)
pw_cv_fit <- cv.glmnet(
  x = pw_win_train |> select(-(1:4)) |> as.matrix(),
  y = pw_win_train$win,
  family = "binomial",
  type.measure = "class",
  # contrain intercept = 0
  intercept = FALSE,
  lambda = lambda_range
)

# saveRDS(pw_cv_fit, "naive_cv.rds")
# plot(pw_cv_fit)

# Collect CV data from naive and proper methods for comparison
# Naive
cv_errors_naive <- tibble(
  lambda = pw_cv_fit$lambda,
  accuracy = 1 - pw_cv_fit$cvm,
  method = "Naive"
)
# Proper
cv_errors_proper <- tibble(
  lambda = obj_cv$lambda,
  accuracy = obj_cv$concordance,
  method = "Proper"
)

# Optimalized concordance from proper CV
max_concordance <- max(obj_cv$concordance)
# Transformation function for secon y-axis (for improper CV)
lin_trans <- function(x, scale){
  0.5 + scale * (x - 0.5)
}
# Creat plot
cv_errors_proper |> 
  ggplot(aes(x = log(lambda), y = accuracy, color = method)) +
  geom_point() +
  geom_line() +
  geom_point(data = cv_errors_naive, aes(y = lin_trans(accuracy, 0.4)))  +
  geom_line(data = cv_errors_naive, aes(y = lin_trans(accuracy, 0.4))) +
  geom_segment(x = log(lambda_opt), xend = log(lambda_opt), y = 0, yend = max_concordance, linetype = 2, alpha = 0.2) +
  annotate(
    "text",
    x = log(lambda_opt),
    y = 0.5,
    label = round(log(lambda_opt), 1),
    hjust = 1.2,
    vjust = 1,
    size = 3
  ) +
  scale_color_brewer(palette = "Set1", breaks = c("Proper", "Naive")) +
  scale_y_continuous("Concordance", breaks = c(0.5, 0.55, 0.6),
                     sec.axis = sec_axis(~lin_trans(., 1/0.4), 
                                         name = "Classification accuracy",
                                         breaks = c(0.5, 0.6, 0.7)
                                         )) +
    labs(
    x = expression(log(lambda)),
    y = "Concordance",
    color = NULL
  ) +
  coord_cartesian(ylim = c(0.5, 0.61)) +
  theme_minimal() +
    theme(
    legend.position = "top",
    axis.title.y.right = element_text(color = "#E41A1C"),  # Color for secondary axis title
    axis.title.y.left = element_text(color = "#377EB8"),   # Color for primary axis title
    axis.text.y.right = element_text(color = "#E41A1C"),   # Color for secondary axis text
    axis.text.y.left = element_text(color = "#377EB8"),     # Color for primary axis text
    axis.line.y.right = element_line(color = "#E41A1C"),    # Color for secondary axis line
    axis.line.y.left = element_line(color = "#377EB8")      # Color for primary axis line
  )

# Save the plot
ggsave("figs/fig_cv_errors.png", width = 7.5, height = 4)
ggsave("figs/fig_cv_errors.eps", device = cairo_ps, width = 7.5, height = 4)

Table 2: Comparison with regularized Cox model

Fit regularized Cox model

# Subset to first events
df_train_tfe <- df_train |> 
  group_by(patid) |> 
  arrange(time) |> 
  slice(1) |> 
  mutate(
    status = status > 0,
    time = if_else(time <= 0, 0.0001, time) # fix 0 times
  ) |> 
  ungroup()
  

# Fit regularized cox regression model
# 10-fold CV
set.seed(1234)
cox_cvfit <- cv.glmnet(
  x = df_train_tfe |> select(-c(patid, time, status)) |> as.matrix(),
  y = df_train_tfe |> select(time, status) |> as.matrix(),
  family = "cox", 
  type.measure = "C" # C-index for CV
)

# saveRDS(cox_cvfit, "cox_cvfit.rds")
cox_cvfit <- readRDS("cox_cvfit.rds")

# plot(cox_cvfit)

# Trick test_wrnet() to get risk score for Cox model
cox_cvfit$beta <-  - coef(cox_cvfit, s = "lambda.min")
# How many covariates are selected
sum(cox_cvfit$beta != 0)
#> 32
# Test the final model on test set
cox_test_result <- cox_cvfit |> test_wrnet(df_test)
# Test C-index

# Combine with wrnet model
tbl_cind <- cox_test_result$concordance |> 
  mutate(
    model = "Cox",
    .before = 1
  ) |> 
  bind_rows(
    test_result$concordance |> 
      mutate(
        model = "Win ratio",
        .before = 1
      )
  ) |> 
  mutate(
    component = str_to_title(component),
    concordance = round(concordance, 3)
  ) |> 
  pivot_wider(
    names_from = model,
    values_from = concordance
  )

# Output to latex
tbl_cind |> 
  kable(format = "latex", booktabs = TRUE, escape = FALSE)

Fig. 4: Variable importance comparison

Variable importance comparison

# Trick vi_wrnet() to get variable importance for Cox model
cox_cvfit$Zn <- final_fit$Zn
# plot vip for Cox 
cox_cvfit |> 
  vi_wrnet() |> 
  vip(k = 20)

# Collect vi data from Cox and Win ratio models
vi_cox <- cox_cvfit |> 
  vi_wrnet()
# Combine with wrnet model
vis <- tibble(
    vars = vi_cox$vars[1:20],
    importance = vi_cox$importance[1:20],
    model = "Cox"
  ) |> 
  bind_rows(
    tibble(
      vars = vi_wr$vars[1:20],
      importance = vi_wr$importance[1:20],
      model = "Win ratio"
    )
  )

# Intersection of top 20 variables
vi_both <- vis |> 
  count(vars) |> 
  filter(
    n == 2
  ) |> select(-n)

# Order variables by importance in win ratio model
vis |> 
  filter(model == "Win ratio") |> 
  arrange(desc(importance)) |>
  right_join(vi_both, by = "vars") |> 
  pull(vars)


# Format variable names
vis1 <- vis |> 
  mutate(
    vars = factor(vars, levels = uvars$vars,
              labels =
                c("KCCQ sympt freq",
                  "Valve surgery",
                  "Female vs male",
                  "Education level",
                  "KCCQ sympt stability < 50",
                  "KCCQ social limitation",
                  "MV peak E/A velocity",
                  "VT percent max VO2",
                  "Biplane LVEF",
                  "Loop diuretic dose",
                  "Left atrial dimension",
                  "Both diuretics",
                  "Work in 6MW",
                  "Height",
                  "IVCD ventri conduction",
                  "Mitral regurgit grade 3+",
                  "Systolic blood pressure",
                  "Weber class",
                  "MV peak E velocity",
                  "Mitral regurgit grade 6",
                  "Best LVEF",
                  "NYHA class",
                  "CPX heart rate",
                  "Mitral regurgit pct",
                  "On loop diuretic",
                  "Mitral regurgit grade \u2264 2",
                  "Peak VO2"
                  )
              )
  )

# Cox plot
vip_cox <- vis1 |> 
  filter(model == "Cox") |> 
  ggplot(
    aes(y = reorder(vars, importance), x = importance)
  ) +
  geom_col(width = 0.8) +
  labs(
    y = NULL,
    x = "Importance",
    title = "Cox"
  ) +
  scale_x_continuous(expand = expansion(c(0, 0.05))) +
  theme_minimal() +
  theme(
    #center title
    plot.title = element_text(hjust = 0.5, size = 11),
    panel.grid.major.y = element_blank()
  )


# WR plot
vip_wr <- vis1 |> 
  filter(model == "Win ratio") |> 
  ggplot(
    aes(y = reorder(vars, importance), x = importance)
  ) +
  geom_col(width = 0.8) +
  labs(
    y = NULL,
    x = "Importance",
    title = "Win ratio"
  ) +
  scale_x_continuous(expand = expansion(c(0, 0.05)),
                     breaks = c(0, 0.05, 0.1, 0.15)) +
  theme_minimal() +
  theme(
    #center title
    plot.title = element_text(hjust = 0.5, size = 11),
    panel.grid.major.y = element_blank()
  )


# Combine plots
vip_cox + vip_wr
# Save figure
ggsave("figs/fig_vip.png", width = 8, height = 6)
ggsave("figs/fig_vip.eps", device = cairo_ps, width = 8, height = 6)

Fig. 3: Risk prediction assessment

Risk prediction assessment

# Group outcomes on test set
df_test_risk_score <- df_test |> 
  group_by(
    id
  ) |> 
  summarize(
    out = case_when(
      any(status == 1) ~ "Death",
      any(status == 2) ~ "Hosp alone",
      TRUE ~ "Event-free"
    ),
    out = factor(out, levels = c("Event-free", "Hosp alone", "Death"))
  ) |> 
  left_join(
    test_result$score_test |> select(id, risk_score)
  ) |> 
  left_join(
    cox_test_result$score_test |> select(id, risk_score) |> rename(risk_score_cox = risk_score)
  ) |> 
  # compute the percentiles of risk_score
  mutate(
    risk_score_pct = rank(risk_score) / n(),
    risk_score_cox_pct = rank(risk_score_cox) / n() 
  ) 
  

# Compute R-squared between the two risk scores
R2 <- cor(df_test_risk_score$risk_score, df_test_risk_score$risk_score_cox)^2

# Left panel: correlation of risk scores
fig_rs_corr <- df_test_risk_score |> 
  ggplot(
    aes(x = risk_score_pct, y = risk_score_cox_pct, color = out)
  ) +
  geom_point(size = 4) +
  # Annotate R-squared
  annotate(
    "text",
    x = 0.15,
    y = 0.95,
    label = bquote(R^2 == .(round(R2, 2))),
    size = 4
  ) +
  geom_abline(linetype = 2) +
  labs(
    x = "Win-ratio risk score percentile",
    y = "Cox risk score percentile",
    color = NULL
  ) +
  # use a gradient color scale for fill
  scale_color_brewer(palette = "Reds") +
  scale_x_continuous(labels = scales::percent) +
  scale_y_continuous(labels = scales::percent) +
  coord_fixed() +
  theme_minimal() +
  theme(
        legend.position = "top"
  )


# Right panel: boxplot of risk scores
fig_rs_box <- df_test_risk_score |> 
  select(-c(risk_score, risk_score_cox)) |>
  pivot_longer(
    cols = c(risk_score_pct, risk_score_cox_pct),
    names_to = "model",
    values_to = "score"
  ) |>
  mutate(
    model = factor(model, levels = c("risk_score_cox_pct", "risk_score_pct"),
                   labels = c("Cox", "Win ratio"))
  ) |> 
  ggplot(
    aes(x = model, y = score, fill = out)
  ) +
  geom_boxplot() +
  # use a gradient color scale for fill
  scale_fill_brewer(palette = "Reds") +
  scale_y_continuous(labels = scales::percent) +
  labs(
    x = NULL,
    y = "Predicted risk score percentile",
    fill = NULL
  ) +
  theme_minimal() +
  theme(
    legend.position = "top",
    axis.text.x = element_text(size = 11)
  )

# Combine two panels
fig_preds <- fig_rs_corr + fig_rs_box 
# Print figure
fig_preds
# Save figure
ggsave("figs/fig_preds.png", fig_preds, width = 7.5, height = 4)
ggsave("figs/fig_preds.eps", fig_preds, width = 7.5, height = 4)

Table 3: Inference of win ratio model on entire dataset

Inference of win ratio model on entire dataset

# Selected covariates
# List of variable names
df <- df_sub |> left_join(df0, by = "patid")
df_Z <- df |> 
  select(any_of(vi_wr$vars[1:20]))

# Fit the model
pw_obj <- pwreg1(df$patid, df$time, df$status, df_Z)

beta <- pw_obj$beta
se <- sqrt(diag(pw_obj$Var))

# Rescale loopdose & walkwork by 100
beta[names(beta) %in% c("loopdose", "walkwork")] <- beta[names(beta) %in% c("loopdose", "walkwork")] * 100
se[names(se) %in% c("loopdose", "walkwork")] <- se[names(se) %in% c("loopdose", "walkwork")] * 100
# Construct win ratio, 95% CI, and p-value
wr <- exp(beta)
wr_up <- exp(beta + 1.96 * se)
wr_low <- exp(beta - 1.96 * se)
pval <- 2 * pnorm(-abs(beta / se))
# Tabulate
tbl_inf <- tibble(
  vars = vi_wr$vars[1:20],
  beta = beta,
  se = se,
  wr = round(wr, 2),
  wr_low = round(wr_low, 2),
  wr_up = round(wr_up, 2),
  pval = round(pval, 3)
) |> 
  # format
  mutate(
    `Win ratio` = wr,
    `95% CI` = str_c("[", wr_low, ", ", wr_up, "]")
  ) |> 
  select(
    vars, `Win ratio`, `95% CI`, pval
  )
# Format variable names
tbl_inf1 <- tbl_inf |> 
  mutate(
    vars = factor(vars, 
              levels = c("female", 
                         "EDUCAT", 
                         "valvsurg", 
                         "loopdiur", 
                         "loopdose",
                         "bothdiur", 
                         "KCCQSFB", 
                         "KCCQSLSB", 
                         "kccqssb1", 
                         "walkwork", 
                         "nyhaclbn", 
                         "sbp", 
                         "vtpctvo2", 
                         "cpxhrr", 
                         "peakvo2", 
                         "eavelrat", 
                         "bestlvef", 
                         "lftatrdm", 
                         "mitregrg", 
                         "mrgra2"),
              labels =
                    c("Female vs male",
                      "Education level (1-6)",
                      "Valve surgery (y vs n)",
                      "On loop diuretic",
                      "Loop diuretic dose (mg)",
                      "On both diuretics  (y vs n)",
                      "KCCQ symptoms freq",
                      "KCCQ social limitation score",
                      "KCCQ symptom stability < 50 (y vs n)",
                      "Work in 6MW (kJ)",
                      "NYHA class (III/IV vs II)",
                      "Systolic blood pressure (mmHg)",
                      "VT percent max VO2 (%)",
                      "CPX heart rate (bpm)",
                      "Peak VO2 (mL/kg/min)",
                      "MV peak E/A velocity ",
                      "Best LVEF (%)",
                      "Left atrial dimension (mm)",
                      "Mitral regurgit percent (%)",
                      "Mitral regurgit grade 2 (y vs n)")
              
                      
                      )
              ) |> 
  arrange(vars)
  
# Print in latex
tbl_inf1 |> 
  kable(format = "latex", booktabs = TRUE, escape = FALSE)
# Print table
tbl_inf1 |> kable()