1. Generate the True Population

In this section, we create a simulated “true” population of 100,000 patients with known characteristics. This serves as our ground truth for comparison. Each patient has an age, biomarker level, and disease score. We define a rare subgroup as patients in the top 1% of biomarker levels - this represents the population we want to study but will have limited samples of in real-world data.

library(ggplot2)
library(dplyr)
library(MASS)
library(tidyr)
library(gridExtra)
set.seed(123)

n_population <- 100000
age_mean <- 55
age_sd <- 10
biomarker1_mean <- 50
biomarker1_sd <- 10

true_population <- data.frame(
  age = rnorm(n_population, age_mean, age_sd),
  biomarker1 = rnorm(n_population, biomarker1_mean, biomarker1_sd)
)

true_population$disease_score <- 
  1 + 0.15 * true_population$age + 0.3 * true_population$biomarker1 + 
  rnorm(n_population, 0, 3)

biomarker1_threshold <- quantile(true_population$biomarker1, 0.99)
true_population$subgroup <- (true_population$biomarker1 >= biomarker1_threshold)

true_mean <- mean(true_population$disease_score)
true_subgroup_mean <- mean(true_population$disease_score[true_population$subgroup])
subgroup_percent <- mean(true_population$subgroup) * 100

cat("True mean disease score:", round(true_mean, 2), "\n")
## True mean disease score: 24.27
cat("True subgroup mean disease score:", round(true_subgroup_mean, 2), "\n")
## True subgroup mean disease score: 32.36
cat("Subgroup percentage:", round(subgroup_percent, 2), "%\n")
## Subgroup percentage: 1 %
cat("Biomarker1 threshold for subgroup:", round(biomarker1_threshold, 2), "\n")
## Biomarker1 threshold for subgroup: 73.22

2. Synthetic Data Function and Experiments

Here we define our synthetic data generation approach and run the comparison experiments. The get_sd_estimate function creates synthetic patients by learning the statistical relationships from a real-world data sample, then generates a much larger synthetic population (ratio of 1:100) to better estimate rare subgroup outcomes.

We then run 100 experiments, each time taking a small sample from our true population (simulating limited real-world data) and comparing two approaches: direct estimation from the small sample versus synthetic data augmentation.

get_sd_estimate <- function(rwd_sample, n_synthetic = 100000) {
  vars <- c("age", "biomarker1", "disease_score")
  mu <- colMeans(rwd_sample[, vars])
  Sigma <- cov(rwd_sample[, vars])
  
  synthetic_samples <- mvrnorm(n_synthetic, mu = mu, Sigma = Sigma)
  
  synthetic_patients <- data.frame(
    age = synthetic_samples[,1],
    biomarker1 = synthetic_samples[,2],
    disease_score = synthetic_samples[,3]
  )
  
  biomarker1_sd_threshold <- quantile(synthetic_patients$biomarker1, 0.99)
  synthetic_patients$subgroup <- (synthetic_patients$biomarker1 >= biomarker1_sd_threshold)
  
  synthetic_subgroup <- synthetic_patients[synthetic_patients$subgroup == TRUE, ]
  
  list(
    mean_score = mean(synthetic_subgroup$disease_score),
    synthetic_data = synthetic_patients,
    synthetic_subgroup = synthetic_subgroup,
    threshold = biomarker1_sd_threshold,
    subgroup_size = nrow(synthetic_subgroup)
  )
}
subgroup_indices <- which(true_population$subgroup == TRUE)
nonsubgroup_indices <- which(true_population$subgroup == FALSE)

n_experiments <- 100
seeds <- 777:(777 + n_experiments - 1)

results <- data.frame(
  seed = seeds,
  direct_estimate = numeric(n_experiments),
  sd_estimate = numeric(n_experiments),
  direct_error = numeric(n_experiments),
  sd_error = numeric(n_experiments),
  better_method = character(n_experiments),
  rwd_subgroup_size = numeric(n_experiments),
  sd_subgroup_size = numeric(n_experiments)
)

for (i in 1:n_experiments) {
  set.seed(seeds[i])
  
  sampled_subgroup <- sample(subgroup_indices, 10)
  sampled_nonsubgroup <- sample(nonsubgroup_indices, 990)
  rwd_sample <- true_population[c(sampled_subgroup, sampled_nonsubgroup), ]
  
  direct_subgroup <- rwd_sample[rwd_sample$subgroup == TRUE, ]
  direct_estimate <- mean(direct_subgroup$disease_score)
  
  sd_result <- get_sd_estimate(rwd_sample)
  sd_estimate <- sd_result$mean_score
  
  direct_error <- abs(direct_estimate - true_subgroup_mean)
  sd_error <- abs(sd_estimate - true_subgroup_mean)
  
  better_method <- ifelse(sd_error < direct_error, "Synthetic Data", "RWD")
  
  results[i, ] <- list(
    seeds[i], direct_estimate, sd_estimate,
    direct_error, sd_error, better_method,
    nrow(direct_subgroup), sd_result$subgroup_size
  )
}

3. Results Summary

This section calculates key performance metrics comparing the two approaches. We measure the absolute error of each method compared to the true population value, and count how many times each method performed better. The summary statistics help us understand which approach is more accurate on average.

mean_direct_error <- mean(results$direct_error, na.rm = TRUE)
mean_sd_error <- mean(results$sd_error, na.rm = TRUE)
median_direct_error <- median(results$direct_error, na.rm = TRUE)
median_sd_error <- median(results$sd_error, na.rm = TRUE)
sd_wins <- sum(results$better_method == "Synthetic Data")
rwd_wins <- sum(results$better_method == "RWD")

cat("Average RWD error:", round(mean_direct_error, 2), "\n")
## Average RWD error: 0.88
cat("Average Synthetic Data error:", round(mean_sd_error, 2), "\n")
## Average Synthetic Data error: 0.31
cat("Median RWD error:", round(median_direct_error, 2), "\n")
## Median RWD error: 0.78
cat("Median Synthetic Data error:", round(median_sd_error, 2), "\n")
## Median Synthetic Data error: 0.27
cat("Synthetic Data won in", sd_wins, "experiments\n")
## Synthetic Data won in 79 experiments
cat("RWD won in", rwd_wins, "experiments\n")
## RWD won in 21 experiments

4. Visualizations

results_long <- pivot_longer(results, 
                             cols = c("direct_error", "sd_error"),
                             names_to = "method", values_to = "error") %>%
  mutate(method = factor(method, 
                         levels = c("direct_error", "sd_error"),
                         labels = c("RWD Samples", "Synthetic Samples")))

ggplot(results_long, aes(x = method, y = error, fill = method)) +
  geom_boxplot() +
  geom_jitter(width = 0.2, alpha = 0.6) +
  labs(title = "Error Distribution Across 100 Experiments",
       x = "Method", y = "Absolute Error") +
  theme_bw() +
  scale_fill_manual(values = c("RWD Samples" = "darkred", "Synthetic Samples" = "darkblue"))

paired_errors <- pivot_longer(results, 
                              cols = c("direct_error", "sd_error"),
                              names_to = "method", 
                              values_to = "error") %>%
  mutate(method = factor(method, 
                         levels = c("direct_error", "sd_error"),
                         labels = c("RWD Sample", "Synthetic Data Sample")))

ggplot(paired_errors, aes(x = factor(seed), y = error, fill = method)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Estimation Error by Experiment", x = "Seed", y = "Absolute Error") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_x_discrete(breaks = function(x) x[seq(1, length(x), 10)]) +
  scale_fill_manual(values = c("RWD Sample" = "darkred", "Synthetic Data Sample" = "darkblue"))

method_counts <- data.frame(
  method = c("Synthetic Data Better", "RWD Better"),
  count = c(sd_wins, rwd_wins)
)

ggplot(method_counts, aes(x = "", y = count, fill = method)) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar("y") +
  labs(title = "Which Method Performed Better?") +
  theme_minimal() +
  theme(axis.title = element_blank(),
        axis.text = element_blank(),
        panel.grid = element_blank())

✅ Conclusion

This notebook demonstrates how synthetic data, when built from a representative RWD sample, and augmented can improve the estimation of outcomes in rare subgroups. While synthetic data won in 79 out of 100 experiments, the balance between direct RWD and modeling-based estimates should always consider context, assumptions, and data quality.