simulate_crt <- function(
n_clusters = 82,
m_mean = 18,
CV = 0.98,
p0 = 0.50,
p1 = 0.70,
rho = 0.20,
re_dist = "gamma",
alpha = 0.3, # weak-moderate correlation between u_j and baseline outcome rate
tau = 0.45, # SD of baseline noise
beta_baseline = 0.2, # weak-moderate pos correlation: baseline outcome rate -> outcome, independent of u_j
age_mean = 35,
age_sd = 12,
sex_prob = 0.40
){
# (1) Compute OR and intercept
OR <- p0_p1_to_OR(p0, p1)
beta0 <- p_to_beta0(p0)
beta1 <- log(OR)
beta0_adj <- beta0 - 1.0
# (2) Generate clusters
sizes <- generate_cluster_sizes(n_clusters, m_mean, CV)
sigma_b <- icc_to_sigma(rho)
u_j <- generate_u(n_clusters, sigma_b, dist = re_dist)
arm_assign <- sample(rep(0:1, length.out = n_clusters))
# (3) Baseline IGRA testing rate
eps <- rnorm(n_clusters, 0, tau)
logit_b <- qlogis(p0) + alpha * u_j + eps
baseline_rate <- plogis(logit_b)
# (4) site
site <- sample(1:7, n_clusters, replace = TRUE,
prob = c(0.05, 0.15, 0.20, 0.10, 0.15, 0.25, 0.10))
# (5) Individual-level simulation
ind_list <- vector("list", length = n_clusters)
for(j in seq_len(n_clusters)){
nj <- sizes[j]
age_j <- rnorm(nj, mean = age_mean, sd = age_sd)
sex_j <- rbinom(nj, 1, sex_prob)
logit_baseline_j <- qlogis(baseline_rate[j])
linpred_j <- beta0_adj +
beta1 * arm_assign[j] +
beta_baseline * logit_baseline_j +
beta_site * site[j] +
u_j[j]
p_ij <- plogis(linpred_j)
y_ij <- rbinom(nj, 1, p_ij)
ind_list[[j]] <- data.frame(
cluster = j,
arm = arm_assign[j],
age = age_j,
sex = sex_j,
site = site[j],
baseline_rate = baseline_rate[j],
u_j = u_j[j],
p = p_ij,
y = y_ij
)
}
df_ind <- do.call(rbind, ind_list)
# (6) Cluster-level summary
df_cluster <- aggregate(y ~ cluster + arm, data = df_ind, sum)
df_cluster$size <- aggregate(y ~ cluster, data = df_ind, length)$y
cluster_meta <- data.frame(
cluster = seq_len(n_clusters),
arm = arm_assign,
site = site,
baseline_rate = baseline_rate,
u_j = u_j
)
df_sim <- merge(df_cluster, cluster_meta, by = c("cluster","arm"))
df_sim <- df_sim[order(df_sim$cluster),
c("cluster","arm","size","y","baseline_rate",
"site","u_j")]
return(list(
individual = df_ind,
cluster = df_sim
))
}
# Default simulation
# sim_data <- simulate_crt()
# df_ind <- sim_data$individual
# df_cluster <- sim_data$cluster
# Number of simulations
n_sims <- 100
set.seed(20250809)
# Storage
results <- data.frame(
sim = 1:n_sims,
unadj_signif = NA,
adj_signif = NA,
beta_unadj = NA,
beta_adj = NA,
results$OR_unadj <- NA,
results$OR_unadj_lower <- NA,
results$OR_unadj_upper <- NA,
results$OR_adj <- NA,
results$OR_adj_lower <- NA,
results$OR_adj_upper <- NA
)
for(i in seq_len(n_sims)){
# (1) Simulate trial
sim_data <- simulate_crt()
df_ind <- sim_data$individual
# (2) Prepare age spline
age_spline <- as.data.frame(ns(df_ind$age,
knots = quantile(df_ind$age, probs=c(0.1,0.5,0.9))))
colnames(age_spline) <- paste0("age_spline", seq_len(ncol(age_spline)))
df_ind <- cbind(df_ind, age_spline)
# (3) Ensure factors
df_ind$arm <- factor(df_ind$arm, levels = c(0,1))
df_ind$sex <- factor(df_ind$sex, levels = c(0,1))
df_ind$site <- factor(df_ind$site, levels = c(1,2,3,4,5,6,7))
# (4) Unadjusted model
form_unadj <- as.formula(paste("y ~ arm + (1 | cluster)"))
model_unadj <- lme4::glmer(
formula = form_unadj,
family = binomial(link = "logit"),
data = df_ind,
control = lme4::glmerControl(optimizer = "bobyqa")
)
coef_name_unadj <- grep("^arm", names(fixef(model_unadj)), value=TRUE)
beta1_unadj <- fixef(model_unadj)[coef_name_unadj]
se1_unadj <- summary(model_unadj)$coefficients[coef_name_unadj,"Std. Error"]
pval_unadj <- summary(model_unadj)$coefficients[coef_name_unadj,"Pr(>|z|)"]
# (5) Fully adjusted model
spline_cols <- colnames(df_ind)[grepl("^age_spline", colnames(df_ind))]
form_adj <- as.formula(
paste("y ~ arm + baseline_rate + site + sex +",
paste(spline_cols, collapse = " + "),
"+ (1 | cluster)")
)
model_adj <- lme4::glmer(
formula = form_adj,
family = binomial(link = "logit"),
data = df_ind,
control = lme4::glmerControl(optimizer = "bobyqa")
)
coef_name_adj <- grep("^arm", names(fixef(model_adj)), value=TRUE)
beta1_adj <- fixef(model_adj)[coef_name_adj]
se1_adj <- summary(model_adj)$coefficients[coef_name_adj,"Std. Error"]
pval_adj <- summary(model_adj)$coefficients[coef_name_adj,"Pr(>|z|)"]
# (6) Save results including OR and CI
CI_unadj <- exp(confint(model_unadj, method="Wald")[coef_name_unadj, ])
results$OR_unadj[i] <- exp(beta1_unadj)
results$OR_unadj_lower[i] <- CI_unadj[1]
results$OR_unadj_upper[i] <- CI_unadj[2]
CI_adj <- exp(confint(model_adj, method="Wald")[coef_name_adj, ])
results$OR_adj[i] <- exp(beta1_adj)
results$OR_adj_lower[i] <- CI_adj[1]
results$OR_adj_upper[i] <- CI_adj[2]
results$unadj_signif[i] <- (pval_unadj < 0.05)
results$adj_signif[i] <- (pval_adj < 0.05)
}
# (7) Compute estimated power
power_unadj <- mean(results$unadj_signif)
power_adj <- mean(results$adj_signif)
cat("Estimated power (unadjusted) =", round(power_unadj,4), "\n")