knitr::opts_chunk$set(warning = FALSE)
knitr::opts_chunk$set(include = TRUE)
options(scipen = 999) ## turning off scientific notation
library(lme4) ## for clustered regressions
## Loading required package: Matrix
#library(ICC)
library(tidyverse) ## for many things
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::expand() masks Matrix::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tidyr::pack() masks Matrix::pack()
## ✖ tidyr::unpack() masks Matrix::unpack()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor) ## for %>% function mostly
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
#install.packages("simstudy")
library(simstudy) ## for generating simulations
#install.packages("ordinal")
library(ordinal) ## for ordinal outcome data
##
## Attaching package: 'ordinal'
##
## The following object is masked from 'package:dplyr':
##
## slice
#install.packages("misty")
#install.packages("furrr")
library(future) # backend
library(furrr) # parallel version of purrr, for running multiple cores at once
## Helper function for OR to delta
increase_odds <- function(p, OR) {
#enter a baseline prob (p) and an Odds Ratio (OR)
#model returns the delta, the percentage point increase
current_odds <- p/(1-p)
new_odds <- current_odds * OR
new_p <- new_odds/ (1 + new_odds)
delta <- new_p - p
delta <- ifelse(p < 0.999999999, delta, 0)
return(delta)
}
## helper code to shrink probabilities
affine_transform <- function(x, minmax){
y <- minmax/(1-2*minmax)
return((x + y)/(1+2*y))
}
## Main function
make_breastfed_data <- function(p0, delta, ordinal_or = 1.5, num_facilities, mean_workers_per_facility, mean_mothers_per_worker, var_across_facilities = 1, num_workers_variance = 1, var_within_facilities = 1, num_mothers_variance = 1, minmax_fac = 0.02, minmax_worker = 0.01, baseprobs = c(73/126, 13/126, 40/126) ) {
odds_ratio <- ((p0 + delta)/(1 - p0 - delta)) / ((p0)/(1-p0)) ## calculates OR from p0 and delta. If you tell it you have a baseline of 90% and anticipate 5pp increase, it converts this into an OR of 1.9. As baseline is just an average, sometimes you'll have higher or lower values that are generated, and that you then need to apply an OR to.
## The way 'simstudy' works is first by defining variables relationships (defData) then generating data (genData). You start with the biggest clusters, and then continue adding new details for each cluster.
# generating Facility Data
gen_facility <- defData(id = "facilityID", varname = "int_facility_prob",
dist = "beta",
formula = p0, # r program needs the mean of the beta distribution
variance = var_across_facilities) ## tried diff values, variance of 1 works best
gen_facility <- defData(gen_facility, varname = "facility_prob",
dist = "normal",
formula = "affine_transform(int_facility_prob, ..minmax_fac)",
variance = 0) # 0 b/c there's no randomness
gen_facility <- defData(gen_facility, varname = "intermediate_num_workers",
dist = "gamma",
formula = 7,
variance = num_workers_variance) ## 1 is the dispersion parameter
gen_facility <- defData(gen_facility, varname = "num_workers", dist = "normal", formula = "round(intermediate_num_workers) + 1", variance = 0)
dfacilities <- genData(num_facilities, gen_facility) %>% select(-c(int_facility_prob,intermediate_num_workers))
## generating Worker Data
gen_workers <- defDataAdd(varname = "int_worker_prob",
dist = "beta",
formula = "facility_prob", # mean is just facility_prob (p)
variance = var_within_facilities)
gen_workers <- defDataAdd(gen_workers,
varname = "worker_prob", dist = "normal",
formula = "affine_transform(int_worker_prob, ..minmax_worker)",
variance = 0)
gen_workers <- defDataAdd(gen_workers,
varname = "intermediate_num_mothers", dist = "gamma",
formula = mean_mothers_per_worker,
variance = num_mothers_variance)
gen_workers <- defDataAdd(gen_workers,
varname = "num_mothers", dist = "normal",
formula = "round(intermediate_num_mothers)",
variance = 0)
dworkers <- genCluster(dfacilities, "facilityID",
numIndsVar = "num_workers",
level1ID = "workerID")
dworkers <- addColumns(gen_workers, dworkers)
dworkers <- dworkers %>% select(-c(int_worker_prob,intermediate_num_mothers))
#### generating Parent Data
gen_parents <- defDataAdd(varname = "treatment",
dist = "binary", formula = 0.5)
gen_parents <- defDataAdd(gen_parents, varname = "OR", dist = "normal",
formula = odds_ratio,
variance = 0)
gen_parents <- defDataAdd(gen_parents, varname = "worker_delta",
dist = "normal",
formula = "increase_odds(worker_prob, OR)",
variance = 0)
gen_parents <- defDataAdd(gen_parents, varname = "breastfed",
dist = "binary",
formula = "worker_prob + treatment*worker_delta")
gen_parents <- defDataAdd(gen_parents,
varname = "ordinal_effect",
formula = "..ordinal_or*treatment + worker_prob",
dist = "nonrandom")
dparents <- genCluster(dworkers, "workerID",
numIndsVar = "num_mothers",
level1ID = "parentID")
dparents <- addColumns(gen_parents, dparents)
dparents <- genOrdCat(dparents, id = "parentID", adjVar = "ordinal_effect", baseprobs, catVar = "ordinal_outcome")
## removing workers that don't have parents in pre and post
dparents <- dparents %>%
group_by(workerID) %>%
filter(n_distinct(treatment) == 2) %>%
ungroup()
return(dparents)
}
demo_generated <- make_breastfed_data(p0 = 0.8,
delta = 0.05,
#ordinal_or = 1.5,
num_facilities = 20,
mean_workers_per_facility = 7,
mean_mothers_per_worker = 8,
# var_across_facilities = 1, num_workers_variance = 1, var_within_facilities = 1, num_mothers_variance = 1, minmax_fac = 0.02, minmax_worker = 0.01,
# baseprobs = c(73/126, 13/126, 40/126)
)
demo_generated %>%
group_by(treatment) %>%
summarise(
n_facilities = n_distinct(facilityID),
n_workers = n_distinct(workerID),
# workers per facility
avg_workers_per_facility = n_workers / n_facilities,
# parents per worker (each row is a parent)
n_parents = n(),
avg_parents_per_worker = n_parents / n_workers,
.groups = "drop"
) %>%
mutate(treatment = if_else(treatment == 1, "treat", "control")) %>%
pivot_wider(
names_from = treatment,
values_from = c(
n_facilities, n_workers, n_parents,
avg_workers_per_facility, avg_parents_per_worker
),
names_glue = "obs_{.value}_{treatment}"
)
demo_generated %>% head(50)
demo_generated %>% summary()
## workerID facilityID facility_prob num_workers
## Min. : 1.00 Min. : 1.0 Min. :0.1872 Min. : 1.00
## 1st Qu.: 33.00 1st Qu.: 6.0 1st Qu.:0.5413 1st Qu.: 5.00
## Median : 77.00 Median :10.0 Median :0.9333 Median :12.00
## Mean : 75.79 Mean :10.7 Mean :0.7747 Mean :12.65
## 3rd Qu.:112.50 3rd Qu.:15.0 3rd Qu.:0.9800 3rd Qu.:18.00
## Max. :153.00 Max. :20.0 Max. :0.9800 Max. :25.00
## worker_prob num_mothers parentID treatment
## Min. :0.01004 Min. : 2.0 Min. : 1.0 Min. :0.000
## 1st Qu.:0.41966 1st Qu.: 9.0 1st Qu.: 312.5 1st Qu.:0.000
## Median :0.98887 Median :15.0 Median : 635.0 Median :1.000
## Mean :0.72743 Mean :16.9 Mean : 636.6 Mean :0.502
## 3rd Qu.:0.99000 3rd Qu.:22.0 3rd Qu.: 954.5 3rd Qu.:1.000
## Max. :0.99000 Max. :37.0 Max. :1281.0 Max. :1.000
## OR worker_delta breastfed ordinal_effect
## Min. :1.417 Min. :0.002920 Min. :0.0000 Min. :0.01004
## 1st Qu.:1.417 1st Qu.:0.002920 1st Qu.:0.0000 1st Qu.:0.98629
## Median :1.417 Median :0.003247 Median :1.0000 Median :1.51004
## Mean :1.417 Mean :0.022615 Mean :0.7304 Mean :1.48046
## 3rd Qu.:1.417 3rd Qu.:0.043631 3rd Qu.:1.0000 3rd Qu.:2.48942
## Max. :1.417 Max. :0.086422 Max. :1.0000 Max. :2.49000
## ordinal_outcome
## 1:332
## 2:103
## 3:804
##
##
##
dim(demo_generated)
## [1] 1239 13
model0 <- suppressMessages(glmer(data = demo_generated,
formula = breastfed ~ treatment + (1 | facilityID ) + (1 | workerID),
family = binomial)) ## the actual model
model1 <- suppressMessages(glmer(data = demo_generated,
formula = breastfed ~ 1 + (1 | facilityID ) + (1 | workerID),
family = binomial)) ## the null model
exp_coef_demo <- exp(summary(model0)$coefficients[2,1]) # exponentiation the logged odds of treatment, this value is an Odds Ratio
observed_delta_demo <- increase_odds(0.8, exp_coef_demo) ## the first line of this was in the code above
coef_pval_demo <- summary(model0)$coefficients[2,4] # p-value of treatment coefficient
model_pval_demo <- anova(model0, model1, test = "Chisq")[2,8] # Likelihood-ratio test between models
exp_coef_demo ## the Odds Ratio on the binary outcome
## [1] 1.903377
observed_delta_demo ## the percentage point change for binary outcome, provided for ease of interpretation
## [1] 0.08390328
coef_pval_demo ## p-value on the coefficient itself
## [1] 0.006602305
model_pval_demo ## p-value of the model when compared to a null model, provided for alternative p-value
## [1] 0.006082161
model3 <- clmm(ordinal_outcome ~ treatment + (1 | facilityID) + (1 | workerID), data = demo_generated)
summary(model3)
## Cumulative Link Mixed Model fitted with the Laplace approximation
##
## formula: ordinal_outcome ~ treatment + (1 | facilityID) + (1 | workerID)
## data: demo_generated
##
## link threshold nobs logLik AIC niter max.grad cond.H
## logit flexible 1239 -969.71 1949.41 349(761) 6.68e-05 5.5e+01
##
## Random effects:
## Groups Name Variance Std.Dev.
## workerID (Intercept) 0.02581 0.1607
## facilityID (Intercept) 0.08058 0.2839
## Number of groups: workerID 113, facilityID 20
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## treatment 1.4386 0.1305 11.02 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Threshold coefficients:
## Estimate Std. Error z value
## 1|2 -0.39837 0.11321 -3.519
## 2|3 0.04283 0.11227 0.382
ordinal_coef_pval <- summary(model3)$coefficients[3,4]
ordinal_coef_pval
## [1] 0.0000000000000000000000000003060279
full_simulation <- function(p0, delta, ordinal_or = 1.5, num_facilities, mean_workers_per_facility, mean_mothers_per_worker, var_across_facilities = 1, num_workers_variance = 1, var_within_facilities = 1, num_mothers_variance = 1, minmax_fac = 0.02, minmax_worker = 0.01, baseprobs = c(73/126, 13/126, 40/126) ) {
dparents <- make_breastfed_data(p0 = p0, delta = delta, ordinal_or = ordinal_or, num_facilities = num_facilities, mean_workers_per_facility = mean_workers_per_facility, mean_mothers_per_worker = mean_mothers_per_worker, var_across_facilities = var_across_facilities, num_workers_variance = num_workers_variance, var_within_facilities = var_within_facilities, num_mothers_variance = num_mothers_variance, minmax_fac = minmax_fac, minmax_worker = minmax_worker, baseprobs = baseprobs )
dparents_summary <- dparents %>%
group_by(treatment) %>%
summarise(
n_facilities = n_distinct(facilityID),
n_workers = n_distinct(workerID),
# workers per facility
avg_workers_per_facility = n_workers / n_facilities,
# parents per worker (each row is a parent)
n_parents = n(),
avg_parents_per_worker = n_parents / n_workers,
.groups = "drop"
) %>%
mutate(treatment = if_else(treatment == 1, "treat", "control")) %>%
pivot_wider(
names_from = treatment,
values_from = c(
n_facilities, n_workers, n_parents,
avg_workers_per_facility, avg_parents_per_worker
),
names_glue = "obs_{.value}_{treatment}"
)
m0 <- suppressMessages(glmer(data = dparents,
formula = breastfed ~ treatment + (1 | facilityID ) + (1 | workerID),
family = binomial)) ## the actual model
m1 <- suppressMessages(glmer(data = dparents,
formula = breastfed ~ 1 + (1 | facilityID ) + (1 | workerID),
family = binomial)) ## the null model
exp_coef <- exp(summary(m0)$coefficients[2,1]) # exponentiation the logged odds of treatment, this value is an Odds Ratio
observed_delta <- increase_odds(p0, exp_coef)
coef_pval <- summary(m0)$coefficients[2,4] # p-value of treatment coefficient
model_pval <- anova(m0, m1, test = "Chisq")[2,8] # Likelihood-ratio test between models
### ordinal tests
m3 <- clmm(ordinal_outcome ~ treatment + (1 | facilityID) + (1 | workerID), data = dparents)
coef_ordinal <- summary(m3)$coefficients[3,1]
ordinal_coef_pval <- summary(m3)$coefficients[3,4]
results <- cbind(p0, delta, num_facilities, exp_coef, observed_delta, coef_ordinal, coef_pval, model_pval, ordinal_coef_pval, dparents_summary)
return(results)
}
full_simulation(p0 = 0.8,
delta = 0.05,
#ordinal_or = 1.5,
num_facilities = 20,
mean_workers_per_facility = 7,
mean_mothers_per_worker = 8,
# var_across_facilities = 1, num_workers_variance = 1, var_within_facilities = 1, num_mothers_variance = 1, minmax_fac = 0.02, minmax_worker = 0.01,
#baseprobs = c(73/126, 13/126, 40/126)
)
p0_vals <- c(0.4, 0.5, 0.6, 0.7, 0.8) ## change this
delta_vals <- c(0.04, 0.06, 0.08, 0.10, 0.12, 0.14) ## change this
num_facilities_vals <- c(10, 20, 30, 40, 50 ) ## change this
params <- tidyr::expand_grid(p0 = p0_vals, delta = delta_vals, num_facilities = num_facilities_vals)
params_rep <- params %>%
mutate(rep = list(1:10)) %>% ## the latter number is the number of simulations you wish to run.
tidyr::unnest(rep)
# Set up parallel back-end (use all cores minus 1)
plan(multisession, workers = availableCores() - 3)
res_all8 <- future_pmap_dfr(
params_rep,
~ full_simulation(
p0 = ..1,
delta = ..2,
num_facilities = ..3,
mean_workers_per_facility = 7, # you can change
mean_mothers_per_worker = 8 # you can change
## you can change the values below
#ordinal_or = 1.5,
#var_across_facilities = 1, num_workers_variance = 1, var_within_facilities = 1, num_mothers_variance = 1, minmax_fac = 0.02, minmax_worker = 0.01
),
.options = furrr_options(
packages = c("simstudy", "dplyr", "tidyr", "janitor", "lme4", "ordinal"), # packages your functions use
globals = c("affine_transform", "full_simulation", "make_breastfed_data", "increase_odds") ), # exports custom functions
.progress = TRUE
)
# Clean up (optional)
plan(sequential) # back to single-core
all_simulations <- rbind(res_all, res_all2, res_all3, res_all4, res_all5, res_all6, res_all6, res_all7, res_all8)
all_simulations <- cbind(params_rep, all_simulations)
write.csv(all_simulations, "all_simulation.csv")
all_sim_summary <- cbind(params_rep, all_simulations) %>%
group_by(p0, delta, num_facilities) %>% summarise(
num_sim = n(),
ave_or = round(mean(exp_coef),3),
ave_delta = round(mean(observed_delta),3),
coef_power = round(mean(coef_pval < 0.05),3),
model_power = mean(model_pval < 0.05),
ordinal_power = mean(ordinal_coef_pval <0.05))
all_sim_summary %>% ggplot(aes(x= delta, y = coef_power, color = factor(num_facilities))) + facet_wrap(~p0) + geom_line() + theme_bw()
all_simulations %>% group_by(p0, delta, num_facilities) %>% summarise(
ave_or = round(mean(exp_coef),3),
ave_delta = round(mean(observed_delta),3),
coef_power = round(mean(coef_pval < 0.05),3),
model_power = mean(model_pval < 0.05),
ordinal_power = mean(ordinal_coef_pval <0.05))
library(misty) ## for multilevel ICC
## |-------------------------------------|
## | misty 0.7.6 (2025-10-26) |
## | Miscellaneous Functions T. Yanagida |
## |-------------------------------------|
##
## Attaching package: 'misty'
## The following object is masked from 'package:janitor':
##
## crosstab
head(demo_generated)
#####
##### ICC calculations
misty::multilevel.icc(data = demo_generated,
breastfed,
cluster = c("facilityID", "workerID"))
## L3 L2
## 0.4257190 0.2563135
misty::multilevel.icc(data = demo_generated,
breastfed,
cluster = c( "workerID"))
## [1] 0.6860078
misty::multilevel.icc(data = demo_generated,
breastfed,
cluster = c( "facilityID"))
## [1] 0.52387
demo_generated %>% group_by(workerID) %>%
summarise(facilityID = mean(facilityID),
ave_worker_prob = mean(breastfed)) %>%
misty::multilevel.icc(ave_worker_prob,
cluster = "facilityID")
## [1] 0.5607145
#####
dpilot <- read.csv("C:/Users/kinze/Downloads/Pilot data for Michael - Health Learn_Phase 2_Clean Dataset_DEIDENTIFIED.csv")
misty::multilevel.icc(dpilot, EIBF, cluster = "Facility.code")
## [1] 0.3050684
misty::multilevel.icc(dpilot, Immediate.SSC, cluster = "Facility.code")
## [1] 0.3827044
summary(dpilot)
## Facility.code resp_id Immediate.SSC EIBF
## Length:126 Min. :13151757 Min. :0.0000 Min. :0.0000
## Class :character 1st Qu.:13177348 1st Qu.:0.0000 1st Qu.:1.0000
## Mode :character Median :16200686 Median :1.0000 Median :1.0000
## Mean :16202404 Mean :0.5556 Mean :0.8968
## 3rd Qu.:19232019 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :19248898 Max. :1.0000 Max. :1.0000
====================================================== POWER SIMULATION: SHARED LATENT VARIABLE + LRT MIXED MODEL ====================================================== Developed pro bono for HealthLearn by Martina Jakob This script simulates a two-period (pre–post) clustered experiment with binary outcomes and intraclass correlation (ICC).
Simulation setup: - J: number of clusters (workers) - p0: baseline prevalence (pre-treatment mean) - delta: treatment effect in percentage points (post - pre) - rho: intraclass correlation coefficient (binary scale) - alpha: significance level for hypothesis test (default 0.05) - R: number of simulation replications
Data-generating process: 1. Each cluster j has a latent random quantile U_j (shared across pre/post) determining correlated pre/post cluster probabilities. 2. Cluster-level probabilities (p_pre_j, p_post_j) follow Beta distributions with means p0 and p0 + delta, and common (a+b) = 1/rho - 1. → Ensures equal ICC across waves. 3. Each cluster has a variable number of observations (0–15, mean ≈8) drawn from a gamma-like discrete distribution. 4. Binary outcomes Y_ijt ~ Bernoulli(p_jt).
Estimation and inference: - Fits two nested logistic mixed-effects models with random intercepts: m0: y ~ 1 + (1 | cluster) m1: y ~ wave + (1 | cluster) - Compares m0 vs m1 using a likelihood-ratio test (Chi-square). - Power = share of replications where p-value < alpha.
Output: - Diagnostic stats for first simulation (means, ICCs, correlations, cluster size) - Estimated statistical power across R runs
Limitations: - Shared cluster error (U_j) across waves may be over-optimistic, inducing high pre-post correlation (especially under high ICC). Power may be overestimated. - No attrition or missing data modeled. ======================================================
# -----------------------------
# Helper: Beta parameters for mean & ICC
# -----------------------------
ab_from_pr <- function(p, rho) {
k <- 1 / rho - 1
c(a = p * k, b = (1 - p) * k)
}
##M's
ab_from_pr(0.5, 0.25)
# this returns a and b parameters (successes and failures)
curve(dbeta(x, 1000, 1000))
# -----------------------------
# Helper: discrete cluster-size distribution (0–15, mean ≈8)
# -----------------------------
rdisc_counts <- function(n_clusters) {
vals <- 0:15
probs <- dgamma(vals + 1, shape = 3, rate = 0.25) ## returns probability (density) at each of these specific integeres
probs <- probs / sum(probs) ## summing to 1 b/c
sample(vals, n_clusters, replace = TRUE, prob = probs)
}
rdisc_counts(1000) %>% as.data.frame() %>%
ggplot(aes(.)) + geom_histogram()
rdisc_counts(1000) %>% tabyl()
as.data.frame() %>%
ggplot(aes(.)) + geom_histogram()
stat_ecdf(geom = "step") + geom_density()
p0 <- 0.45
delta <- 0.11
rho <- 0.1
J <- 100 # 100 workers
cbind(p_pre_j, p_post_j) %>% ggplot(aes()) + geom_density(aes(x=p_pre_j)) + geom_density(aes(x=p_post_j))
hist(n_pre)
# -----------------------------
# One simulation replicate
# -----------------------------
simulate_once_latent <- function(J, p0, delta, rho, alpha = 0.05, show_checks = TRUE) {
# Beta parameters for pre/post
ab0 <- ab_from_pr(p0, rho)
ab1 <- ab_from_pr(p0 + delta, rho)
# shared latent quantile per cluster (common cluster-level error)
U <- runif(J) #generating quantiles
p_pre_j <- qbeta(U, ab0["a"], ab0["b"]) # returning a probability associated w/ each quantile from the beta distribution specified
p_post_j <- qbeta(U, ab1["a"], ab1["b"])
# variable cluster sizes
n_pre <- pmax(rdisc_counts(J), 1L) ## seems unnecessary, could have simply changed rdisc_counts to not allow for values of 0.
n_post <- pmax(rdisc_counts(J), 1L)
# simulate individuals
make_period <- function(flag, nvec) {
do.call(rbind, lapply(seq_len(J), function(j) {
n <- nvec[j]
data.frame(cluster = j, wave = flag)[rep(1, n), ]
}))
}
df <- rbind(make_period("pre", n_pre), make_period("post", n_post))
cluster <- df$cluster
df$y <- ifelse(df$wave == "pre",## generating outcome variables
rbinom(nrow(df), 1, p_pre_j[cluster]),
rbinom(nrow(df), 1, p_post_j[cluster]))
df$wave <- relevel(factor(df$wave), ref = "pre")
# optional diagnostics
if (show_checks) {
mean_pre <- mean(df$y[df$wave=="pre"])
mean_post <- mean(df$y[df$wave=="post"])
# Use ICC package for accurate ICC calculation
icc_pre <- ICCbare(cluster, y, data = subset(df, wave == "pre"))
icc_post <- ICCbare(cluster, y, data = subset(df, wave == "post"))
cm <- aggregate(y ~ cluster + wave, df, mean)
w <- reshape(cm, idvar = "cluster", timevar = "wave", direction = "wide")
corr_pre_post <- cor(w$y.pre, w$y.post)
mean_cluster_size <- mean(c(n_pre, n_post))
cat(sprintf("\nMeans: pre=%.3f post=%.3f (target %.2f → %.2f)\n",
mean_pre, mean_post, p0, p0 + delta))
cat(sprintf("ICCs: pre=%.3f post=%.3f (target %.2f)\n",
icc_pre, icc_post, rho))
cat(sprintf("Corr(cluster means pre, post): %.3f (shared latent)\n", corr_pre_post))
cat(sprintf("Average cluster size: %.2f (across both waves)\n", mean_cluster_size))
}
# -----------------------------------
# Fit clustered mixed-effects model
# -----------------------------------
m0 <- suppressMessages(glmer(y ~ 1 + (1 | cluster), data = df, family = binomial))
m1 <- suppressMessages(glmer(y ~ wave + (1 | cluster), data = df, family = binomial))
# Likelihood-ratio test between models
pval <- as.numeric(anova(m0, m1, test = "Chisq")$`Pr(>Chisq)`[2])
as.integer(pval < alpha)
}
# -----------------------------
# Repeat simulation for power
# -----------------------------
simulate_power_latent <- function(R = 200, J = 100, p0 = 0.5, delta = 0.1,
rho = 0.1, alpha = 0.05, verbose = TRUE) {
results <- replicate(R, simulate_once_latent(J, p0, delta, rho, alpha, show_checks = (R == 1)))
power_est <- mean(results)
if (verbose) {
cat(sprintf("\nEstimated power (LRT mixed model) = %.3f (R=%d, J=%d, p0=%.2f, delta=%.2f, rho=%.2f)\n",
power_est, R, J, p0, delta, rho))
}
return(power_est)
}
# -----------------------------
# Example run
# -----------------------------
# Check diagnostics
simulate_once_latent(J = 1000, p0 = 0.6, delta = 0.07, rho = 0.1, show_checks = TRUE)
# Compute power
set.seed(123)
simulate_power_latent(R = 100, J = 100, p0 = 0.6, delta = 0.07, rho = 0.05)