knitr::opts_chunk$set(warning = FALSE)
knitr::opts_chunk$set(include = TRUE) 
options(scipen = 999) ## turning off scientific notation


library(lme4)  ## for clustered regressions 
## Loading required package: Matrix
#library(ICC)
library(tidyverse) ## for many things 
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::expand() masks Matrix::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ tidyr::pack()   masks Matrix::pack()
## ✖ tidyr::unpack() masks Matrix::unpack()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor) ## for %>% function mostly
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
#install.packages("simstudy")
library(simstudy)  ## for generating simulations 
#install.packages("ordinal")
library(ordinal) ## for ordinal outcome data
## 
## Attaching package: 'ordinal'
## 
## The following object is masked from 'package:dplyr':
## 
##     slice
#install.packages("misty")

#install.packages("furrr")
library(future) # backend
library(furrr)  # parallel version of purrr, for running multiple cores at once

Important Code

## Helper function for OR to delta
increase_odds <- function(p, OR) {
  #enter a baseline prob (p) and an Odds Ratio (OR)
  #model returns the delta, the percentage point increase 
  current_odds <- p/(1-p)
  new_odds <- current_odds * OR 
  new_p <- new_odds/ (1 + new_odds)
  delta <- new_p - p
  delta <- ifelse(p < 0.999999999, delta, 0)
  return(delta)
} 


## helper code to shrink probabilities 
affine_transform <- function(x, minmax){
  y <- minmax/(1-2*minmax)
  return((x + y)/(1+2*y))
}


## Main function

make_breastfed_data <- function(p0, delta, ordinal_or = 1.5, num_facilities, mean_workers_per_facility, mean_mothers_per_worker, var_across_facilities = 1, num_workers_variance = 1, var_within_facilities = 1, num_mothers_variance = 1, minmax_fac = 0.02, minmax_worker = 0.01, baseprobs = c(73/126, 13/126, 40/126)  ) {


odds_ratio <- ((p0 + delta)/(1 - p0 - delta)) / ((p0)/(1-p0)) ## calculates OR from p0 and delta. If you tell it you have a baseline of 90% and anticipate 5pp increase, it converts this into an OR of 1.9. As baseline is just an average, sometimes you'll have higher or lower values that are generated, and that you then need to apply an OR to. 


## The way 'simstudy' works is first by defining variables relationships (defData) then generating data (genData). You start with the biggest clusters, and then continue adding new details for each cluster. 

# generating Facility Data 
gen_facility <- defData(id = "facilityID", varname = "int_facility_prob", 
                        dist = "beta", 
                       formula = p0, # r program needs the mean of the beta distribution
                       variance = var_across_facilities) ## tried diff values, variance of 1 works best

gen_facility <- defData(gen_facility, varname = "facility_prob", 
                        dist = "normal", 
                        formula = "affine_transform(int_facility_prob, ..minmax_fac)", 
                        variance = 0) # 0 b/c there's no randomness

gen_facility <- defData(gen_facility, varname = "intermediate_num_workers", 
                        dist = "gamma", 
                        formula = 7, 
                        variance = num_workers_variance) ## 1 is the dispersion parameter

gen_facility <- defData(gen_facility, varname = "num_workers", dist = "normal", formula = "round(intermediate_num_workers) + 1", variance = 0)

dfacilities <- genData(num_facilities, gen_facility) %>% select(-c(int_facility_prob,intermediate_num_workers))

## generating Worker Data 
gen_workers <- defDataAdd(varname = "int_worker_prob", 
                          dist = "beta", 
                          formula  = "facility_prob",   # mean is just facility_prob (p)
                          variance = var_within_facilities) 


gen_workers <- defDataAdd(gen_workers, 
                          varname = "worker_prob", dist = "normal", 
                          formula = "affine_transform(int_worker_prob, ..minmax_worker)", 
                          variance = 0)

gen_workers <- defDataAdd(gen_workers, 
                          varname = "intermediate_num_mothers", dist = "gamma", 
                          formula = mean_mothers_per_worker, 
                          variance = num_mothers_variance)

gen_workers <- defDataAdd(gen_workers, 
                          varname = "num_mothers", dist = "normal", 
                          formula = "round(intermediate_num_mothers)",
                          variance = 0)


dworkers <- genCluster(dfacilities, "facilityID", 
                       numIndsVar = "num_workers", 
                       level1ID = "workerID")

dworkers <- addColumns(gen_workers, dworkers)

dworkers <- dworkers %>% select(-c(int_worker_prob,intermediate_num_mothers))



#### generating Parent Data 
gen_parents <- defDataAdd(varname = "treatment", 
                          dist = "binary", formula = 0.5) 

gen_parents <- defDataAdd(gen_parents, varname = "OR", dist = "normal", 
                          formula = odds_ratio, 
                          variance = 0)

gen_parents <- defDataAdd(gen_parents, varname = "worker_delta", 
                         dist = "normal", 
                         formula = "increase_odds(worker_prob, OR)", 
                         variance = 0)

gen_parents <- defDataAdd(gen_parents, varname = "breastfed", 
                         dist = "binary", 
                         formula = "worker_prob + treatment*worker_delta")

gen_parents <- defDataAdd(gen_parents, 
                          varname = "ordinal_effect", 
                          formula = "..ordinal_or*treatment + worker_prob", 
                          dist = "nonrandom")

dparents <- genCluster(dworkers, "workerID", 
                       numIndsVar = "num_mothers", 
                       level1ID = "parentID")

dparents <- addColumns(gen_parents, dparents)


dparents <- genOrdCat(dparents, id = "parentID", adjVar = "ordinal_effect", baseprobs, catVar = "ordinal_outcome")


## removing workers that don't have parents in pre and post 
dparents <- dparents %>%
  group_by(workerID) %>%
  filter(n_distinct(treatment) == 2) %>%
  ungroup()  

return(dparents)
  
}
demo_generated <- make_breastfed_data(p0 = 0.8, 
                    delta = 0.05, 
                    #ordinal_or = 1.5, 
                    num_facilities = 20, 
                    mean_workers_per_facility = 7,
                    mean_mothers_per_worker = 8,
                   # var_across_facilities = 1, num_workers_variance = 1, var_within_facilities = 1, num_mothers_variance = 1, minmax_fac = 0.02, minmax_worker = 0.01, 
                  # baseprobs = c(73/126, 13/126, 40/126)
                   )


demo_generated %>% 
  group_by(treatment) %>%
  summarise(
    n_facilities = n_distinct(facilityID),
    n_workers    = n_distinct(workerID),
    # workers per facility
    avg_workers_per_facility = n_workers / n_facilities,
    # parents per worker (each row is a parent)
    n_parents    = n(), 
    avg_parents_per_worker = n_parents / n_workers,
    .groups = "drop"
  ) %>% 
  mutate(treatment = if_else(treatment == 1, "treat", "control")) %>%
  pivot_wider(
    names_from  = treatment,
    values_from = c(
      n_facilities, n_workers, n_parents,
      avg_workers_per_facility, avg_parents_per_worker
    ),
    names_glue = "obs_{.value}_{treatment}"
  )
demo_generated %>% head(50)
demo_generated %>% summary()
##     workerID        facilityID   facility_prob     num_workers   
##  Min.   :  1.00   Min.   : 1.0   Min.   :0.1872   Min.   : 1.00  
##  1st Qu.: 33.00   1st Qu.: 6.0   1st Qu.:0.5413   1st Qu.: 5.00  
##  Median : 77.00   Median :10.0   Median :0.9333   Median :12.00  
##  Mean   : 75.79   Mean   :10.7   Mean   :0.7747   Mean   :12.65  
##  3rd Qu.:112.50   3rd Qu.:15.0   3rd Qu.:0.9800   3rd Qu.:18.00  
##  Max.   :153.00   Max.   :20.0   Max.   :0.9800   Max.   :25.00  
##   worker_prob       num_mothers      parentID        treatment    
##  Min.   :0.01004   Min.   : 2.0   Min.   :   1.0   Min.   :0.000  
##  1st Qu.:0.41966   1st Qu.: 9.0   1st Qu.: 312.5   1st Qu.:0.000  
##  Median :0.98887   Median :15.0   Median : 635.0   Median :1.000  
##  Mean   :0.72743   Mean   :16.9   Mean   : 636.6   Mean   :0.502  
##  3rd Qu.:0.99000   3rd Qu.:22.0   3rd Qu.: 954.5   3rd Qu.:1.000  
##  Max.   :0.99000   Max.   :37.0   Max.   :1281.0   Max.   :1.000  
##        OR         worker_delta        breastfed      ordinal_effect   
##  Min.   :1.417   Min.   :0.002920   Min.   :0.0000   Min.   :0.01004  
##  1st Qu.:1.417   1st Qu.:0.002920   1st Qu.:0.0000   1st Qu.:0.98629  
##  Median :1.417   Median :0.003247   Median :1.0000   Median :1.51004  
##  Mean   :1.417   Mean   :0.022615   Mean   :0.7304   Mean   :1.48046  
##  3rd Qu.:1.417   3rd Qu.:0.043631   3rd Qu.:1.0000   3rd Qu.:2.48942  
##  Max.   :1.417   Max.   :0.086422   Max.   :1.0000   Max.   :2.49000  
##  ordinal_outcome
##  1:332          
##  2:103          
##  3:804          
##                 
##                 
## 
dim(demo_generated)
## [1] 1239   13
model0 <- suppressMessages(glmer(data = demo_generated, 
      formula = breastfed ~ treatment + (1 | facilityID ) + (1 | workerID), 
      family = binomial)) ## the actual model 

model1 <- suppressMessages(glmer(data = demo_generated, 
      formula = breastfed ~ 1 + (1 | facilityID ) + (1 | workerID), 
      family = binomial))  ## the null model 


  
  exp_coef_demo <- exp(summary(model0)$coefficients[2,1])  # exponentiation the logged odds of treatment, this value is an Odds Ratio
 observed_delta_demo <- increase_odds(0.8, exp_coef_demo) ## the first line of this was in the code above 
  
  coef_pval_demo <- summary(model0)$coefficients[2,4] # p-value of treatment coefficient 
  model_pval_demo <- anova(model0, model1, test = "Chisq")[2,8] # Likelihood-ratio test between models

  
exp_coef_demo ## the Odds Ratio on the binary outcome 
## [1] 1.903377
observed_delta_demo ## the percentage point change for binary outcome, provided for ease of interpretation 
## [1] 0.08390328
coef_pval_demo ## p-value on the coefficient itself 
## [1] 0.006602305
model_pval_demo ## p-value of the model when compared to a null model, provided for alternative p-value 
## [1] 0.006082161
model3 <- clmm(ordinal_outcome ~ treatment + (1 | facilityID) + (1 | workerID), data = demo_generated) 

summary(model3)
## Cumulative Link Mixed Model fitted with the Laplace approximation
## 
## formula: ordinal_outcome ~ treatment + (1 | facilityID) + (1 | workerID)
## data:    demo_generated
## 
##  link  threshold nobs logLik  AIC     niter    max.grad cond.H 
##  logit flexible  1239 -969.71 1949.41 349(761) 6.68e-05 5.5e+01
## 
## Random effects:
##  Groups     Name        Variance Std.Dev.
##  workerID   (Intercept) 0.02581  0.1607  
##  facilityID (Intercept) 0.08058  0.2839  
## Number of groups:  workerID 113,  facilityID 20 
## 
## Coefficients:
##           Estimate Std. Error z value            Pr(>|z|)    
## treatment   1.4386     0.1305   11.02 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Threshold coefficients:
##     Estimate Std. Error z value
## 1|2 -0.39837    0.11321  -3.519
## 2|3  0.04283    0.11227   0.382
ordinal_coef_pval <- summary(model3)$coefficients[3,4]

ordinal_coef_pval
## [1] 0.0000000000000000000000000003060279
full_simulation <- function(p0, delta, ordinal_or = 1.5, num_facilities, mean_workers_per_facility, mean_mothers_per_worker, var_across_facilities = 1, num_workers_variance = 1, var_within_facilities = 1, num_mothers_variance = 1, minmax_fac = 0.02, minmax_worker = 0.01, baseprobs = c(73/126, 13/126, 40/126) ) {

dparents <- make_breastfed_data(p0 = p0, delta = delta, ordinal_or = ordinal_or, num_facilities = num_facilities, mean_workers_per_facility = mean_workers_per_facility, mean_mothers_per_worker = mean_mothers_per_worker, var_across_facilities = var_across_facilities, num_workers_variance = num_workers_variance, var_within_facilities = var_within_facilities, num_mothers_variance = num_mothers_variance, minmax_fac = minmax_fac, minmax_worker = minmax_worker, baseprobs = baseprobs )

dparents_summary <- dparents %>%
  group_by(treatment) %>%
  summarise(
    n_facilities = n_distinct(facilityID),
    n_workers    = n_distinct(workerID),
    # workers per facility
    avg_workers_per_facility = n_workers / n_facilities,
    # parents per worker (each row is a parent)
    n_parents    = n(), 
    avg_parents_per_worker = n_parents / n_workers,
    .groups = "drop"
  ) %>% 
  mutate(treatment = if_else(treatment == 1, "treat", "control")) %>%
  pivot_wider(
    names_from  = treatment,
    values_from = c(
      n_facilities, n_workers, n_parents,
      avg_workers_per_facility, avg_parents_per_worker
    ),
    names_glue = "obs_{.value}_{treatment}"
  )


m0 <- suppressMessages(glmer(data = dparents, 
      formula = breastfed ~ treatment + (1 | facilityID ) + (1 | workerID), 
      family = binomial)) ## the actual model 

m1 <- suppressMessages(glmer(data = dparents, 
      formula = breastfed ~ 1 + (1 | facilityID ) + (1 | workerID), 
      family = binomial))  ## the null model 


  
  exp_coef <- exp(summary(m0)$coefficients[2,1])  # exponentiation the logged odds of treatment, this value is an Odds Ratio
 observed_delta <- increase_odds(p0, exp_coef)
  
  coef_pval <- summary(m0)$coefficients[2,4] # p-value of treatment coefficient 
  model_pval <- anova(m0, m1, test = "Chisq")[2,8] # Likelihood-ratio test between models
  
  
  
### ordinal tests 
  
m3 <- clmm(ordinal_outcome ~ treatment + (1 | facilityID) + (1 | workerID), data = dparents) 

coef_ordinal <- summary(m3)$coefficients[3,1]

ordinal_coef_pval <- summary(m3)$coefficients[3,4]

results <- cbind(p0, delta, num_facilities, exp_coef, observed_delta, coef_ordinal, coef_pval, model_pval, ordinal_coef_pval, dparents_summary)

return(results)
  
}  
full_simulation(p0 = 0.8, 
                    delta = 0.05, 
                    #ordinal_or = 1.5, 
                    num_facilities = 20, 
                    mean_workers_per_facility = 7,
                    mean_mothers_per_worker = 8,
                   # var_across_facilities = 1, num_workers_variance = 1, var_within_facilities = 1, num_mothers_variance = 1, minmax_fac = 0.02, minmax_worker = 0.01, 
                #baseprobs = c(73/126, 13/126, 40/126)
                   )
p0_vals    <- c(0.4, 0.5, 0.6, 0.7, 0.8) ## change this 
delta_vals <- c(0.04, 0.06, 0.08, 0.10, 0.12, 0.14) ## change this 
num_facilities_vals <- c(10, 20, 30, 40, 50 ) ## change this 



params <- tidyr::expand_grid(p0 = p0_vals, delta = delta_vals, num_facilities = num_facilities_vals)
params_rep <- params %>%
  mutate(rep = list(1:10)) %>% ## the latter number is the number of simulations you wish to run. 
  tidyr::unnest(rep)


# Set up parallel back-end (use all cores minus 1)
plan(multisession, workers = availableCores() - 3)



res_all8 <- future_pmap_dfr(
  params_rep,
  ~ full_simulation(
    p0 = ..1,
    delta = ..2, 
    num_facilities = ..3,
    mean_workers_per_facility = 7, # you can change
    mean_mothers_per_worker = 8 # you can change 
    ## you can change the values below 
    #ordinal_or = 1.5, 
    #var_across_facilities = 1, num_workers_variance = 1, var_within_facilities = 1, num_mothers_variance = 1, minmax_fac = 0.02, minmax_worker = 0.01
  ),
      .options = furrr_options(
        packages = c("simstudy", "dplyr", "tidyr", "janitor", "lme4", "ordinal"),  # packages your functions use
        globals  = c("affine_transform", "full_simulation", "make_breastfed_data", "increase_odds") ),  # exports custom functions 
  .progress = TRUE
)




# Clean up (optional)
plan(sequential)  # back to single-core
all_simulations <- rbind(res_all, res_all2, res_all3, res_all4, res_all5, res_all6, res_all6, res_all7, res_all8)

all_simulations <- cbind(params_rep, all_simulations)

write.csv(all_simulations, "all_simulation.csv")

all_sim_summary <- cbind(params_rep, all_simulations) %>% 
  group_by(p0, delta, num_facilities) %>% summarise(
  num_sim = n(),
  ave_or = round(mean(exp_coef),3), 
  ave_delta = round(mean(observed_delta),3), 
  coef_power = round(mean(coef_pval < 0.05),3), 
  model_power = mean(model_pval < 0.05),
  ordinal_power = mean(ordinal_coef_pval <0.05)) 

all_sim_summary %>% ggplot(aes(x= delta, y = coef_power, color = factor(num_facilities))) + facet_wrap(~p0) + geom_line() + theme_bw()

POWER RESULTS

all_simulations %>% group_by(p0, delta, num_facilities) %>% summarise(
  ave_or = round(mean(exp_coef),3), 
  ave_delta = round(mean(observed_delta),3), 
  coef_power = round(mean(coef_pval < 0.05),3), 
  model_power = mean(model_pval < 0.05),
  ordinal_power = mean(ordinal_coef_pval <0.05)) 

Code I’m using to test things

library(misty)  ## for multilevel ICC
## |-------------------------------------|
## | misty 0.7.6 (2025-10-26)            |
## | Miscellaneous Functions T. Yanagida |
## |-------------------------------------|
## 
## Attaching package: 'misty'
## The following object is masked from 'package:janitor':
## 
##     crosstab
head(demo_generated)
#####
##### ICC calculations 

misty::multilevel.icc(data = demo_generated,
               breastfed, 
               cluster = c("facilityID", "workerID"))
##        L3        L2 
## 0.4257190 0.2563135
misty::multilevel.icc(data = demo_generated,
               breastfed, 
               cluster = c( "workerID"))
## [1] 0.6860078
misty::multilevel.icc(data = demo_generated,
               breastfed, 
               cluster = c( "facilityID"))
## [1] 0.52387
demo_generated %>% group_by(workerID) %>% 
  summarise(facilityID = mean(facilityID),
    ave_worker_prob = mean(breastfed)) %>% 
  misty::multilevel.icc(ave_worker_prob,
                        cluster = "facilityID")
## [1] 0.5607145
#####

dpilot <- read.csv("C:/Users/kinze/Downloads/Pilot data for Michael - Health Learn_Phase 2_Clean Dataset_DEIDENTIFIED.csv")
  
misty::multilevel.icc(dpilot, EIBF, cluster = "Facility.code")
## [1] 0.3050684
misty::multilevel.icc(dpilot, Immediate.SSC, cluster = "Facility.code")
## [1] 0.3827044
summary(dpilot)
##  Facility.code         resp_id         Immediate.SSC         EIBF       
##  Length:126         Min.   :13151757   Min.   :0.0000   Min.   :0.0000  
##  Class :character   1st Qu.:13177348   1st Qu.:0.0000   1st Qu.:1.0000  
##  Mode  :character   Median :16200686   Median :1.0000   Median :1.0000  
##                     Mean   :16202404   Mean   :0.5556   Mean   :0.8968  
##                     3rd Qu.:19232019   3rd Qu.:1.0000   3rd Qu.:1.0000  
##                     Max.   :19248898   Max.   :1.0000   Max.   :1.0000

Old Code

====================================================== POWER SIMULATION: SHARED LATENT VARIABLE + LRT MIXED MODEL ====================================================== Developed pro bono for HealthLearn by Martina Jakob This script simulates a two-period (pre–post) clustered experiment with binary outcomes and intraclass correlation (ICC).

Simulation setup: - J: number of clusters (workers) - p0: baseline prevalence (pre-treatment mean) - delta: treatment effect in percentage points (post - pre) - rho: intraclass correlation coefficient (binary scale) - alpha: significance level for hypothesis test (default 0.05) - R: number of simulation replications

Data-generating process: 1. Each cluster j has a latent random quantile U_j (shared across pre/post) determining correlated pre/post cluster probabilities. 2. Cluster-level probabilities (p_pre_j, p_post_j) follow Beta distributions with means p0 and p0 + delta, and common (a+b) = 1/rho - 1. → Ensures equal ICC across waves. 3. Each cluster has a variable number of observations (0–15, mean ≈8) drawn from a gamma-like discrete distribution. 4. Binary outcomes Y_ijt ~ Bernoulli(p_jt).

Estimation and inference: - Fits two nested logistic mixed-effects models with random intercepts: m0: y ~ 1 + (1 | cluster) m1: y ~ wave + (1 | cluster) - Compares m0 vs m1 using a likelihood-ratio test (Chi-square). - Power = share of replications where p-value < alpha.

Output: - Diagnostic stats for first simulation (means, ICCs, correlations, cluster size) - Estimated statistical power across R runs

Limitations: - Shared cluster error (U_j) across waves may be over-optimistic, inducing high pre-post correlation (especially under high ICC). Power may be overestimated. - No attrition or missing data modeled. ======================================================

# -----------------------------
# Helper: Beta parameters for mean & ICC
# -----------------------------
ab_from_pr <- function(p, rho) {
  k <- 1 / rho - 1
  c(a = p * k, b = (1 - p) * k)
}


##M's
ab_from_pr(0.5, 0.25)

# this returns a and b parameters (successes and failures)

curve(dbeta(x, 1000, 1000))

# -----------------------------
# Helper: discrete cluster-size distribution (0–15, mean ≈8)
# -----------------------------
rdisc_counts <- function(n_clusters) {
  vals <- 0:15
  probs <- dgamma(vals + 1, shape = 3, rate = 0.25) ## returns probability (density) at each of these specific integeres 
  probs <- probs / sum(probs)  ## summing to 1 b/c 
  sample(vals, n_clusters, replace = TRUE, prob = probs)
}


rdisc_counts(1000) %>% as.data.frame() %>% 
ggplot(aes(.))  + geom_histogram()


rdisc_counts(1000) %>% tabyl() 
as.data.frame() %>% 
ggplot(aes(.))  + geom_histogram()
  stat_ecdf(geom = "step") + geom_density()
p0 <- 0.45
delta <- 0.11
rho <- 0.1

J <- 100 # 100 workers

cbind(p_pre_j, p_post_j) %>% ggplot(aes()) + geom_density(aes(x=p_pre_j)) + geom_density(aes(x=p_post_j))
hist(n_pre)

# -----------------------------
# One simulation replicate
# -----------------------------
simulate_once_latent <- function(J, p0, delta, rho, alpha = 0.05, show_checks = TRUE) {
  # Beta parameters for pre/post
  ab0 <- ab_from_pr(p0, rho)
  ab1 <- ab_from_pr(p0 + delta, rho)
  
  # shared latent quantile per cluster (common cluster-level error)
  U <- runif(J)  #generating quantiles  
  p_pre_j  <- qbeta(U, ab0["a"], ab0["b"]) # returning a probability associated w/ each quantile from the beta distribution specified
  p_post_j <- qbeta(U, ab1["a"], ab1["b"])
  
  # variable cluster sizes
  n_pre  <- pmax(rdisc_counts(J), 1L) ## seems unnecessary, could have simply changed rdisc_counts to not allow for values of 0. 
  n_post <- pmax(rdisc_counts(J), 1L)
  
  # simulate individuals
  make_period <- function(flag, nvec) {
    do.call(rbind, lapply(seq_len(J), function(j) {
      n <- nvec[j]
      data.frame(cluster = j, wave = flag)[rep(1, n), ]
    }))
  }
  
  df <- rbind(make_period("pre", n_pre), make_period("post", n_post))
  
  cluster <- df$cluster
  df$y <- ifelse(df$wave == "pre",## generating outcome variables 
                 rbinom(nrow(df), 1, p_pre_j[cluster]),
                 rbinom(nrow(df), 1, p_post_j[cluster]))
  df$wave <- relevel(factor(df$wave), ref = "pre")
  
  # optional diagnostics
  if (show_checks) {
    mean_pre  <- mean(df$y[df$wave=="pre"])
    mean_post <- mean(df$y[df$wave=="post"])
    
    # Use ICC package for accurate ICC calculation
    icc_pre  <- ICCbare(cluster, y, data = subset(df, wave == "pre"))
    icc_post <- ICCbare(cluster, y, data = subset(df, wave == "post"))
    
    cm <- aggregate(y ~ cluster + wave, df, mean)
    w  <- reshape(cm, idvar = "cluster", timevar = "wave", direction = "wide")
    corr_pre_post <- cor(w$y.pre, w$y.post)
    mean_cluster_size <- mean(c(n_pre, n_post))
    
    cat(sprintf("\nMeans: pre=%.3f  post=%.3f  (target %.2f → %.2f)\n",
                mean_pre, mean_post, p0, p0 + delta))
    cat(sprintf("ICCs:  pre=%.3f  post=%.3f  (target %.2f)\n",
                icc_pre, icc_post, rho))
    cat(sprintf("Corr(cluster means pre, post): %.3f (shared latent)\n", corr_pre_post))
    cat(sprintf("Average cluster size: %.2f (across both waves)\n", mean_cluster_size))
  }
  
  # -----------------------------------
  # Fit clustered mixed-effects model 
  # -----------------------------------
  m0 <- suppressMessages(glmer(y ~ 1 + (1 | cluster), data = df, family = binomial))
  m1 <- suppressMessages(glmer(y ~ wave + (1 | cluster), data = df, family = binomial))
  
  # Likelihood-ratio test between models
  pval <- as.numeric(anova(m0, m1, test = "Chisq")$`Pr(>Chisq)`[2])
  
  as.integer(pval < alpha)
}
# -----------------------------
# Repeat simulation for power
# -----------------------------
simulate_power_latent <- function(R = 200, J = 100, p0 = 0.5, delta = 0.1,
                                  rho = 0.1, alpha = 0.05, verbose = TRUE) {
  results <- replicate(R, simulate_once_latent(J, p0, delta, rho, alpha, show_checks = (R == 1)))
  power_est <- mean(results)
  if (verbose) {
    cat(sprintf("\nEstimated power (LRT mixed model) = %.3f  (R=%d, J=%d, p0=%.2f, delta=%.2f, rho=%.2f)\n",
                power_est, R, J, p0, delta, rho))
  }
  return(power_est)
}

# -----------------------------
# Example run
# -----------------------------

# Check diagnostics
simulate_once_latent(J = 1000, p0 = 0.6, delta = 0.07, rho = 0.1, show_checks = TRUE)

# Compute power
set.seed(123)
simulate_power_latent(R = 100, J = 100, p0 = 0.6, delta = 0.07, rho = 0.05)