Cleaned Version, just the code needed for simulations.

knitr::opts_chunk$set(warning = FALSE)
knitr::opts_chunk$set(include = TRUE) 
options(scipen = 999) ## turning off scientific notation


library(lme4)  ## for clustered regressions 
## Warning: package 'lme4' was built under R version 4.5.2
## Loading required package: Matrix
#library(ICC)
library(tidyverse) ## for many things 
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::expand() masks Matrix::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ tidyr::pack()   masks Matrix::pack()
## ✖ tidyr::unpack() masks Matrix::unpack()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor) ## for %>% function mostly
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
#install.packages("simstudy")
library(simstudy)  ## for generating simulations 
## Warning: package 'simstudy' was built under R version 4.5.2
#install.packages("ordinal")
library(ordinal) ## for ordinal outcome data
## Warning: package 'ordinal' was built under R version 4.5.2
## 
## Attaching package: 'ordinal'
## 
## The following object is masked from 'package:dplyr':
## 
##     slice
#install.packages("misty")

#install.packages("furrr")
library(future) # backend
## Warning: package 'future' was built under R version 4.5.2
library(furrr)  # parallel version of purrr, for running multiple cores at once
## Warning: package 'furrr' was built under R version 4.5.2

Important Code

## Helper function for OR to delta
increase_odds <- function(p, OR) {
  #enter a baseline prob (p) and an Odds Ratio (OR)
  #model returns the delta, the percentage point increase 
  current_odds <- p/(1-p)
  new_odds <- current_odds * OR 
  new_p <- new_odds/ (1 + new_odds)
  delta <- new_p - p
  delta <- ifelse(p < 0.999999999, delta, 0)
  return(delta)
} 


## helper code to shrink probabilities 
affine_transform <- function(x, minmax){
  y <- minmax/(1-2*minmax)
  return((x + y)/(1+2*y))
}


## Main function

make_breastfed_data <- function(p0, delta, ordinal_or = 1.5, num_facilities, mean_workers_per_facility, mean_mothers_per_worker, var_across_facilities = 1, num_workers_variance = 1, var_within_facilities = 1, num_mothers_variance = 1, minmax_fac = 0.02, minmax_worker = 0.01, baseprobs = c(73/126, 13/126, 40/126)  ) {


odds_ratio <- ((p0 + delta)/(1 - p0 - delta)) / ((p0)/(1-p0)) ## calculates OR from p0 and delta. If you tell it you have a baseline of 90% and anticipate 5pp increase, it converts this into an OR of 1.9. As baseline is just an average, sometimes you'll have higher or lower values that are generated, and that you then need to apply an OR to. 


## The way 'simstudy' works is first by defining variables relationships (defData) then generating data (genData). You start with the biggest clusters, and then continue adding new details for each cluster. 

# generating Facility Data 
gen_facility <- defData(id = "facilityID", varname = "int_facility_prob", 
                        dist = "beta", 
                       formula = p0, # r program needs the mean of the beta distribution
                       variance = var_across_facilities) ## tried diff values, variance of 1 works best

gen_facility <- defData(gen_facility, varname = "facility_prob", 
                        dist = "normal", 
                        formula = "affine_transform(int_facility_prob, ..minmax_fac)", 
                        variance = 0) # 0 b/c there's no randomness

gen_facility <- defData(gen_facility, varname = "intermediate_num_workers", 
                        dist = "gamma", 
                        formula = 7, 
                        variance = num_workers_variance) ## 1 is the dispersion parameter

gen_facility <- defData(gen_facility, varname = "num_workers", dist = "normal", formula = "round(intermediate_num_workers) + 1", variance = 0)

dfacilities <- genData(num_facilities, gen_facility) %>% select(-c(int_facility_prob,intermediate_num_workers))

## generating Worker Data 
gen_workers <- defDataAdd(varname = "int_worker_prob", 
                          dist = "beta", 
                          formula  = "facility_prob",   # mean is just facility_prob (p)
                          variance = var_within_facilities) 


gen_workers <- defDataAdd(gen_workers, 
                          varname = "worker_prob", dist = "normal", 
                          formula = "affine_transform(int_worker_prob, ..minmax_worker)", 
                          variance = 0)

gen_workers <- defDataAdd(gen_workers, 
                          varname = "intermediate_num_mothers", dist = "gamma", 
                          formula = mean_mothers_per_worker, 
                          variance = num_mothers_variance)

gen_workers <- defDataAdd(gen_workers, 
                          varname = "num_mothers", dist = "normal", 
                          formula = "round(intermediate_num_mothers)",
                          variance = 0)


dworkers <- genCluster(dfacilities, "facilityID", 
                       numIndsVar = "num_workers", 
                       level1ID = "workerID")

dworkers <- addColumns(gen_workers, dworkers)

dworkers <- dworkers %>% select(-c(int_worker_prob,intermediate_num_mothers))



#### generating Parent Data 
gen_parents <- defDataAdd(varname = "treatment", 
                          dist = "binary", formula = 0.5) 

gen_parents <- defDataAdd(gen_parents, varname = "OR", dist = "normal", 
                          formula = odds_ratio, 
                          variance = 0)

gen_parents <- defDataAdd(gen_parents, varname = "worker_delta", 
                         dist = "normal", 
                         formula = "increase_odds(worker_prob, OR)", 
                         variance = 0)

gen_parents <- defDataAdd(gen_parents, varname = "breastfed", 
                         dist = "binary", 
                         formula = "worker_prob + treatment*worker_delta")

gen_parents <- defDataAdd(gen_parents, 
                          varname = "ordinal_effect", 
                          formula = "..ordinal_or*treatment + worker_prob", 
                          dist = "nonrandom")

dparents <- genCluster(dworkers, "workerID", 
                       numIndsVar = "num_mothers", 
                       level1ID = "parentID")

dparents <- addColumns(gen_parents, dparents)


dparents <- genOrdCat(dparents, id = "parentID", adjVar = "ordinal_effect", baseprobs, catVar = "ordinal_outcome")


## removing workers that don't have parents in pre and post 
dparents <- dparents %>%
  group_by(workerID) %>%
  filter(n_distinct(treatment) == 2) %>%
  ungroup()  

return(dparents)
  
}

Simulation Function

full_simulation <- function(p0, delta, ordinal_or = 1.5, num_facilities, mean_workers_per_facility, mean_mothers_per_worker, var_across_facilities = 1, num_workers_variance = 1, var_within_facilities = 1, num_mothers_variance = 1, minmax_fac = 0.02, minmax_worker = 0.01, baseprobs = c(73/126, 13/126, 40/126) ) {

dparents <- make_breastfed_data(p0 = p0, delta = delta, ordinal_or = ordinal_or, num_facilities = num_facilities, mean_workers_per_facility = mean_workers_per_facility, mean_mothers_per_worker = mean_mothers_per_worker, var_across_facilities = var_across_facilities, num_workers_variance = num_workers_variance, var_within_facilities = var_within_facilities, num_mothers_variance = num_mothers_variance, minmax_fac = minmax_fac, minmax_worker = minmax_worker, baseprobs = baseprobs )

dparents_summary <- dparents %>%
  group_by(treatment) %>%
  summarise(
    n_facilities = n_distinct(facilityID),
    n_workers    = n_distinct(workerID),
    # workers per facility
    avg_workers_per_facility = n_workers / n_facilities,
    # parents per worker (each row is a parent)
    n_parents    = n(), 
    avg_parents_per_worker = n_parents / n_workers,
    .groups = "drop"
  ) %>% 
  mutate(treatment = if_else(treatment == 1, "treat", "control")) %>%
  pivot_wider(
    names_from  = treatment,
    values_from = c(
      n_facilities, n_workers, n_parents,
      avg_workers_per_facility, avg_parents_per_worker
    ),
    names_glue = "obs_{.value}_{treatment}"
  )


m0 <- suppressMessages(glmer(data = dparents, 
      formula = breastfed ~ treatment + (1 | facilityID ) + (1 | workerID), 
      family = binomial)) ## the actual model 

m1 <- suppressMessages(glmer(data = dparents, 
      formula = breastfed ~ 1 + (1 | facilityID ) + (1 | workerID), 
      family = binomial))  ## the null model 


  
  exp_coef <- exp(summary(m0)$coefficients[2,1])  # exponentiation the logged odds of treatment, this value is an Odds Ratio
 observed_delta <- increase_odds(p0, exp_coef)
  
  coef_pval <- summary(m0)$coefficients[2,4] # p-value of treatment coefficient 
  model_pval <- anova(m0, m1, test = "Chisq")[2,8] # Likelihood-ratio test between models
  
  
  
### ordinal tests 
  
m3 <- clmm(ordinal_outcome ~ treatment + (1 | facilityID) + (1 | workerID), data = dparents) 

coef_ordinal <- summary(m3)$coefficients[3,1]

ordinal_coef_pval <- summary(m3)$coefficients[3,4]

results <- cbind(p0, delta, num_facilities, exp_coef, observed_delta, coef_ordinal, coef_pval, model_pval, ordinal_coef_pval, dparents_summary)

return(results)
  
}  

Simulations, this document shows very few iterations. Change if using in future.

p0_vals    <- c(0.4, 0.5) ## change this 
delta_vals <- c(0.04, 0.10, 0.14) ## change this 
num_facilities_vals <- c(10, 30, 50 ) ## change this 



params <- tidyr::expand_grid(p0 = p0_vals, delta = delta_vals, num_facilities = num_facilities_vals)
params_rep <- params %>%
  mutate(rep = list(1:2)) %>% ## the latter number is the number of simulations you wish to run. 
  tidyr::unnest(rep)


# Set up parallel back-end (use all cores minus 1)
plan(multisession, workers = availableCores() - 3)


res_all <- future_pmap_dfr(
  params_rep,
  ~ full_simulation(
    p0 = ..1,
    delta = ..2, 
    num_facilities = ..3,
    mean_workers_per_facility = 7, # you can change
    mean_mothers_per_worker = 8 # you can change 
    ## you can change the values below 
    #ordinal_or = 1.5, 
    #var_across_facilities = 1, num_workers_variance = 1, var_within_facilities = 1, num_mothers_variance = 1, minmax_fac = 0.02, minmax_worker = 0.01
  ),
      .options = furrr_options(
        packages = c("simstudy", "dplyr", "tidyr", "janitor", "lme4", "ordinal"),  # packages your functions use
        globals  = c("affine_transform", "full_simulation", "make_breastfed_data", "increase_odds") ),  # exports custom functions 
  .progress = TRUE
)



# Clean up (optional)
plan(sequential)  # back to single-core

POWER RESULTS

res_all %>% group_by(p0, delta, num_facilities) %>% summarise(
  ave_or = round(mean(exp_coef),3), 
  ave_delta = round(mean(observed_delta),3), 
  coef_power = round(mean(coef_pval < 0.05),3), 
  model_power = mean(model_pval < 0.05),
  ordinal_power = mean(ordinal_coef_pval <0.05)) 
## `summarise()` has grouped output by 'p0', 'delta'. You can override using the
## `.groups` argument.