knitr::opts_chunk$set(warning = FALSE)
knitr::opts_chunk$set(include = TRUE)
options(scipen = 999) ## turning off scientific notation
library(lme4) ## for clustered regressions
## Warning: package 'lme4' was built under R version 4.5.2
## Loading required package: Matrix
#library(ICC)
library(tidyverse) ## for many things
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::expand() masks Matrix::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tidyr::pack() masks Matrix::pack()
## ✖ tidyr::unpack() masks Matrix::unpack()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor) ## for %>% function mostly
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
#install.packages("simstudy")
library(simstudy) ## for generating simulations
## Warning: package 'simstudy' was built under R version 4.5.2
#install.packages("ordinal")
library(ordinal) ## for ordinal outcome data
## Warning: package 'ordinal' was built under R version 4.5.2
##
## Attaching package: 'ordinal'
##
## The following object is masked from 'package:dplyr':
##
## slice
#install.packages("misty")
#install.packages("furrr")
library(future) # backend
## Warning: package 'future' was built under R version 4.5.2
library(furrr) # parallel version of purrr, for running multiple cores at once
## Warning: package 'furrr' was built under R version 4.5.2
## Helper function for OR to delta
increase_odds <- function(p, OR) {
#enter a baseline prob (p) and an Odds Ratio (OR)
#model returns the delta, the percentage point increase
current_odds <- p/(1-p)
new_odds <- current_odds * OR
new_p <- new_odds/ (1 + new_odds)
delta <- new_p - p
delta <- ifelse(p < 0.999999999, delta, 0)
return(delta)
}
## helper code to shrink probabilities
affine_transform <- function(x, minmax){
y <- minmax/(1-2*minmax)
return((x + y)/(1+2*y))
}
## Main function
make_breastfed_data <- function(p0, delta, ordinal_or = 1.5, num_facilities, mean_workers_per_facility, mean_mothers_per_worker, var_across_facilities = 1, num_workers_variance = 1, var_within_facilities = 1, num_mothers_variance = 1, minmax_fac = 0.02, minmax_worker = 0.01, baseprobs = c(73/126, 13/126, 40/126) ) {
odds_ratio <- ((p0 + delta)/(1 - p0 - delta)) / ((p0)/(1-p0)) ## calculates OR from p0 and delta. If you tell it you have a baseline of 90% and anticipate 5pp increase, it converts this into an OR of 1.9. As baseline is just an average, sometimes you'll have higher or lower values that are generated, and that you then need to apply an OR to.
## The way 'simstudy' works is first by defining variables relationships (defData) then generating data (genData). You start with the biggest clusters, and then continue adding new details for each cluster.
# generating Facility Data
gen_facility <- defData(id = "facilityID", varname = "int_facility_prob",
dist = "beta",
formula = p0, # r program needs the mean of the beta distribution
variance = var_across_facilities) ## tried diff values, variance of 1 works best
gen_facility <- defData(gen_facility, varname = "facility_prob",
dist = "normal",
formula = "affine_transform(int_facility_prob, ..minmax_fac)",
variance = 0) # 0 b/c there's no randomness
gen_facility <- defData(gen_facility, varname = "intermediate_num_workers",
dist = "gamma",
formula = 7,
variance = num_workers_variance) ## 1 is the dispersion parameter
gen_facility <- defData(gen_facility, varname = "num_workers", dist = "normal", formula = "round(intermediate_num_workers) + 1", variance = 0)
dfacilities <- genData(num_facilities, gen_facility) %>% select(-c(int_facility_prob,intermediate_num_workers))
## generating Worker Data
gen_workers <- defDataAdd(varname = "int_worker_prob",
dist = "beta",
formula = "facility_prob", # mean is just facility_prob (p)
variance = var_within_facilities)
gen_workers <- defDataAdd(gen_workers,
varname = "worker_prob", dist = "normal",
formula = "affine_transform(int_worker_prob, ..minmax_worker)",
variance = 0)
gen_workers <- defDataAdd(gen_workers,
varname = "intermediate_num_mothers", dist = "gamma",
formula = mean_mothers_per_worker,
variance = num_mothers_variance)
gen_workers <- defDataAdd(gen_workers,
varname = "num_mothers", dist = "normal",
formula = "round(intermediate_num_mothers)",
variance = 0)
dworkers <- genCluster(dfacilities, "facilityID",
numIndsVar = "num_workers",
level1ID = "workerID")
dworkers <- addColumns(gen_workers, dworkers)
dworkers <- dworkers %>% select(-c(int_worker_prob,intermediate_num_mothers))
#### generating Parent Data
gen_parents <- defDataAdd(varname = "treatment",
dist = "binary", formula = 0.5)
gen_parents <- defDataAdd(gen_parents, varname = "OR", dist = "normal",
formula = odds_ratio,
variance = 0)
gen_parents <- defDataAdd(gen_parents, varname = "worker_delta",
dist = "normal",
formula = "increase_odds(worker_prob, OR)",
variance = 0)
gen_parents <- defDataAdd(gen_parents, varname = "breastfed",
dist = "binary",
formula = "worker_prob + treatment*worker_delta")
gen_parents <- defDataAdd(gen_parents,
varname = "ordinal_effect",
formula = "..ordinal_or*treatment + worker_prob",
dist = "nonrandom")
dparents <- genCluster(dworkers, "workerID",
numIndsVar = "num_mothers",
level1ID = "parentID")
dparents <- addColumns(gen_parents, dparents)
dparents <- genOrdCat(dparents, id = "parentID", adjVar = "ordinal_effect", baseprobs, catVar = "ordinal_outcome")
## removing workers that don't have parents in pre and post
dparents <- dparents %>%
group_by(workerID) %>%
filter(n_distinct(treatment) == 2) %>%
ungroup()
return(dparents)
}
Simulation Function
full_simulation <- function(p0, delta, ordinal_or = 1.5, num_facilities, mean_workers_per_facility, mean_mothers_per_worker, var_across_facilities = 1, num_workers_variance = 1, var_within_facilities = 1, num_mothers_variance = 1, minmax_fac = 0.02, minmax_worker = 0.01, baseprobs = c(73/126, 13/126, 40/126) ) {
dparents <- make_breastfed_data(p0 = p0, delta = delta, ordinal_or = ordinal_or, num_facilities = num_facilities, mean_workers_per_facility = mean_workers_per_facility, mean_mothers_per_worker = mean_mothers_per_worker, var_across_facilities = var_across_facilities, num_workers_variance = num_workers_variance, var_within_facilities = var_within_facilities, num_mothers_variance = num_mothers_variance, minmax_fac = minmax_fac, minmax_worker = minmax_worker, baseprobs = baseprobs )
dparents_summary <- dparents %>%
group_by(treatment) %>%
summarise(
n_facilities = n_distinct(facilityID),
n_workers = n_distinct(workerID),
# workers per facility
avg_workers_per_facility = n_workers / n_facilities,
# parents per worker (each row is a parent)
n_parents = n(),
avg_parents_per_worker = n_parents / n_workers,
.groups = "drop"
) %>%
mutate(treatment = if_else(treatment == 1, "treat", "control")) %>%
pivot_wider(
names_from = treatment,
values_from = c(
n_facilities, n_workers, n_parents,
avg_workers_per_facility, avg_parents_per_worker
),
names_glue = "obs_{.value}_{treatment}"
)
m0 <- suppressMessages(glmer(data = dparents,
formula = breastfed ~ treatment + (1 | facilityID ) + (1 | workerID),
family = binomial)) ## the actual model
m1 <- suppressMessages(glmer(data = dparents,
formula = breastfed ~ 1 + (1 | facilityID ) + (1 | workerID),
family = binomial)) ## the null model
exp_coef <- exp(summary(m0)$coefficients[2,1]) # exponentiation the logged odds of treatment, this value is an Odds Ratio
observed_delta <- increase_odds(p0, exp_coef)
coef_pval <- summary(m0)$coefficients[2,4] # p-value of treatment coefficient
model_pval <- anova(m0, m1, test = "Chisq")[2,8] # Likelihood-ratio test between models
### ordinal tests
m3 <- clmm(ordinal_outcome ~ treatment + (1 | facilityID) + (1 | workerID), data = dparents)
coef_ordinal <- summary(m3)$coefficients[3,1]
ordinal_coef_pval <- summary(m3)$coefficients[3,4]
results <- cbind(p0, delta, num_facilities, exp_coef, observed_delta, coef_ordinal, coef_pval, model_pval, ordinal_coef_pval, dparents_summary)
return(results)
}
Simulations, this document shows very few iterations. Change if using in future.
p0_vals <- c(0.4, 0.5) ## change this
delta_vals <- c(0.04, 0.10, 0.14) ## change this
num_facilities_vals <- c(10, 30, 50 ) ## change this
params <- tidyr::expand_grid(p0 = p0_vals, delta = delta_vals, num_facilities = num_facilities_vals)
params_rep <- params %>%
mutate(rep = list(1:2)) %>% ## the latter number is the number of simulations you wish to run.
tidyr::unnest(rep)
# Set up parallel back-end (use all cores minus 1)
plan(multisession, workers = availableCores() - 3)
res_all <- future_pmap_dfr(
params_rep,
~ full_simulation(
p0 = ..1,
delta = ..2,
num_facilities = ..3,
mean_workers_per_facility = 7, # you can change
mean_mothers_per_worker = 8 # you can change
## you can change the values below
#ordinal_or = 1.5,
#var_across_facilities = 1, num_workers_variance = 1, var_within_facilities = 1, num_mothers_variance = 1, minmax_fac = 0.02, minmax_worker = 0.01
),
.options = furrr_options(
packages = c("simstudy", "dplyr", "tidyr", "janitor", "lme4", "ordinal"), # packages your functions use
globals = c("affine_transform", "full_simulation", "make_breastfed_data", "increase_odds") ), # exports custom functions
.progress = TRUE
)
# Clean up (optional)
plan(sequential) # back to single-core
res_all %>% group_by(p0, delta, num_facilities) %>% summarise(
ave_or = round(mean(exp_coef),3),
ave_delta = round(mean(observed_delta),3),
coef_power = round(mean(coef_pval < 0.05),3),
model_power = mean(model_pval < 0.05),
ordinal_power = mean(ordinal_coef_pval <0.05))
## `summarise()` has grouped output by 'p0', 'delta'. You can override using the
## `.groups` argument.