setwd("C:/Work Files/Dissertations/Donna Manlongat")Donna M. Power Analyses
library(tidyverse)── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.2 ✔ tibble 3.3.0
✔ lubridate 1.9.4 ✔ tidyr 1.3.1
✔ purrr 1.0.4
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ranger) # fast random forest
library(pROC) # AUCType 'citation("pROC")' for a citation.
Attaching package: 'pROC'
The following objects are masked from 'package:stats':
cov, smooth, var
library(rsample) # cross-validationsimulate_predictors <- function(n) {
tibble(
# Binary predictors
ETOH = rbinom(5620, 1, 0.1),
MRJ = rbinom(5620, 1, 0.1),
GES_Hyp = rbinom(5620, 1, 0.2),
Depression = rbinom(5620, 1, 0.2),
Marital_Stat = rbinom(5620,1,0.4),
Discrim = rbinom(5620,1,0.5),
Childhood_Bas_Need = rbinom(5620,1,0.3),
# Ordinal predictors (2–5 levels)
GWG = sample(1:3, 5620, replace = TRUE),
Age = sample(1:3, 5620, replace = TRUE),
EDU = sample(1:4, 5620, replace = TRUE),
INC = sample(1:4, 5620, replace = TRUE),
Life_Stress = sample(1:4, 5620, replace = TRUE),
Resilience = sample(1:5, 5620, replace = TRUE),
Basic_NEED = sample(1:3, 5620, replace = TRUE),
ACE = sample(1:5, 5620, replace = TRUE),
)
}simulate_outcome <- function(X, target_prev = 0.131) {
# True signal: subset of predictors matter
lp <-
0.6 * X$GWG +
0.5 * X$Age +
0.4 * X$EDU +
0.3 * X$INC +
0.5 * X$Life_Stress +
0.4 * X$Resilience +
0.5 * X$Basic_NEED
0.5 * X$ACE
rnorm(nrow(X), 0, 0.8) # noise
# Find intercept to match prevalence
intercept <- uniroot(
function(b) mean(plogis(b + lp)) - target_prev,
interval = c(-10, 10)
)$root
p <- plogis(intercept + lp)
rbinom(nrow(X), 1, p)
}fit_evaluate_rf <- function(data) {
folds <- vfold_cv(data, v = 5, strata = SGA)
aucs <- map_dbl(folds$splits, function(split) {
train <- analysis(split)
test <- assessment(split)
rf <- ranger(
SGA ~ .,
data = train,
probability = TRUE,
num.trees = 500,
mtry = floor(sqrt(ncol(train) - 1)),
min.node.size = 20,
class.weights = c("0" = 1, "1" = 5620 / 738)
)
preds <- predict(rf, test)$predictions[, "1"]
roc_obj <- roc(test$SGA, preds, quiet = TRUE)
as.numeric(auc(roc_obj))
})
mean(aucs)
}run_simulation <- function(
n_sim = 500,
n = 5620,
target_auc = 0.70
) {
aucs <- replicate(n_sim, {
X <- simulate_predictors(n)
y <- simulate_outcome(X)
data <- bind_cols(X, SGA = factor(y, levels = c(0, 1)))
fit_evaluate_rf(data)
})
tibble(
mean_auc = mean(aucs),
sd_auc = sd(aucs),
power = mean(aucs >= target_auc)
)
}set.seed(2026)
results <- run_simulation(
n_sim = 300,
target_auc = 0.75
)
results# A tibble: 1 × 3
mean_auc sd_auc power
<dbl> <dbl> <dbl>
1 0.766 0.00983 0.953