Donna M. Power Analyses

setwd("C:/Work Files/Dissertations/Donna Manlongat")

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.2     ✔ tibble    3.3.0
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ranger)     # fast random forest
library(pROC)       # AUC

Type 'citation("pROC")' for a citation.

Attaching package: 'pROC'

The following objects are masked from 'package:stats':

    cov, smooth, var

library(rsample)    # cross-validation

simulate_predictors <- function(n) {
  tibble(
    # Binary predictors
    ETOH = rbinom(5620, 1, 0.1),
    MRJ = rbinom(5620, 1, 0.1),
    GES_Hyp = rbinom(5620, 1, 0.2),
    Depression = rbinom(5620, 1, 0.2),
    Marital_Stat = rbinom(5620,1,0.4),
    Discrim = rbinom(5620,1,0.5),
    Childhood_Bas_Need = rbinom(5620,1,0.3),
    
    # Ordinal predictors (2–5 levels)
    GWG  = sample(1:3, 5620, replace = TRUE),
    Age  = sample(1:3, 5620, replace = TRUE),
    EDU  = sample(1:4, 5620, replace = TRUE),
    INC  = sample(1:4, 5620, replace = TRUE),
    Life_Stress  = sample(1:4, 5620, replace = TRUE),
    Resilience = sample(1:5, 5620, replace = TRUE),
    Basic_NEED = sample(1:3, 5620, replace = TRUE),
    ACE = sample(1:5, 5620, replace = TRUE),
    
  )
}

simulate_outcome <- function(X, target_prev = 0.131) {

  # True signal: subset of predictors matter
  lp <- 
    0.6 * X$GWG +
    0.5 * X$Age +
    0.4 * X$EDU +
    0.3 * X$INC +
    0.5 * X$Life_Stress +
    0.4 * X$Resilience +
    0.5 * X$Basic_NEED
    0.5 * X$ACE
  
    rnorm(nrow(X), 0, 0.8)  # noise

  # Find intercept to match prevalence
  intercept <- uniroot(
    function(b) mean(plogis(b + lp)) - target_prev,
    interval = c(-10, 10)
  )$root

  p <- plogis(intercept + lp)
  rbinom(nrow(X), 1, p)
}

fit_evaluate_rf <- function(data) {

  folds <- vfold_cv(data, v = 5, strata = SGA)

  aucs <- map_dbl(folds$splits, function(split) {

    train <- analysis(split)
    test  <- assessment(split)

    rf <- ranger(
      SGA ~ .,
      data = train,
      probability = TRUE,
      num.trees = 500,
      mtry = floor(sqrt(ncol(train) - 1)),
      min.node.size = 20,
      class.weights = c("0" = 1, "1" = 5620 / 738)
    )

    preds <- predict(rf, test)$predictions[, "1"]

    roc_obj <- roc(test$SGA, preds, quiet = TRUE)
    as.numeric(auc(roc_obj))
  })

  mean(aucs)
}

run_simulation <- function(
  n_sim = 500,
  n = 5620,
  target_auc = 0.70
) {

  aucs <- replicate(n_sim, {

    X <- simulate_predictors(n)
    y <- simulate_outcome(X)

    data <- bind_cols(X, SGA = factor(y, levels = c(0, 1)))

    fit_evaluate_rf(data)
  })

  tibble(
    mean_auc = mean(aucs),
    sd_auc   = sd(aucs),
    power    = mean(aucs >= target_auc)
  )
}

set.seed(2026)

results <- run_simulation(
  n_sim = 300,
  target_auc = 0.75
)

results

# A tibble: 1 × 3
  mean_auc  sd_auc power
     <dbl>   <dbl> <dbl>
1    0.766 0.00983 0.953