Power analysis for Dana

Power analysis 2*2*2 study: Import for packages:

library(MASS) # install.pacakges('MASS')
library(tidyverse) # install.pacakges('tidyverse')
── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
✔ ggplot2 3.4.0     ✔ purrr   0.3.4
✔ tibble  3.1.7     ✔ dplyr   1.1.0
✔ tidyr   1.2.0     ✔ stringr 1.4.0
✔ readr   2.1.4     ✔ forcats 0.5.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
✖ dplyr::select() masks MASS::select()

Time is a bit limited so I will skip comment! feel free to ask me if there’s questions:

Deploy and simulating

quick test: here’s explanation of log odds;

we expect

\(P(Y = 1) = \frac{e^{X\beta}}{1+e^{X\beta}}\)

therefore we have:

\(\frac{P(Y=1)}{1-P(Y=1)} = \frac{P(Y=1)}{P(Y=0)} = e^{X\beta}\)

hence you notice that this is already the odds ratio,

taking log we get $X\beta$, in our case this is what we set up for equation below:

effect_size[1] * as.numeric(factor1) + effect_size[2] * as.numeric(factor2) + effect_size[3] * as.numeric(factor3)

I hope it makes sense

A quick demo is as below:

set.seed(1)

n <- 500 # sample size we want to fit 
p <- 0.5  # baseline prob of people belong to category 1
effect_size <- c(0.2, 0.2, 0.2)  #this is log odds ratio difference 

# 2*2*2 within group setting
comb <- 2*2*2

# Generate factors
factor1 <- factor(rep(rep(c(0, 1), each = n), times = 4))
factor2 <- factor(rep(rep(c(0, 1), each = n), each = 2, times = 2))
factor3 <- factor(rep(c(0, 1), each = n, times = 4))

# Simulate the log odds of success
log_odds <- effect_size[1] * as.numeric(factor1) + 
            effect_size[2] * as.numeric(factor2) + 
            effect_size[3] * as.numeric(factor3)

# Convert log odds to probability
prob <- exp(log_odds) / (1 + exp(log_odds))

# Generate binary outcome based on the simulated probability
outcome <- rbinom(comb * n, 1, prob)

# Fit logistic regression model
fit <- glm(outcome ~ factor1 + factor2 + factor3, family = binomial())

# Output summary of the model
summary(fit)

Call:
glm(formula = outcome ~ factor1 + factor2 + factor3, family = binomial())

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.6557  -1.4650   0.7987   0.8784   0.9148  

Coefficients: (1 not defined because of singularities)
            Estimate Std. Error z value Pr(>|z|)    
(Intercept)  0.65473    0.05851  11.189  < 2e-16 ***
factor11     0.32435    0.06955   4.664  3.1e-06 ***
factor21     0.09870    0.06941   1.422    0.155    
factor31          NA         NA      NA       NA    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 4868.1  on 3999  degrees of freedom
Residual deviance: 4844.3  on 3997  degrees of freedom
AIC: 4850.3

Number of Fisher Scoring iterations: 4

Now wrapping them up to a function, for interaction effect, you don’t have to change the effect_size default in the function here, you can define it yourself later when calling the function:

# Function to calculate power
power_simulation <- function(n, 
                             alpha = 0.05,
                             effect_size = c(0.2, 0.2, 0.2),
                             p = 0.5) {
  comb  = 2*2*2
  # Generate factors
  factor1 <- factor(rep(rep(c(0, 1), each = n), times = 4))
  factor2 <- factor(rep(rep(c(0, 1), each = n), each = 2, times = 2))
  factor3 <- factor(rep(c(0, 1), each = n, times = 4))
  
  # Simulate the log odds of success
  log_odds <- effect_size[1] * as.numeric(factor1) +
              effect_size[2] * as.numeric(factor2) +
              effect_size[3] * as.numeric(factor3) 
  
  # Convert log odds to probability
  prob <- exp(log_odds) / (1 + exp(log_odds))
  
  # Generate binary outcome based on the simulated probability
  outcome <- rbinom(comb * n, 1, prob)
  fit <-
    glm(outcome ~ factor1 + factor2 + factor3, family = binomial())
  
  # Check if the p-value for the treatment effect is less than alpha
  p_value <- summary(fit)$coefficients[2, 4]
  return(p_value < alpha)
}

Given power level, find sample size

estimate_sample_size <- function(desired_power, 
                                 alpha = 0.05,
                                 effect_size = c(0.2, 0.2, 0.2),
                                 p = 0.5,
                                 step = 10,
                                 max_n = 1000) {
  n <- step  
  estimated_power <- 0
  
  while (estimated_power < desired_power && n <= max_n) {
    # Run power simulation
    n_simulations <- 200  # feel free to change
    results <- replicate(n_simulations, power_simulation(n, alpha, effect_size, p))
    estimated_power <- mean(results)
    
    # Print progress (optional) feel free to turn it on if you need to trouble shooting below for printin
    #cat("Sample size:", n, "Estimated power:", estimated_power, "\n")
    
    n <- n + step
  }
  

  if (estimated_power >= desired_power) {
    return(n - step)  
  } else {
    return(NA)
  }
}
estimate_sample_size(
  0.8,
  alpha = 0.05,
  effect_size = c(0.2, 0.2, 0.2),
  p = 0.5,
  step = 10,
  max_n = 1000
)
[1] 130

Demo how to use this function:

desired_power <- 0.8
sample_size <- estimate_sample_size(desired_power)
print(sample_size)
[1] 120

Plot things with uniform increment on effect size

set.seed(124) # replicable thigns
slice <- 10 # how many time you want to slice up the range of effect size
sample_size <- rep(0, slice) # store estimated sample size needed
inc <- seq(0.1, 0.5, length.out = slice)
for (i in 1:slice) {
  effect_size <-
    c(0, 0, 0) + inc[i] # add more dimension if needed Dana!
  sample_size[i] <- estimate_sample_size(
    desired_power = 0.8,
    # how much power you need, this is in decimal so less than 1
    alpha = 0.05,
    # rejection region
    effect_size,
    # add more dimentions if you need to change the effect_size, to do this you also need to alter the function "power_simulation"
    p = 0.5,
    # baseline success rate
    step = 10,
    # step size or precise level to do this
    max_n = 1000 # max sample size for shuting down the loop
  )
}
library(ggplot2)
df = data.frame(increment = inc, sample_size = sample_size)
ggplot(data = df, aes(x = inc, y = sample_size))+
  coord_cartesian(ylim= c(0,500))+
  geom_line()+
  labs(y = 'Sample Size Needed', 
       x = 'Effect Size at Log Odd Ratio',
       title = 'Sample Size needed for given Log Odd Ratio (Effect Size)')+
  theme_bw()