CHI(?) Power analysis for Log. Regression model for engagement with prompts

Author

Ilya Musabirov

Code
library(rstanarm)
library(dplyr)
library(stringr)
library(glue)
Show the code
sprintf_transformer <- function(text, envir) {
  m <- regexpr(":.+$", text)
  if (m != -1) {
    format <- substring(regmatches(text, m), 2)
    regmatches(text, m) <- ""
    res <- eval(parse(text = text, keep.source = FALSE), envir)
    do.call(sprintf, list(glue("%{format}"), res))
  } else {
    eval(parse(text = text, keep.source = FALSE), envir)
  }
}
Code
fit_one_glm <- function(baseline_probability, 
                        main_effect, interaction_multiplier, N) {
    f1 <- sample(c(T,F), N, replace=TRUE)
    f2 <- sample(c(T,F), N, replace=TRUE)
    y <- rbinom(N, 1, baseline_probability)
    y[f1==T] <- rbinom(length(y[f1==T]), 
                                     1, main_effect)
    (p_current <- sum(y)/length(y))
    y[f2==T] <- rbinom(length(y[f2==T]), 
                                     1, main_effect)
    y[f2==T&f1==T] <- rbinom(length(y[f2==T&f1==T]), 
                                 1, (main_effect*interaction_multiplier))
    
    m1 <- glm(y ~ f1*f2, family="binomial")
    m1
}
  
  
  power_glm <- function(baseline_probability, main_effect, interaction_multiplier, N=100, nsim=500) {
  pvals <- data.frame(f1=numeric(length = nsim),
                         f2=numeric(length = nsim),
                         int=numeric(length = nsim))
  coef <-   data.frame(f1=numeric(length = nsim),
                         f2=numeric(length = nsim),
                         int=numeric(length = nsim),
                       intercept=numeric(length = nsim))
  
  for (i in 1:nsim) {
    m1 <- fit_one_glm(baseline_probability, main_effect, interaction_multiplier, N)
    pvals[i,"f1"] <- coef(summary(m1))["f1TRUE", "Pr(>|z|)"]
    pvals[i,"f2"] <- coef(summary(m1))["f2TRUE", "Pr(>|z|)"]
    pvals[i,"int"] <- coef(summary(m1))["f1TRUE:f2TRUE", "Pr(>|z|)"]
    coef[i,"f1"] <- coef(summary(m1))["f1TRUE", "Estimate"]
    coef[i,"f2"] <- coef(summary(m1))["f2TRUE", "Estimate"]
    coef[i,"int"] <- coef(summary(m1))["f1TRUE:f2TRUE", "Estimate"]
    coef[i,"intercept"] <- coef(summary(m1))["(Intercept)", "Estimate"]
    pvals$what <- "p"
    pvals$intercept <- NA
    coef$what <- "coef"
    res = rbind(pvals,coef)
  }
  return(res)
}


print_sim_results <- function(sim){
    print("Power")
    sim %>% filter(what == "p") %>% 
    select(-what) %>% 
    summarise(across(.fns=function(x){sum(x<0.05)/length(x)}))  %>% print()
  
    sim %>% filter(what == "coef") %>% 
    select(-what) %>% 
    pull(intercept) %>% median() -> intercept
  
    sim %>% filter(what == "coef") %>% 
    select(-what) %>% 
    pull(f1) %>% median() -> f1
    sim %>% filter(what == "coef") %>% 
    select(-what) %>% 
    pull(f2) %>% median() -> f2
    sim %>% filter(what == "coef") %>% 
    select(-what) %>% 
    pull(int) %>% median() -> interaction
  
    print(str_glue("Baseline probability {exp(intercept):.3f}, 
                   odds ratio f1 {exp(f1):.3f}, prob f1 {exp(intercept+f1):.3f},  
                   odds ratio f2 {exp(f2):.3f}, prob f2 {exp(intercept+f2):.3f}, 
                   odds ratio interaction {exp(interaction):.3f}", 
                   .transformer = sprintf_transformer))
}

Scenario 1

Baseline engagement ~ 10%, each factor leads to twice as much engagement, the interaction is ~50% (0.2*2.5). N = 600 students Power for main effects 66% :( interaction 28% :((

Code
power_glm(0.1, 0.2, 2.5, 600, 1000) %>% print_sim_results()
[1] "Power"
     f1    f2   int intercept
1 0.668 0.684 0.275        NA
Baseline probability 0.110, 
odds ratio f1 2.248, prob f1 0.248,  
odds ratio f2 2.265, prob f2 0.250, 
odds ratio interaction 1.777

Scenario 1a

Baseline engagement ~ 10%, each factor leads to 2.2**** as much engagement, the interaction is ~50% (0.2*2.5). N = 600 students. Power for main effects ~80% :), interaction 26% :(

Code
power_glm(0.1, 0.22, 2.5, 600, 1000)  %>% print_sim_results()
[1] "Power"
     f1    f2   int intercept
1 0.819 0.804 0.264        NA
Baseline probability 0.110, 
odds ratio f1 2.546, prob f1 0.281,  
odds ratio f2 2.567, prob f2 0.283, 
odds ratio interaction 1.721

Scenario 2

Baseline engagement ~ 5%, each factor leads to twice as much engagement, the interaction is ~20% (0.1*2.5). N = 600 students

Code
power_glm(0.05, 0.1, 2.5, 600, 1000) %>% print_sim_results()
[1] "Power"
     f1    f2   int intercept
1 0.344 0.344 0.102        NA
Baseline probability 0.052, 
odds ratio f1 2.135, prob f1 0.110,  
odds ratio f2 2.106, prob f2 0.109, 
odds ratio interaction 1.448

Scenario 2a

Baseline engagement ~ 5%, each factor leads to twice as much engagement, the interaction is ~20% (0.1*2.5). N = 1800 students

Code
power_glm(0.05, 0.1, 2.5, 1800, 1000) %>% print_sim_results()
[1] "Power"
     f1    f2   int intercept
1 0.828 0.812 0.198        NA
Baseline probability 0.052, 
odds ratio f1 2.119, prob f1 0.110,  
odds ratio f2 2.142, prob f2 0.111, 
odds ratio interaction 1.406

Bayessian approach

Code
fit_one_bayes <- function(baseline_probability, main_effect, 
                          interaction_multiplier, N=100){
    f1 <- sample(c(T,F), N, replace=TRUE)
    f2 <- sample(c(T,F), N, replace=TRUE)
    y <- rbinom(N, 1, baseline_probability)
    y[f1==T] <- rbinom(length(y[f1==T]), 
                                     1, main_effect)
    (p_current <- sum(y)/length(y))
    y[f2==T] <- rbinom(length(y[f2==T]), 
                                     1, main_effect)
    y[f2==T&f1==T] <- rbinom(length(y[f2==T&f1==T]), 
                                 1, (main_effect*interaction_multiplier))
    
    m <- stan_glm(y ~ f1*f2, family="binomial")
    m
}
Code
m1 <- fit_one_bayes(0.05, 0.1, 2.5, 1800)
Warning: Omitting the 'data' argument is not recommended and may not be allowed
in future versions of rstanarm. Some post-estimation functions (in particular
'update', 'loo', 'kfold') are not guaranteed to work properly unless 'data' is
specified as a data frame.

SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 1).
Chain 1: 
Chain 1: Gradient evaluation took 4.9e-05 seconds
Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.49 seconds.
Chain 1: Adjust your expectations accordingly!
Chain 1: 
Chain 1: 
Chain 1: Iteration:    1 / 2000 [  0%]  (Warmup)
Chain 1: Iteration:  200 / 2000 [ 10%]  (Warmup)
Chain 1: Iteration:  400 / 2000 [ 20%]  (Warmup)
Chain 1: Iteration:  600 / 2000 [ 30%]  (Warmup)
Chain 1: Iteration:  800 / 2000 [ 40%]  (Warmup)
Chain 1: Iteration: 1000 / 2000 [ 50%]  (Warmup)
Chain 1: Iteration: 1001 / 2000 [ 50%]  (Sampling)
Chain 1: Iteration: 1200 / 2000 [ 60%]  (Sampling)
Chain 1: Iteration: 1400 / 2000 [ 70%]  (Sampling)
Chain 1: Iteration: 1600 / 2000 [ 80%]  (Sampling)
Chain 1: Iteration: 1800 / 2000 [ 90%]  (Sampling)
Chain 1: Iteration: 2000 / 2000 [100%]  (Sampling)
Chain 1: 
Chain 1:  Elapsed Time: 0.491928 seconds (Warm-up)
Chain 1:                0.520733 seconds (Sampling)
Chain 1:                1.01266 seconds (Total)
Chain 1: 

SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 2).
Chain 2: 
Chain 2: Gradient evaluation took 4.8e-05 seconds
Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 0.48 seconds.
Chain 2: Adjust your expectations accordingly!
Chain 2: 
Chain 2: 
Chain 2: Iteration:    1 / 2000 [  0%]  (Warmup)
Chain 2: Iteration:  200 / 2000 [ 10%]  (Warmup)
Chain 2: Iteration:  400 / 2000 [ 20%]  (Warmup)
Chain 2: Iteration:  600 / 2000 [ 30%]  (Warmup)
Chain 2: Iteration:  800 / 2000 [ 40%]  (Warmup)
Chain 2: Iteration: 1000 / 2000 [ 50%]  (Warmup)
Chain 2: Iteration: 1001 / 2000 [ 50%]  (Sampling)
Chain 2: Iteration: 1200 / 2000 [ 60%]  (Sampling)
Chain 2: Iteration: 1400 / 2000 [ 70%]  (Sampling)
Chain 2: Iteration: 1600 / 2000 [ 80%]  (Sampling)
Chain 2: Iteration: 1800 / 2000 [ 90%]  (Sampling)
Chain 2: Iteration: 2000 / 2000 [100%]  (Sampling)
Chain 2: 
Chain 2:  Elapsed Time: 0.514085 seconds (Warm-up)
Chain 2:                0.51243 seconds (Sampling)
Chain 2:                1.02652 seconds (Total)
Chain 2: 

SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 3).
Chain 3: 
Chain 3: Gradient evaluation took 5e-05 seconds
Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 0.5 seconds.
Chain 3: Adjust your expectations accordingly!
Chain 3: 
Chain 3: 
Chain 3: Iteration:    1 / 2000 [  0%]  (Warmup)
Chain 3: Iteration:  200 / 2000 [ 10%]  (Warmup)
Chain 3: Iteration:  400 / 2000 [ 20%]  (Warmup)
Chain 3: Iteration:  600 / 2000 [ 30%]  (Warmup)
Chain 3: Iteration:  800 / 2000 [ 40%]  (Warmup)
Chain 3: Iteration: 1000 / 2000 [ 50%]  (Warmup)
Chain 3: Iteration: 1001 / 2000 [ 50%]  (Sampling)
Chain 3: Iteration: 1200 / 2000 [ 60%]  (Sampling)
Chain 3: Iteration: 1400 / 2000 [ 70%]  (Sampling)
Chain 3: Iteration: 1600 / 2000 [ 80%]  (Sampling)
Chain 3: Iteration: 1800 / 2000 [ 90%]  (Sampling)
Chain 3: Iteration: 2000 / 2000 [100%]  (Sampling)
Chain 3: 
Chain 3:  Elapsed Time: 0.49408 seconds (Warm-up)
Chain 3:                0.505261 seconds (Sampling)
Chain 3:                0.999341 seconds (Total)
Chain 3: 

SAMPLING FOR MODEL 'bernoulli' NOW (CHAIN 4).
Chain 4: 
Chain 4: Gradient evaluation took 4.6e-05 seconds
Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 0.46 seconds.
Chain 4: Adjust your expectations accordingly!
Chain 4: 
Chain 4: 
Chain 4: Iteration:    1 / 2000 [  0%]  (Warmup)
Chain 4: Iteration:  200 / 2000 [ 10%]  (Warmup)
Chain 4: Iteration:  400 / 2000 [ 20%]  (Warmup)
Chain 4: Iteration:  600 / 2000 [ 30%]  (Warmup)
Chain 4: Iteration:  800 / 2000 [ 40%]  (Warmup)
Chain 4: Iteration: 1000 / 2000 [ 50%]  (Warmup)
Chain 4: Iteration: 1001 / 2000 [ 50%]  (Sampling)
Chain 4: Iteration: 1200 / 2000 [ 60%]  (Sampling)
Chain 4: Iteration: 1400 / 2000 [ 70%]  (Sampling)
Chain 4: Iteration: 1600 / 2000 [ 80%]  (Sampling)
Chain 4: Iteration: 1800 / 2000 [ 90%]  (Sampling)
Chain 4: Iteration: 2000 / 2000 [100%]  (Sampling)
Chain 4: 
Chain 4:  Elapsed Time: 0.499183 seconds (Warm-up)
Chain 4:                0.562588 seconds (Sampling)
Chain 4:                1.06177 seconds (Total)
Chain 4: 
Code
library(bayestestR)
library(insight)
library(ggplot2)


posterior_sum <- describe_posterior(m1)
Possible multicollinearity between f2TRUE and f1TRUE (r = 0.72), f1TRUE:f2TRUE and f1TRUE (r = 0.85), f1TRUE:f2TRUE and f2TRUE (r = 0.83). This might lead to inappropriate results. See 'Details' in '?rope'.
Code
# for a nicer table
print_md(posterior_sum, digits = 2)
Summary of Posterior Distribution
Parameter Median 95% CI pd ROPE % in ROPE Rhat ESS
(Intercept) -3.19 [-3.69, -2.75] 100% [-0.18, 0.18] 0% 1.003 1205.00
f1TRUE 1.01 [ 0.45, 1.59] 100% [-0.18, 0.18] 0% 1.004 1222.00
f2TRUE 1.15 [ 0.62, 1.73] 100% [-0.18, 0.18] 0% 1.005 1317.00
f1TRUE:f2TRUE -0.04 [-0.70, 0.62] 54.50% [-0.18, 0.18] 42.39% 1.005 1195.00
Code
posteriors <- get_parameters(m1)

ggplot(posteriors, aes(x = f2TRUE)) +
  geom_density(fill = "orange")

Code
hdi(posteriors$f1TRUE, ci = 0.9)
90% HDI: [0.55, 1.50]
Code
rope_range(m1)
[1] -0.1813799  0.1813799
Code
p_direction(posteriors$f2TRUE)
Probability of Direction: 1.00