Hurdle Model Simulations

library(lme4)
Loading required package: Matrix
library(GLMMadaptive)

Attaching package: 'GLMMadaptive'
The following object is masked from 'package:lme4':

    negative.binomial
library(tidyverse)
library(tictoc) # timing
library(multidplyr) # parallelization
library(beepr) # beeps

We’re going to do a simulation on the relative power of the hurdle vs. difference score models to detect condition differences.

Dataset generation

We start by generating a dataset. We can do this either via our data, via a fitted model or more artificially. I’m going to start with a simple synthetic dataset.

I’m doing this in the simplest possible way where there is a match vs. a mismatch trial on exactly one trial type. We can generalize this later.

One big question is about what the correlation is between search times and whether you search. We need to assume some random effects in this simulation. Here I’ve set the magnitude of these arbitrarily, and I assume there is no correlation between them (which is largely true in the pilot) and also that there’s no random slope (e.g., difference between kids in the match effect).

make_hurdle_data <- function (n = 100,
                              match_prob = .4,
                              match_time = log(3),
                              mismatch_boost = .3,
                              mismatch_time_boost = log(3), 
                              sub_prob_sd = .05,
                              sub_time_sd = .2) 
{
  expand_grid(trial_type = c("match","mismatch"), 
              subid = 1:n) |>
    group_by(subid) |>
    mutate(sub_match_prob = rnorm(n = 1, mean = match_prob, 
                                  sd = sub_prob_sd),
           sub_match_time = rnorm(n = 1, mean = match_time, 
                                  sd = sub_time_sd),
           searched = c(rbinom(n = 1, size = 1, 
                               prob = sub_match_prob), 
                        rbinom(n = 1, size = 1, 
                               prob = min(sub_match_prob + 
                                 mismatch_boost, 1)))) |>
    mutate(search_time = 
             ifelse(searched == 0, 0, 
                    ifelse(trial_type == "match", 
                           rlnorm(n = 1, 
                                  meanlog = sub_match_time),
                           rlnorm(n = 1, 
                                  meanlog = sub_match_time + 
                                    mismatch_time_boost))))
}

Try this out.

d <- make_hurdle_data()

ggplot(d, aes(x = trial_type, y = search_time)) + 
  geom_jitter(alpha = .4) + 
  stat_summary()
No summary function supplied, defaulting to `mean_se()`

d |> group_by(trial_type) |> summarise(searched = sum(searched))
# A tibble: 2 × 2
  trial_type searched
  <chr>         <int>
1 match            46
2 mismatch         75
d <- make_hurdle_data(mismatch_boost = 0)

ggplot(d, aes(x = trial_type, y = search_time)) + 
  geom_jitter(alpha = .4) + 
  stat_summary()
No summary function supplied, defaulting to `mean_se()`

d |> group_by(trial_type) |> summarise(searched = sum(searched))
# A tibble: 2 × 2
  trial_type searched
  <chr>         <int>
1 match            41
2 mismatch         48

Functions for simulation

Now set up functions to do both analyses for these data and get the relevant p-values.

Here I’m fitting a model that DOES assume correlation between the zero and continuous random effects.

# runs the simple difference score
do_diff_analysis <- function(data) {
  tt <- data |>
    group_by(subid) |>
    summarise(diff_score = search_time[trial_type == "mismatch"] - 
                search_time[trial_type == "match"]) |>
    pull(diff_score) |>
    t.test() 
  
  return(tt$p.value)
}

# runs the model
do_hurdle_analysis <- function(data) {
  mod_hurdle <- mixed_model(search_time ~ trial_type, 
                            random = ~ 1 | subid, 
                            data = data, 
                            family = hurdle.lognormal(), 
                            n_phis = 1,
                            zi_fixed = ~ trial_type, 
                            zi_random = ~ 1 | subid, 
                            control = list(max_coef_value = 100)) # to avoid non-convergence errors
  
  p_vals <- summary(mod_hurdle)[[1]][,"p-value"]
  hurdle_tibble <- tibble(search_time_p = summary(mod_hurdle)[[1]][,"p-value"][2],
                          searched_p = summary(mod_hurdle)[[2]][,"p-value"][2])
  return(hurdle_tibble)
}

# helper function to avoid problems when glmm crashes
safe_hurdle_analysis <- safely(do_hurdle_analysis, otherwise = NA)

Simulations

First test run

Here we run the simulations. They are slow, so we put them in an eval=FALSE and run them only once and cache. We also parallelize all of this to speed up.

cluster <- new_cluster(14) 
cluster_library(cluster, "tidyverse")
cluster_library(cluster, "GLMMadaptive")
cluster_copy(cluster, "make_hurdle_data")
cluster_copy(cluster, "do_diff_analysis")
cluster_copy(cluster, "do_hurdle_analysis")
cluster_copy(cluster, "safe_hurdle_analysis")
tic()
sims <- expand_grid(sim_num = 1:100,
                    n = c(20, 50, 100)) |>
  partition(cluster) |>
  mutate(data = map(n, make_hurdle_data), 
         diff_p = map_dbl(data, do_diff_analysis), 
         hurdle_p = map(data, safe_hurdle_analysis)) |>
  collect() |>
  mutate(hurdle_p = map(hurdle_p, ~.x$result)) |>
  unnest(hurdle_p)
toc()
beep()

save(sims, file = "sim_cache/simple_sims.Rds")

Now let’s average and plot these p-values.

load(file = "sim_cache/simple_sims.Rds")

sims |>
  pivot_longer(ends_with("_p"), names_to = "analysis", values_to = "p") |>
  mutate(sig = as.numeric(p < .05)) |>
  ggplot(aes(x = n, y = sig, col = analysis)) +
  stat_summary(geom = "line") + 
  stat_summary() +
  xlab("N per group") + 
  ylab("Proportion statistically significant")
Warning: Removed 9 rows containing non-finite values (`stat_summary()`).
No summary function supplied, defaulting to `mean_se()`
Warning: Removed 9 rows containing non-finite values (`stat_summary()`).
No summary function supplied, defaulting to `mean_se()`

Interesting. So under our default parameters, the binary variable in general is much lower power, but the search time is higher power. In particular the search time is higher power for low N but then maybe the difference score is a hint better for higher N?

Full simulation

How does this depend on how big each effect is? Let’s do that simulation.

cluster <- new_cluster(14) 
cluster_library(cluster, "tidyverse")
cluster_library(cluster, "GLMMadaptive")
cluster_copy(cluster, "make_hurdle_data")
cluster_copy(cluster, "do_diff_analysis")
cluster_copy(cluster, "do_hurdle_analysis")
cluster_copy(cluster, "safe_hurdle_analysis")
tic()
sims <- expand_grid(sim_num = 1:100,
                    n = c(20, 50, 100), 
                    match_prob = .4,
                    match_time = log(3),
                    mismatch_boost = c(.2, .4),
                    mismatch_time_boost = c(.5, 1.5), 
                    sub_prob_sd = c(.05, .1),
                    sub_time_sd = c(.2, .3)) |>
  partition(cluster) |>
  mutate(data = pmap(list(n = n, 
                          match_prob = match_prob, 
                          match_time = match_time,
                          mismatch_boost = mismatch_boost, 
                          mismatch_time_boost = mismatch_time_boost, 
                          sub_prob_sd = sub_prob_sd, 
                          sub_time_sd = sub_time_sd), 
                     make_hurdle_data), 
         diff_p = map_dbl(data, do_diff_analysis), 
         hurdle_p = map(data, safe_hurdle_analysis)) |>
  collect() |>
  mutate(hurdle_p = map(hurdle_p, ~.x$result)) |>
  unnest(hurdle_p)
toc()
# beep()

save(sims, file = "sim_cache/full_param_sims.Rds")

Now plot.

load(file = "sim_cache/full_param_sims.Rds")

sims |>
  pivot_longer(ends_with("_p"), names_to = "analysis", values_to = "p") |>
  mutate(sig = as.numeric(p < .05)) |>
  ggplot(aes(x = n, y = sig, col = analysis, lty = 
               interaction(as.factor(sub_prob_sd), 
                           as.factor(sub_time_sd)))) +
  stat_summary(geom = "line") + 
  stat_summary() +
  facet_grid(mismatch_boost ~ mismatch_time_boost) +
  xlab("N per group") + 
  ylab("Proportion statistically significant") + 
  scale_linetype_discrete(name = "Random effects") + 
  theme(legend.position = "bottom")
Warning: Removed 51 rows containing non-finite values (`stat_summary()`).
No summary function supplied, defaulting to `mean_se()`
No summary function supplied, defaulting to `mean_se()`
No summary function supplied, defaulting to `mean_se()`
No summary function supplied, defaulting to `mean_se()`
Warning: Removed 51 rows containing non-finite values (`stat_summary()`).
No summary function supplied, defaulting to `mean_se()`
No summary function supplied, defaulting to `mean_se()`
No summary function supplied, defaulting to `mean_se()`
No summary function supplied, defaulting to `mean_se()`

The quadrants show different strengths of the effect on not searching vs. on search time. Red is the difference scores. The different line types are different combos of random effects (high/low variance on not searching vs. high/low variance on search times).

What I see here is that if you have an effect, it can move between search time and searching. In some cases, difference scores can be as good as one or the other - but in some cases it’s substantially worse than one.

Single siimulation result

For debugging purposes.

Let’s dig into what is going on.

data <- sims$data[[4800]]

ggplot(data, aes(x = trial_type, y = search_time)) + 
  geom_jitter(alpha = .4) + 
  stat_summary()
No summary function supplied, defaulting to `mean_se()`

data |> group_by(trial_type) |> 
  summarise(m_searched = mean(searched), 
            m_search_time = mean(search_time), 
            m_search_time_nozero = mean(search_time[search_time > 0]))
# A tibble: 2 × 4
  trial_type m_searched m_search_time m_search_time_nozero
  <chr>           <dbl>         <dbl>                <dbl>
1 match            0.39          2.22                 5.69
2 mismatch         0.57          5.60                 9.82
data |>
    group_by(subid) |>
    summarise(diff_score = search_time[trial_type == "mismatch"] - 
                search_time[trial_type == "match"]) |>
    pull(diff_score) |>
    t.test() 

    One Sample t-test

data:  pull(summarise(group_by(data, subid), diff_score = search_time[trial_type == "mismatch"] - search_time[trial_type == "match"]), diff_score)
t = 2.7798, df = 99, p-value = 0.006511
alternative hypothesis: true mean is not equal to 0
95 percent confidence interval:
 0.9669979 5.7906872
sample estimates:
mean of x 
 3.378843 
mod_hurdle <- mixed_model(search_time ~ trial_type, 
                            random = ~ 1 | subid, 
                            data = data, 
                            family = hurdle.lognormal(), 
                            n_phis = 1,
                            zi_fixed = ~ trial_type, 
                            zi_random = ~ 1 | subid, 
                            control = list(max_coef_value = 100)) # to avoid non-convergence errors

summary(mod_hurdle)

Call:
mixed_model(fixed = search_time ~ trial_type, random = ~1 | subid, 
    data = data, family = hurdle.lognormal(), zi_fixed = ~trial_type, 
    zi_random = ~1 | subid, n_phis = 1, control = list(max_coef_value = 100))

Data Descriptives:
Number of Observations: 200
Number of Groups: 100 

Model:
 family: hurdle log-normal
 link: identity 

Fit statistics:
   log.Lik      AIC      BIC
 -278.1749 572.3498 593.1911

Random effects covariance matrix:
                StdDev    Corr
(Intercept)     0.1794        
zi_(Intercept)  0.7809 -0.1939

Fixed effects:
                   Estimate Std.Err z-value  p-value
(Intercept)          1.1108  0.2347  4.7335  < 1e-04
trial_typemismatch   0.5727  0.2290  2.5012 0.012378

Zero-part coefficients:
                   Estimate Std.Err z-value   p-value
(Intercept)          0.5061  0.2387  2.1208 0.0339381
trial_typemismatch  -0.8276  0.3205 -2.5822 0.0098166

log(residual std. dev.):
  Estimate Std.Err
    0.0638  0.0877

Integration:
method: adaptive Gauss-Hermite quadrature rule
quadrature points: 11

Optimization:
method: hybrid EM and quasi-Newton
converged: TRUE