# Load required packages
library(lme4)
library(simr)
library(tidyverse)
library(ggthemes)

We used the simr package (Green and Macleod 2016) to estimate the sample size required to detect effects with 80% power for three simulated data sets with hierarchical structure. For each data set we run multiple power simulations.

Narrative skills

The power analysis is based on simulated narrative story scores with predictors of the LEVANTE vocabulary score (at time point 1) over 3 test occasions. Data come from groups of children nested within classrooms nested within schools. Power simulation will use linear mixed-effects modelling.

For the simulation we use the following parameter values based on results reported in Babayiğit, Roulstone, and Wren (2021), Gagarina, Bohnacker, and Lindgren (2019), and Tselekidou et al. (2024).

## Intercept and slopes for vocabulary and time effects
fixed <- c(5, # mean narrative school of preschoolers based on Gagarina et al. (2019)
          .2, # moderate effect of vocabulary based on Tselekidou et al. (2024) and Babayiğit et al. (2021)
          .8) # linear growth over time

## Random intercepts for participants clustered by class and school
rand <- list(.75, # variability for schools
             1.5, # Moderate variability for classrooms
             2) # Individual differences for children

## residual variance
res <- 2

The following code is simulating the narrative score data set.

# Description:
# Mixed-effects linear regression predicting narrative structure score from vocabulary.
# Random intercepts for school and class.
set.seed(365)
n_schools <- 20
n_classrooms <- 5
n_children_per_class <- 10
n_timepoints <- 3

# Create identifiers
school <- rep(1:n_schools, each = n_classrooms * n_children_per_class * n_timepoints)
classroom <- rep(1:(n_classrooms * n_schools), each = n_children_per_class * n_timepoints)
child <- rep(1:(n_children_per_class * n_schools * n_classrooms), each = n_timepoints)

# Simulate random effects
school_effects <- rnorm(n_schools, sd = rand[[1]])
class_effects <- rnorm(n_schools * n_classrooms, sd = rand[[2]])
child_effects <- rnorm(n_schools * n_classrooms * n_children_per_class, sd = rand[[3]])

# Create data frame
data <- tibble(child, classroom, school) %>% 
  arrange(school, classroom, child) %>% 
  mutate(time = rep(0:(n_timepoints - 1), n()/3),
          # Simulate predictors and outcome
          school_re = school_effects[school],
          class_re = class_effects[classroom],
          child_re = child_effects[child],
          vocabulary = rnorm(n_schools * n_classrooms * n_children_per_class)[child],
          narrative = fixed[1] + vocabulary * fixed[2] + time * fixed[3] + 
           school_re + class_re + child_re + rnorm(n(), sd = res),
          across(c(school, classroom, child), as.factor))

# Preview data
data
# A tibble: 3,000 × 9
   child classroom school  time school_re class_re child_re vocabulary narrative
   <fct> <fct>     <fct>  <int>     <dbl>    <dbl>    <dbl>      <dbl>     <dbl>
 1 1     1         1          0     0.628  -0.0250    4.26      -2.36       8.35
 2 1     1         1          1     0.628  -0.0250    4.26      -2.36      13.3 
 3 1     1         1          2     0.628  -0.0250    4.26      -2.36       9.87
 4 2     1         1          0     0.628  -0.0250   -1.27       0.972      6.14
 5 2     1         1          1     0.628  -0.0250   -1.27       0.972      3.54
 6 2     1         1          2     0.628  -0.0250   -1.27       0.972      5.39
 7 3     1         1          0     0.628  -0.0250    5.13       1.65      10.4 
 8 3     1         1          1     0.628  -0.0250    5.13       1.65      11.8 
 9 3     1         1          2     0.628  -0.0250    5.13       1.65      14.0 
10 4     1         1          0     0.628  -0.0250   -0.721      0.226      8.33
# ℹ 2,990 more rows

Model coefficients of the simulated linear mixed effects model.

# Fit model
model <- makeLmer(narrative ~ vocabulary + time + (1|school/classroom/child),
                  fixef = fixed, 
                  VarCorr = rand, 
                  sigma = res, 
                  data = data)

model
Linear mixed model fit by REML ['lmerMod']
Formula: narrative ~ vocabulary + time + (1 | school/classroom/child)
   Data: data
REML criterion at convergence: 14449.97
Random effects:
 Groups                 Name        Std.Dev.
 child:classroom:school (Intercept) 0.866   
 classroom:school       (Intercept) 1.225   
 school                 (Intercept) 1.414   
 Residual                           2.000   
Number of obs: 3000, groups:  
child:classroom:school, 1000; classroom:school, 100; school, 20
Fixed Effects:
(Intercept)   vocabulary         time  
        5.0          0.2          0.8  

To determine how many schools we need to test to detect time and vocabulary effects on narrative scores using model comparisons with a power of 80%, we ran multiple power simulations for up to 20 with each 5 classrooms and each 10 children nested within classrooms.

p_curve_vocab <- powerCurve(model, 
                      test = simr::fixed("vocabulary", method = "t"),
                      along = "school", 
                      breaks = seq(4, n_schools, 2))

p_curve_time <- powerCurve(model, 
                      test = simr::fixed("time", method = "t"),
                      along = "school", 
                      breaks = seq(4, n_schools, 2))

Finally we plot the power curve to determine the required number of participants for a power of 80%. The results can be seen in Figure 1. According to these simulation results, we require data from at least 10 schools to be able to detect an effect of vocabulary with a power of 86.4% (95% CI: 84.1%, 88.5%). Power for the fixed effect of time was at ceiling for the smallest number of schools.

Power curve with estimated power over number of schools for the effect of time and vocabulary. Errorbars indicate 95% confidence intervals (CI).

Figure 1: Power curve with estimated power over number of schools for the effect of time and vocabulary. Errorbars indicate 95% confidence intervals (CI).

Lexical processing

Lexical processing will involve recording of eye-fixation data on identifying the picture of a target word vs semantic and phonological competitors plus distractor. Fixation data will be analysed in growth curve analysis using linear mixed-effects modeling to capture the fixation changes over time. Predictors will be included for executive functions and vocabulary as well as fixed effects with polynomial time terms (e.g., linear, quadratic). Random effects will be included for child-level intercepts.

For the simulation we use parameter values based on the results reported in Jeppsen et al. (2025) and Pomper, Kaushanskaya, and Saffran (2022).

# Fixed effects estimates
fixed <- c(0.00, 0.05, 0.02, 0.03, 1.5, -2.0)

# Random effects
rand <- list(0.01)

# residual error
res <- 0.025

The following code is simulating the data:

# Set parameters
set.seed(123)
n_children <- 100
n_timepoints <- 10

# Simulate subject-level predictors
children_data <- tibble(
  child = factor(1:n_children),
  age = rnorm(n_children, mean = 8),  # age in years
  vocabulary = rnorm(n_children, sd = 2),
  exec_function = rnorm(n_children, sd = 2),
  child_effect = rnorm(n_children, sd = unlist(rand)))  # random intercepts

# Expand to timepoints
data <- children_data %>%
  slice(rep(1:n(), each = n_timepoints)) %>%
  mutate(time = rep(seq(0, 1, length.out = n_timepoints), times = n_children),
         fixation = age * fixed[2] + vocabulary * fixed[3] + exec_function * fixed[4] + time * fixed[5] + time^2 * fixed[6] + child_effect + rnorm(n(), sd = res))

# Preview data
data
# A tibble: 1,000 × 7
   child   age vocabulary exec_function child_effect  time fixation
   <fct> <dbl>      <dbl>         <dbl>        <dbl> <dbl>    <dbl>
 1 1      7.44      -1.42          4.40     -0.00715 0       0.466 
 2 1      7.44      -1.42          4.40     -0.00715 0.111   0.581 
 3 1      7.44      -1.42          4.40     -0.00715 0.222   0.687 
 4 1      7.44      -1.42          4.40     -0.00715 0.333   0.745 
 5 1      7.44      -1.42          4.40     -0.00715 0.444   0.757 
 6 1      7.44      -1.42          4.40     -0.00715 0.556   0.643 
 7 1      7.44      -1.42          4.40     -0.00715 0.667   0.571 
 8 1      7.44      -1.42          4.40     -0.00715 0.778   0.444 
 9 1      7.44      -1.42          4.40     -0.00715 0.889   0.208 
10 1      7.44      -1.42          4.40     -0.00715 1      -0.0260
# ℹ 990 more rows

Model coefficients of the simulated linear mixed effects model.

# Simulate model using makeLmer
model <- makeLmer(fixation ~ age + vocabulary + exec_function + time + I(time^2) + (1 | child),
  fixef = fixed,
  VarCorr = rand,
  sigma = res, 
  data = data)

model
Linear mixed model fit by REML ['lmerMod']
Formula: fixation ~ age + vocabulary + exec_function + time + I(time^2) +  
    (1 | child)
   Data: data
REML criterion at convergence: -4088.685
Random effects:
 Groups   Name        Std.Dev.
 child    (Intercept) 0.100   
 Residual             0.025   
Number of obs: 1000, groups:  child, 100
Fixed Effects:
  (Intercept)            age     vocabulary  exec_function           time  
         0.00           0.05           0.02           0.03           1.50  
    I(time^2)  
        -2.00  

To determine the sample size of children to detect effects for executive functions and vocabulary with a power of 80%, we ran multiple power simulations for up to 100 children.

p_curve_vocab <- powerCurve(model, 
                      test = simr::fixed("vocabulary", method = "t"), 
                      along = "child",
                      breaks = seq(20, n_children, 5))

p_curve_exfun <- powerCurve(model, 
                      test = simr::fixed("exec_function", method = "t"), 
                      along = "child",
                      breaks = seq(20, n_children, 5))

Finally we plot the power curve to determine the required number of children for a power of 80%. The results can be seen in Figure 2. According to these simulation results, we require data from at least 65 children to be able to detect an effect of vocabulary with a power of 89.1% (95% CI: 87%, 91%) and at least 30 children to be able to detect an effect of executive functions with a power of 86.1% (95% CI: 83.8%, 88.2%).

Power curve with estimated power over number of children for the effect of executive function and vocabulary on lexical processing. Errorbars indicate 95% confidence intervals (CI).

Figure 2: Power curve with estimated power over number of children for the effect of executive function and vocabulary on lexical processing. Errorbars indicate 95% confidence intervals (CI).

Sentence processing

For sentence processing we use accuracy data (correct picture selection) simulated with predictors of executive function and vocabulary. For modelling accuracy data from sentence comprehension accuracy, we use logistic mixed-effects models with random intercepts for children. The parameter coefficients used for the simulation are based on Thothathiri, Kidd, and Rowland (2025).

# Fixed effects coefficients
fixed <- c(-0.5, 
           .5,  # executive functions effect 
           .35)   # vocabulary effect 

## Random intercepts for children
rand <- list(.5) 

The following code is simulating the accuracy data.

# Set seed for reproducibility
set.seed(123)

# Parameters
n_children <- 100
n_trials_per_child <- 20

# Generate hierarchical IDs
children <- factor(1:n_children)

# Create data frame
data <- expand.grid(
  trial = 1:n_trials_per_child,
  child = children) 

# Simulate EF and Vocabulary scores per child
child_traits <- data %>%
  select(child) %>%
  distinct() %>%
  mutate(exec_function = rnorm(n()),
         vocabulary = rnorm(n()))

# Merge traits into main data
data <- left_join(data, child_traits, by = c("child")) %>% as_tibble()

# Simulate random effects
child_effects <- rnorm(n_children, sd = rand[[1]])

# Linear predictor
data <- data %>%
  mutate(child_re = child_effects[child],
        eta = fixed[1] + exec_function * fixed[2] + vocabulary * fixed[3] + child_re,
        prob = plogis(eta),
        accuracy = rbinom(n(), size = 1, prob = prob))

# Preview data
data
# A tibble: 2,000 × 8
   trial child exec_function vocabulary child_re    eta  prob accuracy
   <int> <fct>         <dbl>      <dbl>    <dbl>  <dbl> <dbl>    <int>
 1     1 1            -0.560     -0.710     1.10 0.0705 0.518        1
 2     2 1            -0.560     -0.710     1.10 0.0705 0.518        0
 3     3 1            -0.560     -0.710     1.10 0.0705 0.518        1
 4     4 1            -0.560     -0.710     1.10 0.0705 0.518        1
 5     5 1            -0.560     -0.710     1.10 0.0705 0.518        1
 6     6 1            -0.560     -0.710     1.10 0.0705 0.518        0
 7     7 1            -0.560     -0.710     1.10 0.0705 0.518        1
 8     8 1            -0.560     -0.710     1.10 0.0705 0.518        0
 9     9 1            -0.560     -0.710     1.10 0.0705 0.518        1
10    10 1            -0.560     -0.710     1.10 0.0705 0.518        1
# ℹ 1,990 more rows

Model coefficients of the simulated logistic mixed-effects model.

# Simulate model
model <- makeGlmer(accuracy ~ exec_function + vocabulary + (1 | child), 
                  family = binomial(),
                  fixef = fixed,
                  VarCorr = rand,
                  data = data)
model
Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: binomial  ( logit )
Formula: accuracy ~ exec_function + vocabulary + (1 | child)
   Data: data
      AIC       BIC    logLik -2*log(L)  df.resid 
 2660.611  2683.015 -1326.306  2652.611      1996 
Random effects:
 Groups Name        Std.Dev.
 child  (Intercept) 0.7071  
Number of obs: 2000, groups:  child, 100
Fixed Effects:
  (Intercept)  exec_function     vocabulary  
        -0.50           0.50           0.35  

To determine how many children we need to test to detect executive functions and vocabulary effects on comprehension accuracy with a power of 80%, we ran multiple power simulations for up to 100 children.

p_curve_vocab <- powerCurve(model, 
                      test = simr::fixed("vocabulary", method = "z"), 
                      along = "child",
                      breaks = seq(20, n_children, 10))

p_curve_exfun <- powerCurve(model, 
                      test = simr::fixed("exec_function", method = "z"), 
                      along = "child",
                      breaks = seq(20, n_children, 10))

We plot the power curve to determine the required number of children for a power of 80%. The results can be seen in Figure 3. According to these simulation results, we require data from at least 50 children to be able to detect an effect of vocabulary with a power of 83.3% (95% CI: 80.8%, 85.6%) and at least 30 children to be able to detect an effect of executive functions with a power of 86.7% (95% CI: 84.4%, 88.7%).

Power curve with estimated power over number of children for the effect of time and vocabulary on sentence comprehension accuracy. Errorbars indicate 95% confidence intervals (CI).

Figure 3: Power curve with estimated power over number of children for the effect of time and vocabulary on sentence comprehension accuracy. Errorbars indicate 95% confidence intervals (CI).

References

Babayiğit, Selma, Sue Roulstone, and Yvonne Wren. 2021. “Linguistic Comprehension and Narrative Skills Predict Reading Ability: A 9-Year Longitudinal Study.” British Journal of Educational Psychology 91 (1): 148–68. https://doi.org/https://doi.org/10.1111/bjep.12353.
Gagarina, Natalia, Ute Bohnacker, and Josefin Lindgren. 2019. “Macrostructural Organization of Adults’ Oral Narrative Texts.” ZAS Papers in Linguistics 62: 190–208. https://doi.org/10.21248/zaspil.62.2019.449.
Green, Peter, and C. J. Macleod. 2016. SIMR: An R Package for Power Analysis of Generalized Linear Mixed Models by Simulation.” Methods in Ecology and Evolution 7 (4): 493–98. https://doi.org/10.1111/2041-210X.12504.
Jeppsen, C., K. Baxelbaum, B. Tomblin, K. Klein, and B. McMurray. 2025. “The Development of Lexical Processing: Real-Time Phonological Competition and Semantic Activation in School Age Children.” Quarterly Journal of Experimental Psychology 78 (3): 437–58. https://doi.org/10.1177/17470218241244799.
Pomper, R., M. Kaushanskaya, and J. Saffran. 2022. “Change Is Hard: Individual Differences in Children’s Lexical Processing and Executive Functions After a Shift in Dimensions.” Language Learning and Development 18 (2): 229–47. https://doi.org/10.1080/15475441.2021.1947289.
Thothathiri, M., E. Kidd, and C. Rowland. 2025. “The Role of Executive Function in the Processing and Acquisition of Syntax.” Royal Society Open Science 12: 201497. https://doi.org/10.1098/rsos.201497.
Tselekidou, Freideriki, Elizabeth Stadtmiller, Assunta Süss, Katrin Lindner, and Natalia Gagarina. 2024. “Cognitive Skills Differentially Influence Narrative Macrostructure in Bilinguals’ L1 and L2.” Journal of Child Language 52 (6): 1437–47. https://doi.org/10.1017/S0305000924000394.