# Load required packages
library(lme4)
library(simr)
library(tidyverse)
library(ggthemes)
We used the simr package (Green and Macleod 2016) to estimate the sample size required to detect effects with 80% power for three simulated data sets with hierarchical structure. For each data set we run multiple power simulations.
The power analysis is based on simulated narrative story scores with predictors of the LEVANTE vocabulary score (at time point 1) over 3 test occasions. Data come from groups of children nested within classrooms nested within schools. Power simulation will use linear mixed-effects modelling.
For the simulation we use the following parameter values based on results reported in Babayiğit, Roulstone, and Wren (2021), Gagarina, Bohnacker, and Lindgren (2019), and Tselekidou et al. (2024).
## Intercept and slopes for vocabulary and time effects
fixed <- c(5, # mean narrative school of preschoolers based on Gagarina et al. (2019)
.2, # moderate effect of vocabulary based on Tselekidou et al. (2024) and Babayiğit et al. (2021)
.8) # linear growth over time
## Random intercepts for participants clustered by class and school
rand <- list(.75, # variability for schools
1.5, # Moderate variability for classrooms
2) # Individual differences for children
## residual variance
res <- 2
The following code is simulating the narrative score data set.
# Description:
# Mixed-effects linear regression predicting narrative structure score from vocabulary.
# Random intercepts for school and class.
set.seed(365)
n_schools <- 20
n_classrooms <- 5
n_children_per_class <- 10
n_timepoints <- 3
# Create identifiers
school <- rep(1:n_schools, each = n_classrooms * n_children_per_class * n_timepoints)
classroom <- rep(1:(n_classrooms * n_schools), each = n_children_per_class * n_timepoints)
child <- rep(1:(n_children_per_class * n_schools * n_classrooms), each = n_timepoints)
# Simulate random effects
school_effects <- rnorm(n_schools, sd = rand[[1]])
class_effects <- rnorm(n_schools * n_classrooms, sd = rand[[2]])
child_effects <- rnorm(n_schools * n_classrooms * n_children_per_class, sd = rand[[3]])
# Create data frame
data <- tibble(child, classroom, school) %>%
arrange(school, classroom, child) %>%
mutate(time = rep(0:(n_timepoints - 1), n()/3),
# Simulate predictors and outcome
school_re = school_effects[school],
class_re = class_effects[classroom],
child_re = child_effects[child],
vocabulary = rnorm(n_schools * n_classrooms * n_children_per_class)[child],
narrative = fixed[1] + vocabulary * fixed[2] + time * fixed[3] +
school_re + class_re + child_re + rnorm(n(), sd = res),
across(c(school, classroom, child), as.factor))
# Preview data
data
# A tibble: 3,000 × 9
child classroom school time school_re class_re child_re vocabulary narrative
<fct> <fct> <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 1 0 0.628 -0.0250 4.26 -2.36 8.35
2 1 1 1 1 0.628 -0.0250 4.26 -2.36 13.3
3 1 1 1 2 0.628 -0.0250 4.26 -2.36 9.87
4 2 1 1 0 0.628 -0.0250 -1.27 0.972 6.14
5 2 1 1 1 0.628 -0.0250 -1.27 0.972 3.54
6 2 1 1 2 0.628 -0.0250 -1.27 0.972 5.39
7 3 1 1 0 0.628 -0.0250 5.13 1.65 10.4
8 3 1 1 1 0.628 -0.0250 5.13 1.65 11.8
9 3 1 1 2 0.628 -0.0250 5.13 1.65 14.0
10 4 1 1 0 0.628 -0.0250 -0.721 0.226 8.33
# ℹ 2,990 more rows
Model coefficients of the simulated linear mixed effects model.
# Fit model
model <- makeLmer(narrative ~ vocabulary + time + (1|school/classroom/child),
fixef = fixed,
VarCorr = rand,
sigma = res,
data = data)
model
Linear mixed model fit by REML ['lmerMod']
Formula: narrative ~ vocabulary + time + (1 | school/classroom/child)
Data: data
REML criterion at convergence: 14449.97
Random effects:
Groups Name Std.Dev.
child:classroom:school (Intercept) 0.866
classroom:school (Intercept) 1.225
school (Intercept) 1.414
Residual 2.000
Number of obs: 3000, groups:
child:classroom:school, 1000; classroom:school, 100; school, 20
Fixed Effects:
(Intercept) vocabulary time
5.0 0.2 0.8
To determine how many schools we need to test to detect time and vocabulary effects on narrative scores using model comparisons with a power of 80%, we ran multiple power simulations for up to 20 with each 5 classrooms and each 10 children nested within classrooms.
p_curve_vocab <- powerCurve(model,
test = simr::fixed("vocabulary", method = "t"),
along = "school",
breaks = seq(4, n_schools, 2))
p_curve_time <- powerCurve(model,
test = simr::fixed("time", method = "t"),
along = "school",
breaks = seq(4, n_schools, 2))
Finally we plot the power curve to determine the required number of participants for a power of 80%. The results can be seen in Figure 1. According to these simulation results, we require data from at least 10 schools to be able to detect an effect of vocabulary with a power of 86.4% (95% CI: 84.1%, 88.5%). Power for the fixed effect of time was at ceiling for the smallest number of schools.
Figure 1: Power curve with estimated power over number of schools for the effect of time and vocabulary. Errorbars indicate 95% confidence intervals (CI).
Lexical processing will involve recording of eye-fixation data on identifying the picture of a target word vs semantic and phonological competitors plus distractor. Fixation data will be analysed in growth curve analysis using linear mixed-effects modeling to capture the fixation changes over time. Predictors will be included for executive functions and vocabulary as well as fixed effects with polynomial time terms (e.g., linear, quadratic). Random effects will be included for child-level intercepts.
For the simulation we use parameter values based on the results reported in Jeppsen et al. (2025) and Pomper, Kaushanskaya, and Saffran (2022).
# Fixed effects estimates
fixed <- c(0.00, 0.05, 0.02, 0.03, 1.5, -2.0)
# Random effects
rand <- list(0.01)
# residual error
res <- 0.025
The following code is simulating the data:
# Set parameters
set.seed(123)
n_children <- 100
n_timepoints <- 10
# Simulate subject-level predictors
children_data <- tibble(
child = factor(1:n_children),
age = rnorm(n_children, mean = 8), # age in years
vocabulary = rnorm(n_children, sd = 2),
exec_function = rnorm(n_children, sd = 2),
child_effect = rnorm(n_children, sd = unlist(rand))) # random intercepts
# Expand to timepoints
data <- children_data %>%
slice(rep(1:n(), each = n_timepoints)) %>%
mutate(time = rep(seq(0, 1, length.out = n_timepoints), times = n_children),
fixation = age * fixed[2] + vocabulary * fixed[3] + exec_function * fixed[4] + time * fixed[5] + time^2 * fixed[6] + child_effect + rnorm(n(), sd = res))
# Preview data
data
# A tibble: 1,000 × 7
child age vocabulary exec_function child_effect time fixation
<fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 7.44 -1.42 4.40 -0.00715 0 0.466
2 1 7.44 -1.42 4.40 -0.00715 0.111 0.581
3 1 7.44 -1.42 4.40 -0.00715 0.222 0.687
4 1 7.44 -1.42 4.40 -0.00715 0.333 0.745
5 1 7.44 -1.42 4.40 -0.00715 0.444 0.757
6 1 7.44 -1.42 4.40 -0.00715 0.556 0.643
7 1 7.44 -1.42 4.40 -0.00715 0.667 0.571
8 1 7.44 -1.42 4.40 -0.00715 0.778 0.444
9 1 7.44 -1.42 4.40 -0.00715 0.889 0.208
10 1 7.44 -1.42 4.40 -0.00715 1 -0.0260
# ℹ 990 more rows
Model coefficients of the simulated linear mixed effects model.
# Simulate model using makeLmer
model <- makeLmer(fixation ~ age + vocabulary + exec_function + time + I(time^2) + (1 | child),
fixef = fixed,
VarCorr = rand,
sigma = res,
data = data)
model
Linear mixed model fit by REML ['lmerMod']
Formula: fixation ~ age + vocabulary + exec_function + time + I(time^2) +
(1 | child)
Data: data
REML criterion at convergence: -4088.685
Random effects:
Groups Name Std.Dev.
child (Intercept) 0.100
Residual 0.025
Number of obs: 1000, groups: child, 100
Fixed Effects:
(Intercept) age vocabulary exec_function time
0.00 0.05 0.02 0.03 1.50
I(time^2)
-2.00
To determine the sample size of children to detect effects for executive functions and vocabulary with a power of 80%, we ran multiple power simulations for up to 100 children.
p_curve_vocab <- powerCurve(model,
test = simr::fixed("vocabulary", method = "t"),
along = "child",
breaks = seq(20, n_children, 5))
p_curve_exfun <- powerCurve(model,
test = simr::fixed("exec_function", method = "t"),
along = "child",
breaks = seq(20, n_children, 5))
Finally we plot the power curve to determine the required number of children for a power of 80%. The results can be seen in Figure 2. According to these simulation results, we require data from at least 65 children to be able to detect an effect of vocabulary with a power of 89.1% (95% CI: 87%, 91%) and at least 30 children to be able to detect an effect of executive functions with a power of 86.1% (95% CI: 83.8%, 88.2%).
Figure 2: Power curve with estimated power over number of children for the effect of executive function and vocabulary on lexical processing. Errorbars indicate 95% confidence intervals (CI).
For sentence processing we use accuracy data (correct picture selection) simulated with predictors of executive function and vocabulary. For modelling accuracy data from sentence comprehension accuracy, we use logistic mixed-effects models with random intercepts for children. The parameter coefficients used for the simulation are based on Thothathiri, Kidd, and Rowland (2025).
# Fixed effects coefficients
fixed <- c(-0.5,
.5, # executive functions effect
.35) # vocabulary effect
## Random intercepts for children
rand <- list(.5)
The following code is simulating the accuracy data.
# Set seed for reproducibility
set.seed(123)
# Parameters
n_children <- 100
n_trials_per_child <- 20
# Generate hierarchical IDs
children <- factor(1:n_children)
# Create data frame
data <- expand.grid(
trial = 1:n_trials_per_child,
child = children)
# Simulate EF and Vocabulary scores per child
child_traits <- data %>%
select(child) %>%
distinct() %>%
mutate(exec_function = rnorm(n()),
vocabulary = rnorm(n()))
# Merge traits into main data
data <- left_join(data, child_traits, by = c("child")) %>% as_tibble()
# Simulate random effects
child_effects <- rnorm(n_children, sd = rand[[1]])
# Linear predictor
data <- data %>%
mutate(child_re = child_effects[child],
eta = fixed[1] + exec_function * fixed[2] + vocabulary * fixed[3] + child_re,
prob = plogis(eta),
accuracy = rbinom(n(), size = 1, prob = prob))
# Preview data
data
# A tibble: 2,000 × 8
trial child exec_function vocabulary child_re eta prob accuracy
<int> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
1 1 1 -0.560 -0.710 1.10 0.0705 0.518 1
2 2 1 -0.560 -0.710 1.10 0.0705 0.518 0
3 3 1 -0.560 -0.710 1.10 0.0705 0.518 1
4 4 1 -0.560 -0.710 1.10 0.0705 0.518 1
5 5 1 -0.560 -0.710 1.10 0.0705 0.518 1
6 6 1 -0.560 -0.710 1.10 0.0705 0.518 0
7 7 1 -0.560 -0.710 1.10 0.0705 0.518 1
8 8 1 -0.560 -0.710 1.10 0.0705 0.518 0
9 9 1 -0.560 -0.710 1.10 0.0705 0.518 1
10 10 1 -0.560 -0.710 1.10 0.0705 0.518 1
# ℹ 1,990 more rows
Model coefficients of the simulated logistic mixed-effects model.
# Simulate model
model <- makeGlmer(accuracy ~ exec_function + vocabulary + (1 | child),
family = binomial(),
fixef = fixed,
VarCorr = rand,
data = data)
model
Generalized linear mixed model fit by maximum likelihood (Laplace
Approximation) [glmerMod]
Family: binomial ( logit )
Formula: accuracy ~ exec_function + vocabulary + (1 | child)
Data: data
AIC BIC logLik -2*log(L) df.resid
2660.611 2683.015 -1326.306 2652.611 1996
Random effects:
Groups Name Std.Dev.
child (Intercept) 0.7071
Number of obs: 2000, groups: child, 100
Fixed Effects:
(Intercept) exec_function vocabulary
-0.50 0.50 0.35
To determine how many children we need to test to detect executive functions and vocabulary effects on comprehension accuracy with a power of 80%, we ran multiple power simulations for up to 100 children.
p_curve_vocab <- powerCurve(model,
test = simr::fixed("vocabulary", method = "z"),
along = "child",
breaks = seq(20, n_children, 10))
p_curve_exfun <- powerCurve(model,
test = simr::fixed("exec_function", method = "z"),
along = "child",
breaks = seq(20, n_children, 10))
We plot the power curve to determine the required number of children for a power of 80%. The results can be seen in Figure 3. According to these simulation results, we require data from at least 50 children to be able to detect an effect of vocabulary with a power of 83.3% (95% CI: 80.8%, 85.6%) and at least 30 children to be able to detect an effect of executive functions with a power of 86.7% (95% CI: 84.4%, 88.7%).
Figure 3: Power curve with estimated power over number of children for the effect of time and vocabulary on sentence comprehension accuracy. Errorbars indicate 95% confidence intervals (CI).