BiLExpo: Hierarchical IRT

Author

Alvin W.M. Tan

Published

October 9, 2025

Setup

Code

bilingual_item_data_clean <- readRDS("data/bilingual_item_data_clean.rds")

Data prep

For this pilot, we use all bilingual item-level data in Wordbank, including English (Irish)–Irish, English (American)–French (Quebecois), and English (American)–Spanish (Mexican).

Code

df_data <- bilingual_item_data_clean |> 
  rename(unilemma = uni_lemma) |> 
  filter(!is.na(unilemma),
         !is.na(value)) |> 
  mutate(uid = glue("{language}_{item_id}"),
         value = ifelse(value == "produces", 1, 0),
         age_sc = scale(age)[,1],
         exp_sc = scale(log1p(exposure_proportion / 100))[,1])

In total, there are 1016 children contributing 2184 administrations of 1466710 trials.

Code

df_data |> 
  group_by(language) |>
  summarise(
    n_admins = n_distinct(data_id),
    n_trials = n()
  )

# A tibble: 5 × 3
  language           n_admins n_trials
  <chr>                 <int>    <int>
1 English (American)     1057   718749
2 English (Irish)          80    51109
3 French (Quebecois)      460   301570
4 Irish                    95    65152
5 Spanish (Mexican)       492   330130

Some children contributed longitudinal data.

Code

df_data |> 
  group_by(child_id) |> 
  summarise(n_admins = n_distinct(data_id)) |> 
  group_by(n_admins) |> 
  summarise(n_children = n())

# A tibble: 6 × 2
  n_admins n_children
     <int>      <int>
1        1        319
2        2        521
3        3         17
4        4         96
5        6         58
6        8          5

Two-stage HIRT

The overall strategy:

Fit a 2PL IRT model to the item-level data, estimating person abilities (theta) and item difficulties and discriminations (beta, alpha). Note that we use the exp(logalpha) formulation to ensure positivity of alpha.
Extract the posterior estimates of theta and beta/logalpha, then regress them on person- and item-level covariates respectively, accounting for uncertainty in the estimates using measurement error models.

Stage 1: IRT

Here we fit a 2PL IRT model, specified as value ~ exp(logalpha) * (theta + beta), with random effects of data_id for theta (person ability), and uid for beta (item difficulty) and logalpha (log item discrimination).

We used variational Bayes (using the mean field algorithm) because it is much faster than MCMC. Prior work (Wu et al., 2020) has shown that variational inference can yield comparable results to MCMC for Wordbank specifically, so we considered it a reasonable approximation. This model takes ~14min to fit for the current data.

Code

stage1_formula <- bf(
  value ~ exp(logalpha) * (theta + beta),
  theta ~ 1 + (1 | data_id),
  beta ~ 1 + (1 |i| uid),
  logalpha ~ 1 + (1 |i| uid),
  nl = TRUE
)

stage1_priors <- c(
  prior(normal(0, 1), class = "b", nlpar = "theta"),
  prior(normal(0, 1), class = "b", nlpar = "beta"),
  prior(normal(0, 1), class = "b", nlpar = "logalpha"),
  prior(normal(0, 1), class = "sd", nlpar = "theta"),
  prior(normal(0, 1), class = "sd", nlpar = "beta"), 
  prior(normal(0, 1), class = "sd", nlpar = "logalpha")
)

stage1_model <- brm(
  formula = stage1_formula,
  data = df_data,
  family = c("bernoulli", "logit"),
  prior = stage1_priors,
  # chains = 4,
  # iter = 2000,
  # cores = 4,
  algorithm = "meanfield",
  iter = 10000,
  tol_rel_obj = 1e-4,
  backend = "cmdstanr",
  threads = threading(2),
  seed = 123
)

# using meanfield algo (variational) took 14min
saveRDS(stage1_model, "models/bilingual_hirt_stage1_model.rds")

Code

stage1_model <- readRDS("models/bilingual_hirt_stage1_model.rds")

Code

summary(stage1_model)

 Family: bernoulli 
  Links: mu = logit 
Formula: value ~ exp(logalpha) * (theta + beta) 
         theta ~ 1 + (1 | data_id)
         beta ~ 1 + (1 | i | uid)
         logalpha ~ 1 + (1 | i | uid)
   Data: df_data (Number of observations: 1466710) 
  Draws: 1 chains, each with iter = 1000; warmup = 0; thin = 1;
         total post-warmup draws = 1000

Multilevel Hyperparameters:
~data_id (Number of levels: 2184) 
                    Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
sd(theta_Intercept)     1.46      0.00     1.45     1.47 1.00      955      948

~uid (Number of levels: 3336) 
                                       Estimate Est.Error l-95% CI u-95% CI
sd(beta_Intercept)                         1.21      0.00     1.20     1.22
sd(logalpha_Intercept)                     0.23      0.00     0.22     0.23
cor(beta_Intercept,logalpha_Intercept)    -0.13      0.01    -0.16    -0.11
                                       Rhat Bulk_ESS Tail_ESS
sd(beta_Intercept)                     1.00     1024      874
sd(logalpha_Intercept)                 1.00      981      845
cor(beta_Intercept,logalpha_Intercept) 1.00      610      794

Regression Coefficients:
                   Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
theta_Intercept       -0.09      0.00    -0.10    -0.09 1.00      916      710
beta_Intercept        -0.43      0.00    -0.44    -0.42 1.00     1083      941
logalpha_Intercept     0.21      0.00     0.20     0.22 1.00      905      915

Draws were sampled using variational(meanfield).

Code

plot(stage1_model, ask = FALSE)

Code

# pp_check(stage1_model, ndraws = 100)

Code

stage1_formula_rasch <- bf(
  value ~ theta + beta,
  theta ~ 1 + (1 | data_id),
  beta ~ 1 + (1 |i| uid),
  nl = TRUE
)

stage1_priors_rasch <- c(
  prior(normal(0, 1), class = "b", nlpar = "theta"),
  prior(normal(0, 1), class = "b", nlpar = "beta"),
  prior(normal(0, 1), class = "sd", nlpar = "theta"),
  prior(normal(0, 1), class = "sd", nlpar = "beta")
)

stage1_model_rasch <- brm(
  formula = stage1_formula_rasch,
  data = df_data,
  family = c("bernoulli", "logit"),
  prior = stage1_priors_rasch,
  # chains = 4,
  # iter = 2000,
  # cores = 4,
  algorithm = "meanfield",
  iter = 10000,
  tol_rel_obj = 1e-4,
  backend = "cmdstanr",
  threads = threading(2),
  seed = 123
)

saveRDS(stage1_model_rasch, "models/bilingual_hirt_stage1_model_rasch.rds")

Code

stage1_model_rasch <- readRDS("models/bilingual_hirt_stage1_model_rasch.rds")

Failed to calculate LOO/WAIC—the session aborts, likely due to OOM errors.

Stage 2: Covariates

Now we extract posterior estimates, and regress theta on age*exposure, and beta/logalpha on item language and unilemma. The mi() syntax is used to account for measurement error in the estimates. (These models are relatively fast and are fit on-the-fly during knitting.)

Note that here we fit a linear exposure term; we should be able to switch this out for other nonlinear expressions to test our hypotheses.

Code

theta_ranef <- ranef(stage1_model, summary = TRUE)$data_id[, , "theta_Intercept"]
theta_fixed <- fixef(stage1_model, summary = TRUE, nlpar = "theta")[1, "Estimate"]
theta_estimates <- tibble(
  data_id = rownames(theta_ranef),
  theta_est = theta_fixed + theta_ranef[, "Estimate"],
  theta_se = theta_ranef[, "Est.Error"]
)

beta_ranef <- ranef(stage1_model, summary = TRUE)$uid[, , "beta_Intercept"]
beta_fixed <- fixef(stage1_model, summary = TRUE, nlpar = "beta")[1, "Estimate"]
beta_estimates <- tibble(
  uid = rownames(beta_ranef),
  beta_est = beta_fixed + beta_ranef[, "Estimate"],
  beta_se = beta_ranef[, "Est.Error"]
)

logalpha_ranef <- ranef(stage1_model, summary = TRUE)$uid[, , "logalpha_Intercept"]
logalpha_fixed <- fixef(stage1_model, summary = TRUE, nlpar = "logalpha")[1, "Estimate"]
logalpha_estimates <- tibble(
  uid = rownames(logalpha_ranef),
  logalpha_est = logalpha_fixed + logalpha_ranef[, "Estimate"],
  logalpha_se = logalpha_ranef[, "Est.Error"]
)

Code

child_data <- df_data |>
  select(data_id, child_id, age_sc, exp_sc, language, age, exposure_proportion) |>
  distinct() |>
  mutate(data_id = as.character(data_id),
         child_id = as.character(child_id)) |> 
  left_join(theta_estimates, by = join_by(data_id))

# Item-level data with covariates
item_data <- df_data |>
  select(uid, unilemma, language) |>
  distinct() |>
  left_join(beta_estimates, by = join_by(uid)) |>
  left_join(logalpha_estimates, by = join_by(uid))

Theta model:

Code

summary(stage2_theta_model)

 Family: gaussian 
  Links: mu = identity; sigma = identity 
Formula: theta_est | mi(theta_se) ~ age_sc * exp_sc + (1 | child_id) 
   Data: child_data (Number of observations: 2184) 
  Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
         total post-warmup draws = 4000

Multilevel Hyperparameters:
~child_id (Number of levels: 1016) 
              Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
sd(Intercept)     1.00      0.04     0.91     1.08 1.00     1137     1630

Regression Coefficients:
              Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
Intercept        -0.87      0.04    -0.95    -0.79 1.00     2741     2446
age_sc            1.01      0.03     0.95     1.08 1.00     3796     3018
exp_sc            0.78      0.03     0.72     0.84 1.00     5249     3189
age_sc:exp_sc     0.08      0.03     0.02     0.14 1.00     5672     3025

Further Distributional Parameters:
      Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
sigma     1.18      0.02     1.13     1.23 1.00     1784     2133

Draws were sampled using sample(hmc). For each parameter, Bulk_ESS
and Tail_ESS are effective sample size measures, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).

Beta model:

Code

summary(stage2_beta_model)

Warning: There were 13 divergent transitions after warmup. Increasing
adapt_delta above 0.8 may help. See
http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup

 Family: gaussian 
  Links: mu = identity; sigma = identity 
Formula: beta_est | mi(beta_se) ~ (1 | unilemma) + (1 | language) 
   Data: item_data (Number of observations: 3336) 
  Draws: 4 chains, each with iter = 4000; warmup = 2000; thin = 1;
         total post-warmup draws = 8000

Multilevel Hyperparameters:
~language (Number of levels: 5) 
              Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
sd(Intercept)     0.38      0.19     0.17     0.86 1.00     2873     3396

~unilemma (Number of levels: 993) 
              Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
sd(Intercept)     1.09      0.03     1.03     1.15 1.00     2265     4459

Regression Coefficients:
          Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
Intercept    -0.45      0.18    -0.80    -0.04 1.00     2513     3097

Further Distributional Parameters:
      Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
sigma     0.84      0.01     0.81     0.86 1.00     5955     5453

Draws were sampled using sample(hmc). For each parameter, Bulk_ESS
and Tail_ESS are effective sample size measures, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).

Code

beta_unilemma_var <- VarCorr(stage2_beta_model)$unilemma$sd[1, "Estimate"]^2
beta_language_var <- VarCorr(stage2_beta_model)$language$sd[1, "Estimate"]^2
beta_uid_var <- VarCorr(stage2_beta_model)$uid$sd[1, "Estimate"]^2
beta_residual_var <- VarCorr(stage2_beta_model)$residual$sd[1, "Estimate"]^2

beta_total_var <- beta_unilemma_var + beta_language_var + beta_uid_var + beta_residual_var

Logalpha model:

Code

summary(stage2_logalpha_model)

Warning: There were 39 divergent transitions after warmup. Increasing
adapt_delta above 0.8 may help. See
http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup

 Family: gaussian 
  Links: mu = identity; sigma = identity 
Formula: logalpha_est | mi(logalpha_se) ~ (1 | unilemma) + (1 | language) 
   Data: item_data (Number of observations: 3336) 
  Draws: 4 chains, each with iter = 4000; warmup = 2000; thin = 1;
         total post-warmup draws = 8000

Multilevel Hyperparameters:
~language (Number of levels: 5) 
              Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
sd(Intercept)     0.03      0.02     0.01     0.08 1.00     1365     1068

~unilemma (Number of levels: 993) 
              Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
sd(Intercept)     0.17      0.01     0.16     0.18 1.00     1462     2947

Regression Coefficients:
          Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
Intercept    -0.10      0.02    -0.14    -0.07 1.01     1252      852

Further Distributional Parameters:
      Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
sigma     0.12      0.00     0.11     0.13 1.00     1225     2263

Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
and Tail_ESS are effective sample size measures, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).

Code

logalpha_unilemma_var <- VarCorr(stage2_logalpha_model)$unilemma$sd[1, "Estimate"]^2
logalpha_language_var <- VarCorr(stage2_logalpha_model)$language$sd[1, "Estimate"]^2
logalpha_uid_var <- VarCorr(stage2_logalpha_model)$uid$sd[1, "Estimate"]^2
logalpha_residual_var <- VarCorr(stage2_logalpha_model)$residual$sd[1, "Estimate"]^2

logalpha_total_var <- logalpha_unilemma_var + logalpha_language_var + logalpha_uid_var + logalpha_residual_var

Plots

Plots for sanity checking

Code

p_age <- child_data |> 
  ggplot(aes(x = age, y = theta_est)) +
  geom_point(aes(size = 1/theta_se), alpha = 0.1) +
  geom_smooth(method = "lm", formula = y ~ x) +
  labs(x = "Age (mo)", y = "Theta estimate",
       size = "Precision")
p_age

Code

p_exp <- child_data |>
  ggplot(aes(x = exposure_proportion, y = theta_est)) +
  geom_point(aes(size = 1/theta_se), alpha = 0.1) +
  geom_smooth(method = "lm", formula = y ~ x) +
  labs(x = "Language exposure (%)", y = "Theta estimate",
       size = "Precision")
p_exp

Code

p_theta_lang <- child_data |> 
  ggplot(aes(x = theta_est)) +
  geom_density(aes(fill = language), alpha = 0.4) +
  labs(x = "Theta estimate", fill = "Language",
       size = "Precision")
p_theta_lang

Code

p_diff_lang <- item_data |> 
  ggplot(aes(x = beta_est)) +
  geom_density(aes(fill = language), alpha = 0.4) +
  labs(x = "Beta estimate", fill = "Language",
       size = "Precision")
p_diff_lang

Code

p_disc_lang <- item_data |>
  ggplot(aes(x = exp(logalpha_est))) +
  geom_density(aes(fill = language), alpha = 0.4) +
  labs(x = "Alpha estimate (exp(logalpha))", fill = "Language",
       size = "Precision")
p_disc_lang

Fit statistics

We evaluate the fit of the IRT model using Bayesian analogues of in- and outfit. (Code for posterior predictive checks is also available but it takes a long time to make draws.)

Code

pp_check(stage1_model, ndraws = 10)

Code

# extract parameter estimates (posterior means only)
theta_ranef <- ranef(stage1_model, summary = TRUE)$data_id[, , "theta_Intercept"]
beta_ranef <- ranef(stage1_model, summary = TRUE)$uid[, , "beta_Intercept"]
logalpha_ranef <- ranef(stage1_model, summary = TRUE)$uid[, , "logalpha_Intercept"]

theta_fixed <- fixef(stage1_model, summary = TRUE, nlpar = "theta")[1, "Estimate"]
beta_fixed <- fixef(stage1_model, summary = TRUE, nlpar = "beta")[1, "Estimate"]
logalpha_fixed <- fixef(stage1_model, summary = TRUE, nlpar = "logalpha")[1, "Estimate"]

data <- stage1_model$data |>
  mutate(
    theta = theta_fixed + theta_ranef[as.character(data_id), "Estimate"],
    beta = beta_fixed + beta_ranef[as.character(uid), "Estimate"],
    alpha = exp(logalpha_fixed + logalpha_ranef[as.character(uid), "Estimate"])
  )

# calculate predicted probabilities
data <- data |>
  mutate(
    p_pred = plogis(alpha * (theta + beta)),
    variance = p_pred * (1 - p_pred),
    residual = value - p_pred,
    z_residual = residual / sqrt(variance),
    z_residual_sq = z_residual^2
  )

item_fit <- data |>
  group_by(uid) |>
  summarise(
    n_obs = n(),
    # outfit: unweighted mean square
    outfit_msq = mean(z_residual_sq, na.rm = TRUE),
    outfit_zscore = (mean(z_residual_sq, na.rm = TRUE) - 1) / sqrt(2 / n()),
    # infit: variance-weighted mean square
    infit_msq = sum(z_residual_sq * variance, na.rm = TRUE) / 
      sum(variance, na.rm = TRUE),
    infit_zscore = (sum(z_residual_sq * variance, na.rm = TRUE) / 
                      sum(variance, na.rm = TRUE) - 1) / 
      sqrt(sum(variance^2, na.rm = TRUE) / sum(variance, na.rm = TRUE)^2),
    .groups = "drop"
  )

person_fit <- data |>
  group_by(data_id) |>
  summarise(
    n_obs = n(),
    outfit_msq = mean(z_residual_sq, na.rm = TRUE),
    outfit_zscore = (mean(z_residual_sq, na.rm = TRUE) - 1) / sqrt(2 / n()),
    infit_msq = sum(z_residual_sq * variance, na.rm = TRUE) / 
      sum(variance, na.rm = TRUE),
    infit_zscore = (sum(z_residual_sq * variance, na.rm = TRUE) / 
                      sum(variance, na.rm = TRUE) - 1) / 
      sqrt(sum(variance^2, na.rm = TRUE) / sum(variance, na.rm = TRUE)^2),
    .groups = "drop"
  )

Item fit plots

Code

p_item_infit <- item_fit |>
  ggplot(aes(x = infit_msq)) +
  geom_vline(xintercept = 1, col = "gray32") +
  geom_vline(xintercept = 0.7, col = "gray32", lty = "dashed") +
  geom_vline(xintercept = 1.3, col = "gray32", lty = "dashed") +
  geom_density() +
  labs(x = "Item infit mean square", y = "Density")
p_item_infit

Code

p_item_outfit <- item_fit |>
  ggplot(aes(x = outfit_msq)) +
  geom_vline(xintercept = 1, col = "gray32") +
  geom_vline(xintercept = 0.7, col = "gray32", lty = "dashed") +
  geom_vline(xintercept = 1.3, col = "gray32", lty = "dashed") +
  geom_density() +
  labs(x = "Item outfit mean square", y = "Density")
p_item_outfit