Code
bilingual_item_data_clean <- readRDS("data/bilingual_item_data_clean.rds")bilingual_item_data_clean <- readRDS("data/bilingual_item_data_clean.rds")For this pilot, we use all bilingual item-level data in Wordbank, including English (Irish)–Irish, English (American)–French (Quebecois), and English (American)–Spanish (Mexican).
df_data <- bilingual_item_data_clean |>
rename(unilemma = uni_lemma) |>
filter(!is.na(unilemma),
!is.na(value)) |>
mutate(uid = glue("{language}_{item_id}"),
value = ifelse(value == "produces", 1, 0),
age_sc = scale(age)[,1],
exp_sc = scale(log1p(exposure_proportion / 100))[,1])In total, there are 1016 children contributing 2184 administrations of 1466710 trials.
df_data |>
group_by(language) |>
summarise(
n_admins = n_distinct(data_id),
n_trials = n()
)# A tibble: 5 × 3
language n_admins n_trials
<chr> <int> <int>
1 English (American) 1057 718749
2 English (Irish) 80 51109
3 French (Quebecois) 460 301570
4 Irish 95 65152
5 Spanish (Mexican) 492 330130
Some children contributed longitudinal data.
df_data |>
group_by(child_id) |>
summarise(n_admins = n_distinct(data_id)) |>
group_by(n_admins) |>
summarise(n_children = n())# A tibble: 6 × 2
n_admins n_children
<int> <int>
1 1 319
2 2 521
3 3 17
4 4 96
5 6 58
6 8 5
The overall strategy:
Here we fit a 2PL IRT model, specified as value ~ exp(logalpha) * (theta + beta), with random effects of data_id for theta (person ability), and uid for beta (item difficulty) and logalpha (log item discrimination).
We used variational Bayes (using the mean field algorithm) because it is much faster than MCMC. Prior work (Wu et al., 2020) has shown that variational inference can yield comparable results to MCMC for Wordbank specifically, so we considered it a reasonable approximation. This model takes ~14min to fit for the current data.
stage1_formula <- bf(
value ~ exp(logalpha) * (theta + beta),
theta ~ 1 + (1 | data_id),
beta ~ 1 + (1 |i| uid),
logalpha ~ 1 + (1 |i| uid),
nl = TRUE
)
stage1_priors <- c(
prior(normal(0, 1), class = "b", nlpar = "theta"),
prior(normal(0, 1), class = "b", nlpar = "beta"),
prior(normal(0, 1), class = "b", nlpar = "logalpha"),
prior(normal(0, 1), class = "sd", nlpar = "theta"),
prior(normal(0, 1), class = "sd", nlpar = "beta"),
prior(normal(0, 1), class = "sd", nlpar = "logalpha")
)
stage1_model <- brm(
formula = stage1_formula,
data = df_data,
family = c("bernoulli", "logit"),
prior = stage1_priors,
# chains = 4,
# iter = 2000,
# cores = 4,
algorithm = "meanfield",
iter = 10000,
tol_rel_obj = 1e-4,
backend = "cmdstanr",
threads = threading(2),
seed = 123
)
# using meanfield algo (variational) took 14min
saveRDS(stage1_model, "models/bilingual_hirt_stage1_model.rds")stage1_model <- readRDS("models/bilingual_hirt_stage1_model.rds")summary(stage1_model) Family: bernoulli
Links: mu = logit
Formula: value ~ exp(logalpha) * (theta + beta)
theta ~ 1 + (1 | data_id)
beta ~ 1 + (1 | i | uid)
logalpha ~ 1 + (1 | i | uid)
Data: df_data (Number of observations: 1466710)
Draws: 1 chains, each with iter = 1000; warmup = 0; thin = 1;
total post-warmup draws = 1000
Multilevel Hyperparameters:
~data_id (Number of levels: 2184)
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
sd(theta_Intercept) 1.46 0.00 1.45 1.47 1.00 955 948
~uid (Number of levels: 3336)
Estimate Est.Error l-95% CI u-95% CI
sd(beta_Intercept) 1.21 0.00 1.20 1.22
sd(logalpha_Intercept) 0.23 0.00 0.22 0.23
cor(beta_Intercept,logalpha_Intercept) -0.13 0.01 -0.16 -0.11
Rhat Bulk_ESS Tail_ESS
sd(beta_Intercept) 1.00 1024 874
sd(logalpha_Intercept) 1.00 981 845
cor(beta_Intercept,logalpha_Intercept) 1.00 610 794
Regression Coefficients:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
theta_Intercept -0.09 0.00 -0.10 -0.09 1.00 916 710
beta_Intercept -0.43 0.00 -0.44 -0.42 1.00 1083 941
logalpha_Intercept 0.21 0.00 0.20 0.22 1.00 905 915
Draws were sampled using variational(meanfield).
plot(stage1_model, ask = FALSE)# pp_check(stage1_model, ndraws = 100)stage1_formula_rasch <- bf(
value ~ theta + beta,
theta ~ 1 + (1 | data_id),
beta ~ 1 + (1 |i| uid),
nl = TRUE
)
stage1_priors_rasch <- c(
prior(normal(0, 1), class = "b", nlpar = "theta"),
prior(normal(0, 1), class = "b", nlpar = "beta"),
prior(normal(0, 1), class = "sd", nlpar = "theta"),
prior(normal(0, 1), class = "sd", nlpar = "beta")
)
stage1_model_rasch <- brm(
formula = stage1_formula_rasch,
data = df_data,
family = c("bernoulli", "logit"),
prior = stage1_priors_rasch,
# chains = 4,
# iter = 2000,
# cores = 4,
algorithm = "meanfield",
iter = 10000,
tol_rel_obj = 1e-4,
backend = "cmdstanr",
threads = threading(2),
seed = 123
)
saveRDS(stage1_model_rasch, "models/bilingual_hirt_stage1_model_rasch.rds")stage1_model_rasch <- readRDS("models/bilingual_hirt_stage1_model_rasch.rds")Failed to calculate LOO/WAIC—the session aborts, likely due to OOM errors.
Now we extract posterior estimates, and regress theta on age*exposure, and beta/logalpha on item language and unilemma. The mi() syntax is used to account for measurement error in the estimates. (These models are relatively fast and are fit on-the-fly during knitting.)
Note that here we fit a linear exposure term; we should be able to switch this out for other nonlinear expressions to test our hypotheses.
theta_ranef <- ranef(stage1_model, summary = TRUE)$data_id[, , "theta_Intercept"]
theta_fixed <- fixef(stage1_model, summary = TRUE, nlpar = "theta")[1, "Estimate"]
theta_estimates <- tibble(
data_id = rownames(theta_ranef),
theta_est = theta_fixed + theta_ranef[, "Estimate"],
theta_se = theta_ranef[, "Est.Error"]
)
beta_ranef <- ranef(stage1_model, summary = TRUE)$uid[, , "beta_Intercept"]
beta_fixed <- fixef(stage1_model, summary = TRUE, nlpar = "beta")[1, "Estimate"]
beta_estimates <- tibble(
uid = rownames(beta_ranef),
beta_est = beta_fixed + beta_ranef[, "Estimate"],
beta_se = beta_ranef[, "Est.Error"]
)
logalpha_ranef <- ranef(stage1_model, summary = TRUE)$uid[, , "logalpha_Intercept"]
logalpha_fixed <- fixef(stage1_model, summary = TRUE, nlpar = "logalpha")[1, "Estimate"]
logalpha_estimates <- tibble(
uid = rownames(logalpha_ranef),
logalpha_est = logalpha_fixed + logalpha_ranef[, "Estimate"],
logalpha_se = logalpha_ranef[, "Est.Error"]
)child_data <- df_data |>
select(data_id, child_id, age_sc, exp_sc, language, age, exposure_proportion) |>
distinct() |>
mutate(data_id = as.character(data_id),
child_id = as.character(child_id)) |>
left_join(theta_estimates, by = join_by(data_id))
# Item-level data with covariates
item_data <- df_data |>
select(uid, unilemma, language) |>
distinct() |>
left_join(beta_estimates, by = join_by(uid)) |>
left_join(logalpha_estimates, by = join_by(uid))Theta model:
summary(stage2_theta_model) Family: gaussian
Links: mu = identity; sigma = identity
Formula: theta_est | mi(theta_se) ~ age_sc * exp_sc + (1 | child_id)
Data: child_data (Number of observations: 2184)
Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
total post-warmup draws = 4000
Multilevel Hyperparameters:
~child_id (Number of levels: 1016)
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
sd(Intercept) 1.00 0.04 0.91 1.08 1.00 1137 1630
Regression Coefficients:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
Intercept -0.87 0.04 -0.95 -0.79 1.00 2741 2446
age_sc 1.01 0.03 0.95 1.08 1.00 3796 3018
exp_sc 0.78 0.03 0.72 0.84 1.00 5249 3189
age_sc:exp_sc 0.08 0.03 0.02 0.14 1.00 5672 3025
Further Distributional Parameters:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
sigma 1.18 0.02 1.13 1.23 1.00 1784 2133
Draws were sampled using sample(hmc). For each parameter, Bulk_ESS
and Tail_ESS are effective sample size measures, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).
Beta model:
summary(stage2_beta_model)Warning: There were 13 divergent transitions after warmup. Increasing
adapt_delta above 0.8 may help. See
http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
Family: gaussian
Links: mu = identity; sigma = identity
Formula: beta_est | mi(beta_se) ~ (1 | unilemma) + (1 | language)
Data: item_data (Number of observations: 3336)
Draws: 4 chains, each with iter = 4000; warmup = 2000; thin = 1;
total post-warmup draws = 8000
Multilevel Hyperparameters:
~language (Number of levels: 5)
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
sd(Intercept) 0.38 0.19 0.17 0.86 1.00 2873 3396
~unilemma (Number of levels: 993)
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
sd(Intercept) 1.09 0.03 1.03 1.15 1.00 2265 4459
Regression Coefficients:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
Intercept -0.45 0.18 -0.80 -0.04 1.00 2513 3097
Further Distributional Parameters:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
sigma 0.84 0.01 0.81 0.86 1.00 5955 5453
Draws were sampled using sample(hmc). For each parameter, Bulk_ESS
and Tail_ESS are effective sample size measures, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).
beta_unilemma_var <- VarCorr(stage2_beta_model)$unilemma$sd[1, "Estimate"]^2
beta_language_var <- VarCorr(stage2_beta_model)$language$sd[1, "Estimate"]^2
beta_uid_var <- VarCorr(stage2_beta_model)$uid$sd[1, "Estimate"]^2
beta_residual_var <- VarCorr(stage2_beta_model)$residual$sd[1, "Estimate"]^2
beta_total_var <- beta_unilemma_var + beta_language_var + beta_uid_var + beta_residual_varLogalpha model:
summary(stage2_logalpha_model)Warning: There were 39 divergent transitions after warmup. Increasing
adapt_delta above 0.8 may help. See
http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
Family: gaussian
Links: mu = identity; sigma = identity
Formula: logalpha_est | mi(logalpha_se) ~ (1 | unilemma) + (1 | language)
Data: item_data (Number of observations: 3336)
Draws: 4 chains, each with iter = 4000; warmup = 2000; thin = 1;
total post-warmup draws = 8000
Multilevel Hyperparameters:
~language (Number of levels: 5)
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
sd(Intercept) 0.03 0.02 0.01 0.08 1.00 1365 1068
~unilemma (Number of levels: 993)
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
sd(Intercept) 0.17 0.01 0.16 0.18 1.00 1462 2947
Regression Coefficients:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
Intercept -0.10 0.02 -0.14 -0.07 1.01 1252 852
Further Distributional Parameters:
Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
sigma 0.12 0.00 0.11 0.13 1.00 1225 2263
Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
and Tail_ESS are effective sample size measures, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).
logalpha_unilemma_var <- VarCorr(stage2_logalpha_model)$unilemma$sd[1, "Estimate"]^2
logalpha_language_var <- VarCorr(stage2_logalpha_model)$language$sd[1, "Estimate"]^2
logalpha_uid_var <- VarCorr(stage2_logalpha_model)$uid$sd[1, "Estimate"]^2
logalpha_residual_var <- VarCorr(stage2_logalpha_model)$residual$sd[1, "Estimate"]^2
logalpha_total_var <- logalpha_unilemma_var + logalpha_language_var + logalpha_uid_var + logalpha_residual_varPlots for sanity checking
p_age <- child_data |>
ggplot(aes(x = age, y = theta_est)) +
geom_point(aes(size = 1/theta_se), alpha = 0.1) +
geom_smooth(method = "lm", formula = y ~ x) +
labs(x = "Age (mo)", y = "Theta estimate",
size = "Precision")
p_agep_exp <- child_data |>
ggplot(aes(x = exposure_proportion, y = theta_est)) +
geom_point(aes(size = 1/theta_se), alpha = 0.1) +
geom_smooth(method = "lm", formula = y ~ x) +
labs(x = "Language exposure (%)", y = "Theta estimate",
size = "Precision")
p_expp_theta_lang <- child_data |>
ggplot(aes(x = theta_est)) +
geom_density(aes(fill = language), alpha = 0.4) +
labs(x = "Theta estimate", fill = "Language",
size = "Precision")
p_theta_langp_diff_lang <- item_data |>
ggplot(aes(x = beta_est)) +
geom_density(aes(fill = language), alpha = 0.4) +
labs(x = "Beta estimate", fill = "Language",
size = "Precision")
p_diff_langp_disc_lang <- item_data |>
ggplot(aes(x = exp(logalpha_est))) +
geom_density(aes(fill = language), alpha = 0.4) +
labs(x = "Alpha estimate (exp(logalpha))", fill = "Language",
size = "Precision")
p_disc_langWe evaluate the fit of the IRT model using Bayesian analogues of in- and outfit. (Code for posterior predictive checks is also available but it takes a long time to make draws.)
pp_check(stage1_model, ndraws = 10)# extract parameter estimates (posterior means only)
theta_ranef <- ranef(stage1_model, summary = TRUE)$data_id[, , "theta_Intercept"]
beta_ranef <- ranef(stage1_model, summary = TRUE)$uid[, , "beta_Intercept"]
logalpha_ranef <- ranef(stage1_model, summary = TRUE)$uid[, , "logalpha_Intercept"]
theta_fixed <- fixef(stage1_model, summary = TRUE, nlpar = "theta")[1, "Estimate"]
beta_fixed <- fixef(stage1_model, summary = TRUE, nlpar = "beta")[1, "Estimate"]
logalpha_fixed <- fixef(stage1_model, summary = TRUE, nlpar = "logalpha")[1, "Estimate"]
data <- stage1_model$data |>
mutate(
theta = theta_fixed + theta_ranef[as.character(data_id), "Estimate"],
beta = beta_fixed + beta_ranef[as.character(uid), "Estimate"],
alpha = exp(logalpha_fixed + logalpha_ranef[as.character(uid), "Estimate"])
)
# calculate predicted probabilities
data <- data |>
mutate(
p_pred = plogis(alpha * (theta + beta)),
variance = p_pred * (1 - p_pred),
residual = value - p_pred,
z_residual = residual / sqrt(variance),
z_residual_sq = z_residual^2
)
item_fit <- data |>
group_by(uid) |>
summarise(
n_obs = n(),
# outfit: unweighted mean square
outfit_msq = mean(z_residual_sq, na.rm = TRUE),
outfit_zscore = (mean(z_residual_sq, na.rm = TRUE) - 1) / sqrt(2 / n()),
# infit: variance-weighted mean square
infit_msq = sum(z_residual_sq * variance, na.rm = TRUE) /
sum(variance, na.rm = TRUE),
infit_zscore = (sum(z_residual_sq * variance, na.rm = TRUE) /
sum(variance, na.rm = TRUE) - 1) /
sqrt(sum(variance^2, na.rm = TRUE) / sum(variance, na.rm = TRUE)^2),
.groups = "drop"
)
person_fit <- data |>
group_by(data_id) |>
summarise(
n_obs = n(),
outfit_msq = mean(z_residual_sq, na.rm = TRUE),
outfit_zscore = (mean(z_residual_sq, na.rm = TRUE) - 1) / sqrt(2 / n()),
infit_msq = sum(z_residual_sq * variance, na.rm = TRUE) /
sum(variance, na.rm = TRUE),
infit_zscore = (sum(z_residual_sq * variance, na.rm = TRUE) /
sum(variance, na.rm = TRUE) - 1) /
sqrt(sum(variance^2, na.rm = TRUE) / sum(variance, na.rm = TRUE)^2),
.groups = "drop"
)p_item_infit <- item_fit |>
ggplot(aes(x = infit_msq)) +
geom_vline(xintercept = 1, col = "gray32") +
geom_vline(xintercept = 0.7, col = "gray32", lty = "dashed") +
geom_vline(xintercept = 1.3, col = "gray32", lty = "dashed") +
geom_density() +
labs(x = "Item infit mean square", y = "Density")
p_item_infitp_item_outfit <- item_fit |>
ggplot(aes(x = outfit_msq)) +
geom_vline(xintercept = 1, col = "gray32") +
geom_vline(xintercept = 0.7, col = "gray32", lty = "dashed") +
geom_vline(xintercept = 1.3, col = "gray32", lty = "dashed") +
geom_density() +
labs(x = "Item outfit mean square", y = "Density")
p_item_outfit