LEVANTE Vocab

Author

Fionnuala O’Reilly

Required packages

Code
#| echo: false

load_or_install <- function(pkg) {
  if (!require(pkg, character.only = TRUE)) {
    install.packages(pkg)
    library(pkg, character.only = TRUE)
  }
}

load_or_install("tidyverse")
Loading required package: tidyverse
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.2     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Code
load_or_install("glue")
Loading required package: glue
Code
load_or_install("here")
Loading required package: here
here() starts at /Users/foreilly/Documents/levante-pilots
Code
load_or_install("viridis")
Loading required package: viridis
Loading required package: viridisLite
Code
load_or_install("purrr")
load_or_install("readr")
load_or_install("dplyr")
load_or_install("tidyr")
load_or_install("quarto")
Loading required package: quarto
Code
load_or_install("forcats")
load_or_install("ggplot2")
load_or_install("ggrepel")
Loading required package: ggrepel
Code
load_or_install("broom")
Loading required package: broom
Code
load_or_install("purrr")

Helper functions

Get vocab data

Code
vocab <- load_task_data("vocab")
Joining with `by = join_by(site, run_id)`
Code
vocab
# A tibble: 43,762 × 15
   site     task_id dataset        user_id run_id trial_id trial_number item_uid
   <chr>    <chr>   <chr>          <chr>   <chr>  <chr>           <int> <chr>   
 1 ca_pilot vocab   ca_western_pi… G38GNy… WJIaT… Wd6JBua…          127 vocab__…
 2 ca_pilot vocab   ca_western_pi… flAV0x… vcU18… 0o2wktk…          127 vocab__…
 3 ca_pilot vocab   ca_western_pi… HTGMsm… cZyyu… vAj95cp…          127 vocab__…
 4 ca_pilot vocab   ca_western_pi… s1woiH… B2nN3… 1rFYxSI…          127 vocab__…
 5 ca_pilot vocab   ca_western_pi… LQmMTj… DqXYo… eWaTDb6…          127 vocab__…
 6 ca_pilot vocab   ca_western_pi… NvrKIV… qmlxg… OBotUVi…          127 vocab__…
 7 ca_pilot vocab   ca_western_pi… Udjgwj… mHDym… W0eGRQj…          127 vocab__…
 8 ca_pilot vocab   ca_western_pi… nHKQ5o… Ft0Bg… fOPjXPU…          127 vocab__…
 9 ca_pilot vocab   ca_western_pi… LQmMTj… DqXYo… OjAeT8l…           87 vocab__…
10 ca_pilot vocab   ca_western_pi… NxYnKU… LhAmP… wNMDm1Q…           87 vocab__…
# ℹ 43,752 more rows
# ℹ 7 more variables: item_group <chr>, item <chr>, correct <lgl>, rt <chr>,
#   rt_numeric <dbl>, timestamp <dttm>, age <dbl>

Checking n’s

Code
vocab |> 
  distinct(site, user_id) |> 
  count(site)
# A tibble: 3 × 2
  site         n
  <chr>    <int>
1 ca_pilot    20
2 co_pilot   171
3 de_pilot   196

Get sum scores

Code
vocab_runs <- vocab |>
  group_by(site, user_id, run_id) |>
  summarise(
    correct = mean(correct, na.rm = TRUE),  
    age = mean(age, na.rm = TRUE),          
    n_items = n_distinct(item_uid)          
  )
`summarise()` has grouped output by 'site', 'user_id'. You can override using
the `.groups` argument.
Code
ggplot(vocab_runs, aes(x = age, y = correct)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "gam") +
  theme_minimal(base_family = "sans") +
  ylim(0, 1) +
  facet_wrap(~site)
`geom_smooth()` using formula = 'y ~ s(x, bs = "cs")'
Warning: Removed 15 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 15 rows containing missing values or values outside the scale range
(`geom_point()`).

Item completion patterns across countries

Code
ggplot(vocab_runs, aes(x = n_items)) +
  geom_histogram(binwidth = 1) +
  facet_wrap(~site) + 
  theme_minimal()

Examining least and most difficult items

Code
# Calculate item-level accuracy
item_accuracy <- vocab |>
  group_by(item_uid) |>
  summarise(proportion_correct = mean(correct, na.rm = TRUE)) |>
  arrange(proportion_correct) |>
  mutate(item_rank = row_number())

Plot

Code
# Plot all items
ggplot(item_accuracy, aes(x = reorder(item_uid, proportion_correct), y = proportion_correct)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Proportion Correct by Vocabulary Item (All Items)",
    x = "Vocabulary Item",
    y = "Proportion Correct"
  ) +
  theme_minimal()

Code
# Note this doesn't take account of the difficulty of items as in the IRT models.

Subset of items

Code
# Filter every 5th item
sampled_items <- item_accuracy |>
  filter(item_rank %% 5 == 0)

# Plot sampled items
ggplot(sampled_items, aes(x = reorder(item_uid, proportion_correct), y = proportion_correct)) +
  geom_col() +
  coord_flip() +
    labs(
    title = "Proportion Correct (Sampled Vocabulary Items)",
    x = "Vocabulary Item (every 5th)",
    y = "Proportion Correct"
  ) +
  theme_minimal()

By age

Code
# Create age brackets
vocab_binned <- vocab |>
  mutate(age_bracket = cut(age, c(4, 7, 9, 13), include.lowest = TRUE)) |>
  filter(!is.na(age_bracket))

summary(vocab$age)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
  5.153   7.518   9.114   9.243  11.179  13.413    1790 
Code
table(is.na(vocab$age))

FALSE  TRUE 
41972  1790 
Code
# Accuracy by age
item_accuracy_by_bracket <- vocab_binned |>
  group_by(age_bracket, item_uid) |>
  summarise(
    proportion_correct = mean(correct, na.rm = TRUE),
    n = n(),
    .groups = "drop"
  )

# Top 10 easiest and 10 hardest per age bin 
easiest_hardest_by_bracket <- item_accuracy_by_bracket |>
  group_by(age_bracket) |>
  arrange(proportion_correct) |>
  mutate(rank = row_number()) |>
  filter(rank <= 10 | rank > (n() - 10)) |>
  distinct(age_bracket, item_uid, .keep_all = TRUE)

# Plot (without sample size labels)
ggplot(easiest_hardest_by_bracket, aes(x = reorder(item_uid, proportion_correct), y = proportion_correct, fill = age_bracket)) +
  geom_col(show.legend = FALSE) +
  scale_fill_viridis_d() +
  coord_flip() +
  facet_wrap(~ age_bracket, scales = "free_y") +
  labs(
    title = "Easiest and Hardest Vocabulary Items by Age Bracket",
    x = "Vocabulary Item",
    y = "Proportion Correct"
  ) +
  ylim(0, 1.1) +
  theme_minimal()

By country

Code
# Accuracy per item by site
item_accuracy_by_site <- vocab |>
  filter(site %in% c("ca_pilot", "co_pilot", "de_pilot")) |>
  group_by(site, item_uid) |>
  summarise(
    proportion_correct = mean(correct, na.rm = TRUE),
    n = n(),  # sample size
    .groups = "drop"
  )

# Top 10 easiest and hardest items per site
easiest_hardest_by_site <- item_accuracy_by_site |>
  group_by(site) |>
  arrange(proportion_correct) |>
  mutate(rank = row_number())
  #filter(rank <= 10 | rank > (n() - 10))

Plot

Code
# Plot 
ggplot(easiest_hardest_by_site, aes(x = reorder(item_uid, proportion_correct), y = proportion_correct, fill = site)) +
  geom_col(show.legend = FALSE) +
  geom_text(aes(label = glue::glue("{n}")),
            hjust = -0.1, size = 1.5) +
  coord_flip() +
  scale_fill_viridis_d() +
  facet_wrap(~ site, scales = "free_y") +
  labs(
    title = "Easiest and Hardest Vocabulary Items by Country",
    x = "Vocabulary Item",
    y = "Proportion Correct"
  ) +
  ylim(0, 1.05) +
  theme_minimal()

Examining item accuracy (proportion correct) across sites, adjusting for age

Code
# Safe GLM wrapper to skip items that crash (e.g. single-site items)
safe_glm <- purrr::possibly(
  function(df) glm(correct ~ site + age, data = df, family = binomial()),
  otherwise = NULL
)

# Fit per-item logistic models and extract site effects
item_models <- vocab %>%
  filter(site %in% c("ca_pilot", "co_pilot", "de_pilot")) %>%
  group_by(item_uid) %>%
  nest() %>%
  mutate(
    model = map(data, safe_glm),
    tidied = map(model, ~ if (!is.null(.x)) tidy(.x) else tibble())
  ) %>%
  unnest(tidied)

# Extract site effects (excluding intercept and age)
site_effects <- item_models %>%
  filter(term %in% c("siteco_pilot", "sitede_pilot")) %>%
  mutate(abs_estimate = abs(estimate))

# Get top 20 items with largest site differences (adjusted for age)
top_items_adj <- site_effects %>%
  group_by(item_uid) %>%
  summarise(site_diff = max(abs_estimate), .groups = "drop") %>%
  arrange(desc(site_diff)) %>%
  slice_head(n = 20)

# Bring in raw accuracy data for plotting
top_items_accuracy <- item_accuracy_by_site %>%
  filter(item_uid %in% top_items_adj$item_uid)

# Plot raw accuracy for these items (to visualize effect scale)
ggplot(top_items_accuracy, aes(x = proportion_correct, y = reorder(item_uid, proportion_correct), color = site)) +
  geom_point(size = 3) +
  geom_line(aes(group = item_uid), color = "gray70", linewidth = 0.6) +
  scale_color_viridis_d() +
  labs(
    title = "Top 20 vocabulary items with cross-site accuracy gaps (age-adjusted)",
    x = "Proportion Correct",
    y = "Vocabulary Item"
  ) +
  theme_minimal(base_size = 10) +
  theme(legend.position = "bottom")

Same graph, re-ordered by the size of the difference between sites

Code
# Vector of items ordered by difficult gap across sites 
item_order <- top_items_adj %>%
  arrange(desc(site_diff)) %>%
  pull(item_uid)

# Reorder factor levels in accuracy data
top_items_accuracy$item_uid <- factor(top_items_accuracy$item_uid, levels = item_order)

# Plot again
ggplot(top_items_accuracy, aes(x = proportion_correct, y = item_uid, color = site)) +
  geom_point(size = 3) +
  geom_line(aes(group = item_uid), color = "gray70", linewidth = 0.6) +
  scale_color_viridis_d() +
  labs(
  title = "Top 20 vocabulary items with site differences (age-adjusted)",
  subtitle = "Ranked by age-adjusted site effects from logistic regression; plotted using raw accuracy",
  x = "Proportion Correct",
  y = "Vocabulary Item"
) +
  theme_minimal(base_size = 10) +
  theme(legend.position = "bottom")

Code
# The items near the top of the plot had the largest model-estimated site effects, after adjusting for age — not necessarily the largest raw proportion gaps.

Load IRT results

Code
# Load multigroup IRT model outputs
best_multigroup <- readRDS(here("02_scored_data", "irt_outputs", "multigroup_best_outputs.rds"))
multigroup_scores <- readRDS(here("02_scoring_outputs", "scores", "scores_multigroup.rds"))

Multigroup models

Code
run_ages <- vocab |>
  select(site, run_id, user_id, age) |>
  distinct()

multigroup_scores_vocab <- multigroup_scores |>
  filter(task_id == "vocab") |>
  select(site, task_id, user_id, run_id, metric_type, metric_value ) |>
  left_join(run_ages)
Joining with `by = join_by(site, user_id, run_id)`
Code
ggplot(multigroup_scores_vocab, aes(x = age, y = metric_value, col = site)) + 
  geom_point() + 
  theme_minimal(base_family = "sans") +
  geom_smooth(method = "gam")
`geom_smooth()` using formula = 'y ~ s(x, bs = "cs")'
Warning: Removed 15 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 15 rows containing missing values or values outside the scale range
(`geom_point()`).

Extract the coefficients

Code
coefs_vocab_list <- best_multigroup |>
  filter(task_id == "vocab") |>
  pull(coefs)

coefs_vocab <- coefs_vocab_list[[1]]  # create tibble

# For vocabulary items only
vocab_difficulty_by_site <- coefs_vocab |>
  select(site, item, d) |>
  mutate(d_flipped = -d)

glimpse(coefs_vocab)
Rows: 492
Columns: 6
$ site <chr> "ca_pilot", "ca_pilot", "ca_pilot", "ca_pilot", "ca_pilot", "ca_p…
$ item <chr> "vocab__acorn_1", "vocab__aloe_1", "vocab__antenna_1", "vocab__ar…
$ a1   <dbl> 1.4637683, 0.6520752, 1.5212775, 0.7595099, 2.0618756, 1.7595194,…
$ d    <dbl> 2.115200957, -0.206423135, 1.781436109, 0.686630980, 2.710615580,…
$ g    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ u    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…

Difficulty across site

Code
# Difficulty across country
ggplot(vocab_difficulty_by_site, aes(x = -d, y = reorder(item, d), fill = site)) +
  geom_col(position = "dodge") +
  facet_wrap(~ site, scales = "free_y") +
  labs(
    title = "IRT-Derived Difficulty of Vocabulary Items by Country",
    x = "Item Difficulty (d)",
    y = "Vocabulary Item"
  ) +
  theme_minimal(base_size = 10)

20 vocabulary items with the greatest variability in difficulty across sites (not discrimination), based on IRT-derived difficulty estimates

Code
# Note: This figure shows the 20 vocabulary items with the largest apparent variation in difficulty across sites, based on IRT-derived item parameters. However, because the multigroup model used here constrains item parameters to be equal across groups, these differences should be interpreted with caution — they reflect structural model outputs rather than true cross-site differences.

# Compute the range of difficulty for each item
top_diff_items <- coefs_vocab %>%
  group_by(item) %>%
  summarise(diff_range = max(d) - min(d), .groups = "drop") %>%
  arrange(desc(diff_range)) %>%
  slice_head(n = 20)

# Coefficients for these items
coefs_top_items <- coefs_vocab %>%
  filter(item %in% top_diff_items$item)

Plot

Code
# Change labels
library(stringr)

coefs_top_items <- coefs_top_items %>%
  mutate(
    item_clean = item |>
      str_remove("^vocab_") |>          # remove 'vocab_'
      str_remove("_[0-9]+$") |>         # remove trailing '_1', '_2', etc.
      str_replace_all("_", " ")         # optional: replace underscores with spaces (or remove this line if you want to keep compound words as-is)
  )

# Order itmes so the ones with the greatest difference appear first
item_order <- coefs_top_items %>%
  group_by(item_clean) %>%
  summarise(var_d = max(d) - min(d)) %>%
  arrange(desc(var_d)) %>%
  pull(item_clean)

coefs_top_items$item_clean <- factor(coefs_top_items$item_clean, levels = item_order)

# Plot
ggplot(coefs_top_items, aes(x = -d, y = a1, color = site, label = item_clean)) +
  geom_point(size = 3, alpha = 0.8) +
  geom_text_repel(size = 3, max.overlaps = 20) +
  geom_hline(yintercept = 1, linetype = "dashed", color = "gray70") +
  geom_vline(xintercept = 0, linetype = "dashed", color = "gray70") +
  labs(
    title = "Vocabulary Item Difficulty vs. Discrimination (Top 20 Most Variable Items)",
    x = "Item Difficulty (d)",
    y = "Item Discrimination (a)"
  ) +
  scale_color_viridis_d() +
  theme_minimal(base_size = 12) +
  theme(legend.position = "bottom")