Wordbank IRT Analysis

Let’s take a step back from implementing bespoke Stan code for the full Standard Model, with real, unscaled units for input rate, word frequency and difficulty. This was troublesome due to convergence issues in our preferred model which seems to imply issues with identifiability (see also discussion in Buerkner, 2017: unless you have fixed scale and location for at least one of the parameters (ability (our rate) or difficulty), then the model is likely unidentifiable). It seems sensible to first try running some of the basic IRT models in BRMS, and adding in hyperpriors and person and item covariates of greatest interest for the Standard Model. To start, we will try the Rasch and 2PL models, and then to the preferred model add age, CHILDES word frequency, and lexical category. We will examine which of these models converge, and which covariates are justified by leave-one-out cross-validation (LOO-CV). For now, we fit a sample of 50 words and 200 subjects, which requires only a few minutes per model to fit.

## Warning: package 'tidybayes' was built under R version 4.0.3

## Warning: package 'GGally' was built under R version 4.0.3

Rasch model

# partial pooling for both item and person 
# (standard IRT would not pool item pars: 0 + item + (1 | person))
formula_1pl <- bf(produces ~ 1 + (1 | item) + (1 | person))
family = brmsfamily(family = "bernoulli", link = "logit")

# change priors on the hyperparameters defining the
# covariance matrix of the person or item parameters 
# that is on the SDs and correlation matrices
prior_1pl = set_prior("cauchy(0, 5)", class = "sd", group = "person") +
        set_prior("cauchy(0, 5)", class = "sd", group = "item")
        #set_prior("lkj(2)", class = "cor", group = "person")

fit_1pl <- brm(
  formula = formula_1pl,
  data = d_sm,
  family = family,
  prior = prior_1pl, 
  iter = N_ITER, warmup = N_WARMUP, cores = 4, chains = 4,
  file = "1pl.rds"
)

#summary(fit_1pl)
plot(fit_1pl)

# coef vs ranef: do we want to include overall effects (i.e., the global intercept for the present model) in the computation of the individual coefficients or not?

#get_variables(fit_1pl)

fit_1pl %>% spread_draws(b_Intercept) %>% mean_qi(.width=.9)

ABCDEFGHIJ0123456789

b_Intercept <dbl>	.lower <dbl>	.upper <dbl>	.width <dbl>	.point <chr>	.interval <chr>
-1.179453	-1.806336	-0.5706104	0.9	mean	qi

m1_item <- fit_1pl %>% spread_draws(r_item[item,term]) %>%
  mean_qi(.width=.9)
m1_person <- fit_1pl %>% spread_draws(r_person[person,term]) %>%
  mean_qi(.width=.9)
#ranef(fit_1pl)$item

p1 <- m1_person %>% ggplot(aes(x=r_person, y=reorder(person, -r_person), xmin=.lower, xmax=.upper)) + 
   geom_pointinterval() + ylab("Child") + xlab("Ability")

p2 <- m1_item %>% ggplot(aes(x= r_item, y=reorder(item, -r_item), xmin=.lower, xmax=.upper)) + 
   geom_pointinterval() + ylab("Item") + xlab("Item Easiness")

grid.arrange(p1, p2, ncol=2)

#library(ggmcmc)
#m1 <- ggs(fit_1pl)
# label=words_samp_sm
#ggs_caterpillar(m1, family = "item") # + = easier
#ggs_caterpillar(m1, family = "person")

2PL model

Also fit discrimination parameter per item ( $\alpha_i$ ), assumed to be >0.

# eta = sum of person ability and item easiness
# logalpha = log discrimination
# item easiness and discrimination are correlated: |i|
formula_2pl <- bf(
  produces ~ exp(logalpha) * eta,
  eta ~ 1 + (1 |i| item) + (1 | person),
  logalpha ~ 1 + (1 |i| item),
  nl = TRUE
)

prior_2pl <- prior("normal(0, 5)", class = "b", nlpar = "eta") +
  prior("normal(0, 1)", class = "b", nlpar = "logalpha") +
  prior("constant(1)", class = "sd", group = "person", nlpar = "eta") +
  prior("normal(0, 3)", class = "sd", group = "item", nlpar = "eta") +
  prior("normal(0, 1)", class = "sd", group = "item", nlpar = "logalpha")

fit_2pl <- brm(
  formula = formula_2pl,
  data = d_sm,
  family = family,
  prior = prior_2pl,
  iter = N_ITER, warmup = N_WARMUP, cores = 4, chains = 4,
  file = "2pl.rds"
)

summary(fit_2pl)

##  Family: bernoulli 
##   Links: mu = logit 
## Formula: produces ~ exp(logalpha) * eta 
##          eta ~ 1 + (1 | i | item) + (1 | person)
##          logalpha ~ 1 + (1 | i | item)
##    Data: d_sm (Number of observations: 10000) 
## Samples: 4 chains, each with iter = 6000; warmup = 3000; thin = 1;
##          total post-warmup samples = 12000
## 
## Group-Level Effects: 
## ~item (Number of levels: 50) 
##                                       Estimate Est.Error l-95% CI u-95% CI Rhat
## sd(eta_Intercept)                         0.71      0.09     0.56     0.90 1.00
## sd(logalpha_Intercept)                    0.24      0.04     0.17     0.32 1.00
## cor(eta_Intercept,logalpha_Intercept)     0.37      0.18    -0.02     0.69 1.00
##                                       Bulk_ESS Tail_ESS
## sd(eta_Intercept)                         1670     2777
## sd(logalpha_Intercept)                    5372     7789
## cor(eta_Intercept,logalpha_Intercept)     6338     8623
## 
## ~person (Number of levels: 200) 
##                   Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(eta_Intercept)     1.00      0.00     1.00     1.00 1.00    12000    12000
## 
## Population-Level Effects: 
##                    Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## eta_Intercept         -0.39      0.13    -0.63    -0.15 1.01      654     1473
## logalpha_Intercept     1.14      0.07     1.01     1.28 1.00     1321     2436
## 
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

# once again, BULK ESS too low: 343 for pop-level eta
#m2 <- ggs(fit_2pl)
#ggs_caterpillar(m2, family = "item") # + = easier
#ggs_caterpillar(m2, family = "person")

Compare 1PL and 2PL Model Fits

Model fit was obtained via approximate leave-one-out cross-validation (LOO-CV), revealing a LOOIC difference of $\Delta$ LOOIC = 3.5 in favor of the 2PL model, which is quite small both on an absolute scale and in comparison to its standard error SE = 3.3 depicting the uncertainty in the difference. Thus, for our next analyses we will continue to use the 1PL model.

#pp_check(fit_1pl)
#pp_check(fit_2pl)
m1m2 <- loo(fit_1pl, fit_2pl) # smaller LOOIC is better

## Warning: Found 2 observations with a pareto_k > 0.7 in model 'fit_2pl'. It is
## recommended to set 'moment_match = TRUE' in order to perform moment matching for
## problematic observations.

# loo_compare(m1m2$loos)
# log_lik() can give pointwise log-lik estimate

Model fit was obtained via approximate leave-one-out cross-validation (LOO-CV), revealing a LOOIC difference of $\Delta$ LOOIC = 61.18 in favor of the 2PL model, which is quite small both on an absolute scale and in comparison to its standard error SE = 17.71 depicting the uncertainty in the difference. Thus, for our next analyses we will continue to use the 1PL model.

Age covariate

We now include child’s age as a covariate.

formula_1pl_age <- bf(produces ~ 1 + age + (1 | item) + (1 | person))
# do we need to scale and center age?

fit_1pl_age <- brm(
  formula = formula_1pl_age,
  data = d_sm,
  family = family,
  prior = prior_1pl,
  iter = N_ITER, warmup = N_WARMUP, cores = 4, chains = 4,
  file = "1pl_age.rds"
)

#summary(fit_1pl_age)
# once again, BULK ESS too low: 343 for pop-level eta

age_par <- fit_1pl_age %>% spread_draws(`b_.*`, regex = TRUE) %>% mean_qi(.width=.9)


m1age <- loo(fit_1pl, fit_1pl_age)

The age coefficient has M = 0.49, with 90% credible interval = [0.43,0.54]. Model comparison reveals a LOOIC difference of $\Delta$ LOOIC = -4.49 against the model with age, although this value is quite small both on an absolute scale and in comparison to its standard error SE = 5.43. Next we will test the 1PL model with a lexical class item covariate.

Lexical category

#formula_1pl_cat <- bf(produces ~ 1 + lexical_class + (0 + lexical_class | item) + (1 + person))
formula_1pl_cat <- bf(produces ~ 1 + lexical_class + (1 | item) + (1 | person))
#d_sm$lexical_class =  as.factor(d_sm$lexical_class)

fit_1pl_cat <- brm(
  formula = formula_1pl_cat,
  data = d_sm,
  family = family,
  prior = prior_1pl,
  iter = N_ITER, warmup = N_WARMUP, cores = 4, chains = 4,
  file = "1pl_lexcat.rds"
)

# There were 306 transitions after warmup that exceeded the maximum treedepth. Increase max_treedepth above 10.

m1cat <- loo(fit_1pl, fit_1pl_cat)

#coef(fit_1pl_cat)$item

#get_variables(fit_1pl_cat)[1:20]

#lexcat_pars <- fit_1pl_cat %>% spread_draws(`b_.*`[category], regex = TRUE) %>% mean_qi(.width=.9)


m_lexcat_item <- fit_1pl_cat %>% spread_draws(r_item[item,term]) %>%
  mean_qi(.width=.9)
m_lexcat_person <- fit_1pl_cat %>% spread_draws(r_person[person,term]) %>%
  mean_qi(.width=.9)
#ranef(fit_1pl)$item

p1 <- m_lexcat_person %>% ggplot(aes(x=r_person, y=reorder(person, -r_person), xmin=.lower, xmax=.upper)) + 
   geom_pointinterval() + ylab("Child") + xlab("Ability")

p2 <- m_lexcat_item %>% ggplot(aes(x= r_item, y=reorder(item, -r_item), xmin=.lower, xmax=.upper)) + 
   geom_pointinterval() + ylab("Item") + xlab("Item Easiness")

#grid.arrange(p1, p2, ncol=2)

lexcat_pars <- fit_1pl_cat %>% spread_draws(`b_lexical_class.*`, regex=T) %>% 
  pivot_longer(cols = starts_with("b_lexical_class"), names_to="lexical_class", names_prefix= "b_lexical_class") %>% group_by(lexical_class) %>%
  tidyboot::tidyboot_mean(value)

lexcat_pars %>% ggplot(aes(x=reorder(lexical_class, -mean), y=mean)) +
  geom_point() + geom_linerange(aes(ymin=ci_lower, ymax=ci_upper)) +
  geom_hline(yintercept=0, linetype='dashed') + 
  ylab("Coefficient") + xlab("Lexical Class")

# transform logit parms back to probabilities and compare posterior
#conditional_effects(fit_1pl_cat, "lexical_class")
#hyp <- "lexical_classnouns - lexical_classverbs > 0"
#hypothesis(fit_1pl_cat, hyp, class = "sd", group = "id")

“Other” words are easiest (the random sample includes relatively easy items such as “daddy”, “baby”, “pattycake”, and cockadoodledoo"), followed by nouns, while verbs and function words are much more difficult.

Model comparison reveals a LOOIC difference of $\Delta$ LOOIC = -1.71 against the model with lexical class, although this value is quite small both on an absolute scale and in comparison to its standard error SE = 0.76. Next we will test the 1PL model with word frequency (from CHILDES) as an item covariate.

Word frequency

formula_1pl_freq <- bf(produces ~ 1 + prob + (1 | item) + (1 | person))

fit_1pl_freq <- brm(
  formula = formula_1pl_freq,
  data = d_sm,
  family = family,
  prior = prior_1pl,
  iter = N_ITER, warmup = N_WARMUP, cores = 4, chains = 4,
  file = "1pl_freq.rds"
)


m1freq <- loo(fit_1pl, fit_1pl_freq)

## Warning: Found 1 observations with a pareto_k > 0.7 in model 'fit_1pl_freq'. It
## is recommended to set 'moment_match = TRUE' in order to perform moment matching
## for problematic observations.

#get_variables(fit_1pl_freq)

freq_pars <- fit_1pl_freq %>% spread_draws(`b_.*`, regex = TRUE) %>% mean_qi(.width=.9)

m_freq_item <- fit_1pl_freq %>% spread_draws(r_item[item,term]) %>%
  mean_qi(.width=.9)
m_freq_person <- fit_1pl_freq %>% spread_draws(r_person[person,term]) %>%
  mean_qi(.width=.9)
#ranef(fit_1pl)$item

p1 <- m_freq_person %>% ggplot(aes(x=r_person, y=reorder(person, -r_person), xmin=.lower, xmax=.upper)) + 
   geom_pointinterval() + ylab("Child") + xlab("Ability")

p2 <- m_freq_item %>% ggplot(aes(x= r_item, y=reorder(item, -r_item), xmin=.lower, xmax=.upper)) + 
   geom_pointinterval() + ylab("Item") + xlab("Item Easiness")

grid.arrange(p1, p2, ncol=2)

The word frequency coefficient has M = -14.88, with 90% credible interval = [-91.92,62.67]. Model comparison to the 1PL model reveals a LOOIC difference of $\Delta$ LOOIC = -1.26 against the 1PL model, although this value is quite small both on an absolute scale and in comparison to its standard error SE = 0.43. Next we will test the 1PL model with word frequency and lexical class as item covariates.

Word frequency and lexical class

(Did not converge: need to re-run.)

Maybe should skip directly to evaluating

produces ~ 1 + age + lexical_class * prob + (1 | item) + (1 | person)

and also compare to a model allowing the effect of age to vary across items:

produces ~ 1 + age + lexical_class * prob + (0 + age | item) + (1 | person)

and to a model allowing the effect of lexical class to vary across children:

produces ~ 1 + age + lexical_class * prob + (1 | item) + (0 + lexical_class | person)

formula_1pl_cat_freq <- bf(produces ~ 1 + lexical_class * prob + (1 | item) + (1 | person))

fit_1pl_cat_freq <- brm(
  formula = formula_1pl_cat_freq,
  data = d_sm,
  family = family,
  prior = prior_1pl,
  iter = N_ITER, warmup = N_WARMUP, cores = 4, chains = 4,
  control = list(adapt_delta=0.98, max_treedepth=14),
  file = "1pl_lexcat_freq.rds"
)
# max_treedepth > 13 needed

# 25 hrs for 50 words, 200 subjects...

cat_freq_pars <- fit_1pl_cat_freq %>% spread_draws(`b_.*`, regex = TRUE) %>% mean_qi(.width=.9)

m_cat_freq_item <- fit_1pl_cat_freq %>% spread_draws(r_item[item,term]) %>%
  mean_qi(.width=.9)
m_cat_freq_person <- fit_1pl_cat_freq %>% spread_draws(r_person[person,term]) %>%
  mean_qi(.width=.9)

catfreq_pars <- fit_1pl_cat_freq %>% spread_draws(`b_lexical_class.*`, regex=T) %>% 
  pivot_longer(cols = starts_with("b_lexical_class"), names_to="lexical_class", names_prefix= "b_lexical_class") %>% group_by(lexical_class) %>%
  tidyboot::tidyboot_mean(value)

catfreq_pars$Interaction = rep(c("Main Effect","WF Interaction"), 4)

catfreq_pars %>% 
  ggplot(aes(x=lexical_class, y=mean)) +
  facet_wrap(~Interaction, scale="free") + 
  geom_point() + geom_linerange(aes(ymin=ci_lower, ymax=ci_upper)) +
  geom_hline(yintercept=0, linetype='dashed') + 
  ylab("Coefficient") + xlab("Lexical Class")

m1lcf <- loo(fit_1pl, fit_1pl_cat_freq)

Model comparison to the 1PL model reveals a LOOIC difference of $\Delta$ LOOIC = 0.12, although this value is quite small both on an absolute scale and in comparison to its standard error SE = 2.28.

Correlations of item difficulty estimates across models

m1_item$Model = "1PL"
m_freq_item$Model = "Word Freq"
m_lexcat_item$Model = "Lex Cat"
m_cat_freq_item$Model = "Word Freq + Lex Cat"
m1_person$Model = "1PL"
m_freq_person$Model = "Word Freq"
m_lexcat_person$Model = "Lex Cat"
m_cat_freq_person$Model = "Word Freq + Lex Cat"

item_pars <- bind_rows(m1_item, m_freq_item, m_lexcat_item, m_cat_freq_item)
person_pars <- bind_rows(m1_person, m_freq_person, m_lexcat_person, m_cat_freq_person)

item_info <- d_sm %>% group_by(item) %>% 
  summarise(lexical_class = lexical_class[1]) 

ip <- item_pars %>% 
  pivot_wider(id_cols = "item", names_from="Model", values_from="r_item") %>%
  left_join(item_info, by="item")

ggpairs(ip, columns=2:5, ggplot2::aes(colour=lexical_class))

Correlations of person ability estimates across models

pp_info <- d_sm %>% group_by(person) %>% 
  summarise(age = age[1], production = production[1]) # cor = .75

pp <- person_pars %>% 
  pivot_wider(id_cols = "person", names_from="Model", values_from="r_person") %>%
  left_join(pp_info, by="person")

ggpairs(pp, columns=2:ncol(pp))

Both ability and item parameter estimates are strongly correlated across the different models. Ability scores in all models are better predictors of children’s overall CDI:WS sumscores than age is.