Loading data

Package loading.

library(tidyverse)
## Warning: package 'tibble' was built under R version 3.5.2
## Warning: package 'tidyr' was built under R version 3.5.2
## Warning: package 'purrr' was built under R version 3.5.2
## Warning: package 'dplyr' was built under R version 3.5.2
## Warning: package 'stringr' was built under R version 3.5.2
library(brms)
## Warning: package 'brms' was built under R version 3.5.2
## Warning: package 'Rcpp' was built under R version 3.5.2
library(wordbankr)
theme_set(langcog::theme_mikabr())
knitr::opts_chunk$set(cache = TRUE, warn = FALSE, message = FALSE)

Data loading

wg_admins <- get_administration_data(language = "English (American)", 
                                     form = "WS", original_ids = TRUE)
## Warning in .local(conn, statement, ...): Decimal MySQL column 6 imported as
## numeric

## Warning in .local(conn, statement, ...): Decimal MySQL column 6 imported as
## numeric
wg_long <- wg_admins %>%
  group_by(original_id, mom_ed) %>% 
  count %>%
  filter(n == 2,  !is.na(mom_ed))
## Warning: Factor `mom_ed` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## Warning: Factor `mom_ed` contains implicit NA, consider using
## `forcats::fct_explicit_na`
wg_long_admins <- left_join(wg_long, wg_admins)

wg_items <- get_item_data(language = "English (American)", 
                          form = "WS")
## Warning in .local(conn, statement, ...): Decimal MySQL column 6 imported as
## numeric
## Warning in .local(conn, statement, ...): Decimal MySQL column 6 imported as
## numeric
wg_all <- get_instrument_data(language = "English (American)", 
                                     form = "WS", 
                              items = filter(wg_items, 
                                             type == "word") %>%
                                pull(item_id))

wg_long_byitem <- filter(wg_all, data_id %in% wg_long_admins$data_id) 

Make main dataframe.

d <- left_join(wg_long_byitem, 
               select(wg_long_admins, 
                      data_id, original_id, age, mom_ed, sex)) %>%
  mutate(produces = as.numeric(value == "produces")) %>%
  select(-value) %>%
  left_join(select(wg_items, num_item_id, definition, category)) %>%
  rename(person = original_id, 
         item = definition) %>%
  select(person, age, sex, mom_ed, category, item, produces)

Subsampling

ms <- d %>%
  group_by(person, age, sex) %>%
  summarise(produces = sum(produces))

ggplot(ms, 
       aes(x = age, y = produces, group = person, col = sex)) + 
  geom_jitter(alpha = .2, width = .2)+ 
  geom_line(alpha = .2) +
  geom_smooth(aes(group = sex), span = 2, se = FALSE) + 
  ggthemes::scale_color_solarized()

Break into two time periods and subsample.

earlier <- d %>%
  group_by(person, age) %>%
  count() %>%
  group_by(person) %>%
  mutate(earlier = ifelse(age == age[1], TRUE, FALSE),
         twos = ifelse(age[1] == 24, TRUE, FALSE)) %>%
  select(-n)

d <- left_join(d, earlier)

d_sub <- filter(d, 
                category == "food_drink", 
                twos)

d1_sub <- filter(d_sub, earlier)
d2_sub <- filter(d_sub, !earlier)

Plotting the subsampled data.

ms <- d_sub %>%
  group_by(person, age, sex) %>%
  summarise(produces = sum(produces))

ggplot(ms, 
       aes(x = age, y = produces, group = person, col = sex)) + 
  geom_jitter(alpha = .2, width = .2)+ 
  geom_line(alpha = .2) +
  geom_smooth(aes(group = sex), span = 2, se = FALSE) + 
  ggthemes::scale_color_solarized() +
  langcog::theme_mikabr() 

Models

1PL with no sex differences

First fit 1pl to time 1.

formula_1pl <- produces ~ 1 + (1|person) + (1|item)
  
prior_1pl <-
  prior("normal(0, 3)", class = "sd", group = "person") + 
  prior("normal(0, 3)", class = "sd", group = "item")

mod_d1_1pl <- brm(formula = formula_1pl,
              prior = prior_1pl, 
              family = brmsfamily(family = "bernoulli", link = "logit"),
              data = d1_sub)
## 
## SAMPLING FOR MODEL '080fe31e0b6ab0c59347934ef33710e4' NOW (CHAIN 1).
## Chain 1: 
## Chain 1: Gradient evaluation took 0.000273 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 2.73 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1: 
## Chain 1: 
## Chain 1: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 1: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 1: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 1: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 1: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 1: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 1: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 1: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 1: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 1: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 1: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 1: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 1: 
## Chain 1:  Elapsed Time: 4.37235 seconds (Warm-up)
## Chain 1:                4.67327 seconds (Sampling)
## Chain 1:                9.04562 seconds (Total)
## Chain 1: 
## 
## SAMPLING FOR MODEL '080fe31e0b6ab0c59347934ef33710e4' NOW (CHAIN 2).
## Chain 2: 
## Chain 2: Gradient evaluation took 0.000148 seconds
## Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 1.48 seconds.
## Chain 2: Adjust your expectations accordingly!
## Chain 2: 
## Chain 2: 
## Chain 2: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 2: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 2: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 2: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 2: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 2: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 2: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 2: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 2: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 2: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 2: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 2: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 2: 
## Chain 2:  Elapsed Time: 4.4231 seconds (Warm-up)
## Chain 2:                4.72401 seconds (Sampling)
## Chain 2:                9.14711 seconds (Total)
## Chain 2: 
## 
## SAMPLING FOR MODEL '080fe31e0b6ab0c59347934ef33710e4' NOW (CHAIN 3).
## Chain 3: 
## Chain 3: Gradient evaluation took 0.000151 seconds
## Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 1.51 seconds.
## Chain 3: Adjust your expectations accordingly!
## Chain 3: 
## Chain 3: 
## Chain 3: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 3: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 3: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 3: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 3: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 3: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 3: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 3: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 3: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 3: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 3: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 3: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 3: 
## Chain 3:  Elapsed Time: 4.43297 seconds (Warm-up)
## Chain 3:                4.62197 seconds (Sampling)
## Chain 3:                9.05494 seconds (Total)
## Chain 3: 
## 
## SAMPLING FOR MODEL '080fe31e0b6ab0c59347934ef33710e4' NOW (CHAIN 4).
## Chain 4: 
## Chain 4: Gradient evaluation took 0.000146 seconds
## Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 1.46 seconds.
## Chain 4: Adjust your expectations accordingly!
## Chain 4: 
## Chain 4: 
## Chain 4: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 4: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 4: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 4: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 4: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 4: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 4: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 4: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 4: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 4: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 4: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 4: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 4: 
## Chain 4:  Elapsed Time: 4.30439 seconds (Warm-up)
## Chain 4:                4.67911 seconds (Sampling)
## Chain 4:                8.9835 seconds (Total)
## Chain 4:

Basic summary of the model.

summary(mod_d1_1pl)
##  Family: bernoulli 
##   Links: mu = logit 
## Formula: produces ~ 1 + (1 | person) + (1 | item) 
##    Data: d1_sub (Number of observations: 1700) 
## Samples: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
##          total post-warmup samples = 4000
## 
## Group-Level Effects: 
## ~item (Number of levels: 68) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     1.38      0.15     1.10     1.70 1.00     1507     2600
## 
## ~person (Number of levels: 25) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     1.87      0.32     1.35     2.59 1.01     1026     1875
## 
## Population-Level Effects: 
##           Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept    -0.77      0.43    -1.58     0.09 1.00      608      933
## 
## Samples were drawn using sampling(NUTS). For each parameter, Eff.Sample 
## is a crude measure of effective sample size, and Rhat is the potential 
## scale reduction factor on split chains (at convergence, Rhat = 1).

Examine parameters.

plot(mod_d1_1pl)

Plot items.

items_1pl <- as_tibble(coef(mod_d1_1pl)$item[, , "Intercept"]) %>%
  mutate(item = rownames(coef(mod_d1_1pl)$item[, , "Intercept"]), 
         item = fct_reorder(item, Estimate))

ggplot(items_1pl, 
       aes(x = item, y = Estimate)) + 
  geom_pointrange(aes(ymin = Q2.5, ymax = Q97.5)) + 
  coord_flip()

Now plot people.

persons_1pl <- as_tibble(ranef(mod_d1_1pl)$person[, , "Intercept"]) %>%
  mutate(person = rownames(ranef(mod_d1_1pl)$person[, , "Intercept"])) %>%
  left_join(d1_sub %>%
              group_by(person, age, sex) %>% count()) %>%
  mutate(person = fct_reorder(person, Estimate))

ggplot(persons_1pl, 
       aes(x = person, y = Estimate, col = sex)) + 
  geom_pointrange(aes(ymin = Q2.5, ymax = Q97.5)) + 
  coord_flip()

1PL with sex differences and DIF

Now think about modeling sex differences.

Classically, DIF would be written as a fied effect of sex plus a random effect of sex by item.

formula_1pl_dif <- produces ~ sex + (1 | person) + (sex | item)
  
prior_1pl_dif <-
  prior("normal(0, 3)", class = "sd", group = "person") + 
  prior("normal(0, 3)", class = "sd", group = "item") 

mod_d1_1pl_dif <- brm(formula = formula_1pl_dif,
                      prior = prior_1pl_dif, 
                      family = brmsfamily(family = "bernoulli", 
                                          link = "logit"),
                      data = d1_sub)
## 
## SAMPLING FOR MODEL '8bf3d3d3fa426de24ee1bbe596d3b3ba' NOW (CHAIN 1).
## Chain 1: 
## Chain 1: Gradient evaluation took 0.000381 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 3.81 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1: 
## Chain 1: 
## Chain 1: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 1: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 1: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 1: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 1: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 1: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 1: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 1: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 1: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 1: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 1: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 1: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 1: 
## Chain 1:  Elapsed Time: 10.7652 seconds (Warm-up)
## Chain 1:                7.69702 seconds (Sampling)
## Chain 1:                18.4623 seconds (Total)
## Chain 1: 
## 
## SAMPLING FOR MODEL '8bf3d3d3fa426de24ee1bbe596d3b3ba' NOW (CHAIN 2).
## Chain 2: 
## Chain 2: Gradient evaluation took 0.000239 seconds
## Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 2.39 seconds.
## Chain 2: Adjust your expectations accordingly!
## Chain 2: 
## Chain 2: 
## Chain 2: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 2: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 2: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 2: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 2: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 2: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 2: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 2: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 2: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 2: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 2: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 2: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 2: 
## Chain 2:  Elapsed Time: 10.9499 seconds (Warm-up)
## Chain 2:                7.89602 seconds (Sampling)
## Chain 2:                18.846 seconds (Total)
## Chain 2: 
## 
## SAMPLING FOR MODEL '8bf3d3d3fa426de24ee1bbe596d3b3ba' NOW (CHAIN 3).
## Chain 3: 
## Chain 3: Gradient evaluation took 0.000245 seconds
## Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 2.45 seconds.
## Chain 3: Adjust your expectations accordingly!
## Chain 3: 
## Chain 3: 
## Chain 3: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 3: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 3: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 3: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 3: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 3: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 3: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 3: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 3: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 3: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 3: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 3: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 3: 
## Chain 3:  Elapsed Time: 11.284 seconds (Warm-up)
## Chain 3:                7.65485 seconds (Sampling)
## Chain 3:                18.9388 seconds (Total)
## Chain 3: 
## 
## SAMPLING FOR MODEL '8bf3d3d3fa426de24ee1bbe596d3b3ba' NOW (CHAIN 4).
## Chain 4: 
## Chain 4: Gradient evaluation took 0.000251 seconds
## Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 2.51 seconds.
## Chain 4: Adjust your expectations accordingly!
## Chain 4: 
## Chain 4: 
## Chain 4: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 4: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 4: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 4: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 4: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 4: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 4: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 4: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 4: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 4: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 4: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 4: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 4: 
## Chain 4:  Elapsed Time: 11.3786 seconds (Warm-up)
## Chain 4:                7.76694 seconds (Sampling)
## Chain 4:                19.1455 seconds (Total)
## Chain 4:
plot(mod_d1_1pl_dif, pars = c("b_sexMale", "sd_item__sexMale"))

Plot items and DIF scores. Spoiler, there are no DIF effects on our subsample, food items.

items_1pl_dif <- bind_rows(
  as_tibble(coef(mod_d1_1pl_dif)$item[, , "Intercept"]) %>% 
    mutate(coef = "Intercept"), 
  as_tibble(ranef(mod_d1_1pl_dif)$item[, , "sexMale"]) %>% 
    mutate(coef = "sexMale")) %>%
  mutate(item = rep(rownames(coef(mod_d1_1pl_dif)$item[, , "Intercept"]),2), 
         item = fct_reorder(item, Estimate))

ggplot(items_1pl_dif, 
       aes(x = item, y = Estimate)) + 
  geom_pointrange(aes(ymin = Q2.5, ymax = Q97.5)) + 
  coord_flip() + 
  facet_grid(.~coef, scales = "free_x")

Now plot people.

persons_1pl_dif <- as_tibble(ranef(mod_d1_1pl_dif)$person[, , "Intercept"]) %>%
  mutate(person = rownames(ranef(mod_d1_1pl_dif)$person[, , "Intercept"])) %>%
  left_join(d1_sub %>%
              group_by(person, age, sex) %>% count()) %>%
  mutate(person = fct_reorder(person, Estimate))

persons_1pl_dif_coef <- as_tibble(coef(mod_d1_1pl_dif)$person[, , "Intercept"]) %>%
  mutate(person = rownames(coef(mod_d1_1pl_dif)$person[, , "Intercept"])) %>%
  left_join(d1_sub %>%
              group_by(person, age, sex) %>% count()) %>%
  mutate(person = fct_reorder(person, Estimate))

ggplot(persons_1pl_dif, 
       aes(x = person, y = Estimate, col = sex)) + 
  geom_pointrange(aes(ymin = Q2.5, ymax = Q97.5)) + 
  coord_flip()

Comparison of DIF and non-DIF models

Plot sex-DIF vs. non-sex-DIF estimates for people, showing the differences in the random ability estimates.

compare_dif <- tibble(estimate = persons_1pl$Estimate, 
                      estimate_dif = persons_1pl_dif$Estimate,
                      estimate_dif_coef = persons_1pl_dif_coef$Estimate,
                      sex = persons_1pl$sex)

ggplot(data = compare_dif, 
       aes(x = estimate, y = estimate_dif, 
           col = sex)) +
  geom_point() + 
  geom_abline(lty = 2) 

# p2 <- ggplot(data = compare_dif, 
#        aes(x = estimate, y = estimate_dif_coef, 
#            col = sex)) +
#   geom_point() + 
#   geom_abline(lty = 2) + 
#   theme(legend.pos = "bottom")
# 
# cowplot::plot_grid(p1,p2)

You can get:

  1. Estimate of absolute ability. How good are you compared to other people.
  2. Demographically-adjusted estimate of ability. Effectively, how good are are you compared to other people, if you weren’t in your demographic group.

This second is of interest in reasoning about “ability in context.”

One way of formalizing the affirmative action hypothesis is the idea that, for some demographics, (2) should be a better predictor of future performance than (1).