library(dplyr)
library(tidyr)
library(purrr)
library(readr)
library(ggplot2)
library(langcog)
library(wordbankr)
library(boot)
library(lazyeval)
theme_set(theme_mikabr())

Data loading

Connect to the Wordbank database and pull out the raw data.

data_mode <- "remote"

admins <- get_administration_data(mode = data_mode) %>%
  select(data_id, age, language, form)

items <- get_item_data(mode = data_mode) %>%
  mutate(num_item_id = as.numeric(substr(item_id, 6, nchar(item_id))),
         definition = tolower(definition))

words <- items %>%
  filter(type == "word", !is.na(uni_lemma), form == "WG")

invalid_uni_lemmas <- words %>%
  group_by(uni_lemma) %>%
  filter(n() > 1,
         length(unique(lexical_class)) > 1) %>%
  arrange(language, uni_lemma)

get_inst_data <- function(inst_items) {
  inst_language <- unique(inst_items$language)
  inst_form <- unique(inst_items$form)
  inst_admins <- filter(admins, language == inst_language, form == inst_form)
  get_instrument_data(instrument_language = inst_language,
                      instrument_form = inst_form,
                      items = inst_items$item_id,
                      administrations = inst_admins,
                      iteminfo = inst_items,
                      mode = data_mode) %>%
    filter(!is.na(age)) %>%
    mutate(produces = !is.na(value) & value == "produces",
           understands = !is.na(value) & (value == "understands" | value == "produces")) %>%
    select(-value) %>%
    gather(measure, value, produces, understands) %>%
    mutate(language = inst_language,
           form = inst_form)
}

items_by_inst <- split(words, paste(words$language, words$form, sep = "_"))
raw_data <- map(items_by_inst, get_inst_data)

Exploratory analyses

Explore AOAs for English.

ewg <- raw_data$English_WG

ms <- ewg %>%
  group_by(definition, age, category) %>%
  summarise(prop = mean(value), 
            num_true = sum(value), 
            num_false = sum(!value))

ggplot(ms, aes(age, prop, col = definition)) + 
  geom_line() + 
  facet_wrap(~category) +
  scale_colour_solarized(guide=FALSE)

Let’s compare some methods. First try empirical quantiles. This loses many observations that never go above 50%.

empirical_aoas <- ms %>%
  group_by(definition, category) %>%
  summarise(empirical_aoa = min(age[prop > .5]))

qplot(empirical_aoa, data = empirical_aoas)

qplot(empirical_aoa, facets = ~ category, data = empirical_aoas)

Basic models

Basic GLM

Now let’s try the basic GLM version.

fit_glm <- function(data) {
  model <- glm(cbind(num_true, num_false) ~ age, family = "binomial", 
               data = data)
  fit <- predict(model, newdata = data.frame(age = 0:36), se.fit = TRUE)
  aoa <- -model$coefficients[["(Intercept)"]] / model$coefficients[["age"]]
  
  data.frame(definition = data$definition[1],
             category = data$category[1],
             glm_aoa = aoa)
}

glm_aoas <- ms %>%
  split(.$definition) %>%
  map(fit_glm) %>%
  bind_rows

qplot(glm_aoa, data = glm_aoas)

qplot(glm_aoa, facets = ~ category, data = glm_aoas)

Compare. The GLM AOA gives a bunch of values, many of which seem crazy.

aoas <- full_join(empirical_aoas, glm_aoas)

ggplot(aoas, aes(x = empirical_aoa, y = glm_aoa, col = category)) + 
  geom_point() + 
  geom_text(aes(label = definition), nudge_y = 3) +
  scale_colour_solarized(guide=FALSE) 

aoas %>%
  gather(measure, aoa, empirical_aoa, glm_aoa) %>%
  ggplot(aes(x = aoa)) + facet_grid(.~measure) + geom_histogram()

Robust GLM

fit_rglm <- function(data) {
  model <- glmrob(cbind(num_true, num_false) ~ age, family = "binomial", 
               data = data)
  fit <- predict(model, newdata = data.frame(age = 0:36), se.fit = TRUE)
  aoa <- -model$coefficients[["(Intercept)"]] / model$coefficients[["age"]]
  
  data.frame(definition = data$definition[1],
             category = data$category[1],
             rglm_aoa = aoa)
}

rglm_aoas <- ms %>%
  split(.$definition) %>%
  map(fit_rglm) %>%
  bind_rows

qplot(rglm_aoa, data = rglm_aoas)

qplot(rglm_aoa, data = rglm_aoas) + 
  xlim(c(0,50))

qplot(rglm_aoa, facets = ~ category, data = rglm_aoas)

Looks totally reasonable except for the two crazy ones.

rglm_aoas$definition[rglm_aoas$rglm_aoa < 0] 
## [1] "pattycake"
rglm_aoas$definition[rglm_aoas$rglm_aoa > 50] 
## [1] "brother" "sister"

Bayes GLM

Can we do a BayesGLM to regularize things a bit? Let’s explore a single curve. Basically, BayesGLM does exactly the same thing as GLM because of the huge amount of data, no matter what prior. Let’s try also robust GLM.

library(arm)
library(robustbase)

target_words <- c(c("pattycake","mommy*","bye", "tomorrow","play pen", "today",
                  "child", "all","night night","brother", "daddy", "bye", "when"), 
                  sample(unique(ms$definition), 20))

do.fits <- function(data) {
  glm_model <- glm(cbind(num_true, num_false) ~ age, 
                   family = "binomial", 
                   data = data)
  bayes_model <- bayesglm(cbind(num_true, num_false) ~ age, 
                          family = binomial(link="logit"),
                          prior.mean = .25, 
                          prior.scale = c(.01), 
                          prior.mean.for.intercept = 0, 
                          prior.scale.for.intercept = 2.5,
                          prior.df=Inf, 
                          data = data)
  rob_model <- glmrob(cbind(num_true, num_false) ~ age, 
                      family = binomial(link="logit"), data = data)
  
  
  fits <- data.frame(age = 0:36) %>%
    mutate(glm = predict(glm_model, 
                         type = "response", 
                         data.frame(age = age)), 
           bayes = predict(bayes_model, 
                           type = "response", 
                           data.frame(age = age)), 
           robust = predict(rob_model, 
                            type = "response", 
                            data.frame(age = age)), 
           definition = data$definition[1]) %>%
    left_join(data) 
  
  return(fits)
}

mms <- ms %>%
  filter(definition %in% target_words) %>%
  group_by(definition) %>%
  do(do.fits(.)) 

ggplot(mms, aes(x = age)) + 
  geom_line(aes(y = glm), col = "red") + 
  geom_line(aes(y = bayes), col = "blue") +
  geom_line(aes(y = robust), col = "green") + 
  scale_colour_manual(values = c("glm" = "red", 
                                 "bayes" = "blue", 
                                 "robustglm" = "green"), 
                      name = "model") +
  geom_point(aes(y = prop), col = "black") + 
  ylim(c(0,1)) + 
  facet_wrap(~definition)

Now apply this more broadly. Note that the 50% point for logistic regression = \(- \beta_1 / \beta_2\).

fit_bglm <- function(data) {
  model <- bayesglm(cbind(num_true, num_false) ~ age, family = "binomial", 
                    prior.mean = .25, 
                          prior.scale = c(.01), 
                          prior.mean.for.intercept = 0, 
                          prior.scale.for.intercept = 2.5,
                          prior.df=Inf,
                    data = data)
  aoa <- -model$coefficients[["(Intercept)"]] / model$coefficients[["age"]]
  
  data.frame(definition = data$definition[1],
             category = data$category[1],
             bglm_aoa = aoa)
}

bglm_aoas <- ms %>%
  split(.$definition) %>%
  map(fit_bglm) %>%
  bind_rows

qplot(bglm_aoa, data = bglm_aoas)

qplot(bglm_aoa, facets = ~ category, data = bglm_aoas)

qplot(bglm_aoas$bglm_aoa, glm_aoas$glm_aoa, label = bglm_aoas$definition, 
      geom = "text")

BayesGLM with these made-up parameters fixes some of the issues, produces a better-looking distribution with fewer crazy outliers (though still some). But how do we get around the “made up parameter” problem?

Hierarchical GLM model

Set some stan settings.

library(rstan)
library(readr)
library(dplyr)

rstan_options(auto_write = TRUE)
options(mc.cores = parallel::detectCores())
ms <- read_csv("engdata.csv")

The model. Constraints:

hierarchical_logit <-'
data {                            
  int<lower=1> W; // number of words
  int<lower=1> A; // number of ages
  vector[A] age; // subject ages
  int<lower=0> produces[W,A]; // count data
  int<lower=0> attempts[W,A]; // count data
}

parameters {
  real mu_i;             // intercept mean
  real<lower=0> sigma_i; // intercept SD
  
  real<lower=0> mu_s;             // slope mean
  real<lower=0> sigma_s; // slope SD

  vector[W] intercept; // word means
  vector[W] slope; // subject means
}

transformed parameters {
  matrix[W,A] p;

  for (w in 1:W) 
    for (a in 1:A)
      p[w,a] <- intercept[w] + (slope[w] * age[a]);
}

model {
  mu_i ~ normal(0, 10); 
  sigma_i ~ normal(0, 1);

  mu_s ~ normal(0, .25);
  sigma_s ~ normal(0, .25);

  intercept ~ normal(mu_i, sigma_i);
  slope ~ normal(mu_s, sigma_s);
  
  for (w in 1:W) 
    for (a in 1:A)
      produces[w,a] ~ binomial_logit(attempts[w,a], p[w,a]);
}
'

Now reformat the data to stan format and compute.

model.data <- ms %>%
  ungroup %>%
  mutate(word = definition, 
         n = num_true,
         N = num_true + num_false) %>%
  # filter(word %in% c("daddy*","mommy*","no","bye")) %>%
  dplyr::select(word, age, category, n, N)

ages <- unique(model.data$age)
n.words <- length(unique(model.data$word))

dat <- list(age = ages,
            produces = matrix(model.data$n, 
                              nrow=n.words, ncol=length(ages), byrow = TRUE),
            attempts = matrix(model.data$N, 
                              nrow=n.words, ncol=length(ages), byrow = TRUE),
            W = n.words, 
            A = length(ages))

samps <- stan(model_code = hierarchical_logit, 
              cores = 4, 
              data = dat, iter = 200, warmup=100, chains = 4, 
              pars = c("mu_i", "sigma_i", "mu_s", "sigma_s", "slope", "intercept"))

Diagnostics.

traceplot(samps, pars = c("mu_i","sigma_i","mu_s","sigma_s"), 
                 inc_warmup = TRUE)

Explore parameters.

library(stringr)
library(tidyr)

coefs <- data.frame(summary(samps)$summary)
coefs$name <- rownames(coefs)

word_ids <- model.data %>% 
  group_by(word) %>% 
  summarise(category = category[1]) %>%
  mutate(word_id = 1:n())

words <- coefs %>% 
  filter(str_detect(name, "slope") | str_detect(name, "intercept")) %>%
  separate(name, c("variable", "word_id"), "\\[") %>%
  mutate(word_id = as.numeric(str_replace(word_id, "]", ""))) %>%
  left_join(word_ids) %>%
  dplyr::select(mean, variable, word, category) %>%
  spread(variable, mean) %>%
  mutate(aoa = intercept / slope) %>%
  arrange(aoa) %>%
  mutate(word = factor(word, 
                        levels = word,
                        labels = word))

and plot again:

age_range <- 0:36
preds <- words %>%
  group_by(word, category) %>%
  do(data.frame(age = age_range, 
                pred.prop = boot::inv.logit(.$intercept + age_range * .$slope))) %>%
  left_join(model.data) %>%
  mutate(prop = n/N) 

ggplot(filter(preds, word %in% target_words), aes(x = age, y = prop)) + 
  geom_point() + 
  facet_wrap(~word) + 
  geom_line(aes(x = age, y = pred.prop))

Check the histogram.

hglm.aoas <- words %>%
  mutate(aoa = -intercept / slope)

qplot(aoa, geom = "blank", data = hglm.aoas) + 
  geom_histogram(aes(y = ..density..)) + 
  geom_density(col = "red")

What are the crazy ones?

hglm.aoas$word[hglm.aoas$aoa <0]
## [1] pattycake
## 396 Levels: brother sister night night play pen babysitter ... pattycake
hglm.aoas$word[hglm.aoas$aoa >50]
## [1] brother
## 396 Levels: brother sister night night play pen babysitter ... pattycake

Don’t know what’s going on with these but everything else looks good.

Full comparison between models

Sample words

Merge with previous data frame.

all.preds <- left_join(mms, 
                       preds %>% 
                         ungroup %>%
                         rename(hglm = pred.prop, 
                                definition = word) %>% 
                         dplyr::select(definition, age, hglm)) %>%
  dplyr::select(-num_true, -num_false)

ggplot(all.preds, aes(x = age)) + 
  geom_line(aes(y = glm), col = "red") + 
  geom_line(aes(y = bayes), col = "blue") +
  geom_line(aes(y = robust), col = "green") + 
  geom_line(aes(y = hglm), col = "orange") + 
  geom_point(aes(y = prop), col = "black") + 
  ylim(c(0,1)) + 
  facet_wrap(~definition) + 
  scale_colour_manual(values = c(glm = "red", 
                                 bayes = "blue", 
                                 robustglm = "green", 
                                 hglm = "orange"), 
                      name = "model")

AOA distribution

aoas <- full_join(full_join(glm_aoas, full_join(bglm_aoas, rglm_aoas)), empirical_aoas) %>%
  full_join(hglm.aoas %>% rename(definition = word, 
                               hglm_aoa = aoa) %>% 
            dplyr::select(-intercept, -slope))

Plot.

qplot(aoa,facets = ~ measure, 
      data = aoas %>% gather(measure, aoa, ends_with("aoa")))

And without the outliers.

qplot(aoa,facets = ~ measure, 
      data = aoas %>% 
        gather(measure, aoa, ends_with("aoa")) %>%
        filter(aoa > 0 & aoa < 50))

And pairs plots.

pair_data <- aoas %>% 
  dplyr::select(ends_with("aoa")) %>%
          mutate(empirical_aoa = ifelse(is.infinite(empirical_aoa), 
                                       NA, empirical_aoa)) %>%
  filter(glm_aoa > 0, glm_aoa < 50, rglm_aoa > 0, rglm_aoa < 50)

GGally::ggpairs(pair_data) 

Compute across all languages

Fit models to predict each item’s age of acquisition.

Commented out for computation time reasons.

# fit_inst_measure_uni <- function(inst_measure_uni_data) {
#   ages <- min(inst_measure_uni_data$age):max(inst_measure_uni_data$age)
#   model <- glm(cbind(num_true, num_false) ~ age, family = "binomial",
#                data = inst_measure_uni_data)
#   fit <- predict(model, newdata = data.frame(age = ages), se.fit = TRUE)
#   aoa <- -model$coefficients[["(Intercept)"]] / model$coefficients[["age"]]
#   constants <- inst_measure_uni_data %>%
#     select(language, form, measure, lexical_category, lexical_class, uni_lemma,
#            words) %>%
#     distinct()
#   props <- inst_measure_uni_data %>%
#     ungroup() %>%
#     select(age, prop)
#   data.frame(age = ages, fit_prop = inv.logit(fit$fit), fit_se = fit$se.fit,
#              aoa = aoa, language = constants$language, form = constants$form,
#              measure = constants$measure,
#              lexical_category = constants$lexical_category,
#              lexical_class = constants$lexical_class,
#              uni_lemma = constants$uni_lemma, words = constants$words) %>%
#     left_join(props)
# }
# 
# fit_inst_measure <- function(inst_measure_data) {
#   inst_measure_by_uni <- inst_measure_data %>%
#     group_by(language, form, measure,
#              lexical_category, lexical_class, uni_lemma,
#              age, data_id) %>%
#     summarise(uni_value = any(value),
#               words = paste(definition, collapse = ", ")) %>%
#     group_by(language, form, measure,
#              lexical_category, lexical_class, uni_lemma, words,
#              age) %>%
#     summarise(num_true = sum(uni_value, na.rm = TRUE),
#               num_false = n() - num_true,
#               prop = mean(uni_value, na.rm = TRUE))
#   inst_measure_by_uni %>%
#     split(paste(.$lexical_category, .$lexical_class, .$uni_lemma)) %>%
#     map(fit_inst_measure_uni) %>%
#     bind_rows()
# }
# 
# fit_inst <- function(inst_data) {
#   inst_data %>%
#     split(.$measure) %>%
#     map(fit_inst_measure) %>%
#     bind_rows()
# }
# 
# prop_data <- map(raw_data, fit_inst)
# all_prop_data <- bind_rows(prop_data)
# #write_csv(all_prop_data, "app/all_prop_data.csv")
# aoa_data <- all_prop_data %>%
#   select(language, form, measure, lexical_category, lexical_class, uni_lemma, 
#          words, aoa) %>%
#   distinct() %>%
#   filter(aoa > 5 & aoa < 31)
# #write_csv(aoa_data, "aoa_data.csv")
# ggplot(aoa_data, aes(x = aoa, fill = language)) +
#   facet_wrap(~language) +
#   geom_bar() +
#   xlab("Age of Acquisition (months)") +
#   ylab("Count") +
#   scale_fill_solarized(guide = FALSE)