library(dplyr)
library(tidyr)
library(purrr)
library(readr)
library(ggplot2)
library(langcog)
library(wordbankr)
library(boot)
library(lazyeval)
theme_set(theme_mikabr())
Connect to the Wordbank database and pull out the raw data.
data_mode <- "remote"
admins <- get_administration_data(mode = data_mode) %>%
select(data_id, age, language, form)
items <- get_item_data(mode = data_mode) %>%
mutate(num_item_id = as.numeric(substr(item_id, 6, nchar(item_id))),
definition = tolower(definition))
words <- items %>%
filter(type == "word", !is.na(uni_lemma), form == "WG")
invalid_uni_lemmas <- words %>%
group_by(uni_lemma) %>%
filter(n() > 1,
length(unique(lexical_class)) > 1) %>%
arrange(language, uni_lemma)
get_inst_data <- function(inst_items) {
inst_language <- unique(inst_items$language)
inst_form <- unique(inst_items$form)
inst_admins <- filter(admins, language == inst_language, form == inst_form)
get_instrument_data(instrument_language = inst_language,
instrument_form = inst_form,
items = inst_items$item_id,
administrations = inst_admins,
iteminfo = inst_items,
mode = data_mode) %>%
filter(!is.na(age)) %>%
mutate(produces = !is.na(value) & value == "produces",
understands = !is.na(value) & (value == "understands" | value == "produces")) %>%
select(-value) %>%
gather(measure, value, produces, understands) %>%
mutate(language = inst_language,
form = inst_form)
}
items_by_inst <- split(words, paste(words$language, words$form, sep = "_"))
raw_data <- map(items_by_inst, get_inst_data)
Explore AOAs for English.
ewg <- raw_data$English_WG
ms <- ewg %>%
group_by(definition, age, category) %>%
summarise(prop = mean(value),
num_true = sum(value),
num_false = sum(!value))
ggplot(ms, aes(age, prop, col = definition)) +
geom_line() +
facet_wrap(~category) +
scale_colour_solarized(guide=FALSE)
Let’s compare some methods. First try empirical quantiles. This loses many observations that never go above 50%.
empirical_aoas <- ms %>%
group_by(definition, category) %>%
summarise(empirical_aoa = min(age[prop > .5]))
qplot(empirical_aoa, data = empirical_aoas)
qplot(empirical_aoa, facets = ~ category, data = empirical_aoas)
Now let’s try the basic GLM version.
fit_glm <- function(data) {
model <- glm(cbind(num_true, num_false) ~ age, family = "binomial",
data = data)
fit <- predict(model, newdata = data.frame(age = 0:36), se.fit = TRUE)
aoa <- -model$coefficients[["(Intercept)"]] / model$coefficients[["age"]]
data.frame(definition = data$definition[1],
category = data$category[1],
glm_aoa = aoa)
}
glm_aoas <- ms %>%
split(.$definition) %>%
map(fit_glm) %>%
bind_rows
qplot(glm_aoa, data = glm_aoas)
qplot(glm_aoa, facets = ~ category, data = glm_aoas)
Compare. The GLM AOA gives a bunch of values, many of which seem crazy.
aoas <- full_join(empirical_aoas, glm_aoas)
ggplot(aoas, aes(x = empirical_aoa, y = glm_aoa, col = category)) +
geom_point() +
geom_text(aes(label = definition), nudge_y = 3) +
scale_colour_solarized(guide=FALSE)
aoas %>%
gather(measure, aoa, empirical_aoa, glm_aoa) %>%
ggplot(aes(x = aoa)) + facet_grid(.~measure) + geom_histogram()
fit_rglm <- function(data) {
model <- glmrob(cbind(num_true, num_false) ~ age, family = "binomial",
data = data)
fit <- predict(model, newdata = data.frame(age = 0:36), se.fit = TRUE)
aoa <- -model$coefficients[["(Intercept)"]] / model$coefficients[["age"]]
data.frame(definition = data$definition[1],
category = data$category[1],
rglm_aoa = aoa)
}
rglm_aoas <- ms %>%
split(.$definition) %>%
map(fit_rglm) %>%
bind_rows
qplot(rglm_aoa, data = rglm_aoas)
qplot(rglm_aoa, data = rglm_aoas) +
xlim(c(0,50))
qplot(rglm_aoa, facets = ~ category, data = rglm_aoas)
Looks totally reasonable except for the two crazy ones.
rglm_aoas$definition[rglm_aoas$rglm_aoa < 0]
## [1] "pattycake"
rglm_aoas$definition[rglm_aoas$rglm_aoa > 50]
## [1] "brother" "sister"
Can we do a BayesGLM to regularize things a bit? Let’s explore a single curve. Basically, BayesGLM does exactly the same thing as GLM because of the huge amount of data, no matter what prior. Let’s try also robust GLM.
library(arm)
library(robustbase)
target_words <- c(c("pattycake","mommy*","bye", "tomorrow","play pen", "today",
"child", "all","night night","brother", "daddy", "bye", "when"),
sample(unique(ms$definition), 20))
do.fits <- function(data) {
glm_model <- glm(cbind(num_true, num_false) ~ age,
family = "binomial",
data = data)
bayes_model <- bayesglm(cbind(num_true, num_false) ~ age,
family = binomial(link="logit"),
prior.mean = .25,
prior.scale = c(.01),
prior.mean.for.intercept = 0,
prior.scale.for.intercept = 2.5,
prior.df=Inf,
data = data)
rob_model <- glmrob(cbind(num_true, num_false) ~ age,
family = binomial(link="logit"), data = data)
fits <- data.frame(age = 0:36) %>%
mutate(glm = predict(glm_model,
type = "response",
data.frame(age = age)),
bayes = predict(bayes_model,
type = "response",
data.frame(age = age)),
robust = predict(rob_model,
type = "response",
data.frame(age = age)),
definition = data$definition[1]) %>%
left_join(data)
return(fits)
}
mms <- ms %>%
filter(definition %in% target_words) %>%
group_by(definition) %>%
do(do.fits(.))
ggplot(mms, aes(x = age)) +
geom_line(aes(y = glm), col = "red") +
geom_line(aes(y = bayes), col = "blue") +
geom_line(aes(y = robust), col = "green") +
scale_colour_manual(values = c("glm" = "red",
"bayes" = "blue",
"robustglm" = "green"),
name = "model") +
geom_point(aes(y = prop), col = "black") +
ylim(c(0,1)) +
facet_wrap(~definition)
Now apply this more broadly. Note that the 50% point for logistic regression = \(- \beta_1 / \beta_2\).
fit_bglm <- function(data) {
model <- bayesglm(cbind(num_true, num_false) ~ age, family = "binomial",
prior.mean = .25,
prior.scale = c(.01),
prior.mean.for.intercept = 0,
prior.scale.for.intercept = 2.5,
prior.df=Inf,
data = data)
aoa <- -model$coefficients[["(Intercept)"]] / model$coefficients[["age"]]
data.frame(definition = data$definition[1],
category = data$category[1],
bglm_aoa = aoa)
}
bglm_aoas <- ms %>%
split(.$definition) %>%
map(fit_bglm) %>%
bind_rows
qplot(bglm_aoa, data = bglm_aoas)
qplot(bglm_aoa, facets = ~ category, data = bglm_aoas)
qplot(bglm_aoas$bglm_aoa, glm_aoas$glm_aoa, label = bglm_aoas$definition,
geom = "text")
BayesGLM with these made-up parameters fixes some of the issues, produces a better-looking distribution with fewer crazy outliers (though still some). But how do we get around the “made up parameter” problem?
Set some stan settings.
library(rstan)
library(readr)
library(dplyr)
rstan_options(auto_write = TRUE)
options(mc.cores = parallel::detectCores())
ms <- read_csv("engdata.csv")
The model. Constraints:
hierarchical_logit <-'
data {
int<lower=1> W; // number of words
int<lower=1> A; // number of ages
vector[A] age; // subject ages
int<lower=0> produces[W,A]; // count data
int<lower=0> attempts[W,A]; // count data
}
parameters {
real mu_i; // intercept mean
real<lower=0> sigma_i; // intercept SD
real<lower=0> mu_s; // slope mean
real<lower=0> sigma_s; // slope SD
vector[W] intercept; // word means
vector[W] slope; // subject means
}
transformed parameters {
matrix[W,A] p;
for (w in 1:W)
for (a in 1:A)
p[w,a] <- intercept[w] + (slope[w] * age[a]);
}
model {
mu_i ~ normal(0, 10);
sigma_i ~ normal(0, 1);
mu_s ~ normal(0, .25);
sigma_s ~ normal(0, .25);
intercept ~ normal(mu_i, sigma_i);
slope ~ normal(mu_s, sigma_s);
for (w in 1:W)
for (a in 1:A)
produces[w,a] ~ binomial_logit(attempts[w,a], p[w,a]);
}
'
Now reformat the data to stan format and compute.
model.data <- ms %>%
ungroup %>%
mutate(word = definition,
n = num_true,
N = num_true + num_false) %>%
# filter(word %in% c("daddy*","mommy*","no","bye")) %>%
dplyr::select(word, age, category, n, N)
ages <- unique(model.data$age)
n.words <- length(unique(model.data$word))
dat <- list(age = ages,
produces = matrix(model.data$n,
nrow=n.words, ncol=length(ages), byrow = TRUE),
attempts = matrix(model.data$N,
nrow=n.words, ncol=length(ages), byrow = TRUE),
W = n.words,
A = length(ages))
samps <- stan(model_code = hierarchical_logit,
cores = 4,
data = dat, iter = 200, warmup=100, chains = 4,
pars = c("mu_i", "sigma_i", "mu_s", "sigma_s", "slope", "intercept"))
Diagnostics.
traceplot(samps, pars = c("mu_i","sigma_i","mu_s","sigma_s"),
inc_warmup = TRUE)
Explore parameters.
library(stringr)
library(tidyr)
coefs <- data.frame(summary(samps)$summary)
coefs$name <- rownames(coefs)
word_ids <- model.data %>%
group_by(word) %>%
summarise(category = category[1]) %>%
mutate(word_id = 1:n())
words <- coefs %>%
filter(str_detect(name, "slope") | str_detect(name, "intercept")) %>%
separate(name, c("variable", "word_id"), "\\[") %>%
mutate(word_id = as.numeric(str_replace(word_id, "]", ""))) %>%
left_join(word_ids) %>%
dplyr::select(mean, variable, word, category) %>%
spread(variable, mean) %>%
mutate(aoa = intercept / slope) %>%
arrange(aoa) %>%
mutate(word = factor(word,
levels = word,
labels = word))
and plot again:
age_range <- 0:36
preds <- words %>%
group_by(word, category) %>%
do(data.frame(age = age_range,
pred.prop = boot::inv.logit(.$intercept + age_range * .$slope))) %>%
left_join(model.data) %>%
mutate(prop = n/N)
ggplot(filter(preds, word %in% target_words), aes(x = age, y = prop)) +
geom_point() +
facet_wrap(~word) +
geom_line(aes(x = age, y = pred.prop))
Check the histogram.
hglm.aoas <- words %>%
mutate(aoa = -intercept / slope)
qplot(aoa, geom = "blank", data = hglm.aoas) +
geom_histogram(aes(y = ..density..)) +
geom_density(col = "red")
What are the crazy ones?
hglm.aoas$word[hglm.aoas$aoa <0]
## [1] pattycake
## 396 Levels: brother sister night night play pen babysitter ... pattycake
hglm.aoas$word[hglm.aoas$aoa >50]
## [1] brother
## 396 Levels: brother sister night night play pen babysitter ... pattycake
Don’t know what’s going on with these but everything else looks good.
Merge with previous data frame.
all.preds <- left_join(mms,
preds %>%
ungroup %>%
rename(hglm = pred.prop,
definition = word) %>%
dplyr::select(definition, age, hglm)) %>%
dplyr::select(-num_true, -num_false)
ggplot(all.preds, aes(x = age)) +
geom_line(aes(y = glm), col = "red") +
geom_line(aes(y = bayes), col = "blue") +
geom_line(aes(y = robust), col = "green") +
geom_line(aes(y = hglm), col = "orange") +
geom_point(aes(y = prop), col = "black") +
ylim(c(0,1)) +
facet_wrap(~definition) +
scale_colour_manual(values = c(glm = "red",
bayes = "blue",
robustglm = "green",
hglm = "orange"),
name = "model")
aoas <- full_join(full_join(glm_aoas, full_join(bglm_aoas, rglm_aoas)), empirical_aoas) %>%
full_join(hglm.aoas %>% rename(definition = word,
hglm_aoa = aoa) %>%
dplyr::select(-intercept, -slope))
Plot.
qplot(aoa,facets = ~ measure,
data = aoas %>% gather(measure, aoa, ends_with("aoa")))
And without the outliers.
qplot(aoa,facets = ~ measure,
data = aoas %>%
gather(measure, aoa, ends_with("aoa")) %>%
filter(aoa > 0 & aoa < 50))
And pairs plots.
pair_data <- aoas %>%
dplyr::select(ends_with("aoa")) %>%
mutate(empirical_aoa = ifelse(is.infinite(empirical_aoa),
NA, empirical_aoa)) %>%
filter(glm_aoa > 0, glm_aoa < 50, rglm_aoa > 0, rglm_aoa < 50)
GGally::ggpairs(pair_data)
Fit models to predict each item’s age of acquisition.
Commented out for computation time reasons.
# fit_inst_measure_uni <- function(inst_measure_uni_data) {
# ages <- min(inst_measure_uni_data$age):max(inst_measure_uni_data$age)
# model <- glm(cbind(num_true, num_false) ~ age, family = "binomial",
# data = inst_measure_uni_data)
# fit <- predict(model, newdata = data.frame(age = ages), se.fit = TRUE)
# aoa <- -model$coefficients[["(Intercept)"]] / model$coefficients[["age"]]
# constants <- inst_measure_uni_data %>%
# select(language, form, measure, lexical_category, lexical_class, uni_lemma,
# words) %>%
# distinct()
# props <- inst_measure_uni_data %>%
# ungroup() %>%
# select(age, prop)
# data.frame(age = ages, fit_prop = inv.logit(fit$fit), fit_se = fit$se.fit,
# aoa = aoa, language = constants$language, form = constants$form,
# measure = constants$measure,
# lexical_category = constants$lexical_category,
# lexical_class = constants$lexical_class,
# uni_lemma = constants$uni_lemma, words = constants$words) %>%
# left_join(props)
# }
#
# fit_inst_measure <- function(inst_measure_data) {
# inst_measure_by_uni <- inst_measure_data %>%
# group_by(language, form, measure,
# lexical_category, lexical_class, uni_lemma,
# age, data_id) %>%
# summarise(uni_value = any(value),
# words = paste(definition, collapse = ", ")) %>%
# group_by(language, form, measure,
# lexical_category, lexical_class, uni_lemma, words,
# age) %>%
# summarise(num_true = sum(uni_value, na.rm = TRUE),
# num_false = n() - num_true,
# prop = mean(uni_value, na.rm = TRUE))
# inst_measure_by_uni %>%
# split(paste(.$lexical_category, .$lexical_class, .$uni_lemma)) %>%
# map(fit_inst_measure_uni) %>%
# bind_rows()
# }
#
# fit_inst <- function(inst_data) {
# inst_data %>%
# split(.$measure) %>%
# map(fit_inst_measure) %>%
# bind_rows()
# }
#
# prop_data <- map(raw_data, fit_inst)
# all_prop_data <- bind_rows(prop_data)
# #write_csv(all_prop_data, "app/all_prop_data.csv")
# aoa_data <- all_prop_data %>%
# select(language, form, measure, lexical_category, lexical_class, uni_lemma,
# words, aoa) %>%
# distinct() %>%
# filter(aoa > 5 & aoa < 31)
# #write_csv(aoa_data, "aoa_data.csv")
# ggplot(aoa_data, aes(x = aoa, fill = language)) +
# facet_wrap(~language) +
# geom_bar() +
# xlab("Age of Acquisition (months)") +
# ylab("Count") +
# scale_fill_solarized(guide = FALSE)