Primary analysis
Model selection
American English Data
Let’s look at curve shapes on the English data alone. We’ll start with means here but investigate medians below.
eng_vocab_data <- vocab_data %>%
filter(language == "English") %>%
group_by(momed, age) %>%
summarise(production_prop = mean(production_prop, na.rm=TRUE),
n = n()) %>%
gather(measure, proportion, production_prop)
Start with a simple model and build.
mod <- glm(cbind(production, no_production) ~ age * momed,
family = "binomial",
data = filter(vocab_data, language == "English"))
kable(summary(mod)$coef, digits=3)
Estimate | Std. Error | z value | Pr(>|z|) | |
---|---|---|---|---|
(Intercept) | -5.627 | 0.039 | -145.667 | 0.000 |
age | 0.207 | 0.002 | 128.209 | 0.000 |
momedSecondary | 0.139 | 0.041 | 3.426 | 0.001 |
momedCollege and Above | -0.440 | 0.040 | -10.907 | 0.000 |
age:momedSecondary | 0.010 | 0.002 | 6.147 | 0.000 |
age:momedCollege and Above | 0.044 | 0.002 | 26.209 | 0.000 |
Plot.
eng_vocab_data <- eng_vocab_data %>%
list %>%
map_df(function(x) {
x$preds <- predict(mod, newdata = x, type = "response")
return(x)
})
ggplot(eng_vocab_data,
aes(x = age, y = proportion, colour = momed,
label = momed)) +
geom_point(aes(size = n), alpha = .3) +
geom_line(aes(y = preds)) +
scale_colour_solarized(name = "") +
scale_size_continuous(range = c(.3, 3),
name = "N") +
scale_x_continuous("Age (months)") +
scale_y_continuous(name = "Mean Productive Vocabulary",
limits = c(0,1)) +
theme(legend.position = "bottom")
Note that this model doesn’t do badly, but it somewhat overshoots down at the bottom.
A larger model, now including a quadratic term.
mod <- glm(cbind(production, no_production) ~ age * momed +
I(age^2) * momed,
family = "binomial",
data = filter(vocab_data, language == "English"))
eng_vocab_data <- eng_vocab_data %>%
list %>%
map_df(function(x) {
x$preds <- predict(mod, newdata = x, type = "response")
return(x)
})
ggplot(eng_vocab_data,
aes(x = age, y = proportion, colour = momed,
label = momed)) +
geom_point(aes(size = n), alpha = .3) +
geom_line(aes(y = preds)) +
scale_colour_solarized(name = "") +
scale_size_continuous(range = c(.3, 3),
name = "N") +
scale_x_continuous("Age (months)") +
scale_y_continuous(name = "Mean Productive Vocabulary",
limits = c(0,1)) +
theme(legend.position = "bottom")
Let’s try ordering the predictors - this assumes linear spacing between them but will deal with overfitting. Note: I tried to use ordPens
for this, but it is very inflexible as a package (e.g., can’t deal with the cbind(production, no_production)
form for logistic).
vocab_data$momed_continuous <- scale(as.numeric(vocab_data$momed),
center=TRUE, scale=FALSE)
eng_vocab_data$momed_continuous <- scale(as.numeric(eng_vocab_data$momed),
center=TRUE, scale=FALSE)
mod_cont <- glm(cbind(production, no_production) ~ age * momed_continuous +
I(age^2) * momed_continuous,
family = "binomial",
data = filter(vocab_data, language=="English"))
eng_vocab_data <- eng_vocab_data %>%
list %>%
map_df(function(x) {
x$preds <- predict(mod_cont, newdata = x, type = "response")
return(x)
})
ggplot(eng_vocab_data,
aes(x = age, y = proportion, colour = momed,
label = momed)) +
geom_point(aes(size = n), alpha = .3) +
geom_line(aes(y = preds)) +
scale_colour_solarized(name = "") +
scale_size_continuous(range = c(.3, 3),
name = "N") +
scale_x_continuous("Age (months)") +
scale_y_continuous(name = "Mean Productive Vocabulary",
limits = c(0,1)) +
theme(legend.position = "bottom")
This is more parsimonious, but does empirically look like it overshoots the below secondary
category by a lot. This model looks better with four momed levels - with three it’s clearly wrong.
eng_vocab_data$momed_continuous <- scale(as.numeric(eng_vocab_data$momed),
center=TRUE, scale=FALSE) # not sure why but you have to do this again.
mod_cont_cubic <- glm(cbind(production, no_production) ~ age * momed +
I(age^2) * momed + I(age^3) * momed ,
family = "binomial",
data = filter(vocab_data, language=="English"))
eng_vocab_data <- eng_vocab_data %>%
list %>%
map_df(function(x) {
x$preds <- predict(mod_cont_cubic, newdata = x, type = "response")
return(x)
})
ggplot(eng_vocab_data,
aes(x = age, y = proportion, colour = momed,
label = momed)) +
geom_point(aes(size = n), alpha = .3) +
geom_line(aes(y = preds)) +
scale_colour_solarized(name = "") +
scale_size_continuous(range = c(.3, 3),
name = "N") +
scale_x_continuous("Age (months)") +
scale_y_continuous(name = "Mean Productive Vocabulary",
limits = c(0,1)) +
theme(legend.position = "bottom")
The cubic increases fit because we have so much data, but I’m not convinced.
Median-based
model_spec <- formula(cbind(production, no_production) ~ age * momed + I(age^2) * momed - momed)
There are two variants on this analysis that we can use: mean-based and median-based. The median-based variant is a bit more robust across kid-wise variability (especially at the low end, where a few outliers can throw off estimates), but the mean-based statistics allow for easier fitting of mixed models across languages.
overall_vocab_data_median <- vocab_data %>%
group_by(language, momed, age) %>%
summarise(production_prop = median(production_prop, na.rm=TRUE),
n = n()) %>%
mutate(language_name = factor(language,
levels = c("English", "Hebrew", "Spanish",
"Norwegian", "Danish"),
labels = c("United States", "Israel", "Mexico",
"Norway", "Denmark")),
language_n = paste0(language_name," (N=", ns[ns$language==language[1],2] ,")"))
vocab_models_median <- vocab_data %>%
split(.$language) %>%
map(function(x) {robustbase::glmrob(model_spec,
family = "binomial",
data = x)})
overall_vocab_data_median <- overall_vocab_data_median %>%
split(.$language) %>%
map_df(function(x) {
x$preds <- predict(vocab_models_median[[x$language[1]]],
newdata = x, type = "response")
return(x)
})
ggplot(overall_vocab_data_median,
aes(x = age, y = production_prop, colour = momed,
label = momed)) +
facet_wrap( ~ language_n) +
geom_point(aes(size = n), alpha = .3) +
geom_line(aes(y = preds)) +
scale_colour_solarized(name = "") +
scale_size_continuous(range = c(.3, 3),
name = "N") +
scale_x_continuous("Age (months)") +
scale_y_continuous(name = "Median Productive Vocabulary",
limits = c(0,1)) +
theme(legend.position = "bottom")
Mean-based
overall_vocab_data_mean <- vocab_data %>%
group_by(language, momed, age) %>%
summarise(production_prop = mean(production_prop, na.rm=TRUE),
n = n()) %>%
mutate(language_name = factor(language,
levels = c("English", "Hebrew", "Spanish",
"Norwegian", "Danish"),
labels = c("United States", "Israel", "Mexico",
"Norway", "Denmark")),
language_n = paste0(language_name," (N=", ns[ns$language==language[1],2] ,")"))
vocab_models_mean <- vocab_data %>%
split(.$language) %>%
map(function(x) {glm(model_spec,
family = "binomial",
data = x)})
overall_vocab_data_mean <- overall_vocab_data_mean %>%
split(.$language) %>%
map_df(function(x) {
x$preds <- predict(vocab_models_mean[[x$language[1]]],
newdata = x, type = "response")
return(x)
})
ggplot(overall_vocab_data_mean,
aes(x = age, y = production_prop, colour = momed,
label = momed)) +
facet_wrap( ~ language_n) +
geom_point(aes(size = n), alpha = .3) +
geom_line(aes(y = preds)) +
scale_colour_solarized(name = "") +
scale_size_continuous(range = c(.3, 3),
name = "N") +
scale_x_continuous("Age (months)") +
scale_y_continuous(name = "Mean Productive Vocabulary",
limits = c(0,1)) +
theme(legend.position = "bottom")
We’re going to adopt the median approach. The means feel like they are wrong because they show a much more gradual emergence due to the averaging-in of substantial outliers (e.g., when the mode is 0 at the bottom or when the mode is high at the top).
CIs
Inspired by https://rpubs.com/aammd/boot_regression_dplyr.
n_rep <- 50
means <- vocab_data %>%
group_by(language, momed, age) %>%
summarise(production_prop = mean(production_prop, na.rm=TRUE),
n = n())
do_sim <- function (vocab_data, means, sim_num) {
vocab_models <- vocab_data %>%
split(.$language) %>%
map_df(function(x) sample_frac(x, replace = TRUE)) %>%
split(.$language) %>%
map(function(x) robustbase::glmrob(model_spec, family = "binomial", data = x))
means %>%
mutate(sim_num = sim_num) %>%
split(.$language) %>%
map_df(function(x) {
x$preds <- predict(vocab_models[[x$language[1]]],
newdata = x, type = "response")
return(x)
})
}
sims <- data.frame(sim_num = 1:n_rep) %>%
rowwise() %>%
do(do_sim(vocab_data, means, .$sim_num))
Plot variability.
ggplot(sims,
aes(x = age, y = preds, colour = momed, group = interaction(momed,sim_num),
label = momed)) +
facet_wrap( ~ language) +
geom_line(alpha = .1) +
scale_colour_solarized(name = "") +
scale_x_continuous("Age (months)") +
scale_y_continuous(name = "Mean Productive Vocabulary",
limits = c(0,1)) +
theme(legend.position = "bottom")
Plot ribbons.
ms <- sims %>%
group_by(language, age, momed) %>%
summarise(lower = quantile(preds, .025),
upper = quantile(preds, .975))
ggplot(ms,
aes(x = age, ymin = lower, ymax = upper, fill = momed)) +
facet_wrap(~language) +
geom_ribbon(alpha = .5) +
scale_fill_solarized(name = "") +
scale_x_continuous("Age (months)") +
scale_y_continuous(name = "Mean Productive Vocabulary",
limits = c(0,1)) +
theme(legend.position = "bottom")