Load required libraries.
library(wordbankr)
library(langcog)
library(dplyr)
library(ggplot2)
library(directlabels)
library(purrr)
library(tidyr)
theme_set(theme_mikabr())
font <- theme_mikabr()$text$family
mode <- "remote"
Get administration data and filter to administrations of Words & Sentences that have momed coded.
vocab_admins <- get_administration_data(mode = mode, original_ids=TRUE) %>%
rename(momed = mom_ed) %>%
filter(!is.na(momed))
Get first admins.
first_longitudinals <- vocab_admins %>%
filter(longitudinal) %>%
group_by(source_name, original_id) %>%
arrange(age) %>%
slice(1)
vocab_admins <- vocab_admins %>%
filter(!longitudinal |
(longitudinal & (data_id %in% first_longitudinals$data_id)))
Get item information to find the number of items on each language’s form.
num_words <- get_item_data(mode = mode) %>%
filter(type == "word") %>%
group_by(language, form) %>%
summarise(n = n())
Normalize productive vocabulary size as a proportion of items and calculate median vocabulary size for each language, momed level, and age.
vocab_data <- vocab_admins %>%
left_join(num_words) %>%
filter(form %in% c("WG","WS","TEDS Twos")) %>%
mutate(production = as.numeric(production),
no_production = n - production,
comprehension = as.numeric(comprehension),
production_prop = as.numeric(production) / n,
comprehension_prop = ifelse(form == "WG",
as.numeric(comprehension) / n, NA),
momed = `levels<-`(momed,
list("None, Primary, Some Secondary" = "None",
"None, Primary, Some Secondary" = "Primary",
"None, Primary, Some Secondary" = "Some Secondary",
"Secondary" = "Secondary",
"College" = "Some College",
"College" = "College",
"Graduate" = "Some Graduate",
"Graduate" = "Graduate")))
Included languages.
included_languages <- c("Danish","English","Spanish","Norwegian","Hebrew", "English (British)")
Get Ns.
vocab_data %>%
filter(language %in% included_languages) %>%
group_by(language) %>%
summarise(n = n()) %>%
kable
language | n |
---|---|
Danish | 3640 |
English | 3792 |
English (British) | 10925 |
Hebrew | 313 |
Norwegian | 7427 |
Spanish | 1068 |
Get means.
overall_vocab_data <- vocab_data %>%
group_by(language, momed, age) %>%
summarise(production_prop = median(production_prop, na.rm=TRUE),
comprehension_prop = median(comprehension_prop, na.rm=TRUE),
n = n()) %>%
gather(measure, proportion, production_prop, comprehension_prop)
Subsidiary data for plots.
ns <- filter(overall_vocab_data,
language %in% included_languages,
measure == "production_prop") %>%
group_by(language) %>%
summarize(n = sum(n))
Fit model and plot.
vocab_models <- vocab_data %>%
split(.$language) %>%
map(function(x) {robustbase::glmrob(cbind(production, no_production) ~ age * momed +
I(age^2) - momed,
family = "binomial",
data = x)})
overall_vocab_data <- overall_vocab_data %>%
split(.$language) %>%
map_df(function(x) {
x$preds <- predict(vocab_models[[x$language[1]]],
newdata = x, type = "response")
return(x)
})
plot_data <- filter(overall_vocab_data,
language %in% included_languages,
measure == "production_prop") %>%
mutate(language_name = factor(language,
levels = c("English", "Hebrew", "Spanish",
"Norwegian", "Danish", "English (British)"),
labels = c("United States", "Israel", "Mexico",
"Norway", "Denmark", "United Kingdom")),
language_n = paste0(language_name," (N=", ns[ns$language==language[1],2] ,")"))
ggplot(plot_data,
aes(x = age, y = proportion, colour = momed,
label = momed)) +
facet_wrap( ~ language_n) +
geom_point(aes(size = n), alpha = .3) +
geom_line(aes(y = preds)) +
scale_colour_solarized(name = "") +
scale_size_continuous(range = c(.3, 3),
name = "N") +
scale_x_continuous("Age (months)") +
scale_y_continuous(name = "Median Productive Vocabulary",
limits = c(0,1)) +
theme(legend.position = "bottom")
Simple percentage gain (ratio of percentiles).
d_mean <- overall_vocab_data %>%
ungroup() %>%
filter(language %in% included_languages,
measure == "production_prop") %>%
select(-proportion, -measure, -n) %>%
mutate(momed = factor(momed, levels = c("None, Primary, Some Secondary",
"Secondary",
"College",
"Graduate"),
labels = c("NPSS","S","CSC","GSG"))) %>%
spread(momed, preds) %>%
mutate(r1 = S/NPSS,
r2 = CSC/S,
r3 = GSG/CSC) %>%
mutate(r_mean = rowMeans(cbind(r1, r2, r3), na.rm=TRUE))
ggplot(d_mean,
aes(x = age, y = r_mean, col = language)) +
geom_point()
Summary.
(Note missing data for Hebrew before 19 mo).
d_mean %>%
group_by(language) %>%
filter(!(language == "Hebrew" & age < 19)) %>%
summarise(max_pct_diff = max(r_mean, na.rm=TRUE),
mean_pct_diff = mean(r_mean, na.rm=TRUE),
secondary_pct_diff = mean(r1, na.rm=TRUE)) %>%
dplyr::arrange(mean_pct_diff) %>%
kable(digits=2)
language | max_pct_diff | mean_pct_diff | secondary_pct_diff |
---|---|---|---|
English (British) | 1.06 | 1.04 | 1.02 |
Danish | 1.07 | 1.05 | 1.23 |
Norwegian | 1.12 | 1.08 | 1.18 |
Spanish | 1.17 | 1.15 | 1.13 |
English | 1.26 | 1.20 | 1.39 |
Hebrew | 1.28 | 1.26 | 1.82 |
Plot these.
d_mean %>%
select(-starts_with("d"), -NPSS, -S, -CSC, -GSG, -r_mean) %>%
gather(momed, preds, starts_with("r")) %>%
ggplot(aes(x = age, y = preds, col = momed)) +
geom_point() +
facet_wrap(~language)