Load required libraries.

library(wordbankr)
library(langcog)
library(dplyr)
library(ggplot2)
library(directlabels)
library(purrr)
library(tidyr)
theme_set(theme_mikabr())
font <- theme_mikabr()$text$family
mode <- "remote"

Get administration data and filter to administrations of Words & Sentences that have momed coded.

vocab_admins <- get_administration_data(mode = mode, original_ids=TRUE) %>%
  rename(momed = mom_ed) %>%
  filter(!is.na(momed))

Get first admins.

first_longitudinals <- vocab_admins %>%
  filter(longitudinal) %>%
  group_by(source_name, original_id) %>%
  arrange(age) %>%
  slice(1)

vocab_admins <- vocab_admins %>%
      filter(!longitudinal |
               (longitudinal & (data_id %in% first_longitudinals$data_id)))

Get item information to find the number of items on each language’s form.

num_words <- get_item_data(mode = mode) %>%
  filter(type == "word") %>%
  group_by(language, form) %>%
  summarise(n = n())

Normalize productive vocabulary size as a proportion of items and calculate median vocabulary size for each language, momed level, and age.

vocab_data <- vocab_admins %>%
  left_join(num_words) %>%
  filter(form %in% c("WG","WS","TEDS Twos")) %>%
  mutate(production = as.numeric(production), 
         no_production = n - production, 
         comprehension = as.numeric(comprehension),
         production_prop = as.numeric(production) / n,
         comprehension_prop = ifelse(form == "WG", 
                                     as.numeric(comprehension) / n, NA),
         momed = `levels<-`(momed,
                            list("None, Primary, Some Secondary" = "None",
                                 "None, Primary, Some Secondary" = "Primary",
                                 "None, Primary, Some Secondary" = "Some Secondary",
                                 "Secondary" = "Secondary",
                                 "College" = "Some College",
                                 "College" = "College",
                                 "Graduate" = "Some Graduate",
                                 "Graduate" = "Graduate")))

Included languages.

included_languages <- c("Danish","English","Spanish","Norwegian","Hebrew", "English (British)")

Get Ns.

vocab_data %>%
  filter(language %in% included_languages) %>%
  group_by(language) %>%
  summarise(n = n()) %>%
  kable
language n
Danish 3640
English 3792
English (British) 10925
Hebrew 313
Norwegian 7427
Spanish 1068

Plotting

Get means.

overall_vocab_data <- vocab_data %>%
  group_by(language, momed, age) %>%
  summarise(production_prop = median(production_prop, na.rm=TRUE), 
            comprehension_prop = median(comprehension_prop, na.rm=TRUE), 
            n = n()) %>%
  gather(measure, proportion, production_prop, comprehension_prop)

Subsidiary data for plots.

ns <- filter(overall_vocab_data, 
             language %in% included_languages, 
             measure == "production_prop") %>% 
  group_by(language) %>% 
  summarize(n = sum(n))

Fit model and plot.

vocab_models <- vocab_data %>%
  split(.$language) %>%
  map(function(x) {robustbase::glmrob(cbind(production, no_production) ~ age * momed + 
                                        I(age^2)  - momed,
                             family = "binomial",
                             data = x)}) 

overall_vocab_data <- overall_vocab_data %>%
  split(.$language) %>%
  map_df(function(x) {
    x$preds <- predict(vocab_models[[x$language[1]]], 
                       newdata = x, type = "response")
    return(x)
  })
                
plot_data <- filter(overall_vocab_data, 
                    language %in% included_languages, 
                    measure == "production_prop") %>%
mutate(language_name = factor(language, 
                              levels = c("English", "Hebrew", "Spanish",
                                         "Norwegian", "Danish", "English (British)"),
                              labels = c("United States", "Israel", "Mexico", 
                                         "Norway", "Denmark", "United Kingdom")),
       language_n = paste0(language_name," (N=", ns[ns$language==language[1],2] ,")"))

ggplot(plot_data,
       aes(x = age, y = proportion, colour = momed, 
           label = momed)) +
  facet_wrap( ~ language_n) +
  geom_point(aes(size = n), alpha = .3) +
  geom_line(aes(y = preds)) +
  scale_colour_solarized(name = "") +
  scale_size_continuous(range = c(.3, 3), 
                        name = "N") + 
  scale_x_continuous("Age (months)") +
  scale_y_continuous(name = "Median Productive Vocabulary",
                     limits = c(0,1)) +
  theme(legend.position = "bottom")

Effect size

Simple percentage gain (ratio of percentiles).

d_mean <- overall_vocab_data %>%
  ungroup() %>%
  filter(language %in% included_languages, 
         measure == "production_prop") %>%
  select(-proportion, -measure, -n) %>%
  mutate(momed = factor(momed, levels = c("None, Primary, Some Secondary",
                                   "Secondary",
                                   "College", 
                                   "Graduate"), 
                        labels = c("NPSS","S","CSC","GSG"))) %>%
  spread(momed, preds) %>%
  mutate(r1 = S/NPSS,
         r2 = CSC/S,
         r3 = GSG/CSC) %>%
  mutate(r_mean = rowMeans(cbind(r1, r2, r3), na.rm=TRUE))


ggplot(d_mean, 
       aes(x = age, y = r_mean, col = language)) +
  geom_point()

Summary.

(Note missing data for Hebrew before 19 mo).

d_mean %>%
  group_by(language) %>%
  filter(!(language == "Hebrew" & age < 19)) %>%
  summarise(max_pct_diff = max(r_mean, na.rm=TRUE), 
            mean_pct_diff = mean(r_mean, na.rm=TRUE), 
            secondary_pct_diff = mean(r1, na.rm=TRUE)) %>%
  dplyr::arrange(mean_pct_diff) %>%
  kable(digits=2)
language max_pct_diff mean_pct_diff secondary_pct_diff
English (British) 1.06 1.04 1.02
Danish 1.07 1.05 1.23
Norwegian 1.12 1.08 1.18
Spanish 1.17 1.15 1.13
English 1.26 1.20 1.39
Hebrew 1.28 1.26 1.82

Plot these.

d_mean %>%
  select(-starts_with("d"), -NPSS, -S, -CSC, -GSG, -r_mean) %>%
  gather(momed, preds, starts_with("r")) %>%
  ggplot(aes(x = age, y = preds, col = momed)) + 
  geom_point() + 
  facet_wrap(~language)