Notes

This document describes analyses of the relationship between maternal education and parent-reported productive vocabulary across languages.

Outstanding items:

Done:

Summary: I think this is important and interesting and deserves to be out there. I’ve thought more about the effect size question and I think we are doing something reasonable. My major concerns:

Note that the most recent worldbank GINI coefficients for the different countries are as follows:

Preliminaries

Load required libraries.

library(tidyverse)
library(wordbankr)
library(langcog)
library(scales)
library(forcats)
library(broom)

theme_set(theme_mikabr())
font <- theme_mikabr()$text$family
mode <- "remote"

Data

Set what languages we include.

included_languages <- c("Danish", "English", "Spanish", "Norwegian", "Hebrew") 
  #, "Hebrew", "English (British)")
kable(data.frame(included_languages))

included_languages

Danish
English
Spanish
Norwegian
Hebrew

Get administration data and filter to administrations of Words & Sentences that have momed coded.

vocab_admins <- get_administration_data(mode = mode, 
                                        original_ids=TRUE) %>%
  rename(momed = mom_ed) %>%
  filter(!is.na(momed), 
         language %in% included_languages)

Filter these data to ensure that we have the first administration only for longitudinal data.

first_longitudinals <- vocab_admins %>%
  filter(longitudinal) %>%
  group_by(source_name, original_id) %>%
  arrange(age) %>%
  slice(1)

vocab_admins <- vocab_admins %>%
  filter(!longitudinal |
           (longitudinal & (data_id %in% first_longitudinals$data_id)))

Get item information to find the number of items on each language’s form.

num_words <- get_item_data(mode = mode) %>%
  filter(type == "word", 
         language %in% included_languages) %>%
  group_by(language, form) %>%
  summarise(n = n())

kable(num_words)
language form n
Danish WG 410
Danish WS 725
English WG 396
English WS 680
Hebrew WG 443
Hebrew WS 622
Norwegian WG 395
Norwegian WS 731
Spanish WG 428
Spanish WS 680

Make the main data frame.

Normalize productive vocabulary size as a proportion of items and calculate median vocabulary size for each language, momed level, and age.

vocab_data <- vocab_admins %>%
  left_join(num_words) %>%
  filter(form %in% c("WG","WS","TEDS Twos")) %>%
  mutate(production = as.numeric(production), 
         no_production = n - production, 
         comprehension = as.numeric(comprehension),
         production_prop = as.numeric(production) / n,
         comprehension_prop = ifelse(form == "WG", 
                                     as.numeric(comprehension) / n, NA))

There are a total of 16240 children in this analysis, across 0 languages.

Momed distributions and releveling

Examine momed distributions in the data, prior to releveling.

momeds <- vocab_data %>%
  group_by(language, momed) %>%
  summarise(ns = n()) %>%
  mutate(prop = ns / sum(ns))

ggplot(momeds, aes(x = momed, fill = momed, y = prop)) + 
  geom_bar(stat="identity") + 
  geom_text(aes(label=ns), vjust = -.25) + 
  facet_wrap(~language) + 
  scale_y_continuous(labels = percent, lim = c(0,1)) +
  scale_fill_solarized(guide=FALSE) + 
  xlab("Mother's Education Level") + 
  ylab("") + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = .5)) 

Note there is an error in the TEDS data momed coding:

  • 1: no qualifications
  • 2: CSE grade 2-5 or O-level/GCSE grade D-G
  • 3: CSE grade 1 or O-level/GCSE grade A-C
  • 4: A-level or S-level
  • 5: HNC
  • 6: HND
  • 7: undergraduate
  • 8: postgraduate

Right now, 2 is coded as primary and 3 was some secondary. This is incorrect - 2 and 3 denote different grades in different secondary exit exams. In particular, both should be coded as secondary.

Let’s fix that.

vocab_data$momed[vocab_data$language == "English (British)" &
                   vocab_data$momed == "Primary"] <- "Secondary"

vocab_data$momed[vocab_data$language == "English (British)" &
                   vocab_data$momed == "Some Secondary"] <- "Secondary"

We also have the issue that Spanish should have some college be college.

vocab_data$momed[vocab_data$language == "Spanish" &
                   vocab_data$momed == "Some College"] <- "College"

One possible releveling.

vocab_data$momed_original <- vocab_data$momed
vocab_data$momed <- fct_recode(vocab_data$momed,
                               "Below Secondary" = "None",
                               "Below Secondary" = "Some Secondary",
                               "Below Secondary" = "Primary",
                               "Secondary" = "Secondary",
                               "Secondary" = "Some College",
                               "College and Above" = "College",
                               "College and Above" = "Some Graduate",
                               "College and Above" = "Graduate")

This evens things out somewhat across languages.

momeds <- vocab_data %>%
  group_by(language, momed) %>%
  summarise(ns = n()) %>%
  mutate(prop = ns / sum(ns))

ggplot(momeds, aes(x = momed, fill = momed, y = prop)) + 
  geom_bar(stat="identity") + 
  geom_text(aes(label=ns), vjust = -.25) + 
  facet_wrap(~language) + 
  scale_y_continuous(labels = percent, lim = c(0,1)) +
  scale_fill_solarized(guide=FALSE) + 
  xlab("Mother's Education Level") + 
  ylab("") + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = .5)) 

Age distribution

What’s the age distribution across languages?

ggplot(vocab_data, 
       aes(x = age, fill = form)) + 
  geom_bar() + 
  facet_wrap(~language) + 
  scale_y_log10(breaks=c(10,100,1000)) + 
  scale_fill_solarized()

Comparison of absolute differences

Let’s look at absolute differences between languages.

overall_vocab_data_median <- vocab_data %>%
  group_by(language, age) %>%
  summarise(production_prop = median(production_prop, na.rm=TRUE), 
            n = n()) 

ggplot(overall_vocab_data_median,
       aes(x = age, y = production_prop, colour = language)) +
  geom_point(aes(size = n), alpha = .3) +
  geom_smooth(se = FALSE) + 
  scale_colour_solarized(name = "") +
  scale_size_continuous(range = c(.3, 3), 
                        name = "N") + 
  scale_x_continuous("Age (months)") +
  scale_y_continuous(name = "Median Productive Vocabulary",
                     limits = c(0,1)) +
  theme(legend.position = "bottom")

The major note here is that Danish and Spanish are low in absolute terms, while the other languages seem largely on the same trajectory (perhaps Hebrew is a bit high). These are likely caused at least in part by the relative makeup of the forms - some designers ended up with easier and harder forms. Plus the number of items differs susbstantially between forms:

ns <- vocab_data %>%
  group_by(language) %>%
  summarise(n = n())

kable(ns)
language n
Danish 3640
English 3792
Hebrew 313
Norwegian 7427
Spanish 1068

In sum, only relative differences here are really interpretable, and we should be a little careful even with those given the different parts of the curve that are represented in different datasets.

Primary analysis

Model selection

American English Data

Let’s look at curve shapes on the English data alone. We’ll start with means here but investigate medians below.

eng_vocab_data <- vocab_data %>%
  filter(language == "English") %>%
  group_by(momed, age) %>%
  summarise(production_prop = mean(production_prop, na.rm=TRUE), 
            n = n()) %>%
  gather(measure, proportion, production_prop)

Start with a simple model and build.

mod <- glm(cbind(production, no_production) ~ age * momed,
                          family = "binomial",
                          data = filter(vocab_data, language == "English"))
kable(summary(mod)$coef, digits=3)
Estimate Std. Error z value Pr(>|z|)
(Intercept) -5.627 0.039 -145.667 0.000
age 0.207 0.002 128.209 0.000
momedSecondary 0.139 0.041 3.426 0.001
momedCollege and Above -0.440 0.040 -10.907 0.000
age:momedSecondary 0.010 0.002 6.147 0.000
age:momedCollege and Above 0.044 0.002 26.209 0.000

Plot.

eng_vocab_data <- eng_vocab_data %>%
  list %>%
  map_df(function(x) {
    x$preds <- predict(mod, newdata = x, type = "response")
    return(x)
  })

ggplot(eng_vocab_data,
       aes(x = age, y = proportion, colour = momed, 
           label = momed)) +
  geom_point(aes(size = n), alpha = .3) +
  geom_line(aes(y = preds)) +
  scale_colour_solarized(name = "") +
  scale_size_continuous(range = c(.3, 3), 
                        name = "N") + 
  scale_x_continuous("Age (months)") +
  scale_y_continuous(name = "Mean Productive Vocabulary",
                     limits = c(0,1)) +
  theme(legend.position = "bottom")

Note that this model doesn’t do badly, but it somewhat overshoots down at the bottom.

A larger model, now including a quadratic term.

mod <- glm(cbind(production, no_production) ~ age * momed + 
                            I(age^2) * momed,
                          family = "binomial",
                          data = filter(vocab_data, language == "English"))

eng_vocab_data <- eng_vocab_data %>%
  list %>%
  map_df(function(x) {
    x$preds <- predict(mod, newdata = x, type = "response")
    return(x)
  })

ggplot(eng_vocab_data,
       aes(x = age, y = proportion, colour = momed, 
           label = momed)) +
  geom_point(aes(size = n), alpha = .3) +
  geom_line(aes(y = preds)) +
  scale_colour_solarized(name = "") +
  scale_size_continuous(range = c(.3, 3), 
                        name = "N") + 
  scale_x_continuous("Age (months)") +
  scale_y_continuous(name = "Mean Productive Vocabulary",
                     limits = c(0,1)) +
  theme(legend.position = "bottom")

Let’s try ordering the predictors - this assumes linear spacing between them but will deal with overfitting. Note: I tried to use ordPens for this, but it is very inflexible as a package (e.g., can’t deal with the cbind(production, no_production) form for logistic).

vocab_data$momed_continuous <- scale(as.numeric(vocab_data$momed), 
                                     center=TRUE, scale=FALSE)
eng_vocab_data$momed_continuous <- scale(as.numeric(eng_vocab_data$momed), 
                                     center=TRUE, scale=FALSE)


mod_cont <- glm(cbind(production, no_production) ~ age * momed_continuous + 
                            I(age^2) * momed_continuous,
                          family = "binomial",
                          data = filter(vocab_data, language=="English"))

eng_vocab_data <- eng_vocab_data %>%
  list %>%
  map_df(function(x) {
    x$preds <- predict(mod_cont, newdata = x, type = "response")
    return(x)
  })

ggplot(eng_vocab_data,
       aes(x = age, y = proportion, colour = momed, 
           label = momed)) +
  geom_point(aes(size = n), alpha = .3) +
  geom_line(aes(y = preds)) +
  scale_colour_solarized(name = "") +
  scale_size_continuous(range = c(.3, 3), 
                        name = "N") + 
  scale_x_continuous("Age (months)") +
  scale_y_continuous(name = "Mean Productive Vocabulary",
                     limits = c(0,1)) +
  theme(legend.position = "bottom")

This is more parsimonious, but does empirically look like it overshoots the below secondary category by a lot. This model looks better with four momed levels - with three it’s clearly wrong.

eng_vocab_data$momed_continuous <- scale(as.numeric(eng_vocab_data$momed), 
                                     center=TRUE, scale=FALSE) # not sure why but you have to do this again.

mod_cont_cubic <- glm(cbind(production, no_production) ~ age * momed +
                            I(age^2) * momed + I(age^3) * momed ,
                          family = "binomial",
                          data = filter(vocab_data, language=="English"))

eng_vocab_data <- eng_vocab_data %>%
  list %>%
  map_df(function(x) {
    x$preds <- predict(mod_cont_cubic, newdata = x, type = "response")
    return(x)
  })

ggplot(eng_vocab_data,
       aes(x = age, y = proportion, colour = momed, 
           label = momed)) +
  geom_point(aes(size = n), alpha = .3) +
  geom_line(aes(y = preds)) +
  scale_colour_solarized(name = "") +
  scale_size_continuous(range = c(.3, 3), 
                        name = "N") + 
  scale_x_continuous("Age (months)") +
  scale_y_continuous(name = "Mean Productive Vocabulary",
                     limits = c(0,1)) +
  theme(legend.position = "bottom")

The cubic increases fit because we have so much data, but I’m not convinced.

Median-based

model_spec <- formula(cbind(production, no_production) ~ age * momed  + I(age^2) * momed - momed)

There are two variants on this analysis that we can use: mean-based and median-based. The median-based variant is a bit more robust across kid-wise variability (especially at the low end, where a few outliers can throw off estimates), but the mean-based statistics allow for easier fitting of mixed models across languages.

overall_vocab_data_median <- vocab_data %>%
  group_by(language, momed, age) %>%
  summarise(production_prop = median(production_prop, na.rm=TRUE), 
            n = n()) %>%
  mutate(language_name = factor(language, 
                                levels = c("English", "Hebrew", "Spanish",
                                           "Norwegian", "Danish"),
                                labels = c("United States", "Israel", "Mexico", 
                                           "Norway", "Denmark")),
         language_n = paste0(language_name," (N=", ns[ns$language==language[1],2] ,")"))

vocab_models_median <- vocab_data %>%
  split(.$language) %>%
  map(function(x) {robustbase::glmrob(model_spec,
                                      family = "binomial",
                                      data = x)}) 

overall_vocab_data_median <- overall_vocab_data_median %>%
  split(.$language) %>%
  map_df(function(x) {
    x$preds <- predict(vocab_models_median[[x$language[1]]], 
                       newdata = x, type = "response")
    return(x)
  })

ggplot(overall_vocab_data_median,
       aes(x = age, y = production_prop, colour = momed, 
           label = momed)) +
  facet_wrap( ~ language_n) +
  geom_point(aes(size = n), alpha = .3) +
  geom_line(aes(y = preds)) +
  scale_colour_solarized(name = "") +
  scale_size_continuous(range = c(.3, 3), 
                        name = "N") + 
  scale_x_continuous("Age (months)") +
  scale_y_continuous(name = "Median Productive Vocabulary",
                     limits = c(0,1)) +
  theme(legend.position = "bottom")

Mean-based

overall_vocab_data_mean <- vocab_data %>%
  group_by(language, momed, age) %>%
  summarise(production_prop = mean(production_prop, na.rm=TRUE), 
            n = n()) %>%
  mutate(language_name = factor(language, 
                                levels = c("English", "Hebrew", "Spanish",
                                           "Norwegian", "Danish"),
                                labels = c("United States", "Israel", "Mexico", 
                                           "Norway", "Denmark")),
         language_n = paste0(language_name," (N=", ns[ns$language==language[1],2] ,")"))

vocab_models_mean <- vocab_data %>%
  split(.$language) %>%
  map(function(x) {glm(model_spec,
                       family = "binomial",
                       data = x)}) 

overall_vocab_data_mean <- overall_vocab_data_mean %>%
  split(.$language) %>%
  map_df(function(x) {
    x$preds <- predict(vocab_models_mean[[x$language[1]]], 
                       newdata = x, type = "response")
    return(x)
  })

ggplot(overall_vocab_data_mean,
       aes(x = age, y = production_prop, colour = momed, 
           label = momed)) +
  facet_wrap( ~ language_n) +
  geom_point(aes(size = n), alpha = .3) +
  geom_line(aes(y = preds)) +
  scale_colour_solarized(name = "") +
  scale_size_continuous(range = c(.3, 3), 
                        name = "N") + 
  scale_x_continuous("Age (months)") +
  scale_y_continuous(name = "Mean Productive Vocabulary",
                     limits = c(0,1)) +
  theme(legend.position = "bottom")

We’re going to adopt the median approach. The means feel like they are wrong because they show a much more gradual emergence due to the averaging-in of substantial outliers (e.g., when the mode is 0 at the bottom or when the mode is high at the top).

CIs

Inspired by https://rpubs.com/aammd/boot_regression_dplyr.

n_rep <- 50

means <- vocab_data %>%
  group_by(language, momed, age) %>%
  summarise(production_prop = mean(production_prop, na.rm=TRUE), 
            n = n())

do_sim <- function (vocab_data, means, sim_num) {
  vocab_models <- vocab_data %>%
    split(.$language) %>%
    map_df(function(x) sample_frac(x, replace = TRUE)) %>%
    split(.$language) %>%
    map(function(x) robustbase::glmrob(model_spec, family = "binomial", data = x)) 
  
  means %>%
    mutate(sim_num = sim_num) %>%
    split(.$language) %>%
    map_df(function(x) {
      x$preds <- predict(vocab_models[[x$language[1]]], 
                         newdata = x, type = "response")
      return(x)
    })
}

sims <- data.frame(sim_num = 1:n_rep) %>%
  rowwise() %>%
  do(do_sim(vocab_data, means, .$sim_num))

Plot variability.

ggplot(sims,
       aes(x = age, y = preds, colour = momed, group = interaction(momed,sim_num),
           label = momed)) +
  facet_wrap( ~ language) +
  geom_line(alpha = .1) +
  scale_colour_solarized(name = "") +
  scale_x_continuous("Age (months)") +
  scale_y_continuous(name = "Mean Productive Vocabulary",
                     limits = c(0,1)) +
  theme(legend.position = "bottom")

Plot ribbons.

ms <- sims %>%
  group_by(language, age, momed) %>%
  summarise(lower = quantile(preds, .025),
            upper = quantile(preds, .975))

ggplot(ms, 
      aes(x = age, ymin = lower, ymax = upper, fill = momed)) +
  facet_wrap(~language) +
  geom_ribbon(alpha = .5) +
  scale_fill_solarized(name = "") +
  scale_x_continuous("Age (months)") +
  scale_y_continuous(name = "Mean Productive Vocabulary",
                     limits = c(0,1)) +
  theme(legend.position = "bottom")

Effect size

We’re looking for a nice, easily-interpretable measure of effect size to convey the difference between these curves. Let’s try to take the model predictions and plot the percentage gain between strata.

ns <- overall_vocab_data_median %>%
  group_by(language, age) %>%
  summarise(n = sum(n))

es <- overall_vocab_data_median %>%
  ungroup %>%
  select(-production_prop, -n) %>%
  mutate(momed = factor(momed, levels = c("Below Secondary",
                                          "Secondary",
                                          "College and Above"),
                        labels = c("BS","S","CA"))) %>%
  spread(momed, preds) %>%
  mutate(bs_s = S/BS,
         s_c = CA/S) %>%
  mutate(es_mean = rowMeans(cbind(bs_s, s_c), na.rm=TRUE)) %>%
  left_join(ns)

Mean stratum difference across languages. (Note very sparse data for Hebrew before 19 mo).

n_cutoff <- 20

Dropping ages with less than 20 kids.

ggplot(filter(es, n > n_cutoff),
       aes(x = age, y = es_mean, col = language)) +
  geom_point(aes(size=n)) + 
  geom_smooth(se=FALSE, span = 1) + 
  ylab("Difference between mom ed strata (%)") + 
  xlab("Age (months)") + 
  scale_colour_solarized()

Stratum difference by stratum and language.

es_long <- es %>%
  filter(n > n_cutoff) %>%
  select(age, language, bs_s, s_c) %>%
  gather(stratum, es, bs_s, s_c)

ggplot(es_long,
       aes(x = age, y = es, col = stratum)) +
  facet_wrap(~language) + 
  geom_point() + 
  geom_hline(yintercept = 1, lty = 2, col = "black") + 
  ylab("Difference between mom ed strata (%)") + 
  xlab("Age (months)") + 
  scale_colour_solarized()

Summary statistics.

es %>%
  group_by(language) %>%
  filter(n > n_cutoff) %>%
  summarise(max_pct_diff = max(es_mean, na.rm=TRUE),
            mean_pct_diff = mean(es_mean, na.rm=TRUE),
            secondary_pct_diff = mean(bs_s, na.rm=TRUE)) %>%
  dplyr::arrange(mean_pct_diff) %>%
  kable(digits=2)
language max_pct_diff mean_pct_diff secondary_pct_diff
Norwegian 1.07 1.01 1.00
Danish 1.07 1.05 1.05
Spanish 1.18 1.10 1.11
English 1.30 1.22 1.36
Hebrew 1.54 1.53 1.86

With CIs

USe the simulations from above.

es_sims <- sims %>% 
  split(.$sim_num) %>%
  map_df(function(x) {
    x %>%
      ungroup %>%
      select(-n, -production_prop) %>%
      mutate(momed = factor(momed, levels = c("Below Secondary",
                                              "Secondary",
                                              "College and Above"),
                            labels = c("BS","S","CA"))) %>%
      spread(momed, preds) %>%
      mutate(bs_s = S/BS,
             s_c = CA/S) %>%
      mutate(es_mean = rowMeans(cbind(bs_s, s_c), na.rm=TRUE)) %>%
      select(sim_num, language, age, bs_s, s_c, es_mean) %>%
      left_join(ns)
  })

es_ms <- es_sims %>%
  filter(n > n_cutoff) %>%
  group_by(language, age, n) %>%
  summarise(mean = mean(es_mean, na.rm=TRUE), 
            lower = quantile(es_mean, .024), 
            upper = quantile(es_mean, .975))

Plot these with ribbons now.

ggplot(filter(es_ms),
       aes(x = age, y = mean)) +
  geom_point(aes(size=n, col = language)) + 
  geom_line(aes(col = language)) + 
  geom_line(aes(y = lower, col = language), lty = 2) + 
  geom_line(aes(y = upper, col = language), lty = 2) + 
  geom_ribbon(aes(ymin = lower, ymax = upper, fill = language), alpha = .1) +
  scale_color_solarized() + 
  scale_fill_solarized() + 
  ylab("Difference between mom ed strata (%)") + 
  xlab("Age (months)")

ES Comparison to Gender

It’s informative to look at how big these differences are, when compared to gender differences. They are not as big, but they get bigger with development (at least until the form ceilings out), while the gender ones are biggest (as a proportion difference) at the beginning.

sex_model_spec <- formula(cbind(production, no_production) ~ age * sex  + I(age^2) * sex - sex)

overall_vocab_data_median <- vocab_data %>%
  group_by(language, sex, age) %>%
  summarise(production_prop = median(production_prop, na.rm=TRUE), 
            n = n()) %>%
  mutate(language_name = factor(language, 
                                levels = c("English", "Spanish", "Hebrew",
                                           "Norwegian", "Danish"),
                                labels = c("United States", "Mexico", "Israel",
                                           "Norway", "Denmark")),
         language_n = paste0(language_name," (N=", ns[ns$language==language[1],2] ,")"))

vocab_models_median <- vocab_data %>%
  split(.$language) %>%
  map(function(x) {robustbase::glmrob(sex_model_spec,
                                      family = "binomial",
                                      data = x)}) 

overall_vocab_data_median <- overall_vocab_data_median %>%
  split(.$language) %>%
  map_df(function(x) {
    x$preds <- predict(vocab_models_median[[x$language[1]]], 
                       newdata = x, type = "response")
    return(x)
  })

ggplot(overall_vocab_data_median,
       aes(x = age, y = production_prop, colour = sex, 
           label = sex)) +
  facet_wrap( ~ language_n) +
  geom_point(aes(size = n), alpha = .3) +
  geom_line(aes(y = preds)) +
  scale_colour_solarized(name = "") +
  scale_size_continuous(range = c(.3, 3), 
                        name = "N") + 
  scale_x_continuous("Age (months)") +
  scale_y_continuous(name = "Median Productive Vocabulary",
                     limits = c(0,1)) +
  theme(legend.position = "bottom")

ns <- overall_vocab_data_median %>%
  group_by(language, age) %>%
  summarise(n = sum(n))

gender_es <- overall_vocab_data_median %>%
  ungroup %>%
  select(-production_prop, -n) %>%
  spread(sex, preds) %>%
  mutate(es_mean = Female/Male) %>%
  left_join(ns)

Mean stratum difference across languages.

ggplot(filter(gender_es, n > n_cutoff),
       aes(x = age, y = es_mean, col = language)) +
  geom_point(aes(size=n)) + 
  geom_smooth(se=FALSE, span = 1) + 
  ylab("Difference between genders (%)") + 
  xlab("Age (months)") + 
  scale_colour_solarized()

Summary statistics.

gender_es %>%
  group_by(language) %>%
  filter(n > n_cutoff) %>%
  summarise(max_pct_diff = max(es_mean, na.rm=TRUE),
            mean_pct_diff = mean(es_mean, na.rm=TRUE)) %>%
  dplyr::arrange(mean_pct_diff) %>%
  kable(digits=2)
language max_pct_diff mean_pct_diff
Spanish 1.30 1.20
Danish 1.43 1.23
Norwegian 1.53 1.31
Hebrew 1.43 1.32
English 1.48 1.35