Bilingual vocabulary: Lexical category bias

Author

Alvin W.M. Tan

Published

March 21, 2024

In this document we explore the noun, verb, and function word bias for bilinguals; we hypothesise that these bias values will lie somewhere between the biases for monolinguals speaking either of the bilinguals’ two languages.

Malaysian CDI

We use data from Malaysian infants (speaking some combination of English, Malay, and Mandarin) contributed by Jun Ho Chai (Chai et al., 2021), which has previously been used to examine verb bias in Malay–English and Mandarin–English speakers.

Show code
library(here)
library(broom)
library(langcog)
library(tidyverse)
library(knitr)
set.seed(42)

Category proportion curves

Load the data, and join in lexical categories via semantic categories.

Show code
# bad but temporary
data <- read_csv(here(CDI_LOC, "wordbank_import/Bilingual/chai/mcdi-m_instrument_data_ws.csv"))
categories <- read_csv(here(CDI_LOC, "wordbank/raw_data/categories.csv"),
                       col_names = c("category", "lex_class", "lex_cat"))
Show code
data_cat <- data |> 
  mutate(category = case_when(category == "people_names" ~ "people",
                              category == "prepositions_locations" ~ "locations",
                              category == "quantifiers_articles" ~ "quantifiers",
                              .default = category)) |> 
  left_join(categories, by = "category") |> 
  mutate(across(ends_with("Expo"), \(x) round(x, 7)))

Group the participants by language group (monolingual, bilingual (3 combinations), trilingual). Participants are considered to have exposure to a language if they receive >10% of their language input in that language.

Show code
lex_class <- data_cat |> 
  group_by(data_id, age, sex, language, 
           MalayExpo, EngExpo, ChiExpo, Ethnicity, lex_class) |> 
  summarise(produces = sum(value == 2),
            understands = sum(value > 0),
            n = n()) |> 
  filter(lex_class != "other") |> 
  mutate(prop_class = produces / n,
         prop_total = sum(produces) / sum(n)) |> 
  ungroup() |> 
  mutate(across(ends_with("Expo"), 
                \(x) {as.numeric(cut(x, 
                                     breaks = c(0, 0.1, 1), 
                                     labels = c(0, 1), 
                                     include.lowest = T)) - 1},
                .names = "{col}_fct"),
         lang_group = case_when(MalayExpo_fct + EngExpo_fct + ChiExpo_fct == 1 ~ "mon",
                                MalayExpo_fct + EngExpo_fct + ChiExpo_fct == 3 ~ "tri",
                                EngExpo_fct + MalayExpo_fct == 2 ~ "eng-msa",
                                EngExpo_fct + ChiExpo_fct == 2 ~ "eng-cmn",
                                MalayExpo_fct + ChiExpo_fct == 2 ~ "msa-cmn"),
         lex_class = lex_class |> as_factor() |> fct_shift(1))
Show code
lex_class |> 
  select(data_id, lang_group) |> 
  distinct() |> 
  count(lang_group) |> 
  kable()
lang_group n
eng-cmn 54
eng-msa 163
mon 347
msa-cmn 1
tri 4

Run constrained LMs.

Show code
pts <- seq(0, 1, 0.01)

lex_class_cleaned <- lex_class |> 
  filter(prop_total != 0) |> 
  group_by(language, lex_class, lang_group) |> 
  nest(data = -c("language", "lex_class", "lang_group")) |> 
  filter(lang_group != "msa-cmn", # only 1 ppt
         lang_group != "tri", # only 4 ppts
         !(language == "Malay" && lang_group == "eng-cmn"),
         !(language == "Mandarin" && lang_group == "eng-msa"))

lex_preds_models <- lex_class_cleaned |> 
  mutate(model = map(data, ~ langcog::clm(prop_class ~ I(prop_total^3) + 
                                            I(prop_total^2) + prop_total - 1, 
                                          data = .)),
         predictions = map(model, ~ broom:::augment.lm(., newdata = 
                                                         tibble(prop_total = pts))))

lex_preds <- lex_preds_models |> 
  select(language, lex_class, lang_group, predictions) |> 
  unnest(predictions) |> 
  rename(prop_class = .fitted)

Visualise the results.

Show code
ggplot(lex_preds, aes(x = prop_total, y = prop_class, col = lex_class)) +
  geom_abline(intercept = 0, slope = 1, lty = "dashed") +
  facet_grid(language ~ lang_group) +
  coord_fixed(xlim = c(0, 1), ylim = c(0, 1)) +
  geom_point(data = lex_class |> 
               filter(prop_total != 0, lang_group != "msa-cmn", lang_group != "tri"), 
             alpha = .1, size = 0.6) +
  geom_line(linewidth = 1.5, alpha = .9) +
  theme_classic() +
  theme(axis.line.x = element_line(linewidth = 0),
        axis.line.y = element_line(linewidth = 0),
        strip.background = element_blank(),
        panel.border = element_rect(colour = "black", fill = NA, linewidth = 1)) +
  scale_colour_manual(values = c("#4476AA", "#DECC77", "#CD6677")) +
  labs(x = "Vocabulary size", y = "Proportion of category", col = "Lexical class") +
  scale_x_continuous(breaks = c(0, 0.5, 1)) +
  scale_y_continuous(breaks = c(0, 0.5, 1))

Bootstrapped curve–diagonal areas

We use the difference in the area between the curve and the diagonal as a measure for bias. We randomly resample the whole population 1000x with replacement, and recompute the area measurement. Code here is adapted from the Wordbank book.

Show code
poly_area <- function(group_data) {
  model <- tryCatch(
    langcog::clm(prop_class ~ I(prop_total^3) + I(prop_total^2) + prop_total - 1, 
                 data = group_data),
    error = function(e) return(NULL)
  )
  if (is.null(model)) return(NA)
  # this is some magic that I don't understand lol
  return((model$solution %*% c(1/4, 1/3, 1/2) - 0.5)[1]) 
}

sample_areas <- function(vocab_data, nboot = 1000, verbose = FALSE) {
  
  sample_area <- function(i) {
    if (verbose & i%%10 == 0) {print(i)}
    vocab_data |> 
      group_by(data_id) |> 
      select(-produces, -understands, -n) |> 
      # pivot wider to sample participants, not administrations
      pivot_wider(names_from = c(language, lex_class),
                  values_from = c(prop_class, prop_total)) |> 
      group_by(lang_group) |> 
      slice_sample(prop = 1, replace = TRUE) |> 
      # pivoting back requires a bit more finesse because of the data structure
      pivot_longer(cols = starts_with("prop_"),
                   names_to = c("prop_type", "language", "lex_class"),
                   names_pattern = "(prop_[a-z]*)_([A-Z][a-z]*)_([a-z_]*)") |> 
      pivot_wider(names_from = prop_type,
                  values_from = value,
                  values_fn = unique) |> 
      filter(!is.na(prop_total)) |> 
      group_by(language, lang_group, lex_class) |> 
      nest(data = -c("language", "lang_group", "lex_class")) |> 
      mutate(area = map_dbl(data, poly_area),
             sample = i) |> 
      select(-data)
  }
  
  map_df(1:nboot, sample_area)
}

sampled_areas <- sample_areas(lex_class_cleaned |> unnest(data))

Generate summaries of the area data.

Show code
area_summary <- sampled_areas |> 
  filter(!is.na(area)) |> 
  group_by(language, lang_group, lex_class) |> 
  summarise(mean = mean(area),
            ci_lower = langcog::ci_lower(area),
            ci_upper = langcog::ci_upper(area))

Visualise the areas.

Show code
ggplot(area_summary) +
  geom_vline(xintercept = 0, lty = "dashed") +
  geom_pointrange(aes(x = mean, xmin = ci_lower, xmax = ci_upper, y = language, col = lang_group),
                  position = position_dodge(width = .6)) +
  facet_grid(lex_class ~ .) +
  theme_classic() +
  theme(axis.line.x = element_line(linewidth = 0),
        axis.line.y = element_line(linewidth = 0),
        strip.background = element_blank(),
        panel.border = element_rect(colour = "black", fill = NA, linewidth = 1)) +
  scale_colour_manual(values = c("#A9CD66", "#66CDB2", "#B766CD")) +
  labs(x = "Category bias", y = "Language", col = "Language group")

Bias differences across groups

If we only observe the English bias values for predicates, we notice that both bilingual groups have less bias against predicates than the monolingual group. We can investigate this numerically by conducting a permutation test, in which we permute the group labels and examine the probability that the observed difference in areas would arise under the null hypothesis. We can do this for all lexical categories (and indeed for all languages—but for now let’s just look at English).

Show code
observed_areas <- lex_class_cleaned |> 
  mutate(area = map_dbl(data, poly_area)) |> 
  select(-data)

observed_areas_wide <- observed_areas |> 
  pivot_wider(names_from = lang_group,
              values_from = area) |> 
  mutate(engmsa_minus_mon = `eng-msa` - mon,
         engcmn_minus_mon = `eng-cmn` - mon)

observed_areas_diff <- observed_areas_wide |> 
  select(-`eng-msa`, -`eng-cmn`, -mon) |> 
  pivot_longer(cols = c("engmsa_minus_mon", "engcmn_minus_mon"), 
               names_to = "comparison", 
               values_to = "area_diff")
Show code
permute_areas <- function(vocab_data, nboot = 1000, verbose = FALSE, group = NULL) {
  
  permute_area <- function(i) {
    if (verbose & i%%10 == 0) {print(i)}
    vocab_data |> 
      group_by(data_id) |> 
      select(-produces, -understands, -n) |> 
      # pivot wider to sample participants, not administrations
      pivot_wider(names_from = c(language, lex_class),
                  values_from = c(prop_class, prop_total)) |> 
      group_by(across(all_of(group))) |> 
      mutate(lang_group = sample(lang_group, replace = FALSE)) |> 
      # pivoting back requires a bit more finesse because of the data structure
      pivot_longer(cols = starts_with("prop_"),
                   names_to = c("prop_type", "language", "lex_class"),
                   names_pattern = "(prop_[a-z]*)_([A-Z][a-z]*)_([a-z_]*)") |> 
      pivot_wider(names_from = prop_type,
                  values_from = value,
                  values_fn = unique) |> 
      filter(!is.na(prop_total)) |> 
      group_by(language, lang_group, lex_class) |> 
      nest(data = -c("language", "lang_group", "lex_class")) |> 
      mutate(area = map_dbl(data, poly_area),
             sample = i) |> 
      select(-data)
  }
  
  map_df(1:nboot, permute_area)
}

permuted_areas <- permute_areas(lex_class_cleaned |> unnest(data))

permuted_areas_wide <- permuted_areas |> 
  pivot_wider(names_from = lang_group,
              values_from = area) |> 
  mutate(engmsa_minus_mon = `eng-msa` - mon,
         engcmn_minus_mon = `eng-cmn` - mon)

permuted_areas_diff <- permuted_areas_wide |> 
  select(-`eng-msa`, -`eng-cmn`, -mon) |> 
  pivot_longer(cols = c("engmsa_minus_mon", "engcmn_minus_mon"), 
               names_to = "comparison", 
               values_to = "area_diff")

Visualise the distribution in bias values. Densities indicate permuted samples; dashed line indicates observed value.

Show code
ggplot() +
  geom_density(data = permuted_areas_diff |> filter(language == "English"), 
               mapping = aes(x = area_diff, col = lex_class)) +
  geom_vline(data = observed_areas_diff |> filter(language == "English"),
             mapping = aes(xintercept = area_diff, col = lex_class),
             lty = "dashed") +
  facet_grid(lex_class ~ comparison) +
  theme_classic() +
  theme(axis.line.x = element_line(linewidth = 0),
        axis.line.y = element_line(linewidth = 0),
        strip.background = element_blank(),
        panel.border = element_rect(colour = "black", fill = NA, linewidth = 1)) +
  scale_colour_manual(values = c("#4476AA", "#DECC77", "#CD6677")) +
  labs(x = "Category bias", y = "Probability density", col = "Lexical category")

Calculate two-sided \(p\) values.

Show code
permuted_areas_diff |> 
  left_join(observed_areas_diff, 
            by = c("language", "lex_class", "comparison"),
            suffix = c("_perm", "_obs")) |> 
  mutate(obs_beats_perm = abs(area_diff_perm) - abs(area_diff_obs) > 0) |> 
  group_by(language, lex_class, comparison) |> 
  summarise(obs_p = sum(obs_beats_perm, na.rm = TRUE) / n()) |> 
  filter(!(language == "Malay" & comparison == "engcmn_minus_mon"),
         !(language == "Mandarin" & comparison == "engmsa_minus_mon")) |> 
  arrange(comparison) |> 
  kable()
language lex_class comparison obs_p
English function_words engcmn_minus_mon 0.632
English nouns engcmn_minus_mon 0.273
English predicates engcmn_minus_mon 0.284
Mandarin function_words engcmn_minus_mon 0.585
Mandarin nouns engcmn_minus_mon 0.478
Mandarin predicates engcmn_minus_mon 0.567
English function_words engmsa_minus_mon 0.132
English nouns engmsa_minus_mon 0.237
English predicates engmsa_minus_mon 0.694
Malay function_words engmsa_minus_mon 0.333
Malay nouns engmsa_minus_mon 0.203
Malay predicates engmsa_minus_mon 0.380

Nothing is significant at \(\alpha = 0.05\), or even at \(\alpha = 0.10\). Perhaps there is a better way to investigate whether there is a difference, but at least it does not show up using this method.

Oxford CDI

We use data from Serene Siow to explore European bilinguals—noting that most of these languages exhibit a noun bias.

Category proportion curves

Load the data and join in categories.

Show code
sem_cats <- c("Sounds", "Animals", "Vehicles", "Toys", "Food and Drink", 
              "Body Parts", "Clothes", "Furniture and Rooms", "Outside", 
              "Household items", "People", "Games and Routines", "Action Words", 
              "Descriptive Words", "Question words", "Time", 
              "Pronouns and Possessives", "Prepositions and Location Words", 
              "Quantifiers")

namefun <- \(nm) make.unique(nm, sep = "_")

ox_files <- list.files(OX_LOC, "*.xlsx")

ox_long <- lapply(ox_files, 
                  \(x) {readxl::read_xlsx(here(OX_LOC, x), skip = 1, 
                                          .name_repair = namefun) |> 
                      pivot_longer(cols = starts_with(sem_cats),
                                   names_to = c("category", "item"),
                                   names_pattern = "([A-Za-z ]*) - (.*)") |> 
                      mutate(file_name = x,
                             other_lang = str_extract(x, "(?<=_)[A-Z][a-z]*(?=.xlsx)"))}) |> 
  bind_rows() |> 
  filter(!(`Response ID` == "Response ID")) |>  # data error in German
  group_by(`Response ID`)

items <- colnames(readxl::read_xlsx(here(OX_LOC, ox_files[1]), skip = 1, 
                                        .name_repair = namefun))
eng_items <- str_split(items[33:450], " - ") |> 
  as.data.frame() |> t() |> as_tibble() |> 
  `colnames<-`(value = c("category", "item")) |> 
  mutate(is_eng_form = 1)
Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
`.name_repair` is omitted as of tibble 2.0.0.
ℹ Using compatibility `.name_repair`.
Show code
orig_cats <- c("sounds", "animals", "vehicles", "toys", "food_drink",
               "body_parts", "clothing", "furniture_rooms", "outside",
               "household", "people", "games_routines", "action_words",
               "descriptive_words", "question_words", "time_words",
               "pronouns", "locations", "quantifiers")

ox_long_cats <- ox_long |> 
  left_join(eng_items, by = c("category", "item")) |> 
  mutate(category = orig_cats[match(category, sem_cats)],
         is_eng_form = replace_na(is_eng_form, 0)) |> 
  left_join(categories, by = "category")

Group participants by language group; here most categories that are not “eng-l2” are too sparse so we exclude them. Note that we combine all “moneng” into a general monolingual English category.

Show code
ox_exp <- ox_long |> 
  select(`Response ID`, other_lang, starts_with("Overall language exposure - "), 
         -`Overall language exposure - Other - Text`) |> 
  rename_with(\(x) {str_replace(x, "Overall language exposure - ", "Expo")}) |> 
  distinct() |> 
  mutate(ExpoSecond = coalesce(ExpoDutch, ExpoFrench, ExpoGerman, ExpoItalian, ExpoPolish, ExpoPortuguese, ExpoSpanish)) |> 
  select(data_id = `Response ID`, other_lang, ExpoEnglish, ExpoSecond, ExpoOther) |> 
  mutate(across(starts_with("Expo"), 
                \(x) {as.numeric(cut(as.numeric(x), 
                                     breaks = c(0, 10, 100), 
                                     labels = c(0, 1), 
                                     include.lowest = T)) - 1},
                .names = "{col}_fct"),
         lang_group = case_when(ExpoEnglish_fct == 1 & ExpoSecond_fct + ExpoOther_fct == 0 ~ "moneng",
                                ExpoEnglish_fct == 0 & ExpoSecond_fct + ExpoOther_fct == 1 ~ "monl2",
                                ExpoEnglish_fct + ExpoSecond_fct + ExpoOther_fct == 3 ~ "tri",
                                ExpoEnglish_fct + ExpoSecond_fct == 2 ~ "eng-l2",
                                .default = "other"))
Show code
ox_lex_class <- ox_long_cats |> 
  select(data_id = `Response ID`, age = age_months, sex = `Child's gender`, 
         is_eng_form, lex_class, value) |> 
  group_by(data_id, age, sex, is_eng_form, lex_class) |> 
  summarise(produces = sum(str_detect(value, "/"), na.rm = TRUE),
            understands = sum(str_starts(value, "N", negate = TRUE), na.rm = TRUE) + produces,
            n = n()) |> 
  filter(lex_class != "other") |> 
  left_join(ox_exp, by = "data_id") |> 
  mutate(language = ifelse(is_eng_form, "English", other_lang)) |> 
  group_by(data_id, language) |> 
  mutate(prop_class = produces / n,
         prop_total = sum(produces, na.rm = T) / sum(n, na.rm = T),
         lex_class = lex_class |> as_factor() |> fct_shift(1))
Show code
ox_lex_class |> 
  ungroup() |> 
  select(data_id, other_lang, lang_group) |>
  distinct() |> 
  ungroup() |> 
  count(lang_group, other_lang) |> 
  kable()
lang_group other_lang n
eng-l2 Dutch 38
eng-l2 French 130
eng-l2 German 64
eng-l2 Germany 65
eng-l2 Italian 125
eng-l2 Netherlands 35
eng-l2 Polish 143
eng-l2 Portuguese 76
eng-l2 Spain 105
eng-l2 Spanish 240
moneng Dutch 1
moneng French 5
moneng German 3
moneng Germany 1
moneng Italian 2
moneng Netherlands 2
moneng Polish 3
moneng Portuguese 3
moneng Spain 2
moneng Spanish 3
monl2 French 6
monl2 Germany 2
monl2 Italian 7
monl2 Polish 11
monl2 Portuguese 11
monl2 Spanish 14
other Italian 1
other Spain 1
other Spanish 1
tri French 1
tri Germany 1
tri Spain 3
tri Spanish 6

Run constrained LMs.

Show code
ox_lex_class_cleaned <- ox_lex_class |> 
  filter(prop_total != 0) |> 
  mutate(other_lang = ifelse(lang_group == "moneng", "English", other_lang)) |> 
  group_by(language, other_lang, lex_class, lang_group) |> 
  nest(data = -c("language", "other_lang", "lex_class", "lang_group")) |> 
  filter(lang_group %in% c("moneng", "eng-l2"),
         !(lang_group == "moneng" & language != "English")) |> 
  mutate(lang_group = other_lang)

ox_lex_preds_models <- ox_lex_class_cleaned |> 
  mutate(model = map(data, ~ langcog::clm(prop_class ~ I(prop_total^3) + 
                                            I(prop_total^2) + prop_total - 1, 
                                          data = .)),
         predictions = map(model, ~ broom:::augment.lm(., newdata = 
                                                         tibble(prop_total = pts))))

ox_lex_preds <- ox_lex_preds_models |> 
  select(language, other_lang, lex_class, lang_group, predictions) |> 
  unnest(predictions) |> 
  rename(prop_class = .fitted)

Visualise the results.

Show code
ggplot(ox_lex_preds |> filter(!lang_group %in% c("Germany", "Netherlands", "Spain")), 
       aes(x = prop_total, y = prop_class, col = lex_class)) +
  geom_abline(intercept = 0, slope = 1, lty = "dashed") +
  facet_grid(factor(language == "English") |> fct_recode(English = "TRUE", Other = "FALSE") |> 
               fct_relevel("English") ~ 
               factor(lang_group) |> fct_relevel("English")) +
  coord_fixed(xlim = c(0, 1), ylim = c(0, 1)) +
  geom_point(data = ox_lex_class |> 
               filter(prop_total != 0, lang_group %in% c("moneng", "eng-l2")) |> 
               mutate(lang_group = other_lang) |> 
               filter(!lang_group %in% c("Germany", "Netherlands", "Spain")), 
             alpha = .1, size = 0.6) +
  geom_line(linewidth = 1, alpha = .9) +
  theme_classic() +
  theme(axis.line.x = element_line(linewidth = 0),
        axis.line.y = element_line(linewidth = 0),
        strip.background = element_blank(),
        panel.border = element_rect(colour = "black", fill = NA, linewidth = 1)) +
  scale_colour_manual(values = c("#4476AA", "#DECC77", "#CD6677")) +
  labs(x = "Vocabulary size", y = "Proportion of category", col = "Lexical class") +
  scale_x_continuous(breaks = c(0, 0.5, 1)) +
  scale_y_continuous(breaks = c(0, 0.5, 1))
Warning: 1 unknown level in `f`: English
1 unknown level in `f`: English

Bootstrapped curve–diagonal areas

Show code
ox_sampled_areas <- sample_areas(ox_lex_class_cleaned |> unnest(data))

Generate summaries of the area data.

Show code
ox_area_summary <- ox_sampled_areas |> 
  filter(!is.na(area)) |> 
  group_by(language, lang_group, lex_class) |> 
  summarise(mean = mean(area),
            ci_lower = langcog::ci_lower(area),
            ci_upper = langcog::ci_upper(area)) |> 
  mutate(language = case_when(
    language == "Germany" ~ "German",
    language == "Netherlands" ~ "Dutch",
    language == "Spain" ~ "Spanish",
    .default = language
  ))

Visualise the areas. We first show the plot for English, then for the other languages which have a British vs local comparison (Germany, the Netherlands, Spain).

UK bilinguals English vocabularies

Show code
ggplot(ox_area_summary |> filter(language %in% c("English"), !(lang_group %in% c("Germany", "Netherlands", "Spain")))) +
  geom_vline(xintercept = 0, lty = "dashed") +
  geom_pointrange(aes(x = mean, xmin = ci_lower, xmax = ci_upper, y = language, col = lang_group),
                  position = position_dodge(width = .6)) +
  facet_grid(lex_class ~ .) +
  theme_classic() +
  theme(axis.line.x = element_line(linewidth = 0),
        axis.line.y = element_line(linewidth = 0),
        strip.background = element_blank(),
        panel.border = element_rect(colour = "black", fill = NA, linewidth = 1)) +
  # scale_colour_manual(values = c("#A9CD66", "#66CDB2", "#B766CD")) +
  labs(x = "Category bias", y = "Language", col = "Language group")

British vs local comparisons

Show code
ggplot(ox_area_summary |> 
         filter(language %in% c("German", "Dutch", "Spanish")) |> 
         mutate(lang_group = ifelse(lang_group %in% c("German", "Dutch", "Spanish"), "UK", lang_group))) +
  geom_vline(xintercept = 0, lty = "dashed") +
  geom_pointrange(aes(x = mean, xmin = ci_lower, xmax = ci_upper, y = language, col = lang_group),
                  position = position_dodge(width = .6)) +
  facet_grid(lex_class ~ .) +
  theme_classic() +
  theme(axis.line.x = element_line(linewidth = 0),
        axis.line.y = element_line(linewidth = 0),
        strip.background = element_blank(),
        panel.border = element_rect(colour = "black", fill = NA, linewidth = 1)) +
  # scale_colour_manual(values = c("#A9CD66", "#66CDB2", "#B766CD")) +
  labs(x = "Category bias", y = "Language", col = "Country")

Something surprising is the difference in the non-English language results for British vs local populations, where the British sample seems to be more extreme than the local sample.

Bias differences across groups

Again, we test for bias differences statistically by using permutation tests. We do this in two steps: first, we consider only the English vocabularies from UK monolingual and bilingual children.

UK bilinguals English vocabularies

Show code
ox_observed_areas <- ox_lex_class_cleaned |> 
  mutate(area = map_dbl(data, poly_area)) |> 
  select(-data)

ox_observed_areas_wide_eng <- ox_observed_areas |> 
  filter(language == "English",
         !(lang_group %in% c("Germany", "Netherlands", "Spain"))) |> 
  ungroup() |> 
  select(-other_lang) |> 
  pivot_wider(names_from = lang_group,
              values_from = area) |> 
  mutate(across(-c("lex_class", "language"), \(x) {x - English}))

ox_observed_areas_diff_eng <- ox_observed_areas_wide_eng |> 
  pivot_longer(cols = -c("lex_class", "language"), 
               names_to = "comparison", 
               values_to = "area_diff")
Show code
ox_permuted_areas_eng <- permute_areas(ox_lex_class_cleaned |> unnest(data) |> 
                                         filter(language == "English",
                                                !(lang_group %in% c("Germany", "Netherlands", "Spain"))))

ox_permuted_areas_wide_eng <- ox_permuted_areas_eng |> 
  filter(language == "English",
         !(lang_group %in% c("Germany", "Netherlands", "Spain"))) |> 
  ungroup() |> 
  pivot_wider(names_from = lang_group,
              values_from = area) |> 
  mutate(across(-c("lex_class", "language", "sample"), \(x) {x - English}))

ox_permuted_areas_diff_eng <- ox_permuted_areas_wide_eng |> 
  pivot_longer(cols = -c("lex_class", "language", "sample"), 
               names_to = "comparison", 
               values_to = "area_diff")

Densities indicate permuted samples; dashed line indicates observed value.

Show code
ggplot() +
  geom_density(data = ox_permuted_areas_diff_eng |> filter(comparison != "English"), 
               mapping = aes(x = area_diff, col = lex_class)) +
  geom_vline(data = ox_observed_areas_diff_eng |> filter(comparison != "English"),
             mapping = aes(xintercept = area_diff, col = lex_class),
             lty = "dashed") +
  facet_grid(lex_class ~ comparison) +
  theme_classic() +
  theme(axis.line.x = element_line(linewidth = 0),
        axis.line.y = element_line(linewidth = 0),
        strip.background = element_blank(),
        panel.border = element_rect(colour = "black", fill = NA, linewidth = 1)) +
  scale_colour_manual(values = c("#4476AA", "#DECC77", "#CD6677")) +
  labs(x = "Category bias", y = "Probability density", col = "Lexical category")

We see almost no effect for function words, but possibly some effects for the nouns and predicates.

Calculate two-sided \(p\) values.

Show code
ox_permuted_areas_diff_eng |> 
  left_join(ox_observed_areas_diff_eng, 
            by = c("language", "lex_class", "comparison"),
            suffix = c("_perm", "_obs")) |> 
  mutate(obs_beats_perm = abs(area_diff_perm) - abs(area_diff_obs) > 0) |> 
  group_by(language, lex_class, comparison) |> 
  summarise(obs_p = sum(obs_beats_perm, na.rm = TRUE) / n()) |> 
  filter(comparison != "English") |> 
  arrange(comparison) |> 
  kable()
language lex_class comparison obs_p
English function_words Dutch 0.533
English nouns Dutch 0.059
English predicates Dutch 0.043
English function_words French 0.434
English nouns French 0.195
English predicates French 0.237
English function_words German 0.712
English nouns German 0.558
English predicates German 0.620
English function_words Italian 0.632
English nouns Italian 0.393
English predicates Italian 0.421
English function_words Polish 0.993
English nouns Polish 0.172
English predicates Polish 0.070
English function_words Portuguese 0.621
English nouns Portuguese 0.480
English predicates Portuguese 0.553
English function_words Spanish 0.677
English nouns Spanish 0.270
English predicates Spanish 0.243

There is a marginal difference for English–Dutch bilinguals (vs English monolinguals) for predicates at \(\alpha = 0.05\) but I am inclined to not give that too much weight.

British vs local comparisons

Now, we compare German, Dutch, and Spanish vocabularies for British vs local samples.

Show code
ox_observed_areas_wide_oth <- ox_observed_areas |> 
  mutate(language = case_when(
    language == "Germany" ~ "German",
    language == "Netherlands" ~ "Dutch",
    language == "Spain" ~ "Spanish",
    .default = language
  ),
  lang_group = ifelse(lang_group %in% c("German", "Dutch", "Spanish"), "UK", lang_group)) |> 
  filter(language %in% c("German", "Dutch", "Spanish")) |> 
  ungroup() |> 
  select(-other_lang) |> 
  pivot_wider(names_from = lang_group,
              values_from = area) |> 
  mutate(across(-c("lex_class", "language"), \(x) {x - UK}))

ox_observed_areas_diff_oth <- ox_observed_areas_wide_oth |> 
  pivot_longer(cols = -c("lex_class", "language"), 
               names_to = "comparison", 
               values_to = "area_diff") |> 
  filter(!is.na(area_diff), comparison != "UK")
Show code
ox_permuted_areas_oth <- permute_areas(ox_lex_class_cleaned |> unnest(data) |> 
                                         filter(lang_group == language &
                                                  language %in% c("German", "Dutch", "Spanish",
                                                                  "Germany", "Netherlands", "Spain")) |> 
                                         mutate(other_lang = case_when(
                                           other_lang == "Germany" ~ "German",
                                           other_lang == "Netherlands" ~ "Dutch",
                                           other_lang == "Spain" ~ "Spanish",
                                           .default = other_lang
                                         ),
                                         language = case_when(
                                           language == "Germany" ~ "German",
                                           language == "Netherlands" ~ "Dutch",
                                           language == "Spain" ~ "Spanish",
                                           .default = language
                                         )),
                                       group = "other_lang")

ox_permuted_areas_wide_oth <- ox_permuted_areas_oth |> 
  filter(language %in% c("German", "Dutch", "Spanish")) |> 
  mutate(lang_group = ifelse(language == lang_group, "UK", lang_group)) |> 
  filter(!is.na(area)) |> 
  ungroup() |> 
  pivot_wider(names_from = lang_group,
              values_from = area) |> 
  mutate(across(-c("lex_class", "language", "sample"), \(x) {x - UK}))

ox_permuted_areas_diff_oth <- ox_permuted_areas_wide_oth |> 
  pivot_longer(cols = -c("lex_class", "language", "sample"), 
               names_to = "comparison", 
               values_to = "area_diff") |> 
  filter(!is.na(area_diff), comparison != "UK")
Show code
ggplot() +
  geom_density(data = ox_permuted_areas_diff_oth |> filter(comparison != "UK"), 
               mapping = aes(x = area_diff, col = lex_class)) +
  geom_vline(data = ox_observed_areas_diff_oth |> filter(comparison != "UK"),
             mapping = aes(xintercept = area_diff, col = lex_class),
             lty = "dashed") +
  facet_grid(lex_class ~ comparison) +
  theme_classic() +
  theme(axis.line.x = element_line(linewidth = 0),
        axis.line.y = element_line(linewidth = 0),
        strip.background = element_blank(),
        panel.border = element_rect(colour = "black", fill = NA, linewidth = 1)) +
  scale_colour_manual(values = c("#4476AA", "#DECC77", "#CD6677")) +
  labs(x = "Category bias", y = "Probability density", col = "Lexical category")

Calculate two-sided \(p\) values.

Show code
ox_permuted_areas_diff_oth |> 
  left_join(ox_observed_areas_diff_oth, 
            by = c("language", "lex_class", "comparison"),
            suffix = c("_perm", "_obs")) |> 
  mutate(obs_beats_perm = abs(area_diff_perm) - abs(area_diff_obs) > 0) |> 
  group_by(language, lex_class, comparison) |> 
  summarise(obs_p = sum(obs_beats_perm, na.rm = TRUE) / n()) |> 
  filter(comparison != "UK") |> 
  arrange(comparison) |> 
  kable()
language lex_class comparison obs_p
German function_words Germany 0.000
German nouns Germany 0.011
German predicates Germany 0.691
Dutch function_words Netherlands 0.165
Dutch nouns Netherlands 0.019
Dutch predicates Netherlands 0.051
Spanish function_words Spain 0.000
Spanish nouns Spain 0.001
Spanish predicates Spain 0.120

Here we see pretty clear differences for most British vs local comparisons.