Bilingual vocabulary: Lexical category bias

Author

Alvin W.M. Tan

Published

March 21, 2024

In this document we explore the noun, verb, and function word bias for bilinguals; we hypothesise that these bias values will lie somewhere between the biases for monolinguals speaking either of the bilinguals’ two languages.

Malaysian CDI

We use data from Malaysian infants (speaking some combination of English, Malay, and Mandarin) contributed by Jun Ho Chai (Chai et al., 2021), which has previously been used to examine verb bias in Malay–English and Mandarin–English speakers.

Show code

library(here)
library(broom)
library(langcog)
library(tidyverse)
library(knitr)
set.seed(42)

Category proportion curves

Load the data, and join in lexical categories via semantic categories.

Show code

# bad but temporary
data <- read_csv(here(CDI_LOC, "wordbank_import/Bilingual/chai/mcdi-m_instrument_data_ws.csv"))
categories <- read_csv(here(CDI_LOC, "wordbank/raw_data/categories.csv"),
                       col_names = c("category", "lex_class", "lex_cat"))

Show code

data_cat <- data |> 
  mutate(category = case_when(category == "people_names" ~ "people",
                              category == "prepositions_locations" ~ "locations",
                              category == "quantifiers_articles" ~ "quantifiers",
                              .default = category)) |> 
  left_join(categories, by = "category") |> 
  mutate(across(ends_with("Expo"), \(x) round(x, 7)))

Group the participants by language group (monolingual, bilingual (3 combinations), trilingual). Participants are considered to have exposure to a language if they receive >10% of their language input in that language.

Show code

lex_class <- data_cat |> 
  group_by(data_id, age, sex, language, 
           MalayExpo, EngExpo, ChiExpo, Ethnicity, lex_class) |> 
  summarise(produces = sum(value == 2),
            understands = sum(value > 0),
            n = n()) |> 
  filter(lex_class != "other") |> 
  mutate(prop_class = produces / n,
         prop_total = sum(produces) / sum(n)) |> 
  ungroup() |> 
  mutate(across(ends_with("Expo"), 
                \(x) {as.numeric(cut(x, 
                                     breaks = c(0, 0.1, 1), 
                                     labels = c(0, 1), 
                                     include.lowest = T)) - 1},
                .names = "{col}_fct"),
         lang_group = case_when(MalayExpo_fct + EngExpo_fct + ChiExpo_fct == 1 ~ "mon",
                                MalayExpo_fct + EngExpo_fct + ChiExpo_fct == 3 ~ "tri",
                                EngExpo_fct + MalayExpo_fct == 2 ~ "eng-msa",
                                EngExpo_fct + ChiExpo_fct == 2 ~ "eng-cmn",
                                MalayExpo_fct + ChiExpo_fct == 2 ~ "msa-cmn"),
         lex_class = lex_class |> as_factor() |> fct_shift(1))

Show code

lex_class |> 
  select(data_id, lang_group) |> 
  distinct() |> 
  count(lang_group) |> 
  kable()

lang_group	n
eng-cmn	54
eng-msa	163
mon	347
msa-cmn	1
tri	4

Run constrained LMs.

Show code

pts <- seq(0, 1, 0.01)

lex_class_cleaned <- lex_class |> 
  filter(prop_total != 0) |> 
  group_by(language, lex_class, lang_group) |> 
  nest(data = -c("language", "lex_class", "lang_group")) |> 
  filter(lang_group != "msa-cmn", # only 1 ppt
         lang_group != "tri", # only 4 ppts
         !(language == "Malay" && lang_group == "eng-cmn"),
         !(language == "Mandarin" && lang_group == "eng-msa"))

lex_preds_models <- lex_class_cleaned |> 
  mutate(model = map(data, ~ langcog::clm(prop_class ~ I(prop_total^3) + 
                                            I(prop_total^2) + prop_total - 1, 
                                          data = .)),
         predictions = map(model, ~ broom:::augment.lm(., newdata = 
                                                         tibble(prop_total = pts))))

lex_preds <- lex_preds_models |> 
  select(language, lex_class, lang_group, predictions) |> 
  unnest(predictions) |> 
  rename(prop_class = .fitted)

Visualise the results.

Show code

ggplot(lex_preds, aes(x = prop_total, y = prop_class, col = lex_class)) +
  geom_abline(intercept = 0, slope = 1, lty = "dashed") +
  facet_grid(language ~ lang_group) +
  coord_fixed(xlim = c(0, 1), ylim = c(0, 1)) +
  geom_point(data = lex_class |> 
               filter(prop_total != 0, lang_group != "msa-cmn", lang_group != "tri"), 
             alpha = .1, size = 0.6) +
  geom_line(linewidth = 1.5, alpha = .9) +
  theme_classic() +
  theme(axis.line.x = element_line(linewidth = 0),
        axis.line.y = element_line(linewidth = 0),
        strip.background = element_blank(),
        panel.border = element_rect(colour = "black", fill = NA, linewidth = 1)) +
  scale_colour_manual(values = c("#4476AA", "#DECC77", "#CD6677")) +
  labs(x = "Vocabulary size", y = "Proportion of category", col = "Lexical class") +
  scale_x_continuous(breaks = c(0, 0.5, 1)) +
  scale_y_continuous(breaks = c(0, 0.5, 1))

Bootstrapped curve–diagonal areas

We use the difference in the area between the curve and the diagonal as a measure for bias. We randomly resample the whole population 1000x with replacement, and recompute the area measurement. Code here is adapted from the Wordbank book.

Show code

poly_area <- function(group_data) {
  model <- tryCatch(
    langcog::clm(prop_class ~ I(prop_total^3) + I(prop_total^2) + prop_total - 1, 
                 data = group_data),
    error = function(e) return(NULL)
  )
  if (is.null(model)) return(NA)
  # this is some magic that I don't understand lol
  return((model$solution %*% c(1/4, 1/3, 1/2) - 0.5)[1]) 
}

sample_areas <- function(vocab_data, nboot = 1000, verbose = FALSE) {
  
  sample_area <- function(i) {
    if (verbose & i%%10 == 0) {print(i)}
    vocab_data |> 
      group_by(data_id) |> 
      select(-produces, -understands, -n) |> 
      # pivot wider to sample participants, not administrations
      pivot_wider(names_from = c(language, lex_class),
                  values_from = c(prop_class, prop_total)) |> 
      group_by(lang_group) |> 
      slice_sample(prop = 1, replace = TRUE) |> 
      # pivoting back requires a bit more finesse because of the data structure
      pivot_longer(cols = starts_with("prop_"),
                   names_to = c("prop_type", "language", "lex_class"),
                   names_pattern = "(prop_[a-z]*)_([A-Z][a-z]*)_([a-z_]*)") |> 
      pivot_wider(names_from = prop_type,
                  values_from = value,
                  values_fn = unique) |> 
      filter(!is.na(prop_total)) |> 
      group_by(language, lang_group, lex_class) |> 
      nest(data = -c("language", "lang_group", "lex_class")) |> 
      mutate(area = map_dbl(data, poly_area),
             sample = i) |> 
      select(-data)
  }
  
  map_df(1:nboot, sample_area)
}

sampled_areas <- sample_areas(lex_class_cleaned |> unnest(data))

Generate summaries of the area data.

Show code

area_summary <- sampled_areas |> 
  filter(!is.na(area)) |> 
  group_by(language, lang_group, lex_class) |> 
  summarise(mean = mean(area),
            ci_lower = langcog::ci_lower(area),
            ci_upper = langcog::ci_upper(area))

Visualise the areas.

Show code

ggplot(area_summary) +
  geom_vline(xintercept = 0, lty = "dashed") +
  geom_pointrange(aes(x = mean, xmin = ci_lower, xmax = ci_upper, y = language, col = lang_group),
                  position = position_dodge(width = .6)) +
  facet_grid(lex_class ~ .) +
  theme_classic() +
  theme(axis.line.x = element_line(linewidth = 0),
        axis.line.y = element_line(linewidth = 0),
        strip.background = element_blank(),
        panel.border = element_rect(colour = "black", fill = NA, linewidth = 1)) +
  scale_colour_manual(values = c("#A9CD66", "#66CDB2", "#B766CD")) +
  labs(x = "Category bias", y = "Language", col = "Language group")

Bias differences across groups

If we only observe the English bias values for predicates, we notice that both bilingual groups have less bias against predicates than the monolingual group. We can investigate this numerically by conducting a permutation test, in which we permute the group labels and examine the probability that the observed difference in areas would arise under the null hypothesis. We can do this for all lexical categories (and indeed for all languages—but for now let’s just look at English).

Show code

observed_areas <- lex_class_cleaned |> 
  mutate(area = map_dbl(data, poly_area)) |> 
  select(-data)

observed_areas_wide <- observed_areas |> 
  pivot_wider(names_from = lang_group,
              values_from = area) |> 
  mutate(engmsa_minus_mon = `eng-msa` - mon,
         engcmn_minus_mon = `eng-cmn` - mon)

observed_areas_diff <- observed_areas_wide |> 
  select(-`eng-msa`, -`eng-cmn`, -mon) |> 
  pivot_longer(cols = c("engmsa_minus_mon", "engcmn_minus_mon"), 
               names_to = "comparison", 
               values_to = "area_diff")

Show code

permute_areas <- function(vocab_data, nboot = 1000, verbose = FALSE, group = NULL) {
  
  permute_area <- function(i) {
    if (verbose & i%%10 == 0) {print(i)}
    vocab_data |> 
      group_by(data_id) |> 
      select(-produces, -understands, -n) |> 
      # pivot wider to sample participants, not administrations
      pivot_wider(names_from = c(language, lex_class),
                  values_from = c(prop_class, prop_total)) |> 
      group_by(across(all_of(group))) |> 
      mutate(lang_group = sample(lang_group, replace = FALSE)) |> 
      # pivoting back requires a bit more finesse because of the data structure
      pivot_longer(cols = starts_with("prop_"),
                   names_to = c("prop_type", "language", "lex_class"),
                   names_pattern = "(prop_[a-z]*)_([A-Z][a-z]*)_([a-z_]*)") |> 
      pivot_wider(names_from = prop_type,
                  values_from = value,
                  values_fn = unique) |> 
      filter(!is.na(prop_total)) |> 
      group_by(language, lang_group, lex_class) |> 
      nest(data = -c("language", "lang_group", "lex_class")) |> 
      mutate(area = map_dbl(data, poly_area),
             sample = i) |> 
      select(-data)
  }
  
  map_df(1:nboot, permute_area)
}

permuted_areas <- permute_areas(lex_class_cleaned |> unnest(data))

permuted_areas_wide <- permuted_areas |> 
  pivot_wider(names_from = lang_group,
              values_from = area) |> 
  mutate(engmsa_minus_mon = `eng-msa` - mon,
         engcmn_minus_mon = `eng-cmn` - mon)

permuted_areas_diff <- permuted_areas_wide |> 
  select(-`eng-msa`, -`eng-cmn`, -mon) |> 
  pivot_longer(cols = c("engmsa_minus_mon", "engcmn_minus_mon"), 
               names_to = "comparison", 
               values_to = "area_diff")

Visualise the distribution in bias values. Densities indicate permuted samples; dashed line indicates observed value.

Show code

ggplot() +
  geom_density(data = permuted_areas_diff |> filter(language == "English"), 
               mapping = aes(x = area_diff, col = lex_class)) +
  geom_vline(data = observed_areas_diff |> filter(language == "English"),
             mapping = aes(xintercept = area_diff, col = lex_class),
             lty = "dashed") +
  facet_grid(lex_class ~ comparison) +
  theme_classic() +
  theme(axis.line.x = element_line(linewidth = 0),
        axis.line.y = element_line(linewidth = 0),
        strip.background = element_blank(),
        panel.border = element_rect(colour = "black", fill = NA, linewidth = 1)) +
  scale_colour_manual(values = c("#4476AA", "#DECC77", "#CD6677")) +
  labs(x = "Category bias", y = "Probability density", col = "Lexical category")

Calculate two-sided \(p\) values.

Show code

permuted_areas_diff |> 
  left_join(observed_areas_diff, 
            by = c("language", "lex_class", "comparison"),
            suffix = c("_perm", "_obs")) |> 
  mutate(obs_beats_perm = abs(area_diff_perm) - abs(area_diff_obs) > 0) |> 
  group_by(language, lex_class, comparison) |> 
  summarise(obs_p = sum(obs_beats_perm, na.rm = TRUE) / n()) |> 
  filter(!(language == "Malay" & comparison == "engcmn_minus_mon"),
         !(language == "Mandarin" & comparison == "engmsa_minus_mon")) |> 
  arrange(comparison) |> 
  kable()

language	lex_class	comparison	obs_p
English	function_words	engcmn_minus_mon	0.632
English	nouns	engcmn_minus_mon	0.273
English	predicates	engcmn_minus_mon	0.284
Mandarin	function_words	engcmn_minus_mon	0.585
Mandarin	nouns	engcmn_minus_mon	0.478
Mandarin	predicates	engcmn_minus_mon	0.567
English	function_words	engmsa_minus_mon	0.132
English	nouns	engmsa_minus_mon	0.237
English	predicates	engmsa_minus_mon	0.694
Malay	function_words	engmsa_minus_mon	0.333
Malay	nouns	engmsa_minus_mon	0.203
Malay	predicates	engmsa_minus_mon	0.380

Nothing is significant at \(\alpha = 0.05\), or even at \(\alpha = 0.10\). Perhaps there is a better way to investigate whether there is a difference, but at least it does not show up using this method.

Oxford CDI

We use data from Serene Siow to explore European bilinguals—noting that most of these languages exhibit a noun bias.

Category proportion curves

Load the data and join in categories.

Show code

sem_cats <- c("Sounds", "Animals", "Vehicles", "Toys", "Food and Drink", 
              "Body Parts", "Clothes", "Furniture and Rooms", "Outside", 
              "Household items", "People", "Games and Routines", "Action Words", 
              "Descriptive Words", "Question words", "Time", 
              "Pronouns and Possessives", "Prepositions and Location Words", 
              "Quantifiers")

namefun <- \(nm) make.unique(nm, sep = "_")

ox_files <- list.files(OX_LOC, "*.xlsx")

ox_long <- lapply(ox_files, 
                  \(x) {readxl::read_xlsx(here(OX_LOC, x), skip = 1, 
                                          .name_repair = namefun) |> 
                      pivot_longer(cols = starts_with(sem_cats),
                                   names_to = c("category", "item"),
                                   names_pattern = "([A-Za-z ]*) - (.*)") |> 
                      mutate(file_name = x,
                             other_lang = str_extract(x, "(?<=_)[A-Z][a-z]*(?=.xlsx)"))}) |> 
  bind_rows() |> 
  filter(!(`Response ID` == "Response ID")) |>  # data error in German
  group_by(`Response ID`)

items <- colnames(readxl::read_xlsx(here(OX_LOC, ox_files[1]), skip = 1, 
                                        .name_repair = namefun))
eng_items <- str_split(items[33:450], " - ") |> 
  as.data.frame() |> t() |> as_tibble() |> 
  `colnames<-`(value = c("category", "item")) |> 
  mutate(is_eng_form = 1)

Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
`.name_repair` is omitted as of tibble 2.0.0.
ℹ Using compatibility `.name_repair`.

Show code

orig_cats <- c("sounds", "animals", "vehicles", "toys", "food_drink",
               "body_parts", "clothing", "furniture_rooms", "outside",
               "household", "people", "games_routines", "action_words",
               "descriptive_words", "question_words", "time_words",
               "pronouns", "locations", "quantifiers")

ox_long_cats <- ox_long |> 
  left_join(eng_items, by = c("category", "item")) |> 
  mutate(category = orig_cats[match(category, sem_cats)],
         is_eng_form = replace_na(is_eng_form, 0)) |> 
  left_join(categories, by = "category")

Group participants by language group; here most categories that are not “eng-l2” are too sparse so we exclude them. Note that we combine all “moneng” into a general monolingual English category.

Show code

ox_exp <- ox_long |> 
  select(`Response ID`, other_lang, starts_with("Overall language exposure - "), 
         -`Overall language exposure - Other - Text`) |> 
  rename_with(\(x) {str_replace(x, "Overall language exposure - ", "Expo")}) |> 
  distinct() |> 
  mutate(ExpoSecond = coalesce(ExpoDutch, ExpoFrench, ExpoGerman, ExpoItalian, ExpoPolish, ExpoPortuguese, ExpoSpanish)) |> 
  select(data_id = `Response ID`, other_lang, ExpoEnglish, ExpoSecond, ExpoOther) |> 
  mutate(across(starts_with("Expo"), 
                \(x) {as.numeric(cut(as.numeric(x), 
                                     breaks = c(0, 10, 100), 
                                     labels = c(0, 1), 
                                     include.lowest = T)) - 1},
                .names = "{col}_fct"),
         lang_group = case_when(ExpoEnglish_fct == 1 & ExpoSecond_fct + ExpoOther_fct == 0 ~ "moneng",
                                ExpoEnglish_fct == 0 & ExpoSecond_fct + ExpoOther_fct == 1 ~ "monl2",
                                ExpoEnglish_fct + ExpoSecond_fct + ExpoOther_fct == 3 ~ "tri",
                                ExpoEnglish_fct + ExpoSecond_fct == 2 ~ "eng-l2",
                                .default = "other"))

Show code

ox_lex_class <- ox_long_cats |> 
  select(data_id = `Response ID`, age = age_months, sex = `Child's gender`, 
         is_eng_form, lex_class, value) |> 
  group_by(data_id, age, sex, is_eng_form, lex_class) |> 
  summarise(produces = sum(str_detect(value, "/"), na.rm = TRUE),
            understands = sum(str_starts(value, "N", negate = TRUE), na.rm = TRUE) + produces,
            n = n()) |> 
  filter(lex_class != "other") |> 
  left_join(ox_exp, by = "data_id") |> 
  mutate(language = ifelse(is_eng_form, "English", other_lang)) |> 
  group_by(data_id, language) |> 
  mutate(prop_class = produces / n,
         prop_total = sum(produces, na.rm = T) / sum(n, na.rm = T),
         lex_class = lex_class |> as_factor() |> fct_shift(1))

Show code

ox_lex_class |> 
  ungroup() |> 
  select(data_id, other_lang, lang_group) |>
  distinct() |> 
  ungroup() |> 
  count(lang_group, other_lang) |> 
  kable()

lang_group	other_lang	n
eng-l2	Dutch	38
eng-l2	French	130
eng-l2	German	64
eng-l2	Germany	65
eng-l2	Italian	125
eng-l2	Netherlands	35
eng-l2	Polish	143
eng-l2	Portuguese	76
eng-l2	Spain	105
eng-l2	Spanish	240
moneng	Dutch	1
moneng	French	5
moneng	German	3
moneng	Germany	1
moneng	Italian	2
moneng	Netherlands	2
moneng	Polish	3
moneng	Portuguese	3
moneng	Spain	2
moneng	Spanish	3
monl2	French	6
monl2	Germany	2
monl2	Italian	7
monl2	Polish	11
monl2	Portuguese	11
monl2	Spanish	14
other	Italian	1
other	Spain	1
other	Spanish	1
tri	French	1
tri	Germany	1
tri	Spain	3
tri	Spanish	6

Run constrained LMs.

Show code

ox_lex_class_cleaned <- ox_lex_class |> 
  filter(prop_total != 0) |> 
  mutate(other_lang = ifelse(lang_group == "moneng", "English", other_lang)) |> 
  group_by(language, other_lang, lex_class, lang_group) |> 
  nest(data = -c("language", "other_lang", "lex_class", "lang_group")) |> 
  filter(lang_group %in% c("moneng", "eng-l2"),
         !(lang_group == "moneng" & language != "English")) |> 
  mutate(lang_group = other_lang)

ox_lex_preds_models <- ox_lex_class_cleaned |> 
  mutate(model = map(data, ~ langcog::clm(prop_class ~ I(prop_total^3) + 
                                            I(prop_total^2) + prop_total - 1, 
                                          data = .)),
         predictions = map(model, ~ broom:::augment.lm(., newdata = 
                                                         tibble(prop_total = pts))))

ox_lex_preds <- ox_lex_preds_models |> 
  select(language, other_lang, lex_class, lang_group, predictions) |> 
  unnest(predictions) |> 
  rename(prop_class = .fitted)

Visualise the results.

Show code

ggplot(ox_lex_preds |> filter(!lang_group %in% c("Germany", "Netherlands", "Spain")), 
       aes(x = prop_total, y = prop_class, col = lex_class)) +
  geom_abline(intercept = 0, slope = 1, lty = "dashed") +
  facet_grid(factor(language == "English") |> fct_recode(English = "TRUE", Other = "FALSE") |> 
               fct_relevel("English") ~ 
               factor(lang_group) |> fct_relevel("English")) +
  coord_fixed(xlim = c(0, 1), ylim = c(0, 1)) +
  geom_point(data = ox_lex_class |> 
               filter(prop_total != 0, lang_group %in% c("moneng", "eng-l2")) |> 
               mutate(lang_group = other_lang) |> 
               filter(!lang_group %in% c("Germany", "Netherlands", "Spain")), 
             alpha = .1, size = 0.6) +
  geom_line(linewidth = 1, alpha = .9) +
  theme_classic() +
  theme(axis.line.x = element_line(linewidth = 0),
        axis.line.y = element_line(linewidth = 0),
        strip.background = element_blank(),
        panel.border = element_rect(colour = "black", fill = NA, linewidth = 1)) +
  scale_colour_manual(values = c("#4476AA", "#DECC77", "#CD6677")) +
  labs(x = "Vocabulary size", y = "Proportion of category", col = "Lexical class") +
  scale_x_continuous(breaks = c(0, 0.5, 1)) +
  scale_y_continuous(breaks = c(0, 0.5, 1))

Warning: 1 unknown level in `f`: English
1 unknown level in `f`: English

Bootstrapped curve–diagonal areas

Show code

ox_sampled_areas <- sample_areas(ox_lex_class_cleaned |> unnest(data))

Generate summaries of the area data.

Show code

ox_area_summary <- ox_sampled_areas |> 
  filter(!is.na(area)) |> 
  group_by(language, lang_group, lex_class) |> 
  summarise(mean = mean(area),
            ci_lower = langcog::ci_lower(area),
            ci_upper = langcog::ci_upper(area)) |> 
  mutate(language = case_when(
    language == "Germany" ~ "German",
    language == "Netherlands" ~ "Dutch",
    language == "Spain" ~ "Spanish",
    .default = language
  ))

Visualise the areas. We first show the plot for English, then for the other languages which have a British vs local comparison (Germany, the Netherlands, Spain).

UK bilinguals English vocabularies

Show code

ggplot(ox_area_summary |> filter(language %in% c("English"), !(lang_group %in% c("Germany", "Netherlands", "Spain")))) +
  geom_vline(xintercept = 0, lty = "dashed") +
  geom_pointrange(aes(x = mean, xmin = ci_lower, xmax = ci_upper, y = language, col = lang_group),
                  position = position_dodge(width = .6)) +
  facet_grid(lex_class ~ .) +
  theme_classic() +
  theme(axis.line.x = element_line(linewidth = 0),
        axis.line.y = element_line(linewidth = 0),
        strip.background = element_blank(),
        panel.border = element_rect(colour = "black", fill = NA, linewidth = 1)) +
  # scale_colour_manual(values = c("#A9CD66", "#66CDB2", "#B766CD")) +
  labs(x = "Category bias", y = "Language", col = "Language group")

British vs local comparisons

Show code

ggplot(ox_area_summary |> 
         filter(language %in% c("German", "Dutch", "Spanish")) |> 
         mutate(lang_group = ifelse(lang_group %in% c("German", "Dutch", "Spanish"), "UK", lang_group))) +
  geom_vline(xintercept = 0, lty = "dashed") +
  geom_pointrange(aes(x = mean, xmin = ci_lower, xmax = ci_upper, y = language, col = lang_group),
                  position = position_dodge(width = .6)) +
  facet_grid(lex_class ~ .) +
  theme_classic() +
  theme(axis.line.x = element_line(linewidth = 0),
        axis.line.y = element_line(linewidth = 0),
        strip.background = element_blank(),
        panel.border = element_rect(colour = "black", fill = NA, linewidth = 1)) +
  # scale_colour_manual(values = c("#A9CD66", "#66CDB2", "#B766CD")) +
  labs(x = "Category bias", y = "Language", col = "Country")

Something surprising is the difference in the non-English language results for British vs local populations, where the British sample seems to be more extreme than the local sample.

Bias differences across groups

Again, we test for bias differences statistically by using permutation tests. We do this in two steps: first, we consider only the English vocabularies from UK monolingual and bilingual children.

UK bilinguals English vocabularies

Show code

ox_observed_areas <- ox_lex_class_cleaned |> 
  mutate(area = map_dbl(data, poly_area)) |> 
  select(-data)

ox_observed_areas_wide_eng <- ox_observed_areas |> 
  filter(language == "English",
         !(lang_group %in% c("Germany", "Netherlands", "Spain"))) |> 
  ungroup() |> 
  select(-other_lang) |> 
  pivot_wider(names_from = lang_group,
              values_from = area) |> 
  mutate(across(-c("lex_class", "language"), \(x) {x - English}))

ox_observed_areas_diff_eng <- ox_observed_areas_wide_eng |> 
  pivot_longer(cols = -c("lex_class", "language"), 
               names_to = "comparison", 
               values_to = "area_diff")

Show code

ox_permuted_areas_eng <- permute_areas(ox_lex_class_cleaned |> unnest(data) |> 
                                         filter(language == "English",
                                                !(lang_group %in% c("Germany", "Netherlands", "Spain"))))

ox_permuted_areas_wide_eng <- ox_permuted_areas_eng |> 
  filter(language == "English",
         !(lang_group %in% c("Germany", "Netherlands", "Spain"))) |> 
  ungroup() |> 
  pivot_wider(names_from = lang_group,
              values_from = area) |> 
  mutate(across(-c("lex_class", "language", "sample"), \(x) {x - English}))

ox_permuted_areas_diff_eng <- ox_permuted_areas_wide_eng |> 
  pivot_longer(cols = -c("lex_class", "language", "sample"), 
               names_to = "comparison", 
               values_to = "area_diff")

Densities indicate permuted samples; dashed line indicates observed value.

Show code

ggplot() +
  geom_density(data = ox_permuted_areas_diff_eng |> filter(comparison != "English"), 
               mapping = aes(x = area_diff, col = lex_class)) +
  geom_vline(data = ox_observed_areas_diff_eng |> filter(comparison != "English"),
             mapping = aes(xintercept = area_diff, col = lex_class),
             lty = "dashed") +
  facet_grid(lex_class ~ comparison) +
  theme_classic() +
  theme(axis.line.x = element_line(linewidth = 0),
        axis.line.y = element_line(linewidth = 0),
        strip.background = element_blank(),
        panel.border = element_rect(colour = "black", fill = NA, linewidth = 1)) +
  scale_colour_manual(values = c("#4476AA", "#DECC77", "#CD6677")) +
  labs(x = "Category bias", y = "Probability density", col = "Lexical category")

We see almost no effect for function words, but possibly some effects for the nouns and predicates.

Calculate two-sided \(p\) values.

Show code

ox_permuted_areas_diff_eng |> 
  left_join(ox_observed_areas_diff_eng, 
            by = c("language", "lex_class", "comparison"),
            suffix = c("_perm", "_obs")) |> 
  mutate(obs_beats_perm = abs(area_diff_perm) - abs(area_diff_obs) > 0) |> 
  group_by(language, lex_class, comparison) |> 
  summarise(obs_p = sum(obs_beats_perm, na.rm = TRUE) / n()) |> 
  filter(comparison != "English") |> 
  arrange(comparison) |> 
  kable()

language	lex_class	comparison	obs_p
English	function_words	Dutch	0.533
English	nouns	Dutch	0.059
English	predicates	Dutch	0.043
English	function_words	French	0.434
English	nouns	French	0.195
English	predicates	French	0.237
English	function_words	German	0.712
English	nouns	German	0.558
English	predicates	German	0.620
English	function_words	Italian	0.632
English	nouns	Italian	0.393
English	predicates	Italian	0.421
English	function_words	Polish	0.993
English	nouns	Polish	0.172
English	predicates	Polish	0.070
English	function_words	Portuguese	0.621
English	nouns	Portuguese	0.480
English	predicates	Portuguese	0.553
English	function_words	Spanish	0.677
English	nouns	Spanish	0.270
English	predicates	Spanish	0.243

There is a marginal difference for English–Dutch bilinguals (vs English monolinguals) for predicates at \(\alpha = 0.05\) but I am inclined to not give that too much weight.

British vs local comparisons

Now, we compare German, Dutch, and Spanish vocabularies for British vs local samples.

Show code

ox_observed_areas_wide_oth <- ox_observed_areas |> 
  mutate(language = case_when(
    language == "Germany" ~ "German",
    language == "Netherlands" ~ "Dutch",
    language == "Spain" ~ "Spanish",
    .default = language
  ),
  lang_group = ifelse(lang_group %in% c("German", "Dutch", "Spanish"), "UK", lang_group)) |> 
  filter(language %in% c("German", "Dutch", "Spanish")) |> 
  ungroup() |> 
  select(-other_lang) |> 
  pivot_wider(names_from = lang_group,
              values_from = area) |> 
  mutate(across(-c("lex_class", "language"), \(x) {x - UK}))

ox_observed_areas_diff_oth <- ox_observed_areas_wide_oth |> 
  pivot_longer(cols = -c("lex_class", "language"), 
               names_to = "comparison", 
               values_to = "area_diff") |> 
  filter(!is.na(area_diff), comparison != "UK")

Show code

ox_permuted_areas_oth <- permute_areas(ox_lex_class_cleaned |> unnest(data) |> 
                                         filter(lang_group == language &
                                                  language %in% c("German", "Dutch", "Spanish",
                                                                  "Germany", "Netherlands", "Spain")) |> 
                                         mutate(other_lang = case_when(
                                           other_lang == "Germany" ~ "German",
                                           other_lang == "Netherlands" ~ "Dutch",
                                           other_lang == "Spain" ~ "Spanish",
                                           .default = other_lang
                                         ),
                                         language = case_when(
                                           language == "Germany" ~ "German",
                                           language == "Netherlands" ~ "Dutch",
                                           language == "Spain" ~ "Spanish",
                                           .default = language
                                         )),
                                       group = "other_lang")

ox_permuted_areas_wide_oth <- ox_permuted_areas_oth |> 
  filter(language %in% c("German", "Dutch", "Spanish")) |> 
  mutate(lang_group = ifelse(language == lang_group, "UK", lang_group)) |> 
  filter(!is.na(area)) |> 
  ungroup() |> 
  pivot_wider(names_from = lang_group,
              values_from = area) |> 
  mutate(across(-c("lex_class", "language", "sample"), \(x) {x - UK}))

ox_permuted_areas_diff_oth <- ox_permuted_areas_wide_oth |> 
  pivot_longer(cols = -c("lex_class", "language", "sample"), 
               names_to = "comparison", 
               values_to = "area_diff") |> 
  filter(!is.na(area_diff), comparison != "UK")

Show code

ggplot() +
  geom_density(data = ox_permuted_areas_diff_oth |> filter(comparison != "UK"), 
               mapping = aes(x = area_diff, col = lex_class)) +
  geom_vline(data = ox_observed_areas_diff_oth |> filter(comparison != "UK"),
             mapping = aes(xintercept = area_diff, col = lex_class),
             lty = "dashed") +
  facet_grid(lex_class ~ comparison) +
  theme_classic() +
  theme(axis.line.x = element_line(linewidth = 0),
        axis.line.y = element_line(linewidth = 0),
        strip.background = element_blank(),
        panel.border = element_rect(colour = "black", fill = NA, linewidth = 1)) +
  scale_colour_manual(values = c("#4476AA", "#DECC77", "#CD6677")) +
  labs(x = "Category bias", y = "Probability density", col = "Lexical category")

Calculate two-sided \(p\) values.

Show code

ox_permuted_areas_diff_oth |> 
  left_join(ox_observed_areas_diff_oth, 
            by = c("language", "lex_class", "comparison"),
            suffix = c("_perm", "_obs")) |> 
  mutate(obs_beats_perm = abs(area_diff_perm) - abs(area_diff_obs) > 0) |> 
  group_by(language, lex_class, comparison) |> 
  summarise(obs_p = sum(obs_beats_perm, na.rm = TRUE) / n()) |> 
  filter(comparison != "UK") |> 
  arrange(comparison) |> 
  kable()

language	lex_class	comparison	obs_p
German	function_words	Germany	0.000
German	nouns	Germany	0.011
German	predicates	Germany	0.691
Dutch	function_words	Netherlands	0.165
Dutch	nouns	Netherlands	0.019
Dutch	predicates	Netherlands	0.051
Spanish	function_words	Spain	0.000
Spanish	nouns	Spain	0.001
Spanish	predicates	Spain	0.120

Here we see pretty clear differences for most British vs local comparisons.