Show code
library(here)
library(broom)
library(langcog)
library(tidyverse)
library(knitr)
set.seed(42)In this document we explore the noun, verb, and function word bias for bilinguals; we hypothesise that these bias values will lie somewhere between the biases for monolinguals speaking either of the bilinguals’ two languages.
We use data from Malaysian infants (speaking some combination of English, Malay, and Mandarin) contributed by Jun Ho Chai (Chai et al., 2021), which has previously been used to examine verb bias in Malay–English and Mandarin–English speakers.
library(here)
library(broom)
library(langcog)
library(tidyverse)
library(knitr)
set.seed(42)Load the data, and join in lexical categories via semantic categories.
# bad but temporary
data <- read_csv(here(CDI_LOC, "wordbank_import/Bilingual/chai/mcdi-m_instrument_data_ws.csv"))
categories <- read_csv(here(CDI_LOC, "wordbank/raw_data/categories.csv"),
col_names = c("category", "lex_class", "lex_cat"))data_cat <- data |>
mutate(category = case_when(category == "people_names" ~ "people",
category == "prepositions_locations" ~ "locations",
category == "quantifiers_articles" ~ "quantifiers",
.default = category)) |>
left_join(categories, by = "category") |>
mutate(across(ends_with("Expo"), \(x) round(x, 7)))Group the participants by language group (monolingual, bilingual (3 combinations), trilingual). Participants are considered to have exposure to a language if they receive >10% of their language input in that language.
lex_class <- data_cat |>
group_by(data_id, age, sex, language,
MalayExpo, EngExpo, ChiExpo, Ethnicity, lex_class) |>
summarise(produces = sum(value == 2),
understands = sum(value > 0),
n = n()) |>
filter(lex_class != "other") |>
mutate(prop_class = produces / n,
prop_total = sum(produces) / sum(n)) |>
ungroup() |>
mutate(across(ends_with("Expo"),
\(x) {as.numeric(cut(x,
breaks = c(0, 0.1, 1),
labels = c(0, 1),
include.lowest = T)) - 1},
.names = "{col}_fct"),
lang_group = case_when(MalayExpo_fct + EngExpo_fct + ChiExpo_fct == 1 ~ "mon",
MalayExpo_fct + EngExpo_fct + ChiExpo_fct == 3 ~ "tri",
EngExpo_fct + MalayExpo_fct == 2 ~ "eng-msa",
EngExpo_fct + ChiExpo_fct == 2 ~ "eng-cmn",
MalayExpo_fct + ChiExpo_fct == 2 ~ "msa-cmn"),
lex_class = lex_class |> as_factor() |> fct_shift(1))lex_class |>
select(data_id, lang_group) |>
distinct() |>
count(lang_group) |>
kable()| lang_group | n |
|---|---|
| eng-cmn | 54 |
| eng-msa | 163 |
| mon | 347 |
| msa-cmn | 1 |
| tri | 4 |
Run constrained LMs.
pts <- seq(0, 1, 0.01)
lex_class_cleaned <- lex_class |>
filter(prop_total != 0) |>
group_by(language, lex_class, lang_group) |>
nest(data = -c("language", "lex_class", "lang_group")) |>
filter(lang_group != "msa-cmn", # only 1 ppt
lang_group != "tri", # only 4 ppts
!(language == "Malay" && lang_group == "eng-cmn"),
!(language == "Mandarin" && lang_group == "eng-msa"))
lex_preds_models <- lex_class_cleaned |>
mutate(model = map(data, ~ langcog::clm(prop_class ~ I(prop_total^3) +
I(prop_total^2) + prop_total - 1,
data = .)),
predictions = map(model, ~ broom:::augment.lm(., newdata =
tibble(prop_total = pts))))
lex_preds <- lex_preds_models |>
select(language, lex_class, lang_group, predictions) |>
unnest(predictions) |>
rename(prop_class = .fitted)Visualise the results.
ggplot(lex_preds, aes(x = prop_total, y = prop_class, col = lex_class)) +
geom_abline(intercept = 0, slope = 1, lty = "dashed") +
facet_grid(language ~ lang_group) +
coord_fixed(xlim = c(0, 1), ylim = c(0, 1)) +
geom_point(data = lex_class |>
filter(prop_total != 0, lang_group != "msa-cmn", lang_group != "tri"),
alpha = .1, size = 0.6) +
geom_line(linewidth = 1.5, alpha = .9) +
theme_classic() +
theme(axis.line.x = element_line(linewidth = 0),
axis.line.y = element_line(linewidth = 0),
strip.background = element_blank(),
panel.border = element_rect(colour = "black", fill = NA, linewidth = 1)) +
scale_colour_manual(values = c("#4476AA", "#DECC77", "#CD6677")) +
labs(x = "Vocabulary size", y = "Proportion of category", col = "Lexical class") +
scale_x_continuous(breaks = c(0, 0.5, 1)) +
scale_y_continuous(breaks = c(0, 0.5, 1))We use the difference in the area between the curve and the diagonal as a measure for bias. We randomly resample the whole population 1000x with replacement, and recompute the area measurement. Code here is adapted from the Wordbank book.
poly_area <- function(group_data) {
model <- tryCatch(
langcog::clm(prop_class ~ I(prop_total^3) + I(prop_total^2) + prop_total - 1,
data = group_data),
error = function(e) return(NULL)
)
if (is.null(model)) return(NA)
# this is some magic that I don't understand lol
return((model$solution %*% c(1/4, 1/3, 1/2) - 0.5)[1])
}
sample_areas <- function(vocab_data, nboot = 1000, verbose = FALSE) {
sample_area <- function(i) {
if (verbose & i%%10 == 0) {print(i)}
vocab_data |>
group_by(data_id) |>
select(-produces, -understands, -n) |>
# pivot wider to sample participants, not administrations
pivot_wider(names_from = c(language, lex_class),
values_from = c(prop_class, prop_total)) |>
group_by(lang_group) |>
slice_sample(prop = 1, replace = TRUE) |>
# pivoting back requires a bit more finesse because of the data structure
pivot_longer(cols = starts_with("prop_"),
names_to = c("prop_type", "language", "lex_class"),
names_pattern = "(prop_[a-z]*)_([A-Z][a-z]*)_([a-z_]*)") |>
pivot_wider(names_from = prop_type,
values_from = value,
values_fn = unique) |>
filter(!is.na(prop_total)) |>
group_by(language, lang_group, lex_class) |>
nest(data = -c("language", "lang_group", "lex_class")) |>
mutate(area = map_dbl(data, poly_area),
sample = i) |>
select(-data)
}
map_df(1:nboot, sample_area)
}
sampled_areas <- sample_areas(lex_class_cleaned |> unnest(data))Generate summaries of the area data.
area_summary <- sampled_areas |>
filter(!is.na(area)) |>
group_by(language, lang_group, lex_class) |>
summarise(mean = mean(area),
ci_lower = langcog::ci_lower(area),
ci_upper = langcog::ci_upper(area))Visualise the areas.
ggplot(area_summary) +
geom_vline(xintercept = 0, lty = "dashed") +
geom_pointrange(aes(x = mean, xmin = ci_lower, xmax = ci_upper, y = language, col = lang_group),
position = position_dodge(width = .6)) +
facet_grid(lex_class ~ .) +
theme_classic() +
theme(axis.line.x = element_line(linewidth = 0),
axis.line.y = element_line(linewidth = 0),
strip.background = element_blank(),
panel.border = element_rect(colour = "black", fill = NA, linewidth = 1)) +
scale_colour_manual(values = c("#A9CD66", "#66CDB2", "#B766CD")) +
labs(x = "Category bias", y = "Language", col = "Language group")If we only observe the English bias values for predicates, we notice that both bilingual groups have less bias against predicates than the monolingual group. We can investigate this numerically by conducting a permutation test, in which we permute the group labels and examine the probability that the observed difference in areas would arise under the null hypothesis. We can do this for all lexical categories (and indeed for all languages—but for now let’s just look at English).
observed_areas <- lex_class_cleaned |>
mutate(area = map_dbl(data, poly_area)) |>
select(-data)
observed_areas_wide <- observed_areas |>
pivot_wider(names_from = lang_group,
values_from = area) |>
mutate(engmsa_minus_mon = `eng-msa` - mon,
engcmn_minus_mon = `eng-cmn` - mon)
observed_areas_diff <- observed_areas_wide |>
select(-`eng-msa`, -`eng-cmn`, -mon) |>
pivot_longer(cols = c("engmsa_minus_mon", "engcmn_minus_mon"),
names_to = "comparison",
values_to = "area_diff")permute_areas <- function(vocab_data, nboot = 1000, verbose = FALSE, group = NULL) {
permute_area <- function(i) {
if (verbose & i%%10 == 0) {print(i)}
vocab_data |>
group_by(data_id) |>
select(-produces, -understands, -n) |>
# pivot wider to sample participants, not administrations
pivot_wider(names_from = c(language, lex_class),
values_from = c(prop_class, prop_total)) |>
group_by(across(all_of(group))) |>
mutate(lang_group = sample(lang_group, replace = FALSE)) |>
# pivoting back requires a bit more finesse because of the data structure
pivot_longer(cols = starts_with("prop_"),
names_to = c("prop_type", "language", "lex_class"),
names_pattern = "(prop_[a-z]*)_([A-Z][a-z]*)_([a-z_]*)") |>
pivot_wider(names_from = prop_type,
values_from = value,
values_fn = unique) |>
filter(!is.na(prop_total)) |>
group_by(language, lang_group, lex_class) |>
nest(data = -c("language", "lang_group", "lex_class")) |>
mutate(area = map_dbl(data, poly_area),
sample = i) |>
select(-data)
}
map_df(1:nboot, permute_area)
}
permuted_areas <- permute_areas(lex_class_cleaned |> unnest(data))
permuted_areas_wide <- permuted_areas |>
pivot_wider(names_from = lang_group,
values_from = area) |>
mutate(engmsa_minus_mon = `eng-msa` - mon,
engcmn_minus_mon = `eng-cmn` - mon)
permuted_areas_diff <- permuted_areas_wide |>
select(-`eng-msa`, -`eng-cmn`, -mon) |>
pivot_longer(cols = c("engmsa_minus_mon", "engcmn_minus_mon"),
names_to = "comparison",
values_to = "area_diff")Visualise the distribution in bias values. Densities indicate permuted samples; dashed line indicates observed value.
ggplot() +
geom_density(data = permuted_areas_diff |> filter(language == "English"),
mapping = aes(x = area_diff, col = lex_class)) +
geom_vline(data = observed_areas_diff |> filter(language == "English"),
mapping = aes(xintercept = area_diff, col = lex_class),
lty = "dashed") +
facet_grid(lex_class ~ comparison) +
theme_classic() +
theme(axis.line.x = element_line(linewidth = 0),
axis.line.y = element_line(linewidth = 0),
strip.background = element_blank(),
panel.border = element_rect(colour = "black", fill = NA, linewidth = 1)) +
scale_colour_manual(values = c("#4476AA", "#DECC77", "#CD6677")) +
labs(x = "Category bias", y = "Probability density", col = "Lexical category")Calculate two-sided \(p\) values.
permuted_areas_diff |>
left_join(observed_areas_diff,
by = c("language", "lex_class", "comparison"),
suffix = c("_perm", "_obs")) |>
mutate(obs_beats_perm = abs(area_diff_perm) - abs(area_diff_obs) > 0) |>
group_by(language, lex_class, comparison) |>
summarise(obs_p = sum(obs_beats_perm, na.rm = TRUE) / n()) |>
filter(!(language == "Malay" & comparison == "engcmn_minus_mon"),
!(language == "Mandarin" & comparison == "engmsa_minus_mon")) |>
arrange(comparison) |>
kable()| language | lex_class | comparison | obs_p |
|---|---|---|---|
| English | function_words | engcmn_minus_mon | 0.632 |
| English | nouns | engcmn_minus_mon | 0.273 |
| English | predicates | engcmn_minus_mon | 0.284 |
| Mandarin | function_words | engcmn_minus_mon | 0.585 |
| Mandarin | nouns | engcmn_minus_mon | 0.478 |
| Mandarin | predicates | engcmn_minus_mon | 0.567 |
| English | function_words | engmsa_minus_mon | 0.132 |
| English | nouns | engmsa_minus_mon | 0.237 |
| English | predicates | engmsa_minus_mon | 0.694 |
| Malay | function_words | engmsa_minus_mon | 0.333 |
| Malay | nouns | engmsa_minus_mon | 0.203 |
| Malay | predicates | engmsa_minus_mon | 0.380 |
Nothing is significant at \(\alpha = 0.05\), or even at \(\alpha = 0.10\). Perhaps there is a better way to investigate whether there is a difference, but at least it does not show up using this method.
We use data from Serene Siow to explore European bilinguals—noting that most of these languages exhibit a noun bias.
Load the data and join in categories.
sem_cats <- c("Sounds", "Animals", "Vehicles", "Toys", "Food and Drink",
"Body Parts", "Clothes", "Furniture and Rooms", "Outside",
"Household items", "People", "Games and Routines", "Action Words",
"Descriptive Words", "Question words", "Time",
"Pronouns and Possessives", "Prepositions and Location Words",
"Quantifiers")
namefun <- \(nm) make.unique(nm, sep = "_")
ox_files <- list.files(OX_LOC, "*.xlsx")
ox_long <- lapply(ox_files,
\(x) {readxl::read_xlsx(here(OX_LOC, x), skip = 1,
.name_repair = namefun) |>
pivot_longer(cols = starts_with(sem_cats),
names_to = c("category", "item"),
names_pattern = "([A-Za-z ]*) - (.*)") |>
mutate(file_name = x,
other_lang = str_extract(x, "(?<=_)[A-Z][a-z]*(?=.xlsx)"))}) |>
bind_rows() |>
filter(!(`Response ID` == "Response ID")) |> # data error in German
group_by(`Response ID`)
items <- colnames(readxl::read_xlsx(here(OX_LOC, ox_files[1]), skip = 1,
.name_repair = namefun))
eng_items <- str_split(items[33:450], " - ") |>
as.data.frame() |> t() |> as_tibble() |>
`colnames<-`(value = c("category", "item")) |>
mutate(is_eng_form = 1)Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
`.name_repair` is omitted as of tibble 2.0.0.
ℹ Using compatibility `.name_repair`.
orig_cats <- c("sounds", "animals", "vehicles", "toys", "food_drink",
"body_parts", "clothing", "furniture_rooms", "outside",
"household", "people", "games_routines", "action_words",
"descriptive_words", "question_words", "time_words",
"pronouns", "locations", "quantifiers")
ox_long_cats <- ox_long |>
left_join(eng_items, by = c("category", "item")) |>
mutate(category = orig_cats[match(category, sem_cats)],
is_eng_form = replace_na(is_eng_form, 0)) |>
left_join(categories, by = "category")Group participants by language group; here most categories that are not “eng-l2” are too sparse so we exclude them. Note that we combine all “moneng” into a general monolingual English category.
ox_exp <- ox_long |>
select(`Response ID`, other_lang, starts_with("Overall language exposure - "),
-`Overall language exposure - Other - Text`) |>
rename_with(\(x) {str_replace(x, "Overall language exposure - ", "Expo")}) |>
distinct() |>
mutate(ExpoSecond = coalesce(ExpoDutch, ExpoFrench, ExpoGerman, ExpoItalian, ExpoPolish, ExpoPortuguese, ExpoSpanish)) |>
select(data_id = `Response ID`, other_lang, ExpoEnglish, ExpoSecond, ExpoOther) |>
mutate(across(starts_with("Expo"),
\(x) {as.numeric(cut(as.numeric(x),
breaks = c(0, 10, 100),
labels = c(0, 1),
include.lowest = T)) - 1},
.names = "{col}_fct"),
lang_group = case_when(ExpoEnglish_fct == 1 & ExpoSecond_fct + ExpoOther_fct == 0 ~ "moneng",
ExpoEnglish_fct == 0 & ExpoSecond_fct + ExpoOther_fct == 1 ~ "monl2",
ExpoEnglish_fct + ExpoSecond_fct + ExpoOther_fct == 3 ~ "tri",
ExpoEnglish_fct + ExpoSecond_fct == 2 ~ "eng-l2",
.default = "other"))ox_lex_class <- ox_long_cats |>
select(data_id = `Response ID`, age = age_months, sex = `Child's gender`,
is_eng_form, lex_class, value) |>
group_by(data_id, age, sex, is_eng_form, lex_class) |>
summarise(produces = sum(str_detect(value, "/"), na.rm = TRUE),
understands = sum(str_starts(value, "N", negate = TRUE), na.rm = TRUE) + produces,
n = n()) |>
filter(lex_class != "other") |>
left_join(ox_exp, by = "data_id") |>
mutate(language = ifelse(is_eng_form, "English", other_lang)) |>
group_by(data_id, language) |>
mutate(prop_class = produces / n,
prop_total = sum(produces, na.rm = T) / sum(n, na.rm = T),
lex_class = lex_class |> as_factor() |> fct_shift(1))ox_lex_class |>
ungroup() |>
select(data_id, other_lang, lang_group) |>
distinct() |>
ungroup() |>
count(lang_group, other_lang) |>
kable()| lang_group | other_lang | n |
|---|---|---|
| eng-l2 | Dutch | 38 |
| eng-l2 | French | 130 |
| eng-l2 | German | 64 |
| eng-l2 | Germany | 65 |
| eng-l2 | Italian | 125 |
| eng-l2 | Netherlands | 35 |
| eng-l2 | Polish | 143 |
| eng-l2 | Portuguese | 76 |
| eng-l2 | Spain | 105 |
| eng-l2 | Spanish | 240 |
| moneng | Dutch | 1 |
| moneng | French | 5 |
| moneng | German | 3 |
| moneng | Germany | 1 |
| moneng | Italian | 2 |
| moneng | Netherlands | 2 |
| moneng | Polish | 3 |
| moneng | Portuguese | 3 |
| moneng | Spain | 2 |
| moneng | Spanish | 3 |
| monl2 | French | 6 |
| monl2 | Germany | 2 |
| monl2 | Italian | 7 |
| monl2 | Polish | 11 |
| monl2 | Portuguese | 11 |
| monl2 | Spanish | 14 |
| other | Italian | 1 |
| other | Spain | 1 |
| other | Spanish | 1 |
| tri | French | 1 |
| tri | Germany | 1 |
| tri | Spain | 3 |
| tri | Spanish | 6 |
Run constrained LMs.
ox_lex_class_cleaned <- ox_lex_class |>
filter(prop_total != 0) |>
mutate(other_lang = ifelse(lang_group == "moneng", "English", other_lang)) |>
group_by(language, other_lang, lex_class, lang_group) |>
nest(data = -c("language", "other_lang", "lex_class", "lang_group")) |>
filter(lang_group %in% c("moneng", "eng-l2"),
!(lang_group == "moneng" & language != "English")) |>
mutate(lang_group = other_lang)
ox_lex_preds_models <- ox_lex_class_cleaned |>
mutate(model = map(data, ~ langcog::clm(prop_class ~ I(prop_total^3) +
I(prop_total^2) + prop_total - 1,
data = .)),
predictions = map(model, ~ broom:::augment.lm(., newdata =
tibble(prop_total = pts))))
ox_lex_preds <- ox_lex_preds_models |>
select(language, other_lang, lex_class, lang_group, predictions) |>
unnest(predictions) |>
rename(prop_class = .fitted)Visualise the results.
ggplot(ox_lex_preds |> filter(!lang_group %in% c("Germany", "Netherlands", "Spain")),
aes(x = prop_total, y = prop_class, col = lex_class)) +
geom_abline(intercept = 0, slope = 1, lty = "dashed") +
facet_grid(factor(language == "English") |> fct_recode(English = "TRUE", Other = "FALSE") |>
fct_relevel("English") ~
factor(lang_group) |> fct_relevel("English")) +
coord_fixed(xlim = c(0, 1), ylim = c(0, 1)) +
geom_point(data = ox_lex_class |>
filter(prop_total != 0, lang_group %in% c("moneng", "eng-l2")) |>
mutate(lang_group = other_lang) |>
filter(!lang_group %in% c("Germany", "Netherlands", "Spain")),
alpha = .1, size = 0.6) +
geom_line(linewidth = 1, alpha = .9) +
theme_classic() +
theme(axis.line.x = element_line(linewidth = 0),
axis.line.y = element_line(linewidth = 0),
strip.background = element_blank(),
panel.border = element_rect(colour = "black", fill = NA, linewidth = 1)) +
scale_colour_manual(values = c("#4476AA", "#DECC77", "#CD6677")) +
labs(x = "Vocabulary size", y = "Proportion of category", col = "Lexical class") +
scale_x_continuous(breaks = c(0, 0.5, 1)) +
scale_y_continuous(breaks = c(0, 0.5, 1))Warning: 1 unknown level in `f`: English
1 unknown level in `f`: English
ox_sampled_areas <- sample_areas(ox_lex_class_cleaned |> unnest(data))Generate summaries of the area data.
ox_area_summary <- ox_sampled_areas |>
filter(!is.na(area)) |>
group_by(language, lang_group, lex_class) |>
summarise(mean = mean(area),
ci_lower = langcog::ci_lower(area),
ci_upper = langcog::ci_upper(area)) |>
mutate(language = case_when(
language == "Germany" ~ "German",
language == "Netherlands" ~ "Dutch",
language == "Spain" ~ "Spanish",
.default = language
))Visualise the areas. We first show the plot for English, then for the other languages which have a British vs local comparison (Germany, the Netherlands, Spain).
ggplot(ox_area_summary |> filter(language %in% c("English"), !(lang_group %in% c("Germany", "Netherlands", "Spain")))) +
geom_vline(xintercept = 0, lty = "dashed") +
geom_pointrange(aes(x = mean, xmin = ci_lower, xmax = ci_upper, y = language, col = lang_group),
position = position_dodge(width = .6)) +
facet_grid(lex_class ~ .) +
theme_classic() +
theme(axis.line.x = element_line(linewidth = 0),
axis.line.y = element_line(linewidth = 0),
strip.background = element_blank(),
panel.border = element_rect(colour = "black", fill = NA, linewidth = 1)) +
# scale_colour_manual(values = c("#A9CD66", "#66CDB2", "#B766CD")) +
labs(x = "Category bias", y = "Language", col = "Language group")ggplot(ox_area_summary |>
filter(language %in% c("German", "Dutch", "Spanish")) |>
mutate(lang_group = ifelse(lang_group %in% c("German", "Dutch", "Spanish"), "UK", lang_group))) +
geom_vline(xintercept = 0, lty = "dashed") +
geom_pointrange(aes(x = mean, xmin = ci_lower, xmax = ci_upper, y = language, col = lang_group),
position = position_dodge(width = .6)) +
facet_grid(lex_class ~ .) +
theme_classic() +
theme(axis.line.x = element_line(linewidth = 0),
axis.line.y = element_line(linewidth = 0),
strip.background = element_blank(),
panel.border = element_rect(colour = "black", fill = NA, linewidth = 1)) +
# scale_colour_manual(values = c("#A9CD66", "#66CDB2", "#B766CD")) +
labs(x = "Category bias", y = "Language", col = "Country")Something surprising is the difference in the non-English language results for British vs local populations, where the British sample seems to be more extreme than the local sample.
Again, we test for bias differences statistically by using permutation tests. We do this in two steps: first, we consider only the English vocabularies from UK monolingual and bilingual children.
ox_observed_areas <- ox_lex_class_cleaned |>
mutate(area = map_dbl(data, poly_area)) |>
select(-data)
ox_observed_areas_wide_eng <- ox_observed_areas |>
filter(language == "English",
!(lang_group %in% c("Germany", "Netherlands", "Spain"))) |>
ungroup() |>
select(-other_lang) |>
pivot_wider(names_from = lang_group,
values_from = area) |>
mutate(across(-c("lex_class", "language"), \(x) {x - English}))
ox_observed_areas_diff_eng <- ox_observed_areas_wide_eng |>
pivot_longer(cols = -c("lex_class", "language"),
names_to = "comparison",
values_to = "area_diff")ox_permuted_areas_eng <- permute_areas(ox_lex_class_cleaned |> unnest(data) |>
filter(language == "English",
!(lang_group %in% c("Germany", "Netherlands", "Spain"))))
ox_permuted_areas_wide_eng <- ox_permuted_areas_eng |>
filter(language == "English",
!(lang_group %in% c("Germany", "Netherlands", "Spain"))) |>
ungroup() |>
pivot_wider(names_from = lang_group,
values_from = area) |>
mutate(across(-c("lex_class", "language", "sample"), \(x) {x - English}))
ox_permuted_areas_diff_eng <- ox_permuted_areas_wide_eng |>
pivot_longer(cols = -c("lex_class", "language", "sample"),
names_to = "comparison",
values_to = "area_diff")Densities indicate permuted samples; dashed line indicates observed value.
ggplot() +
geom_density(data = ox_permuted_areas_diff_eng |> filter(comparison != "English"),
mapping = aes(x = area_diff, col = lex_class)) +
geom_vline(data = ox_observed_areas_diff_eng |> filter(comparison != "English"),
mapping = aes(xintercept = area_diff, col = lex_class),
lty = "dashed") +
facet_grid(lex_class ~ comparison) +
theme_classic() +
theme(axis.line.x = element_line(linewidth = 0),
axis.line.y = element_line(linewidth = 0),
strip.background = element_blank(),
panel.border = element_rect(colour = "black", fill = NA, linewidth = 1)) +
scale_colour_manual(values = c("#4476AA", "#DECC77", "#CD6677")) +
labs(x = "Category bias", y = "Probability density", col = "Lexical category")We see almost no effect for function words, but possibly some effects for the nouns and predicates.
Calculate two-sided \(p\) values.
ox_permuted_areas_diff_eng |>
left_join(ox_observed_areas_diff_eng,
by = c("language", "lex_class", "comparison"),
suffix = c("_perm", "_obs")) |>
mutate(obs_beats_perm = abs(area_diff_perm) - abs(area_diff_obs) > 0) |>
group_by(language, lex_class, comparison) |>
summarise(obs_p = sum(obs_beats_perm, na.rm = TRUE) / n()) |>
filter(comparison != "English") |>
arrange(comparison) |>
kable()| language | lex_class | comparison | obs_p |
|---|---|---|---|
| English | function_words | Dutch | 0.533 |
| English | nouns | Dutch | 0.059 |
| English | predicates | Dutch | 0.043 |
| English | function_words | French | 0.434 |
| English | nouns | French | 0.195 |
| English | predicates | French | 0.237 |
| English | function_words | German | 0.712 |
| English | nouns | German | 0.558 |
| English | predicates | German | 0.620 |
| English | function_words | Italian | 0.632 |
| English | nouns | Italian | 0.393 |
| English | predicates | Italian | 0.421 |
| English | function_words | Polish | 0.993 |
| English | nouns | Polish | 0.172 |
| English | predicates | Polish | 0.070 |
| English | function_words | Portuguese | 0.621 |
| English | nouns | Portuguese | 0.480 |
| English | predicates | Portuguese | 0.553 |
| English | function_words | Spanish | 0.677 |
| English | nouns | Spanish | 0.270 |
| English | predicates | Spanish | 0.243 |
There is a marginal difference for English–Dutch bilinguals (vs English monolinguals) for predicates at \(\alpha = 0.05\) but I am inclined to not give that too much weight.
Now, we compare German, Dutch, and Spanish vocabularies for British vs local samples.
ox_observed_areas_wide_oth <- ox_observed_areas |>
mutate(language = case_when(
language == "Germany" ~ "German",
language == "Netherlands" ~ "Dutch",
language == "Spain" ~ "Spanish",
.default = language
),
lang_group = ifelse(lang_group %in% c("German", "Dutch", "Spanish"), "UK", lang_group)) |>
filter(language %in% c("German", "Dutch", "Spanish")) |>
ungroup() |>
select(-other_lang) |>
pivot_wider(names_from = lang_group,
values_from = area) |>
mutate(across(-c("lex_class", "language"), \(x) {x - UK}))
ox_observed_areas_diff_oth <- ox_observed_areas_wide_oth |>
pivot_longer(cols = -c("lex_class", "language"),
names_to = "comparison",
values_to = "area_diff") |>
filter(!is.na(area_diff), comparison != "UK")ox_permuted_areas_oth <- permute_areas(ox_lex_class_cleaned |> unnest(data) |>
filter(lang_group == language &
language %in% c("German", "Dutch", "Spanish",
"Germany", "Netherlands", "Spain")) |>
mutate(other_lang = case_when(
other_lang == "Germany" ~ "German",
other_lang == "Netherlands" ~ "Dutch",
other_lang == "Spain" ~ "Spanish",
.default = other_lang
),
language = case_when(
language == "Germany" ~ "German",
language == "Netherlands" ~ "Dutch",
language == "Spain" ~ "Spanish",
.default = language
)),
group = "other_lang")
ox_permuted_areas_wide_oth <- ox_permuted_areas_oth |>
filter(language %in% c("German", "Dutch", "Spanish")) |>
mutate(lang_group = ifelse(language == lang_group, "UK", lang_group)) |>
filter(!is.na(area)) |>
ungroup() |>
pivot_wider(names_from = lang_group,
values_from = area) |>
mutate(across(-c("lex_class", "language", "sample"), \(x) {x - UK}))
ox_permuted_areas_diff_oth <- ox_permuted_areas_wide_oth |>
pivot_longer(cols = -c("lex_class", "language", "sample"),
names_to = "comparison",
values_to = "area_diff") |>
filter(!is.na(area_diff), comparison != "UK")ggplot() +
geom_density(data = ox_permuted_areas_diff_oth |> filter(comparison != "UK"),
mapping = aes(x = area_diff, col = lex_class)) +
geom_vline(data = ox_observed_areas_diff_oth |> filter(comparison != "UK"),
mapping = aes(xintercept = area_diff, col = lex_class),
lty = "dashed") +
facet_grid(lex_class ~ comparison) +
theme_classic() +
theme(axis.line.x = element_line(linewidth = 0),
axis.line.y = element_line(linewidth = 0),
strip.background = element_blank(),
panel.border = element_rect(colour = "black", fill = NA, linewidth = 1)) +
scale_colour_manual(values = c("#4476AA", "#DECC77", "#CD6677")) +
labs(x = "Category bias", y = "Probability density", col = "Lexical category")Calculate two-sided \(p\) values.
ox_permuted_areas_diff_oth |>
left_join(ox_observed_areas_diff_oth,
by = c("language", "lex_class", "comparison"),
suffix = c("_perm", "_obs")) |>
mutate(obs_beats_perm = abs(area_diff_perm) - abs(area_diff_obs) > 0) |>
group_by(language, lex_class, comparison) |>
summarise(obs_p = sum(obs_beats_perm, na.rm = TRUE) / n()) |>
filter(comparison != "UK") |>
arrange(comparison) |>
kable()| language | lex_class | comparison | obs_p |
|---|---|---|---|
| German | function_words | Germany | 0.000 |
| German | nouns | Germany | 0.011 |
| German | predicates | Germany | 0.691 |
| Dutch | function_words | Netherlands | 0.165 |
| Dutch | nouns | Netherlands | 0.019 |
| Dutch | predicates | Netherlands | 0.051 |
| Spanish | function_words | Spain | 0.000 |
| Spanish | nouns | Spain | 0.001 |
| Spanish | predicates | Spain | 0.120 |
Here we see pretty clear differences for most British vs local comparisons.