BEHAVIORAL_IAT_PATH <- here("data/study0/processed/by_language_df.csv")
iat_behavioral_es <- read_csv(BEHAVIORAL_IAT_PATH) %>%
rename(language_code = "wiki_language_code") %>%
select(language_code, median_country_age,
es_iat_sex_age_order_explicit_resid,
es_iat_sex_age_order_implicit_resid,
per_women_stem_2012_2017,
n_participants)
LANG_PATH <- here("exploratory_studies/16_wiki_native/data/06_iat_es/iat_es_lang.csv")
lang_iat_native <- read_csv(LANG_PATH) %>%
rename(lang_es_wiki_native = lang_es_wiki)
# study 1b
LANG_IAT_PATH <- here("data/study1b/iat_es_lang.csv")
iat_lang_es <- read_csv(LANG_IAT_PATH)
LANG_FAMILY_PATH <- here("data/study0/processed/top_lang_by_country_ethnologue.csv")
lang_family <- read_csv(LANG_FAMILY_PATH) %>%
select(wiki_language_code, family) %>%
rename(language_code = "wiki_language_code") %>%
distinct()
# study 2 measures (included here for making single grand table)
BY_LANGUAGE_OCCUPATION_PATH <- here("data/study2/occupation_gender_score_by_language.csv")
occupation_semantics <- read_csv(BY_LANGUAGE_OCCUPATION_PATH)
OCCUPATION_OVERLAP_PATH <- here('data/study2/occupation_gender_scores.csv')
by_lang_scores <- read_csv(OCCUPATION_OVERLAP_PATH)
LANGUAGE_NAME_PATH <- here("data/study0/processed/lang_name_to_wiki_iso.csv")
language_names <- read_csv(LANGUAGE_NAME_PATH) %>%
rename(language_code = wiki_language_code) %>%
distinct(language_code, .keep_all = TRUE)
# combine lang and behavioral and family info
all_es <- left_join(iat_behavioral_es, iat_lang_es, by = "language_code") %>%
left_join(lang_iat_native, by = "language_code") %>%
left_join(lang_family) %>%
left_join(occupation_semantics) %>% # include study 2 measure here so can make table
left_join(by_lang_scores) %>%
left_join(language_names) %>%
select(language_code,language_name,family,n_participants,es_iat_sex_age_order_implicit_resid, es_iat_sex_age_order_explicit_resid, median_country_age, per_women_stem_2012_2017, lang_es_sub, lang_es_wiki, lang_es_wiki_native, mean_prop_distinct_occs, subt_occu_semantics_fm, wiki_occu_semantics_fm)
# remove exclusions and fix croatian to be mean of hr and sr (only in wiki)
EXCLUSIONS_PATH <- here("data/study1b/language_exclusions.csv")
exclusions <- read_csv(EXCLUSIONS_PATH)
EXCLUSIONS_WIKI_NATIVE_PATH <- here("/exploratory_studies/16_wiki_native/data/06_iat_es/language_exclusions.csv")
exclusions_wiki_native <- read_csv(EXCLUSIONS_WIKI_NATIVE_PATH)
hr_new_wiki <- mean(c(filter(iat_lang_es, language_code == "hr") %>% pull(lang_es_wiki),
filter(iat_lang_es, language_code == "sr") %>% pull(lang_es_wiki)))
hr_new_wiki_native <- mean(c(filter(lang_iat_native, language_code == "hr") %>% pull(lang_es_wiki_native),
filter(lang_iat_native, language_code == "sr") %>% pull(lang_es_wiki_native)))
all_es_tidy <- all_es %>%
left_join(exclusions) %>%
left_join(exclusions_wiki_native) %>%
mutate(lang_es_wiki = case_when(exclude_wiki == TRUE ~ NA_real_,
TRUE ~ lang_es_wiki),
lang_es_sub = case_when(exclude_sub == TRUE ~ NA_real_,
TRUE ~ lang_es_sub),
lang_es_wiki_native = case_when(exclude_wiki_native == TRUE ~ NA_real_,
TRUE ~ lang_es_wiki_native)) %>%
select(-exclude_wiki, -exclude_sub, -exclude_wiki_native) %>%
mutate(lang_es_wiki = case_when(language_code == "hr" ~ hr_new_wiki,
TRUE ~ lang_es_wiki),
lang_es_wiki_native = case_when(language_code == "hr" ~ hr_new_wiki_native,
TRUE ~ lang_es_wiki_native),
lang_es_sub = case_when(language_code == "hr" ~ NA_real_, # sr is missing from sub
TRUE ~ lang_es_sub)) %>%
filter(language_code != "zu") # exclude proportion overlap measure (study 2) in zulu
# corr of lang, behavioral, etc.
all_corr_vars <- all_es_tidy %>%
select(lang_es_sub, lang_es_wiki,lang_es_wiki_native, subt_occu_semantics_fm, wiki_occu_semantics_fm, mean_prop_distinct_occs, es_iat_sex_age_order_explicit_resid,
es_iat_sex_age_order_implicit_resid, per_women_stem_2012_2017, median_country_age) %>%
rename(`Residualized Implicit Bias (IAT)` = "es_iat_sex_age_order_implicit_resid",
`Residualized Explicit Bias` = "es_iat_sex_age_order_explicit_resid",
`Language IAT (Subtitle)` = "lang_es_sub",
`Language IAT (Wikipedia)` = "lang_es_wiki",
`Language IAT (Wikipedia, native)` = "lang_es_wiki_native",
`Occupation Bias (Subtitle)` = "subt_occu_semantics_fm",
`Occupation Bias (Wikipedia)` = "wiki_occu_semantics_fm",
`Prop. Gendered Occupation Labels` = "mean_prop_distinct_occs",
`Percent Women in STEM` = "per_women_stem_2012_2017",
`Median Country Age` = "median_country_age")
simple_corr <- psych::corr.test(all_corr_vars, adjust = "none")$r %>%
as_tibble(rownames = "rowname") %>%
gather("var2", "simple_r", -rowname)
simple_corr_p <- psych::corr.test(all_corr_vars, adjust = "none")$p %>%
as_tibble(rownames = "rowname") %>%
gather("var2", "simple_p", -rowname)
partial_psych_obj <- psych::partial.r(data = all_corr_vars,
x = 1:8, y = "Median Country Age" )
partial_corr <- psych::corr.p(partial_psych_obj, n = nrow(all_corr_vars) - 1,
adjust = "none")$r %>%
psych_to_mat() %>%
as_tibble(rownames = "rowname") %>%
gather("var2", "partial_r", -rowname)
partial_corr_p <- psych::corr.p(partial_psych_obj, n = nrow(all_corr_vars) - 1,
adjust = "none")$p %>%
psych_to_mat() %>%
as_tibble(rownames = "rowname") %>%
gather("var2", "partial_p", -rowname)
tidy_corrs <- simple_corr %>%
left_join(simple_corr_p) %>%
left_join(partial_corr) %>%
left_join(partial_corr_p)
corr_text_df <- tidy_corrs %>%
filter(rowname == "Residualized Implicit Bias (IAT)",
var2 %in% c("Language IAT (Subtitle)",
"Language IAT (Wikipedia)",
"Language IAT (Wikipedia, native)")) %>%
mutate(model = fct_recode(var2, "Subtitle Embeddings" = "Language IAT (Subtitle)",
"Wikipedia Embeddings" = "Language IAT (Wikipedia)",
"Wikipedia Embeddings, native" = "Language IAT (Wikipedia, native)")) %>%
select(model, simple_r) %>%
mutate(simple_r = paste0("r = ", f_num(simple_r, 2))) %>%
mutate(x = .85, y = -.07)
# plot lang vs behavioral
all_es_tidy %>%
select(language_name, lang_es_sub, lang_es_wiki, lang_es_wiki_native,
es_iat_sex_age_order_implicit_resid, n_participants) %>%
gather("model", "lang_es", -language_name,
-es_iat_sex_age_order_implicit_resid, -n_participants) %>%
mutate(model = fct_recode(model, "Subtitle Embeddings" = "lang_es_sub",
"Wikipedia Embeddings" = "lang_es_wiki",
"Wikipedia Embeddings, native" = "lang_es_wiki_native")) %>%
ggplot(aes(x = lang_es, y = es_iat_sex_age_order_implicit_resid, size = n_participants)) +
facet_wrap( . ~ model) +
geom_smooth(method = "lm", alpha = .1, size = .9) +
geom_point(alpha = .2) +
ggrepel::geom_text_repel(aes(label = language_name),
size = 2, box.padding = 0.1) +
scale_x_continuous(breaks = c(-.3, -0, .5, 1),
label = c("\n(male-\nfamily)", "0", ".5","1\n(male-\ncareer)") , limits = c(-.35, 1.1)) +
scale_y_continuous(breaks = c(-.075, -.05, -.025, 0, .025, .05),
label = c("-.075\n(male-\nfamily)", "-.05", "-.025", "0", ".025", ".05\n(male-\ncareer)") , limits = c(-.08, .06) ) +
scale_size(trans = "log10", labels = scales::comma, name = "N participants") +
geom_text(data = corr_text_df, aes(label = simple_r, x = x, y = y),
color = "red", size = 4) +
ggtitle("Psychological and Linguistic Gender Biases") +
ylab("Implicit Gender Bias (residualized)\n") +
xlab("\nLinguistic Gender Bias\n (effect size)") +
theme_classic() +
theme(legend.position = "bottom")

print_tidy_corrs <- tidy_corrs %>%
filter(rowname != var2) %>%
mutate_at(vars(simple_r, partial_r), ~ format(round(., 2), nsmall = 2) %>% f_num(., digits = 2)) %>%
mutate_at(vars(simple_p, partial_p), ~ case_when(
. < .01 ~ "**", . < .05 ~ "*", . < .1 ~ "+", TRUE ~ "")) %>%
mutate(r_partial_print = case_when(
!is.na(partial_r) ~ paste0(partial_r, partial_p),TRUE ~ ""),
r_simple_print = paste0(simple_r, simple_p)) %>%
select(rowname, var2, r_simple_print, r_partial_print)
tidy_corrs_to_print_simple <- print_tidy_corrs %>%
select(-r_partial_print) %>%
spread(var2, r_simple_print) %>%
mutate_all(funs(replace_na(., ""))) %>%
select("rowname", "Language IAT (Subtitle)", "Language IAT (Wikipedia, native)", "Language IAT (Wikipedia)", "Median Country Age", "Occupation Bias (Subtitle)", "Occupation Bias (Wikipedia)", "Percent Women in STEM", "Prop. Gendered Occupation Labels", "Residualized Explicit Bias", "Residualized Implicit Bias (IAT)") %>%
rename(" " = "rowname")
tidy_corrs_to_print_partial <- print_tidy_corrs %>%
select(-r_simple_print) %>%
spread(var2, r_partial_print) %>%
mutate_all(funs(replace_na(., ""))) %>%
select("rowname", "Language IAT (Subtitle)", "Language IAT (Wikipedia, native)", "Language IAT (Wikipedia)", "Median Country Age", "Occupation Bias (Subtitle)", "Occupation Bias (Wikipedia)", "Prop. Gendered Occupation Labels", "Residualized Explicit Bias", "Residualized Implicit Bias (IAT)") %>%
rename(" " = "rowname") %>%
mutate("Median Country Age" = " ")
#tidy_corrs_to_print_reordered_simple <- tidy_corrs_to_print_simple[c(8,9,6,1,2,7,4,5,3),]
#tidy_corrs_to_print_reordered_partial <- tidy_corrs_to_print_partial[c(8,9,6,1,2,7,4,5,3),]
tidy_corrs_to_print_reordered <- bind_rows(tidy_corrs_to_print_simple, tidy_corrs_to_print_partial)
kable(tidy_corrs_to_print_reordered, booktabs = T, escape = F,
caption = "Correlation (Pearson's r) for all measures in Study 1 and 2 at the level of languages. Top panel shows simple correlations; bottom panel shows partial correlations controlling for median country age. Single asterisks indicate p < .05 and double asterisks indicate p < .01. The + symbol indicates a marginally significant p-value, p < .1.",
align = "l",
longtable = T) %>% # this makes it so the table doesn't go at end
kable_styling(full_width = F, font_size = 10) %>%
kableExtra::group_rows(group_label = "Simple Correlations",
start_row = 1,
end_row = 10) %>%
kableExtra::group_rows(group_label = "Partial Correlations",
start_row = 11,
end_row = 18) %>%
row_spec(0, angle = 90)
Correlation (Pearson’s r) for all measures in Study 1 and 2 at the level of languages. Top panel shows simple correlations; bottom panel shows partial correlations controlling for median country age. Single asterisks indicate p < .05 and double asterisks indicate p < .01. The + symbol indicates a marginally significant p-value, p < .1.
|
|
Language IAT (Subtitle)
|
Language IAT (Wikipedia, native)
|
Language IAT (Wikipedia)
|
Median Country Age
|
Occupation Bias (Subtitle)
|
Occupation Bias (Wikipedia)
|
Percent Women in STEM
|
Prop. Gendered Occupation Labels
|
Residualized Explicit Bias
|
Residualized Implicit Bias (IAT)
|
|
Simple Correlations
|
|
Language IAT (Subtitle)
|
|
.49*
|
.51*
|
.31
|
.42+
|
.40+
|
-.55*
|
.28
|
-.08
|
.50*
|
|
Language IAT (Wikipedia, native)
|
.49*
|
|
.90**
|
.48*
|
.25
|
.46*
|
-.39+
|
.23
|
.21
|
.60**
|
|
Language IAT (Wikipedia)
|
.51*
|
.90**
|
|
.25
|
.28
|
.44*
|
-.19
|
.18
|
.34+
|
.48*
|
|
Median Country Age
|
.31
|
.48*
|
.25
|
|
.36
|
.34+
|
-.42+
|
.35+
|
-.07
|
.61**
|
|
Occupation Bias (Subtitle)
|
.42+
|
.25
|
.28
|
.36
|
|
.80**
|
-.39
|
.75**
|
.28
|
.64**
|
|
Occupation Bias (Wikipedia)
|
.40+
|
.46*
|
.44*
|
.34+
|
.80**
|
|
-.32
|
.70**
|
.29
|
.59**
|
|
Percent Women in STEM
|
-.55*
|
-.39+
|
-.19
|
-.42+
|
-.39
|
-.32
|
|
-.35
|
.18
|
-.53*
|
|
Prop. Gendered Occupation Labels
|
.28
|
.23
|
.18
|
.35+
|
.75**
|
.70**
|
-.35
|
|
.11
|
.57**
|
|
Residualized Explicit Bias
|
-.08
|
.21
|
.34+
|
-.07
|
.28
|
.29
|
.18
|
.11
|
|
.18
|
|
Residualized Implicit Bias (IAT)
|
.50*
|
.60**
|
.48*
|
.61**
|
.64**
|
.59**
|
-.53*
|
.57**
|
.18
|
|
|
Partial Correlations
|
|
Language IAT (Subtitle)
|
|
.41*
|
.47*
|
|
.35+
|
.33
|
NA
|
.20
|
-.06
|
.42*
|
|
Language IAT (Wikipedia, native)
|
.41*
|
|
.92**
|
|
.09
|
.36+
|
NA
|
.07
|
.28
|
.44*
|
|
Language IAT (Wikipedia)
|
.47*
|
.92**
|
|
|
.21
|
.39+
|
NA
|
.11
|
.38+
|
.43*
|
|
Median Country Age
|
|
|
|
|
|
|
NA
|
|
|
|
|
Occupation Bias (Subtitle)
|
.35+
|
.09
|
.21
|
|
|
.77**
|
NA
|
.71**
|
.33
|
.57**
|
|
Occupation Bias (Wikipedia)
|
.33
|
.36+
|
.39+
|
|
.77**
|
|
NA
|
.66**
|
.34
|
.52**
|
|
Percent Women in STEM
|
|
|
|
|
|
|
NA
|
|
|
|
|
Prop. Gendered Occupation Labels
|
.20
|
.07
|
.11
|
|
.71**
|
.66**
|
NA
|
|
.14
|
.48*
|
|
Residualized Explicit Bias
|
-.06
|
.28
|
.38+
|
|
.33
|
.34
|
NA
|
.14
|
|
.28
|
|
Residualized Implicit Bias (IAT)
|
.42*
|
.44*
|
.43*
|
|
.57**
|
.52**
|
NA
|
.48*
|
.28
|
|