Embedding scores are correlated with human occupation gender ratings, but they are not more correlated in their own language than across languages.

Embedding scores:

EMBEDDING_SCORE_PATH <- "embedding_gender_bias_occupations.csv"

embedding_scores <- read_csv(EMBEDDING_SCORE_PATH, 
                             col_names = c("wiki_lang_code", "occupation", "female_targets",
                                           "male_targets", "male_score")) %>%
  filter(!(occupation %in% c("male_targets", "female_targets")))

count(embedding_scores, wiki_lang_code) %>%
  kable()
wiki_lang_code n
de 391
en 418
fr 409
it 408
no 306
embedding_scores %>%
  ggplot(aes(x = male_score, fill = wiki_lang_code)) +
  geom_histogram() +
  facet_wrap(~wiki_lang_code) +
  theme_classic() +
  theme(legend.position = "none")

Human ratings:

HUMAN_RATINGS_PATH <- "mirsesky_norms_clean.csv"

human_ratings <- read_csv(HUMAN_RATINGS_PATH)   %>%
  filter(!(language %in% c("czech", "slovak"))) %>%
  mutate(wiki_lang_code = case_when(language == "english" ~ "en",
                                   language == "norwegian" ~ "no",
                                   language == "french" ~ "fr",
                                   language == "german" ~ "de",
                                   language == "italian" ~ "it"))  %>%
  select(-language)

count(human_ratings, wiki_lang_code) %>%
  kable()
wiki_lang_code n
de 422
en 422
fr 422
it 422
no 422
human_ratings %>%
  ggplot(aes(x = mean_gender_rating, fill = wiki_lang_code)) +
  geom_histogram() +
  facet_wrap(~wiki_lang_code) +
  theme_classic() +
  theme(legend.position = "none")

all_scores <- embedding_scores %>%
  select(-female_targets, -male_targets) %>%
  full_join(human_ratings)

all_scores %>%
  ggplot(aes(y = -mean_gender_rating, x = male_score)) +
  geom_point(aes(color = wiki_lang_code)) +
  xlab("embedding male score") +
  ylab("human male score") +
  geom_smooth(method = "lm") + 
  facet_wrap(~wiki_lang_code, scale = "free") +
  theme_classic() +
  theme(legend.position = "none")

full_df <- human_ratings %>%
  rename(human_wiki_lang_code = wiki_lang_code) %>%
  left_join(embedding_scores %>%
  select(-female_targets, -male_targets)  %>%
  rename(embedding_wiki_lang_code = wiki_lang_code)) %>%
  mutate(same_lang = human_wiki_lang_code == embedding_wiki_lang_code) %>%
  filter(!is.na(embedding_wiki_lang_code))

full_df %>%
  ggplot(aes(y = -mean_gender_rating, 
             x = male_score, 
             group = embedding_wiki_lang_code,
             color = embedding_wiki_lang_code)) +
  xlab("embedding male score") +
  ylab("human male score") +
  geom_smooth(method = "lm", aes(linetype = same_lang)) + 
  facet_wrap(~human_wiki_lang_code) +
  theme_classic()

full_df %>%
  group_by(human_wiki_lang_code, embedding_wiki_lang_code) %>%
  summarize(cor = cor(male_score, -mean_gender_rating, use = "pairwise.complete.obs")) %>%
  ggplot(aes(x = human_wiki_lang_code, y = embedding_wiki_lang_code, fill = cor)) +
  geom_tile() +
  scale_fill_gradient(low = "white", high = "red") +
  theme_classic()

same_lang_corrs <- full_df %>%
  group_by(human_wiki_lang_code, embedding_wiki_lang_code) %>%
  summarize(cor = cor(male_score, -mean_gender_rating, use = "pairwise.complete.obs"))  %>%
  mutate(same_lang = human_wiki_lang_code == embedding_wiki_lang_code) %>%
  group_by(human_wiki_lang_code, same_lang) %>%
  multi_boot_standard(col = "cor")

ggplot(same_lang_corrs, aes(x = human_wiki_lang_code, gorup = same_lang, color = same_lang)) +
  geom_point(aes(y = mean), size = 4) +
  geom_errorbar(aes(ymin = ci_lower, ymax = ci_upper)) +
  theme_classic()