true
INFILE <- "data/gender_bias_by_word_occupation_sub.csv"
distances <- read_csv(INFILE)

few_forms <- c("fa", "es", "ja", "ko", "ms", "fi", "en", "tr")

tidy_dists <- distances %>%
  filter(!is.na(male_score)) %>%
  mutate(num_forms = case_when(language_code %in% few_forms ~ "few",
                               TRUE ~ "many"))
tidy_dists %>%
  ggplot(aes(x = male_score)) +
  geom_density(fill = "grey") +
  geom_vline(aes(xintercept =0), linetype = 2) +
  facet_wrap(~word) +
  theme_classic()

tidy_dists %>%
  group_by(word) %>%
  multi_boot_standard(col  = "male_score") %>%
  ggplot(aes(x = reorder(word, mean), y = mean)) +
  geom_pointrange( aes(ymin = ci_lower, ymax = ci_upper)) +
  geom_hline(aes(yintercept = 0), linetype = 2) +
  ylab("Male Embedding Score") +
  xlab("Occupation Name") +
  theme_classic(base_size = 15) +
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 90, hjust = 1))

tidy_dists %>%
  group_by(num_forms,word) %>%
  multi_boot_standard(col  = "male_score") %>%
  ggplot(aes(x = reorder(word, mean), y = mean, 
             group = num_forms, color = num_forms)) +
  geom_pointrange( aes(ymin = ci_lower, ymax = ci_upper)) +
  geom_smooth(method = "lm") +
  geom_hline(aes(yintercept = 0), linetype = 2) +
  ylab("Male Embedding Score") +
  xlab("Occupation Name") +
  theme_classic(base_size = 15) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Nurse vs. Doctors

doc_nurse <- tidy_dists %>%
  filter(word %in% c("doctor/physician", "nurse"))


ggplot(doc_nurse, aes(x = word, y = male_score, group = language_code, color = word)) +
  geom_line(color = "black") +
  geom_point(size = 2.4) +
  geom_text(data = doc_nurse %>% filter(word == "nurse"), 
                                 aes(x = 2.05, y = male_score, label = language_code), 
            color = "black") +
  ylab("Male Embedding Score") +
  xlab("Occupation Name") +
  theme_classic(base_size = 15) +
  theme(legend.position = "none")