Word gender bias

KIDCORPUS <- here("data/processed/books/tidy_full_corpus_all.csv")
TIDY_MODEL_SCORES <- here('data/processed/other/tidy_gender_scores.csv')
model_biases <- read_csv(TIDY_MODEL_SCORES) %>%
  mutate(model_type = case_when(path == "downsampled_kidbook"~ "kid", 
                                path == "wiki"~ "wiki", TRUE ~ "adult"))


model_scores <- model_biases %>%
  group_by(model_type, word) %>%
  summarize(male_score = mean(male_score)) # consider filtering to words that are in lots of models word_counts <- count(model_biases, word) 

kidbook_word_counts <- read_csv(KIDCORPUS) %>%
  count(word)

GENDER_NORMS <- here("data/processed/words/gender_ratings_mean.csv")

gender_norms <- read_csv(GENDER_NORMS) %>%
  select(word, mean) %>%
  rename(human_gender_rating = mean)

all_scores <- gender_norms %>%
  left_join(model_scores) %>%
  filter(!is.na(model_type))

good_words <- count(all_scores, word) %>%
  filter(n == 3) %>%
  pull(word)

all_scores_tidy <- all_scores %>%
  filter(word %in% good_words) %>%
  left_join(kidbook_word_counts) %>%
  filter(n > 50) %>%
  group_by(model_type) %>%
  mutate(male_score = scale(male_score))

all_scores_tidy %>%
  ggplot(aes( x = male_score,
              y = human_gender_rating,
              color = model_type)) +
  geom_point(alpha = .2) + 
  geom_smooth(method = "lm") +
  facet_wrap(~model_type) +
  theme_classic()

all_scores_tidy %>%
  ggplot(aes( x = male_score,
              y = human_gender_rating,
              color = model_type)) +
  geom_smooth(method = "lm") +
  theme_classic()

 all_scores_tidy %>%
  group_by(model_type) %>%
  nest() %>%
  mutate(test = map(data, ~ tidy(cor.test(.x$male_score, 
                                          .x$human_gender_rating)))) %>%
  select(-data) %>%
  unnest()  %>%
   kable()

model_type	estimate	statistic	p.value	parameter	conf.low	conf.high	method	alternative
adult	-0.0254469	-0.5229155	0.6013075	422	-0.1203885	0.0699560	Pearson’s product-moment correlation	two.sided
kid	-0.1069391	-2.2094814	0.0276775	422	-0.2001342	-0.0118262	Pearson’s product-moment correlation	two.sided
wiki	-0.6456074	-17.3667810	0.0000000	422	-0.6979297	-0.5864298	Pearson’s product-moment correlation	two.sided

Word gender bias

Molly Lewis

2019-05-30