KIDCORPUS <- here("data/processed/books/tidy_full_corpus_all.csv")
TIDY_MODEL_SCORES <- here('data/processed/other/tidy_gender_scores.csv')
model_biases <- read_csv(TIDY_MODEL_SCORES) %>%
  mutate(model_type = case_when(path == "downsampled_kidbook"~ "kid", 
                                path == "wiki"~ "wiki", TRUE ~ "adult"))


model_scores <- model_biases %>%
  group_by(model_type, word) %>%
  summarize(male_score = mean(male_score)) # consider filtering to words that are in lots of models word_counts <- count(model_biases, word) 

kidbook_word_counts <- read_csv(KIDCORPUS) %>%
  count(word)
GENDER_NORMS <- here("data/processed/words/gender_ratings_mean.csv")

gender_norms <- read_csv(GENDER_NORMS) %>%
  select(word, mean) %>%
  rename(human_gender_rating = mean)

all_scores <- gender_norms %>%
  left_join(model_scores) %>%
  filter(!is.na(model_type))

good_words <- count(all_scores, word) %>%
  filter(n == 3) %>%
  pull(word)

all_scores_tidy <- all_scores %>%
  filter(word %in% good_words) %>%
  left_join(kidbook_word_counts) %>%
  filter(n > 50) %>%
  group_by(model_type) %>%
  mutate(male_score = scale(male_score))

all_scores_tidy %>%
  ggplot(aes( x = male_score,
              y = human_gender_rating,
              color = model_type)) +
  geom_point(alpha = .2) + 
  geom_smooth(method = "lm") +
  facet_wrap(~model_type) +
  theme_classic()

all_scores_tidy %>%
  ggplot(aes( x = male_score,
              y = human_gender_rating,
              color = model_type)) +
  geom_smooth(method = "lm") +
  theme_classic()