KIDCORPUS <- here("data/processed/books/tidy_full_corpus_all.csv")
TIDY_MODEL_SCORES <- here('data/processed/other/tidy_gender_scores.csv')
model_biases <- read_csv(TIDY_MODEL_SCORES) %>%
  mutate(model_type = case_when(path == "downsampled_kidbook"~ "kid", 
                                path == "wiki"~ "wiki", TRUE ~ "adult"))


model_scores <- model_biases %>%
  group_by(model_type, word) %>%
  summarize(male_score = mean(male_score)) # consider filtering to words that are in lots of models word_counts <- count(model_biases, word) 

kidbook_word_counts <- read_csv(KIDCORPUS) %>%
  count(word)
GENDER_NORMS <- here("data/processed/words/gender_ratings_mean.csv")

gender_norms <- read_csv(GENDER_NORMS) %>%
  select(word, mean) %>%
  rename(human_gender_rating = mean)

all_scores <- gender_norms %>%
  left_join(model_scores) %>%
  filter(!is.na(model_type))

good_words <- count(all_scores, word) %>%
  filter(n == 3) %>%
  pull(word)

all_scores_tidy <- all_scores %>%
  filter(word %in% good_words) %>%
  left_join(kidbook_word_counts) %>%
  filter(n > 50) %>%
  group_by(model_type) %>%
  mutate(male_score = scale(male_score))

all_scores_tidy %>%
  ggplot(aes( x = male_score,
              y = human_gender_rating,
              color = model_type)) +
  geom_point(alpha = .2) + 
  geom_smooth(method = "lm") +
  facet_wrap(~model_type) +
  theme_classic()

all_scores_tidy %>%
  ggplot(aes( x = male_score,
              y = human_gender_rating,
              color = model_type)) +
  geom_smooth(method = "lm") +
  theme_classic()

 all_scores_tidy %>%
  group_by(model_type) %>%
  nest() %>%
  mutate(test = map(data, ~ tidy(cor.test(.x$male_score, 
                                          .x$human_gender_rating)))) %>%
  select(-data) %>%
  unnest()  %>%
   kable()
model_type estimate statistic p.value parameter conf.low conf.high method alternative
adult -0.0254469 -0.5229155 0.6013075 422 -0.1203885 0.0699560 Pearson’s product-moment correlation two.sided
kid -0.1069391 -2.2094814 0.0276775 422 -0.2001342 -0.0118262 Pearson’s product-moment correlation two.sided
wiki -0.6456074 -17.3667810 0.0000000 422 -0.6979297 -0.5864298 Pearson’s product-moment correlation two.sided