KIDCORPUS <- here("data/processed/books/tidy_full_corpus_all.csv")
TIDY_MODEL_SCORES <- here('data/processed/other/tidy_gender_scores.csv')
model_biases <- read_csv(TIDY_MODEL_SCORES) %>%
mutate(model_type = case_when(path == "downsampled_kidbook"~ "kid",
path == "wiki"~ "wiki", TRUE ~ "adult"))
model_scores <- model_biases %>%
group_by(model_type, word) %>%
summarize(male_score = mean(male_score)) # consider filtering to words that are in lots of models word_counts <- count(model_biases, word)
kidbook_word_counts <- read_csv(KIDCORPUS) %>%
count(word)
GENDER_NORMS <- here("data/processed/words/gender_ratings_mean.csv")
gender_norms <- read_csv(GENDER_NORMS) %>%
select(word, mean) %>%
rename(human_gender_rating = mean)
all_scores <- gender_norms %>%
left_join(model_scores) %>%
filter(!is.na(model_type))
good_words <- count(all_scores, word) %>%
filter(n == 3) %>%
pull(word)
all_scores_tidy <- all_scores %>%
filter(word %in% good_words) %>%
left_join(kidbook_word_counts) %>%
filter(n > 50) %>%
group_by(model_type) %>%
mutate(male_score = scale(male_score))
all_scores_tidy %>%
ggplot(aes( x = male_score,
y = human_gender_rating,
color = model_type)) +
geom_point(alpha = .2) +
geom_smooth(method = "lm") +
facet_wrap(~model_type) +
theme_classic()

all_scores_tidy %>%
ggplot(aes( x = male_score,
y = human_gender_rating,
color = model_type)) +
geom_smooth(method = "lm") +
theme_classic()

all_scores_tidy %>%
group_by(model_type) %>%
nest() %>%
mutate(test = map(data, ~ tidy(cor.test(.x$male_score,
.x$human_gender_rating)))) %>%
select(-data) %>%
unnest() %>%
kable()
| adult |
-0.0254469 |
-0.5229155 |
0.6013075 |
422 |
-0.1203885 |
0.0699560 |
Pearson’s product-moment correlation |
two.sided |
| kid |
-0.1069391 |
-2.2094814 |
0.0276775 |
422 |
-0.2001342 |
-0.0118262 |
Pearson’s product-moment correlation |
two.sided |
| wiki |
-0.6456074 |
-17.3667810 |
0.0000000 |
422 |
-0.6979297 |
-0.5864298 |
Pearson’s product-moment correlation |
two.sided |