MODELS <- here("exploratory_studies/15_udbank/gender_ratings/")
all_model_ratings <- map_df(list.files(MODELS, full.names = T), fread) %>%
mutate(corpus = case_when(str_detect(model, "conll2018") ~ "conll2017",
str_detect(model, "ud_corpus") ~ "ud_corpus"),
epochs = case_when(str_detect(model, "15ep") ~ 15,
str_detect(model, "30ep") ~ 30,
str_detect(model, "50ep") ~ 50,
str_detect(model, "5ep") ~ 5),
size = case_when(str_detect(model, "300d") ~ "300",
str_detect(model, "200d") ~ "200",
str_detect(model, "100") ~ "100",
str_detect(model, "150d") ~ "150"),
type = case_when(str_detect(model, "noSubword1") ~ "noSubword1",
str_detect(model, "noSubword") ~ "noSubword",
str_detect(model, "10neg") ~ "10neg",
TRUE ~ "other")) %>%
select(corpus, epochs, size, type, word, male_score)
target_words <- count(all_model_ratings, word) %>%
filter(n == length(unique(list.files(MODELS)))) %>%
pull(word)
GENDER_NORMS <- here("data/study1a/raw/GlasgowNorms.csv")
glasgow_norms <- read_csv(GENDER_NORMS) %>%
select(word, GEND_M) %>%
rename(human_gender_rating = GEND_M) %>%
rowwise() %>%
mutate(word = str_split(word, " ", simplify = T)[1],
word = tolower(word)) %>%
distinct() %>%
ungroup()
all_ratings <- all_model_ratings %>%
left_join(glasgow_norms) %>%
filter(word %in% target_words)
ggplot(all_ratings, aes(x = human_gender_rating, y = male_score)) +
geom_point(size = .2, alpha = .2) +
geom_smooth(method = "lm") +
facet_grid(epochs + size + type ~ corpus) +
theme_bw()
cor_values <- all_ratings %>%
group_by(corpus, epochs, size, type) %>%
nest() %>%
mutate(temp = map(data, ~tidy(cor.test(.$human_gender_rating, .$male_score)))) %>%
select(-data) %>%
unnest()
kable(cor_values)
conll2017 |
NA |
100 |
other |
0.5054474 |
30.880281 |
0.00e+00 |
2779 |
0.4772443 |
0.5326104 |
Pearson’s product-moment correlation |
two.sided |
ud_corpus |
15 |
150 |
noSubword |
0.1371568 |
7.299370 |
0.00e+00 |
2779 |
0.1005000 |
0.1734416 |
Pearson’s product-moment correlation |
two.sided |
ud_corpus |
15 |
150 |
other |
0.0989046 |
5.239567 |
2.00e-07 |
2779 |
0.0619633 |
0.1355753 |
Pearson’s product-moment correlation |
two.sided |
ud_corpus |
15 |
200 |
10neg |
0.1313567 |
6.985152 |
0.00e+00 |
2779 |
0.0946497 |
0.1677069 |
Pearson’s product-moment correlation |
two.sided |
ud_corpus |
15 |
200 |
noSubword |
0.1336441 |
7.108985 |
0.00e+00 |
2779 |
0.0969567 |
0.1699689 |
Pearson’s product-moment correlation |
two.sided |
ud_corpus |
15 |
200 |
noSubword1 |
0.1875419 |
10.065088 |
0.00e+00 |
2779 |
0.1514284 |
0.2231554 |
Pearson’s product-moment correlation |
two.sided |
ud_corpus |
15 |
200 |
other |
0.1103832 |
5.854764 |
0.00e+00 |
2779 |
0.0735158 |
0.1469494 |
Pearson’s product-moment correlation |
two.sided |
ud_corpus |
30 |
200 |
other |
0.1103498 |
5.852967 |
0.00e+00 |
2779 |
0.0734821 |
0.1469162 |
Pearson’s product-moment correlation |
two.sided |
ud_corpus |
50 |
200 |
other |
0.1064774 |
5.645176 |
0.00e+00 |
2779 |
0.0695837 |
0.1430802 |
Pearson’s product-moment correlation |
two.sided |
ud_corpus |
5 |
200 |
other |
0.0831735 |
4.399839 |
1.12e-05 |
2779 |
0.0461471 |
0.1199717 |
Pearson’s product-moment correlation |
two.sided |
ud_corpus |
15 |
300 |
other |
0.1122461 |
5.954820 |
0.00e+00 |
2779 |
0.0753915 |
0.1487944 |
Pearson’s product-moment correlation |
two.sided |
ud_corpus |
5 |
300 |
other |
0.0746779 |
3.947760 |
8.08e-05 |
2779 |
0.0376132 |
0.1115374 |
Pearson’s product-moment correlation |
two.sided |
CAREER_WORDS <- c("career", "executive", "management", "professional", "corporation", "salary", "office", "business")
FAMILY_WORDS <- c("family", "home", "parents", "children", "cousins", "marriage", "wedding", "relatives")
all_ratings %>%
filter(word %in% c(CAREER_WORDS, FAMILY_WORDS)) %>%
mutate(career_word_type = case_when(word %in% CAREER_WORDS ~ "career",
word %in% FAMILY_WORDS ~ "family")) %>%
group_by(corpus, epochs, size, type, career_word_type) %>%
multi_boot_standard(col = "male_score") %>%
kable()
conll2017 |
NA |
100 |
other |
career |
0.0056570 |
0.0456183 |
0.0269064 |
conll2017 |
NA |
100 |
other |
family |
-0.1074745 |
-0.0212240 |
-0.0631271 |
ud_corpus |
5 |
200 |
other |
career |
-0.0513356 |
-0.0016441 |
-0.0285404 |
ud_corpus |
5 |
200 |
other |
family |
-0.0767936 |
0.0123628 |
-0.0333798 |
ud_corpus |
5 |
300 |
other |
career |
-0.0522652 |
-0.0034116 |
-0.0304499 |
ud_corpus |
5 |
300 |
other |
family |
-0.0763923 |
0.0159493 |
-0.0324062 |
ud_corpus |
15 |
150 |
noSubword |
career |
-0.0612325 |
0.0061413 |
-0.0288450 |
ud_corpus |
15 |
150 |
noSubword |
family |
-0.0666119 |
-0.0020531 |
-0.0323858 |
ud_corpus |
15 |
150 |
other |
career |
-0.0353894 |
0.0042530 |
-0.0161106 |
ud_corpus |
15 |
150 |
other |
family |
-0.0716662 |
0.0252384 |
-0.0232139 |
ud_corpus |
15 |
200 |
10neg |
career |
-0.0443594 |
0.0041684 |
-0.0202151 |
ud_corpus |
15 |
200 |
10neg |
family |
-0.0691812 |
0.0153592 |
-0.0244123 |
ud_corpus |
15 |
200 |
noSubword |
career |
-0.0622161 |
-0.0044000 |
-0.0310811 |
ud_corpus |
15 |
200 |
noSubword |
family |
-0.0659102 |
-0.0101882 |
-0.0390177 |
ud_corpus |
15 |
200 |
noSubword1 |
career |
-0.0837939 |
0.0004136 |
-0.0424895 |
ud_corpus |
15 |
200 |
noSubword1 |
family |
-0.1087512 |
-0.0435951 |
-0.0768896 |
ud_corpus |
15 |
200 |
other |
career |
-0.0415709 |
0.0090273 |
-0.0175203 |
ud_corpus |
15 |
200 |
other |
family |
-0.0760934 |
0.0113404 |
-0.0302534 |
ud_corpus |
15 |
300 |
other |
career |
-0.0409650 |
0.0032161 |
-0.0201452 |
ud_corpus |
15 |
300 |
other |
family |
-0.0807131 |
-0.0000623 |
-0.0403372 |
ud_corpus |
30 |
200 |
other |
career |
-0.0285922 |
0.0032447 |
-0.0131271 |
ud_corpus |
30 |
200 |
other |
family |
-0.0543396 |
0.0171815 |
-0.0213092 |
ud_corpus |
50 |
200 |
other |
career |
-0.0282649 |
-0.0017467 |
-0.0146070 |
ud_corpus |
50 |
200 |
other |
family |
-0.0528261 |
0.0104990 |
-0.0239562 |