Get human gender correlations by model

MODELS <- here("exploratory_studies/15_udbank/gender_ratings/")
all_model_ratings <- map_df(list.files(MODELS, full.names = T), fread)  %>%
  mutate(corpus = case_when(str_detect(model, "conll2018") ~ "conll2017",
                            str_detect(model, "ud_corpus") ~ "ud_corpus"),
        epochs = case_when(str_detect(model, "15ep") ~ 15,
                            str_detect(model, "30ep") ~ 30,
                            str_detect(model, "50ep") ~ 50,
                                str_detect(model, "5ep") ~ 5),
         size = case_when(str_detect(model, "300d") ~ "300",
                                str_detect(model, "200d") ~ "200",
                                str_detect(model, "100") ~ "100",
                                 str_detect(model, "150d") ~ "150"),
         type = case_when(str_detect(model, "noSubword1") ~ "noSubword1",
                          str_detect(model, "noSubword") ~ "noSubword",
                                str_detect(model, "10neg") ~ "10neg",
                                 TRUE ~ "other")) %>%
  select(corpus, epochs, size, type, word, male_score)

target_words <- count(all_model_ratings, word) %>%
  filter(n == length(unique(list.files(MODELS)))) %>%
  pull(word)

GENDER_NORMS <- here("data/study1a/raw/GlasgowNorms.csv")
glasgow_norms <- read_csv(GENDER_NORMS) %>%
  select(word, GEND_M) %>%
  rename(human_gender_rating = GEND_M) %>%
  rowwise() %>%
  mutate(word =  str_split(word, " ", simplify = T)[1],
         word = tolower(word)) %>%
  distinct() %>%
  ungroup()

all_ratings <- all_model_ratings %>%
  left_join(glasgow_norms)  %>%
  filter(word %in% target_words)

ggplot(all_ratings, aes(x  = human_gender_rating, y = male_score)) +
  geom_point(size = .2, alpha = .2) +
  geom_smooth(method = "lm") +
  facet_grid(epochs + size + type ~ corpus) +
  theme_bw()

cor_values <- all_ratings %>%
  group_by(corpus, epochs, size, type) %>%
  nest() %>%
  mutate(temp = map(data, ~tidy(cor.test(.$human_gender_rating, .$male_score)))) %>%
  select(-data) %>%
  unnest() 

kable(cor_values)

corpus	epochs	size	type	estimate	statistic	p.value	parameter	conf.low	conf.high	method	alternative
conll2017	NA	100	other	0.5054474	30.880281	0.00e+00	2779	0.4772443	0.5326104	Pearson’s product-moment correlation	two.sided
ud_corpus	15	150	noSubword	0.1371568	7.299370	0.00e+00	2779	0.1005000	0.1734416	Pearson’s product-moment correlation	two.sided
ud_corpus	15	150	other	0.0989046	5.239567	2.00e-07	2779	0.0619633	0.1355753	Pearson’s product-moment correlation	two.sided
ud_corpus	15	200	10neg	0.1313567	6.985152	0.00e+00	2779	0.0946497	0.1677069	Pearson’s product-moment correlation	two.sided
ud_corpus	15	200	noSubword	0.1336441	7.108985	0.00e+00	2779	0.0969567	0.1699689	Pearson’s product-moment correlation	two.sided
ud_corpus	15	200	noSubword1	0.1875419	10.065088	0.00e+00	2779	0.1514284	0.2231554	Pearson’s product-moment correlation	two.sided
ud_corpus	15	200	other	0.1103832	5.854764	0.00e+00	2779	0.0735158	0.1469494	Pearson’s product-moment correlation	two.sided
ud_corpus	30	200	other	0.1103498	5.852967	0.00e+00	2779	0.0734821	0.1469162	Pearson’s product-moment correlation	two.sided
ud_corpus	50	200	other	0.1064774	5.645176	0.00e+00	2779	0.0695837	0.1430802	Pearson’s product-moment correlation	two.sided
ud_corpus	5	200	other	0.0831735	4.399839	1.12e-05	2779	0.0461471	0.1199717	Pearson’s product-moment correlation	two.sided
ud_corpus	15	300	other	0.1122461	5.954820	0.00e+00	2779	0.0753915	0.1487944	Pearson’s product-moment correlation	two.sided
ud_corpus	5	300	other	0.0746779	3.947760	8.08e-05	2779	0.0376132	0.1115374	Pearson’s product-moment correlation	two.sided

CAREER_WORDS <- c("career", "executive", "management", "professional", "corporation", "salary", "office", "business")

FAMILY_WORDS <- c("family", "home", "parents", "children", "cousins", "marriage", "wedding", "relatives")

# parents and cousins
all_ratings %>%
  filter(word %in% c(CAREER_WORDS, FAMILY_WORDS)) %>%
  mutate(career_word_type = case_when(word %in% CAREER_WORDS ~ "career",
                                     word %in% FAMILY_WORDS ~ "family")) %>%
  group_by(corpus, epochs, size, type, career_word_type) %>%
  multi_boot_standard(col = "male_score") %>%
  kable()

corpus	epochs	size	type	career_word_type	ci_lower	ci_upper	mean
conll2017	NA	100	other	career	0.0056570	0.0456183	0.0269064
conll2017	NA	100	other	family	-0.1074745	-0.0212240	-0.0631271
ud_corpus	5	200	other	career	-0.0513356	-0.0016441	-0.0285404
ud_corpus	5	200	other	family	-0.0767936	0.0123628	-0.0333798
ud_corpus	5	300	other	career	-0.0522652	-0.0034116	-0.0304499
ud_corpus	5	300	other	family	-0.0763923	0.0159493	-0.0324062
ud_corpus	15	150	noSubword	career	-0.0612325	0.0061413	-0.0288450
ud_corpus	15	150	noSubword	family	-0.0666119	-0.0020531	-0.0323858
ud_corpus	15	150	other	career	-0.0353894	0.0042530	-0.0161106
ud_corpus	15	150	other	family	-0.0716662	0.0252384	-0.0232139
ud_corpus	15	200	10neg	career	-0.0443594	0.0041684	-0.0202151
ud_corpus	15	200	10neg	family	-0.0691812	0.0153592	-0.0244123
ud_corpus	15	200	noSubword	career	-0.0622161	-0.0044000	-0.0310811
ud_corpus	15	200	noSubword	family	-0.0659102	-0.0101882	-0.0390177
ud_corpus	15	200	noSubword1	career	-0.0837939	0.0004136	-0.0424895
ud_corpus	15	200	noSubword1	family	-0.1087512	-0.0435951	-0.0768896
ud_corpus	15	200	other	career	-0.0415709	0.0090273	-0.0175203
ud_corpus	15	200	other	family	-0.0760934	0.0113404	-0.0302534
ud_corpus	15	300	other	career	-0.0409650	0.0032161	-0.0201452
ud_corpus	15	300	other	family	-0.0807131	-0.0000623	-0.0403372
ud_corpus	30	200	other	career	-0.0285922	0.0032447	-0.0131271
ud_corpus	30	200	other	family	-0.0543396	0.0171815	-0.0213092
ud_corpus	50	200	other	career	-0.0282649	-0.0017467	-0.0146070
ud_corpus	50	200	other	family	-0.0528261	0.0104990	-0.0239562

Get human gender correlations by model

Molly Lewis

2019-09-19