Word gender bias

SCORE_PATH <- here("analysis/books/adult_comparision/correlation_analysis_2/tidy_gender_scores.csv")
model_biases <- read_csv(SCORE_PATH)


model_scores <- model_biases %>%
  group_by(model_type, word) %>%
  summarize(male_score = mean(male_score))

# Merge with human data
GENDER_NORMS <- here("data/processed/words/gender_ratings_mean.csv")

gender_words <- read_csv(GENDER_NORMS)
gender_norms <- gender_words %>%
  mutate(word = map_chr(word, ~unlist(str_split(.x, " "))[[1]]),
         word = tolower(word),
         word = str_remove_all(word, '[:punct:]')) %>%
  distinct(word, .keep_all = T) %>%
  group_by(word) %>%
  summarize(human_gender_rating  = mean(mean, na.rm = T))


all_scores <- gender_norms %>%
  inner_join(model_scores)

all_scores %>%
  ggplot(aes( x = male_score,
              y = human_gender_rating,
              color = model_type)) +
  geom_point(size = .5, alpha = .2)+
  geom_smooth(method = "lm") +
  theme_classic()

all_scores %>%
  group_by(model_type) %>%
  nest() %>%
  mutate(test = map(data, ~ tidy(cor.test(.x$male_score, 
                                          .x$human_gender_rating)))) %>%
  select(-data) %>%
  unnest()  %>%
  select(1:7) %>%
  mutate(sig = case_when(p.value  < .05  ~ "*", TRUE ~ "")) %>%
  kable()

model_type	estimate	statistic	p.value	parameter	conf.low	conf.high	sig
coca	-0.4080415	-15.36596	0	1182	-0.4544480	-0.3594261	*
kid	-0.3612783	-12.93749	0	1115	-0.4112195	-0.3091748	*

 gender_norms %>% 
  inner_join(model_biases) %>%
  group_by(model_type, run) %>%
  nest() %>%
  mutate(test = map(data, ~ tidy(cor.test(.x$male_score, 
                                          .x$human_gender_rating)))) %>%
  select(-data) %>%
  unnest()  %>%
  select(1:8) %>%
  mutate(sig = case_when(p.value  < .05  ~ "*", TRUE ~ "")) %>%
  kable()

model_type	run	estimate	statistic	parameter	conf.low	conf.high	sig
coca	1	-0.3005056	-9.087952	832	-0.3610265	-0.2374641	*
coca	10	-0.3015096	-8.949804	801	-0.3631195	-0.2372746	*
coca	2	-0.3352277	-10.244789	829	-0.3942478	-0.2734536	*
coca	3	-0.3960748	-12.329411	817	-0.4523073	-0.3367057	*
coca	4	-0.2815993	-8.383340	816	-0.3435154	-0.2172458	*
coca	5	-0.3375960	-10.307739	826	-0.3966052	-0.2758084	*
coca	6	-0.3766578	-11.699999	828	-0.4335935	-0.3167266	*
coca	7	-0.3281883	-9.894183	811	-0.3881862	-0.2654202	*
coca	8	-0.2932725	-8.874891	837	-0.3539302	-0.2301579	*
coca	9	-0.3610774	-11.181957	834	-0.4186327	-0.3006332	*
kid	1	-0.3392428	-12.041968	1115	-0.3901350	-0.2862842	*
kid	10	-0.3244020	-11.451617	1115	-0.3759045	-0.2709014	*
kid	2	-0.3497329	-12.465340	1115	-0.4001790	-0.2971738	*
kid	3	-0.3341407	-11.837901	1115	-0.3852454	-0.2809926	*
kid	4	-0.3497485	-12.465973	1115	-0.4001939	-0.2971900	*
kid	5	-0.3104181	-10.904019	1115	-0.3624735	-0.2564319	*
kid	6	-0.3103981	-10.903241	1115	-0.3624543	-0.2564112	*
kid	7	-0.3521110	-12.562052	1115	-0.4024543	-0.2996444	*
kid	8	-0.3212276	-11.326595	1115	-0.3728575	-0.2676147	*
kid	9	-0.3284790	-11.612826	1115	-0.3798163	-0.2751246	*

Same words across all models

common_words <- count(all_scores, word) %>%
  filter(n == 2) %>%
  pull(word)

all_scores %>%
  filter(word %in% common_words) %>%
  ggplot(aes( x = male_score,
              y = human_gender_rating,
              color = model_type)) +
  geom_point(size = .5, alpha = .2)+
  geom_smooth(method = "lm") +
  theme_classic()

all_scores %>%
    filter(word %in% common_words) %>%

  group_by(model_type) %>%
  nest() %>%
  mutate(test = map(data, ~ tidy(cor.test(.x$male_score, 
                                          .x$human_gender_rating)))) %>%
  select(-data) %>%
  unnest()  %>%
  select(1:7) %>%
  mutate(sig = case_when(p.value  < .05  ~ "*", TRUE ~ "")) %>%
  kable()

model_type	estimate	statistic	p.value	parameter	conf.low	conf.high	sig
coca	-0.4562223	-14.95605	0	851	-0.5077967	-0.4013893	*
kid	-0.3867795	-12.23535	0	851	-0.4424183	-0.3281747	*

Middle range only

all_scores %>%
  filter(male_score > -.1, male_score < .1) %>%
  ggplot(aes( x = male_score,
              y = human_gender_rating,
              color = model_type)) +
  geom_point(size = .5, alpha = .2)+
  geom_smooth(method = "lm") +
  theme_classic()

all_scores %>%
  filter(male_score > -.1, male_score < .1) %>%

  group_by(model_type) %>%
  nest() %>%
  mutate(test = map(data, ~ tidy(cor.test(.x$male_score, 
                                          .x$human_gender_rating)))) %>%
  select(-data) %>%
  unnest()  %>%
  select(1:7) %>%
  mutate(sig = case_when(p.value  < .05  ~ "*", TRUE ~ "")) %>%
  kable()

model_type	estimate	statistic	p.value	parameter	conf.low	conf.high	sig
coca	-0.3142719	-11.323480	0	1170	-0.3649659	-0.2617199	*
kid	-0.2822794	-9.763462	0	1101	-0.3357123	-0.2270358	*

Word gender bias

Molly Lewis

2019-07-30