Word gender bias from models

Raw Scores
Distributions
Correlation
Quartile Comparision
Summary Table
- Kid -> Coca
- Kid -> Adult human (ratings)

In this analysis I’m comparing the gender scores of words derived from the corpus itself, across different corpora. A gender score is derived for each word by measuring its distance to a set of MALE ( “son”, “his”,“him”,“he”, “brother”,“boy”, “man”) and FEMALE anchor words (“daughter”, “hers”, “her”, “she”, “sister”, “girl”, “woman”,), and taking the difference (mean male - mean female distance). I’ve also included the human ratings we collected for comparision.

“coca”, “kid”, and “wiki” refer to male scores derived from the corresponding corpus. “human” refers to the gender ratings that we collected from participants.

Raw Scores

SCORE_PATH <- here("analysis/books/adult_comparision/correlation_analysis_2/tidy_gender_scores2.csv")
model_biases <- read_csv(SCORE_PATH) 
model_scores <- model_biases %>%
  group_by(model_type, word) %>%
  summarize(male_score = mean(male_score))  %>%
  filter(!(word %in% c("daughter", "hers", "her", "she",  "sister", "girl", "woman", "son", "his","him","he", "brother","boy", "man")))

GENDER_NORMS <- here("data/processed/words/gender_ratings_mean.csv")

gender_words <- read_csv(GENDER_NORMS)
gender_norms <- gender_words %>%
  mutate(word = map_chr(word, ~unlist(str_split(.x, " "))[[1]]),
         word = tolower(word),
         word = str_remove_all(word, '[:punct:]')) %>%
  distinct(word, .keep_all = T) %>%
  group_by(word) %>%
  summarize(human_gender_rating  = mean(mean, na.rm = T))

model_scores_wide <- model_scores %>%
  spread(model_type, male_score)  %>%
    inner_join(gender_norms) %>%
  mutate(coca = scale_this(coca),
         kid = scale_this(kid),
         wiki = scale_this(wiki),
         human = -scale_this(human_gender_rating)) %>% # ratings and model scores oriented differently
  select(-human_gender_rating) %>%
  drop_na()

DT::datatable(model_scores_wide %>% mutate_if(is.numeric, round, 4))

Distributions

dists <- model_scores_wide %>%
  select(word, coca, kid, wiki, human) %>%
  gather("measure", "value", -word) 

ggplot(dists, aes(x = value, group = measure, fill = measure)) +
  xlab("male score") +
  facet_grid(.~measure)+
  geom_density(alpha = .4) +
  theme_classic() +
  theme(legend.position = "none")

dists %>%
  group_by(measure) %>%
  summarize(sd = sd(value)) %>%
  kable(caption = "SDs by Corpus")

SDs by Corpus
measure	sd
coca	0.9660780
human	0.8844889
kid	0.9828390
wiki	0.9392539

Correlation

Kid ~ Coca Adult

ggplot(model_scores_wide, aes(x = kid, y = coca)) +
  geom_point(alpha = .5) +
  geom_smooth(method = "lm") +
  theme_classic()

cor.test(model_scores_wide$coca, model_scores_wide$kid) %>%
  tidy() %>%
  kable()

estimate	statistic	p.value	parameter	conf.low	conf.high	method	alternative
0.1761631	5.18372	3e-07	839	0.109869	0.2408968	Pearson’s product-moment correlation	two.sided

Kid ~ Human

ggplot(model_scores_wide, aes(x = kid, y = human)) +
  geom_point(alpha = .5) +
  geom_smooth(method = "lm") +
  theme_classic()

cor.test(model_scores_wide$human, model_scores_wide$kid) %>%
  tidy() %>%
  kable()

estimate	statistic	p.value	parameter	conf.low	conf.high	method	alternative
0.2552892	7.647998	0	839	0.1909827	0.3174138	Pearson’s product-moment correlation	two.sided

Kid ~ Wiki Adult

ggplot(model_scores_wide, aes(x = kid, y = wiki)) +
  geom_point(alpha = .5) +
  geom_smooth(method = "lm") +
  theme_classic()

cor.test(model_scores_wide$wiki, model_scores_wide$kid) %>%
  tidy() %>%
  kable()

estimate	statistic	p.value	parameter	conf.low	conf.high	method	alternative
0.236491	7.050065	0	839	0.1716324	0.2993084	Pearson’s product-moment correlation	two.sided

Coca Adult ~ Wiki Adult

ggplot(model_scores_wide, aes(x = coca, y = wiki)) +
  geom_point(alpha = .5) +
  geom_smooth(method = "lm") +
  theme_classic()

cor.test(model_scores_wide$wiki, model_scores_wide$coca) %>%
  tidy() %>%
  kable()

estimate	statistic	p.value	parameter	conf.low	conf.high	method	alternative
0.3683333	11.47578	0	839	0.3084102	0.4253447	Pearson’s product-moment correlation	two.sided

Coca Adult ~ Human

ggplot(model_scores_wide, aes(x = coca, y = human)) +
  geom_point(alpha = .5) +
  geom_smooth(method = "lm") +
  theme_classic()

cor.test(model_scores_wide$human, model_scores_wide$coca) %>%
  tidy() %>%
  kable()

estimate	statistic	p.value	parameter	conf.low	conf.high	method	alternative
0.2888854	8.740365	0	839	0.2256904	0.3496593	Pearson’s product-moment correlation	two.sided

Wiki Adult ~ Human

ggplot(model_scores_wide, aes(x = wiki, y = human)) +
  geom_point(alpha = .5) +
  geom_smooth(method = "lm") +
  theme_classic()

cor.test(model_scores_wide$wiki, model_scores_wide$human) %>%
  tidy() %>%
  kable()

estimate	statistic	p.value	parameter	conf.low	conf.high	method	alternative
0.6147889	22.57874	0	839	0.5709143	0.6551621	Pearson’s product-moment correlation	two.sided

Quartile Comparision

Kid ~ Coca Adult

NUMTILES <- 4
quartiles_wide <- model_scores_wide %>%
  mutate(coca_quartile = ntile(coca, NUMTILES ),
         kid_quartile = ntile(kid, NUMTILES),
         wiki_quartile = ntile(wiki, NUMTILES),
        human_quartile = ntile(human, NUMTILES),
           coca_type = case_when(coca_quartile == 1 ~ "female",
                          coca_quartile == 4 ~ "male",
                          TRUE ~ "neither"),
         wiki_type = case_when(wiki_quartile == 1 ~ "female",
                          wiki_quartile == 4 ~ "male",
                          TRUE ~ "neither"),
         kid_type = case_when(kid_quartile == 1 ~ "female",
                          kid_quartile == 4 ~ "male",
                          TRUE ~ "neither"),
            human_type = case_when(human_quartile == 1 ~ "female",
                          human_quartile == 4 ~ "male",
                          TRUE ~ "neither"))

quartile_table <- quartiles_wide %>%
  count(coca_quartile, kid_quartile) %>%
  mutate(delta = case_when(coca_quartile > kid_quartile ~ "coca more male",
                            kid_quartile > coca_quartile ~ "kid more male",
                            kid_quartile == coca_quartile ~ "same"))

ggplot(quartile_table,
       aes(axis1 = kid_quartile,
           axis2 = coca_quartile,
           y = n,
           fill = delta)) +
  geom_alluvium() +
  geom_stratum(fill = "grey") +
  geom_text(stat = "stratum", 
            label.strata = TRUE) +
    scale_fill_manual(values = c('blue', 'green', 'red')) +

  scale_x_discrete(limits = c("kid", "coca"),
                   expand = c(.1, .1)) +
  theme_classic() +
      labs(title = "Male Score Quartile Counts by Training Corpus",
       y = "Male score quartile")

Kid ~ Wiki Adult

quartile_table <- quartiles_wide %>%
  count(wiki_quartile, kid_quartile) %>%
  mutate(delta = case_when(wiki_quartile > kid_quartile ~ "wiki more male",
                            kid_quartile > wiki_quartile ~ "kid more male",
                            kid_quartile == wiki_quartile ~ "same"))

ggplot(quartile_table,
       aes(axis1 = kid_quartile,
           axis2 = wiki_quartile,
           y = n,
           fill = delta)) +
  geom_alluvium() +
  geom_stratum(fill = "grey") +
  geom_text(stat = "stratum", 
            label.strata = TRUE) +
  scale_fill_manual(values = c('green', 'red', 'blue')) +
  scale_x_discrete(limits = c("kid", "wiki"),
                   expand = c(.1, .1)) +
  theme_classic() +
      labs(title = "Male Score Quartile Counts by Training Corpus",
       y = "Male score quartile")

Kid ~ Human

quartile_table <- quartiles_wide %>%
  count(human_quartile, kid_quartile) %>%
  mutate(delta = case_when(human_quartile > kid_quartile ~ "human more male",
                            kid_quartile > human_quartile ~ "kid more male",
                            kid_quartile == human_quartile ~ "same"))

ggplot(quartile_table,
       aes(axis1 = kid_quartile,
           axis2 = human_quartile,
           y = n,
           fill = delta)) +
  geom_alluvium() +
  geom_stratum(fill = "grey") +
  geom_text(stat = "stratum", 
            label.strata = TRUE) +
  scale_fill_manual(values = c('blue', 'green', 'red')) +
  scale_x_discrete(limits = c("kid", "wiki"),
                   expand = c(.1, .1)) +
  theme_classic() +
      labs(title = "Male Score Quartile Counts by Training Corpus",
       y = "Male score quartile")

Coca Adult ~ Wiki Adult

quartile_table <- quartiles_wide %>%
  count(wiki_quartile, coca_quartile) %>%
  mutate(delta = case_when(coca_quartile > wiki_quartile ~ "coca more male",
                            wiki_quartile > coca_quartile ~ "wiki more male",
                            wiki_quartile == coca_quartile ~ "same"))

ggplot(quartile_table,
       aes(axis1 = coca_quartile,
           axis2 = wiki_quartile,
           y = n,
           fill = delta)) +
  geom_alluvium() +
  geom_stratum(fill = "grey") +
  geom_text(stat = "stratum", 
            label.strata = TRUE) +
  scale_fill_manual(values = c('green', 'red', 'blue')) +
  scale_x_discrete(limits = c("kid", "wiki"),
                   expand = c(.1, .1)) +
  theme_classic() +
      labs(title = "Male Score Quartile Counts by Training Corpus",
       y = "Male score quartile")

Summary Table

This is the table you were suggesting. Words are defined as male/female biased if they’re in the bottom/top quartile.

Kid -> Coca

quartiles_wide %>%
  count(kid_type, coca_type) %>%
  spread(coca_type, n) %>%
  kable(col.names = c("Bias in kid corpus", "Coca biased female", "Coca biased male", "Coca biased neither"))

Bias in kid corpus	Coca biased female	Coca biased male	Coca biased neither
female	71	38	102
male	38	60	112
neither	102	112	206

Kid -> Adult human (ratings)

quartiles_wide %>%
  count(kid_type, human_type) %>%
  spread(human_type, n) %>%
  kable(col.names = c("Bias in kid corpus", "Adult human biased female", "Adult human biased male", "Adult human biased neither"))

Bias in kid corpus	Adult human biased female	Adult human biased male	Adult human biased neither
female	80	31	100
male	41	59	110
neither	90	120	210

Word gender bias from models

Molly Lewis

2019-07-30

Raw Scores

Distributions

Correlation

Kid ~ Coca Adult

Kid ~ Human

Kid ~ Wiki Adult

Coca Adult ~ Wiki Adult

Coca Adult ~ Human

Wiki Adult ~ Human

Quartile Comparision

Kid ~ Coca Adult

Kid ~ Wiki Adult

Kid ~ Human

Coca Adult ~ Wiki Adult

Summary Table

Kid -> Coca

Kid -> Adult human (ratings)