In this analysis I’m comparing the gender scores of words derived from the corpus itself, across different corpora. A gender score is derived for each word by measuring its distance to a set of MALE ( “son”, “his”,“him”,“he”, “brother”,“boy”, “man”) and FEMALE anchor words (“daughter”, “hers”, “her”, “she”, “sister”, “girl”, “woman”,), and taking the difference (mean male - mean female distance). I’ve also included the human ratings we collected for comparision.

“coca”, “kid”, and “wiki” refer to male scores derived from the corresponding corpus. “human” refers to the gender ratings that we collected from participants.

Raw Scores

SCORE_PATH <- here("analysis/books/adult_comparision/correlation_analysis_2/tidy_gender_scores2.csv")
model_biases <- read_csv(SCORE_PATH) 
model_scores <- model_biases %>%
  group_by(model_type, word) %>%
  summarize(male_score = mean(male_score))  %>%
  filter(!(word %in% c("daughter", "hers", "her", "she",  "sister", "girl", "woman", "son", "his","him","he", "brother","boy", "man")))

GENDER_NORMS <- here("data/processed/words/gender_ratings_mean.csv")

gender_words <- read_csv(GENDER_NORMS)
gender_norms <- gender_words %>%
  mutate(word = map_chr(word, ~unlist(str_split(.x, " "))[[1]]),
         word = tolower(word),
         word = str_remove_all(word, '[:punct:]')) %>%
  distinct(word, .keep_all = T) %>%
  group_by(word) %>%
  summarize(human_gender_rating  = mean(mean, na.rm = T))

model_scores_wide <- model_scores %>%
  spread(model_type, male_score)  %>%
    inner_join(gender_norms) %>%
  mutate(coca = scale_this(coca),
         kid = scale_this(kid),
         wiki = scale_this(wiki),
         human = -scale_this(human_gender_rating)) %>% # ratings and model scores oriented differently
  select(-human_gender_rating) %>%
  drop_na()

DT::datatable(model_scores_wide %>% mutate_if(is.numeric, round, 4))

Distributions

dists <- model_scores_wide %>%
  select(word, coca, kid, wiki, human) %>%
  gather("measure", "value", -word) 

ggplot(dists, aes(x = value, group = measure, fill = measure)) +
  xlab("male score") +
  facet_grid(.~measure)+
  geom_density(alpha = .4) +
  theme_classic() +
  theme(legend.position = "none")

dists %>%
  group_by(measure) %>%
  summarize(sd = sd(value)) %>%
  kable(caption = "SDs by Corpus")
SDs by Corpus
measure sd
coca 0.9660780
human 0.8844889
kid 0.9828390
wiki 0.9392539

Correlation

Kid ~ Coca Adult

ggplot(model_scores_wide, aes(x = kid, y = coca)) +
  geom_point(alpha = .5) +
  geom_smooth(method = "lm") +
  theme_classic()

cor.test(model_scores_wide$coca, model_scores_wide$kid) %>%
  tidy() %>%
  kable()
estimate statistic p.value parameter conf.low conf.high method alternative
0.1761631 5.18372 3e-07 839 0.109869 0.2408968 Pearson’s product-moment correlation two.sided

Kid ~ Human

ggplot(model_scores_wide, aes(x = kid, y = human)) +
  geom_point(alpha = .5) +
  geom_smooth(method = "lm") +
  theme_classic()

cor.test(model_scores_wide$human, model_scores_wide$kid) %>%
  tidy() %>%
  kable()
estimate statistic p.value parameter conf.low conf.high method alternative
0.2552892 7.647998 0 839 0.1909827 0.3174138 Pearson’s product-moment correlation two.sided

Kid ~ Wiki Adult

ggplot(model_scores_wide, aes(x = kid, y = wiki)) +
  geom_point(alpha = .5) +
  geom_smooth(method = "lm") +
  theme_classic()

cor.test(model_scores_wide$wiki, model_scores_wide$kid) %>%
  tidy() %>%
  kable()
estimate statistic p.value parameter conf.low conf.high method alternative
0.236491 7.050065 0 839 0.1716324 0.2993084 Pearson’s product-moment correlation two.sided

Coca Adult ~ Wiki Adult

ggplot(model_scores_wide, aes(x = coca, y = wiki)) +
  geom_point(alpha = .5) +
  geom_smooth(method = "lm") +
  theme_classic()

cor.test(model_scores_wide$wiki, model_scores_wide$coca) %>%
  tidy() %>%
  kable()
estimate statistic p.value parameter conf.low conf.high method alternative
0.3683333 11.47578 0 839 0.3084102 0.4253447 Pearson’s product-moment correlation two.sided

Coca Adult ~ Human

ggplot(model_scores_wide, aes(x = coca, y = human)) +
  geom_point(alpha = .5) +
  geom_smooth(method = "lm") +
  theme_classic()

cor.test(model_scores_wide$human, model_scores_wide$coca) %>%
  tidy() %>%
  kable()
estimate statistic p.value parameter conf.low conf.high method alternative
0.2888854 8.740365 0 839 0.2256904 0.3496593 Pearson’s product-moment correlation two.sided

Wiki Adult ~ Human

ggplot(model_scores_wide, aes(x = wiki, y = human)) +
  geom_point(alpha = .5) +
  geom_smooth(method = "lm") +
  theme_classic()

cor.test(model_scores_wide$wiki, model_scores_wide$human) %>%
  tidy() %>%
  kable()
estimate statistic p.value parameter conf.low conf.high method alternative
0.6147889 22.57874 0 839 0.5709143 0.6551621 Pearson’s product-moment correlation two.sided

Quartile Comparision

Kid ~ Coca Adult

NUMTILES <- 4
quartiles_wide <- model_scores_wide %>%
  mutate(coca_quartile = ntile(coca, NUMTILES ),
         kid_quartile = ntile(kid, NUMTILES),
         wiki_quartile = ntile(wiki, NUMTILES),
        human_quartile = ntile(human, NUMTILES),
           coca_type = case_when(coca_quartile == 1 ~ "female",
                          coca_quartile == 4 ~ "male",
                          TRUE ~ "neither"),
         wiki_type = case_when(wiki_quartile == 1 ~ "female",
                          wiki_quartile == 4 ~ "male",
                          TRUE ~ "neither"),
         kid_type = case_when(kid_quartile == 1 ~ "female",
                          kid_quartile == 4 ~ "male",
                          TRUE ~ "neither"),
            human_type = case_when(human_quartile == 1 ~ "female",
                          human_quartile == 4 ~ "male",
                          TRUE ~ "neither"))

quartile_table <- quartiles_wide %>%
  count(coca_quartile, kid_quartile) %>%
  mutate(delta = case_when(coca_quartile > kid_quartile ~ "coca more male",
                            kid_quartile > coca_quartile ~ "kid more male",
                            kid_quartile == coca_quartile ~ "same"))

ggplot(quartile_table,
       aes(axis1 = kid_quartile,
           axis2 = coca_quartile,
           y = n,
           fill = delta)) +
  geom_alluvium() +
  geom_stratum(fill = "grey") +
  geom_text(stat = "stratum", 
            label.strata = TRUE) +
    scale_fill_manual(values = c('blue', 'green', 'red')) +

  scale_x_discrete(limits = c("kid", "coca"),
                   expand = c(.1, .1)) +
  theme_classic() +
      labs(title = "Male Score Quartile Counts by Training Corpus",
       y = "Male score quartile") 

Kid ~ Wiki Adult

quartile_table <- quartiles_wide %>%
  count(wiki_quartile, kid_quartile) %>%
  mutate(delta = case_when(wiki_quartile > kid_quartile ~ "wiki more male",
                            kid_quartile > wiki_quartile ~ "kid more male",
                            kid_quartile == wiki_quartile ~ "same"))

ggplot(quartile_table,
       aes(axis1 = kid_quartile,
           axis2 = wiki_quartile,
           y = n,
           fill = delta)) +
  geom_alluvium() +
  geom_stratum(fill = "grey") +
  geom_text(stat = "stratum", 
            label.strata = TRUE) +
  scale_fill_manual(values = c('green', 'red', 'blue')) +
  scale_x_discrete(limits = c("kid", "wiki"),
                   expand = c(.1, .1)) +
  theme_classic() +
      labs(title = "Male Score Quartile Counts by Training Corpus",
       y = "Male score quartile") 

Kid ~ Human

quartile_table <- quartiles_wide %>%
  count(human_quartile, kid_quartile) %>%
  mutate(delta = case_when(human_quartile > kid_quartile ~ "human more male",
                            kid_quartile > human_quartile ~ "kid more male",
                            kid_quartile == human_quartile ~ "same"))

ggplot(quartile_table,
       aes(axis1 = kid_quartile,
           axis2 = human_quartile,
           y = n,
           fill = delta)) +
  geom_alluvium() +
  geom_stratum(fill = "grey") +
  geom_text(stat = "stratum", 
            label.strata = TRUE) +
  scale_fill_manual(values = c('blue', 'green', 'red')) +
  scale_x_discrete(limits = c("kid", "wiki"),
                   expand = c(.1, .1)) +
  theme_classic() +
      labs(title = "Male Score Quartile Counts by Training Corpus",
       y = "Male score quartile") 

Coca Adult ~ Wiki Adult

quartile_table <- quartiles_wide %>%
  count(wiki_quartile, coca_quartile) %>%
  mutate(delta = case_when(coca_quartile > wiki_quartile ~ "coca more male",
                            wiki_quartile > coca_quartile ~ "wiki more male",
                            wiki_quartile == coca_quartile ~ "same"))

ggplot(quartile_table,
       aes(axis1 = coca_quartile,
           axis2 = wiki_quartile,
           y = n,
           fill = delta)) +
  geom_alluvium() +
  geom_stratum(fill = "grey") +
  geom_text(stat = "stratum", 
            label.strata = TRUE) +
  scale_fill_manual(values = c('green', 'red', 'blue')) +
  scale_x_discrete(limits = c("kid", "wiki"),
                   expand = c(.1, .1)) +
  theme_classic() +
      labs(title = "Male Score Quartile Counts by Training Corpus",
       y = "Male score quartile") 

Summary Table

This is the table you were suggesting. Words are defined as male/female biased if they’re in the bottom/top quartile.

Kid -> Coca

quartiles_wide %>%
  count(kid_type, coca_type) %>%
  spread(coca_type, n) %>%
  kable(col.names = c("Bias in kid corpus", "Coca biased female", "Coca biased male", "Coca biased neither"))
Bias in kid corpus Coca biased female Coca biased male Coca biased neither
female 71 38 102
male 38 60 112
neither 102 112 206

Kid -> Adult human (ratings)

quartiles_wide %>%
  count(kid_type, human_type) %>%
  spread(human_type, n) %>%
  kable(col.names = c("Bias in kid corpus", "Adult human biased female", "Adult human biased male", "Adult human biased neither"))
Bias in kid corpus Adult human biased female Adult human biased male Adult human biased neither
female 80 31 100
male 41 59 110
neither 90 120 210