In this analysis I’m comparing the gender scores of words derived from the corpus itself, across different corpora. A gender score is derived for each word by measuring its distance to a set of MALE ( “son”, “his”,“him”,“he”, “brother”,“boy”, “man”) and FEMALE anchor words (“daughter”, “hers”, “her”, “she”, “sister”, “girl”, “woman”,), and taking the difference (mean male - mean female distance). I’ve also included the human ratings we collected for comparision.
“coca”, “kid”, and “wiki” refer to male scores derived from the corresponding corpus. “human” refers to the gender ratings that we collected from participants.
Raw Scores
SCORE_PATH <- here("analysis/books/adult_comparision/correlation_analysis_2/tidy_gender_scores2.csv")
model_biases <- read_csv(SCORE_PATH)
model_scores <- model_biases %>%
group_by(model_type, word) %>%
summarize(male_score = mean(male_score)) %>%
filter(!(word %in% c("daughter", "hers", "her", "she", "sister", "girl", "woman", "son", "his","him","he", "brother","boy", "man")))
GENDER_NORMS <- here("data/processed/words/gender_ratings_mean.csv")
gender_words <- read_csv(GENDER_NORMS)
gender_norms <- gender_words %>%
mutate(word = map_chr(word, ~unlist(str_split(.x, " "))[[1]]),
word = tolower(word),
word = str_remove_all(word, '[:punct:]')) %>%
distinct(word, .keep_all = T) %>%
group_by(word) %>%
summarize(human_gender_rating = mean(mean, na.rm = T))
model_scores_wide <- model_scores %>%
spread(model_type, male_score) %>%
inner_join(gender_norms) %>%
mutate(coca = scale_this(coca),
kid = scale_this(kid),
wiki = scale_this(wiki),
human = -scale_this(human_gender_rating)) %>% # ratings and model scores oriented differently
select(-human_gender_rating) %>%
drop_na()
DT::datatable(model_scores_wide %>% mutate_if(is.numeric, round, 4))
Distributions
dists <- model_scores_wide %>%
select(word, coca, kid, wiki, human) %>%
gather("measure", "value", -word)
ggplot(dists, aes(x = value, group = measure, fill = measure)) +
xlab("male score") +
facet_grid(.~measure)+
geom_density(alpha = .4) +
theme_classic() +
theme(legend.position = "none")

dists %>%
group_by(measure) %>%
summarize(sd = sd(value)) %>%
kable(caption = "SDs by Corpus")
SDs by Corpus
| coca |
0.9660780 |
| human |
0.8844889 |
| kid |
0.9828390 |
| wiki |
0.9392539 |
Correlation
Kid ~ Coca Adult
ggplot(model_scores_wide, aes(x = kid, y = coca)) +
geom_point(alpha = .5) +
geom_smooth(method = "lm") +
theme_classic()

cor.test(model_scores_wide$coca, model_scores_wide$kid) %>%
tidy() %>%
kable()
| 0.1761631 |
5.18372 |
3e-07 |
839 |
0.109869 |
0.2408968 |
Pearson’s product-moment correlation |
two.sided |
Kid ~ Human
ggplot(model_scores_wide, aes(x = kid, y = human)) +
geom_point(alpha = .5) +
geom_smooth(method = "lm") +
theme_classic()

cor.test(model_scores_wide$human, model_scores_wide$kid) %>%
tidy() %>%
kable()
| 0.2552892 |
7.647998 |
0 |
839 |
0.1909827 |
0.3174138 |
Pearson’s product-moment correlation |
two.sided |
Kid ~ Wiki Adult
ggplot(model_scores_wide, aes(x = kid, y = wiki)) +
geom_point(alpha = .5) +
geom_smooth(method = "lm") +
theme_classic()

cor.test(model_scores_wide$wiki, model_scores_wide$kid) %>%
tidy() %>%
kable()
| 0.236491 |
7.050065 |
0 |
839 |
0.1716324 |
0.2993084 |
Pearson’s product-moment correlation |
two.sided |
Coca Adult ~ Wiki Adult
ggplot(model_scores_wide, aes(x = coca, y = wiki)) +
geom_point(alpha = .5) +
geom_smooth(method = "lm") +
theme_classic()

cor.test(model_scores_wide$wiki, model_scores_wide$coca) %>%
tidy() %>%
kable()
| 0.3683333 |
11.47578 |
0 |
839 |
0.3084102 |
0.4253447 |
Pearson’s product-moment correlation |
two.sided |
Coca Adult ~ Human
ggplot(model_scores_wide, aes(x = coca, y = human)) +
geom_point(alpha = .5) +
geom_smooth(method = "lm") +
theme_classic()

cor.test(model_scores_wide$human, model_scores_wide$coca) %>%
tidy() %>%
kable()
| 0.2888854 |
8.740365 |
0 |
839 |
0.2256904 |
0.3496593 |
Pearson’s product-moment correlation |
two.sided |
Wiki Adult ~ Human
ggplot(model_scores_wide, aes(x = wiki, y = human)) +
geom_point(alpha = .5) +
geom_smooth(method = "lm") +
theme_classic()

cor.test(model_scores_wide$wiki, model_scores_wide$human) %>%
tidy() %>%
kable()
| 0.6147889 |
22.57874 |
0 |
839 |
0.5709143 |
0.6551621 |
Pearson’s product-moment correlation |
two.sided |
Quartile Comparision
Kid ~ Coca Adult
NUMTILES <- 4
quartiles_wide <- model_scores_wide %>%
mutate(coca_quartile = ntile(coca, NUMTILES ),
kid_quartile = ntile(kid, NUMTILES),
wiki_quartile = ntile(wiki, NUMTILES),
human_quartile = ntile(human, NUMTILES),
coca_type = case_when(coca_quartile == 1 ~ "female",
coca_quartile == 4 ~ "male",
TRUE ~ "neither"),
wiki_type = case_when(wiki_quartile == 1 ~ "female",
wiki_quartile == 4 ~ "male",
TRUE ~ "neither"),
kid_type = case_when(kid_quartile == 1 ~ "female",
kid_quartile == 4 ~ "male",
TRUE ~ "neither"),
human_type = case_when(human_quartile == 1 ~ "female",
human_quartile == 4 ~ "male",
TRUE ~ "neither"))
quartile_table <- quartiles_wide %>%
count(coca_quartile, kid_quartile) %>%
mutate(delta = case_when(coca_quartile > kid_quartile ~ "coca more male",
kid_quartile > coca_quartile ~ "kid more male",
kid_quartile == coca_quartile ~ "same"))
ggplot(quartile_table,
aes(axis1 = kid_quartile,
axis2 = coca_quartile,
y = n,
fill = delta)) +
geom_alluvium() +
geom_stratum(fill = "grey") +
geom_text(stat = "stratum",
label.strata = TRUE) +
scale_fill_manual(values = c('blue', 'green', 'red')) +
scale_x_discrete(limits = c("kid", "coca"),
expand = c(.1, .1)) +
theme_classic() +
labs(title = "Male Score Quartile Counts by Training Corpus",
y = "Male score quartile")

Kid ~ Wiki Adult
quartile_table <- quartiles_wide %>%
count(wiki_quartile, kid_quartile) %>%
mutate(delta = case_when(wiki_quartile > kid_quartile ~ "wiki more male",
kid_quartile > wiki_quartile ~ "kid more male",
kid_quartile == wiki_quartile ~ "same"))
ggplot(quartile_table,
aes(axis1 = kid_quartile,
axis2 = wiki_quartile,
y = n,
fill = delta)) +
geom_alluvium() +
geom_stratum(fill = "grey") +
geom_text(stat = "stratum",
label.strata = TRUE) +
scale_fill_manual(values = c('green', 'red', 'blue')) +
scale_x_discrete(limits = c("kid", "wiki"),
expand = c(.1, .1)) +
theme_classic() +
labs(title = "Male Score Quartile Counts by Training Corpus",
y = "Male score quartile")

Kid ~ Human
quartile_table <- quartiles_wide %>%
count(human_quartile, kid_quartile) %>%
mutate(delta = case_when(human_quartile > kid_quartile ~ "human more male",
kid_quartile > human_quartile ~ "kid more male",
kid_quartile == human_quartile ~ "same"))
ggplot(quartile_table,
aes(axis1 = kid_quartile,
axis2 = human_quartile,
y = n,
fill = delta)) +
geom_alluvium() +
geom_stratum(fill = "grey") +
geom_text(stat = "stratum",
label.strata = TRUE) +
scale_fill_manual(values = c('blue', 'green', 'red')) +
scale_x_discrete(limits = c("kid", "wiki"),
expand = c(.1, .1)) +
theme_classic() +
labs(title = "Male Score Quartile Counts by Training Corpus",
y = "Male score quartile")

Coca Adult ~ Wiki Adult
quartile_table <- quartiles_wide %>%
count(wiki_quartile, coca_quartile) %>%
mutate(delta = case_when(coca_quartile > wiki_quartile ~ "coca more male",
wiki_quartile > coca_quartile ~ "wiki more male",
wiki_quartile == coca_quartile ~ "same"))
ggplot(quartile_table,
aes(axis1 = coca_quartile,
axis2 = wiki_quartile,
y = n,
fill = delta)) +
geom_alluvium() +
geom_stratum(fill = "grey") +
geom_text(stat = "stratum",
label.strata = TRUE) +
scale_fill_manual(values = c('green', 'red', 'blue')) +
scale_x_discrete(limits = c("kid", "wiki"),
expand = c(.1, .1)) +
theme_classic() +
labs(title = "Male Score Quartile Counts by Training Corpus",
y = "Male score quartile")

Summary Table
This is the table you were suggesting. Words are defined as male/female biased if they’re in the bottom/top quartile.
Kid -> Coca
quartiles_wide %>%
count(kid_type, coca_type) %>%
spread(coca_type, n) %>%
kable(col.names = c("Bias in kid corpus", "Coca biased female", "Coca biased male", "Coca biased neither"))
| female |
71 |
38 |
102 |
| male |
38 |
60 |
112 |
| neither |
102 |
112 |
206 |
Kid -> Adult human (ratings)
quartiles_wide %>%
count(kid_type, human_type) %>%
spread(human_type, n) %>%
kable(col.names = c("Bias in kid corpus", "Adult human biased female", "Adult human biased male", "Adult human biased neither"))
| female |
80 |
31 |
100 |
| male |
41 |
59 |
110 |
| neither |
90 |
120 |
210 |