CLUSTER_PATH <- here("analysis/books/adult_comparision/cluster_analysis/data/clusters_all_models_common.csv")
clusters <- read_csv(CLUSTER_PATH,
col_names = c("cluster_id", "n_words", "mean_gender",
"tsne_x", "tsne_y", "effect_size", "is_biased", "bias_type", "model_type", "run")) %>%
mutate(run = case_when(is.na(run)~ 1, TRUE ~ run))
Clusters are identified as gendered based on the human judgements. NAs are cases where the cluster had < 3 words in it.
clusters %>%
distinct(model_type, run, cluster_id, bias_type) %>%
count(model_type, run, bias_type) %>%
complete(model_type, run, bias_type, fill = list(n = 0)) %>%
filter(model_type == "coca"| run < 2) %>%
unite("model", model_type, run, sep = "_") %>%
ggplot(aes(x = model, y = n, color = bias_type, group = bias_type)) +
geom_point() +
ylab("N clusters") +
geom_line() +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
Excluding NA and “neither” categories:
clusters %>%
filter(bias_type %in% c("female", "male")) %>%
distinct(model_type, run, cluster_id, bias_type) %>%
count(model_type, run, bias_type) %>%
complete(model_type, run, bias_type, fill = list(n = 0)) %>%
filter(model_type == "coca"| run < 2) %>%
unite("model", model_type, run, sep = "_") %>%
ggplot(aes(x = model, y = n, color = bias_type, group = bias_type)) +
geom_point() +
ylab("N clusters") +
geom_line() +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
Taking the mean across coca runs. Dashed lines correspond to mean in kid corpus.
mean_n_clusters <- clusters %>%
filter(bias_type %in% c("female", "male")) %>%
distinct(model_type, run, cluster_id, bias_type) %>%
count(model_type, run, bias_type) %>%
complete(model_type, run, bias_type, fill = list(n = 0)) %>%
filter(model_type == "coca"| run < 2) %>%
group_by(model_type, bias_type) %>%
multi_boot_standard(col = "n")
ggplot(mean_n_clusters %>% filter(model_type == "coca"), aes(x = bias_type, fill = bias_type, y = mean)) +
geom_bar(stat = "identity") +
geom_linerange(aes(ymin = ci_lower, ymax = ci_upper)) +
ylab("mean n biased clusters in coca") +
xlab("Coca bias type") +
geom_hline(data = mean_n_clusters %>% filter(model_type == "kid"), aes(yintercept = mean, color = bias_type), linetype = 2) +
theme_classic() +
theme(legend.position = "none")
Of those clusters that are gendered, what’s the distribution of how gendered they are?
clusters %>%
filter(is_biased) %>%
distinct(model_type, run, cluster_id, mean_gender) %>%
ggplot(aes(x = mean_gender)) +
geom_density() +
# geom_vline(aes(xintercept = 0), linetype = 2) +
facet_wrap(model_type + run ~. ) +
theme_classic()
clusters %>%
group_by(model_type, run) %>%
summarize(n = sum(n_words)) %>%
kable()
| model_type | run | n |
|---|---|---|
| coca | 1 | 513 |
| coca | 2 | 513 |
| coca | 3 | 513 |
| coca | 4 | 513 |
| coca | 5 | 513 |
| coca | 6 | 513 |
| coca | 7 | 513 |
| coca | 8 | 513 |
| coca | 9 | 513 |
| coca | 10 | 513 |
| kid | 1 | 513 |
Looking only at words that are common across all models.
WORD_PATH <- here("analysis/books/adult_comparision/cluster_analysis/data/by_word_clusters_all_models_common.csv")
words <- read_csv(WORD_PATH,
col_names = c("word", "tsne_X", "tsne_Y", "cluster_id", "is_gendered", "gender_bias", "effect_size", "mean_gender", "model_type", "run")) %>%
mutate(run = case_when(is.na(run)~ 1, TRUE ~ run))
common_words <- split(words$word,
words$model_type) %>%
reduce(intersect)
words %>%
filter(word %in% common_words) %>%
filter(is_gendered) %>%
distinct(model_type, word, gender_bias, .keep_all = T) %>%
select(model_type, gender_bias,word, cluster_id, effect_size) %>%
arrange(model_type, gender_bias, cluster_id, word, effect_size) %>%
DT::datatable()
Words in clusters with “she” vs. “he”. No clear qualitative pattern.
targ_words <- words %>%
filter(word %in% common_words) %>%
group_by(model_type, run, cluster_id) %>%
nest() %>%
mutate(contains_she = map_lgl(data, ~ "she" %in% .$word),
contains_he = map_lgl(data, ~ "he" %in% .$word)) %>%
filter(contains_she | contains_he) %>%
unnest()
targ_words %>%
filter(contains_he) %>%
distinct(model_type, word) %>%
data.frame()
## model_type word
## 1 kid he
## 2 kid him
## 3 kid his
## 4 kid run
## 5 kid took
## 6 kid under
## 7 kid went
## 8 coca he
## 9 coca left
## 10 coca little
## 11 coca my
## 12 coca she
## 13 coca together
## 14 coca everything
## 15 coca made
## 16 coca see
## 17 coca tried
## 18 coca and
## 19 coca came
## 20 coca long
## 21 coca under
## 22 coca while
## 23 coca still
## 24 coca him
## 25 coca house
## 26 coca like
## 27 coca man
## 28 coca without
## 29 coca himself
## 30 coca let
## 31 coca next
## 32 coca once
## 33 coca along
## 34 coca coming
## 35 coca enough
## 36 coca same
## 37 coca some
## 38 coca time
## 39 coca told
targ_words %>%
filter(contains_she) %>%
distinct(model_type, word) %>%
data.frame()
## model_type word
## 1 kid brought
## 2 kid her
## 3 kid home
## 4 kid nothing
## 5 kid set
## 6 kid she
## 7 coca he
## 8 coca left
## 9 coca little
## 10 coca my
## 11 coca she
## 12 coca together
## 13 coca father
## 14 coca great
## 15 coca men
## 16 coca and
## 17 coca came
## 18 coca long
## 19 coca under
## 20 coca while
## 21 coca still
## 22 coca before
## 23 coca both
## 24 coca hard
## 25 coca made
## 26 coca near
## 27 coca then
## 28 coca watching
## 29 coca big
## 30 coca high
## 31 coca her
## 32 coca herself
## 33 coca his
## 34 coca over
## 35 coca felt
## 36 coca him
## 37 coca like
## 38 coca one
## 39 coca way
Words plotted in 2d (only “gendered” words shown (> 4 or < 2)).
WORD_COORDS <- here("analysis/books/adult_comparision/cluster_analysis/data/by_word_coordinates.csv")
word_cords <- read_csv(WORD_COORDS, col_names = c("tsne_X" , "tsne_Y" , "word", "gender","model_type" ,"run"))
ggplot(word_cords %>% filter(gender > 4| gender < 2),
aes(x = tsne_X, y = tsne_Y, color = gender)) +
geom_point(alpha = .2) +
scale_color_gradient2(midpoint = 3, low = "blue", mid = "white",
high = "red", space = "Lab" ) +
facet_wrap(model_type + run ~ .) +
theme_classic()