CLUSTER_PATH <- here("analysis/books/adult_comparision/cluster_analysis/data/clusters_all_models.csv")
clusters <- read_csv(CLUSTER_PATH,
col_names = c("cluster_id", "n_words", "mean_gender",
"tsne_x", "tsne_y", "effect_size", "is_biased", "bias_type", "model_type", "run")) %>%
mutate(run = case_when(is.na(run)~ 1, TRUE ~ run))
Clusters are identified as gendered based on the human judgements. NAs are cases where the cluster had < 3 words in it.
clusters %>%
distinct(model_type, run, cluster_id, bias_type) %>%
count(model_type, run, bias_type) %>%
complete(model_type, run, bias_type, fill = list(n = 0)) %>%
filter(model_type == "coca"| run < 2) %>%
unite("model", model_type, run, sep = "_") %>%
ggplot(aes(x = model, y = n, color = bias_type, group = bias_type)) +
geom_point() +
ylab("N clusters") +
geom_line() +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
Excluding NA and “neither” categories:
clusters %>%
filter(bias_type %in% c("female", "male")) %>%
distinct(model_type, run, cluster_id, bias_type) %>%
count(model_type, run, bias_type) %>%
complete(model_type, run, bias_type, fill = list(n = 0)) %>%
filter(model_type == "coca"| run < 2) %>%
unite("model", model_type, run, sep = "_") %>%
ggplot(aes(x = model, y = n, color = bias_type, group = bias_type)) +
geom_point() +
ylab("N clusters") +
geom_line() +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
Taking the mean across coca runs. Dashed lines correspond to mean in kid corpus.
mean_n_clusters <- clusters %>%
filter(bias_type %in% c("female", "male")) %>%
distinct(model_type, run, cluster_id, bias_type) %>%
count(model_type, run, bias_type) %>%
complete(model_type, run, bias_type, fill = list(n = 0)) %>%
filter(model_type == "coca"| run < 2) %>%
group_by(model_type, bias_type) %>%
multi_boot_standard(col = "n")
ggplot(mean_n_clusters %>% filter(model_type == "coca"), aes(x = bias_type, fill = bias_type, y = mean)) +
geom_bar(stat = "identity") +
geom_linerange(aes(ymin = ci_lower, ymax = ci_upper)) +
ylab("mean n biased clusters in coca") +
xlab("Coca bias type") +
geom_hline(data = mean_n_clusters %>% filter(model_type == "kid"), aes(yintercept = mean, color = bias_type), linetype = 2) +
theme_classic() +
theme(legend.position = "none")
Of those clusters that are gendered, what’s the distribution of how gendered they are?
clusters %>%
filter(is_biased) %>%
distinct(model_type, run, cluster_id, mean_gender) %>%
ggplot(aes(x = mean_gender)) +
geom_density() +
# geom_vline(aes(xintercept = 0), linetype = 2) +
facet_wrap(model_type + run ~. ) +
theme_classic()
The above analysis isn’t totally fair, though, because there are more words in the coca texts that are missing from the norms we collected.
clusters %>%
group_by(model_type, run) %>%
summarize(n = sum(n_words)) %>%
kable()
model_type | run | n |
---|---|---|
coca | 1 | 834 |
coca | 2 | 831 |
coca | 3 | 819 |
coca | 4 | 818 |
coca | 5 | 828 |
coca | 6 | 830 |
coca | 7 | 813 |
coca | 8 | 839 |
coca | 9 | 836 |
coca | 10 | 803 |
kid | 1 | 1121 |
Looking only at words that are common across all models.
WORD_PATH <- here("analysis/books/adult_comparision/cluster_analysis/data/by_word_clusters_all_models.csv")
words <- read_csv(WORD_PATH,
col_names = c("word", "tsne_X", "tsne_Y", "cluster_id", "is_gendered", "gender_bias", "effect_size", "mean_gender", "model_type", "run")) %>%
mutate(run = case_when(is.na(run)~ 1, TRUE ~ run))
common_words <- split(words$word,
words$model_type) %>%
reduce(intersect)
words %>%
filter(word %in% common_words) %>%
filter(is_gendered) %>%
distinct(model_type, word, gender_bias, .keep_all = T) %>%
select(model_type, gender_bias,word, cluster_id, effect_size) %>%
arrange(model_type, gender_bias, cluster_id, word, effect_size) %>%
DT::datatable()
Words in clusters with “she” vs. “he”. No clear qualitative pattern.
targ_words <- words %>%
filter(word %in% common_words) %>%
group_by(model_type, run, cluster_id) %>%
nest() %>%
mutate(contains_she = map_lgl(data, ~ "she" %in% .$word),
contains_he = map_lgl(data, ~ "he" %in% .$word)) %>%
filter(contains_she | contains_he) %>%
unnest()
targ_words %>%
filter(contains_he) %>%
distinct(model_type, word) %>%
data.frame()
## model_type word
## 1 kid away
## 2 kid back
## 3 kid down
## 4 kid he
## 5 kid him
## 6 kid himself
## 7 kid his
## 8 kid instead
## 9 kid out
## 10 kid run
## 11 kid then
## 12 kid tired
## 13 kid up
## 14 coca away
## 15 coca big
## 16 coca bit
## 17 coca drive
## 18 coca fell
## 19 coca he
## 20 coca her
## 21 coca left
## 22 coca little
## 23 coca put
## 24 coca saw
## 25 coca she
## 26 coca smiled
## 27 coca stopped
## 28 coca could
## 29 coca empty
## 30 coca end
## 31 coca hard
## 32 coca remember
## 33 coca same
## 34 coca see
## 35 coca something
## 36 coca wondered
## 37 coca almost
## 38 coca grace
## 39 coca house
## 40 coca large
## 41 coca seemed
## 42 coca tried
## 43 coca again
## 44 coca before
## 45 coca him
## 46 coca went
## 47 coca another
## 48 coca man
## 49 coca now
## 50 coca once
## 51 coca own
## 52 coca some
## 53 coca there
## 54 coca turn
## 55 coca gave
## 56 coca having
## 57 coca heard
## 58 coca mother
## 59 coca must
## 60 coca way
## 61 coca enough
## 62 coca full
## 63 coca help
## 64 coca like
## 65 coca more
## 66 coca perhaps
## 67 coca spoke
## 68 coca than
## 69 coca world
## 70 coca brought
## 71 coca finally
## 72 coca himself
## 73 coca while
## 74 coca eye
## 75 coca finished
## 76 coca rest
## 77 coca seen
## 78 coca woman
## 79 coca half
## 80 coca hot
## 81 coca look
## 82 coca started
## 83 coca used
## 84 coca worked
targ_words %>%
filter(contains_she) %>%
distinct(model_type, word) %>%
data.frame()
## model_type word
## 1 kid animals
## 2 kid decided
## 3 kid had
## 4 kid hand
## 5 kid other
## 6 kid place
## 7 kid pretty
## 8 kid she
## 9 kid still
## 10 kid their
## 11 kid they
## 12 kid were
## 13 coca away
## 14 coca big
## 15 coca bit
## 16 coca drive
## 17 coca fell
## 18 coca he
## 19 coca her
## 20 coca left
## 21 coca little
## 22 coca put
## 23 coca saw
## 24 coca she
## 25 coca smiled
## 26 coca stopped
## 27 coca came
## 28 coca his
## 29 coca later
## 30 coca moment
## 31 coca stand
## 32 coca another
## 33 coca cold
## 34 coca like
## 35 coca looked
## 36 coca looking
## 37 coca my
## 38 coca off
## 39 coca out
## 40 coca then
## 41 coca took
## 42 coca turn
## 43 coca under
## 44 coca up
## 45 coca again
## 46 coca before
## 47 coca hard
## 48 coca him
## 49 coca went
## 50 coca along
## 51 coca and
## 52 coca between
## 53 coca car
## 54 coca got
## 55 coca high
## 56 coca long
## 57 coca other
## 58 coca still
## 59 coca them
## 60 coca walking
## 61 coca woman
## 62 coca almost
## 63 coca boy
## 64 coca quiet
## 65 coca table
## 66 coca teeth
## 67 coca turned
## 68 coca watching
## 69 coca back
## 70 coca herself
## 71 coca mother
## 72 coca over
## 73 coca top
## 74 coca brought
## 75 coca finally
## 76 coca himself
## 77 coca while
## 78 coca felt
## 79 coca heard
## 80 coca ran
## 81 coca last
## 82 coca made
## 83 coca station
Words plotted in 2d (only “gendered” words shown (> 4 or < 2)).
WORD_COORDS <- here("analysis/books/adult_comparision/cluster_analysis/data/by_word_coordinates.csv")
word_cords <- read_csv(WORD_COORDS, col_names = c("tsne_X" , "tsne_Y" , "word", "gender","model_type" ,"run"))
ggplot(word_cords %>% filter(gender > 4| gender < 2),
aes(x = tsne_X, y = tsne_Y, color = gender)) +
geom_point(alpha = .2) +
scale_color_gradient2(midpoint = 3, low = "blue", mid = "white",
high = "red", space = "Lab" ) +
facet_wrap(model_type + run ~ .) +
theme_classic()