CLUSTER_PATH <-  here("analysis/books/adult_comparision/cluster_analysis/data/clusters_all_models_common.csv")

clusters <- read_csv(CLUSTER_PATH,
                     col_names = c("cluster_id", "n_words", "mean_gender",
                                   "tsne_x", "tsne_y", "effect_size", "is_biased", "bias_type", "model_type", "run")) %>%
  mutate(run = case_when(is.na(run)~ 1, TRUE ~ run))

Gender distribution of clusters by run

Clusters are identified as gendered based on the human judgements. NAs are cases where the cluster had < 3 words in it.

clusters %>%
  distinct(model_type, run, cluster_id, bias_type) %>%
  count(model_type, run, bias_type) %>%
    complete(model_type, run, bias_type, fill = list(n = 0)) %>%
  filter(model_type == "coca"| run < 2) %>%
  unite("model", model_type, run, sep = "_")  %>%
  ggplot(aes(x = model,  y = n, color = bias_type,  group = bias_type)) +
  geom_point() +
  ylab("N clusters") +
  geom_line() +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Excluding NA and “neither” categories:

clusters %>%
  filter(bias_type %in% c("female", "male")) %>%
  distinct(model_type, run, cluster_id, bias_type) %>%
  count(model_type, run, bias_type) %>%
  complete(model_type, run, bias_type, fill = list(n = 0)) %>%
  filter(model_type == "coca"| run < 2) %>%
  unite("model", model_type, run, sep = "_")  %>%
  ggplot(aes(x = model,  y = n, color = bias_type,  group = bias_type)) +
  geom_point() +
  ylab("N clusters") +
  geom_line() +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Taking the mean across coca runs. Dashed lines correspond to mean in kid corpus.

mean_n_clusters <- clusters %>%
  filter(bias_type %in% c("female", "male")) %>%
  distinct(model_type, run, cluster_id, bias_type) %>%
  count(model_type, run, bias_type) %>%
    complete(model_type, run, bias_type, fill = list(n = 0)) %>%
  filter(model_type == "coca"| run < 2) %>%
  group_by(model_type, bias_type) %>%
  multi_boot_standard(col = "n")

ggplot(mean_n_clusters %>% filter(model_type == "coca"), aes(x = bias_type, fill = bias_type, y = mean)) +
  geom_bar(stat = "identity") +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper)) +
  ylab("mean n biased clusters in coca") +
  xlab("Coca bias type") +
  geom_hline(data = mean_n_clusters %>% filter(model_type == "kid"), aes(yintercept = mean, color = bias_type), linetype = 2) +
  theme_classic() +
  theme(legend.position = "none")

Of those clusters that are gendered, what’s the distribution of how gendered they are?

clusters %>%
  filter(is_biased) %>%
  distinct(model_type, run, cluster_id, mean_gender) %>%
  ggplot(aes(x = mean_gender)) +
  geom_density() + 
 # geom_vline(aes(xintercept = 0), linetype = 2) +
  facet_wrap(model_type + run ~. )  +
  theme_classic()

clusters %>%  
  group_by(model_type, run) %>%
  summarize(n = sum(n_words)) %>%
  kable() 
model_type run n
coca 1 513
coca 2 513
coca 3 513
coca 4 513
coca 5 513
coca 6 513
coca 7 513
coca 8 513
coca 9 513
coca 10 513
kid 1 513

Word level

Looking only at words that are common across all models.

WORD_PATH <- here("analysis/books/adult_comparision/cluster_analysis/data/by_word_clusters_all_models_common.csv")

words <- read_csv(WORD_PATH,
                     col_names = c("word", "tsne_X", "tsne_Y", "cluster_id", "is_gendered", "gender_bias", "effect_size", "mean_gender", "model_type", "run")) %>%
  mutate(run = case_when(is.na(run)~ 1, TRUE ~ run))


common_words <- split(words$word,
                 words$model_type) %>%
  reduce(intersect)

words %>%
  filter(word %in% common_words) %>%
  filter(is_gendered) %>%
  distinct(model_type, word, gender_bias, .keep_all = T) %>%
  select(model_type,  gender_bias,word, cluster_id, effect_size) %>%
  arrange(model_type, gender_bias, cluster_id, word, effect_size) %>%
  DT::datatable()

Words in clusters with “she” vs. “he”. No clear qualitative pattern.

targ_words <- words %>%
  filter(word %in% common_words) %>%
  group_by(model_type, run, cluster_id) %>%
  nest() %>%
  mutate(contains_she = map_lgl(data, ~ "she" %in% .$word),
         contains_he = map_lgl(data, ~ "he" %in% .$word)) %>%
  filter(contains_she | contains_he) %>%
  unnest() 

targ_words %>%
  filter(contains_he) %>%
  distinct(model_type, word) %>%
  data.frame()
##    model_type       word
## 1         kid         he
## 2         kid        him
## 3         kid        his
## 4         kid        run
## 5         kid       took
## 6         kid      under
## 7         kid       went
## 8        coca         he
## 9        coca       left
## 10       coca     little
## 11       coca         my
## 12       coca        she
## 13       coca   together
## 14       coca everything
## 15       coca       made
## 16       coca        see
## 17       coca      tried
## 18       coca        and
## 19       coca       came
## 20       coca       long
## 21       coca      under
## 22       coca      while
## 23       coca      still
## 24       coca        him
## 25       coca      house
## 26       coca       like
## 27       coca        man
## 28       coca    without
## 29       coca    himself
## 30       coca        let
## 31       coca       next
## 32       coca       once
## 33       coca      along
## 34       coca     coming
## 35       coca     enough
## 36       coca       same
## 37       coca       some
## 38       coca       time
## 39       coca       told
targ_words %>%
  filter(contains_she) %>%
  distinct(model_type, word) %>%
  data.frame()
##    model_type     word
## 1         kid  brought
## 2         kid      her
## 3         kid     home
## 4         kid  nothing
## 5         kid      set
## 6         kid      she
## 7        coca       he
## 8        coca     left
## 9        coca   little
## 10       coca       my
## 11       coca      she
## 12       coca together
## 13       coca   father
## 14       coca    great
## 15       coca      men
## 16       coca      and
## 17       coca     came
## 18       coca     long
## 19       coca    under
## 20       coca    while
## 21       coca    still
## 22       coca   before
## 23       coca     both
## 24       coca     hard
## 25       coca     made
## 26       coca     near
## 27       coca     then
## 28       coca watching
## 29       coca      big
## 30       coca     high
## 31       coca      her
## 32       coca  herself
## 33       coca      his
## 34       coca     over
## 35       coca     felt
## 36       coca      him
## 37       coca     like
## 38       coca      one
## 39       coca      way

Words plotted in 2d (only “gendered” words shown (> 4 or < 2)).

WORD_COORDS <- here("analysis/books/adult_comparision/cluster_analysis/data/by_word_coordinates.csv")
word_cords <- read_csv(WORD_COORDS, col_names = c("tsne_X" ,    "tsne_Y"   ,  "word", "gender","model_type" ,"run"))

ggplot(word_cords %>% filter(gender > 4| gender < 2), 
       aes(x = tsne_X, y = tsne_Y, color = gender)) +
  geom_point(alpha = .2) +
  scale_color_gradient2(midpoint = 3, low = "blue", mid = "white",
                            high = "red", space = "Lab" ) +
  facet_wrap(model_type + run ~ .) +
  theme_classic()