CLUSTER_PATH <-  here("analysis/books/adult_comparision/cluster_analysis/data/clusters_all_models.csv")

clusters <- read_csv(CLUSTER_PATH,
                     col_names = c("cluster_id", "n_words", "mean_gender",
                                   "tsne_x", "tsne_y", "effect_size", "is_biased", "bias_type", "model_type", "run")) %>%
  mutate(run = case_when(is.na(run)~ 1, TRUE ~ run))

Gender distribution of clusters by run

Clusters are identified as gendered based on the human judgements. NAs are cases where the cluster had < 3 words in it.

clusters %>%
  distinct(model_type, run, cluster_id, bias_type) %>%
  count(model_type, run, bias_type) %>%
    complete(model_type, run, bias_type, fill = list(n = 0)) %>%
  filter(model_type == "coca"| run < 2) %>%
  unite("model", model_type, run, sep = "_")  %>%
  ggplot(aes(x = model,  y = n, color = bias_type,  group = bias_type)) +
  geom_point() +
  ylab("N clusters") +
  geom_line() +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Excluding NA and “neither” categories:

clusters %>%
  filter(bias_type %in% c("female", "male")) %>%
  distinct(model_type, run, cluster_id, bias_type) %>%
  count(model_type, run, bias_type) %>%
  complete(model_type, run, bias_type, fill = list(n = 0)) %>%
  filter(model_type == "coca"| run < 2) %>%
  unite("model", model_type, run, sep = "_")  %>%
  ggplot(aes(x = model,  y = n, color = bias_type,  group = bias_type)) +
  geom_point() +
  ylab("N clusters") +
  geom_line() +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Taking the mean across coca runs. Dashed lines correspond to mean in kid corpus.

mean_n_clusters <- clusters %>%
  filter(bias_type %in% c("female", "male")) %>%
  distinct(model_type, run, cluster_id, bias_type) %>%
  count(model_type, run, bias_type) %>%
    complete(model_type, run, bias_type, fill = list(n = 0)) %>%
  filter(model_type == "coca"| run < 2) %>%
  group_by(model_type, bias_type) %>%
  multi_boot_standard(col = "n")

ggplot(mean_n_clusters %>% filter(model_type == "coca"), aes(x = bias_type, fill = bias_type, y = mean)) +
  geom_bar(stat = "identity") +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper)) +
  ylab("mean n biased clusters in coca") +
  xlab("Coca bias type") +
  geom_hline(data = mean_n_clusters %>% filter(model_type == "kid"), aes(yintercept = mean, color = bias_type), linetype = 2) +
  theme_classic() +
  theme(legend.position = "none")

Of those clusters that are gendered, what’s the distribution of how gendered they are?

clusters %>%
  filter(is_biased) %>%
  distinct(model_type, run, cluster_id, mean_gender) %>%
  ggplot(aes(x = mean_gender)) +
  geom_density() + 
 # geom_vline(aes(xintercept = 0), linetype = 2) +
  facet_wrap(model_type + run ~. )  +
  theme_classic()

The above analysis isn’t totally fair, though, because there are more words in the coca texts that are missing from the norms we collected.

clusters %>%  
  group_by(model_type, run) %>%
  summarize(n = sum(n_words)) %>%
  kable() 
model_type run n
coca 1 834
coca 2 831
coca 3 819
coca 4 818
coca 5 828
coca 6 830
coca 7 813
coca 8 839
coca 9 836
coca 10 803
kid 1 1121

Word level

Looking only at words that are common across all models.

WORD_PATH <- here("analysis/books/adult_comparision/cluster_analysis/data/by_word_clusters_all_models.csv")

words <- read_csv(WORD_PATH,
                     col_names = c("word", "tsne_X", "tsne_Y", "cluster_id", "is_gendered", "gender_bias", "effect_size", "mean_gender", "model_type", "run")) %>%
  mutate(run = case_when(is.na(run)~ 1, TRUE ~ run))


common_words <- split(words$word,
                 words$model_type) %>%
  reduce(intersect)

words %>%
  filter(word %in% common_words) %>%
  filter(is_gendered) %>%
  distinct(model_type, word, gender_bias, .keep_all = T) %>%
  select(model_type,  gender_bias,word, cluster_id, effect_size) %>%
  arrange(model_type, gender_bias, cluster_id, word, effect_size) %>%
  DT::datatable()

Words in clusters with “she” vs. “he”. No clear qualitative pattern.

targ_words <- words %>%
  filter(word %in% common_words) %>%
  group_by(model_type, run, cluster_id) %>%
  nest() %>%
  mutate(contains_she = map_lgl(data, ~ "she" %in% .$word),
         contains_he = map_lgl(data, ~ "he" %in% .$word)) %>%
  filter(contains_she | contains_he) %>%
  unnest() 

targ_words %>%
  filter(contains_he) %>%
  distinct(model_type, word) %>%
  data.frame()
##    model_type      word
## 1         kid      away
## 2         kid      back
## 3         kid      down
## 4         kid        he
## 5         kid       him
## 6         kid   himself
## 7         kid       his
## 8         kid   instead
## 9         kid       out
## 10        kid       run
## 11        kid      then
## 12        kid     tired
## 13        kid        up
## 14       coca      away
## 15       coca       big
## 16       coca       bit
## 17       coca     drive
## 18       coca      fell
## 19       coca        he
## 20       coca       her
## 21       coca      left
## 22       coca    little
## 23       coca       put
## 24       coca       saw
## 25       coca       she
## 26       coca    smiled
## 27       coca   stopped
## 28       coca     could
## 29       coca     empty
## 30       coca       end
## 31       coca      hard
## 32       coca  remember
## 33       coca      same
## 34       coca       see
## 35       coca something
## 36       coca  wondered
## 37       coca    almost
## 38       coca     grace
## 39       coca     house
## 40       coca     large
## 41       coca    seemed
## 42       coca     tried
## 43       coca     again
## 44       coca    before
## 45       coca       him
## 46       coca      went
## 47       coca   another
## 48       coca       man
## 49       coca       now
## 50       coca      once
## 51       coca       own
## 52       coca      some
## 53       coca     there
## 54       coca      turn
## 55       coca      gave
## 56       coca    having
## 57       coca     heard
## 58       coca    mother
## 59       coca      must
## 60       coca       way
## 61       coca    enough
## 62       coca      full
## 63       coca      help
## 64       coca      like
## 65       coca      more
## 66       coca   perhaps
## 67       coca     spoke
## 68       coca      than
## 69       coca     world
## 70       coca   brought
## 71       coca   finally
## 72       coca   himself
## 73       coca     while
## 74       coca       eye
## 75       coca  finished
## 76       coca      rest
## 77       coca      seen
## 78       coca     woman
## 79       coca      half
## 80       coca       hot
## 81       coca      look
## 82       coca   started
## 83       coca      used
## 84       coca    worked
targ_words %>%
  filter(contains_she) %>%
  distinct(model_type, word) %>%
  data.frame()
##    model_type     word
## 1         kid  animals
## 2         kid  decided
## 3         kid      had
## 4         kid     hand
## 5         kid    other
## 6         kid    place
## 7         kid   pretty
## 8         kid      she
## 9         kid    still
## 10        kid    their
## 11        kid     they
## 12        kid     were
## 13       coca     away
## 14       coca      big
## 15       coca      bit
## 16       coca    drive
## 17       coca     fell
## 18       coca       he
## 19       coca      her
## 20       coca     left
## 21       coca   little
## 22       coca      put
## 23       coca      saw
## 24       coca      she
## 25       coca   smiled
## 26       coca  stopped
## 27       coca     came
## 28       coca      his
## 29       coca    later
## 30       coca   moment
## 31       coca    stand
## 32       coca  another
## 33       coca     cold
## 34       coca     like
## 35       coca   looked
## 36       coca  looking
## 37       coca       my
## 38       coca      off
## 39       coca      out
## 40       coca     then
## 41       coca     took
## 42       coca     turn
## 43       coca    under
## 44       coca       up
## 45       coca    again
## 46       coca   before
## 47       coca     hard
## 48       coca      him
## 49       coca     went
## 50       coca    along
## 51       coca      and
## 52       coca  between
## 53       coca      car
## 54       coca      got
## 55       coca     high
## 56       coca     long
## 57       coca    other
## 58       coca    still
## 59       coca     them
## 60       coca  walking
## 61       coca    woman
## 62       coca   almost
## 63       coca      boy
## 64       coca    quiet
## 65       coca    table
## 66       coca    teeth
## 67       coca   turned
## 68       coca watching
## 69       coca     back
## 70       coca  herself
## 71       coca   mother
## 72       coca     over
## 73       coca      top
## 74       coca  brought
## 75       coca  finally
## 76       coca  himself
## 77       coca    while
## 78       coca     felt
## 79       coca    heard
## 80       coca      ran
## 81       coca     last
## 82       coca     made
## 83       coca  station

Words plotted in 2d (only “gendered” words shown (> 4 or < 2)).

WORD_COORDS <- here("analysis/books/adult_comparision/cluster_analysis/data/by_word_coordinates.csv")
word_cords <- read_csv(WORD_COORDS, col_names = c("tsne_X" ,    "tsne_Y"   ,  "word", "gender","model_type" ,"run"))

ggplot(word_cords %>% filter(gender > 4| gender < 2), 
       aes(x = tsne_X, y = tsne_Y, color = gender)) +
  geom_point(alpha = .2) +
  scale_color_gradient2(midpoint = 3, low = "blue", mid = "white",
                            high = "red", space = "Lab" ) +
  facet_wrap(model_type + run ~ .) +
  theme_classic()