Montag children’s book corpus

Train word2vec model on montag corpus.
- Book centroids
Relative position of gender-related words
Gender score of each word
Gender score by book

FINDINGS:

The N closest words to “boy” and “girl” are almost non-overlapping, suggesting that the concepts of maleness and femaleness are located in different parts of semantic space (maybe related to valence?)
There are more male-biased books than female-biased books (61% vs. 39%).
There’s less variability in semantic space for female-biased books, compared to male-biased books.

Train word2vec model on montag corpus.

Book centroids

This is the distribution of books in semantic space using t-sne coordinates. The cenroid of each book is calculated as the sum of all words in book.

model_df <- read_csv("w2v_montag_corpus.csv")

montag_counts <- read_csv("tidy_montag_corpus.csv") %>%
  rename(target_word = word) %>%
  count(title, target_word)

# get location of essay in the vector space 
get_one_essay_location <- function(book_title, 
                                   model, 
                                   book_data){
  
  # merge current essay with corresponding word vecs
  merged_vectors <- book_data[title == book_title] %>%
    merge(model, all.x = TRUE, by = "target_word")
  
  # get pieces to essay vector computation
  word_vectors <- merged_vectors %>%  # get word vectors only
    select(V1:V200) %>%
    as.matrix()
  counts <- merged_vectors$n # get counts

  # weighted by number of times word appears in essay
  count_weighted_vecs <- word_vectors * counts # weight by n word appears
  essay_vector_count_weighted <- t(colSums(count_weighted_vecs, 
                                           na.rm = TRUE)) # sum across words
 
  # return essay vector
  count_weighting_vec <- data.frame(title = book_title,
                                    essay_vector_count_weighted)

}

essay_locations <- 
  map_df(unique(montag_counts$title), 
         get_one_essay_location, model_df, 
         as.data.table(montag_counts))

# get tsne coordinates
tsne_out = Rtsne::Rtsne(as.matrix(essay_locations[,-1]))
tsne_dims <- tsne_out$Y %>%
  as.data.frame() %>%
  rename(tsne_X = V1,
         tsne_Y = V2)  

#write_csv(tsne_dims, "book_tsne_montag_w2v.csv")

tsne_dims <- read_csv("book_tsne_montag_w2v.csv") %>%
  bind_cols(title = unique(montag_counts$title)) 

ggplot(tsne_dims,
         aes(x = tsne_X, y = tsne_Y)) +
  geom_text(aes(label = title), size = 2) +
  theme_void()

Relative position of gender-related words

Get N closest words (N = 200) to “boy” and “girl” word pairs. Then, plot in 2d space and try to predict words with independent ratings (Glasglow and Byrsbaert).

# Read in w2v and norms.
gender_norms <- read_csv("GlasgowNorms.csv") %>%
  select(word, GEND_M) %>%
  rename(gender_maleness = GEND_M)

affect_norms <- read_csv("BRM-emot-submit.csv") %>%
  select(2,3,6,9) %>%
  rename(word = Word,
         valence = V.Mean.Sum, 
         affect = A.Mean.Sum,
         dominance = D.Mean.Sum)

Boy vs. girl

Distribution in 2d w2v space

The large point corresponds to the target boy and girl words.

BOY_WORD <- "boy"
GIRL_WORD <- "girl"

boy_words <- model %>% 
  closest_to(BOY_WORD, n = 201, fancy_names = F) %>%
  slice(-1) %>%
  mutate(word_type = "boy") 
  #mutate(word = text_tokens(word, stemmer = "en") %>% unlist())

girl_words <- model %>% 
  closest_to(GIRL_WORD, n = 201, fancy_names = F) %>%
  slice(-1) %>%
  mutate(word_type = "girl") 
  #mutate(word = text_tokens(word, stemmer = "en") %>% unlist())

word_dims <- boy_words %>%
  bind_rows(girl_words) %>%
  select(-similarity, -word_type) %>%
  add_row(word = GIRL_WORD) %>%
  add_row(word = BOY_WORD) %>%
  rename(target_word = word) %>%
  left_join(model_df) %>%
  distinct(target_word, .keep_all=T)

# get tsne coordinates
tsne_out = Rtsne::Rtsne(as.matrix(word_dims[,-1]))
tsne_dims <- tsne_out$Y %>%
  as.data.frame() %>%
  rename(tsne_X = V1,
         tsne_Y = V2)  %>%
  bind_cols(target_word = word_dims$target_word) %>%
  select(target_word, everything()) 

tsne_words <- boy_words %>%
  bind_rows(girl_words) %>%
  select(-similarity) %>% 
  rename(target_word = word) %>%
  left_join(tsne_dims)

targ_words <- tsne_dims %>%
  filter(target_word %in% c(BOY_WORD, GIRL_WORD)) %>%
  mutate(word_type = ifelse(target_word == BOY_WORD,
                            "boy", "girl"))
 
ggplot(tsne_words,
         aes(x = tsne_X, y = tsne_Y)) +
  geom_text(aes(label = target_word, color = word_type),
            size = 2) +
  scale_color_manual(values = c("blue", "red")) +
  geom_point(aes(color = word_type), 
             data = targ_words, size = 4) +
  theme_void()

Independent ratings

boy_words %>%
  bind_rows(girl_words) %>%
  left_join(gender_norms) %>%
  left_join(affect_norms) %>%
  gather("norm", "rating", -1:-3) %>%
  group_by(norm, word_type) %>%
  multi_boot_standard(col = "rating", na.rm = T) %>%
  ggplot(aes(x = fct_rev(word_type), y = mean, 
             fill = word_type)) +
    facet_wrap(~norm, nrow = 1) +
    geom_bar(stat = "identity") +
    ggtitle(paste0(BOY_WORD, " vs. ", GIRL_WORD)) +
    ylab("Mean rating") +
    xlab("gender of target words") +
    ylim(0, 7) +
    geom_linerange(aes(ymin = ci_lower, ymax = ci_upper)) +
    theme_minimal() +
    theme(legend.position = "none")

Him vs. her

Distribution in 2d w2v space

BOY_WORD <- "him"
GIRL_WORD <- "her"

boy_words <- model %>% 
  closest_to(BOY_WORD, n = 201, fancy_names = F) %>%
  slice(-1) %>%
  mutate(word_type = "boy") 
  #mutate(word = text_tokens(word, stemmer = "en") %>% unlist())

girl_words <- model %>% 
  closest_to(GIRL_WORD, n = 201, fancy_names = F) %>%
  slice(-1) %>%
  mutate(word_type = "girl") 
  #mutate(word = text_tokens(word, stemmer = "en") %>% unlist())

word_dims <- boy_words %>%
  bind_rows(girl_words) %>%
  select(-similarity, -word_type) %>%
  add_row(word = GIRL_WORD) %>%
  add_row(word = BOY_WORD) %>%
  rename(target_word = word) %>%
  left_join(model_df) %>%
  distinct(target_word, .keep_all=T)

# get tsne coordinates
tsne_out = Rtsne::Rtsne(as.matrix(word_dims[,-1]))
tsne_dims <- tsne_out$Y %>%
  as.data.frame() %>%
  rename(tsne_X = V1,
         tsne_Y = V2)  %>%
  bind_cols(target_word = word_dims$target_word) %>%
  select(target_word, everything()) 

tsne_words <- boy_words %>%
  bind_rows(girl_words) %>%
  select(-similarity) %>% 
  rename(target_word = word) %>%
  left_join(tsne_dims)

targ_words <- tsne_dims %>%
  filter(target_word %in% c(BOY_WORD, GIRL_WORD)) %>%
  mutate(word_type = ifelse(target_word == BOY_WORD,
                            "boy", "girl"))
 
ggplot(tsne_words,
         aes(x = tsne_X, y = tsne_Y)) +
  geom_text(aes(label = target_word, color = word_type),
            size = 2) +
  scale_color_manual(values = c("blue", "red")) +
  geom_point(aes(color = word_type), 
             data = targ_words, size = 4) +
  theme_void()

Independent ratings

boy_words %>%
  bind_rows(girl_words) %>%
  left_join(gender_norms) %>%
  left_join(affect_norms) %>%
  gather("norm", "rating", -1:-3) %>%
  group_by(norm, word_type) %>%
  multi_boot_standard(col = "rating", na.rm = T) %>%
  ggplot(aes(x = fct_rev(word_type), y = mean, 
             fill = word_type)) +
    facet_wrap(~norm, nrow = 1) +
    geom_bar(stat = "identity") +
    ggtitle(paste0(BOY_WORD, " vs. ", GIRL_WORD)) +
    ylab("Mean rating") +
    ylim(0, 7) +
    xlab("gender of target words") +

    geom_linerange(aes(ymin = ci_lower, ymax = ci_upper)) +
    theme_minimal() +
    theme(legend.position = "none")

His vs. her

Distribution in 2d w2v space

BOY_WORD <- "his"
GIRL_WORD <- "her"

boy_words <- model %>% 
  closest_to(BOY_WORD, n = 201, fancy_names = F) %>%
  slice(-1) %>%
  mutate(word_type = "boy") 
  #mutate(word = text_tokens(word, stemmer = "en") %>% unlist())

girl_words <- model %>% 
  closest_to(GIRL_WORD, n = 201, fancy_names = F) %>%
  slice(-1) %>%
  mutate(word_type = "girl") 
  #mutate(word = text_tokens(word, stemmer = "en") %>% unlist())

word_dims <- boy_words %>%
  bind_rows(girl_words) %>%
  select(-similarity, -word_type) %>%
  add_row(word = GIRL_WORD) %>%
  add_row(word = BOY_WORD) %>%
  rename(target_word = word) %>%
  left_join(model_df) %>%
  distinct(target_word, .keep_all=T)

# get tsne coordinates
tsne_out = Rtsne::Rtsne(as.matrix(word_dims[,-1]))
tsne_dims <- tsne_out$Y %>%
  as.data.frame() %>%
  rename(tsne_X = V1,
         tsne_Y = V2)  %>%
  bind_cols(target_word = word_dims$target_word) %>%
  select(target_word, everything()) 

tsne_words <- boy_words %>%
  bind_rows(girl_words) %>%
  select(-similarity) %>% 
  rename(target_word = word) %>%
  left_join(tsne_dims)

targ_words <- tsne_dims %>%
  filter(target_word %in% c(BOY_WORD, GIRL_WORD)) %>%
  mutate(word_type = ifelse(target_word == BOY_WORD,
                            "boy", "girl"))
 
ggplot(tsne_words,
         aes(x = tsne_X, y = tsne_Y)) +
  geom_text(aes(label = target_word, color = word_type),
            size = 2) +
  scale_color_manual(values = c("blue", "red")) +
  geom_point(aes(color = word_type), 
             data = targ_words, size = 4) +
  theme_void()

Independent ratings

boy_words %>%
  bind_rows(girl_words) %>%
  left_join(gender_norms) %>%
  left_join(affect_norms) %>%
  gather("norm", "rating", -1:-3) %>%
  group_by(norm, word_type) %>%
  multi_boot_standard(col = "rating", na.rm = T) %>%
  ggplot(aes(x = fct_rev(word_type), y = mean, 
             fill = word_type)) +
    facet_wrap(~norm, nrow = 1) +
    geom_bar(stat = "identity") +
    ggtitle(paste0(BOY_WORD, " vs. ", GIRL_WORD)) +
    ylab("Mean rating") +
    xlab("gender of target words") +
    ylim(0, 7) +
    geom_linerange(aes(ymin = ci_lower, ymax = ci_upper)) +
    theme_minimal() +
    theme(legend.position = "none")

Gender score of each word

To calculate gender bias score for each word, I took the mean distance to 4 “male” words (“boy”, “man”, “he”, “him”) and the mean distance to 4 “female” words (“girl”, “woman”, “she”, “her”), and subtracted the two. Positive numbers indicate male bias.

w2v_norms <- read_csv("gender_score_output_montag_trained.csv", 
                      col_names = c("target_word", "female_score",
                                    "male_score", "gender_score")) %>%
  distinct(.keep_all = T) %>%
  anti_join(stop_words %>% filter(lexicon == "SMART"), 
            by = c("target_word" = "word"))

ggplot(w2v_norms, aes(x = gender_score)) +
    ggtitle("Gender score distribution from word embedding models")+
  xlab("Gender score + => more male") +
    geom_histogram() +
    theme_classic()

Glasglow norms:

norms <- read_csv("GlasgowNorms.csv") %>%
  select(word, GEND_M, contains("_M")) 

all_norms <- left_join(w2v_norms, norms, by = c("target_word"= "word")) %>%
  rename(gender_rating = GEND_M) %>%
  select(target_word, gender_score, gender_rating)


ggplot(all_norms,aes(y = gender_score, x = gender_rating)) +
    geom_point() +
    geom_smooth(method = "lm")  +
    ggtitle("Correlation between gender norms and word embedding gender score")+
    ylab("word2vec gender score") +
    scale_x_continuous(name = "mean rating", 
                     limits = c(1, 7),
                     breaks = 1:7) +
  theme_classic()

cor_summary <- cor.test(all_norms$gender_score, all_norms$gender_rating)
cor_summary

## 
##  Pearson's product-moment correlation
## 
## data:  all_norms$gender_score and all_norms$gender_rating
## t = 6.9028, df = 586, p-value = 1.329e-11
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1977479 0.3473762
## sample estimates:
##       cor 
## 0.2742209

These are correlated with the Glasglow norms, though not as strongly as when using the Wikipedia-trained corpus.

Gender score by book

Here I calculated a gender score for each book as the mean gender score of all the words in the book (unweighted for word frequency.)

montag <- read_csv("tidy_montag_corpus.csv") %>%
  rename(target_word = word)

tidy_with_norms <- montag %>%
  left_join(w2v_norms)

norm_means <- tidy_with_norms %>%
  group_by(title) %>%
  multi_boot_standard(col = "gender_score", na.rm = T)

norm_means_wide <- norm_means %>%
  select(title, mean) %>%
  arrange(mean)

kable(norm_means_wide, "html") %>%
  kable_styling() %>%
  scroll_box(width = "900px", height = "400px")

title	mean
Chrysanthemum	-0.2227945
Ladybug Girl at the Beach	-0.1717888
Olivia	-0.1559726
The Other Side	-0.1403173
Bread and Jam for Frances	-0.1326910
Knuffle Bunny	-0.1258607
Olivia. . . and the Missing Toy	-0.1219377
The Keeping Quilt	-0.1141147
A Bad Case of Stripes	-0.1125367
Lilly’s Purple Plastic Purse	-0.1109904
Goldilicious	-0.1070088
Miss Rumphius	-0.1057535
Blueberries for Sal	-0.0969059
The Pigeon Finds a Hot Dog!	-0.0916011
Charlie and the New Baby	-0.0694802
Angelina Ice Skates	-0.0671414
Stellaluna	-0.0607183
The Napping House	-0.0578504
Madeline	-0.0549459
I’m a Big Sister	-0.0542753
Cloudy With a Chance of Meatballs	-0.0516419
Brown Bear, Brown Bear, What Do You See?	-0.0405251
Show Dog	-0.0391585
The Gardener	-0.0343828
The Little House	-0.0316518
The Berenstain Bears and the Green-Eyed Monster	-0.0292539
Maisy Goes to the Library	-0.0289920
The Grouchy Ladybug	-0.0281236
The Very Hungry Caterpillar	-0.0269398
Mike Mulligan and his Steam Shovel	-0.0225999
George and Martha	-0.0194237
The Hat	-0.0162605
Make Way for Ducklings	-0.0138288
Duck on a Bike	-0.0129937
Arthur Writes a Story	-0.0102899
Winter Days in the Big Woods	-0.0089948
Click, Clack, Moo Cows that Type	-0.0065756
Don’t Let the Pigeon Drive the Bus	-0.0052883
Dragons Love Tacos	-0.0031539
That Is Not a Good Idea!	0.0011166
The Carrot Seed	0.0024524
Maisy Goes Camping	0.0052526
A Sick Day for Amos McGee	0.0060247
The Story of Ferdinand	0.0062799
The Duckling gets a cookie!?	0.0099374
Are You My Mother?	0.0109353
Froggy Goes to Bed	0.0112074
The Berenstain Bears Forget Their Manners	0.0126178
Bunny Party	0.0156525
Llama llama home with mama	0.0182262
Horton Hears a Who!	0.0192898
Bark, George	0.0201041
Alexander and the Terrible, Horrible, No Good, Very Bad Day	0.0212947
Harry the Dirty Dog	0.0236787
Good Night Gorilla	0.0242415
Llama Llama Red Pajama	0.0245242
The Day the Crayons Quit	0.0246210
The Lorax	0.0256525
The Tale of Peter Rabbit	0.0265083
Own Moon	0.0281096
The Polar Express	0.0282551
Corduroy	0.0310155
The Paper Bag Princess	0.0349305
The Runaway Bunny	0.0358249
Love You Forever	0.0378168
There’s an Alligator Under My Bed	0.0384450
Bear Wants More	0.0399634
When Dinosaurs came with everything	0.0417785
Chicka Chicka 1-2-3	0.0441899
No, David!	0.0445534
The True Story of the 3 little pigs!	0.0460763
Goodnight Moon	0.0464993
Pete the Cat: The Wheels on the Bus	0.0468702
The Story of Babar	0.0491472
How to Train a Train	0.0530617
Train	0.0538341
The snowy Day	0.0577400
The Cat in The Hat	0.0588785
Sylvester and the Magic Pebble	0.0645792
Oh, the Places You’ll Go	0.0645905
How Do Dinosaurs Say Good Night?	0.0658704
Dear Zoo	0.0682707
Chicka Chicka Boom Boom	0.0703642
Green Eggs and Ham	0.0752307
This is Not My Hat	0.0769937
Little Blue Truck Leads the Way	0.0777590
Arnie the Doughnut	0.0784456
Guess How Much I Love You	0.0821203
Clifford at the Circus	0.0921021
Dinosaur Rescue	0.0937580
Where the Wild Things Are	0.0946174
Harold and the Purple Crayon	0.0965765
The Little Engine That Could	0.1002806
Curious George Takes a Job	0.1093987
Trashy Town	0.1342198
The Giving Tree	0.1385967
Curious George	0.1437146
If You Give a Moose a Muffin	0.1470233
Caps for Sale	0.1474325
If You Give a Mouse a Cookie	0.1590399

ggplot(norm_means_wide, aes(x = mean)) +
    geom_histogram() +
    ggtitle("mean rating by book") +
    xlab("word2vec gender rating (+ => more male)") +
    geom_vline(aes(xintercept = 0), color = "red", linetype = 2) +
    theme_classic()

#norm_means_wide %>%
 # mutate(male = ifelse(mean > 0, 1, 0)) %>%
 # ungroup()%>%
 # count(male)

tsne_dims <- read_csv("book_tsne_montag_w2v.csv") %>%
  bind_cols(gender_rating = norm_means$mean)  %>%
  bind_cols(title = norm_means$title)  %>%
  mutate(gender_rating_cat = ifelse(gender_rating > 0, 
                                    "male", "female"))


  ggplot(tsne_dims,
         aes(x = tsne_X, y = tsne_Y, color = gender_rating)) +
  geom_text(aes(label = title, color = gender_rating), size = 2) +
  #geom_point() +
  scale_color_gradient2(low = "red", high = "blue") +
    theme_void()

  ggplot(tsne_dims,
         aes(x = tsne_X, y = tsne_Y, color = gender_rating)) +
    geom_point() +
  #geom_text(aes(label = title, color = gender_rating), size = 2) +
  #geom_point() +
  scale_color_gradient2(low = "red", high = "blue") +
    theme_void()

Montag children’s book corpus

training w2v on corpus

Molly Lewis

2018-03-23

Train word2vec model on montag corpus.

Book centroids

Gender score of each word

Gender score by book