FINDINGS:
This is the distribution of books in semantic space using t-sne coordinates. The cenroid of each book is calculated as the sum of all words in book.
model_df <- read_csv("w2v_montag_corpus.csv")
montag_counts <- read_csv("tidy_montag_corpus.csv") %>%
rename(target_word = word) %>%
count(title, target_word)
# get location of essay in the vector space
get_one_essay_location <- function(book_title,
model,
book_data){
# merge current essay with corresponding word vecs
merged_vectors <- book_data[title == book_title] %>%
merge(model, all.x = TRUE, by = "target_word")
# get pieces to essay vector computation
word_vectors <- merged_vectors %>% # get word vectors only
select(V1:V200) %>%
as.matrix()
counts <- merged_vectors$n # get counts
# weighted by number of times word appears in essay
count_weighted_vecs <- word_vectors * counts # weight by n word appears
essay_vector_count_weighted <- t(colSums(count_weighted_vecs,
na.rm = TRUE)) # sum across words
# return essay vector
count_weighting_vec <- data.frame(title = book_title,
essay_vector_count_weighted)
}
essay_locations <-
map_df(unique(montag_counts$title),
get_one_essay_location, model_df,
as.data.table(montag_counts))
# get tsne coordinates
tsne_out = Rtsne::Rtsne(as.matrix(essay_locations[,-1]))
tsne_dims <- tsne_out$Y %>%
as.data.frame() %>%
rename(tsne_X = V1,
tsne_Y = V2)
#write_csv(tsne_dims, "book_tsne_montag_w2v.csv")
tsne_dims <- read_csv("book_tsne_montag_w2v.csv") %>%
bind_cols(title = unique(montag_counts$title))
ggplot(tsne_dims,
aes(x = tsne_X, y = tsne_Y)) +
geom_text(aes(label = title), size = 2) +
theme_void()
To calculate gender bias score for each word, I took the mean distance to 4 “male” words (“boy”, “man”, “he”, “him”) and the mean distance to 4 “female” words (“girl”, “woman”, “she”, “her”), and subtracted the two. Positive numbers indicate male bias.
w2v_norms <- read_csv("gender_score_output_montag_trained.csv",
col_names = c("target_word", "female_score",
"male_score", "gender_score")) %>%
distinct(.keep_all = T) %>%
anti_join(stop_words %>% filter(lexicon == "SMART"),
by = c("target_word" = "word"))
ggplot(w2v_norms, aes(x = gender_score)) +
ggtitle("Gender score distribution from word embedding models")+
xlab("Gender score + => more male") +
geom_histogram() +
theme_classic()
Glasglow norms:
norms <- read_csv("GlasgowNorms.csv") %>%
select(word, GEND_M, contains("_M"))
all_norms <- left_join(w2v_norms, norms, by = c("target_word"= "word")) %>%
rename(gender_rating = GEND_M) %>%
select(target_word, gender_score, gender_rating)
ggplot(all_norms,aes(y = gender_score, x = gender_rating)) +
geom_point() +
geom_smooth(method = "lm") +
ggtitle("Correlation between gender norms and word embedding gender score")+
ylab("word2vec gender score") +
scale_x_continuous(name = "mean rating",
limits = c(1, 7),
breaks = 1:7) +
theme_classic()
cor_summary <- cor.test(all_norms$gender_score, all_norms$gender_rating)
cor_summary
##
## Pearson's product-moment correlation
##
## data: all_norms$gender_score and all_norms$gender_rating
## t = 6.9028, df = 586, p-value = 1.329e-11
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1977479 0.3473762
## sample estimates:
## cor
## 0.2742209
These are correlated with the Glasglow norms, though not as strongly as when using the Wikipedia-trained corpus.
Here I calculated a gender score for each book as the mean gender score of all the words in the book (unweighted for word frequency.)
montag <- read_csv("tidy_montag_corpus.csv") %>%
rename(target_word = word)
tidy_with_norms <- montag %>%
left_join(w2v_norms)
norm_means <- tidy_with_norms %>%
group_by(title) %>%
multi_boot_standard(col = "gender_score", na.rm = T)
norm_means_wide <- norm_means %>%
select(title, mean) %>%
arrange(mean)
kable(norm_means_wide, "html") %>%
kable_styling() %>%
scroll_box(width = "900px", height = "400px")
| title | mean |
|---|---|
| Chrysanthemum | -0.2227945 |
| Ladybug Girl at the Beach | -0.1717888 |
| Olivia | -0.1559726 |
| The Other Side | -0.1403173 |
| Bread and Jam for Frances | -0.1326910 |
| Knuffle Bunny | -0.1258607 |
| Olivia. . . and the Missing Toy | -0.1219377 |
| The Keeping Quilt | -0.1141147 |
| A Bad Case of Stripes | -0.1125367 |
| Lilly’s Purple Plastic Purse | -0.1109904 |
| Goldilicious | -0.1070088 |
| Miss Rumphius | -0.1057535 |
| Blueberries for Sal | -0.0969059 |
| The Pigeon Finds a Hot Dog! | -0.0916011 |
| Charlie and the New Baby | -0.0694802 |
| Angelina Ice Skates | -0.0671414 |
| Stellaluna | -0.0607183 |
| The Napping House | -0.0578504 |
| Madeline | -0.0549459 |
| I’m a Big Sister | -0.0542753 |
| Cloudy With a Chance of Meatballs | -0.0516419 |
| Brown Bear, Brown Bear, What Do You See? | -0.0405251 |
| Show Dog | -0.0391585 |
| The Gardener | -0.0343828 |
| The Little House | -0.0316518 |
| The Berenstain Bears and the Green-Eyed Monster | -0.0292539 |
| Maisy Goes to the Library | -0.0289920 |
| The Grouchy Ladybug | -0.0281236 |
| The Very Hungry Caterpillar | -0.0269398 |
| Mike Mulligan and his Steam Shovel | -0.0225999 |
| George and Martha | -0.0194237 |
| The Hat | -0.0162605 |
| Make Way for Ducklings | -0.0138288 |
| Duck on a Bike | -0.0129937 |
| Arthur Writes a Story | -0.0102899 |
| Winter Days in the Big Woods | -0.0089948 |
| Click, Clack, Moo Cows that Type | -0.0065756 |
| Don’t Let the Pigeon Drive the Bus | -0.0052883 |
| Dragons Love Tacos | -0.0031539 |
| That Is Not a Good Idea! | 0.0011166 |
| The Carrot Seed | 0.0024524 |
| Maisy Goes Camping | 0.0052526 |
| A Sick Day for Amos McGee | 0.0060247 |
| The Story of Ferdinand | 0.0062799 |
| The Duckling gets a cookie!? | 0.0099374 |
| Are You My Mother? | 0.0109353 |
| Froggy Goes to Bed | 0.0112074 |
| The Berenstain Bears Forget Their Manners | 0.0126178 |
| Bunny Party | 0.0156525 |
| Llama llama home with mama | 0.0182262 |
| Horton Hears a Who! | 0.0192898 |
| Bark, George | 0.0201041 |
| Alexander and the Terrible, Horrible, No Good, Very Bad Day | 0.0212947 |
| Harry the Dirty Dog | 0.0236787 |
| Good Night Gorilla | 0.0242415 |
| Llama Llama Red Pajama | 0.0245242 |
| The Day the Crayons Quit | 0.0246210 |
| The Lorax | 0.0256525 |
| The Tale of Peter Rabbit | 0.0265083 |
| Own Moon | 0.0281096 |
| The Polar Express | 0.0282551 |
| Corduroy | 0.0310155 |
| The Paper Bag Princess | 0.0349305 |
| The Runaway Bunny | 0.0358249 |
| Love You Forever | 0.0378168 |
| There’s an Alligator Under My Bed | 0.0384450 |
| Bear Wants More | 0.0399634 |
| When Dinosaurs came with everything | 0.0417785 |
| Chicka Chicka 1-2-3 | 0.0441899 |
| No, David! | 0.0445534 |
| The True Story of the 3 little pigs! | 0.0460763 |
| Goodnight Moon | 0.0464993 |
| Pete the Cat: The Wheels on the Bus | 0.0468702 |
| The Story of Babar | 0.0491472 |
| How to Train a Train | 0.0530617 |
| Train | 0.0538341 |
| The snowy Day | 0.0577400 |
| The Cat in The Hat | 0.0588785 |
| Sylvester and the Magic Pebble | 0.0645792 |
| Oh, the Places You’ll Go | 0.0645905 |
| How Do Dinosaurs Say Good Night? | 0.0658704 |
| Dear Zoo | 0.0682707 |
| Chicka Chicka Boom Boom | 0.0703642 |
| Green Eggs and Ham | 0.0752307 |
| This is Not My Hat | 0.0769937 |
| Little Blue Truck Leads the Way | 0.0777590 |
| Arnie the Doughnut | 0.0784456 |
| Guess How Much I Love You | 0.0821203 |
| Clifford at the Circus | 0.0921021 |
| Dinosaur Rescue | 0.0937580 |
| Where the Wild Things Are | 0.0946174 |
| Harold and the Purple Crayon | 0.0965765 |
| The Little Engine That Could | 0.1002806 |
| Curious George Takes a Job | 0.1093987 |
| Trashy Town | 0.1342198 |
| The Giving Tree | 0.1385967 |
| Curious George | 0.1437146 |
| If You Give a Moose a Muffin | 0.1470233 |
| Caps for Sale | 0.1474325 |
| If You Give a Mouse a Cookie | 0.1590399 |
ggplot(norm_means_wide, aes(x = mean)) +
geom_histogram() +
ggtitle("mean rating by book") +
xlab("word2vec gender rating (+ => more male)") +
geom_vline(aes(xintercept = 0), color = "red", linetype = 2) +
theme_classic()
#norm_means_wide %>%
# mutate(male = ifelse(mean > 0, 1, 0)) %>%
# ungroup()%>%
# count(male)
tsne_dims <- read_csv("book_tsne_montag_w2v.csv") %>%
bind_cols(gender_rating = norm_means$mean) %>%
bind_cols(title = norm_means$title) %>%
mutate(gender_rating_cat = ifelse(gender_rating > 0,
"male", "female"))
ggplot(tsne_dims,
aes(x = tsne_X, y = tsne_Y, color = gender_rating)) +
geom_text(aes(label = title, color = gender_rating), size = 2) +
#geom_point() +
scale_color_gradient2(low = "red", high = "blue") +
theme_void()
ggplot(tsne_dims,
aes(x = tsne_X, y = tsne_Y, color = gender_rating)) +
geom_point() +
#geom_text(aes(label = title, color = gender_rating), size = 2) +
#geom_point() +
scale_color_gradient2(low = "red", high = "blue") +
theme_void()