title: “WPA Slave Narratives- glove word embeddings” author: “T Yasin” # This uses the text2vec package with GloVe word embeddings. # See also a word2vec implementation in R from Ben Schmidt: # devtools::install_github(“benschmidt/wordVectors”)
devtools::install_github(“lmullen/tractarian”)
devtools::install_github(“lmullen/WPAnarratives”)
devtools::install_github(“lmullen/tokenizers”)
devtools::install_github(“juliasilge/janeaustenr”)
library(tractarian) library(magrittr) library(dplyr) library(readr) library(WPAnarratives) library(tokenizers) library(text2vec) library(Matrix) library(dplyr) library(ggplot2) library(ggrepel) library(broom)
tokens <- tokenize_words(wpa_narratives$text)
it <- itoken(tokens) vocab <- create_vocabulary(it) vocab_pruned <- prune_vocabulary(vocab, term_count_min = 5, doc_proportion_max = 0.9) vectorizer <- vocab_vectorizer(vocab_pruned, grow_dtm = TRUE, skip_grams_window = 10L) it <- itoken(tokens) corpus <- create_corpus(it, vectorizer)
tcm <- get_tcm(corpus)
nnzero(tcm) / length(tcm)
Please change numThreads to 1 or 2 when using the server in class
RcppParallel::setThreadOptions(numThreads = 5)
glove_fit <- glove(tcm, word_vectors_size = 100, x_max = 10, num_iters = 20)
word_vectors <- glove_fit\(word_vectors[[1]] + glove_fit\)word_vectors[[2]] rownames(word_vectors) <- rownames(tcm) word_vectors_norm <- sqrt(rowSums(word_vectors ^ 2)) ```
word_vec <- function(word) {
word_vectors[word, , drop = FALSE]
}
closest_to <- function(word_vec, n = 10) {
cos_dist <- text2vec:::cosine(word_vec, word_vectors, word_vectors_norm)
head(sort(cos_dist[1, ], decreasing = TRUE), n)
}
similarities <- function(word_vec) {
cos_dist <- text2vec:::cosine(word_vec, word_vectors, word_vectors_norm)
cos_dist %>% t() %>% tidy() %>% rename(word = .rownames)
}
word_vec(“live”) %>% closest_to() word_vec(“freedom”) %>% closest_to() word_vec(“dead”) %>% closest_to() word_vec(“sold”) %>% closest_to()
(word_vec("live") - word_vec("death")) %>% closest_to()
I chose the test words of: Work, Free, Whip, and Pray
test_words <- c("work", "free", "whip", "pray")
word_sim <- word_vec(test_words) %>% similarities() interesting_words <- c(“pray”, “bow”, “sell”, “dead”, “master”, “hit”, “whip”, “love”, “good”, “mistress”, “freedom”, “escape”, “nigger”, “free”, “soap”, “mammy”, “wash”, “hair”, “clothes”, “land”, “house”, “overseer”, “color”, “mulatto”, “baby”, “sold”, “plant”, “black”, “light”, “dark”, “father”, “hair”, “sing”, “God”, “Egypt”, “old”, “country”, “water”, “song”, “hum”, “drum”, “music”, “boat”, “learn”, “read”, “write”, “girl”, “boy”, “with child”, “run”, “ran”, “chopped”, “foot”)
word_sim %>%
filter(word %in% interesting_words)
ggplot(aes(x = freedom, y = work, label = word)) +
geom_text() + # alternatively, use geom_point() + geom_text_repel()
theme_bw() +
lims(x = c(-1, 1), y = c(-1, 1)) +
labs(title = "Relationship of words to 'work' and 'freedom'")
word_sim %>% filter(word %in% interesting_words) %>% ggplot(aes(x = free, y = dead, label = word)) + geom_text() + # alternatively, use geom_point() + geom_text_repel() theme_bw() + lims(x = c(-1.01, 1.01), y = c(-1.01, 1.01)) + labs(title = “Relationship of words to ‘free’ and ‘dead’”) ```