title: “WPA Slave Narratives- glove word embeddings” author: “T Yasin” # This uses the text2vec package with GloVe word embeddings. # See also a word2vec implementation in R from Ben Schmidt: # devtools::install_github(“benschmidt/wordVectors”)

devtools::install_github(“lmullen/tractarian”)

devtools::install_github(“lmullen/WPAnarratives”)

devtools::install_github(“lmullen/tokenizers”)

devtools::install_github(“juliasilge/janeaustenr”)

library(tractarian) library(magrittr) library(dplyr) library(readr) library(WPAnarratives) library(tokenizers) library(text2vec) library(Matrix) library(dplyr) library(ggplot2) library(ggrepel) library(broom)

tokens <- tokenize_words(wpa_narratives$text)

it <- itoken(tokens) vocab <- create_vocabulary(it) vocab_pruned <- prune_vocabulary(vocab, term_count_min = 5, doc_proportion_max = 0.9) vectorizer <- vocab_vectorizer(vocab_pruned, grow_dtm = TRUE, skip_grams_window = 10L) it <- itoken(tokens) corpus <- create_corpus(it, vectorizer)

tcm <- get_tcm(corpus)

nnzero(tcm) / length(tcm)

Please change numThreads to 1 or 2 when using the server in class

RcppParallel::setThreadOptions(numThreads = 5)

glove_fit <- glove(tcm, word_vectors_size = 100, x_max = 10, num_iters = 20)

word_vectors <- glove_fit\(word_vectors[[1]] + glove_fit\)word_vectors[[2]] rownames(word_vectors) <- rownames(tcm) word_vectors_norm <- sqrt(rowSums(word_vectors ^ 2)) ```

word_vec <- function(word) {
  word_vectors[word, , drop = FALSE]
}

closest_to <- function(word_vec, n = 10) {
  cos_dist <- text2vec:::cosine(word_vec, word_vectors, word_vectors_norm)
  head(sort(cos_dist[1, ], decreasing = TRUE), n)
}

similarities <- function(word_vec) {
  cos_dist <- text2vec:::cosine(word_vec, word_vectors, word_vectors_norm)
  cos_dist %>% t() %>% tidy() %>% rename(word = .rownames)
}

word_vec(“live”) %>% closest_to() word_vec(“freedom”) %>% closest_to() word_vec(“dead”) %>% closest_to() word_vec(“sold”) %>% closest_to()



(word_vec("live") - word_vec("death")) %>% closest_to()

I chose the test words of: Work, Free, Whip, and Pray

test_words <- c("work", "free", "whip", "pray")

word_sim <- word_vec(test_words) %>% similarities() interesting_words <- c(“pray”, “bow”, “sell”, “dead”, “master”, “hit”, “whip”, “love”, “good”, “mistress”, “freedom”, “escape”, “nigger”, “free”, “soap”, “mammy”, “wash”, “hair”, “clothes”, “land”, “house”, “overseer”, “color”, “mulatto”, “baby”, “sold”, “plant”, “black”, “light”, “dark”, “father”, “hair”, “sing”, “God”, “Egypt”, “old”, “country”, “water”, “song”, “hum”, “drum”, “music”, “boat”, “learn”, “read”, “write”, “girl”, “boy”, “with child”, “run”, “ran”, “chopped”, “foot”)



word_sim %>%
  filter(word %in% interesting_words) 
ggplot(aes(x = freedom, y = work, label = word)) +
  geom_text() + # alternatively, use geom_point() + geom_text_repel()
  theme_bw() +
  lims(x = c(-1, 1), y = c(-1, 1)) +
  labs(title = "Relationship of words to 'work' and 'freedom'")

word_sim %>% filter(word %in% interesting_words) %>% ggplot(aes(x = free, y = dead, label = word)) + geom_text() + # alternatively, use geom_point() + geom_text_repel() theme_bw() + lims(x = c(-1.01, 1.01), y = c(-1.01, 1.01)) + labs(title = “Relationship of words to ‘free’ and ‘dead’”) ```