Well, here is what I came up with. The issue is accounting for documents that are shorter than the window size.
slide_windows <- function(tbl, doc_var, window_size) {
# each word gets a skipgram (window_size words) starting on the first
# e.g. skipgram 1 starts on word 1, skipgram 2 starts on word 2
each_total <- tbl %>%
group_by(!!doc_var) %>%
mutate(doc_total = n(),
each_total = pmin(doc_total, window_size, na.rm = TRUE)) %>%
pull(each_total)
rle_each <- rle(each_total)
counts <- rle_each[["lengths"]]
counts[rle_each$values != window_size] <- 1
# each word get a skipgram window, starting on the first
# account for documents shorter than window
id_counts <- rep(rle_each$values, counts)
window_id <- rep(seq_along(id_counts), id_counts)
# within each skipgram, there are window_size many offsets
indexer <- (seq_along(rle_each[["values"]]) - 1) %>%
map2(rle_each[["values"]] - 1,
~ seq.int(.x, .x + .y)) %>%
map2(counts, ~ rep(.x, .y)) %>%
flatten_int() +
window_id
## here is where I gave up and used dplyr
tbl[indexer, ] %>%
bind_cols(data_frame(window_id)) %>%
group_by(window_id) %>%
filter(n_distinct(!!doc_var) == 1) %>%
ungroup
}
Here is something else I tried to find the window IDs that contain multiple document IDs but it is SLOOOOOOOOOOW…
find_overlaps <- tbl[[!!doc_var]][indexer]
include <- seq_along(id_counts) %>%
map(~find_overlaps[window_id == .x]) %>% ## SUPER DUPER SLOW PART
map_lgl(~diff(range(.x)) < .Machine$double.eps ^ 0.5)
tbl[indexer[include], ] %>%
bind_cols(data_frame(window_id[include]))
Let’s try it out.
library(janeaustenr)
library(tidytext)
library(tidyverse)
skipgrams <- austen_books() %>%
mutate(line_number = row_number()) %>%
unnest_tokens(word, text) %>%
add_count(word) %>%
filter(n >= 20) %>%
slide_windows(quo(line_number), 8) ## when I test this out, it's ~10% faster than the method from my 1st blog post
library(widyr)
tidy_word_vectors <- skipgrams %>%
pairwise_pmi(word, window_id) %>%
widely_svd(item1, item2, pmi, nv = 256, maxit = 1000)
It’s not drastically faster, but it is easier to reason about.
Can we find synonyms? Yes, one at a time.
tidy_word_vectors %>%
widely(~ . %*% (.["netherfield", ]), sort = TRUE)(item1, dimension, value)
## # A tibble: 2,633 x 3
## item1 item2 value
## <chr> <int> <dbl>
## 1 netherfield 1 0.0146
## 2 he 1 0.0107
## 3 longbourn 1 0.00997
## 4 following 1 0.00981
## 5 his 1 0.00953
## 6 at 1 0.00938
## 7 barton 1 0.00937
## 8 next 1 0.00931
## 9 cottage 1 0.00924
## 10 london 1 0.00921
## # ... with 2,623 more rows
Nice to be able to do this so easily.
tidy_word_vectors %>%
filter(dimension <= 12) %>%
group_by(dimension) %>%
top_n(12, abs(value)) %>%
ungroup %>%
mutate(item1 = reorder(item1, value)) %>%
group_by(dimension, item1) %>%
arrange(desc(value)) %>%
ungroup %>%
mutate(item1 = factor(paste(item1, dimension, sep = "__"),
levels = rev(paste(item1, dimension, sep = "__"))),
dimension = factor(paste0("Dimension ", dimension),
levels = paste0("Dimension ", as.factor(1:12)))) %>%
ggplot(aes(item1, value, fill = dimension)) +
geom_col(show.legend = FALSE) +
facet_wrap(~dimension, scales = "free_y", nrow = 4) +
scale_x_discrete(labels = function(x) gsub("__.+$", "", x)) +
coord_flip() +
labs(x = NULL, y = "Value",
title = "First 12 principal components of Jane Austen's novels",
subtitle = "Top 12 words that contribute the most to the principal components that explain the most variation")