Tidy Word Vectors Take 2

Well, here is what I came up with. The issue is accounting for documents that are shorter than the window size.

slide_windows <- function(tbl, doc_var, window_size) {
    # each word gets a skipgram (window_size words) starting on the first
    # e.g. skipgram 1 starts on word 1, skipgram 2 starts on word 2
    
    each_total <- tbl %>% 
        group_by(!!doc_var) %>% 
        mutate(doc_total = n(),
               each_total = pmin(doc_total, window_size, na.rm = TRUE)) %>%
        pull(each_total)
    
    rle_each <- rle(each_total)
    counts <- rle_each[["lengths"]]
    counts[rle_each$values != window_size] <- 1
    
    # each word get a skipgram window, starting on the first
    # account for documents shorter than window
    id_counts <- rep(rle_each$values, counts)
    window_id <- rep(seq_along(id_counts), id_counts)

    
    # within each skipgram, there are window_size many offsets
    indexer <- (seq_along(rle_each[["values"]]) - 1) %>%
        map2(rle_each[["values"]] - 1,
             ~ seq.int(.x, .x + .y)) %>% 
        map2(counts, ~ rep(.x, .y)) %>%
        flatten_int() +
        window_id
    
    ## here is where I gave up and used dplyr
    tbl[indexer, ] %>%
        bind_cols(data_frame(window_id)) %>%
        group_by(window_id) %>%
        filter(n_distinct(!!doc_var) == 1) %>%
        ungroup
}

Here is something else I tried to find the window IDs that contain multiple document IDs but it is SLOOOOOOOOOOW…

find_overlaps <- tbl[[!!doc_var]][indexer]

include <- seq_along(id_counts) %>%
    map(~find_overlaps[window_id == .x]) %>% ## SUPER DUPER SLOW PART
    map_lgl(~diff(range(.x)) < .Machine$double.eps ^ 0.5)


tbl[indexer[include], ] %>%
    bind_cols(data_frame(window_id[include]))

Let’s try it out.

library(janeaustenr)
library(tidytext)
library(tidyverse)

skipgrams <- austen_books() %>%
    mutate(line_number = row_number()) %>%
    unnest_tokens(word, text) %>%
    add_count(word) %>%
    filter(n >= 20) %>%
    slide_windows(quo(line_number), 8) ## when I test this out, it's ~10% faster than the method from my 1st blog post

library(widyr)

tidy_word_vectors <- skipgrams %>%
    pairwise_pmi(word, window_id) %>%
    widely_svd(item1, item2, pmi, nv = 256, maxit = 1000)

It’s not drastically faster, but it is easier to reason about.

Can we find synonyms? Yes, one at a time.

tidy_word_vectors %>%
  widely(~ . %*% (.["netherfield", ]), sort = TRUE)(item1, dimension, value)

## # A tibble: 2,633 x 3
##    item1       item2   value
##    <chr>       <int>   <dbl>
##  1 netherfield     1 0.0146 
##  2 he              1 0.0107 
##  3 longbourn       1 0.00997
##  4 following       1 0.00981
##  5 his             1 0.00953
##  6 at              1 0.00938
##  7 barton          1 0.00937
##  8 next            1 0.00931
##  9 cottage         1 0.00924
## 10 london          1 0.00921
## # ... with 2,623 more rows

Nice to be able to do this so easily.

tidy_word_vectors %>%
    filter(dimension <= 12) %>%
    group_by(dimension) %>%
    top_n(12, abs(value)) %>%
    ungroup %>%
    mutate(item1 = reorder(item1, value)) %>%
    group_by(dimension, item1) %>%
    arrange(desc(value)) %>%
    ungroup %>%
    mutate(item1 = factor(paste(item1, dimension, sep = "__"), 
                         levels = rev(paste(item1, dimension, sep = "__"))),
           dimension = factor(paste0("Dimension ", dimension),
                              levels = paste0("Dimension ", as.factor(1:12)))) %>%
    ggplot(aes(item1, value, fill = dimension)) +
    geom_col(show.legend = FALSE) +
    facet_wrap(~dimension, scales = "free_y", nrow = 4) +
    scale_x_discrete(labels = function(x) gsub("__.+$", "", x)) +
    coord_flip() +
    labs(x = NULL, y = "Value",
         title = "First 12 principal components of Jane Austen's novels",
         subtitle = "Top 12 words that contribute the most to the principal components that explain the most variation")

Tidy Word Vectors Take 2

Julia Silge

2017-11-19