Getting some text to model

First, I will get some text to use in topic modeling. I’ll use the same text we currently have in the chapter on topic modeling using the topicmodels package.

library(tidyverse)
library(gutenbergr)
library(tidytext)
library(stringr)

titles <- c("Twenty Thousand Leagues under the Sea", "The War of the Worlds",
            "Pride and Prejudice", "Great Expectations")
books <- gutenberg_works(title %in% titles) %>%
    gutenberg_download(meta_fields = "title")

by_chapter <- books %>%
  group_by(title) %>%
  mutate(chapter = cumsum(str_detect(text, regex("^chapter ", ignore_case = TRUE)))) %>%
  ungroup() %>%
  filter(chapter > 0) %>%
  unite(document, title, chapter)

word_counts <- by_chapter %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words) %>%
  count(document, word, sort = TRUE) %>%
  ungroup()

word_counts
## # A tibble: 104,722 x 3
##                    document    word     n
##                       <chr>   <chr> <int>
##  1    Great Expectations_57     joe    88
##  2     Great Expectations_7     joe    70
##  3    Great Expectations_17   biddy    63
##  4    Great Expectations_27     joe    58
##  5    Great Expectations_38 estella    58
##  6     Great Expectations_2     joe    56
##  7    Great Expectations_23  pocket    53
##  8    Great Expectations_15     joe    50
##  9    Great Expectations_18     joe    50
## 10 The War of the Worlds_16 brother    50
## # ... with 104,712 more rows

The input to topic modeling using stm can be a quanteda dfm.

library(stm)
library(quanteda)


chapters_dfm <- word_counts %>%
  cast_dfm(document, word, n)

chapters_dfm
## Document-feature matrix of: 193 documents, 18,215 features (97% sparse).
stm_model <- stm(chapters_dfm, K = 4, verbose = FALSE)

We successfully fit a stm model.

Now what?

summary(stm_model)
## A topic model with 4 topics, 193 documents and a 18215 word dictionary.
## Topic 1 Top Words:
##       Highest Prob: captain, nautilus, sea, nemo, ned, conseil, land 
##       FREX: captain, nautilus, nemo, ned, conseil, canadian, ocean 
##       Lift: bullets, canoes, declivity, dillon, galleons, gallons, layers 
##       Score: nautilus, nemo, ned, conseil, captain, canadian, ocean 
## Topic 2 Top Words:
##       Highest Prob: joe, miss, time, pip, looked, herbert, hand 
##       FREX: joe, pip, herbert, wemmick, havisham, estella, biddy 
##       Lift: jaggers's, joseph, a.m, a'most, aback, abased, abear 
##       Score: joe, pip, biddy, jaggers, wemmick, havisham, estella 
## Topic 3 Top Words:
##       Highest Prob: elizabeth, darcy, miss, bennet, jane, bingley, time 
##       FREX: elizabeth, darcy, bennet, jane, bingley, wickham, collins 
##       Lift: _affect_, _am_, _any_, _appearance_, _be_, _begin_, _being 
##       Score: elizabeth, darcy, bennet, bingley, wickham, jane, lydia 
## Topic 4 Top Words:
##       Highest Prob: people, martians, time, black, night, road, brother 
##       FREX: martians, martian, woking, mars, curate, pine, artilleryman 
##       Lift: _been_, _blotted_, _brutes_, _budded_, _daily, _dead_, _en 
##       Score: martians, martian, woking, cylinder, mars, curate, ulla

That looks promising.

Tidy

tidy.stm <- function(x, matrix = c("beta", "gamma", "theta"), log = FALSE, document_names = NULL, ...) {
  matrix <- match.arg(matrix)
  if (matrix == "beta") {
    mat <- x$beta
  } else {
    mat <- x$theta
  }

  ret <- reshape2::melt(mat) %>%
    tbl_df()

  if (matrix == "beta") {
    ret <- transmute(ret, topic = Var1, term = x$vocab[Var2], beta = value)
  } else {
    ret <- transmute(ret, document = Var1, topic = Var2, gamma = value)
    if (!is.null(document_names)) {
      ret$document <- document_names[ret$document]
    }
  }

  if (matrix == "beta" && !log) {
    ret[[matrix]] <- exp(ret[[matrix]])
  } else if (matrix %in% c("gamma", "theta") && log) {
    ret[[matrix]] <- log(ret[[matrix]])
  }
  ret
}

BUT WHERE ARE THE DOCUMENT NAMES??? Are they really gone? Yes :(

tidy.stm(stm_model, matrix = "beta") %>% 
  group_by(topic) %>%
  top_n(5, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)
## # A tibble: 20 x 3
##    topic      term        beta
##    <int>     <chr>       <dbl>
##  1     1   captain 0.015548063
##  2     1  nautilus 0.013082513
##  3     1       sea 0.008994842
##  4     1      nemo 0.008730061
##  5     1       ned 0.008050777
##  6     2       joe 0.013191613
##  7     2      miss 0.007078365
##  8     2      time 0.006713371
##  9     2       pip 0.006223539
## 10     2    looked 0.006105482
## 11     3 elizabeth 0.015626831
## 12     3     darcy 0.009763497
## 13     3      miss 0.007750863
## 14     3    bennet 0.007695625
## 15     3      jane 0.007224465
## 16     4    people 0.006872080
## 17     4  martians 0.006564288
## 18     4      time 0.005457814
## 19     4     black 0.005377354
## 20     4     night 0.004489902

YASSSS

tidy.stm(stm_model, matrix = "theta") ## without document names
## # A tibble: 772 x 3
##    document topic        gamma
##       <int> <int>        <dbl>
##  1        1     1 0.0001829624
##  2        2     1 0.0001392330
##  3        3     1 0.0003031470
##  4        4     1 0.0001593147
##  5        5     1 0.0003052634
##  6        6     1 0.0001645394
##  7        7     1 0.0023414604
##  8        8     1 0.0001995044
##  9        9     1 0.0001106603
## 10       10     1 0.0001953030
## # ... with 762 more rows
tidy.stm(stm_model, matrix = "gamma", 
         document_names = rownames(chapters_dfm)) %>% ## with document names
  separate(document, c("title", "chapter"), sep = "_", convert = TRUE) %>%
  mutate(title = reorder(title, gamma * topic)) %>%
  ggplot(aes(factor(topic), gamma)) +
  geom_boxplot(position = "dodge") +
  facet_wrap(~ title)

Added an optional second argument of document names

Augment

augment.stm <- function(x, data, ...) {
  
  if (missing(data)) {
    stop("data argument must be provided in order to augment a stm model")
  }
  
  if (inherits(data, "data.frame") &&
      (all(c("document", "term") %in% colnames(data)))) {
    data$value <- 1
    mat <- cast_dfm(data, document, term, value)
  } else if (inherits(data, "dfmSparse")) {
    mat <- data
    data <- tidy(mat)
  } else {
    stop("data argument must either be a dfmSparse ",
         "(from quanteda) or a table with document and term columns")
  }
  
  beta <- t(as.matrix(x$beta$logbeta[[1]]))
  theta <- x$theta
  
  term_indices <- match(data$term, x$vocab)
  doc_indices <- match(data$document, rownames(mat))

  products <- beta[term_indices, ] * theta[doc_indices, ]
  keep <- !is.na(term_indices) & !is.na(doc_indices)

  data$.topic <- NA
  data$.topic[keep] <- apply(products[keep, ], 1, which.max)
  data
}

augment.stm(stm_model, chapters_dfm)
## # A tibble: 104,722 x 4
##                 document  term count .topic
##                    <chr> <chr> <dbl>  <int>
##  1 Great Expectations_57   joe    88      1
##  2  Great Expectations_7   joe    70      1
##  3 Great Expectations_17   joe     5      4
##  4 Great Expectations_27   joe    58      1
##  5  Great Expectations_2   joe    56      1
##  6 Great Expectations_23   joe     1      1
##  7 Great Expectations_15   joe    50      1
##  8 Great Expectations_18   joe    50      1
##  9  Great Expectations_9   joe    44      1
## 10 Great Expectations_13   joe    40      1
## # ... with 104,712 more rows
augment.stm(stm_model, chapters_dfm) %>%
  separate(document, c("title", "chapter"), sep = "_", convert = TRUE) %>%
  count(title, .topic) %>%
  arrange(title, desc(n))
## # A tibble: 12 x 3
##                                    title .topic     n
##                                    <chr>  <int> <int>
##  1                    Great Expectations      1 25302
##  2                    Great Expectations      3  5992
##  3                    Great Expectations      4  5405
##  4                   Pride and Prejudice      4 17785
##  5                   Pride and Prejudice      1  4551
##  6                   Pride and Prejudice      2  3945
##  7                 The War of the Worlds      3  7510
##  8                 The War of the Worlds      1  5550
##  9                 The War of the Worlds      2  2307
## 10 Twenty Thousand Leagues under the Sea      2 15450
## 11 Twenty Thousand Leagues under the Sea      4  7783
## 12 Twenty Thousand Leagues under the Sea      3  3142

OK, that worked now!

Glance

glance.stm <- function(x, ...) {
  ret <- data_frame(k = as.integer(x$settings$dim$K), 
                    docs = x$settings$dim$N, 
                    terms = x$settings$dim$V,
                    iter = length(x$convergence$bound),
                    alpha = x$settings$init$alpha)

  ret
}

glance.stm(stm_model)
## # A tibble: 1 x 5
##       k  docs terms  iter alpha
##   <int> <int> <int> <int> <dbl>
## 1     4   193 18215    83  12.5

That is working well, seems like.