First, I will get some text to use in topic modeling. I’ll use the same text we currently have in the chapter on topic modeling using the topicmodels package.
library(tidyverse)
library(gutenbergr)
library(tidytext)
library(stringr)
titles <- c("Twenty Thousand Leagues under the Sea", "The War of the Worlds",
"Pride and Prejudice", "Great Expectations")
books <- gutenberg_works(title %in% titles) %>%
gutenberg_download(meta_fields = "title")
by_chapter <- books %>%
group_by(title) %>%
mutate(chapter = cumsum(str_detect(text, regex("^chapter ", ignore_case = TRUE)))) %>%
ungroup() %>%
filter(chapter > 0) %>%
unite(document, title, chapter)
word_counts <- by_chapter %>%
unnest_tokens(word, text) %>%
anti_join(stop_words) %>%
count(document, word, sort = TRUE) %>%
ungroup()
word_counts
## # A tibble: 104,722 x 3
## document word n
## <chr> <chr> <int>
## 1 Great Expectations_57 joe 88
## 2 Great Expectations_7 joe 70
## 3 Great Expectations_17 biddy 63
## 4 Great Expectations_27 joe 58
## 5 Great Expectations_38 estella 58
## 6 Great Expectations_2 joe 56
## 7 Great Expectations_23 pocket 53
## 8 Great Expectations_15 joe 50
## 9 Great Expectations_18 joe 50
## 10 The War of the Worlds_16 brother 50
## # ... with 104,712 more rows
The input to topic modeling using stm can be a quanteda dfm.
library(stm)
library(quanteda)
chapters_dfm <- word_counts %>%
cast_dfm(document, word, n)
chapters_dfm
## Document-feature matrix of: 193 documents, 18,215 features (97% sparse).
stm_model <- stm(chapters_dfm, K = 4, verbose = FALSE)
We successfully fit a stm model.
summary(stm_model)
## A topic model with 4 topics, 193 documents and a 18215 word dictionary.
## Topic 1 Top Words:
## Highest Prob: captain, nautilus, sea, nemo, ned, conseil, land
## FREX: captain, nautilus, nemo, ned, conseil, canadian, ocean
## Lift: bullets, canoes, declivity, dillon, galleons, gallons, layers
## Score: nautilus, nemo, ned, conseil, captain, canadian, ocean
## Topic 2 Top Words:
## Highest Prob: joe, miss, time, pip, looked, herbert, hand
## FREX: joe, pip, herbert, wemmick, havisham, estella, biddy
## Lift: jaggers's, joseph, a.m, a'most, aback, abased, abear
## Score: joe, pip, biddy, jaggers, wemmick, havisham, estella
## Topic 3 Top Words:
## Highest Prob: elizabeth, darcy, miss, bennet, jane, bingley, time
## FREX: elizabeth, darcy, bennet, jane, bingley, wickham, collins
## Lift: _affect_, _am_, _any_, _appearance_, _be_, _begin_, _being
## Score: elizabeth, darcy, bennet, bingley, wickham, jane, lydia
## Topic 4 Top Words:
## Highest Prob: people, martians, time, black, night, road, brother
## FREX: martians, martian, woking, mars, curate, pine, artilleryman
## Lift: _been_, _blotted_, _brutes_, _budded_, _daily, _dead_, _en
## Score: martians, martian, woking, cylinder, mars, curate, ulla
That looks promising.
tidy.stm <- function(x, matrix = c("beta", "gamma", "theta"), log = FALSE, document_names = NULL, ...) {
matrix <- match.arg(matrix)
if (matrix == "beta") {
mat <- x$beta
} else {
mat <- x$theta
}
ret <- reshape2::melt(mat) %>%
tbl_df()
if (matrix == "beta") {
ret <- transmute(ret, topic = Var1, term = x$vocab[Var2], beta = value)
} else {
ret <- transmute(ret, document = Var1, topic = Var2, gamma = value)
if (!is.null(document_names)) {
ret$document <- document_names[ret$document]
}
}
if (matrix == "beta" && !log) {
ret[[matrix]] <- exp(ret[[matrix]])
} else if (matrix %in% c("gamma", "theta") && log) {
ret[[matrix]] <- log(ret[[matrix]])
}
ret
}
BUT WHERE ARE THE DOCUMENT NAMES??? Are they really gone? Yes :(
tidy.stm(stm_model, matrix = "beta") %>%
group_by(topic) %>%
top_n(5, beta) %>%
ungroup() %>%
arrange(topic, -beta)
## # A tibble: 20 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 captain 0.015548063
## 2 1 nautilus 0.013082513
## 3 1 sea 0.008994842
## 4 1 nemo 0.008730061
## 5 1 ned 0.008050777
## 6 2 joe 0.013191613
## 7 2 miss 0.007078365
## 8 2 time 0.006713371
## 9 2 pip 0.006223539
## 10 2 looked 0.006105482
## 11 3 elizabeth 0.015626831
## 12 3 darcy 0.009763497
## 13 3 miss 0.007750863
## 14 3 bennet 0.007695625
## 15 3 jane 0.007224465
## 16 4 people 0.006872080
## 17 4 martians 0.006564288
## 18 4 time 0.005457814
## 19 4 black 0.005377354
## 20 4 night 0.004489902
YASSSS
tidy.stm(stm_model, matrix = "theta") ## without document names
## # A tibble: 772 x 3
## document topic gamma
## <int> <int> <dbl>
## 1 1 1 0.0001829624
## 2 2 1 0.0001392330
## 3 3 1 0.0003031470
## 4 4 1 0.0001593147
## 5 5 1 0.0003052634
## 6 6 1 0.0001645394
## 7 7 1 0.0023414604
## 8 8 1 0.0001995044
## 9 9 1 0.0001106603
## 10 10 1 0.0001953030
## # ... with 762 more rows
tidy.stm(stm_model, matrix = "gamma",
document_names = rownames(chapters_dfm)) %>% ## with document names
separate(document, c("title", "chapter"), sep = "_", convert = TRUE) %>%
mutate(title = reorder(title, gamma * topic)) %>%
ggplot(aes(factor(topic), gamma)) +
geom_boxplot(position = "dodge") +
facet_wrap(~ title)
Added an optional second argument of document names
augment.stm <- function(x, data, ...) {
if (missing(data)) {
stop("data argument must be provided in order to augment a stm model")
}
if (inherits(data, "data.frame") &&
(all(c("document", "term") %in% colnames(data)))) {
data$value <- 1
mat <- cast_dfm(data, document, term, value)
} else if (inherits(data, "dfmSparse")) {
mat <- data
data <- tidy(mat)
} else {
stop("data argument must either be a dfmSparse ",
"(from quanteda) or a table with document and term columns")
}
beta <- t(as.matrix(x$beta$logbeta[[1]]))
theta <- x$theta
term_indices <- match(data$term, x$vocab)
doc_indices <- match(data$document, rownames(mat))
products <- beta[term_indices, ] * theta[doc_indices, ]
keep <- !is.na(term_indices) & !is.na(doc_indices)
data$.topic <- NA
data$.topic[keep] <- apply(products[keep, ], 1, which.max)
data
}
augment.stm(stm_model, chapters_dfm)
## # A tibble: 104,722 x 4
## document term count .topic
## <chr> <chr> <dbl> <int>
## 1 Great Expectations_57 joe 88 1
## 2 Great Expectations_7 joe 70 1
## 3 Great Expectations_17 joe 5 4
## 4 Great Expectations_27 joe 58 1
## 5 Great Expectations_2 joe 56 1
## 6 Great Expectations_23 joe 1 1
## 7 Great Expectations_15 joe 50 1
## 8 Great Expectations_18 joe 50 1
## 9 Great Expectations_9 joe 44 1
## 10 Great Expectations_13 joe 40 1
## # ... with 104,712 more rows
augment.stm(stm_model, chapters_dfm) %>%
separate(document, c("title", "chapter"), sep = "_", convert = TRUE) %>%
count(title, .topic) %>%
arrange(title, desc(n))
## # A tibble: 12 x 3
## title .topic n
## <chr> <int> <int>
## 1 Great Expectations 1 25302
## 2 Great Expectations 3 5992
## 3 Great Expectations 4 5405
## 4 Pride and Prejudice 4 17785
## 5 Pride and Prejudice 1 4551
## 6 Pride and Prejudice 2 3945
## 7 The War of the Worlds 3 7510
## 8 The War of the Worlds 1 5550
## 9 The War of the Worlds 2 2307
## 10 Twenty Thousand Leagues under the Sea 2 15450
## 11 Twenty Thousand Leagues under the Sea 4 7783
## 12 Twenty Thousand Leagues under the Sea 3 3142
OK, that worked now!
glance.stm <- function(x, ...) {
ret <- data_frame(k = as.integer(x$settings$dim$K),
docs = x$settings$dim$N,
terms = x$settings$dim$V,
iter = length(x$convergence$bound),
alpha = x$settings$init$alpha)
ret
}
glance.stm(stm_model)
## # A tibble: 1 x 5
## k docs terms iter alpha
## <int> <int> <int> <int> <dbl>
## 1 4 193 18215 83 12.5
That is working well, seems like.