This code is adapted from: tidytext website ch 4, ch 8, stm package paper, and quanteda package. Sources link/cite.

Data

Load text data, then create separate tables: document text corpus and document metadata. This example is comprised of a doc with the header “EXCERPT” to mark separate documents within the whole doc. Additional metadata variables would be useful. One table has each doc as a continuous text block; the other table has each line as a separate entry. Adjust for corpus.

source_text <- "C:/Users/newsomevw/OneDrive - National Institutes of Health/Desktop/TM scripts/longer_interview_chirp_sample.txt"
original_interview <- read_lines(source_text, locale = locale(encoding = "UTF-8"))
original_interview <- original_interview[original_interview != ""]

interview_df <- tibble(text = original_interview) %>%
  mutate(excerpt = ifelse(str_detect(text, "^EXCERPT"), text, NA), .before = 1) %>%
  fill(excerpt, .direction = "down") %>%
  filter(!str_detect(text, "^EXCERPT")) %>%
  extract(text, into = c("speaker", "text"), regex = "^(SW|CG):\\s*(.*)") %>%
  mutate(line_id = row_number(), .before = 1)

interview_text <- interview_df %>%
  select(line_id, text)

interview_whole_df <- interview_df %>%
  select(-line_id) %>%
  group_by(excerpt, speaker) %>%
  summarise(text = paste(text, collapse = " "), .groups = "drop") %>%
  mutate(doc_id = row_number(), .before = 1)

interview_whole_text <- interview_whole_df %>%
  select(doc_id, text)

interview_df <- interview_df %>%
  select(-text)
interview_whole_df <- interview_whole_df %>%
  select(-text)

Tokenize

Use the unnest_tokens function from tidytext to create a token table. Load the library of stop words, then confirm that none of the stop words are useful measures. Plot the top word frequency, observing that stop words are less useful to the corpus dict. Tokenize the separate line table for later functions.

data(stop_words)

tidy_interview_token_withstop <- interview_whole_text %>%
  unnest_tokens(word, text)

tidy_interview_token <- tidy_interview_token_withstop %>%
  anti_join(stop_words)

tidy_interview_token_withstop %>%
  count(word, sort = TRUE)

## # A tibble: 1,834 × 2
##    word      n
##    <chr> <int>
##  1 i       375
##  2 the     348
##  3 and     263
##  4 to      250
##  5 that    249
##  6 a       205
##  7 of      188
##  8 she     183
##  9 it      179
## 10 is      170
## # ℹ 1,824 more rows

tidy_interview_token %>%
  count(word, sort = TRUE)

## # A tibble: 1,462 × 2
##    word        n
##    <chr>   <int>
##  1 people    108
##  2 support    84
##  3 i’m        83
##  4 don’t      62
##  5 feel       58
##  6 that’s     56
##  7 close      55
##  8 doesn’t    53
##  9 it’s       48
## 10 lot        41
## # ℹ 1,452 more rows

tidy_interview_token %>%
  count(word, sort = TRUE) %>%
  filter(n > 15) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word)) +
  geom_col() +
  labs(y = NULL)

tidy_interview_line_token <- interview_text %>%
  unnest_tokens(word, text)

tidy_interview_line_token <- tidy_interview_line_token %>%
  anti_join(stop_words)

TF-IDF

Calculate term frequency-inverse document frequency for each word, relative to the document. Use the bind function from tidytext.

interview_doc_words <- tidy_interview_token %>%
  count(doc_id, word, sort = TRUE)

total_doc_words <- interview_doc_words %>%
  group_by(doc_id) %>%
  summarize(total = sum(n))

interview_doc_words <- left_join(interview_doc_words, total_doc_words)

interview_doc_tf_idf <- interview_doc_words %>%
  bind_tf_idf(word, doc_id, n)
  
interview_doc_tf_idf %>%
  arrange(desc(tf_idf))

## # A tibble: 2,977 × 7
##    doc_id word            n total     tf   idf tf_idf
##     <int> <chr>       <int> <int>  <dbl> <dbl>  <dbl>
##  1     13 15001          14   366 0.0383  3.00 0.115 
##  2     10 ecology         2    42 0.0476  2.30 0.110 
##  3      6 belongs         2    63 0.0317  3.00 0.0951
##  4     18 17001           2    50 0.04    2.30 0.0921
##  5     16 active          1    39 0.0256  3.00 0.0768
##  6     16 flexibility     1    39 0.0256  3.00 0.0768
##  7     16 receive         1    39 0.0256  3.00 0.0768
##  8     16 sorting         1    39 0.0256  3.00 0.0768
##  9     16 speaking        1    39 0.0256  3.00 0.0768
## 10      4 19001           2    63 0.0317  2.30 0.0731
## # ℹ 2,967 more rows

Word meaning

Basic sentiments are described for the words in the corpus. Further dictionary-based analysis may examine specific process words, thematic words, etc.

interview_sentiments <- interview_doc_words %>%
  inner_join(get_sentiments("bing"), by = "word") %>%
  count(word, sentiment, sort = TRUE)

interview_sentiments %>%
  group_by(sentiment) %>%
  slice_max(n, n = 15) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL)

N-grams

Create a table of word bigrams, to investigate their TF-IDF. Observing the impact of the stop words, remove them from bigrams prior to calculating TF-IDF. Adjust n to observe different N-grams

tidy_interview_bigrams <- interview_whole_text %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
  filter(!is.na(bigram))

tidy_interview_bigrams %>%
  count(bigram, sort = TRUE)

## # A tibble: 8,313 × 2
##    bigram          n
##    <chr>       <int>
##  1 and i          42
##  2 a lot          41
##  3 in the         37
##  4 i don’t        33
##  5 i think        31
##  6 people who     29
##  7 if i           28
##  8 she doesn’t    25
##  9 of the         24
## 10 who can        24
## # ℹ 8,303 more rows

bigrams_separated <- tidy_interview_bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ")
bigrams_filtered <- bigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)

bigrams_counts <- bigrams_filtered %>% 
  count(word1, word2, sort = TRUE)
bigrams_united <- bigrams_filtered %>%
  unite(bigram, word1, word2, sep = " ")

interview_bigram_tf_idf <- bigrams_united %>%
  count(doc_id, bigram) %>%
  bind_tf_idf(doc_id, bigram, n) %>%
  arrange(desc(tf_idf))

interview_bigram_tf_idf

## # A tibble: 1,125 × 6
##    doc_id bigram                     n    tf   idf tf_idf
##     <int> <chr>                  <int> <dbl> <dbl>  <dbl>
##  1     14 close that’s               1     1  4.63   4.63
##  2     14 equate closeness           1     1  4.63   4.63
##  3     14 explain mm                 1     1  4.63   4.63
##  4     14 feels real                 1     1  4.63   4.63
##  5     14 practically supportive     1     1  4.63   4.63
##  6     14 support feels              1     1  4.63   4.63
##  7     16 16001 stopped              1     1  4.63   4.63
##  8     16 active sorting             1     1  4.63   4.63
##  9     16 collaborative that’s       1     1  4.63   4.63
## 10     16 flexibility mm             1     1  4.63   4.63
## # ℹ 1,115 more rows

tidy_interview_trigrams <- interview_whole_text %>%
  unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
  filter(!is.na(trigram))

tidy_interview_trigrams %>%
  count(trigram, sort = TRUE)

## # A tibble: 11,263 × 2
##    trigram              n
##    <chr>            <int>
##  1 a lot of            23
##  2 one of the          11
##  3 are there people    10
##  4 i used to           10
##  5 if i say            10
##  6 i have to            9
##  7 there people who     8
##  8 and i think          7
##  9 how do you           7
## 10 i think i            7
## # ℹ 11,253 more rows

trigrams_separated <- tidy_interview_trigrams %>%
  separate(trigram, c("word1", "word2", "word3"), sep = " ")
trigrams_filtered <- trigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word) %>%
  filter(!word3 %in% stop_words$word)

trigrams_counts <- trigrams_filtered %>% 
  count(word1, word2, word3, sort = TRUE)
trigrams_united <- trigrams_filtered %>%
  unite(trigram, word1, word2, word3, sep = " ")

interview_trigram_tf_idf <- trigrams_united %>%
  count(doc_id, trigram) %>%
  bind_tf_idf(doc_id, trigram, n) %>%
  arrange(desc(tf_idf))

interview_trigram_tf_idf

## # A tibble: 313 × 6
##    doc_id trigram                         n    tf   idf tf_idf
##     <int> <chr>                       <int> <dbl> <dbl>  <dbl>
##  1      6 physically closer translate     1     1  5.04   5.04
##  2      6 view mm hm                      1     1  5.04   5.04
##  3      4 doesn’t sound cold              1     1  4.63   4.63
##  4      4 feels supportive you’ve         1     1  4.63   4.63
##  5     14 emotionally close that’s        1     1  4.63   4.63
##  6     14 explain mm hm                   1     1  4.63   4.63
##  7     14 support feels real              1     1  4.63   4.63
##  8     12 approach communicated trust     1     1  4.34   4.34
##  9     12 fairly stable medically         1     1  4.34   4.34
## 10     12 support system changed          1     1  4.34   4.34
## # ℹ 303 more rows

Negations

Additional analysis on the impact of not and broader negation words (not, no, never, without). A different sentiment lexicon is used, for a numerical sentiment score. Also note that the colors plotted correspond to the original sentiment- the sentiment is opposite, with negation.

bigrams_separated %>%
  filter(word1 == "not") %>%
  count(word1, word2, sort = TRUE)

## # A tibble: 92 × 3
##    word1 word2           n
##    <chr> <chr>       <int>
##  1 not   just            8
##  2 not   close           7
##  3 not   the             7
##  4 not   having          4
##  5 not   really          4
##  6 not   to              4
##  7 not   always          3
##  8 not   because         3
##  9 not   emotionally     3
## 10 not   especially      3
## # ℹ 82 more rows

tidy_interview_not_words <- bigrams_separated %>%
  filter(word1 == "not") %>%
  inner_join(get_sentiments("afinn"), by = c(word2 = "word")) %>%
  count(word2, value, sort = TRUE)

tidy_interview_not_words %>%
  mutate(contribution = n * value) %>%
  arrange(desc(abs(contribution))) %>%
  mutate(word2 = reorder(word2, contribution)) %>%
  ggplot(aes(n * value, word2, fill = n * value > 0)) +
  geom_col(show.legend = FALSE) +
  labs(x = "Sentiment value * number of occurrences",
       y = "Words preceded by \"not\"")

negation_words <- c("not", "no", "never", "without")
bigrams_separated %>%
  filter(word1 %in% negation_words) %>%
  count(word1, word2, sort = TRUE)

## # A tibble: 132 × 3
##    word1 word2           n
##    <chr> <chr>       <int>
##  1 not   just            8
##  2 not   close           7
##  3 not   the             7
##  4 not   having          4
##  5 not   really          4
##  6 not   to              4
##  7 no    i               3
##  8 not   always          3
##  9 not   because         3
## 10 not   emotionally     3
## # ℹ 122 more rows

tidy_interview_negation_words <- bigrams_separated %>%
  filter(word1 %in% negation_words) %>%
  inner_join(get_sentiments("afinn"), by = c(word2 = "word")) %>%
  count(word1, word2, value, sort = TRUE)

tidy_interview_negation_words %>%
  mutate(contribution = n * value) %>%
  arrange(desc(abs(contribution))) %>%
  mutate(word2 = reorder(word2, contribution)) %>%
  ggplot(aes(n * value, word2, fill = n * value > 0)) +
  geom_col(show.legend = FALSE) +
  labs(x = "Sentiment value * number of occurrences",
       y = "Words preceded by a negation word")

Bigram networks

Plot a network of bigrams, weighted by occurrence frequency. Then show the difference between a directed and undirected network, for the corpus.Adjust for corpus.

bigram_graph <- bigrams_counts %>%
  filter(n > 2) %>%
  graph_from_data_frame()

set.seed(2017)
ggraph(bigram_graph, layout = "fr") +
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1)

set.seed(2020)
a <- grid::arrow(type = "closed", length = unit(.15, "inches"))

ggraph(bigram_graph, layout = "fr") +
  geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
                 arrow = a, end_cap = circle(.07, 'inches')) +
  geom_node_point(color = "lightblue", size = 5) +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
  theme_void()

Other functionality to potentially include: clean NLP package word features, lemmatization, and visualizing metadata.

Structural Topic Models

First, pre-process the data and organize proper variable names for the stm package.Note that this package does not display the stop words in the same way as tidytext. Decide lower threshold. May un-comment pdf lines to save outputs to pdf format.

set.seed(23456)
processed <- textProcessor(documents = interview_whole_text$text, metadata = interview_whole_df)

## Building corpus... 
## Converting to Lower Case... 
## Removing punctuation... 
## Removing stopwords... 
## Removing numbers... 
## Stemming... 
## Creating Output...

out <- prepDocuments(documents = processed$documents, 
                     vocab = processed$vocab,
                     meta = processed$meta)

## Removing 670 of 1292 terms (670 of 3631 tokens) due to frequency 
## Your corpus now has 20 documents, 622 terms and 2961 tokens.

docs <- out$documents
vocab <- out$vocab
meta <- out$meta

plotRemoved(processed$documents, lower.thresh = seq(1, 200, by = 100))

# what is this plot showing? what parameters are input and what outputs are removed?

Initial Model

Fit the starting model, adjusting thresholds as needed. Then try running stm with a range of K values to see what number of topics works best. Shortdoc run, uses first 200 chars and confirm success. Search a range of K topic numbers, adjust for corpus.

Select Model

Iterate through Expectation-Maximization to select the best stm output. Adjust K for the optimal number of topics for the corpus (8). Plot semantic coherence and exclusivity of the sets of topics, to quantify the results- then use the optimal model balancing semantic coherence and exclusivity.

modelSelect <- selectModel(out$documents, out$vocab, K = 8,
                              prevalence =~ doc_id, max.em.its = 75,
                              data = out$meta, runs = 20, seed = 23456)

# pdf("stmVignette-009.pdf")
plotModels(modelSelect)

# dev.off()

selectedmodel <- modelSelect$runout[[3]]

Describe Model

Use metrics including FREX to characterize the models performance, words content. Find representative documents for the topic and quotes, words summarizing the frequent themes of the topic.See how segments of the words are more distinct in some quotes, vs. others.

labelTopics(selectedmodel, c(1, 7))

## Topic 1 Top Words:
##       Highest Prob: can, like, just, work, help, need, also 
##       FREX: decis, burden, work, absolut, everyth, concern, day 
##       Lift: find, food, incred, sibl, slept, remind, walk 
##       Score: absolut, thought, burden, “’m, meant, everyth, also 
## Topic 7 Top Words:
##       Highest Prob: peopl, feel, close, ask, becom, don’t, time 
##       FREX: peopl, feel, becom, ask, close, trust, don’t 
##       Lift: peopl, right, becom, feel, ask, close, life 
##       Score: feel, peopl, close, becom, ask, don’t, right

thoughts1 <- findThoughts(selectedmodel, texts = shortdoc,
                          n = 2, topics = 1)$docs[[1]]
thoughts7 <- findThoughts(selectedmodel, texts = shortdoc,
                           n = 2, topics = 7)$docs[[1]]

# pdf("stmVignette-015.pdf")
par(mfrow = c(2, 1), mar = c(.5, .5, 1, .5))
plotQuote(thoughts1, width = 40, maxwidth = 120, main = "Topic 1")
plotQuote(thoughts7, width = 40, maxwidth = 120, main = "Topic 7")

# dev.off()

Model Covariates

Adjust the stm by including other variables: covariates from metadata. Flexible, adjust for corpus.Look at the proportion of topics and expected doc composition (?). Note that time variables require different handling, for dates.

meta$excerpt <- as.factor(meta$excerpt)
prep <- estimateEffect(1:8 ~ excerpt, selectedmodel,
                       meta = out$meta, uncertainty = "Global")
summary(prep, topics = 1)

## 
## Call:
## estimateEffect(formula = 1:8 ~ excerpt, stmobj = selectedmodel, 
##     metadata = out$meta, uncertainty = "Global")
## 
## 
## Topic 1:
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)
## (Intercept)        0.02118    0.16042   0.132    0.898
## excerptEXCERPT 10  0.06456    0.23408   0.276    0.788
## excerptEXCERPT 2   0.03896    0.23233   0.168    0.870
## excerptEXCERPT 3  -0.01800    0.22452  -0.080    0.938
## excerptEXCERPT 4   0.40368    0.22694   1.779    0.106
## excerptEXCERPT 5  -0.01547    0.22699  -0.068    0.947
## excerptEXCERPT 6   0.04381    0.22742   0.193    0.851
## excerptEXCERPT 7   0.14183    0.23437   0.605    0.559
## excerptEXCERPT 8   0.07055    0.23986   0.294    0.775
## excerptEXCERPT 9  -0.01947    0.22305  -0.087    0.932

# pdf("stmVignette-017.pdf")
plot(selectedmodel, type = "summary", xlim = c(0, .3))

# dev.off()

# pdf("stmVignette-019.pdf")
# plot(prep, "day", method = "continuous", topics = 13,
#      model = z, printlegend = FALSE, xaxt = "n", xlab = "Time (2008)")
# monthseq <- seq(from = as.Date("2008-01-01"),
#                 to = as.Date("2008-12-01"), by = "month")
# monthnames <- months(monthseq)
# axis(1,
#      at = as.numeric(monthseq) - min(as.numeric(monthseq)),
#      labels = monthnames)
# dev.off()

Additional plots

Other plots that include stm word clouds, topic correlations, and convergence. Topics may go through other examinations and curation, especially human validation.

# pdf("stmVignette-025.pdf")
cloud(selectedmodel, topic = 7, scale = c(2, .25))

# dev.off()
mod.out.corr <- topicCorr(selectedmodel)

# pdf("stmVignette-027.pdf")
plot(mod.out.corr)

# dev.off()

# pdf("stmVignette-028.pdf")
plot(selectedmodel$convergence$bound, type = "l",
     ylab = "Approximate Objective",
     main = "Convergence")

# dev.off()

Word co-occurrence networks

Again using tidytext, examine word pair co-occurrence in the documents. Consider what measures and network statistics are important to describe.

tidy_interview_pairs <- tidy_interview_token %>%
  pairwise_count(word, doc_id, sort = TRUE, upper = FALSE)

# network
set.seed(1234)
tidy_interview_pairs %>%
  filter(n >= 15) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "cyan4") +
  geom_node_point(size = 5) +
  geom_node_text(aes(label = name), repel = TRUE, 
                 point.padding = unit(0.2, "lines")) +
  theme_void()

## Warning: The `trans` argument of `continuous_scale()` is deprecated as of ggplot2 3.5.0.
## ℹ Please use the `transform` argument instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Correlation network

Correlation between pairs of words, running the same steps.

# word correlation
tidy_interview_cors <- tidy_interview_token %>% 
  group_by(word) %>%
  filter(n() >= 15) %>%
  pairwise_cor(word, doc_id, sort = TRUE, upper = FALSE)

set.seed(1234)
tidy_interview_cors %>%
  filter(correlation > .6) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation, edge_width = correlation), edge_colour = "royalblue") +
  geom_node_point(size = 5) +
  geom_node_text(aes(label = name), repel = TRUE,
                 point.padding = unit(0.2, "lines")) +
  theme_void()

Feature co-occurrence

Use the quanteda package to create a feature co-occurrence matrix, then run essentially the same procedure to measure word co-occurrence. Use the interview line as the unit of length of text. The same analysis can be done on a window of words, instead of the unit of words.

word_tokens <- tokens(c(interview_text$text)) %>%
  tokens(remove_punct = TRUE) %>%
  tokens_tolower() %>%
  tokens_remove(pattern = stopwords("english"), padding = FALSE)

interview_fcm <- fcm(word_tokens, context = "document")

top_feats <- rowSums(interview_fcm) %>%
  sort(decreasing = TRUE) %>%
  head(25)
fcm_subset <- fcm_select(interview_fcm, pattern = names(top_feats))

set.seed(2017)
fcm_select(fcm_subset) %>%
  textplot_network(min_freq = 0.67)

interview_win_fcm <- fcm(word_tokens, context = "window", window = 5)

top_feats <- rowSums(interview_win_fcm) %>%
  sort(decreasing = TRUE) %>%
  head(25)
fcm_win_subset <- fcm_select(interview_win_fcm, pattern = names(top_feats))

set.seed(2017)
fcm_select(fcm_win_subset) %>%
  textplot_network(min_freq = 0.67)

Document network

Again, measure and track feature co-occurrence. This time, use each doc as the unit of text length. These networks can be improved in their quantitative output and informativeness for the corpus text. Visualization may also be improved for clarity.

word_tokens <- tokens(c(interview_whole_text$text)) %>%
  tokens(remove_punct = TRUE) %>%
  tokens_tolower() %>%
  tokens_remove(pattern = stopwords("english"), padding = FALSE)

interview_fcm <- fcm(word_tokens, context = "document")

top_feats <- rowSums(interview_fcm) %>%
  sort(decreasing = TRUE) %>%
  head(25)
fcm_subset <- fcm_select(interview_fcm, pattern = names(top_feats))

set.seed(2017)
fcm_select(fcm_subset) %>%
  textplot_network(min_freq = 0.67)

Method Outline: Text Mining and Communal Coping

Vance Newsome