This code is adapted from tidytext ch 4, ch 8, and stm package paper. Sources link/cite.
Load text data, then create separate tables: document text corpus and document metadata. This example is comprised of a doc with the header “EXCERPT” to mark separate documents within the whole doc. Additional metadata variables would be useful. One table has each doc as a continuous text block; the other table has each line as a separate entry. Adjust for corpus.
original_interview <- read_lines("C:/Users/newsomevw/OneDrive - National Institutes of Health/Desktop/TM scripts/longer_interview_chirp_sample.txt", locale = locale(encoding = "UTF-8"))
original_interview <- original_interview[original_interview != ""]
interview_df <- tibble(text = original_interview) %>%
mutate(excerpt = ifelse(str_detect(text, "^EXCERPT"), text, NA), .before = 1) %>%
fill(excerpt, .direction = "down") %>%
filter(!str_detect(text, "^EXCERPT")) %>%
extract(text, into = c("speaker", "text"), regex = "^(SW|CG):\\s*(.*)") %>%
mutate(line_id = row_number(), .before = 1)
interview_text <- interview_df %>%
select(line_id, text)
interview_whole_df <- interview_df %>%
select(-line_id) %>%
group_by(excerpt, speaker) %>%
summarise(text = paste(text, collapse = " "), .groups = "drop") %>%
mutate(doc_id = row_number(), .before = 1)
interview_whole_text <- interview_whole_df %>%
select(doc_id, text)
interview_df <- interview_df %>%
select(-text)
interview_whole_df <- interview_whole_df %>%
select(-text)
Use the unnest_tokens function from tidytext to create a token table. Load the library of stop words, then confirm that none of the stop words are useful measures. Plot the top word frequency, observing that stop words are less useful to the corpus dict. Tokenize the separate line table for later functions.
data(stop_words)
tidy_interview_token_withstop <- interview_whole_text %>%
unnest_tokens(word, text)
tidy_interview_token <- tidy_interview_token_withstop %>%
anti_join(stop_words)
tidy_interview_token_withstop %>%
count(word, sort = TRUE)
## # A tibble: 1,834 × 2
## word n
## <chr> <int>
## 1 i 375
## 2 the 348
## 3 and 263
## 4 to 250
## 5 that 249
## 6 a 205
## 7 of 188
## 8 she 183
## 9 it 179
## 10 is 170
## # ℹ 1,824 more rows
tidy_interview_token %>%
count(word, sort = TRUE)
## # A tibble: 1,462 × 2
## word n
## <chr> <int>
## 1 people 108
## 2 support 84
## 3 i’m 83
## 4 don’t 62
## 5 feel 58
## 6 that’s 56
## 7 close 55
## 8 doesn’t 53
## 9 it’s 48
## 10 lot 41
## # ℹ 1,452 more rows
tidy_interview_token %>%
count(word, sort = TRUE) %>%
filter(n > 15) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word)) +
geom_col() +
labs(y = NULL)
tidy_interview_line_token <- interview_text %>%
unnest_tokens(word, text)
tidy_interview_line_token <- tidy_interview_line_token %>%
anti_join(stop_words)
Calculate term frequency-inverse document frequency for each word, relative to the document. Use the bind function from tidytext.
interview_doc_words <- tidy_interview_token %>%
count(doc_id, word, sort = TRUE)
total_doc_words <- interview_doc_words %>%
group_by(doc_id) %>%
summarize(total = sum(n))
interview_doc_words <- left_join(interview_doc_words, total_doc_words)
interview_doc_tf_idf <- interview_doc_words %>%
bind_tf_idf(word, doc_id, n)
interview_doc_tf_idf %>%
arrange(desc(tf_idf))
## # A tibble: 2,977 × 7
## doc_id word n total tf idf tf_idf
## <int> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 13 15001 14 366 0.0383 3.00 0.115
## 2 10 ecology 2 42 0.0476 2.30 0.110
## 3 6 belongs 2 63 0.0317 3.00 0.0951
## 4 18 17001 2 50 0.04 2.30 0.0921
## 5 16 active 1 39 0.0256 3.00 0.0768
## 6 16 flexibility 1 39 0.0256 3.00 0.0768
## 7 16 receive 1 39 0.0256 3.00 0.0768
## 8 16 sorting 1 39 0.0256 3.00 0.0768
## 9 16 speaking 1 39 0.0256 3.00 0.0768
## 10 4 19001 2 63 0.0317 2.30 0.0731
## # ℹ 2,967 more rows
Basic sentiments are described for the words in the corpus. Further dictionary-based analysis may examine specific process words, thematic words, etc.
interview_sentiments <- interview_doc_words %>%
inner_join(get_sentiments("bing"), by = "word") %>%
count(word, sentiment, sort = TRUE)
interview_sentiments %>%
group_by(sentiment) %>%
slice_max(n, n = 15) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(x = "Contribution to sentiment",
y = NULL)
Create a table of word bigrams, to investigate their TF-IDF and the impact of not and broader negation words (not, no, never, without). Remove stop words from bigrams prior to calculating TF-IDF. A different sentiment lexicon is used, for a numerical sentiment score. Also note that the colors plotted correspond to the original sentiment- the sentiment is opposite, with negation.
tidy_interview_bigrams <- interview_whole_text %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
filter(!is.na(bigram))
tidy_interview_bigrams %>%
count(bigram, sort = TRUE)
## # A tibble: 8,313 × 2
## bigram n
## <chr> <int>
## 1 and i 42
## 2 a lot 41
## 3 in the 37
## 4 i don’t 33
## 5 i think 31
## 6 people who 29
## 7 if i 28
## 8 she doesn’t 25
## 9 of the 24
## 10 who can 24
## # ℹ 8,303 more rows
bigrams_separated <- tidy_interview_bigrams %>%
separate(bigram, c("word1", "word2"), sep = " ")
interview_bigrams_filtered <- bigrams_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
interview_bigram_counts <- interview_bigrams_filtered %>%
count(word1, word2, sort = TRUE)
tidy_interview_bigrams <- interview_bigrams_filtered %>%
unite(bigram, word1, word2, sep = " ")
interview_bigram_tf_idf <- tidy_interview_bigrams %>%
count(doc_id, bigram) %>%
bind_tf_idf(doc_id, bigram, n) %>%
arrange(desc(tf_idf))
bigrams_separated %>%
filter(word1 == "not") %>%
count(word1, word2, sort = TRUE)
## # A tibble: 92 × 3
## word1 word2 n
## <chr> <chr> <int>
## 1 not just 8
## 2 not close 7
## 3 not the 7
## 4 not having 4
## 5 not really 4
## 6 not to 4
## 7 not always 3
## 8 not because 3
## 9 not emotionally 3
## 10 not especially 3
## # ℹ 82 more rows
tidy_interview_not_words <- bigrams_separated %>%
filter(word1 == "not") %>%
inner_join(get_sentiments("afinn"), by = c(word2 = "word")) %>%
count(word2, value, sort = TRUE)
tidy_interview_not_words %>%
mutate(contribution = n * value) %>%
arrange(desc(abs(contribution))) %>%
mutate(word2 = reorder(word2, contribution)) %>%
ggplot(aes(n * value, word2, fill = n * value > 0)) +
geom_col(show.legend = FALSE) +
labs(x = "Sentiment value * number of occurrences",
y = "Words preceded by \"not\"")
negation_words <- c("not", "no", "never", "without")
bigrams_separated %>%
filter(word1 %in% negation_words) %>%
count(word1, word2, sort = TRUE)
## # A tibble: 132 × 3
## word1 word2 n
## <chr> <chr> <int>
## 1 not just 8
## 2 not close 7
## 3 not the 7
## 4 not having 4
## 5 not really 4
## 6 not to 4
## 7 no i 3
## 8 not always 3
## 9 not because 3
## 10 not emotionally 3
## # ℹ 122 more rows
tidy_interview_negation_words <- bigrams_separated %>%
filter(word1 %in% negation_words) %>%
inner_join(get_sentiments("afinn"), by = c(word2 = "word")) %>%
count(word1, word2, value, sort = TRUE)
tidy_interview_negation_words %>%
mutate(contribution = n * value) %>%
arrange(desc(abs(contribution))) %>%
mutate(word2 = reorder(word2, contribution)) %>%
ggplot(aes(n * value, word2, fill = n * value > 0)) +
geom_col(show.legend = FALSE) +
labs(x = "Sentiment value * number of occurrences",
y = "Words preceded by a negation word")
Plot a network of bigrams, weighted by occurrence frequency. Then show the difference between a directed and undirected network, for the corpus.Adjust for corpus.
bigram_graph <- interview_bigram_counts %>%
filter(n > 2) %>%
graph_from_data_frame()
set.seed(2017)
ggraph(bigram_graph, layout = "fr") +
geom_edge_link() +
geom_node_point() +
geom_node_text(aes(label = name), vjust = 1, hjust = 1)
set.seed(2020)
a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
ggraph(bigram_graph, layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
arrow = a, end_cap = circle(.07, 'inches')) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
theme_void()
other functionality to include: clean NLP package word features, lemma,
visualize metadata.
First, pre-process the data and organize proper variable names for the stm package.Note that this package does not display the stop words in the same way as tidytext. Decide lower threshold. May un-comment pdf lines to save outputs to pdf format.
set.seed(23456)
processed <- textProcessor(documents = interview_text$text, metadata = interview_df)
## Building corpus...
## Converting to Lower Case...
## Removing punctuation...
## Removing stopwords...
## Removing numbers...
## Stemming...
## Creating Output...
out <- prepDocuments(documents = processed$documents,
vocab = processed$vocab,
meta = processed$meta)
## Removing 650 of 1292 terms (650 of 5321 tokens) due to frequency
## Removing 2 Documents with No Words
## Your corpus now has 307 documents, 642 terms and 4671 tokens.
docs <- out$documents
vocab <- out$vocab
meta <- out$meta
plotRemoved(processed$documents, lower.thresh = seq(1, 200, by = 100))
# what is this plot showing? what parameters are input and what outputs are removed?
Fit the starting model, adjusting thresholds as needed. Then try running stm with a range of K values to see what number of topics works best. Shortdoc run, to see first 200 chars and confirm success
Iterate through Expectation-Maximization to select the best stm output. Adjust K for the optimal number of topics for the corpus. Plot semantic coherence and exclusivity of the sets of topics, to quantify the results.
Use metrics including FREX to characterize the models performance, words content. Find representative documents for the topic and quotes summarizing the themes of the topic.
labelTopics(modelPrevFit, c(6, 13, 18))
## Topic 6 Top Words:
## Highest Prob: less, mayb, respond, leav, say, know, don’t
## FREX: respond, leav, less, mayb, confus, respit, review
## Lift: respit, respond, review, fall, confus, convers, worker
## Score: respond, respit, leav, mayb, review, convers, new
## Topic 13 Top Words:
## Highest Prob: can, stay, didn’t, lot, one, peopl, ask
## FREX: didn’t, stay, lot, can, love, matter, “’s
## Lift: didn’t, tidi, join, parent, answer, pull, stay
## Score: didn’t, stay, tidi, answer, pull, lot, parent
## Topic 18 Top Words:
## Highest Prob: made, time, say, learn, can, becom, thought
## FREX: made, learn, thought, time, becam, expect, without
## Lift: made, appreci, charg, thought, expect, neat, becam
## Score: made, thought, becam, learn, charg, expect, appreci
thoughts3 <- findThoughts(modelPrevFit, texts = shortdoc,
n = 2, topics = 6)$docs[[1]]
thoughts20 <- findThoughts(modelPrevFit, texts = shortdoc,
n = 2, topics = 18)$docs[[1]]
# pdf("stmVignette-015.pdf")
par(mfrow = c(2, 1), mar = c(.5, .5, 1, .5))
plotQuote(thoughts3, width = 40, main = "Topic 6")
plotQuote(thoughts20, width = 40, main = "Topic 18")
# dev.off()
Adjust the stm by including other variables: covariates from metadata. Flexible, adjust for corpus.Look at the proportion of topics and expected doc composition (?). Note that time variables require different handling, for dates.
meta$speaker <- as.factor(meta$excerpt)
prep <- estimateEffect(1:20 ~ speaker, modelPrevFit,
meta = out$meta, uncertainty = "Global")
summary(prep, topics = 1)
##
## Call:
## estimateEffect(formula = 1:20 ~ speaker, stmobj = modelPrevFit,
## metadata = out$meta, uncertainty = "Global")
##
##
## Topic 1:
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.04616 0.01123 4.112 5.05e-05 ***
## speakerSW 0.01849 0.01697 1.090 0.277
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# pdf("stmVignette-017.pdf")
plot(modelPrevFit, type = "summary", xlim = c(0, .3))
# dev.off()
# pdf("stmVignette-019.pdf")
# plot(prep, "day", method = "continuous", topics = 13,
# model = z, printlegend = FALSE, xaxt = "n", xlab = "Time (2008)")
# monthseq <- seq(from = as.Date("2008-01-01"),
# to = as.Date("2008-12-01"), by = "month")
# monthnames <- months(monthseq)
# axis(1,
# at = as.numeric(monthseq) - min(as.numeric(monthseq)),
# labels = monthnames)
# dev.off()
Other plots that include stm word clouds, topic correlations, and convergence.
# pdf("stmVignette-025.pdf")
cloud(modelPrevFit, topic = 13, scale = c(2, .25))
# dev.off()
mod.out.corr <- topicCorr(modelPrevFit)
# pdf("stmVignette-027.pdf")
plot(mod.out.corr)
# dev.off()
# pdf("stmVignette-028.pdf")
plot(modelPrevFit$convergence$bound, type = "l",
ylab = "Approximate Objective",
main = "Convergence")
# dev.off()
Again using tidytext, examine word pair co-occurrence in the documents. Consider what measures and network statistics are important to describe.
tidy_interview_pairs <- tidy_interview_token %>%
pairwise_count(word, doc_id, sort = TRUE, upper = FALSE)
# network
set.seed(1234)
tidy_interview_pairs %>%
filter(n >= 10) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "cyan4") +
geom_node_point(size = 5) +
geom_node_text(aes(label = name), repel = TRUE,
point.padding = unit(0.2, "lines")) +
theme_void()
## Warning: The `trans` argument of `continuous_scale()` is deprecated as of ggplot2 3.5.0.
## ℹ Please use the `transform` argument instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Correlation between pairs of words, running the same steps.
# word correlation
tidy_interview_cors <- tidy_interview_token %>%
group_by(word) %>%
filter(n() >= 10) %>%
pairwise_cor(word, doc_id, sort = TRUE, upper = FALSE)
set.seed(1234)
tidy_interview_cors %>%
filter(correlation > .6) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation, edge_width = correlation), edge_colour = "royalblue") +
geom_node_point(size = 5) +
geom_node_text(aes(label = name), repel = TRUE,
point.padding = unit(0.2, "lines")) +
theme_void()
Use the quanteda package to create a feature co-occurrence matrix, then run essentially the same procedure to measure word co-occurrence. Use the interview line as the unit of length of text.
word_tokens <- tokens(c(interview_text$text)) %>%
tokens(remove_punct = TRUE) %>%
tokens_tolower() %>%
tokens_remove(pattern = stopwords("english"), padding = FALSE)
interview_fcm <- fcm(word_tokens, context = "document")
top_feats <- rowSums(interview_fcm) %>%
sort(decreasing = TRUE) %>%
head(25)
fcm_subset <- fcm_select(interview_fcm, pattern = names(top_feats))
set.seed(2017)
fcm_select(fcm_subset) %>%
textplot_network(min_freq = 0.5)
Again, measure and track feature co-occurrence. This time, use each doc as the unit of text length.
is this significantly different from the other network within this group of methods? how would the markers realistically be interpreted, concluded with the analysis? quantification, network, and markers bio with caregiving, descriptions, text
word_tokens <- tokens(c(interview_whole_text$text)) %>%
tokens(remove_punct = TRUE) %>%
tokens_tolower() %>%
tokens_remove(pattern = stopwords("english"), padding = FALSE)
interview_fcm <- fcm(word_tokens, context = "document")
top_feats <- rowSums(interview_fcm) %>%
sort(decreasing = TRUE) %>%
head(25)
fcm_subset <- fcm_select(interview_fcm, pattern = names(top_feats))
set.seed(2017)
fcm_select(fcm_subset) %>%
textplot_network(min_freq = 0.5)