# install.packages(c("tidyverse", "tidytext", "gutenbergr", "textdata"))
library(tidyverse)
library(tidytext)
library(gutenbergr)
austen <- gutenberg_download(1342) %>%
mutate(line = row_number())
| gutenberg_id | text | line |
|---|---|---|
| 1342 | THERE IS AN ILLUSTRATED EDITION OF THIS TITLE WHICH MAY VIEWED AT EBOOK | 1 |
| 1342 | [# 42671 ] | 2 |
| 1342 | 3 | |
| 1342 | cover | 4 |
| 1342 | 5 | |
| 1342 | 6 |
austen_tokens <- austen %>%
unnest_tokens(input = text, output = word, token = "words")
| gutenberg_id | line | word |
|---|---|---|
| 1342 | 1 | there |
| 1342 | 1 | is |
| 1342 | 1 | an |
| 1342 | 1 | illustrated |
| 1342 | 1 | edition |
| 1342 | 1 | of |
Try downloading and tokenizing a copy of another book from Project Gutenberg.
Save the results as my_document and my_words
counts <- austen_tokens %>%
group_by(word) %>%
tally() %>%
arrange(-n)
| word | n |
|---|---|
| the | 4333 |
| to | 4163 |
| of | 3612 |
| and | 3586 |
| her | 2202 |
| i | 2047 |
Using regular expressions to get rid of non-word characters
tokens_clean <- austen_tokens %>%
mutate(word = str_remove_all(string = word, pattern = "[^a-zA-Z]"))
counts_clean <- tokens_clean %>%
group_by(word) %>%
tally() %>%
arrange(-n)
| word | n |
|---|---|
| the | 4333 |
| to | 4164 |
| of | 3613 |
| and | 3586 |
| her | 2226 |
| i | 2069 |
Take a few minutes to find the top 10 most frequent words from your book (my_document).
What are they? Put a few of them in the chat.
data(stop_words)
| word | lexicon |
|---|---|
| a | SMART |
| a’s | SMART |
| able | SMART |
| about | SMART |
| above | SMART |
| according | SMART |
| accordingly | SMART |
| across | SMART |
| actually | SMART |
| after | SMART |
my_stop_words <- stop_words$word
counts_filtered <- counts_clean %>%
filter(!(word %in% my_stop_words))
| word | n |
|---|---|
| elizabeth | 596 |
| darcy | 374 |
| bennet | 295 |
| miss | 283 |
| jane | 264 |
| bingley | 258 |
| time | 203 |
| lady | 183 |
| sister | 180 |
| wickham | 162 |
my_stop_words <- c(my_stop_words, "", "chapter", "darcy")
counts_filtered <- counts_clean %>%
filter(!(word %in% my_stop_words))
| word | n |
|---|---|
| elizabeth | 596 |
| bennet | 295 |
| miss | 283 |
| jane | 264 |
| bingley | 258 |
| time | 203 |
| lady | 183 |
| sister | 180 |
| wickham | 162 |
| dear | 158 |
Add some custom stopwords to your own list.
What are they? What new words made it into your top words after this?
write_csv(counts_filtered, "data/counts_filtered.csv")
Save your word frequencies to a CSV on your computer.
Basic sentiment analysis (not considering negating words, just word-score analysis)
Let’s begin by loading in a sentiment dictionary.
sentiments <- get_sentiments("afinn")
| word | value |
|---|---|
| abandon | -2 |
| abandoned | -2 |
| abandons | -2 |
| abducted | -2 |
| abduction | -2 |
| abductions | -2 |
| abhor | -3 |
| abhorred | -3 |
| abhorrent | -3 |
| abhors | -3 |
Let’s combine the book into a single text and then break it up into sentences, and then words.
austen_text <- tibble(text = paste(austen$text, collapse = " "))
austen_sentences <- austen_text %>%
unnest_tokens(input = text, output = sentence, token = "sentences") %>%
mutate(line = row_number())
sentence_words <- austen_sentences %>%
unnest_tokens(input = sentence, output = word, token = "words")
Now that we have a sentence-by-sentence, word-by-word breakdown, we can assign average sentiment to each sentence.
word_sentiments <- sentence_words %>%
inner_join(sentiments, by = "word")
sentence_sentiments <- word_sentiments %>%
group_by(line) %>%
summarize(mean_sentiment = mean(value, na.rm = TRUE))
ggplot(sentence_sentiments, aes(x = line, y = mean_sentiment)) +
geom_col() +
geom_smooth()
Plot the sentiment as it changes throughout your book
We can use %/% for integer division, which lets us break the book up into 50-line sections
section_sentiments <- word_sentiments %>%
mutate(section = line %/% 50) %>%
group_by(section) %>%
summarize(mean_sentiment = mean(value, na.rm = TRUE))
ggplot(section_sentiments, aes(x = section, y = mean_sentiment)) +
geom_col() +
geom_smooth()
This tutorial borrows very heavily from Text Mining with R:
Silge, Julia, and David Robinson. Text Mining with R: A Tidy Approach. 1st edition. Beijing; Boston: O’Reilly Media, 2017. https://www.tidytextmining.com/index.html.