library(tidyverse)
library(ggplot2)
library(tidytext)
library(textdata)
library(janeaustenr)
library(gutenbergr)
library(SentimentAnalysis)
The objective was to re-create the code supplied in chapter 2 of Text Mining with R, then to extend the exercise with a new corpus and lexicon. Recreating the analysis in chapter 2 came with no issues, thanks to the thorough steps provided by the textbook. Extending the analysis brought up some issues with dealing with the text formatting, particularly finding the right regex to partition chapters in the new corpus. Finding an additional lexicon to utilize in the exercise was straightforward as the SentimentAnalysis R package came equipped with multiple dictionaries, of which we used the Harvard-IV dictionary.
tidy_books <- austen_books() %>%
group_by(book) %>%
mutate(
linenumber = row_number(),
chapter = cumsum(str_detect(text,
regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup() %>%
unnest_tokens(word, text)
head(tidy_books)
## # A tibble: 6 x 4
## book linenumber chapter word
## <fct> <int> <int> <chr>
## 1 Sense & Sensibility 1 0 sense
## 2 Sense & Sensibility 1 0 and
## 3 Sense & Sensibility 1 0 sensibility
## 4 Sense & Sensibility 3 0 by
## 5 Sense & Sensibility 3 0 jane
## 6 Sense & Sensibility 3 0 austen
# Download Ethics by Spinoza from gutenbergr
ethics_raw <- gutenberg_download(3800)
ethics <- as_tibble(ethics_raw)
ethics <- ethics %>% add_column(book = "Ethics")
# Download Theologico-Political by Spinoza from gutenbergr
theologico_prt1_raw <- gutenberg_download(989)
theologico_prt1 <- as_tibble(theologico_prt1_raw)
theologico_prt1 <- theologico_prt1 %>% add_column(book = "Theologico-Political Part 1")
# Concat Spinoza books into one dataframe
spinoza_books_raw <- bind_rows(ethics, theologico_prt1)
# Tokenize the text so that each word is its own row
spinoza_books <- spinoza_books_raw %>%
group_by(book) %>%
mutate(
linenumber = row_number(),
chapter = cumsum(str_detect(text,
regex("PART|Chapter|CHAPTER [\\dIVXLC]")))) %>%
ungroup() %>%
unnest_tokens(word, text)
tail(spinoza_books)
## # A tibble: 6 x 5
## gutenberg_id book linenumber chapter word
## <int> <chr> <int> <int> <chr>
## 1 989 Theologico-Political Part 1 2853 5 end
## 2 989 Theologico-Political Part 1 2853 5 of
## 3 989 Theologico-Political Part 1 2853 5 endnotes
## 4 989 Theologico-Political Part 1 2853 5 to
## 5 989 Theologico-Political Part 1 2853 5 part
## 6 989 Theologico-Political Part 1 2853 5 i
nrc_joy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
tidy_books %>%
filter(book == "Emma") %>%
inner_join(nrc_joy) %>%
count(word, sort = TRUE)
## # A tibble: 303 x 2
## word n
## <chr> <int>
## 1 good 359
## 2 young 192
## 3 friend 166
## 4 hope 143
## 5 happy 125
## 6 love 117
## 7 deal 92
## 8 found 92
## 9 present 89
## 10 kind 82
## # … with 293 more rows
# Filter for words with positive sentiment
# Note: DictionaryGI is Harvard-IV dictionary from SentimentAnalysis
gi_positive <- as_tibble(DictionaryGI$positive)
gi_positive <- gi_positive %>% rename(word = value)
# Filter for words with negative sentiment (DictionaryGI is from SentimentAnalysis)
gi_negative <- as_tibble(DictionaryGI$negative)
gi_negative <- gi_negative %>% rename(word = value)
# Inner join words from Ethics and positive words from Harvard-IV lexicon
spinoza_positive <- spinoza_books %>%
filter(book == "Ethics") %>%
inner_join(gi_positive) %>%
count(word, sort = TRUE)
head(spinoza_positive)
## # A tibble: 6 x 2
## word n
## <chr> <int>
## 1 mind 580
## 2 human 294
## 3 love 251
## 4 pleasure 213
## 5 knowledge 208
## 6 good 181
jane_austen_sentiment <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
head(jane_austen_sentiment)
## # A tibble: 6 x 5
## book index negative positive sentiment
## <fct> <dbl> <int> <int> <int>
## 1 Sense & Sensibility 0 16 32 16
## 2 Sense & Sensibility 1 19 53 34
## 3 Sense & Sensibility 2 12 31 19
## 4 Sense & Sensibility 3 15 31 16
## 5 Sense & Sensibility 4 16 34 18
## 6 Sense & Sensibility 5 16 51 35
# Add column indicating sentiment (needed in long format)
gi_positive_df <- gi_positive %>% add_column(sentiment = "positive")
gi_negative_df <- gi_negative %>% add_column(sentiment = "negative")
# Concat positive and negative words into one dataframe
gi_dict <- bind_rows(gi_positive_df, gi_negative_df)
# Evaluate sentiment 80 lines at a time
spinoza_sentiment <- spinoza_books %>%
inner_join(gi_dict) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
head(spinoza_sentiment)
## # A tibble: 6 x 5
## book index negative positive sentiment
## <chr> <dbl> <int> <int> <int>
## 1 Ethics 0 8 11 3
## 2 Ethics 1 9 18 9
## 3 Ethics 2 19 47 28
## 4 Ethics 3 12 23 11
## 5 Ethics 4 23 26 3
## 6 Ethics 5 13 14 1
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x")
# View net sentiment over the course of the book
ggplot(spinoza_sentiment, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x")
pride_prejudice <- tidy_books %>%
filter(book == "Pride & Prejudice")
afinn <- pride_prejudice %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
bing_and_nrc <- bind_rows(
pride_prejudice %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
pride_prejudice %>%
inner_join(gi_dict %>%
filter(sentiment %in% c("positive",
"negative"))
) %>%
mutate(method = "NRC")) %>%
count(method, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment,
values_from = n,
values_fill = 0) %>%
mutate(sentiment = positive - negative)
bind_rows(afinn,
bing_and_nrc) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y")
ethics_book <- spinoza_books %>%
filter(book == "Ethics")
afinn <- ethics_book %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
bing_and_nrc <- bind_rows(
ethics_book %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
ethics_book %>%
inner_join(gi_dict %>%
filter(sentiment %in% c("positive",
"negative"))
) %>%
mutate(method = "Harvarad-IV")) %>%
count(method, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment,
values_from = n,
values_fill = 0) %>%
mutate(sentiment = positive - negative)
bind_rows(afinn,
bing_and_nrc) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y")
bing_word_counts <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
bing_word_counts %>%
group_by(sentiment) %>%
slice_max(n, n = 10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(x = "Contribution to sentiment",
y = NULL)
# Label the sentiment of each word
gi_word_counts <- spinoza_books %>%
inner_join(gi_dict) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
# Plot top negative and positive words by count
gi_word_counts %>%
group_by(sentiment) %>%
slice_max(n, n = 10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(x = "Contribution to sentiment",
y = NULL)
custom_stop_words_ja <- bind_rows(tibble(word = c("miss"),
lexicon = c("custom")),
stop_words)
custom_stop_words_ja
## # A tibble: 1,150 x 2
## word lexicon
## <chr> <chr>
## 1 miss custom
## 2 a SMART
## 3 a's SMART
## 4 able SMART
## 5 about SMART
## 6 above SMART
## 7 according SMART
## 8 accordingly SMART
## 9 across SMART
## 10 actually SMART
## # … with 1,140 more rows
custom_stop_words_sp <- bind_rows(tibble(word = c("mind", "order"),
lexicon = c("custom", "custom")),
stop_words)
custom_stop_words_sp
## # A tibble: 1,151 x 2
## word lexicon
## <chr> <chr>
## 1 mind custom
## 2 order custom
## 3 a SMART
## 4 a's SMART
## 5 able SMART
## 6 about SMART
## 7 above SMART
## 8 according SMART
## 9 accordingly SMART
## 10 across SMART
## # … with 1,141 more rows
p_and_p_sentences <- tibble(text = prideprejudice) %>%
unnest_tokens(sentence, text, token = "sentences")
austen_chapters <- austen_books() %>%
group_by(book) %>%
unnest_tokens(chapter, text, token = "regex",
pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
ungroup()
austen_chapters %>%
group_by(book) %>%
summarise(chapters = n())
## # A tibble: 6 x 2
## book chapters
## * <fct> <int>
## 1 Sense & Sensibility 51
## 2 Pride & Prejudice 62
## 3 Mansfield Park 49
## 4 Emma 56
## 5 Northanger Abbey 32
## 6 Persuasion 25
ethics_text <- spinoza_books_raw %>%
filter(book == "Ethics")
# Tokenize chapters with regex
spinoza_chapters <- spinoza_books_raw %>%
group_by(book) %>%
unnest_tokens(chapter, text, token = "regex",
pattern = "PART|Chapter|CHAPTER [\\dIVXLC]") %>%
ungroup()
# Confirm chapters were partitioned correctly
spinoza_chapters %>%
group_by(book) %>%
summarise(chapters = n())
## # A tibble: 2 x 2
## book chapters
## * <chr> <int>
## 1 Ethics 5
## 2 Theologico-Political Part 1 6
bingnegative <- get_sentiments("bing") %>%
filter(sentiment == "negative")
wordcounts <- tidy_books %>%
group_by(book, chapter) %>%
summarize(words = n())
tidy_books %>%
semi_join(bingnegative) %>%
group_by(book, chapter) %>%
summarize(negativewords = n()) %>%
left_join(wordcounts, by = c("book", "chapter")) %>%
mutate(ratio = negativewords/words) %>%
filter(chapter != 0) %>%
slice_max(ratio, n = 1) %>%
ungroup()
## # A tibble: 6 x 5
## book chapter negativewords words ratio
## <fct> <int> <int> <int> <dbl>
## 1 Sense & Sensibility 43 161 3405 0.0473
## 2 Pride & Prejudice 34 111 2104 0.0528
## 3 Mansfield Park 46 173 3685 0.0469
## 4 Emma 15 151 3340 0.0452
## 5 Northanger Abbey 21 149 2982 0.0500
## 6 Persuasion 4 62 1807 0.0343
bingpositive <- gi_dict %>%
filter(sentiment == "positive")
# Word count of each chapter
wordcounts_sp <- spinoza_books %>%
group_by(book, chapter) %>%
summarize(words = n())
# Highest positive word ratio of each book
spinoza_books %>%
semi_join(bingpositive) %>%
group_by(book, chapter) %>%
summarize(positivewords = n()) %>%
left_join(wordcounts_sp, by = c("book", "chapter")) %>%
mutate(ratio = positivewords/words) %>%
slice_max(ratio, n = 1) %>%
ungroup()
## # A tibble: 2 x 5
## book chapter positivewords words ratio
## <chr> <int> <int> <int> <dbl>
## 1 Ethics 4 770 10915 0.0705
## 2 Theologico-Political Part 1 2 47 415 0.113
Again, the meticulous documentation by Text Mining with R made re-creating the primary code functions seamless. There are two main takeaways from extending the sentiment analysis. One, this task once again proves that the majority of a data scientists time will be formatting the data. Two, these analyses will need customization. For example, many of the positive and negative sentiment words are questionable in the context of this text. When looking to build sentiment analysis into a production application, thorough review of the dictionaries being used will be required. Additionally, the data scientist should consider using a domain-specific lexicon when possible. While all the lexicons used in this analysis had their weak points, AFINN seems to be the strongest as it offers degrees of positive and negative sentiment.
....
Julia Silge and David Robinson (2017) Chapter 2, Text Mining with R
David Robinson (2020). gutenbergr: Search and download public domain texts from Project Gutenberg. R package version 0.2.0.
Stefan Feuerriegel, Nicolas Proellochs (2021). SentimentAnalysis: a powerful toolchain facilitating the sentiment analysis of textual contents in R. R package version 1.3-4.
This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License.