library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidytext)
library(textdata)
library(stringr)
library(janeaustenr)
library(syuzhet)
Base code from chapter 2 that we will be working with. This includes a collection of works by author Jane Austen
tidy_books <- austen_books() %>%
group_by(book) %>%
mutate(
linenumber = row_number(),
chapter = cumsum(str_detect(text,
regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup() %>%
unnest_tokens(word, text)
glimpse(tidy_books)
## Rows: 725,055
## Columns: 4
## $ book <fct> Sense & Sensibility, Sense & Sensibility, Sense & Sensibili…
## $ linenumber <int> 1, 1, 1, 3, 3, 3, 5, 10, 10, 13, 13, 13, 13, 13, 13, 13, 13…
## $ chapter <int> 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ word <chr> "sense", "and", "sensibility", "by", "jane", "austen", "181…
Silge J, Robinson D. Welcome to Text Mining with R: A Tidy Approach. August 1, 2017. Section 2.2. O’Reilly Media. URL.
devtools::install_github("EmilHvitfeldt/sherlock")
## Skipping install of 'sherlock' from a github remote, the SHA1 (38584034) has not changed since last install.
## Use `force = TRUE` to force installation
library(sherlock)
ls("package:sherlock")
## [1] "holmes"
tidy_holmes <- holmes %>%
group_by(book) %>%
mutate(
linenumber = row_number(),
chapter = cumsum(str_detect(text,
regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup() %>%
unnest_tokens(word, text)
data(stop_words) # loads a data frame of stop words
tidy_holmes <- tidy_holmes %>%
anti_join(stop_words) %>% filter(chapter == !0)
## Joining with `by = join_by(word)`
head(tidy_holmes)
tidy_holmes %>% distinct(book)
Here I chose to compare how those sentiment scores compared to one of the lexicons from chapter 2 of Text Mining with R: A Tidy Approach. In this case I used the nrc lexicon to compare it to the syuzhet lexicon in the visualization that follows. The nrc lexicon associates a sentiment to a word and I used only the positive and negative words to create a sentiment total
I wanted to create one data frame to use for visualizing the trend throughout the story lines.
Step 1: Created seperate data frames with each of the lexicons dictionary of words
#
syuzhet <- tibble(get_sentiment_dictionary(dictionary = "syuzhet", language = "English"))
nrc <- get_sentiments("nrc") %>%
filter(sentiment %in% c("positive",
"negative"))
Step 2: I joined each lexicons dictionaries with the words in the tidy_holmes data frame and then combined the two. I followed a similar procedure as the one in the book to create columns of the total sentiment in 25 lines of the book.
nrc_holmes <- tidy_holmes %>% inner_join(nrc, relationship = "many-to-many") %>%
count(book, index = linenumber %/% 25, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative, lexicon = "nrc")
## Joining with `by = join_by(word)`
syuzhet_holmes <- tidy_holmes %>% inner_join(syuzhet, relationship = "many-to-many") %>%
mutate(index = linenumber %/% 25) %>%
group_by(book, index) %>%
summarise(sentiment = sum(value)) %>%
mutate(lexicon = "syuzhet")
## Joining with `by = join_by(word)`
## `summarise()` has grouped output by 'book'. You can override using the
## `.groups` argument.
sentiment_df <- bind_rows(nrc_holmes, syuzhet_holmes) %>% select(c(1,2,5,6))
head(sentiment_df)
sentiment_df %>% filter(lexicon == "syuzhet") %>%
ggplot(aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_y")
sentiment_df %>% filter(book == "The Hound of the Baskervilles") %>%
ggplot(aes(index, sentiment, fill = lexicon)) +
geom_col(show.legend = FALSE) +
facet_wrap(~lexicon, ncol = 1, scales = "free_y")
Across all the books in the Sherlock Holmes data set we see that the sentiment fluctuates from beginning to end and most books tend to end with a neutral tone. When looking at the arc of “The Hound of the Baskervilles” each lexicon follows a similar trend except at index 6 were the sentiment looks to be evaluated quite differently between the two lexicons. Further investigation reveals that not all the same words were identified by each lexicon. This may have been from filtering out only the negative and positive words from the NRC dictionary.
bind_cols(tidy_holmes %>% mutate(index = linenumber %/% 25) %>%
filter(index == 6 & book == "The Hound of the Baskervilles") %>%
inner_join(syuzhet, relationship = "many-to-many") %>%
select(SYUZHET_word = word, SYUZHET_value = value),
tidy_holmes %>% mutate(index = linenumber %/% 25) %>%
filter(index == 6 & book == "The Hound of the Baskervilles") %>%
inner_join(nrc, relationship = "many-to-many") %>%
mutate(sentiment = ifelse(sentiment == "positive", 1, 0)) %>%
select(NRC_word = word, NRC_value = sentiment)
)
## Joining with `by = join_by(word)`
## Joining with `by = join_by(word)`