knitr::opts_chunk$set(echo = TRUE)
# Load necessary libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.0 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidytext)
## Warning: пакет 'tidytext' был собран под R версии 4.5.3
library(janeaustenr)
## Warning: пакет 'janeaustenr' был собран под R версии 4.5.3
library(gutenbergr)
## Warning: пакет 'gutenbergr' был собран под R версии 4.5.3
library(syuzhet)
## Warning: пакет 'syuzhet' был собран под R версии 4.5.3
# Set a clean theme for all plots
theme_set(theme_minimal())
# Prepare the Jane Austen data
austen_tidy <- austen_books() %>%
group_by(book) %>%
mutate(
linenumber = row_number(),
chapter = cumsum(str_detect(text,
regex("^chapter [\\divxlc]", ignore_case = TRUE)))
) %>%
ungroup() %>%
unnest_tokens(word, text)
# Calculate net sentiment using the Bing lexicon
austen_sentiment <- austen_tidy %>%
inner_join(get_sentiments("bing"), by = "word") %>%
count(book, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Warning in inner_join(., get_sentiments("bing"), by = "word"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 435434 of `x` matches multiple rows in `y`.
## ℹ Row 5051 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
# Visualize the trajectories
ggplot(austen_sentiment, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x") +
labs(title = "Sentiment Trajectories in Jane Austen's Novels",
subtitle = "Reproduced using the Bing lexicon",
y = "Net Sentiment", x = "Narrative Time (Index)")

# wells raw
wells_raw <- gutenberg_download(36)
## Using mirror https://aleph.pglaf.org.
wells_tidy <- wells_raw %>%
mutate(
linenumber = row_number(),
# Detect Book I and Book II
book_part = cumsum(str_detect(text, regex("^BOOK (ONE|TWO)", ignore_case = TRUE))),
# Improved regex: looks for "I.", "II.", or "CHAPTER" at the start of lines
chapter = cumsum(str_detect(text, regex("^(CHAPTER|V?I{1,3}\\.|X?V?I{0,3}\\.)", ignore_case = TRUE)))
) %>%
unnest_tokens(word, text) %>%
anti_join(stop_words, by = "word")
# Using NRC Lexicon for the extension
wells_nrc <- wells_tidy %>%
inner_join(get_sentiments("nrc"), by = "word") %>%
filter(sentiment %in% c("positive", "negative")) %>%
count(book_part, chapter, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(
sentiment = positive - negative,
part_label = ifelse(book_part == 1, "Book I: The Coming of the Martians", "Book II: The Earth Under the Martians")
)
## Warning in inner_join(., get_sentiments("nrc"), by = "word"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 13510 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
ggplot(wells_nrc, aes(chapter, sentiment, fill = part_label)) +
geom_col(show.legend = FALSE) +
# Use facet_wrap with free scales to separate the two books
facet_wrap(~part_label, scales = "free_x") +
labs(title = "Sentiment Arc of 'The War of the Worlds'",
subtitle = "Detailed Chapter Analysis (NRC Lexicon)",
y = "Net Sentiment", x = "Chapter Number")

wells_text <- paste(wells_raw$text, collapse = " ")
wells_sentiment_v <- get_sentiment(get_sentences(wells_text), method = "bing")
simple_plot(wells_sentiment_v,
title = "Emotional Shape of 'The War of the Worlds'",
legend_pos = "bottom")
