knitr::opts_chunk$set(echo = TRUE)


# Load necessary libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidytext)
## Warning: пакет 'tidytext' был собран под R версии 4.5.3
library(janeaustenr)
## Warning: пакет 'janeaustenr' был собран под R версии 4.5.3
library(gutenbergr)
## Warning: пакет 'gutenbergr' был собран под R версии 4.5.3
library(syuzhet)
## Warning: пакет 'syuzhet' был собран под R версии 4.5.3
# Set a clean theme for all plots
theme_set(theme_minimal())

# Prepare the Jane Austen data
austen_tidy <- austen_books() %>%
  group_by(book) %>%
  mutate(
    linenumber = row_number(),
    chapter = cumsum(str_detect(text, 
                                regex("^chapter [\\divxlc]", ignore_case = TRUE)))
  ) %>%
  ungroup() %>%
  unnest_tokens(word, text)

# Calculate net sentiment using the Bing lexicon
austen_sentiment <- austen_tidy %>%
  inner_join(get_sentiments("bing"), by = "word") %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
## Warning in inner_join(., get_sentiments("bing"), by = "word"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 435434 of `x` matches multiple rows in `y`.
## ℹ Row 5051 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
# Visualize the trajectories
ggplot(austen_sentiment, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x") +
  labs(title = "Sentiment Trajectories in Jane Austen's Novels",
       subtitle = "Reproduced using the Bing lexicon",
       y = "Net Sentiment", x = "Narrative Time (Index)")

# wells raw

wells_raw <- gutenberg_download(36)
## Using mirror https://aleph.pglaf.org.
wells_tidy <- wells_raw %>%
  mutate(
    linenumber = row_number(),
    # Detect Book I and Book II
    book_part = cumsum(str_detect(text, regex("^BOOK (ONE|TWO)", ignore_case = TRUE))),
    # Improved regex: looks for "I.", "II.", or "CHAPTER" at the start of lines
    chapter = cumsum(str_detect(text, regex("^(CHAPTER|V?I{1,3}\\.|X?V?I{0,3}\\.)", ignore_case = TRUE)))
  ) %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words, by = "word")

# Using NRC Lexicon for the extension
wells_nrc <- wells_tidy %>%
  inner_join(get_sentiments("nrc"), by = "word") %>%
  filter(sentiment %in% c("positive", "negative")) %>%
  count(book_part, chapter, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
  mutate(
    sentiment = positive - negative,
    part_label = ifelse(book_part == 1, "Book I: The Coming of the Martians", "Book II: The Earth Under the Martians")
  )
## Warning in inner_join(., get_sentiments("nrc"), by = "word"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 13510 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
ggplot(wells_nrc, aes(chapter, sentiment, fill = part_label)) +
  geom_col(show.legend = FALSE) +
  # Use facet_wrap with free scales to separate the two books
  facet_wrap(~part_label, scales = "free_x") + 
  labs(title = "Sentiment Arc of 'The War of the Worlds'",
       subtitle = "Detailed Chapter Analysis (NRC Lexicon)",
       y = "Net Sentiment", x = "Chapter Number")   

wells_text <- paste(wells_raw$text, collapse = " ")
wells_sentiment_v <- get_sentiment(get_sentences(wells_text), method = "bing")

simple_plot(wells_sentiment_v, 
            title = "Emotional Shape of 'The War of the Worlds'",
            legend_pos = "bottom")