knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.5.3
library(janeaustenr)
## Warning: package 'janeaustenr' was built under R version 4.5.3
library(stringr)
library(textdata)
## Warning: package 'textdata' was built under R version 4.5.3
### Step 1: Reproduce the Base Example
#In Chapter 2 of "Text Mining with R", the authors analyze sentiment trends throughout books. We will start by replicating the basic workflow using the "Bing" lexicon.
#Source: Silge & Robinson, Text Mining with R
# Prepare the text data for Pride and Prejudice
tidy_books <- austen_books() %>%
filter(book == "Pride & Prejudice") %>%
mutate(linenumber = row_number(),
chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]", ignore_case = TRUE)))) %>%
unnest_tokens(word, text)
# Use the Bing lexicon to calculate sentiment
pride_prejudice_sentiment <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
# Plot the base analysis
ggplot(pride_prejudice_sentiment, aes(index, sentiment)) +
geom_col(fill = "steelblue", show.legend = FALSE) +
labs(title = "Sentiment through Pride & Prejudice (Bing Lexicon)",
x = "Trajectory", y = "Sentiment Score")

### Step 2: Extension of the Analysis
#For the extension, I will use a different sentiment lexicon: Loughran-McDonald. This lexicon was originally developed for financial documents but is excellent for identifying specific emotional tones like "litigious", "uncertainty", or "constraining".
# Get the Loughran-McDonald lexicon
loughran <- get_sentiments("loughran")
# Analyze Pride and Prejudice using Loughran-McDonald
loughran_analysis <- tidy_books %>%
inner_join(loughran) %>%
count(word, sentiment, sort = TRUE)
## Joining with `by = join_by(word)`
## Warning in inner_join(., loughran): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 3 of `x` matches multiple rows in `y`.
## ℹ Row 2826 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
# Compare how different emotional categories appear in the text
ggplot(loughran_analysis %>% filter(n > 5), aes(reorder(sentiment, n), n)) +
geom_col(fill = "darkred") +
coord_flip() +
labs(title = "Loughran-McDonald Categories in Pride & Prejudice",
x = "Emotion Category", y = "Word Count")

### Analysis of Differences
#The original example from the book uses the Bing lexicon, which provides a simple binary classification (positive vs. negative). This is useful for seeing the general "mood" of a chapter.
#In contrast, the Loughran-McDonald lexicon (used in the extension) reveals more nuanced categories. Even in a 19th-century novel, it identified high levels of "uncertainty" and "litigious" language, which likely reflects the complex social negotiations and marriage contracts discussed in the plot. While Bing shows if a scene is happy, Loughran-McDonald helps explain the nature of the tension.