Part 1 consists of items 2-15 and based on are example code from Chapter 2 of Text Mining with R: A Tidy Approach. See reference citation below.
Silge, J., & Robinson, D. (2017). Sentiment analysis with tidy data. In Text Mining with R: A Tidy Approach (Chapter 2). O’Reilly Media. https://www.tidytextmining.com/sentiment.html
library(tidytext)
library(tidyverse)
library(ggplot2)
get_sentiments("afinn")
## # A tibble: 2,477 × 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # ℹ 2,467 more rows
get_sentiments("bing")
## # A tibble: 6,786 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # ℹ 6,776 more rows
get_sentiments("nrc")
## # A tibble: 13,872 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abacus      trust    
##  2 abandon     fear     
##  3 abandon     negative 
##  4 abandon     sadness  
##  5 abandoned   anger    
##  6 abandoned   fear     
##  7 abandoned   negative 
##  8 abandoned   sadness  
##  9 abandonment anger    
## 10 abandonment fear     
## # ℹ 13,862 more rows
library(janeaustenr)
library(dplyr)
library(stringr)
tidy_books <- austen_books() %>%
  group_by(book) %>%
  mutate(
    linenumber = row_number(),
    chapter = cumsum(str_detect(text, 
                                regex("^chapter [\\divxlc]", 
                                      ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)
nrc_joy <- get_sentiments("nrc") %>% 
  filter(sentiment == "joy")
tidy_books %>%
  filter(book == "Emma") %>%
  inner_join(nrc_joy, relationship = "many-to-many") %>%
  count(word, sort = TRUE)
## # A tibble: 301 × 2
##    word          n
##    <chr>     <int>
##  1 good        359
##  2 friend      166
##  3 hope        143
##  4 happy       125
##  5 love        117
##  6 deal         92
##  7 found        92
##  8 present      89
##  9 kind         82
## 10 happiness    76
## # ℹ 291 more rows
library(tidyr)
jane_austen_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing"), relationship = "many-to-many") %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")
First, use filter() to choose only the words from the one novel we are interested in.
pride_prejudice <- tidy_books %>% 
  filter(book == "Pride & Prejudice")
pride_prejudice
## # A tibble: 122,204 × 4
##    book              linenumber chapter word     
##    <fct>                  <int>   <int> <chr>    
##  1 Pride & Prejudice          1       0 pride    
##  2 Pride & Prejudice          1       0 and      
##  3 Pride & Prejudice          1       0 prejudice
##  4 Pride & Prejudice          3       0 by       
##  5 Pride & Prejudice          3       0 jane     
##  6 Pride & Prejudice          3       0 austen   
##  7 Pride & Prejudice          7       1 chapter  
##  8 Pride & Prejudice          7       1 1        
##  9 Pride & Prejudice         10       1 it       
## 10 Pride & Prejudice         10       1 is       
## # ℹ 122,194 more rows
afinn <- pride_prejudice %>% 
  inner_join(get_sentiments("afinn"), relationship = "many-to-many") %>% 
  group_by(index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(value)) %>% 
  mutate(method = "AFINN")
bing_and_nrc <- bind_rows(
  pride_prejudice %>% 
    inner_join(get_sentiments("bing"), relationship = "many-to-many") %>%
    mutate(method = "Bing et al."),
  pride_prejudice %>% 
    inner_join(get_sentiments("nrc") %>% 
                 filter(sentiment %in% c("positive", 
                                         "negative")),
               relationship = "many-to-many") %>%
    mutate(method = "NRC")) %>%
  count(method, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment,
              values_from = n,
              values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
bind_rows(afinn, 
          bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")
get_sentiments("nrc") %>% 
  filter(sentiment %in% c("positive", "negative")) %>% 
  count(sentiment)
## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   3316
## 2 positive   2308
get_sentiments("bing") %>% 
  count(sentiment)
## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   4781
## 2 positive   2005
bing_word_counts <- tidy_books %>%
  inner_join(get_sentiments("bing"), relationship = "many-to-many") %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
bing_word_counts
## # A tibble: 2,585 × 3
##    word     sentiment     n
##    <chr>    <chr>     <int>
##  1 miss     negative   1855
##  2 well     positive   1523
##  3 good     positive   1380
##  4 great    positive    981
##  5 like     positive    725
##  6 better   positive    639
##  7 enough   positive    613
##  8 happy    positive    534
##  9 love     positive    495
## 10 pleasure positive    462
## # ℹ 2,575 more rows
bing_word_counts %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL)
library(wordcloud)
tidy_books %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))
library(reshape2)
tidy_books %>%
  inner_join(get_sentiments("bing"), relationship = "many-to-many") %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 100)
austen_chapters <- austen_books() %>%
  group_by(book) %>%
  unnest_tokens(chapter, text, token = "regex", 
                pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
  ungroup()
austen_chapters %>% 
  group_by(book) %>% 
  summarise(chapters = n())
## # A tibble: 6 × 2
##   book                chapters
##   <fct>                  <int>
## 1 Sense & Sensibility       51
## 2 Pride & Prejudice         62
## 3 Mansfield Park            49
## 4 Emma                      56
## 5 Northanger Abbey          32
## 6 Persuasion                25
library(tidytext)
library(gutenbergr)
library(dplyr)
library(stringr)
# Download Pride and Prejudice
pride_prejudice <- gutenberg_download(1342)
# Process it
tidy_pp <- pride_prejudice %>%
  mutate(
    linenumber = row_number(),
    chapter = cumsum(str_detect(text, 
                                regex("^chapter [\\divxlc]", 
                                      ignore_case = TRUE)))) %>%
  unnest_tokens(word, text)
# First, let's explore joy in Pride and Prejudice
nrc_joy <- get_sentiments("nrc") %>% 
  filter(sentiment == "joy")
tidy_pp %>%
  inner_join(nrc_joy, relationship = "many-to-many") %>%
  count(word, sort = TRUE) %>%
  head(20)
## # A tibble: 20 × 2
##    word          n
##    <chr>     <int>
##  1 good        208
##  2 hope        125
##  3 mother      112
##  4 friend      107
##  5 love        102
##  6 happy        83
##  7 daughter     77
##  8 happiness    72
##  9 kind         71
## 10 present      71
## 11 found        68
## 12 marriage     67
## 13 affection    61
## 14 pride        55
## 15 marry        46
## 16 engaged      40
## 17 fortune      39
## 18 pleased      39
## 19 spirits      39
## 20 feeling      37
pp_sentiment <- tidy_pp %>%
  inner_join(get_sentiments("bing"), relationship = "many-to-many") %>%
  count(chapter, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
# Visualize the emotional arc
ggplot(pp_sentiment, aes(index, sentiment, fill = sentiment > 0)) +
  geom_col(show.legend = FALSE) +
  scale_fill_manual(values = c("firebrick", "steelblue")) +
  labs(title = "Sentiment Arc of Pride and Prejudice",
       subtitle = "Emotional trajectory throughout the novel",
       x = "Narrative Progress (80-line segments)",
       y = "Sentiment Score",
       caption = "Using Bing Lexicon") +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold", size = 16))
afinn_pp <- tidy_pp %>% 
  inner_join(get_sentiments("afinn"), relationship = "many-to-many") %>% 
  group_by(index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(value)) %>% 
  mutate(method = "AFINN")
bing_and_nrc_pp <- bind_rows(
  tidy_pp %>% 
    inner_join(get_sentiments("bing"), relationship = "many-to-many") %>%
    mutate(method = "Bing et al."),
  tidy_pp %>% 
    inner_join(get_sentiments("nrc") %>% 
                 filter(sentiment %in% c("positive", "negative")),
               relationship = "many-to-many") %>%
    mutate(method = "NRC")) %>%
  count(method, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment,
              values_from = n,
              values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
# Compare lexicons
bind_rows(afinn_pp, bing_and_nrc_pp) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y") +
  labs(title = "Comparing Sentiment Lexicons on Pride and Prejudice",
       x = "Narrative Progress",
       y = "Sentiment Score") +
  theme_minimal()
# Analyze 8 emotions beyond just positive/negative
nrc_emotions <- get_sentiments("nrc") %>%
  filter(!sentiment %in% c("positive", "negative"))
pp_emotions <- tidy_pp %>%
  inner_join(nrc_emotions, relationship = "many-to-many") %>%
  count(index = linenumber %/% 80, sentiment) %>%
  group_by(sentiment) %>%
  mutate(cumulative = cumsum(n))
# Visualize emotional dimensions over narrative
ggplot(pp_emotions, aes(index, n, color = sentiment)) +
  geom_smooth(se = FALSE, size = 1.2) +
  facet_wrap(~sentiment, ncol = 2, scales = "free_y") +
  labs(title = "Eight Emotional Dimensions in Pride and Prejudice",
       subtitle = "NRC Emotion Lexicon Analysis",
       x = "Narrative Progress",
       y = "Emotion Frequency") +
  theme_minimal() +
  theme(legend.position = "none",
        plot.title = element_text(face = "bold"))
pp_chapter_sentiment <- tidy_pp %>%
  filter(chapter > 0) %>%  # Remove prologue/intro
  inner_join(get_sentiments("bing"), relationship = "many-to-many") %>%
  count(chapter, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
  mutate(
    total_words = positive + negative,
    sentiment_score = positive - negative,
    sentiment_ratio = positive / (positive + negative)
  )
pp_chapter_sentiment %>%
  arrange(desc(sentiment_score)) %>%
  head(5)
## # A tibble: 5 × 6
##   chapter negative positive total_words sentiment_score sentiment_ratio
##     <int>    <int>    <int>       <int>           <int>           <dbl>
## 1      43       97      222         319             125           0.696
## 2      18      158      227         385              69           0.590
## 3      16       92      160         252              68           0.635
## 4       6       70      130         200              60           0.65 
## 5      49       44      102         146              58           0.699
pp_chapter_sentiment %>%
  arrange(sentiment_score) %>%
  head(5)
## # A tibble: 5 × 6
##   chapter negative positive total_words sentiment_score sentiment_ratio
##     <int>    <int>    <int>       <int>           <int>           <dbl>
## 1      46      141       94         235             -47           0.4  
## 2      34      109       72         181             -37           0.398
## 3      36       96       74         170             -22           0.435
## 4      41      114       94         208             -20           0.452
## 5      45       69       54         123             -15           0.439
# Visualize chapter sentiment
ggplot(pp_chapter_sentiment, aes(chapter, sentiment_score, fill = sentiment_score > 0)) +
  geom_col(show.legend = FALSE) +
  scale_fill_manual(values = c("coral", "skyblue")) +
  labs(title = "Sentiment by Chapter in Pride and Prejudice",
       x = "Chapter Number",
       y = "Net Sentiment Score",
       subtitle = "Positive chapters in blue, negative in coral") +
  theme_minimal()
pp_word_counts <- tidy_pp %>%
  inner_join(get_sentiments("bing"), relationship = "many-to-many") %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
# Top 15 words by sentiment
pp_word_counts %>%
  group_by(sentiment) %>%
  slice_max(n, n = 15) %>% 
  ungroup() %>%
  mutate(word = reorder_within(word, n, sentiment)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free") +
  scale_y_reordered() +
  scale_fill_manual(values = c("negative" = "indianred", "positive" = "seagreen")) +
  labs(title = "Most Frequent Sentiment Words in Pride and Prejudice",
       x = "Word Frequency",
       y = NULL) +
  theme_minimal()
### Wordcloud visualization
library(wordcloud)
# Overall word cloud
tidy_pp %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100, 
                 colors = brewer.pal(8, "Dark2"),
                 random.order = FALSE))
# Sentiment comparison cloud
library(reshape2)
tidy_pp %>%
  inner_join(get_sentiments("bing"), relationship = "many-to-many") %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("firebrick", "steelblue"),
                   max.words = 100,
                   title.size = 1.5)
###. Character-specific Sentiment Analysis (Advanced)
# Extract mentions of main characters
characters <- c("elizabeth", "darcy", "bingley", "jane", "wickham", "lydia")
character_context <- tidy_pp %>%
  mutate(
    word_lower = tolower(word),
    is_character = word_lower %in% characters
  ) %>%
  group_by(linenumber) %>%
  mutate(character_present = any(is_character)) %>%
  filter(character_present) %>%
  ungroup()
# Sentiment when each character is mentioned
character_sentiment <- character_context %>%
  filter(word %in% characters) %>%
  mutate(character_name = word) %>%
  select(linenumber, character_name, chapter) %>%
  distinct() %>%
  left_join(
    tidy_pp %>%
      inner_join(get_sentiments("afinn"), relationship = "many-to-many") %>%
      group_by(linenumber) %>%
      summarise(line_sentiment = mean(value)),
    by = "linenumber"
  )
# Average sentiment by character
character_sentiment %>%
  group_by(character_name) %>%
  summarise(
    avg_sentiment = mean(line_sentiment, na.rm = TRUE),
    median_sentiment = median(line_sentiment, na.rm = TRUE),
    mentions = n()
  ) %>%
  arrange(desc(avg_sentiment))
## # A tibble: 6 × 4
##   character_name avg_sentiment median_sentiment mentions
##   <chr>                  <dbl>            <dbl>    <int>
## 1 wickham                0.516              1        167
## 2 lydia                  0.348              0.5      134
## 3 elizabeth              0.278              0.5      604
## 4 jane                   0.234              0.5      270
## 5 darcy                  0.104              0        381
## 6 bingley               -0.282             -1        261
# Visualize character sentiment
character_sentiment %>%
  filter(!is.na(line_sentiment)) %>%
  ggplot(aes(x = reorder(character_name, line_sentiment, FUN = median), 
             y = line_sentiment,
             fill = character_name)) +
  geom_boxplot(show.legend = FALSE) +
  coord_flip() +
  scale_fill_brewer(palette = "Set2") +
  labs(title = "Sentiment Distribution When Characters Are Mentioned",
       subtitle = "Pride and Prejudice Character Analysis",
       x = "Character",
       y = "Sentiment Score (AFINN)") +
  theme_minimal()