Sentiment Analysis: A Tidy Approach

1 Part 1

Part 1 consists of items 2-15 and based on are example code from Chapter 2 of Text Mining with R: A Tidy Approach. See reference citation below.

Silge, J., & Robinson, D. (2017). Sentiment analysis with tidy data. In Text Mining with R: A Tidy Approach (Chapter 2). O’Reilly Media. https://www.tidytextmining.com/sentiment.html

2 Analyzing sentiment lexicons

library(tidytext)
library(tidyverse)
library(ggplot2)

get_sentiments("afinn")

## # A tibble: 2,477 × 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # ℹ 2,467 more rows

get_sentiments("bing")

## # A tibble: 6,786 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # ℹ 6,776 more rows

get_sentiments("nrc")

## # A tibble: 13,872 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abacus      trust    
##  2 abandon     fear     
##  3 abandon     negative 
##  4 abandon     sadness  
##  5 abandoned   anger    
##  6 abandoned   fear     
##  7 abandoned   negative 
##  8 abandoned   sadness  
##  9 abandonment anger    
## 10 abandonment fear     
## # ℹ 13,862 more rows

3 Sentiment Analysis Part 1: Convert texts to tidy format using unnest_tokens()

library(janeaustenr)
library(dplyr)
library(stringr)

tidy_books <- austen_books() %>%
  group_by(book) %>%
  mutate(
    linenumber = row_number(),
    chapter = cumsum(str_detect(text, 
                                regex("^chapter [\\divxlc]", 
                                      ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)

4 Sentiment Analysis Part 2: Conduct the sentiment analysis on the tidy data

nrc_joy <- get_sentiments("nrc") %>% 
  filter(sentiment == "joy")

tidy_books %>%
  filter(book == "Emma") %>%
  inner_join(nrc_joy, relationship = "many-to-many") %>%
  count(word, sort = TRUE)

## # A tibble: 301 × 2
##    word          n
##    <chr>     <int>
##  1 good        359
##  2 friend      166
##  3 hope        143
##  4 happy       125
##  5 love        117
##  6 deal         92
##  7 found        92
##  8 present      89
##  9 kind         82
## 10 happiness    76
## # ℹ 291 more rows

5 Use pivot_wider() to get positive and negative sentiments in separate columns

library(tidyr)

jane_austen_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing"), relationship = "many-to-many") %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)

6 Plot the sentiment scores

ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")

7 Comparing the three sentiment dictionaries on Pride and Prejudice

First, use filter() to choose only the words from the one novel we are interested in.

pride_prejudice <- tidy_books %>% 
  filter(book == "Pride & Prejudice")

pride_prejudice

## # A tibble: 122,204 × 4
##    book              linenumber chapter word     
##    <fct>                  <int>   <int> <chr>    
##  1 Pride & Prejudice          1       0 pride    
##  2 Pride & Prejudice          1       0 and      
##  3 Pride & Prejudice          1       0 prejudice
##  4 Pride & Prejudice          3       0 by       
##  5 Pride & Prejudice          3       0 jane     
##  6 Pride & Prejudice          3       0 austen   
##  7 Pride & Prejudice          7       1 chapter  
##  8 Pride & Prejudice          7       1 1        
##  9 Pride & Prejudice         10       1 it       
## 10 Pride & Prejudice         10       1 is       
## # ℹ 122,194 more rows

8 Use inner_join() to calculate the sentiment in different ways

afinn <- pride_prejudice %>% 
  inner_join(get_sentiments("afinn"), relationship = "many-to-many") %>% 
  group_by(index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(value)) %>% 
  mutate(method = "AFINN")

bing_and_nrc <- bind_rows(
  pride_prejudice %>% 
    inner_join(get_sentiments("bing"), relationship = "many-to-many") %>%
    mutate(method = "Bing et al."),
  pride_prejudice %>% 
    inner_join(get_sentiments("nrc") %>% 
                 filter(sentiment %in% c("positive", 
                                         "negative")),
               relationship = "many-to-many") %>%
    mutate(method = "NRC")) %>%
  count(method, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment,
              values_from = n,
              values_fill = 0) %>% 
  mutate(sentiment = positive - negative)

9 Bind the rows from afinn, bing and nrc and visualize them

bind_rows(afinn, 
          bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

10 A look at the number of positive and negative words in these lexicons

get_sentiments("nrc") %>% 
  filter(sentiment %in% c("positive", "negative")) %>% 
  count(sentiment)

## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   3316
## 2 positive   2308

get_sentiments("bing") %>% 
  count(sentiment)

## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   4781
## 2 positive   2005

11 Count of common positive and negative words

bing_word_counts <- tidy_books %>%
  inner_join(get_sentiments("bing"), relationship = "many-to-many") %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()

bing_word_counts

## # A tibble: 2,585 × 3
##    word     sentiment     n
##    <chr>    <chr>     <int>
##  1 miss     negative   1855
##  2 well     positive   1523
##  3 good     positive   1380
##  4 great    positive    981
##  5 like     positive    725
##  6 better   positive    639
##  7 enough   positive    613
##  8 happy    positive    534
##  9 love     positive    495
## 10 pleasure positive    462
## # ℹ 2,575 more rows

12 Visualization of positive and negative words

bing_word_counts %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL)

13 Create a word cloud for the Jane Austen novels

library(wordcloud)

tidy_books %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))

14 Reshaping the word cloud by positive and negative

library(reshape2)

tidy_books %>%
  inner_join(get_sentiments("bing"), relationship = "many-to-many") %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 100)

15 Alternative approach: Use unnest_tokens() to split the text into a dataframe by chapter

austen_chapters <- austen_books() %>%
  group_by(book) %>%
  unnest_tokens(chapter, text, token = "regex", 
                pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
  ungroup()

austen_chapters %>% 
  group_by(book) %>% 
  summarise(chapters = n())

## # A tibble: 6 × 2
##   book                chapters
##   <fct>                  <int>
## 1 Sense & Sensibility       51
## 2 Pride & Prejudice         62
## 3 Mansfield Park            49
## 4 Emma                      56
## 5 Northanger Abbey          32
## 6 Persuasion                25

16 Part 2: Sentiment analysis of Pride and Prejudice

library(tidytext)
library(gutenbergr)
library(dplyr)
library(stringr)

# Download Pride and Prejudice
pride_prejudice <- gutenberg_download(1342)

# Process it
tidy_pp <- pride_prejudice %>%
  mutate(
    linenumber = row_number(),
    chapter = cumsum(str_detect(text, 
                                regex("^chapter [\\divxlc]", 
                                      ignore_case = TRUE)))) %>%
  unnest_tokens(word, text)

17 Basic Sentiment Analysis with NRC Lexicon

# First, let's explore joy in Pride and Prejudice
nrc_joy <- get_sentiments("nrc") %>% 
  filter(sentiment == "joy")

tidy_pp %>%
  inner_join(nrc_joy, relationship = "many-to-many") %>%
  count(word, sort = TRUE) %>%
  head(20)

## # A tibble: 20 × 2
##    word          n
##    <chr>     <int>
##  1 good        208
##  2 hope        125
##  3 mother      112
##  4 friend      107
##  5 love        102
##  6 happy        83
##  7 daughter     77
##  8 happiness    72
##  9 kind         71
## 10 present      71
## 11 found        68
## 12 marriage     67
## 13 affection    61
## 14 pride        55
## 15 marry        46
## 16 engaged      40
## 17 fortune      39
## 18 pleased      39
## 19 spirits      39
## 20 feeling      37

18 Sentiment Arc Analysis - Track the emotional journey

pp_sentiment <- tidy_pp %>%
  inner_join(get_sentiments("bing"), relationship = "many-to-many") %>%
  count(chapter, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)

# Visualize the emotional arc
ggplot(pp_sentiment, aes(index, sentiment, fill = sentiment > 0)) +
  geom_col(show.legend = FALSE) +
  scale_fill_manual(values = c("firebrick", "steelblue")) +
  labs(title = "Sentiment Arc of Pride and Prejudice",
       subtitle = "Emotional trajectory throughout the novel",
       x = "Narrative Progress (80-line segments)",
       y = "Sentiment Score",
       caption = "Using Bing Lexicon") +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold", size = 16))

19 Compare Three Sentiment Lexicons

afinn_pp <- tidy_pp %>% 
  inner_join(get_sentiments("afinn"), relationship = "many-to-many") %>% 
  group_by(index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(value)) %>% 
  mutate(method = "AFINN")

bing_and_nrc_pp <- bind_rows(
  tidy_pp %>% 
    inner_join(get_sentiments("bing"), relationship = "many-to-many") %>%
    mutate(method = "Bing et al."),
  tidy_pp %>% 
    inner_join(get_sentiments("nrc") %>% 
                 filter(sentiment %in% c("positive", "negative")),
               relationship = "many-to-many") %>%
    mutate(method = "NRC")) %>%
  count(method, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment,
              values_from = n,
              values_fill = 0) %>% 
  mutate(sentiment = positive - negative)

# Compare lexicons
bind_rows(afinn_pp, bing_and_nrc_pp) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y") +
  labs(title = "Comparing Sentiment Lexicons on Pride and Prejudice",
       x = "Narrative Progress",
       y = "Sentiment Score") +
  theme_minimal()

20 Advanced: Multi-Dimensional Emotion Analysis with NRC

# Analyze 8 emotions beyond just positive/negative
nrc_emotions <- get_sentiments("nrc") %>%
  filter(!sentiment %in% c("positive", "negative"))

pp_emotions <- tidy_pp %>%
  inner_join(nrc_emotions, relationship = "many-to-many") %>%
  count(index = linenumber %/% 80, sentiment) %>%
  group_by(sentiment) %>%
  mutate(cumulative = cumsum(n))

# Visualize emotional dimensions over narrative
ggplot(pp_emotions, aes(index, n, color = sentiment)) +
  geom_smooth(se = FALSE, size = 1.2) +
  facet_wrap(~sentiment, ncol = 2, scales = "free_y") +
  labs(title = "Eight Emotional Dimensions in Pride and Prejudice",
       subtitle = "NRC Emotion Lexicon Analysis",
       x = "Narrative Progress",
       y = "Emotion Frequency") +
  theme_minimal() +
  theme(legend.position = "none",
        plot.title = element_text(face = "bold"))

21 Chapter-by-Chapter Sentiment Analysis

pp_chapter_sentiment <- tidy_pp %>%
  filter(chapter > 0) %>%  # Remove prologue/intro
  inner_join(get_sentiments("bing"), relationship = "many-to-many") %>%
  count(chapter, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
  mutate(
    total_words = positive + negative,
    sentiment_score = positive - negative,
    sentiment_ratio = positive / (positive + negative)
  )

22 Identify most positive and negative chapters

pp_chapter_sentiment %>%
  arrange(desc(sentiment_score)) %>%
  head(5)

## # A tibble: 5 × 6
##   chapter negative positive total_words sentiment_score sentiment_ratio
##     <int>    <int>    <int>       <int>           <int>           <dbl>
## 1      43       97      222         319             125           0.696
## 2      18      158      227         385              69           0.590
## 3      16       92      160         252              68           0.635
## 4       6       70      130         200              60           0.65 
## 5      49       44      102         146              58           0.699

pp_chapter_sentiment %>%
  arrange(sentiment_score) %>%
  head(5)

## # A tibble: 5 × 6
##   chapter negative positive total_words sentiment_score sentiment_ratio
##     <int>    <int>    <int>       <int>           <int>           <dbl>
## 1      46      141       94         235             -47           0.4  
## 2      34      109       72         181             -37           0.398
## 3      36       96       74         170             -22           0.435
## 4      41      114       94         208             -20           0.452
## 5      45       69       54         123             -15           0.439

# Visualize chapter sentiment
ggplot(pp_chapter_sentiment, aes(chapter, sentiment_score, fill = sentiment_score > 0)) +
  geom_col(show.legend = FALSE) +
  scale_fill_manual(values = c("coral", "skyblue")) +
  labs(title = "Sentiment by Chapter in Pride and Prejudice",
       x = "Chapter Number",
       y = "Net Sentiment Score",
       subtitle = "Positive chapters in blue, negative in coral") +
  theme_minimal()

23 Most Impactful Words Analysis

pp_word_counts <- tidy_pp %>%
  inner_join(get_sentiments("bing"), relationship = "many-to-many") %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()

# Top 15 words by sentiment
pp_word_counts %>%
  group_by(sentiment) %>%
  slice_max(n, n = 15) %>% 
  ungroup() %>%
  mutate(word = reorder_within(word, n, sentiment)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free") +
  scale_y_reordered() +
  scale_fill_manual(values = c("negative" = "indianred", "positive" = "seagreen")) +
  labs(title = "Most Frequent Sentiment Words in Pride and Prejudice",
       x = "Word Frequency",
       y = NULL) +
  theme_minimal()

### Wordcloud visualization

library(wordcloud)

# Overall word cloud
tidy_pp %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100, 
                 colors = brewer.pal(8, "Dark2"),
                 random.order = FALSE))

# Sentiment comparison cloud
library(reshape2)

tidy_pp %>%
  inner_join(get_sentiments("bing"), relationship = "many-to-many") %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("firebrick", "steelblue"),
                   max.words = 100,
                   title.size = 1.5)

###. Character-specific Sentiment Analysis (Advanced)

# Extract mentions of main characters
characters <- c("elizabeth", "darcy", "bingley", "jane", "wickham", "lydia")

character_context <- tidy_pp %>%
  mutate(
    word_lower = tolower(word),
    is_character = word_lower %in% characters
  ) %>%
  group_by(linenumber) %>%
  mutate(character_present = any(is_character)) %>%
  filter(character_present) %>%
  ungroup()

# Sentiment when each character is mentioned
character_sentiment <- character_context %>%
  filter(word %in% characters) %>%
  mutate(character_name = word) %>%
  select(linenumber, character_name, chapter) %>%
  distinct() %>%
  left_join(
    tidy_pp %>%
      inner_join(get_sentiments("afinn"), relationship = "many-to-many") %>%
      group_by(linenumber) %>%
      summarise(line_sentiment = mean(value)),
    by = "linenumber"
  )

# Average sentiment by character
character_sentiment %>%
  group_by(character_name) %>%
  summarise(
    avg_sentiment = mean(line_sentiment, na.rm = TRUE),
    median_sentiment = median(line_sentiment, na.rm = TRUE),
    mentions = n()
  ) %>%
  arrange(desc(avg_sentiment))

## # A tibble: 6 × 4
##   character_name avg_sentiment median_sentiment mentions
##   <chr>                  <dbl>            <dbl>    <int>
## 1 wickham                0.516              1        167
## 2 lydia                  0.348              0.5      134
## 3 elizabeth              0.278              0.5      604
## 4 jane                   0.234              0.5      270
## 5 darcy                  0.104              0        381
## 6 bingley               -0.282             -1        261

# Visualize character sentiment
character_sentiment %>%
  filter(!is.na(line_sentiment)) %>%
  ggplot(aes(x = reorder(character_name, line_sentiment, FUN = median), 
             y = line_sentiment,
             fill = character_name)) +
  geom_boxplot(show.legend = FALSE) +
  coord_flip() +
  scale_fill_brewer(palette = "Set2") +
  labs(title = "Sentiment Distribution When Characters Are Mentioned",
       subtitle = "Pride and Prejudice Character Analysis",
       x = "Character",
       y = "Sentiment Score (AFINN)") +
  theme_minimal()

24 Part 1: Traditional Tidy Text Approach (Jane Austen)

Uses tidytext package with lexicon-based methods (AFINN, Bing, NRC)
Word-by-word sentiment scoring
Effective for literary text analysis

25 Part 2: Advanced Sentiment Analysis (Pride and Prejudice)

Employs multi-dimensional emotion analysis using NRC lexicon to track eight distinct emotions (joy, anger, fear, trust, anticipation, surprise, sadness, disgust) across the
narrative arc
Implements context-aware sentiment scoring with sentimentr package, which accounts for valence shifters like negations (“not happy”) and amplifiers (“very good”) for more nuanced analysis
Includes character-specific sentiment tracking to analyze how emotional tone shifts when major characters (Elizabeth, Darcy, Wickham) are mentioned, revealing character development patterns
Compares three distinct lexicons (AFINN, Bing, NRC) at both chapter and sentence levels to demonstrate methodological rigor and validate findings across different sentiment