Overview: Practice with sentiment analysis and the tidyverse

I am going to start by getting the primary example code from Chapter 2: Sentiment analysis with tidy data from Text Mining with R: A Tidy Approach.

The sentiments datasets

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidytext)

#get AFINN-111 sentiments from http://www2.imm.dtu.dk/pubdb/views/publication_details.php?id=6010 
#categorizes words in a binary fashion (“yes”/“no”) into categories of positive, negative, anger, anticipation, disgust, fear, joy, sadness, surprise, and trust
get_sentiments("afinn")
## # A tibble: 2,477 × 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # ℹ 2,467 more rows
#get bing sentiments from https://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html
get_sentiments("bing")
## # A tibble: 6,786 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # ℹ 6,776 more rows
#get NRC Word-Emotion Association Lexicon from http://saifmohammad.com/WebPages/lexicons.html
#assigns words with a score that runs between -5 and 5, with negative scores indicating negative sentiment and positive scores indicating positive sentiment
get_sentiments("nrc")
## # A tibble: 13,872 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abacus      trust    
##  2 abandon     fear     
##  3 abandon     negative 
##  4 abandon     sadness  
##  5 abandoned   anger    
##  6 abandoned   fear     
##  7 abandoned   negative 
##  8 abandoned   sadness  
##  9 abandonment anger    
## 10 abandonment fear     
## # ℹ 13,862 more rows

Sentiment analysis with inner join

library(janeaustenr)
library(stringr)

# make text tidy: one word per row
tidy_books <- austen_books() %>%
  group_by(book) %>%
  mutate(
    linenumber = row_number(),
    chapter = cumsum(str_detect(text, 
                                regex("^chapter [\\divxlc]", 
                                      ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)
#filter nrc lexicon for joy words
nrc_joy <- get_sentiments("nrc") %>% 
  filter(sentiment == "joy")

#use inner_join() to perform the sentiment analysis
#we count the joy words from the book Emma, sorting from most common
tidy_books %>%
  filter(book == "Emma") %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)
## Joining with `by = join_by(word)`
## # A tibble: 301 × 2
##    word          n
##    <chr>     <int>
##  1 good        359
##  2 friend      166
##  3 hope        143
##  4 happy       125
##  5 love        117
##  6 deal         92
##  7 found        92
##  8 present      89
##  9 kind         82
## 10 happiness    76
## # ℹ 291 more rows
#use pivot_wider() so that we have negative and positive sentiment in separate columns, and lastly calculate a net sentiment (positive - negative)
jane_austen_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 435434 of `x` matches multiple rows in `y`.
## ℹ Row 5051 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
#plot these sentiment scores across the plot trajectory of each novel
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")

Comparing the three sentiment dictionaries

#get Pride and Prejudice book
pride_prejudice <- tidy_books %>% 
  filter(book == "Pride & Prejudice")
#use integer division (%/%) to define larger sections of text that span multiple lines
#find the net sentiment in each of these sections of text
afinn <- pride_prejudice %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(value)) %>% 
  mutate(method = "AFINN")
## Joining with `by = join_by(word)`
bing_and_nrc <- bind_rows(
  pride_prejudice %>% 
    inner_join(get_sentiments("bing")) %>%
    mutate(method = "Bing et al."),
  pride_prejudice %>% 
    inner_join(get_sentiments("nrc") %>% 
                 filter(sentiment %in% c("positive", 
                                         "negative"))
    ) %>%
    mutate(method = "NRC")) %>%
  count(method, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment,
              values_from = n,
              values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("nrc") %>% filter(sentiment %in% : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 215 of `x` matches multiple rows in `y`.
## ℹ Row 5178 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
bind_rows(afinn, 
          bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

get_sentiments("nrc") %>% 
  filter(sentiment %in% c("positive", "negative")) %>% 
  count(sentiment)
## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   3316
## 2 positive   2308
get_sentiments("bing") %>% 
  count(sentiment)
## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   4781
## 2 positive   2005

The result for the NRC lexicon biased so high in sentiment compared to the Bing et al. result.

Most common positive and negative words

bing_word_counts <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 435434 of `x` matches multiple rows in `y`.
## ℹ Row 5051 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
bing_word_counts
## # A tibble: 2,585 × 3
##    word     sentiment     n
##    <chr>    <chr>     <int>
##  1 miss     negative   1855
##  2 well     positive   1523
##  3 good     positive   1380
##  4 great    positive    981
##  5 like     positive    725
##  6 better   positive    639
##  7 enough   positive    613
##  8 happy    positive    534
##  9 love     positive    495
## 10 pleasure positive    462
## # ℹ 2,575 more rows
bing_word_counts %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL)

Expand code

I am introducing a sentiment lexicon from Kaggle’s Sentiment Analysis Word Lists Dataset (‘https://www.kaggle.com/datasets/prajwalkanade/sentiment-analysis-word-lists-dataset?resource=download’).

# create a positive words data drame
positive <- read.delim('https://raw.githubusercontent.com/evelynbartley/Data-607/main/positive-words.txt', header = FALSE) %>%
  rename(word = V1)

#create a negative words data frame
negative <- read.delim('https://raw.githubusercontent.com/evelynbartley/Data-607/main/negative-words.txt', header = FALSE) %>%
  rename(word = V1)

I am using the gutenbergr package (‘https://docs.ropensci.org/gutenbergr/’) to access free ebooks. I downloaded War and Peace with gutenberg_download() and War and Peace’s id number.

library(gutenbergr)

gutenberg_works() %>%
  filter(title == "War and Peace")
## # A tibble: 1 × 8
##   gutenberg_id title     author gutenberg_author_id language gutenberg_bookshelf
##          <int> <chr>     <chr>                <int> <chr>    <chr>              
## 1         2600 War and … Tolst…                 136 en       Napoleonic(Bookshe…
## # ℹ 2 more variables: rights <chr>, has_text <lgl>
WarandPeace <- gutenberg_download(2600)
## Determining mirror for Project Gutenberg from https://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org

Tidy!

# make text tidy: one word per row
WarandPeace_tidy <- WarandPeace %>%
  mutate(
    linenumber = row_number(),
    chapter = cumsum(str_detect(text, 
                                regex("^chapter [\\divxlc]", 
                                      ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)

Get rid of stop words

data(stop_words)
WarandPeace_tidy <- WarandPeace_tidy %>%
  anti_join(stop_words)
## Joining with `by = join_by(word)`

Perform the sentiment analysis

#we count the positive words from War and Peace, sorting from most common
WarandPeace_tidy %>%
  inner_join(positive) %>%
  count(word, sort = TRUE)
## Joining with `by = join_by(word)`
## # A tibble: 1,007 × 2
##    word          n
##    <chr>     <int>
##  1 love        456
##  2 smile       416
##  3 happy       194
##  4 ready       165
##  5 silent      164
##  6 honor       157
##  7 free        151
##  8 smiling     145
##  9 glad        129
## 10 happiness   129
## # ℹ 997 more rows
#we count the negative words from War and Peace, sorting from most common
WarandPeace_tidy %>%
  inner_join(negative) %>%
  count(word, sort = TRUE)
## Joining with `by = join_by(word)`
## # A tibble: 1,878 × 2
##    word           n
##    <chr>      <int>
##  1 enemy        207
##  2 death        188
##  3 impossible   178
##  4 strange      159
##  5 terrible     157
##  6 afraid       148
##  7 lost         140
##  8 cold         138
##  9 killed       129
## 10 fell         125
## # ℹ 1,868 more rows
#filter nrc lexicon for disgust words
nrc_disgust <- get_sentiments("nrc") %>% 
  filter(sentiment == "disgust")

#use inner_join() to perform the sentiment analysis
#we count the disgust words from the book War and Peace, sorting from most common
WarandPeace_tidy %>%
  inner_join(nrc_disgust) %>%
  count(word, sort = TRUE)
## Joining with `by = join_by(word)`
## # A tibble: 513 × 2
##    word          n
##    <chr>     <int>
##  1 feeling     320
##  2 enemy       207
##  3 death       188
##  4 terrible    157
##  5 bad         124
##  6 angry       109
##  7 gray         86
##  8 lying        82
##  9 ill          79
## 10 suffering    73
## # ℹ 503 more rows

I think its funny that the nrc lexicon includes “feeling” as a word of disgust. Feelings are disgusting!

#filter nrc lexicon for sadness words
nrc_sadness <- get_sentiments("nrc") %>% 
  filter(sentiment == "sadness")

#use inner_join() to perform the sentiment analysis
#we count the sadness words from the book War and Peace, sorting from most common
WarandPeace_tidy %>%
  inner_join(nrc_sadness) %>%
  count(word, sort = TRUE)
## Joining with `by = join_by(word)`
## # A tibble: 678 × 2
##    word           n
##    <chr>      <int>
##  1 feeling      320
##  2 leave        210
##  3 mother       204
##  4 death        188
##  5 soldier      187
##  6 impossible   178
##  7 terrible     157
##  8 lost         140
##  9 fell         125
## 10 bad          124
## # ℹ 668 more rows
#use pivot_wider() so that we have negative and positive sentiment in separate columns, and lastly calculate a net sentiment (positive - negative)
WarandPeace_sentiment_50 <- WarandPeace_tidy %>%
  inner_join(get_sentiments("bing")) %>%
  count(, index = linenumber %/% 50, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 10888 of `x` matches multiple rows in `y`.
## ℹ Row 5873 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
#plot these sentiment scores across the plot trajectory of each novel
ggplot(WarandPeace_sentiment_50, aes(index, sentiment)) + geom_col() 

The plot shows that there are more overall negative sentiment sections than positive sentiment sections.

#use pivot_wider() so that we have negative and positive sentiment in separate columns, and lastly calculate a net sentiment (positive - negative)
WarandPeace_sentiment_200 <- WarandPeace_tidy %>%
  inner_join(get_sentiments("bing")) %>%
  count(, index = linenumber %/% 200, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 10888 of `x` matches multiple rows in `y`.
## ℹ Row 5873 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
#plot these sentiment scores across the plot trajectory of each novel
ggplot(WarandPeace_sentiment_200, aes(index, sentiment)) + geom_col() 

Changing the line number argument, I get a lot more negative sections in the book than positive.

I want to make my kaggle sentimentlexicons into one dataframe.

positive$sentiment <- "positive"
negative$sentiment <- "negative"
kaggle_sentiments <- rbind(positive, negative)

# Count how many positive and negative words
kaggle_sentiments %>% 
  count(sentiment)
##   sentiment    n
## 1  negative 4783
## 2  positive 2006

There are more than double the amount of negative words than positive words.

#find out how much each word contributed to each sentiment
kaggle_word_counts <- WarandPeace_tidy %>%
  inner_join(kaggle_sentiments) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
## Joining with `by = join_by(word)`
## Warning in inner_join(., kaggle_sentiments): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 10888 of `x` matches multiple rows in `y`.
## ℹ Row 6152 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
kaggle_word_counts
## # A tibble: 2,885 × 3
##    word       sentiment     n
##    <chr>      <chr>     <int>
##  1 love       positive    456
##  2 smile      positive    416
##  3 enemy      negative    207
##  4 happy      positive    194
##  5 death      negative    188
##  6 impossible negative    178
##  7 ready      positive    165
##  8 silent     positive    164
##  9 strange    negative    159
## 10 honor      positive    157
## # ℹ 2,875 more rows
#plot how each word contributed to sentiment
kaggle_word_counts %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL)

“Love” contributed most to sentiment.

Wordclouds!

library(wordcloud)
## Loading required package: RColorBrewer
WarandPeace_tidy %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))
## Joining with `by = join_by(word)`

library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
WarandPeace_tidy %>%
  inner_join(kaggle_sentiments) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray80", "gray20"),
                   max.words = 100)
## Joining with `by = join_by(word)`
## Warning in inner_join(., kaggle_sentiments): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 10888 of `x` matches multiple rows in `y`.
## ℹ Row 6152 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

## Conclusions

It was interesting to do this sentiment analysis on a book I have only ever heard about. I wonder if I would agree or disagree that the sentiment of War and Peace is mostly negative. Probably.

I liked this assignment because it introduced a field of data science that can be biased like humans are biased. Its interesting that the analysis of sentiment can change based on the lexicon you use to analyze sentiment.