Week 10 assignment

Overview: Practice with sentiment analysis and the tidyverse

I am going to start by getting the primary example code from Chapter 2: Sentiment analysis with tidy data from Text Mining with R: A Tidy Approach.

The sentiments datasets

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(tidytext)

#get AFINN-111 sentiments from http://www2.imm.dtu.dk/pubdb/views/publication_details.php?id=6010 
#categorizes words in a binary fashion (“yes”/“no”) into categories of positive, negative, anger, anticipation, disgust, fear, joy, sadness, surprise, and trust
get_sentiments("afinn")

## # A tibble: 2,477 × 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # ℹ 2,467 more rows

#get bing sentiments from https://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html
get_sentiments("bing")

## # A tibble: 6,786 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # ℹ 6,776 more rows

#get NRC Word-Emotion Association Lexicon from http://saifmohammad.com/WebPages/lexicons.html
#assigns words with a score that runs between -5 and 5, with negative scores indicating negative sentiment and positive scores indicating positive sentiment
get_sentiments("nrc")

## # A tibble: 13,872 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abacus      trust    
##  2 abandon     fear     
##  3 abandon     negative 
##  4 abandon     sadness  
##  5 abandoned   anger    
##  6 abandoned   fear     
##  7 abandoned   negative 
##  8 abandoned   sadness  
##  9 abandonment anger    
## 10 abandonment fear     
## # ℹ 13,862 more rows

Sentiment analysis with inner join

library(janeaustenr)
library(stringr)

# make text tidy: one word per row
tidy_books <- austen_books() %>%
  group_by(book) %>%
  mutate(
    linenumber = row_number(),
    chapter = cumsum(str_detect(text, 
                                regex("^chapter [\\divxlc]", 
                                      ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)

#filter nrc lexicon for joy words
nrc_joy <- get_sentiments("nrc") %>% 
  filter(sentiment == "joy")

#use inner_join() to perform the sentiment analysis
#we count the joy words from the book Emma, sorting from most common
tidy_books %>%
  filter(book == "Emma") %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)

## Joining with `by = join_by(word)`

## # A tibble: 301 × 2
##    word          n
##    <chr>     <int>
##  1 good        359
##  2 friend      166
##  3 hope        143
##  4 happy       125
##  5 love        117
##  6 deal         92
##  7 found        92
##  8 present      89
##  9 kind         82
## 10 happiness    76
## # ℹ 291 more rows

#use pivot_wider() so that we have negative and positive sentiment in separate columns, and lastly calculate a net sentiment (positive - negative)
jane_austen_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)

## Joining with `by = join_by(word)`

## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 435434 of `x` matches multiple rows in `y`.
## ℹ Row 5051 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

#plot these sentiment scores across the plot trajectory of each novel
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")

Comparing the three sentiment dictionaries

#get Pride and Prejudice book
pride_prejudice <- tidy_books %>% 
  filter(book == "Pride & Prejudice")

#use integer division (%/%) to define larger sections of text that span multiple lines
#find the net sentiment in each of these sections of text
afinn <- pride_prejudice %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(value)) %>% 
  mutate(method = "AFINN")

## Joining with `by = join_by(word)`

bing_and_nrc <- bind_rows(
  pride_prejudice %>% 
    inner_join(get_sentiments("bing")) %>%
    mutate(method = "Bing et al."),
  pride_prejudice %>% 
    inner_join(get_sentiments("nrc") %>% 
                 filter(sentiment %in% c("positive", 
                                         "negative"))
    ) %>%
    mutate(method = "NRC")) %>%
  count(method, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment,
              values_from = n,
              values_fill = 0) %>% 
  mutate(sentiment = positive - negative)

## Joining with `by = join_by(word)`
## Joining with `by = join_by(word)`

## Warning in inner_join(., get_sentiments("nrc") %>% filter(sentiment %in% : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 215 of `x` matches multiple rows in `y`.
## ℹ Row 5178 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

bind_rows(afinn, 
          bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

get_sentiments("nrc") %>% 
  filter(sentiment %in% c("positive", "negative")) %>% 
  count(sentiment)

## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   3316
## 2 positive   2308

get_sentiments("bing") %>% 
  count(sentiment)

## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   4781
## 2 positive   2005

The result for the NRC lexicon biased so high in sentiment compared to the Bing et al. result.

Most common positive and negative words

bing_word_counts <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()

## Joining with `by = join_by(word)`

## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 435434 of `x` matches multiple rows in `y`.
## ℹ Row 5051 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

bing_word_counts

## # A tibble: 2,585 × 3
##    word     sentiment     n
##    <chr>    <chr>     <int>
##  1 miss     negative   1855
##  2 well     positive   1523
##  3 good     positive   1380
##  4 great    positive    981
##  5 like     positive    725
##  6 better   positive    639
##  7 enough   positive    613
##  8 happy    positive    534
##  9 love     positive    495
## 10 pleasure positive    462
## # ℹ 2,575 more rows

bing_word_counts %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL)

Expand code

I am introducing a sentiment lexicon from Kaggle’s Sentiment Analysis Word Lists Dataset (‘https://www.kaggle.com/datasets/prajwalkanade/sentiment-analysis-word-lists-dataset?resource=download’).

# create a positive words data drame
positive <- read.delim('https://raw.githubusercontent.com/evelynbartley/Data-607/main/positive-words.txt', header = FALSE) %>%
  rename(word = V1)

#create a negative words data frame
negative <- read.delim('https://raw.githubusercontent.com/evelynbartley/Data-607/main/negative-words.txt', header = FALSE) %>%
  rename(word = V1)

I am using the gutenbergr package (‘https://docs.ropensci.org/gutenbergr/’) to access free ebooks. I downloaded War and Peace with gutenberg_download() and War and Peace’s id number.

library(gutenbergr)

gutenberg_works() %>%
  filter(title == "War and Peace")

## # A tibble: 1 × 8
##   gutenberg_id title     author gutenberg_author_id language gutenberg_bookshelf
##          <int> <chr>     <chr>                <int> <chr>    <chr>              
## 1         2600 War and … Tolst…                 136 en       Napoleonic(Bookshe…
## # ℹ 2 more variables: rights <chr>, has_text <lgl>

WarandPeace <- gutenberg_download(2600)

## Determining mirror for Project Gutenberg from https://www.gutenberg.org/robot/harvest

## Using mirror http://aleph.gutenberg.org

Tidy!

# make text tidy: one word per row
WarandPeace_tidy <- WarandPeace %>%
  mutate(
    linenumber = row_number(),
    chapter = cumsum(str_detect(text, 
                                regex("^chapter [\\divxlc]", 
                                      ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)

Get rid of stop words

data(stop_words)
WarandPeace_tidy <- WarandPeace_tidy %>%
  anti_join(stop_words)

## Joining with `by = join_by(word)`

Perform the sentiment analysis

#we count the positive words from War and Peace, sorting from most common
WarandPeace_tidy %>%
  inner_join(positive) %>%
  count(word, sort = TRUE)

## Joining with `by = join_by(word)`

## # A tibble: 1,007 × 2
##    word          n
##    <chr>     <int>
##  1 love        456
##  2 smile       416
##  3 happy       194
##  4 ready       165
##  5 silent      164
##  6 honor       157
##  7 free        151
##  8 smiling     145
##  9 glad        129
## 10 happiness   129
## # ℹ 997 more rows

#we count the negative words from War and Peace, sorting from most common
WarandPeace_tidy %>%
  inner_join(negative) %>%
  count(word, sort = TRUE)

## Joining with `by = join_by(word)`

## # A tibble: 1,878 × 2
##    word           n
##    <chr>      <int>
##  1 enemy        207
##  2 death        188
##  3 impossible   178
##  4 strange      159
##  5 terrible     157
##  6 afraid       148
##  7 lost         140
##  8 cold         138
##  9 killed       129
## 10 fell         125
## # ℹ 1,868 more rows

#filter nrc lexicon for disgust words
nrc_disgust <- get_sentiments("nrc") %>% 
  filter(sentiment == "disgust")

#use inner_join() to perform the sentiment analysis
#we count the disgust words from the book War and Peace, sorting from most common
WarandPeace_tidy %>%
  inner_join(nrc_disgust) %>%
  count(word, sort = TRUE)

## Joining with `by = join_by(word)`

## # A tibble: 513 × 2
##    word          n
##    <chr>     <int>
##  1 feeling     320
##  2 enemy       207
##  3 death       188
##  4 terrible    157
##  5 bad         124
##  6 angry       109
##  7 gray         86
##  8 lying        82
##  9 ill          79
## 10 suffering    73
## # ℹ 503 more rows

I think its funny that the nrc lexicon includes “feeling” as a word of disgust. Feelings are disgusting!

#filter nrc lexicon for sadness words
nrc_sadness <- get_sentiments("nrc") %>% 
  filter(sentiment == "sadness")

#use inner_join() to perform the sentiment analysis
#we count the sadness words from the book War and Peace, sorting from most common
WarandPeace_tidy %>%
  inner_join(nrc_sadness) %>%
  count(word, sort = TRUE)

## Joining with `by = join_by(word)`

## # A tibble: 678 × 2
##    word           n
##    <chr>      <int>
##  1 feeling      320
##  2 leave        210
##  3 mother       204
##  4 death        188
##  5 soldier      187
##  6 impossible   178
##  7 terrible     157
##  8 lost         140
##  9 fell         125
## 10 bad          124
## # ℹ 668 more rows

#use pivot_wider() so that we have negative and positive sentiment in separate columns, and lastly calculate a net sentiment (positive - negative)
WarandPeace_sentiment_50 <- WarandPeace_tidy %>%
  inner_join(get_sentiments("bing")) %>%
  count(, index = linenumber %/% 50, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)

## Joining with `by = join_by(word)`

## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 10888 of `x` matches multiple rows in `y`.
## ℹ Row 5873 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

#plot these sentiment scores across the plot trajectory of each novel
ggplot(WarandPeace_sentiment_50, aes(index, sentiment)) + geom_col()

The plot shows that there are more overall negative sentiment sections than positive sentiment sections.

#use pivot_wider() so that we have negative and positive sentiment in separate columns, and lastly calculate a net sentiment (positive - negative)
WarandPeace_sentiment_200 <- WarandPeace_tidy %>%
  inner_join(get_sentiments("bing")) %>%
  count(, index = linenumber %/% 200, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)

## Joining with `by = join_by(word)`

## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 10888 of `x` matches multiple rows in `y`.
## ℹ Row 5873 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

#plot these sentiment scores across the plot trajectory of each novel
ggplot(WarandPeace_sentiment_200, aes(index, sentiment)) + geom_col()

Changing the line number argument, I get a lot more negative sections in the book than positive.

I want to make my kaggle sentimentlexicons into one dataframe.

positive$sentiment <- "positive"
negative$sentiment <- "negative"
kaggle_sentiments <- rbind(positive, negative)

# Count how many positive and negative words
kaggle_sentiments %>% 
  count(sentiment)

##   sentiment    n
## 1  negative 4783
## 2  positive 2006

There are more than double the amount of negative words than positive words.

#find out how much each word contributed to each sentiment
kaggle_word_counts <- WarandPeace_tidy %>%
  inner_join(kaggle_sentiments) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()

## Joining with `by = join_by(word)`

## Warning in inner_join(., kaggle_sentiments): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 10888 of `x` matches multiple rows in `y`.
## ℹ Row 6152 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

kaggle_word_counts

## # A tibble: 2,885 × 3
##    word       sentiment     n
##    <chr>      <chr>     <int>
##  1 love       positive    456
##  2 smile      positive    416
##  3 enemy      negative    207
##  4 happy      positive    194
##  5 death      negative    188
##  6 impossible negative    178
##  7 ready      positive    165
##  8 silent     positive    164
##  9 strange    negative    159
## 10 honor      positive    157
## # ℹ 2,875 more rows

#plot how each word contributed to sentiment
kaggle_word_counts %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL)

“Love” contributed most to sentiment.

Wordclouds!

library(wordcloud)

## Loading required package: RColorBrewer

WarandPeace_tidy %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))

## Joining with `by = join_by(word)`

library(reshape2)

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

WarandPeace_tidy %>%
  inner_join(kaggle_sentiments) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray80", "gray20"),
                   max.words = 100)

## Joining with `by = join_by(word)`

## Warning in inner_join(., kaggle_sentiments): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 10888 of `x` matches multiple rows in `y`.
## ℹ Row 6152 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

## Conclusions

It was interesting to do this sentiment analysis on a book I have only ever heard about. I wonder if I would agree or disagree that the sentiment of War and Peace is mostly negative. Probably.

I liked this assignment because it introduced a field of data science that can be biased like humans are biased. Its interesting that the analysis of sentiment can change based on the lexicon you use to analyze sentiment.

Week 10 assignment - 607

Evelyn Bartley

2024-03-27