Sentiment Analysis

Citation: https://www.tidytextmining.com/sentiment.html

further citation below

Load packages

library(janeaustenr)
library(dplyr)
library(stringr)

#for unnest_tokens() function
 library(tidytext)


#for get_sentiments() function
library(textdata)


# pivot_wider() function
library(tidyr)

#ggplot function
library(ggplot2)

#to plot wordcloud
library(wordcloud)

# to cast to dataframe
library(reshape2)

#lemmatize_words
library(textstem)


#corpus function
library(quanteda)

Tidy data

# read data from austen books, create new variables line number and chapter
# convert the text to the tidy format using unnest_tokens()
tidy_books <- austen_books() %>%
  group_by(book) %>%
  mutate(
    linenumber = row_number(),
    chapter = cumsum(str_detect(text, 
                                regex("^chapter [\\divxlc]", 
                                      ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)

Sentiment analysis using nrc lexicon and joy category

# read sentiments from nrc with joy sentiment
nrc_joy <- get_sentiments("nrc") %>% 
  filter(sentiment == "joy")

#text from the book "from Emma"
#join with our data to get distinct counts
tidy_books %>%
  filter(book == "Emma") %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)
## # A tibble: 301 × 2
##    word          n
##    <chr>     <int>
##  1 good        359
##  2 friend      166
##  3 hope        143
##  4 happy       125
##  5 love        117
##  6 deal         92
##  7 found        92
##  8 present      89
##  9 kind         82
## 10 happiness    76
## # ℹ 291 more rows

Sentiment analysis using the Bing lexicon

#count up how many positive and negative words there are in defined sections of 80 lines of text of each book
#
jane_austen_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)

# plot the sentiment scores across the plot trajectory of each novel
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")

Comparing the three sentiment lexicons

AFINN info: URL: http://www2.imm.dtu.dk/pubdb/views/publication_details.php?id=6010

License: Open Database License (ODbL) v1.0

pride_prejudice <- tidy_books %>% 
  filter(book == "Pride & Prejudice")

afinn <- pride_prejudice %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(value)) %>% 
  mutate(method = "AFINN")

bing_and_nrc <- bind_rows(
  pride_prejudice %>% 
    inner_join(get_sentiments("bing")) %>%
    mutate(method = "Bing et al."),
  pride_prejudice %>% 
    inner_join(get_sentiments("nrc") %>% 
                 filter(sentiment %in% c("positive", 
                                         "negative"))
    ) %>%
    mutate(method = "NRC")) %>%
  count(method, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment,
              values_from = n,
              values_fill = 0) %>% 
  mutate(sentiment = positive - negative)

#bind them together and visualize them
bind_rows(afinn, 
          bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

Summary:

The three different lexicons for calculating sentiment give results that are different in an absolute sense but have similar relative trajectories through the novel. all three agree roughly on the overall trends in the sentiment through a narrative

Most common positive and negative words

#each word contribution to each sentiment.
  bing_word_counts <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()


# plot visually using ggplot2,
bing_word_counts %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL)

#custom stop-words list using bind_rows()
custom_stop_words <- bind_rows(tibble(word = c("miss"),  
                                      lexicon = c("custom")), 
                               stop_words)

Wordclouds

#plot using wordcloud
  tidy_books %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))

#comparison cloud
  tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 100)

Sentiment of a sentence as a whole

#Tokenizing into sentence
p_and_p_sentences <- tibble(text = prideprejudice) %>% 
  unnest_tokens(sentence, text, token = "sentences")

#unnesting by chapters in each book
austen_chapters <- austen_books() %>%
  group_by(book) %>%
  unnest_tokens(chapter, text, token = "regex", 
                pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
  ungroup()

#number of chapters in each novel (plus an “extra” row for each novel title)
austen_chapters %>% 
  group_by(book) %>% 
  summarise(chapters = n())
## # A tibble: 6 × 2
##   book                chapters
##   <fct>                  <int>
## 1 Sense & Sensibility       51
## 2 Pride & Prejudice         62
## 3 Mansfield Park            49
## 4 Emma                      56
## 5 Northanger Abbey          32
## 6 Persuasion                25
#regex to find where all the chapters were in Austen’s novels for a tidy data frame organized by one-word-per-row
bingnegative <- get_sentiments("bing") %>% 
  filter(sentiment == "negative")

wordcounts <- tidy_books %>%
  group_by(book, chapter) %>%
  summarize(words = n())

#the most negative chapters in each of Jane Austen’s novels
tidy_books %>%
  semi_join(bingnegative) %>%
  group_by(book, chapter) %>%
  summarize(negativewords = n()) %>%
  left_join(wordcounts, by = c("book", "chapter")) %>%
  mutate(ratio = negativewords/words) %>%
  filter(chapter != 0) %>%
  slice_max(ratio, n = 1) %>% 
  ungroup()
## # A tibble: 6 × 5
##   book                chapter negativewords words  ratio
##   <fct>                 <int>         <int> <int>  <dbl>
## 1 Sense & Sensibility      43           161  3405 0.0473
## 2 Pride & Prejudice        34           111  2104 0.0528
## 3 Mansfield Park           46           173  3685 0.0469
## 4 Emma                     15           151  3340 0.0452
## 5 Northanger Abbey         21           149  2982 0.0500
## 6 Persuasion                4            62  1807 0.0343

Using loughran Lexicons from the “textstem” package

Name: Loughran-McDonald Sentiment lexicon URL: https://sraf.nd.edu/textual-analysis/resources/

Using crude data from tm package.

#loading sentiment from Loughran-McDonald Sentiment lexicon
lou <- get_sentiments("loughran") # 4150 words
lou$word <- lemmatize_words(lou$word)
lou <- lou %>% distinct() # 2456 words


lou_positive_negative <- lou %>%
  filter(sentiment %in% c("positive", "negative"))

#combining with crude data in tm package
library(tm)
data(crude)

crude_df <- data.frame(text = sapply(crude, as.character), stringsAsFactors = FALSE)

# join to get sentiment analysis
crude_sentiment <- crude_df %>%
  unnest_tokens(word, text) %>%
  inner_join(lou_positive_negative) %>%
  count(word, sentiment,sort = TRUE)
 

# words in crude data in positive and negative sentiment
crude_sentiment %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>% 
    mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL)

 #find the most common positive and negative words.
crude_sentiment %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 100)