Text Mining with R: Sentiment Analysis

Title: CUNY SPS MDS DATA607_Sentiment Analysis”

Author: Charles Ugiagbe

Date: 10/31/2021

library (tidytext)
library(tidyverse)
library(stringr)
library(textdata)

Part 1: Example codes from the textbook:

The sentiments dataset

get_sentiments("afinn")

## # A tibble: 2,477 x 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # ... with 2,467 more rows

get_sentiments("bing")

## # A tibble: 6,786 x 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # ... with 6,776 more rows

get_sentiments("nrc")

## # A tibble: 13,875 x 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abacus      trust    
##  2 abandon     fear     
##  3 abandon     negative 
##  4 abandon     sadness  
##  5 abandoned   anger    
##  6 abandoned   fear     
##  7 abandoned   negative 
##  8 abandoned   sadness  
##  9 abandonment anger    
## 10 abandonment fear     
## # ... with 13,865 more rows

Sentiment analysis with inner join

library(janeaustenr)
library(dplyr)
tidy_books <- austen_books() %>%
  group_by(book) %>%
  mutate(
    linenumber = row_number(),
    chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
      ignore_case = TRUE
    )))
  ) %>%
  ungroup() %>%
  unnest_tokens(word, text)

nrc_joy <- get_sentiments("nrc") %>%
  filter(sentiment == "joy")
tidy_books %>%
  filter(book == "Emma") %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)

## # A tibble: 301 x 2
##    word          n
##    <chr>     <int>
##  1 good        359
##  2 friend      166
##  3 hope        143
##  4 happy       125
##  5 love        117
##  6 deal         92
##  7 found        92
##  8 present      89
##  9 kind         82
## 10 happiness    76
## # ... with 291 more rows

library(tidyr)
jane_austen_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)

library(ggplot2)
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")

Comparing the three sentiment dictionaries

pride_prejudice <- tidy_books %>%
  filter(book == "Pride & Prejudice")
pride_prejudice

## # A tibble: 122,204 x 4
##    book              linenumber chapter word     
##    <fct>                  <int>   <int> <chr>    
##  1 Pride & Prejudice          1       0 pride    
##  2 Pride & Prejudice          1       0 and      
##  3 Pride & Prejudice          1       0 prejudice
##  4 Pride & Prejudice          3       0 by       
##  5 Pride & Prejudice          3       0 jane     
##  6 Pride & Prejudice          3       0 austen   
##  7 Pride & Prejudice          7       1 chapter  
##  8 Pride & Prejudice          7       1 1        
##  9 Pride & Prejudice         10       1 it       
## 10 Pride & Prejudice         10       1 is       
## # ... with 122,194 more rows

afinn <- pride_prejudice %>%
  inner_join(get_sentiments("afinn")) %>%
  group_by(index = linenumber %/% 80) %>%
  summarise(sentiment = sum(value)) %>%
  mutate(method = "AFINN")
bing_and_nrc <- bind_rows(
  pride_prejudice %>%
    inner_join(get_sentiments("bing")) %>%
    mutate(method = "Bing et al."),
  pride_prejudice %>%
    inner_join(get_sentiments("nrc") %>%
      filter(sentiment %in% c(
        "positive",
        "negative"
      ))) %>%
    mutate(method = "NRC")
) %>%
  count(method, index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)

bind_rows(
  afinn,
  bing_and_nrc
) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

get_sentiments("nrc") %>%
  filter(sentiment %in% c(
    "positive",
    "negative"
  )) %>%
  count(sentiment)

## # A tibble: 2 x 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   3318
## 2 positive   2308

get_sentiments("bing") %>%
  count(sentiment)

## # A tibble: 2 x 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   4781
## 2 positive   2005

Most common positive and negative words

bing_word_counts <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
bing_word_counts

## # A tibble: 2,585 x 3
##    word     sentiment     n
##    <chr>    <chr>     <int>
##  1 miss     negative   1855
##  2 well     positive   1523
##  3 good     positive   1380
##  4 great    positive    981
##  5 like     positive    725
##  6 better   positive    639
##  7 enough   positive    613
##  8 happy    positive    534
##  9 love     positive    495
## 10 pleasure positive    462
## # ... with 2,575 more rows

bing_word_counts %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(
    y = "Contribution to sentiment",
    x = NULL
  ) +
  coord_flip()

custom_stop_words <- bind_rows(
  tibble(
    word = c("miss"),
    lexicon = c("custom")
  ),
  stop_words
)
custom_stop_words

## # A tibble: 1,150 x 2
##    word        lexicon
##    <chr>       <chr>  
##  1 miss        custom 
##  2 a           SMART  
##  3 a's         SMART  
##  4 able        SMART  
##  5 about       SMART  
##  6 above       SMART  
##  7 according   SMART  
##  8 accordingly SMART  
##  9 across      SMART  
## 10 actually    SMART  
## # ... with 1,140 more rows

Wordclouds

library(wordcloud)
tidy_books %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))

library(reshape2)
tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(
    colors = c("gray20", "gray80"),
    max.words = 100
  )

Source

Silge, J., & Robinson, D. (2017). Text mining with R: A tidy approach. Sebastopol, CA: O’Reilly.

Chapter 2: Sentiment Analysis with Tidy Data

See: www.tidytextmining.com/sentiment.html

Part 2: My Chosen Corpus

Sentiment Analysis of “The Black Experience IN America”, a book written by Norman Coombs.

We are going to analyze a book title “The Black Experience in America”, written by Norman Coombs; found in gutenbergr package. It’s a book that give a interpretative insight to the History, struggle and emancipation of the black race in America. It also talks about the immigration of the black race from Africa and the variety of rich contribution they have made to America.

Source: THE BLACK EXPERIENCE IN AMERICA

Loading the package and tidying the dataset

library(gutenbergr)

# Download the book id 67, "THE BLACK LIVE IN AMERICA"
norman_book <- gutenberg_download(67)
norman_book

## # A tibble: 7,967 x 2
##    gutenberg_id text                                                            
##           <int> <chr>                                                           
##  1           67 "THE BLACK EXPERIENCE IN AMERICA"                               
##  2           67 ""                                                              
##  3           67 "Published electronically by its author, Norman Coombs, and Pro~
##  4           67 "Gutenberg."                                                    
##  5           67 ""                                                              
##  6           67 "(C 1993) by Norman Coombs"                                     
##  7           67 ""                                                              
##  8           67 ""                                                              
##  9           67 "This text is claimed under copyright to protect its integrity,~
## 10           67 "therefore you are required to pass it on intact, but you may m~
## # ... with 7,957 more rows

# Restructure to one-token_per-row and remove stop words
norman_book_tidy <- norman_book %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words)
norman_book_tidy

## # A tibble: 34,141 x 2
##    gutenberg_id word          
##           <int> <chr>         
##  1           67 black         
##  2           67 experience    
##  3           67 america       
##  4           67 published     
##  5           67 electronically
##  6           67 author        
##  7           67 norman        
##  8           67 coombs        
##  9           67 project       
## 10           67 gutenberg     
## # ... with 34,131 more rows

Net Sentiment analysis across the book per chapter

# Restructure to one-token_per-row and remove stop words
norman_book_chapters <- norman_book %>% 
  filter(text != "") %>%
  
  mutate(linenumber = row_number(),
         
         chapter = cumsum(str_detect(text, regex("(Chapter )([\\divxlc])", 
                                                 
            ignore_case =  TRUE
            
            )))
         ) %>%
  
  ungroup()
norman_book_chapters

## # A tibble: 7,132 x 4
##    gutenberg_id text                                          linenumber chapter
##           <int> <chr>                                              <int>   <int>
##  1           67 THE BLACK EXPERIENCE IN AMERICA                        1       0
##  2           67 Published electronically by its author, Norm~          2       0
##  3           67 Gutenberg.                                             3       0
##  4           67 (C 1993) by Norman Coombs                              4       0
##  5           67 This text is claimed under copyright to prot~          5       0
##  6           67 therefore you are required to pass it on int~          6       0
##  7           67 to your own copy.  This text may be shared i~          7       0
##  8           67 this header is included.  It may be quoted f~          8       0
##  9           67 authorship is properly credited.  As the boo~          9       0
## 10           67 has chosen to make it freely available.               10       0
## # ... with 7,122 more rows

Tidying by tokenizing and using afinn lexicon

# tidying mybook_chapter by tokenizing and using afinn lexicon
norman_book_chapters_tidy <- norman_book_chapters %>%
  
  unnest_tokens(word, text) %>%
  inner_join(get_sentiments("afinn"))

Sentiment analysis accross the book

norman_books_rows_plot <- norman_book_chapters_tidy %>%
  inner_join(get_sentiments("bing")) %>%
  count(index = linenumber %/% 20, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)
ggplot(norman_books_rows_plot, aes(index, sentiment)) +
  geom_col(show.legend = FALSE) +
  
  geom_col(fill = "red") +
  
   labs(title = "Net Sentiment accross the book")

From the 12 chapters contain in the book.We can see that the sentiment varies across the book. We are to analyze the net sentiment per chapter and the overall sentiment per chapter.

# Grouping needed variables
norman_book_chapters_plot <- norman_book_chapters_tidy %>%
  
  select(chapter, value) %>%
  
  group_by(chapter) %>% 
  
  summarize(total_sentiment = sum(value))
# Plot
norman_book_chapters_plot %>%
  
  ggplot(aes(chapter, total_sentiment)) +
  
  geom_col(fill = "purple") +
  
   xlab("Index - chapter") +
  
  
   ylab("Net Sentiment") + 
  
  labs(title = "Net Sentiment accross the book per chapter")

From the graph above we can see that majority of the chapters have a net negative sentiment with few positive sentiment; with the last chapter having the most negative sentiment while chapter 2 is the most positive.

Overall sentiment

Let take a look at the overall sentiment in the entire book using bing lexicon:

# Get "bing" lexicon for this analysis
norman_book_overall_sentiment <- norman_book %>% 
  unnest_tokens(word, text) %>%
  inner_join(get_sentiments("bing")) %>% 
  count(sentiment) %>%
  mutate(total = n / sum(n))
# Plot
ggplot(norman_book_overall_sentiment) + 
  
  aes(x = sentiment, y = total) + 
  geom_col(fill = "blue") + 
  
  xlab("Sentiment") +
  ylab("Percent") + 
 
  labs(title = "Overall Sentiment") + 
  
  geom_text(aes(label = round(total * 100, 2) , vjust = -.4))

From the plot, it is clear that there are more negative contribution than positve contribution in the sentiment

Let plot now the most positive and negative words below. We are going to use bing lexicon as well:

Most positive words

norman_book %>%
  
  unnest_tokens(word, text) %>%
  inner_join(get_sentiments("bing")) %>% 
  filter(sentiment == "positive") %>%
  count(word, sentiment, sort = TRUE) %>% 
  
  top_n(15) %>%
  mutate(word = reorder(word, n)) %>%
  
  ggplot() + 
  
  aes(x = word, y = n) +
  labs(title = "Most Positive Words") + 
  ylab("Contribution to sentiment") + 
  xlab("Word") +
  geom_col(fill = "purple") +
  
  
  coord_flip()

Most negative words

norman_book %>%
  
  unnest_tokens(word, text) %>%
  inner_join(get_sentiments("bing")) %>% 
  filter(sentiment == "negative") %>%
  count(word, sentiment, sort = TRUE) %>% 
  
  top_n(15) %>%
  mutate(word = reorder(word, n)) %>%
  
  ggplot() + 
  
  aes(x = word, y = n) +
  labs(title = "Most Negative Words") + 
  ylab("Contribution to sentiment") + 
  xlab("Word") +
  geom_col(fill = "red") +
  
  
  coord_flip()

library(wordcloud)

norman_book_tidy %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))

Sentiment Analysis with Loughran-MacDonald sentiment lexicon

# Get loughran

sentiment <- get_sentiments("loughran")

Negative and Positive words

We want to compare both the positive and negative word that will be generated from the emotion in using the nrc lexicon and the loughran lexicon

  norman_book_chapters %>% 
  unnest_tokens(word, text) %>% 
  inner_join(get_sentiments("loughran")) %>%
  filter(sentiment %in% c("positive", "negative")) %>%
  count(word, sentiment, sort = TRUE) %>%
  group_by(sentiment) %>%
  
  top_n(10) %>%
  
  ggplot() + 
  aes(x = reorder(word,desc(n)), y = n) + 
  geom_col(fill = "violet") +
  facet_grid(~sentiment, scales = "free_x")  + 
  geom_text(aes(label = n, vjust = .4)) + 
  labs(title = "Negative and  Positive words") +
  
  facet_wrap(~sentiment, ncol = 1, scales = "free_x") +
  
  
  xlab("Word") + 
  ylab("Count")

Words associated to positive and negative emotions using nrc lexicon

This is to compare how both lexicon classify words

  norman_book_chapters %>% 
  unnest_tokens(word, text) %>% 
  inner_join(get_sentiments("nrc")) %>%
  filter(sentiment %in% c("positive", "negative")) %>%
  count(word, sentiment, sort = TRUE) %>%
  group_by(sentiment) %>%
  
  top_n(10) %>%
  
  ggplot() + 
  aes(x = reorder(word,desc(n)), y = n) + 
  geom_col(fill = "violet") +
  facet_grid(~sentiment, scales = "free_x")  + 
  geom_text(aes(label = n, vjust = 0.4)) + 
  labs(title = "Negative and Positive words") +
  
  facet_wrap(~sentiment, ncol = 1, scales = "free_x") +
  
  xlab("Word") + 
  ylab("Count")

Findings

looking at on the two last graphs, we can see that the sentiment lexicons don’t classify words in the same way; even though the emotion is the same. For example, the most frequent words for “positive” emotion in both the lougran and the nrc sentiment lexicon are not the same. All the Negative and positive emotion for in the both sentiment lexicons are different. Thus, choosing a sentiment lexicon would depend on specific aspects we want to base our sentiment analysis.