Data 607 - Homework 10

Description

This assignment is focused on sentiment analysis and is uses code examples from the following book:

Silge, J. and Robinson, D. (2020). Text Mining with R: A Tidy Approach. Retrieved from https://www.tidytextmining.com.

Overview of Approach

Per the assignment’s instructions I have focused the first part of this assignment on running the code from the above-referenced book in order to see and learn how sentiment analysis in R works. From there I extended the code and examples with an additional corpus and dictionary to apply my new knowledge.

Examples From the Book

The following code is an annotated version of the examples from the book (see citation above).

# Get measures of the 3 below lexicons
library(tidytext)
get_sentiments("afinn")

## # A tibble: 2,477 x 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # … with 2,467 more rows

get_sentiments("bing")

## # A tibble: 6,786 x 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # … with 6,776 more rows

get_sentiments("nrc")

## # A tibble: 13,901 x 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abacus      trust    
##  2 abandon     fear     
##  3 abandon     negative 
##  4 abandon     sadness  
##  5 abandoned   anger    
##  6 abandoned   fear     
##  7 abandoned   negative 
##  8 abandoned   sadness  
##  9 abandonment anger    
## 10 abandonment fear     
## # … with 13,891 more rows

# Find most common words / using inner join
library(janeaustenr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stringr)

tidy_books <- austen_books() %>%
  group_by(book) %>%
  mutate(
    linenumber = row_number(),
    chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
                                            ignore_case = TRUE
    )))
  ) %>%
  ungroup() %>%
  unnest_tokens(word, text)

# Start sentiment analysis
nrc_joy <- get_sentiments("nrc") %>%
  filter(sentiment == "joy")

tidy_books %>%
  filter(book == "Emma") %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)

## Joining, by = "word"

## # A tibble: 303 x 2
##    word        n
##    <chr>   <int>
##  1 good      359
##  2 young     192
##  3 friend    166
##  4 hope      143
##  5 happy     125
##  6 love      117
##  7 deal       92
##  8 found      92
##  9 present    89
## 10 kind       82
## # … with 293 more rows

# Spread data, calculate a net sentiment
library(tidyr)

jane_austen_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)

## Joining, by = "word"

# Plot net sentinment scores
library(ggplot2)

ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")

# Compare the 3 dictionaryies: choose words intereted in
pride_prejudice <- tidy_books %>%
  filter(book == "Pride & Prejudice")

pride_prejudice

## # A tibble: 122,204 x 4
##    book              linenumber chapter word     
##    <fct>                  <int>   <int> <chr>    
##  1 Pride & Prejudice          1       0 pride    
##  2 Pride & Prejudice          1       0 and      
##  3 Pride & Prejudice          1       0 prejudice
##  4 Pride & Prejudice          3       0 by       
##  5 Pride & Prejudice          3       0 jane     
##  6 Pride & Prejudice          3       0 austen   
##  7 Pride & Prejudice          7       1 chapter  
##  8 Pride & Prejudice          7       1 1        
##  9 Pride & Prejudice         10       1 it       
## 10 Pride & Prejudice         10       1 is       
## # … with 122,194 more rows

# Define larger spans of text
afinn <- pride_prejudice %>%
  inner_join(get_sentiments("afinn")) %>%
  group_by(index = linenumber %/% 80) %>%
  summarise(sentiment = sum(value)) %>%
  mutate(method = "AFINN")

## Joining, by = "word"

## `summarise()` ungrouping output (override with `.groups` argument)

bing_and_nrc <- bind_rows(
  pride_prejudice %>%
    inner_join(get_sentiments("bing")) %>%
    mutate(method = "Bing et al."),
  pride_prejudice %>%
    inner_join(get_sentiments("nrc") %>%
                 filter(sentiment %in% c(
                   "positive",
                   "negative"
                 ))) %>%
    mutate(method = "NRC")
) %>%
  count(method, index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)

## Joining, by = "word"

## Joining, by = "word"

# Bind rows together and visualize them
bind_rows(
  afinn,
  bing_and_nrc
) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

# Look at positive and negative words in lexicons
get_sentiments("nrc") %>%
  filter(sentiment %in% c(
    "positive",
    "negative"
  )) %>%
  count(sentiment)

## # A tibble: 2 x 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   3324
## 2 positive   2312

get_sentiments("bing") %>%
  count(sentiment)

## # A tibble: 2 x 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   4781
## 2 positive   2005

# Find out how much each word contributed to the sentiment
bing_word_counts <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()

## Joining, by = "word"

# Show it visually
bing_word_counts

## # A tibble: 2,585 x 3
##    word     sentiment     n
##    <chr>    <chr>     <int>
##  1 miss     negative   1855
##  2 well     positive   1523
##  3 good     positive   1380
##  4 great    positive    981
##  5 like     positive    725
##  6 better   positive    639
##  7 enough   positive    613
##  8 happy    positive    534
##  9 love     positive    495
## 10 pleasure positive    462
## # … with 2,575 more rows

bing_word_counts %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(
    y = "Contribution to sentiment",
    x = NULL
  ) +
  coord_flip()

## Selecting by n

# Add rows to custom stop words list
custom_stop_words <- bind_rows(
  tibble(
    word = c("miss"),
    lexicon = c("custom")
  ),
  stop_words
)

custom_stop_words

## # A tibble: 1,150 x 2
##    word        lexicon
##    <chr>       <chr>  
##  1 miss        custom 
##  2 a           SMART  
##  3 a's         SMART  
##  4 able        SMART  
##  5 about       SMART  
##  6 above       SMART  
##  7 according   SMART  
##  8 accordingly SMART  
##  9 across      SMART  
## 10 actually    SMART  
## # … with 1,140 more rows

# Create a wordcloud
library(wordcloud)

## Loading required package: RColorBrewer

tidy_books %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))

## Joining, by = "word"

# Wordcloud using most common positive and negative words
library(reshape2)

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(
    colors = c("gray20", "gray80"),
    max.words = 100
  )

## Joining, by = "word"

# Tokenize text into sentences
PandP_sentences <- tibble(text = prideprejudice) %>%
  unnest_tokens(sentence, text, token = "sentences")

PandP_sentences$sentence[2]

## [1] "however little known the feelings or views of such a man may be on his first entering a neighbourhood, this truth is so well fixed in the minds of the surrounding families, that he is considered the rightful property of some one or other of their daughters."

# Split into data frames by chapter
austen_chapters <- austen_books() %>%
  group_by(book) %>%
  unnest_tokens(chapter, text,
                token = "regex",
                pattern = "Chapter|CHAPTER [\\dIVXLC]"
  ) %>%
  ungroup()

austen_chapters %>%
  group_by(book) %>%
  summarise(chapters = n())

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 6 x 2
##   book                chapters
##   <fct>                  <int>
## 1 Sense & Sensibility       51
## 2 Pride & Prejudice         62
## 3 Mansfield Park            49
## 4 Emma                      56
## 5 Northanger Abbey          32
## 6 Persuasion                25

# For each book, which has highest proportion of negative words
bingnegative <- get_sentiments("bing") %>%
  filter(sentiment == "negative")

wordcounts <- tidy_books %>%
  group_by(book, chapter) %>%
  summarize(words = n())

## `summarise()` regrouping output by 'book' (override with `.groups` argument)

tidy_books %>%
  semi_join(bingnegative) %>%
  group_by(book, chapter) %>%
  summarize(negativewords = n()) %>%
  left_join(wordcounts, by = c("book", "chapter")) %>%
  mutate(ratio = negativewords / words) %>%
  filter(chapter != 0) %>%
  top_n(1) %>%
  ungroup()

## Joining, by = "word"

## `summarise()` regrouping output by 'book' (override with `.groups` argument)

## Selecting by ratio

## # A tibble: 6 x 5
##   book                chapter negativewords words  ratio
##   <fct>                 <int>         <int> <int>  <dbl>
## 1 Sense & Sensibility      43           161  3405 0.0473
## 2 Pride & Prejudice        34           111  2104 0.0528
## 3 Mansfield Park           46           173  3685 0.0469
## 4 Emma                     15           151  3340 0.0452
## 5 Northanger Abbey         21           149  2982 0.0500
## 6 Persuasion                4            62  1807 0.0343

# End of code from book examples

Extend the Code

The following section extends the code by extending it to a new corpus and a new lexicon.

For the corpus, I used the guttenbergr package to download the text of the U.S. Declaration of Independence.

For the lexicon, I used the syuzhet package by Matthew Jockers et al of the Nebraska Literary Lab.

The Syuzhet dictionary was created in the Nebraska Literary Lab by Matthew Jockers et al. IT ranks each term on a scale of -1 to 1.

More info at the below link: https://www.rdocumentation.org/packages/syuzhet/versions/1.0.4

Load Lexicon and Corpus

# Load gutenberg library and download the Declaration of Independence
library(gutenbergr)
decl_ind <- gutenberg_download(1)

## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest

## Using mirror http://aleph.gutenberg.org

# Load the Syuzhet lexicon
library(syuzhet)
syuzhet_lex <- syuzhet::get_sentiment_dictionary()

Tidy the Data

# Tidy the data with an added column to indicate row number
decl_ind_tidy <- decl_ind %>%
  mutate(
    linenumber = row_number(),
  )

# Tokenize the data with each word being a token
decl_ind_tidy <- decl_ind_tidy %>% unnest_tokens(word, text)

# Remove the stop words to focus the analysis on more relevant terms
data("stop_words")
decl_ind_tidy <- decl_ind_tidy %>% 
  anti_join(stop_words)

## Joining, by = "word"

# View most common words
decl_ind_tidy %>%
  count(word, sort = TRUE)

## # A tibble: 2,424 x 2
##    word             n
##    <chr>        <int>
##  1 united          78
##  2 people          53
##  3 constitution    45
##  4 law             44
##  5 time            43
##  6 laws            40
##  7 congress        39
##  8 government      38
##  9 president       38
## 10 war             35
## # … with 2,414 more rows

# Visualize with horizontal bar chart, filtered by words used over 25 times
decl_ind_tidy %>% 
  count(word, sort = TRUE) %>%
  filter(n > 25) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip()

Add Custom Stop Words

In the above chart we can see that ‘etext’ appears erroneously in the results, so I added it to a custom stop words list.

custom_stop_words <- bind_rows(
  tibble(
    word = c("etext"),
    lexicon = c("custom")
  ),
  stop_words
)

# Use the anti-join function again to strip out the custom stop words
decl_ind_tidy <- decl_ind_tidy %>% 
  anti_join(custom_stop_words)

## Joining, by = "word"

# Visualize the data again to verify 'etext' no longer appears
decl_ind_tidy %>% 
  count(word, sort = TRUE) %>%
  filter(n > 25) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip()

Summarize Sentiment

The below code summarizes the sentiment with 3 separate lexicons (including the new one), and then plots the summarizes next to each other for easy comparison.

# Summarize sentiment using syuzhet lexicon
decl_syuzhet_lex <- decl_ind_tidy %>%
  inner_join(syuzhet_lex) %>%
  group_by(index = linenumber %/% 40) %>%
  summarise(sentiment = sum(value)) %>%
  mutate(method = "SYUZHET")

## Joining, by = "word"

## `summarise()` ungrouping output (override with `.groups` argument)

# Summarize sentiment using nrc and bing lexicons, w/ groups of 40 words
decl_bing_and_nrc <- bind_rows(
  decl_ind_tidy%>%
    inner_join(get_sentiments("bing")) %>%
    mutate(method = "Bing et al."),
  decl_ind_tidy %>%
    inner_join(get_sentiments("nrc") %>%
                 filter(sentiment %in% c(
                   "positive",
                   "negative"
                 ))) %>%
    mutate(method = "NRC")
) %>%
  count(method, index = linenumber %/% 40, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)

## Joining, by = "word"

## Joining, by = "word"

# Combine the results for use in plotting
# Interestingly the results are pretty different
bind_rows(
  decl_syuzhet_lex,
  decl_bing_and_nrc
) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

Create Wordcloud

The below code creates a wordcloud, excluding the stop_words (including the custom stop words added).

decl_ind_tidy %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))

## Joining, by = "word"

## Warning in wordcloud(word, n, max.words = 100): congress could not be fit on
## page. It will not be plotted.

Conclusion

In conclusion, as described in Text Mining With R: A Tidy Approach, we can use a variety of dictionaries for sentiment analysis in R to perform various types of analysis on text. The best dictionary depends on the context of the specific challenge or need as each have their strengths and weaknesses, although the general format and use of each is similar. For the Declaration of Independence I chose Syuzhet because of its scale of -1 to 1 which gives a more granular breakdown of sentiment, which seems more useful for shorter texts such as this.