Introduction:

In Text Mining with R, Chapter 2 looks at Sentiment Analysis. In this assignment, you should start by getting the primary example code from chapter 2 working in an R Markdown document. You should provide a citation to this base code. You’re then asked to extend the code in two ways: Work with a different corpus of your choosing, and incorporate at least one additional sentiment lexicon (possibly from another R package that you’ve found through research).

The base code for this assignment is from chapter 2 of Text Mining with R: A Tidy Approach https://www.tidytextmining.com/sentiment.html

library(janeaustenr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
library(tidytext)
library(tidyr)
library(ggplot2)
library(sentimentr)

Sentiment analysis with inner join

tidy_books <- austen_books() %>%
  group_by(book) %>%
  mutate(
    linenumber = row_number(),
    chapter = cumsum(str_detect(text, 
                                regex("^chapter [\\divxlc]", 
                                      ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)

glimpse(tidy_books)
## Rows: 725,055
## Columns: 4
## $ book       <fct> Sense & Sensibility, Sense & Sensibility, Sense & Sensibili…
## $ linenumber <int> 1, 1, 1, 3, 3, 3, 5, 10, 10, 13, 13, 13, 13, 13, 13, 13, 13…
## $ chapter    <int> 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ word       <chr> "sense", "and", "sensibility", "by", "jane", "austen", "181…
nrc_joy <- get_sentiments("nrc") %>% 
  filter(sentiment == "joy")

tidy_books %>%
  filter(book == "Emma") %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)
## Joining with `by = join_by(word)`
## # A tibble: 301 × 2
##    word          n
##    <chr>     <int>
##  1 good        359
##  2 friend      166
##  3 hope        143
##  4 happy       125
##  5 love        117
##  6 deal         92
##  7 found        92
##  8 present      89
##  9 kind         82
## 10 happiness    76
## # ℹ 291 more rows
library(tidyr)

jane_austen_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 435434 of `x` matches multiple rows in `y`.
## ℹ Row 5051 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")

pride_prejudice <- tidy_books %>% 
  filter(book == "Pride & Prejudice")
afinn <- pride_prejudice %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(value)) %>% 
  mutate(method = "AFINN")
## Joining with `by = join_by(word)`
bing_and_nrc <- bind_rows(
  pride_prejudice %>% 
    inner_join(get_sentiments("bing")) %>%
    mutate(method = "Bing et al."),
  pride_prejudice %>% 
    inner_join(get_sentiments("nrc") %>% 
                 filter(sentiment %in% c("positive", 
                                         "negative"))
    ) %>%
    mutate(method = "NRC")) %>%
  count(method, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment,
              values_from = n,
              values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("nrc") %>% filter(sentiment %in% : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 215 of `x` matches multiple rows in `y`.
## ℹ Row 5178 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
bind_rows(afinn, 
          bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

get_sentiments("nrc") %>% 
  filter(sentiment %in% c("positive", "negative")) %>% 
  count(sentiment)
## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   3316
## 2 positive   2308
get_sentiments("bing") %>% 
  count(sentiment)
## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   4781
## 2 positive   2005

Most common positive and negative words

bing_word_counts <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 435434 of `x` matches multiple rows in `y`.
## ℹ Row 5051 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
bing_word_counts
## # A tibble: 2,585 × 3
##    word     sentiment     n
##    <chr>    <chr>     <int>
##  1 miss     negative   1855
##  2 well     positive   1523
##  3 good     positive   1380
##  4 great    positive    981
##  5 like     positive    725
##  6 better   positive    639
##  7 enough   positive    613
##  8 happy    positive    534
##  9 love     positive    495
## 10 pleasure positive    462
## # ℹ 2,575 more rows
bing_word_counts %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL)

# New Data Set

library(gutenbergr)
gutenberg_metadata
## # A tibble: 72,569 × 8
##    gutenberg_id title    author gutenberg_author_id language gutenberg_bookshelf
##           <int> <chr>    <chr>                <int> <chr>    <chr>              
##  1            1 "The De… Jeffe…                1638 en       "Politics/American…
##  2            2 "The Un… Unite…                   1 en       "Politics/American…
##  3            3 "John F… Kenne…                1666 en       ""                 
##  4            4 "Lincol… Linco…                   3 en       "US Civil War"     
##  5            5 "The Un… Unite…                   1 en       "United States/Pol…
##  6            6 "Give M… Henry…                   4 en       "American Revoluti…
##  7            7 "The Ma… <NA>                    NA en       ""                 
##  8            8 "Abraha… Linco…                   3 en       "US Civil War"     
##  9            9 "Abraha… Linco…                   3 en       "US Civil War"     
## 10           10 "The Ki… <NA>                    NA en       "Banned Books List…
## # ℹ 72,559 more rows
## # ℹ 2 more variables: rights <chr>, has_text <lgl>

Using gutenberg library, I was curious if they had one of my favorite books, so I used gutenberg_metadata and grep function to look for the title

# Search for the book by title
book_title <- "Strange Case of Dr. Jekyll and Mr. Hyde"
drj_info <- gutenberg_metadata[grep(book_title, gutenberg_metadata$title), ] #use partial match with grep function

print(drj_info)
## # A tibble: 2 × 8
##   gutenberg_id title     author gutenberg_author_id language gutenberg_bookshelf
##          <int> <chr>     <chr>                <int> <chr>    <chr>              
## 1           42 The Stra… Steve…                  35 en       Precursors of Scie…
## 2           43 The Stra… Steve…                  35 en       Precursors of Scie…
## # ℹ 2 more variables: rights <chr>, has_text <lgl>
DrJekyll <- gutenberg_download(42)
## Determining mirror for Project Gutenberg from https://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org

Tidy Data

tidy_DrJekyll <- DrJekyll %>%
  mutate(linenumber = row_number(), chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]", ignore_case = TRUE)))) %>%
  unnest_tokens(word, text)

Analysis

Let’s compare the positive and negative words in bing lexicon and the loughran lexicon

#filtering the loughran lexicon only for positive an negative words
loughran_posneg <- get_sentiments("loughran") %>% 
  filter(sentiment == "positive" | sentiment =="negative")

#creating net sentiment using bing
DrJekyll_bing <- tidy_DrJekyll %>%
  inner_join(get_sentiments("bing")) %>%
  count(index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
#creating net sentiment using loughran
DrJekyll_loughran <- tidy_DrJekyll %>%
  inner_join(loughran_posneg) %>%
  count(index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
par(mfrow=c(1,2))

ggplot(DrJekyll_loughran, aes(index, sentiment)) +
  geom_col(show.legend = FALSE) +
  labs(title = "Sentiment Analysis of Dr. Jekyll using Loughran Lexicon")

ggplot(DrJekyll_bing, aes(index, sentiment)) +
  geom_col(show.legend = FALSE) +
  labs(title = "Sentiment Analysis of Dr. Jekyll using Bing Lexicon")

Conclusion

It seems as though there is a disparity in positive/negative words identified using the different lexicons.

get_sentiments("loughran") %>% 
     filter(sentiment %in% c("positive", 
                             "negative")) %>% 
  count(sentiment)
## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   2355
## 2 positive    354
get_sentiments("bing") %>% 
  count(sentiment)
## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   4781
## 2 positive   2005

This could be due to the difference in the negative and positive sentiments in the lexicons themselves. It is necessary to make this observation when conducting analysis because it could affect our results.