Data 607 - Assignment 10

In Text Mining with R, Chapter 2 looks at Sentiment Analysis. In this assignment, you should start by getting the primary example code from chapter 2 working in an R Markdown document. You should provide a citation to this base code. You’re then asked to extend the code in two ways:

1)Work with a different corpus of your choosing, and 2) Incorporate at least one additional sentiment lexicon (possibly from another R package that you’ve found through research).

The base code for this assignemnet is taken directy from chapter 2 of Text Mining with R: A Tidy Approach https://www.tidytextmining.com/sentiment.html

Load needed libraries

library(tidytext)

get_sentiments("afinn")

get_sentiments("bing")

get_sentiments("nrc")

We tidy the data to get the linenumber and chapter

library(janeaustenr)

## Warning: package 'janeaustenr' was built under R version 3.6.3

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stringr)

tidy_books <- austen_books() %>%
  group_by(book) %>%
  mutate(linenumber = row_number(),
         chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]", ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)

We filter for all joyful words mentioned around Emma:

nrc_joy <- get_sentiments("nrc") %>% 
  filter(sentiment == "joy")

tidy_books %>%
  filter(book == "Emma") %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)

## Joining, by = "word"

Comparing positive and negative words per sections:

library(tidyr)

jane_austen_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)

## Joining, by = "word"

We can plot the overall sentiment for each book:

library(ggplot2)

ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")

Filter just Pride & Prejudice

pride_prejudice <- tidy_books %>% 
  filter(book == "Pride & Prejudice")

pride_prejudice

creating a net sentiment for each part of the book using each of the lexicons

afinn <- pride_prejudice %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(value)) %>% 
  mutate(method = "AFINN")

## Joining, by = "word"

bing_and_nrc <- bind_rows(pride_prejudice %>% 
                            inner_join(get_sentiments("bing")) %>%
                            mutate(method = "Bing et al."),
                          pride_prejudice %>% 
                            inner_join(get_sentiments("nrc") %>% 
             filter(sentiment %in% c("positive", "negative"))) %>%
                            mutate(method = "NRC")) %>%
  count(method, index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)

## Joining, by = "word"
## Joining, by = "word"

plot the above comparison of net sentiment:

bind_rows(afinn, 
          bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

looking at number of negative to positive words:

get_sentiments("nrc") %>% 
     filter(sentiment %in% c("positive", 
                             "negative")) %>% 
  count(sentiment)

get_sentiments("bing") %>% 
  count(sentiment)

count senitment words:

bing_word_counts <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()

## Joining, by = "word"

bing_word_counts

using ggplot plot top 10 words:

bing_word_counts %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()

## Selecting by n

adding word “miss” to custom words:

custom_stop_words <- bind_rows(tibble(word = c("miss"), 
                                          lexicon = c("custom")), 
                               stop_words)

custom_stop_words

Wordcloud:

library(wordcloud)

## Warning: package 'wordcloud' was built under R version 3.6.3

## Loading required package: RColorBrewer

tidy_books %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))

## Joining, by = "word"

worldcould split by sentiment:

library(reshape2)

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 100)

## Joining, by = "word"

looking at sentences not just single words:

PandP_sentences <- tibble(text = prideprejudice) %>% 
  unnest_tokens(sentence, text, token = "sentences")

PandP_sentences$sentence[2]

## [1] "however little known the feelings or views of such a man may be on his first entering a neighbourhood, this truth is so well fixed in the minds of the surrounding families, that he is considered the rightful property of some one or other of their daughters."

split by chapter:

austen_chapters <- austen_books() %>%
  group_by(book) %>%
  unnest_tokens(chapter, text, token = "regex", 
                pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
  ungroup()

austen_chapters %>% 
  group_by(book) %>% 
  summarise(chapters = n())

We look at the chapters with the most sad words using bing:

bingnegative <- get_sentiments("bing") %>% 
  filter(sentiment == "negative")

wordcounts <- tidy_books %>%
  group_by(book, chapter) %>%
  summarize(words = n())

tidy_books %>%
  semi_join(bingnegative) %>%
  group_by(book, chapter) %>%
  summarize(negativewords = n()) %>%
  left_join(wordcounts, by = c("book", "chapter")) %>%
  mutate(ratio = negativewords/words) %>%
  filter(chapter != 0) %>%
  top_n(1) %>%
  ungroup()

## Joining, by = "word"

## Selecting by ratio

Additional book with loughran lexicon :

Using the gutenberg_metadata command I found Peter Pan as the 16th book in the library:

library(gutenbergr)

## Warning: package 'gutenbergr' was built under R version 3.6.3

PeterPan <- gutenberg_download(16)

## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest

## Using mirror http://aleph.gutenberg.org

tidy_PeterPan <- PeterPan %>%
  mutate(linenumber = row_number(), chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]", ignore_case = TRUE)))) %>%
  unnest_tokens(word, text)

#filtering the loughran lexicon only for positive an negative words
loughran_posneg <- get_sentiments("loughran") %>% 
  filter(sentiment == "positive" | sentiment =="negative")

#creating net sentiment using bing
Peter_bing <- tidy_PeterPan %>%
  inner_join(get_sentiments("bing")) %>%
  count(index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)

## Joining, by = "word"

#creating net sentiment using bing
Peter_loughran <- tidy_PeterPan %>%
  inner_join(loughran_posneg) %>%
  count(index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)

## Joining, by = "word"

par(mfrow=c(1,2))

ggplot(Peter_loughran, aes(index, sentiment)) +
  geom_col(show.legend = FALSE)

ggplot(Peter_bing, aes(index, sentiment)) +
  geom_col(show.legend = FALSE)

We can explain that as before pby looking at the counts of positive and negative words in the 2 lexica. We can see that in loughran negative words are about 7 times more common than positive words while in bind this proportion is a bit over 2:

get_sentiments("loughran") %>% 
     filter(sentiment %in% c("positive", 
                             "negative")) %>% 
  count(sentiment)

get_sentiments("bing") %>% 
  count(sentiment)

Comparison/Results

We can clearly see that the lexicon that is chosen can have a big impact on the analysis and we need to be careful to take this into consideration on any analysis.

GitHub: https://github.com/chilleundso/DATA607/blob/master/Assignment10/Data607_Assignment10.Rmd

Data 607 - Assignment 10

Manolis Manoli

Additional book with loughran lexicon :

Comparison/Results