Week 10 Assignment

Introduction

This assignment is focused on conducting a sentiment analysis of a text corpus using the same methodology and overall codebase introduced in Chapter 2: Sentiment Analysis with Tidy Data of the Text Mining with R text.

We extend this code by doing the following: a) Working with a different corpus of our choosing; and b) Incorporating at least one additional sentiment lexicon

Import Libraries

We begin by importing the necessary libraries that will allow us to manipulate our data and ensure it is setup in a Tidy format. Additionally, we load libraries that contain the additional corpus we will be using and the sentiment lexicons.

knitr::opts_chunk$set(echo = TRUE)

library(tidytext)
library(janeaustenr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stringr)
library(tidyr)
library(ggplot2)
library(gutenbergr)

Chapter 2 Tidy Data Code (Part 1):

The code in Chapter 2

tidy_books <- austen_books() %>%
  group_by(book) %>%
  mutate(linenumber = row_number(),
         chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",ignore_case=TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word,text)


nrc_joy <- get_sentiments('nrc') %>% 
  filter(sentiment == 'joy')


tidy_books %>% 
  filter(book == 'Emma') %>% 
  inner_join(nrc_joy) %>% 
  count(word, sort=TRUE)

## Joining, by = "word"

## # A tibble: 301 × 2
##    word          n
##    <chr>     <int>
##  1 good        359
##  2 friend      166
##  3 hope        143
##  4 happy       125
##  5 love        117
##  6 deal         92
##  7 found        92
##  8 present      89
##  9 kind         82
## 10 happiness    76
## # … with 291 more rows

janeaustensentiment <- tidy_books %>% 
  inner_join(get_sentiments('bing')) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill=0) %>%
  mutate(sentiment = positive-negative)

## Joining, by = "word"

ggplot(janeaustensentiment, aes(index, sentiment, fill=book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol=2, scales="free_x")

Chapter 2 Tidy Data Code (Part 2):

pride_prejudice <- tidy_books %>%
  filter(book == 'Pride & Prejudice' )

afinn <- pride_prejudice %>% 
  inner_join(get_sentiments('afinn')) %>%
  group_by(index = linenumber %/% 80) %>%
  summarize(sentiment = sum(value)) %>%
  mutate(method = "AFINN")

## Joining, by = "word"

bing <- pride_prejudice %>% 
  inner_join(get_sentiments('bing')) %>%
  mutate(method = "BING")

## Joining, by = "word"

nrc <- pride_prejudice %>% 
  inner_join(get_sentiments('nrc')) %>%
  filter(sentiment %in% c("positive", "negative")) %>%
  mutate(method="NRC")

## Joining, by = "word"

bing_and_nrc <- rbind(bing,nrc) %>%
  count(method, index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill=0) %>%
  mutate(sentiment = positive-negative)

bind_rows(afinn, bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill=method)) +
  geom_col(show.legend=FALSE) +
  facet_wrap(~method, ncol=1, scales="free_y")

Extending Codebase: Choosing a Corpus

For this portion of the analysis, I decided to utilize the Science Fiction book “The Lost World” to evaluate the sentiment of the book.

gutenberg_works() %>%
  count(gutenberg_bookshelf, sort = TRUE)

## # A tibble: 770 × 2
##    gutenberg_bookshelf                  n
##    <chr>                            <int>
##  1 <NA>                             42746
##  2 Science Fiction                   1120
##  3 Punch                              471
##  4 Children's Book Series             432
##  5 Bestsellers, American, 1895-1923   330
##  6 World War I                        307
##  7 Historical Fiction                 283
##  8 US Civil War                       265
##  9 Children's Fiction                 248
## 10 Notes and Queries                  219
## # … with 760 more rows

gutenberg_works() %>%
  filter(gutenberg_bookshelf == 'Science Fiction') %>%
  select(title, author, gutenberg_id)

## # A tibble: 1,120 × 3
##    title                                 author                          guten…¹
##    <chr>                                 <chr>                             <int>
##  1 The Gods of Mars                      Burroughs, Edgar Rice                64
##  2 The warlord of Mars                   Burroughs, Edgar Rice                68
##  3 Thuvia, Maid of Mars                  Burroughs, Edgar Rice                72
##  4 The Monster Men                       Burroughs, Edgar Rice                96
##  5 At the Earth's Core                   Burroughs, Edgar Rice               123
##  6 The Poison Belt                       Doyle, Arthur Conan                 126
##  7 The Lost World                        Doyle, Arthur Conan                 139
##  8 The Lost Continent                    Burroughs, Edgar Rice               149
##  9 Twenty Thousand Leagues under the Sea Verne, Jules                        164
## 10 The Lost Continent                    Hyne, Charles John Cutcliffe W…     285
## # … with 1,110 more rows, and abbreviated variable name ¹gutenberg_id

book_id <- (gutenberg_works() %>%
  select(gutenberg_id, title, author) %>%
  arrange(title) %>%
  filter(title == 'The Lost World'))$gutenberg_id

selected_book <- gutenberg_download(book_id)

## Determining mirror for Project Gutenberg from https://www.gutenberg.org/robot/harvest

## Using mirror http://aleph.gutenberg.org

Extending Codebase: Sentiment Analysis using Bing sentiment lexicon

tidy_book <- selected_book %>%
  mutate(linenumber = row_number(),
         chapter = cumsum(str_detect(text, "CHAPTER [\\w]+"))) %>%
  select(-gutenberg_id) %>%
  unnest_tokens(word, text)

tidy_book %>%
  mutate(index = linenumber %/% 40) %>%
  inner_join(get_sentiments('bing')) %>%
  count(index, sentiment) %>%
  spread(sentiment,n,fill=0) %>%
  mutate(sentiment = positive-negative,
         sentiment_type = ifelse(sentiment > 0, 'positive', 'negative')) %>%
  ggplot(aes(index,sentiment, fill=sentiment_type)) +
  geom_col(show.legend = FALSE)

## Joining, by = "word"

Extending Codebase: Additional Lexicons

In addition to the afinn, nrc, and bing lexicons, I also used the “loughran” lexicon. For the sections in this analysis, I used an index length of 80 lines.

index_length = 80


bing <- tidy_book %>%
  mutate(index = linenumber %/% index_length) %>%
  inner_join(get_sentiments('bing')) %>%
  count(index, sentiment) %>%
  mutate(method = 'bing') %>%
  spread(sentiment,n, fill=0) %>%
  mutate(sentiment = positive-negative) %>%
  select(index, sentiment, method)

## Joining, by = "word"

nrc <- tidy_book %>%
  mutate(index = linenumber %/% index_length) %>%
  inner_join(get_sentiments('nrc')) %>%
  filter(sentiment %in% c("positive", "negative")) %>%
  count(index, sentiment) %>%
  mutate(method = 'nrc') %>%
  spread(sentiment,n, fill=0) %>%
  mutate(sentiment = positive-negative) %>%
  select(index, sentiment, method)

## Joining, by = "word"

loughran <- tidy_book %>%
  mutate(index = linenumber %/% index_length) %>%
  inner_join(get_sentiments('loughran')) %>%
  filter(sentiment %in% c("positive","negative")) %>%
  count(index, sentiment) %>%
  mutate(method = 'loughran') %>%
  spread(sentiment,n, fill=0) %>%
  mutate(sentiment = positive-negative) %>%
  select(index, sentiment, method)

## Joining, by = "word"

afinn <- tidy_book %>%
  mutate(index = linenumber %/% index_length) %>%
  inner_join(get_sentiments('afinn')) %>%
  group_by(index) %>%
  summarize(sentiment = sum(value)) %>%
  mutate(method = 'afinn')

## Joining, by = "word"

rbind(loughran, nrc) %>% 
  rbind(afinn) %>% 
  rbind(bing) %>% 
  ggplot(aes(index, sentiment, fill=method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol=2, nrow=2)

Conclusion

This project provided a very good opportunity to explore the possibilities for conducting sentiment analysis on a corpus of text using different sentiment lexicons.