This assignment is focused on conducting a sentiment analysis of a text corpus using the same methodology and overall codebase introduced in Chapter 2: Sentiment Analysis with Tidy Data of the Text Mining with R text.
We extend this code by doing the following: a) Working with a different corpus of our choosing; and b) Incorporating at least one additional sentiment lexicon
We begin by importing the necessary libraries that will allow us to manipulate our data and ensure it is setup in a Tidy format. Additionally, we load libraries that contain the additional corpus we will be using and the sentiment lexicons.
knitr::opts_chunk$set(echo = TRUE)
library(tidytext)
library(janeaustenr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(tidyr)
library(ggplot2)
library(gutenbergr)
The code in Chapter 2
tidy_books <- austen_books() %>%
group_by(book) %>%
mutate(linenumber = row_number(),
chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",ignore_case=TRUE)))) %>%
ungroup() %>%
unnest_tokens(word,text)
nrc_joy <- get_sentiments('nrc') %>%
filter(sentiment == 'joy')
tidy_books %>%
filter(book == 'Emma') %>%
inner_join(nrc_joy) %>%
count(word, sort=TRUE)
## Joining, by = "word"
## # A tibble: 301 × 2
## word n
## <chr> <int>
## 1 good 359
## 2 friend 166
## 3 hope 143
## 4 happy 125
## 5 love 117
## 6 deal 92
## 7 found 92
## 8 present 89
## 9 kind 82
## 10 happiness 76
## # … with 291 more rows
janeaustensentiment <- tidy_books %>%
inner_join(get_sentiments('bing')) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill=0) %>%
mutate(sentiment = positive-negative)
## Joining, by = "word"
ggplot(janeaustensentiment, aes(index, sentiment, fill=book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol=2, scales="free_x")
pride_prejudice <- tidy_books %>%
filter(book == 'Pride & Prejudice' )
afinn <- pride_prejudice %>%
inner_join(get_sentiments('afinn')) %>%
group_by(index = linenumber %/% 80) %>%
summarize(sentiment = sum(value)) %>%
mutate(method = "AFINN")
## Joining, by = "word"
bing <- pride_prejudice %>%
inner_join(get_sentiments('bing')) %>%
mutate(method = "BING")
## Joining, by = "word"
nrc <- pride_prejudice %>%
inner_join(get_sentiments('nrc')) %>%
filter(sentiment %in% c("positive", "negative")) %>%
mutate(method="NRC")
## Joining, by = "word"
bing_and_nrc <- rbind(bing,nrc) %>%
count(method, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill=0) %>%
mutate(sentiment = positive-negative)
bind_rows(afinn, bing_and_nrc) %>%
ggplot(aes(index, sentiment, fill=method)) +
geom_col(show.legend=FALSE) +
facet_wrap(~method, ncol=1, scales="free_y")
For this portion of the analysis, I decided to utilize the Science Fiction book “The Lost World” to evaluate the sentiment of the book.
gutenberg_works() %>%
count(gutenberg_bookshelf, sort = TRUE)
## # A tibble: 770 × 2
## gutenberg_bookshelf n
## <chr> <int>
## 1 <NA> 42746
## 2 Science Fiction 1120
## 3 Punch 471
## 4 Children's Book Series 432
## 5 Bestsellers, American, 1895-1923 330
## 6 World War I 307
## 7 Historical Fiction 283
## 8 US Civil War 265
## 9 Children's Fiction 248
## 10 Notes and Queries 219
## # … with 760 more rows
gutenberg_works() %>%
filter(gutenberg_bookshelf == 'Science Fiction') %>%
select(title, author, gutenberg_id)
## # A tibble: 1,120 × 3
## title author guten…¹
## <chr> <chr> <int>
## 1 The Gods of Mars Burroughs, Edgar Rice 64
## 2 The warlord of Mars Burroughs, Edgar Rice 68
## 3 Thuvia, Maid of Mars Burroughs, Edgar Rice 72
## 4 The Monster Men Burroughs, Edgar Rice 96
## 5 At the Earth's Core Burroughs, Edgar Rice 123
## 6 The Poison Belt Doyle, Arthur Conan 126
## 7 The Lost World Doyle, Arthur Conan 139
## 8 The Lost Continent Burroughs, Edgar Rice 149
## 9 Twenty Thousand Leagues under the Sea Verne, Jules 164
## 10 The Lost Continent Hyne, Charles John Cutcliffe W… 285
## # … with 1,110 more rows, and abbreviated variable name ¹gutenberg_id
book_id <- (gutenberg_works() %>%
select(gutenberg_id, title, author) %>%
arrange(title) %>%
filter(title == 'The Lost World'))$gutenberg_id
selected_book <- gutenberg_download(book_id)
## Determining mirror for Project Gutenberg from https://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
tidy_book <- selected_book %>%
mutate(linenumber = row_number(),
chapter = cumsum(str_detect(text, "CHAPTER [\\w]+"))) %>%
select(-gutenberg_id) %>%
unnest_tokens(word, text)
tidy_book %>%
mutate(index = linenumber %/% 40) %>%
inner_join(get_sentiments('bing')) %>%
count(index, sentiment) %>%
spread(sentiment,n,fill=0) %>%
mutate(sentiment = positive-negative,
sentiment_type = ifelse(sentiment > 0, 'positive', 'negative')) %>%
ggplot(aes(index,sentiment, fill=sentiment_type)) +
geom_col(show.legend = FALSE)
## Joining, by = "word"
In addition to the afinn, nrc, and bing lexicons, I also used the “loughran” lexicon. For the sections in this analysis, I used an index length of 80 lines.
index_length = 80
bing <- tidy_book %>%
mutate(index = linenumber %/% index_length) %>%
inner_join(get_sentiments('bing')) %>%
count(index, sentiment) %>%
mutate(method = 'bing') %>%
spread(sentiment,n, fill=0) %>%
mutate(sentiment = positive-negative) %>%
select(index, sentiment, method)
## Joining, by = "word"
nrc <- tidy_book %>%
mutate(index = linenumber %/% index_length) %>%
inner_join(get_sentiments('nrc')) %>%
filter(sentiment %in% c("positive", "negative")) %>%
count(index, sentiment) %>%
mutate(method = 'nrc') %>%
spread(sentiment,n, fill=0) %>%
mutate(sentiment = positive-negative) %>%
select(index, sentiment, method)
## Joining, by = "word"
loughran <- tidy_book %>%
mutate(index = linenumber %/% index_length) %>%
inner_join(get_sentiments('loughran')) %>%
filter(sentiment %in% c("positive","negative")) %>%
count(index, sentiment) %>%
mutate(method = 'loughran') %>%
spread(sentiment,n, fill=0) %>%
mutate(sentiment = positive-negative) %>%
select(index, sentiment, method)
## Joining, by = "word"
afinn <- tidy_book %>%
mutate(index = linenumber %/% index_length) %>%
inner_join(get_sentiments('afinn')) %>%
group_by(index) %>%
summarize(sentiment = sum(value)) %>%
mutate(method = 'afinn')
## Joining, by = "word"
rbind(loughran, nrc) %>%
rbind(afinn) %>%
rbind(bing) %>%
ggplot(aes(index, sentiment, fill=method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol=2, nrow=2)
This project provided a very good opportunity to explore the possibilities for conducting sentiment analysis on a corpus of text using different sentiment lexicons.