Part 1. Get the the primary example code from chapter 2 of Text Mining with R and run.
Part 2. Extend the code in two ways: * Work with a different corpus of your choosing * Incorporate at least one additional sentiment lexicon (possibly from another R package that you’ve found through research).
I have used janeaustenr, tidyverse, tidytext and gutenbergr libraries.
library(janeaustenr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0 ✓ purrr 0.3.3
## ✓ tibble 2.1.3 ✓ dplyr 0.8.5
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidytext)
get_sentiments("afinn")
get_sentiments("bing")
get_sentiments("nrc")
tidy_books <- austen_books() %>%
group_by(book) %>%
mutate(linenumber = row_number(),
chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup() %>%
unnest_tokens(word, text)
tidy_books
jane_austen_sentiment <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x")
pride_prejudice <- tidy_books %>%
filter(book == "Pride & Prejudice")
pride_prejudice
afinn <- pride_prejudice %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
## Joining, by = "word"
bing_and_nrc <- bind_rows(pride_prejudice %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
pride_prejudice %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c("positive",
"negative"))) %>%
mutate(method = "NRC")) %>%
count(method, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
## Joining, by = "word"
bind_rows(afinn,
bing_and_nrc) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y")
bing_word_counts <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
bing_word_counts %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip()
## Selecting by n
get_sentiments("loughran")
library(gutenbergr)
gutenberg_metadata
jungle_book <- gutenberg_download(236)
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
jungle_book
jungle_book_tidy <- jungle_book %>%
slice(-(1:28)) %>%
#Gets rid of blanks rows
filter(!(text=="")) %>%
select(text) %>%
unnest_tokens(output=word, input=text, token='words') %>%
anti_join(stop_words) %>%
mutate(book="The Jungle Book")
## Joining, by = "word"
jungle_book_tidy
# 4366
jb_afinn <- jungle_book_tidy %>%
group_by(book) %>%
mutate(word_count = 1:n(),
index = word_count %/% 100 + 1) %>%
inner_join(get_sentiments("afinn")) %>%
group_by(book, index) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
## Joining, by = "word"
jb_afinn
jb_loughran <- jungle_book_tidy %>%
right_join(get_sentiments("loughran")) %>%
filter(!is.na(sentiment)) %>%
count(sentiment, sort = TRUE)
## Joining, by = "word"
jb_loughran
jb_loughran <- bind_rows(jungle_book_tidy %>%
group_by(book) %>%
mutate(word_count = 1:n(),
index = word_count %/% 100 + 1) %>%
inner_join(get_sentiments("loughran") %>%
filter(sentiment %in% c("positive", "negative"))) %>%
mutate(method = "Loughran")) %>%
count(book, method, index = index , sentiment) %>%
ungroup() %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative) %>%
select(book, index, method, sentiment)
## Joining, by = "word"
jb_loughran
bind_rows(jb_afinn, jb_loughran) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y")
Sentiment analysis depends on lexicon used. Here we have used 4 lexicons afinn, bing, nrc and loughran. For the Jungle book corpus described above, I see the plot comparison between afinn and loughran and it appears both the plots follow similar trends though the scales are different.