In Text Mining with R, Chapter 2 looks at Sentiment Analysis. In this assignment, you should start by getting the primary example code from chapter 2 working in an R Markdown document. You should provide a citation to this base code. You’re then asked to extend the code in two ways: Work with a different corpus of your choosing, and incorporate at least one additional sentiment lexicon (possibly from another R package that you’ve found through research).
The base code for this assignment is from chapter 2 of Text Mining with R: A Tidy Approach https://www.tidytextmining.com/sentiment.html
library(janeaustenr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(tidytext)
library(tidyr)
library(ggplot2)
library(sentimentr)
tidy_books <- austen_books() %>%
group_by(book) %>%
mutate(
linenumber = row_number(),
chapter = cumsum(str_detect(text,
regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup() %>%
unnest_tokens(word, text)
glimpse(tidy_books)
## Rows: 725,055
## Columns: 4
## $ book <fct> Sense & Sensibility, Sense & Sensibility, Sense & Sensibili…
## $ linenumber <int> 1, 1, 1, 3, 3, 3, 5, 10, 10, 13, 13, 13, 13, 13, 13, 13, 13…
## $ chapter <int> 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ word <chr> "sense", "and", "sensibility", "by", "jane", "austen", "181…
nrc_joy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
tidy_books %>%
filter(book == "Emma") %>%
inner_join(nrc_joy) %>%
count(word, sort = TRUE)
## Joining with `by = join_by(word)`
## # A tibble: 301 × 2
## word n
## <chr> <int>
## 1 good 359
## 2 friend 166
## 3 hope 143
## 4 happy 125
## 5 love 117
## 6 deal 92
## 7 found 92
## 8 present 89
## 9 kind 82
## 10 happiness 76
## # ℹ 291 more rows
library(tidyr)
jane_austen_sentiment <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 435434 of `x` matches multiple rows in `y`.
## ℹ Row 5051 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x")
pride_prejudice <- tidy_books %>%
filter(book == "Pride & Prejudice")
afinn <- pride_prejudice %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
## Joining with `by = join_by(word)`
bing_and_nrc <- bind_rows(
pride_prejudice %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
pride_prejudice %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c("positive",
"negative"))
) %>%
mutate(method = "NRC")) %>%
count(method, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment,
values_from = n,
values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("nrc") %>% filter(sentiment %in% : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 215 of `x` matches multiple rows in `y`.
## ℹ Row 5178 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
bind_rows(afinn,
bing_and_nrc) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y")
get_sentiments("nrc") %>%
filter(sentiment %in% c("positive", "negative")) %>%
count(sentiment)
## # A tibble: 2 × 2
## sentiment n
## <chr> <int>
## 1 negative 3316
## 2 positive 2308
get_sentiments("bing") %>%
count(sentiment)
## # A tibble: 2 × 2
## sentiment n
## <chr> <int>
## 1 negative 4781
## 2 positive 2005
bing_word_counts <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 435434 of `x` matches multiple rows in `y`.
## ℹ Row 5051 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
bing_word_counts
## # A tibble: 2,585 × 3
## word sentiment n
## <chr> <chr> <int>
## 1 miss negative 1855
## 2 well positive 1523
## 3 good positive 1380
## 4 great positive 981
## 5 like positive 725
## 6 better positive 639
## 7 enough positive 613
## 8 happy positive 534
## 9 love positive 495
## 10 pleasure positive 462
## # ℹ 2,575 more rows
bing_word_counts %>%
group_by(sentiment) %>%
slice_max(n, n = 10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(x = "Contribution to sentiment",
y = NULL)
# New Data Set
library(gutenbergr)
gutenberg_metadata
## # A tibble: 72,569 × 8
## gutenberg_id title author gutenberg_author_id language gutenberg_bookshelf
## <int> <chr> <chr> <int> <chr> <chr>
## 1 1 "The De… Jeffe… 1638 en "Politics/American…
## 2 2 "The Un… Unite… 1 en "Politics/American…
## 3 3 "John F… Kenne… 1666 en ""
## 4 4 "Lincol… Linco… 3 en "US Civil War"
## 5 5 "The Un… Unite… 1 en "United States/Pol…
## 6 6 "Give M… Henry… 4 en "American Revoluti…
## 7 7 "The Ma… <NA> NA en ""
## 8 8 "Abraha… Linco… 3 en "US Civil War"
## 9 9 "Abraha… Linco… 3 en "US Civil War"
## 10 10 "The Ki… <NA> NA en "Banned Books List…
## # ℹ 72,559 more rows
## # ℹ 2 more variables: rights <chr>, has_text <lgl>
Using gutenberg library, I was curious if they had one of my favorite books, so I used gutenberg_metadata and grep function to look for the title
# Search for the book by title
book_title <- "Strange Case of Dr. Jekyll and Mr. Hyde"
drj_info <- gutenberg_metadata[grep(book_title, gutenberg_metadata$title), ] #use partial match with grep function
print(drj_info)
## # A tibble: 2 × 8
## gutenberg_id title author gutenberg_author_id language gutenberg_bookshelf
## <int> <chr> <chr> <int> <chr> <chr>
## 1 42 The Stra… Steve… 35 en Precursors of Scie…
## 2 43 The Stra… Steve… 35 en Precursors of Scie…
## # ℹ 2 more variables: rights <chr>, has_text <lgl>
DrJekyll <- gutenberg_download(42)
## Determining mirror for Project Gutenberg from https://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
Tidy Data
tidy_DrJekyll <- DrJekyll %>%
mutate(linenumber = row_number(), chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]", ignore_case = TRUE)))) %>%
unnest_tokens(word, text)
Let’s compare the positive and negative words in bing lexicon and the loughran lexicon
#filtering the loughran lexicon only for positive an negative words
loughran_posneg <- get_sentiments("loughran") %>%
filter(sentiment == "positive" | sentiment =="negative")
#creating net sentiment using bing
DrJekyll_bing <- tidy_DrJekyll %>%
inner_join(get_sentiments("bing")) %>%
count(index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
#creating net sentiment using loughran
DrJekyll_loughran <- tidy_DrJekyll %>%
inner_join(loughran_posneg) %>%
count(index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
par(mfrow=c(1,2))
ggplot(DrJekyll_loughran, aes(index, sentiment)) +
geom_col(show.legend = FALSE) +
labs(title = "Sentiment Analysis of Dr. Jekyll using Loughran Lexicon")
ggplot(DrJekyll_bing, aes(index, sentiment)) +
geom_col(show.legend = FALSE) +
labs(title = "Sentiment Analysis of Dr. Jekyll using Bing Lexicon")
It seems as though there is a disparity in positive/negative words identified using the different lexicons.
get_sentiments("loughran") %>%
filter(sentiment %in% c("positive",
"negative")) %>%
count(sentiment)
## # A tibble: 2 × 2
## sentiment n
## <chr> <int>
## 1 negative 2355
## 2 positive 354
get_sentiments("bing") %>%
count(sentiment)
## # A tibble: 2 × 2
## sentiment n
## <chr> <int>
## 1 negative 4781
## 2 positive 2005
This could be due to the difference in the negative and positive sentiments in the lexicons themselves. It is necessary to make this observation when conducting analysis because it could affect our results.