The goal of this document is to use current sentiment analysis base code from “Text Mining with R”, and apply it to a new corpus and lexicon."
Bring in packages.
library(tidytext)
library(textdata)
library(janeaustenr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(tidyr)
library(ggplot2)
library(gutenbergr)
Tidy selected corpus into data frame(added sample function to reduce processing power
tidy_books <- sample_n(austen_books(),10000) %>%
group_by(book) %>%
mutate(
linenumber = row_number(),
chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
ignore_case = TRUE
)))
) %>%
ungroup() %>%
unnest_tokens(word, text)
Word count reflecting “joy” per nrc lexicon
nrc_joy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
tidy_books %>%
filter(book == "Emma") %>%
inner_join(nrc_joy) %>%
count(word, sort = TRUE)
## Joining, by = "word"
## # A tibble: 161 x 2
## word n
## <chr> <int>
## 1 good 45
## 2 hope 25
## 3 happy 23
## 4 young 23
## 5 friend 18
## 6 found 17
## 7 love 17
## 8 kind 16
## 9 pretty 15
## 10 present 14
## # … with 151 more rows
Sentiment score grouped grouped 80 line buckets
jane_austen_sentiment <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
Plot sentiments trend
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x")
Charles Dickens Children Books
dickens_works<- gutenberg_works(author == "Dickens, Charles",!str_detect(title, "Works"))
dickens_works <- dickens_works %>%
filter(str_detect(gutenberg_bookshelf,'Children'))
dickens_works_ids <- dickens_works$gutenberg_id
Tidy selected corpus into data frame(added sample function to reduce processing power
dickens_childrens_books <- gutenberg_download(dickens_works_ids)%>%
left_join(dickens_works) %>%
rename(book = title)%>%
group_by(book) %>%
mutate(
linenumber = row_number(),
chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
ignore_case = TRUE
)))
) %>%
ungroup() %>%
unnest_tokens(word, text)%>%
select(book,linenumber, chapter, word)
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
## Joining, by = "gutenberg_id"
Word count reflecting “joy” per nrc lexicon
Words that you would expect in children books appear, such as “young” and “daughter.” However, interesting call-outs include “money” and “church,” which may not be associated with children books these days.
nrc_joy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
dickens_childrens_books %>%
inner_join(nrc_joy) %>%
count(word, sort = TRUE)
## Joining, by = "word"
## # A tibble: 399 x 2
## word n
## <chr> <int>
## 1 good 554
## 2 young 350
## 3 found 224
## 4 money 160
## 5 merry 159
## 6 baby 146
## 7 love 146
## 8 daughter 128
## 9 church 123
## 10 child 121
## # … with 389 more rows
Sentiment score grouped grouped 80 line buckets
dickens_childrens_books_sentiment <- dickens_childrens_books %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
Plot sentiments trends Example plot using “bing” Lexicon.
ggplot(filter(dickens_childrens_books_sentiment,book == "Holiday Romance"), aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 3, scales = "free_x")
Re-run with different Lexicon
library(SentimentAnalysis)
##
## Attaching package: 'SentimentAnalysis'
## The following object is masked from 'package:base':
##
## write
#load DictionarHE lexicon from SentimentAnalysis Package
dict.HE <- rbind(data.frame(rename(data.frame(sentiment = "positive", loadDictionaryHE()[1]), word = positiveWords)),data.frame(rename(data.frame(sentiment = "negative", loadDictionaryHE()[2]), word = negativeWords)))
dickens_childrens_books_sentiment_HE <- dickens_childrens_books %>%
inner_join(dict.HE) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
Plot sentiments trends
ggplot(filter(dickens_childrens_books_sentiment_HE,book == "Holiday Romance"), aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 3, scales = "free_x")
We can see how the choice of lexicon can affect the results in a major way; not only in absolute value but in trends.
Julia Silge and David Robinson. “Text Mining with R.” https://www.tidytextmining.com.*