This assignment is focused on sentiment analysis and is uses code examples from the following book:
Silge, J. and Robinson, D. (2020). Text Mining with R: A Tidy Approach. Retrieved from https://www.tidytextmining.com.
Per the assignment’s instructions I have focused the first part of this assignment on running the code from the above-referenced book in order to see and learn how sentiment analysis in R works. From there I extended the code and examples with an additional corpus and dictionary to apply my new knowledge.
The following code is an annotated version of the examples from the book (see citation above).
# Get measures of the 3 below lexicons
library(tidytext)
get_sentiments("afinn")
## # A tibble: 2,477 x 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # … with 2,467 more rows
get_sentiments("bing")
## # A tibble: 6,786 x 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # … with 6,776 more rows
get_sentiments("nrc")
## # A tibble: 13,901 x 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # … with 13,891 more rows
# Find most common words / using inner join
library(janeaustenr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
tidy_books <- austen_books() %>%
group_by(book) %>%
mutate(
linenumber = row_number(),
chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
ignore_case = TRUE
)))
) %>%
ungroup() %>%
unnest_tokens(word, text)
# Start sentiment analysis
nrc_joy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
tidy_books %>%
filter(book == "Emma") %>%
inner_join(nrc_joy) %>%
count(word, sort = TRUE)
## Joining, by = "word"
## # A tibble: 303 x 2
## word n
## <chr> <int>
## 1 good 359
## 2 young 192
## 3 friend 166
## 4 hope 143
## 5 happy 125
## 6 love 117
## 7 deal 92
## 8 found 92
## 9 present 89
## 10 kind 82
## # … with 293 more rows
# Spread data, calculate a net sentiment
library(tidyr)
jane_austen_sentiment <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
# Plot net sentinment scores
library(ggplot2)
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x")
# Compare the 3 dictionaryies: choose words intereted in
pride_prejudice <- tidy_books %>%
filter(book == "Pride & Prejudice")
pride_prejudice
## # A tibble: 122,204 x 4
## book linenumber chapter word
## <fct> <int> <int> <chr>
## 1 Pride & Prejudice 1 0 pride
## 2 Pride & Prejudice 1 0 and
## 3 Pride & Prejudice 1 0 prejudice
## 4 Pride & Prejudice 3 0 by
## 5 Pride & Prejudice 3 0 jane
## 6 Pride & Prejudice 3 0 austen
## 7 Pride & Prejudice 7 1 chapter
## 8 Pride & Prejudice 7 1 1
## 9 Pride & Prejudice 10 1 it
## 10 Pride & Prejudice 10 1 is
## # … with 122,194 more rows
# Define larger spans of text
afinn <- pride_prejudice %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
## Joining, by = "word"
## `summarise()` ungrouping output (override with `.groups` argument)
bing_and_nrc <- bind_rows(
pride_prejudice %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
pride_prejudice %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c(
"positive",
"negative"
))) %>%
mutate(method = "NRC")
) %>%
count(method, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
## Joining, by = "word"
# Bind rows together and visualize them
bind_rows(
afinn,
bing_and_nrc
) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y")
# Look at positive and negative words in lexicons
get_sentiments("nrc") %>%
filter(sentiment %in% c(
"positive",
"negative"
)) %>%
count(sentiment)
## # A tibble: 2 x 2
## sentiment n
## <chr> <int>
## 1 negative 3324
## 2 positive 2312
get_sentiments("bing") %>%
count(sentiment)
## # A tibble: 2 x 2
## sentiment n
## <chr> <int>
## 1 negative 4781
## 2 positive 2005
# Find out how much each word contributed to the sentiment
bing_word_counts <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
# Show it visually
bing_word_counts
## # A tibble: 2,585 x 3
## word sentiment n
## <chr> <chr> <int>
## 1 miss negative 1855
## 2 well positive 1523
## 3 good positive 1380
## 4 great positive 981
## 5 like positive 725
## 6 better positive 639
## 7 enough positive 613
## 8 happy positive 534
## 9 love positive 495
## 10 pleasure positive 462
## # … with 2,575 more rows
bing_word_counts %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(
y = "Contribution to sentiment",
x = NULL
) +
coord_flip()
## Selecting by n
# Add rows to custom stop words list
custom_stop_words <- bind_rows(
tibble(
word = c("miss"),
lexicon = c("custom")
),
stop_words
)
custom_stop_words
## # A tibble: 1,150 x 2
## word lexicon
## <chr> <chr>
## 1 miss custom
## 2 a SMART
## 3 a's SMART
## 4 able SMART
## 5 about SMART
## 6 above SMART
## 7 according SMART
## 8 accordingly SMART
## 9 across SMART
## 10 actually SMART
## # … with 1,140 more rows
# Create a wordcloud
library(wordcloud)
## Loading required package: RColorBrewer
tidy_books %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
## Joining, by = "word"
# Wordcloud using most common positive and negative words
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(
colors = c("gray20", "gray80"),
max.words = 100
)
## Joining, by = "word"
# Tokenize text into sentences
PandP_sentences <- tibble(text = prideprejudice) %>%
unnest_tokens(sentence, text, token = "sentences")
PandP_sentences$sentence[2]
## [1] "however little known the feelings or views of such a man may be on his first entering a neighbourhood, this truth is so well fixed in the minds of the surrounding families, that he is considered the rightful property of some one or other of their daughters."
# Split into data frames by chapter
austen_chapters <- austen_books() %>%
group_by(book) %>%
unnest_tokens(chapter, text,
token = "regex",
pattern = "Chapter|CHAPTER [\\dIVXLC]"
) %>%
ungroup()
austen_chapters %>%
group_by(book) %>%
summarise(chapters = n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 6 x 2
## book chapters
## <fct> <int>
## 1 Sense & Sensibility 51
## 2 Pride & Prejudice 62
## 3 Mansfield Park 49
## 4 Emma 56
## 5 Northanger Abbey 32
## 6 Persuasion 25
# For each book, which has highest proportion of negative words
bingnegative <- get_sentiments("bing") %>%
filter(sentiment == "negative")
wordcounts <- tidy_books %>%
group_by(book, chapter) %>%
summarize(words = n())
## `summarise()` regrouping output by 'book' (override with `.groups` argument)
tidy_books %>%
semi_join(bingnegative) %>%
group_by(book, chapter) %>%
summarize(negativewords = n()) %>%
left_join(wordcounts, by = c("book", "chapter")) %>%
mutate(ratio = negativewords / words) %>%
filter(chapter != 0) %>%
top_n(1) %>%
ungroup()
## Joining, by = "word"
## `summarise()` regrouping output by 'book' (override with `.groups` argument)
## Selecting by ratio
## # A tibble: 6 x 5
## book chapter negativewords words ratio
## <fct> <int> <int> <int> <dbl>
## 1 Sense & Sensibility 43 161 3405 0.0473
## 2 Pride & Prejudice 34 111 2104 0.0528
## 3 Mansfield Park 46 173 3685 0.0469
## 4 Emma 15 151 3340 0.0452
## 5 Northanger Abbey 21 149 2982 0.0500
## 6 Persuasion 4 62 1807 0.0343
# End of code from book examples
The following section extends the code by extending it to a new corpus and a new lexicon.
For the corpus, I used the guttenbergr package to download the text of the U.S. Declaration of Independence.
For the lexicon, I used the syuzhet package by Matthew Jockers et al of the Nebraska Literary Lab.
The Syuzhet dictionary was created in the Nebraska Literary Lab by Matthew Jockers et al. IT ranks each term on a scale of -1 to 1.
More info at the below link: https://www.rdocumentation.org/packages/syuzhet/versions/1.0.4
# Load gutenberg library and download the Declaration of Independence
library(gutenbergr)
decl_ind <- gutenberg_download(1)
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
# Load the Syuzhet lexicon
library(syuzhet)
syuzhet_lex <- syuzhet::get_sentiment_dictionary()
# Tidy the data with an added column to indicate row number
decl_ind_tidy <- decl_ind %>%
mutate(
linenumber = row_number(),
)
# Tokenize the data with each word being a token
decl_ind_tidy <- decl_ind_tidy %>% unnest_tokens(word, text)
# Remove the stop words to focus the analysis on more relevant terms
data("stop_words")
decl_ind_tidy <- decl_ind_tidy %>%
anti_join(stop_words)
## Joining, by = "word"
# View most common words
decl_ind_tidy %>%
count(word, sort = TRUE)
## # A tibble: 2,424 x 2
## word n
## <chr> <int>
## 1 united 78
## 2 people 53
## 3 constitution 45
## 4 law 44
## 5 time 43
## 6 laws 40
## 7 congress 39
## 8 government 38
## 9 president 38
## 10 war 35
## # … with 2,414 more rows
# Visualize with horizontal bar chart, filtered by words used over 25 times
decl_ind_tidy %>%
count(word, sort = TRUE) %>%
filter(n > 25) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col() +
xlab(NULL) +
coord_flip()
In the above chart we can see that ‘etext’ appears erroneously in the results, so I added it to a custom stop words list.
custom_stop_words <- bind_rows(
tibble(
word = c("etext"),
lexicon = c("custom")
),
stop_words
)
# Use the anti-join function again to strip out the custom stop words
decl_ind_tidy <- decl_ind_tidy %>%
anti_join(custom_stop_words)
## Joining, by = "word"
# Visualize the data again to verify 'etext' no longer appears
decl_ind_tidy %>%
count(word, sort = TRUE) %>%
filter(n > 25) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col() +
xlab(NULL) +
coord_flip()
The below code summarizes the sentiment with 3 separate lexicons (including the new one), and then plots the summarizes next to each other for easy comparison.
# Summarize sentiment using syuzhet lexicon
decl_syuzhet_lex <- decl_ind_tidy %>%
inner_join(syuzhet_lex) %>%
group_by(index = linenumber %/% 40) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "SYUZHET")
## Joining, by = "word"
## `summarise()` ungrouping output (override with `.groups` argument)
# Summarize sentiment using nrc and bing lexicons, w/ groups of 40 words
decl_bing_and_nrc <- bind_rows(
decl_ind_tidy%>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
decl_ind_tidy %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c(
"positive",
"negative"
))) %>%
mutate(method = "NRC")
) %>%
count(method, index = linenumber %/% 40, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
## Joining, by = "word"
# Combine the results for use in plotting
# Interestingly the results are pretty different
bind_rows(
decl_syuzhet_lex,
decl_bing_and_nrc
) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y")
The below code creates a wordcloud, excluding the stop_words (including the custom stop words added).
decl_ind_tidy %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
## Joining, by = "word"
## Warning in wordcloud(word, n, max.words = 100): congress could not be fit on
## page. It will not be plotted.
In conclusion, as described in Text Mining With R: A Tidy Approach, we can use a variety of dictionaries for sentiment analysis in R to perform various types of analysis on text. The best dictionary depends on the context of the specific challenge or need as each have their strengths and weaknesses, although the general format and use of each is similar. For the Declaration of Independence I chose Syuzhet because of its scale of -1 to 1 which gives a more granular breakdown of sentiment, which seems more useful for shorter texts such as this.