Example code was downloaded from here
Robinson, Julia Silge and David. “2 Sentiment Analysis with Tidy Data: Text Mining with R.” 2 Sentiment Analysis with Tidy Data | Text Mining with R, https://www.tidytextmining.com/sentiment.html.
Data Sets: Saif M. Mohammad and Peter Turney. (2013), ``Crowdsourcing a Word-Emotion Association Lexicon.’’ Computational Intelligence, 29(3): 436-465. nrc
Finn Arup Nielsen: AFINN
Bing Liu and Collaborators: bing
Loughran Lexicon: Marketing Communications: Web // University of Notre Dame. “Resources // Software Repository for Accounting and Finance // University of Notre Dame.” Software Repository for Accounting and Finance, https://sraf.nd.edu/textual-analysis/resources/#Master%20Dictionary.
Here, the data is loaded and sentiment values are retrieved.
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.4 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 2.0.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidytext)
library(janeaustenr)
library(dplyr)
library(stringr)
library(tidyr)
library(ggplot2)
library(wordcloud)
## Loading required package: RColorBrewer
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
get_sentiments("afinn")
## # A tibble: 2,477 × 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # … with 2,467 more rows
get_sentiments("bing")
## # A tibble: 6,786 × 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # … with 6,776 more rows
get_sentiments("nrc")
## # A tibble: 13,875 × 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # … with 13,865 more rows
Next, the data was manipulated to isolate Jane Austen’s books and they were further filtered to seek out positive sentiments. Then, the sentiments were plotted and filtered by book.
tidy_books <- austen_books() %>%
group_by(book) %>%
mutate(
linenumber = row_number(),
chapter = cumsum(str_detect(text,
regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup() %>%
unnest_tokens(word, text)
nrc_joy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
tidy_books %>%
filter(book == "Emma") %>%
inner_join(nrc_joy) %>%
count(word, sort = TRUE)
## Joining, by = "word"
## # A tibble: 301 × 2
## word n
## <chr> <int>
## 1 good 359
## 2 friend 166
## 3 hope 143
## 4 happy 125
## 5 love 117
## 6 deal 92
## 7 found 92
## 8 present 89
## 9 kind 82
## 10 happiness 76
## # … with 291 more rows
jane_austen_sentiment <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x")
Next, focus was placed on Pride and Prejudice. Individual words were selected from the book and sentiments were tabulated on those words. The three data sets, AFINN, bing, and nrc were compared on the same plot. Then, the total positive and negative sentiments are displayed from bing and nrc.
pride_prejudice <- tidy_books %>%
filter(book == "Pride & Prejudice")
pride_prejudice
## # A tibble: 122,204 × 4
## book linenumber chapter word
## <fct> <int> <int> <chr>
## 1 Pride & Prejudice 1 0 pride
## 2 Pride & Prejudice 1 0 and
## 3 Pride & Prejudice 1 0 prejudice
## 4 Pride & Prejudice 3 0 by
## 5 Pride & Prejudice 3 0 jane
## 6 Pride & Prejudice 3 0 austen
## 7 Pride & Prejudice 7 1 chapter
## 8 Pride & Prejudice 7 1 1
## 9 Pride & Prejudice 10 1 it
## 10 Pride & Prejudice 10 1 is
## # … with 122,194 more rows
afinn <- pride_prejudice %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
## Joining, by = "word"
bing_and_nrc <- bind_rows(
pride_prejudice %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
pride_prejudice %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c("positive",
"negative"))
) %>%
mutate(method = "NRC")) %>%
count(method, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment,
values_from = n,
values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
## Joining, by = "word"
bind_rows(afinn,
bing_and_nrc) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y")
get_sentiments("nrc") %>%
filter(sentiment %in% c("positive", "negative")) %>%
count(sentiment)
## # A tibble: 2 × 2
## sentiment n
## <chr> <int>
## 1 negative 3318
## 2 positive 2308
get_sentiments("bing") %>%
count(sentiment)
## # A tibble: 2 × 2
## sentiment n
## <chr> <int>
## 1 negative 4781
## 2 positive 2005
Here, individual words were explored from the bing data set. The words were plotted to visualize the impact each word had on overall sentiment.
bing_word_counts <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
bing_word_counts
## # A tibble: 2,585 × 3
## word sentiment n
## <chr> <chr> <int>
## 1 miss negative 1855
## 2 well positive 1523
## 3 good positive 1380
## 4 great positive 981
## 5 like positive 725
## 6 better positive 639
## 7 enough positive 613
## 8 happy positive 534
## 9 love positive 495
## 10 pleasure positive 462
## # … with 2,575 more rows
bing_word_counts %>%
group_by(sentiment) %>%
slice_max(n, n = 10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(x = "Contribution to sentiment",
y = NULL)
custom_stop_words <- bind_rows(tibble(word = c("miss"),
lexicon = c("custom")),
stop_words)
custom_stop_words
## # A tibble: 1,150 × 2
## word lexicon
## <chr> <chr>
## 1 miss custom
## 2 a SMART
## 3 a's SMART
## 4 able SMART
## 5 about SMART
## 6 above SMART
## 7 according SMART
## 8 accordingly SMART
## 9 across SMART
## 10 actually SMART
## # … with 1,140 more rows
Here, the saddest chapters were selected and displayed in a data frame. The ratio represents the sad words as a ratio to the total words in the chapter.
p_and_p_sentences <- tibble(text = prideprejudice) %>%
unnest_tokens(sentence, text, token = "sentences")
p_and_p_sentences$sentence[2]
## [1] "by jane austen"
austen_chapters <- austen_books() %>%
group_by(book) %>%
unnest_tokens(chapter, text, token = "regex",
pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
ungroup()
austen_chapters %>%
group_by(book) %>%
summarise(chapters = n())
## # A tibble: 6 × 2
## book chapters
## <fct> <int>
## 1 Sense & Sensibility 51
## 2 Pride & Prejudice 62
## 3 Mansfield Park 49
## 4 Emma 56
## 5 Northanger Abbey 32
## 6 Persuasion 25
bingnegative <- get_sentiments("bing") %>%
filter(sentiment == "negative")
wordcounts <- tidy_books %>%
group_by(book, chapter) %>%
summarize(words = n())
## `summarise()` has grouped output by 'book'. You can override using the `.groups` argument.
tidy_books %>%
semi_join(bingnegative) %>%
group_by(book, chapter) %>%
summarize(negativewords = n()) %>%
left_join(wordcounts, by = c("book", "chapter")) %>%
mutate(ratio = negativewords/words) %>%
filter(chapter != 0) %>%
slice_max(ratio, n = 1) %>%
ungroup()
## Joining, by = "word"
## `summarise()` has grouped output by 'book'. You can override using the `.groups` argument.
## # A tibble: 6 × 5
## book chapter negativewords words ratio
## <fct> <int> <int> <int> <dbl>
## 1 Sense & Sensibility 43 161 3405 0.0473
## 2 Pride & Prejudice 34 111 2104 0.0528
## 3 Mansfield Park 46 173 3685 0.0469
## 4 Emma 15 151 3340 0.0452
## 5 Northanger Abbey 21 149 2982 0.0500
## 6 Persuasion 4 62 1807 0.0343
I decided to explore Harry Potter for my corpus of choice. I found an R package online that contains all of the text for the seven main books in the Harry Potter series. The problem I ran into was that I could not load the correct harrypotter package. I used install.packages(harrypotter). This was a mistake. There is another Harry Potter House package that was being confused with the books package, and I could not resolve the issue for quite some time. I had to remove the package and restart R and clear all consoles and values in order to finally be able to call the correct package. The package installation command is copied from the original package on github.
The other sentiment lexicon I found and will be using was found here
Loughran-McDonald Sentiment lexicon
if (packageVersion("devtools") < 1.6) {
install.packages("devtools")
}
devtools::install_github("bradleyboehmke/harrypotter", force = TRUE)
## Downloading GitHub repo bradleyboehmke/harrypotter@HEAD
##
checking for file ‘/private/var/folders/j0/csjb4l51263g59vn3xvc41c00000gn/T/RtmpInknaq/remotes1467634399fa0/bradleyboehmke-harrypotter-51f7146/DESCRIPTION’ ...
✓ checking for file ‘/private/var/folders/j0/csjb4l51263g59vn3xvc41c00000gn/T/RtmpInknaq/remotes1467634399fa0/bradleyboehmke-harrypotter-51f7146/DESCRIPTION’
##
─ preparing ‘harrypotter’:
##
checking DESCRIPTION meta-information ...
✓ checking DESCRIPTION meta-information
##
─ checking for LF line-endings in source and make files and shell scripts
##
─ checking for empty or unneeded directories
##
─ building ‘harrypotter_0.1.0.tar.gz’
##
##
library(harrypotter)
titles <- c("Philosopher's Stone", "Chamber of Secrets", "Prisoner of Azkaban",
"Goblet of Fire", "Order of the Phoenix", "Half-Blood Prince",
"Deathly Hallows")
book <- list(philosophers_stone, chamber_of_secrets, prisoner_of_azkaban,
goblet_of_fire, order_of_the_phoenix, half_blood_prince,
deathly_hallows)
#df <- data.frame(harrypotter::philosophers_stone)
potter_tidy <- list(philosophers_stone, chamber_of_secrets, prisoner_of_azkaban,
goblet_of_fire, order_of_the_phoenix, half_blood_prince,
deathly_hallows) %>%
set_names(titles) %>%
map_df(as_tibble, .id = "book") %>%
mutate(book = factor(book, levels = titles)) %>%
drop_na(value) %>%
group_by(book) %>%
mutate(chapter = row_number(book)) %>%
ungroup() %>%
unnest_tokens(word, value)
potter_sentiment <- potter_tidy %>%
inner_join(get_sentiments("loughran")) %>%
count(book, index = chapter, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
ggplot(potter_sentiment, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
coord_cartesian(ylim = c(-150, 50))+
facet_wrap(~book, ncol = 2, scales = "free_x")+
ggtitle("Harry Potter Sentiments: Loughran Lexicon")+
theme(plot.title = element_text(hjust = 0.5))
potter_sentiment <- potter_tidy %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = chapter, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
ggplot(potter_sentiment, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
coord_cartesian(ylim = c(-200, 100))+
facet_wrap(~book, ncol = 2, scales = "free_x")+
ggtitle("Harry Potter Sentiments: Bing Lexicon")+
theme(plot.title = element_text(hjust = 0.5))
potter_sentiment <- potter_tidy %>%
inner_join(get_sentiments("nrc")) %>%
count(book, index = chapter, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
ggplot(potter_sentiment, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
coord_cartesian(ylim = c(-300, 100))+
facet_wrap(~book, ncol = 2, scales = "free_x")+
ggtitle("Harry Potter Sentiments: NRC Lexicon")+
theme(plot.title = element_text(hjust = 0.5))
Order of the Phoenix and Deathly Hallows are the two most negative books across the seven. The book with the most variation appears to be Half-Blood Prince, which is most clearly evidenced in the visualization using the Bing Lexicon.
From the three different sentiment lexicons, the Harry Potter books on the whole appear to offer overwhelmingly negative sentiments.
After the third book, the books are divided into more chapters and they seem to get progressively more negative. The widest range seems to be in Order of the Phoenix, so I will analyze that book.
order_phx <- potter_tidy %>%
filter(book == "Order of the Phoenix")
order_phx
## # A tibble: 258,763 × 3
## book chapter word
## <fct> <int> <chr>
## 1 Order of the Phoenix 1 dudley
## 2 Order of the Phoenix 1 demented
## 3 Order of the Phoenix 1 the
## 4 Order of the Phoenix 1 hottest
## 5 Order of the Phoenix 1 day
## 6 Order of the Phoenix 1 of
## 7 Order of the Phoenix 1 the
## 8 Order of the Phoenix 1 summer
## 9 Order of the Phoenix 1 so
## 10 Order of the Phoenix 1 far
## # … with 258,753 more rows
afinn <- order_phx %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = chapter) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
## Joining, by = "word"
bing_and_nrc <- bind_rows(
order_phx %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
order_phx %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c("positive",
"negative"))
) %>%
mutate(method = "NRC")) %>%
count(method, index = chapter, sentiment) %>%
pivot_wider(names_from = sentiment,
values_from = n,
values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
## Joining, by = "word"
loughran <- order_phx %>%
inner_join(get_sentiments("loughran")) %>%
mutate(method = "Loughran") %>%
group_by(index = chapter) %>%
filter(sentiment %in% c("positive", "negative")) %>%
mutate(method = "Loughran") %>%
count(method, index = chapter, sentiment) %>%
pivot_wider(names_from = sentiment,
values_from = n,
values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
bind_rows(afinn,
bing_and_nrc, loughran) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
coord_cartesian(ylim = c(-300, 125))+
facet_wrap(~method, ncol = 1, scales = "free_y")
NRC seems the most polarized, so I will use NRC to find the ratio of negative words to total words. One peculiar result is that the Loughran Lexicon did not return any positive sentiments.
get_sentiments("nrc") %>%
filter(sentiment %in% c("positive", "negative")) %>%
count(sentiment)
## # A tibble: 2 × 2
## sentiment n
## <chr> <int>
## 1 negative 3318
## 2 positive 2308
get_sentiments("bing") %>%
count(sentiment)
## # A tibble: 2 × 2
## sentiment n
## <chr> <int>
## 1 negative 4781
## 2 positive 2005
get_sentiments("loughran") %>%
filter(sentiment %in% c("positive", "negative")) %>%
count(sentiment)
## # A tibble: 2 × 2
## sentiment n
## <chr> <int>
## 1 negative 2355
## 2 positive 354
stone_chapters <- as.numeric(nrow(data.frame(harrypotter::philosophers_stone)))
chamber_chapters <- as.numeric(nrow(data.frame(harrypotter::chamber_of_secrets)))
prisoner_chapters <- as.numeric(nrow(data.frame(harrypotter::prisoner_of_azkaban)))
goblet_chapters <- as.numeric(nrow(data.frame(harrypotter::goblet_of_fire)))
order_chapters <- as.numeric(nrow(data.frame(harrypotter::order_of_the_phoenix)))
prince_chapters <- as.numeric(nrow(data.frame(harrypotter::half_blood_prince)))
deathly_chapters <- as.numeric(nrow(data.frame(harrypotter::deathly_hallows)))
chapters <- data.frame(c(stone_chapters, chamber_chapters, prisoner_chapters, goblet_chapters, order_chapters, prince_chapters, deathly_chapters))
chapters <- cbind(titles,chapters)
colnames(chapters) <- c("title", "count")
nrcnegative <- get_sentiments("nrc") %>%
filter(sentiment == "negative")
wordcounts <- potter_tidy %>%
group_by(book, chapter) %>%
summarize(words = n())
## `summarise()` has grouped output by 'book'. You can override using the `.groups` argument.
potter_tidy %>%
semi_join(nrcnegative) %>%
group_by(book, chapter) %>%
summarize(negativewords = n()) %>%
left_join(wordcounts, by = c("book", "chapter")) %>%
mutate(ratio = negativewords/words) %>%
filter(chapter != 0) %>%
slice_max(ratio, n = 1) %>%
ungroup()
## Joining, by = "word"
## `summarise()` has grouped output by 'book'. You can override using the `.groups` argument.
## # A tibble: 7 × 5
## book chapter negativewords words ratio
## <fct> <int> <int> <int> <dbl>
## 1 Philosopher's Stone 17 318 5488 0.0579
## 2 Chamber of Secrets 2 199 2894 0.0688
## 3 Prisoner of Azkaban 3 284 4415 0.0643
## 4 Goblet of Fire 32 159 2010 0.0791
## 5 Order of the Phoenix 36 268 3845 0.0697
## 6 Half-Blood Prince 28 274 3597 0.0762
## 7 Deathly Hallows 32 375 5552 0.0675
The most variation is found in the Order of the Phoenix and Deathly Hallows books. The Bing lexicon seems to be the most effective at getting a wider range of outcomes, which is more in line with expectation in a book. Having all of the lexicons available makes for a more accurate idea of the reality of the data. It appears that the Harry Potter books are largely negative, but I would like to explore which emotion is the main cause of the negativity. My guess is that anger would contribute more to the overall sentiment than sadness would.
Sadness contributed on average just slightly more than anger.
ggplot(potter_sentiment, aes(index, sentiment, fill = book, color = book)) +
geom_line(show.legend = FALSE) +
coord_cartesian(ylim = c(-300, 100))+
facet_wrap(~book, ncol = 2, scales = "free_x")+
ggtitle("Harry Potter Sentiments: NRC Lexicon")+
theme(plot.title = element_text(hjust = 0.5))
ggplot(potter_sentiment, aes(index, sentiment, fill = book, color = book)) +
geom_line(show.legend = FALSE) +
coord_cartesian(ylim = c(-300, 100))+
facet_wrap(~book, ncol = 2, scales = "free_x")+
ggtitle("Harry Potter Sentiments: NRC Lexicon")+
theme(plot.title = element_text(hjust = 0.5))+
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(potter_sentiment, aes(index, sadness, color = book)) +
geom_line()+
facet_wrap(~book, ncol = 2, scales = "free_x")+
ggtitle("Harry Potter Sentiments: Sadness")+
theme(plot.title = element_text(hjust = 0.5))
ggplot(potter_sentiment, aes(index, anger, color = book)) +
geom_line()+
facet_wrap(~book, ncol = 2, scales = "free_x")+
ggtitle("Harry Potter Sentiments: Anger")+
theme(plot.title = element_text(hjust = 0.5))
ggplot(potter_sentiment, aes(index, fear, color = book)) +
geom_line()+
facet_wrap(~book, ncol = 2, scales = "free_x")+
ggtitle("Harry Potter Sentiments: Fear")+
theme(plot.title = element_text(hjust = 0.5))
sentiments_avg <- colMeans(potter_sentiment[3:13])
category <- colnames(potter_sentiment[3:13])
avg <- data.frame()
avg <- data.frame(cbind(category,as.numeric(sentiments_avg)))
colnames(avg) <- c("category", "avg")
rownames(avg) <- c(1:11)
avg$category <- as.factor(avg$category)
avg$avg <- as.numeric(avg$avg)
ggplot(avg, aes(x = category, y = avg, fill = category)) +
geom_col(stat = "identity")+
ggtitle("Harry Potter Sentiment Contributions")+
theme(plot.title = element_text(hjust = 0.5))+
scale_x_discrete(guide = guide_axis(n.dodge = 3))
## Warning: Ignoring unknown parameters: stat
Comments:
I would argue that “miss” is not as negative as it seems. It is very likely that most female characters were referred to as Miss.