library (tidytext)
library(tidyverse)
library(stringr)
library(textdata)
get_sentiments("afinn")
## # A tibble: 2,477 x 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # ... with 2,467 more rows
get_sentiments("bing")
## # A tibble: 6,786 x 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # ... with 6,776 more rows
get_sentiments("nrc")
## # A tibble: 13,875 x 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # ... with 13,865 more rows
library(janeaustenr)
library(dplyr)
tidy_books <- austen_books() %>%
group_by(book) %>%
mutate(
linenumber = row_number(),
chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
ignore_case = TRUE
)))
) %>%
ungroup() %>%
unnest_tokens(word, text)
nrc_joy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
tidy_books %>%
filter(book == "Emma") %>%
inner_join(nrc_joy) %>%
count(word, sort = TRUE)
## # A tibble: 301 x 2
## word n
## <chr> <int>
## 1 good 359
## 2 friend 166
## 3 hope 143
## 4 happy 125
## 5 love 117
## 6 deal 92
## 7 found 92
## 8 present 89
## 9 kind 82
## 10 happiness 76
## # ... with 291 more rows
library(tidyr)
jane_austen_sentiment <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
library(ggplot2)
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x")
pride_prejudice <- tidy_books %>%
filter(book == "Pride & Prejudice")
pride_prejudice
## # A tibble: 122,204 x 4
## book linenumber chapter word
## <fct> <int> <int> <chr>
## 1 Pride & Prejudice 1 0 pride
## 2 Pride & Prejudice 1 0 and
## 3 Pride & Prejudice 1 0 prejudice
## 4 Pride & Prejudice 3 0 by
## 5 Pride & Prejudice 3 0 jane
## 6 Pride & Prejudice 3 0 austen
## 7 Pride & Prejudice 7 1 chapter
## 8 Pride & Prejudice 7 1 1
## 9 Pride & Prejudice 10 1 it
## 10 Pride & Prejudice 10 1 is
## # ... with 122,194 more rows
afinn <- pride_prejudice %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
bing_and_nrc <- bind_rows(
pride_prejudice %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
pride_prejudice %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c(
"positive",
"negative"
))) %>%
mutate(method = "NRC")
) %>%
count(method, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
bind_rows(
afinn,
bing_and_nrc
) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y")
get_sentiments("nrc") %>%
filter(sentiment %in% c(
"positive",
"negative"
)) %>%
count(sentiment)
## # A tibble: 2 x 2
## sentiment n
## <chr> <int>
## 1 negative 3318
## 2 positive 2308
get_sentiments("bing") %>%
count(sentiment)
## # A tibble: 2 x 2
## sentiment n
## <chr> <int>
## 1 negative 4781
## 2 positive 2005
bing_word_counts <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
bing_word_counts
## # A tibble: 2,585 x 3
## word sentiment n
## <chr> <chr> <int>
## 1 miss negative 1855
## 2 well positive 1523
## 3 good positive 1380
## 4 great positive 981
## 5 like positive 725
## 6 better positive 639
## 7 enough positive 613
## 8 happy positive 534
## 9 love positive 495
## 10 pleasure positive 462
## # ... with 2,575 more rows
bing_word_counts %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(
y = "Contribution to sentiment",
x = NULL
) +
coord_flip()
custom_stop_words <- bind_rows(
tibble(
word = c("miss"),
lexicon = c("custom")
),
stop_words
)
custom_stop_words
## # A tibble: 1,150 x 2
## word lexicon
## <chr> <chr>
## 1 miss custom
## 2 a SMART
## 3 a's SMART
## 4 able SMART
## 5 about SMART
## 6 above SMART
## 7 according SMART
## 8 accordingly SMART
## 9 across SMART
## 10 actually SMART
## # ... with 1,140 more rows
library(wordcloud)
tidy_books %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
library(reshape2)
tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(
colors = c("gray20", "gray80"),
max.words = 100
)
Source
Silge, J., & Robinson, D. (2017). Text mining with R: A tidy approach. Sebastopol, CA: O’Reilly.
Chapter 2: Sentiment Analysis with Tidy Data
See: www.tidytextmining.com/sentiment.html
Sentiment Analysis of “The Black Experience IN America”, a book written by Norman Coombs.
We are going to analyze a book title “The Black Experience in America”, written by Norman Coombs; found in gutenbergr package. It’s a book that give a interpretative insight to the History, struggle and emancipation of the black race in America. It also talks about the immigration of the black race from Africa and the variety of rich contribution they have made to America.
Source: THE BLACK EXPERIENCE IN AMERICA
library(gutenbergr)
# Download the book id 67, "THE BLACK LIVE IN AMERICA"
norman_book <- gutenberg_download(67)
norman_book
## # A tibble: 7,967 x 2
## gutenberg_id text
## <int> <chr>
## 1 67 "THE BLACK EXPERIENCE IN AMERICA"
## 2 67 ""
## 3 67 "Published electronically by its author, Norman Coombs, and Pro~
## 4 67 "Gutenberg."
## 5 67 ""
## 6 67 "(C 1993) by Norman Coombs"
## 7 67 ""
## 8 67 ""
## 9 67 "This text is claimed under copyright to protect its integrity,~
## 10 67 "therefore you are required to pass it on intact, but you may m~
## # ... with 7,957 more rows
# Restructure to one-token_per-row and remove stop words
norman_book_tidy <- norman_book %>%
unnest_tokens(word, text) %>%
anti_join(stop_words)
norman_book_tidy
## # A tibble: 34,141 x 2
## gutenberg_id word
## <int> <chr>
## 1 67 black
## 2 67 experience
## 3 67 america
## 4 67 published
## 5 67 electronically
## 6 67 author
## 7 67 norman
## 8 67 coombs
## 9 67 project
## 10 67 gutenberg
## # ... with 34,131 more rows
# Restructure to one-token_per-row and remove stop words
norman_book_chapters <- norman_book %>%
filter(text != "") %>%
mutate(linenumber = row_number(),
chapter = cumsum(str_detect(text, regex("(Chapter )([\\divxlc])",
ignore_case = TRUE
)))
) %>%
ungroup()
norman_book_chapters
## # A tibble: 7,132 x 4
## gutenberg_id text linenumber chapter
## <int> <chr> <int> <int>
## 1 67 THE BLACK EXPERIENCE IN AMERICA 1 0
## 2 67 Published electronically by its author, Norm~ 2 0
## 3 67 Gutenberg. 3 0
## 4 67 (C 1993) by Norman Coombs 4 0
## 5 67 This text is claimed under copyright to prot~ 5 0
## 6 67 therefore you are required to pass it on int~ 6 0
## 7 67 to your own copy. This text may be shared i~ 7 0
## 8 67 this header is included. It may be quoted f~ 8 0
## 9 67 authorship is properly credited. As the boo~ 9 0
## 10 67 has chosen to make it freely available. 10 0
## # ... with 7,122 more rows
Tidying by tokenizing and using afinn lexicon
# tidying mybook_chapter by tokenizing and using afinn lexicon
norman_book_chapters_tidy <- norman_book_chapters %>%
unnest_tokens(word, text) %>%
inner_join(get_sentiments("afinn"))
norman_books_rows_plot <- norman_book_chapters_tidy %>%
inner_join(get_sentiments("bing")) %>%
count(index = linenumber %/% 20, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
ggplot(norman_books_rows_plot, aes(index, sentiment)) +
geom_col(show.legend = FALSE) +
geom_col(fill = "red") +
labs(title = "Net Sentiment accross the book")
From the 12 chapters contain in the book.We can see that the sentiment varies across the book. We are to analyze the net sentiment per chapter and the overall sentiment per chapter.
# Grouping needed variables
norman_book_chapters_plot <- norman_book_chapters_tidy %>%
select(chapter, value) %>%
group_by(chapter) %>%
summarize(total_sentiment = sum(value))
# Plot
norman_book_chapters_plot %>%
ggplot(aes(chapter, total_sentiment)) +
geom_col(fill = "purple") +
xlab("Index - chapter") +
ylab("Net Sentiment") +
labs(title = "Net Sentiment accross the book per chapter")
From the graph above we can see that majority of the chapters have a net negative sentiment with few positive sentiment; with the last chapter having the most negative sentiment while chapter 2 is the most positive.
Let take a look at the overall sentiment in the entire book using bing lexicon:
# Get "bing" lexicon for this analysis
norman_book_overall_sentiment <- norman_book %>%
unnest_tokens(word, text) %>%
inner_join(get_sentiments("bing")) %>%
count(sentiment) %>%
mutate(total = n / sum(n))
# Plot
ggplot(norman_book_overall_sentiment) +
aes(x = sentiment, y = total) +
geom_col(fill = "blue") +
xlab("Sentiment") +
ylab("Percent") +
labs(title = "Overall Sentiment") +
geom_text(aes(label = round(total * 100, 2) , vjust = -.4))
From the plot, it is clear that there are more negative contribution than positve contribution in the sentiment
Let plot now the most positive and negative words below. We are going to use bing lexicon as well:
norman_book %>%
unnest_tokens(word, text) %>%
inner_join(get_sentiments("bing")) %>%
filter(sentiment == "positive") %>%
count(word, sentiment, sort = TRUE) %>%
top_n(15) %>%
mutate(word = reorder(word, n)) %>%
ggplot() +
aes(x = word, y = n) +
labs(title = "Most Positive Words") +
ylab("Contribution to sentiment") +
xlab("Word") +
geom_col(fill = "purple") +
coord_flip()
norman_book %>%
unnest_tokens(word, text) %>%
inner_join(get_sentiments("bing")) %>%
filter(sentiment == "negative") %>%
count(word, sentiment, sort = TRUE) %>%
top_n(15) %>%
mutate(word = reorder(word, n)) %>%
ggplot() +
aes(x = word, y = n) +
labs(title = "Most Negative Words") +
ylab("Contribution to sentiment") +
xlab("Word") +
geom_col(fill = "red") +
coord_flip()
library(wordcloud)
norman_book_tidy %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
# Get loughran
sentiment <- get_sentiments("loughran")
We want to compare both the positive and negative word that will be generated from the emotion in using the nrc lexicon and the loughran lexicon
norman_book_chapters %>%
unnest_tokens(word, text) %>%
inner_join(get_sentiments("loughran")) %>%
filter(sentiment %in% c("positive", "negative")) %>%
count(word, sentiment, sort = TRUE) %>%
group_by(sentiment) %>%
top_n(10) %>%
ggplot() +
aes(x = reorder(word,desc(n)), y = n) +
geom_col(fill = "violet") +
facet_grid(~sentiment, scales = "free_x") +
geom_text(aes(label = n, vjust = .4)) +
labs(title = "Negative and Positive words") +
facet_wrap(~sentiment, ncol = 1, scales = "free_x") +
xlab("Word") +
ylab("Count")
This is to compare how both lexicon classify words
norman_book_chapters %>%
unnest_tokens(word, text) %>%
inner_join(get_sentiments("nrc")) %>%
filter(sentiment %in% c("positive", "negative")) %>%
count(word, sentiment, sort = TRUE) %>%
group_by(sentiment) %>%
top_n(10) %>%
ggplot() +
aes(x = reorder(word,desc(n)), y = n) +
geom_col(fill = "violet") +
facet_grid(~sentiment, scales = "free_x") +
geom_text(aes(label = n, vjust = 0.4)) +
labs(title = "Negative and Positive words") +
facet_wrap(~sentiment, ncol = 1, scales = "free_x") +
xlab("Word") +
ylab("Count")
looking at on the two last graphs, we can see that the sentiment lexicons don’t classify words in the same way; even though the emotion is the same. For example, the most frequent words for “positive” emotion in both the lougran and the nrc sentiment lexicon are not the same. All the Negative and positive emotion for in the both sentiment lexicons are different. Thus, choosing a sentiment lexicon would depend on specific aspects we want to base our sentiment analysis.