## -- Attaching packages --------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.3 v dplyr 1.0.2
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts ------------------------------------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
## Warning: package 'textdata' was built under R version 4.0.3
## # A tibble: 2,477 x 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # ... with 2,467 more rows
## # A tibble: 6,786 x 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # ... with 6,776 more rows
## # A tibble: 13,901 x 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # ... with 13,891 more rows
library(janeaustenr)
library(dplyr)
tidy_books <- austen_books() %>%
group_by(book) %>%
mutate(
linenumber = row_number(),
chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
ignore_case = TRUE
)))
) %>%
ungroup() %>%
unnest_tokens(word, text)nrc_joy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
tidy_books %>%
filter(book == "Emma") %>%
inner_join(nrc_joy) %>%
count(word, sort = TRUE)## Joining, by = "word"
## # A tibble: 303 x 2
## word n
## <chr> <int>
## 1 good 359
## 2 young 192
## 3 friend 166
## 4 hope 143
## 5 happy 125
## 6 love 117
## 7 deal 92
## 8 found 92
## 9 present 89
## 10 kind 82
## # ... with 293 more rows
library(tidyr)
jane_austen_sentiment <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)## Joining, by = "word"
library(ggplot2)
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x")## # A tibble: 122,204 x 4
## book linenumber chapter word
## <fct> <int> <int> <chr>
## 1 Pride & Prejudice 1 0 pride
## 2 Pride & Prejudice 1 0 and
## 3 Pride & Prejudice 1 0 prejudice
## 4 Pride & Prejudice 3 0 by
## 5 Pride & Prejudice 3 0 jane
## 6 Pride & Prejudice 3 0 austen
## 7 Pride & Prejudice 7 1 chapter
## 8 Pride & Prejudice 7 1 1
## 9 Pride & Prejudice 10 1 it
## 10 Pride & Prejudice 10 1 is
## # ... with 122,194 more rows
afinn <- pride_prejudice %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")## Joining, by = "word"
## `summarise()` ungrouping output (override with `.groups` argument)
bing_and_nrc <- bind_rows(
pride_prejudice %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
pride_prejudice %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c(
"positive",
"negative"
))) %>%
mutate(method = "NRC")
) %>%
count(method, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)## Joining, by = "word"
## Joining, by = "word"
bind_rows(
afinn,
bing_and_nrc
) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y")## # A tibble: 2 x 2
## sentiment n
## <chr> <int>
## 1 negative 3324
## 2 positive 2312
## # A tibble: 2 x 2
## sentiment n
## <chr> <int>
## 1 negative 4781
## 2 positive 2005
bing_word_counts <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()## Joining, by = "word"
## # A tibble: 2,585 x 3
## word sentiment n
## <chr> <chr> <int>
## 1 miss negative 1855
## 2 well positive 1523
## 3 good positive 1380
## 4 great positive 981
## 5 like positive 725
## 6 better positive 639
## 7 enough positive 613
## 8 happy positive 534
## 9 love positive 495
## 10 pleasure positive 462
## # ... with 2,575 more rows
bing_word_counts %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(
y = "Contribution to sentiment",
x = NULL
) +
coord_flip()## Selecting by n
custom_stop_words <- bind_rows(
tibble(
word = c("miss"),
lexicon = c("custom")
),
stop_words
)
custom_stop_words## # A tibble: 1,150 x 2
## word lexicon
## <chr> <chr>
## 1 miss custom
## 2 a SMART
## 3 a's SMART
## 4 able SMART
## 5 about SMART
## 6 above SMART
## 7 according SMART
## 8 accordingly SMART
## 9 across SMART
## 10 actually SMART
## # ... with 1,140 more rows
## Warning: package 'wordcloud' was built under R version 4.0.3
## Loading required package: RColorBrewer
## Joining, by = "word"
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(
colors = c("gray20", "gray80"),
max.words = 100
)## Joining, by = "word"
Silge, J., & Robinson, D. (2017). Text mining with R: A tidy approach. Sebastopol, CA: O’Reilly.
Chapter 2: Sentiment Analysis with Tidy Data
See: www.tidytextmining.com/sentiment.html
We are going to analyze “Glasses”, found in gutenbergr package, it is a book written by Henry James. It is a story about a young woman whose only asset is a supremely beautiful face is about to make a society marriage until her fiance discovers that, being virtually bind, she needs thick glasses which ruin her looks.
Source: Glasses
## Warning: package 'gutenbergr' was built under R version 4.0.3
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
## # A tibble: 1,803 x 2
## gutenberg_id text
## <int> <chr>
## 1 1195 "GLASSES"
## 2 1195 ""
## 3 1195 ""
## 4 1195 "CHAPTER I"
## 5 1195 ""
## 6 1195 ""
## 7 1195 "Yes indeed, I say to myself, pen in hand, I can keep hold of t~
## 8 1195 "and let it lead me back to the first impression. The little s~
## 9 1195 "there, I can touch it from point to point; for the thread, as ~
## 10 1195 "is a row of coloured beads on a string. None of the beads are~
## # ... with 1,793 more rows
# Restructure to one-token_per-row and remove stop words
mybook_tidy <- mybook %>%
unnest_tokens(word, text) %>%
anti_join(stop_words)## Joining, by = "word"
## # A tibble: 5,670 x 2
## gutenberg_id word
## <int> <chr>
## 1 1195 glasses
## 2 1195 chapter
## 3 1195 pen
## 4 1195 hand
## 5 1195 hold
## 6 1195 thread
## 7 1195 lead
## 8 1195 impression
## 9 1195 story
## 10 1195 touch
## # ... with 5,660 more rows
# Restructure to one-token_per-row and remove stop words
mybook_chapters <- mybook %>%
filter(text != "") %>%
mutate(linenumber = row_number(),
chapter = cumsum(str_detect(text, regex("(Chapter )([\\divxlc])",
ignore_case = TRUE
)))
) %>%
ungroup()
mybook_chapters## # A tibble: 1,560 x 4
## gutenberg_id text linenumber chapter
## <int> <chr> <int> <int>
## 1 1195 GLASSES 1 0
## 2 1195 CHAPTER I 2 1
## 3 1195 Yes indeed, I say to myself, pen in hand, I ~ 3 1
## 4 1195 and let it lead me back to the first impress~ 4 1
## 5 1195 there, I can touch it from point to point; f~ 5 1
## 6 1195 is a row of coloured beads on a string. Non~ 6 1
## 7 1195 least I think they're not: that's exactly wh~ 7 1
## 8 1195 finding out. 8 1
## 9 1195 I had been all summer working hard in town a~ 9 1
## 10 1195 Folkestone for a blow. Art was long, I felt~ 10 1
## # ... with 1,550 more rows
Tidying by tokenizing and using afinn lexicon
# tidying mybook_chapter by tokenizing and using afinn lexicon
mybook_chapters_tidy <- mybook_chapters %>%
unnest_tokens(word, text) %>%
inner_join(get_sentiments("afinn"))## Joining, by = "word"
mybooks_rows_plot <- mybook_chapters_tidy %>%
inner_join(get_sentiments("bing")) %>%
count(index = linenumber %/% 20, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)## Joining, by = "word"
ggplot(mybooks_rows_plot, aes(index, sentiment)) +
geom_col(show.legend = FALSE) +
geom_col(fill = "violet") +
labs(title = "Net Sentiment accross the book") We can see that the sentiment accroos the book varies. We are going now to analyze the net sentiment per chapiter and the overall sentiment per chapter. Note that we have 13 chapters in this book.
# Grouping needed variables
mybook_chapters_plot <- mybook_chapters_tidy %>%
select(chapter, value) %>%
group_by(chapter) %>%
summarize(total_sentiment = sum(value))## `summarise()` ungrouping output (override with `.groups` argument)
# Plot
mybook_chapters_plot %>%
ggplot(aes(chapter, total_sentiment)) +
geom_col(fill = "red") +
xlab("Index - chapter") +
ylab("Net Sentiment") +
labs(title = "Net Sentiment accross the book per chapter") From the graph above we can see that the first 6 chapters have a net positive sentiment with the third one to be the most positive while chapter 7 is the most negative.
Let take a look at the overall sentiment in the entire book using bing lexicon:
# Get "bing" lexicon for this analysis
mybook_overall_sentiment <- mybook %>%
unnest_tokens(word, text) %>%
inner_join(get_sentiments("bing")) %>%
count(sentiment) %>%
mutate(total = n / sum(n))## Joining, by = "word"
# Plot
ggplot(mybook_overall_sentiment) +
aes(x = sentiment, y = total) +
geom_col(fill = "violet") +
xlab("Sentiment") +
ylab("Percent") +
labs(title = "Overall Sentiment") +
geom_text(aes(label = round(total * 100, 2) , vjust = -.4))There is almost positive contribution as there is negative one.
Let plot now the most positive and negative words below. We are going to use bing lexicon as well:
mybook %>%
unnest_tokens(word, text) %>%
inner_join(get_sentiments("bing")) %>%
filter(sentiment == "positive") %>%
count(word, sentiment, sort = TRUE) %>%
top_n(15) %>%
mutate(word = reorder(word, n)) %>%
ggplot() +
aes(x = word, y = n) +
labs(title = "Most Positive Words") +
ylab("Contribution to sentiment") +
xlab("Word") +
geom_col(fill = "blue") +
coord_flip()## Joining, by = "word"
## Selecting by n
mybook %>%
unnest_tokens(word, text) %>%
inner_join(get_sentiments("bing")) %>%
filter(sentiment == "negative") %>%
count(word, sentiment, sort = TRUE) %>%
top_n(15) %>%
mutate(word = reorder(word, n)) %>%
ggplot() +
aes(x = word, y = n) +
labs(title = "Most Negative Words") +
ylab("Contribution to sentiment") +
xlab("Word") +
geom_col(fill = "red") +
coord_flip() ## Joining, by = "word"
## Selecting by n
This lexicon labels words with six possible sentiments important in financial contexts: “negative”, “positive”, “litigious”, “uncertainty”, “constraining”, or “superfluous”.
source: https://emilhvitfeldt.github.io/textdata/reference/lexicon_loughran.html
In this analysis, we are going to explore type of words in “Glasses” are associated to “uncertainty” and “constraining”.
We chose these words by the fact they are not really common in daily conversation.
mybook_chapters %>%
unnest_tokens(word, text) %>%
inner_join(get_sentiments("loughran")) %>%
filter(sentiment %in% c("litigious", "superfluous")) %>%
count(word, sentiment, sort = TRUE) %>%
group_by(sentiment) %>%
top_n(10) %>%
ggplot() +
aes(x = reorder(word,desc(n)), y = n) +
geom_col(fill = "turquoise") +
facet_grid(~sentiment, scales = "free_x") +
geom_text(aes(label = n, vjust = -.5)) +
labs(title = "Words Associated to litigious & Superfluous") +
facet_wrap(~sentiment, ncol = 1, scales = "free_x") +
xlab("Word") +
ylab("Count") ## Joining, by = "word"
## Selecting by n
Although the two words are not common but we found more words related to “litigious” but only one to “superfluous”, that is really interesting.
We will use this then compare the words I will get to the ones I will get using nrc lexicon.
mybook_chapters %>%
unnest_tokens(word, text) %>%
inner_join(get_sentiments("loughran")) %>%
filter(sentiment %in% c("positive", "negative")) %>%
count(word, sentiment, sort = TRUE) %>%
group_by(sentiment) %>%
top_n(10) %>%
ggplot() +
aes(x = reorder(word,desc(n)), y = n) +
geom_col(fill = "turquoise") +
facet_grid(~sentiment, scales = "free_x") +
geom_text(aes(label = n, vjust = -.5)) +
labs(title = "Positive words") +
facet_wrap(~sentiment, ncol = 1, scales = "free_x") +
xlab("Word") +
ylab("Count") ## Joining, by = "word"
## Selecting by n
This is to compare how both lexicon classify words
mybook_chapters %>%
unnest_tokens(word, text) %>%
inner_join(get_sentiments("nrc")) %>%
filter(sentiment %in% c("positive", "negative")) %>%
count(word, sentiment, sort = TRUE) %>%
group_by(sentiment) %>%
top_n(10) %>%
ggplot() +
aes(x = reorder(word,desc(n)), y = n) +
geom_col(fill = "turquoise") +
facet_grid(~sentiment, scales = "free_x") +
geom_text(aes(label = n, vjust = -.5)) +
labs(title = "Positive words") +
facet_wrap(~sentiment, ncol = 1, scales = "free_x") +
xlab("Word") +
ylab("Count") ## Joining, by = "word"
## Selecting by n
When look at on the two last graphs, we can see that the sentiment lexicons don’t classify words in the same way even though the emotion is the same. We can see that for “positive” emotion for example in both loughran and nrc sentiment lexicon, the most frequent words are not the same. Only the word “good” appears in both with the same count. The rest are different. Negative emotion in the other hand, all the words are different. Thus, choosing a sentiment lexicon would depend on specific aspects we want to base our sentiment analysis. We need to ask ourselves several questions on the text we want to analyze prior even starting the analysis or using any sentiment lexicon.