In Text Mining with R, Chapter 2 looks at Sentiment Analysis. In this assignment, you should start by getting the primary example code from chapter 2 working in an R Markdown document. You should provide a citation to this base code. You’re then asked to extend the code in two ways:
Work with a different corpus of your choosing, and Incorporate at least one additional sentiment lexicon (possibly from another R package that you’ve found through research).
As usual, please submit links to both an .Rmd file posted in your GitHub repository and to your code on rpubs.com. You make work on a small team on this assignment.
library(tidyverse)
## -- Attaching packages ----------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.3 v dplyr 1.0.2
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts -------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(janeaustenr)
## Warning: package 'janeaustenr' was built under R version 4.0.3
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.0.3
library(stringr)
library(dplyr)
library(tidyr)
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 4.0.3
## Loading required package: RColorBrewer
I took the code directly from the text: “2 Sentiment Analysis with Tidy Data.” Text Mining with R: a Tidy Approach, by Julia Silge and David Robinson, O’Reilly Media, 2017. I also downloaded the NRC code from Saif Mohammad and Peter Turney.
get_sentiments("afinn")
## # A tibble: 2,477 x 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # ... with 2,467 more rows
get_sentiments("bing")
## # A tibble: 6,786 x 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # ... with 6,776 more rows
get_sentiments("nrc")
## # A tibble: 13,901 x 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # ... with 13,891 more rows
tidy_books <- austen_books() %>%
group_by(book) %>%
mutate(linenumber = row_number(),
chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup() %>%
unnest_tokens(word, text)
nrc_joy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
tidy_books %>%
filter(book == "Emma") %>%
inner_join(nrc_joy) %>%
count(word, sort = TRUE)
## Joining, by = "word"
## # A tibble: 303 x 2
## word n
## <chr> <int>
## 1 good 359
## 2 young 192
## 3 friend 166
## 4 hope 143
## 5 happy 125
## 6 love 117
## 7 deal 92
## 8 found 92
## 9 present 89
## 10 kind 82
## # ... with 293 more rows
janeaustensentiment <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
ggplot(janeaustensentiment, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x")
# comparing 3 sentiment dictionaries
pride_prejudice <- tidy_books %>%
filter(book == "Pride & Prejudice")
pride_prejudice
## # A tibble: 122,204 x 4
## book linenumber chapter word
## <fct> <int> <int> <chr>
## 1 Pride & Prejudice 1 0 pride
## 2 Pride & Prejudice 1 0 and
## 3 Pride & Prejudice 1 0 prejudice
## 4 Pride & Prejudice 3 0 by
## 5 Pride & Prejudice 3 0 jane
## 6 Pride & Prejudice 3 0 austen
## 7 Pride & Prejudice 7 1 chapter
## 8 Pride & Prejudice 7 1 1
## 9 Pride & Prejudice 10 1 it
## 10 Pride & Prejudice 10 1 is
## # ... with 122,194 more rows
afinn <- pride_prejudice %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
## Joining, by = "word"
## `summarise()` ungrouping output (override with `.groups` argument)
bing_and_nrc <- bind_rows(pride_prejudice %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
pride_prejudice %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c("positive",
"negative"))) %>%
mutate(method = "NRC")) %>%
count(method, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
## Joining, by = "word"
get_sentiments("nrc") %>%
filter(sentiment %in% c("positive",
"negative")) %>%
count(sentiment)
## # A tibble: 2 x 2
## sentiment n
## <chr> <int>
## 1 negative 3324
## 2 positive 2312
get_sentiments("bing") %>%
count(sentiment)
## # A tibble: 2 x 2
## sentiment n
## <chr> <int>
## 1 negative 4781
## 2 positive 2005
# most common positive and negative words
bing_word_counts <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
bing_word_counts
## # A tibble: 2,585 x 3
## word sentiment n
## <chr> <chr> <int>
## 1 miss negative 1855
## 2 well positive 1523
## 3 good positive 1380
## 4 great positive 981
## 5 like positive 725
## 6 better positive 639
## 7 enough positive 613
## 8 happy positive 534
## 9 love positive 495
## 10 pleasure positive 462
## # ... with 2,575 more rows
bing_word_counts %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip()
## Selecting by n
custom_stop_words <- bind_rows(tibble(word = c("miss"),
lexicon = c("custom")),
stop_words)
custom_stop_words
## # A tibble: 1,150 x 2
## word lexicon
## <chr> <chr>
## 1 miss custom
## 2 a SMART
## 3 a's SMART
## 4 able SMART
## 5 about SMART
## 6 above SMART
## 7 according SMART
## 8 accordingly SMART
## 9 across SMART
## 10 actually SMART
## # ... with 1,140 more rows
# wordclouds
tidy_books %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
## Joining, by = "word"
Hamlet, by William Shakespeare is one of my favorite plays. I decided to use gutenbergr package and the link for the play is here.
library(gutenbergr)
## Warning: package 'gutenbergr' was built under R version 4.0.3
gutenberg_metadata %>%
filter(author == "Shakespeare, William",
title == "Hamlet",
language == "en",
!str_detect(rights, "Copyright"))
## # A tibble: 1 x 8
## gutenberg_id title author gutenberg_autho~ language gutenberg_books~ rights
## <int> <chr> <chr> <int> <chr> <chr> <chr>
## 1 2265 Haml~ Shake~ 65 en Best Books Ever~ Publi~
## # ... with 1 more variable: has_text <lgl>
According to the search, 2265 is the gutenberg_id to download Hamlet.
hamlet <- gutenberg_download(2265)
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
hamlet
## # A tibble: 5,013 x 2
## gutenberg_id text
## <int> <chr>
## 1 2265 "Executive Director's Notes:"
## 2 2265 ""
## 3 2265 "In addition to the notes below, and so you will *NOT* think al~
## 4 2265 "the spelling errors introduced by the printers of the time hav~
## 5 2265 "been corrected, here are the first few lines of Hamlet, as the~
## 6 2265 "are presented herein:"
## 7 2265 ""
## 8 2265 " Barnardo. Who's there?"
## 9 2265 " Fran. Nay answer me: Stand & vnfold"
## 10 2265 "your selfe"
## # ... with 5,003 more rows
tidy_hamlet <- hamlet %>%
unnest_tokens(word, text)
tidy_hamlet
## # A tibble: 30,384 x 2
## gutenberg_id word
## <int> <chr>
## 1 2265 executive
## 2 2265 director's
## 3 2265 notes
## 4 2265 in
## 5 2265 addition
## 6 2265 to
## 7 2265 the
## 8 2265 notes
## 9 2265 below
## 10 2265 and
## # ... with 30,374 more rows
tidy_hamlet <- hamlet %>%
unnest_tokens(word, text)
tidy_hamlet
## # A tibble: 30,384 x 2
## gutenberg_id word
## <int> <chr>
## 1 2265 executive
## 2 2265 director's
## 3 2265 notes
## 4 2265 in
## 5 2265 addition
## 6 2265 to
## 7 2265 the
## 8 2265 notes
## 9 2265 below
## 10 2265 and
## # ... with 30,374 more rows
data(stop_words)
tidy_hamlet <- tidy_hamlet %>%
anti_join(stop_words)
## Joining, by = "word"
tidy_hamlet %>%
count(word, sort = TRUE)
## # A tibble: 4,583 x 2
## word n
## <chr> <int>
## 1 ham 337
## 2 lord 211
## 3 haue 175
## 4 king 172
## 5 thou 105
## 6 hamlet 102
## 7 hor 95
## 8 thy 90
## 9 enter 85
## 10 selfe 69
## # ... with 4,573 more rows
tidy_hamlet %>%
count(word, sort = TRUE) %>%
top_n(20, n) %>%
ggplot(aes(x = fct_reorder(word, n), y = n, fill = word)) +
geom_col(show.legend = FALSE) +
scale_fill_viridis_d(option = "inferno") +
coord_flip() +
xlab(NULL) +
labs(title = "Hamlet - Word Frequency") +
theme_minimal()
The most used word is ham for Hamlet.
tidy_hamlet <- hamlet %>%
mutate(gutenberg_id = row_number(),
chapter = cumsum(str_detect(text,
regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup() %>%
unnest_tokens(word, text)
sentiment_hamlet <- get_sentiments("nrc")
sentiment_hamlet
## # A tibble: 13,901 x 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # ... with 13,891 more rows
nrc_joy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
nrc_joy
## # A tibble: 689 x 2
## word sentiment
## <chr> <chr>
## 1 absolution joy
## 2 abundance joy
## 3 abundant joy
## 4 accolade joy
## 5 accompaniment joy
## 6 accomplish joy
## 7 accomplished joy
## 8 achieve joy
## 9 achievement joy
## 10 acrobat joy
## # ... with 679 more rows
tidy_hamlet %>%
#filter(title == "Hamlet") %>%
inner_join(nrc_joy) %>%
count(word, sort = TRUE)
## Joining, by = "word"
## # A tibble: 119 x 2
## word n
## <chr> <int>
## 1 good 98
## 2 mother 40
## 3 pray 28
## 4 god 26
## 5 sweet 22
## 6 true 22
## 7 art 16
## 8 daughter 16
## 9 marry 15
## 10 youth 14
## # ... with 109 more rows
bing_hamlet <- tidy_hamlet %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
bing_hamlet
## # A tibble: 533 x 3
## word sentiment n
## <chr> <chr> <int>
## 1 good positive 98
## 2 like positive 81
## 3 well positive 71
## 4 death negative 36
## 5 mar negative 31
## 6 dead negative 30
## 7 sweet positive 22
## 8 mad negative 21
## 9 great positive 20
## 10 noble positive 16
## # ... with 523 more rows
bing_hamlet %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip() +
geom_text(aes(label = n, hjust = 1.0))
## Selecting by n
death, mar, and dead are the most negative words used in Hamlet.
good, like, and well are the most positive words used in Hamlet.
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.0.3
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
tidy_hamlet %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("gray20", "gray80"),
max.words = 100)
## Joining, by = "word"
I will use the Loughran lexicon to perfom sentiment analysis from here
get_sentiments("loughran")
## # A tibble: 4,150 x 2
## word sentiment
## <chr> <chr>
## 1 abandon negative
## 2 abandoned negative
## 3 abandoning negative
## 4 abandonment negative
## 5 abandonments negative
## 6 abandons negative
## 7 abdicated negative
## 8 abdicates negative
## 9 abdicating negative
## 10 abdication negative
## # ... with 4,140 more rows
lexi_hamlet <- tidy_hamlet %>%
inner_join(get_sentiments("loughran")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
lexi_hamlet
## # A tibble: 241 x 3
## word sentiment n
## <chr> <chr> <int>
## 1 shall litigious 107
## 2 good positive 98
## 3 may uncertainty 69
## 4 could uncertainty 31
## 5 might uncertainty 30
## 6 against negative 20
## 7 great positive 20
## 8 best positive 14
## 9 better positive 14
## 10 question negative 14
## # ... with 231 more rows
lexi_hamlet %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip() +
geom_text(aes(label = n, hjust = 1.0))
## Selecting by n
After applying the loughran lexicon, the word may is the most litigious word used in Hamlet, which bypasses the word good from the bing analysis.