In Text Mining with R, Chapter 2 looks at Sentiment Analysis. In this assignment, you should start by getting the primary example code from chapter 2 working in an R Markdown document. You should provide a citation to this base code. You’re then asked to extend the code in two ways:
Work with a different corpus of your choosing, and
Incorporate at least one additional sentiment lexicon (possibly from another R package that you’ve found through research).
The base code for this assignemnet is from chapter 2 of Text Mining with R: A Tidy Approach
https://www.tidytextmining.com/sentiment.html
library("tidyverse")
library("janeaustenr")
library("stringr")
library("tidytext")
get_sentiments("afinn")
## # A tibble: 2,477 × 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # ℹ 2,467 more rows
get_sentiments("bing")
## # A tibble: 6,786 × 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # ℹ 6,776 more rows
get_sentiments("nrc")
## # A tibble: 13,872 × 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # ℹ 13,862 more rows
# Tokenize the data
tidy_books <- austen_books() %>%
group_by(book) %>%
mutate(
linenumber = row_number(),
chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
ignore_case = TRUE
)))
) %>%
ungroup() %>%
unnest_tokens(word, text)
# The nrc lexicon
nrc_joy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
tidy_books %>%
filter(book == "Emma") %>%
inner_join(nrc_joy) %>%
count(word, sort = TRUE)
## # A tibble: 301 × 2
## word n
## <chr> <int>
## 1 good 359
## 2 friend 166
## 3 hope 143
## 4 happy 125
## 5 love 117
## 6 deal 92
## 7 found 92
## 8 present 89
## 9 kind 82
## 10 happiness 76
## # ℹ 291 more rows
# The bing lexicon
jane_austen_sentiment <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
# Visualization
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x")
# Filter out Pride and Prejudice from book
pride_prejudice <- tidy_books %>%
filter(book == "Pride & Prejudice")
pride_prejudice
## # A tibble: 122,204 × 4
## book linenumber chapter word
## <fct> <int> <int> <chr>
## 1 Pride & Prejudice 1 0 pride
## 2 Pride & Prejudice 1 0 and
## 3 Pride & Prejudice 1 0 prejudice
## 4 Pride & Prejudice 3 0 by
## 5 Pride & Prejudice 3 0 jane
## 6 Pride & Prejudice 3 0 austen
## 7 Pride & Prejudice 7 1 chapter
## 8 Pride & Prejudice 7 1 1
## 9 Pride & Prejudice 10 1 it
## 10 Pride & Prejudice 10 1 is
## # ℹ 122,194 more rows
afinn <- pride_prejudice %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
bing_and_nrc <- bind_rows(
pride_prejudice %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
pride_prejudice %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c(
"positive",
"negative"
))) %>%
mutate(method = "NRC")
) %>%
count(method, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
bind_rows(
afinn,
bing_and_nrc
) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y")
get_sentiments("nrc") %>%
filter(sentiment %in% c(
"positive",
"negative"
)) %>%
count(sentiment)
## # A tibble: 2 × 2
## sentiment n
## <chr> <int>
## 1 negative 3316
## 2 positive 2308
get_sentiments("bing") %>%
count(sentiment)
## # A tibble: 2 × 2
## sentiment n
## <chr> <int>
## 1 negative 4781
## 2 positive 2005
# Find out how much each word contributed to each sentiment.
bing_word_counts <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
bing_word_counts
## # A tibble: 2,585 × 3
## word sentiment n
## <chr> <chr> <int>
## 1 miss negative 1855
## 2 well positive 1523
## 3 good positive 1380
## 4 great positive 981
## 5 like positive 725
## 6 better positive 639
## 7 enough positive 613
## 8 happy positive 534
## 9 love positive 495
## 10 pleasure positive 462
## # ℹ 2,575 more rows
bing_word_counts %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(
y = "Contribution to sentiment",
x = NULL
) +
coord_flip()
custom_stop_words <- bind_rows(tibble(word = c("miss"),
lexicon = c("custom")),
stop_words)
custom_stop_words
## # A tibble: 1,150 × 2
## word lexicon
## <chr> <chr>
## 1 miss custom
## 2 a SMART
## 3 a's SMART
## 4 able SMART
## 5 about SMART
## 6 above SMART
## 7 according SMART
## 8 accordingly SMART
## 9 across SMART
## 10 actually SMART
## # ℹ 1,140 more rows
library(wordcloud)
tidy_books %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
library(reshape2)
tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("gray20", "gray80"),
max.words = 100)
p_and_p_sentences <- tibble(text = prideprejudice) %>%
unnest_tokens(sentence, text, token = "sentences")
p_and_p_sentences$sentence[2]
## [1] "by jane austen"
austen_chapters <- austen_books() %>%
group_by(book) %>%
unnest_tokens(chapter, text, token = "regex",
pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
ungroup()
austen_chapters %>%
group_by(book) %>%
summarise(chapters = n())
## # A tibble: 6 × 2
## book chapters
## <fct> <int>
## 1 Sense & Sensibility 51
## 2 Pride & Prejudice 62
## 3 Mansfield Park 49
## 4 Emma 56
## 5 Northanger Abbey 32
## 6 Persuasion 25
bingnegative <- get_sentiments("bing") %>%
filter(sentiment == "negative")
wordcounts <- tidy_books %>%
group_by(book, chapter) %>%
summarize(words = n())
tidy_books %>%
semi_join(bingnegative) %>%
group_by(book, chapter) %>%
summarize(negativewords = n()) %>%
left_join(wordcounts, by = c("book", "chapter")) %>%
mutate(ratio = negativewords/words) %>%
filter(chapter != 0) %>%
slice_max(ratio, n = 1) %>%
ungroup()
## # A tibble: 6 × 5
## book chapter negativewords words ratio
## <fct> <int> <int> <int> <dbl>
## 1 Sense & Sensibility 43 161 3405 0.0473
## 2 Pride & Prejudice 34 111 2104 0.0528
## 3 Mansfield Park 46 173 3685 0.0469
## 4 Emma 15 151 3340 0.0452
## 5 Northanger Abbey 21 149 2982 0.0500
## 6 Persuasion 4 62 1807 0.0343
“Jane Eyre” by Charlotte Brontë will be used.
The Gutenberg ID for “Jane Eyre” is 1260
library(gutenbergr)
jane_eyre <- gutenberg_download(1260)
# Tokenize the data
tidy_jane <- jane_eyre %>%
mutate(linenumber = row_number(), chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]", ignore_case = TRUE)))) %>%
unnest_tokens(word, text)
# The nrc lexicon
fear <- get_sentiments("nrc") %>%
filter(sentiment == "fear")
jane_sentiment1 <- tidy_jane %>%
inner_join(fear) %>%
count(word, sort = TRUE)
jane_sentiment1
## # A tibble: 675 × 2
## word n
## <chr> <int>
## 1 fire 122
## 2 god 96
## 3 feeling 67
## 4 doubt 62
## 5 fear 62
## 6 death 55
## 7 marry 49
## 8 change 41
## 9 bear 40
## 10 die 38
## # ℹ 665 more rows
# The bing lexicon
word_totals <- tidy_jane %>%
group_by(chapter) %>%
count()
jane_sentiment2 <- tidy_jane %>%
inner_join(get_sentiments("bing")) %>%
group_by(chapter) %>%
count(sentiment) %>%
filter(sentiment == 'negative') %>%
transform(p = n / word_totals$n) %>%
arrange(desc(p))
jane_sentiment2
## chapter sentiment n p
## 1 2 negative 181 0.06531938
## 2 6 negative 177 0.06055423
## 3 27 negative 558 0.04943741
## 4 3 negative 158 0.04891641
## 5 7 negative 168 0.04661487
## 6 20 negative 264 0.04488269
## 7 35 negative 194 0.04429224
## 8 4 negative 259 0.04415274
## 9 0 negative 42 0.04379562
## 10 14 negative 212 0.04252758
## 11 9 negative 139 0.04242979
## 12 26 negative 179 0.04146398
## 13 8 negative 125 0.04145937
## 14 21 negative 364 0.04094949
## 15 1 negative 80 0.04094166
## 16 15 negative 205 0.04040205
## 17 25 negative 205 0.04025923
## 18 36 negative 158 0.03979849
## 19 31 negative 122 0.03874246
## 20 28 negative 252 0.03609281
## 21 5 negative 175 0.03468094
## 22 23 negative 138 0.03448276
## 23 33 negative 167 0.03431977
## 24 37 negative 258 0.03414505
## 25 18 negative 201 0.03380992
## 26 12 negative 142 0.03359357
## 27 16 negative 124 0.03272631
## 28 30 negative 122 0.03236074
## 29 24 negative 232 0.03206634
## 30 29 negative 140 0.03052769
## 31 34 negative 285 0.03041298
## 32 32 negative 139 0.02993754
## 33 13 negative 114 0.02810651
## 34 17 negative 226 0.02759800
## 35 22 negative 77 0.02632479
## 36 19 negative 98 0.02564774
## 37 38 negative 42 0.02283850
## 38 11 negative 148 0.02267157
## 39 10 negative 91 0.02068182
# English sentiment lexicon created for use with financial documents. This lexicon labels words with six possible sentiments important in financial contexts: "negative", "positive", "litigious", "uncertainty", "constraining", or "superfluous".
# Append a column representing the proportion of positive words used
jane_sentiment3 <- tidy_jane %>%
inner_join(get_sentiments("loughran")) %>%
group_by(chapter) %>%
count(sentiment) %>%
filter(sentiment == 'positive') %>%
transform(p = n / word_totals$n) %>%
arrange(desc(p))
jane_sentiment3
## chapter sentiment n p
## 1 32 positive 68 0.014645703
## 2 31 positive 45 0.014290251
## 3 30 positive 50 0.013262599
## 4 38 positive 23 0.012506797
## 5 34 positive 114 0.012165190
## 6 9 positive 39 0.011904762
## 7 16 positive 41 0.010820797
## 8 8 positive 31 0.010281924
## 9 10 positive 45 0.010227273
## 10 11 positive 65 0.009957108
## 11 22 positive 28 0.009572650
## 12 6 positive 27 0.009237085
## 13 24 positive 66 0.009122322
## 14 33 positive 44 0.009042335
## 15 35 positive 39 0.008904110
## 16 4 positive 52 0.008864644
## 17 14 positive 43 0.008625878
## 18 18 positive 51 0.008578638
## 19 5 positive 43 0.008521601
## 20 29 positive 39 0.008504143
## 21 0 positive 8 0.008342023
## 22 37 positive 62 0.008205400
## 23 12 positive 34 0.008043530
## 24 15 positive 40 0.007883327
## 25 25 positive 40 0.007855460
## 26 23 positive 31 0.007746127
## 27 21 positive 64 0.007199910
## 28 13 positive 28 0.006903353
## 29 1 positive 13 0.006653019
## 30 19 positive 25 0.006542790
## 31 2 positive 18 0.006495850
## 32 17 positive 52 0.006349982
## 33 36 positive 25 0.006297229
## 34 3 positive 20 0.006191950
## 35 27 positive 69 0.006113228
## 36 20 positive 35 0.005950357
## 37 28 positive 39 0.005585792
## 38 7 positive 20 0.005549390
## 39 26 positive 18 0.004169562
# Bar plot - Positive Sentiment in Jane Eyre by Chapter
ggplot(jane_sentiment3, aes(x = reorder(factor(chapter), -n), y = n, fill = factor(chapter))) +
geom_bar(stat = "identity", color = "white") +
labs(x = "Chapter", y = "Proportion of positive words used") +
ggtitle("Positive Sentiment in Jane Eyre by Chapter") +
theme_minimal() +
theme(legend.position = "none")