In Text Mining with R, Chapter 2 looks at Sentiment Analysis. In this assignment, you should start by getting the primary  example code from chapter 2 working in an R Markdown document. You should provide a citation to this base code.  You’re then asked to extend the code in two ways:
Work with a different corpus of your choosing, and
Incorporate at least one additional sentiment lexicon (possibly from another R package that you’ve found through research).
As usual, please submit links to both an .Rmd file posted in your GitHub repository and to your code on rpubs.com.
You make work on a small team on this assignment.
# Load library
library(stringr)
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.0.5
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.0.6 v dplyr 1.0.2
## v tidyr 1.1.2 v forcats 0.5.0
## v readr 1.4.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidyr)
library(ggplot2)
library(dplyr)
library(textdata)
## Warning: package 'textdata' was built under R version 4.0.5
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 4.0.5
## Loading required package: RColorBrewer
library(janeaustenr)
## Warning: package 'janeaustenr' was built under R version 4.0.5
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(ggwordcloud)
## Warning: package 'ggwordcloud' was built under R version 4.0.5
library(wordcloud)
library(gutenbergr)
## Warning: package 'gutenbergr' was built under R version 4.0.5
tidy_book <- function(author) {
author %>%
unnest_tokens(word, text) %>%
anti_join(stop_words)
}
facet_bar <- function(df, y, x, by, nrow = 2, ncol = 2, scales = "free") {
mapping <- aes(y = reorder_within({{ y }}, {{ x }}, {{ by }}),
x = {{ x }},
fill = {{ by }})
facet <- facet_wrap(vars({{ by }}),
nrow = nrow,
ncol = ncol,
scales = scales)
ggplot(df, mapping = mapping) +
geom_col(show.legend = FALSE) +
scale_y_reordered() +
facet +
ylab("")
}
tidy_books <- austen_books() %>%
group_by(book) %>%
mutate(linenumber = row_number(),
chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup() %>%
unnest_tokens(word, text)
nrc_joy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
tidy_books %>%
filter(book == "Emma") %>%
inner_join(nrc_joy) %>%
count(word, sort = TRUE)
## Joining, by = "word"
## # A tibble: 303 x 2
## word n
## <chr> <int>
## 1 good 359
## 2 young 192
## 3 friend 166
## 4 hope 143
## 5 happy 125
## 6 love 117
## 7 deal 92
## 8 found 92
## 9 present 89
## 10 kind 82
## # ... with 293 more rows
jane_austen_sentiment <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
mutate(index = linenumber %/% 80) %>%
count(book, index, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = list(n = 0)) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
jane_austen_sentiment
## # A tibble: 920 x 5
## book index negative positive sentiment
## <fct> <dbl> <int> <int> <int>
## 1 Sense & Sensibility 0 16 32 16
## 2 Sense & Sensibility 1 19 53 34
## 3 Sense & Sensibility 2 12 31 19
## 4 Sense & Sensibility 3 15 31 16
## 5 Sense & Sensibility 4 16 34 18
## 6 Sense & Sensibility 5 16 51 35
## 7 Sense & Sensibility 6 24 40 16
## 8 Sense & Sensibility 7 23 51 28
## 9 Sense & Sensibility 8 30 40 10
## 10 Sense & Sensibility 9 15 19 4
## # ... with 910 more rows
ggplot(jane_austen_sentiment) +
geom_col(aes(index, sentiment, fill = book), show.legend = F) +
facet_wrap( ~ book, ncol = 2, scales = "free_x")
pride_prejudice <- tidy_books %>%
filter(book == "Pride & Prejudice")
pride_prejudice
## # A tibble: 122,204 x 4
## book linenumber chapter word
## <fct> <int> <int> <chr>
## 1 Pride & Prejudice 1 0 pride
## 2 Pride & Prejudice 1 0 and
## 3 Pride & Prejudice 1 0 prejudice
## 4 Pride & Prejudice 3 0 by
## 5 Pride & Prejudice 3 0 jane
## 6 Pride & Prejudice 3 0 austen
## 7 Pride & Prejudice 7 1 chapter
## 8 Pride & Prejudice 7 1 1
## 9 Pride & Prejudice 10 1 it
## 10 Pride & Prejudice 10 1 is
## # ... with 122,194 more rows
afinn <- pride_prejudice %>%
inner_join(get_sentiments("afinn")) %>%
mutate(index = linenumber %/% 80) %>%
count(book, index, wt = value, name = "value") %>%
mutate(dict = "afinn") %>%
select(index, value, dict)
## Joining, by = "word"
bing <- pride_prejudice %>%
inner_join(get_sentiments("bing")) %>%
mutate(index = linenumber %/% 80) %>%
count(index, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = list(n = 0)) %>%
mutate(value = positive - negative,
dict = "bing") %>%
select(index, value, dict)
## Joining, by = "word"
nrc <- pride_prejudice %>%
inner_join(get_sentiments("nrc")) %>%
filter(sentiment %in% c("positive", "negative")) %>%
mutate(index = linenumber %/% 80) %>%
count(index, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = list(n = 0)) %>%
mutate(value = positive - negative,
dict = "nrc") %>%
select(index, value, dict)
## Joining, by = "word"
bind_rows(afinn, bing, nrc) %>%
ggplot() +
geom_col(aes(index, value, fill = dict), show.legend = FALSE) +
facet_wrap(~ dict, nrow = 3)
get_sentiments("nrc") %>%
filter(sentiment %in% c("positive", "negative")) %>%
count(sentiment)
## # A tibble: 2 x 2
## sentiment n
## <chr> <int>
## 1 negative 3324
## 2 positive 2312
get_sentiments("bing") %>%
count(sentiment)
## # A tibble: 2 x 2
## sentiment n
## <chr> <int>
## 1 negative 4781
## 2 positive 2005
bing_word_counts <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
bing_word_counts
## # A tibble: 2,585 x 3
## word sentiment n
## <chr> <chr> <int>
## 1 miss negative 1855
## 2 well positive 1523
## 3 good positive 1380
## 4 great positive 981
## 5 like positive 725
## 6 better positive 639
## 7 enough positive 613
## 8 happy positive 534
## 9 love positive 495
## 10 pleasure positive 462
## # ... with 2,575 more rows
custom_stop_words <- tibble(word = c("miss"), lexicon = c("custom")) %>%
bind_rows(stop_words)
bing_word_counts <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
anti_join(custom_stop_words) %>%
group_by(sentiment) %>%
count(word, sentiment, sort = T) %>%
ungroup()
## Joining, by = "word"
## Joining, by = "word"
bing_word_counts %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
facet_bar(y = word, x = n, by = sentiment, nrow = 1) +
labs(title = "Top 10 words of sentiment in Jane Austen's books")
## Selecting by n
wordcloud_df <-tidy_books %>%
anti_join(custom_stop_words) %>%
inner_join(get_sentiments("bing")) %>%
count(sentiment, word, sort = T) %>%
top_n(200)
## Joining, by = "word"
## Joining, by = "word"
## Selecting by n
wordcloud_df %>%
ggplot() +
geom_text_wordcloud_area(aes(label = word, size = n)) +
scale_size_area(max_size = 15)
wordcloud_df %>%
ggplot() +
geom_text_wordcloud_area(aes(label = word, size = n), shape = "star") +
scale_size_area(max_size = 15)
tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
reshape2::acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("gray20", "gray80"),
max.words = 100)
## Joining, by = "word"
PandP_sentences <- tibble(text = prideprejudice) %>%
unnest_tokens(sentence, text, token = "sentences")
PandP_sentences
## # A tibble: 15,545 x 1
## sentence
## <chr>
## 1 "pride and prejudice"
## 2 "by jane austen"
## 3 "chapter 1"
## 4 "it is a truth universally acknowledged, that a single man in possession"
## 5 "of a good fortune, must be in want of a wife."
## 6 "however little known the feelings or views of such a man may be on his"
## 7 "first entering a neighbourhood, this truth is so well fixed in the minds"
## 8 "of the surrounding families, that he is considered the rightful property"
## 9 "of some one or other of their daughters."
## 10 "\"my dear mr."
## # ... with 15,535 more rows
tibble(text = prideprejudice) %>%
mutate(text = iconv(text, to = "ASCII")) %>%
unnest_tokens(sentence, text, token = "sentences")
## # A tibble: 15,545 x 1
## sentence
## <chr>
## 1 "pride and prejudice"
## 2 "by jane austen"
## 3 "chapter 1"
## 4 "it is a truth universally acknowledged, that a single man in possession"
## 5 "of a good fortune, must be in want of a wife."
## 6 "however little known the feelings or views of such a man may be on his"
## 7 "first entering a neighbourhood, this truth is so well fixed in the minds"
## 8 "of the surrounding families, that he is considered the rightful property"
## 9 "of some one or other of their daughters."
## 10 "\"my dear mr."
## # ... with 15,535 more rows
austen_chapters <- austen_books() %>%
group_by(book) %>%
unnest_tokens(chapter, text, token = "regex",
pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
ungroup()
austen_chapters
## # A tibble: 275 x 2
## book chapter
## <fct> <chr>
## 1 Sense & Sensibi~ "sense and sensibility\n\nby jane austen\n\n(1811)\n\n\n\n\~
## 2 Sense & Sensibi~ "\n\n\nthe family of dashwood had long been settled in suss~
## 3 Sense & Sensibi~ "\n\n\nmrs. john dashwood now installed herself mistress of~
## 4 Sense & Sensibi~ "\n\n\nmrs. dashwood remained at norland several months; no~
## 5 Sense & Sensibi~ "\n\n\n\"what a pity it is, elinor,\" said marianne, \"that~
## 6 Sense & Sensibi~ "\n\n\nno sooner was her answer dispatched, than mrs. dashw~
## 7 Sense & Sensibi~ "\n\n\nthe first part of their journey was performed in too~
## 8 Sense & Sensibi~ "\n\n\nbarton park was about half a mile from the cottage. ~
## 9 Sense & Sensibi~ "\n\n\nmrs. jennings was a widow with an ample jointure. s~
## 10 Sense & Sensibi~ "\n\n\nthe dashwoods were now settled at barton with tolera~
## # ... with 265 more rows
tidy_books %>%
distinct(book, chapter)
## # A tibble: 275 x 2
## book chapter
## <fct> <int>
## 1 Sense & Sensibility 0
## 2 Sense & Sensibility 1
## 3 Sense & Sensibility 2
## 4 Sense & Sensibility 3
## 5 Sense & Sensibility 4
## 6 Sense & Sensibility 5
## 7 Sense & Sensibility 6
## 8 Sense & Sensibility 7
## 9 Sense & Sensibility 8
## 10 Sense & Sensibility 9
## # ... with 265 more rows
tidy_books %>%
group_by(book, chapter) %>%
summarize(str_c(word, collapse = " "))
## `summarise()` regrouping output by 'book' (override with `.groups` argument)
## # A tibble: 275 x 3
## # Groups: book [6]
## book chapter `str_c(word, collapse = " ")`
## <fct> <int> <chr>
## 1 Sense & Sensi~ 0 sense and sensibility by jane austen 1811
## 2 Sense & Sensi~ 1 chapter 1 the family of dashwood had long been settle~
## 3 Sense & Sensi~ 2 chapter 2 mrs john dashwood now installed herself mis~
## 4 Sense & Sensi~ 3 chapter 3 mrs dashwood remained at norland several mo~
## 5 Sense & Sensi~ 4 chapter 4 what a pity it is elinor said marianne that~
## 6 Sense & Sensi~ 5 chapter 5 no sooner was her answer dispatched than mr~
## 7 Sense & Sensi~ 6 chapter 6 the first part of their journey was perform~
## 8 Sense & Sensi~ 7 chapter 7 barton park was about half a mile from the ~
## 9 Sense & Sensi~ 8 chapter 8 mrs jennings was a widow with an ample join~
## 10 Sense & Sensi~ 9 chapter 9 the dashwoods were now settled at barton wi~
## # ... with 265 more rows
bing_negative <- get_sentiments("bing") %>%
filter(sentiment == "negative")
chapter_words <- tidy_books %>%
count(book, chapter)
tidy_books %>%
semi_join(bing_negative) %>%
count(book, chapter, name = "negative_words") %>%
left_join(chapter_words) %>%
mutate(ratio = negative_words / n) %>%
filter(chapter != 0) %>%
group_by(book) %>%
top_n(1)
## Joining, by = "word"
## Joining, by = c("book", "chapter")
## Selecting by ratio
## # A tibble: 6 x 5
## # Groups: book [6]
## book chapter negative_words n ratio
## <fct> <int> <int> <int> <dbl>
## 1 Sense & Sensibility 43 161 3405 0.0473
## 2 Pride & Prejudice 34 111 2104 0.0528
## 3 Mansfield Park 46 173 3685 0.0469
## 4 Emma 15 151 3340 0.0452
## 5 Northanger Abbey 21 149 2982 0.0500
## 6 Persuasion 4 62 1807 0.0343
# filter a book by author title language and copyright
aesopMtdt <-gutenberg_metadata %>%
filter(author == "Aesop",
title == "Aesop's Fables",language == "en",!grepl('Copyrighted', rights))
# load the book using book id
aesopBook <- gutenberg_download(aesopMtdt$gutenberg_author_id[1])
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
#mutate to add author
aesopBookMutated <- aesopBook %>%
select(-gutenberg_id) %>%
mutate(author = "Aesop")
#aesopBookTidy<-tidy_book(aesopBookMutated)
# identify the tokens and remove stop words
aesopBookTidy<-aesopBookMutated %>%
unnest_tokens(word, text) %>%
anti_join(stop_words)
## Joining, by = "word"
#load the sentiment lexicon loughran
get_sentiments("loughran")
## # A tibble: 4,150 x 2
## word sentiment
## <chr> <chr>
## 1 abandon negative
## 2 abandoned negative
## 3 abandoning negative
## 4 abandonment negative
## 5 abandonments negative
## 6 abandons negative
## 7 abdicated negative
## 8 abdicates negative
## 9 abdicating negative
## 10 abdication negative
## # ... with 4,140 more rows
# join the sentiments with tokenized books with count for each sentiment
aesopBook_lexi <- aesopBookTidy %>%
inner_join(get_sentiments("loughran")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
aesopBook_lexi %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%mutate(word = reorder(word, n))
## Selecting by n
## # A tibble: 60 x 3
## word sentiment n
## <fct> <chr> <int>
## 1 constitution litigious 471
## 2 legislative litigious 210
## 3 laws litigious 203
## 4 legislature litigious 193
## 5 law litigious 148
## 6 danger negative 134
## 7 courts litigious 129
## 8 force negative 119
## 9 court litigious 105
## 10 jurisdiction litigious 103
## # ... with 50 more rows
#filter out positive and negative sentiments
aesopBook_positive_negative<-aesopBook_lexi %>%filter(sentiment =='positive' | sentiment =='negative')
aesopBook_nrc_lexi <- aesopBookTidy %>%
inner_join(get_sentiments("nrc")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
aesopBook_positive_negative_nrc<-aesopBook_nrc_lexi %>%filter(sentiment =='positive' | sentiment =='negative')
aesopBook_positive_negative %>%
filter(n > 50) %>%
mutate(n = ifelse(sentiment == "negative", -n, n)) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment))+
geom_col() +
coord_flip() +
labs(y = "Sentiment Count")
aesopBook_positive_negative_nrc %>%
filter(n > 100) %>%
mutate(n = ifelse(sentiment == "negative", -n, n)) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment))+
geom_col() +
coord_flip() +
labs(y = "Sentiment Count")
# Word clouts
aesopBookTidy %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
# sentiment analysis to tag different sentiments
aesopBookTidy %>%
inner_join(get_sentiments("loughran")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("gray20", "gray80"),
max.words = 100)
## Joining, by = "word"
Identified a different corpus and completed sentimental analysis using loughran lexicon. Also performed a sentimental comparison of the words extracted from the same book using loughran and nrc lexicons. It is clear that different lexicon dictionaries are having different set of sentimental words