knitr::opts_chunk$set(echo = TRUE)
# We need these packages
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.2.1 ✓ purrr 0.3.3
## ✓ tibble 2.1.3 ✓ dplyr 0.8.3
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.4.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(jsonlite)
##
## Attaching package: 'jsonlite'
## The following object is masked from 'package:purrr':
##
## flatten
library(knitr)
library(tidytext)
library(janeaustenr)
library(dplyr)
library(stringr)
library(tidyr)
library(ggplot2)
library(wordcloud)
## Loading required package: RColorBrewer
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(gutenbergr)
Overview: Start by getting the primary example code from chapter 2 of “Text Mining with R” working. Then extend it with a different corpus and incorporate at least one additional sentiment lexicon.
get_sentiments("afinn")
## # A tibble: 2,477 x 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # … with 2,467 more rows
get_sentiments("bing")
## # A tibble: 6,786 x 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # … with 6,776 more rows
get_sentiments("nrc")
## # A tibble: 13,901 x 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # … with 13,891 more rows
tidy_books <- austen_books() %>%
group_by(book) %>%
mutate(linenumber = row_number(),
chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup() %>%
unnest_tokens(word, text)
nrc_joy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
tidy_books %>%
filter(book == "Emma") %>%
inner_join(nrc_joy) %>%
count(word, sort = TRUE)
## Joining, by = "word"
## # A tibble: 303 x 2
## word n
## <chr> <int>
## 1 good 359
## 2 young 192
## 3 friend 166
## 4 hope 143
## 5 happy 125
## 6 love 117
## 7 deal 92
## 8 found 92
## 9 present 89
## 10 kind 82
## # … with 293 more rows
jane_austen_sentiment <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x")
pride_prejudice <- tidy_books %>%
filter(book == "Pride & Prejudice")
pride_prejudice
## # A tibble: 122,204 x 4
## book linenumber chapter word
## <fct> <int> <int> <chr>
## 1 Pride & Prejudice 1 0 pride
## 2 Pride & Prejudice 1 0 and
## 3 Pride & Prejudice 1 0 prejudice
## 4 Pride & Prejudice 3 0 by
## 5 Pride & Prejudice 3 0 jane
## 6 Pride & Prejudice 3 0 austen
## 7 Pride & Prejudice 7 1 chapter
## 8 Pride & Prejudice 7 1 1
## 9 Pride & Prejudice 10 1 it
## 10 Pride & Prejudice 10 1 is
## # … with 122,194 more rows
afinn <- pride_prejudice %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
## Joining, by = "word"
bing_and_nrc <- bind_rows(pride_prejudice %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
pride_prejudice %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c("positive",
"negative"))) %>%
mutate(method = "NRC")) %>%
count(method, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
## Joining, by = "word"
bind_rows(afinn,
bing_and_nrc) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y")
get_sentiments("nrc") %>%
filter(sentiment %in% c("positive",
"negative")) %>%
count(sentiment)
## # A tibble: 2 x 2
## sentiment n
## <chr> <int>
## 1 negative 3324
## 2 positive 2312
get_sentiments("bing") %>%
count(sentiment)
## # A tibble: 2 x 2
## sentiment n
## <chr> <int>
## 1 negative 4781
## 2 positive 2005
bing_word_counts <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
bing_word_counts
## # A tibble: 2,585 x 3
## word sentiment n
## <chr> <chr> <int>
## 1 miss negative 1855
## 2 well positive 1523
## 3 good positive 1380
## 4 great positive 981
## 5 like positive 725
## 6 better positive 639
## 7 enough positive 613
## 8 happy positive 534
## 9 love positive 495
## 10 pleasure positive 462
## # … with 2,575 more rows
bing_word_counts %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip()
## Selecting by n
custom_stop_words <- bind_rows(tibble(word = c("miss"),
lexicon = c("custom")),
stop_words)
custom_stop_words
## # A tibble: 1,150 x 2
## word lexicon
## <chr> <chr>
## 1 miss custom
## 2 a SMART
## 3 a's SMART
## 4 able SMART
## 5 about SMART
## 6 above SMART
## 7 according SMART
## 8 accordingly SMART
## 9 across SMART
## 10 actually SMART
## # … with 1,140 more rows
tidy_books %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
## Joining, by = "word"
tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("gray20", "gray80"),
max.words = 100)
## Joining, by = "word"
PandP_sentences <- tibble(text = prideprejudice) %>%
unnest_tokens(sentence, text, token = "sentences")
PandP_sentences$sentence[2]
## [1] "however little known the feelings or views of such a man may be on his first entering a neighbourhood, this truth is so well fixed in the minds of the surrounding families, that he is considered the rightful property of some one or other of their daughters."
austen_chapters <- austen_books() %>%
group_by(book) %>%
unnest_tokens(chapter, text, token = "regex",
pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
ungroup()
austen_chapters %>%
group_by(book) %>%
summarise(chapters = n())
## # A tibble: 6 x 2
## book chapters
## <fct> <int>
## 1 Sense & Sensibility 51
## 2 Pride & Prejudice 62
## 3 Mansfield Park 49
## 4 Emma 56
## 5 Northanger Abbey 32
## 6 Persuasion 25
bingnegative <- get_sentiments("bing") %>%
filter(sentiment == "negative")
wordcounts <- tidy_books %>%
group_by(book, chapter) %>%
summarize(words = n())
tidy_books %>%
semi_join(bingnegative) %>%
group_by(book, chapter) %>%
summarize(negativewords = n()) %>%
left_join(wordcounts, by = c("book", "chapter")) %>%
mutate(ratio = negativewords/words) %>%
filter(chapter != 0) %>%
top_n(1) %>%
ungroup()
## Joining, by = "word"
## Selecting by ratio
## # A tibble: 6 x 5
## book chapter negativewords words ratio
## <fct> <int> <int> <int> <dbl>
## 1 Sense & Sensibility 43 161 3405 0.0473
## 2 Pride & Prejudice 34 111 2104 0.0528
## 3 Mansfield Park 46 173 3685 0.0469
## 4 Emma 15 151 3340 0.0452
## 5 Northanger Abbey 21 149 2982 0.0500
## 6 Persuasion 4 62 1807 0.0343
# Let's grab a text from the gutenberg package
# Let's work with the US constitution. I wonder if there should be a sentiment in the
# historical document from the founding fathers
gutenberg_metadata %>% filter(title == "The United States Constitution")
## # A tibble: 1 x 8
## gutenberg_id title author gutenberg_autho… language gutenberg_books… rights
## <int> <chr> <chr> <int> <chr> <chr> <chr>
## 1 5 The … Unite… 1 en American Revolu… Publi…
## # … with 1 more variable: has_text <lgl>
const_id = 5
const_text = gutenberg_download(const_id)
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
# trim out the meta data stuff at the beginning
const_text = const_text[-(1:34),]
# get in tidy form, and maybe add sections
tidy_const <- const_text %>%
mutate(linenumber = row_number(),
article = cumsum(str_detect(text, regex("^article", ignore_case = TRUE))),
section = cumsum(str_detect(text, regex("^section", ignore_case = TRUE)))) %>%
ungroup() %>%
unnest_tokens(word, text)
divisor = 10
const_sentiment <- tidy_const %>%
inner_join(get_sentiments("bing")) %>%
count(article, index = linenumber %/% divisor, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
# plots of sentiment per article
ggplot(const_sentiment, aes(index, sentiment, fill = article)) +
geom_col(show.legend = FALSE) +
facet_wrap(~article, ncol = 2, scales = "free_x")
afinn <- tidy_const %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% divisor) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
## Joining, by = "word"
bing_and_nrc_and_lough <- bind_rows(tidy_const %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
tidy_const %>%
inner_join(get_sentiments("loughran")) %>%
mutate(method = "Loughran"),
tidy_const %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c("positive",
"negative"))) %>%
mutate(method = "NRC")) %>%
count(method, index = linenumber %/% divisor, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
bind_rows(afinn,
bing_and_nrc_and_lough) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y")
get_sentiments("nrc") %>%
filter(sentiment %in% c("positive",
"negative")) %>%
count(sentiment)
## # A tibble: 2 x 2
## sentiment n
## <chr> <int>
## 1 negative 3324
## 2 positive 2312
get_sentiments("bing") %>%
count(sentiment)
## # A tibble: 2 x 2
## sentiment n
## <chr> <int>
## 1 negative 4781
## 2 positive 2005
Here we added the lexicon Loughran. What is interesting is why NRC has the longer index. I believe it has to do with which words are recognied in each lexicon. We can see the Loughran looks particularly negative while NRC appears most positive.
bing_word_counts <- tidy_const %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
bing_word_counts
## # A tibble: 75 x 3
## word sentiment n
## <chr> <chr> <int>
## 1 vice negative 8
## 2 supreme positive 7
## 3 treason negative 7
## 4 inferior negative 4
## 5 proper positive 4
## 6 trust positive 4
## 7 affirmation positive 3
## 8 debts negative 3
## 9 like positive 3
## 10 objections negative 3
## # … with 65 more rows
bing_word_counts %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip()
## Selecting by n
custom_stop_words <- bind_rows(tibble(word = c("vice"),
lexicon = c("custom")),
stop_words)
I suspect that vice here is incorrectly being treated as a negative word when it is actually referring to the vice president.
tidy_const %>%
anti_join(custom_stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
## Joining, by = "word"
tidy_const %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("gray20", "gray80"),
max.words = 100)
## Joining, by = "word"
##const_sentences <- tibble(text = const_text$text) %>%
# unnest_tokens
bingnegative <- get_sentiments("bing") %>%
filter(sentiment == "negative")
wordcounts <- tidy_const %>%
group_by(article, section) %>%
summarize(words = n())
tidy_const %>%
semi_join(bingnegative) %>%
group_by(article, section) %>%
summarize(negativewords = n()) %>%
left_join(wordcounts, by = c("article", "section")) %>%
mutate(ratio = negativewords/words) %>%
arrange(desc(ratio)) %>%
ungroup()
## Joining, by = "word"
## # A tibble: 17 x 5
## article section negativewords words ratio
## <int> <int> <int> <int> <dbl>
## 1 2 14 3 33 0.0909
## 2 3 17 7 82 0.0854
## 3 4 19 4 124 0.0323
## 4 3 15 2 69 0.0290
## 5 4 20 2 111 0.0180
## 6 1 5 3 179 0.0168
## 7 2 11 11 666 0.0165
## 8 1 8 7 431 0.0162
## 9 1 10 3 188 0.0160
## 10 1 3 5 349 0.0143
## 11 1 6 2 146 0.0137
## 12 2 12 3 225 0.0133
## 13 1 7 4 317 0.0126
## 14 2 13 1 99 0.0101
## 15 5 21 1 145 0.00690
## 16 6 21 1 156 0.00641
## 17 1 2 1 297 0.00337
It looks like Article 2 Section 4 (14 in my analysis) is the most negative. According to this website this article talks about disqualifications of president, vice president, and all civil officers and removal from office via impeachment, conviction of treason, bribery, and other high crimes and misdemeanors. Seems very accurate to be the most negative.
Looking at the loughran lexicon, it appears to be aimed for determining what are liabilities from financial documents. Aside from positive and negative sentiments, there are litiguous, constraining, superfluous, and uncertainty. I’ll dive a bit deeper into uncertanty.
uncertainness <- tidy_const %>%
inner_join(get_sentiments("loughran")) %>%
count(article, index = linenumber %/% divisor, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
select(article, uncertainty) %>%
group_by(article) %>%
summarise(sum = sum(uncertainty), n = n()) %>%
mutate(uncertainness = sum / n) %>%
arrange(desc(uncertainness)) %>%
ungroup
## Joining, by = "word"
uncertainness
## # A tibble: 8 x 4
## article sum n uncertainness
## <int> <dbl> <int> <dbl>
## 1 5 2 2 1
## 2 2 9 11 0.818
## 3 1 19 27 0.704
## 4 4 3 5 0.6
## 5 3 2 5 0.4
## 6 0 0 1 0
## 7 6 0 3 0
## 8 7 0 3 0
ggplot(head(uncertainness, 4), aes(x = reorder(article, -uncertainness), y = uncertainness, fill = uncertainness)) +
geom_col(show.legend = FALSE) +
labs(x = "Article Number", y = "Uncertainty")
Article 5 is about how to make amendments. It seems most likely to contain the most words about uncertainty because it is how the Constitution gets added to. Article 2 is about the Executive branch. This may make sense because the founding fathers were wary of anyone becoming like a king and maybe they were uncertain how much powers the executive branch should have. Article 1 is about the legislative branch and Article 4 is about the states.
Citation: “Silge, Julia, and David Robinson. Text mining with R: A tidy approach.” O’Reilly Media, Inc.“, 2017.”