knitr::opts_chunk$set(echo = TRUE)
# We need these packages
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.2.1     ✓ purrr   0.3.3
## ✓ tibble  2.1.3     ✓ dplyr   0.8.3
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.4.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(jsonlite)
## 
## Attaching package: 'jsonlite'
## The following object is masked from 'package:purrr':
## 
##     flatten
library(knitr)
library(tidytext)
library(janeaustenr)
library(dplyr)
library(stringr)
library(tidyr)
library(ggplot2)
library(wordcloud)
## Loading required package: RColorBrewer
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(gutenbergr)

Overview: Start by getting the primary example code from chapter 2 of “Text Mining with R” working. Then extend it with a different corpus and incorporate at least one additional sentiment lexicon.

2.1

get_sentiments("afinn")
## # A tibble: 2,477 x 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # … with 2,467 more rows
get_sentiments("bing")
## # A tibble: 6,786 x 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # … with 6,776 more rows
get_sentiments("nrc")
## # A tibble: 13,901 x 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abacus      trust    
##  2 abandon     fear     
##  3 abandon     negative 
##  4 abandon     sadness  
##  5 abandoned   anger    
##  6 abandoned   fear     
##  7 abandoned   negative 
##  8 abandoned   sadness  
##  9 abandonment anger    
## 10 abandonment fear     
## # … with 13,891 more rows

2.2

tidy_books <- austen_books() %>%
  group_by(book) %>%
  mutate(linenumber = row_number(),
         chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]", 
                                                 ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)

nrc_joy <- get_sentiments("nrc") %>% 
  filter(sentiment == "joy")

tidy_books %>%
  filter(book == "Emma") %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)
## Joining, by = "word"
## # A tibble: 303 x 2
##    word        n
##    <chr>   <int>
##  1 good      359
##  2 young     192
##  3 friend    166
##  4 hope      143
##  5 happy     125
##  6 love      117
##  7 deal       92
##  8 found      92
##  9 present    89
## 10 kind       82
## # … with 293 more rows
jane_austen_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)
## Joining, by = "word"
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")

2.3

pride_prejudice <- tidy_books %>% 
  filter(book == "Pride & Prejudice")

pride_prejudice
## # A tibble: 122,204 x 4
##    book              linenumber chapter word     
##    <fct>                  <int>   <int> <chr>    
##  1 Pride & Prejudice          1       0 pride    
##  2 Pride & Prejudice          1       0 and      
##  3 Pride & Prejudice          1       0 prejudice
##  4 Pride & Prejudice          3       0 by       
##  5 Pride & Prejudice          3       0 jane     
##  6 Pride & Prejudice          3       0 austen   
##  7 Pride & Prejudice          7       1 chapter  
##  8 Pride & Prejudice          7       1 1        
##  9 Pride & Prejudice         10       1 it       
## 10 Pride & Prejudice         10       1 is       
## # … with 122,194 more rows
afinn <- pride_prejudice %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(value)) %>% 
  mutate(method = "AFINN")
## Joining, by = "word"
bing_and_nrc <- bind_rows(pride_prejudice %>% 
                            inner_join(get_sentiments("bing")) %>%
                            mutate(method = "Bing et al."),
                          pride_prejudice %>% 
                            inner_join(get_sentiments("nrc") %>% 
                                         filter(sentiment %in% c("positive", 
                                                                 "negative"))) %>%
                            mutate(method = "NRC")) %>%
  count(method, index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)
## Joining, by = "word"
## Joining, by = "word"
bind_rows(afinn, 
          bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

get_sentiments("nrc") %>% 
     filter(sentiment %in% c("positive", 
                             "negative")) %>% 
  count(sentiment)
## # A tibble: 2 x 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   3324
## 2 positive   2312
get_sentiments("bing") %>% 
  count(sentiment)
## # A tibble: 2 x 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   4781
## 2 positive   2005

2.4

bing_word_counts <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
## Joining, by = "word"
bing_word_counts
## # A tibble: 2,585 x 3
##    word     sentiment     n
##    <chr>    <chr>     <int>
##  1 miss     negative   1855
##  2 well     positive   1523
##  3 good     positive   1380
##  4 great    positive    981
##  5 like     positive    725
##  6 better   positive    639
##  7 enough   positive    613
##  8 happy    positive    534
##  9 love     positive    495
## 10 pleasure positive    462
## # … with 2,575 more rows
bing_word_counts %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()
## Selecting by n

custom_stop_words <- bind_rows(tibble(word = c("miss"), 
                                          lexicon = c("custom")), 
                               stop_words)

custom_stop_words
## # A tibble: 1,150 x 2
##    word        lexicon
##    <chr>       <chr>  
##  1 miss        custom 
##  2 a           SMART  
##  3 a's         SMART  
##  4 able        SMART  
##  5 about       SMART  
##  6 above       SMART  
##  7 according   SMART  
##  8 accordingly SMART  
##  9 across      SMART  
## 10 actually    SMART  
## # … with 1,140 more rows

2.5

tidy_books %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))
## Joining, by = "word"

tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 100)
## Joining, by = "word"

2.6

PandP_sentences <- tibble(text = prideprejudice) %>% 
  unnest_tokens(sentence, text, token = "sentences")
PandP_sentences$sentence[2]
## [1] "however little known the feelings or views of such a man may be on his first entering a neighbourhood, this truth is so well fixed in the minds of the surrounding families, that he is considered the rightful property of some one or other of their daughters."
austen_chapters <- austen_books() %>%
  group_by(book) %>%
  unnest_tokens(chapter, text, token = "regex", 
                pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
  ungroup()

austen_chapters %>% 
  group_by(book) %>% 
  summarise(chapters = n())
## # A tibble: 6 x 2
##   book                chapters
##   <fct>                  <int>
## 1 Sense & Sensibility       51
## 2 Pride & Prejudice         62
## 3 Mansfield Park            49
## 4 Emma                      56
## 5 Northanger Abbey          32
## 6 Persuasion                25
bingnegative <- get_sentiments("bing") %>% 
  filter(sentiment == "negative")

wordcounts <- tidy_books %>%
  group_by(book, chapter) %>%
  summarize(words = n())

tidy_books %>%
  semi_join(bingnegative) %>%
  group_by(book, chapter) %>%
  summarize(negativewords = n()) %>%
  left_join(wordcounts, by = c("book", "chapter")) %>%
  mutate(ratio = negativewords/words) %>%
  filter(chapter != 0) %>%
  top_n(1) %>%
  ungroup()
## Joining, by = "word"
## Selecting by ratio
## # A tibble: 6 x 5
##   book                chapter negativewords words  ratio
##   <fct>                 <int>         <int> <int>  <dbl>
## 1 Sense & Sensibility      43           161  3405 0.0473
## 2 Pride & Prejudice        34           111  2104 0.0528
## 3 Mansfield Park           46           173  3685 0.0469
## 4 Emma                     15           151  3340 0.0452
## 5 Northanger Abbey         21           149  2982 0.0500
## 6 Persuasion                4            62  1807 0.0343

Now I extend by using a different corpus and another sentiment lexicon. Use the loughran lexicon.

# Let's grab a text from the gutenberg package
# Let's work with the US constitution. I wonder if there should be a sentiment in the
# historical document from the founding fathers
gutenberg_metadata %>% filter(title == "The United States Constitution")
## # A tibble: 1 x 8
##   gutenberg_id title author gutenberg_autho… language gutenberg_books… rights
##          <int> <chr> <chr>             <int> <chr>    <chr>            <chr> 
## 1            5 The … Unite…                1 en       American Revolu… Publi…
## # … with 1 more variable: has_text <lgl>
const_id = 5
const_text = gutenberg_download(const_id)
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
# trim out the meta data stuff at the beginning
const_text = const_text[-(1:34),]

# get in tidy form, and maybe add sections
tidy_const <- const_text %>%
  mutate(linenumber = row_number(),
         article = cumsum(str_detect(text, regex("^article", ignore_case = TRUE))),
         section = cumsum(str_detect(text, regex("^section", ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)
divisor = 10
const_sentiment <- tidy_const %>%
  inner_join(get_sentiments("bing")) %>%
  count(article, index = linenumber %/% divisor, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)
## Joining, by = "word"
# plots of sentiment per article
ggplot(const_sentiment, aes(index, sentiment, fill = article)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~article, ncol = 2, scales = "free_x")

afinn <- tidy_const %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% divisor) %>% 
  summarise(sentiment = sum(value)) %>% 
  mutate(method = "AFINN")
## Joining, by = "word"
bing_and_nrc_and_lough <- bind_rows(tidy_const %>% 
                            inner_join(get_sentiments("bing")) %>%
                            mutate(method = "Bing et al."),
                          tidy_const %>%
                            inner_join(get_sentiments("loughran")) %>%
                            mutate(method = "Loughran"),
                          tidy_const %>% 
                            inner_join(get_sentiments("nrc") %>% 
                                         filter(sentiment %in% c("positive", 
                                                                 "negative"))) %>%
                            mutate(method = "NRC")) %>%
  count(method, index = linenumber %/% divisor, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)
## Joining, by = "word"
## Joining, by = "word"
## Joining, by = "word"
bind_rows(afinn, 
          bing_and_nrc_and_lough) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

get_sentiments("nrc") %>% 
     filter(sentiment %in% c("positive", 
                             "negative")) %>% 
  count(sentiment)
## # A tibble: 2 x 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   3324
## 2 positive   2312
get_sentiments("bing") %>% 
  count(sentiment)
## # A tibble: 2 x 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   4781
## 2 positive   2005

Here we added the lexicon Loughran. What is interesting is why NRC has the longer index. I believe it has to do with which words are recognied in each lexicon. We can see the Loughran looks particularly negative while NRC appears most positive.

bing_word_counts <- tidy_const %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
## Joining, by = "word"
bing_word_counts
## # A tibble: 75 x 3
##    word        sentiment     n
##    <chr>       <chr>     <int>
##  1 vice        negative      8
##  2 supreme     positive      7
##  3 treason     negative      7
##  4 inferior    negative      4
##  5 proper      positive      4
##  6 trust       positive      4
##  7 affirmation positive      3
##  8 debts       negative      3
##  9 like        positive      3
## 10 objections  negative      3
## # … with 65 more rows
bing_word_counts %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()
## Selecting by n

custom_stop_words <- bind_rows(tibble(word = c("vice"), 
                                          lexicon = c("custom")), 
                               stop_words)

I suspect that vice here is incorrectly being treated as a negative word when it is actually referring to the vice president.

tidy_const %>%
  anti_join(custom_stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))
## Joining, by = "word"

tidy_const %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 100)
## Joining, by = "word"

##const_sentences <- tibble(text = const_text$text) %>% 
#  unnest_tokens

bingnegative <- get_sentiments("bing") %>% 
  filter(sentiment == "negative")

wordcounts <- tidy_const %>%
  group_by(article, section) %>%
  summarize(words = n())

tidy_const %>%
  semi_join(bingnegative) %>%
  group_by(article, section) %>%
  summarize(negativewords = n()) %>%
  left_join(wordcounts, by = c("article", "section")) %>%
  mutate(ratio = negativewords/words) %>%
  arrange(desc(ratio)) %>%
  ungroup()
## Joining, by = "word"
## # A tibble: 17 x 5
##    article section negativewords words   ratio
##      <int>   <int>         <int> <int>   <dbl>
##  1       2      14             3    33 0.0909 
##  2       3      17             7    82 0.0854 
##  3       4      19             4   124 0.0323 
##  4       3      15             2    69 0.0290 
##  5       4      20             2   111 0.0180 
##  6       1       5             3   179 0.0168 
##  7       2      11            11   666 0.0165 
##  8       1       8             7   431 0.0162 
##  9       1      10             3   188 0.0160 
## 10       1       3             5   349 0.0143 
## 11       1       6             2   146 0.0137 
## 12       2      12             3   225 0.0133 
## 13       1       7             4   317 0.0126 
## 14       2      13             1    99 0.0101 
## 15       5      21             1   145 0.00690
## 16       6      21             1   156 0.00641
## 17       1       2             1   297 0.00337

It looks like Article 2 Section 4 (14 in my analysis) is the most negative. According to this website this article talks about disqualifications of president, vice president, and all civil officers and removal from office via impeachment, conviction of treason, bribery, and other high crimes and misdemeanors. Seems very accurate to be the most negative.

Looking at the loughran lexicon, it appears to be aimed for determining what are liabilities from financial documents. Aside from positive and negative sentiments, there are litiguous, constraining, superfluous, and uncertainty. I’ll dive a bit deeper into uncertanty.

uncertainness <- tidy_const %>%
  inner_join(get_sentiments("loughran")) %>%
  count(article, index = linenumber %/% divisor, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  select(article, uncertainty) %>%
  group_by(article) %>%
  summarise(sum = sum(uncertainty), n = n()) %>%
  mutate(uncertainness = sum / n) %>%
  arrange(desc(uncertainness)) %>%
  ungroup
## Joining, by = "word"
uncertainness
## # A tibble: 8 x 4
##   article   sum     n uncertainness
##     <int> <dbl> <int>         <dbl>
## 1       5     2     2         1    
## 2       2     9    11         0.818
## 3       1    19    27         0.704
## 4       4     3     5         0.6  
## 5       3     2     5         0.4  
## 6       0     0     1         0    
## 7       6     0     3         0    
## 8       7     0     3         0
 ggplot(head(uncertainness, 4), aes(x = reorder(article, -uncertainness), y = uncertainness, fill = uncertainness)) +
  geom_col(show.legend = FALSE) +
  labs(x = "Article Number", y = "Uncertainty")

Article 5 is about how to make amendments. It seems most likely to contain the most words about uncertainty because it is how the Constitution gets added to. Article 2 is about the Executive branch. This may make sense because the founding fathers were wary of anyone becoming like a king and maybe they were uncertain how much powers the executive branch should have. Article 1 is about the legislative branch and Article 4 is about the states.

Citation: “Silge, Julia, and David Robinson. Text mining with R: A tidy approach.” O’Reilly Media, Inc.“, 2017.”