Week_10_Assignment

In Text Mining with R, Chapter 2 looks at Sentiment Analysis. In this assignment, you should start by getting the primary example code from chapter 2 working in an R Markdown document. You should provide a citation to this base code. You’re then asked to extend the code in two ways:

Work with a different corpus of your choosing, and

Incorporate at least one additional sentiment lexicon (possibly from another R package that you’ve found through research).

The base code for this assignemnet is from chapter 2 of Text Mining with R: A Tidy Approach

https://www.tidytextmining.com/sentiment.html

Library

library("tidyverse")
library("janeaustenr")
library("stringr")
library("tidytext")

get_sentiments("afinn")

## # A tibble: 2,477 × 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # ℹ 2,467 more rows

get_sentiments("bing")

## # A tibble: 6,786 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # ℹ 6,776 more rows

get_sentiments("nrc")

## # A tibble: 13,872 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abacus      trust    
##  2 abandon     fear     
##  3 abandon     negative 
##  4 abandon     sadness  
##  5 abandoned   anger    
##  6 abandoned   fear     
##  7 abandoned   negative 
##  8 abandoned   sadness  
##  9 abandonment anger    
## 10 abandonment fear     
## # ℹ 13,862 more rows

Sentiment Analysis with inner join

# Tokenize the data
tidy_books <- austen_books() %>%
  group_by(book) %>%
  mutate(
    linenumber = row_number(),
    chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
      ignore_case = TRUE
    )))
  ) %>%
  ungroup() %>%
  unnest_tokens(word, text)

# The nrc lexicon

nrc_joy <- get_sentiments("nrc") %>% 
  filter(sentiment == "joy")

tidy_books %>%
  filter(book == "Emma") %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)

## # A tibble: 301 × 2
##    word          n
##    <chr>     <int>
##  1 good        359
##  2 friend      166
##  3 hope        143
##  4 happy       125
##  5 love        117
##  6 deal         92
##  7 found        92
##  8 present      89
##  9 kind         82
## 10 happiness    76
## # ℹ 291 more rows

# The bing lexicon

jane_austen_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)

# Visualization

ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")

Comparing the three sentiment dictionaries

# Filter out Pride and Prejudice from book

pride_prejudice <- tidy_books %>%
  filter(book == "Pride & Prejudice")

pride_prejudice

## # A tibble: 122,204 × 4
##    book              linenumber chapter word     
##    <fct>                  <int>   <int> <chr>    
##  1 Pride & Prejudice          1       0 pride    
##  2 Pride & Prejudice          1       0 and      
##  3 Pride & Prejudice          1       0 prejudice
##  4 Pride & Prejudice          3       0 by       
##  5 Pride & Prejudice          3       0 jane     
##  6 Pride & Prejudice          3       0 austen   
##  7 Pride & Prejudice          7       1 chapter  
##  8 Pride & Prejudice          7       1 1        
##  9 Pride & Prejudice         10       1 it       
## 10 Pride & Prejudice         10       1 is       
## # ℹ 122,194 more rows

afinn <- pride_prejudice %>%
  inner_join(get_sentiments("afinn")) %>%
  group_by(index = linenumber %/% 80) %>%
  summarise(sentiment = sum(value)) %>%
  mutate(method = "AFINN")

bing_and_nrc <- bind_rows(
  pride_prejudice %>%
    inner_join(get_sentiments("bing")) %>%
    mutate(method = "Bing et al."),
  pride_prejudice %>%
    inner_join(get_sentiments("nrc") %>%
      filter(sentiment %in% c(
        "positive",
        "negative"
      ))) %>%
    mutate(method = "NRC")
) %>%
  count(method, index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)

bind_rows(
  afinn,
  bing_and_nrc
) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

get_sentiments("nrc") %>%
  filter(sentiment %in% c(
    "positive",
    "negative"
  )) %>%
  count(sentiment)

## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   3316
## 2 positive   2308

get_sentiments("bing") %>%
  count(sentiment)

## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   4781
## 2 positive   2005

Most common positive and negative words

# Find out how much each word contributed to each sentiment.

bing_word_counts <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()

bing_word_counts

## # A tibble: 2,585 × 3
##    word     sentiment     n
##    <chr>    <chr>     <int>
##  1 miss     negative   1855
##  2 well     positive   1523
##  3 good     positive   1380
##  4 great    positive    981
##  5 like     positive    725
##  6 better   positive    639
##  7 enough   positive    613
##  8 happy    positive    534
##  9 love     positive    495
## 10 pleasure positive    462
## # ℹ 2,575 more rows

bing_word_counts %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(
    y = "Contribution to sentiment",
    x = NULL
  ) +
  coord_flip()

custom_stop_words <- bind_rows(tibble(word = c("miss"),  
                                      lexicon = c("custom")), 
                               stop_words)

custom_stop_words

## # A tibble: 1,150 × 2
##    word        lexicon
##    <chr>       <chr>  
##  1 miss        custom 
##  2 a           SMART  
##  3 a's         SMART  
##  4 able        SMART  
##  5 about       SMART  
##  6 above       SMART  
##  7 according   SMART  
##  8 accordingly SMART  
##  9 across      SMART  
## 10 actually    SMART  
## # ℹ 1,140 more rows

Wordclouds

library(wordcloud)

tidy_books %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))

library(reshape2)

tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 100)

Looking at units beyond just words

p_and_p_sentences <- tibble(text = prideprejudice) %>% 
  unnest_tokens(sentence, text, token = "sentences")

p_and_p_sentences$sentence[2]

## [1] "by jane austen"

austen_chapters <- austen_books() %>%
  group_by(book) %>%
  unnest_tokens(chapter, text, token = "regex", 
                pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
  ungroup()

austen_chapters %>% 
  group_by(book) %>% 
  summarise(chapters = n())

## # A tibble: 6 × 2
##   book                chapters
##   <fct>                  <int>
## 1 Sense & Sensibility       51
## 2 Pride & Prejudice         62
## 3 Mansfield Park            49
## 4 Emma                      56
## 5 Northanger Abbey          32
## 6 Persuasion                25

bingnegative <- get_sentiments("bing") %>% 
  filter(sentiment == "negative")

wordcounts <- tidy_books %>%
  group_by(book, chapter) %>%
  summarize(words = n())

tidy_books %>%
  semi_join(bingnegative) %>%
  group_by(book, chapter) %>%
  summarize(negativewords = n()) %>%
  left_join(wordcounts, by = c("book", "chapter")) %>%
  mutate(ratio = negativewords/words) %>%
  filter(chapter != 0) %>%
  slice_max(ratio, n = 1) %>% 
  ungroup()

## # A tibble: 6 × 5
##   book                chapter negativewords words  ratio
##   <fct>                 <int>         <int> <int>  <dbl>
## 1 Sense & Sensibility      43           161  3405 0.0473
## 2 Pride & Prejudice        34           111  2104 0.0528
## 3 Mansfield Park           46           173  3685 0.0469
## 4 Emma                     15           151  3340 0.0452
## 5 Northanger Abbey         21           149  2982 0.0500
## 6 Persuasion                4            62  1807 0.0343

Work with a different corpus of your choosing and incorporate at least one additional sentiment lexicon (possibly from another R package that you’ve found through research).

“Jane Eyre” by Charlotte Brontë will be used.

The Gutenberg ID for “Jane Eyre” is 1260

library(gutenbergr)

jane_eyre <- gutenberg_download(1260)

# Tokenize the data
tidy_jane <- jane_eyre %>%
                mutate(linenumber = row_number(), chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]", ignore_case = TRUE)))) %>%
                unnest_tokens(word, text)

# The nrc lexicon

fear <- get_sentiments("nrc") %>% 
            filter(sentiment == "fear")

jane_sentiment1 <- tidy_jane %>%
                      inner_join(fear) %>%
                      count(word, sort = TRUE)
jane_sentiment1

## # A tibble: 675 × 2
##    word        n
##    <chr>   <int>
##  1 fire      122
##  2 god        96
##  3 feeling    67
##  4 doubt      62
##  5 fear       62
##  6 death      55
##  7 marry      49
##  8 change     41
##  9 bear       40
## 10 die        38
## # ℹ 665 more rows

# The bing lexicon

word_totals <- tidy_jane %>%
                    group_by(chapter) %>%
                    count()

jane_sentiment2 <- tidy_jane %>%
                    inner_join(get_sentiments("bing")) %>%
                    group_by(chapter) %>%
                    count(sentiment) %>%
                    filter(sentiment == 'negative') %>%
                    transform(p = n / word_totals$n) %>%
                    arrange(desc(p))
jane_sentiment2

##    chapter sentiment   n          p
## 1        2  negative 181 0.06531938
## 2        6  negative 177 0.06055423
## 3       27  negative 558 0.04943741
## 4        3  negative 158 0.04891641
## 5        7  negative 168 0.04661487
## 6       20  negative 264 0.04488269
## 7       35  negative 194 0.04429224
## 8        4  negative 259 0.04415274
## 9        0  negative  42 0.04379562
## 10      14  negative 212 0.04252758
## 11       9  negative 139 0.04242979
## 12      26  negative 179 0.04146398
## 13       8  negative 125 0.04145937
## 14      21  negative 364 0.04094949
## 15       1  negative  80 0.04094166
## 16      15  negative 205 0.04040205
## 17      25  negative 205 0.04025923
## 18      36  negative 158 0.03979849
## 19      31  negative 122 0.03874246
## 20      28  negative 252 0.03609281
## 21       5  negative 175 0.03468094
## 22      23  negative 138 0.03448276
## 23      33  negative 167 0.03431977
## 24      37  negative 258 0.03414505
## 25      18  negative 201 0.03380992
## 26      12  negative 142 0.03359357
## 27      16  negative 124 0.03272631
## 28      30  negative 122 0.03236074
## 29      24  negative 232 0.03206634
## 30      29  negative 140 0.03052769
## 31      34  negative 285 0.03041298
## 32      32  negative 139 0.02993754
## 33      13  negative 114 0.02810651
## 34      17  negative 226 0.02759800
## 35      22  negative  77 0.02632479
## 36      19  negative  98 0.02564774
## 37      38  negative  42 0.02283850
## 38      11  negative 148 0.02267157
## 39      10  negative  91 0.02068182

# English sentiment lexicon created for use with financial documents. This lexicon labels words with six possible sentiments important in financial contexts: "negative", "positive", "litigious", "uncertainty", "constraining", or "superfluous".

# Append a column representing the proportion of positive words used

jane_sentiment3 <- tidy_jane %>%
                    inner_join(get_sentiments("loughran")) %>%
                    group_by(chapter) %>%
                    count(sentiment) %>%
                    filter(sentiment == 'positive') %>%
                    transform(p = n / word_totals$n) %>%
                    arrange(desc(p))

jane_sentiment3

##    chapter sentiment   n           p
## 1       32  positive  68 0.014645703
## 2       31  positive  45 0.014290251
## 3       30  positive  50 0.013262599
## 4       38  positive  23 0.012506797
## 5       34  positive 114 0.012165190
## 6        9  positive  39 0.011904762
## 7       16  positive  41 0.010820797
## 8        8  positive  31 0.010281924
## 9       10  positive  45 0.010227273
## 10      11  positive  65 0.009957108
## 11      22  positive  28 0.009572650
## 12       6  positive  27 0.009237085
## 13      24  positive  66 0.009122322
## 14      33  positive  44 0.009042335
## 15      35  positive  39 0.008904110
## 16       4  positive  52 0.008864644
## 17      14  positive  43 0.008625878
## 18      18  positive  51 0.008578638
## 19       5  positive  43 0.008521601
## 20      29  positive  39 0.008504143
## 21       0  positive   8 0.008342023
## 22      37  positive  62 0.008205400
## 23      12  positive  34 0.008043530
## 24      15  positive  40 0.007883327
## 25      25  positive  40 0.007855460
## 26      23  positive  31 0.007746127
## 27      21  positive  64 0.007199910
## 28      13  positive  28 0.006903353
## 29       1  positive  13 0.006653019
## 30      19  positive  25 0.006542790
## 31       2  positive  18 0.006495850
## 32      17  positive  52 0.006349982
## 33      36  positive  25 0.006297229
## 34       3  positive  20 0.006191950
## 35      27  positive  69 0.006113228
## 36      20  positive  35 0.005950357
## 37      28  positive  39 0.005585792
## 38       7  positive  20 0.005549390
## 39      26  positive  18 0.004169562

# Bar plot - Positive Sentiment in Jane Eyre by Chapter 

ggplot(jane_sentiment3, aes(x = reorder(factor(chapter), -n), y = n, fill = factor(chapter))) +
  geom_bar(stat = "identity", color = "white") +
  labs(x = "Chapter", y = "Proportion of positive words used") +
  ggtitle("Positive Sentiment in Jane Eyre by Chapter") +
  theme_minimal() +
  theme(legend.position = "none")