Data607 Assignment 10

tidytextmining ch 2

This week’s assignment is to run the primary code for chapter 2 of ‘Text Mining with R’ which can be found here.

First, we’ll run the code found on the site to demonstrate sentiment analysis in R in book written by Jane Austin.

Afterwards, we’ll extend the practice using our own example and another sentiment lexicon.

Running the demonstrated code

#install.packages('tidytext')
#install.packages('textdata')
library(tidytext)

## Warning: package 'tidytext' was built under R version 3.6.3

get_sentiments('afinn')

## # A tibble: 2,477 x 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # ... with 2,467 more rows

get_sentiments('bing')

## # A tibble: 6,786 x 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # ... with 6,776 more rows

get_sentiments('nrc')

## # A tibble: 13,901 x 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abacus      trust    
##  2 abandon     fear     
##  3 abandon     negative 
##  4 abandon     sadness  
##  5 abandoned   anger    
##  6 abandoned   fear     
##  7 abandoned   negative 
##  8 abandoned   sadness  
##  9 abandonment anger    
## 10 abandonment fear     
## # ... with 13,891 more rows

library(janeaustenr)

## Warning: package 'janeaustenr' was built under R version 3.6.3

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stringr)

tidy_books <- austen_books() %>%
  group_by(book) %>%
  mutate(linenumber = row_number(),
         chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]", 
                                                 ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)

nrc_joy <- get_sentiments("nrc") %>% 
  filter(sentiment == "joy")

tidy_books %>%
  filter(book == "Emma") %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)

## Joining, by = "word"

## # A tibble: 303 x 2
##    word        n
##    <chr>   <int>
##  1 good      359
##  2 young     192
##  3 friend    166
##  4 hope      143
##  5 happy     125
##  6 love      117
##  7 deal       92
##  8 found      92
##  9 present    89
## 10 kind       82
## # ... with 293 more rows

library(tidyr)

jane_austen_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)

## Joining, by = "word"

library(ggplot2)

ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")

pride_prejudice <- tidy_books %>% 
  filter(book == "Pride & Prejudice")

pride_prejudice

## # A tibble: 122,204 x 4
##    book              linenumber chapter word     
##    <fct>                  <int>   <int> <chr>    
##  1 Pride & Prejudice          1       0 pride    
##  2 Pride & Prejudice          1       0 and      
##  3 Pride & Prejudice          1       0 prejudice
##  4 Pride & Prejudice          3       0 by       
##  5 Pride & Prejudice          3       0 jane     
##  6 Pride & Prejudice          3       0 austen   
##  7 Pride & Prejudice          7       1 chapter  
##  8 Pride & Prejudice          7       1 1        
##  9 Pride & Prejudice         10       1 it       
## 10 Pride & Prejudice         10       1 is       
## # ... with 122,194 more rows

afinn <- pride_prejudice %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(value)) %>% 
  mutate(method = "AFINN")

## Joining, by = "word"

bing_and_nrc <- bind_rows(pride_prejudice %>% 
                            inner_join(get_sentiments("bing")) %>%
                            mutate(method = "Bing et al."),
                          pride_prejudice %>% 
                            inner_join(get_sentiments("nrc") %>% 
                                         filter(sentiment %in% c("positive", 
                                                                 "negative"))) %>%
                            mutate(method = "NRC")) %>%
  count(method, index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)

## Joining, by = "word"

## Joining, by = "word"

bind_rows(afinn, 
          bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

get_sentiments("nrc") %>% 
  filter(sentiment %in% c("positive", 
                          "negative")) %>% 
  count(sentiment)

## # A tibble: 2 x 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   3324
## 2 positive   2312

get_sentiments("bing") %>% 
  count(sentiment)

## # A tibble: 2 x 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   4781
## 2 positive   2005

bing_word_counts <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()

## Joining, by = "word"

bing_word_counts

## # A tibble: 2,585 x 3
##    word     sentiment     n
##    <chr>    <chr>     <int>
##  1 miss     negative   1855
##  2 well     positive   1523
##  3 good     positive   1380
##  4 great    positive    981
##  5 like     positive    725
##  6 better   positive    639
##  7 enough   positive    613
##  8 happy    positive    534
##  9 love     positive    495
## 10 pleasure positive    462
## # ... with 2,575 more rows

bing_word_counts %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()

## Selecting by n

custom_stop_words <- bind_rows(tibble(word = c("miss"), 
                                      lexicon = c("custom")), 
                               stop_words)

custom_stop_words

## # A tibble: 1,150 x 2
##    word        lexicon
##    <chr>       <chr>  
##  1 miss        custom 
##  2 a           SMART  
##  3 a's         SMART  
##  4 able        SMART  
##  5 about       SMART  
##  6 above       SMART  
##  7 according   SMART  
##  8 accordingly SMART  
##  9 across      SMART  
## 10 actually    SMART  
## # ... with 1,140 more rows

library(wordcloud)

## Warning: package 'wordcloud' was built under R version 3.6.3

## Loading required package: RColorBrewer

tidy_books %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))

## Joining, by = "word"

library(reshape2)

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 100)

## Joining, by = "word"

PandP_sentences <- tibble(text = prideprejudice) %>% 
  unnest_tokens(sentence, text, token = "sentences")


PandP_sentences$sentence[2]

## [1] "however little known the feelings or views of such a man may be on his first entering a neighbourhood, this truth is so well fixed in the minds of the surrounding families, that he is considered the rightful property of some one or other of their daughters."

austen_chapters <- austen_books() %>%
  group_by(book) %>%
  unnest_tokens(chapter, text, token = "regex", 
                pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
  ungroup()

austen_chapters %>% 
  group_by(book) %>% 
  summarise(chapters = n())

## # A tibble: 6 x 2
##   book                chapters
##   <fct>                  <int>
## 1 Sense & Sensibility       51
## 2 Pride & Prejudice         62
## 3 Mansfield Park            49
## 4 Emma                      56
## 5 Northanger Abbey          32
## 6 Persuasion                25

bingnegative <- get_sentiments("bing") %>% 
  filter(sentiment == "negative")

wordcounts <- tidy_books %>%
  group_by(book, chapter) %>%
  summarize(words = n())

tidy_books %>%
  semi_join(bingnegative) %>%
  group_by(book, chapter) %>%
  summarize(negativewords = n()) %>%
  left_join(wordcounts, by = c("book", "chapter")) %>%
  mutate(ratio = negativewords/words) %>%
  filter(chapter != 0) %>%
  top_n(1) %>%
  ungroup()

## Joining, by = "word"

## Selecting by ratio

## # A tibble: 6 x 5
##   book                chapter negativewords words  ratio
##   <fct>                 <int>         <int> <int>  <dbl>
## 1 Sense & Sensibility      43           161  3405 0.0473
## 2 Pride & Prejudice        34           111  2104 0.0528
## 3 Mansfield Park           46           173  3685 0.0469
## 4 Emma                     15           151  3340 0.0452
## 5 Northanger Abbey         21           149  2982 0.0500
## 6 Persuasion                4            62  1807 0.0343

Voltaire using Loughran

For our practice, lets look at two books by Voltaire and analyze the sentiment from the loughran lexicon. The two books have been loaded with the wikisourcer package, tidied with dplyr, then plotted with ggplot. The two books we’re working with today are:

Candide
Micromegas

Loading the books

Lets begin by loading the books and the lexicon:

options(scipen = 100)
library(scales)

loughran <- get_sentiments('loughran')

#install.packages("wikisourcer")
library(wikisourcer)

## Warning: package 'wikisourcer' was built under R version 3.6.3

canidide <- wikisource_book(url = "https://en.wikisource.org/wiki/Candide")
micromegas <- wikisource_book(url = "https://en.wikisource.org/wiki/Micromegas")

# Lets combine them into one dataframe
voltaire <- bind_rows(canidide, micromegas)

Tidying the Text

Next, lets tidy up the data so that we can use them in plots.

tidy_vbooks <- group_by(voltaire, title) %>%
  mutate(linenumber = row_number(),
         chapter = as.integer(str_extract(url, regex("\\d+$", ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)

# This has all words and sentiments
vsentiment <- inner_join(tidy_vbooks, loughran, by = c('word' = ('word')))

# Lets count words by chapter to determine a denomiator
vchap <- group_by(vsentiment, title, chapter) %>%
  summarise(ccount = n())

# This shows words and the frequency percent by chapter
vsum <- group_by(vsentiment, title, chapter, sentiment) %>%
  summarise(count = n()) %>%
  inner_join(vchap, by = c('title' = 'title', 'chapter' = 'chapter')) %>%
  mutate(pct = count / ccount)

Visualize it

Here, we’ll take a look at some plots and make some observations.

# Sentiment percent by chapter and book
ggplot(vsentiment, aes(x = chapter, fill = sentiment)) +
  geom_bar(position = 'fill') +
  facet_wrap(~title, nrow = 2)

## Warning: Removed 17 rows containing non-finite values (stat_count).

This view provides an overview, but lets break it down a bit so its easier to read.

# Sentiment percent by chapter for Canidide
filter(vsum, title == 'Candide') %>%
ggplot(aes(x = chapter, y = pct, fill = sentiment)) +
  geom_col() +
  facet_wrap(~sentiment, nrow = 6, scale = 'free_y') +
  scale_y_continuous(labels = percent_format(accuracy = 1))

It looks like the beginning can be superflous and positive. Also, it looks like chapter 15 is the least negative chapter of all.

# Sentiment percent by chapter for Micromegas
filter(vsum, title == 'Micromegas') %>%
  ggplot(aes(x = chapter, y = pct, fill = sentiment)) +
  geom_col() +
  facet_wrap(~sentiment, nrow = 6, scale = 'free_y') +
  scale_y_continuous(labels = percent_format(accuracy = 1))

## Warning: Removed 5 rows containing missing values (position_stack).

This book looks like it start out positive and loses positivity throughout the book, ending on a superflous note.

# Count words in these two books
vcount <- group_by(vsentiment, word, sentiment) %>%
  summarize(n = n()) %>%
  arrange(desc(n))

# Take a look at top 10 words for each sentiment
vcount %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()

## Selecting by n

Looking at this makes me wonder if the sentiment analysis would differ with different translators.