Overview

In Text Mining with R, Chapter 2 looks at Sentiment Analysis. In this assignment, you should start by getting the primary  example code from chapter 2 working in an R Markdown document. You should provide a citation to this base code.   You’re then asked to extend the code in two ways:

Work with a different corpus of your choosing, and
Incorporate at least one additional sentiment lexicon (possibly from another R package that you’ve found through research).
As usual, please submit links to both an .Rmd file posted in your GitHub repository and to your code on rpubs.com.
You make work on a small team on this assignment.

# Load library

library(stringr)
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.0.5
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.0.6     v dplyr   1.0.2
## v tidyr   1.1.2     v forcats 0.5.0
## v readr   1.4.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(tidyr)
library(ggplot2)
library(dplyr)
library(textdata)
## Warning: package 'textdata' was built under R version 4.0.5
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 4.0.5
## Loading required package: RColorBrewer
library(janeaustenr)
## Warning: package 'janeaustenr' was built under R version 4.0.5
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(ggwordcloud)
## Warning: package 'ggwordcloud' was built under R version 4.0.5
library(wordcloud)
library(gutenbergr)
## Warning: package 'gutenbergr' was built under R version 4.0.5
tidy_book <- function(author) {
  author %>% 
    unnest_tokens(word, text) %>% 
    anti_join(stop_words)
}


facet_bar <- function(df, y, x, by, nrow = 2, ncol = 2, scales = "free") {
  mapping <- aes(y = reorder_within({{ y }}, {{ x }}, {{ by }}), 
                 x = {{ x }}, 
                 fill = {{ by }})
  
  facet <- facet_wrap(vars({{ by }}), 
                      nrow = nrow, 
                      ncol = ncol,
                      scales = scales) 
  
  ggplot(df, mapping = mapping) + 
    geom_col(show.legend = FALSE) + 
    scale_y_reordered() + 
    facet + 
    ylab("")
} 

2.2 Sentiment analysis with inner join

tidy_books <- austen_books() %>%
  group_by(book) %>%
  mutate(linenumber = row_number(),
         chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]", 
                                                 ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)

nrc_joy <- get_sentiments("nrc") %>%
  filter(sentiment == "joy")

tidy_books %>%
  filter(book == "Emma") %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)
## Joining, by = "word"
## # A tibble: 303 x 2
##    word        n
##    <chr>   <int>
##  1 good      359
##  2 young     192
##  3 friend    166
##  4 hope      143
##  5 happy     125
##  6 love      117
##  7 deal       92
##  8 found      92
##  9 present    89
## 10 kind       82
## # ... with 293 more rows
jane_austen_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  mutate(index = linenumber %/% 80) %>% 
  count(book, index, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = list(n = 0)) %>%
  mutate(sentiment = positive - negative)
## Joining, by = "word"
jane_austen_sentiment
## # A tibble: 920 x 5
##    book                index negative positive sentiment
##    <fct>               <dbl>    <int>    <int>     <int>
##  1 Sense & Sensibility     0       16       32        16
##  2 Sense & Sensibility     1       19       53        34
##  3 Sense & Sensibility     2       12       31        19
##  4 Sense & Sensibility     3       15       31        16
##  5 Sense & Sensibility     4       16       34        18
##  6 Sense & Sensibility     5       16       51        35
##  7 Sense & Sensibility     6       24       40        16
##  8 Sense & Sensibility     7       23       51        28
##  9 Sense & Sensibility     8       30       40        10
## 10 Sense & Sensibility     9       15       19         4
## # ... with 910 more rows
ggplot(jane_austen_sentiment) + 
  geom_col(aes(index, sentiment, fill = book), show.legend = F) + 
  facet_wrap( ~ book, ncol = 2, scales = "free_x") 

2.3 Comparing 3 different dictionaries

pride_prejudice <- tidy_books %>% 
  filter(book == "Pride & Prejudice")

pride_prejudice
## # A tibble: 122,204 x 4
##    book              linenumber chapter word     
##    <fct>                  <int>   <int> <chr>    
##  1 Pride & Prejudice          1       0 pride    
##  2 Pride & Prejudice          1       0 and      
##  3 Pride & Prejudice          1       0 prejudice
##  4 Pride & Prejudice          3       0 by       
##  5 Pride & Prejudice          3       0 jane     
##  6 Pride & Prejudice          3       0 austen   
##  7 Pride & Prejudice          7       1 chapter  
##  8 Pride & Prejudice          7       1 1        
##  9 Pride & Prejudice         10       1 it       
## 10 Pride & Prejudice         10       1 is       
## # ... with 122,194 more rows
afinn <- pride_prejudice %>% 
  inner_join(get_sentiments("afinn")) %>% 
  mutate(index = linenumber %/% 80) %>% 
  count(book, index, wt = value, name = "value") %>%
  mutate(dict = "afinn") %>% 
  select(index, value, dict)
## Joining, by = "word"
bing <- pride_prejudice %>% 
  inner_join(get_sentiments("bing")) %>% 
  mutate(index = linenumber %/% 80) %>% 
  count(index, sentiment) %>% 
  pivot_wider(names_from = sentiment, values_from = n, values_fill = list(n = 0)) %>% 
  mutate(value = positive - negative,
         dict = "bing") %>%  
  select(index, value, dict)
## Joining, by = "word"
nrc <- pride_prejudice %>% 
  inner_join(get_sentiments("nrc")) %>%
  filter(sentiment %in% c("positive", "negative")) %>% 
  mutate(index = linenumber %/% 80) %>% 
  count(index, sentiment) %>% 
  pivot_wider(names_from = sentiment, values_from = n, values_fill = list(n = 0)) %>% 
  mutate(value = positive - negative,
         dict = "nrc") %>% 
  select(index, value, dict)
## Joining, by = "word"
bind_rows(afinn, bing, nrc) %>% 
  ggplot() + 
  geom_col(aes(index, value, fill = dict), show.legend = FALSE) + 
  facet_wrap(~ dict, nrow = 3)

get_sentiments("nrc") %>% 
  filter(sentiment %in% c("positive", "negative")) %>% 
  count(sentiment)
## # A tibble: 2 x 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   3324
## 2 positive   2312
get_sentiments("bing") %>% 
  count(sentiment)
## # A tibble: 2 x 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   4781
## 2 positive   2005

2.4 Most common positive and negative words

bing_word_counts <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
## Joining, by = "word"
bing_word_counts
## # A tibble: 2,585 x 3
##    word     sentiment     n
##    <chr>    <chr>     <int>
##  1 miss     negative   1855
##  2 well     positive   1523
##  3 good     positive   1380
##  4 great    positive    981
##  5 like     positive    725
##  6 better   positive    639
##  7 enough   positive    613
##  8 happy    positive    534
##  9 love     positive    495
## 10 pleasure positive    462
## # ... with 2,575 more rows
custom_stop_words <- tibble(word = c("miss"), lexicon = c("custom")) %>% 
  bind_rows(stop_words)

bing_word_counts <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>% 
  anti_join(custom_stop_words) %>%
  group_by(sentiment) %>%
  count(word, sentiment, sort = T) %>% 
  ungroup()
## Joining, by = "word"
## Joining, by = "word"
bing_word_counts %>%
  group_by(sentiment) %>% 
  top_n(10) %>%
  ungroup()  %>% 
  facet_bar(y = word, x = n, by = sentiment, nrow = 1) + 
  labs(title = "Top 10 words of sentiment in Jane Austen's books") 
## Selecting by n

2.5 Wordclouds

wordcloud_df <-tidy_books %>% 
  anti_join(custom_stop_words) %>% 
  inner_join(get_sentiments("bing")) %>% 
  count(sentiment, word, sort = T) %>% 
  top_n(200)
## Joining, by = "word"
## Joining, by = "word"
## Selecting by n
wordcloud_df %>%
  ggplot() + 
  geom_text_wordcloud_area(aes(label = word, size = n)) +
  scale_size_area(max_size = 15)

wordcloud_df %>%
  ggplot() + 
  geom_text_wordcloud_area(aes(label = word, size = n), shape = "star") + 
  scale_size_area(max_size = 15)

tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  reshape2::acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 100)
## Joining, by = "word"

2.6 Units other than words

PandP_sentences <- tibble(text = prideprejudice) %>% 
  unnest_tokens(sentence, text, token = "sentences")

PandP_sentences
## # A tibble: 15,545 x 1
##    sentence                                                                  
##    <chr>                                                                     
##  1 "pride and prejudice"                                                     
##  2 "by jane austen"                                                          
##  3 "chapter 1"                                                               
##  4 "it is a truth universally acknowledged, that a single man in possession" 
##  5 "of a good fortune, must be in want of a wife."                           
##  6 "however little known the feelings or views of such a man may be on his"  
##  7 "first entering a neighbourhood, this truth is so well fixed in the minds"
##  8 "of the surrounding families, that he is considered the rightful property"
##  9 "of some one or other of their daughters."                                
## 10 "\"my dear mr."                                                           
## # ... with 15,535 more rows
tibble(text = prideprejudice) %>% 
  mutate(text = iconv(text, to = "ASCII")) %>% 
  unnest_tokens(sentence, text, token = "sentences")
## # A tibble: 15,545 x 1
##    sentence                                                                  
##    <chr>                                                                     
##  1 "pride and prejudice"                                                     
##  2 "by jane austen"                                                          
##  3 "chapter 1"                                                               
##  4 "it is a truth universally acknowledged, that a single man in possession" 
##  5 "of a good fortune, must be in want of a wife."                           
##  6 "however little known the feelings or views of such a man may be on his"  
##  7 "first entering a neighbourhood, this truth is so well fixed in the minds"
##  8 "of the surrounding families, that he is considered the rightful property"
##  9 "of some one or other of their daughters."                                
## 10 "\"my dear mr."                                                           
## # ... with 15,535 more rows
austen_chapters <- austen_books() %>%
  group_by(book) %>%
  unnest_tokens(chapter, text, token = "regex", 
                pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
  ungroup()

austen_chapters
## # A tibble: 275 x 2
##    book             chapter                                                     
##    <fct>            <chr>                                                       
##  1 Sense & Sensibi~ "sense and sensibility\n\nby jane austen\n\n(1811)\n\n\n\n\~
##  2 Sense & Sensibi~ "\n\n\nthe family of dashwood had long been settled in suss~
##  3 Sense & Sensibi~ "\n\n\nmrs. john dashwood now installed herself mistress of~
##  4 Sense & Sensibi~ "\n\n\nmrs. dashwood remained at norland several months; no~
##  5 Sense & Sensibi~ "\n\n\n\"what a pity it is, elinor,\" said marianne, \"that~
##  6 Sense & Sensibi~ "\n\n\nno sooner was her answer dispatched, than mrs. dashw~
##  7 Sense & Sensibi~ "\n\n\nthe first part of their journey was performed in too~
##  8 Sense & Sensibi~ "\n\n\nbarton park was about half a mile from the cottage. ~
##  9 Sense & Sensibi~ "\n\n\nmrs. jennings was a widow with an ample jointure.  s~
## 10 Sense & Sensibi~ "\n\n\nthe dashwoods were now settled at barton with tolera~
## # ... with 265 more rows
tidy_books %>%
  distinct(book, chapter)
## # A tibble: 275 x 2
##    book                chapter
##    <fct>                 <int>
##  1 Sense & Sensibility       0
##  2 Sense & Sensibility       1
##  3 Sense & Sensibility       2
##  4 Sense & Sensibility       3
##  5 Sense & Sensibility       4
##  6 Sense & Sensibility       5
##  7 Sense & Sensibility       6
##  8 Sense & Sensibility       7
##  9 Sense & Sensibility       8
## 10 Sense & Sensibility       9
## # ... with 265 more rows
tidy_books %>% 
  group_by(book, chapter) %>% 
  summarize(str_c(word, collapse = " "))
## `summarise()` regrouping output by 'book' (override with `.groups` argument)
## # A tibble: 275 x 3
## # Groups:   book [6]
##    book           chapter `str_c(word, collapse = " ")`                         
##    <fct>            <int> <chr>                                                 
##  1 Sense & Sensi~       0 sense and sensibility by jane austen 1811             
##  2 Sense & Sensi~       1 chapter 1 the family of dashwood had long been settle~
##  3 Sense & Sensi~       2 chapter 2 mrs john dashwood now installed herself mis~
##  4 Sense & Sensi~       3 chapter 3 mrs dashwood remained at norland several mo~
##  5 Sense & Sensi~       4 chapter 4 what a pity it is elinor said marianne that~
##  6 Sense & Sensi~       5 chapter 5 no sooner was her answer dispatched than mr~
##  7 Sense & Sensi~       6 chapter 6 the first part of their journey was perform~
##  8 Sense & Sensi~       7 chapter 7 barton park was about half a mile from the ~
##  9 Sense & Sensi~       8 chapter 8 mrs jennings was a widow with an ample join~
## 10 Sense & Sensi~       9 chapter 9 the dashwoods were now settled at barton wi~
## # ... with 265 more rows
bing_negative <- get_sentiments("bing") %>% 
  filter(sentiment == "negative")

chapter_words <- tidy_books %>% 
  count(book, chapter)

tidy_books %>%
  semi_join(bing_negative) %>%
  count(book, chapter, name = "negative_words") %>% 
  left_join(chapter_words) %>%
  mutate(ratio = negative_words / n) %>%
  filter(chapter != 0) %>%
  group_by(book) %>% 
  top_n(1) 
## Joining, by = "word"
## Joining, by = c("book", "chapter")
## Selecting by ratio
## # A tibble: 6 x 5
## # Groups:   book [6]
##   book                chapter negative_words     n  ratio
##   <fct>                 <int>          <int> <int>  <dbl>
## 1 Sense & Sensibility      43            161  3405 0.0473
## 2 Pride & Prejudice        34            111  2104 0.0528
## 3 Mansfield Park           46            173  3685 0.0469
## 4 Emma                     15            151  3340 0.0452
## 5 Northanger Abbey         21            149  2982 0.0500
## 6 Persuasion                4             62  1807 0.0343

Work with a different corpus

# filter a book  by author title language and copyright
aesopMtdt <-gutenberg_metadata %>% 
    filter(author == "Aesop",
           title == "Aesop's Fables",language == "en",!grepl('Copyrighted', rights))

# load the book using book id
aesopBook <- gutenberg_download(aesopMtdt$gutenberg_author_id[1])
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
#mutate to add author
aesopBookMutated <- aesopBook %>% 
  select(-gutenberg_id) %>% 
  mutate(author = "Aesop")

#aesopBookTidy<-tidy_book(aesopBookMutated)

# identify the tokens and remove stop words
aesopBookTidy<-aesopBookMutated %>% 
    unnest_tokens(word, text) %>% 
    anti_join(stop_words)
## Joining, by = "word"

sentiment lexicon loughran

#load the sentiment lexicon loughran
get_sentiments("loughran")
## # A tibble: 4,150 x 2
##    word         sentiment
##    <chr>        <chr>    
##  1 abandon      negative 
##  2 abandoned    negative 
##  3 abandoning   negative 
##  4 abandonment  negative 
##  5 abandonments negative 
##  6 abandons     negative 
##  7 abdicated    negative 
##  8 abdicates    negative 
##  9 abdicating   negative 
## 10 abdication   negative 
## # ... with 4,140 more rows
# join the sentiments with tokenized books with count for each sentiment  
aesopBook_lexi <- aesopBookTidy %>%
inner_join(get_sentiments("loughran")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"

Identify top 10 sentiments

aesopBook_lexi %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%mutate(word = reorder(word, n)) 
## Selecting by n
## # A tibble: 60 x 3
##    word         sentiment     n
##    <fct>        <chr>     <int>
##  1 constitution litigious   471
##  2 legislative  litigious   210
##  3 laws         litigious   203
##  4 legislature  litigious   193
##  5 law          litigious   148
##  6 danger       negative    134
##  7 courts       litigious   129
##  8 force        negative    119
##  9 court        litigious   105
## 10 jurisdiction litigious   103
## # ... with 50 more rows

Identify positive and negative sentiments

#filter out positive and negative sentiments
aesopBook_positive_negative<-aesopBook_lexi %>%filter(sentiment =='positive' | sentiment =='negative')

aesopBook_nrc_lexi <- aesopBookTidy %>%
inner_join(get_sentiments("nrc")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
aesopBook_positive_negative_nrc<-aesopBook_nrc_lexi %>%filter(sentiment =='positive' | sentiment =='negative')

Plot for positive and negative sentiments for loughran

aesopBook_positive_negative %>%
     filter(n > 50) %>%
     mutate(n = ifelse(sentiment == "negative", -n, n)) %>%
     mutate(word = reorder(word, n)) %>%
     ggplot(aes(word, n, fill = sentiment))+
     geom_col() +
     coord_flip() +
     labs(y = "Sentiment Count")

Plot for positive and negative sentiments for nrc

aesopBook_positive_negative_nrc %>%
     filter(n > 100) %>%
     mutate(n = ifelse(sentiment == "negative", -n, n)) %>%
     mutate(word = reorder(word, n)) %>%
     ggplot(aes(word, n, fill = sentiment))+
     geom_col() +
     coord_flip() +
     labs(y = "Sentiment Count")

word clout

# Word clouts
aesopBookTidy %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))

# sentiment analysis to tag different sentiments
 aesopBookTidy %>%
  inner_join(get_sentiments("loughran")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 100)
## Joining, by = "word"

Conclusion

Identified a different corpus and completed sentimental analysis using loughran lexicon. Also performed a sentimental comparison of the words extracted from the same book using loughran and nrc lexicons. It is clear that different lexicon dictionaries are having different set of sentimental words