Intro

This assignment is focused on reproducing a sentiment analysis from Chapter 2 in Text Mining with R by Julia Silge and David Robinson: https://www.tidytextmining.com/sentiment.html The project is extended further by also analyzing the novel Les Miserables.

Loading libraries and sentiments

library(janeaustenr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
library(tidyr)
library(tidytext)
get_sentiments("afinn")
## # A tibble: 2,477 × 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # … with 2,467 more rows

Reproducing Project from Chapter 2 “Text Mining in R”

Tidy-ing the data within Jane Austen books. Each row includes each word found in each of Austen’s books, along with the chapter it is found in.

tidy_books <- austen_books() %>%
  group_by(book) %>%
  mutate(
    linenumber = row_number(),
    chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
                         ignore_case=TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)
tidy_books
## # A tibble: 725,055 × 4
##    book                linenumber chapter word       
##    <fct>                    <int>   <int> <chr>      
##  1 Sense & Sensibility          1       0 sense      
##  2 Sense & Sensibility          1       0 and        
##  3 Sense & Sensibility          1       0 sensibility
##  4 Sense & Sensibility          3       0 by         
##  5 Sense & Sensibility          3       0 jane       
##  6 Sense & Sensibility          3       0 austen     
##  7 Sense & Sensibility          5       0 1811       
##  8 Sense & Sensibility         10       1 chapter    
##  9 Sense & Sensibility         10       1 1          
## 10 Sense & Sensibility         13       1 the        
## # … with 725,045 more rows
#the arguments used in the unnest_tokens() funtion is relabeling the text column as "word"

Using the sentiment lexicon NRC, the sentiment “joy” was filtered from Austen’s book “Emma”, where each word associated with the emotion is filtered out and counted in its frequency throughout the book.

#nrc
nrc_joy <- get_sentiments("nrc") %>%
  filter(sentiment == "joy")

tidy_books %>%
  filter(book == "Emma") %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)
## Joining, by = "word"
## # A tibble: 301 × 2
##    word          n
##    <chr>     <int>
##  1 good        359
##  2 friend      166
##  3 hope        143
##  4 happy       125
##  5 love        117
##  6 deal         92
##  7 found        92
##  8 present      89
##  9 kind         82
## 10 happiness    76
## # … with 291 more rows

In analyzing how sentiment changes across Austen’s books, the Bing lexicon is used to analyze every 80 lines of each book in the data set.

jane_austen_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
#the count function is used to determine the amount of positive and negative sentiments per every 80 lines
    pivot_wider(names_from = sentiment, values_from = n, values_fill = 0 ) %>%
  mutate(sentiment = positive - negative)
## Joining, by = "word"
jane_austen_sentiment 
## # A tibble: 920 × 5
##    book                index negative positive sentiment
##    <fct>               <dbl>    <int>    <int>     <int>
##  1 Sense & Sensibility     0       16       32        16
##  2 Sense & Sensibility     1       19       53        34
##  3 Sense & Sensibility     2       12       31        19
##  4 Sense & Sensibility     3       15       31        16
##  5 Sense & Sensibility     4       16       34        18
##  6 Sense & Sensibility     5       16       51        35
##  7 Sense & Sensibility     6       24       40        16
##  8 Sense & Sensibility     7       23       51        28
##  9 Sense & Sensibility     8       30       40        10
## 10 Sense & Sensibility     9       15       19         4
## # … with 910 more rows

Visualization of the changes in sentiment across each of Austen’s books.

library(ggplot2)
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol=2, scales = "free_x")

The following 3 chunks looks into how the sentiment of the novel “Pride and Prejudice” differs according to what lexicon is used.

pride_prejudice <- tidy_books %>%
  filter(book == "Pride & Prejudice")
pride_prejudice
## # A tibble: 122,204 × 4
##    book              linenumber chapter word     
##    <fct>                  <int>   <int> <chr>    
##  1 Pride & Prejudice          1       0 pride    
##  2 Pride & Prejudice          1       0 and      
##  3 Pride & Prejudice          1       0 prejudice
##  4 Pride & Prejudice          3       0 by       
##  5 Pride & Prejudice          3       0 jane     
##  6 Pride & Prejudice          3       0 austen   
##  7 Pride & Prejudice          7       1 chapter  
##  8 Pride & Prejudice          7       1 1        
##  9 Pride & Prejudice         10       1 it       
## 10 Pride & Prejudice         10       1 is       
## # … with 122,194 more rows
afinn <- pride_prejudice %>%
  inner_join(get_sentiments("afinn")) %>%
  group_by(index = linenumber %/% 80) %>%
  summarise(sentiment = sum(value)) %>%
  mutate(method = "AFINN")
## Joining, by = "word"
afinn
## # A tibble: 163 × 3
##    index sentiment method
##    <dbl>     <dbl> <chr> 
##  1     0        29 AFINN 
##  2     1         0 AFINN 
##  3     2        20 AFINN 
##  4     3        30 AFINN 
##  5     4        62 AFINN 
##  6     5        66 AFINN 
##  7     6        60 AFINN 
##  8     7        18 AFINN 
##  9     8        84 AFINN 
## 10     9        26 AFINN 
## # … with 153 more rows
bing_and_nrc <- bind_rows(pride_prejudice %>% inner_join(get_sentiments("bing")) %>%
    mutate(method = "Bing et al."),
  pride_prejudice %>%
    inner_join(get_sentiments("nrc") %>%
    filter(sentiment %in% c("positive", "negative"))) %>%
  mutate(method = "NRC")) %>%
  count(method, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, 
              values_from = n,
              values_fill = 0) %>%
  mutate(sentiment = positive - negative)
## Joining, by = "word"
## Joining, by = "word"
bing_and_nrc
## # A tibble: 326 × 5
##    method      index negative positive sentiment
##    <chr>       <dbl>    <int>    <int>     <int>
##  1 Bing et al.     0        7       21        14
##  2 Bing et al.     1       20       19        -1
##  3 Bing et al.     2       16       20         4
##  4 Bing et al.     3       19       31        12
##  5 Bing et al.     4       23       47        24
##  6 Bing et al.     5       15       49        34
##  7 Bing et al.     6       18       46        28
##  8 Bing et al.     7       23       33        10
##  9 Bing et al.     8       17       48        31
## 10 Bing et al.     9       22       40        18
## # … with 316 more rows
bind_rows(afinn, bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

Looking into the positive and negative of both Bing and NRC

get_sentiments("nrc") %>%
  filter(sentiment %in% c("positive", "negative")) %>%
    count(sentiment)
## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   3316
## 2 positive   2308
  get_sentiments("bing") %>%
  count(sentiment)
## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   4781
## 2 positive   2005

Retrieving the most frequently used sentimental words.

bing_word_counts <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
## Joining, by = "word"
bing_word_counts
## # A tibble: 2,585 × 3
##    word     sentiment     n
##    <chr>    <chr>     <int>
##  1 miss     negative   1855
##  2 well     positive   1523
##  3 good     positive   1380
##  4 great    positive    981
##  5 like     positive    725
##  6 better   positive    639
##  7 enough   positive    613
##  8 happy    positive    534
##  9 love     positive    495
## 10 pleasure positive    462
## # … with 2,575 more rows

Visualizing the top 10 most frequently used negative and positive words use in “Pride and Prejudice”.

bing_word_counts %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()
## Selecting by n

customizing stop words. The word “miss” is used in the novel to describe young, unmarried women, not to long for someone/something.

custom_stop_words <- 
  bind_rows(tibble(word = c("miss"),
                   lexicon = c("custom")),
            stop_words)

custom_stop_words
## # A tibble: 1,150 × 2
##    word        lexicon
##    <chr>       <chr>  
##  1 miss        custom 
##  2 a           SMART  
##  3 a's         SMART  
##  4 able        SMART  
##  5 about       SMART  
##  6 above       SMART  
##  7 according   SMART  
##  8 accordingly SMART  
##  9 across      SMART  
## 10 actually    SMART  
## # … with 1,140 more rows

Creating a word cloud.

library(wordcloud)
## Loading required package: RColorBrewer
tidy_books %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))
## Joining, by = "word"

comparing the most frequent positive and negative words via word cloud.

library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0)  %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 100)
## Joining, by = "word"

Working with sentences…

p_and_p_sentences <- tibble(text = prideprejudice) %>% 
  unnest_tokens(sentence, text, token = "sentences")

p_and_p_sentences$sentence[2]
## [1] "by jane austen"

…and chapters

austen_chapters <- austen_books() %>%
  group_by(book) %>%
  unnest_tokens(chapter, text, token = "regex", 
                pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
  ungroup()

austen_chapters %>% 
  group_by(book) %>% 
  summarise(chapters = n())
## # A tibble: 6 × 2
##   book                chapters
##   <fct>                  <int>
## 1 Sense & Sensibility       51
## 2 Pride & Prejudice         62
## 3 Mansfield Park            49
## 4 Emma                      56
## 5 Northanger Abbey          32
## 6 Persuasion                25

Reviewing the chapters with containing the most negative sentimental words.

bingnegative <- get_sentiments("bing") %>% 
  filter(sentiment == "negative")

wordcounts <- tidy_books %>%
  group_by(book, chapter) %>%
  summarize(words = n())
## `summarise()` has grouped output by 'book'. You can override using the
## `.groups` argument.
tidy_books %>%
  semi_join(bingnegative) %>%
  group_by(book, chapter) %>%
  summarize(negativewords = n()) %>%
  left_join(wordcounts, by = c("book", "chapter")) %>%
  mutate(ratio = negativewords/words) %>%
  filter(chapter != 0) %>%
  top_n(1) %>%
  ungroup()
## Joining, by = "word"
## `summarise()` has grouped output by 'book'. You can override using the
## `.groups` argument.
## Selecting by ratio
## # A tibble: 6 × 5
##   book                chapter negativewords words  ratio
##   <fct>                 <int>         <int> <int>  <dbl>
## 1 Sense & Sensibility      43           161  3405 0.0473
## 2 Pride & Prejudice        34           111  2104 0.0528
## 3 Mansfield Park           46           173  3685 0.0469
## 4 Emma                     15           151  3340 0.0452
## 5 Northanger Abbey         21           149  2982 0.0500
## 6 Persuasion                4            62  1807 0.0343

Analyzing Les Miserables

The rest of this assignment extends past the reproduced project to now including a sentiment analysis of the novel Les Miserables

The gutenbergr package is used to retrieve the novel where it is then tidyd to a workable dataset.

library(gutenbergr)

vhugo <- gutenberg_download(135)
## Determining mirror for Project Gutenberg from https://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
## Warning: ! Could not download a book at http://aleph.gutenberg.org/1/3/135/135.zip.
## ℹ The book may have been archived.
## ℹ Alternatively, You may need to select a different mirror.
## → See https://www.gutenberg.org/MIRRORS.ALL for options.
## Warning: Unknown or uninitialised column: `text`.
les_mis <- vhugo %>%
  mutate(linenumber = row_number(), chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]", ignore_case = TRUE)))) %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words) %>%
  select(-gutenberg_id)
## Joining, by = "word"
les_mis 
## # A tibble: 0 × 3
## # … with 3 variables: linenumber <int>, chapter <int>, word <chr>

Retrieving the sentiment score for each chapter

les_mis_sentiment <- les_mis %>%
  inner_join(get_sentiments("afinn")) %>%
  group_by(chapter) %>%
  summarise(sentiment = sum(value))
## Joining, by = "word"
les_mis_sentiment
## # A tibble: 0 × 2
## # … with 2 variables: chapter <int>, sentiment <dbl>

Visualizing how the sentiment scores changes across the 365 chapters within Les Miserables

  ggplot(les_mis_sentiment, aes(x = chapter, y = sentiment)) +
  geom_col() +
  labs(title = "Sentiment Analysis of Les Misérables",
       x = "Chapter ",
       y = "Sentiment Score") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

The 10 most positive chapters based on their sentiment scores

les_mis_sentiment %>%
  arrange(desc(sentiment)) %>% 
  head(10)
## # A tibble: 0 × 2
## # … with 2 variables: chapter <int>, sentiment <dbl>

Analyzing the most positive words of the most positive chapter

lesmis_pos_sentiment <- les_mis %>%
  filter(chapter == 351) %>%
  inner_join(get_sentiments("afinn")) %>%
  arrange(desc(value)) %>% 
  head(10)
## Joining, by = "word"
lesmis_pos_sentiment 
## # A tibble: 0 × 4
## # … with 4 variables: linenumber <int>, chapter <int>, word <chr>, value <dbl>

Visualizing

lesmis_pos_sentiment_viz <- lesmis_pos_sentiment %>%
ggplot(aes(x = word, y = value)) +
  geom_col( fill= "#04D6D9") +
  labs(title = "Analysis of the 10 most Positive Words from the most Positive Chapter in Les Miserables",
       x = "Positive Words ",
       y = "Sentiment Score") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))  +
  coord_flip()

The 10 most negative chapters based on their sentiment scores

les_mis_sentiment %>%
  arrange(desc(sentiment)) %>% 
  tail(10)
## # A tibble: 0 × 2
## # … with 2 variables: chapter <int>, sentiment <dbl>

Analyzing the most positive words of the most positive chapter

lesmis_neg_sentiment <- les_mis %>%
  filter(chapter == 220) %>%
  inner_join(get_sentiments("afinn")) %>%
  arrange(desc(value)) %>% 
  tail(10)
## Joining, by = "word"
lesmis_neg_sentiment
## # A tibble: 0 × 4
## # … with 4 variables: linenumber <int>, chapter <int>, word <chr>, value <dbl>

Visualization

lesmis_neg_sentiment_viz <- lesmis_neg_sentiment %>%
ggplot(aes(x = word, y = value)) +
  geom_col( fill= "#D92104") +
  labs(title = "Analysis of the 10 most Negative Words in the Most Negative Chapter in Les Miserables",
       x = "Negative Words ",
       y = "Sentiment Score") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_y_reverse()  +
  coord_flip()
lesmis_pos_sentiment_viz

lesmis_neg_sentiment_viz

Conclusion

The analysis of Les Miserables identifies the ten most positive and negative chapters of the novel.The 10 most positive words in the most positive chapter(351) suggest a theme of success and happiness while the 10 most negative words in the most negative chapter (220) suggests themes of oppression, suffering, and injustice.