Assignment 10

# Example code from "Text Mining with R", Chapter 2
# Citation: Silge, J., & Robinson, D. (2017). Text Mining with R: A Tidy Approach. O'Reilly Media.

# What are the most common joy words in Emma? 
library(janeaustenr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stringr)
library(tidytext)
library(textdata)
library(tidyr)
library(ggplot2)

tidy_books <- austen_books() %>%
  group_by(book) %>%
  mutate(
    linenumber = row_number(),
    chapter = cumsum(str_detect(text, 
                                regex("^chapter [\\divxlc]", 
                                      ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)

head(tidy_books)

## # A tibble: 6 × 4
##   book                linenumber chapter word       
##   <fct>                    <int>   <int> <chr>      
## 1 Sense & Sensibility          1       0 sense      
## 2 Sense & Sensibility          1       0 and        
## 3 Sense & Sensibility          1       0 sensibility
## 4 Sense & Sensibility          3       0 by         
## 5 Sense & Sensibility          3       0 jane       
## 6 Sense & Sensibility          3       0 austen

# Now that the text is in a tidy format with one word per row, we are ready to do the sentiment analysis. First, let’s use the NRC lexicon and filter() for the joy words. Next, let’s filter() the data frame with the text from the books for the words from Emma and then use inner_join() to perform the sentiment analysis.

nrc_joy <- get_sentiments("nrc") %>% 
  filter(sentiment == "joy")
nrc_joy

## # A tibble: 687 × 2
##    word          sentiment
##    <chr>         <chr>    
##  1 absolution    joy      
##  2 abundance     joy      
##  3 abundant      joy      
##  4 accolade      joy      
##  5 accompaniment joy      
##  6 accomplish    joy      
##  7 accomplished  joy      
##  8 achieve       joy      
##  9 achievement   joy      
## 10 acrobat       joy      
## # ℹ 677 more rows

tidy_books %>%
  filter(book == "Emma") %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)

## Joining with `by = join_by(word)`

## # A tibble: 301 × 2
##    word          n
##    <chr>     <int>
##  1 good        359
##  2 friend      166
##  3 hope        143
##  4 happy       125
##  5 love        117
##  6 deal         92
##  7 found        92
##  8 present      89
##  9 kind         82
## 10 happiness    76
## # ℹ 291 more rows

head(tidy_books)

## # A tibble: 6 × 4
##   book                linenumber chapter word       
##   <fct>                    <int>   <int> <chr>      
## 1 Sense & Sensibility          1       0 sense      
## 2 Sense & Sensibility          1       0 and        
## 3 Sense & Sensibility          1       0 sensibility
## 4 Sense & Sensibility          3       0 by         
## 5 Sense & Sensibility          3       0 jane       
## 6 Sense & Sensibility          3       0 austen

# We then use pivot_wider() so that we have negative and positive sentiment in separate columns, and lastly calculate a net sentiment (positive - negative)

jane_austen_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)

## Joining with `by = join_by(word)`

## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 435434 of `x` matches multiple rows in `y`.
## ℹ Row 5051 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

head(jane_austen_sentiment)

## # A tibble: 6 × 5
##   book                index negative positive sentiment
##   <fct>               <dbl>    <int>    <int>     <int>
## 1 Sense & Sensibility     0       16       32        16
## 2 Sense & Sensibility     1       19       53        34
## 3 Sense & Sensibility     2       12       31        19
## 4 Sense & Sensibility     3       15       31        16
## 5 Sense & Sensibility     4       16       34        18
## 6 Sense & Sensibility     5       16       51        35

# Now we can plot these sentiment scores across the plot trajectory of each novel.

ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")

Comparing the three sentiment dictionaries :

# Now filter Pride & Prejudice book

pride_prejudice <- tidy_books %>% 
  filter(book == "Pride & Prejudice")

head(pride_prejudice)

## # A tibble: 6 × 4
##   book              linenumber chapter word     
##   <fct>                  <int>   <int> <chr>    
## 1 Pride & Prejudice          1       0 pride    
## 2 Pride & Prejudice          1       0 and      
## 3 Pride & Prejudice          1       0 prejudice
## 4 Pride & Prejudice          3       0 by       
## 5 Pride & Prejudice          3       0 jane     
## 6 Pride & Prejudice          3       0 austen

# Now create a net sentiment for each part of the book using each of the lexicons

afinn <- pride_prejudice %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(value)) %>% 
  mutate(method = "AFINN")

## Joining with `by = join_by(word)`

head(afinn)

## # A tibble: 6 × 3
##   index sentiment method
##   <dbl>     <dbl> <chr> 
## 1     0        29 AFINN 
## 2     1         0 AFINN 
## 3     2        20 AFINN 
## 4     3        30 AFINN 
## 5     4        62 AFINN 
## 6     5        66 AFINN

bing_and_nrc <- bind_rows(
  pride_prejudice %>% 
    inner_join(get_sentiments("bing")) %>%
    mutate(method = "Bing et al."),
  pride_prejudice %>% 
    inner_join(get_sentiments("nrc") %>% 
                 filter(sentiment %in% c("positive", 
                                         "negative"))
    ) %>%
    mutate(method = "NRC")) %>%
  count(method, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment,
              values_from = n,
              values_fill = 0) %>% 
  mutate(sentiment = positive - negative)

## Joining with `by = join_by(word)`
## Joining with `by = join_by(word)`

## Warning in inner_join(., get_sentiments("nrc") %>% filter(sentiment %in% : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 215 of `x` matches multiple rows in `y`.
## ℹ Row 5178 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

head(bing_and_nrc)

## # A tibble: 6 × 5
##   method      index negative positive sentiment
##   <chr>       <dbl>    <int>    <int>     <int>
## 1 Bing et al.     0        7       21        14
## 2 Bing et al.     1       20       19        -1
## 3 Bing et al.     2       16       20         4
## 4 Bing et al.     3       19       31        12
## 5 Bing et al.     4       23       47        24
## 6 Bing et al.     5       15       49        34

# Plot net sentiment:

bind_rows(afinn, 
          bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

# Let’s look briefly at how many positive and negative words are in these lexicons.
get_sentiments("nrc") %>% 
  filter(sentiment %in% c("positive", "negative")) %>% 
  count(sentiment)

## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   3316
## 2 positive   2308

get_sentiments("bing") %>% 
  count(sentiment)

## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   4781
## 2 positive   2005

Most common positive and negative words:

bing_word_counts <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()

## Joining with `by = join_by(word)`

## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 435434 of `x` matches multiple rows in `y`.
## ℹ Row 5051 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

head(bing_word_counts)

## # A tibble: 6 × 3
##   word   sentiment     n
##   <chr>  <chr>     <int>
## 1 miss   negative   1855
## 2 well   positive   1523
## 3 good   positive   1380
## 4 great  positive    981
## 5 like   positive    725
## 6 better positive    639

bing_word_counts %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL)

# Now add word “miss” to custom words:

custom_stop_words <- bind_rows(tibble(word = c("miss"),  
                                      lexicon = c("custom")), 
                               stop_words)

head(custom_stop_words)

## # A tibble: 6 × 2
##   word  lexicon
##   <chr> <chr>  
## 1 miss  custom 
## 2 a     SMART  
## 3 a's   SMART  
## 4 able  SMART  
## 5 about SMART  
## 6 above SMART

Wordclouds:

library(wordcloud)

## Loading required package: RColorBrewer

tidy_books %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))

## Joining with `by = join_by(word)`

library(reshape2)

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 100)

## Joining with `by = join_by(word)`

## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 435434 of `x` matches multiple rows in `y`.
## ℹ Row 5051 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

Looking at units beyond just words:

p_and_p_sentences <- tibble(text = prideprejudice) %>% 
  unnest_tokens(sentence, text, token = "sentences")
head(p_and_p_sentences)

## # A tibble: 6 × 1
##   sentence                                                               
##   <chr>                                                                  
## 1 pride and prejudice                                                    
## 2 by jane austen                                                         
## 3 chapter 1                                                              
## 4 it is a truth universally acknowledged, that a single man in possession
## 5 of a good fortune, must be in want of a wife.                          
## 6 however little known the feelings or views of such a man may be on his

p_and_p_sentences$sentence[2]

## [1] "by jane austen"

austen_chapters <- austen_books() %>%
  group_by(book) %>%
  unnest_tokens(chapter, text, token = "regex", 
                pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
  ungroup()

head(austen_chapters)

## # A tibble: 6 × 2
##   book                chapter                                                   
##   <fct>               <chr>                                                     
## 1 Sense & Sensibility "sense and sensibility\n\nby jane austen\n\n(1811)\n\n\n\…
## 2 Sense & Sensibility "\n\n\nthe family of dashwood had long been settled in su…
## 3 Sense & Sensibility "\n\n\nmrs. john dashwood now installed herself mistress …
## 4 Sense & Sensibility "\n\n\nmrs. dashwood remained at norland several months; …
## 5 Sense & Sensibility "\n\n\n\"what a pity it is, elinor,\" said marianne, \"th…
## 6 Sense & Sensibility "\n\n\nno sooner was her answer dispatched, than mrs. das…

austen_chapters %>% 
  group_by(book) %>% 
  summarise(chapters = n())

## # A tibble: 6 × 2
##   book                chapters
##   <fct>                  <int>
## 1 Sense & Sensibility       51
## 2 Pride & Prejudice         62
## 3 Mansfield Park            49
## 4 Emma                      56
## 5 Northanger Abbey          32
## 6 Persuasion                25

head(austen_chapters)

## # A tibble: 6 × 2
##   book                chapter                                                   
##   <fct>               <chr>                                                     
## 1 Sense & Sensibility "sense and sensibility\n\nby jane austen\n\n(1811)\n\n\n\…
## 2 Sense & Sensibility "\n\n\nthe family of dashwood had long been settled in su…
## 3 Sense & Sensibility "\n\n\nmrs. john dashwood now installed herself mistress …
## 4 Sense & Sensibility "\n\n\nmrs. dashwood remained at norland several months; …
## 5 Sense & Sensibility "\n\n\n\"what a pity it is, elinor,\" said marianne, \"th…
## 6 Sense & Sensibility "\n\n\nno sooner was her answer dispatched, than mrs. das…

bingnegative <- get_sentiments("bing") %>% 
  filter(sentiment == "negative")
head(bingnegative)

## # A tibble: 6 × 2
##   word       sentiment
##   <chr>      <chr>    
## 1 2-faces    negative 
## 2 abnormal   negative 
## 3 abolish    negative 
## 4 abominable negative 
## 5 abominably negative 
## 6 abominate  negative

wordcounts <- tidy_books %>%
  group_by(book, chapter) %>%
  summarize(words = n())

## `summarise()` has grouped output by 'book'. You can override using the
## `.groups` argument.

head(wordcounts)

## # A tibble: 6 × 3
## # Groups:   book [1]
##   book                chapter words
##   <fct>                 <int> <int>
## 1 Sense & Sensibility       0     7
## 2 Sense & Sensibility       1  1571
## 3 Sense & Sensibility       2  1970
## 4 Sense & Sensibility       3  1538
## 5 Sense & Sensibility       4  1952
## 6 Sense & Sensibility       5  1030

tidy_books %>%
  semi_join(bingnegative) %>%
  group_by(book, chapter) %>%
  summarize(negativewords = n()) %>%
  left_join(wordcounts, by = c("book", "chapter")) %>%
  mutate(ratio = negativewords/words) %>%
  filter(chapter != 0) %>%
  slice_max(ratio, n = 1) %>% 
  ungroup()

## Joining with `by = join_by(word)`
## `summarise()` has grouped output by 'book'. You can override using the
## `.groups` argument.

## # A tibble: 6 × 5
##   book                chapter negativewords words  ratio
##   <fct>                 <int>         <int> <int>  <dbl>
## 1 Sense & Sensibility      43           161  3405 0.0473
## 2 Pride & Prejudice        34           111  2104 0.0528
## 3 Mansfield Park           46           173  3685 0.0469
## 4 Emma                     15           151  3340 0.0452
## 5 Northanger Abbey         21           149  2982 0.0500
## 6 Persuasion                4            62  1807 0.0343

head(tidy_books)

## # A tibble: 6 × 4
##   book                linenumber chapter word       
##   <fct>                    <int>   <int> <chr>      
## 1 Sense & Sensibility          1       0 sense      
## 2 Sense & Sensibility          1       0 and        
## 3 Sense & Sensibility          1       0 sensibility
## 4 Sense & Sensibility          3       0 by         
## 5 Sense & Sensibility          3       0 jane       
## 6 Sense & Sensibility          3       0 austen

Now, let’s use the gutenbergr library to analyze another book, “Moby Dick” by Herman Melville using similar approach.

library(gutenbergr)

moby_dick <- gutenberg_download(2701, mirror = "http://www.gutenberg.lib.md.us/")
head(moby_dick)

## # A tibble: 6 × 2
##   gutenberg_id text                
##          <int> <chr>               
## 1         2701 "MOBY-DICK;"        
## 2         2701 ""                  
## 3         2701 "or, THE WHALE."    
## 4         2701 ""                  
## 5         2701 "By Herman Melville"
## 6         2701 ""

tidy_moby_dick <- moby_dick %>%
  mutate(text = str_replace_all(text, "[^[:alnum:][:space:]]", "")) %>%
  unnest_tokens(word, text)
head(tidy_moby_dick)

## # A tibble: 6 × 2
##   gutenberg_id word    
##          <int> <chr>   
## 1         2701 mobydick
## 2         2701 or      
## 3         2701 the     
## 4         2701 whale   
## 5         2701 by      
## 6         2701 herman

# Perform sentiment analysis:
# Using NRC lexicon for joy words
nrc_joy <- get_sentiments("nrc") %>% 
  filter(sentiment == "joy")

tidy_moby_dick %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)

## Joining with `by = join_by(word)`

## # A tibble: 409 × 2
##    word        n
##    <chr>   <int>
##  1 good      195
##  2 god       118
##  3 found     115
##  4 true       85
##  5 sun        83
##  6 present    79
##  7 green      48
##  8 mighty     47
##  9 art        41
## 10 hope       34
## # ℹ 399 more rows

moby_dick_sentiment <- tidy_moby_dick %>%
  inner_join(get_sentiments("bing")) %>%
  count(index = row_number() %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
  mutate(sentiment = positive - negative)

## Joining with `by = join_by(word)`

## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 65454 of `x` matches multiple rows in `y`.
## ℹ Row 5125 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

head(moby_dick_sentiment)

## # A tibble: 6 × 4
##   index negative positive sentiment
##   <dbl>    <int>    <int>     <int>
## 1     0       43       36        -7
## 2     1       40       40         0
## 3     2       40       40         0
## 4     3       28       52        24
## 5     4       48       32       -16
## 6     5       49       31       -18

ggplot(moby_dick_sentiment, aes(index, sentiment)) +
  geom_col(show.legend = FALSE) +
  labs(title = "Sentiment Analysis of Moby Dick",
       x = "Index",
       y = "Sentiment Score")

# Compare sentiment dictionaries:

# AFINN lexicon
afinn <- tidy_moby_dick %>%
  inner_join(get_sentiments("afinn")) %>%
  group_by(index = row_number() %/% 80) %>%
  summarise(sentiment = sum(value)) %>%
  mutate(method = "AFINN")

## Joining with `by = join_by(word)`

# Bing and NRC lexicons
bing_and_nrc <- bind_rows(
  tidy_moby_dick %>%
    inner_join(get_sentiments("bing")) %>%
    mutate(method = "Bing et al."),
  tidy_moby_dick %>%
    inner_join(get_sentiments("nrc") %>% 
                 filter(sentiment %in% c("positive", "negative"))) %>%
    mutate(method = "NRC")) %>%
  count(method, index = row_number() %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
  mutate(sentiment = positive - negative)

## Joining with `by = join_by(word)`

## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 65454 of `x` matches multiple rows in `y`.
## ℹ Row 5125 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

## Joining with `by = join_by(word)`

## Warning in inner_join(., get_sentiments("nrc") %>% filter(sentiment %in% : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 882 of `x` matches multiple rows in `y`.
## ℹ Row 5175 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

bind_rows(afinn, bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y") +
  labs(title = "Sentiment Analysis Comparison for Moby Dick",
       x = "Index",
       y = "Sentiment Score")

Now, Let’s use the sentimentr package to perform sentiment analysis on “Moby Dick”:

library(sentimentr)

library(gutenbergr)
moby_dick <- gutenberg_download(2701, mirror = "http://www.gutenberg.lib.md.us/")
head(moby_dick)

## # A tibble: 6 × 2
##   gutenberg_id text                
##          <int> <chr>               
## 1         2701 "MOBY-DICK;"        
## 2         2701 ""                  
## 3         2701 "or, THE WHALE."    
## 4         2701 ""                  
## 5         2701 "By Herman Melville"
## 6         2701 ""

# prepare the text data:
tidy_moby_dick <- moby_dick %>%
  mutate(text = str_replace_all(text, "[^[:alnum:][:space:]]", "")) %>%
  unnest_tokens(sentence, text, token = "sentences")
head(tidy_moby_dick)

## # A tibble: 6 × 2
##   gutenberg_id sentence                              
##          <int> <chr>                                 
## 1         2701 mobydick                              
## 2         2701 or the whale                          
## 3         2701 by herman melville                    
## 4         2701 contents                              
## 5         2701 etymology                             
## 6         2701 extracts supplied by a subsublibrarian

# perform sentiment analysis using sentimentr:

# calculate sentiment for each sentence
sentiment_scores <- sentiment(tidy_moby_dick$sentence)

## Warning: Each time `sentiment` is run it has to do sentence boundary disambiguation when a
## raw `character` vector is passed to `text.var`. This may be costly of time and
## memory.  It is highly recommended that the user first runs the raw `character`
## vector through the `get_sentences` function.

# add sentiment scores to the original data
tidy_moby_dick <- tidy_moby_dick %>%
  mutate(sentiment = sentiment_scores$sentiment)
head(tidy_moby_dick)

## # A tibble: 6 × 3
##   gutenberg_id sentence                               sentiment
##          <int> <chr>                                      <dbl>
## 1         2701 mobydick                                       0
## 2         2701 or the whale                                   0
## 3         2701 by herman melville                             0
## 4         2701 contents                                       0
## 5         2701 etymology                                      0
## 6         2701 extracts supplied by a subsublibrarian         0

# aggregate sentiment scores by index
moby_dick_sentiment <- tidy_moby_dick %>%
  mutate(index = row_number() %/% 80) %>%
  group_by(index) %>%
  summarise(sentiment = sum(sentiment))
head(moby_dick_sentiment)

## # A tibble: 6 × 2
##   index sentiment
##   <dbl>     <dbl>
## 1     0     0.287
## 2     1     1.24 
## 3     2    -0.424
## 4     3     0.223
## 5     4     0.201
## 6     5    -1.25

ggplot(moby_dick_sentiment, aes(index, sentiment)) +
  geom_col(show.legend = FALSE) +
  labs(title = "Sentiment Analysis of Moby Dick",
       x = "Index",
       y = "Sentiment Score")

Conclusion:

This follwing graph shows the emotional ups and downs in Moby Dick:

Big Swings: The emotions change a lot, with some parts feeling positive and others negative, creating a rollercoaster of feelings.
Mostly Negative: There are more dips into negative emotions, which makes sense since Moby Dick is a story with dark themes, like revenge and obsession.
Some Calm Sections: Many points are close to zero, which might be more neutral parts of the story where things aren’t very emotional.
Positive Moments: There are a few spots with higher, positive emotions, possibly showing lighter moments in the story, like friendship or breaks in the action.

Overall, the emotional tone of Moby Dick swings between positive and negative, with more leaning toward the negative side.

Assignment 10

Inna Yedzinovich

2024-11-08