Assignment 10A Code Base

Author

Long Lin

Overview

First I will get the example from Chapter 2 running first.

Reproducing the Base Example

Taken from:

Silge, J., & Robinson, D. (2017). “Sentiment Analysis with Tidy Data.” Text Mining with R. Retrieved from https://www.tidytextmining.com/sentiment.html.

library(tidytext)

get_sentiments("afinn")
# A tibble: 2,477 × 2
   word       value
   <chr>      <dbl>
 1 abandon       -2
 2 abandoned     -2
 3 abandons      -2
 4 abducted      -2
 5 abduction     -2
 6 abductions    -2
 7 abhor         -3
 8 abhorred      -3
 9 abhorrent     -3
10 abhors        -3
# ℹ 2,467 more rows
get_sentiments("bing")
# A tibble: 6,786 × 2
   word        sentiment
   <chr>       <chr>    
 1 2-faces     negative 
 2 abnormal    negative 
 3 abolish     negative 
 4 abominable  negative 
 5 abominably  negative 
 6 abominate   negative 
 7 abomination negative 
 8 abort       negative 
 9 aborted     negative 
10 aborts      negative 
# ℹ 6,776 more rows
get_sentiments("nrc")
# A tibble: 13,872 × 2
   word        sentiment
   <chr>       <chr>    
 1 abacus      trust    
 2 abandon     fear     
 3 abandon     negative 
 4 abandon     sadness  
 5 abandoned   anger    
 6 abandoned   fear     
 7 abandoned   negative 
 8 abandoned   sadness  
 9 abandonment anger    
10 abandonment fear     
# ℹ 13,862 more rows
library(janeaustenr)
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(stringr)

tidy_books <- austen_books() %>%
  group_by(book) %>%
  mutate(
    linenumber = row_number(),
    chapter = cumsum(str_detect(text, 
                                regex("^chapter [\\divxlc]", 
                                      ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)
nrc_joy <- get_sentiments("nrc") %>% 
  filter(sentiment == "joy")

tidy_books %>%
  filter(book == "Emma") %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)
Joining with `by = join_by(word)`
# A tibble: 301 × 2
   word          n
   <chr>     <int>
 1 good        359
 2 friend      166
 3 hope        143
 4 happy       125
 5 love        117
 6 deal         92
 7 found        92
 8 present      89
 9 kind         82
10 happiness    76
# ℹ 291 more rows
library(tidyr)

jane_austen_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
Joining with `by = join_by(word)`
Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 435434 of `x` matches multiple rows in `y`.
ℹ Row 5051 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
  "many-to-many"` to silence this warning.
library(ggplot2)

ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")

pride_prejudice <- tidy_books %>% 
  filter(book == "Pride & Prejudice")

pride_prejudice
# A tibble: 122,204 × 4
   book              linenumber chapter word     
   <fct>                  <int>   <int> <chr>    
 1 Pride & Prejudice          1       0 pride    
 2 Pride & Prejudice          1       0 and      
 3 Pride & Prejudice          1       0 prejudice
 4 Pride & Prejudice          3       0 by       
 5 Pride & Prejudice          3       0 jane     
 6 Pride & Prejudice          3       0 austen   
 7 Pride & Prejudice          7       1 chapter  
 8 Pride & Prejudice          7       1 1        
 9 Pride & Prejudice         10       1 it       
10 Pride & Prejudice         10       1 is       
# ℹ 122,194 more rows
afinn <- pride_prejudice %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(value)) %>% 
  mutate(method = "AFINN")
Joining with `by = join_by(word)`
bing_and_nrc <- bind_rows(
  pride_prejudice %>% 
    inner_join(get_sentiments("bing")) %>%
    mutate(method = "Bing et al."),
  pride_prejudice %>% 
    inner_join(get_sentiments("nrc") %>% 
                 filter(sentiment %in% c("positive", 
                                         "negative"))
    ) %>%
    mutate(method = "NRC")) %>%
  count(method, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment,
              values_from = n,
              values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
Joining with `by = join_by(word)`
Joining with `by = join_by(word)`
Warning in inner_join(., get_sentiments("nrc") %>% filter(sentiment %in% : Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 215 of `x` matches multiple rows in `y`.
ℹ Row 5178 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
  "many-to-many"` to silence this warning.
bind_rows(afinn, 
          bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

get_sentiments("nrc") %>% 
  filter(sentiment %in% c("positive", "negative")) %>% 
  count(sentiment)
# A tibble: 2 × 2
  sentiment     n
  <chr>     <int>
1 negative   3316
2 positive   2308
get_sentiments("bing") %>% 
  count(sentiment)
# A tibble: 2 × 2
  sentiment     n
  <chr>     <int>
1 negative   4781
2 positive   2005
bing_word_counts <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
Joining with `by = join_by(word)`
Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 435434 of `x` matches multiple rows in `y`.
ℹ Row 5051 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
  "many-to-many"` to silence this warning.
bing_word_counts %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL)

custom_stop_words <- bind_rows(tibble(word = c("miss"),  
                                      lexicon = c("custom")), 
                               stop_words)

custom_stop_words
# A tibble: 1,150 × 2
   word        lexicon
   <chr>       <chr>  
 1 miss        custom 
 2 a           SMART  
 3 a's         SMART  
 4 able        SMART  
 5 about       SMART  
 6 above       SMART  
 7 according   SMART  
 8 accordingly SMART  
 9 across      SMART  
10 actually    SMART  
# ℹ 1,140 more rows
library(wordcloud)
Loading required package: RColorBrewer
tidy_books %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))
Joining with `by = join_by(word)`
Warning in wordcloud(word, n, max.words = 100): miss could not be fit on page.
It will not be plotted.

library(reshape2)

Attaching package: 'reshape2'
The following object is masked from 'package:tidyr':

    smiths
tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 100)
Joining with `by = join_by(word)`
Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 435434 of `x` matches multiple rows in `y`.
ℹ Row 5051 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
  "many-to-many"` to silence this warning.

Extend the Analysis

For my extended analysis, I choose to add the Loughran-McDonald lexicon and use The Adventures of Sherlock Holmes within the gutenbergr library as my new text corpus.

First I grabbed the Loughran-McDonald lexicon from the textdata library.

library(tidytext)
library(textdata)

loughran <- get_sentiments("loughran")

Next I downloaded The Adventures of Sherlock Holmes from gutenbergr with the ID value 1661 and tidied the data like the base example by using mutate and unnest_tokens.

library(gutenbergr)
library(dplyr)
library(tidyr)
library(ggplot2)

sherlock_raw <- gutenberg_download(1661)
Using mirror https://aleph.pglaf.org.
sherlock_tidy <- sherlock_raw %>%
  mutate(linenumber = row_number()) %>%
  unnest_tokens(word, text)

Next, I used inner_join with the bing sentiments to calculate the overall sentiment.

sherlock_sentiment <- sherlock_tidy %>%
  inner_join(get_sentiments("bing")) %>%
  count(index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
Joining with `by = join_by(word)`

Next, I plotted the sentiment to get a visual of the results of the sentiment calculation.

library(ggplot2)

ggplot(sherlock_sentiment, aes(index, sentiment)) +
  geom_col(fill = "steelblue") +
  theme_minimal() +
  labs(title = "The Adventures of Sherlock Holmes")

Next, I added the Loughran lexicon to the calculations with the afinn, bing, and nrc lexicons.

afinn <- sherlock_tidy %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(value)) %>% 
  mutate(method = "AFINN")
Joining with `by = join_by(word)`
compare_lexicons <- bind_rows(
  sherlock_tidy %>% 
    inner_join(get_sentiments("bing")) %>%
    mutate(method = "Bing"),
  
  sherlock_tidy %>% 
    inner_join(get_sentiments("nrc") %>% 
                 filter(sentiment %in% c("positive", 
                                         "negative"))
    ) %>%
    mutate(method = "NRC"),
    
  sherlock_tidy %>% 
    inner_join(get_sentiments("loughran") %>% 
                 filter(sentiment %in% c("positive", 
                                         "negative"))
    ) %>%
    mutate(method = "Loughran")
) %>%
  count(method, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, 
              values_from = n, 
              values_fill = 0) %>% 
  mutate(sentiment = positive - negative) %>%
  select(method, index, sentiment) # Keeping only columns that match AFINN
Joining with `by = join_by(word)`
Joining with `by = join_by(word)`
Warning in inner_join(., get_sentiments("nrc") %>% filter(sentiment %in% : Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 1790 of `x` matches multiple rows in `y`.
ℹ Row 108 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
  "many-to-many"` to silence this warning.
Joining with `by = join_by(word)`

Here I compared the 4 different sentiment lexicons.

bind_rows(afinn, compare_lexicons) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y") +
  theme_minimal() +
  labs(title = "Comparing Sentiment Lexicons: Sherlock Holmes",
       subtitle = "AFINN vs. Bing vs. Loughran vs. NRC",
       y = "Net Sentiment Score",
       x = "Narrative Progress (80-line chunks)")

Here I did a count check on the loughran lexicon, it had different categories compared to the NRC and bing lexicons.

get_sentiments("loughran") %>% 
  count(sentiment)
# A tibble: 6 × 2
  sentiment        n
  <chr>        <int>
1 constraining   186
2 litigious      812
3 negative      2068
4 positive       409
5 superfluous     57
6 uncertainty    318

Then I did a count for the bing words in The Adventures of Sherlock Holmes.

bing_word_counts <- sherlock_tidy %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
Joining with `by = join_by(word)`

Here I did the graph of the bing word counts.

bing_word_counts %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL)

Then I did the calculations for the lougran word counts.

loughran_word_counts <- sherlock_tidy %>%
  inner_join(get_sentiments("loughran")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
Joining with `by = join_by(word)`
Warning in inner_join(., get_sentiments("loughran")): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 240 of `x` matches multiple rows in `y`.
ℹ Row 2679 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
  "many-to-many"` to silence this warning.

Then I plotted the results of the loughran word counts.

loughran_word_counts %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free") +
  labs(x = "Contribution to sentiment",
       y = NULL)

Here I added the word miss to stop_words because it comes up often.

custom_stop_words <- bind_rows(tibble(word = c("miss"),  
                                      lexicon = c("custom")), 
                               stop_words)

custom_stop_words
# A tibble: 1,150 × 2
   word        lexicon
   <chr>       <chr>  
 1 miss        custom 
 2 a           SMART  
 3 a's         SMART  
 4 able        SMART  
 5 about       SMART  
 6 above       SMART  
 7 according   SMART  
 8 accordingly SMART  
 9 across      SMART  
10 actually    SMART  
# ℹ 1,140 more rows

Here I created a word cloud for sherlock_tidy.

library(wordcloud)

sherlock_tidy %>%
  anti_join(custom_stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))
Joining with `by = join_by(word)`

Then I created a comparison cloud for sherlock_tidy using the bing sentiments.

library(reshape2)

sherlock_tidy %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 100)
Joining with `by = join_by(word)`

Conclusion

Since the Loughran lexicon is used mostly for financial text, it was interesting to use it for an older novel like The Adventures of Sherlock Holmes. The results from the sentiment analysis of The Adventures of Sherlock Holmes differs from that of the Jane Austen books in the fact that it is more negative with bigger spikes. This is apparent in the ggplots and is likely due to the fact that Sherlock Holmes is a novel that deals with crime and mystery. The results of the Loughran word count was also more negative compared to the other 3 lexicons used, likely due to the fact that it is used mostly for financial text.