This assignment utilized example code from the book “Text Mining With R: A Tidy Approach” by Julia Silge and David Robinson (see here: https://www.tidytextmining.com). In particular, it uses the code from chapter 2, sentiment analysis. The chapter uses three lexicons, namely, * AFINN from Finn Årup Nielsen, * bing from Bing Liu and collaborators, and * nrc from Saif Mohammad and Peter Turney.

Below, the chapter’s full code is presented, however without any annotations.

library(tidytext)

afinn = get_sentiments("afinn")
bing = get_sentiments("bing")
nrc = get_sentiments("nrc")
library(janeaustenr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)

tidy_books <- austen_books() %>%
  group_by(book) %>%
  mutate(
    linenumber = row_number(),
    chapter = cumsum(str_detect(text, 
                                regex("^chapter [\\divxlc]", 
                                      ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)
nrc_joy <- get_sentiments("nrc") %>% 
  filter(sentiment == "joy")

tidy_books %>%
  filter(book == "Emma") %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)
## Joining with `by = join_by(word)`
## # A tibble: 301 × 2
##    word          n
##    <chr>     <int>
##  1 good        359
##  2 friend      166
##  3 hope        143
##  4 happy       125
##  5 love        117
##  6 deal         92
##  7 found        92
##  8 present      89
##  9 kind         82
## 10 happiness    76
## # ℹ 291 more rows
nrc_joy <- nrc %>% 
  filter(sentiment == "joy")

tidy_books %>%
  filter(book == "Emma") %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)
## Joining with `by = join_by(word)`
## # A tibble: 301 × 2
##    word          n
##    <chr>     <int>
##  1 good        359
##  2 friend      166
##  3 hope        143
##  4 happy       125
##  5 love        117
##  6 deal         92
##  7 found        92
##  8 present      89
##  9 kind         82
## 10 happiness    76
## # ℹ 291 more rows
library(tidyr)

jane_austen_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 435434 of `x` matches multiple rows in `y`.
## ℹ Row 5051 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
library(ggplot2)

ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")

pride_prejudice <- tidy_books %>% 
  filter(book == "Pride & Prejudice")

pride_prejudice
## # A tibble: 122,204 × 4
##    book              linenumber chapter word     
##    <fct>                  <int>   <int> <chr>    
##  1 Pride & Prejudice          1       0 pride    
##  2 Pride & Prejudice          1       0 and      
##  3 Pride & Prejudice          1       0 prejudice
##  4 Pride & Prejudice          3       0 by       
##  5 Pride & Prejudice          3       0 jane     
##  6 Pride & Prejudice          3       0 austen   
##  7 Pride & Prejudice          7       1 chapter  
##  8 Pride & Prejudice          7       1 1        
##  9 Pride & Prejudice         10       1 it       
## 10 Pride & Prejudice         10       1 is       
## # ℹ 122,194 more rows
afinn <- pride_prejudice %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(value)) %>% 
  mutate(method = "AFINN")
## Joining with `by = join_by(word)`
bing_and_nrc <- bind_rows(
  pride_prejudice %>% 
    inner_join(get_sentiments("bing")) %>%
    mutate(method = "Bing et al."),
  pride_prejudice %>% 
    inner_join(get_sentiments("nrc") %>% 
                 filter(sentiment %in% c("positive", 
                                         "negative"))
    ) %>%
    mutate(method = "NRC")) %>%
  count(method, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment,
              values_from = n,
              values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("nrc") %>% filter(sentiment %in% : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 215 of `x` matches multiple rows in `y`.
## ℹ Row 5178 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
afinn <- pride_prejudice %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(value)) %>% 
  mutate(method = "AFINN")
## Joining with `by = join_by(word)`
bing_and_nrc <- bind_rows(
  pride_prejudice %>% 
    inner_join(get_sentiments("bing")) %>%
    mutate(method = "Bing et al."),
  pride_prejudice %>% 
    inner_join(nrc %>% 
                 filter(sentiment %in% c("positive", 
                                         "negative"))
    ) %>%
    mutate(method = "NRC")) %>%
  count(method, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment,
              values_from = n,
              values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
## Joining with `by = join_by(word)`
## Warning in inner_join(., nrc %>% filter(sentiment %in% c("positive", "negative"))): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 215 of `x` matches multiple rows in `y`.
## ℹ Row 5178 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
bind_rows(afinn, 
          bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

get_sentiments("nrc") %>% 
  filter(sentiment %in% c("positive", "negative")) %>% 
  count(sentiment)
## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   3316
## 2 positive   2308
nrc %>% 
  filter(sentiment %in% c("positive", "negative")) %>% 
  count(sentiment)
## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   3316
## 2 positive   2308
get_sentiments("bing") %>% 
  count(sentiment)
## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   4781
## 2 positive   2005
bing_word_counts <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 435434 of `x` matches multiple rows in `y`.
## ℹ Row 5051 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
bing_word_counts
## # A tibble: 2,585 × 3
##    word     sentiment     n
##    <chr>    <chr>     <int>
##  1 miss     negative   1855
##  2 well     positive   1523
##  3 good     positive   1380
##  4 great    positive    981
##  5 like     positive    725
##  6 better   positive    639
##  7 enough   positive    613
##  8 happy    positive    534
##  9 love     positive    495
## 10 pleasure positive    462
## # ℹ 2,575 more rows
bing_word_counts %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL)

custom_stop_words <- bind_rows(tibble(word = c("miss"),  
                                      lexicon = c("custom")), 
                               stop_words)

custom_stop_words
## # A tibble: 1,150 × 2
##    word        lexicon
##    <chr>       <chr>  
##  1 miss        custom 
##  2 a           SMART  
##  3 a's         SMART  
##  4 able        SMART  
##  5 about       SMART  
##  6 above       SMART  
##  7 according   SMART  
##  8 accordingly SMART  
##  9 across      SMART  
## 10 actually    SMART  
## # ℹ 1,140 more rows
library(wordcloud)
## Loading required package: RColorBrewer
tidy_books %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))
## Joining with `by = join_by(word)`

library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 100)
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 435434 of `x` matches multiple rows in `y`.
## ℹ Row 5051 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

austen_chapters <- austen_books() %>%
  group_by(book) %>%
  unnest_tokens(chapter, text, token = "regex", 
                pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
  ungroup()

austen_chapters %>% 
  group_by(book) %>% 
  summarise(chapters = n())
## # A tibble: 6 × 2
##   book                chapters
##   <fct>                  <int>
## 1 Sense & Sensibility       51
## 2 Pride & Prejudice         62
## 3 Mansfield Park            49
## 4 Emma                      56
## 5 Northanger Abbey          32
## 6 Persuasion                25
bingnegative <- get_sentiments("bing") %>% 
  filter(sentiment == "negative")

wordcounts <- tidy_books %>%
  group_by(book, chapter) %>%
  summarize(words = n())
## `summarise()` has grouped output by 'book'. You can override using the
## `.groups` argument.
tidy_books %>%
  semi_join(bingnegative) %>%
  group_by(book, chapter) %>%
  summarize(negativewords = n()) %>%
  left_join(wordcounts, by = c("book", "chapter")) %>%
  mutate(ratio = negativewords/words) %>%
  filter(chapter != 0) %>%
  slice_max(ratio, n = 1) %>% 
  ungroup()
## Joining with `by = join_by(word)`
## `summarise()` has grouped output by 'book'. You can override using the
## `.groups` argument.
## # A tibble: 6 × 5
##   book                chapter negativewords words  ratio
##   <fct>                 <int>         <int> <int>  <dbl>
## 1 Sense & Sensibility      43           161  3405 0.0473
## 2 Pride & Prejudice        34           111  2104 0.0528
## 3 Mansfield Park           46           173  3685 0.0469
## 4 Emma                     15           151  3340 0.0452
## 5 Northanger Abbey         21           149  2982 0.0500
## 6 Persuasion                4            62  1807 0.0343

Extension

The general idea of this chapter was to provide an overview of how to perform sentiment analysis simply with tidyr and sentiment lexica. It is a method to provide understanding of the general sentimental words for a specific text, which can then be used to further understand text data. Below, this sentiment analysis is extended in two ways: using a different corpus, namely, the sentiments of New York Times article titles from the month of February 2024 scraped from the NYT API. Below, a connection to the API is established and the data pulled, by identifying the year and month of interested, as well as the API key. Then, the data is moved from a JSON file to an R data frame for manipulation.

library(httr)
library(jsonlite)
library(wordcloud)

nyt = GET("https://api.nytimes.com/svc/archive/v1/2024/2.json?api-key=47MqAqoMmhQVNdgxdpxgaQ2rlY2GCKRD")
nyt
## Response [https://api.nytimes.com/svc/archive/v1/2024/2.json?api-key=47MqAqoMmhQVNdgxdpxgaQ2rlY2GCKRD]
##   Date: 2024-03-29 16:10
##   Status: 200
##   Content-Type: application/json; charset=UTF-8
##   Size: 13 MB
data = fromJSON(rawToChar(nyt$content))
articles = as.data.frame(data$response)

Examining the articles data frame, it becomes apparent that there is a lot of information that is not needed for this project, therefore, most columns are omitted. Additionally, the full text of the articles are not shown, as New York Times would like money for that, understandably. Two columns are retained: docs.headline and docs.pub_date. Additionally, the assignment requires a new lexicon to be used, which will be the Jockers lexicon from 2017. This lexicon has 10,738 words and is aiming to incorporate emotional shifts in text, with scores ranging from -1 to +1.

jockers = lexicon::hash_sentiment_jockers

articles = articles[,c("docs.headline","docs.pub_date","docs.abstract")]
articles = do.call(data.frame, articles)
articles = articles[,c("docs.headline.main","docs.pub_date","docs.abstract")]
colnames(articles) = c("Headline", "Date", "Abstract")

articles$Date = as.Date(articles$Date, format = "%Y-%m-%d")

articles_words <- articles %>%
  unnest_tokens(word, Headline)

colnames(jockers) = c("word", "sentiment")
jockers_positive = jockers %>%
  filter(sentiment > 0)
jockers_negative = jockers %>%
  filter(sentiment < 0)


articles_words %>%
  inner_join(jockers_positive) %>%
  count(word, sort = T) %>%
  top_n(10)
## Joining with `by = join_by(word)`
## Selecting by n
##           word   n
## 1          new 254
## 2         love  60
## 3        super  56
## 4          big  42
## 5         like  41
## 6         deal  37
## 7         best  35
## 8          aid  34
## 9  connections  32
## 10     supreme  32
articles_words %>%
  inner_join(jockers_negative) %>%
  count(word, sort = T) %>%
  top_n(10)
## Joining with `by = join_by(word)`
## Selecting by n
##        word   n
## 1     trump 174
## 2      dies 101
## 3       war  51
## 4     black  39
## 5     death  39
## 6      fire  30
## 7    police  30
## 8      dead  29
## 9  shooting  24
## 10     lost  20
articles_words = articles_words %>%
  inner_join(jockers)
## Joining with `by = join_by(word)`

The above code did the following things: it pulled the Jockers lexicon from the package lexicon, then omitted all columns from the NYT articles and renamed the columns, unnested the NYT articles for each word. Separated the positive and negative words in the Jockers lexicon, and then separately performed an inner join on the NYT article words for each positive and negative. The output shows that the most used positive word in February 2024 was new, followed by love and super. On the other hand, the most common negative word was trump, which is very interesting because NYT is likely referring to Donald Trump mostly, but the verb “to trump” is what the Jockers lexicon refers to.

Additionally to this, below analyzes the net sentiment as function of time (i.e., positive scores minus negative scores per day).

articles_words %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))

time_series = articles_words %>%
  group_by(Date) %>%
  summarize(positive = sum(sentiment > 0, na.rm = TRUE), 
            negative = sum(sentiment < 0, na.rm = TRUE)) %>%
  mutate(sentiment_value = positive - negative)
time_series$Date = as.Date(time_series$Date)

ggplot(aes(x = Date, y = sentiment_value), data = time_series) +
  geom_line(color = "darkgreen") +
  theme_minimal() +
  labs(x = 'Date', y = 'Sentiment Score (Positive - Negative)', title = 'Net Sentiment of NYT Headlines') +
  geom_hline(yintercept = mean(time_series$sentiment_value), color="salmon")

The word cloud shown above shows the most common positive and negative words for all NYT headlines in the month of February, 2024.

As can be seen on the time series, it appears that the sentiment per day are quite volatile, with the highest value on 30 on 02/13 and the lowest value of -47 on 02/22. Interestingly, since the Jockers lexicon evaluates sentiments based on its magnitude, computing any summary statistics here will show not the count of appearances, but actually take into account the importance for each words. The horizontal line depicts the average sentiment across the month, which shows that, on average, sentiments are more negative than positive. Given that the New York Times is one of the most read news papers around the world, it is not surprising that the average sentiment is negative.

Given that the most used negative word is “trump” and through domain knowledge it can be inferred that NYT refers to Donald Trump, below the same time series is re-run without this word to see how much of an impact it has.

articles_word_no_trump = articles_words[!grepl("trump", articles_words$word),]
time_series_no_trump = articles_word_no_trump %>%
  group_by(Date) %>%
  summarize(positive = sum(sentiment > 0, na.rm = TRUE), 
            negative = sum(sentiment < 0, na.rm = TRUE)) %>%
  mutate(sentiment_value = positive - negative)
time_series_no_trump$Date = as.Date(time_series_no_trump$Date)

ggplot(aes(x = Date, y = sentiment_value), data = time_series_no_trump) +
  geom_line(color = "darkgreen") +
  theme_minimal() +
  labs(x = 'Date', y = 'Sentiment Score (Positive - Negative)', title = 'Net Sentiment of NYT Headlines') +
  geom_hline(yintercept = mean(time_series_no_trump$sentiment_value), color="salmon")

Interestingly, the average sentiment is positive when excluding the word “trump”. While the volatility of the sentiments over time, this is in contrast to the general conception that the news is more negative than positive. However, it is very important to remember that several words, which are classified either positive or negative, can be easily utilized in the opposite sentiment, which in turn can invalidate our results. Additionally, the content of an article may be different altogether compared to its headline.