This assignment utilized example code from the book “Text Mining With
R: A Tidy Approach” by Julia Silge and David Robinson (see here: https://www.tidytextmining.com). In particular, it uses
the code from chapter 2, sentiment analysis. The chapter uses three
lexicons, namely, * AFINN
from Finn Årup Nielsen, *
bing
from Bing Liu and collaborators, and *
nrc
from Saif Mohammad and Peter Turney.
Below, the chapter’s full code is presented, however without any annotations.
library(tidytext)
afinn = get_sentiments("afinn")
bing = get_sentiments("bing")
nrc = get_sentiments("nrc")
library(janeaustenr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
tidy_books <- austen_books() %>%
group_by(book) %>%
mutate(
linenumber = row_number(),
chapter = cumsum(str_detect(text,
regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup() %>%
unnest_tokens(word, text)
nrc_joy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
tidy_books %>%
filter(book == "Emma") %>%
inner_join(nrc_joy) %>%
count(word, sort = TRUE)
## Joining with `by = join_by(word)`
## # A tibble: 301 × 2
## word n
## <chr> <int>
## 1 good 359
## 2 friend 166
## 3 hope 143
## 4 happy 125
## 5 love 117
## 6 deal 92
## 7 found 92
## 8 present 89
## 9 kind 82
## 10 happiness 76
## # ℹ 291 more rows
nrc_joy <- nrc %>%
filter(sentiment == "joy")
tidy_books %>%
filter(book == "Emma") %>%
inner_join(nrc_joy) %>%
count(word, sort = TRUE)
## Joining with `by = join_by(word)`
## # A tibble: 301 × 2
## word n
## <chr> <int>
## 1 good 359
## 2 friend 166
## 3 hope 143
## 4 happy 125
## 5 love 117
## 6 deal 92
## 7 found 92
## 8 present 89
## 9 kind 82
## 10 happiness 76
## # ℹ 291 more rows
library(tidyr)
jane_austen_sentiment <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 435434 of `x` matches multiple rows in `y`.
## ℹ Row 5051 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
library(ggplot2)
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x")
pride_prejudice <- tidy_books %>%
filter(book == "Pride & Prejudice")
pride_prejudice
## # A tibble: 122,204 × 4
## book linenumber chapter word
## <fct> <int> <int> <chr>
## 1 Pride & Prejudice 1 0 pride
## 2 Pride & Prejudice 1 0 and
## 3 Pride & Prejudice 1 0 prejudice
## 4 Pride & Prejudice 3 0 by
## 5 Pride & Prejudice 3 0 jane
## 6 Pride & Prejudice 3 0 austen
## 7 Pride & Prejudice 7 1 chapter
## 8 Pride & Prejudice 7 1 1
## 9 Pride & Prejudice 10 1 it
## 10 Pride & Prejudice 10 1 is
## # ℹ 122,194 more rows
afinn <- pride_prejudice %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
## Joining with `by = join_by(word)`
bing_and_nrc <- bind_rows(
pride_prejudice %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
pride_prejudice %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c("positive",
"negative"))
) %>%
mutate(method = "NRC")) %>%
count(method, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment,
values_from = n,
values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("nrc") %>% filter(sentiment %in% : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 215 of `x` matches multiple rows in `y`.
## ℹ Row 5178 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
afinn <- pride_prejudice %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
## Joining with `by = join_by(word)`
bing_and_nrc <- bind_rows(
pride_prejudice %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
pride_prejudice %>%
inner_join(nrc %>%
filter(sentiment %in% c("positive",
"negative"))
) %>%
mutate(method = "NRC")) %>%
count(method, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment,
values_from = n,
values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
## Joining with `by = join_by(word)`
## Warning in inner_join(., nrc %>% filter(sentiment %in% c("positive", "negative"))): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 215 of `x` matches multiple rows in `y`.
## ℹ Row 5178 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
bind_rows(afinn,
bing_and_nrc) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y")
get_sentiments("nrc") %>%
filter(sentiment %in% c("positive", "negative")) %>%
count(sentiment)
## # A tibble: 2 × 2
## sentiment n
## <chr> <int>
## 1 negative 3316
## 2 positive 2308
nrc %>%
filter(sentiment %in% c("positive", "negative")) %>%
count(sentiment)
## # A tibble: 2 × 2
## sentiment n
## <chr> <int>
## 1 negative 3316
## 2 positive 2308
get_sentiments("bing") %>%
count(sentiment)
## # A tibble: 2 × 2
## sentiment n
## <chr> <int>
## 1 negative 4781
## 2 positive 2005
bing_word_counts <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 435434 of `x` matches multiple rows in `y`.
## ℹ Row 5051 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
bing_word_counts
## # A tibble: 2,585 × 3
## word sentiment n
## <chr> <chr> <int>
## 1 miss negative 1855
## 2 well positive 1523
## 3 good positive 1380
## 4 great positive 981
## 5 like positive 725
## 6 better positive 639
## 7 enough positive 613
## 8 happy positive 534
## 9 love positive 495
## 10 pleasure positive 462
## # ℹ 2,575 more rows
bing_word_counts %>%
group_by(sentiment) %>%
slice_max(n, n = 10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(x = "Contribution to sentiment",
y = NULL)
custom_stop_words <- bind_rows(tibble(word = c("miss"),
lexicon = c("custom")),
stop_words)
custom_stop_words
## # A tibble: 1,150 × 2
## word lexicon
## <chr> <chr>
## 1 miss custom
## 2 a SMART
## 3 a's SMART
## 4 able SMART
## 5 about SMART
## 6 above SMART
## 7 according SMART
## 8 accordingly SMART
## 9 across SMART
## 10 actually SMART
## # ℹ 1,140 more rows
library(wordcloud)
## Loading required package: RColorBrewer
tidy_books %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
## Joining with `by = join_by(word)`
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("gray20", "gray80"),
max.words = 100)
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 435434 of `x` matches multiple rows in `y`.
## ℹ Row 5051 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
austen_chapters <- austen_books() %>%
group_by(book) %>%
unnest_tokens(chapter, text, token = "regex",
pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
ungroup()
austen_chapters %>%
group_by(book) %>%
summarise(chapters = n())
## # A tibble: 6 × 2
## book chapters
## <fct> <int>
## 1 Sense & Sensibility 51
## 2 Pride & Prejudice 62
## 3 Mansfield Park 49
## 4 Emma 56
## 5 Northanger Abbey 32
## 6 Persuasion 25
bingnegative <- get_sentiments("bing") %>%
filter(sentiment == "negative")
wordcounts <- tidy_books %>%
group_by(book, chapter) %>%
summarize(words = n())
## `summarise()` has grouped output by 'book'. You can override using the
## `.groups` argument.
tidy_books %>%
semi_join(bingnegative) %>%
group_by(book, chapter) %>%
summarize(negativewords = n()) %>%
left_join(wordcounts, by = c("book", "chapter")) %>%
mutate(ratio = negativewords/words) %>%
filter(chapter != 0) %>%
slice_max(ratio, n = 1) %>%
ungroup()
## Joining with `by = join_by(word)`
## `summarise()` has grouped output by 'book'. You can override using the
## `.groups` argument.
## # A tibble: 6 × 5
## book chapter negativewords words ratio
## <fct> <int> <int> <int> <dbl>
## 1 Sense & Sensibility 43 161 3405 0.0473
## 2 Pride & Prejudice 34 111 2104 0.0528
## 3 Mansfield Park 46 173 3685 0.0469
## 4 Emma 15 151 3340 0.0452
## 5 Northanger Abbey 21 149 2982 0.0500
## 6 Persuasion 4 62 1807 0.0343
The general idea of this chapter was to provide an overview of how to perform sentiment analysis simply with tidyr and sentiment lexica. It is a method to provide understanding of the general sentimental words for a specific text, which can then be used to further understand text data. Below, this sentiment analysis is extended in two ways: using a different corpus, namely, the sentiments of New York Times article titles from the month of February 2024 scraped from the NYT API. Below, a connection to the API is established and the data pulled, by identifying the year and month of interested, as well as the API key. Then, the data is moved from a JSON file to an R data frame for manipulation.
library(httr)
library(jsonlite)
library(wordcloud)
nyt = GET("https://api.nytimes.com/svc/archive/v1/2024/2.json?api-key=47MqAqoMmhQVNdgxdpxgaQ2rlY2GCKRD")
nyt
## Response [https://api.nytimes.com/svc/archive/v1/2024/2.json?api-key=47MqAqoMmhQVNdgxdpxgaQ2rlY2GCKRD]
## Date: 2024-03-29 16:10
## Status: 200
## Content-Type: application/json; charset=UTF-8
## Size: 13 MB
data = fromJSON(rawToChar(nyt$content))
articles = as.data.frame(data$response)
Examining the articles data frame, it becomes apparent that there is a lot of information that is not needed for this project, therefore, most columns are omitted. Additionally, the full text of the articles are not shown, as New York Times would like money for that, understandably. Two columns are retained: docs.headline and docs.pub_date. Additionally, the assignment requires a new lexicon to be used, which will be the Jockers lexicon from 2017. This lexicon has 10,738 words and is aiming to incorporate emotional shifts in text, with scores ranging from -1 to +1.
jockers = lexicon::hash_sentiment_jockers
articles = articles[,c("docs.headline","docs.pub_date","docs.abstract")]
articles = do.call(data.frame, articles)
articles = articles[,c("docs.headline.main","docs.pub_date","docs.abstract")]
colnames(articles) = c("Headline", "Date", "Abstract")
articles$Date = as.Date(articles$Date, format = "%Y-%m-%d")
articles_words <- articles %>%
unnest_tokens(word, Headline)
colnames(jockers) = c("word", "sentiment")
jockers_positive = jockers %>%
filter(sentiment > 0)
jockers_negative = jockers %>%
filter(sentiment < 0)
articles_words %>%
inner_join(jockers_positive) %>%
count(word, sort = T) %>%
top_n(10)
## Joining with `by = join_by(word)`
## Selecting by n
## word n
## 1 new 254
## 2 love 60
## 3 super 56
## 4 big 42
## 5 like 41
## 6 deal 37
## 7 best 35
## 8 aid 34
## 9 connections 32
## 10 supreme 32
articles_words %>%
inner_join(jockers_negative) %>%
count(word, sort = T) %>%
top_n(10)
## Joining with `by = join_by(word)`
## Selecting by n
## word n
## 1 trump 174
## 2 dies 101
## 3 war 51
## 4 black 39
## 5 death 39
## 6 fire 30
## 7 police 30
## 8 dead 29
## 9 shooting 24
## 10 lost 20
articles_words = articles_words %>%
inner_join(jockers)
## Joining with `by = join_by(word)`
The above code did the following things: it pulled the Jockers lexicon from the package lexicon, then omitted all columns from the NYT articles and renamed the columns, unnested the NYT articles for each word. Separated the positive and negative words in the Jockers lexicon, and then separately performed an inner join on the NYT article words for each positive and negative. The output shows that the most used positive word in February 2024 was new, followed by love and super. On the other hand, the most common negative word was trump, which is very interesting because NYT is likely referring to Donald Trump mostly, but the verb “to trump” is what the Jockers lexicon refers to.
Additionally to this, below analyzes the net sentiment as function of time (i.e., positive scores minus negative scores per day).
articles_words %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
time_series = articles_words %>%
group_by(Date) %>%
summarize(positive = sum(sentiment > 0, na.rm = TRUE),
negative = sum(sentiment < 0, na.rm = TRUE)) %>%
mutate(sentiment_value = positive - negative)
time_series$Date = as.Date(time_series$Date)
ggplot(aes(x = Date, y = sentiment_value), data = time_series) +
geom_line(color = "darkgreen") +
theme_minimal() +
labs(x = 'Date', y = 'Sentiment Score (Positive - Negative)', title = 'Net Sentiment of NYT Headlines') +
geom_hline(yintercept = mean(time_series$sentiment_value), color="salmon")
The word cloud shown above shows the most common positive and negative words for all NYT headlines in the month of February, 2024.
As can be seen on the time series, it appears that the sentiment per day are quite volatile, with the highest value on 30 on 02/13 and the lowest value of -47 on 02/22. Interestingly, since the Jockers lexicon evaluates sentiments based on its magnitude, computing any summary statistics here will show not the count of appearances, but actually take into account the importance for each words. The horizontal line depicts the average sentiment across the month, which shows that, on average, sentiments are more negative than positive. Given that the New York Times is one of the most read news papers around the world, it is not surprising that the average sentiment is negative.
Given that the most used negative word is “trump” and through domain knowledge it can be inferred that NYT refers to Donald Trump, below the same time series is re-run without this word to see how much of an impact it has.
articles_word_no_trump = articles_words[!grepl("trump", articles_words$word),]
time_series_no_trump = articles_word_no_trump %>%
group_by(Date) %>%
summarize(positive = sum(sentiment > 0, na.rm = TRUE),
negative = sum(sentiment < 0, na.rm = TRUE)) %>%
mutate(sentiment_value = positive - negative)
time_series_no_trump$Date = as.Date(time_series_no_trump$Date)
ggplot(aes(x = Date, y = sentiment_value), data = time_series_no_trump) +
geom_line(color = "darkgreen") +
theme_minimal() +
labs(x = 'Date', y = 'Sentiment Score (Positive - Negative)', title = 'Net Sentiment of NYT Headlines') +
geom_hline(yintercept = mean(time_series_no_trump$sentiment_value), color="salmon")
Interestingly, the average sentiment is positive when excluding the word “trump”. While the volatility of the sentiments over time, this is in contrast to the general conception that the news is more negative than positive. However, it is very important to remember that several words, which are classified either positive or negative, can be easily utilized in the opposite sentiment, which in turn can invalidate our results. Additionally, the content of an article may be different altogether compared to its headline.