Sentiment Analysis with Tidy Data

The sentiments Datasets

We’ll use the tidytext package, which provides access to several sentiment lexicons. Three commonly used general-purpose lexicons are:

  • AFINN
  • Bing
  • NRC
# Load necessary libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidytext)
library(sentimentr)
library(dplyr)
library(ggplot2)
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)

Load the financial data

financial_data <- read_csv("https://raw.githubusercontent.com/Amish22/DS607/refs/heads/main/financial_data.csv") 
## Rows: 5842 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Sentence, Sentiment
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
financial_data_clean <- financial_data %>%
  mutate(text = str_to_lower(Sentence)) %>% 
  unnest_tokens(word, text)
head(financial_data_clean)
## # A tibble: 6 × 3
##   Sentence                                                       Sentiment word 
##   <chr>                                                          <chr>     <chr>
## 1 The GeoSolutions technology will leverage Benefon 's GPS solu… positive  the  
## 2 The GeoSolutions technology will leverage Benefon 's GPS solu… positive  geos…
## 3 The GeoSolutions technology will leverage Benefon 's GPS solu… positive  tech…
## 4 The GeoSolutions technology will leverage Benefon 's GPS solu… positive  will 
## 5 The GeoSolutions technology will leverage Benefon 's GPS solu… positive  leve…
## 6 The GeoSolutions technology will leverage Benefon 's GPS solu… positive  bene…

Sentiment Analysis with Inner Join

Calculate Sentiment with Bing, AFINN, and NRC Lexicons

Let’s examine the distribution of positive and negative sentiment within the financial data using the three lexicons.

# Bing sentiment
bing_sentiment <- financial_data_clean %>%
  inner_join(get_sentiments("bing"), by = "word") %>%
  count(sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment_score = positive - negative)

bing_sentiment
## # A tibble: 1 × 3
##   negative positive sentiment_score
##      <dbl>    <dbl>           <dbl>
## 1     1327     1880             553
# AFINN sentiment
afinn_sentiment <- financial_data_clean %>%
  inner_join(get_sentiments("afinn"), by = "word") %>%
  summarize(sentiment_score = sum(value))

afinn_sentiment
## # A tibble: 1 × 1
##   sentiment_score
##             <dbl>
## 1            2330
# NRC sentiment
nrc_sentiment <- financial_data_clean %>%
  inner_join(get_sentiments("nrc") %>% filter(sentiment %in% c("positive", "negative")), by = "word") %>%
  count(sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment_score = positive - negative)
## Warning in inner_join(., get_sentiments("nrc") %>% filter(sentiment %in% : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 2254 of `x` matches multiple rows in `y`.
## ℹ Row 5028 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
nrc_sentiment
## # A tibble: 1 × 3
##   negative positive sentiment_score
##      <dbl>    <dbl>           <dbl>
## 1     1887     5273            3386
bing_net <- financial_data_clean %>%
  inner_join(get_sentiments("bing"), by = "word") %>%
  mutate(score = ifelse(sentiment == "positive", 1, -1)) %>%
  group_by(Sentence_ID = row_number() %/% 80) %>%
  summarize(net_sentiment = sum(score)) %>%
  mutate(lexicon = "Bing")

afinn_net <- financial_data_clean %>%
  inner_join(get_sentiments("afinn"), by = "word") %>%
  group_by(Sentence_ID = row_number() %/% 80) %>%
  summarize(net_sentiment = sum(value)) %>%
  mutate(lexicon = "AFINN")

nrc_net <- financial_data_clean %>%
  inner_join(get_sentiments("nrc") %>% filter(sentiment %in% c("positive", "negative")), by = "word") %>%
  mutate(score = ifelse(sentiment == "positive", 1, -1)) %>%
  group_by(Sentence_ID = row_number() %/% 80) %>%
  summarize(net_sentiment = sum(score)) %>%
  mutate(lexicon = "NRC")
## Warning in inner_join(., get_sentiments("nrc") %>% filter(sentiment %in% : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 2254 of `x` matches multiple rows in `y`.
## ℹ Row 5028 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
# Sentimentr calculation
sentimentr_net <- sentiment_by(financial_data$Sentence) %>%
  mutate(Sentence_ID = row_number() %/% 80) %>%  # Create Sentence_ID chunks of 80
  group_by(Sentence_ID) %>%
  summarize(net_sentiment = mean(ave_sentiment)) %>%
  mutate(lexicon = "Sentimentr")
## Warning: Each time `sentiment_by` is run it has to do sentence boundary disambiguation when a
## raw `character` vector is passed to `text.var`. This may be costly of time and
## memory.  It is highly recommended that the user first runs the raw `character`
## vector through the `get_sentences` function.
sentimentr_net
## # A tibble: 74 × 3
##    Sentence_ID net_sentiment lexicon   
##          <dbl>         <dbl> <chr>     
##  1           0        0.0793 Sentimentr
##  2           1        0.0912 Sentimentr
##  3           2        0.0494 Sentimentr
##  4           3        0.0897 Sentimentr
##  5           4        0.0723 Sentimentr
##  6           5        0.0727 Sentimentr
##  7           6        0.0915 Sentimentr
##  8           7        0.0841 Sentimentr
##  9           8        0.115  Sentimentr
## 10           9        0.0685 Sentimentr
## # ℹ 64 more rows
combined_net_sentiment <- bind_rows(bing_net, afinn_net, nrc_net, sentimentr_net)

# Plot comparative sentiment
ggplot(combined_net_sentiment, aes(x = Sentence_ID, y = net_sentiment, color = lexicon)) +
  geom_line() +
  labs(title = "Comparative Sentiment Analysis Across Lexicons",
       x = "Text Chunk (Sentence_ID)",
       y = "Net Sentiment Score",
       color = "Lexicon") +
  theme_minimal()

### Most Common Positive and Negative Words

# Most common positive and negative words using Bing
bing_word_counts <- financial_data_clean %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
## Joining with `by = join_by(word)`
# Visualize most common positive and negative words
bing_word_counts %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL,
       title = "Most Common Positive and Negative Words in Financial Data")

### Word Cloud of Common Words

word_counts <- financial_data_clean %>%
  anti_join(stop_words, by = "word") %>%
  count(word, sort = TRUE)

set.seed(1234)
wordcloud(
  words = word_counts$word,
  freq = word_counts$n,
  min.freq = 2,
  max.words = 100,
  random.order = FALSE,
  rot.per = 0.35,
  colors = brewer.pal(8, "Dark2")
)

Conclusion

By using multiple lexicons, we can capture different nuances in sentiment. For example, certain terms in financial contexts may be strongly associated with growth or decline, which could impact financial decision-making. The ability to view sentiment across multiple lexicons provides a deeper, multifaceted understanding of sentiment patterns in financial data.