sentiments
DatasetsWe’ll use the tidytext package, which provides access to several sentiment lexicons. Three commonly used general-purpose lexicons are:
# Load necessary libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidytext)
library(sentimentr)
library(dplyr)
library(ggplot2)
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
financial_data <- read_csv("https://raw.githubusercontent.com/Amish22/DS607/refs/heads/main/financial_data.csv")
## Rows: 5842 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Sentence, Sentiment
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
financial_data_clean <- financial_data %>%
mutate(text = str_to_lower(Sentence)) %>%
unnest_tokens(word, text)
head(financial_data_clean)
## # A tibble: 6 × 3
## Sentence Sentiment word
## <chr> <chr> <chr>
## 1 The GeoSolutions technology will leverage Benefon 's GPS solu… positive the
## 2 The GeoSolutions technology will leverage Benefon 's GPS solu… positive geos…
## 3 The GeoSolutions technology will leverage Benefon 's GPS solu… positive tech…
## 4 The GeoSolutions technology will leverage Benefon 's GPS solu… positive will
## 5 The GeoSolutions technology will leverage Benefon 's GPS solu… positive leve…
## 6 The GeoSolutions technology will leverage Benefon 's GPS solu… positive bene…
Let’s examine the distribution of positive and negative sentiment within the financial data using the three lexicons.
# Bing sentiment
bing_sentiment <- financial_data_clean %>%
inner_join(get_sentiments("bing"), by = "word") %>%
count(sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment_score = positive - negative)
bing_sentiment
## # A tibble: 1 × 3
## negative positive sentiment_score
## <dbl> <dbl> <dbl>
## 1 1327 1880 553
# AFINN sentiment
afinn_sentiment <- financial_data_clean %>%
inner_join(get_sentiments("afinn"), by = "word") %>%
summarize(sentiment_score = sum(value))
afinn_sentiment
## # A tibble: 1 × 1
## sentiment_score
## <dbl>
## 1 2330
# NRC sentiment
nrc_sentiment <- financial_data_clean %>%
inner_join(get_sentiments("nrc") %>% filter(sentiment %in% c("positive", "negative")), by = "word") %>%
count(sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment_score = positive - negative)
## Warning in inner_join(., get_sentiments("nrc") %>% filter(sentiment %in% : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 2254 of `x` matches multiple rows in `y`.
## ℹ Row 5028 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
nrc_sentiment
## # A tibble: 1 × 3
## negative positive sentiment_score
## <dbl> <dbl> <dbl>
## 1 1887 5273 3386
bing_net <- financial_data_clean %>%
inner_join(get_sentiments("bing"), by = "word") %>%
mutate(score = ifelse(sentiment == "positive", 1, -1)) %>%
group_by(Sentence_ID = row_number() %/% 80) %>%
summarize(net_sentiment = sum(score)) %>%
mutate(lexicon = "Bing")
afinn_net <- financial_data_clean %>%
inner_join(get_sentiments("afinn"), by = "word") %>%
group_by(Sentence_ID = row_number() %/% 80) %>%
summarize(net_sentiment = sum(value)) %>%
mutate(lexicon = "AFINN")
nrc_net <- financial_data_clean %>%
inner_join(get_sentiments("nrc") %>% filter(sentiment %in% c("positive", "negative")), by = "word") %>%
mutate(score = ifelse(sentiment == "positive", 1, -1)) %>%
group_by(Sentence_ID = row_number() %/% 80) %>%
summarize(net_sentiment = sum(score)) %>%
mutate(lexicon = "NRC")
## Warning in inner_join(., get_sentiments("nrc") %>% filter(sentiment %in% : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 2254 of `x` matches multiple rows in `y`.
## ℹ Row 5028 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
# Sentimentr calculation
sentimentr_net <- sentiment_by(financial_data$Sentence) %>%
mutate(Sentence_ID = row_number() %/% 80) %>% # Create Sentence_ID chunks of 80
group_by(Sentence_ID) %>%
summarize(net_sentiment = mean(ave_sentiment)) %>%
mutate(lexicon = "Sentimentr")
## Warning: Each time `sentiment_by` is run it has to do sentence boundary disambiguation when a
## raw `character` vector is passed to `text.var`. This may be costly of time and
## memory. It is highly recommended that the user first runs the raw `character`
## vector through the `get_sentences` function.
sentimentr_net
## # A tibble: 74 × 3
## Sentence_ID net_sentiment lexicon
## <dbl> <dbl> <chr>
## 1 0 0.0793 Sentimentr
## 2 1 0.0912 Sentimentr
## 3 2 0.0494 Sentimentr
## 4 3 0.0897 Sentimentr
## 5 4 0.0723 Sentimentr
## 6 5 0.0727 Sentimentr
## 7 6 0.0915 Sentimentr
## 8 7 0.0841 Sentimentr
## 9 8 0.115 Sentimentr
## 10 9 0.0685 Sentimentr
## # ℹ 64 more rows
combined_net_sentiment <- bind_rows(bing_net, afinn_net, nrc_net, sentimentr_net)
# Plot comparative sentiment
ggplot(combined_net_sentiment, aes(x = Sentence_ID, y = net_sentiment, color = lexicon)) +
geom_line() +
labs(title = "Comparative Sentiment Analysis Across Lexicons",
x = "Text Chunk (Sentence_ID)",
y = "Net Sentiment Score",
color = "Lexicon") +
theme_minimal()
### Most Common Positive and Negative Words
# Most common positive and negative words using Bing
bing_word_counts <- financial_data_clean %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining with `by = join_by(word)`
# Visualize most common positive and negative words
bing_word_counts %>%
group_by(sentiment) %>%
slice_max(n, n = 10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(x = "Contribution to sentiment",
y = NULL,
title = "Most Common Positive and Negative Words in Financial Data")
### Word Cloud of Common Words
word_counts <- financial_data_clean %>%
anti_join(stop_words, by = "word") %>%
count(word, sort = TRUE)
set.seed(1234)
wordcloud(
words = word_counts$word,
freq = word_counts$n,
min.freq = 2,
max.words = 100,
random.order = FALSE,
rot.per = 0.35,
colors = brewer.pal(8, "Dark2")
)
By using multiple lexicons, we can capture different nuances in sentiment. For example, certain terms in financial contexts may be strongly associated with growth or decline, which could impact financial decision-making. The ability to view sentiment across multiple lexicons provides a deeper, multifaceted understanding of sentiment patterns in financial data.