This report provides a static sentiment analysis of a given set of news headlines. It processes the text, calculates sentiment scores using a chosen lexicon, and visualizes the results. All code is hidden by default but can be expanded for review.
This first section loads all necessary libraries and defines the input data for the analysis.
# Load Libraries
library(dplyr)
library(tidytext)
library(ggplot2)
library(stringr)
library(wordcloud)
library(RColorBrewer)
library(DT)
library(knitr)
# Global plotting options for the report
opts_chunk$set(
echo = TRUE,
message = FALSE, # Hide messages
warning = FALSE, # Hide warnings
fig.align = 'center' # Center-align all plots
)
Change the headline_input text and the
lexicon_choice to run a new report.
# PASTE YOUR HEADLINES HERE (one per line, inside the quotes)
headline_input <- "
Treasury wary of National State Enterprises Bill – concerns raised about holding company for SOEs
Thousands of vehicle recalls in SA raise questions over standards
Miners' stocks soar along with copper prices – Anglo American, BHP and Glencore
Nedbank nabs FNB Business CEO Andiswa Bata
US treasury head to skip G20 meeting
Activity in residential property market cools
Prosus prices €750m bond
Stefanutti Stocks sells its Mozambique and Mauritius operations
Tharisa reports higher output but lowers its guidance
Merck poised to land $10bn biotech deal
Starlink lands licence to operate in India
SA central bank could lower CPI goal in July
Obsidian fund shuns SA bonds for Latin American yield
"
# CHOOSE YOUR SENTIMENT LEXICON: "bing", "afinn", or "nrc"
lexicon_choice <- "bing"
The following code processes the raw text into a structured data frame, calculating a sentiment score for each headline based on the chosen lexicon.
analyze_sentiments <- function(headlines_text, lexicon_choice = "bing") {
headlines <- unlist(strsplit(headlines_text, "\n"))
headlines <- trimws(headlines)
headlines <- headlines[headlines != ""]
if (length(headlines) == 0) return(NULL)
df <- data.frame(headline_id = 1:length(headlines), headline = headlines, stringsAsFactors = FALSE)
if (lexicon_choice %in% c("bing", "afinn")) {
lexicon <- get_sentiments(lexicon_choice)
if (lexicon_choice == "bing") {
lexicon <- lexicon %>% mutate(value = ifelse(sentiment == "positive", 1, -1))
}
# ** THE FIX IS HERE: drop = FALSE **
sentiment_scored <- df %>%
unnest_tokens(word, headline, drop = FALSE) %>% # <-- ADDED drop = FALSE
inner_join(lexicon, by = "word") %>%
group_by(headline_id, headline) %>%
summarise(score = sum(value), .groups = "drop") %>%
ungroup()
} else { # NRC Lexicon
nrc_sentiments <- get_sentiments("nrc")
sentiment_scored <- df %>%
unnest_tokens(word, headline, drop = FALSE) %>%
inner_join(nrc_sentiments, by = "word") %>%
filter(sentiment %in% c("positive", "negative")) %>%
count(headline_id, headline, sentiment) %>%
tidyr::pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
{
if (!"positive" %in% names(.)) .$positive <- 0
if (!"negative" %in% names(.)) .$negative <- 0
.
} %>%
mutate(score = (positive - negative)) %>%
ungroup()
}
final_df <- df %>%
left_join(sentiment_scored, by = c("headline_id", "headline")) %>%
mutate(
score = ifelse(is.na(score), 0, score),
label = case_when(
score > 0 ~ "Positive",
score < 0 ~ "Negative",
TRUE ~ "Neutral"
)
)
return(final_df)
}
analysis_results <- analyze_sentiments(headline_input, lexicon_choice)
# This code chunk requires that analysis_results is not NULL
if(!is.null(analysis_results) && nrow(analysis_results) > 0){
data <- analysis_results
neg_news <- sum(data$label == "Negative")
pos_news <- sum(data$label == "Positive")
neu_news <- sum(data$label == "Neutral")
if (pos_news > neg_news) {
summary_text <- "Overall sentiment is 'Positive', suggesting bullish signals."
summary_class <- "summary-positive"
} else if (neg_news > pos_news) {
summary_text <- "Overall sentiment is 'Negative', indicating potential bearish signals or risks."
summary_class <- "summary-negative"
} else {
summary_text <- "Sentiment appears 'Mixed or Neutral'. Monitor closely for market-moving catalysts."
summary_class <- "summary-neutral"
}
cat(paste0("<div class='summary-box ", summary_class, "'>"))
cat(paste("#### Overall Assessment: ", summary_text, "\n\n"))
cat(paste0("- **Total Headlines Analyzed:** ", nrow(data), "\n"))
cat(paste0("- **Positive Headlines:** ", pos_news, "\n"))
cat(paste0("- **Negative Headlines:** ", neg_news, "\n"))
cat(paste0("- **Neutral Headlines:** ", neu_news, "\n"))
cat("</div>")
} else {
cat("No headlines were processed or found. Please check your inputs.")
}
if(!is.null(analysis_results) && nrow(analysis_results) > 0){
if(lexicon_choice %in% c("bing", "afinn")) {
ggplot(analysis_results, aes(x = score, fill = label)) +
geom_histogram(binwidth = 1, alpha = 0.8, position = "identity") +
scale_fill_manual(name = "Sentiment", values = c("Positive" = "#2c7bb6", "Negative" = "#d7191c", "Neutral" = "#fdae61")) +
theme_minimal(base_size = 15) +
labs(title = "Distribution of Sentiment Scores", x = "Sentiment Score (Positive - Negative)", y = "Number of Headlines")
} else { # NRC Plot
analysis_results %>%
unnest_tokens(word, headline) %>%
inner_join(get_sentiments("nrc"), by = "word") %>%
count(sentiment, sort = TRUE) %>%
filter(!sentiment %in% c("positive", "negative")) %>%
ggplot(aes(x = reorder(sentiment, n), y = n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
coord_flip() +
theme_minimal(base_size = 15) +
labs(title = "NRC Emotion Category Word Count", x = "Emotion", y = "Total Word Count")
}
}
library(tibble)
if(!is.null(analysis_results) && nrow(analysis_results) > 0){
cloud_data <- analysis_results %>%
unnest_tokens(word, headline) %>%
anti_join(stop_words, by = "word")
if(lexicon_choice %in% c("bing", "afinn") && nrow(cloud_data) > 0) {
# This pipeline prepares the data for comparison.cloud
comparison_data <- cloud_data %>%
inner_join(get_sentiments("bing"), by = "word") %>%
count(word, sentiment, sort = TRUE) %>%
tidyr::pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
# Ensure required columns exist
{ if (!"positive" %in% names(.)) .$positive <- 0; if (!"negative" %in% names(.)) .$negative <- 0; . }
# We only proceed if we have words to plot
if (nrow(comparison_data) > 0) {
comparison_data %>%
# Convert the 'word' column to row names, which is what the function expects
column_to_rownames("word") %>%
# Then call the function on the resulting numeric-only data
comparison.cloud(colors = c("#d7191c", "#2c7bb6"),
max.words = 150,
title.size = 1.5,
scale = c(4, 0.8),
random.order = FALSE)
} else {
plot.new()
text(x=0.5, y=0.5, "No sentiment words found for comparison cloud.")
}
} else { # Fallback for NRC lexicon or if no data
word_counts <- count(cloud_data, word, sort = TRUE)
if (nrow(word_counts) > 0) {
wordcloud(words = word_counts$word, freq = word_counts$n,
max.words = 100,
random.order = FALSE,
colors = brewer.pal(8, "Dark2"))
} else {
plot.new()
text(x=0.5, y=0.5, "Not enough words to generate a word cloud.")
}
}
}
if(!is.null(analysis_results) && nrow(analysis_results) > 0){
data_for_table <- analysis_results %>%
select(Headline = headline, `Sentiment Score` = score, `Label` = label)
datatable(
data_for_table,
extensions = 'Responsive',
options = list(
pageLength = 10,
columnDefs = list(
list(
targets = 0,
render = JS(
"function(data, type, row, meta) {",
" return type === 'display' && data.length > 80 ?",
" '<span title=\"' + data + '\">' + data.substr(0, 80) + '...</span>' : data;",
"}"
)
)
)
),
rownames = FALSE
) %>%
formatStyle(
'Sentiment Score',
color = styleInterval(c(-0.5, 0.5), c('#d7191c', 'gray', '#2c7bb6')),
fontWeight = 'bold'
)
}