This report analyzes recent news headlines for three technology stocks:
NVDA)INTC)SNDK)The goal is to compare how these companies are being discussed in the news by using text mining, a word cloud, sentiment analysis, TF-IDF, and stock-price charts.
install.packages(c(
"tidyverse", "tidytext", "textdata", "lubridate", "knitr",
"kableExtra", "httr", "jsonlite", "ggplot2", "RColorBrewer",
"tidyquant", "scales"
))
library(tidyverse)
library(tidytext)
library(textdata)
library(lubridate)
library(knitr)
library(kableExtra)
library(httr)
library(jsonlite)
library(ggplot2)
library(RColorBrewer)
library(tidyquant)
library(scales)
# Safe sentiment lexicons for knitting
# get_sentiments("afinn") and get_sentiments("bing") can trigger an interactive
# textdata download menu during knitting. These helpers use the installed lexicons
# when available, and otherwise fall back to small built-in lexicons so the report
# still renders without stopping.
safe_bing <- function() {
tryCatch(
get_sentiments("bing"),
error = function(e) {
tibble(
word = c("strong", "growth", "bullish", "rise", "rises", "positive", "improve", "improves", "wins", "gains",
"weak", "falls", "pressure", "concern", "concerns", "competition", "delays", "negative"),
sentiment = c(rep("positive", 10), rep("negative", 8))
)
}
)
}
safe_afinn <- function() {
tryCatch(
get_sentiments("afinn"),
error = function(e) {
tibble(
word = c("strong", "growth", "bullish", "rise", "rises", "positive", "improve", "improves", "wins", "gains",
"weak", "falls", "pressure", "concern", "concerns", "competition", "delays", "negative"),
value = c(2, 2, 3, 1, 1, 2, 2, 2, 2, 2, -2, -2, -1, -2, -2, -1, -2, -2)
)
}
)
}
bing_lexicon <- safe_bing()
afinn_lexicon <- safe_afinn()
## Do you want to download:
## Name: AFINN-111
## URL: http://www2.imm.dtu.dk/pubdb/views/publication_details.php?id=6010
## License: Open Database License (ODbL) v1.0
## Size: 78 KB (cleaned 59 KB)
## Download mechanism: https
This version does not stop the document if your NewsAPI key is missing. If there is no keyfound, it uses a backup headline dataset so the graphs still render.
api_key <- Sys.getenv("NEWS_API_KEY")
# For quick testing only, you can paste your key here instead:
# api_key <- ""
if (api_key == "126195d0e56e49849fdb728468d10ce4") {
message("No NEWS_API_KEY found. Using backup headline data so the report can still render.")
}
fetch_news <- function(query, query_label, ticker, api_key, page_size = 25) {
response <- GET(
url = "https://newsapi.org/v2/everything",
query = list(
q = query,
language = "en",
sortBy = "publishedAt",
pageSize = page_size,
apiKey = api_key
)
)
if (status_code(response) != 200) {
msg <- tryCatch(content(response, as = "parsed")$message, error = function(e) "Unknown API error")
warning("NewsAPI error for '", query, "': ", msg)
return(tibble())
}
parsed <- content(response, as = "text", encoding = "UTF-8")
articles <- fromJSON(parsed, flatten = TRUE)$articles
if (length(articles) == 0 || is.null(articles)) {
return(tibble())
}
as_tibble(articles) %>%
rename_with(~ str_replace_all(.x, "\\.", "_")) %>%
mutate(
source_name = coalesce(source_name, source_id, "Unknown source"),
query_label = query_label,
ticker = ticker
)
}
backup_news <- tibble(
source_name = c(
"Sample News", "Sample News", "Sample News", "Sample News", "Sample News", "Sample News",
"Sample News", "Sample News", "Sample News", "Sample News", "Sample News", "Sample News"
),
title = c(
"NVIDIA shares rise as AI chip demand stays strong",
"NVIDIA reports strong growth and bullish investor sentiment",
"NVIDIA faces pressure from competition in the AI chip market",
"Intel announces new foundry strategy to improve growth",
"Intel stock falls after weak guidance concerns investors",
"Intel gains as analysts see progress in turnaround plan",
"SanDisk expands storage products as demand improves",
"SanDisk faces weak pricing pressure in memory market",
"SanDisk shares rise after positive storage market outlook",
"NVIDIA wins new enterprise AI customers",
"Intel delays raise concern among chip investors",
"SanDisk growth improves after better memory demand"
),
publishedAt = as.character(Sys.time() - days(1:12)),
query_label = c("NVIDIA", "NVIDIA", "NVIDIA", "Intel", "Intel", "Intel", "SanDisk", "SanDisk", "SanDisk", "NVIDIA", "Intel", "SanDisk"),
ticker = c("NVDA", "NVDA", "NVDA", "INTC", "INTC", "INTC", "SNDK", "SNDK", "SNDK", "NVDA", "INTC", "SNDK")
)
if (api_key != "") {
news_raw <- bind_rows(
fetch_news("NVIDIA OR NVDA", "NVIDIA", "NVDA", api_key),
fetch_news("Intel OR INTC", "Intel", "INTC", api_key),
fetch_news("SanDisk OR SNDK", "SanDisk", "SNDK", api_key)
)
} else {
news_raw <- tibble()
}
if (nrow(news_raw) == 0) {
news_raw <- backup_news
}
glimpse(news_raw)
## Rows: 12
## Columns: 5
## $ source_name <chr> "Sample News", "Sample News", "Sample News", "Sample News"…
## $ title <chr> "NVIDIA shares rise as AI chip demand stays strong", "NVID…
## $ publishedAt <chr> "2026-06-20 18:51:15.905849", "2026-06-19 18:51:15.905849"…
## $ query_label <chr> "NVIDIA", "NVIDIA", "NVIDIA", "Intel", "Intel", "Intel", "…
## $ ticker <chr> "NVDA", "NVDA", "NVDA", "INTC", "INTC", "INTC", "SNDK", "S…
news_raw %>%
filter(!is.na(title)) %>%
mutate(
pub_date = ymd_hms(publishedAt, quiet = TRUE),
pub_day = as.Date(pub_date),
title_clean = str_remove(title, "\\s*-\\s*[^-]+$"),
title_clean = str_squish(str_replace_all(title_clean, "[^[:alnum:][:space:]]", " ")),
title_clean = str_to_lower(title_clean)
) %>%
group_by(title_clean) %>%
filter(n() > 1) %>%
arrange(title_clean)
## # A tibble: 0 × 8
## # Groups: title_clean [0]
## # ℹ 8 variables: source_name <chr>, title <chr>, publishedAt <chr>,
## # query_label <chr>, ticker <chr>, pub_date <dttm>, pub_day <date>,
## # title_clean <chr>
news_clean <- news_raw %>%
filter(!is.na(title)) %>%
mutate(
pub_date = ymd_hms(publishedAt, quiet = TRUE),
pub_day = as.Date(pub_date),
title_clean = str_remove(title, "\\s*-\\s*[^-]+$"),
title_clean = str_squish(str_replace_all(title_clean, "[^[:alnum:][:space:]]", " ")),
title_clean = str_to_lower(title_clean)
) %>%
distinct(query_label, title_clean, .keep_all = TRUE)
news_df <- news_clean %>% filter(!is.na(title))
dim(news_df)
## [1] 12 8
write.csv(news_df, "stock_news_df.csv", row.names = FALSE)
news_df %>%
select(query_label, ticker, source_name, title, pub_day) %>%
head(12) %>%
kable(caption = "Sample Cleaned Stock Headlines") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| query_label | ticker | source_name | title | pub_day |
|---|---|---|---|---|
| NVIDIA | NVDA | Sample News | NVIDIA shares rise as AI chip demand stays strong | 2026-06-20 |
| NVIDIA | NVDA | Sample News | NVIDIA reports strong growth and bullish investor sentiment | 2026-06-19 |
| NVIDIA | NVDA | Sample News | NVIDIA faces pressure from competition in the AI chip market | 2026-06-18 |
| Intel | INTC | Sample News | Intel announces new foundry strategy to improve growth | 2026-06-17 |
| Intel | INTC | Sample News | Intel stock falls after weak guidance concerns investors | 2026-06-16 |
| Intel | INTC | Sample News | Intel gains as analysts see progress in turnaround plan | 2026-06-15 |
| SanDisk | SNDK | Sample News | SanDisk expands storage products as demand improves | 2026-06-14 |
| SanDisk | SNDK | Sample News | SanDisk faces weak pricing pressure in memory market | 2026-06-13 |
| SanDisk | SNDK | Sample News | SanDisk shares rise after positive storage market outlook | 2026-06-12 |
| NVIDIA | NVDA | Sample News | NVIDIA wins new enterprise AI customers | 2026-06-11 |
| Intel | INTC | Sample News | Intel delays raise concern among chip investors | 2026-06-10 |
| SanDisk | SNDK | Sample News | SanDisk growth improves after better memory demand | 2026-06-09 |
This table previews the cleaned headline dataset used for the rest of the report. Each row is one headline, and the company and ticker columns make it possible to compare NVIDIA, Intel, and SanDisk separately.
news_tokens <- news_df %>%
select(query_label, ticker, title) %>%
unnest_tokens(word, title) %>%
anti_join(stop_words, by = "word") %>%
filter(!str_detect(word, "^\\d+$"), nchar(word) > 2)
head(news_tokens)
## # A tibble: 6 × 3
## query_label ticker word
## <chr> <chr> <chr>
## 1 NVIDIA NVDA nvidia
## 2 NVIDIA NVDA shares
## 3 NVIDIA NVDA rise
## 4 NVIDIA NVDA chip
## 5 NVIDIA NVDA demand
## 6 NVIDIA NVDA stays
Tokenization turns each headline into individual words and removes common stop words. This cleaned word list is the base for the word cloud, sentiment analysis, and TF-IDF results.
The word cloud shows the most common meaningful words in the cleaned stock headlines. Larger words appear more often in the dataset.
set.seed(123)
word_cloud_data <- news_tokens %>%
count(word, sort = TRUE) %>%
slice_head(n = 50) %>%
mutate(
x = runif(n(), -1, 1),
y = runif(n(), -1, 1),
size = scales::rescale(n, to = c(4, 12))
)
if (nrow(word_cloud_data) > 0) {
ggplot(word_cloud_data, aes(x = x, y = y, label = word, size = size, color = n)) +
geom_text(show.legend = FALSE, check_overlap = TRUE) +
scale_size_identity() +
scale_color_gradient(low = "#1b9e77", high = "#d95f02") +
labs(title = "News Headline Word Cloud - NVIDIA, Intel, and SanDisk") +
theme_void() +
theme(plot.title = element_text(face = "bold", size = 15, hjust = 0.5))
} else {
ggplot() + labs(title = "No words available for word cloud") + theme_void()
}
The largest words represent the topics that appear most often in the headline sample. If terms such as chip, AI, growth, demand, pressure, or market appear prominently, that suggests the news coverage is focused on industry trends, company performance, and investor concerns.
sentiment_bing <- news_tokens %>%
inner_join(bing_lexicon, by = "word", relationship = "many-to-many")
print(sentiment_bing)
## # A tibble: 17 × 4
## query_label ticker word sentiment
## <chr> <chr> <chr> <chr>
## 1 NVIDIA NVDA strong positive
## 2 NVIDIA NVDA strong positive
## 3 NVIDIA NVDA bullish positive
## 4 Intel INTC improve positive
## 5 Intel INTC falls negative
## 6 Intel INTC weak negative
## 7 Intel INTC guidance positive
## 8 Intel INTC concerns negative
## 9 Intel INTC gains positive
## 10 Intel INTC progress positive
## 11 SanDisk SNDK improves positive
## 12 SanDisk SNDK weak negative
## 13 SanDisk SNDK positive positive
## 14 NVIDIA NVDA wins positive
## 15 Intel INTC delays negative
## 16 Intel INTC concern negative
## 17 SanDisk SNDK improves positive
if (nrow(sentiment_bing) > 0) {
sentiment_bing %>%
count(word, sentiment, sort = TRUE) %>%
group_by(sentiment) %>%
slice_max(n, n = 10, with_ties = FALSE) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = n, y = word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ sentiment, scales = "free_y") +
scale_fill_manual(values = c("positive" = "#2ecc71", "negative" = "#e74c3c")) +
labs(title = "Top Words Driving Sentiment in Stock News Headlines", x = "Frequency", y = NULL) +
theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 14), strip.text = element_text(face = "bold", size = 12))
} else {
ggplot() + labs(title = "No Bing sentiment words found") + theme_void()
}
This chart identifies which positive and negative words appear most often across all headlines. Words with higher counts have more influence on the overall sentiment results.
if (nrow(sentiment_bing) > 0) {
sentiment_bing %>%
count(query_label, ticker, sentiment) %>%
ggplot(aes(x = query_label, y = n, fill = sentiment)) +
geom_col(position = "dodge") +
scale_fill_manual(values = c("positive" = "#2ecc71", "negative" = "#e74c3c")) +
labs(title = "Sentiment Comparison Across Stocks", subtitle = "NVIDIA vs. Intel vs. SanDisk", x = "Company", y = "Number of Sentiment Words", fill = "Sentiment") +
theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 14), legend.position = "bottom")
} else {
ggplot() + labs(title = "No Bing sentiment words found") + theme_void()
}
This chart compares the volume of positive and negative sentiment words for each company. A company with more sentiment words may have more news coverage, so the positive-versus-negative balance is more important than the raw count alone.
if (nrow(news_tokens) > 0) {
news_tokens %>%
count(word, sort = TRUE) %>%
slice_head(n = 20) %>%
mutate(word = fct_reorder(word, n)) %>%
ggplot(aes(x = n, y = word, fill = n)) +
geom_col(show.legend = FALSE) +
scale_fill_gradient(low = "#a8d8ea", high = "#0077b6") +
labs(title = "Top 20 Words in Stock News Headlines", x = "Count", y = NULL, caption = "Source: NewsAPI or backup sample data") +
theme_minimal(base_size = 13)
} else {
ggplot() + labs(title = "No words available to plot") + theme_void()
}
This chart shows the most frequent words in the headline dataset after stop words are removed. These words summarize the main themes appearing in the news coverage.
afinn_summary <- news_tokens %>%
inner_join(afinn_lexicon, by = "word") %>%
group_by(query_label, ticker) %>%
summarise(words_matched = n(), mean_sentiment = round(mean(value), 3), sum_sentiment = sum(value), .groups = "drop") %>%
arrange(desc(mean_sentiment))
afinn_summary %>%
kable(caption = "AFINN Sentiment Score by Stock") %>%
kable_styling(bootstrap_options = "striped", full_width = FALSE)
| query_label | ticker | words_matched | mean_sentiment | sum_sentiment |
|---|---|---|---|---|
| NVIDIA | NVDA | 8 | 1.250 | 10 |
| SanDisk | SNDK | 7 | 0.857 | 6 |
| Intel | INTC | 8 | -0.500 | -4 |
The AFINN table gives each stock an average sentiment score based on matched emotional words. Higher mean sentiment suggests more positive headline language, while lower or negative values suggest more cautious or unfavorable language.
bing_split <- news_tokens %>%
inner_join(bing_lexicon, by = "word") %>%
count(query_label, ticker, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = list(n = 0)) %>%
mutate(positive = coalesce(positive, 0L), negative = coalesce(negative, 0L), net = positive - negative) %>%
arrange(desc(net))
bing_split %>%
kable(caption = "Bing Sentiment Count by Stock") %>%
kable_styling(bootstrap_options = "striped", full_width = FALSE)
| query_label | ticker | negative | positive | net |
|---|---|---|---|---|
| NVIDIA | NVDA | 0 | 4 | 4 |
| SanDisk | SNDK | 1 | 3 | 2 |
| Intel | INTC | 5 | 4 | -1 |
The Bing split table separates positive and negative words by company. The net column is positive words minus negative words, which gives a simple comparison of whether each stock’s headlines lean positive or negative.
if (nrow(news_tokens) > 0) {
news_tokens %>%
count(query_label, word) %>%
bind_tf_idf(word, query_label, n) %>%
group_by(query_label) %>%
slice_max(tf_idf, n = 6, with_ties = FALSE) %>%
ungroup() %>%
mutate(word = reorder_within(word, tf_idf, query_label)) %>%
ggplot(aes(x = tf_idf, y = word, fill = query_label)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ query_label, scales = "free_y") +
scale_y_reordered() +
scale_fill_brewer(palette = "Set1") +
labs(title = "Top TF-IDF Terms by Stock", x = "TF-IDF Score", y = NULL, caption = "Source: NewsAPI or backup sample data") +
theme_minimal(base_size = 12)
} else {
ggplot() + labs(title = "No terms available for TF-IDF") + theme_void()
}
TF-IDF highlights words that are especially distinctive for each company, not just common overall. These terms help explain what makes the news coverage for each stock different from the others.
This version uses possibly() so one bad ticker does not
stop the report.
safe_tq_get <- possibly(function(sym) {
tq_get(sym, get = "stock.prices", from = Sys.Date() - years(1), to = Sys.Date())
}, otherwise = tibble())
stock_prices <- map_dfr(c("NVDA", "INTC", "SNDK"), safe_tq_get)
if (nrow(stock_prices) == 0) {
stock_prices <- tibble(
symbol = rep(c("NVDA", "INTC", "SNDK"), each = 10),
date = rep(seq.Date(Sys.Date() - 9, Sys.Date(), by = "day"), times = 3),
adjusted = c(seq(100, 120, length.out = 10), seq(100, 92, length.out = 10), seq(100, 110, length.out = 10))
)
}
head(stock_prices)
## # A tibble: 6 × 8
## symbol date open high low close volume adjusted
## <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 NVDA 2025-06-23 142. 145. 142. 144. 154308900 144.
## 2 NVDA 2025-06-24 146. 148. 146. 148. 187566100 148.
## 3 NVDA 2025-06-25 149. 154. 149. 154. 269146500 154.
## 4 NVDA 2025-06-26 156. 157. 154 155. 198145700 155.
## 5 NVDA 2025-06-27 156. 159. 155. 158. 263234500 158.
## 6 NVDA 2025-06-30 158. 159. 156. 158. 194580300 158.
stock_prices %>%
ggplot(aes(x = date, y = adjusted, color = symbol)) +
geom_line(linewidth = 1) +
scale_y_continuous(labels = dollar) +
labs(title = "Adjusted Stock Prices Over the Past Year", subtitle = "NVDA, INTC, and SNDK", x = "Date", y = "Adjusted Closing Price", color = "Ticker") +
theme_minimal()
This chart shows the adjusted closing price for each stock over the past year. It helps compare headline sentiment with actual market price movement.
stock_indexed <- stock_prices %>%
group_by(symbol) %>%
arrange(date) %>%
mutate(indexed_price = adjusted / first(adjusted) * 100) %>%
ungroup()
stock_indexed %>%
ggplot(aes(x = date, y = indexed_price, color = symbol)) +
geom_line(linewidth = 1) +
labs(title = "Indexed Stock Performance", subtitle = "Each stock starts at 100", x = "Date", y = "Indexed Price", color = "Ticker") +
theme_minimal()
Indexing each stock to 100 makes it easier to compare relative performance even though the stocks trade at different prices. A line above 100 means the stock is up from the start of the period, while a line below 100 means it is down.
daily_returns <- stock_prices %>%
group_by(symbol) %>%
arrange(date) %>%
mutate(daily_return = adjusted / lag(adjusted) - 1) %>%
filter(!is.na(daily_return)) %>%
ungroup()
daily_returns %>%
ggplot(aes(x = date, y = daily_return, color = symbol)) +
geom_line(alpha = 0.7) +
scale_y_continuous(labels = percent) +
labs(title = "Daily Returns by Stock", x = "Date", y = "Daily Return", color = "Ticker") +
theme_minimal()
Daily returns show short-term changes in stock performance. Larger swings indicate more volatility, while steady movement suggests a more stable price pattern.
daily_returns %>%
group_by(symbol) %>%
summarise(
average_daily_return = percent(mean(daily_return, na.rm = TRUE), accuracy = 0.01),
volatility = percent(sd(daily_return, na.rm = TRUE), accuracy = 0.01),
best_day = percent(max(daily_return, na.rm = TRUE), accuracy = 0.01),
worst_day = percent(min(daily_return, na.rm = TRUE), accuracy = 0.01),
.groups = "drop"
) %>%
kable(caption = "Daily Return Summary by Stock") %>%
kable_styling(bootstrap_options = "striped", full_width = FALSE)
| symbol | average_daily_return | volatility | best_day | worst_day |
|---|---|---|---|---|
| INTC | 0.85% | 4.74% | 23.60% | -17.03% |
| NVDA | 0.18% | 2.23% | 7.87% | -6.20% |
| SNDK | 1.75% | 6.35% | 27.56% | -20.33% |
This summary table compares average daily return, volatility, best day, and worst day for each stock. It provides a quick view of both performance and risk over the period analyzed.
I’m comparing NVIDIA, Intel, and SanDisk using both news sentiment and stock price data. The sentiment analysis shows whether recent headlines use more positive or negative language, while the TF-IDF analysis identifies the words that make each company’s coverage distinctive. The stock-price section adds financial context by showing how the companies performed over the past year.