1. Project Overview

This report analyzes recent news headlines for three technology stocks:

NVIDIA Corporation (NVDA)
Intel Corporation (INTC)
SanDisk Corporation (SNDK)

The goal is to compare how these companies are being discussed in the news by using text mining, a word cloud, sentiment analysis, TF-IDF, and stock-price charts.

2. Setup and Package Installation

install.packages(c(
  "tidyverse", "tidytext", "textdata", "lubridate", "knitr",
  "kableExtra", "httr", "jsonlite", "ggplot2", "RColorBrewer",
  "tidyquant", "scales"
))

library(tidyverse)
library(tidytext)
library(textdata)
library(lubridate)
library(knitr)
library(kableExtra)
library(httr)
library(jsonlite)
library(ggplot2)
library(RColorBrewer)
library(tidyquant)
library(scales)

# Safe sentiment lexicons for knitting
# get_sentiments("afinn") and get_sentiments("bing") can trigger an interactive
# textdata download menu during knitting. These helpers use the installed lexicons
# when available, and otherwise fall back to small built-in lexicons so the report
# still renders without stopping.
safe_bing <- function() {
  tryCatch(
    get_sentiments("bing"),
    error = function(e) {
      tibble(
        word = c("strong", "growth", "bullish", "rise", "rises", "positive", "improve", "improves", "wins", "gains",
                 "weak", "falls", "pressure", "concern", "concerns", "competition", "delays", "negative"),
        sentiment = c(rep("positive", 10), rep("negative", 8))
      )
    }
  )
}

safe_afinn <- function() {
  tryCatch(
    get_sentiments("afinn"),
    error = function(e) {
      tibble(
        word = c("strong", "growth", "bullish", "rise", "rises", "positive", "improve", "improves", "wins", "gains",
                 "weak", "falls", "pressure", "concern", "concerns", "competition", "delays", "negative"),
        value = c(2, 2, 3, 1, 1, 2, 2, 2, 2, 2, -2, -2, -1, -2, -2, -1, -2, -2)
      )
    }
  )
}

bing_lexicon <- safe_bing()
afinn_lexicon <- safe_afinn()

## Do you want to download:
##  Name: AFINN-111 
##  URL: http://www2.imm.dtu.dk/pubdb/views/publication_details.php?id=6010 
##  License: Open Database License (ODbL) v1.0 
##  Size: 78 KB (cleaned 59 KB) 
##  Download mechanism: https

3. Authentication

This version does not stop the document if your NewsAPI key is missing. If there is no keyfound, it uses a backup headline dataset so the graphs still render.

api_key <- Sys.getenv("NEWS_API_KEY")

# For quick testing only, you can paste your key here instead:
# api_key <- ""

if (api_key == "126195d0e56e49849fdb728468d10ce4") {
  message("No NEWS_API_KEY found. Using backup headline data so the report can still render.")
}

4. Pulling Headlines from NewsAPI

fetch_news <- function(query, query_label, ticker, api_key, page_size = 25) {
  response <- GET(
    url = "https://newsapi.org/v2/everything",
    query = list(
      q        = query,
      language = "en",
      sortBy   = "publishedAt",
      pageSize = page_size,
      apiKey   = api_key
    )
  )

  if (status_code(response) != 200) {
    msg <- tryCatch(content(response, as = "parsed")$message, error = function(e) "Unknown API error")
    warning("NewsAPI error for '", query, "': ", msg)
    return(tibble())
  }

  parsed <- content(response, as = "text", encoding = "UTF-8")
  articles <- fromJSON(parsed, flatten = TRUE)$articles

  if (length(articles) == 0 || is.null(articles)) {
    return(tibble())
  }

  as_tibble(articles) %>%
    rename_with(~ str_replace_all(.x, "\\.", "_")) %>%
    mutate(
      source_name = coalesce(source_name, source_id, "Unknown source"),
      query_label = query_label,
      ticker = ticker
    )
}

backup_news <- tibble(
  source_name = c(
    "Sample News", "Sample News", "Sample News", "Sample News", "Sample News", "Sample News",
    "Sample News", "Sample News", "Sample News", "Sample News", "Sample News", "Sample News"
  ),
  title = c(
    "NVIDIA shares rise as AI chip demand stays strong",
    "NVIDIA reports strong growth and bullish investor sentiment",
    "NVIDIA faces pressure from competition in the AI chip market",
    "Intel announces new foundry strategy to improve growth",
    "Intel stock falls after weak guidance concerns investors",
    "Intel gains as analysts see progress in turnaround plan",
    "SanDisk expands storage products as demand improves",
    "SanDisk faces weak pricing pressure in memory market",
    "SanDisk shares rise after positive storage market outlook",
    "NVIDIA wins new enterprise AI customers",
    "Intel delays raise concern among chip investors",
    "SanDisk growth improves after better memory demand"
  ),
  publishedAt = as.character(Sys.time() - days(1:12)),
  query_label = c("NVIDIA", "NVIDIA", "NVIDIA", "Intel", "Intel", "Intel", "SanDisk", "SanDisk", "SanDisk", "NVIDIA", "Intel", "SanDisk"),
  ticker = c("NVDA", "NVDA", "NVDA", "INTC", "INTC", "INTC", "SNDK", "SNDK", "SNDK", "NVDA", "INTC", "SNDK")
)

if (api_key != "") {
  news_raw <- bind_rows(
    fetch_news("NVIDIA OR NVDA", "NVIDIA", "NVDA", api_key),
    fetch_news("Intel OR INTC", "Intel", "INTC", api_key),
    fetch_news("SanDisk OR SNDK", "SanDisk", "SNDK", api_key)
  )
} else {
  news_raw <- tibble()
}

if (nrow(news_raw) == 0) {
  news_raw <- backup_news
}

glimpse(news_raw)

## Rows: 12
## Columns: 5
## $ source_name <chr> "Sample News", "Sample News", "Sample News", "Sample News"…
## $ title       <chr> "NVIDIA shares rise as AI chip demand stays strong", "NVID…
## $ publishedAt <chr> "2026-06-20 18:51:15.905849", "2026-06-19 18:51:15.905849"…
## $ query_label <chr> "NVIDIA", "NVIDIA", "NVIDIA", "Intel", "Intel", "Intel", "…
## $ ticker      <chr> "NVDA", "NVDA", "NVDA", "INTC", "INTC", "INTC", "SNDK", "S…

5. Inspect for Duplicates

news_raw %>%
  filter(!is.na(title)) %>%
  mutate(
    pub_date = ymd_hms(publishedAt, quiet = TRUE),
    pub_day = as.Date(pub_date),
    title_clean = str_remove(title, "\\s*-\\s*[^-]+$"),
    title_clean = str_squish(str_replace_all(title_clean, "[^[:alnum:][:space:]]", " ")),
    title_clean = str_to_lower(title_clean)
  ) %>%
  group_by(title_clean) %>%
  filter(n() > 1) %>%
  arrange(title_clean)

## # A tibble: 0 × 8
## # Groups:   title_clean [0]
## # ℹ 8 variables: source_name <chr>, title <chr>, publishedAt <chr>,
## #   query_label <chr>, ticker <chr>, pub_date <dttm>, pub_day <date>,
## #   title_clean <chr>

6. Clean and Deduplicate Headlines

news_clean <- news_raw %>%
  filter(!is.na(title)) %>%
  mutate(
    pub_date = ymd_hms(publishedAt, quiet = TRUE),
    pub_day = as.Date(pub_date),
    title_clean = str_remove(title, "\\s*-\\s*[^-]+$"),
    title_clean = str_squish(str_replace_all(title_clean, "[^[:alnum:][:space:]]", " ")),
    title_clean = str_to_lower(title_clean)
  ) %>%
  distinct(query_label, title_clean, .keep_all = TRUE)

news_df <- news_clean %>% filter(!is.na(title))

dim(news_df)

## [1] 12  8

write.csv(news_df, "stock_news_df.csv", row.names = FALSE)

7. Preview the Cleaned Data

news_df %>%
  select(query_label, ticker, source_name, title, pub_day) %>%
  head(12) %>%
  kable(caption = "Sample Cleaned Stock Headlines") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Sample Cleaned Stock Headlines
query_label	ticker	source_name	title	pub_day
NVIDIA	NVDA	Sample News	NVIDIA shares rise as AI chip demand stays strong	2026-06-20
NVIDIA	NVDA	Sample News	NVIDIA reports strong growth and bullish investor sentiment	2026-06-19
NVIDIA	NVDA	Sample News	NVIDIA faces pressure from competition in the AI chip market	2026-06-18
Intel	INTC	Sample News	Intel announces new foundry strategy to improve growth	2026-06-17
Intel	INTC	Sample News	Intel stock falls after weak guidance concerns investors	2026-06-16
Intel	INTC	Sample News	Intel gains as analysts see progress in turnaround plan	2026-06-15
SanDisk	SNDK	Sample News	SanDisk expands storage products as demand improves	2026-06-14
SanDisk	SNDK	Sample News	SanDisk faces weak pricing pressure in memory market	2026-06-13
SanDisk	SNDK	Sample News	SanDisk shares rise after positive storage market outlook	2026-06-12
NVIDIA	NVDA	Sample News	NVIDIA wins new enterprise AI customers	2026-06-11
Intel	INTC	Sample News	Intel delays raise concern among chip investors	2026-06-10
SanDisk	SNDK	Sample News	SanDisk growth improves after better memory demand	2026-06-09

This table previews the cleaned headline dataset used for the rest of the report. Each row is one headline, and the company and ticker columns make it possible to compare NVIDIA, Intel, and SanDisk separately.

8. Tokenization

news_tokens <- news_df %>%
  select(query_label, ticker, title) %>%
  unnest_tokens(word, title) %>%
  anti_join(stop_words, by = "word") %>%
  filter(!str_detect(word, "^\\d+$"), nchar(word) > 2)

head(news_tokens)

## # A tibble: 6 × 3
##   query_label ticker word  
##   <chr>       <chr>  <chr> 
## 1 NVIDIA      NVDA   nvidia
## 2 NVIDIA      NVDA   shares
## 3 NVIDIA      NVDA   rise  
## 4 NVIDIA      NVDA   chip  
## 5 NVIDIA      NVDA   demand
## 6 NVIDIA      NVDA   stays

Tokenization turns each headline into individual words and removes common stop words. This cleaned word list is the base for the word cloud, sentiment analysis, and TF-IDF results.

9. Word Cloud of Headline Terms

The word cloud shows the most common meaningful words in the cleaned stock headlines. Larger words appear more often in the dataset.

set.seed(123)

word_cloud_data <- news_tokens %>%
  count(word, sort = TRUE) %>%
  slice_head(n = 50) %>%
  mutate(
    x = runif(n(), -1, 1),
    y = runif(n(), -1, 1),
    size = scales::rescale(n, to = c(4, 12))
  )

if (nrow(word_cloud_data) > 0) {
  ggplot(word_cloud_data, aes(x = x, y = y, label = word, size = size, color = n)) +
    geom_text(show.legend = FALSE, check_overlap = TRUE) +
    scale_size_identity() +
    scale_color_gradient(low = "#1b9e77", high = "#d95f02") +
    labs(title = "News Headline Word Cloud - NVIDIA, Intel, and SanDisk") +
    theme_void() +
    theme(plot.title = element_text(face = "bold", size = 15, hjust = 0.5))
} else {
  ggplot() + labs(title = "No words available for word cloud") + theme_void()
}

The largest words represent the topics that appear most often in the headline sample. If terms such as chip, AI, growth, demand, pressure, or market appear prominently, that suggests the news coverage is focused on industry trends, company performance, and investor concerns.

10. Bing Sentiment Analysis

sentiment_bing <- news_tokens %>%
  inner_join(bing_lexicon, by = "word", relationship = "many-to-many")

print(sentiment_bing)

## # A tibble: 17 × 4
##    query_label ticker word     sentiment
##    <chr>       <chr>  <chr>    <chr>    
##  1 NVIDIA      NVDA   strong   positive 
##  2 NVIDIA      NVDA   strong   positive 
##  3 NVIDIA      NVDA   bullish  positive 
##  4 Intel       INTC   improve  positive 
##  5 Intel       INTC   falls    negative 
##  6 Intel       INTC   weak     negative 
##  7 Intel       INTC   guidance positive 
##  8 Intel       INTC   concerns negative 
##  9 Intel       INTC   gains    positive 
## 10 Intel       INTC   progress positive 
## 11 SanDisk     SNDK   improves positive 
## 12 SanDisk     SNDK   weak     negative 
## 13 SanDisk     SNDK   positive positive 
## 14 NVIDIA      NVDA   wins     positive 
## 15 Intel       INTC   delays   negative 
## 16 Intel       INTC   concern  negative 
## 17 SanDisk     SNDK   improves positive

Graph 1: Top Words Driving Sentiment

if (nrow(sentiment_bing) > 0) {
  sentiment_bing %>%
    count(word, sentiment, sort = TRUE) %>%
    group_by(sentiment) %>%
    slice_max(n, n = 10, with_ties = FALSE) %>%
    ungroup() %>%
    mutate(word = reorder(word, n)) %>%
    ggplot(aes(x = n, y = word, fill = sentiment)) +
    geom_col(show.legend = FALSE) +
    facet_wrap(~ sentiment, scales = "free_y") +
    scale_fill_manual(values = c("positive" = "#2ecc71", "negative" = "#e74c3c")) +
    labs(title = "Top Words Driving Sentiment in Stock News Headlines", x = "Frequency", y = NULL) +
    theme_minimal() +
    theme(plot.title = element_text(face = "bold", size = 14), strip.text = element_text(face = "bold", size = 12))
} else {
  ggplot() + labs(title = "No Bing sentiment words found") + theme_void()
}

This chart identifies which positive and negative words appear most often across all headlines. Words with higher counts have more influence on the overall sentiment results.

Graph 2: Sentiment Volume by Company

if (nrow(sentiment_bing) > 0) {
  sentiment_bing %>%
    count(query_label, ticker, sentiment) %>%
    ggplot(aes(x = query_label, y = n, fill = sentiment)) +
    geom_col(position = "dodge") +
    scale_fill_manual(values = c("positive" = "#2ecc71", "negative" = "#e74c3c")) +
    labs(title = "Sentiment Comparison Across Stocks", subtitle = "NVIDIA vs. Intel vs. SanDisk", x = "Company", y = "Number of Sentiment Words", fill = "Sentiment") +
    theme_minimal() +
    theme(plot.title = element_text(face = "bold", size = 14), legend.position = "bottom")
} else {
  ggplot() + labs(title = "No Bing sentiment words found") + theme_void()
}

This chart compares the volume of positive and negative sentiment words for each company. A company with more sentiment words may have more news coverage, so the positive-versus-negative balance is more important than the raw count alone.

11. Top Words Overall

if (nrow(news_tokens) > 0) {
  news_tokens %>%
    count(word, sort = TRUE) %>%
    slice_head(n = 20) %>%
    mutate(word = fct_reorder(word, n)) %>%
    ggplot(aes(x = n, y = word, fill = n)) +
    geom_col(show.legend = FALSE) +
    scale_fill_gradient(low = "#a8d8ea", high = "#0077b6") +
    labs(title = "Top 20 Words in Stock News Headlines", x = "Count", y = NULL, caption = "Source: NewsAPI or backup sample data") +
    theme_minimal(base_size = 13)
} else {
  ggplot() + labs(title = "No words available to plot") + theme_void()
}

This chart shows the most frequent words in the headline dataset after stop words are removed. These words summarize the main themes appearing in the news coverage.

12. AFINN Sentiment Analysis

afinn_summary <- news_tokens %>%
  inner_join(afinn_lexicon, by = "word") %>%
  group_by(query_label, ticker) %>%
  summarise(words_matched = n(), mean_sentiment = round(mean(value), 3), sum_sentiment = sum(value), .groups = "drop") %>%
  arrange(desc(mean_sentiment))

afinn_summary %>%
  kable(caption = "AFINN Sentiment Score by Stock") %>%
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

AFINN Sentiment Score by Stock
query_label	ticker	words_matched	mean_sentiment	sum_sentiment
NVIDIA	NVDA	8	1.250	10
SanDisk	SNDK	7	0.857	6
Intel	INTC	8	-0.500	-4

The AFINN table gives each stock an average sentiment score based on matched emotional words. Higher mean sentiment suggests more positive headline language, while lower or negative values suggest more cautious or unfavorable language.

13. Bing Sentiment Split Table

bing_split <- news_tokens %>%
  inner_join(bing_lexicon, by = "word") %>%
  count(query_label, ticker, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = list(n = 0)) %>%
  mutate(positive = coalesce(positive, 0L), negative = coalesce(negative, 0L), net = positive - negative) %>%
  arrange(desc(net))

bing_split %>%
  kable(caption = "Bing Sentiment Count by Stock") %>%
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Bing Sentiment Count by Stock
query_label	ticker	negative	positive	net
NVIDIA	NVDA	0	4	4
SanDisk	SNDK	1	3	2
Intel	INTC	5	4	-1

The Bing split table separates positive and negative words by company. The net column is positive words minus negative words, which gives a simple comparison of whether each stock’s headlines lean positive or negative.

14. TF-IDF: Distinctive Words by Stock

if (nrow(news_tokens) > 0) {
  news_tokens %>%
    count(query_label, word) %>%
    bind_tf_idf(word, query_label, n) %>%
    group_by(query_label) %>%
    slice_max(tf_idf, n = 6, with_ties = FALSE) %>%
    ungroup() %>%
    mutate(word = reorder_within(word, tf_idf, query_label)) %>%
    ggplot(aes(x = tf_idf, y = word, fill = query_label)) +
    geom_col(show.legend = FALSE) +
    facet_wrap(~ query_label, scales = "free_y") +
    scale_y_reordered() +
    scale_fill_brewer(palette = "Set1") +
    labs(title = "Top TF-IDF Terms by Stock", x = "TF-IDF Score", y = NULL, caption = "Source: NewsAPI or backup sample data") +
    theme_minimal(base_size = 12)
} else {
  ggplot() + labs(title = "No terms available for TF-IDF") + theme_void()
}

TF-IDF highlights words that are especially distinctive for each company, not just common overall. These terms help explain what makes the news coverage for each stock different from the others.

15. Stock Price Comparison

This version uses possibly() so one bad ticker does not stop the report.

safe_tq_get <- possibly(function(sym) {
  tq_get(sym, get = "stock.prices", from = Sys.Date() - years(1), to = Sys.Date())
}, otherwise = tibble())

stock_prices <- map_dfr(c("NVDA", "INTC", "SNDK"), safe_tq_get)

if (nrow(stock_prices) == 0) {
  stock_prices <- tibble(
    symbol = rep(c("NVDA", "INTC", "SNDK"), each = 10),
    date = rep(seq.Date(Sys.Date() - 9, Sys.Date(), by = "day"), times = 3),
    adjusted = c(seq(100, 120, length.out = 10), seq(100, 92, length.out = 10), seq(100, 110, length.out = 10))
  )
}

head(stock_prices)

## # A tibble: 6 × 8
##   symbol date        open  high   low close    volume adjusted
##   <chr>  <date>     <dbl> <dbl> <dbl> <dbl>     <dbl>    <dbl>
## 1 NVDA   2025-06-23  142.  145.  142.  144. 154308900     144.
## 2 NVDA   2025-06-24  146.  148.  146.  148. 187566100     148.
## 3 NVDA   2025-06-25  149.  154.  149.  154. 269146500     154.
## 4 NVDA   2025-06-26  156.  157.  154   155. 198145700     155.
## 5 NVDA   2025-06-27  156.  159.  155.  158. 263234500     158.
## 6 NVDA   2025-06-30  158.  159.  156.  158. 194580300     158.

Graph 3: Adjusted Closing Prices

stock_prices %>%
  ggplot(aes(x = date, y = adjusted, color = symbol)) +
  geom_line(linewidth = 1) +
  scale_y_continuous(labels = dollar) +
  labs(title = "Adjusted Stock Prices Over the Past Year", subtitle = "NVDA, INTC, and SNDK", x = "Date", y = "Adjusted Closing Price", color = "Ticker") +
  theme_minimal()

This chart shows the adjusted closing price for each stock over the past year. It helps compare headline sentiment with actual market price movement.

Graph 4: Indexed Stock Performance

stock_indexed <- stock_prices %>%
  group_by(symbol) %>%
  arrange(date) %>%
  mutate(indexed_price = adjusted / first(adjusted) * 100) %>%
  ungroup()

stock_indexed %>%
  ggplot(aes(x = date, y = indexed_price, color = symbol)) +
  geom_line(linewidth = 1) +
  labs(title = "Indexed Stock Performance", subtitle = "Each stock starts at 100", x = "Date", y = "Indexed Price", color = "Ticker") +
  theme_minimal()

Indexing each stock to 100 makes it easier to compare relative performance even though the stocks trade at different prices. A line above 100 means the stock is up from the start of the period, while a line below 100 means it is down.

Graph 5: Daily Returns

daily_returns <- stock_prices %>%
  group_by(symbol) %>%
  arrange(date) %>%
  mutate(daily_return = adjusted / lag(adjusted) - 1) %>%
  filter(!is.na(daily_return)) %>%
  ungroup()

daily_returns %>%
  ggplot(aes(x = date, y = daily_return, color = symbol)) +
  geom_line(alpha = 0.7) +
  scale_y_continuous(labels = percent) +
  labs(title = "Daily Returns by Stock", x = "Date", y = "Daily Return", color = "Ticker") +
  theme_minimal()

Daily returns show short-term changes in stock performance. Larger swings indicate more volatility, while steady movement suggests a more stable price pattern.

Stock Return Summary Table

daily_returns %>%
  group_by(symbol) %>%
  summarise(
    average_daily_return = percent(mean(daily_return, na.rm = TRUE), accuracy = 0.01),
    volatility = percent(sd(daily_return, na.rm = TRUE), accuracy = 0.01),
    best_day = percent(max(daily_return, na.rm = TRUE), accuracy = 0.01),
    worst_day = percent(min(daily_return, na.rm = TRUE), accuracy = 0.01),
    .groups = "drop"
  ) %>%
  kable(caption = "Daily Return Summary by Stock") %>%
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Daily Return Summary by Stock
symbol	average_daily_return	volatility	best_day	worst_day
INTC	0.85%	4.74%	23.60%	-17.03%
NVDA	0.18%	2.23%	7.87%	-6.20%
SNDK	1.75%	6.35%	27.56%	-20.33%

This summary table compares average daily return, volatility, best day, and worst day for each stock. It provides a quick view of both performance and risk over the period analyzed.

16. Final Summary

I’m comparing NVIDIA, Intel, and SanDisk using both news sentiment and stock price data. The sentiment analysis shows whether recent headlines use more positive or negative language, while the TF-IDF analysis identifies the words that make each company’s coverage distinctive. The stock-price section adds financial context by showing how the companies performed over the past year.

Mining Stock News: Sentiment and Text Analysis of Headlines

NVIDIA, Intel, and SanDisk

Brandon Chin

2026-06-21