# Run once in your console if needed
install.packages(c("remotes", "tidyverse", "tidytext", "textdata", 
                   "lubridate", "knitr", "kableExtra"))
remotes::install_github("news-r/newsapi")

library(newsapi)
library(tidyverse)
library(tidytext)
library(textdata)
library(lubridate)
library(knitr)
library(kableExtra)
library(ggplot2)

# API Key from .Renviron file
newsapi_key(Sys.getenv("NEWSAPI_API_KEY"))

# Fetch headlines for the topic "energy"
energy_raw <- every_news("energy", language = "en")

# Check the dimensions of the raw data
dim(energy_raw)

## [1] 133   8

# Remove duplicates and clean titles
energy_clean <- energy_raw %>%
  filter(!is.na(.data$title)) %>%
  mutate(
    pub_date = ymd_hms(.data$publishedAt, quiet = TRUE),
    pub_day = as.Date(pub_date),
    title_clean = str_remove(.data$title, "\\s*-\\s*[^-]+$"),
    title_clean = str_squish(str_replace_all(title_clean, "[^[:alnum:][:space:]]", " ")),
    title_clean = str_to_lower(title_clean)
  ) %>%
  distinct(title_clean, .keep_all = TRUE)

# Combine into our final data frame (in case you add more topics later)
news_df <- bind_rows(energy_clean) %>%
  filter(!is.na(title))

# Preview the cleaned data
news_df %>%
  select(source, title, pub_day, any_of(c("source.name", "source", "sourceName"))) %>%
  head(10) %>%
  kable(caption = "Sample Cleaned Headlines") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Sample Cleaned Headlines
source	title	pub_day
Gizmodo.com	Trump Just Lost His War on Wind Energy	2026-06-16
Gizmodo.com	Stressed? Nuropod Says It Can Fix That—by Hacking Your Brain	2026-06-22
Gizmodo.com	Dude, Where’s My Founder? Ashton Kutcher Leaves VC Firm and Shows Which Way the Wind Is Blowing	2026-07-02
Gizmodo.com	A Federal Regulator Wants to Fast-Track AI Data Centers Onto the Power Grid	2026-06-18
Gizmodo.com	OpenAI Adds Fuel to Republican Drive to Label Anti-Data Center Movement a Chinese Psy-Op	2026-06-11
the-verge	New York lawmakers pass one-year ban on new data centers	2026-06-05
the-verge	Amazon’s data centers used 2.5 billion gallons of water last year	2026-06-11
the-verge	Apple’s smart home camera service is starting to impress me	2026-06-16
bbc-news	Oil price falls to levels not seen since before Iran war	2026-06-25
Gizmodo.com	America’s Solar Just Hit a Critical Milestone That Won’t Make Trump Happy	2026-06-11

news_tokens <- news_df %>%
  # Keep both the source and the title!
  select(source, title) %>% 
  unnest_tokens(word, title) %>%
  anti_join(stop_words, by = "word") %>%
  filter(!str_detect(word, "^\\d+$"), nchar(word) > 2)

news_tokens %>%
  count(word, sort = TRUE) %>%
  slice_head(n = 20) %>%
  mutate(word = fct_reorder(word, n)) %>%
  ggplot(aes(x = n, y = word, fill = n)) +
  geom_col(show.legend = FALSE) +
  scale_fill_gradient(low = "#a8d8ea", high = "#0077b6") +
  labs(
    title = "Top 20 Words in News Headlines",
    x = "Count", y = NULL,
    caption = "Source: NewsAPI"
  ) +
  theme_minimal(base_size = 13)

# Match tokens against the Bing lexicon
sentiment_bing <- news_tokens %>%
  inner_join(get_sentiments("bing"), by = "word", relationship = "many-to-many")

# Plot top sentiment words
sentiment_bing %>%
  count(word, sentiment, sort = TRUE) %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10, with_ties = FALSE) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = n, y = word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  scale_fill_manual(values = c("positive" = "#2ecc71", "negative" = "#e74c3c")) +
  labs(
    title = "Top Words Driving Sentiment in News Titles",
    x = "Frequency (Word Count)",
    y = NULL
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    strip.text = element_text(face = "bold", size = 12)
  )

# Force textdata to accept the license automatically for knitting
Sys.setenv(TEXTDATA_AGREE = "TRUE")

# Note: If your news_tokens has a 'source' column, you can group by it here.
# Assuming a single dataset for now, we calculate the total mean sentiment.
news_tokens %>%
  inner_join(get_sentiments("afinn"), by = "word") %>%
  summarise(
    words_matched = n(),
    mean_sentiment = round(mean(value), 3),
    sum_sentiment = sum(value)
  ) %>%
  kable(caption = "Total AFINN Sentiment Score") %>%
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Total AFINN Sentiment Score
words_matched	mean_sentiment	sum_sentiment
59	-0.068	-4

# TF-IDF (Term Frequency–Inverse Document Frequency)
n_sources <- n_distinct(news_tokens$source)
news_tokens %>%
  count(source, word) %>%
  bind_tf_idf(word, source, n) %>%
  group_by(source) %>%
  slice_max(tf_idf, n = 5, with_ties = FALSE) %>%
  ungroup() %>%
  mutate(
    word = str_trunc(word, 20),
    word = reorder_within(word, tf_idf, source)
  ) %>%
  ggplot(aes(x = tf_idf, y = word, fill = source)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ source, scales = "free_y", ncol = 4) +
  scale_y_reordered() +
  scale_fill_manual(
    values = colorRampPalette(RColorBrewer::brewer.pal(9, "Set1"))(n_sources)
  ) +
  labs(
    title = "Top TF-IDF Terms by Source",
    x = "TF-IDF Score",
    y = NULL,
    caption = "Source: NewsAPI"
  ) +
  theme_minimal(base_size = 10) +
  theme(
    strip.text = element_text(size = 8, face = "bold"),
    axis.text.y = element_text(size = 7.5),
    panel.spacing = unit(1.2, "lines")
  )

Preliminary Findings

After reviewing the headlines and text analysis charts and plots, there has not been many articles published in the last few days. The few articles that were returned show very few words that can be analyzed for sentiment. Of the seven unique articles, only one word that drives sentiment was found. I only searched one topic, I know expanding this would yield more results for analysis. Searching one topic seems to limit the results. I thought a generic topic like energy would return many results but that was not true. In the future I would expand the number of topics if using the top_headlines function, using a pair like energy and gasoline. I had to switch to using the every_news function to grab more data for sentiment analysis. Once I did that, I had more data for analysis.

News API Sentiment Analysis

Alex Rodriguez

2026-07-04

Preliminary Findings