# Run once in your console if needed
install.packages(c("remotes", "tidyverse", "tidytext", "textdata", 
                   "lubridate", "knitr", "kableExtra"))
remotes::install_github("news-r/newsapi")
library(newsapi)
library(tidyverse)
library(tidytext)
library(textdata)
library(lubridate)
library(knitr)
library(kableExtra)
library(ggplot2)
# API Key from .Renviron file
newsapi_key(Sys.getenv("NEWSAPI_API_KEY"))

# Fetch headlines for the topic "energy"
energy_raw <- every_news("energy", language = "en")

# Check the dimensions of the raw data
dim(energy_raw)
## [1] 133   8
# Remove duplicates and clean titles
energy_clean <- energy_raw %>%
  filter(!is.na(.data$title)) %>%
  mutate(
    pub_date = ymd_hms(.data$publishedAt, quiet = TRUE),
    pub_day = as.Date(pub_date),
    title_clean = str_remove(.data$title, "\\s*-\\s*[^-]+$"),
    title_clean = str_squish(str_replace_all(title_clean, "[^[:alnum:][:space:]]", " ")),
    title_clean = str_to_lower(title_clean)
  ) %>%
  distinct(title_clean, .keep_all = TRUE)

# Combine into our final data frame (in case you add more topics later)
news_df <- bind_rows(energy_clean) %>%
  filter(!is.na(title))

# Preview the cleaned data
news_df %>%
  select(source, title, pub_day, any_of(c("source.name", "source", "sourceName"))) %>%
  head(10) %>%
  kable(caption = "Sample Cleaned Headlines") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Sample Cleaned Headlines
source title pub_day
Gizmodo.com Trump Just Lost His War on Wind Energy 2026-06-16
Gizmodo.com Stressed? Nuropod Says It Can Fix That—by Hacking Your Brain 2026-06-22
Gizmodo.com Dude, Where’s My Founder? Ashton Kutcher Leaves VC Firm and Shows Which Way the Wind Is Blowing 2026-07-02
Gizmodo.com A Federal Regulator Wants to Fast-Track AI Data Centers Onto the Power Grid 2026-06-18
Gizmodo.com OpenAI Adds Fuel to Republican Drive to Label Anti-Data Center Movement a Chinese Psy-Op 2026-06-11
the-verge New York lawmakers pass one-year ban on new data centers 2026-06-05
the-verge Amazon’s data centers used 2.5 billion gallons of water last year 2026-06-11
the-verge Apple’s smart home camera service is starting to impress me 2026-06-16
bbc-news Oil price falls to levels not seen since before Iran war 2026-06-25
Gizmodo.com America’s Solar Just Hit a Critical Milestone That Won’t Make Trump Happy 2026-06-11
news_tokens <- news_df %>%
  # Keep both the source and the title!
  select(source, title) %>% 
  unnest_tokens(word, title) %>%
  anti_join(stop_words, by = "word") %>%
  filter(!str_detect(word, "^\\d+$"), nchar(word) > 2)
news_tokens %>%
  count(word, sort = TRUE) %>%
  slice_head(n = 20) %>%
  mutate(word = fct_reorder(word, n)) %>%
  ggplot(aes(x = n, y = word, fill = n)) +
  geom_col(show.legend = FALSE) +
  scale_fill_gradient(low = "#a8d8ea", high = "#0077b6") +
  labs(
    title = "Top 20 Words in News Headlines",
    x = "Count", y = NULL,
    caption = "Source: NewsAPI"
  ) +
  theme_minimal(base_size = 13)

# Match tokens against the Bing lexicon
sentiment_bing <- news_tokens %>%
  inner_join(get_sentiments("bing"), by = "word", relationship = "many-to-many")

# Plot top sentiment words
sentiment_bing %>%
  count(word, sentiment, sort = TRUE) %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10, with_ties = FALSE) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = n, y = word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  scale_fill_manual(values = c("positive" = "#2ecc71", "negative" = "#e74c3c")) +
  labs(
    title = "Top Words Driving Sentiment in News Titles",
    x = "Frequency (Word Count)",
    y = NULL
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    strip.text = element_text(face = "bold", size = 12)
  )

# Force textdata to accept the license automatically for knitting
Sys.setenv(TEXTDATA_AGREE = "TRUE")

# Note: If your news_tokens has a 'source' column, you can group by it here.
# Assuming a single dataset for now, we calculate the total mean sentiment.
news_tokens %>%
  inner_join(get_sentiments("afinn"), by = "word") %>%
  summarise(
    words_matched = n(),
    mean_sentiment = round(mean(value), 3),
    sum_sentiment = sum(value)
  ) %>%
  kable(caption = "Total AFINN Sentiment Score") %>%
  kable_styling(bootstrap_options = "striped", full_width = FALSE)
Total AFINN Sentiment Score
words_matched mean_sentiment sum_sentiment
59 -0.068 -4
# TF-IDF (Term Frequency–Inverse Document Frequency)
n_sources <- n_distinct(news_tokens$source)
news_tokens %>%
  count(source, word) %>%
  bind_tf_idf(word, source, n) %>%
  group_by(source) %>%
  slice_max(tf_idf, n = 5, with_ties = FALSE) %>%
  ungroup() %>%
  mutate(
    word = str_trunc(word, 20),
    word = reorder_within(word, tf_idf, source)
  ) %>%
  ggplot(aes(x = tf_idf, y = word, fill = source)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ source, scales = "free_y", ncol = 4) +
  scale_y_reordered() +
  scale_fill_manual(
    values = colorRampPalette(RColorBrewer::brewer.pal(9, "Set1"))(n_sources)
  ) +
  labs(
    title = "Top TF-IDF Terms by Source",
    x = "TF-IDF Score",
    y = NULL,
    caption = "Source: NewsAPI"
  ) +
  theme_minimal(base_size = 10) +
  theme(
    strip.text = element_text(size = 8, face = "bold"),
    axis.text.y = element_text(size = 7.5),
    panel.spacing = unit(1.2, "lines")
  )

Preliminary Findings

After reviewing the headlines and text analysis charts and plots, there has not been many articles published in the last few days. The few articles that were returned show very few words that can be analyzed for sentiment. Of the seven unique articles, only one word that drives sentiment was found. I only searched one topic, I know expanding this would yield more results for analysis. Searching one topic seems to limit the results. I thought a generic topic like energy would return many results but that was not true. In the future I would expand the number of topics if using the top_headlines function, using a pair like energy and gasoline. I had to switch to using the every_news function to grab more data for sentiment analysis. Once I did that, I had more data for analysis.