# Run once in your console if needed
install.packages(c("remotes", "tidyverse", "tidytext", "textdata",
"lubridate", "knitr", "kableExtra"))
remotes::install_github("news-r/newsapi")
library(newsapi)
library(tidyverse)
library(tidytext)
library(textdata)
library(lubridate)
library(knitr)
library(kableExtra)
library(ggplot2)
# API Key from .Renviron file
newsapi_key(Sys.getenv("NEWSAPI_API_KEY"))
# Fetch headlines for the topic "energy"
energy_raw <- every_news("energy", language = "en")
# Check the dimensions of the raw data
dim(energy_raw)
## [1] 133 8
# Remove duplicates and clean titles
energy_clean <- energy_raw %>%
filter(!is.na(.data$title)) %>%
mutate(
pub_date = ymd_hms(.data$publishedAt, quiet = TRUE),
pub_day = as.Date(pub_date),
title_clean = str_remove(.data$title, "\\s*-\\s*[^-]+$"),
title_clean = str_squish(str_replace_all(title_clean, "[^[:alnum:][:space:]]", " ")),
title_clean = str_to_lower(title_clean)
) %>%
distinct(title_clean, .keep_all = TRUE)
# Combine into our final data frame (in case you add more topics later)
news_df <- bind_rows(energy_clean) %>%
filter(!is.na(title))
# Preview the cleaned data
news_df %>%
select(source, title, pub_day, any_of(c("source.name", "source", "sourceName"))) %>%
head(10) %>%
kable(caption = "Sample Cleaned Headlines") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Sample Cleaned Headlines
|
source
|
title
|
pub_day
|
|
Gizmodo.com
|
Trump Just Lost His War on Wind Energy
|
2026-06-16
|
|
Gizmodo.com
|
Stressed? Nuropod Says It Can Fix That—by Hacking Your Brain
|
2026-06-22
|
|
Gizmodo.com
|
Dude, Where’s My Founder? Ashton Kutcher Leaves VC Firm and Shows Which
Way the Wind Is Blowing
|
2026-07-02
|
|
Gizmodo.com
|
A Federal Regulator Wants to Fast-Track AI Data Centers Onto the Power
Grid
|
2026-06-18
|
|
Gizmodo.com
|
OpenAI Adds Fuel to Republican Drive to Label Anti-Data Center Movement
a Chinese Psy-Op
|
2026-06-11
|
|
the-verge
|
New York lawmakers pass one-year ban on new data centers
|
2026-06-05
|
|
the-verge
|
Amazon’s data centers used 2.5 billion gallons of water last year
|
2026-06-11
|
|
the-verge
|
Apple’s smart home camera service is starting to impress me
|
2026-06-16
|
|
bbc-news
|
Oil price falls to levels not seen since before Iran war
|
2026-06-25
|
|
Gizmodo.com
|
America’s Solar Just Hit a Critical Milestone That Won’t Make Trump
Happy
|
2026-06-11
|
news_tokens <- news_df %>%
# Keep both the source and the title!
select(source, title) %>%
unnest_tokens(word, title) %>%
anti_join(stop_words, by = "word") %>%
filter(!str_detect(word, "^\\d+$"), nchar(word) > 2)
news_tokens %>%
count(word, sort = TRUE) %>%
slice_head(n = 20) %>%
mutate(word = fct_reorder(word, n)) %>%
ggplot(aes(x = n, y = word, fill = n)) +
geom_col(show.legend = FALSE) +
scale_fill_gradient(low = "#a8d8ea", high = "#0077b6") +
labs(
title = "Top 20 Words in News Headlines",
x = "Count", y = NULL,
caption = "Source: NewsAPI"
) +
theme_minimal(base_size = 13)

# Match tokens against the Bing lexicon
sentiment_bing <- news_tokens %>%
inner_join(get_sentiments("bing"), by = "word", relationship = "many-to-many")
# Plot top sentiment words
sentiment_bing %>%
count(word, sentiment, sort = TRUE) %>%
group_by(sentiment) %>%
slice_max(n, n = 10, with_ties = FALSE) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = n, y = word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
scale_fill_manual(values = c("positive" = "#2ecc71", "negative" = "#e74c3c")) +
labs(
title = "Top Words Driving Sentiment in News Titles",
x = "Frequency (Word Count)",
y = NULL
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 14),
strip.text = element_text(face = "bold", size = 12)
)

# Force textdata to accept the license automatically for knitting
Sys.setenv(TEXTDATA_AGREE = "TRUE")
# Note: If your news_tokens has a 'source' column, you can group by it here.
# Assuming a single dataset for now, we calculate the total mean sentiment.
news_tokens %>%
inner_join(get_sentiments("afinn"), by = "word") %>%
summarise(
words_matched = n(),
mean_sentiment = round(mean(value), 3),
sum_sentiment = sum(value)
) %>%
kable(caption = "Total AFINN Sentiment Score") %>%
kable_styling(bootstrap_options = "striped", full_width = FALSE)
Total AFINN Sentiment Score
|
words_matched
|
mean_sentiment
|
sum_sentiment
|
|
59
|
-0.068
|
-4
|
# TF-IDF (Term Frequency–Inverse Document Frequency)
n_sources <- n_distinct(news_tokens$source)
news_tokens %>%
count(source, word) %>%
bind_tf_idf(word, source, n) %>%
group_by(source) %>%
slice_max(tf_idf, n = 5, with_ties = FALSE) %>%
ungroup() %>%
mutate(
word = str_trunc(word, 20),
word = reorder_within(word, tf_idf, source)
) %>%
ggplot(aes(x = tf_idf, y = word, fill = source)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ source, scales = "free_y", ncol = 4) +
scale_y_reordered() +
scale_fill_manual(
values = colorRampPalette(RColorBrewer::brewer.pal(9, "Set1"))(n_sources)
) +
labs(
title = "Top TF-IDF Terms by Source",
x = "TF-IDF Score",
y = NULL,
caption = "Source: NewsAPI"
) +
theme_minimal(base_size = 10) +
theme(
strip.text = element_text(size = 8, face = "bold"),
axis.text.y = element_text(size = 7.5),
panel.spacing = unit(1.2, "lines")
)

Preliminary Findings
After reviewing the headlines and text analysis charts and plots,
there has not been many articles published in the last few days. The few
articles that were returned show very few words that can be analyzed for
sentiment. Of the seven unique articles, only one word that drives
sentiment was found. I only searched one topic, I know expanding this
would yield more results for analysis. Searching one topic seems to
limit the results. I thought a generic topic like energy would return
many results but that was not true. In the future I would expand the
number of topics if using the top_headlines function, using a pair like
energy and gasoline. I had to switch to using the every_news function to
grab more data for sentiment analysis. Once I did that, I had more data
for analysis.