News Sentiment & Text Analytics: Public Policy News Topics

install.packages(c("tidyverse", "tidytext", "jsonlite",
                   "wordcloud", "RColorBrewer", "lubridate",
                   "scales", "knitr", "kableExtra", "readr"))

library(tidyverse)
library(tidytext)
library(jsonlite)
library(wordcloud)
library(RColorBrewer)
library(lubridate)
library(scales)
library(knitr)
library(kableExtra)
library(readr)

safe_kable <- function(data, caption = NULL, digits = 3, font_size = 8) {
  data %>%
    kable(
      caption = caption,
      booktabs = TRUE,
      digits = digits
    ) %>%
    kable_styling(
      bootstrap_options = c("striped", "hover", "condensed"),
      latex_options = c("scale_down", "hold_position"),
      full_width = FALSE,
      font_size = font_size
    )
}

short_text <- function(x, width = 70) {
  stringr::str_trunc(as.character(x), width = width, side = "right")
}

## Paste your NewsAPI key inside the quotation marks below.
## Do not post this key publicly or upload it to GitHub.

api_key <- "74aecc4c252f4146b00f8a7e9745f1ed"

if (api_key == "PASTE_YOUR_NEWSAPI_KEY_HERE" || api_key == "") {
  stop("Please paste your NewsAPI key into the api_key object before knitting.")
}

## New topic set for the in-class NewsAPI webcrawl exercise.
## query_label = clean display name used in tables/plots.
## api_query = actual NewsAPI search query.

topics <- tibble::tribble(
  ~query_label,                ~api_query,
  "Trump",                     "Trump",
  "DHS",                       "\"Department of Homeland Security\" OR DHS",
  "ICE",                       "\"Immigration and Customs Enforcement\" OR (ICE AND immigration)",
  "Immigration Enforcement",   "\"immigration enforcement\" OR \"border enforcement\""
)

topics %>%
  mutate(api_query = short_text(api_query, 60)) %>%
  safe_kable(caption = "NewsAPI Search Topics")

NewsAPI Search Topics
query_label	api_query
Trump	Trump
DHS	“Department of Homeland Security” OR DHS
ICE	“Immigration and Customs Enforcement” OR (ICE AND immigra…
Immigration Enforcement	“immigration enforcement” OR “border enforcement”

fetch_news <- function(query_label, api_query, api_key, page_size = 20) {
  url <- paste0(
    "https://newsapi.org/v2/everything?",
    "q=",        URLencode(api_query, reserved = TRUE),
    "&language=en",
    "&sortBy=publishedAt",
    "&pageSize=", page_size,
    "&apiKey=",  api_key
  )

  response <- fromJSON(url, flatten = TRUE)

  if (!is.null(response$status) && response$status != "ok") {
    warning(paste("NewsAPI request failed for topic:", query_label))
    warning(paste("Message:", response$message))
    return(tibble())
  }

  if (is.null(response$articles) || length(response$articles) == 0) {
    warning(paste("No articles returned for topic:", query_label))
    return(tibble())
  }

  articles <- as_tibble(response$articles)

  articles %>%
    rename_with(~ str_replace_all(.x, "\\.", "_")) %>%
    mutate(
      query = query_label,
      api_query = api_query
    )
}

news_raw <- pmap_dfr(
  topics,
  function(query_label, api_query) {
    fetch_news(
      query_label = query_label,
      api_query = api_query,
      api_key = api_key,
      page_size = 20
    )
  }
)

if (nrow(news_raw) == 0) {
  stop("No news articles were returned. Check your API key, NewsAPI limit, or internet connection.")
}

glimpse(news_raw)

## Rows: 69
## Columns: 11
## $ author      <chr> "ANI", NA, NA, "RTÉ News", "RTÉ News", "X.com", "Sameer Su…
## $ title       <chr> "No nuclear weapon, conditional relief & open Hormuz: Vanc…
## $ description <chr> "US Vice President JD Vance described that the peace agree…
## $ url         <chr> "https://economictimes.indiatimes.com/news/defence/no-nucl…
## $ urlToImage  <chr> "https://img.etimg.com/thumb/msid-131786269,width-1200,hei…
## $ publishedAt <chr> "2026-06-17T03:46:27Z", "2026-06-17T03:45:37Z", "2026-06-1…
## $ content     <chr> "US Vice President JD Vance described that the peace agree…
## $ source_id   <chr> "the-times-of-india", NA, NA, "rte", "rte", NA, NA, NA, "t…
## $ source_name <chr> "The Times of India", "Fark.com", "CNA", "RTE", "RTE", "Fr…
## $ query       <chr> "Trump", "Trump", "Trump", "Trump", "Trump", "Trump", "Tru…
## $ api_query   <chr> "Trump", "Trump", "Trump", "Trump", "Trump", "Trump", "Tru…

## Required group submission item:
## This exports the raw data scraped from NewsAPI.

write_csv(news_raw, "newsapi_raw_data_public_policy_topics.csv")

cat("Raw NewsAPI data exported to: newsapi_raw_data_public_policy_topics.csv")

## Raw NewsAPI data exported to: newsapi_raw_data_public_policy_topics.csv

news_clean <- news_raw %>%
  filter(!is.na(.data$title)) %>%
  mutate(
    pub_date    = ymd_hms(.data$publishedAt, quiet = TRUE),
    pub_day     = as.Date(pub_date),
    source_clean = if_else(
      !is.na(.data$source_name),
      as.character(.data$source_name),
      "Unknown Source"
    ),
    title_clean = str_remove(.data$title, "\\s*-\\s*[^-]+$"),
    title_clean = str_squish(str_replace_all(title_clean, "[^[:alnum:][:space:]]", " ")),
    title_clean = str_to_lower(title_clean)
  ) %>%
  distinct(query, title_clean, .keep_all = TRUE)

cat("Total unique headlines:", nrow(news_clean), "\n")

## Total unique headlines: 69

news_clean %>%
  mutate(
    title_display = short_text(title, 75),
    source_display = short_text(source_clean, 35)
  ) %>%
  select(query, title_display, source_display, pub_day) %>%
  head(12) %>%
  safe_kable(caption = "Sample Cleaned Headlines", font_size = 7)

Sample Cleaned Headlines
query	title_display	source_display	pub_day
Trump	No nuclear weapon, conditional relief & open Hormuz: Vance outlines 3 pi…	The Times of India	2026-06-17
Trump	A little perspective from Mr. Global about how Stinky dun goofed so badl…	Fark.com	2026-06-17
Trump	Vietnam maintains 2026 GDP target despite trade deficit, inflation pressure	CNA	2026-06-17
Trump	Macron to close G7 with AI discussions, Trump dinner	RTE	2026-06-17
Trump	Macron to close G7 summit with AI discussions	RTE	2026-06-17
Trump	JD Vance Just EVISCERATED The Hosts of The View over their false claims …	Freerepublic.com	2026-06-17
Trump	Meghan McCain trashes former co-stars on The View over ‘undisciplined’ J…	Dailymail.com	2026-06-17
Trump	The FDA Just Put Psilocybin and an MDMA-Like Drug on a 1-to-2 Month Appr…	Medical Daily	2026-06-17
Trump	Decision-Day Guide: Warsh faces first big test as Fed Chair	The Times of India	2026-06-17
Trump	Kevin Warsh prepares for first Fed meeting as inflation hits 4%	Crypto Briefing	2026-06-17
Trump	Gold extends gains as rate-hike bets ease ahead of Fed verdict	BusinessLine	2026-06-17
Trump	Alabama US Senate: Voters decide tonight between Trump-endorsed Barry Mo…	Slashdot.org	2026-06-17

headline_counts <- news_clean %>%
  count(query, sort = TRUE)

headline_counts %>%
  safe_kable(caption = "Unique Headline Count by Topic")

Unique Headline Count by Topic
query	n
ICE	19
DHS	17
Trump	17
Immigration Enforcement	16

news_tokens <- news_clean %>%
  select(query, title_clean) %>%
  unnest_tokens(word, title_clean) %>%
  anti_join(stop_words, by = "word") %>%
  filter(!str_detect(word, "^\\d+$"), nchar(word) > 2)

top_words <- news_tokens %>%
  count(word, sort = TRUE) %>%
  slice_head(n = 20)

top_words %>%
  safe_kable(caption = "Top 20 Words Across All Headlines")

Top 20 Words Across All Headlines
word	n
trump	17
antifa	10
minnesota	10
federal	9
anti	7
ice	6
charged	5
feds	5
conspiracy	4
immigration	4
iran	4
senate	4
tied	4
alabama	3
block	3
charges	3
court	3
judge	3
operations	3
people	3

top_words %>%
  mutate(word = fct_reorder(word, n)) %>%
  ggplot(aes(x = n, y = word, fill = n)) +
  geom_col(show.legend = FALSE) +
  scale_fill_gradient(low = "#a8d8ea", high = "#0077b6") +
  labs(
    title    = "Top 20 Words in News Headlines",
    subtitle = "Trump, DHS, ICE, and Immigration Enforcement",
    x        = "Count",
    y        = NULL,
    caption  = "Source: NewsAPI"
  ) +
  theme_minimal(base_size = 13)

word_freq <- news_tokens %>%
  count(word, sort = TRUE) %>%
  filter(n >= 2)

set.seed(42)

wordcloud(
  words  = word_freq$word,
  freq   = word_freq$n,
  min.freq = 1,
  max.words = 80,
  random.order = FALSE,
  colors = brewer.pal(8, "Dark2"),
  scale  = c(3.5, 0.5)
)

title("News Headline Word Cloud — Public Policy Topics")

## Web-safe AFINN import.
## This avoids tidytext::get_sentiments("afinn"), which can trigger
## textdata::menu() and break non-interactive R Markdown knitting.

afinn <- read_delim(
  "https://raw.githubusercontent.com/fnielsen/afinn/master/afinn/data/AFINN-en-165.txt",
  delim = "\t",
  col_names = c("word", "value"),
  show_col_types = FALSE
)

sentiment_afinn <- news_tokens %>%
  inner_join(afinn, by = "word") %>%
  group_by(query) %>%
  summarise(
    total_matched_words = n(),
    mean_sentiment      = round(mean(value), 3),
    sum_sentiment       = sum(value),
    .groups = "drop"
  ) %>%
  arrange(desc(mean_sentiment))

sentiment_afinn %>%
  safe_kable(caption = "AFINN Sentiment Score by Topic")

AFINN Sentiment Score by Topic
query	total_matched_words	mean_sentiment	sum_sentiment
Trump	19	-0.263	-5
DHS	22	-0.364	-8
Immigration Enforcement	29	-1.655	-48
ICE	28	-1.786	-50

sentiment_afinn %>%
  mutate(
    query = fct_reorder(query, mean_sentiment),
    sentiment_dir = ifelse(mean_sentiment >= 0, "Positive", "Negative")
  ) %>%
  ggplot(aes(x = mean_sentiment, y = query, fill = sentiment_dir)) +
  geom_col(width = 0.6) +
  scale_fill_manual(values = c("Positive" = "#2ecc71", "Negative" = "#e74c3c")) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "gray40") +
  labs(
    title   = "Mean AFINN Sentiment Score by Topic",
    subtitle = "Higher scores indicate more positive headline language",
    x       = "Mean Sentiment Score",
    y       = NULL,
    fill    = NULL,
    caption = "Source: NewsAPI headlines and AFINN sentiment lexicon"
  ) +
  theme_minimal(base_size = 13) +
  theme(legend.position = "top")

## Bing sentiment is bundled with tidytext and should knit normally.

bing <- get_sentiments("bing")

sentiment_bing <- news_tokens %>%
  inner_join(bing, by = "word") %>%
  count(query, sentiment) %>%
  pivot_wider(
    names_from  = sentiment,
    values_from = n,
    values_fill = list(n = 0)
  ) %>%
  mutate(
    positive = coalesce(positive, 0L),
    negative = coalesce(negative, 0L),
    net_sentiment = positive - negative
  )

sentiment_bing %>%
  safe_kable(caption = "Bing Sentiment Count by Topic")

Bing Sentiment Count by Topic
query	negative	positive	net_sentiment
DHS	11	10	-1
ICE	14	2	-12
Immigration Enforcement	11	8	-3
Trump	8	15	7

news_tokens %>%
  inner_join(bing, by = "word") %>%
  count(word, sentiment, sort = TRUE) %>%
  group_by(sentiment) %>%
  slice_head(n = 10) %>%
  ungroup() %>%
  mutate(word = reorder_within(word, n, sentiment)) %>%
  ggplot(aes(x = n, y = word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ sentiment, scales = "free_y") +
  scale_y_reordered() +
  scale_fill_manual(values = c("positive" = "#2ecc71", "negative" = "#e74c3c")) +
  labs(
    title   = "Top Positive & Negative Words in Headlines",
    x       = "Count",
    y       = NULL,
    caption = "Source: NewsAPI headlines and Bing sentiment lexicon"
  ) +
  theme_minimal(base_size = 12)

## NRC is disabled for the web-knit version because tidytext::get_sentiments("nrc")
## can trigger the same non-interactive textdata download/menu issue.
## If your instructor requires NRC, download/cache it interactively first,
## then remove eval=FALSE from this chunk.

nrc <- get_sentiments("nrc")

emotion_nrc <- news_tokens %>%
  inner_join(nrc, by = "word") %>%
  filter(!sentiment %in% c("positive", "negative")) %>%
  count(query, sentiment) %>%
  group_by(query) %>%
  mutate(prop = n / sum(n))

ggplot(emotion_nrc, aes(x = sentiment, y = prop, fill = query)) +
  geom_col(position = "dodge") +
  scale_y_continuous(labels = percent_format()) +
  scale_fill_brewer(palette = "Set2") +
  labs(
    title   = "NRC Emotion Proportions by Topic",
    x       = "Emotion",
    y       = "Proportion of Emotional Words",
    fill    = "Topic",
    caption = "Source: NewsAPI headlines and NRC emotion lexicon"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    axis.text.x = element_text(angle = 30, hjust = 1),
    legend.position = "top"
  )

tfidf_words <- news_tokens %>%
  count(query, word) %>%
  bind_tf_idf(word, query, n) %>%
  group_by(query) %>%
  slice_max(tf_idf, n = 12, with_ties = FALSE) %>%
  ungroup()

tfidf_words %>%
  mutate(word = reorder_within(word, tf_idf, query)) %>%
  ggplot(aes(x = tf_idf, y = word, fill = query)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ query, scales = "free_y", ncol = 2) +
  scale_y_reordered() +
  scale_fill_brewer(palette = "Set1") +
  labs(
    title    = "Top 12 TF-IDF Terms by Topic",
    subtitle = "Words most distinctive to each NewsAPI search topic",
    x        = "TF-IDF Score",
    y        = NULL,
    caption  = "Source: NewsAPI headlines"
  ) +
  theme_minimal(base_size = 12)

summary_tbl <- sentiment_afinn %>%
  left_join(
    sentiment_bing %>% select(query, positive, negative, net_sentiment),
    by = "query"
  ) %>%
  left_join(
    headline_counts,
    by = "query"
  ) %>%
  rename(
    Topic                 = query,
    `Unique Headlines`    = n,
    `Words Matched`       = total_matched_words,
    `Mean AFINN`          = mean_sentiment,
    `AFINN Sum`           = sum_sentiment,
    Positive              = positive,
    Negative              = negative,
    `Net Bing Sentiment`  = net_sentiment
  ) %>%
  select(
    Topic,
    `Unique Headlines`,
    `Words Matched`,
    `Mean AFINN`,
    `AFINN Sum`,
    Positive,
    Negative,
    `Net Bing Sentiment`
  )

summary_tbl %>%
  safe_kable(caption = "Sentiment Summary: All Topics", font_size = 7) %>%
  column_spec(
    4,
    color = ifelse(summary_tbl$`Mean AFINN` >= 0, "darkgreen", "red")
  )

Sentiment Summary: All Topics
Topic	Unique Headlines	Words Matched	Mean AFINN	AFINN Sum	Positive	Negative	Net Bing Sentiment
Trump	17	19	-0.263	-5	15	8	7
DHS	17	22	-0.364	-8	10	11	-1
Immigration Enforcement	16	29	-1.655	-48	8	11	-3
ICE	19	28	-1.786	-50	2	14	-12

## Required and optional submission files.
## Required: raw data scraped from NewsAPI.
## Optional but useful: cleaned headlines and sentiment summary.

write_csv(news_raw, "newsapi_raw_data_public_policy_topics.csv")
write_csv(news_clean, "newsapi_clean_headlines_public_policy_topics.csv")
write_csv(summary_tbl, "newsapi_sentiment_summary_public_policy_topics.csv")

cat("Export complete. Files saved to the RStudio Cloud project folder:")

## Export complete. Files saved to the RStudio Cloud project folder:

cat("\n- newsapi_raw_data_public_policy_topics.csv")

## 
## - newsapi_raw_data_public_policy_topics.csv

cat("\n- newsapi_clean_headlines_public_policy_topics.csv")

## 
## - newsapi_clean_headlines_public_policy_topics.csv

cat("\n- newsapi_sentiment_summary_public_policy_topics.csv")

## 
## - newsapi_sentiment_summary_public_policy_topics.csv

if (nrow(summary_tbl) > 0) {
  most_positive <- summary_tbl %>%
    arrange(desc(`Mean AFINN`)) %>%
    slice(1)

  most_negative <- summary_tbl %>%
    arrange(`Mean AFINN`) %>%
    slice(1)

  cat("## Two-Sentence Summary of Findings\n\n")

  cat(
    "Among the four public-policy news topics analyzed from NewsAPI headlines, ",
    most_positive$Topic,
    " had the highest average AFINN sentiment score, while ",
    most_negative$Topic,
    " had the lowest average AFINN sentiment score. ",
    sep = ""
  )

  cat(
    "The TF-IDF results show that each topic was associated with distinct headline language, suggesting that the topics differed not only in sentiment but also in the specific news themes driving the coverage."
  )
} else {
  cat("## Two-Sentence Summary of Findings\n\n")
  cat("The sentiment summary table did not contain enough matched sentiment terms to automatically generate a finding. The headline and TF-IDF outputs should still be reviewed manually to identify the clearest topic-level patterns.")
}

Two-Sentence Summary of Findings

Among the four public-policy news topics analyzed from NewsAPI headlines, Trump had the highest average AFINN sentiment score, while ICE had the lowest average AFINN sentiment score. The TF-IDF results show that each topic was associated with distinct headline language, suggesting that the topics differed not only in sentiment but also in the specific news themes driving the coverage.

News Sentiment & Text Analytics: Public Policy News Topics

Student Example

2026-06-18

Two-Sentence Summary of Findings