install.packages(c("tidyverse", "tidytext", "jsonlite",
                   "wordcloud", "RColorBrewer", "lubridate",
                   "scales", "knitr", "kableExtra", "readr"))
library(tidyverse)
library(tidytext)
library(jsonlite)
library(wordcloud)
library(RColorBrewer)
library(lubridate)
library(scales)
library(knitr)
library(kableExtra)
library(readr)
safe_kable <- function(data, caption = NULL, digits = 3, font_size = 8) {
  data %>%
    kable(
      caption = caption,
      booktabs = TRUE,
      digits = digits
    ) %>%
    kable_styling(
      bootstrap_options = c("striped", "hover", "condensed"),
      latex_options = c("scale_down", "hold_position"),
      full_width = FALSE,
      font_size = font_size
    )
}

short_text <- function(x, width = 70) {
  stringr::str_trunc(as.character(x), width = width, side = "right")
}
## Paste your NewsAPI key inside the quotation marks below.
## Do not post this key publicly or upload it to GitHub.

api_key <- "74aecc4c252f4146b00f8a7e9745f1ed"

if (api_key == "PASTE_YOUR_NEWSAPI_KEY_HERE" || api_key == "") {
  stop("Please paste your NewsAPI key into the api_key object before knitting.")
}
## New topic set for the in-class NewsAPI webcrawl exercise.
## query_label = clean display name used in tables/plots.
## api_query = actual NewsAPI search query.

topics <- tibble::tribble(
  ~query_label,                ~api_query,
  "Trump",                     "Trump",
  "DHS",                       "\"Department of Homeland Security\" OR DHS",
  "ICE",                       "\"Immigration and Customs Enforcement\" OR (ICE AND immigration)",
  "Immigration Enforcement",   "\"immigration enforcement\" OR \"border enforcement\""
)

topics %>%
  mutate(api_query = short_text(api_query, 60)) %>%
  safe_kable(caption = "NewsAPI Search Topics")
NewsAPI Search Topics
query_label api_query
Trump Trump
DHS “Department of Homeland Security” OR DHS
ICE “Immigration and Customs Enforcement” OR (ICE AND immigra…
Immigration Enforcement “immigration enforcement” OR “border enforcement”
fetch_news <- function(query_label, api_query, api_key, page_size = 20) {
  url <- paste0(
    "https://newsapi.org/v2/everything?",
    "q=",        URLencode(api_query, reserved = TRUE),
    "&language=en",
    "&sortBy=publishedAt",
    "&pageSize=", page_size,
    "&apiKey=",  api_key
  )

  response <- fromJSON(url, flatten = TRUE)

  if (!is.null(response$status) && response$status != "ok") {
    warning(paste("NewsAPI request failed for topic:", query_label))
    warning(paste("Message:", response$message))
    return(tibble())
  }

  if (is.null(response$articles) || length(response$articles) == 0) {
    warning(paste("No articles returned for topic:", query_label))
    return(tibble())
  }

  articles <- as_tibble(response$articles)

  articles %>%
    rename_with(~ str_replace_all(.x, "\\.", "_")) %>%
    mutate(
      query = query_label,
      api_query = api_query
    )
}

news_raw <- pmap_dfr(
  topics,
  function(query_label, api_query) {
    fetch_news(
      query_label = query_label,
      api_query = api_query,
      api_key = api_key,
      page_size = 20
    )
  }
)

if (nrow(news_raw) == 0) {
  stop("No news articles were returned. Check your API key, NewsAPI limit, or internet connection.")
}

glimpse(news_raw)
## Rows: 69
## Columns: 11
## $ author      <chr> "ANI", NA, NA, "RTÉ News", "RTÉ News", "X.com", "Sameer Su…
## $ title       <chr> "No nuclear weapon, conditional relief & open Hormuz: Vanc…
## $ description <chr> "US Vice President JD Vance described that the peace agree…
## $ url         <chr> "https://economictimes.indiatimes.com/news/defence/no-nucl…
## $ urlToImage  <chr> "https://img.etimg.com/thumb/msid-131786269,width-1200,hei…
## $ publishedAt <chr> "2026-06-17T03:46:27Z", "2026-06-17T03:45:37Z", "2026-06-1…
## $ content     <chr> "US Vice President JD Vance described that the peace agree…
## $ source_id   <chr> "the-times-of-india", NA, NA, "rte", "rte", NA, NA, NA, "t…
## $ source_name <chr> "The Times of India", "Fark.com", "CNA", "RTE", "RTE", "Fr…
## $ query       <chr> "Trump", "Trump", "Trump", "Trump", "Trump", "Trump", "Tru…
## $ api_query   <chr> "Trump", "Trump", "Trump", "Trump", "Trump", "Trump", "Tru…
## Required group submission item:
## This exports the raw data scraped from NewsAPI.

write_csv(news_raw, "newsapi_raw_data_public_policy_topics.csv")

cat("Raw NewsAPI data exported to: newsapi_raw_data_public_policy_topics.csv")
## Raw NewsAPI data exported to: newsapi_raw_data_public_policy_topics.csv
news_clean <- news_raw %>%
  filter(!is.na(.data$title)) %>%
  mutate(
    pub_date    = ymd_hms(.data$publishedAt, quiet = TRUE),
    pub_day     = as.Date(pub_date),
    source_clean = if_else(
      !is.na(.data$source_name),
      as.character(.data$source_name),
      "Unknown Source"
    ),
    title_clean = str_remove(.data$title, "\\s*-\\s*[^-]+$"),
    title_clean = str_squish(str_replace_all(title_clean, "[^[:alnum:][:space:]]", " ")),
    title_clean = str_to_lower(title_clean)
  ) %>%
  distinct(query, title_clean, .keep_all = TRUE)

cat("Total unique headlines:", nrow(news_clean), "\n")
## Total unique headlines: 69
news_clean %>%
  mutate(
    title_display = short_text(title, 75),
    source_display = short_text(source_clean, 35)
  ) %>%
  select(query, title_display, source_display, pub_day) %>%
  head(12) %>%
  safe_kable(caption = "Sample Cleaned Headlines", font_size = 7)
Sample Cleaned Headlines
query title_display source_display pub_day
Trump No nuclear weapon, conditional relief & open Hormuz: Vance outlines 3 pi… The Times of India 2026-06-17
Trump A little perspective from Mr. Global about how Stinky dun goofed so badl… Fark.com 2026-06-17
Trump Vietnam maintains 2026 GDP target despite trade deficit, inflation pressure CNA 2026-06-17
Trump Macron to close G7 with AI discussions, Trump dinner RTE 2026-06-17
Trump Macron to close G7 summit with AI discussions RTE 2026-06-17
Trump JD Vance Just EVISCERATED The Hosts of The View over their false claims … Freerepublic.com 2026-06-17
Trump Meghan McCain trashes former co-stars on The View over ‘undisciplined’ J… Dailymail.com 2026-06-17
Trump The FDA Just Put Psilocybin and an MDMA-Like Drug on a 1-to-2 Month Appr… Medical Daily 2026-06-17
Trump Decision-Day Guide: Warsh faces first big test as Fed Chair The Times of India 2026-06-17
Trump Kevin Warsh prepares for first Fed meeting as inflation hits 4% Crypto Briefing 2026-06-17
Trump Gold extends gains as rate-hike bets ease ahead of Fed verdict BusinessLine 2026-06-17
Trump Alabama US Senate: Voters decide tonight between Trump-endorsed Barry Mo… Slashdot.org 2026-06-17
headline_counts <- news_clean %>%
  count(query, sort = TRUE)

headline_counts %>%
  safe_kable(caption = "Unique Headline Count by Topic")
Unique Headline Count by Topic
query n
ICE 19
DHS 17
Trump 17
Immigration Enforcement 16
news_tokens <- news_clean %>%
  select(query, title_clean) %>%
  unnest_tokens(word, title_clean) %>%
  anti_join(stop_words, by = "word") %>%
  filter(!str_detect(word, "^\\d+$"), nchar(word) > 2)

top_words <- news_tokens %>%
  count(word, sort = TRUE) %>%
  slice_head(n = 20)

top_words %>%
  safe_kable(caption = "Top 20 Words Across All Headlines")
Top 20 Words Across All Headlines
word n
trump 17
antifa 10
minnesota 10
federal 9
anti 7
ice 6
charged 5
feds 5
conspiracy 4
immigration 4
iran 4
senate 4
tied 4
alabama 3
block 3
charges 3
court 3
judge 3
operations 3
people 3
top_words %>%
  mutate(word = fct_reorder(word, n)) %>%
  ggplot(aes(x = n, y = word, fill = n)) +
  geom_col(show.legend = FALSE) +
  scale_fill_gradient(low = "#a8d8ea", high = "#0077b6") +
  labs(
    title    = "Top 20 Words in News Headlines",
    subtitle = "Trump, DHS, ICE, and Immigration Enforcement",
    x        = "Count",
    y        = NULL,
    caption  = "Source: NewsAPI"
  ) +
  theme_minimal(base_size = 13)

word_freq <- news_tokens %>%
  count(word, sort = TRUE) %>%
  filter(n >= 2)

set.seed(42)

wordcloud(
  words  = word_freq$word,
  freq   = word_freq$n,
  min.freq = 1,
  max.words = 80,
  random.order = FALSE,
  colors = brewer.pal(8, "Dark2"),
  scale  = c(3.5, 0.5)
)

title("News Headline Word Cloud — Public Policy Topics")

## Web-safe AFINN import.
## This avoids tidytext::get_sentiments("afinn"), which can trigger
## textdata::menu() and break non-interactive R Markdown knitting.

afinn <- read_delim(
  "https://raw.githubusercontent.com/fnielsen/afinn/master/afinn/data/AFINN-en-165.txt",
  delim = "\t",
  col_names = c("word", "value"),
  show_col_types = FALSE
)

sentiment_afinn <- news_tokens %>%
  inner_join(afinn, by = "word") %>%
  group_by(query) %>%
  summarise(
    total_matched_words = n(),
    mean_sentiment      = round(mean(value), 3),
    sum_sentiment       = sum(value),
    .groups = "drop"
  ) %>%
  arrange(desc(mean_sentiment))

sentiment_afinn %>%
  safe_kable(caption = "AFINN Sentiment Score by Topic")
AFINN Sentiment Score by Topic
query total_matched_words mean_sentiment sum_sentiment
Trump 19 -0.263 -5
DHS 22 -0.364 -8
Immigration Enforcement 29 -1.655 -48
ICE 28 -1.786 -50
sentiment_afinn %>%
  mutate(
    query = fct_reorder(query, mean_sentiment),
    sentiment_dir = ifelse(mean_sentiment >= 0, "Positive", "Negative")
  ) %>%
  ggplot(aes(x = mean_sentiment, y = query, fill = sentiment_dir)) +
  geom_col(width = 0.6) +
  scale_fill_manual(values = c("Positive" = "#2ecc71", "Negative" = "#e74c3c")) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "gray40") +
  labs(
    title   = "Mean AFINN Sentiment Score by Topic",
    subtitle = "Higher scores indicate more positive headline language",
    x       = "Mean Sentiment Score",
    y       = NULL,
    fill    = NULL,
    caption = "Source: NewsAPI headlines and AFINN sentiment lexicon"
  ) +
  theme_minimal(base_size = 13) +
  theme(legend.position = "top")

## Bing sentiment is bundled with tidytext and should knit normally.

bing <- get_sentiments("bing")

sentiment_bing <- news_tokens %>%
  inner_join(bing, by = "word") %>%
  count(query, sentiment) %>%
  pivot_wider(
    names_from  = sentiment,
    values_from = n,
    values_fill = list(n = 0)
  ) %>%
  mutate(
    positive = coalesce(positive, 0L),
    negative = coalesce(negative, 0L),
    net_sentiment = positive - negative
  )

sentiment_bing %>%
  safe_kable(caption = "Bing Sentiment Count by Topic")
Bing Sentiment Count by Topic
query negative positive net_sentiment
DHS 11 10 -1
ICE 14 2 -12
Immigration Enforcement 11 8 -3
Trump 8 15 7
news_tokens %>%
  inner_join(bing, by = "word") %>%
  count(word, sentiment, sort = TRUE) %>%
  group_by(sentiment) %>%
  slice_head(n = 10) %>%
  ungroup() %>%
  mutate(word = reorder_within(word, n, sentiment)) %>%
  ggplot(aes(x = n, y = word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ sentiment, scales = "free_y") +
  scale_y_reordered() +
  scale_fill_manual(values = c("positive" = "#2ecc71", "negative" = "#e74c3c")) +
  labs(
    title   = "Top Positive & Negative Words in Headlines",
    x       = "Count",
    y       = NULL,
    caption = "Source: NewsAPI headlines and Bing sentiment lexicon"
  ) +
  theme_minimal(base_size = 12)

## NRC is disabled for the web-knit version because tidytext::get_sentiments("nrc")
## can trigger the same non-interactive textdata download/menu issue.
## If your instructor requires NRC, download/cache it interactively first,
## then remove eval=FALSE from this chunk.

nrc <- get_sentiments("nrc")

emotion_nrc <- news_tokens %>%
  inner_join(nrc, by = "word") %>%
  filter(!sentiment %in% c("positive", "negative")) %>%
  count(query, sentiment) %>%
  group_by(query) %>%
  mutate(prop = n / sum(n))

ggplot(emotion_nrc, aes(x = sentiment, y = prop, fill = query)) +
  geom_col(position = "dodge") +
  scale_y_continuous(labels = percent_format()) +
  scale_fill_brewer(palette = "Set2") +
  labs(
    title   = "NRC Emotion Proportions by Topic",
    x       = "Emotion",
    y       = "Proportion of Emotional Words",
    fill    = "Topic",
    caption = "Source: NewsAPI headlines and NRC emotion lexicon"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    axis.text.x = element_text(angle = 30, hjust = 1),
    legend.position = "top"
  )
tfidf_words <- news_tokens %>%
  count(query, word) %>%
  bind_tf_idf(word, query, n) %>%
  group_by(query) %>%
  slice_max(tf_idf, n = 12, with_ties = FALSE) %>%
  ungroup()

tfidf_words %>%
  mutate(word = reorder_within(word, tf_idf, query)) %>%
  ggplot(aes(x = tf_idf, y = word, fill = query)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ query, scales = "free_y", ncol = 2) +
  scale_y_reordered() +
  scale_fill_brewer(palette = "Set1") +
  labs(
    title    = "Top 12 TF-IDF Terms by Topic",
    subtitle = "Words most distinctive to each NewsAPI search topic",
    x        = "TF-IDF Score",
    y        = NULL,
    caption  = "Source: NewsAPI headlines"
  ) +
  theme_minimal(base_size = 12)

summary_tbl <- sentiment_afinn %>%
  left_join(
    sentiment_bing %>% select(query, positive, negative, net_sentiment),
    by = "query"
  ) %>%
  left_join(
    headline_counts,
    by = "query"
  ) %>%
  rename(
    Topic                 = query,
    `Unique Headlines`    = n,
    `Words Matched`       = total_matched_words,
    `Mean AFINN`          = mean_sentiment,
    `AFINN Sum`           = sum_sentiment,
    Positive              = positive,
    Negative              = negative,
    `Net Bing Sentiment`  = net_sentiment
  ) %>%
  select(
    Topic,
    `Unique Headlines`,
    `Words Matched`,
    `Mean AFINN`,
    `AFINN Sum`,
    Positive,
    Negative,
    `Net Bing Sentiment`
  )

summary_tbl %>%
  safe_kable(caption = "Sentiment Summary: All Topics", font_size = 7) %>%
  column_spec(
    4,
    color = ifelse(summary_tbl$`Mean AFINN` >= 0, "darkgreen", "red")
  )
Sentiment Summary: All Topics
Topic Unique Headlines Words Matched Mean AFINN AFINN Sum Positive Negative Net Bing Sentiment
Trump 17 19 -0.263 -5 15 8 7
DHS 17 22 -0.364 -8 10 11 -1
Immigration Enforcement 16 29 -1.655 -48 8 11 -3
ICE 19 28 -1.786 -50 2 14 -12
## Required and optional submission files.
## Required: raw data scraped from NewsAPI.
## Optional but useful: cleaned headlines and sentiment summary.

write_csv(news_raw, "newsapi_raw_data_public_policy_topics.csv")
write_csv(news_clean, "newsapi_clean_headlines_public_policy_topics.csv")
write_csv(summary_tbl, "newsapi_sentiment_summary_public_policy_topics.csv")

cat("Export complete. Files saved to the RStudio Cloud project folder:")
## Export complete. Files saved to the RStudio Cloud project folder:
cat("\n- newsapi_raw_data_public_policy_topics.csv")
## 
## - newsapi_raw_data_public_policy_topics.csv
cat("\n- newsapi_clean_headlines_public_policy_topics.csv")
## 
## - newsapi_clean_headlines_public_policy_topics.csv
cat("\n- newsapi_sentiment_summary_public_policy_topics.csv")
## 
## - newsapi_sentiment_summary_public_policy_topics.csv
if (nrow(summary_tbl) > 0) {
  most_positive <- summary_tbl %>%
    arrange(desc(`Mean AFINN`)) %>%
    slice(1)

  most_negative <- summary_tbl %>%
    arrange(`Mean AFINN`) %>%
    slice(1)

  cat("## Two-Sentence Summary of Findings\n\n")

  cat(
    "Among the four public-policy news topics analyzed from NewsAPI headlines, ",
    most_positive$Topic,
    " had the highest average AFINN sentiment score, while ",
    most_negative$Topic,
    " had the lowest average AFINN sentiment score. ",
    sep = ""
  )

  cat(
    "The TF-IDF results show that each topic was associated with distinct headline language, suggesting that the topics differed not only in sentiment but also in the specific news themes driving the coverage."
  )
} else {
  cat("## Two-Sentence Summary of Findings\n\n")
  cat("The sentiment summary table did not contain enough matched sentiment terms to automatically generate a finding. The headline and TF-IDF outputs should still be reviewed manually to identify the clearest topic-level patterns.")
}

Two-Sentence Summary of Findings

Among the four public-policy news topics analyzed from NewsAPI headlines, Trump had the highest average AFINN sentiment score, while ICE had the lowest average AFINN sentiment score. The TF-IDF results show that each topic was associated with distinct headline language, suggesting that the topics differed not only in sentiment but also in the specific news themes driving the coverage.