install.packages(c("tidyverse", "tidytext", "jsonlite",
"wordcloud", "RColorBrewer", "lubridate",
"scales", "knitr", "kableExtra", "readr"))
library(tidyverse)
library(tidytext)
library(jsonlite)
library(wordcloud)
library(RColorBrewer)
library(lubridate)
library(scales)
library(knitr)
library(kableExtra)
library(readr)
safe_kable <- function(data, caption = NULL, digits = 3, font_size = 8) {
data %>%
kable(
caption = caption,
booktabs = TRUE,
digits = digits
) %>%
kable_styling(
bootstrap_options = c("striped", "hover", "condensed"),
latex_options = c("scale_down", "hold_position"),
full_width = FALSE,
font_size = font_size
)
}
short_text <- function(x, width = 70) {
stringr::str_trunc(as.character(x), width = width, side = "right")
}
## Paste your NewsAPI key inside the quotation marks below.
## Do not post this key publicly or upload it to GitHub.
api_key <- "74aecc4c252f4146b00f8a7e9745f1ed"
if (api_key == "PASTE_YOUR_NEWSAPI_KEY_HERE" || api_key == "") {
stop("Please paste your NewsAPI key into the api_key object before knitting.")
}
## New topic set for the in-class NewsAPI webcrawl exercise.
## query_label = clean display name used in tables/plots.
## api_query = actual NewsAPI search query.
topics <- tibble::tribble(
~query_label, ~api_query,
"Trump", "Trump",
"DHS", "\"Department of Homeland Security\" OR DHS",
"ICE", "\"Immigration and Customs Enforcement\" OR (ICE AND immigration)",
"Immigration Enforcement", "\"immigration enforcement\" OR \"border enforcement\""
)
topics %>%
mutate(api_query = short_text(api_query, 60)) %>%
safe_kable(caption = "NewsAPI Search Topics")
NewsAPI Search Topics
|
query_label
|
api_query
|
|
Trump
|
Trump
|
|
DHS
|
“Department of Homeland Security” OR DHS
|
|
ICE
|
“Immigration and Customs Enforcement” OR (ICE AND immigra…
|
|
Immigration Enforcement
|
“immigration enforcement” OR “border enforcement”
|
fetch_news <- function(query_label, api_query, api_key, page_size = 20) {
url <- paste0(
"https://newsapi.org/v2/everything?",
"q=", URLencode(api_query, reserved = TRUE),
"&language=en",
"&sortBy=publishedAt",
"&pageSize=", page_size,
"&apiKey=", api_key
)
response <- fromJSON(url, flatten = TRUE)
if (!is.null(response$status) && response$status != "ok") {
warning(paste("NewsAPI request failed for topic:", query_label))
warning(paste("Message:", response$message))
return(tibble())
}
if (is.null(response$articles) || length(response$articles) == 0) {
warning(paste("No articles returned for topic:", query_label))
return(tibble())
}
articles <- as_tibble(response$articles)
articles %>%
rename_with(~ str_replace_all(.x, "\\.", "_")) %>%
mutate(
query = query_label,
api_query = api_query
)
}
news_raw <- pmap_dfr(
topics,
function(query_label, api_query) {
fetch_news(
query_label = query_label,
api_query = api_query,
api_key = api_key,
page_size = 20
)
}
)
if (nrow(news_raw) == 0) {
stop("No news articles were returned. Check your API key, NewsAPI limit, or internet connection.")
}
glimpse(news_raw)
## Rows: 69
## Columns: 11
## $ author <chr> "ANI", NA, NA, "RTÉ News", "RTÉ News", "X.com", "Sameer Su…
## $ title <chr> "No nuclear weapon, conditional relief & open Hormuz: Vanc…
## $ description <chr> "US Vice President JD Vance described that the peace agree…
## $ url <chr> "https://economictimes.indiatimes.com/news/defence/no-nucl…
## $ urlToImage <chr> "https://img.etimg.com/thumb/msid-131786269,width-1200,hei…
## $ publishedAt <chr> "2026-06-17T03:46:27Z", "2026-06-17T03:45:37Z", "2026-06-1…
## $ content <chr> "US Vice President JD Vance described that the peace agree…
## $ source_id <chr> "the-times-of-india", NA, NA, "rte", "rte", NA, NA, NA, "t…
## $ source_name <chr> "The Times of India", "Fark.com", "CNA", "RTE", "RTE", "Fr…
## $ query <chr> "Trump", "Trump", "Trump", "Trump", "Trump", "Trump", "Tru…
## $ api_query <chr> "Trump", "Trump", "Trump", "Trump", "Trump", "Trump", "Tru…
## Required group submission item:
## This exports the raw data scraped from NewsAPI.
write_csv(news_raw, "newsapi_raw_data_public_policy_topics.csv")
cat("Raw NewsAPI data exported to: newsapi_raw_data_public_policy_topics.csv")
## Raw NewsAPI data exported to: newsapi_raw_data_public_policy_topics.csv
news_clean <- news_raw %>%
filter(!is.na(.data$title)) %>%
mutate(
pub_date = ymd_hms(.data$publishedAt, quiet = TRUE),
pub_day = as.Date(pub_date),
source_clean = if_else(
!is.na(.data$source_name),
as.character(.data$source_name),
"Unknown Source"
),
title_clean = str_remove(.data$title, "\\s*-\\s*[^-]+$"),
title_clean = str_squish(str_replace_all(title_clean, "[^[:alnum:][:space:]]", " ")),
title_clean = str_to_lower(title_clean)
) %>%
distinct(query, title_clean, .keep_all = TRUE)
cat("Total unique headlines:", nrow(news_clean), "\n")
## Total unique headlines: 69
news_clean %>%
mutate(
title_display = short_text(title, 75),
source_display = short_text(source_clean, 35)
) %>%
select(query, title_display, source_display, pub_day) %>%
head(12) %>%
safe_kable(caption = "Sample Cleaned Headlines", font_size = 7)
Sample Cleaned Headlines
|
query
|
title_display
|
source_display
|
pub_day
|
|
Trump
|
No nuclear weapon, conditional relief & open Hormuz: Vance outlines
3 pi…
|
The Times of India
|
2026-06-17
|
|
Trump
|
A little perspective from Mr. Global about how Stinky dun goofed so
badl…
|
Fark.com
|
2026-06-17
|
|
Trump
|
Vietnam maintains 2026 GDP target despite trade deficit, inflation
pressure
|
CNA
|
2026-06-17
|
|
Trump
|
Macron to close G7 with AI discussions, Trump dinner
|
RTE
|
2026-06-17
|
|
Trump
|
Macron to close G7 summit with AI discussions
|
RTE
|
2026-06-17
|
|
Trump
|
JD Vance Just EVISCERATED The Hosts of The View over their false claims
…
|
Freerepublic.com
|
2026-06-17
|
|
Trump
|
Meghan McCain trashes former co-stars on The View over ‘undisciplined’
J…
|
Dailymail.com
|
2026-06-17
|
|
Trump
|
The FDA Just Put Psilocybin and an MDMA-Like Drug on a 1-to-2 Month
Appr…
|
Medical Daily
|
2026-06-17
|
|
Trump
|
Decision-Day Guide: Warsh faces first big test as Fed Chair
|
The Times of India
|
2026-06-17
|
|
Trump
|
Kevin Warsh prepares for first Fed meeting as inflation hits 4%
|
Crypto Briefing
|
2026-06-17
|
|
Trump
|
Gold extends gains as rate-hike bets ease ahead of Fed verdict
|
BusinessLine
|
2026-06-17
|
|
Trump
|
Alabama US Senate: Voters decide tonight between Trump-endorsed Barry
Mo…
|
Slashdot.org
|
2026-06-17
|
headline_counts <- news_clean %>%
count(query, sort = TRUE)
headline_counts %>%
safe_kable(caption = "Unique Headline Count by Topic")
Unique Headline Count by Topic
|
query
|
n
|
|
ICE
|
19
|
|
DHS
|
17
|
|
Trump
|
17
|
|
Immigration Enforcement
|
16
|
news_tokens <- news_clean %>%
select(query, title_clean) %>%
unnest_tokens(word, title_clean) %>%
anti_join(stop_words, by = "word") %>%
filter(!str_detect(word, "^\\d+$"), nchar(word) > 2)
top_words <- news_tokens %>%
count(word, sort = TRUE) %>%
slice_head(n = 20)
top_words %>%
safe_kable(caption = "Top 20 Words Across All Headlines")
Top 20 Words Across All Headlines
|
word
|
n
|
|
trump
|
17
|
|
antifa
|
10
|
|
minnesota
|
10
|
|
federal
|
9
|
|
anti
|
7
|
|
ice
|
6
|
|
charged
|
5
|
|
feds
|
5
|
|
conspiracy
|
4
|
|
immigration
|
4
|
|
iran
|
4
|
|
senate
|
4
|
|
tied
|
4
|
|
alabama
|
3
|
|
block
|
3
|
|
charges
|
3
|
|
court
|
3
|
|
judge
|
3
|
|
operations
|
3
|
|
people
|
3
|
top_words %>%
mutate(word = fct_reorder(word, n)) %>%
ggplot(aes(x = n, y = word, fill = n)) +
geom_col(show.legend = FALSE) +
scale_fill_gradient(low = "#a8d8ea", high = "#0077b6") +
labs(
title = "Top 20 Words in News Headlines",
subtitle = "Trump, DHS, ICE, and Immigration Enforcement",
x = "Count",
y = NULL,
caption = "Source: NewsAPI"
) +
theme_minimal(base_size = 13)

word_freq <- news_tokens %>%
count(word, sort = TRUE) %>%
filter(n >= 2)
set.seed(42)
wordcloud(
words = word_freq$word,
freq = word_freq$n,
min.freq = 1,
max.words = 80,
random.order = FALSE,
colors = brewer.pal(8, "Dark2"),
scale = c(3.5, 0.5)
)
title("News Headline Word Cloud — Public Policy Topics")

## Web-safe AFINN import.
## This avoids tidytext::get_sentiments("afinn"), which can trigger
## textdata::menu() and break non-interactive R Markdown knitting.
afinn <- read_delim(
"https://raw.githubusercontent.com/fnielsen/afinn/master/afinn/data/AFINN-en-165.txt",
delim = "\t",
col_names = c("word", "value"),
show_col_types = FALSE
)
sentiment_afinn <- news_tokens %>%
inner_join(afinn, by = "word") %>%
group_by(query) %>%
summarise(
total_matched_words = n(),
mean_sentiment = round(mean(value), 3),
sum_sentiment = sum(value),
.groups = "drop"
) %>%
arrange(desc(mean_sentiment))
sentiment_afinn %>%
safe_kable(caption = "AFINN Sentiment Score by Topic")
AFINN Sentiment Score by Topic
|
query
|
total_matched_words
|
mean_sentiment
|
sum_sentiment
|
|
Trump
|
19
|
-0.263
|
-5
|
|
DHS
|
22
|
-0.364
|
-8
|
|
Immigration Enforcement
|
29
|
-1.655
|
-48
|
|
ICE
|
28
|
-1.786
|
-50
|
sentiment_afinn %>%
mutate(
query = fct_reorder(query, mean_sentiment),
sentiment_dir = ifelse(mean_sentiment >= 0, "Positive", "Negative")
) %>%
ggplot(aes(x = mean_sentiment, y = query, fill = sentiment_dir)) +
geom_col(width = 0.6) +
scale_fill_manual(values = c("Positive" = "#2ecc71", "Negative" = "#e74c3c")) +
geom_vline(xintercept = 0, linetype = "dashed", color = "gray40") +
labs(
title = "Mean AFINN Sentiment Score by Topic",
subtitle = "Higher scores indicate more positive headline language",
x = "Mean Sentiment Score",
y = NULL,
fill = NULL,
caption = "Source: NewsAPI headlines and AFINN sentiment lexicon"
) +
theme_minimal(base_size = 13) +
theme(legend.position = "top")

## Bing sentiment is bundled with tidytext and should knit normally.
bing <- get_sentiments("bing")
sentiment_bing <- news_tokens %>%
inner_join(bing, by = "word") %>%
count(query, sentiment) %>%
pivot_wider(
names_from = sentiment,
values_from = n,
values_fill = list(n = 0)
) %>%
mutate(
positive = coalesce(positive, 0L),
negative = coalesce(negative, 0L),
net_sentiment = positive - negative
)
sentiment_bing %>%
safe_kable(caption = "Bing Sentiment Count by Topic")
Bing Sentiment Count by Topic
|
query
|
negative
|
positive
|
net_sentiment
|
|
DHS
|
11
|
10
|
-1
|
|
ICE
|
14
|
2
|
-12
|
|
Immigration Enforcement
|
11
|
8
|
-3
|
|
Trump
|
8
|
15
|
7
|
news_tokens %>%
inner_join(bing, by = "word") %>%
count(word, sentiment, sort = TRUE) %>%
group_by(sentiment) %>%
slice_head(n = 10) %>%
ungroup() %>%
mutate(word = reorder_within(word, n, sentiment)) %>%
ggplot(aes(x = n, y = word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ sentiment, scales = "free_y") +
scale_y_reordered() +
scale_fill_manual(values = c("positive" = "#2ecc71", "negative" = "#e74c3c")) +
labs(
title = "Top Positive & Negative Words in Headlines",
x = "Count",
y = NULL,
caption = "Source: NewsAPI headlines and Bing sentiment lexicon"
) +
theme_minimal(base_size = 12)

## NRC is disabled for the web-knit version because tidytext::get_sentiments("nrc")
## can trigger the same non-interactive textdata download/menu issue.
## If your instructor requires NRC, download/cache it interactively first,
## then remove eval=FALSE from this chunk.
nrc <- get_sentiments("nrc")
emotion_nrc <- news_tokens %>%
inner_join(nrc, by = "word") %>%
filter(!sentiment %in% c("positive", "negative")) %>%
count(query, sentiment) %>%
group_by(query) %>%
mutate(prop = n / sum(n))
ggplot(emotion_nrc, aes(x = sentiment, y = prop, fill = query)) +
geom_col(position = "dodge") +
scale_y_continuous(labels = percent_format()) +
scale_fill_brewer(palette = "Set2") +
labs(
title = "NRC Emotion Proportions by Topic",
x = "Emotion",
y = "Proportion of Emotional Words",
fill = "Topic",
caption = "Source: NewsAPI headlines and NRC emotion lexicon"
) +
theme_minimal(base_size = 12) +
theme(
axis.text.x = element_text(angle = 30, hjust = 1),
legend.position = "top"
)
tfidf_words <- news_tokens %>%
count(query, word) %>%
bind_tf_idf(word, query, n) %>%
group_by(query) %>%
slice_max(tf_idf, n = 12, with_ties = FALSE) %>%
ungroup()
tfidf_words %>%
mutate(word = reorder_within(word, tf_idf, query)) %>%
ggplot(aes(x = tf_idf, y = word, fill = query)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ query, scales = "free_y", ncol = 2) +
scale_y_reordered() +
scale_fill_brewer(palette = "Set1") +
labs(
title = "Top 12 TF-IDF Terms by Topic",
subtitle = "Words most distinctive to each NewsAPI search topic",
x = "TF-IDF Score",
y = NULL,
caption = "Source: NewsAPI headlines"
) +
theme_minimal(base_size = 12)

summary_tbl <- sentiment_afinn %>%
left_join(
sentiment_bing %>% select(query, positive, negative, net_sentiment),
by = "query"
) %>%
left_join(
headline_counts,
by = "query"
) %>%
rename(
Topic = query,
`Unique Headlines` = n,
`Words Matched` = total_matched_words,
`Mean AFINN` = mean_sentiment,
`AFINN Sum` = sum_sentiment,
Positive = positive,
Negative = negative,
`Net Bing Sentiment` = net_sentiment
) %>%
select(
Topic,
`Unique Headlines`,
`Words Matched`,
`Mean AFINN`,
`AFINN Sum`,
Positive,
Negative,
`Net Bing Sentiment`
)
summary_tbl %>%
safe_kable(caption = "Sentiment Summary: All Topics", font_size = 7) %>%
column_spec(
4,
color = ifelse(summary_tbl$`Mean AFINN` >= 0, "darkgreen", "red")
)
Sentiment Summary: All Topics
|
Topic
|
Unique Headlines
|
Words Matched
|
Mean AFINN
|
AFINN Sum
|
Positive
|
Negative
|
Net Bing Sentiment
|
|
Trump
|
17
|
19
|
-0.263
|
-5
|
15
|
8
|
7
|
|
DHS
|
17
|
22
|
-0.364
|
-8
|
10
|
11
|
-1
|
|
Immigration Enforcement
|
16
|
29
|
-1.655
|
-48
|
8
|
11
|
-3
|
|
ICE
|
19
|
28
|
-1.786
|
-50
|
2
|
14
|
-12
|
## Required and optional submission files.
## Required: raw data scraped from NewsAPI.
## Optional but useful: cleaned headlines and sentiment summary.
write_csv(news_raw, "newsapi_raw_data_public_policy_topics.csv")
write_csv(news_clean, "newsapi_clean_headlines_public_policy_topics.csv")
write_csv(summary_tbl, "newsapi_sentiment_summary_public_policy_topics.csv")
cat("Export complete. Files saved to the RStudio Cloud project folder:")
## Export complete. Files saved to the RStudio Cloud project folder:
cat("\n- newsapi_raw_data_public_policy_topics.csv")
##
## - newsapi_raw_data_public_policy_topics.csv
cat("\n- newsapi_clean_headlines_public_policy_topics.csv")
##
## - newsapi_clean_headlines_public_policy_topics.csv
cat("\n- newsapi_sentiment_summary_public_policy_topics.csv")
##
## - newsapi_sentiment_summary_public_policy_topics.csv
if (nrow(summary_tbl) > 0) {
most_positive <- summary_tbl %>%
arrange(desc(`Mean AFINN`)) %>%
slice(1)
most_negative <- summary_tbl %>%
arrange(`Mean AFINN`) %>%
slice(1)
cat("## Two-Sentence Summary of Findings\n\n")
cat(
"Among the four public-policy news topics analyzed from NewsAPI headlines, ",
most_positive$Topic,
" had the highest average AFINN sentiment score, while ",
most_negative$Topic,
" had the lowest average AFINN sentiment score. ",
sep = ""
)
cat(
"The TF-IDF results show that each topic was associated with distinct headline language, suggesting that the topics differed not only in sentiment but also in the specific news themes driving the coverage."
)
} else {
cat("## Two-Sentence Summary of Findings\n\n")
cat("The sentiment summary table did not contain enough matched sentiment terms to automatically generate a finding. The headline and TF-IDF outputs should still be reviewed manually to identify the clearest topic-level patterns.")
}
Two-Sentence Summary of Findings
Among the four public-policy news topics analyzed from NewsAPI
headlines, Trump had the highest average AFINN sentiment score, while
ICE had the lowest average AFINN sentiment score. The TF-IDF results
show that each topic was associated with distinct headline language,
suggesting that the topics differed not only in sentiment but also in
the specific news themes driving the coverage.