This notebook uses Quanteda, topicmodels, and lexicoder sentiment tools to analyze narrative patterns and thematic clusters across articles, and to compare them to the MAHA report. We assess document similarity, cluster themes, and explore sentiment and publication timing.
library(tidyverse)
library(lubridate)
library(quanteda)
library(topicmodels)
library(stopwords)
library(ggplot2)
library(stringr)
library(zoo)
articles <- read_csv("project-articles.csv", show_col_types = FALSE) %>%
filter(!is.na(published_date), published_date != "") %>%
mutate(
pub_date = parse_date_time(published_date, orders = c("ymd", "dmy", "mdy"), quiet = TRUE),
text = str_squish(str_to_lower(paste(title, author_name)))
) %>%
filter(!is.na(pub_date), text != "")
# Add MAHA report
maha_text <- read_file("MAHA-Report-text.txt") %>%
str_to_lower() %>% str_replace_all("[^[:alnum:] ]", " ") %>% str_squish()
combined <- c(articles$text, maha_text)
corp <- corpus(combined)
toks <- tokens(corp, remove_punct = TRUE, remove_numbers = TRUE) %>%
tokens_tolower() %>%
tokens_remove(stopwords("en"))
dfm_all <- dfm(toks) %>%
dfm_trim(min_termfreq = 5, termfreq_type = "count")
dfm_articles <- dfm_all[1:nrow(articles), ]
dfm_maha <- dfm_all[nrow(articles) + 1, ]
sim <- textstat_simil(dfm_articles, dfm_maha, method = "cosine")
articles$similarity_to_maha <- as.numeric(sim)
dtm <- convert(dfm_articles, to = "topicmodels")
row_has_tokens <- rowSums(as.matrix(dtm)) > 0
lda_model <- LDA(dtm[row_has_tokens, ], k = 13, method = "Gibbs", control = list(seed = 42))
topics_assigned <- rep(NA_integer_, nrow(articles))
topics_assigned[row_has_tokens] <- topics(lda_model)
articles$topic <- topics_assigned
top_terms <- terms(lda_model, 10) # top 10 words per topic
print(top_terms)
Topic 1 Topic 2 Topic 3 Topic 4 Topic 5 Topic 6 Topic 7 Topic 8
[1,] "b" "b" "hunter" "big" "karenr" "vaccine" "health" "christina"
[2,] "lisa" "o'brien" "aerowenn" "jr" "new" "pfizer" "defense" "gursslin"
[3,] "p" "jennifer" "j" "$" "u.s" "fda" "children's" "food"
[4,] "medical" "week" "blood" "rfk" "pandemic" "safety" "public" "u.s"
[5,] "misinformation" "mary" "immunity" "fauci" "bill" "moderna" "team" "toxic"
[6,] "watch" "polly" "climate" "million" "gates" "joyce" "childrenb" "chemicals"
[7,] "doctors" "dangerous" "millions" "pharma" "say" "ghen" "now" "cancer"
[8,] "dr" "smart" "natural" "karenr" "global" "covid" "science" "epa"
[9,] "board" "just" "johnson" "tells" "world" "shot" "5g" "water"
[10,] "general" "calls" "lower" "billion" "flu" "exclusive" "freedom" "used"
Topic 9 Topic 10 Topic 11 Topic 12 Topic 13
[1,] "covid" "covid" "na" "court" "+"
[2,] "vaccine" "vaccines" "study" "mandate" "claudia"
[3,] "says" "cdc" "children" "new" "wheeler"
[4,] "kids" "data" "risk" "chd" "ai"
[5,] "get" "shots" "autism" "lawsuit" "google"
[6,] "mandates" "kids" "may" "media" "face"
[7,] "stefanie" "karenr" "shows" "biden" "surveillance"
[8,] "spear" "deaths" "vaccination" "censorship" "omicron"
[9,] "vaccinated" "people" "disease" "federal" "privacy"
[10,] "parents" "injuries" "covid-19" "case" "facial"
beta <- posterior(lda_model)$terms
hc <- hclust(dist(beta), method = "ward.D2")
topic_cluster <- cutree(hc, k = 4)
articles$cluster_group <- map_int(articles$topic, ~ topic_cluster[.x])
labels <- c(
"1" = "Child Health Emphasis",
"2" = "Pharma Critique & Safety",
"3" = "Freedom & Censorship",
"4" = "Institutional Distrust"
)
articles$cluster_label <- recode(as.character(articles$cluster_group), !!!labels)
toks_sent <- tokens(articles$text, remove_punct = TRUE, remove_numbers = TRUE) %>%
tokens_tolower() %>%
tokens_remove(stopwords("en"))
dfm_sent <- dfm(toks_sent)
data("data_dictionary_LSD2015")
dfm_dict <- dfm_lookup(dfm_sent, dictionary = data_dictionary_LSD2015)
# Compute sentiment scores from lexicon dictionary
# Assumes dfm_sent and dfm_dict have already been created
articles$sentiment_pos <- as.numeric(dfm_dict[, "positive"])
articles$sentiment_neg <- as.numeric(dfm_dict[, "negative"])
articles$sentiment <- articles$sentiment_pos - articles$sentiment_neg
articles %>%
group_by(cluster_label) %>%
summarise(
count = n(),
avg_sim = mean(similarity_to_maha, na.rm = TRUE),
avg_sentiment = mean(sentiment, na.rm = TRUE),
top_authors = paste(names(sort(table(author_name), decreasing = TRUE))[1:3], collapse = ", ")
) %>%
arrange(desc(avg_sim)) %>%
knitr::kable()
| cluster_label | count | avg_sim | avg_sentiment | top_authors |
|---|---|---|---|---|
| Child Health Emphasis | 1580 | 0.0657495 | -0.5348101 | Lisa P, Karenr, Aerowenn Hunter |
| Freedom & Censorship | 4774 | 0.0495052 | -0.6101801 | Karenr, Aerowenn Hunter, Christina Gursslin |
| Pharma Critique & Safety | 928 | 0.0371687 | -0.4784483 | Aerowenn Hunter, Karenr, Claudia Wheeler |
| Institutional Distrust | 831 | 0.0278566 | -0.4512635 | Claudia Wheeler, Karenr, Aerowenn Hunter |
ggplot(articles, aes(x = factor(topic), y = similarity_to_maha, fill = cluster_label)) +
geom_boxplot() +
labs(title = "Similarity to MAHA by Topic", x = "Topic", y = "Cosine Similarity") +
theme_minimal()
ggplot(articles, aes(x = factor(topic), y = sentiment, fill = cluster_label)) +
geom_boxplot() +
labs(title = "Sentiment by Topic", x = "Topic", y = "Lexicoder Sentiment") +
theme_minimal()