This notebook uses Quanteda, topicmodels, and lexicoder sentiment tools to analyze narrative patterns and thematic clusters across articles, and to compare them to the MAHA report. We assess document similarity, cluster themes, and explore sentiment and publication timing.

library(tidyverse)
library(lubridate)
library(quanteda)
library(topicmodels)
library(stopwords)
library(ggplot2)
library(stringr)
library(zoo)

1. Load and Preprocess

articles <- read_csv("project-articles.csv", show_col_types = FALSE) %>%
  filter(!is.na(published_date), published_date != "") %>%
  mutate(
    pub_date = parse_date_time(published_date, orders = c("ymd", "dmy", "mdy"), quiet = TRUE),
    text = str_squish(str_to_lower(paste(title, author_name)))
  ) %>%
  filter(!is.na(pub_date), text != "")

2. Tokenize and Create Corpus + DFM

# Add MAHA report
maha_text <- read_file("MAHA-Report-text.txt") %>%
  str_to_lower() %>% str_replace_all("[^[:alnum:] ]", " ") %>% str_squish()
combined <- c(articles$text, maha_text)

corp <- corpus(combined)
toks <- tokens(corp, remove_punct = TRUE, remove_numbers = TRUE) %>%
  tokens_tolower() %>%
  tokens_remove(stopwords("en"))

dfm_all <- dfm(toks) %>%
  dfm_trim(min_termfreq = 5, termfreq_type = "count")

dfm_articles <- dfm_all[1:nrow(articles), ]
dfm_maha <- dfm_all[nrow(articles) + 1, ]

3. Compute Cosine Similarity to MAHA

sim <- textstat_simil(dfm_articles, dfm_maha, method = "cosine")
articles$similarity_to_maha <- as.numeric(sim)

4. Topic Modeling with LDA

dtm <- convert(dfm_articles, to = "topicmodels")
row_has_tokens <- rowSums(as.matrix(dtm)) > 0
lda_model <- LDA(dtm[row_has_tokens, ], k = 13, method = "Gibbs", control = list(seed = 42))
topics_assigned <- rep(NA_integer_, nrow(articles))
topics_assigned[row_has_tokens] <- topics(lda_model)
articles$topic <- topics_assigned
top_terms <- terms(lda_model, 10)  # top 10 words per topic
print(top_terms)
      Topic 1          Topic 2     Topic 3    Topic 4   Topic 5    Topic 6     Topic 7      Topic 8    
 [1,] "b"              "b"         "hunter"   "big"     "karenr"   "vaccine"   "health"     "christina"
 [2,] "lisa"           "o'brien"   "aerowenn" "jr"      "new"      "pfizer"    "defense"    "gursslin" 
 [3,] "p"              "jennifer"  "j"        "$"       "u.s"      "fda"       "children's" "food"     
 [4,] "medical"        "week"      "blood"    "rfk"     "pandemic" "safety"    "public"     "u.s"      
 [5,] "misinformation" "mary"      "immunity" "fauci"   "bill"     "moderna"   "team"       "toxic"    
 [6,] "watch"          "polly"     "climate"  "million" "gates"    "joyce"     "childrenb"  "chemicals"
 [7,] "doctors"        "dangerous" "millions" "pharma"  "say"      "ghen"      "now"        "cancer"   
 [8,] "dr"             "smart"     "natural"  "karenr"  "global"   "covid"     "science"    "epa"      
 [9,] "board"          "just"      "johnson"  "tells"   "world"    "shot"      "5g"         "water"    
[10,] "general"        "calls"     "lower"    "billion" "flu"      "exclusive" "freedom"    "used"     
      Topic 9      Topic 10   Topic 11      Topic 12     Topic 13      
 [1,] "covid"      "covid"    "na"          "court"      "+"           
 [2,] "vaccine"    "vaccines" "study"       "mandate"    "claudia"     
 [3,] "says"       "cdc"      "children"    "new"        "wheeler"     
 [4,] "kids"       "data"     "risk"        "chd"        "ai"          
 [5,] "get"        "shots"    "autism"      "lawsuit"    "google"      
 [6,] "mandates"   "kids"     "may"         "media"      "face"        
 [7,] "stefanie"   "karenr"   "shows"       "biden"      "surveillance"
 [8,] "spear"      "deaths"   "vaccination" "censorship" "omicron"     
 [9,] "vaccinated" "people"   "disease"     "federal"    "privacy"     
[10,] "parents"    "injuries" "covid-19"    "case"       "facial"      

5. Cluster Topics + Label Themes

beta <- posterior(lda_model)$terms
hc <- hclust(dist(beta), method = "ward.D2")
topic_cluster <- cutree(hc, k = 4)
articles$cluster_group <- map_int(articles$topic, ~ topic_cluster[.x])

labels <- c(
  "1" = "Child Health Emphasis",
  "2" = "Pharma Critique & Safety",
  "3" = "Freedom & Censorship",
  "4" = "Institutional Distrust"
)
articles$cluster_label <- recode(as.character(articles$cluster_group), !!!labels)

6. Sentiment Analysis

toks_sent <- tokens(articles$text, remove_punct = TRUE, remove_numbers = TRUE) %>%
  tokens_tolower() %>%
  tokens_remove(stopwords("en"))
dfm_sent <- dfm(toks_sent)

data("data_dictionary_LSD2015")
dfm_dict <- dfm_lookup(dfm_sent, dictionary = data_dictionary_LSD2015)
# Compute sentiment scores from lexicon dictionary
# Assumes dfm_sent and dfm_dict have already been created

articles$sentiment_pos <- as.numeric(dfm_dict[, "positive"])
articles$sentiment_neg <- as.numeric(dfm_dict[, "negative"])
articles$sentiment     <- articles$sentiment_pos - articles$sentiment_neg

7. Summary by Cluster

articles %>%
  group_by(cluster_label) %>%
  summarise(
    count = n(),
    avg_sim = mean(similarity_to_maha, na.rm = TRUE),
    avg_sentiment = mean(sentiment, na.rm = TRUE),
    top_authors = paste(names(sort(table(author_name), decreasing = TRUE))[1:3], collapse = ", ")
  ) %>%
  arrange(desc(avg_sim)) %>%
  knitr::kable()
cluster_label count avg_sim avg_sentiment top_authors
Child Health Emphasis 1580 0.0657495 -0.5348101 Lisa P, Karenr, Aerowenn Hunter
Freedom & Censorship 4774 0.0495052 -0.6101801 Karenr, Aerowenn Hunter, Christina Gursslin
Pharma Critique & Safety 928 0.0371687 -0.4784483 Aerowenn Hunter, Karenr, Claudia Wheeler
Institutional Distrust 831 0.0278566 -0.4512635 Claudia Wheeler, Karenr, Aerowenn Hunter

8. Boxplots: Sentiment and Similarity

ggplot(articles, aes(x = factor(topic), y = similarity_to_maha, fill = cluster_label)) +
  geom_boxplot() +
  labs(title = "Similarity to MAHA by Topic", x = "Topic", y = "Cosine Similarity") +
  theme_minimal()

ggplot(articles, aes(x = factor(topic), y = sentiment, fill = cluster_label)) +
  geom_boxplot() +
  labs(title = "Sentiment by Topic", x = "Topic", y = "Lexicoder Sentiment") +
  theme_minimal()

LS0tDQp0aXRsZTogIlNlbWFudGljIFNpbWlsYXJpdHkgd2l0aCBNQUhBIChRdWFudGVkYSBXb3JrZmxvdykiDQpvdXRwdXQ6DQogIGh0bWxfbm90ZWJvb2s6DQogICAgY29kZV9mb2xkaW5nOiBoaWRlDQotLS0NCg0KVGhpcyBub3RlYm9vayB1c2VzICoqUXVhbnRlZGEqKiwgKip0b3BpY21vZGVscyoqLCBhbmQgKipsZXhpY29kZXINCnNlbnRpbWVudCoqIHRvb2xzIHRvIGFuYWx5emUgbmFycmF0aXZlIHBhdHRlcm5zIGFuZCB0aGVtYXRpYyBjbHVzdGVycw0KYWNyb3NzIGFydGljbGVzLCBhbmQgdG8gY29tcGFyZSB0aGVtIHRvIHRoZSAqKk1BSEEgcmVwb3J0KiouIFdlIGFzc2Vzcw0KZG9jdW1lbnQgc2ltaWxhcml0eSwgY2x1c3RlciB0aGVtZXMsIGFuZCBleHBsb3JlIHNlbnRpbWVudCBhbmQNCnB1YmxpY2F0aW9uIHRpbWluZy4NCg0KYGBge3Igc2V0dXB9DQpsaWJyYXJ5KHRpZHl2ZXJzZSkNCmxpYnJhcnkobHVicmlkYXRlKQ0KbGlicmFyeShxdWFudGVkYSkNCmxpYnJhcnkodG9waWNtb2RlbHMpDQpsaWJyYXJ5KHN0b3B3b3JkcykNCmxpYnJhcnkoZ2dwbG90MikNCmxpYnJhcnkoc3RyaW5ncikNCmxpYnJhcnkoem9vKQ0KYGBgDQoNCiMjIyAxLiBMb2FkIGFuZCBQcmVwcm9jZXNzDQoNCmBgYHtyfQ0KYXJ0aWNsZXMgPC0gcmVhZF9jc3YoInByb2plY3QtYXJ0aWNsZXMuY3N2Iiwgc2hvd19jb2xfdHlwZXMgPSBGQUxTRSkgJT4lDQogIGZpbHRlcighaXMubmEocHVibGlzaGVkX2RhdGUpLCBwdWJsaXNoZWRfZGF0ZSAhPSAiIikgJT4lDQogIG11dGF0ZSgNCiAgICBwdWJfZGF0ZSA9IHBhcnNlX2RhdGVfdGltZShwdWJsaXNoZWRfZGF0ZSwgb3JkZXJzID0gYygieW1kIiwgImRteSIsICJtZHkiKSwgcXVpZXQgPSBUUlVFKSwNCiAgICB0ZXh0ID0gc3RyX3NxdWlzaChzdHJfdG9fbG93ZXIocGFzdGUodGl0bGUsIGF1dGhvcl9uYW1lKSkpDQogICkgJT4lDQogIGZpbHRlcighaXMubmEocHViX2RhdGUpLCB0ZXh0ICE9ICIiKQ0KYGBgDQoNCi0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLQ0KDQojIyMgMi4gVG9rZW5pemUgYW5kIENyZWF0ZSBDb3JwdXMgKyBERk0NCg0KYGBge3J9DQojIEFkZCBNQUhBIHJlcG9ydA0KbWFoYV90ZXh0IDwtIHJlYWRfZmlsZSgiTUFIQS1SZXBvcnQtdGV4dC50eHQiKSAlPiUNCiAgc3RyX3RvX2xvd2VyKCkgJT4lIHN0cl9yZXBsYWNlX2FsbCgiW15bOmFsbnVtOl0gXSIsICIgIikgJT4lIHN0cl9zcXVpc2goKQ0KY29tYmluZWQgPC0gYyhhcnRpY2xlcyR0ZXh0LCBtYWhhX3RleHQpDQoNCmNvcnAgPC0gY29ycHVzKGNvbWJpbmVkKQ0KdG9rcyA8LSB0b2tlbnMoY29ycCwgcmVtb3ZlX3B1bmN0ID0gVFJVRSwgcmVtb3ZlX251bWJlcnMgPSBUUlVFKSAlPiUNCiAgdG9rZW5zX3RvbG93ZXIoKSAlPiUNCiAgdG9rZW5zX3JlbW92ZShzdG9wd29yZHMoImVuIikpDQoNCmRmbV9hbGwgPC0gZGZtKHRva3MpICU+JQ0KICBkZm1fdHJpbShtaW5fdGVybWZyZXEgPSA1LCB0ZXJtZnJlcV90eXBlID0gImNvdW50IikNCg0KZGZtX2FydGljbGVzIDwtIGRmbV9hbGxbMTpucm93KGFydGljbGVzKSwgXQ0KZGZtX21haGEgPC0gZGZtX2FsbFtucm93KGFydGljbGVzKSArIDEsIF0NCmBgYA0KDQotLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0NCg0KIyMjIDMuIENvbXB1dGUgQ29zaW5lIFNpbWlsYXJpdHkgdG8gTUFIQQ0KDQpgYGB7cn0NCnNpbSA8LSB0ZXh0c3RhdF9zaW1pbChkZm1fYXJ0aWNsZXMsIGRmbV9tYWhhLCBtZXRob2QgPSAiY29zaW5lIikNCmFydGljbGVzJHNpbWlsYXJpdHlfdG9fbWFoYSA8LSBhcy5udW1lcmljKHNpbSkNCmBgYA0KDQotLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0NCg0KIyMjIDQuIFRvcGljIE1vZGVsaW5nIHdpdGggTERBDQoNCmBgYHtyfQ0KZHRtIDwtIGNvbnZlcnQoZGZtX2FydGljbGVzLCB0byA9ICJ0b3BpY21vZGVscyIpDQpyb3dfaGFzX3Rva2VucyA8LSByb3dTdW1zKGFzLm1hdHJpeChkdG0pKSA+IDANCmxkYV9tb2RlbCA8LSBMREEoZHRtW3Jvd19oYXNfdG9rZW5zLCBdLCBrID0gMTMsIG1ldGhvZCA9ICJHaWJicyIsIGNvbnRyb2wgPSBsaXN0KHNlZWQgPSA0MikpDQp0b3BpY3NfYXNzaWduZWQgPC0gcmVwKE5BX2ludGVnZXJfLCBucm93KGFydGljbGVzKSkNCnRvcGljc19hc3NpZ25lZFtyb3dfaGFzX3Rva2Vuc10gPC0gdG9waWNzKGxkYV9tb2RlbCkNCmFydGljbGVzJHRvcGljIDwtIHRvcGljc19hc3NpZ25lZA0KYGBgDQoNCmBgYHtyfQ0KdG9wX3Rlcm1zIDwtIHRlcm1zKGxkYV9tb2RlbCwgMTApICAjIHRvcCAxMCB3b3JkcyBwZXIgdG9waWMNCnByaW50KHRvcF90ZXJtcykNCg0KYGBgDQoNCi0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLQ0KDQojIyMgNS4gQ2x1c3RlciBUb3BpY3MgKyBMYWJlbCBUaGVtZXMNCg0KYGBge3J9DQpiZXRhIDwtIHBvc3RlcmlvcihsZGFfbW9kZWwpJHRlcm1zDQpoYyA8LSBoY2x1c3QoZGlzdChiZXRhKSwgbWV0aG9kID0gIndhcmQuRDIiKQ0KdG9waWNfY2x1c3RlciA8LSBjdXRyZWUoaGMsIGsgPSA0KQ0KYXJ0aWNsZXMkY2x1c3Rlcl9ncm91cCA8LSBtYXBfaW50KGFydGljbGVzJHRvcGljLCB+IHRvcGljX2NsdXN0ZXJbLnhdKQ0KDQpsYWJlbHMgPC0gYygNCiAgIjEiID0gIkNoaWxkIEhlYWx0aCBFbXBoYXNpcyIsDQogICIyIiA9ICJQaGFybWEgQ3JpdGlxdWUgJiBTYWZldHkiLA0KICAiMyIgPSAiRnJlZWRvbSAmIENlbnNvcnNoaXAiLA0KICAiNCIgPSAiSW5zdGl0dXRpb25hbCBEaXN0cnVzdCINCikNCmFydGljbGVzJGNsdXN0ZXJfbGFiZWwgPC0gcmVjb2RlKGFzLmNoYXJhY3RlcihhcnRpY2xlcyRjbHVzdGVyX2dyb3VwKSwgISEhbGFiZWxzKQ0KYGBgDQoNCi0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLQ0KDQojIyMgNi4gU2VudGltZW50IEFuYWx5c2lzDQoNCmBgYHtyfQ0KdG9rc19zZW50IDwtIHRva2VucyhhcnRpY2xlcyR0ZXh0LCByZW1vdmVfcHVuY3QgPSBUUlVFLCByZW1vdmVfbnVtYmVycyA9IFRSVUUpICU+JQ0KICB0b2tlbnNfdG9sb3dlcigpICU+JQ0KICB0b2tlbnNfcmVtb3ZlKHN0b3B3b3JkcygiZW4iKSkNCmRmbV9zZW50IDwtIGRmbSh0b2tzX3NlbnQpDQoNCmRhdGEoImRhdGFfZGljdGlvbmFyeV9MU0QyMDE1IikNCmRmbV9kaWN0IDwtIGRmbV9sb29rdXAoZGZtX3NlbnQsIGRpY3Rpb25hcnkgPSBkYXRhX2RpY3Rpb25hcnlfTFNEMjAxNSkNCiMgQ29tcHV0ZSBzZW50aW1lbnQgc2NvcmVzIGZyb20gbGV4aWNvbiBkaWN0aW9uYXJ5DQojIEFzc3VtZXMgZGZtX3NlbnQgYW5kIGRmbV9kaWN0IGhhdmUgYWxyZWFkeSBiZWVuIGNyZWF0ZWQNCg0KYXJ0aWNsZXMkc2VudGltZW50X3BvcyA8LSBhcy5udW1lcmljKGRmbV9kaWN0WywgInBvc2l0aXZlIl0pDQphcnRpY2xlcyRzZW50aW1lbnRfbmVnIDwtIGFzLm51bWVyaWMoZGZtX2RpY3RbLCAibmVnYXRpdmUiXSkNCmFydGljbGVzJHNlbnRpbWVudCAgICAgPC0gYXJ0aWNsZXMkc2VudGltZW50X3BvcyAtIGFydGljbGVzJHNlbnRpbWVudF9uZWcNCmBgYA0KDQotLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0NCg0KIyMjIDcuIFN1bW1hcnkgYnkgQ2x1c3Rlcg0KDQpgYGB7cn0NCmFydGljbGVzICU+JQ0KICBncm91cF9ieShjbHVzdGVyX2xhYmVsKSAlPiUNCiAgc3VtbWFyaXNlKA0KICAgIGNvdW50ID0gbigpLA0KICAgIGF2Z19zaW0gPSBtZWFuKHNpbWlsYXJpdHlfdG9fbWFoYSwgbmEucm0gPSBUUlVFKSwNCiAgICBhdmdfc2VudGltZW50ID0gbWVhbihzZW50aW1lbnQsIG5hLnJtID0gVFJVRSksDQogICAgdG9wX2F1dGhvcnMgPSBwYXN0ZShuYW1lcyhzb3J0KHRhYmxlKGF1dGhvcl9uYW1lKSwgZGVjcmVhc2luZyA9IFRSVUUpKVsxOjNdLCBjb2xsYXBzZSA9ICIsICIpDQogICkgJT4lDQogIGFycmFuZ2UoZGVzYyhhdmdfc2ltKSkgJT4lDQogIGtuaXRyOjprYWJsZSgpDQpgYGANCg0KLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tDQoNCiMjIyA4LiBCb3hwbG90czogU2VudGltZW50IGFuZCBTaW1pbGFyaXR5DQoNCmBgYHtyIGZpZy5oZWlnaHQ9N30NCmdncGxvdChhcnRpY2xlcywgYWVzKHggPSBmYWN0b3IodG9waWMpLCB5ID0gc2ltaWxhcml0eV90b19tYWhhLCBmaWxsID0gY2x1c3Rlcl9sYWJlbCkpICsNCiAgZ2VvbV9ib3hwbG90KCkgKw0KICBsYWJzKHRpdGxlID0gIlNpbWlsYXJpdHkgdG8gTUFIQSBieSBUb3BpYyIsIHggPSAiVG9waWMiLCB5ID0gIkNvc2luZSBTaW1pbGFyaXR5IikgKw0KICB0aGVtZV9taW5pbWFsKCkNCmBgYA0KDQpgYGB7ciBmaWcuaGVpZ2h0PTd9DQpnZ3Bsb3QoYXJ0aWNsZXMsIGFlcyh4ID0gZmFjdG9yKHRvcGljKSwgeSA9IHNlbnRpbWVudCwgZmlsbCA9IGNsdXN0ZXJfbGFiZWwpKSArDQogIGdlb21fYm94cGxvdCgpICsNCiAgbGFicyh0aXRsZSA9ICJTZW50aW1lbnQgYnkgVG9waWMiLCB4ID0gIlRvcGljIiwgeSA9ICJMZXhpY29kZXIgU2VudGltZW50IikgKw0KICB0aGVtZV9taW5pbWFsKCkNCmBgYA0KDQo=