# Load libraries
library(tidyverse)
library(text2vec)
library(tm)
library(pheatmap)
library(dendextend)
# Load and clean MAHA report text
maha_text <- read_file("MAHA-Report-text.txt") %>%
str_to_lower() %>%
str_replace_all("[^[:alnum:] ]", " ") %>%
str_squish()
# Load and preprocess articles
articles <- read_csv("C:/Users/ERosa/Github/maha/project-articles.csv") %>%
mutate(
title = str_to_lower(title),
article_types = str_to_lower(replace_na(article_types, "")),
content_clean = str_squish(str_replace_all(paste(title, article_types), "[^[:alnum:] ]", " "))
)
Rows: 8113 Columns: 27── Column specification ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (8): url, published_date, author_name, domain_name, thumbnail, title, article_types, author_details
dbl (18): total_shares, linking_domains, pinterest_shares, twitter_shares, total_facebook_shares, evergreen_score, total_reddit_engag...
lgl (1): author_image
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Combine content and tokenize
texts_all <- c(articles$content_clean, maha_text)
it_all <- itoken(texts_all, preprocessor = tolower, tokenizer = word_tokenizer)
# Create vocabulary and DTM
vocab <- create_vocabulary(it_all, stopwords = stopwords("en")) %>%
prune_vocabulary(term_count_min = 10, doc_proportion_max = 0.3) %>%
filter(nchar(term) >= 3)
dtm <- create_dtm(it_all, vocab_vectorizer(vocab))
# Split article/report matrices
n_articles <- nrow(articles)
dtm_articles <- dtm[1:n_articles, ]
dtm_maha <- dtm[n_articles + 1, , drop = FALSE]
# Similarity to MAHA
similarity_scores <- sim2(x = dtm_articles, y = dtm_maha, method = "cosine", norm = "l2") %>%
as.vector()
articles <- articles %>% mutate(similarity_to_maha = similarity_scores)
# Select top N
top_n <- 50
top_articles <- articles %>% arrange(desc(similarity_to_maha)) %>% slice(1:top_n)
top_indices <- which(articles$title %in% top_articles$title)
article_subset_dtm <- dtm_articles[top_indices, ]
# Pairwise similarity and clustering
pairwise_sim <- sim2(article_subset_dtm, method = "cosine", norm = "l2")
dist_matrix <- as.dist(1 - pairwise_sim)
hc <- hclust(dist_matrix, method = "average")
dend <- as.dendrogram(hc) %>% color_branches(k = 4)
# Add custom labels to dendrogram
labels(dend) <- articles$title[top_indices]
# Assign cluster membership
articles$cluster <- NA_integer_
articles$cluster[top_indices] <- cutree(hc, k = 4)
# Plot dendrogram
par(mar = c(10, 4, 4, 2)) # More space for rotated labels
plot(dend, cex = 0.5, las = 2,
main = "Hierarchical Clustering of Top Articles",
xlab = "", sub = "", ylab = "Dissimilarity")

# Plot heatmap
pheatmap(
mat = pairwise_sim,
clustering_distance_rows = dist_matrix,
clustering_distance_cols = dist_matrix,
main = "Pairwise Similarity Between Top Articles",
fontsize_row = 6,
fontsize_col = 6,
legend = TRUE,
display_numbers = FALSE
)

# Optional: Save result
# write_csv(articles %>% filter(!is.na(cluster)), "top_articles_with_clusters.csv")
LS0tDQp0aXRsZTogUGFpcndpc2UgU2ltaWxhcml0eQ0Kb3V0cHV0Og0KICBodG1sX25vdGVib29rOg0KICAgIGNvZGVfZm9sZGluZzogaGlkZQ0KZGF0ZTogMjAyNS0wNi0wMw0KLS0tDQoNCmBgYHtyfQ0KIyBMb2FkIGxpYnJhcmllcw0KbGlicmFyeSh0aWR5dmVyc2UpDQpsaWJyYXJ5KHRleHQydmVjKQ0KbGlicmFyeSh0bSkNCmxpYnJhcnkocGhlYXRtYXApDQpsaWJyYXJ5KGRlbmRleHRlbmQpDQoNCiMgTG9hZCBhbmQgY2xlYW4gTUFIQSByZXBvcnQgdGV4dA0KbWFoYV90ZXh0IDwtIHJlYWRfZmlsZSgiTUFIQS1SZXBvcnQtdGV4dC50eHQiKSAlPiUNCiAgc3RyX3RvX2xvd2VyKCkgJT4lDQogIHN0cl9yZXBsYWNlX2FsbCgiW15bOmFsbnVtOl0gXSIsICIgIikgJT4lDQogIHN0cl9zcXVpc2goKQ0KDQojIExvYWQgYW5kIHByZXByb2Nlc3MgYXJ0aWNsZXMNCmFydGljbGVzIDwtIHJlYWRfY3N2KCJDOi9Vc2Vycy9FUm9zYS9HaXRodWIvbWFoYS9wcm9qZWN0LWFydGljbGVzLmNzdiIpICU+JQ0KICBtdXRhdGUoDQogICAgdGl0bGUgPSBzdHJfdG9fbG93ZXIodGl0bGUpLA0KICAgIGFydGljbGVfdHlwZXMgPSBzdHJfdG9fbG93ZXIocmVwbGFjZV9uYShhcnRpY2xlX3R5cGVzLCAiIikpLA0KICAgIGNvbnRlbnRfY2xlYW4gPSBzdHJfc3F1aXNoKHN0cl9yZXBsYWNlX2FsbChwYXN0ZSh0aXRsZSwgYXJ0aWNsZV90eXBlcyksICJbXls6YWxudW06XSBdIiwgIiAiKSkNCiAgKQ0KDQojIENvbWJpbmUgY29udGVudCBhbmQgdG9rZW5pemUNCnRleHRzX2FsbCA8LSBjKGFydGljbGVzJGNvbnRlbnRfY2xlYW4sIG1haGFfdGV4dCkNCml0X2FsbCA8LSBpdG9rZW4odGV4dHNfYWxsLCBwcmVwcm9jZXNzb3IgPSB0b2xvd2VyLCB0b2tlbml6ZXIgPSB3b3JkX3Rva2VuaXplcikNCg0KIyBDcmVhdGUgdm9jYWJ1bGFyeSBhbmQgRFRNDQp2b2NhYiA8LSBjcmVhdGVfdm9jYWJ1bGFyeShpdF9hbGwsIHN0b3B3b3JkcyA9IHN0b3B3b3JkcygiZW4iKSkgJT4lDQogIHBydW5lX3ZvY2FidWxhcnkodGVybV9jb3VudF9taW4gPSAxMCwgZG9jX3Byb3BvcnRpb25fbWF4ID0gMC4zKSAlPiUNCiAgZmlsdGVyKG5jaGFyKHRlcm0pID49IDMpDQpkdG0gPC0gY3JlYXRlX2R0bShpdF9hbGwsIHZvY2FiX3ZlY3Rvcml6ZXIodm9jYWIpKQ0KDQojIFNwbGl0IGFydGljbGUvcmVwb3J0IG1hdHJpY2VzDQpuX2FydGljbGVzIDwtIG5yb3coYXJ0aWNsZXMpDQpkdG1fYXJ0aWNsZXMgPC0gZHRtWzE6bl9hcnRpY2xlcywgXQ0KZHRtX21haGEgPC0gZHRtW25fYXJ0aWNsZXMgKyAxLCAsIGRyb3AgPSBGQUxTRV0NCg0KIyBTaW1pbGFyaXR5IHRvIE1BSEENCnNpbWlsYXJpdHlfc2NvcmVzIDwtIHNpbTIoeCA9IGR0bV9hcnRpY2xlcywgeSA9IGR0bV9tYWhhLCBtZXRob2QgPSAiY29zaW5lIiwgbm9ybSA9ICJsMiIpICU+JQ0KICBhcy52ZWN0b3IoKQ0KYXJ0aWNsZXMgPC0gYXJ0aWNsZXMgJT4lIG11dGF0ZShzaW1pbGFyaXR5X3RvX21haGEgPSBzaW1pbGFyaXR5X3Njb3JlcykNCg0KIyBTZWxlY3QgdG9wIE4NCnRvcF9uIDwtIDUwDQp0b3BfYXJ0aWNsZXMgPC0gYXJ0aWNsZXMgJT4lIGFycmFuZ2UoZGVzYyhzaW1pbGFyaXR5X3RvX21haGEpKSAlPiUgc2xpY2UoMTp0b3BfbikNCnRvcF9pbmRpY2VzIDwtIHdoaWNoKGFydGljbGVzJHRpdGxlICVpbiUgdG9wX2FydGljbGVzJHRpdGxlKQ0KYXJ0aWNsZV9zdWJzZXRfZHRtIDwtIGR0bV9hcnRpY2xlc1t0b3BfaW5kaWNlcywgXQ0KDQojIFBhaXJ3aXNlIHNpbWlsYXJpdHkgYW5kIGNsdXN0ZXJpbmcNCnBhaXJ3aXNlX3NpbSA8LSBzaW0yKGFydGljbGVfc3Vic2V0X2R0bSwgbWV0aG9kID0gImNvc2luZSIsIG5vcm0gPSAibDIiKQ0KZGlzdF9tYXRyaXggPC0gYXMuZGlzdCgxIC0gcGFpcndpc2Vfc2ltKQ0KaGMgPC0gaGNsdXN0KGRpc3RfbWF0cml4LCBtZXRob2QgPSAiYXZlcmFnZSIpDQpkZW5kIDwtIGFzLmRlbmRyb2dyYW0oaGMpICU+JSBjb2xvcl9icmFuY2hlcyhrID0gNCkNCg0KIyBBZGQgY3VzdG9tIGxhYmVscyB0byBkZW5kcm9ncmFtDQpsYWJlbHMoZGVuZCkgPC0gYXJ0aWNsZXMkdGl0bGVbdG9wX2luZGljZXNdDQoNCiMgQXNzaWduIGNsdXN0ZXIgbWVtYmVyc2hpcA0KYXJ0aWNsZXMkY2x1c3RlciA8LSBOQV9pbnRlZ2VyXw0KYXJ0aWNsZXMkY2x1c3Rlclt0b3BfaW5kaWNlc10gPC0gY3V0cmVlKGhjLCBrID0gNCkNCg0KIyBQbG90IGRlbmRyb2dyYW0NCnBhcihtYXIgPSBjKDEwLCA0LCA0LCAyKSkgICMgTW9yZSBzcGFjZSBmb3Igcm90YXRlZCBsYWJlbHMNCnBsb3QoZGVuZCwgY2V4ID0gMC41LCBsYXMgPSAyLA0KICAgICBtYWluID0gIkhpZXJhcmNoaWNhbCBDbHVzdGVyaW5nIG9mIFRvcCBBcnRpY2xlcyIsDQogICAgIHhsYWIgPSAiIiwgc3ViID0gIiIsIHlsYWIgPSAiRGlzc2ltaWxhcml0eSIpDQoNCiMgUGxvdCBoZWF0bWFwDQpwaGVhdG1hcCgNCiAgbWF0ID0gcGFpcndpc2Vfc2ltLA0KICBjbHVzdGVyaW5nX2Rpc3RhbmNlX3Jvd3MgPSBkaXN0X21hdHJpeCwNCiAgY2x1c3RlcmluZ19kaXN0YW5jZV9jb2xzID0gZGlzdF9tYXRyaXgsDQogIG1haW4gPSAiUGFpcndpc2UgU2ltaWxhcml0eSBCZXR3ZWVuIFRvcCBBcnRpY2xlcyIsDQogIGZvbnRzaXplX3JvdyA9IDYsDQogIGZvbnRzaXplX2NvbCA9IDYsDQogIGxlZ2VuZCA9IFRSVUUsDQogIGRpc3BsYXlfbnVtYmVycyA9IEZBTFNFDQopDQoNCiMgT3B0aW9uYWw6IFNhdmUgcmVzdWx0DQojIHdyaXRlX2NzdihhcnRpY2xlcyAlPiUgZmlsdGVyKCFpcy5uYShjbHVzdGVyKSksICJ0b3BfYXJ0aWNsZXNfd2l0aF9jbHVzdGVycy5jc3YiKQ0KDQpgYGANCg==