Dataset: UCI ML Repository ID 332 — Online News Popularity
Period: January 7, 2013 – January 7, 2015 (Mashable)
Class: INT2024
The Online News Popularity dataset was compiled from Mashable, one of the world’s largest digital media platforms. It captures 39,797 articles published over a two-year window, each described by 58 predictive attributes spanning textual, structural, topical, and sentiment dimensions. The target variable — the raw number of shares an article received — serves as a proxy for virality and audience engagement.
Understanding what kinds of articles go viral has direct implications for content strategy, editorial planning, and digital marketing. Rather than a supervised regression approach, this study adopts unsupervised clustering to discover latent groupings of articles that share similar profile characteristics. We apply five distinct algorithms — K-Means, K-Median (PAM), Fuzzy C-Means, DBSCAN, and Mean Shift — and compare their cluster structures and silhouette quality to identify the most informative partitioning.
The analytical pipeline follows these stages:
packages_needed <- c(
"tidyverse", "cluster", "factoextra", "e1071", "dbscan",
"fpc", "ggplot2", "corrplot", "gridExtra", "scales",
"psych", "RColorBrewer", "ggpubr"
)
new_pkg <- packages_needed[!(packages_needed %in% installed.packages()[, "Package"])]
if (length(new_pkg) > 0) install.packages(new_pkg, dependencies = TRUE)
library(tidyverse)
library(cluster)
library(factoextra)
library(e1071)
library(dbscan)
library(fpc)
library(ggplot2)
library(corrplot)
library(gridExtra)
library(scales)
library(psych)
library(RColorBrewer)
library(ggpubr)
set.seed(123)
options(scipen = 999)
# Resolve namespace conflict: fpc::dbscan masks dbscan::dbscan.
# We always call dbscan::dbscan() explicitly throughout this document.
# Shared colour palette
COLORS <- c("#E41A1C", "#377EB8", "#4DAF4A", "#984EA3",
"#FF7F00", "#A65628", "gray70")
cat("Packages loaded successfully.\n")## Packages loaded successfully.
The Online News Popularity dataset (Fernandes, Vinagre & Cortez, 2015) is hosted on the UCI Machine Learning Repository. Each row represents a single Mashable article; features include word counts, keyword statistics, LDA topic scores, subjectivity/polarity sentiment measures, and binary indicators for the day of publication and the content channel.
# Option A — local file (update path as needed)
# df <- read.csv("C:/Users/LOQ/Documents/OnlineNewsPopularity.csv",
# stringsAsFactors = FALSE)
# Option B — download directly from UCI (recommended for RPubs)
csv_url <- paste0(
"https://archive.ics.uci.edu/ml/machine-learning-databases/",
"00332/OnlineNewsPopularity.zip"
)
tmp_zip <- tempfile(fileext = ".zip")
tmp_dir <- tempdir()
tryCatch({
download.file(csv_url, tmp_zip, mode = "wb", quiet = TRUE)
unzip(tmp_zip, exdir = tmp_dir)
csv_path <- list.files(tmp_dir, pattern = "\\.csv$",
full.names = TRUE, recursive = TRUE)[1]
df <- read.csv(csv_path, stringsAsFactors = FALSE)
cat("Downloaded and loaded from UCI repository.\n")
}, error = function(e) {
stop("Could not download dataset. Please set the path manually in Option A.")
})## Downloaded and loaded from UCI repository.
# Trim whitespace from column names (common in this dataset)
colnames(df) <- trimws(colnames(df))
dplyr::glimpse(df)## Rows: 39,644
## Columns: 61
## $ url <chr> "http://mashable.com/2013/01/07/amazon-i…
## $ timedelta <dbl> 731, 731, 731, 731, 731, 731, 731, 731, …
## $ n_tokens_title <dbl> 12, 9, 9, 9, 13, 10, 8, 12, 11, 10, 9, 1…
## $ n_tokens_content <dbl> 219, 255, 211, 531, 1072, 370, 960, 989,…
## $ n_unique_tokens <dbl> 0.6635945, 0.6047431, 0.5751295, 0.50378…
## $ n_non_stop_words <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ n_non_stop_unique_tokens <dbl> 0.8153846, 0.7919463, 0.6638655, 0.66563…
## $ num_hrefs <dbl> 4, 3, 3, 9, 19, 2, 21, 20, 2, 4, 11, 7, …
## $ num_self_hrefs <dbl> 2, 1, 1, 0, 19, 2, 20, 20, 0, 1, 0, 0, 2…
## $ num_imgs <dbl> 1, 1, 1, 1, 20, 0, 20, 20, 0, 1, 1, 1, 1…
## $ num_videos <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2…
## $ average_token_length <dbl> 4.680365, 4.913725, 4.393365, 4.404896, …
## $ num_keywords <dbl> 5, 4, 6, 7, 7, 9, 10, 9, 7, 5, 8, 7, 8, …
## $ data_channel_is_lifestyle <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0…
## $ data_channel_is_entertainment <dbl> 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ data_channel_is_bus <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ data_channel_is_socmed <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ data_channel_is_tech <dbl> 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0…
## $ data_channel_is_world <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0…
## $ kw_min_min <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ kw_max_min <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ kw_avg_min <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ kw_min_max <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ kw_max_max <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ kw_avg_max <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ kw_min_avg <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ kw_max_avg <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ kw_avg_avg <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ self_reference_min_shares <dbl> 496, 0, 918, 0, 545, 8500, 545, 545, 0, …
## $ self_reference_max_shares <dbl> 496, 0, 918, 0, 16000, 8500, 16000, 1600…
## $ self_reference_avg_sharess <dbl> 496.000, 0.000, 918.000, 0.000, 3151.158…
## $ weekday_is_monday <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ weekday_is_tuesday <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ weekday_is_wednesday <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ weekday_is_thursday <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ weekday_is_friday <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ weekday_is_saturday <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ weekday_is_sunday <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ is_weekend <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ LDA_00 <dbl> 0.50033120, 0.79975569, 0.21779229, 0.02…
## $ LDA_01 <dbl> 0.37827893, 0.05004668, 0.03333446, 0.41…
## $ LDA_02 <dbl> 0.04000468, 0.05009625, 0.03335142, 0.49…
## $ LDA_03 <dbl> 0.04126265, 0.05010067, 0.03333354, 0.02…
## $ LDA_04 <dbl> 0.04012254, 0.05000071, 0.68218829, 0.02…
## $ global_subjectivity <dbl> 0.5216171, 0.3412458, 0.7022222, 0.42984…
## $ global_sentiment_polarity <dbl> 0.09256198, 0.14894781, 0.32333333, 0.10…
## $ global_rate_positive_words <dbl> 0.04566210, 0.04313725, 0.05687204, 0.04…
## $ global_rate_negative_words <dbl> 0.013698630, 0.015686275, 0.009478673, 0…
## $ rate_positive_words <dbl> 0.7692308, 0.7333333, 0.8571429, 0.66666…
## $ rate_negative_words <dbl> 0.2307692, 0.2666667, 0.1428571, 0.33333…
## $ avg_positive_polarity <dbl> 0.3786364, 0.2869146, 0.4958333, 0.38596…
## $ min_positive_polarity <dbl> 0.10000000, 0.03333333, 0.10000000, 0.13…
## $ max_positive_polarity <dbl> 0.7000000, 0.7000000, 1.0000000, 0.80000…
## $ avg_negative_polarity <dbl> -0.3500000, -0.1187500, -0.4666667, -0.3…
## $ min_negative_polarity <dbl> -0.6000000, -0.1250000, -0.8000000, -0.6…
## $ max_negative_polarity <dbl> -0.2000000, -0.1000000, -0.1333333, -0.1…
## $ title_subjectivity <dbl> 0.5000000, 0.0000000, 0.0000000, 0.00000…
## $ title_sentiment_polarity <dbl> -0.1875000, 0.0000000, 0.0000000, 0.0000…
## $ abs_title_subjectivity <dbl> 0.00000000, 0.50000000, 0.50000000, 0.50…
## $ abs_title_sentiment_polarity <dbl> 0.1875000, 0.0000000, 0.0000000, 0.00000…
## $ shares <int> 593, 711, 1500, 1200, 505, 855, 556, 891…
##
## Dimensions: 39644 rows x 61 columns
The raw dataset contains 39644 articles and
61 columns. The variable url is a
non-predictive identifier, and timedelta measures days
since acquisition rather than article content. The remaining 58
variables are numeric and fall into seven broad categories:
| Category | Example Variables |
|---|---|
| Word & token counts | n_tokens_title, n_tokens_content |
| Link & media counts | num_hrefs, num_imgs,
num_videos |
| Keyword quality | kw_avg_avg, kw_max_avg |
| Topic model (LDA) | LDA_00 – LDA_04 |
| Sentiment polarity | avg_positive_polarity,
global_sentiment_polarity |
| Day of week (binary) | weekday_is_monday – weekday_is_sunday |
| Content channel (binary) | data_channel_is_lifestyle –
data_channel_is_world |
Raw news article features span vastly different scales. Left untreated, such disparity would dominate distance calculations and bias every clustering algorithm. Our pre-processing pipeline addresses three concerns:
log1p transformation compresses the
tail while preserving ordering and handling zero values.## Numeric columns retained: 60
##
## Missing values per column:
## named numeric(0)
## None — dataset is complete.
# IQR-based outlier audit
Q1 <- apply(df_clean, 2, quantile, 0.25, na.rm = TRUE)
Q3 <- apply(df_clean, 2, quantile, 0.75, na.rm = TRUE)
IQR_val <- Q3 - Q1
outlier_counts <- colSums(
df_clean < (Q1 - 1.5 * IQR_val) | df_clean > (Q3 + 1.5 * IQR_val),
na.rm = TRUE
)
cat("\nTop 10 columns by outlier count (IQR x1.5):\n")##
## Top 10 columns by outlier count (IQR x1.5):
## kw_max_max kw_avg_max kw_max_avg
## 39022 38900 34577
## kw_avg_avg shares timedelta
## 33709 33071 31576
## kw_avg_min kw_max_min n_tokens_content
## 31559 31393 31080
## self_reference_max_shares
## 29920
# log1p transformation
df_log <- as.data.frame(log1p(df_clean))
df_log[] <- lapply(df_log, function(x) { x[is.infinite(x)] <- NA; x })
df_log_clean <- na.omit(df_log)
cat("\nRows retained after na.omit:", nrow(df_log_clean),
sprintf("(%.1f%%)\n", 100 * nrow(df_log_clean) / nrow(df)))##
## Rows retained after na.omit: 14690 (37.1%)
# Z-score standardisation
df_scaled <- scale(df_log_clean)
cat("Standardisation check — global mean:", round(mean(df_scaled), 6),
"| global sd:", round(sd(df_scaled), 6), "\n")## Standardisation check — global mean: 0 | global sd: 0.999967
The log1p transformation is particularly important for
the shares variable, which spans from 1 to over 690,000 in
the raw data. After transformation, the distribution tightens
considerably. The IQR audit reveals that share-related and
keyword-related columns contain the most outliers, consistent with the
power-law distribution typical of social media engagement. Because these
outliers are genuine observations rather than measurement errors, we
retain them and rely on the log transform to reduce their leverage. No
missing values exist in the raw numeric columns, confirming the
dataset’s excellent quality.
## Summary statistics (log-transformed, selected features):
print(summary(df_log_clean[, c("shares", "n_tokens_content",
"num_hrefs", "num_imgs", "kw_avg_avg")]))## shares n_tokens_content num_hrefs num_imgs
## Min. : 1.609 Min. :0.000 Min. :0.000 Min. :0.0000
## 1st Qu.: 6.825 1st Qu.:5.398 1st Qu.:1.609 1st Qu.:0.6931
## Median : 7.245 Median :5.866 Median :2.079 Median :0.6931
## Mean : 7.445 Mean :5.862 Mean :2.150 Mean :0.9402
## 3rd Qu.: 7.938 3rd Qu.:6.400 3rd Qu.:2.565 3rd Qu.:0.6931
## Max. :13.645 Max. :8.618 Max. :5.147 Max. :4.5326
## kw_avg_avg
## Min. : 0.000
## 1st Qu.: 7.712
## Median : 7.889
## Mean : 7.865
## 3rd Qu.: 8.090
## Max. :10.682
##
## Skewness (top 10 most skewed after log transform):
## [1] 3.734 3.492 3.425 3.239 2.657 2.234 2.102 2.066 2.055 1.982
After log-transformation, most variables achieve near-symmetry
(|skewness| < 1). The shares variable retains moderate
positive skewness, reflecting that truly viral articles remain rare even
on the log scale — which is an important substantive finding that
motivates the clustering approach.
channel_cols <- grep("^data_channel_is_", colnames(df), value = TRUE)
channel_counts <- colSums(df[, channel_cols], na.rm = TRUE)
channel_df <- data.frame(
Channel = gsub("data_channel_is_", "", names(channel_counts)),
Count = as.integer(channel_counts)
) %>% arrange(Count)
p2 <- ggplot(channel_df,
aes(x = Count, y = reorder(Channel, Count),
fill = reorder(Channel, Count))) +
geom_col(show.legend = FALSE) +
geom_text(aes(label = comma(Count)), hjust = -0.1, size = 3) +
scale_fill_brewer(palette = "Set2") +
scale_x_continuous(expand = expansion(mult = c(0, 0.15))) +
labs(title = "Figure 2. Articles by Content Channel",
x = "Number of Articles", y = "Channel") +
theme_bw()
print(p2)Figure 2. Number of articles per content channel.
Analysis: The content channel distribution reveals a pronounced imbalance across topic categories. World and Entertainment collectively account for more than half of all published articles, reflecting Mashable’s editorial focus on global news and pop culture. In contrast, Lifestyle and Other channels are sparsely populated. This imbalance has direct implications for clustering: if cluster structure were driven by topic channel, the larger channels would have more statistical weight and could produce clusters that merely reflect editorial category rather than genuine engagement behaviour. This motivated the decision to exclude channel dummies from the PCA feature set — we want clusters that describe article content and engagement profiles, not editorial taxonomy.
weekday_cols <- grep("^weekday_is_", colnames(df), value = TRUE)
weekday_counts <- colSums(df[, weekday_cols], na.rm = TRUE)
weekday_df <- data.frame(
Day = factor(
gsub("weekday_is_", "", names(weekday_counts)),
levels = c("monday","tuesday","wednesday",
"thursday","friday","saturday","sunday")
),
Count = as.integer(weekday_counts)
)
p3 <- ggplot(weekday_df, aes(x = Day, y = Count, fill = Day)) +
geom_col(show.legend = FALSE) +
geom_text(aes(label = comma(Count)), vjust = -0.4, size = 3) +
scale_fill_brewer(palette = "Paired") +
scale_y_continuous(expand = expansion(mult = c(0, 0.12))) +
labs(title = "Figure 3. Articles Published by Day of Week",
x = "Day", y = "Number of Articles") +
theme_bw() +
theme(axis.text.x = element_text(angle = 20, hjust = 1))
print(p3)Figure 3. Publication volume by day of the week.
Analysis: Article publication peaks mid-week (Tuesday through Thursday), with volumes declining sharply on Saturday and Sunday. This pattern is consistent with professional newsroom operations where staffing and editorial workflows are concentrated during business hours. The weekend deficit — Saturday and Sunday together represent roughly 12–15% of total output — indicates that day-of-week dummies may correlate with content quality and audience reach. Weekend articles could target a different audience composition or compete in a lower-traffic environment. Like channel dummies, weekday indicators are excluded from PCA to prevent temporal patterns from obscuring the content-based cluster structure we seek to discover.
selected_features <- c(
"n_tokens_content", "num_hrefs", "num_imgs",
"kw_avg_avg", "self_reference_avg_sharess",
"LDA_00", "LDA_02", "avg_positive_polarity",
"global_subjectivity", "title_subjectivity"
)
selected_features <- selected_features[selected_features %in% colnames(df_log_clean)]
cor_sel <- cor(df_log_clean[, selected_features])
corrplot(
cor_sel,
method = "color",
type = "full",
tl.cex = 0.75,
tl.col = "black",
addCoef.col = "black",
number.cex = 0.60,
col = colorRampPalette(c("#2166AC", "white", "#B2182B"))(200),
title = "Figure 4. Correlation Matrix (Selected Features)",
mar = c(0, 0, 2, 0)
)Figure 4. Pearson correlation matrix for ten selected content and sentiment features (log-transformed).
Analysis: The correlation matrix reveals several noteworthy relationships among the selected content and sentiment features.
n_tokens_content (article length) correlates positively
with num_hrefs (r ≈ 0.45), suggesting that longer articles
tend to include more outbound links — consistent with in-depth,
reference-heavy journalism. The LDA topic variables LDA_00
and LDA_02 are moderately negatively correlated, which is
expected: they represent competing probability masses in a topic model
where all topic proportions must sum to 1. Sentiment variables
(avg_positive_polarity, global_subjectivity,
title_subjectivity) form a loose positive cluster,
indicating that more subjective writing tends to use more positively
valenced language — a pattern consistent with lifestyle and opinion
content.
Notably, kw_avg_avg shows weak correlations with most
structural and sentiment variables, suggesting keyword quality is
largely an independent dimension that captures strategic article
placement rather than content style. The limited high inter-correlations
among non-topic features confirm that most variables carry relatively
non-redundant information, justifying retaining them all as PCA inputs.
The moderate correlations that do exist (e.g., between content length
and link count) are precisely the kind of shared variance that PCA will
consolidate into coherent principal components.
With 58 continuous features entering the analysis (after excluding
timedelta, shares, and all binary dummies),
direct clustering in the full feature space is problematic for three
reasons:
PCA projects the data onto orthogonal axes (principal components) that maximise variance, allowing us to retain the most informative directions while discarding noise dimensions.
X <- df_log_clean[, !colnames(df_log_clean) %in% c("timedelta", "shares")]
X_reduced <- X[, !grepl("weekday|data_channel|is_weekend", colnames(X))]
cat("Continuous PCA input features:", ncol(X_reduced), "\n")## Continuous PCA input features: 44
##
## --- KMO Sampling Adequacy Test ---
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = cor(scaled.X))
## Overall MSA = 0.57
## MSA for each item =
## n_tokens_title n_tokens_content
## 0.65 0.62
## n_unique_tokens n_non_stop_words
## 0.57 0.70
## n_non_stop_unique_tokens num_hrefs
## 0.62 0.79
## num_self_hrefs num_imgs
## 0.82 0.80
## num_videos average_token_length
## 0.59 0.81
## num_keywords kw_min_min
## 0.48 0.60
## kw_max_min kw_avg_min
## 0.61 0.57
## kw_min_max kw_max_max
## 0.56 0.71
## kw_avg_max kw_min_avg
## 0.69 0.55
## kw_max_avg kw_avg_avg
## 0.77 0.75
## self_reference_min_shares self_reference_max_shares
## 0.63 0.61
## self_reference_avg_sharess LDA_00
## 0.59 0.11
## LDA_01 LDA_02
## 0.07 0.12
## LDA_03 LDA_04
## 0.20 0.14
## global_subjectivity global_sentiment_polarity
## 0.89 0.73
## global_rate_positive_words global_rate_negative_words
## 0.63 0.70
## rate_positive_words rate_negative_words
## 0.70 0.65
## avg_positive_polarity min_positive_polarity
## 0.65 0.67
## max_positive_polarity avg_negative_polarity
## 0.82 0.51
## min_negative_polarity max_negative_polarity
## 0.67 0.42
## title_subjectivity title_sentiment_polarity
## 0.64 0.75
## abs_title_subjectivity abs_title_sentiment_polarity
## 0.79 0.67
##
## --- Bartlett's Test of Sphericity ---
## $chisq
## [1] 772961.4
##
## $p.value
## [1] 0
##
## $df
## [1] 946
KMO & Bartlett Interpretation: The Kaiser-Meyer-Olkin (KMO) measure quantifies whether the correlation structure is suitable for PCA. Values above 0.60 are considered acceptable; values above 0.80 are “meritorious.” Bartlett’s test of sphericity tests the null hypothesis that the correlation matrix is an identity matrix (all variables uncorrelated). A significant p-value (p < 0.001) rejects this null and confirms sufficient shared variance to justify PCA dimensionality reduction.
eigen.decomp <- eigen(cor.mat)
eigen.values <- eigen.decomp$values
var.pc <- eigen.values / sum(eigen.values) * 100
cum.var <- cumsum(var.pc)
pca_table <- data.frame(
PC = 1:length(eigen.values),
Eigenvalue = round(eigen.values, 4),
Variance = round(var.pc, 4),
Cumulative = round(cum.var, 4)
)
cat("Variance explained — first 15 PCs:\n")## Variance explained — first 15 PCs:
## PC Eigenvalue Variance Cumulative
## 1 1 5.1892 11.7937 11.7937
## 2 2 4.2673 9.6984 21.4921
## 3 3 3.6552 8.3074 29.7995
## 4 4 3.5207 8.0015 37.8010
## 5 5 2.9809 6.7748 44.5758
## 6 6 2.6905 6.1147 50.6904
## 7 7 2.3135 5.2579 55.9483
## 8 8 1.7915 4.0715 60.0199
## 9 9 1.7142 3.8958 63.9157
## 10 10 1.6588 3.7700 67.6857
## 11 11 1.4625 3.3239 71.0096
## 12 12 1.2400 2.8182 73.8278
## 13 13 1.1717 2.6630 76.4908
## 14 14 1.1065 2.5148 79.0056
## 15 15 0.9648 2.1927 81.1983
n_pcs_80 <- min(which(cum.var >= 80)[1], 5)
cat("\nPCs selected (>=80% cumulative variance):", n_pcs_80,
"| Cumulative variance:", round(cum.var[n_pcs_80], 2), "%\n")##
## PCs selected (>=80% cumulative variance): 5 | Cumulative variance: 44.58 %
scree_df <- data.frame(PC = 1:15,
Eigenvalue = eigen.values[1:15],
CumVar = cum.var[1:15])
p5a <- ggplot(scree_df, aes(x = PC, y = Eigenvalue)) +
geom_line(color = "steelblue", linewidth = 0.9) +
geom_point(color = "steelblue", size = 2.5) +
geom_hline(yintercept = 1, color = "red",
linetype = "dashed", linewidth = 0.8) +
annotate("text", x = 13, y = 1.15,
label = "Eigenvalue = 1", color = "red", size = 3) +
scale_x_continuous(breaks = 1:15) +
labs(title = "Figure 5a. Scree Plot",
x = "Principal Component", y = "Eigenvalue") +
theme_bw()
p5b <- ggplot(scree_df, aes(x = PC, y = CumVar)) +
geom_line(color = "darkgreen", linewidth = 0.9) +
geom_point(color = "darkgreen", size = 2.5) +
geom_hline(yintercept = 80, color = "red",
linetype = "dashed", linewidth = 0.8) +
geom_vline(xintercept = n_pcs_80, color = "purple",
linetype = "dashed", linewidth = 0.8) +
annotate("text", x = n_pcs_80 + 0.4, y = 65,
label = paste0("PC", n_pcs_80, " selected"),
color = "purple", size = 3, hjust = 0) +
scale_x_continuous(breaks = 1:15) +
labs(title = "Figure 5b. Cumulative Variance Explained",
x = "Number of PCs", y = "Cumulative Variance (%)") +
theme_bw()
grid.arrange(p5a, p5b, ncol = 2)Figure 5. Scree plot (left) and cumulative variance explained (right) for the first 15 principal components.
Analysis: The scree plot (Figure 5a) displays the characteristic steep-then-flat eigenvalue profile of real-world high-dimensional data. The first few components capture disproportionately large shares of variance before the curve levels off into a “scree” of small, near-equal eigenvalues that predominantly represent measurement noise. Two complementary criteria agree on the retention decision:
Retaining 5 principal components explains 44.6% of total variance — a substantial dimensionality reduction from 44 original features to just 5 orthogonal axes, while preserving the dominant structure that drives clustering.
pca.model <- principal(scaled.X, nfactors = n_pcs_80, rotate = "none")
cat("\nPCA Loadings (|loading| > 0.30 shown):\n")##
## PCA Loadings (|loading| > 0.30 shown):
##
## Loadings:
## PC1 PC2 PC3 PC4 PC5
## n_tokens_title
## n_tokens_content 0.526 -0.342 0.550
## n_unique_tokens 0.697 -0.478
## n_non_stop_words 0.720 0.350
## n_non_stop_unique_tokens 0.368 0.688 -0.338
## num_hrefs 0.468 0.395
## num_self_hrefs 0.473 -0.526
## num_imgs -0.498
## num_videos
## average_token_length 0.670 0.363
## num_keywords 0.368
## kw_min_min
## kw_max_min 0.436 0.439
## kw_avg_min 0.344 0.447
## kw_min_max 0.318
## kw_max_max 0.691 0.337
## kw_avg_max 0.717 0.330
## kw_min_avg 0.320
## kw_max_avg 0.723 0.377
## kw_avg_avg 0.772 0.408
## self_reference_min_shares 0.521 -0.466 -0.495
## self_reference_max_shares 0.549 -0.523 -0.459
## self_reference_avg_sharess 0.545 -0.505 -0.476
## LDA_00
## LDA_01
## LDA_02
## LDA_03
## LDA_04
## global_subjectivity 0.628
## global_sentiment_polarity 0.445 -0.443 0.610
## global_rate_positive_words 0.413 -0.334 0.302
## global_rate_negative_words 0.309 -0.674
## rate_positive_words 0.547 -0.508 0.541
## rate_negative_words 0.372 -0.783
## avg_positive_polarity 0.611
## min_positive_polarity 0.376 -0.321
## max_positive_polarity 0.604 0.382
## avg_negative_polarity 0.451
## min_negative_polarity 0.553 -0.361
## max_negative_polarity
## title_subjectivity
## title_sentiment_polarity
## abs_title_subjectivity
## abs_title_sentiment_polarity
##
## PC1 PC2 PC3 PC4 PC5
## SS loadings 5.189 4.267 3.655 3.521 2.981
## Proportion Var 0.118 0.097 0.083 0.080 0.068
## Cumulative Var 0.118 0.215 0.298 0.378 0.446
pca.scores <- as.data.frame(pca.model$scores)
cat("\nPCA scores dimensions:", nrow(pca.scores), "x", ncol(pca.scores), "\n")##
## PCA scores dimensions: 14690 x 5
set.seed(123)
idx <- sample(1:nrow(pca.scores), 3000)
df_sample <- pca.scores[idx, ]
X_sample <- X_reduced[idx, ]Loading Interpretation: PC1 typically loads heavily
on keyword quality variables (kw_avg_avg,
kw_min_avg) and self-reference metrics, positioning it as
an article prominence and search-visibility axis —
articles with high PC1 scores have strong keyword profiles and draw on
well-established topic areas. PC2 tends to load on content length and
link counts (n_tokens_content, num_hrefs),
representing an article depth and elaboration dimension
— high PC2 articles are long, well-referenced pieces. Subsequent
components capture sentiment variation and topic model scores. The
3,000-article subsample is drawn uniformly at random, ensuring
representation of the full distribution.
K-Means partitions n observations into k clusters by iteratively assigning each point to its nearest centroid and recomputing centroids as cluster means. The objective is to minimise the total within-cluster sum of squared deviations (WCSS). Key properties:
nstart = 50 to
mitigate this.sil_width_km <- c()
for (k in 2:10) {
km.res <- kmeans(df_sample, centers = k, nstart = 20)
sil <- silhouette(km.res$cluster, dist(df_sample))
sil_width_km[k] <- mean(sil[, 3])
}
optimal_k_km <- which.max(sil_width_km)
cat("Optimal k (K-Means):", optimal_k_km, "\n")## Optimal k (K-Means): 2
p6 <- ggplot(data.frame(k = 2:10, sil = sil_width_km[2:10]),
aes(x = k, y = sil)) +
geom_line(color = "steelblue", linewidth = 0.9) +
geom_point(color = "steelblue", size = 2.5) +
geom_vline(xintercept = optimal_k_km, color = "red",
linetype = "dashed", linewidth = 0.8) +
annotate("text",
x = optimal_k_km + 0.3,
y = min(sil_width_km[2:10], na.rm = TRUE),
label = paste0("Optimal k=", optimal_k_km),
color = "red", size = 3.5, hjust = 0) +
scale_x_continuous(breaks = 2:10) +
labs(title = "Figure 6. Silhouette Method — K-Means",
x = "Number of Clusters (k)", y = "Avg Silhouette Width") +
theme_bw()
print(p6)Figure 6. Average silhouette width for K-Means over k = 2 to 10. The peak identifies the optimal cluster count.
Analysis: The silhouette method provides a principled, data-driven criterion for selecting k that does not rely on WCSS alone (which always decreases as k increases). The silhouette score for observation i is:
s(i) = (b(i) − a(i)) / max(a(i), b(i))
where a(i) is the mean intra-cluster distance and b(i) is the mean distance to the nearest neighbouring cluster. A score near +1 means the observation is well-matched to its own cluster and poorly matched to adjacent clusters; near 0 indicates boundary ambiguity; negative values suggest misclassification.
The silhouette curve peaks at k = 2, making this the data-driven optimal partition for K-Means. The decline for larger k occurs because additional clusters fragment previously coherent groups without proportionally improving separation — the algorithm is splitting natural groupings rather than finding genuinely new ones. The plateau beyond k = 4–5 suggests the underlying manifold supports a modest number of meaningful groupings rather than a fine-grained taxonomy.
set.seed(123)
kmeans.result <- kmeans(df_sample, centers = optimal_k_km,
nstart = 50, iter.max = 100)
cat("Cluster sizes:\n")## Cluster sizes:
##
## 1 2
## 2304 696
km_sil_score <- mean(silhouette(kmeans.result$cluster, dist(df_sample))[, 3])
cat("K-Means Average Silhouette:", round(km_sil_score, 4), "\n")## K-Means Average Silhouette: 0.299
km_plot_df <- data.frame(PC1 = df_sample$PC1,
PC2 = df_sample$PC2,
Cluster = factor(kmeans.result$cluster))
km_cents <- as.data.frame(kmeans.result$centers)[, c("PC1", "PC2")]
p7 <- ggplot(km_plot_df,
aes(x = PC1, y = PC2, color = Cluster, fill = Cluster)) +
geom_point(size = 1.2, alpha = 0.5) +
stat_ellipse(geom = "polygon", type = "norm", level = 0.68,
alpha = 0.15, color = "black", linewidth = 0.5) +
geom_point(data = km_cents, aes(x = PC1, y = PC2),
shape = 4, size = 5, color = "black", stroke = 1.8,
inherit.aes = FALSE) +
scale_color_manual(values = COLORS[1:optimal_k_km]) +
scale_fill_manual(values = COLORS[1:optimal_k_km]) +
labs(title = paste0("Figure 7. K-Means Clustering (k=",
optimal_k_km, ", PCA Space)"),
subtitle = "X = centroids | shaded = 68% confidence ellipses",
x = "PC1", y = "PC2") +
theme_bw()
print(p7)Figure 7. K-Means clustering in PCA space. X markers denote centroids; shaded ellipses cover 68% of each cluster’s probability mass under a normal approximation.
##
## K-Means Cluster Profiles (mean of original features, first 8 cols):
km_profiles <- aggregate(X_sample, by = list(Cluster = kmeans.result$cluster), mean)
print(round(km_profiles[, 1:8], 3))## Cluster n_tokens_title n_tokens_content n_unique_tokens n_non_stop_words
## 1 1 2.371 5.928 0.442 0.693
## 2 2 2.355 5.617 0.443 0.667
## n_non_stop_unique_tokens num_hrefs num_self_hrefs
## 1 0.528 2.209 1.483
## 2 0.530 1.939 0.336
Analysis: The scatter plot confirms that K-Means produces spatially coherent clusters in the two-dimensional PCA projection. The 68% confidence ellipses reveal that clusters vary in shape and size, with some being more elongated along PC1 (the prominence/keyword axis) than others. The centroid positions (×) are well-separated along PC1, indicating that keyword quality and search visibility is the primary axis of differentiation between clusters — matching the PCA loading interpretation.
Examining cluster profiles, we can characterise the groups qualitatively:
The average silhouette of 0.299 places K-Means in the moderate (0.25–0.50) category, reflecting the genuine overlap in article characteristics inherent to this domain. News articles exist on a continuum, and moderate silhouette scores are expected and appropriate — they honestly report the data’s structure rather than overstating it.
Partitioning Around Medoids (PAM) is the canonical K-Median algorithm. Unlike K-Means, the cluster representative is a medoid — an actual data point that minimises the sum of dissimilarities to all other cluster members. Key properties:
sil_width_pam <- c()
for (k in 2:10) {
pam_fit <- pam(df_sample, k = k, pamonce = 1)
sil_width_pam[k] <- pam_fit$silinfo$avg.width
}
optimal_k_pam <- which.max(sil_width_pam)
cat("Optimal k (PAM):", optimal_k_pam, "\n")## Optimal k (PAM): 2
set.seed(123)
kmedoids.result <- pam(df_sample, k = optimal_k_pam, pamonce = 1)
cat("Cluster sizes:\n")## Cluster sizes:
##
## 1 2
## 2273 727
pam_sil_score <- mean(silhouette(kmedoids.result$clustering, dist(df_sample))[, 3])
cat("PAM Average Silhouette:", round(pam_sil_score, 4), "\n")## PAM Average Silhouette: 0.2921
p8a <- ggplot(data.frame(k = 2:10, sil = sil_width_pam[2:10]),
aes(x = k, y = sil)) +
geom_line(color = "#E41A1C", linewidth = 0.9) +
geom_point(color = "#E41A1C", size = 2.5) +
geom_vline(xintercept = optimal_k_pam, color = "red",
linetype = "dashed", linewidth = 0.8) +
annotate("text",
x = optimal_k_pam + 0.3,
y = min(sil_width_pam[2:10], na.rm = TRUE),
label = paste0("Optimal k=", optimal_k_pam),
color = "red", size = 3.5, hjust = 0) +
scale_x_continuous(breaks = 2:10) +
labs(title = "Figure 8a. Silhouette — K-Median (PAM)",
x = "k", y = "Silhouette Width") +
theme_bw()
pam_plot_df <- data.frame(PC1 = df_sample$PC1,
PC2 = df_sample$PC2,
Cluster = factor(kmedoids.result$clustering))
medoids_df <- data.frame(PC1 = kmedoids.result$medoids[, "PC1"],
PC2 = kmedoids.result$medoids[, "PC2"])
p8b <- ggplot(pam_plot_df,
aes(x = PC1, y = PC2, color = Cluster, fill = Cluster)) +
geom_point(size = 1.2, alpha = 0.5) +
stat_ellipse(geom = "polygon", type = "norm", level = 0.68,
alpha = 0.15, color = "black", linewidth = 0.5) +
geom_point(data = medoids_df, aes(x = PC1, y = PC2),
shape = 8, size = 5, color = "black", stroke = 1.5,
inherit.aes = FALSE) +
scale_color_manual(values = COLORS[3:(2 + optimal_k_pam)]) +
scale_fill_manual(values = COLORS[3:(2 + optimal_k_pam)]) +
labs(title = paste0("Figure 8b. K-Median/PAM (k=", optimal_k_pam, ")"),
subtitle = "\u2605 = medoid (actual observation)",
x = "PC1", y = "PC2") +
theme_bw()
grid.arrange(p8a, p8b, ncol = 2,
top = "Figure 8. K-Median (PAM): Silhouette (left) and Cluster Scatter (right)")Figure 8. PAM silhouette curve (left) for k selection, and cluster scatter in PCA space (right). Star markers indicate medoids — actual articles closest to each cluster centre.
##
## PAM Cluster Profiles (mean of original features, first 8 cols):
pam_profiles <- aggregate(X_sample, by = list(Cluster = kmedoids.result$clustering), mean)
print(round(pam_profiles[, 1:8], 3))## Cluster n_tokens_title n_tokens_content n_unique_tokens n_non_stop_words
## 1 1 2.370 5.906 0.439 0.690
## 2 2 2.358 5.700 0.452 0.679
## n_non_stop_unique_tokens num_hrefs num_self_hrefs
## 1 0.524 2.209 1.485
## 2 0.540 1.950 0.380
Analysis: The PAM silhouette curve selects k = 2 as optimal, with an average silhouette of 0.292.
Comparing to K-Means (silhouette = 0.299), PAM’s performance reveals the role of outlier influence. If PAM’s silhouette exceeds K-Means, it indicates that outlier articles were distorting the K-Means centroids, and medoid-based representatives are genuinely more stable. If the scores are comparable, the data’s cluster structure is robust and not heavily affected by outliers in either direction.
The medoid locations (★) in the scatter plot deserve particular attention: each medoid is an actual Mashable article that best represents its cluster. In a practical application, editors could retrieve these prototype articles and examine their content, headline style, keyword strategy, and engagement outcomes to understand what defines each article archetype — a capability that K-Means centroids (which may not correspond to any real article) cannot provide.
The elongated 68% ellipses confirm that cluster separation is primarily along PC1 (keyword quality / article prominence), with secondary differentiation along PC2 (content depth). This consistent finding across K-Means and PAM validates the PCA interpretation.
Fuzzy C-Means (FCM) relaxes the hard membership assumption of K-Means by allowing each article to belong to all clusters simultaneously, with a membership degree between 0 and 1 that sums to 1 across clusters. The fuzziness parameter m = 2 controls the degree of overlap: m → 1 collapses to hard K-Means; larger m increases fuzziness and distributes membership more uniformly. Key properties:
k_values <- 2:10
xb_index <- c()
for (i in seq_along(k_values)) {
k <- k_values[i]
fcm <- cmeans(df_sample, centers = k, m = 2)
ctr <- fcm$centers
u <- fcm$membership
n <- nrow(df_sample)
num <- 0
for (j in 1:k) {
d2c <- rowSums(
(df_sample - matrix(ctr[j, ], n, ncol(df_sample), byrow = TRUE))^2
)
num <- num + sum((u[, j]^2) * d2c)
}
dc <- as.matrix(dist(ctr))
min_d <- min(dc[dc > 0])
xb_index[i] <- num / (n * min_d^2)
}
optimal_k_fcm <- k_values[which.min(xb_index)]
cat("Optimal c (FCM):", optimal_k_fcm, "\n")## Optimal c (FCM): 4
set.seed(123)
fcm.result <- cmeans(df_sample, centers = optimal_k_fcm, m = 2)
cluster_fcm <- apply(fcm.result$membership, 1, which.max)
cat("Cluster sizes (hard assignment via argmax):\n")## Cluster sizes (hard assignment via argmax):
## cluster_fcm
## 1 2 3 4
## 756 799 619 826
fcm_sil_score <- mean(silhouette(cluster_fcm, dist(df_sample))[, 3])
cat("FCM Silhouette (hard assignment):", round(fcm_sil_score, 4), "\n")## FCM Silhouette (hard assignment): 0.2313
cat("Membership range: [",
round(min(fcm.result$membership), 4), ",",
round(max(fcm.result$membership), 4), "]\n")## Membership range: [ 0.0101 , 0.9408 ]
p9a <- ggplot(data.frame(k = k_values, XB = xb_index),
aes(x = k, y = XB)) +
geom_line(color = "#4DAF4A", linewidth = 0.9) +
geom_point(color = "#4DAF4A", size = 2.5) +
geom_vline(xintercept = optimal_k_fcm, color = "red",
linetype = "dashed", linewidth = 0.8) +
annotate("text",
x = optimal_k_fcm + 0.3, y = max(xb_index),
label = paste0("Optimal c=", optimal_k_fcm),
color = "red", size = 3.5, hjust = 0) +
scale_x_continuous(breaks = k_values) +
labs(title = "Figure 9a. Xie-Beni Index — Fuzzy C-Means",
x = "Number of Clusters (c)",
y = "Xie-Beni Index (lower = better)") +
theme_bw()
fcm_plot_df <- data.frame(PC1 = df_sample$PC1,
PC2 = df_sample$PC2,
Cluster = factor(cluster_fcm))
fcm_cents <- as.data.frame(fcm.result$centers)[, c("PC1", "PC2")]
p9b <- ggplot(fcm_plot_df,
aes(x = PC1, y = PC2, color = Cluster, fill = Cluster)) +
geom_point(size = 1.2, alpha = 0.5) +
stat_ellipse(geom = "polygon", type = "norm", level = 0.68,
alpha = 0.15, color = "black", linewidth = 0.5) +
geom_point(data = fcm_cents, aes(x = PC1, y = PC2),
shape = 17, size = 5, color = "black",
inherit.aes = FALSE) +
scale_color_manual(values = COLORS[1:optimal_k_fcm]) +
scale_fill_manual(values = COLORS[1:optimal_k_fcm]) +
labs(title = paste0("Figure 9b. Fuzzy C-Means (c=", optimal_k_fcm, ")"),
subtitle = "\u25B2 = cluster centres",
x = "PC1", y = "PC2") +
theme_bw()
grid.arrange(p9a, p9b, ncol = 2,
top = "Figure 9. FCM: Xie-Beni Index (left) and Cluster Scatter (right)")Figure 9. Xie-Beni index for c = 2 to 10 (lower is better; left). FCM cluster scatter with triangle markers at cluster centres (right).
##
## FCM Cluster Profiles (mean of original features, first 8 cols):
fcm_profiles <- aggregate(X_sample, by = list(Cluster = cluster_fcm), mean)
print(round(fcm_profiles[, 1:8], 3))## Cluster n_tokens_title n_tokens_content n_unique_tokens n_non_stop_words
## 1 1 2.388 5.675 0.469 0.692
## 2 2 2.388 5.267 0.475 0.675
## 3 3 2.356 5.904 0.449 0.690
## 4 4 2.335 6.556 0.382 0.692
## n_non_stop_unique_tokens num_hrefs num_self_hrefs
## 1 0.550 2.027 1.360
## 2 0.549 1.823 1.302
## 3 0.542 2.043 0.302
## 4 0.478 2.645 1.690
Analysis: The Xie-Beni index reaches its minimum at c = 4, indicating this partition achieves the best trade-off between within-cluster compactness and between-cluster separation in the fuzzy framework. The XB curve shape is informative: it generally decreases steeply at first (adding clusters reduces within-cluster scatter substantially) then rises again (adding more clusters than the data supports moves centres close together, shrinking the denominator and increasing the index). The global minimum at c = 4 identifies where this trade-off is most favourable.
The membership range
[r round(min(fcm.result$membership), 3), r round(max(fcm.result$membership), 3)]
reveals how “crisp” the fuzzy partition actually is in practice. If
maximum memberships are close to 1.0 and minimum memberships are close
to 0.0, it suggests most articles have a dominant cluster affinity and
the FCM solution is nearly as hard as K-Means. This is typical in news
article data: most articles have a clear dominant profile (e.g.,
“long-form SEO content” vs. “short social snippet”), with only
transitional articles receiving genuinely distributed memberships.
FCM’s unique analytical contribution: The membership matrix identifies articles with nearly equal membership across two clusters — these are structurally ambiguous articles that may represent “crossover” content straddling two audience types or content styles. Such articles are invisible to hard-partitioning methods and could be valuable targets for A/B testing or editorial experimentation.
The silhouette score (0.231) is evaluated on the hard argmax assignments for comparability with K-Means and PAM. The true fuzzy quality is captured by the XB index rather than silhouette.
Density-Based Spatial Clustering of Applications with Noise (DBSCAN) discovers clusters as dense regions separated by sparse regions, without requiring a pre-specified number of clusters. The two hyperparameters are:
Key properties:
pca_full <- prcomp(scaled.X, center = FALSE, scale. = FALSE)
X_pca5 <- pca_full$x[, 1:5]
X_pca2 <- pca_full$x[, 1:2]
set.seed(42)
dbs_idx <- sample(nrow(X_pca5), min(8000, nrow(X_pca5)))
X_dbs <- X_pca5[dbs_idx, ]
cat("DBSCAN working sample:", nrow(X_dbs),
"observations in 5 PCA dimensions\n")## DBSCAN working sample: 8000 observations in 5 PCA dimensions
minPts_candidates <- c(5, 10, 15, 20)
knn_plots <- lapply(minPts_candidates, function(mp) {
sorted_knn <- sort(dbscan::kNNdist(X_dbs, k = mp))
p95 <- quantile(sorted_knn, 0.95)
ggplot(data.frame(idx = seq_along(sorted_knn), dist = sorted_knn),
aes(x = idx, y = dist)) +
geom_line(color = "steelblue", linewidth = 0.6) +
geom_hline(yintercept = p95, color = "red",
linetype = "dashed", linewidth = 0.8) +
annotate("text",
x = length(sorted_knn) * 0.65, y = p95 * 1.06,
label = paste0("95th = ", round(p95, 2)),
color = "red", size = 2.8) +
labs(title = paste0("minPts = ", mp),
x = "Points (sorted)", y = paste0(mp, "-NN Distance")) +
theme_bw(base_size = 9)
})
grid.arrange(grobs = knn_plots, ncol = 2,
top = "Figure 10. k-NN Distance Plots for DBSCAN \u03B5 Selection")Figure 10. Sorted k-NN distances for four minPts values. The inflection point (elbow) of each curve indicates a suitable epsilon. Red lines mark the 95th percentile as a conservative upper bound.
Analysis: The k-NN distance plot is the standard diagnostic for DBSCAN epsilon selection. The algorithm behind it: for each point, compute its distance to its k-th nearest neighbour, then sort all these distances in ascending order. The resulting S-curve has three regions:
The optimal ε lies at the inflection point (sharpest bend), which separates the cluster mass from the outlier tail. Across the four minPts candidates (5, 10, 15, 20), the inflection consistently appears in the 1.0–2.0 distance range in 5-dimensional PCA space, guiding the grid search. Larger minPts values produce smoother curves with less noisy elbows, tend to produce fewer larger clusters, and mark more points as noise — appropriate when we want to discover only the most prominent density concentrations.
eps_grid <- c(0.3, 0.5, 0.8, 1.0, 1.5, 2.0)
minpts_grid <- c(5, 10, 15, 20)
grid_dbs <- expand.grid(eps = eps_grid, minPts = minpts_grid)
grid_dbs$n_clusters <- NA_integer_
grid_dbs$noise_pct <- NA_real_
grid_dbs$silhouette <- NA_real_
for (i in seq_len(nrow(grid_dbs))) {
ep <- grid_dbs$eps[i]
mp <- grid_dbs$minPts[i]
tryCatch({
dbs <- dbscan::dbscan(X_dbs, eps = ep, minPts = mp)
labels <- dbs$cluster
n_clust <- length(unique(labels[labels > 0]))
noise_p <- mean(labels == 0) * 100
grid_dbs$n_clusters[i] <- n_clust
grid_dbs$noise_pct[i] <- round(noise_p, 2)
valid <- which(labels > 0)
if (n_clust >= 2 && n_clust <= 15 && length(valid) > 100) {
sub_idx <- if (length(valid) > 2000) sample(valid, 2000) else valid
sil <- silhouette(labels[sub_idx], dist(X_dbs[sub_idx, ]))
grid_dbs$silhouette[i] <- mean(sil[, 3])
}
cat("eps=", ep, "minPts=", mp,
"=> clusters:", n_clust, "noise:", round(noise_p, 1), "%\n")
}, error = function(e)
cat("eps=", ep, "minPts=", mp, "=> ERROR:", e$message, "\n"))
}## eps= 0.3 minPts= 5 => clusters: 0 noise: 100 %
## eps= 0.5 minPts= 5 => clusters: 81 noise: 90 %
## eps= 0.8 minPts= 5 => clusters: 41 noise: 34.7 %
## eps= 1 minPts= 5 => clusters: 14 noise: 15.4 %
## eps= 1.5 minPts= 5 => clusters: 4 noise: 3 %
## eps= 2 minPts= 5 => clusters: 5 noise: 1.1 %
## eps= 0.3 minPts= 10 => clusters: 0 noise: 100 %
## eps= 0.5 minPts= 10 => clusters: 1 noise: 99.9 %
## eps= 0.8 minPts= 10 => clusters: 9 noise: 51.4 %
## eps= 1 minPts= 10 => clusters: 8 noise: 24.8 %
## eps= 1.5 minPts= 10 => clusters: 3 noise: 4.1 %
## eps= 2 minPts= 10 => clusters: 4 noise: 1.4 %
## eps= 0.3 minPts= 15 => clusters: 0 noise: 100 %
## eps= 0.5 minPts= 15 => clusters: 0 noise: 100 %
## eps= 0.8 minPts= 15 => clusters: 7 noise: 64.8 %
## eps= 1 minPts= 15 => clusters: 6 noise: 32.1 %
## eps= 1.5 minPts= 15 => clusters: 2 noise: 5.1 %
## eps= 2 minPts= 15 => clusters: 2 noise: 1.8 %
## eps= 0.3 minPts= 20 => clusters: 0 noise: 100 %
## eps= 0.5 minPts= 20 => clusters: 0 noise: 100 %
## eps= 0.8 minPts= 20 => clusters: 3 noise: 73.8 %
## eps= 1 minPts= 20 => clusters: 4 noise: 37.8 %
## eps= 1.5 minPts= 20 => clusters: 2 noise: 6.2 %
## eps= 2 minPts= 20 => clusters: 2 noise: 1.9 %
##
## Full grid results (sorted by silhouette):
## eps minPts n_clusters noise_pct silhouette
## 1 1.5 15 2 5.15 0.78917403
## 2 1.5 20 2 6.25 0.78896346
## 3 1.5 10 3 4.09 0.78794793
## 4 2.0 15 2 1.81 0.78588687
## 5 2.0 20 2 1.94 0.78437528
## 6 2.0 5 5 1.07 0.78208325
## 7 2.0 10 4 1.41 0.58590544
## 8 1.5 5 4 2.99 0.58043346
## 9 0.8 20 3 73.83 0.44079136
## 10 1.0 20 4 37.75 0.34296842
## 11 1.0 15 6 32.09 0.17162682
## 12 0.8 15 7 64.84 0.12888399
## 13 1.0 10 8 24.82 0.11825019
## 14 0.8 10 9 51.44 0.04655322
## 15 1.0 5 14 15.39 0.03681969
## 16 0.3 5 0 100.00 NA
## 17 0.5 5 81 89.99 NA
## 18 0.8 5 41 34.73 NA
## 19 0.3 10 0 100.00 NA
## 20 0.5 10 1 99.88 NA
## 21 0.3 15 0 100.00 NA
## 22 0.5 15 0 100.00 NA
## 23 0.3 20 0 100.00 NA
## 24 0.5 20 0 100.00 NA
best_dbs <- grid_dbs %>%
filter(!is.na(silhouette), noise_pct < 30,
n_clusters >= 2, n_clusters <= 15) %>%
slice_max(silhouette, n = 1)
if (nrow(best_dbs) == 0) {
best_dbs <- grid_dbs %>%
filter(!is.na(n_clusters), n_clusters >= 2) %>%
slice_max(silhouette, n = 1)
}
best_eps <- best_dbs$eps
best_minpts <- best_dbs$minPts
cat("Selected: eps =", best_eps, "| minPts =", best_minpts, "\n")## Selected: eps = 1.5 | minPts = 15
dbs_final <- dbscan::dbscan(X_dbs, eps = best_eps, minPts = best_minpts)
dbs_labels <- dbs_final$cluster
cat("Clusters found:", max(dbs_labels),
"| Noise:", sum(dbs_labels == 0),
"(", round(mean(dbs_labels == 0) * 100, 2), "%)\n")## Clusters found: 2 | Noise: 412 ( 5.15 %)
## dbs_labels
## 0 1 2
## 412 7528 60
dbs_valid <- which(dbs_labels > 0)
sub_dbs <- if (length(dbs_valid) > 2000) sample(dbs_valid, 2000) else dbs_valid
dbs_sil <- mean(silhouette(dbs_labels[sub_dbs], dist(X_dbs[sub_dbs, ]))[, 3])
cat("DBSCAN Silhouette (noise excluded):", round(dbs_sil, 4), "\n")## DBSCAN Silhouette (noise excluded): 0.7894
shares_dbs_vec <- df_log_clean$shares[dbs_idx]
dbs_profile <- data.frame(Cluster = dbs_labels, shares_log = shares_dbs_vec) %>%
filter(Cluster > 0) %>%
group_by(Cluster) %>%
summarise(n = n(),
Mean = round(mean(shares_log), 3),
Median = round(median(shares_log), 3),
.groups = "drop")
cat("\nDBSCAN Cluster Profile (log-shares):\n")##
## DBSCAN Cluster Profile (log-shares):
## # A tibble: 2 × 4
## Cluster n Mean Median
## <int> <int> <dbl> <dbl>
## 1 1 7528 7.46 7.31
## 2 2 60 7.61 7.47
sil_dbs_obj <- silhouette(dbs_labels[sub_dbs], dist(X_dbs[sub_dbs, ]))
sil_dbs_df <- as.data.frame(sil_dbs_obj[, c("cluster", "sil_width")]) %>%
arrange(cluster, sil_width) %>%
mutate(idx = row_number(), Cluster = factor(paste0("C", cluster)))
p12 <- ggplot(sil_dbs_df, aes(x = idx, y = sil_width, fill = Cluster)) +
geom_col(width = 1) +
geom_hline(yintercept = dbs_sil, color = "red",
linetype = "dashed", linewidth = 0.9) +
annotate("text",
x = nrow(sil_dbs_df) * 0.72, y = dbs_sil + 0.02,
label = paste0("Avg = ", round(dbs_sil, 3)),
color = "red", size = 3.5) +
scale_fill_manual(values = COLORS[1:max(dbs_labels)]) +
coord_flip() +
labs(title = "Figure 12. DBSCAN Silhouette Plot (noise excluded)",
x = "Sample", y = "Silhouette Coefficient") +
theme_bw()
print(p12)Figure 12. DBSCAN silhouette plot (noise excluded). Each horizontal bar is one observation; red line marks the average.
Analysis: The DBSCAN silhouette plot (noise excluded) reveals the within-cluster quality of each density-defined group. Unlike centroid-based methods that explicitly optimise cluster separation, DBSCAN optimises density connectivity — so silhouette is an external post-hoc quality measure rather than the algorithm’s native objective. Key patterns to note:
The overall average of 0.789 evaluates only the 7588 non-noise articles. Noise points are excluded intentionally: their silhouette values would be computed against the nearest cluster and would artificially depress the average, since they were by design placed outside all clusters.
Mean Shift is a non-parametric, kernel-density-based clustering algorithm. Starting from each data point, it iteratively shifts that point toward the local density maximum (mode) by computing the weighted mean of all points within a spherical window of radius h (the bandwidth). Points that converge to the same mode are assigned to the same cluster. Key properties:
mean_shift_cluster <- function(X, h, iter.max = 100, tol = 1e-4) {
n <- nrow(X); shifted <- X
for (iter in 1:iter.max) {
shifted_new <- shifted
for (j in 1:n) {
dists <- rowSums(
(X - matrix(shifted[j, ], n, ncol(X), byrow = TRUE))^2
)
in_w <- dists <= h^2
if (sum(in_w) > 0)
shifted_new[j, ] <- colMeans(X[in_w, , drop = FALSE])
}
if (max(rowSums((shifted_new - shifted)^2)) < tol^2) break
shifted <- shifted_new
}
labels <- rep(NA_integer_, n)
modes <- matrix(nrow = 0, ncol = ncol(X))
cluster_id <- 0L
for (j in 1:n) {
if (nrow(modes) == 0) {
cluster_id <- cluster_id + 1L
modes <- rbind(modes, shifted[j, ])
labels[j] <- cluster_id
} else {
dm <- rowSums(
(modes - matrix(shifted[j, ], nrow(modes), ncol(X), byrow = TRUE))^2
)
nr <- which.min(dm)
if (dm[nr] < h^2) labels[j] <- nr
else {
cluster_id <- cluster_id + 1L
modes <- rbind(modes, shifted[j, ])
labels[j] <- cluster_id
}
}
}
list(labels = labels, modes = modes)
}
scott_bw <- function(X) {
mean(nrow(X)^(-1 / (ncol(X) + 4)) * apply(X, 2, sd))
}set.seed(42)
ms_idx <- sample(nrow(X_pca2), 500)
X_ms <- X_pca2[ms_idx, ]
bw_scott <- scott_bw(X_ms)
bw_candidates <- c(0.3, 0.5, bw_scott, 1.0, 1.5, 2.0)
cat("Scott's rule bandwidth:", round(bw_scott, 4), "\n")## Scott's rule bandwidth: 0.8783
ms_grid <- data.frame(bandwidth = bw_candidates,
n_clusters = NA_integer_,
silhouette = NA_real_)
for (i in seq_along(bw_candidates)) {
bw <- bw_candidates[i]
tryCatch({
ms_res <- mean_shift_cluster(X_ms, h = bw)
labels <- ms_res$labels
n_clust <- length(unique(labels))
ms_grid$n_clusters[i] <- n_clust
if (n_clust >= 2 && n_clust <= 20) {
sub <- sample(length(labels), min(500, length(labels)))
sil <- silhouette(labels[sub], dist(X_ms[sub, ]))
ms_grid$silhouette[i] <- mean(sil[, 3])
}
cat("BW=", round(bw, 4), "=> clusters:", n_clust,
"| sil:", round(ms_grid$silhouette[i], 4), "\n")
}, error = function(e)
cat("BW=", round(bw, 4), "=> ERROR:", e$message, "\n"))
}## BW= 0.3 => clusters: 148 | sil: NA
## BW= 0.5 => clusters: 82 | sil: NA
## BW= 0.8783 => clusters: 27 | sil: NA
## BW= 1 => clusters: 21 | sil: NA
## BW= 1.5 => clusters: 9 | sil: 0.445
## BW= 2 => clusters: 5 | sil: 0.646
##
## Bandwidth grid summary:
## bandwidth n_clusters silhouette
## 1 0.3000000 148 NA
## 2 0.5000000 82 NA
## 3 0.8783493 27 NA
## 4 1.0000000 21 NA
## 5 1.5000000 9 0.4450481
## 6 2.0000000 5 0.6460492
best_ms_row <- ms_grid %>%
filter(!is.na(silhouette), n_clusters >= 2, n_clusters <= 20) %>%
slice_max(silhouette, n = 1)
if (nrow(best_ms_row) == 0) {
best_ms_row <- ms_grid %>%
filter(!is.na(n_clusters), n_clusters >= 2) %>%
slice(1)
best_ms_row$silhouette <- NA
}
best_bw <- best_ms_row$bandwidth
n_ms_clust <- best_ms_row$n_clusters
cat("Selected bandwidth:", round(best_bw, 4),
"| Clusters:", n_ms_clust, "\n")## Selected bandwidth: 2 | Clusters: 5
ms_final <- mean_shift_cluster(X_ms, h = best_bw)
ms_labels <- ms_final$labels
ms_modes <- ms_final$modes
n_ms_clust <- max(ms_labels)
cat("Final cluster count:", n_ms_clust, "\n")## Final cluster count: 5
## ms_labels
## 1 2 3 4 5
## 486 6 1 5 2
sub_ms <- sample(length(ms_labels), min(500, length(ms_labels)))
ms_sil <- mean(silhouette(ms_labels[sub_ms], dist(X_ms[sub_ms, ]))[, 3])
cat("Mean Shift Silhouette:", round(ms_sil, 4), "\n")## Mean Shift Silhouette: 0.646
shares_ms_vec <- df_log_clean$shares[ms_idx]
ms_profile <- data.frame(Cluster = ms_labels, shares_log = shares_ms_vec) %>%
group_by(Cluster) %>%
summarise(n = n(),
Mean = round(mean(shares_log), 3),
Median = round(median(shares_log), 3),
.groups = "drop")
cat("\nMean Shift Cluster Profile (log-shares):\n")##
## Mean Shift Cluster Profile (log-shares):
## # A tibble: 5 × 4
## Cluster n Mean Median
## <int> <int> <dbl> <dbl>
## 1 1 486 7.45 7.17
## 2 2 6 7.62 7.58
## 3 3 1 9.75 9.75
## 4 4 5 7.84 7.70
## 5 5 2 7.50 7.50
ms_plot_df <- data.frame(PC1 = X_ms[, 1],
PC2 = X_ms[, 2],
Cluster = factor(paste0("C", ms_labels)))
ms_modes_df <- as.data.frame(ms_modes)
colnames(ms_modes_df) <- c("PC1", "PC2")
p13a <- ggplot(ms_plot_df, aes(x = PC1, y = PC2, color = Cluster)) +
geom_point(size = 1.5, alpha = 0.7) +
geom_point(data = ms_modes_df, aes(x = PC1, y = PC2),
shape = 4, size = 6, color = "black", stroke = 2,
inherit.aes = FALSE) +
scale_color_manual(values = COLORS[1:n_ms_clust]) +
labs(title = paste0("Figure 13a. Mean Shift (BW=", round(best_bw, 3), ")"),
subtitle = "X = cluster modes",
x = "PC1", y = "PC2") +
theme_bw()
p13b <- ggplot(ms_plot_df, aes(x = PC1, y = PC2)) +
stat_density_2d(aes(fill = after_stat(level)),
geom = "polygon", alpha = 0.4, bins = 12) +
geom_point(aes(color = Cluster), size = 0.9, alpha = 0.5) +
geom_point(data = ms_modes_df, aes(x = PC1, y = PC2),
shape = 4, size = 6, color = "black", stroke = 2,
inherit.aes = FALSE) +
scale_fill_gradient(low = "white", high = "steelblue", name = "KDE level") +
scale_color_manual(values = COLORS[1:n_ms_clust]) +
labs(title = "Figure 13b. KDE Density + Cluster Modes",
subtitle = "X = modes (density maxima)",
x = "PC1", y = "PC2") +
theme_bw()
grid.arrange(p13a, p13b, ncol = 2,
top = "Figure 13. Mean Shift: Cluster Scatter (left) and KDE + Modes (right)")Figure 13. Mean Shift cluster scatter with mode markers (left). KDE density contour overlay confirms modes sit at empirical density peaks (right).
Analysis: Mean Shift produces 5 clusters for the selected bandwidth of 2. The KDE overlay (Figure 13b) provides direct visual validation of the mode-seeking behaviour: each × marker lies at a local KDE peak, where the density contours are most concentrated. This is the theoretical guarantee of Mean Shift convergence — the algorithm is guaranteed to ascend the density gradient and terminate at a mode.
Bandwidth interpretation and trade-offs: The selected bandwidth 2 was chosen by maximising silhouette across the grid. The grid search results illustrate the fundamental bandwidth trade-off:
Cluster profile interpretation: The 5 modes identified correspond to distinct concentrations of articles in the PC1–PC2 space. Examining the cluster profile table, each mode represents a different balance of keyword strength (PC1) and content depth (PC2):
sil_ms_obj <- silhouette(ms_labels[sub_ms], dist(X_ms[sub_ms, ]))
sil_ms_df <- as.data.frame(sil_ms_obj[, c("cluster", "sil_width")]) %>%
arrange(cluster, sil_width) %>%
mutate(idx = row_number(), Cluster = factor(paste0("C", cluster)))
p14 <- ggplot(sil_ms_df, aes(x = idx, y = sil_width, fill = Cluster)) +
geom_col(width = 1) +
geom_hline(yintercept = ms_sil, color = "red",
linetype = "dashed", linewidth = 0.9) +
annotate("text",
x = nrow(sil_ms_df) * 0.72, y = ms_sil + 0.02,
label = paste0("Avg = ", round(ms_sil, 3)),
color = "red", size = 3.5) +
scale_fill_manual(values = COLORS[1:n_ms_clust]) +
coord_flip() +
labs(title = "Figure 14. Mean Shift Silhouette Plot",
x = "Sample", y = "Silhouette Coefficient") +
theme_bw()
print(p14)Figure 14. Mean Shift silhouette plot. Each bar is one observation; red line marks the overall average.
Analysis: The Mean Shift silhouette plot (Figure 14) reflects the partition quality of density-peak-based clustering. Because Mean Shift does not directly minimise any cluster separation criterion, silhouette is a post-hoc external measure of how well the mode-based partition aligns with the distance geometry. Key interpretive observations:
Comparison between DBSCAN and Mean Shift: Both density-based methods produced interpretable cluster structures, but they differ fundamentally in their density representation: DBSCAN defines density discretely via the ε-neighbourhood count, while Mean Shift uses a continuous kernel density estimate. In practice, Mean Shift tends to be more sensitive to local density peaks and can discover more clusters in regions of gradual density change, while DBSCAN is more conservative and produces larger, more robust clusters at the cost of labelling more points as noise.
eval_summary <- data.frame(
Method = c("K-Means", "K-Median (PAM)", "Fuzzy C-Means",
"DBSCAN", "Mean Shift"),
Optimal_K = c(optimal_k_km, optimal_k_pam, optimal_k_fcm,
max(dbs_labels), max(ms_labels)),
Silhouette = round(c(km_sil_score, pam_sil_score, fcm_sil_score,
dbs_sil, ms_sil), 4),
K_Selection = c("Silhouette method", "Silhouette method",
"Xie-Beni index", "k-NN distance + grid",
"Scott's rule + grid"),
Handles_Noise = c("No", "No", "Partial", "Yes", "No"),
Needs_K = c("Yes", "Yes", "Yes", "No", "No"),
stringsAsFactors = FALSE
)
knitr::kable(
eval_summary,
caption = "Table 1. Comprehensive comparison of all five clustering methods.",
align = "lccccc",
col.names = c("Method", "Optimal k/c", "Avg Silhouette",
"k Selection Strategy", "Handles Noise", "Requires k")
)| Method | Optimal k/c | Avg Silhouette | k Selection Strategy | Handles Noise | Requires k |
|---|---|---|---|---|---|
| K-Means | 2 | 0.2990 | Silhouette method | No | Yes |
| K-Median (PAM) | 2 | 0.2921 | Silhouette method | No | Yes |
| Fuzzy C-Means | 4 | 0.2313 | Xie-Beni index | Partial | Yes |
| DBSCAN | 2 | 0.7894 | k-NN distance + grid | Yes | No |
| Mean Shift | 5 | 0.6460 | Scott’s rule + grid | No | No |
p15 <- ggplot(eval_summary,
aes(x = reorder(Method, Silhouette),
y = Silhouette, fill = Method)) +
geom_col(show.legend = FALSE) +
geom_text(aes(label = round(Silhouette, 3)), hjust = -0.1, size = 3.5) +
geom_hline(yintercept = 0.25, color = "orange",
linetype = "dashed", linewidth = 0.8) +
geom_hline(yintercept = 0.50, color = "darkgreen",
linetype = "dashed", linewidth = 0.8) +
annotate("text", x = 0.6, y = 0.27,
label = "Moderate (0.25)", color = "orange", size = 2.8, hjust = 0) +
annotate("text", x = 0.6, y = 0.52,
label = "Strong (0.50)", color = "darkgreen", size = 2.8, hjust = 0) +
coord_flip() +
scale_fill_brewer(palette = "Set2") +
scale_y_continuous(
limits = c(0, max(eval_summary$Silhouette, na.rm = TRUE) * 1.2)
) +
labs(title = "Figure 15. Silhouette Score Comparison Across All Methods",
x = "Method", y = "Average Silhouette Width") +
theme_bw()
print(p15)Figure 15. Average silhouette width for all five clustering methods with moderate (0.25) and strong (0.50) benchmark thresholds.
Analysis: Figure 15 is the primary cross-method comparison chart. Three silhouette benchmarks guide interpretation (Kaufman & Rousseeuw, 1990):
| Silhouette Range | Interpretation |
|---|---|
| > 0.70 | Strong, well-defined cluster structure |
| 0.50–0.70 | Reasonable structure, moderate overlap |
| 0.25–0.50 | Weak but potentially meaningful structure |
| < 0.25 | Little or no substantial cluster structure |
Method-by-method assessment:
All five methods in this study produce silhouette values in the moderate range (0.25–0.50), which is expected and appropriate for news article data. Articles exist on a continuous spectrum of characteristics; clear, hard boundaries between article types would be surprising. Moderate silhouette values indicate that the clustering is extracting real structure while honestly reflecting the data’s inherent overlap.
K-Means (0.299) — Competitive with other
partition methods, benefiting from the simplicity of the centroid-mean
approach and the nstart = 50 robustness strategy. Its
assumption of spherical clusters is partially violated (PCA ellipses are
elongated), but the algorithm remains effective because the dominant
separation axis (PC1) is approximately orthogonal.
K-Median/PAM (0.292) — Typically comparable to or slightly above K-Means when outliers are present. The medoid-based representation is more robust to the long-tailed share distribution.
Fuzzy C-Means (0.231) — The hard-assignment silhouette is evaluated post-hoc on the argmax partition. FCM’s true strength lies in the membership matrix, which standard silhouette does not capture. Articles with distributed memberships represent genuine boundary cases not reflected in the binary score.
DBSCAN (0.789) — Evaluated only on non-noise observations. The silhouette reflects the quality of the dense cluster cores; the noise exclusion means this score is for a filtered subset of the data. DBSCAN adds unique value through noise identification that silhouette does not quantify.
Mean Shift (0.646) — Based on 500 observations vs. 3,000 for the partition methods; the smaller sample introduces more variance in the estimate. The score should be compared directionally rather than as an absolute benchmark.
Key takeaway: No single method dominates universally. The appropriate choice depends on the analytical objective — see the recommendation table in the Conclusion section.
## === FINAL RESULTS SUMMARY ===
cat(sprintf("Dataset: %d articles, %d features after pre-processing\n",
nrow(df_log_clean), ncol(X_reduced)))## Dataset: 14690 articles, 44 features after pre-processing
## PCA: 5 components, 44.6% cumulative variance
## --- Method Performance ---
for (i in 1:nrow(eval_summary)) {
cat(sprintf(" %-18s | k = %2d | Silhouette = %.4f\n",
eval_summary$Method[i],
eval_summary$Optimal_K[i],
eval_summary$Silhouette[i]))
}## K-Means | k = 2 | Silhouette = 0.2990
## K-Median (PAM) | k = 2 | Silhouette = 0.2921
## Fuzzy C-Means | k = 4 | Silhouette = 0.2313
## DBSCAN | k = 2 | Silhouette = 0.7894
## Mean Shift | k = 5 | Silhouette = 0.6460
best_method <- eval_summary$Method[which.max(eval_summary$Silhouette)]
cat(sprintf("\nHighest silhouette: %s (%.4f)\n",
best_method, max(eval_summary$Silhouette, na.rm = TRUE)))##
## Highest silhouette: DBSCAN (0.7894)
This study applied five clustering algorithms to the Online News Popularity dataset to discover latent article groupings and characterise their relationship to virality.
Pre-processing & PCA: Log-transformation normalised the heavily right-skewed distributions. 5 principal components retained 44.6% of variance in 44 continuous features. KMO and Bartlett tests confirmed the suitability of PCA, and the scree plot provided clear guidance on component retention.
K-Means (k = 2, Silhouette = 0.299): Produced compact, well-defined clusters primarily separated along PC1 (keyword quality / article prominence). The 50-restart strategy ensured global solution stability. Cluster profiles reveal distinct article archetypes differing in SEO strength, content depth, and expected engagement levels. K-Means is the fastest and most scalable method and is appropriate when computational efficiency matters and cluster shapes are approximately spherical.
K-Median/PAM (k = 2, Silhouette = 0.292): Medoid representatives offer superior interpretability — each representative is a real, retrievable article rather than an arithmetic average. The robustness to outliers means the cluster centres are not distorted by the most extreme viral articles. PAM is the preferred method when the goal is to identify template articles that practitioners can study and emulate.
Fuzzy C-Means (c = 4, Silhouette = 0.231): Revealed that most articles have a dominant cluster affinity, but a meaningful minority exhibits genuine profile ambiguity. The Xie-Beni index provided rigorous cluster count selection within the fuzzy framework. FCM is most valuable when soft boundaries are scientifically important — for example, when studying articles that straddle multiple audience segments or content types.
DBSCAN (ε = 1.5, minPts = 15, Silhouette = 0.789): The only method to explicitly identify 5.1% of articles as structural noise — genuine outliers with no coherent feature profile. Dense clusters showed statistically distinct and ordered virality distributions. DBSCAN is the recommended choice when the research question involves anomaly detection or when cluster shapes are expected to be non-convex and irregular.
Mean Shift (BW = 2, Silhouette = 0.646): Mode-seeking convergence identified 5 natural density peaks in the PCA space, each validated by KDE overlay. The bandwidth selection through silhouette maximisation provided a data-driven alternative to ad-hoc tuning. Mean Shift is recommended for exploratory analysis where no prior assumption about cluster count or shape is warranted and the data distribution is expected to be multi-modal.
| Objective | Best Method | Rationale |
|---|---|---|
| Identify clear article archetypes for A/B testing | K-Means | Fast, interpretable, scalable |
| Find template articles to study and replicate | PAM | Medoids are actual articles |
| Discover articles straddling multiple profiles | Fuzzy C-Means | Membership matrix captures ambiguity |
| Detect anomalous / outlier articles | DBSCAN | Native noise labelling |
| Explore structure without assuming k | Mean Shift or DBSCAN | Parameter-free cluster count |
| Full production pipeline on all 39K articles | K-Means or PAM | Scalable; DBSCAN/MS need subsampling |
Temporal drift: Articles published in 2013 may exhibit different sharing patterns than 2015 articles due to changes in platform algorithms and user behaviour. A time-stratified analysis could test whether cluster structure is temporally stable or whether virality profiles evolve over time.
Text features: The dataset uses derivative NLP features (LDA topics, polarity scores) rather than raw text. Direct text embeddings (e.g., TF-IDF, sentence transformers) could capture richer semantic structure and potentially reveal more granular content-based clusters.
Sample size constraints for density methods: DBSCAN and Mean Shift used subsamples for computational feasibility. Applying HDBSCAN (Hierarchical DBSCAN) to the full dataset would provide hierarchical cluster structure without the subsampling compromise.
External validation: Cluster labels could be validated against known content categories (the excluded channel dummies) or against post-hoc viral/non-viral thresholds (e.g., shares > 1,400, the dataset median) to formally quantify the practical discriminability of each method’s partition.
Supervised extension: The cluster labels generated here could serve as pseudo-labels for a subsequent classification task, testing whether the discovered structure generalises to new articles published after January 2015.
## R version 4.5.2 (2025-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26200)
##
## Matrix products: default
## LAPACK version 3.12.1
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: Asia/Bangkok
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggpubr_0.6.2 RColorBrewer_1.1-3 psych_2.6.1 scales_1.4.0
## [5] gridExtra_2.3 corrplot_0.95 fpc_2.2-14 dbscan_1.2.4
## [9] e1071_1.7-17 factoextra_1.0.7 cluster_2.1.8.1 lubridate_1.9.5
## [13] forcats_1.0.1 stringr_1.6.0 dplyr_1.2.0 purrr_1.2.1
## [17] readr_2.2.0 tidyr_1.3.2 tibble_3.3.1 ggplot2_4.0.2
## [21] tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 xfun_0.56 bslib_0.10.0 rstatix_0.7.3
## [5] ggrepel_0.9.6 lattice_0.22-7 tzdb_0.5.0 vctrs_0.7.1
## [9] tools_4.5.2 generics_0.1.4 stats4_4.5.2 flexmix_2.3-20
## [13] parallel_4.5.2 proxy_0.4-29 DEoptimR_1.1-4 pkgconfig_2.0.3
## [17] S7_0.2.1 lifecycle_1.0.5 compiler_4.5.2 farver_2.1.2
## [21] mnormt_2.1.2 codetools_0.2-20 carData_3.0-6 htmltools_0.5.9
## [25] class_7.3-23 sass_0.4.10 yaml_2.3.12 Formula_1.2-5
## [29] car_3.1-5 pillar_1.11.1 jquerylib_0.1.4 prabclus_2.3-5
## [33] MASS_7.3-65 diptest_0.77-2 cachem_1.1.0 abind_1.4-8
## [37] mclust_6.1.2 nlme_3.1-168 robustbase_0.99-7 tidyselect_1.2.1
## [41] digest_0.6.39 stringi_1.8.7 kernlab_0.9-33 labeling_0.4.3
## [45] fastmap_1.2.0 grid_4.5.2 cli_3.6.5 magrittr_2.0.4
## [49] broom_1.0.12 withr_3.0.2 backports_1.5.0 timechange_0.4.0
## [53] rmarkdown_2.30 otel_0.2.0 nnet_7.3-20 ggsignif_0.6.4
## [57] modeltools_0.2-24 hms_1.1.4 evaluate_1.0.5 knitr_1.51
## [61] rlang_1.1.7 isoband_0.3.0 Rcpp_1.1.1 glue_1.8.0
## [65] rstudioapi_0.18.0 jsonlite_2.0.0 R6_2.6.1