knitr::opts_chunk$set(
echo = TRUE,
warning = FALSE,
message = FALSE,
fig.width = 7.5,
fig.height = 5.5,
dpi = 140
)
required_pkgs <- c(
"MASS", "mvtnorm", "mclust", "cluster", "ggplot2", "dplyr", "tidyr",
"purrr", "stringr", "tibble", "jsonlite", "readr", "glue", "patchwork"
)
to_install <- required_pkgs[!vapply(required_pkgs, requireNamespace, logical(1), quietly = TRUE)]
if (length(to_install) > 0) {
install.packages(to_install, repos = "https://cloud.r-project.org")
}
invisible(lapply(required_pkgs, library, character.only = TRUE))
theme_set(theme_minimal(base_size = 12))
set.seed(1234)
This notebook provides a journal-quality synthetic benchmarking harness for comparing a determinant-based clustering method (Wilks-lambda style) against K-means. In this R version, the full generator suite, K-means evaluation, visual diagnostics, export utilities, and summary tables are implemented directly. The Wilks method is left as a plug-in adapter so that an external implementation can be inserted without changing the benchmark protocol.
The benchmark design varies: - cluster separation and overlap, - covariance heterogeneity and anisotropy, - class imbalance, - heavy tails and contamination, - a nonconvex stress test.
The dimensionality is deliberately restricted to \(d \in \{2,5,10\}\) so that the geometry remains interpretable.
K-means is a trace-based compactness criterion and is well aligned with approximately spherical clusters of similar scale. A Wilks-lambda style criterion is instead sensitive to generalized within-cluster scatter, and is therefore especially interesting when covariance geometry differs across clusters. The benchmark families below are chosen to isolate those distinctions.
if (!requireNamespace("graphicalExtremes", quietly = TRUE)) {
install.packages("graphicalExtremes")
}
safe_dir_create <- function(path) {
if (!dir.exists(path)) dir.create(path, recursive = TRUE, showWarnings = FALSE)
invisible(path)
}
make_sigma <- function(d, eigenvalues = NULL, angle = 0, random_rotation = FALSE, seed = NULL) {
stopifnot(d >= 1)
if (!is.null(seed)) {
old_seed <- if (exists(".Random.seed", envir = .GlobalEnv, inherits = FALSE)) .Random.seed else NULL
on.exit({
if (is.null(old_seed)) {
if (exists(".Random.seed", envir = .GlobalEnv, inherits = FALSE)) {
rm(.Random.seed, envir = .GlobalEnv)
}
} else {
assign(".Random.seed", old_seed, envir = .GlobalEnv)
}
}, add = TRUE)
set.seed(seed)
}
if (is.null(eigenvalues)) {
return(random_spd(d, seed = NULL))
}
stopifnot(length(eigenvalues) == d)
stopifnot(all(eigenvalues > 0))
if (d == 1) {
return(matrix(eigenvalues[1], nrow = 1, ncol = 1))
}
Q <- diag(d)
if (d >= 2 && abs(angle) > 0) {
rotation_2d <- matrix(
c(cos(angle), -sin(angle),
sin(angle), cos(angle)),
nrow = 2,
byrow = TRUE
)
Q[1:2, 1:2] <- rotation_2d
}
if (isTRUE(random_rotation)) {
base_spd <- graphicalExtremes::generate_random_spd_matrix(d)
eig <- eigen(base_spd, symmetric = TRUE)
Q <- eig$vectors
}
Q %*% diag(eigenvalues, d) %*% t(Q)
}
random_spd <- function(d, eig_low = 0.3, eig_high = 2.5, seed = NULL) {
if (!is.null(seed)) {
old_seed <- if (exists(".Random.seed", envir = .GlobalEnv, inherits = FALSE)) .Random.seed else NULL
on.exit({
if (is.null(old_seed)) {
if (exists(".Random.seed", envir = .GlobalEnv, inherits = FALSE)) {
rm(.Random.seed, envir = .GlobalEnv)
}
} else {
assign(".Random.seed", old_seed, envir = .GlobalEnv)
}
}, add = TRUE)
set.seed(seed)
}
Sigma <- graphicalExtremes::generate_random_spd_matrix(d)
diag_scale <- sqrt(diag(Sigma))
Sigma <- sweep(sweep(Sigma, 1, diag_scale, "/"), 2, diag_scale, "/")
eig <- eigen(Sigma, symmetric = TRUE)
target <- seq(eig_low, eig_high, length.out = d)
target <- sample(target, size = d, replace = FALSE)
eig$vectors %*% diag(target, d) %*% t(eig$vectors)
}
sample_multivariate_t <- function(n, mu, Sigma, df) {
d <- length(mu)
Z <- MASS::mvrnorm(n, mu = rep(0, d), Sigma = Sigma)
g <- sqrt(df / rchisq(n, df))
X <- sweep(Z, 1, g, "*")
X <- sweep(X, 2, mu, "+")
X
}
sample_skewed_like <- function(n, mu, Sigma, skew_strength = 1.5) {
d <- length(mu)
Z <- MASS::mvrnorm(n, mu = rep(0, d), Sigma = Sigma)
X <- Z
X[, 1] <- X[, 1] + skew_strength * abs(Z[, 1])
sweep(X, 2, mu, "+")
}
compute_pairwise_center_distance <- function(centers) {
if (nrow(centers) < 2) return(NA_real_)
as.numeric(min(dist(centers)))
}
bhattacharyya_distance_gaussian <- function(mu1, Sigma1, mu2, Sigma2) {
Sigma <- 0.5 * (Sigma1 + Sigma2)
diff <- matrix(mu1 - mu2, ncol = 1)
term1 <- 0.125 * t(diff) %*% solve(Sigma, diff)
term2 <- 0.5 * log(det(Sigma) / sqrt(det(Sigma1) * det(Sigma2)))
as.numeric(term1 + term2)
}
mean_pairwise_bhattacharyya <- function(mus, Sigmas) {
K <- length(mus)
if (K < 2) return(NA_real_)
vals <- c()
for (i in 1:(K-1)) {
for (j in (i+1):K) {
vals <- c(vals, bhattacharyya_distance_gaussian(mus[[i]], Sigmas[[i]], mus[[j]], Sigmas[[j]]))
}
}
mean(vals)
}
mode_int <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
Each generator returns: - X: feature matrix, -
y_true: true labels, - meta: scenario
metadata, - optionally distributional objects used to define
difficulty.
generate_gaussian_mixture <- function(n, d, centers, Sigmas, probs = NULL, seed = NULL, family = "gaussian") {
if (!is.null(seed)) set.seed(seed)
K <- nrow(centers)
if (is.null(probs)) probs <- rep(1 / K, K)
counts <- as.vector(rmultinom(1, n, probs))
X_parts <- vector("list", K)
y_parts <- vector("list", K)
mus <- vector("list", K)
Sigma_list <- vector("list", K)
for (k in seq_len(K)) {
X_parts[[k]] <- MASS::mvrnorm(counts[k], mu = centers[k, ], Sigma = Sigmas[[k]])
y_parts[[k]] <- rep(k, counts[k])
mus[[k]] <- centers[k, ]
Sigma_list[[k]] <- Sigmas[[k]]
}
X <- do.call(rbind, X_parts)
y <- unlist(y_parts)
idx <- sample(seq_len(nrow(X)))
X <- X[idx, , drop = FALSE]
y <- y[idx]
list(
X = X,
y_true = y,
meta = list(
family = family,
n = n,
d = d,
K = K,
probs = probs,
min_center_distance = compute_pairwise_center_distance(centers),
mean_pairwise_bhattacharyya = mean_pairwise_bhattacharyya(mus, Sigma_list)
),
centers = centers,
Sigmas = Sigmas
)
}
generate_contaminated_gaussian <- function(n, d, centers, Sigmas, contamination = 0.08,
inflation = 12, probs = NULL, seed = NULL) {
if (!is.null(seed)) set.seed(seed)
K <- nrow(centers)
if (is.null(probs)) probs <- rep(1 / K, K)
counts <- as.vector(rmultinom(1, n, probs))
X_parts <- vector("list", K)
y_parts <- vector("list", K)
for (k in seq_len(K)) {
nk <- counts[k]
is_contam <- rbinom(nk, size = 1, prob = contamination)
X_clean <- MASS::mvrnorm(sum(is_contam == 0), mu = centers[k, ], Sigma = Sigmas[[k]])
X_contam <- MASS::mvrnorm(sum(is_contam == 1), mu = centers[k, ], Sigma = inflation * Sigmas[[k]])
Xk <- rbind(X_clean, X_contam)
if (nrow(Xk) > 0) Xk <- Xk[sample(seq_len(nrow(Xk))), , drop = FALSE]
X_parts[[k]] <- Xk
y_parts[[k]] <- rep(k, nk)
}
X <- do.call(rbind, X_parts)
y <- unlist(y_parts)
idx <- sample(seq_len(nrow(X)))
X <- X[idx, , drop = FALSE]
y <- y[idx]
list(
X = X,
y_true = y,
meta = list(
family = "contaminated_gaussian",
n = n,
d = d,
K = K,
probs = probs,
contamination = contamination,
inflation = inflation,
min_center_distance = compute_pairwise_center_distance(centers)
),
centers = centers,
Sigmas = Sigmas
)
}
generate_student_t_mixture <- function(n, d, centers, Sigmas, df = 4, probs = NULL, seed = NULL) {
if (!is.null(seed)) set.seed(seed)
K <- nrow(centers)
if (is.null(probs)) probs <- rep(1 / K, K)
counts <- as.vector(rmultinom(1, n, probs))
X_parts <- vector("list", K)
y_parts <- vector("list", K)
for (k in seq_len(K)) {
X_parts[[k]] <- sample_multivariate_t(counts[k], mu = centers[k, ], Sigma = Sigmas[[k]], df = df)
y_parts[[k]] <- rep(k, counts[k])
}
X <- do.call(rbind, X_parts)
y <- unlist(y_parts)
idx <- sample(seq_len(nrow(X)))
X <- X[idx, , drop = FALSE]
y <- y[idx]
list(
X = X,
y_true = y,
meta = list(
family = "student_t",
n = n,
d = d,
K = K,
probs = probs,
df = df,
min_center_distance = compute_pairwise_center_distance(centers)
),
centers = centers,
Sigmas = Sigmas
)
}
generate_skewed_mixture <- function(n, d, centers, Sigmas, skew_strength = 1.5, probs = NULL, seed = NULL) {
if (!is.null(seed)) set.seed(seed)
K <- nrow(centers)
if (is.null(probs)) probs <- rep(1 / K, K)
counts <- as.vector(rmultinom(1, n, probs))
X_parts <- vector("list", K)
y_parts <- vector("list", K)
for (k in seq_len(K)) {
X_parts[[k]] <- sample_skewed_like(counts[k], mu = centers[k, ], Sigma = Sigmas[[k]], skew_strength = skew_strength)
y_parts[[k]] <- rep(k, counts[k])
}
X <- do.call(rbind, X_parts)
y <- unlist(y_parts)
idx <- sample(seq_len(nrow(X)))
X <- X[idx, , drop = FALSE]
y <- y[idx]
list(
X = X,
y_true = y,
meta = list(
family = "skewed_like",
n = n,
d = d,
K = K,
probs = probs,
skew_strength = skew_strength,
min_center_distance = compute_pairwise_center_distance(centers)
),
centers = centers,
Sigmas = Sigmas
)
}
generate_two_moons <- function(n, noise = 0.08, seed = NULL) {
if (!is.null(seed)) set.seed(seed)
n1 <- floor(n / 2)
n2 <- n - n1
t1 <- runif(n1, 0, pi)
t2 <- runif(n2, 0, pi)
X1 <- cbind(cos(t1), sin(t1))
X2 <- cbind(1 - cos(t2), -sin(t2) - 0.4)
X <- rbind(X1, X2) + matrix(rnorm(2 * n, sd = noise), ncol = 2)
y <- c(rep(1, n1), rep(2, n2))
idx <- sample(seq_len(n))
list(
X = X[idx, , drop = FALSE],
y_true = y[idx],
meta = list(
family = "two_moons",
n = n,
d = 2,
K = 2,
noise = noise
)
)
}
The configurations are chosen to cover easy, moderate, and difficult Gaussian settings, covariance heterogeneity, non-Gaussian departures, and a nonconvex stress test. Every 2D family has a dedicated visualization setup.
make_benchmarks <- function() {
benchmarks <- list()
# 2D Gaussian spherical difficulty ladder
benchmarks[["gaussian_spherical_easy_2d"]] <- list(
scenario = "gaussian_spherical_easy_2d",
d = 2, n = 450, K = 3, family = "gaussian_spherical", difficulty = "easy",
generator = function(seed) {
centers <- rbind(c(-4, 0), c(0, 4), c(4, 0))
Sigmas <- list(diag(2), diag(2), diag(2))
generate_gaussian_mixture(450, 2, centers, Sigmas, seed = seed, family = "gaussian_spherical")
}
)
benchmarks[["gaussian_spherical_moderate_2d"]] <- list(
scenario = "gaussian_spherical_moderate_2d",
d = 2, n = 450, K = 3, family = "gaussian_spherical", difficulty = "moderate",
generator = function(seed) {
centers <- rbind(c(-2.7, 0), c(0, 2.7), c(2.7, 0))
Sigmas <- list(diag(2), diag(2), diag(2))
generate_gaussian_mixture(450, 2, centers, Sigmas, seed = seed, family = "gaussian_spherical")
}
)
benchmarks[["gaussian_spherical_hard_2d"]] <- list(
scenario = "gaussian_spherical_hard_2d",
d = 2, n = 450, K = 3, family = "gaussian_spherical", difficulty = "hard",
generator = function(seed) {
centers <- rbind(c(-1.8, 0), c(0, 1.8), c(1.8, 0))
Sigmas <- list(diag(2), diag(2), diag(2))
generate_gaussian_mixture(450, 2, centers, Sigmas, seed = seed, family = "gaussian_spherical")
}
)
benchmarks[["gaussian_anisotropic_rotated_2d"]] <- list(
scenario = "gaussian_anisotropic_rotated_2d",
d = 2, n = 450, K = 3, family = "gaussian_anisotropic", difficulty = "moderate",
generator = function(seed) {
centers <- rbind(c(-3.5, -1.0), c(0, 3.5), c(3.5, -0.6))
Sigmas <- list(
make_sigma(2, c(2.8, 0.25), angle = 0.65),
make_sigma(2, c(2.1, 0.35), angle = -0.45),
make_sigma(2, c(1.7, 0.22), angle = 0.25)
)
generate_gaussian_mixture(450, 2, centers, Sigmas, seed = seed, family = "gaussian_anisotropic")
}
)
benchmarks[["gaussian_unequal_covariance_2d"]] <- list(
scenario = "gaussian_unequal_covariance_2d",
d = 2, n = 450, K = 3, family = "gaussian_unequal_covariance", difficulty = "hard",
generator = function(seed) {
centers <- rbind(c(-3.2, 0), c(0, 2.8), c(3.0, -0.5))
Sigmas <- list(
make_sigma(2, c(3.4, 0.3), angle = 0.35),
make_sigma(2, c(1.0, 0.5), angle = -0.8),
make_sigma(2, c(0.6, 0.18), angle = 0.1)
)
generate_gaussian_mixture(450, 2, centers, Sigmas, seed = seed, family = "gaussian_unequal_covariance")
}
)
benchmarks[["gaussian_imbalanced_2d"]] <- list(
scenario = "gaussian_imbalanced_2d",
d = 2, n = 500, K = 3, family = "gaussian_imbalanced", difficulty = "moderate",
generator = function(seed) {
centers <- rbind(c(-3.2, 0), c(0, 3.0), c(3.2, 0))
Sigmas <- list(diag(2), diag(2), diag(2))
generate_gaussian_mixture(500, 2, centers, Sigmas, probs = c(0.65, 0.25, 0.10), seed = seed, family = "gaussian_imbalanced")
}
)
benchmarks[["student_t_heavy_tail_2d"]] <- list(
scenario = "student_t_heavy_tail_2d",
d = 2, n = 450, K = 3, family = "student_t", difficulty = "hard",
generator = function(seed) {
centers <- rbind(c(-3.0, 0), c(0, 2.7), c(3.0, 0))
Sigmas <- list(diag(2), diag(2), diag(2))
generate_student_t_mixture(450, 2, centers, Sigmas, df = 4, seed = seed)
}
)
benchmarks[["skewed_like_2d"]] <- list(
scenario = "skewed_like_2d",
d = 2, n = 450, K = 3, family = "skewed_like", difficulty = "moderate",
generator = function(seed) {
centers <- rbind(c(-3.0, -0.7), c(0, 3.0), c(3.0, -0.4))
Sigmas <- list(diag(c(0.9, 0.5)), diag(c(0.8, 0.6)), diag(c(0.9, 0.45)))
generate_skewed_mixture(450, 2, centers, Sigmas, skew_strength = 1.35, seed = seed)
}
)
benchmarks[["contaminated_gaussian_2d"]] <- list(
scenario = "contaminated_gaussian_2d",
d = 2, n = 450, K = 3, family = "contaminated_gaussian", difficulty = "hard",
generator = function(seed) {
centers <- rbind(c(-3.2, 0), c(0, 2.8), c(3.2, 0))
Sigmas <- list(diag(2), diag(2), diag(2))
generate_contaminated_gaussian(450, 2, centers, Sigmas, contamination = 0.08, inflation = 14, seed = seed)
}
)
benchmarks[["two_moons_2d"]] <- list(
scenario = "two_moons_2d",
d = 2, n = 500, K = 2, family = "two_moons", difficulty = "stress_test",
generator = function(seed) {
generate_two_moons(500, noise = 0.09, seed = seed)
}
)
# Moderate dimensional quantitative cases
benchmarks[["gaussian_spherical_easy_5d"]] <- list(
scenario = "gaussian_spherical_easy_5d",
d = 5, n = 600, K = 3, family = "gaussian_spherical", difficulty = "easy",
generator = function(seed) {
centers <- rbind(
c(-4, 0, 0, 0, 0),
c(0, 4, 0, 0, 0),
c(4, 0, 0, 0, 0)
)
Sigmas <- list(diag(5), diag(5), diag(5))
generate_gaussian_mixture(600, 5, centers, Sigmas, seed = seed, family = "gaussian_spherical")
}
)
benchmarks[["gaussian_unequal_covariance_5d"]] <- list(
scenario = "gaussian_unequal_covariance_5d",
d = 5, n = 600, K = 3, family = "gaussian_unequal_covariance", difficulty = "hard",
generator = function(seed) {
centers <- rbind(
c(-3.5, 0, 0, 0, 0),
c(0, 3.2, 0, 0, 0),
c(3.2, 0, 0, 0, 0)
)
Sigmas <- list(
diag(c(3.0, 1.5, 0.6, 0.5, 0.5)),
diag(c(1.0, 0.9, 0.6, 0.6, 0.5)),
diag(c(0.6, 0.5, 0.5, 0.4, 0.3))
)
generate_gaussian_mixture(600, 5, centers, Sigmas, seed = seed, family = "gaussian_unequal_covariance")
}
)
benchmarks[["student_t_heavy_tail_5d"]] <- list(
scenario = "student_t_heavy_tail_5d",
d = 5, n = 600, K = 3, family = "student_t", difficulty = "hard",
generator = function(seed) {
centers <- rbind(
c(-3.2, 0, 0, 0, 0),
c(0, 3.2, 0, 0, 0),
c(3.2, 0, 0, 0, 0)
)
Sigmas <- list(diag(5), diag(5), diag(5))
generate_student_t_mixture(600, 5, centers, Sigmas, df = 5, seed = seed)
}
)
benchmarks[["gaussian_spherical_easy_10d"]] <- list(
scenario = "gaussian_spherical_easy_10d",
d = 10, n = 750, K = 3, family = "gaussian_spherical", difficulty = "easy",
generator = function(seed) {
centers <- rbind(
c(-4, 0, 0, 0, 0, 0, 0, 0, 0, 0),
c(0, 4, 0, 0, 0, 0, 0, 0, 0, 0),
c(4, 0, 0, 0, 0, 0, 0, 0, 0, 0)
)
Sigmas <- list(diag(10), diag(10), diag(10))
generate_gaussian_mixture(750, 10, centers, Sigmas, seed = seed, family = "gaussian_spherical")
}
)
benchmarks[["gaussian_unequal_covariance_10d"]] <- list(
scenario = "gaussian_unequal_covariance_10d",
d = 10, n = 750, K = 3, family = "gaussian_unequal_covariance", difficulty = "hard",
generator = function(seed) {
centers <- rbind(
c(-3.6, 0, 0, 0, 0, 0, 0, 0, 0, 0),
c(0, 3.3, 0, 0, 0, 0, 0, 0, 0, 0),
c(3.3, 0, 0, 0, 0, 0, 0, 0, 0, 0)
)
Sigmas <- list(
diag(c(3.2, 1.8, 1.1, 0.9, 0.8, 0.7, 0.7, 0.6, 0.5, 0.5)),
diag(c(1.2, 1.0, 0.9, 0.8, 0.8, 0.7, 0.7, 0.6, 0.6, 0.5)),
diag(c(0.7, 0.6, 0.6, 0.5, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3))
)
generate_gaussian_mixture(750, 10, centers, Sigmas, seed = seed, family = "gaussian_unequal_covariance")
}
)
benchmarks
}
BENCHMARKS <- make_benchmarks()
names(BENCHMARKS)
## [1] "gaussian_spherical_easy_2d" "gaussian_spherical_moderate_2d"
## [3] "gaussian_spherical_hard_2d" "gaussian_anisotropic_rotated_2d"
## [5] "gaussian_unequal_covariance_2d" "gaussian_imbalanced_2d"
## [7] "student_t_heavy_tail_2d" "skewed_like_2d"
## [9] "contaminated_gaussian_2d" "two_moons_2d"
## [11] "gaussian_spherical_easy_5d" "gaussian_unequal_covariance_5d"
## [13] "student_t_heavy_tail_5d" "gaussian_spherical_easy_10d"
## [15] "gaussian_unequal_covariance_10d"
dataset_to_tibble <- function(obj, scenario) {
X <- obj$X
d <- ncol(X)
out <- as_tibble(X)
names(out) <- paste0("x", seq_len(d))
out$y_true <- factor(obj$y_true)
out$scenario <- scenario
out
}
plot_dataset_2d <- function(obj, scenario, alpha = 0.75) {
dat <- dataset_to_tibble(obj, scenario)
ggplot(dat, aes(x = x1, y = x2, color = y_true)) +
geom_point(alpha = alpha, size = 1.7) +
coord_equal() +
labs(
title = scenario,
x = "x1", y = "x2", color = "True cluster"
) +
theme(legend.position = "bottom")
}
benchmarks_2d <- BENCHMARKS[vapply(BENCHMARKS, function(b) b$d == 2, logical(1))]
plots_2d <- imap(benchmarks_2d, function(b, nm) plot_dataset_2d(b$generator(seed = 2026), nm))
wrap_plots(plots_2d, ncol = 2)
The K-means baseline is fully implemented. The Wilks adapter is a placeholder into which an external implementation can be inserted.
fit_predict_kmeans <- function(X, K, nstart = 50, iter.max = 100) {
km <- kmeans(X, centers = K, nstart = nstart, iter.max = iter.max)
list(
cluster = km$cluster,
native_objective = km$tot.withinss
)
}
fit_predict_wilks <- function(X, K, ...) {
stop("Insert your external Wilks-lambda clustering implementation here and return a list with at least $cluster.")
}
METHODS <- list(
KMeans = fit_predict_kmeans
# Wilks = fit_predict_wilks
)
We use external metrics because the synthetic benchmarks come with true labels, and internal metrics as complementary diagnostics.
relabel_by_majority <- function(y_true, y_pred) {
map_df <- tibble(y_true = y_true, y_pred = y_pred) %>%
group_by(y_pred) %>%
summarise(mapped = mode_int(y_true), .groups = "drop")
out <- left_join(tibble(y_pred = y_pred), map_df, by = "y_pred")$mapped
as.integer(out)
}
misclassification_rate <- function(y_true, y_pred) {
y_hat <- relabel_by_majority(y_true, y_pred)
mean(y_hat != y_true)
}
calinski_harabasz_manual <- function(X, clusters) {
n <- nrow(X)
K <- length(unique(clusters))
if (K <= 1 || K >= n) return(NA_real_)
overall <- colMeans(X)
W <- 0
B <- 0
for (k in sort(unique(clusters))) {
Xk <- X[clusters == k, , drop = FALSE]
nk <- nrow(Xk)
if (nk <= 0) next
ck <- colMeans(Xk)
centered <- sweep(Xk, 2, ck, "-")
W <- W + sum(centered^2)
B <- B + nk * sum((ck - overall)^2)
}
(B / (K - 1)) / (W / (n - K))
}
davies_bouldin_manual <- function(X, clusters) {
cl <- sort(unique(clusters))
K <- length(cl)
if (K <= 1) return(NA_real_)
centers <- lapply(cl, function(k) colMeans(X[clusters == k, , drop = FALSE]))
S <- sapply(seq_along(cl), function(i) {
Xi <- X[clusters == cl[i], , drop = FALSE]
ci <- centers[[i]]
mean(sqrt(rowSums((sweep(Xi, 2, ci, "-"))^2)))
})
M <- matrix(NA_real_, K, K)
for (i in seq_len(K)) {
for (j in seq_len(K)) {
if (i != j) {
M[i, j] <- sqrt(sum((centers[[i]] - centers[[j]])^2))
}
}
}
R <- matrix(NA_real_, K, K)
for (i in seq_len(K)) {
for (j in seq_len(K)) {
if (i != j) {
R[i, j] <- (S[i] + S[j]) / M[i, j]
}
}
}
mean(apply(R, 1, max, na.rm = TRUE))
}
wilks_lambda_stat <- function(X, clusters, ridge = 1e-8) {
n <- nrow(X)
d <- ncol(X)
overall <- colMeans(X)
Tmat <- crossprod(sweep(X, 2, overall, "-"))
Wmat <- matrix(0, d, d)
for (k in sort(unique(clusters))) {
Xk <- X[clusters == k, , drop = FALSE]
ck <- colMeans(Xk)
centered <- sweep(Xk, 2, ck, "-")
Wmat <- Wmat + crossprod(centered)
}
Tmat <- Tmat + ridge * diag(d)
Wmat <- Wmat + ridge * diag(d)
det(Wmat) / det(Tmat)
}
compute_metrics <- function(X, y_true, y_pred, native_objective = NA_real_) {
sil <- tryCatch(cluster::silhouette(y_pred, dist(X)), error = function(e) NULL)
sil_mean <- if (is.null(sil)) NA_real_ else mean(sil[, "sil_width"])
tibble(
ARI = mclust::adjustedRandIndex(y_true, y_pred),
Misclassification = misclassification_rate(y_true, y_pred),
Silhouette = sil_mean,
Calinski_Harabasz = calinski_harabasz_manual(X, y_pred),
Davies_Bouldin = davies_bouldin_manual(X, y_pred),
Wilks_Lambda = wilks_lambda_stat(X, y_pred),
Native_Objective = native_objective
)
}
example_obj <- BENCHMARKS[["gaussian_unequal_covariance_2d"]]$generator(seed = 2027)
example_fit <- fit_predict_kmeans(example_obj$X, K = BENCHMARKS[["gaussian_unequal_covariance_2d"]]$K)
example_metrics <- compute_metrics(example_obj$X, example_obj$y_true, example_fit$cluster, example_fit$native_objective)
example_metrics
example_dat <- dataset_to_tibble(example_obj, "gaussian_unequal_covariance_2d")
example_dat$cluster_kmeans <- factor(example_fit$cluster)
p_true <- ggplot(example_dat, aes(x = x1, y = x2, color = y_true)) +
geom_point(alpha = 0.75, size = 1.8) +
coord_equal() +
labs(title = "True labels", color = "True") +
theme(legend.position = "bottom")
p_pred <- ggplot(example_dat, aes(x = x1, y = x2, color = cluster_kmeans)) +
geom_point(alpha = 0.75, size = 1.8) +
coord_equal() +
labs(title = "K-means partition", color = "Predicted") +
theme(legend.position = "bottom")
p_true + p_pred
run_benchmark <- function(benchmarks, methods, n_reps = 20, base_seed = 5000) {
results <- list()
row_id <- 1
for (scenario_name in names(benchmarks)) {
b <- benchmarks[[scenario_name]]
for (rep in seq_len(n_reps)) {
seed_data <- base_seed + 1000 * row_id + rep
obj <- b$generator(seed = seed_data)
X <- obj$X
y_true <- obj$y_true
for (method_name in names(methods)) {
fit <- methods[[method_name]](X, K = b$K)
y_pred <- fit$cluster
native_objective <- if (!is.null(fit$native_objective)) fit$native_objective else NA_real_
met <- compute_metrics(X, y_true, y_pred, native_objective)
met <- met %>%
mutate(
scenario = scenario_name,
family = b$family,
difficulty = b$difficulty,
d = b$d,
n = b$n,
K = b$K,
rep = rep,
seed_data = seed_data,
method = method_name
)
if (!is.null(obj$meta)) {
meta_vals <- obj$meta
if (!is.null(meta_vals$min_center_distance)) met$min_center_distance <- meta_vals$min_center_distance
if (!is.null(meta_vals$mean_pairwise_bhattacharyya)) met$mean_pairwise_bhattacharyya <- meta_vals$mean_pairwise_bhattacharyya
}
results[[length(results) + 1]] <- met
}
}
row_id <- row_id + 1
}
bind_rows(results) %>%
select(
scenario, family, difficulty, method, d, n, K, rep, seed_data,
everything()
)
}
results_tbl <- run_benchmark(BENCHMARKS, METHODS, n_reps = 15, base_seed = 7100)
results_tbl %>% glimpse()
## Rows: 225
## Columns: 18
## $ scenario <chr> "gaussian_spherical_easy_2d", "gaussian_sp…
## $ family <chr> "gaussian_spherical", "gaussian_spherical"…
## $ difficulty <chr> "easy", "easy", "easy", "easy", "easy", "e…
## $ method <chr> "KMeans", "KMeans", "KMeans", "KMeans", "K…
## $ d <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ n <dbl> 450, 450, 450, 450, 450, 450, 450, 450, 45…
## $ K <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, …
## $ rep <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,…
## $ seed_data <dbl> 8101, 8102, 8103, 8104, 8105, 8106, 8107, …
## $ ARI <dbl> 0.9934015, 0.9930235, 0.9864025, 0.9934712…
## $ Misclassification <dbl> 0.002222222, 0.002222222, 0.004444444, 0.0…
## $ Silhouette <dbl> 0.6590121, 0.6798893, 0.6892903, 0.6778953…
## $ Calinski_Harabasz <dbl> 1485.2314, 1616.5058, 1740.1043, 1606.8171…
## $ Davies_Bouldin <dbl> 0.4694331, 0.4383066, 0.4213721, 0.4385242…
## $ Wilks_Lambda <dbl> 0.02206226, 0.01703686, 0.01539706, 0.0180…
## $ Native_Objective <dbl> 978.0614, 822.6333, 814.8083, 891.0392, 81…
## $ min_center_distance <dbl> 5.656854, 5.656854, 5.656854, 5.656854, 5.…
## $ mean_pairwise_bhattacharyya <dbl> 5.333333, 5.333333, 5.333333, 5.333333, 5.…
summary_tbl <- results_tbl %>%
group_by(scenario, family, difficulty, method, d, n, K) %>%
summarise(
ARI_mean = mean(ARI, na.rm = TRUE),
ARI_sd = sd(ARI, na.rm = TRUE),
Misclassification_mean = mean(Misclassification, na.rm = TRUE),
Misclassification_sd = sd(Misclassification, na.rm = TRUE),
Silhouette_mean = mean(Silhouette, na.rm = TRUE),
Silhouette_sd = sd(Silhouette, na.rm = TRUE),
Calinski_Harabasz_mean = mean(Calinski_Harabasz, na.rm = TRUE),
Davies_Bouldin_mean = mean(Davies_Bouldin, na.rm = TRUE),
Wilks_Lambda_mean = mean(Wilks_Lambda, na.rm = TRUE),
Native_Objective_mean = mean(Native_Objective, na.rm = TRUE),
min_center_distance = mean(min_center_distance, na.rm = TRUE),
mean_pairwise_bhattacharyya = mean(mean_pairwise_bhattacharyya, na.rm = TRUE),
.groups = "drop"
)
family_summary_tbl <- results_tbl %>%
group_by(family, difficulty, method, d) %>%
summarise(
ARI_mean = mean(ARI, na.rm = TRUE),
Misclassification_mean = mean(Misclassification, na.rm = TRUE),
Silhouette_mean = mean(Silhouette, na.rm = TRUE),
Calinski_Harabasz_mean = mean(Calinski_Harabasz, na.rm = TRUE),
Davies_Bouldin_mean = mean(Davies_Bouldin, na.rm = TRUE),
Wilks_Lambda_mean = mean(Wilks_Lambda, na.rm = TRUE),
.groups = "drop"
)
summary_tbl
family_summary_tbl
ggplot(summary_tbl, aes(x = scenario, y = ARI_mean, fill = method)) +
geom_col(position = position_dodge(width = 0.8)) +
geom_errorbar(
aes(ymin = ARI_mean - ARI_sd, ymax = ARI_mean + ARI_sd),
position = position_dodge(width = 0.8),
width = 0.25
) +
coord_flip() +
labs(title = "External validity by scenario", x = NULL, y = "Mean ARI")
ggplot(summary_tbl, aes(x = scenario, y = Misclassification_mean, fill = method)) +
geom_col(position = position_dodge(width = 0.8)) +
geom_errorbar(
aes(ymin = pmax(0, Misclassification_mean - Misclassification_sd),
ymax = Misclassification_mean + Misclassification_sd),
position = position_dodge(width = 0.8),
width = 0.25
) +
coord_flip() +
labs(title = "Misclassification rate by scenario", x = NULL, y = "Mean misclassification rate")
internal_long <- summary_tbl %>%
select(scenario, method, Silhouette_mean, Calinski_Harabasz_mean, Davies_Bouldin_mean, Wilks_Lambda_mean) %>%
pivot_longer(cols = -c(scenario, method), names_to = "metric", values_to = "value")
ggplot(internal_long, aes(x = scenario, y = value, fill = method)) +
geom_col(position = position_dodge(width = 0.8)) +
facet_wrap(~ metric, scales = "free") +
coord_flip() +
labs(title = "Internal validity diagnostics", x = NULL, y = NULL)
Replace the placeholder function in Section 8 with your own Wilks implementation. It should return at least:
list(
cluster = integer_vector_of_cluster_assignments,
native_objective = optional_numeric_value
)
Then enable it inside METHODS, for example:
METHODS <- list(
KMeans = fit_predict_kmeans,
Wilks = fit_predict_wilks
)
After that, rerun Sections 12–14.
The export format is intentionally compact: each configuration is
saved as a single CSV with all replications stacked in the same file.
The replication index is stored in the rep column. This
avoids creating an excessive number of files.
export_generated_datasets <- function(benchmarks, n_reps = 3, output_dir = "/mnt/data/benchmark_exports", file_format = "csv",
base_seed = 9000) {
safe_dir_create(output_dir)
manifest_entries <- list()
for (scenario_name in names(benchmarks)) {
b <- benchmarks[[scenario_name]]
rep_tables <- list()
rep_meta <- list()
for (rep in seq_len(n_reps)) {
seed_data <- base_seed + which(names(benchmarks) == scenario_name) * 1000 + rep
obj <- b$generator(seed = seed_data)
dat <- as_tibble(obj$X)
names(dat) <- paste0("x", seq_len(ncol(obj$X)))
dat$y_true <- obj$y_true
dat$scenario <- scenario_name
dat$family <- b$family
dat$difficulty <- b$difficulty
dat$d <- b$d
dat$n <- nrow(obj$X)
dat$K <- b$K
dat$rep <- rep
dat$seed_data <- seed_data
rep_tables[[rep]] <- dat
rep_meta[[rep]] <- list(
rep = rep,
seed_data = seed_data,
scenario = scenario_name,
family = b$family,
difficulty = b$difficulty,
d = b$d,
n = nrow(obj$X),
K = b$K,
scenario_meta = obj$meta
)
}
all_reps_tbl <- bind_rows(rep_tables)
stem <- paste0(scenario_name, "__all_reps")
data_path <- file.path(output_dir, paste0(stem, ".", file_format))
meta_path <- file.path(output_dir, paste0(stem, ".metadata.json"))
if (tolower(file_format) == "csv") {
readr::write_csv(all_reps_tbl, data_path)
} else if (tolower(file_format) == "parquet") {
if (!requireNamespace("arrow", quietly = TRUE)) {
stop("Package 'arrow' is required for parquet export.")
}
arrow::write_parquet(all_reps_tbl, data_path)
} else {
stop("Unsupported file_format. Use 'csv' or 'parquet'.")
}
meta_json <- list(
scenario = scenario_name,
family = b$family,
difficulty = b$difficulty,
d = b$d,
n_reps = n_reps,
data_file = normalizePath(data_path, winslash = "/", mustWork = FALSE),
replications = rep_meta
)
writeLines(jsonlite::toJSON(meta_json, pretty = TRUE, auto_unbox = TRUE), con = meta_path)
manifest_entries[[length(manifest_entries) + 1]] <- tibble(
scenario = scenario_name,
family = b$family,
difficulty = b$difficulty,
dimension = b$d,
K = b$K,
n_reps = n_reps,
csv_path = normalizePath(data_path, winslash = "/", mustWork = FALSE),
metadata_path = normalizePath(meta_path, winslash = "/", mustWork = FALSE)
)
}
manifest_tbl <- bind_rows(manifest_entries)
manifest_path <- file.path(output_dir, "dataset_manifest.csv")
readr::write_csv(manifest_tbl, manifest_path)
manifest_tbl
}
EXPORT_DIR <- "/mnt/data/benchmark_exports"
manifest_tbl <- export_generated_datasets(
BENCHMARKS,
n_reps = 3,
output_dir = EXPORT_DIR,
file_format = "csv"
)
manifest_tbl
readr::write_csv(results_tbl, "/mnt/data/clustering_benchmark_results.csv")
readr::write_csv(summary_tbl, "/mnt/data/clustering_benchmark_summary.csv")
readr::write_csv(family_summary_tbl, "/mnt/data/clustering_benchmark_family_summary.csv")
c(
"/mnt/data/clustering_benchmark_results.csv",
"/mnt/data/clustering_benchmark_summary.csv",
"/mnt/data/clustering_benchmark_family_summary.csv"
)
## [1] "/mnt/data/clustering_benchmark_results.csv"
## [2] "/mnt/data/clustering_benchmark_summary.csv"
## [3] "/mnt/data/clustering_benchmark_family_summary.csv"
This section is useful when the notebook is run on a remote system and the user wants a single downloadable archive.
zip_output <- "/mnt/data/clustering_benchmark_bundle.zip"
paths_to_zip <- c(
list.files(EXPORT_DIR, full.names = TRUE),
"/mnt/data/clustering_benchmark_results.csv",
"/mnt/data/clustering_benchmark_summary.csv",
"/mnt/data/clustering_benchmark_family_summary.csv"
)
paths_to_zip <- paths_to_zip[file.exists(paths_to_zip)]
if (!requireNamespace("zip", quietly = TRUE)) {
install.packages("zip", repos = "https://cloud.r-project.org")
}
zip::zipr(zipfile = zip_output, files = paths_to_zip)
zip_output
## [1] "/mnt/data/clustering_benchmark_bundle.zip"
read_exported_scenario <- function(scenario_name, export_dir = "/mnt/data/benchmark_exports") {
manifest <- readr::read_csv(file.path(export_dir, "dataset_manifest.csv"), show_col_types = FALSE)
row <- manifest %>% filter(scenario == scenario_name)
if (nrow(row) == 0) stop("Scenario not found: ", scenario_name)
csv_file <- row$csv_path[1]
meta_file <- row$metadata_path[1]
if (!file.exists(csv_file)) csv_file <- file.path(export_dir, basename(csv_file))
if (!file.exists(meta_file)) meta_file <- file.path(export_dir, basename(meta_file))
list(
data = readr::read_csv(csv_file, show_col_types = FALSE),
data_by_rep = split(readr::read_csv(csv_file, show_col_types = FALSE), readr::read_csv(csv_file, show_col_types = FALSE)$rep),
metadata = jsonlite::fromJSON(meta_file)
)
}
obj_read <- read_exported_scenario("gaussian_spherical_easy_2d", export_dir = EXPORT_DIR)
head(obj_read$data)
obj_read$metadata$scenario
## [1] "gaussian_spherical_easy_2d"
The scenarios are designed so that: - spherical Gaussian cases act as baseline controls, - unequal covariance and anisotropic cases probe whether covariance-aware criteria can outperform trace-based compactness, - heavy-tailed and contaminated cases probe robustness, - the two-moons case is explicitly a stress test outside the native convex-elliptic regime.
This structure makes the benchmark scientifically defensible rather than merely illustrative.