knitr::opts_chunk$set(
  echo = TRUE,
  warning = FALSE,
  message = FALSE,
  fig.width = 7.5,
  fig.height = 5.5,
  dpi = 140
)

required_pkgs <- c(
  "MASS", "mvtnorm", "mclust", "cluster", "ggplot2", "dplyr", "tidyr",
  "purrr", "stringr", "tibble", "jsonlite", "readr", "glue", "patchwork"
)

to_install <- required_pkgs[!vapply(required_pkgs, requireNamespace, logical(1), quietly = TRUE)]
if (length(to_install) > 0) {
  install.packages(to_install, repos = "https://cloud.r-project.org")
}

invisible(lapply(required_pkgs, library, character.only = TRUE))

theme_set(theme_minimal(base_size = 12))
set.seed(1234)

1. Purpose

This notebook provides a journal-quality synthetic benchmarking harness for comparing a determinant-based clustering method (Wilks-lambda style) against K-means. In this R version, the full generator suite, K-means evaluation, visual diagnostics, export utilities, and summary tables are implemented directly. The Wilks method is left as a plug-in adapter so that an external implementation can be inserted without changing the benchmark protocol.

The benchmark design varies: - cluster separation and overlap, - covariance heterogeneity and anisotropy, - class imbalance, - heavy tails and contamination, - a nonconvex stress test.

The dimensionality is deliberately restricted to \(d \in \{2,5,10\}\) so that the geometry remains interpretable.

2. Design principles

K-means is a trace-based compactness criterion and is well aligned with approximately spherical clusters of similar scale. A Wilks-lambda style criterion is instead sensitive to generalized within-cluster scatter, and is therefore especially interesting when covariance geometry differs across clusters. The benchmark families below are chosen to isolate those distinctions.

3. Utility functions

if (!requireNamespace("graphicalExtremes", quietly = TRUE)) {
  install.packages("graphicalExtremes")
}

safe_dir_create <- function(path) {
  if (!dir.exists(path)) dir.create(path, recursive = TRUE, showWarnings = FALSE)
  invisible(path)
}

make_sigma <- function(d, eigenvalues = NULL, angle = 0, random_rotation = FALSE, seed = NULL) {
  stopifnot(d >= 1)

  if (!is.null(seed)) {
    old_seed <- if (exists(".Random.seed", envir = .GlobalEnv, inherits = FALSE)) .Random.seed else NULL
    on.exit({
      if (is.null(old_seed)) {
        if (exists(".Random.seed", envir = .GlobalEnv, inherits = FALSE)) {
          rm(.Random.seed, envir = .GlobalEnv)
        }
      } else {
        assign(".Random.seed", old_seed, envir = .GlobalEnv)
      }
    }, add = TRUE)
    set.seed(seed)
  }

  if (is.null(eigenvalues)) {
    return(random_spd(d, seed = NULL))
  }

  stopifnot(length(eigenvalues) == d)
  stopifnot(all(eigenvalues > 0))

  if (d == 1) {
    return(matrix(eigenvalues[1], nrow = 1, ncol = 1))
  }

  Q <- diag(d)
  if (d >= 2 && abs(angle) > 0) {
    rotation_2d <- matrix(
      c(cos(angle), -sin(angle),
        sin(angle),  cos(angle)),
      nrow = 2,
      byrow = TRUE
    )
    Q[1:2, 1:2] <- rotation_2d
  }

  if (isTRUE(random_rotation)) {
    base_spd <- graphicalExtremes::generate_random_spd_matrix(d)
    eig <- eigen(base_spd, symmetric = TRUE)
    Q <- eig$vectors
  }

  Q %*% diag(eigenvalues, d) %*% t(Q)
}

random_spd <- function(d, eig_low = 0.3, eig_high = 2.5, seed = NULL) {
  if (!is.null(seed)) {
    old_seed <- if (exists(".Random.seed", envir = .GlobalEnv, inherits = FALSE)) .Random.seed else NULL
    on.exit({
      if (is.null(old_seed)) {
        if (exists(".Random.seed", envir = .GlobalEnv, inherits = FALSE)) {
          rm(.Random.seed, envir = .GlobalEnv)
        }
      } else {
        assign(".Random.seed", old_seed, envir = .GlobalEnv)
      }
    }, add = TRUE)
    set.seed(seed)
  }

  Sigma <- graphicalExtremes::generate_random_spd_matrix(d)
  diag_scale <- sqrt(diag(Sigma))
  Sigma <- sweep(sweep(Sigma, 1, diag_scale, "/"), 2, diag_scale, "/")
  eig <- eigen(Sigma, symmetric = TRUE)
  target <- seq(eig_low, eig_high, length.out = d)
  target <- sample(target, size = d, replace = FALSE)
  eig$vectors %*% diag(target, d) %*% t(eig$vectors)
}

sample_multivariate_t <- function(n, mu, Sigma, df) {
  d <- length(mu)
  Z <- MASS::mvrnorm(n, mu = rep(0, d), Sigma = Sigma)
  g <- sqrt(df / rchisq(n, df))
  X <- sweep(Z, 1, g, "*")
  X <- sweep(X, 2, mu, "+")
  X
}

sample_skewed_like <- function(n, mu, Sigma, skew_strength = 1.5) {
  d <- length(mu)
  Z <- MASS::mvrnorm(n, mu = rep(0, d), Sigma = Sigma)
  X <- Z
  X[, 1] <- X[, 1] + skew_strength * abs(Z[, 1])
  sweep(X, 2, mu, "+")
}

compute_pairwise_center_distance <- function(centers) {
  if (nrow(centers) < 2) return(NA_real_)
  as.numeric(min(dist(centers)))
}

bhattacharyya_distance_gaussian <- function(mu1, Sigma1, mu2, Sigma2) {
  Sigma <- 0.5 * (Sigma1 + Sigma2)
  diff <- matrix(mu1 - mu2, ncol = 1)
  term1 <- 0.125 * t(diff) %*% solve(Sigma, diff)
  term2 <- 0.5 * log(det(Sigma) / sqrt(det(Sigma1) * det(Sigma2)))
  as.numeric(term1 + term2)
}

mean_pairwise_bhattacharyya <- function(mus, Sigmas) {
  K <- length(mus)
  if (K < 2) return(NA_real_)
  vals <- c()
  for (i in 1:(K-1)) {
    for (j in (i+1):K) {
      vals <- c(vals, bhattacharyya_distance_gaussian(mus[[i]], Sigmas[[i]], mus[[j]], Sigmas[[j]]))
    }
  }
  mean(vals)
}

mode_int <- function(x) {
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}

4. Benchmark generators

Each generator returns: - X: feature matrix, - y_true: true labels, - meta: scenario metadata, - optionally distributional objects used to define difficulty.

generate_gaussian_mixture <- function(n, d, centers, Sigmas, probs = NULL, seed = NULL, family = "gaussian") {
  if (!is.null(seed)) set.seed(seed)
  K <- nrow(centers)
  if (is.null(probs)) probs <- rep(1 / K, K)
  counts <- as.vector(rmultinom(1, n, probs))
  X_parts <- vector("list", K)
  y_parts <- vector("list", K)
  mus <- vector("list", K)
  Sigma_list <- vector("list", K)

  for (k in seq_len(K)) {
    X_parts[[k]] <- MASS::mvrnorm(counts[k], mu = centers[k, ], Sigma = Sigmas[[k]])
    y_parts[[k]] <- rep(k, counts[k])
    mus[[k]] <- centers[k, ]
    Sigma_list[[k]] <- Sigmas[[k]]
  }

  X <- do.call(rbind, X_parts)
  y <- unlist(y_parts)
  idx <- sample(seq_len(nrow(X)))
  X <- X[idx, , drop = FALSE]
  y <- y[idx]

  list(
    X = X,
    y_true = y,
    meta = list(
      family = family,
      n = n,
      d = d,
      K = K,
      probs = probs,
      min_center_distance = compute_pairwise_center_distance(centers),
      mean_pairwise_bhattacharyya = mean_pairwise_bhattacharyya(mus, Sigma_list)
    ),
    centers = centers,
    Sigmas = Sigmas
  )
}

generate_contaminated_gaussian <- function(n, d, centers, Sigmas, contamination = 0.08,
                                           inflation = 12, probs = NULL, seed = NULL) {
  if (!is.null(seed)) set.seed(seed)
  K <- nrow(centers)
  if (is.null(probs)) probs <- rep(1 / K, K)
  counts <- as.vector(rmultinom(1, n, probs))
  X_parts <- vector("list", K)
  y_parts <- vector("list", K)

  for (k in seq_len(K)) {
    nk <- counts[k]
    is_contam <- rbinom(nk, size = 1, prob = contamination)
    X_clean <- MASS::mvrnorm(sum(is_contam == 0), mu = centers[k, ], Sigma = Sigmas[[k]])
    X_contam <- MASS::mvrnorm(sum(is_contam == 1), mu = centers[k, ], Sigma = inflation * Sigmas[[k]])
    Xk <- rbind(X_clean, X_contam)
    if (nrow(Xk) > 0) Xk <- Xk[sample(seq_len(nrow(Xk))), , drop = FALSE]
    X_parts[[k]] <- Xk
    y_parts[[k]] <- rep(k, nk)
  }

  X <- do.call(rbind, X_parts)
  y <- unlist(y_parts)
  idx <- sample(seq_len(nrow(X)))
  X <- X[idx, , drop = FALSE]
  y <- y[idx]

  list(
    X = X,
    y_true = y,
    meta = list(
      family = "contaminated_gaussian",
      n = n,
      d = d,
      K = K,
      probs = probs,
      contamination = contamination,
      inflation = inflation,
      min_center_distance = compute_pairwise_center_distance(centers)
    ),
    centers = centers,
    Sigmas = Sigmas
  )
}

generate_student_t_mixture <- function(n, d, centers, Sigmas, df = 4, probs = NULL, seed = NULL) {
  if (!is.null(seed)) set.seed(seed)
  K <- nrow(centers)
  if (is.null(probs)) probs <- rep(1 / K, K)
  counts <- as.vector(rmultinom(1, n, probs))
  X_parts <- vector("list", K)
  y_parts <- vector("list", K)

  for (k in seq_len(K)) {
    X_parts[[k]] <- sample_multivariate_t(counts[k], mu = centers[k, ], Sigma = Sigmas[[k]], df = df)
    y_parts[[k]] <- rep(k, counts[k])
  }

  X <- do.call(rbind, X_parts)
  y <- unlist(y_parts)
  idx <- sample(seq_len(nrow(X)))
  X <- X[idx, , drop = FALSE]
  y <- y[idx]

  list(
    X = X,
    y_true = y,
    meta = list(
      family = "student_t",
      n = n,
      d = d,
      K = K,
      probs = probs,
      df = df,
      min_center_distance = compute_pairwise_center_distance(centers)
    ),
    centers = centers,
    Sigmas = Sigmas
  )
}

generate_skewed_mixture <- function(n, d, centers, Sigmas, skew_strength = 1.5, probs = NULL, seed = NULL) {
  if (!is.null(seed)) set.seed(seed)
  K <- nrow(centers)
  if (is.null(probs)) probs <- rep(1 / K, K)
  counts <- as.vector(rmultinom(1, n, probs))
  X_parts <- vector("list", K)
  y_parts <- vector("list", K)

  for (k in seq_len(K)) {
    X_parts[[k]] <- sample_skewed_like(counts[k], mu = centers[k, ], Sigma = Sigmas[[k]], skew_strength = skew_strength)
    y_parts[[k]] <- rep(k, counts[k])
  }

  X <- do.call(rbind, X_parts)
  y <- unlist(y_parts)
  idx <- sample(seq_len(nrow(X)))
  X <- X[idx, , drop = FALSE]
  y <- y[idx]

  list(
    X = X,
    y_true = y,
    meta = list(
      family = "skewed_like",
      n = n,
      d = d,
      K = K,
      probs = probs,
      skew_strength = skew_strength,
      min_center_distance = compute_pairwise_center_distance(centers)
    ),
    centers = centers,
    Sigmas = Sigmas
  )
}

generate_two_moons <- function(n, noise = 0.08, seed = NULL) {
  if (!is.null(seed)) set.seed(seed)
  n1 <- floor(n / 2)
  n2 <- n - n1
  t1 <- runif(n1, 0, pi)
  t2 <- runif(n2, 0, pi)

  X1 <- cbind(cos(t1), sin(t1))
  X2 <- cbind(1 - cos(t2), -sin(t2) - 0.4)

  X <- rbind(X1, X2) + matrix(rnorm(2 * n, sd = noise), ncol = 2)
  y <- c(rep(1, n1), rep(2, n2))
  idx <- sample(seq_len(n))
  list(
    X = X[idx, , drop = FALSE],
    y_true = y[idx],
    meta = list(
      family = "two_moons",
      n = n,
      d = 2,
      K = 2,
      noise = noise
    )
  )
}

5. Benchmark configuration list

The configurations are chosen to cover easy, moderate, and difficult Gaussian settings, covariance heterogeneity, non-Gaussian departures, and a nonconvex stress test. Every 2D family has a dedicated visualization setup.

make_benchmarks <- function() {
  benchmarks <- list()

  # 2D Gaussian spherical difficulty ladder
  benchmarks[["gaussian_spherical_easy_2d"]] <- list(
    scenario = "gaussian_spherical_easy_2d",
    d = 2, n = 450, K = 3, family = "gaussian_spherical", difficulty = "easy",
    generator = function(seed) {
      centers <- rbind(c(-4, 0), c(0, 4), c(4, 0))
      Sigmas <- list(diag(2), diag(2), diag(2))
      generate_gaussian_mixture(450, 2, centers, Sigmas, seed = seed, family = "gaussian_spherical")
    }
  )

  benchmarks[["gaussian_spherical_moderate_2d"]] <- list(
    scenario = "gaussian_spherical_moderate_2d",
    d = 2, n = 450, K = 3, family = "gaussian_spherical", difficulty = "moderate",
    generator = function(seed) {
      centers <- rbind(c(-2.7, 0), c(0, 2.7), c(2.7, 0))
      Sigmas <- list(diag(2), diag(2), diag(2))
      generate_gaussian_mixture(450, 2, centers, Sigmas, seed = seed, family = "gaussian_spherical")
    }
  )

  benchmarks[["gaussian_spherical_hard_2d"]] <- list(
    scenario = "gaussian_spherical_hard_2d",
    d = 2, n = 450, K = 3, family = "gaussian_spherical", difficulty = "hard",
    generator = function(seed) {
      centers <- rbind(c(-1.8, 0), c(0, 1.8), c(1.8, 0))
      Sigmas <- list(diag(2), diag(2), diag(2))
      generate_gaussian_mixture(450, 2, centers, Sigmas, seed = seed, family = "gaussian_spherical")
    }
  )

  benchmarks[["gaussian_anisotropic_rotated_2d"]] <- list(
    scenario = "gaussian_anisotropic_rotated_2d",
    d = 2, n = 450, K = 3, family = "gaussian_anisotropic", difficulty = "moderate",
    generator = function(seed) {
      centers <- rbind(c(-3.5, -1.0), c(0, 3.5), c(3.5, -0.6))
      Sigmas <- list(
        make_sigma(2, c(2.8, 0.25), angle = 0.65),
        make_sigma(2, c(2.1, 0.35), angle = -0.45),
        make_sigma(2, c(1.7, 0.22), angle = 0.25)
      )
      generate_gaussian_mixture(450, 2, centers, Sigmas, seed = seed, family = "gaussian_anisotropic")
    }
  )

  benchmarks[["gaussian_unequal_covariance_2d"]] <- list(
    scenario = "gaussian_unequal_covariance_2d",
    d = 2, n = 450, K = 3, family = "gaussian_unequal_covariance", difficulty = "hard",
    generator = function(seed) {
      centers <- rbind(c(-3.2, 0), c(0, 2.8), c(3.0, -0.5))
      Sigmas <- list(
        make_sigma(2, c(3.4, 0.3), angle = 0.35),
        make_sigma(2, c(1.0, 0.5), angle = -0.8),
        make_sigma(2, c(0.6, 0.18), angle = 0.1)
      )
      generate_gaussian_mixture(450, 2, centers, Sigmas, seed = seed, family = "gaussian_unequal_covariance")
    }
  )

  benchmarks[["gaussian_imbalanced_2d"]] <- list(
    scenario = "gaussian_imbalanced_2d",
    d = 2, n = 500, K = 3, family = "gaussian_imbalanced", difficulty = "moderate",
    generator = function(seed) {
      centers <- rbind(c(-3.2, 0), c(0, 3.0), c(3.2, 0))
      Sigmas <- list(diag(2), diag(2), diag(2))
      generate_gaussian_mixture(500, 2, centers, Sigmas, probs = c(0.65, 0.25, 0.10), seed = seed, family = "gaussian_imbalanced")
    }
  )

  benchmarks[["student_t_heavy_tail_2d"]] <- list(
    scenario = "student_t_heavy_tail_2d",
    d = 2, n = 450, K = 3, family = "student_t", difficulty = "hard",
    generator = function(seed) {
      centers <- rbind(c(-3.0, 0), c(0, 2.7), c(3.0, 0))
      Sigmas <- list(diag(2), diag(2), diag(2))
      generate_student_t_mixture(450, 2, centers, Sigmas, df = 4, seed = seed)
    }
  )

  benchmarks[["skewed_like_2d"]] <- list(
    scenario = "skewed_like_2d",
    d = 2, n = 450, K = 3, family = "skewed_like", difficulty = "moderate",
    generator = function(seed) {
      centers <- rbind(c(-3.0, -0.7), c(0, 3.0), c(3.0, -0.4))
      Sigmas <- list(diag(c(0.9, 0.5)), diag(c(0.8, 0.6)), diag(c(0.9, 0.45)))
      generate_skewed_mixture(450, 2, centers, Sigmas, skew_strength = 1.35, seed = seed)
    }
  )

  benchmarks[["contaminated_gaussian_2d"]] <- list(
    scenario = "contaminated_gaussian_2d",
    d = 2, n = 450, K = 3, family = "contaminated_gaussian", difficulty = "hard",
    generator = function(seed) {
      centers <- rbind(c(-3.2, 0), c(0, 2.8), c(3.2, 0))
      Sigmas <- list(diag(2), diag(2), diag(2))
      generate_contaminated_gaussian(450, 2, centers, Sigmas, contamination = 0.08, inflation = 14, seed = seed)
    }
  )

  benchmarks[["two_moons_2d"]] <- list(
    scenario = "two_moons_2d",
    d = 2, n = 500, K = 2, family = "two_moons", difficulty = "stress_test",
    generator = function(seed) {
      generate_two_moons(500, noise = 0.09, seed = seed)
    }
  )

  # Moderate dimensional quantitative cases
  benchmarks[["gaussian_spherical_easy_5d"]] <- list(
    scenario = "gaussian_spherical_easy_5d",
    d = 5, n = 600, K = 3, family = "gaussian_spherical", difficulty = "easy",
    generator = function(seed) {
      centers <- rbind(
        c(-4, 0, 0, 0, 0),
        c(0, 4, 0, 0, 0),
        c(4, 0, 0, 0, 0)
      )
      Sigmas <- list(diag(5), diag(5), diag(5))
      generate_gaussian_mixture(600, 5, centers, Sigmas, seed = seed, family = "gaussian_spherical")
    }
  )

  benchmarks[["gaussian_unequal_covariance_5d"]] <- list(
    scenario = "gaussian_unequal_covariance_5d",
    d = 5, n = 600, K = 3, family = "gaussian_unequal_covariance", difficulty = "hard",
    generator = function(seed) {
      centers <- rbind(
        c(-3.5, 0, 0, 0, 0),
        c(0, 3.2, 0, 0, 0),
        c(3.2, 0, 0, 0, 0)
      )
      Sigmas <- list(
        diag(c(3.0, 1.5, 0.6, 0.5, 0.5)),
        diag(c(1.0, 0.9, 0.6, 0.6, 0.5)),
        diag(c(0.6, 0.5, 0.5, 0.4, 0.3))
      )
      generate_gaussian_mixture(600, 5, centers, Sigmas, seed = seed, family = "gaussian_unequal_covariance")
    }
  )

  benchmarks[["student_t_heavy_tail_5d"]] <- list(
    scenario = "student_t_heavy_tail_5d",
    d = 5, n = 600, K = 3, family = "student_t", difficulty = "hard",
    generator = function(seed) {
      centers <- rbind(
        c(-3.2, 0, 0, 0, 0),
        c(0, 3.2, 0, 0, 0),
        c(3.2, 0, 0, 0, 0)
      )
      Sigmas <- list(diag(5), diag(5), diag(5))
      generate_student_t_mixture(600, 5, centers, Sigmas, df = 5, seed = seed)
    }
  )

  benchmarks[["gaussian_spherical_easy_10d"]] <- list(
    scenario = "gaussian_spherical_easy_10d",
    d = 10, n = 750, K = 3, family = "gaussian_spherical", difficulty = "easy",
    generator = function(seed) {
      centers <- rbind(
        c(-4, 0, 0, 0, 0, 0, 0, 0, 0, 0),
        c(0, 4, 0, 0, 0, 0, 0, 0, 0, 0),
        c(4, 0, 0, 0, 0, 0, 0, 0, 0, 0)
      )
      Sigmas <- list(diag(10), diag(10), diag(10))
      generate_gaussian_mixture(750, 10, centers, Sigmas, seed = seed, family = "gaussian_spherical")
    }
  )

  benchmarks[["gaussian_unequal_covariance_10d"]] <- list(
    scenario = "gaussian_unequal_covariance_10d",
    d = 10, n = 750, K = 3, family = "gaussian_unequal_covariance", difficulty = "hard",
    generator = function(seed) {
      centers <- rbind(
        c(-3.6, 0, 0, 0, 0, 0, 0, 0, 0, 0),
        c(0, 3.3, 0, 0, 0, 0, 0, 0, 0, 0),
        c(3.3, 0, 0, 0, 0, 0, 0, 0, 0, 0)
      )
      Sigmas <- list(
        diag(c(3.2, 1.8, 1.1, 0.9, 0.8, 0.7, 0.7, 0.6, 0.5, 0.5)),
        diag(c(1.2, 1.0, 0.9, 0.8, 0.8, 0.7, 0.7, 0.6, 0.6, 0.5)),
        diag(c(0.7, 0.6, 0.6, 0.5, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3))
      )
      generate_gaussian_mixture(750, 10, centers, Sigmas, seed = seed, family = "gaussian_unequal_covariance")
    }
  )

  benchmarks
}

BENCHMARKS <- make_benchmarks()
names(BENCHMARKS)

##  [1] "gaussian_spherical_easy_2d"      "gaussian_spherical_moderate_2d" 
##  [3] "gaussian_spherical_hard_2d"      "gaussian_anisotropic_rotated_2d"
##  [5] "gaussian_unequal_covariance_2d"  "gaussian_imbalanced_2d"         
##  [7] "student_t_heavy_tail_2d"         "skewed_like_2d"                 
##  [9] "contaminated_gaussian_2d"        "two_moons_2d"                   
## [11] "gaussian_spherical_easy_5d"      "gaussian_unequal_covariance_5d" 
## [13] "student_t_heavy_tail_5d"         "gaussian_spherical_easy_10d"    
## [15] "gaussian_unequal_covariance_10d"

6. Visualization helpers

dataset_to_tibble <- function(obj, scenario) {
  X <- obj$X
  d <- ncol(X)
  out <- as_tibble(X)
  names(out) <- paste0("x", seq_len(d))
  out$y_true <- factor(obj$y_true)
  out$scenario <- scenario
  out
}

plot_dataset_2d <- function(obj, scenario, alpha = 0.75) {
  dat <- dataset_to_tibble(obj, scenario)
  ggplot(dat, aes(x = x1, y = x2, color = y_true)) +
    geom_point(alpha = alpha, size = 1.7) +
    coord_equal() +
    labs(
      title = scenario,
      x = "x1", y = "x2", color = "True cluster"
    ) +
    theme(legend.position = "bottom")
}

7. Plot all 2D benchmark setups

benchmarks_2d <- BENCHMARKS[vapply(BENCHMARKS, function(b) b$d == 2, logical(1))]
plots_2d <- imap(benchmarks_2d, function(b, nm) plot_dataset_2d(b$generator(seed = 2026), nm))
wrap_plots(plots_2d, ncol = 2)

8. Clustering methods

The K-means baseline is fully implemented. The Wilks adapter is a placeholder into which an external implementation can be inserted.

fit_predict_kmeans <- function(X, K, nstart = 50, iter.max = 100) {
  km <- kmeans(X, centers = K, nstart = nstart, iter.max = iter.max)
  list(
    cluster = km$cluster,
    native_objective = km$tot.withinss
  )
}

fit_predict_wilks <- function(X, K, ...) {
  stop("Insert your external Wilks-lambda clustering implementation here and return a list with at least $cluster.")
}

METHODS <- list(
  KMeans = fit_predict_kmeans
  # Wilks = fit_predict_wilks
)

9. Validity metrics

We use external metrics because the synthetic benchmarks come with true labels, and internal metrics as complementary diagnostics.

relabel_by_majority <- function(y_true, y_pred) {
  map_df <- tibble(y_true = y_true, y_pred = y_pred) %>%
    group_by(y_pred) %>%
    summarise(mapped = mode_int(y_true), .groups = "drop")
  out <- left_join(tibble(y_pred = y_pred), map_df, by = "y_pred")$mapped
  as.integer(out)
}

misclassification_rate <- function(y_true, y_pred) {
  y_hat <- relabel_by_majority(y_true, y_pred)
  mean(y_hat != y_true)
}

calinski_harabasz_manual <- function(X, clusters) {
  n <- nrow(X)
  K <- length(unique(clusters))
  if (K <= 1 || K >= n) return(NA_real_)
  overall <- colMeans(X)
  W <- 0
  B <- 0
  for (k in sort(unique(clusters))) {
    Xk <- X[clusters == k, , drop = FALSE]
    nk <- nrow(Xk)
    if (nk <= 0) next
    ck <- colMeans(Xk)
    centered <- sweep(Xk, 2, ck, "-")
    W <- W + sum(centered^2)
    B <- B + nk * sum((ck - overall)^2)
  }
  (B / (K - 1)) / (W / (n - K))
}

davies_bouldin_manual <- function(X, clusters) {
  cl <- sort(unique(clusters))
  K <- length(cl)
  if (K <= 1) return(NA_real_)
  centers <- lapply(cl, function(k) colMeans(X[clusters == k, , drop = FALSE]))
  S <- sapply(seq_along(cl), function(i) {
    Xi <- X[clusters == cl[i], , drop = FALSE]
    ci <- centers[[i]]
    mean(sqrt(rowSums((sweep(Xi, 2, ci, "-"))^2)))
  })

  M <- matrix(NA_real_, K, K)
  for (i in seq_len(K)) {
    for (j in seq_len(K)) {
      if (i != j) {
        M[i, j] <- sqrt(sum((centers[[i]] - centers[[j]])^2))
      }
    }
  }
  R <- matrix(NA_real_, K, K)
  for (i in seq_len(K)) {
    for (j in seq_len(K)) {
      if (i != j) {
        R[i, j] <- (S[i] + S[j]) / M[i, j]
      }
    }
  }
  mean(apply(R, 1, max, na.rm = TRUE))
}

wilks_lambda_stat <- function(X, clusters, ridge = 1e-8) {
  n <- nrow(X)
  d <- ncol(X)
  overall <- colMeans(X)
  Tmat <- crossprod(sweep(X, 2, overall, "-"))
  Wmat <- matrix(0, d, d)
  for (k in sort(unique(clusters))) {
    Xk <- X[clusters == k, , drop = FALSE]
    ck <- colMeans(Xk)
    centered <- sweep(Xk, 2, ck, "-")
    Wmat <- Wmat + crossprod(centered)
  }
  Tmat <- Tmat + ridge * diag(d)
  Wmat <- Wmat + ridge * diag(d)
  det(Wmat) / det(Tmat)
}

compute_metrics <- function(X, y_true, y_pred, native_objective = NA_real_) {
  sil <- tryCatch(cluster::silhouette(y_pred, dist(X)), error = function(e) NULL)
  sil_mean <- if (is.null(sil)) NA_real_ else mean(sil[, "sil_width"])

  tibble(
    ARI = mclust::adjustedRandIndex(y_true, y_pred),
    Misclassification = misclassification_rate(y_true, y_pred),
    Silhouette = sil_mean,
    Calinski_Harabasz = calinski_harabasz_manual(X, y_pred),
    Davies_Bouldin = davies_bouldin_manual(X, y_pred),
    Wilks_Lambda = wilks_lambda_stat(X, y_pred),
    Native_Objective = native_objective
  )
}

10. Single-run example

example_obj <- BENCHMARKS[["gaussian_unequal_covariance_2d"]]$generator(seed = 2027)
example_fit <- fit_predict_kmeans(example_obj$X, K = BENCHMARKS[["gaussian_unequal_covariance_2d"]]$K)
example_metrics <- compute_metrics(example_obj$X, example_obj$y_true, example_fit$cluster, example_fit$native_objective)
example_metrics

example_dat <- dataset_to_tibble(example_obj, "gaussian_unequal_covariance_2d")
example_dat$cluster_kmeans <- factor(example_fit$cluster)

p_true <- ggplot(example_dat, aes(x = x1, y = x2, color = y_true)) +
  geom_point(alpha = 0.75, size = 1.8) +
  coord_equal() +
  labs(title = "True labels", color = "True") +
  theme(legend.position = "bottom")

p_pred <- ggplot(example_dat, aes(x = x1, y = x2, color = cluster_kmeans)) +
  geom_point(alpha = 0.75, size = 1.8) +
  coord_equal() +
  labs(title = "K-means partition", color = "Predicted") +
  theme(legend.position = "bottom")

p_true + p_pred

11. Benchmark runner

run_benchmark <- function(benchmarks, methods, n_reps = 20, base_seed = 5000) {
  results <- list()
  row_id <- 1

  for (scenario_name in names(benchmarks)) {
    b <- benchmarks[[scenario_name]]
    for (rep in seq_len(n_reps)) {
      seed_data <- base_seed + 1000 * row_id + rep
      obj <- b$generator(seed = seed_data)
      X <- obj$X
      y_true <- obj$y_true

      for (method_name in names(methods)) {
        fit <- methods[[method_name]](X, K = b$K)
        y_pred <- fit$cluster
        native_objective <- if (!is.null(fit$native_objective)) fit$native_objective else NA_real_

        met <- compute_metrics(X, y_true, y_pred, native_objective)
        met <- met %>%
          mutate(
            scenario = scenario_name,
            family = b$family,
            difficulty = b$difficulty,
            d = b$d,
            n = b$n,
            K = b$K,
            rep = rep,
            seed_data = seed_data,
            method = method_name
          )

        if (!is.null(obj$meta)) {
          meta_vals <- obj$meta
          if (!is.null(meta_vals$min_center_distance)) met$min_center_distance <- meta_vals$min_center_distance
          if (!is.null(meta_vals$mean_pairwise_bhattacharyya)) met$mean_pairwise_bhattacharyya <- meta_vals$mean_pairwise_bhattacharyya
        }

        results[[length(results) + 1]] <- met
      }
    }
    row_id <- row_id + 1
  }

  bind_rows(results) %>%
    select(
      scenario, family, difficulty, method, d, n, K, rep, seed_data,
      everything()
    )
}

12. Run the full benchmark

results_tbl <- run_benchmark(BENCHMARKS, METHODS, n_reps = 15, base_seed = 7100)
results_tbl %>% glimpse()

## Rows: 225
## Columns: 18
## $ scenario                    <chr> "gaussian_spherical_easy_2d", "gaussian_sp…
## $ family                      <chr> "gaussian_spherical", "gaussian_spherical"…
## $ difficulty                  <chr> "easy", "easy", "easy", "easy", "easy", "e…
## $ method                      <chr> "KMeans", "KMeans", "KMeans", "KMeans", "K…
## $ d                           <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ n                           <dbl> 450, 450, 450, 450, 450, 450, 450, 450, 45…
## $ K                           <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, …
## $ rep                         <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,…
## $ seed_data                   <dbl> 8101, 8102, 8103, 8104, 8105, 8106, 8107, …
## $ ARI                         <dbl> 0.9934015, 0.9930235, 0.9864025, 0.9934712…
## $ Misclassification           <dbl> 0.002222222, 0.002222222, 0.004444444, 0.0…
## $ Silhouette                  <dbl> 0.6590121, 0.6798893, 0.6892903, 0.6778953…
## $ Calinski_Harabasz           <dbl> 1485.2314, 1616.5058, 1740.1043, 1606.8171…
## $ Davies_Bouldin              <dbl> 0.4694331, 0.4383066, 0.4213721, 0.4385242…
## $ Wilks_Lambda                <dbl> 0.02206226, 0.01703686, 0.01539706, 0.0180…
## $ Native_Objective            <dbl> 978.0614, 822.6333, 814.8083, 891.0392, 81…
## $ min_center_distance         <dbl> 5.656854, 5.656854, 5.656854, 5.656854, 5.…
## $ mean_pairwise_bhattacharyya <dbl> 5.333333, 5.333333, 5.333333, 5.333333, 5.…

13. Summary tables

summary_tbl <- results_tbl %>%
  group_by(scenario, family, difficulty, method, d, n, K) %>%
  summarise(
    ARI_mean = mean(ARI, na.rm = TRUE),
    ARI_sd = sd(ARI, na.rm = TRUE),
    Misclassification_mean = mean(Misclassification, na.rm = TRUE),
    Misclassification_sd = sd(Misclassification, na.rm = TRUE),
    Silhouette_mean = mean(Silhouette, na.rm = TRUE),
    Silhouette_sd = sd(Silhouette, na.rm = TRUE),
    Calinski_Harabasz_mean = mean(Calinski_Harabasz, na.rm = TRUE),
    Davies_Bouldin_mean = mean(Davies_Bouldin, na.rm = TRUE),
    Wilks_Lambda_mean = mean(Wilks_Lambda, na.rm = TRUE),
    Native_Objective_mean = mean(Native_Objective, na.rm = TRUE),
    min_center_distance = mean(min_center_distance, na.rm = TRUE),
    mean_pairwise_bhattacharyya = mean(mean_pairwise_bhattacharyya, na.rm = TRUE),
    .groups = "drop"
  )

family_summary_tbl <- results_tbl %>%
  group_by(family, difficulty, method, d) %>%
  summarise(
    ARI_mean = mean(ARI, na.rm = TRUE),
    Misclassification_mean = mean(Misclassification, na.rm = TRUE),
    Silhouette_mean = mean(Silhouette, na.rm = TRUE),
    Calinski_Harabasz_mean = mean(Calinski_Harabasz, na.rm = TRUE),
    Davies_Bouldin_mean = mean(Davies_Bouldin, na.rm = TRUE),
    Wilks_Lambda_mean = mean(Wilks_Lambda, na.rm = TRUE),
    .groups = "drop"
  )

summary_tbl

family_summary_tbl

14. Journal-style figures

ggplot(summary_tbl, aes(x = scenario, y = ARI_mean, fill = method)) +
  geom_col(position = position_dodge(width = 0.8)) +
  geom_errorbar(
    aes(ymin = ARI_mean - ARI_sd, ymax = ARI_mean + ARI_sd),
    position = position_dodge(width = 0.8),
    width = 0.25
  ) +
  coord_flip() +
  labs(title = "External validity by scenario", x = NULL, y = "Mean ARI")

ggplot(summary_tbl, aes(x = scenario, y = Misclassification_mean, fill = method)) +
  geom_col(position = position_dodge(width = 0.8)) +
  geom_errorbar(
    aes(ymin = pmax(0, Misclassification_mean - Misclassification_sd),
        ymax = Misclassification_mean + Misclassification_sd),
    position = position_dodge(width = 0.8),
    width = 0.25
  ) +
  coord_flip() +
  labs(title = "Misclassification rate by scenario", x = NULL, y = "Mean misclassification rate")

internal_long <- summary_tbl %>%
  select(scenario, method, Silhouette_mean, Calinski_Harabasz_mean, Davies_Bouldin_mean, Wilks_Lambda_mean) %>%
  pivot_longer(cols = -c(scenario, method), names_to = "metric", values_to = "value")

ggplot(internal_long, aes(x = scenario, y = value, fill = method)) +
  geom_col(position = position_dodge(width = 0.8)) +
  facet_wrap(~ metric, scales = "free") +
  coord_flip() +
  labs(title = "Internal validity diagnostics", x = NULL, y = NULL)

15. Optional Wilks plug-in

Replace the placeholder function in Section 8 with your own Wilks implementation. It should return at least:

list(
  cluster = integer_vector_of_cluster_assignments,
  native_objective = optional_numeric_value
)

Then enable it inside METHODS, for example:

METHODS <- list(
  KMeans = fit_predict_kmeans,
  Wilks = fit_predict_wilks
)

After that, rerun Sections 12–14.

16. Export datasets locally

The export format is intentionally compact: each configuration is saved as a single CSV with all replications stacked in the same file. The replication index is stored in the rep column. This avoids creating an excessive number of files.

export_generated_datasets <- function(benchmarks, n_reps = 3, output_dir = "/mnt/data/benchmark_exports", file_format = "csv",
                                      base_seed = 9000) {
  safe_dir_create(output_dir)

  manifest_entries <- list()

  for (scenario_name in names(benchmarks)) {
    b <- benchmarks[[scenario_name]]
    rep_tables <- list()
    rep_meta <- list()

    for (rep in seq_len(n_reps)) {
      seed_data <- base_seed + which(names(benchmarks) == scenario_name) * 1000 + rep
      obj <- b$generator(seed = seed_data)
      dat <- as_tibble(obj$X)
      names(dat) <- paste0("x", seq_len(ncol(obj$X)))
      dat$y_true <- obj$y_true
      dat$scenario <- scenario_name
      dat$family <- b$family
      dat$difficulty <- b$difficulty
      dat$d <- b$d
      dat$n <- nrow(obj$X)
      dat$K <- b$K
      dat$rep <- rep
      dat$seed_data <- seed_data
      rep_tables[[rep]] <- dat

      rep_meta[[rep]] <- list(
        rep = rep,
        seed_data = seed_data,
        scenario = scenario_name,
        family = b$family,
        difficulty = b$difficulty,
        d = b$d,
        n = nrow(obj$X),
        K = b$K,
        scenario_meta = obj$meta
      )
    }

    all_reps_tbl <- bind_rows(rep_tables)
    stem <- paste0(scenario_name, "__all_reps")
    data_path <- file.path(output_dir, paste0(stem, ".", file_format))
    meta_path <- file.path(output_dir, paste0(stem, ".metadata.json"))

    if (tolower(file_format) == "csv") {
      readr::write_csv(all_reps_tbl, data_path)
    } else if (tolower(file_format) == "parquet") {
      if (!requireNamespace("arrow", quietly = TRUE)) {
        stop("Package 'arrow' is required for parquet export.")
      }
      arrow::write_parquet(all_reps_tbl, data_path)
    } else {
      stop("Unsupported file_format. Use 'csv' or 'parquet'.")
    }

    meta_json <- list(
      scenario = scenario_name,
      family = b$family,
      difficulty = b$difficulty,
      d = b$d,
      n_reps = n_reps,
      data_file = normalizePath(data_path, winslash = "/", mustWork = FALSE),
      replications = rep_meta
    )
    writeLines(jsonlite::toJSON(meta_json, pretty = TRUE, auto_unbox = TRUE), con = meta_path)

    manifest_entries[[length(manifest_entries) + 1]] <- tibble(
      scenario = scenario_name,
      family = b$family,
      difficulty = b$difficulty,
      dimension = b$d,
      K = b$K,
      n_reps = n_reps,
      csv_path = normalizePath(data_path, winslash = "/", mustWork = FALSE),
      metadata_path = normalizePath(meta_path, winslash = "/", mustWork = FALSE)
    )
  }

  manifest_tbl <- bind_rows(manifest_entries)
  manifest_path <- file.path(output_dir, "dataset_manifest.csv")
  readr::write_csv(manifest_tbl, manifest_path)
  manifest_tbl
}

17. Run the local export

EXPORT_DIR <- "/mnt/data/benchmark_exports"

manifest_tbl <- export_generated_datasets(
  BENCHMARKS,
  n_reps = 3,
  output_dir = EXPORT_DIR,
  file_format = "csv"
)

manifest_tbl

18. Export benchmark result tables locally

readr::write_csv(results_tbl, "/mnt/data/clustering_benchmark_results.csv")
readr::write_csv(summary_tbl, "/mnt/data/clustering_benchmark_summary.csv")
readr::write_csv(family_summary_tbl, "/mnt/data/clustering_benchmark_family_summary.csv")

c(
  "/mnt/data/clustering_benchmark_results.csv",
  "/mnt/data/clustering_benchmark_summary.csv",
  "/mnt/data/clustering_benchmark_family_summary.csv"
)

## [1] "/mnt/data/clustering_benchmark_results.csv"       
## [2] "/mnt/data/clustering_benchmark_summary.csv"       
## [3] "/mnt/data/clustering_benchmark_family_summary.csv"

19. Zip exported files

This section is useful when the notebook is run on a remote system and the user wants a single downloadable archive.

zip_output <- "/mnt/data/clustering_benchmark_bundle.zip"

paths_to_zip <- c(
  list.files(EXPORT_DIR, full.names = TRUE),
  "/mnt/data/clustering_benchmark_results.csv",
  "/mnt/data/clustering_benchmark_summary.csv",
  "/mnt/data/clustering_benchmark_family_summary.csv"
)

paths_to_zip <- paths_to_zip[file.exists(paths_to_zip)]

if (!requireNamespace("zip", quietly = TRUE)) {
  install.packages("zip", repos = "https://cloud.r-project.org")
}

zip::zipr(zipfile = zip_output, files = paths_to_zip)
zip_output

## [1] "/mnt/data/clustering_benchmark_bundle.zip"

20. R code for reading the exported files

read_exported_scenario <- function(scenario_name, export_dir = "/mnt/data/benchmark_exports") {
  manifest <- readr::read_csv(file.path(export_dir, "dataset_manifest.csv"), show_col_types = FALSE)
  row <- manifest %>% filter(scenario == scenario_name)
  if (nrow(row) == 0) stop("Scenario not found: ", scenario_name)

  csv_file <- row$csv_path[1]
  meta_file <- row$metadata_path[1]

  if (!file.exists(csv_file)) csv_file <- file.path(export_dir, basename(csv_file))
  if (!file.exists(meta_file)) meta_file <- file.path(export_dir, basename(meta_file))

  list(
    data = readr::read_csv(csv_file, show_col_types = FALSE),
    data_by_rep = split(readr::read_csv(csv_file, show_col_types = FALSE), readr::read_csv(csv_file, show_col_types = FALSE)$rep),
    metadata = jsonlite::fromJSON(meta_file)
  )
}

obj_read <- read_exported_scenario("gaussian_spherical_easy_2d", export_dir = EXPORT_DIR)
head(obj_read$data)

obj_read$metadata$scenario

## [1] "gaussian_spherical_easy_2d"

21. Remarks on interpretation

The scenarios are designed so that: - spherical Gaussian cases act as baseline controls, - unequal covariance and anisotropic cases probe whether covariance-aware criteria can outperform trace-based compactness, - heavy-tailed and contaminated cases probe robustness, - the two-moons case is explicitly a stress test outside the native convex-elliptic regime.

This structure makes the benchmark scientifically defensible rather than merely illustrative.

Benchmarking Harness for Wilks-Lambda Clustering vs K-means