1 Helper Functions

1.1 Evaluation Metric Function

calculate_metrics <- function(cluster_labels, data_scaled, true_labels = NULL, method_name = "Method") {
 
 # Remove noise points (-1 or 0) for metrics that can't handle them
 valid_idx <- cluster_labels > 0
 if(sum(valid_idx) < 2) {
   # If too few valid points, return NAs
   return(data.frame(
     Method = method_name,
     Silhouette = NA,
     Dunn_Index = NA,
     Calinski_Harabasz = NA,
     ARI = ifelse(is.null(true_labels), NA, NA),
     NMI = ifelse(is.null(true_labels), NA, NA)
   ))
 }
 
 # Silhouette Score
 sil <- tryCatch({
   mean(silhouette(cluster_labels[valid_idx], dist(data_scaled[valid_idx, ]))[, 3])
 }, error = function(e) NA)
 
 # Dunn Index
 dunn <- tryCatch({
   cluster.stats(dist(data_scaled[valid_idx, ]), cluster_labels[valid_idx])$dunn
 }, error = function(e) NA)
 
 # Calinski-Harabasz Index
 ch <- tryCatch({
   calinhara(data_scaled[valid_idx, ], cluster_labels[valid_idx])
 }, error = function(e) NA)
 
 # ARI
 ari <- ifelse(!is.null(true_labels) && length(unique(cluster_labels[valid_idx])) > 1,
               tryCatch(adjustedRandIndex(cluster_labels[valid_idx], true_labels[valid_idx]), error = function(e) NA),
               NA)
 
 # NMI 
 nmi <- ifelse(!is.null(true_labels) && requireNamespace("aricode", quietly = TRUE),
               tryCatch(aricode::NMI(cluster_labels[valid_idx], true_labels[valid_idx]), error = function(e) NA),
               NA)
 
 return(data.frame(
   Method = method_name,
   Silhouette = round(sil, 4),
   Dunn_Index = round(dunn, 4),
   Calinski_Harabasz = round(ch, 2),
   ARI = round(ari, 4),
   NMI = round(nmi, 4)
 ))
}

1.2 PCA Visualization

Since this only for visualization, we do not need to do assumption checking because the PCA will not go in into the methods.

plot_clusters_pca <- function(data_scaled, cluster_labels, method_name, 
                              true_labels = NULL, show_legend = TRUE) {
 
 # Perform PCA
 pca_res <- prcomp(data_scaled, center = FALSE, scale. = FALSE)
 pca_df <- data.frame(
   PC1 = pca_res$x[, 1],
   PC2 = pca_res$x[, 2],
   Cluster = as.factor(cluster_labels)
 )
 
 # Add true labels if provided
 if(!is.null(true_labels)) {
   pca_df$TrueLabel <- as.factor(true_labels)
 }
 
 # Base plot
 p <- ggplot(pca_df, aes(x = PC1, y = PC2, color = Cluster)) +
   geom_point(alpha = 0.6, size = 1.5) +
   labs(
     title = method_name,
     x = paste0("PC1 (", round(100*pca_res$sdev[1]^2/sum(pca_res$sdev^2), 1), "%)"),
     y = paste0("PC2 (", round(100*pca_res$sdev[2]^2/sum(pca_res$sdev^2), 1), "%)")
   ) +
   theme_minimal() +
   theme(
     plot.title = element_text(hjust = 0.5, size = 12, face = "bold"),
     legend.position = ifelse(show_legend, "right", "none")
   )
 
 # Use distinct colors
 n_clusters <- length(unique(cluster_labels[cluster_labels > 0]))
 if(any(cluster_labels == 0)) {
   # DBSCAN has noise (0), color as gray
   p <- p + scale_color_manual(values = c("gray70", scales::hue_pal()(n_clusters)),
                              name = "Cluster")
 } else {
   p <- p + scale_color_brewer(palette = "Set1", name = "Cluster")
 }
 
 return(p)
}
plot_all_methods <- function(data_scaled, results_list, true_labels = NULL, 
                             ncol = 3, add_true = TRUE) {
 
 plots <- list()
 
 for(i in seq_along(results_list)) {
   name <- names(results_list)[i]
   clusters <- results_list[[i]]
   plots[[name]] <- plot_clusters_pca(data_scaled, clusters, name, 
                                      true_labels = NULL, show_legend = FALSE)
 }
 
 # Add true labels plot if provided
 if(!is.null(true_labels) && add_true) {
   pca_res <- prcomp(data_scaled)
   p_true <- ggplot(data.frame(
     PC1 = pca_res$x[,1], 
     PC2 = pca_res$x[,2],
     Label = as.factor(true_labels)
   ), aes(x = PC1, y = PC2, color = Label)) +
     geom_point(alpha = 0.6, size = 1.5) +
     labs(title = "True Labels", x = "PC1", y = "PC2") +
     theme_minimal() +
     theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
     scale_color_brewer(palette = "Dark2")
   
   plots[["True Labels"]] <- p_true
 }
 
 # Arrange using gridExtra or patchwork
 do.call(gridExtra::grid.arrange, c(plots, ncol = ncol))
}

1.3 Find Knee

This being used to get the best K in Elbow method and eps for DBScan objectively.

find_knee <- function(x, y, plot = TRUE, xlabel = "X", ylabel = "Y", title = "Knee Detection") {
  # L-method: point farthest from line (first, last)
  n <- length(x)
  
  # Line endpoints
  x1 <- x[1]; y1 <- y[1]
  xn <- x[n]; yn <- y[n]
  
  # Calculate perpendicular distance from each point to line
  dx <- xn - x1
  dy <- yn - y1
  denominator <- sqrt(dx^2 + dy^2)
  
  if (denominator == 0) return(NULL)
  
  distances <- abs(dy * x - dx * y - x1*yn + xn*y1) / denominator
  knee_idx <- which.max(distances)
  knee_x <- x[knee_idx]
  knee_y <- y[knee_idx]
  
  if (plot) {
    plot(x, y, type = "b", pch = 19,
         xlab = xlabel, ylab = ylabel, main = title)
    abline(v = knee_x, h = knee_y, col = "red", lty = 2)
    points(knee_x, knee_y, col = "red", pch = 19, cex = 1.5)
    
    # Smart label positioning based on plot dimension
    y_range <- diff(range(y))
    x_range <- diff(range(x))
    
    # Adjust text position to avoid cropping
    y_offset <- ifelse(knee_y > mean(y), -0.15 * y_range, 0.15 * y_range)
    
    # Use legend or side margin instead of top-right
    is_kmeans_elbow <- length(x) <= 20 && max(x) <= 30

    legend_text <- sprintf("Knee: y=%.2f", knee_y)
    if (is_kmeans_elbow) {
        legend_text <- sprintf("Knee: k=%.0f, y=%.2f", knee_x, knee_y)
    }
    
    legend("bottomright", legend = legend_text, 
           col = "red", pch = 19, bty = "n", text.col = "red")
    
  }
  
  return(list(x = knee_x, y = knee_y, index = knee_idx))
}

2 Data Preprocessing

df <- read_xlsx("Dry_Bean_Dataset.xlsx")

cat("Is there any missing value in the data: ", any(is.na(df)))
## Is there any missing value in the data:  FALSE
true_labels <- df$Class
df_features <- df |> select(-c('Class'))

head(df_features)
## # A tibble: 6 × 16
##    Area Perimeter MajorAxisLength MinorAxisLength AspectRation Eccentricity
##   <dbl>     <dbl>           <dbl>           <dbl>        <dbl>        <dbl>
## 1 28395      610.            208.            174.         1.20        0.550
## 2 28734      638.            201.            183.         1.10        0.412
## 3 29380      624.            213.            176.         1.21        0.563
## 4 30008      646.            211.            183.         1.15        0.499
## 5 30140      620.            202.            190.         1.06        0.334
## 6 30279      635.            213.            182.         1.17        0.520
## # ℹ 10 more variables: ConvexArea <dbl>, EquivDiameter <dbl>, Extent <dbl>,
## #   Solidity <dbl>, roundness <dbl>, Compactness <dbl>, ShapeFactor1 <dbl>,
## #   ShapeFactor2 <dbl>, ShapeFactor3 <dbl>, ShapeFactor4 <dbl>

2.1 Checking the Skewness

moments::skewness(df_features)
##            Area       Perimeter MajorAxisLength MinorAxisLength    AspectRation 
##      2.95260553      1.62594431      1.35766564      2.23796387      0.58250919 
##    Eccentricity      ConvexArea   EquivDiameter          Extent        Solidity 
##     -1.06270680      2.94149690      1.94874282     -0.89524975     -2.54981207 
##       roundness     Compactness    ShapeFactor1    ShapeFactor2    ShapeFactor3 
##     -0.63567889      0.03711137     -0.53408168      0.30119273      0.24245420 
##    ShapeFactor4 
##     -2.75917879

Because the Area and ConvexArea have highly right-skewed (skewness > 2), while Solidity and ShapeFactor4 have highly left-skewed (skewness < -2). To make sure the statistical approach reliable for this dataset, we will apply log transformation, though left-skewed can’t directly used log transformation, we need to reflect them first (max(x) - x), then log.

feature_to_log <- c("Area", "ConvexArea", "Solidity", "ShapeFactor4")

skew_before <- data.frame(
 Feature = feature_to_log,
 Skewness_Before = c(
   skewness(df$Area),
   skewness(df$ConvexArea),
   skewness(df$Solidity),
   skewness(df$ShapeFactor4)
 )
)

p_area_before <- ggplot(df, aes(x = Area)) + 
 geom_histogram(aes(y = after_stat(density)), bins = 40, fill = "steelblue", alpha = 0.6) +
 geom_density(color = "darkred", linewidth = 1) +
 ggtitle(paste("Area (Before)\nSkewness:", round(skew_before$Skewness_Before[1], 2))) +
 theme_minimal() +
 theme(plot.title = element_text(size = 9))

p_convex_before <- ggplot(df, aes(x = ConvexArea)) + 
 geom_histogram(aes(y = after_stat(density)), bins = 40, fill = "steelblue", alpha = 0.6) +
 geom_density(color = "darkred", linewidth = 1) +
 ggtitle(paste("ConvexArea (Before)\nSkewness:", round(skew_before$Skewness_Before[2], 2))) +
 theme_minimal() +
 theme(plot.title = element_text(size = 9))

p_solid_before <- ggplot(df, aes(x = Solidity)) + 
 geom_histogram(aes(y = after_stat(density)), bins = 40, fill = "coral", alpha = 0.6) +
 geom_density(color = "darkred", linewidth = 1) +
 ggtitle(paste("Solidity (Before)\nSkewness:", round(skew_before$Skewness_Before[3], 2))) +
 theme_minimal() +
 theme(plot.title = element_text(size = 9))

p_shape4_before <- ggplot(df, aes(x = ShapeFactor4)) + 
 geom_histogram(aes(y = after_stat(density)), bins = 40, fill = "coral", alpha = 0.6) +
 geom_density(color = "darkred", linewidth = 1) +
 ggtitle(paste("ShapeFactor4 (Before)\nSkewness:", round(skew_before$Skewness_Before[4], 2))) +
 theme_minimal() +
 theme(plot.title = element_text(size = 9))

df_transformed <- df_features

df_transformed$Area_log <- log(df$Area)
df_transformed$ConvexArea_log <- log(df$ConvexArea)

# Left-skewed: Reflect then log (add small constant to avoid log(0))
df_transformed$Solidity_log <- log(max(df$Solidity) - df$Solidity + 0.0001)
df_transformed$ShapeFactor4_log <- log(max(df$ShapeFactor4) - df$ShapeFactor4 + 0.0001)

df_transformed <- df_transformed |>
  select(-any_of(feature_to_log))

skew_after <- data.frame(
 Feature = c("Area", "ConvexArea", "Solidity", "ShapeFactor4"),
 Skewness_After = c(
   skewness(df_transformed$Area_log),
   skewness(df_transformed$ConvexArea_log),
   skewness(df_transformed$Solidity_log),
   skewness(df_transformed$ShapeFactor4_log)
 )
)

p_area_after <- ggplot(df_transformed, aes(x = Area_log)) + 
 geom_histogram(aes(y = after_stat(density)), bins = 40, fill = "steelblue", alpha = 0.6) +
 geom_density(color = "darkgreen", linewidth = 1) +
 ggtitle(paste("Area (After Log)\nSkewness:", round(skew_after$Skewness_After[1], 2))) +
 theme_minimal() +
 theme(plot.title = element_text(size = 9))

p_convex_after <- ggplot(df_transformed, aes(x = ConvexArea_log)) + 
 geom_histogram(aes(y = after_stat(density)), bins = 40, fill = "steelblue", alpha = 0.6) +
 geom_density(color = "darkgreen", linewidth = 1) +
 ggtitle(paste("ConvexArea (After Log)\nSkewness:", round(skew_after$Skewness_After[2], 2))) +
 theme_minimal() +
 theme(plot.title = element_text(size = 9))

p_solid_after <- ggplot(df_transformed, aes(x = Solidity_log)) + 
 geom_histogram(aes(y = after_stat(density)), bins = 40, fill = "coral", alpha = 0.6) +
 geom_density(color = "darkgreen", linewidth = 1) +
 ggtitle(paste("Solidity (After Reflect+Log)\nSkewness:", round(skew_after$Skewness_After[3], 2))) +
 theme_minimal() +
 theme(plot.title = element_text(size = 9))

p_shape4_after <- ggplot(df_transformed, aes(x = ShapeFactor4_log)) + 
 geom_histogram(aes(y = after_stat(density)), bins = 40, fill = "coral", alpha = 0.6) +
 geom_density(color = "darkgreen", linewidth = 1) +
 ggtitle(paste("ShapeFactor4 (After Reflect+Log)\nSkewness:", round(skew_after$Skewness_After[4], 2))) +
 theme_minimal() +
 theme(plot.title = element_text(size = 9))

grid.arrange(p_area_before, p_convex_before, p_solid_before, p_shape4_before,
            p_area_after, p_convex_after, p_solid_after, p_shape4_after,
            ncol = 4, nrow = 2,
            top = "Distribution Comparison: Before (Top) vs After Transformation (Bottom)")

2.2 Scaling the dataset

Since the data have different units, we need to scale it, this is important since many distance based clustering like K-Means could easily be wrong if the units is different.

df_scaled <- df_transformed |>
  scale()

head(df_scaled)
##       Perimeter MajorAxisLength MinorAxisLength AspectRation Eccentricity
## [1,] -1.1432769       -1.306550      -0.6311299    -1.564995    -2.185640
## [2,] -1.0138866       -1.395860      -0.4344286    -1.969712    -3.685904
## [3,] -1.0787894       -1.252311      -0.5857131    -1.514236    -2.045261
## [4,] -0.9771793       -1.278778      -0.4392741    -1.741554    -2.742110
## [5,] -1.0973438       -1.380420      -0.2666536    -2.117915    -4.534862
## [6,] -1.0283110       -1.255410      -0.4616520    -1.670900    -2.505324
##      EquivDiameter    Extent roundness Compactness ShapeFactor1 ShapeFactor2
## [1,]    -1.0633015 0.2890768 1.4238148    1.839049    0.6807614     2.402085
## [2,]    -1.0441784 0.6974512 0.2310455    2.495358    0.3679534     3.100779
## [3,]    -1.0080470 0.5781740 1.2528189    1.764778    0.6031067     2.235009
## [4,]    -0.9733011 0.6712350 0.5150303    2.081639    0.4017030     2.514982
## [5,]    -0.9660443 0.4760028 1.8749235    2.765229    0.1182639     3.270862
## [6,]    -0.9584197 0.5287761 1.1856533    2.007054    0.4046609     2.411297
##      ShapeFactor3  Area_log ConvexArea_log Solidity_log ShapeFactor4_log
## [1,]     1.925653 -1.281118      -1.281521   -0.1989239       -1.3444085
## [2,]     2.689603 -1.252225      -1.243196    0.7362307       -1.0633613
## [3,]     1.841288 -1.198098      -1.200474   -0.4338196       -1.7861222
## [4,]     2.204169 -1.146608      -1.117380    1.8766187        0.6005286
## [5,]     3.013352 -1.135922      -1.141755   -0.9829096       -1.9532599
## [6,]     2.118057 -1.124720      -1.127196   -0.4164543       -2.0854647
plot_clusters_pca(df_scaled, true_labels, "True Label")

3 Clustering Methods

3.1 Finding the Best K

3.1.1 Elbow Method

k_range <- 2:10
wss <- sapply(k_range, function(k){
  kmeans(df_scaled, centers = k, nstart = 25)$tot.withinss
})
find_knee(
  x = k_range,
  y = wss,
  plot = TRUE,
  xlabel = "Number of Clusters (k)",
  ylab = "Within Sum of Squares",
  title = "Elbow Method"
)

## $x
## [1] 5
## 
## $y
## [1] 61281.24
## 
## $index
## [1] 4

The graph shows that the WSS drops sharply up to k=5, after which the decrease becomes marginal. Therefore, the inflection point is at k=5, making it the most optimal number of clusters.

3.1.2 Silhouette Method

avg_sil <- function(k){
  km <- kmeans(df_scaled, centers = k, nstart = 25)
  ss <- silhouette(km$cluster, dist(df_scaled))
  mean(ss[,3])
}
sil_values <- sapply(k_range, avg_sil)
plot(k_range, sil_values, type="b", pch=19,
     xlab="Number of Clusters",
     ylab="Silhouette Score",
     main="Silhouette Analysis")

Based on the silhouette analysis, the score is highest at k = 2 (±0.37) and decreases steadily afterward, suggesting that smaller cluster numbers yield better-separated clusters. This result will be cross-validated with the elbow method and gap statistics.

3.1.3 Gap Statistics

set.seed(123)
idx <- sample(nrow(df_scaled), 3000)
df_gap_sub <- df_scaled[idx, ]

gap_stat <- clusGap(df_gap_sub, FUN = kmeans, nstart = 10, K.max = 10, B = 20)
plot(gap_stat, main = "Gap Statistics")

gap_k <- maxSE(gap_stat$Tab[, "gap"], gap_stat$Tab[, "SE.sim"], method = "Tibs2001SEmax")
cat("Optimal k from Gap Statistics:", gap_k)
## Optimal k from Gap Statistics: 6

The three methods yield the following suggestions: - Elbow Method: k = 5, where the WSS curve shows a clear inflection point - Silhouette Analysis: k = 2, with the highest average silhouette score (±0.37) - Gap Statistics: k = 6, where the gap value plateaus

The silhouette method suggests k = 2, which is too simple to capture the structure of the dry bean dataset that contains 7 known varieties. Meanwhile, the elbow method and gap statistics are in close agreement at k = 5 and k = 6 respectively. Following Tibshirani’s rule, we select k = 5 as the final optimal number of clusters. It is the smallest k where the gap value is within one standard error of the maximum, and it aligns with the elbow inflection point. This choice balances statistical criteria with the known complexity of the dataset.

optimal_k = 5

3.2 K-Means

3.2.1 Evaluation

km_res <- kmeans(df_scaled, centers = optimal_k, nstart = 25)
table(km_res$cluster)
## 
##    1    2    3    4    5 
## 5846  524 2093 2115 3033
metrics_km <- calculate_metrics(km_res$cluster, df_scaled, true_labels, "K-Means")
metrics_km
##    Method Silhouette Dunn_Index Calinski_Harabasz    ARI   NMI
## 1 K-Means     0.3447     0.0152           8685.57 0.5648 0.617

3.2.2 PCA visualization

plot_clusters_pca(df_scaled, km_res$cluster, "K-Means Clustering")

3.3 K-Median

3.3.1 Evaluation

kmedian_res <- flexclust::kcca(
  df_scaled,
  k = optimal_k,
  family = kccaFamily("kmedians")
)
table(clusters(kmedian_res))
## 
##    1    2    3    4    5 
## 2088 3019  524 5780 2200
metrics_kmed <- calculate_metrics(clusters(kmedian_res), df_scaled, true_labels, "K-Median")
metrics_kmed
##     Method Silhouette Dunn_Index Calinski_Harabasz    ARI    NMI
## 1 K-Median     0.3403     0.0098            8613.7 0.5569 0.6114

3.3.2 PCA visualization

plot_clusters_pca(df_scaled, clusters(kmedian_res), "K-Median Clustering")

3.4 DBSCAN

3.4.1 Find the best Eps

This being achieve by selecting via the elbow (knee) point of the k-distance plot, where neighbor distances transition from dense plateau to steep ascent.

min_pts <- ncol(df_scaled) + 1
distances <- kNNdist(df_scaled, k = min_pts)
sorted_dist <- sort(distances)

eps_auto <- find_knee(
  x = 1:length(sorted_dist),
  y = sorted_dist,
  plot = TRUE,
  xlabel = "Points Sorted by Distance",
  ylabel = paste0(min_pts, "-NN Distance"),
  title = "kNN Distance Plot (DBSCAN eps selection)"
)$y

3.4.2 Evaluation

db_res <- dbscan(df_scaled, eps = eps_auto, minPts = min_pts)

table(db_res$cluster)
## 
##     0     1     2 
##   174 12944   493
metrics_db <- calculate_metrics(db_res$cluster, df_scaled, true_labels, "DBSCAN")
metrics_db
##   Method Silhouette Dunn_Index Calinski_Harabasz    ARI    NMI
## 1 DBSCAN     0.5138     0.0983           3253.78 0.0314 0.0859

3.4.3 Visualization

plot_clusters_pca(df_scaled, db_res$cluster, "DBSCAN Clustering")

3.5 Mean Shift

3.5.1 Find the Best Bandwidth

Bandwidth was estimated using a quantile-based approach similar to scikit-learn implementation. Pairwise distances were computed from a random subsample (n=500) of the scaled data for computational efficiency.

estimate_bandwidth <- function(X, quantile = 0.3, n_samples = 500) {
  n <- nrow(X)
  
  # Subsample
  if (n > n_samples) {
    idx <- sample(n, n_samples)
    X_sample <- X[idx, , drop = FALSE]
  } else {
    X_sample <- X
    n_samples <- n
  }
  
  # Compute pairwise distances (upper triangle only for efficiency)
  dists <- as.vector(dist(X_sample))
  
  # Remove zeros (diagonal)
  dists <- dists[dists > 0]
  
  # Bandwidth = quantile of distances
  bandwidth <- quantile(dists, probs = quantile)
  
  return(unname(bandwidth))
}

3.5.2 Evaluation

h = estimate_bandwidth(df_scaled)
ms_res <- meanShift(df_scaled, bandwidth = rep(h, ncol(df_scaled)))
table(ms_res$assignment)
## 
##     1     2 
## 11112  2499
metrics_ms <- calculate_metrics(ms_res$assignment, df_scaled, true_labels, "Mean Shift")
metrics_ms
##       Method Silhouette Dunn_Index Calinski_Harabasz    ARI    NMI
## 1 Mean Shift     0.3014     0.0137           4223.23 0.1102 0.1503

3.5.3 Visualization

plot_clusters_pca(df_scaled, ms_res$assignment, "Mean Shift Clustering")

3.6 Fuzzy C-Means

3.6.1 Evaluation

fcm_res <- cmeans(df_scaled, centers = optimal_k, m = 2)
table(fcm_res$cluster)
## 
##    1    2    3    4    5 
## 2130 2779 3657 2054 2991
metrics_fcm <- calculate_metrics(fcm_res$cluster, df_scaled, true_labels, "FCM")
metrics_fcm
##   Method Silhouette Dunn_Index Calinski_Harabasz    ARI    NMI
## 1    FCM      0.265     0.0128           7040.23 0.6568 0.6406

3.6.2 Visualization

plot_clusters_pca(df_scaled, fcm_res$cluster, "Fuzzy C-means Clustering")

3.7 Results Comparison

bind_rows(metrics_km, metrics_kmed, metrics_db, metrics_ms, metrics_fcm)
##       Method Silhouette Dunn_Index Calinski_Harabasz    ARI    NMI
## 1    K-Means     0.3447     0.0152           8685.57 0.5648 0.6170
## 2   K-Median     0.3403     0.0098           8613.70 0.5569 0.6114
## 3     DBSCAN     0.5138     0.0983           3253.78 0.0314 0.0859
## 4 Mean Shift     0.3014     0.0137           4223.23 0.1102 0.1503
## 5        FCM     0.2650     0.0128           7040.23 0.6568 0.6406
results_list <- list(
   "K-Means" = km_res$cluster,
   "K-Median" = clusters(kmedian_res),
   "DBSCAN" = db_res$cluster,
   "Mean Shift" = ms_res$assignment,
   "Fuzzy C-means" = fcm_res$cluster
)
plot_all_methods(df_scaled, results_list, true_labels, ncol = 3)