calculate_metrics <- function(cluster_labels, data_scaled, true_labels = NULL, method_name = "Method") {
# Remove noise points (-1 or 0) for metrics that can't handle them
valid_idx <- cluster_labels > 0
if(sum(valid_idx) < 2) {
# If too few valid points, return NAs
return(data.frame(
Method = method_name,
Silhouette = NA,
Dunn_Index = NA,
Calinski_Harabasz = NA,
ARI = ifelse(is.null(true_labels), NA, NA),
NMI = ifelse(is.null(true_labels), NA, NA)
))
}
# Silhouette Score
sil <- tryCatch({
mean(silhouette(cluster_labels[valid_idx], dist(data_scaled[valid_idx, ]))[, 3])
}, error = function(e) NA)
# Dunn Index
dunn <- tryCatch({
cluster.stats(dist(data_scaled[valid_idx, ]), cluster_labels[valid_idx])$dunn
}, error = function(e) NA)
# Calinski-Harabasz Index
ch <- tryCatch({
calinhara(data_scaled[valid_idx, ], cluster_labels[valid_idx])
}, error = function(e) NA)
# ARI
ari <- ifelse(!is.null(true_labels) && length(unique(cluster_labels[valid_idx])) > 1,
tryCatch(adjustedRandIndex(cluster_labels[valid_idx], true_labels[valid_idx]), error = function(e) NA),
NA)
# NMI
nmi <- ifelse(!is.null(true_labels) && requireNamespace("aricode", quietly = TRUE),
tryCatch(aricode::NMI(cluster_labels[valid_idx], true_labels[valid_idx]), error = function(e) NA),
NA)
return(data.frame(
Method = method_name,
Silhouette = round(sil, 4),
Dunn_Index = round(dunn, 4),
Calinski_Harabasz = round(ch, 2),
ARI = round(ari, 4),
NMI = round(nmi, 4)
))
}
Since this only for visualization, we do not need to do assumption checking because the PCA will not go in into the methods.
plot_clusters_pca <- function(data_scaled, cluster_labels, method_name,
true_labels = NULL, show_legend = TRUE) {
# Perform PCA
pca_res <- prcomp(data_scaled, center = FALSE, scale. = FALSE)
pca_df <- data.frame(
PC1 = pca_res$x[, 1],
PC2 = pca_res$x[, 2],
Cluster = as.factor(cluster_labels)
)
# Add true labels if provided
if(!is.null(true_labels)) {
pca_df$TrueLabel <- as.factor(true_labels)
}
# Base plot
p <- ggplot(pca_df, aes(x = PC1, y = PC2, color = Cluster)) +
geom_point(alpha = 0.6, size = 1.5) +
labs(
title = method_name,
x = paste0("PC1 (", round(100*pca_res$sdev[1]^2/sum(pca_res$sdev^2), 1), "%)"),
y = paste0("PC2 (", round(100*pca_res$sdev[2]^2/sum(pca_res$sdev^2), 1), "%)")
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, size = 12, face = "bold"),
legend.position = ifelse(show_legend, "right", "none")
)
# Use distinct colors
n_clusters <- length(unique(cluster_labels[cluster_labels > 0]))
if(any(cluster_labels == 0)) {
# DBSCAN has noise (0), color as gray
p <- p + scale_color_manual(values = c("gray70", scales::hue_pal()(n_clusters)),
name = "Cluster")
} else {
p <- p + scale_color_brewer(palette = "Set1", name = "Cluster")
}
return(p)
}
plot_all_methods <- function(data_scaled, results_list, true_labels = NULL,
ncol = 3, add_true = TRUE) {
plots <- list()
for(i in seq_along(results_list)) {
name <- names(results_list)[i]
clusters <- results_list[[i]]
plots[[name]] <- plot_clusters_pca(data_scaled, clusters, name,
true_labels = NULL, show_legend = FALSE)
}
# Add true labels plot if provided
if(!is.null(true_labels) && add_true) {
pca_res <- prcomp(data_scaled)
p_true <- ggplot(data.frame(
PC1 = pca_res$x[,1],
PC2 = pca_res$x[,2],
Label = as.factor(true_labels)
), aes(x = PC1, y = PC2, color = Label)) +
geom_point(alpha = 0.6, size = 1.5) +
labs(title = "True Labels", x = "PC1", y = "PC2") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
scale_color_brewer(palette = "Dark2")
plots[["True Labels"]] <- p_true
}
# Arrange using gridExtra or patchwork
do.call(gridExtra::grid.arrange, c(plots, ncol = ncol))
}
This being used to get the best K in Elbow method and eps for DBScan objectively.
find_knee <- function(x, y, plot = TRUE, xlabel = "X", ylabel = "Y", title = "Knee Detection") {
# L-method: point farthest from line (first, last)
n <- length(x)
# Line endpoints
x1 <- x[1]; y1 <- y[1]
xn <- x[n]; yn <- y[n]
# Calculate perpendicular distance from each point to line
dx <- xn - x1
dy <- yn - y1
denominator <- sqrt(dx^2 + dy^2)
if (denominator == 0) return(NULL)
distances <- abs(dy * x - dx * y - x1*yn + xn*y1) / denominator
knee_idx <- which.max(distances)
knee_x <- x[knee_idx]
knee_y <- y[knee_idx]
if (plot) {
plot(x, y, type = "b", pch = 19,
xlab = xlabel, ylab = ylabel, main = title)
abline(v = knee_x, h = knee_y, col = "red", lty = 2)
points(knee_x, knee_y, col = "red", pch = 19, cex = 1.5)
# Smart label positioning based on plot dimension
y_range <- diff(range(y))
x_range <- diff(range(x))
# Adjust text position to avoid cropping
y_offset <- ifelse(knee_y > mean(y), -0.15 * y_range, 0.15 * y_range)
# Use legend or side margin instead of top-right
is_kmeans_elbow <- length(x) <= 20 && max(x) <= 30
legend_text <- sprintf("Knee: y=%.2f", knee_y)
if (is_kmeans_elbow) {
legend_text <- sprintf("Knee: k=%.0f, y=%.2f", knee_x, knee_y)
}
legend("bottomright", legend = legend_text,
col = "red", pch = 19, bty = "n", text.col = "red")
}
return(list(x = knee_x, y = knee_y, index = knee_idx))
}
df <- read_xlsx("Dry_Bean_Dataset.xlsx")
cat("Is there any missing value in the data: ", any(is.na(df)))
## Is there any missing value in the data: FALSE
true_labels <- df$Class
df_features <- df |> select(-c('Class'))
head(df_features)
## # A tibble: 6 × 16
## Area Perimeter MajorAxisLength MinorAxisLength AspectRation Eccentricity
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 28395 610. 208. 174. 1.20 0.550
## 2 28734 638. 201. 183. 1.10 0.412
## 3 29380 624. 213. 176. 1.21 0.563
## 4 30008 646. 211. 183. 1.15 0.499
## 5 30140 620. 202. 190. 1.06 0.334
## 6 30279 635. 213. 182. 1.17 0.520
## # ℹ 10 more variables: ConvexArea <dbl>, EquivDiameter <dbl>, Extent <dbl>,
## # Solidity <dbl>, roundness <dbl>, Compactness <dbl>, ShapeFactor1 <dbl>,
## # ShapeFactor2 <dbl>, ShapeFactor3 <dbl>, ShapeFactor4 <dbl>
moments::skewness(df_features)
## Area Perimeter MajorAxisLength MinorAxisLength AspectRation
## 2.95260553 1.62594431 1.35766564 2.23796387 0.58250919
## Eccentricity ConvexArea EquivDiameter Extent Solidity
## -1.06270680 2.94149690 1.94874282 -0.89524975 -2.54981207
## roundness Compactness ShapeFactor1 ShapeFactor2 ShapeFactor3
## -0.63567889 0.03711137 -0.53408168 0.30119273 0.24245420
## ShapeFactor4
## -2.75917879
Because the Area and ConvexArea have highly
right-skewed (skewness > 2), while Solidity and
ShapeFactor4 have highly left-skewed (skewness < -2). To
make sure the statistical approach reliable for this dataset, we will
apply log transformation, though left-skewed can’t directly used log
transformation, we need to reflect them first (max(x) - x),
then log.
feature_to_log <- c("Area", "ConvexArea", "Solidity", "ShapeFactor4")
skew_before <- data.frame(
Feature = feature_to_log,
Skewness_Before = c(
skewness(df$Area),
skewness(df$ConvexArea),
skewness(df$Solidity),
skewness(df$ShapeFactor4)
)
)
p_area_before <- ggplot(df, aes(x = Area)) +
geom_histogram(aes(y = after_stat(density)), bins = 40, fill = "steelblue", alpha = 0.6) +
geom_density(color = "darkred", linewidth = 1) +
ggtitle(paste("Area (Before)\nSkewness:", round(skew_before$Skewness_Before[1], 2))) +
theme_minimal() +
theme(plot.title = element_text(size = 9))
p_convex_before <- ggplot(df, aes(x = ConvexArea)) +
geom_histogram(aes(y = after_stat(density)), bins = 40, fill = "steelblue", alpha = 0.6) +
geom_density(color = "darkred", linewidth = 1) +
ggtitle(paste("ConvexArea (Before)\nSkewness:", round(skew_before$Skewness_Before[2], 2))) +
theme_minimal() +
theme(plot.title = element_text(size = 9))
p_solid_before <- ggplot(df, aes(x = Solidity)) +
geom_histogram(aes(y = after_stat(density)), bins = 40, fill = "coral", alpha = 0.6) +
geom_density(color = "darkred", linewidth = 1) +
ggtitle(paste("Solidity (Before)\nSkewness:", round(skew_before$Skewness_Before[3], 2))) +
theme_minimal() +
theme(plot.title = element_text(size = 9))
p_shape4_before <- ggplot(df, aes(x = ShapeFactor4)) +
geom_histogram(aes(y = after_stat(density)), bins = 40, fill = "coral", alpha = 0.6) +
geom_density(color = "darkred", linewidth = 1) +
ggtitle(paste("ShapeFactor4 (Before)\nSkewness:", round(skew_before$Skewness_Before[4], 2))) +
theme_minimal() +
theme(plot.title = element_text(size = 9))
df_transformed <- df_features
df_transformed$Area_log <- log(df$Area)
df_transformed$ConvexArea_log <- log(df$ConvexArea)
# Left-skewed: Reflect then log (add small constant to avoid log(0))
df_transformed$Solidity_log <- log(max(df$Solidity) - df$Solidity + 0.0001)
df_transformed$ShapeFactor4_log <- log(max(df$ShapeFactor4) - df$ShapeFactor4 + 0.0001)
df_transformed <- df_transformed |>
select(-any_of(feature_to_log))
skew_after <- data.frame(
Feature = c("Area", "ConvexArea", "Solidity", "ShapeFactor4"),
Skewness_After = c(
skewness(df_transformed$Area_log),
skewness(df_transformed$ConvexArea_log),
skewness(df_transformed$Solidity_log),
skewness(df_transformed$ShapeFactor4_log)
)
)
p_area_after <- ggplot(df_transformed, aes(x = Area_log)) +
geom_histogram(aes(y = after_stat(density)), bins = 40, fill = "steelblue", alpha = 0.6) +
geom_density(color = "darkgreen", linewidth = 1) +
ggtitle(paste("Area (After Log)\nSkewness:", round(skew_after$Skewness_After[1], 2))) +
theme_minimal() +
theme(plot.title = element_text(size = 9))
p_convex_after <- ggplot(df_transformed, aes(x = ConvexArea_log)) +
geom_histogram(aes(y = after_stat(density)), bins = 40, fill = "steelblue", alpha = 0.6) +
geom_density(color = "darkgreen", linewidth = 1) +
ggtitle(paste("ConvexArea (After Log)\nSkewness:", round(skew_after$Skewness_After[2], 2))) +
theme_minimal() +
theme(plot.title = element_text(size = 9))
p_solid_after <- ggplot(df_transformed, aes(x = Solidity_log)) +
geom_histogram(aes(y = after_stat(density)), bins = 40, fill = "coral", alpha = 0.6) +
geom_density(color = "darkgreen", linewidth = 1) +
ggtitle(paste("Solidity (After Reflect+Log)\nSkewness:", round(skew_after$Skewness_After[3], 2))) +
theme_minimal() +
theme(plot.title = element_text(size = 9))
p_shape4_after <- ggplot(df_transformed, aes(x = ShapeFactor4_log)) +
geom_histogram(aes(y = after_stat(density)), bins = 40, fill = "coral", alpha = 0.6) +
geom_density(color = "darkgreen", linewidth = 1) +
ggtitle(paste("ShapeFactor4 (After Reflect+Log)\nSkewness:", round(skew_after$Skewness_After[4], 2))) +
theme_minimal() +
theme(plot.title = element_text(size = 9))
grid.arrange(p_area_before, p_convex_before, p_solid_before, p_shape4_before,
p_area_after, p_convex_after, p_solid_after, p_shape4_after,
ncol = 4, nrow = 2,
top = "Distribution Comparison: Before (Top) vs After Transformation (Bottom)")
Since the data have different units, we need to scale it, this is important since many distance based clustering like K-Means could easily be wrong if the units is different.
df_scaled <- df_transformed |>
scale()
head(df_scaled)
## Perimeter MajorAxisLength MinorAxisLength AspectRation Eccentricity
## [1,] -1.1432769 -1.306550 -0.6311299 -1.564995 -2.185640
## [2,] -1.0138866 -1.395860 -0.4344286 -1.969712 -3.685904
## [3,] -1.0787894 -1.252311 -0.5857131 -1.514236 -2.045261
## [4,] -0.9771793 -1.278778 -0.4392741 -1.741554 -2.742110
## [5,] -1.0973438 -1.380420 -0.2666536 -2.117915 -4.534862
## [6,] -1.0283110 -1.255410 -0.4616520 -1.670900 -2.505324
## EquivDiameter Extent roundness Compactness ShapeFactor1 ShapeFactor2
## [1,] -1.0633015 0.2890768 1.4238148 1.839049 0.6807614 2.402085
## [2,] -1.0441784 0.6974512 0.2310455 2.495358 0.3679534 3.100779
## [3,] -1.0080470 0.5781740 1.2528189 1.764778 0.6031067 2.235009
## [4,] -0.9733011 0.6712350 0.5150303 2.081639 0.4017030 2.514982
## [5,] -0.9660443 0.4760028 1.8749235 2.765229 0.1182639 3.270862
## [6,] -0.9584197 0.5287761 1.1856533 2.007054 0.4046609 2.411297
## ShapeFactor3 Area_log ConvexArea_log Solidity_log ShapeFactor4_log
## [1,] 1.925653 -1.281118 -1.281521 -0.1989239 -1.3444085
## [2,] 2.689603 -1.252225 -1.243196 0.7362307 -1.0633613
## [3,] 1.841288 -1.198098 -1.200474 -0.4338196 -1.7861222
## [4,] 2.204169 -1.146608 -1.117380 1.8766187 0.6005286
## [5,] 3.013352 -1.135922 -1.141755 -0.9829096 -1.9532599
## [6,] 2.118057 -1.124720 -1.127196 -0.4164543 -2.0854647
plot_clusters_pca(df_scaled, true_labels, "True Label")
k_range <- 2:10
wss <- sapply(k_range, function(k){
kmeans(df_scaled, centers = k, nstart = 25)$tot.withinss
})
find_knee(
x = k_range,
y = wss,
plot = TRUE,
xlabel = "Number of Clusters (k)",
ylab = "Within Sum of Squares",
title = "Elbow Method"
)
## $x
## [1] 5
##
## $y
## [1] 61281.24
##
## $index
## [1] 4
The graph shows that the WSS drops sharply up to k=5, after which the decrease becomes marginal. Therefore, the inflection point is at k=5, making it the most optimal number of clusters.
avg_sil <- function(k){
km <- kmeans(df_scaled, centers = k, nstart = 25)
ss <- silhouette(km$cluster, dist(df_scaled))
mean(ss[,3])
}
sil_values <- sapply(k_range, avg_sil)
plot(k_range, sil_values, type="b", pch=19,
xlab="Number of Clusters",
ylab="Silhouette Score",
main="Silhouette Analysis")
Based on the silhouette analysis, the score is highest at k = 2 (±0.37) and decreases steadily afterward, suggesting that smaller cluster numbers yield better-separated clusters. This result will be cross-validated with the elbow method and gap statistics.
set.seed(123)
idx <- sample(nrow(df_scaled), 3000)
df_gap_sub <- df_scaled[idx, ]
gap_stat <- clusGap(df_gap_sub, FUN = kmeans, nstart = 10, K.max = 10, B = 20)
plot(gap_stat, main = "Gap Statistics")
gap_k <- maxSE(gap_stat$Tab[, "gap"], gap_stat$Tab[, "SE.sim"], method = "Tibs2001SEmax")
cat("Optimal k from Gap Statistics:", gap_k)
## Optimal k from Gap Statistics: 6
The three methods yield the following suggestions: - Elbow Method: k = 5, where the WSS curve shows a clear inflection point - Silhouette Analysis: k = 2, with the highest average silhouette score (±0.37) - Gap Statistics: k = 6, where the gap value plateaus
The silhouette method suggests k = 2, which is too simple to capture the structure of the dry bean dataset that contains 7 known varieties. Meanwhile, the elbow method and gap statistics are in close agreement at k = 5 and k = 6 respectively. Following Tibshirani’s rule, we select k = 5 as the final optimal number of clusters. It is the smallest k where the gap value is within one standard error of the maximum, and it aligns with the elbow inflection point. This choice balances statistical criteria with the known complexity of the dataset.
optimal_k = 5
km_res <- kmeans(df_scaled, centers = optimal_k, nstart = 25)
table(km_res$cluster)
##
## 1 2 3 4 5
## 5846 524 2093 2115 3033
metrics_km <- calculate_metrics(km_res$cluster, df_scaled, true_labels, "K-Means")
metrics_km
## Method Silhouette Dunn_Index Calinski_Harabasz ARI NMI
## 1 K-Means 0.3447 0.0152 8685.57 0.5648 0.617
plot_clusters_pca(df_scaled, km_res$cluster, "K-Means Clustering")
kmedian_res <- flexclust::kcca(
df_scaled,
k = optimal_k,
family = kccaFamily("kmedians")
)
table(clusters(kmedian_res))
##
## 1 2 3 4 5
## 2088 3019 524 5780 2200
metrics_kmed <- calculate_metrics(clusters(kmedian_res), df_scaled, true_labels, "K-Median")
metrics_kmed
## Method Silhouette Dunn_Index Calinski_Harabasz ARI NMI
## 1 K-Median 0.3403 0.0098 8613.7 0.5569 0.6114
plot_clusters_pca(df_scaled, clusters(kmedian_res), "K-Median Clustering")
This being achieve by selecting via the elbow (knee) point of the k-distance plot, where neighbor distances transition from dense plateau to steep ascent.
min_pts <- ncol(df_scaled) + 1
distances <- kNNdist(df_scaled, k = min_pts)
sorted_dist <- sort(distances)
eps_auto <- find_knee(
x = 1:length(sorted_dist),
y = sorted_dist,
plot = TRUE,
xlabel = "Points Sorted by Distance",
ylabel = paste0(min_pts, "-NN Distance"),
title = "kNN Distance Plot (DBSCAN eps selection)"
)$y
db_res <- dbscan(df_scaled, eps = eps_auto, minPts = min_pts)
table(db_res$cluster)
##
## 0 1 2
## 174 12944 493
metrics_db <- calculate_metrics(db_res$cluster, df_scaled, true_labels, "DBSCAN")
metrics_db
## Method Silhouette Dunn_Index Calinski_Harabasz ARI NMI
## 1 DBSCAN 0.5138 0.0983 3253.78 0.0314 0.0859
plot_clusters_pca(df_scaled, db_res$cluster, "DBSCAN Clustering")
Bandwidth was estimated using a quantile-based approach similar to scikit-learn implementation. Pairwise distances were computed from a random subsample (n=500) of the scaled data for computational efficiency.
estimate_bandwidth <- function(X, quantile = 0.3, n_samples = 500) {
n <- nrow(X)
# Subsample
if (n > n_samples) {
idx <- sample(n, n_samples)
X_sample <- X[idx, , drop = FALSE]
} else {
X_sample <- X
n_samples <- n
}
# Compute pairwise distances (upper triangle only for efficiency)
dists <- as.vector(dist(X_sample))
# Remove zeros (diagonal)
dists <- dists[dists > 0]
# Bandwidth = quantile of distances
bandwidth <- quantile(dists, probs = quantile)
return(unname(bandwidth))
}
h = estimate_bandwidth(df_scaled)
ms_res <- meanShift(df_scaled, bandwidth = rep(h, ncol(df_scaled)))
table(ms_res$assignment)
##
## 1 2
## 11112 2499
metrics_ms <- calculate_metrics(ms_res$assignment, df_scaled, true_labels, "Mean Shift")
metrics_ms
## Method Silhouette Dunn_Index Calinski_Harabasz ARI NMI
## 1 Mean Shift 0.3014 0.0137 4223.23 0.1102 0.1503
plot_clusters_pca(df_scaled, ms_res$assignment, "Mean Shift Clustering")
fcm_res <- cmeans(df_scaled, centers = optimal_k, m = 2)
table(fcm_res$cluster)
##
## 1 2 3 4 5
## 2130 2779 3657 2054 2991
metrics_fcm <- calculate_metrics(fcm_res$cluster, df_scaled, true_labels, "FCM")
metrics_fcm
## Method Silhouette Dunn_Index Calinski_Harabasz ARI NMI
## 1 FCM 0.265 0.0128 7040.23 0.6568 0.6406
plot_clusters_pca(df_scaled, fcm_res$cluster, "Fuzzy C-means Clustering")
bind_rows(metrics_km, metrics_kmed, metrics_db, metrics_ms, metrics_fcm)
## Method Silhouette Dunn_Index Calinski_Harabasz ARI NMI
## 1 K-Means 0.3447 0.0152 8685.57 0.5648 0.6170
## 2 K-Median 0.3403 0.0098 8613.70 0.5569 0.6114
## 3 DBSCAN 0.5138 0.0983 3253.78 0.0314 0.0859
## 4 Mean Shift 0.3014 0.0137 4223.23 0.1102 0.1503
## 5 FCM 0.2650 0.0128 7040.23 0.6568 0.6406
results_list <- list(
"K-Means" = km_res$cluster,
"K-Median" = clusters(kmedian_res),
"DBSCAN" = db_res$cluster,
"Mean Shift" = ms_res$assignment,
"Fuzzy C-means" = fcm_res$cluster
)
plot_all_methods(df_scaled, results_list, true_labels, ncol = 3)