Perkembangan teknologi informasi memungkinkan pemanfaatan data untuk mengelompokkan objek secara efisien, termasuk pada biji kacang yang memiliki beragam karakteristik morfologis. Pengelompokan manual dinilai kurang efektif karena memakan waktu dan rentan terhadap subjektivitas, sehingga diperlukan pendekatan otomatis berbasis data. Clustering sebagai metode unsupervised learning dapat digunakan untuk mengelompokkan data berdasarkan kemiripan, dengan berbagai algoritma seperti K-Means, K-Median, DBSCAN, Mean Shift, dan Fuzzy C-Means yang memiliki keunggulan masing-masing. Untuk menentukan metode terbaik, dilakukan evaluasi menggunakan metrik seperti Silhouette Score dan Dunn Index. Selanjutnya, Exploratory Data Analysis (EDA) digunakan untuk memahami karakteristik tiap cluster sehingga menghasilkan insight yang lebih bermakna.
library(tidyverse)
library(readxl)
library(flexclust)
library(dbscan)
library(meanShiftR)
library(e1071)
library(cluster)
library(fpc)
data <- read_excel("D:/UAS tableau/Dry Bean Dataset.xlsx")
str(data)
## tibble [13,611 × 17] (S3: tbl_df/tbl/data.frame)
## $ Area : num [1:13611] 28395 28734 29380 30008 30140 ...
## $ Perimeter : num [1:13611] 610 638 624 646 620 ...
## $ MajorAxisLength: num [1:13611] 208 201 213 211 202 ...
## $ MinorAxisLength: num [1:13611] 174 183 176 183 190 ...
## $ AspectRation : num [1:13611] 1.2 1.1 1.21 1.15 1.06 ...
## $ Eccentricity : num [1:13611] 0.55 0.412 0.563 0.499 0.334 ...
## $ ConvexArea : num [1:13611] 28715 29172 29690 30724 30417 ...
## $ EquivDiameter : num [1:13611] 190 191 193 195 196 ...
## $ Extent : num [1:13611] 0.764 0.784 0.778 0.783 0.773 ...
## $ Solidity : num [1:13611] 0.989 0.985 0.99 0.977 0.991 ...
## $ roundness : num [1:13611] 0.958 0.887 0.948 0.904 0.985 ...
## $ Compactness : num [1:13611] 0.913 0.954 0.909 0.928 0.971 ...
## $ ShapeFactor1 : num [1:13611] 0.00733 0.00698 0.00724 0.00702 0.0067 ...
## $ ShapeFactor2 : num [1:13611] 0.00315 0.00356 0.00305 0.00321 0.00366 ...
## $ ShapeFactor3 : num [1:13611] 0.834 0.91 0.826 0.862 0.942 ...
## $ ShapeFactor4 : num [1:13611] 0.999 0.998 0.999 0.994 0.999 ...
## $ Class : chr [1:13611] "SEKER" "SEKER" "SEKER" "SEKER" ...
summary(data)
## Area Perimeter MajorAxisLength MinorAxisLength
## Min. : 20420 Min. : 524.7 Min. :183.6 Min. :122.5
## 1st Qu.: 36328 1st Qu.: 703.5 1st Qu.:253.3 1st Qu.:175.8
## Median : 44652 Median : 794.9 Median :296.9 Median :192.4
## Mean : 53048 Mean : 855.3 Mean :320.1 Mean :202.3
## 3rd Qu.: 61332 3rd Qu.: 977.2 3rd Qu.:376.5 3rd Qu.:217.0
## Max. :254616 Max. :1985.4 Max. :738.9 Max. :460.2
## AspectRation Eccentricity ConvexArea EquivDiameter
## Min. :1.025 Min. :0.2190 Min. : 20684 Min. :161.2
## 1st Qu.:1.432 1st Qu.:0.7159 1st Qu.: 36715 1st Qu.:215.1
## Median :1.551 Median :0.7644 Median : 45178 Median :238.4
## Mean :1.583 Mean :0.7509 Mean : 53768 Mean :253.1
## 3rd Qu.:1.707 3rd Qu.:0.8105 3rd Qu.: 62294 3rd Qu.:279.4
## Max. :2.430 Max. :0.9114 Max. :263261 Max. :569.4
## Extent Solidity roundness Compactness
## Min. :0.5553 Min. :0.9192 Min. :0.4896 Min. :0.6406
## 1st Qu.:0.7186 1st Qu.:0.9857 1st Qu.:0.8321 1st Qu.:0.7625
## Median :0.7599 Median :0.9883 Median :0.8832 Median :0.8013
## Mean :0.7497 Mean :0.9871 Mean :0.8733 Mean :0.7999
## 3rd Qu.:0.7869 3rd Qu.:0.9900 3rd Qu.:0.9169 3rd Qu.:0.8343
## Max. :0.8662 Max. :0.9947 Max. :0.9907 Max. :0.9873
## ShapeFactor1 ShapeFactor2 ShapeFactor3 ShapeFactor4
## Min. :0.002778 Min. :0.0005642 Min. :0.4103 Min. :0.9477
## 1st Qu.:0.005900 1st Qu.:0.0011535 1st Qu.:0.5814 1st Qu.:0.9937
## Median :0.006645 Median :0.0016935 Median :0.6420 Median :0.9964
## Mean :0.006564 Mean :0.0017159 Mean :0.6436 Mean :0.9951
## 3rd Qu.:0.007271 3rd Qu.:0.0021703 3rd Qu.:0.6960 3rd Qu.:0.9979
## Max. :0.010451 Max. :0.0036650 Max. :0.9748 Max. :0.9997
## Class
## Length:13611
## Class :character
## Mode :character
##
##
##
label <- data$Class
df <- data %>% select(-Class)
df_scaled <- scale(df)
head(df_scaled)
## Area Perimeter MajorAxisLength MinorAxisLength AspectRation
## [1,] -0.8407176 -1.1432769 -1.306550 -0.6311299 -1.564995
## [2,] -0.8291572 -1.0138866 -1.395860 -0.4344286 -1.969712
## [3,] -0.8071275 -1.0787894 -1.252311 -0.5857131 -1.514236
## [4,] -0.7857117 -0.9771793 -1.278778 -0.4392741 -1.741554
## [5,] -0.7812103 -1.0973438 -1.380420 -0.2666536 -2.117915
## [6,] -0.7764701 -1.0283110 -1.255410 -0.4616520 -1.670900
## Eccentricity ConvexArea EquivDiameter Extent Solidity roundness
## [1,] -2.185640 -0.8414197 -1.0633015 0.2890768 0.3675999 1.4238148
## [2,] -3.685904 -0.8260712 -1.0441784 0.6974512 -0.4628896 0.2310455
## [3,] -2.045261 -0.8086740 -1.0080470 0.5781740 0.5183978 1.2528189
## [4,] -2.742110 -0.7739468 -0.9733011 0.6712350 -2.2416847 0.5150303
## [5,] -4.534862 -0.7842575 -0.9660443 0.4760028 0.8047429 1.8749235
## [6,] -2.505324 -0.7781114 -0.9584197 0.5287761 0.5078901 1.1856533
## Compactness ShapeFactor1 ShapeFactor2 ShapeFactor3 ShapeFactor4
## [1,] 1.839049 0.6807614 2.402085 1.925653 0.8383402
## [2,] 2.495358 0.3679534 3.100779 2.689603 0.7711101
## [3,] 1.764778 0.6031067 2.235009 1.841288 0.9167215
## [4,] 2.081639 0.4017030 2.514982 2.204169 -0.1979782
## [5,] 2.765229 0.1182639 3.270862 3.013352 0.9396054
## [6,] 2.007054 0.4046609 2.411297 2.118057 0.9555730
set.seed(123)
# KMeans
kmeans_res <- kmeans(df_scaled, centers = 7, nstart = 25)
df_eval <- df_scaled[sample(1:nrow(df_scaled), 500), ]
kmeans_eval <- kmeans(df_eval, centers = 7)
# KMedian
kmedian_eval <- flexclust::kcca(df_eval, k = 7, family = kccaFamily("kmedians"))
# DBSCAN
dbscan_eval <- dbscan(df_eval, eps = 0.8, MinPts = 5)
# MeanShift
meanshift_eval <- meanShift(as.matrix(df_eval))
# FCM
fcm_eval <- cmeans(df_eval, centers = 7)
evaluate_cluster <- function(data, cluster_result){
sil <- silhouette(cluster_result, dist(data))
sil_score <- mean(sil[,3])
dunn <- cluster.stats(dist(data), cluster_result)$dunn
return(c(silhouette = sil_score, dunn = dunn))
}
# KMeans
eval_kmeans <- evaluate_cluster(df_eval, kmeans_eval$cluster)
# KMedian
eval_kmedian <- evaluate_cluster(df_eval, clusters(kmedian_eval))
# DBSCAN
db_cluster <- dbscan_eval$cluster
db_cluster[db_cluster == 0] <- NA
eval_dbscan <- evaluate_cluster(
df_eval[!is.na(db_cluster),],
db_cluster[!is.na(db_cluster)]
)
# MeanShift
eval_meanshift <- evaluate_cluster(df_eval, meanshift_eval$assignment)
# FCM
eval_fcm <- evaluate_cluster(df_eval, fcm_eval$cluster)
results <- data.frame(
Method = c("KMeans","KMedian","DBSCAN","MeanShift","FCM"),
Silhouette = c(eval_kmeans[1],
eval_kmedian[1],
eval_dbscan[1],
eval_meanshift[1],
eval_fcm[1]),
Dunn = c(eval_kmeans[2],
eval_kmedian[2],
eval_dbscan[2],
eval_meanshift[2],
eval_fcm[2])
)
print(results)
## Method Silhouette Dunn
## 1 KMeans 0.28067439 0.05208664
## 2 KMedian 0.26942319 0.04399881
## 3 DBSCAN 0.16236248 0.15919198
## 4 MeanShift -0.06771737 0.06588645
## 5 FCM 0.26789887 0.03192598
best_method <- results[which.max(results$Silhouette), ]
print(best_method)
## Method Silhouette Dunn
## 1 KMeans 0.2806744 0.05208664
data$Cluster <- kmeans_res$cluster
table(data$Cluster)
##
## 1 2 3 4 5 6 7
## 540 2481 3157 1769 3112 520 2032
cluster_summary <- data %>%
group_by(Cluster) %>%
summarise_all(mean)
print(cluster_summary)
## # A tibble: 7 × 18
## Cluster Area Perimeter MajorAxisLength MinorAxisLength AspectRation
## <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 65216. 1004. 386. 217. 1.79
## 2 2 74024. 1061. 394. 240. 1.65
## 3 3 31541. 658. 243. 165. 1.48
## 4 4 53677. 920. 374. 183. 2.04
## 5 5 44437. 794. 298. 190. 1.57
## 6 6 173668. 1586. 594. 375. 1.59
## 7 7 39393. 722. 249. 201. 1.24
## # ℹ 12 more variables: Eccentricity <dbl>, ConvexArea <dbl>,
## # EquivDiameter <dbl>, Extent <dbl>, Solidity <dbl>, roundness <dbl>,
## # Compactness <dbl>, ShapeFactor1 <dbl>, ShapeFactor2 <dbl>,
## # ShapeFactor3 <dbl>, ShapeFactor4 <dbl>, Class <dbl>
library(ggplot2)
pca <- prcomp(df_scaled)
pca_data <- data.frame(pca$x[,1:2])
pca_data$Cluster <- as.factor(data$Cluster)
ggplot(pca_data, aes(PC1, PC2, color = Cluster)) +
geom_point() +
ggtitle("Cluster KMeans")
ggplot(data, aes(x = as.factor(Cluster))) +
geom_bar(fill = "skyblue") +
ggtitle("Distribusi Cluster")
ggplot(data, aes(x = as.factor(Cluster), y = Area)) +
geom_boxplot() +
ggtitle("Area per Cluster")
ggplot(data, aes(x = as.factor(Cluster), y = Perimeter)) +
geom_boxplot() +
ggtitle("Perimeter per Cluster")