Penelitian ini menerapkan lima metode analisis clustering — K-Means, K-Medians (PAM), DBSCAN, Mean Shift, dan Fuzzy C-Means — pada White Wine Quality Dataset dari UCI Machine Learning Repository. Dataset terdiri dari 4.898 sampel wine putih dengan 11 fitur kimiawi numerik.
Tujuan utama adalah mengelompokkan wine berdasarkan karakteristik kimia tanpa mempertimbangkan label kualitas, kemudian mengevaluasi korelasi antara cluster yang terbentuk dengan skor kualitas sebagai validasi eksternal.
packages <- c(
"tidyverse", # manipulasi data & ggplot2
"factoextra", # visualisasi clustering
"cluster", # silhouette, pam (k-medians)
"fpc", # DBSCAN
"meanShiftR", # Mean Shift
"e1071", # Fuzzy C-Means & skewness/kurtosis
"ggcorrplot", # correlation plot ggplot-based
"knitr", # tabel rapi
"kableExtra", # styling tabel HTML
"dbscan" # DBSCAN (lebih stabil)
)
installed_pkgs <- rownames(installed.packages())
to_install <- packages[!packages %in% installed_pkgs]
if (length(to_install) > 0) {
install.packages(to_install, repos = "https://cran.rstudio.com/")
}
lapply(packages, library, character.only = TRUE)## [[1]]
## [1] "lubridate" "forcats" "stringr" "dplyr" "purrr" "readr"
## [7] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[2]]
## [1] "factoextra" "lubridate" "forcats" "stringr" "dplyr"
## [6] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [11] "tidyverse" "stats" "graphics" "grDevices" "utils"
## [16] "datasets" "methods" "base"
##
## [[3]]
## [1] "cluster" "factoextra" "lubridate" "forcats" "stringr"
## [6] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [11] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [16] "utils" "datasets" "methods" "base"
##
## [[4]]
## [1] "fpc" "cluster" "factoextra" "lubridate" "forcats"
## [6] "stringr" "dplyr" "purrr" "readr" "tidyr"
## [11] "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [16] "grDevices" "utils" "datasets" "methods" "base"
##
## [[5]]
## [1] "meanShiftR" "fpc" "cluster" "factoextra" "lubridate"
## [6] "forcats" "stringr" "dplyr" "purrr" "readr"
## [11] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [16] "graphics" "grDevices" "utils" "datasets" "methods"
## [21] "base"
##
## [[6]]
## [1] "e1071" "meanShiftR" "fpc" "cluster" "factoextra"
## [6] "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [11] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [16] "stats" "graphics" "grDevices" "utils" "datasets"
## [21] "methods" "base"
##
## [[7]]
## [1] "ggcorrplot" "e1071" "meanShiftR" "fpc" "cluster"
## [6] "factoextra" "lubridate" "forcats" "stringr" "dplyr"
## [11] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [16] "tidyverse" "stats" "graphics" "grDevices" "utils"
## [21] "datasets" "methods" "base"
##
## [[8]]
## [1] "knitr" "ggcorrplot" "e1071" "meanShiftR" "fpc"
## [6] "cluster" "factoextra" "lubridate" "forcats" "stringr"
## [11] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [16] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [21] "utils" "datasets" "methods" "base"
##
## [[9]]
## [1] "kableExtra" "knitr" "ggcorrplot" "e1071" "meanShiftR"
## [6] "fpc" "cluster" "factoextra" "lubridate" "forcats"
## [11] "stringr" "dplyr" "purrr" "readr" "tidyr"
## [16] "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [21] "grDevices" "utils" "datasets" "methods" "base"
##
## [[10]]
## [1] "dbscan" "kableExtra" "knitr" "ggcorrplot" "e1071"
## [6] "meanShiftR" "fpc" "cluster" "factoextra" "lubridate"
## [11] "forcats" "stringr" "dplyr" "purrr" "readr"
## [16] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [21] "graphics" "grDevices" "utils" "datasets" "methods"
## [26] "base"
df_raw <- read.csv("winequality-white.csv", sep = ";", header = TRUE)
cat("Dimensi data:", nrow(df_raw), "baris x", ncol(df_raw), "kolom\n")## Dimensi data: 4898 baris x 12 kolom
## Nama kolom : fixed.acidity, volatile.acidity, citric.acid, residual.sugar, chlorides, free.sulfur.dioxide, total.sulfur.dioxide, density, pH, sulphates, alcohol, quality
# Simpan quality untuk validasi, keluarkan dari fitur clustering
quality_label <- df_raw$quality
df <- df_raw %>% select(-quality)
cat("\nFitur clustering (", ncol(df), "fitur):\n")##
## Fitur clustering ( 11 fitur):
## fixed.acidity, volatile.acidity, citric.acid, residual.sugar, chlorides, free.sulfur.dioxide, total.sulfur.dioxide, density, pH, sulphates, alcohol
desc_stats <- df %>%
summarise(across(everything(), list(
Mean = ~round(mean(.), 3),
Median = ~round(median(.), 3),
SD = ~round(sd(.), 3),
Min = ~round(min(.), 3),
Max = ~round(max(.), 3),
Skewness = ~round(e1071::skewness(.), 3),
Kurtosis = ~round(e1071::kurtosis(.), 3)
))) %>%
pivot_longer(everything(),
names_to = c("Variabel", "Statistik"),
names_sep = "_(?=[^_]+$)") %>%
pivot_wider(names_from = Statistik, values_from = value)
desc_stats %>%
kable(caption = "Tabel 1. Statistik Deskriptif Fitur White Wine Quality Dataset",
align = c("l", rep("r", 7))) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = TRUE) %>%
row_spec(0, bold = TRUE, background = "#1F4E79", color = "white")| Variabel | Mean | Median | SD | Min | Max | Skewness | Kurtosis |
|---|---|---|---|---|---|---|---|
| fixed.acidity | 6.855 | 6.800 | 0.844 | 3.800 | 14.200 | 0.647 | 2.167 |
| volatile.acidity | 0.278 | 0.260 | 0.101 | 0.080 | 1.100 | 1.576 | 5.082 |
| citric.acid | 0.334 | 0.320 | 0.121 | 0.000 | 1.660 | 1.281 | 6.164 |
| residual.sugar | 6.391 | 5.200 | 5.072 | 0.600 | 65.800 | 1.076 | 3.462 |
| chlorides | 0.046 | 0.043 | 0.022 | 0.009 | 0.346 | 5.020 | 37.508 |
| free.sulfur.dioxide | 35.308 | 34.000 | 17.007 | 2.000 | 289.000 | 1.406 | 11.448 |
| total.sulfur.dioxide | 138.361 | 134.000 | 42.498 | 9.000 | 440.000 | 0.390 | 0.569 |
| density | 0.994 | 0.994 | 0.003 | 0.987 | 1.039 | 0.977 | 9.777 |
| pH | 3.188 | 3.180 | 0.151 | 2.720 | 3.820 | 0.458 | 0.528 |
| sulphates | 0.490 | 0.470 | 0.114 | 0.220 | 1.080 | 0.977 | 1.586 |
| alcohol | 10.514 | 10.400 | 1.231 | 8.000 | 14.200 | 0.487 | -0.700 |
## Total missing values: 0
## Baris duplikat : 937
df_long <- df %>%
pivot_longer(everything(), names_to = "variable", values_to = "value")
ggplot(df_long, aes(x = value, fill = variable)) +
geom_histogram(bins = 40, color = "white", alpha = 0.85) +
facet_wrap(~variable, scales = "free", ncol = 3) +
theme_minimal(base_size = 11) +
theme(legend.position = "none",
strip.text = element_text(face = "bold")) +
labs(title = "Gambar 1. Distribusi Setiap Fitur",
x = NULL, y = "Frekuensi")ggplot(df_long, aes(x = variable, y = value, fill = variable)) +
geom_boxplot(outlier.size = 0.5, outlier.alpha = 0.4, alpha = 0.8) +
facet_wrap(~variable, scales = "free", ncol = 3) +
theme_minimal(base_size = 11) +
theme(legend.position = "none",
axis.text.x = element_blank(),
strip.text = element_text(face = "bold")) +
labs(title = "Gambar 2. Boxplot Setiap Fitur (Deteksi Outlier)",
x = NULL, y = NULL)outlier_df <- df %>%
summarise(across(everything(), ~{
Q1 <- quantile(., 0.25); Q3 <- quantile(., 0.75)
sum(. < (Q1 - 1.5*(Q3-Q1)) | . > (Q3 + 1.5*(Q3-Q1)))
})) %>%
pivot_longer(everything(),
names_to = "Variabel",
values_to = "Jumlah Outlier") %>%
mutate(`Persentase (%)` = round(`Jumlah Outlier` / nrow(df) * 100, 2))
outlier_df %>%
kable(caption = "Tabel 2. Jumlah Outlier Per Fitur (Metode IQR)",
align = c("l", "r", "r")) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE) %>%
row_spec(0, bold = TRUE, background = "#1F4E79", color = "white")| Variabel | Jumlah Outlier | Persentase (%) |
|---|---|---|
| fixed.acidity | 119 | 2.43 |
| volatile.acidity | 186 | 3.80 |
| citric.acid | 270 | 5.51 |
| residual.sugar | 7 | 0.14 |
| chlorides | 208 | 4.25 |
| free.sulfur.dioxide | 50 | 1.02 |
| total.sulfur.dioxide | 19 | 0.39 |
| density | 5 | 0.10 |
| pH | 75 | 1.53 |
| sulphates | 124 | 2.53 |
| alcohol | 0 | 0.00 |
cor_matrix <- cor(df, method = "pearson")
ggcorrplot(cor_matrix,
method = "square",
type = "lower",
lab = TRUE,
lab_size = 2.8,
colors = c("#6D9EC1", "white", "#E46726"),
title = "Gambar 3. Matriks Korelasi Antar Fitur",
ggtheme = theme_minimal())data.frame(quality = quality_label) %>%
ggplot(aes(x = factor(quality), fill = factor(quality))) +
geom_bar(color = "white", alpha = 0.85) +
geom_text(stat = "count", aes(label = after_stat(count)),
vjust = -0.5, size = 3.5) +
scale_fill_brewer(palette = "RdYlGn") +
theme_minimal(base_size = 12) +
theme(legend.position = "none") +
labs(title = "Gambar 4. Distribusi Skor Quality Wine",
x = "Quality Score", y = "Jumlah Sampel")df_scaled <- scale(df)
data.frame(
Variabel = colnames(df_scaled),
Mean_Setelah = round(colMeans(df_scaled), 5),
SD_Setelah = round(apply(df_scaled, 2, sd), 5)
) %>%
kable(caption = "Tabel 3. Verifikasi Standardisasi Z-Score",
align = c("l", "r", "r")) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE) %>%
row_spec(0, bold = TRUE, background = "#1F4E79", color = "white")| Variabel | Mean_Setelah | SD_Setelah | |
|---|---|---|---|
| fixed.acidity | fixed.acidity | 0 | 1 |
| volatile.acidity | volatile.acidity | 0 | 1 |
| citric.acid | citric.acid | 0 | 1 |
| residual.sugar | residual.sugar | 0 | 1 |
| chlorides | chlorides | 0 | 1 |
| free.sulfur.dioxide | free.sulfur.dioxide | 0 | 1 |
| total.sulfur.dioxide | total.sulfur.dioxide | 0 | 1 |
| density | density | 0 | 1 |
| pH | pH | 0 | 1 |
| sulphates | sulphates | 0 | 1 |
| alcohol | alcohol | 0 | 1 |
set.seed(42)
fviz_nbclust(df_scaled, kmeans, method = "wss", k.max = 10) +
labs(title = "Gambar 5. Elbow Method — Penentuan K Optimal") +
theme_minimal()set.seed(42)
fviz_nbclust(df_scaled, kmeans, method = "silhouette", k.max = 10) +
labs(title = "Gambar 6. Silhouette Method — Penentuan K Optimal") +
theme_minimal()set.seed(42)
gap_stat <- clusGap(df_scaled, FUN = kmeans, K.max = 8, B = 50, nstart = 25)
fviz_gap_stat(gap_stat) +
labs(title = "Gambar 7. Gap Statistic — Penentuan K Optimal") +
theme_minimal()## Clustering Gap statistic ["clusGap"] from call:
## clusGap(x = df_scaled, FUNcluster = kmeans, K.max = 8, B = 50, nstart = 25)
## B=50 simulated reference sets, k = 1..8; spaceH0="scaledPCA"
## --> Number of clusters (method 'firstmax'): 2
## logW E.logW gap SE.sim
## [1,] 8.598970 9.887272 1.288302 0.002749074
## [2,] 8.475417 9.803830 1.328413 0.002513543
## [3,] 8.429626 9.754108 1.324482 0.002489185
## [4,] 8.399495 9.717759 1.318264 0.002615777
## [5,] 8.364682 9.696385 1.331703 0.002506576
## [6,] 8.338565 9.676544 1.337979 0.002439820
## [7,] 8.314844 9.659474 1.344630 0.002313483
## [8,] 8.294947 9.643047 1.348099 0.002291627
## K optimal yang dipilih: 3
set.seed(42)
km_result <- kmeans(df_scaled, centers = K_OPTIMAL, nstart = 25, iter.max = 100)
cat("Cluster sizes :", km_result$size, "\n")## Cluster sizes : 1802 1628 1468
## Total WSS : 39055.05
## Between SS / Total SS : 27.5 %
sil_km <- silhouette(km_result$cluster, dist(df_scaled))
cat("Silhouette Score :", round(mean(sil_km[, 3]), 4), "\n")## Silhouette Score : 0.1447
## cluster size ave.sil.width
## 1 1 1802 0.17
## 2 2 1628 0.12
## 3 3 1468 0.13
fviz_cluster(km_result, data = df_scaled,
geom = "point", ellipse.type = "convex",
palette = "jco", alpha = 0.4, pointsize = 0.8) +
labs(title = paste("Gambar 9. Visualisasi Cluster K-Means (K =", K_OPTIMAL, ")")) +
theme_minimal()df_km <- df %>% mutate(cluster = km_result$cluster, quality = quality_label)
df_km %>%
group_by(cluster) %>%
summarise(across(everything(), ~round(mean(.), 3)), .groups = "drop") %>%
kable(caption = "Tabel 4. Profil Rata-rata Per Cluster (K-Means)",
align = c("r", rep("r", ncol(df)+1))) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = TRUE) %>%
row_spec(0, bold = TRUE, background = "#1F4E79", color = "white")| cluster | fixed.acidity | volatile.acidity | citric.acid | residual.sugar | chlorides | free.sulfur.dioxide | total.sulfur.dioxide | density | pH | sulphates | alcohol | quality |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 6.960 | 0.282 | 0.363 | 11.124 | 0.055 | 46.246 | 172.124 | 0.997 | 3.155 | 0.497 | 9.488 | 5.608 |
| 2 | 6.221 | 0.277 | 0.288 | 3.374 | 0.040 | 31.543 | 122.018 | 0.992 | 3.307 | 0.516 | 11.161 | 6.155 |
| 3 | 7.429 | 0.274 | 0.350 | 3.928 | 0.041 | 26.057 | 115.040 | 0.993 | 3.096 | 0.452 | 11.057 | 5.903 |
set.seed(42)
pam_result <- pam(df_scaled, k = K_OPTIMAL, metric = "euclidean")
cat("Cluster sizes :", pam_result$clusinfo[, "size"], "\n")## Cluster sizes : 1702 1548 1648
sil_pam <- silhouette(pam_result$clustering, dist(df_scaled))
cat("Silhouette Score :", round(mean(sil_pam[, 3]), 4), "\n")## Silhouette Score : 0.125
fviz_silhouette(sil_pam) +
labs(title = "Gambar 10. Silhouette Plot — K-Medians (PAM)") +
theme_minimal()## cluster size ave.sil.width
## 1 1 1702 0.15
## 2 2 1548 0.06
## 3 3 1648 0.15
fviz_cluster(pam_result, data = df_scaled,
geom = "point", ellipse.type = "convex",
palette = "jco", alpha = 0.4, pointsize = 0.8) +
labs(title = paste("Gambar 11. Visualisasi Cluster K-Medians/PAM (K =", K_OPTIMAL, ")")) +
theme_minimal()minPts_val <- 5
knn_dist <- dbscan::kNNdist(df_scaled, k = minPts_val)
if (is.matrix(knn_dist)) {
knn_sorted <- sort(knn_dist[, ncol(knn_dist)])
} else {
knn_sorted <- sort(knn_dist)
}
par(mar = c(4, 4, 2, 1))
plot(knn_sorted, type = "l",
main = "Gambar 12. kNN Distance Plot (Tuning eps DBSCAN)",
xlab = "Titik (diurutkan)",
ylab = paste0(minPts_val, "-NN Distance"),
col = "steelblue", lwd = 1.5)
abline(h = 1.5, col = "red", lty = 2)
legend("topleft", legend = "eps = 1.5", col = "red", lty = 2, bty = "n")eps_val <- 1.5
db_result <- dbscan::dbscan(df_scaled, eps = eps_val, minPts = minPts_val)
cat("Distribusi cluster DBSCAN:\n")## Distribusi cluster DBSCAN:
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
## 957 3854 5 5 6 10 5 7 6 7 7 12 7 5 5
## Noise points : 957
## Jumlah cluster: 14
non_noise <- db_result$cluster != 0
if (sum(non_noise) > 1 && length(unique(db_result$cluster[non_noise])) > 1) {
sil_db <- silhouette(db_result$cluster[non_noise],
dist(df_scaled[non_noise, ]))
cat("Silhouette Score (non-noise):", round(mean(sil_db[, 3]), 4), "\n")
}## Silhouette Score (non-noise): -0.1485
fviz_cluster(db_result, data = df_scaled,
geom = "point", palette = "jco",
alpha = 0.4, pointsize = 0.8, ellipse = FALSE) +
labs(title = paste0("Gambar 13. Visualisasi Cluster DBSCAN (eps=",
eps_val, ", minPts=", minPts_val, ")")) +
theme_minimal()set.seed(42)
n_ms <- 1000
idx_ms <- sample(nrow(df_scaled), n_ms)
df_ms_sub <- df_scaled[idx_ms, ]
ms_result <- meanShiftR::meanShift(
df_ms_sub,
nNeighbors = 100,
algorithm = "LINEAR",
bandwidth = rep(1.5, ncol(df_ms_sub))
)
ms_labels <- ms_result$assignment
cat("Distribusi cluster Mean Shift (subset n =", n_ms, "):\n")## Distribusi cluster Mean Shift (subset n = 1000 ):
## ms_labels
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 32 85 27 163 31 358 14 23 27 7 20 14 11 1 37 12 39 30 1 4
## 21 22 23 24 25 26
## 32 5 20 1 5 1
## Jumlah cluster: 26
if (length(unique(ms_labels)) > 1) {
sil_ms <- silhouette(ms_labels, dist(df_ms_sub))
cat("Silhouette Score:", round(mean(sil_ms[, 3]), 4), "\n")
}## Silhouette Score: -0.0187
pca_ms <- prcomp(df_ms_sub, center = FALSE, scale. = FALSE)
data.frame(PC1 = pca_ms$x[, 1],
PC2 = pca_ms$x[, 2],
cluster = factor(ms_labels)) %>%
ggplot(aes(x = PC1, y = PC2, color = cluster)) +
geom_point(alpha = 0.5, size = 0.9) +
stat_ellipse(aes(group = cluster), type = "norm", level = 0.95) +
scale_color_brewer(palette = "Set1") +
labs(title = paste0("Gambar 15. Visualisasi Cluster Mean Shift (subset n=", n_ms, ")"),
color = "Cluster") +
theme_minimal()set.seed(42)
fcm_result <- e1071::cmeans(df_scaled,
centers = K_OPTIMAL,
iter.max = 100,
m = 2,
method = "cmeans")
cat("Cluster sizes (hard assignment):\n")## Cluster sizes (hard assignment):
##
## 1 2 3
## 949 2027 1922
## Nilai objektif: 3.6409
PC <- sum(fcm_result$membership^2) / nrow(df_scaled)
PE <- -sum(fcm_result$membership * log(fcm_result$membership + 1e-10)) / nrow(df_scaled)
cat("\nPartition Coefficient (PC):", round(PC, 4), " [1=hard, 1/K=fully fuzzy]\n")##
## Partition Coefficient (PC): 0.365 [1=hard, 1/K=fully fuzzy]
## Partition Entropy (PE): 1.05 [0=hard, log(K)=fully fuzzy]
sil_fcm <- silhouette(fcm_result$cluster, dist(df_scaled))
cat("Silhouette Score :", round(mean(sil_fcm[, 3]), 4), "\n")## Silhouette Score : 0.0986
round(head(fcm_result$membership, 5), 4) %>%
as.data.frame() %>%
kable(caption = "Tabel 5. Contoh Membership Matrix FCM (5 Baris Pertama)",
align = "r") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE) %>%
row_spec(0, bold = TRUE, background = "#1F4E79", color = "white")| 1 | 2 | 3 |
|---|---|---|
| 0.2310 | 0.2302 | 0.5388 |
| 0.4008 | 0.4004 | 0.1988 |
| 0.3599 | 0.3587 | 0.2815 |
| 0.1843 | 0.1831 | 0.6327 |
| 0.1843 | 0.1831 | 0.6327 |
fviz_silhouette(sil_fcm) +
labs(title = "Gambar 16. Silhouette Plot — Fuzzy C-Means") +
theme_minimal()## cluster size ave.sil.width
## 1 1 949 0.07
## 2 2 2027 0.11
## 3 3 1922 0.10
pca_all <- prcomp(df_scaled, center = FALSE, scale. = FALSE)
data.frame(PC1 = pca_all$x[, 1],
PC2 = pca_all$x[, 2],
cluster = factor(fcm_result$cluster)) %>%
ggplot(aes(x = PC1, y = PC2, color = cluster)) +
geom_point(alpha = 0.35, size = 0.7) +
stat_ellipse(aes(group = cluster), type = "norm", level = 0.95) +
scale_color_brewer(palette = "Set1") +
labs(title = paste("Gambar 17. Visualisasi Cluster Fuzzy C-Means (K =", K_OPTIMAL, ")"),
color = "Cluster") +
theme_minimal()sil_scores <- c(
"K-Means" = round(mean(sil_km[, 3]), 4),
"K-Medians (PAM)" = round(mean(sil_pam[, 3]), 4),
"Fuzzy C-Means" = round(mean(sil_fcm[, 3]), 4)
)
if (exists("sil_db"))
sil_scores["DBSCAN"] <- round(mean(sil_db[, 3]), 4)
if (length(unique(ms_labels)) > 1) {
sil_ms2 <- silhouette(ms_labels, dist(df_ms_sub))
sil_scores["Mean Shift"] <- round(mean(sil_ms2[, 3]), 4)
}
comparison_df <- data.frame(
Metode = names(sil_scores),
Silhouette = sil_scores,
row.names = NULL
)
comparison_df %>%
arrange(desc(Silhouette)) %>%
kable(caption = "Tabel 6. Perbandingan Silhouette Score Antar Metode",
align = c("l", "r")) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE) %>%
row_spec(0, bold = TRUE, background = "#1F4E79", color = "white") %>%
row_spec(1, bold = TRUE, background = "#E8F5E9")| Metode | Silhouette |
|---|---|
| K-Means | 0.1447 |
| K-Medians (PAM) | 0.1250 |
| Fuzzy C-Means | 0.0986 |
| Mean Shift | -0.0187 |
| DBSCAN | -0.1485 |
comparison_df %>%
ggplot(aes(x = reorder(Metode, Silhouette), y = Silhouette, fill = Metode)) +
geom_col(width = 0.6, alpha = 0.85, color = "white") +
geom_text(aes(label = round(Silhouette, 4)), hjust = -0.2, size = 4) +
coord_flip() +
scale_fill_brewer(palette = "Set2") +
scale_y_continuous(limits = c(0, max(comparison_df$Silhouette) * 1.3)) +
theme_minimal(base_size = 12) +
theme(legend.position = "none") +
labs(title = "Gambar 18. Perbandingan Silhouette Score Antar Metode",
x = "Metode Clustering", y = "Rata-rata Silhouette Score")## Tabel kontingensi K-Means vs Quality:
## Quality
## Cluster 3 4 5 6 7 8 9
## 1 11 42 785 796 141 27 0
## 2 2 52 290 730 459 92 3
## 3 7 69 382 672 280 56 2
df_km %>%
ggplot(aes(x = factor(cluster), y = quality, fill = factor(cluster))) +
geom_boxplot(alpha = 0.75, outlier.size = 0.5) +
scale_fill_brewer(palette = "Set1") +
theme_minimal() +
theme(legend.position = "none") +
labs(title = "Gambar 19. Distribusi Quality per Cluster (K-Means)",
x = "Cluster", y = "Quality Score")df_hasil <- df_raw %>%
mutate(
cluster_kmeans = km_result$cluster,
cluster_pam = pam_result$clustering,
cluster_dbscan = db_result$cluster,
cluster_fcm = fcm_result$cluster
)
write.csv(df_hasil, "wine_clustering_results.csv", row.names = FALSE)
cat("Hasil clustering disimpan ke: wine_clustering_results.csv\n")## Hasil clustering disimpan ke: wine_clustering_results.csv
chlorides (4.37%) dan distribusi yang right-skewed pada
beberapa fitur.Kode ini dipublikasikan di RPubs sebagai bagian dari tugas Analisis Multivariat.