1 Pendahuluan

Penelitian ini menerapkan lima metode analisis clustering — K-Means, K-Medians (PAM), DBSCAN, Mean Shift, dan Fuzzy C-Means — pada White Wine Quality Dataset dari UCI Machine Learning Repository. Dataset terdiri dari 4.898 sampel wine putih dengan 11 fitur kimiawi numerik.

Tujuan utama adalah mengelompokkan wine berdasarkan karakteristik kimia tanpa mempertimbangkan label kualitas, kemudian mengevaluasi korelasi antara cluster yang terbentuk dengan skor kualitas sebagai validasi eksternal.


2 Instalasi & Load Package

packages <- c(
  "tidyverse",   # manipulasi data & ggplot2
  "factoextra",  # visualisasi clustering
  "cluster",     # silhouette, pam (k-medians)
  "fpc",         # DBSCAN
  "meanShiftR",  # Mean Shift
  "e1071",       # Fuzzy C-Means & skewness/kurtosis
  "ggcorrplot",  # correlation plot ggplot-based
  "knitr",       # tabel rapi
  "kableExtra",  # styling tabel HTML
  "dbscan"       # DBSCAN (lebih stabil)
)

installed_pkgs <- rownames(installed.packages())
to_install <- packages[!packages %in% installed_pkgs]
if (length(to_install) > 0) {
  install.packages(to_install, repos = "https://cran.rstudio.com/")
}

lapply(packages, library, character.only = TRUE)
## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "factoextra" "lubridate"  "forcats"    "stringr"    "dplyr"     
##  [6] "purrr"      "readr"      "tidyr"      "tibble"     "ggplot2"   
## [11] "tidyverse"  "stats"      "graphics"   "grDevices"  "utils"     
## [16] "datasets"   "methods"    "base"      
## 
## [[3]]
##  [1] "cluster"    "factoextra" "lubridate"  "forcats"    "stringr"   
##  [6] "dplyr"      "purrr"      "readr"      "tidyr"      "tibble"    
## [11] "ggplot2"    "tidyverse"  "stats"      "graphics"   "grDevices" 
## [16] "utils"      "datasets"   "methods"    "base"      
## 
## [[4]]
##  [1] "fpc"        "cluster"    "factoextra" "lubridate"  "forcats"   
##  [6] "stringr"    "dplyr"      "purrr"      "readr"      "tidyr"     
## [11] "tibble"     "ggplot2"    "tidyverse"  "stats"      "graphics"  
## [16] "grDevices"  "utils"      "datasets"   "methods"    "base"      
## 
## [[5]]
##  [1] "meanShiftR" "fpc"        "cluster"    "factoextra" "lubridate" 
##  [6] "forcats"    "stringr"    "dplyr"      "purrr"      "readr"     
## [11] "tidyr"      "tibble"     "ggplot2"    "tidyverse"  "stats"     
## [16] "graphics"   "grDevices"  "utils"      "datasets"   "methods"   
## [21] "base"      
## 
## [[6]]
##  [1] "e1071"      "meanShiftR" "fpc"        "cluster"    "factoextra"
##  [6] "lubridate"  "forcats"    "stringr"    "dplyr"      "purrr"     
## [11] "readr"      "tidyr"      "tibble"     "ggplot2"    "tidyverse" 
## [16] "stats"      "graphics"   "grDevices"  "utils"      "datasets"  
## [21] "methods"    "base"      
## 
## [[7]]
##  [1] "ggcorrplot" "e1071"      "meanShiftR" "fpc"        "cluster"   
##  [6] "factoextra" "lubridate"  "forcats"    "stringr"    "dplyr"     
## [11] "purrr"      "readr"      "tidyr"      "tibble"     "ggplot2"   
## [16] "tidyverse"  "stats"      "graphics"   "grDevices"  "utils"     
## [21] "datasets"   "methods"    "base"      
## 
## [[8]]
##  [1] "knitr"      "ggcorrplot" "e1071"      "meanShiftR" "fpc"       
##  [6] "cluster"    "factoextra" "lubridate"  "forcats"    "stringr"   
## [11] "dplyr"      "purrr"      "readr"      "tidyr"      "tibble"    
## [16] "ggplot2"    "tidyverse"  "stats"      "graphics"   "grDevices" 
## [21] "utils"      "datasets"   "methods"    "base"      
## 
## [[9]]
##  [1] "kableExtra" "knitr"      "ggcorrplot" "e1071"      "meanShiftR"
##  [6] "fpc"        "cluster"    "factoextra" "lubridate"  "forcats"   
## [11] "stringr"    "dplyr"      "purrr"      "readr"      "tidyr"     
## [16] "tibble"     "ggplot2"    "tidyverse"  "stats"      "graphics"  
## [21] "grDevices"  "utils"      "datasets"   "methods"    "base"      
## 
## [[10]]
##  [1] "dbscan"     "kableExtra" "knitr"      "ggcorrplot" "e1071"     
##  [6] "meanShiftR" "fpc"        "cluster"    "factoextra" "lubridate" 
## [11] "forcats"    "stringr"    "dplyr"      "purrr"      "readr"     
## [16] "tidyr"      "tibble"     "ggplot2"    "tidyverse"  "stats"     
## [21] "graphics"   "grDevices"  "utils"      "datasets"   "methods"   
## [26] "base"

3 Load & Persiapan Data

df_raw <- read.csv("winequality-white.csv", sep = ";", header = TRUE)

cat("Dimensi data:", nrow(df_raw), "baris x", ncol(df_raw), "kolom\n")
## Dimensi data: 4898 baris x 12 kolom
cat("Nama kolom :", paste(names(df_raw), collapse = ", "), "\n")
## Nama kolom : fixed.acidity, volatile.acidity, citric.acid, residual.sugar, chlorides, free.sulfur.dioxide, total.sulfur.dioxide, density, pH, sulphates, alcohol, quality
# Simpan quality untuk validasi, keluarkan dari fitur clustering
quality_label <- df_raw$quality
df <- df_raw %>% select(-quality)

cat("\nFitur clustering (", ncol(df), "fitur):\n")
## 
## Fitur clustering ( 11 fitur):
cat(paste(names(df), collapse = ", "), "\n")
## fixed.acidity, volatile.acidity, citric.acid, residual.sugar, chlorides, free.sulfur.dioxide, total.sulfur.dioxide, density, pH, sulphates, alcohol

4 Eksplorasi Data

4.1 Statistik Deskriptif

desc_stats <- df %>%
  summarise(across(everything(), list(
    Mean     = ~round(mean(.), 3),
    Median   = ~round(median(.), 3),
    SD       = ~round(sd(.), 3),
    Min      = ~round(min(.), 3),
    Max      = ~round(max(.), 3),
    Skewness = ~round(e1071::skewness(.), 3),
    Kurtosis = ~round(e1071::kurtosis(.), 3)
  ))) %>%
  pivot_longer(everything(),
               names_to  = c("Variabel", "Statistik"),
               names_sep = "_(?=[^_]+$)") %>%
  pivot_wider(names_from = Statistik, values_from = value)

desc_stats %>%
  kable(caption = "Tabel 1. Statistik Deskriptif Fitur White Wine Quality Dataset",
        align = c("l", rep("r", 7))) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = TRUE) %>%
  row_spec(0, bold = TRUE, background = "#1F4E79", color = "white")
Tabel 1. Statistik Deskriptif Fitur White Wine Quality Dataset
Variabel Mean Median SD Min Max Skewness Kurtosis
fixed.acidity 6.855 6.800 0.844 3.800 14.200 0.647 2.167
volatile.acidity 0.278 0.260 0.101 0.080 1.100 1.576 5.082
citric.acid 0.334 0.320 0.121 0.000 1.660 1.281 6.164
residual.sugar 6.391 5.200 5.072 0.600 65.800 1.076 3.462
chlorides 0.046 0.043 0.022 0.009 0.346 5.020 37.508
free.sulfur.dioxide 35.308 34.000 17.007 2.000 289.000 1.406 11.448
total.sulfur.dioxide 138.361 134.000 42.498 9.000 440.000 0.390 0.569
density 0.994 0.994 0.003 0.987 1.039 0.977 9.777
pH 3.188 3.180 0.151 2.720 3.820 0.458 0.528
sulphates 0.490 0.470 0.114 0.220 1.080 0.977 1.586
alcohol 10.514 10.400 1.231 8.000 14.200 0.487 -0.700

4.2 Missing Values & Duplikat

mv <- colSums(is.na(df))
cat("Total missing values:", sum(mv), "\n")
## Total missing values: 0
cat("Baris duplikat     :", sum(duplicated(df_raw)), "\n")
## Baris duplikat     : 937

4.3 Distribusi Fitur (Histogram)

df_long <- df %>%
  pivot_longer(everything(), names_to = "variable", values_to = "value")

ggplot(df_long, aes(x = value, fill = variable)) +
  geom_histogram(bins = 40, color = "white", alpha = 0.85) +
  facet_wrap(~variable, scales = "free", ncol = 3) +
  theme_minimal(base_size = 11) +
  theme(legend.position = "none",
        strip.text = element_text(face = "bold")) +
  labs(title = "Gambar 1. Distribusi Setiap Fitur",
       x = NULL, y = "Frekuensi")

4.4 Boxplot & Deteksi Outlier

ggplot(df_long, aes(x = variable, y = value, fill = variable)) +
  geom_boxplot(outlier.size = 0.5, outlier.alpha = 0.4, alpha = 0.8) +
  facet_wrap(~variable, scales = "free", ncol = 3) +
  theme_minimal(base_size = 11) +
  theme(legend.position = "none",
        axis.text.x = element_blank(),
        strip.text = element_text(face = "bold")) +
  labs(title = "Gambar 2. Boxplot Setiap Fitur (Deteksi Outlier)",
       x = NULL, y = NULL)

outlier_df <- df %>%
  summarise(across(everything(), ~{
    Q1 <- quantile(., 0.25); Q3 <- quantile(., 0.75)
    sum(. < (Q1 - 1.5*(Q3-Q1)) | . > (Q3 + 1.5*(Q3-Q1)))
  })) %>%
  pivot_longer(everything(),
               names_to  = "Variabel",
               values_to = "Jumlah Outlier") %>%
  mutate(`Persentase (%)` = round(`Jumlah Outlier` / nrow(df) * 100, 2))

outlier_df %>%
  kable(caption = "Tabel 2. Jumlah Outlier Per Fitur (Metode IQR)",
        align = c("l", "r", "r")) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE) %>%
  row_spec(0, bold = TRUE, background = "#1F4E79", color = "white")
Tabel 2. Jumlah Outlier Per Fitur (Metode IQR)
Variabel Jumlah Outlier Persentase (%)
fixed.acidity 119 2.43
volatile.acidity 186 3.80
citric.acid 270 5.51
residual.sugar 7 0.14
chlorides 208 4.25
free.sulfur.dioxide 50 1.02
total.sulfur.dioxide 19 0.39
density 5 0.10
pH 75 1.53
sulphates 124 2.53
alcohol 0 0.00

4.5 Matriks Korelasi

cor_matrix <- cor(df, method = "pearson")

ggcorrplot(cor_matrix,
           method   = "square",
           type     = "lower",
           lab      = TRUE,
           lab_size = 2.8,
           colors   = c("#6D9EC1", "white", "#E46726"),
           title    = "Gambar 3. Matriks Korelasi Antar Fitur",
           ggtheme  = theme_minimal())

4.6 Distribusi Skor Quality

data.frame(quality = quality_label) %>%
  ggplot(aes(x = factor(quality), fill = factor(quality))) +
  geom_bar(color = "white", alpha = 0.85) +
  geom_text(stat = "count", aes(label = after_stat(count)),
            vjust = -0.5, size = 3.5) +
  scale_fill_brewer(palette = "RdYlGn") +
  theme_minimal(base_size = 12) +
  theme(legend.position = "none") +
  labs(title = "Gambar 4. Distribusi Skor Quality Wine",
       x = "Quality Score", y = "Jumlah Sampel")


5 Pra-Pemrosesan: Standardisasi Z-Score

df_scaled <- scale(df)

data.frame(
  Variabel = colnames(df_scaled),
  Mean_Setelah = round(colMeans(df_scaled), 5),
  SD_Setelah   = round(apply(df_scaled, 2, sd), 5)
) %>%
  kable(caption = "Tabel 3. Verifikasi Standardisasi Z-Score",
        align = c("l", "r", "r")) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE) %>%
  row_spec(0, bold = TRUE, background = "#1F4E79", color = "white")
Tabel 3. Verifikasi Standardisasi Z-Score
Variabel Mean_Setelah SD_Setelah
fixed.acidity fixed.acidity 0 1
volatile.acidity volatile.acidity 0 1
citric.acid citric.acid 0 1
residual.sugar residual.sugar 0 1
chlorides chlorides 0 1
free.sulfur.dioxide free.sulfur.dioxide 0 1
total.sulfur.dioxide total.sulfur.dioxide 0 1
density density 0 1
pH pH 0 1
sulphates sulphates 0 1
alcohol alcohol 0 1

6 Penentuan Jumlah Cluster Optimal

6.1 Elbow Method

set.seed(42)
fviz_nbclust(df_scaled, kmeans, method = "wss", k.max = 10) +
  labs(title = "Gambar 5. Elbow Method — Penentuan K Optimal") +
  theme_minimal()

6.2 Silhouette Method

set.seed(42)
fviz_nbclust(df_scaled, kmeans, method = "silhouette", k.max = 10) +
  labs(title = "Gambar 6. Silhouette Method — Penentuan K Optimal") +
  theme_minimal()

6.3 Gap Statistic

set.seed(42)
gap_stat <- clusGap(df_scaled, FUN = kmeans, K.max = 8, B = 50, nstart = 25)

fviz_gap_stat(gap_stat) +
  labs(title = "Gambar 7. Gap Statistic — Penentuan K Optimal") +
  theme_minimal()

print(gap_stat, method = "firstmax")
## Clustering Gap statistic ["clusGap"] from call:
## clusGap(x = df_scaled, FUNcluster = kmeans, K.max = 8, B = 50, nstart = 25)
## B=50 simulated reference sets, k = 1..8; spaceH0="scaledPCA"
##  --> Number of clusters (method 'firstmax'): 2
##          logW   E.logW      gap      SE.sim
## [1,] 8.598970 9.887272 1.288302 0.002749074
## [2,] 8.475417 9.803830 1.328413 0.002513543
## [3,] 8.429626 9.754108 1.324482 0.002489185
## [4,] 8.399495 9.717759 1.318264 0.002615777
## [5,] 8.364682 9.696385 1.331703 0.002506576
## [6,] 8.338565 9.676544 1.337979 0.002439820
## [7,] 8.314844 9.659474 1.344630 0.002313483
## [8,] 8.294947 9.643047 1.348099 0.002291627
K_OPTIMAL <- 3
cat("K optimal yang dipilih:", K_OPTIMAL, "\n")
## K optimal yang dipilih: 3

7 Analisis Clustering

7.1 K-Means

set.seed(42)
km_result <- kmeans(df_scaled, centers = K_OPTIMAL, nstart = 25, iter.max = 100)

cat("Cluster sizes         :", km_result$size, "\n")
## Cluster sizes         : 1802 1628 1468
cat("Total WSS             :", round(km_result$tot.withinss, 2), "\n")
## Total WSS             : 39055.05
cat("Between SS / Total SS :", round(km_result$betweenss / km_result$totss * 100, 2), "%\n")
## Between SS / Total SS : 27.5 %
sil_km <- silhouette(km_result$cluster, dist(df_scaled))
cat("Silhouette Score      :", round(mean(sil_km[, 3]), 4), "\n")
## Silhouette Score      : 0.1447
fviz_silhouette(sil_km) +
  labs(title = "Gambar 8. Silhouette Plot — K-Means") +
  theme_minimal()
##   cluster size ave.sil.width
## 1       1 1802          0.17
## 2       2 1628          0.12
## 3       3 1468          0.13

fviz_cluster(km_result, data = df_scaled,
             geom = "point", ellipse.type = "convex",
             palette = "jco", alpha = 0.4, pointsize = 0.8) +
  labs(title = paste("Gambar 9. Visualisasi Cluster K-Means (K =", K_OPTIMAL, ")")) +
  theme_minimal()

df_km <- df %>% mutate(cluster = km_result$cluster, quality = quality_label)

df_km %>%
  group_by(cluster) %>%
  summarise(across(everything(), ~round(mean(.), 3)), .groups = "drop") %>%
  kable(caption = "Tabel 4. Profil Rata-rata Per Cluster (K-Means)",
        align = c("r", rep("r", ncol(df)+1))) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = TRUE) %>%
  row_spec(0, bold = TRUE, background = "#1F4E79", color = "white")
Tabel 4. Profil Rata-rata Per Cluster (K-Means)
cluster fixed.acidity volatile.acidity citric.acid residual.sugar chlorides free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol quality
1 6.960 0.282 0.363 11.124 0.055 46.246 172.124 0.997 3.155 0.497 9.488 5.608
2 6.221 0.277 0.288 3.374 0.040 31.543 122.018 0.992 3.307 0.516 11.161 6.155
3 7.429 0.274 0.350 3.928 0.041 26.057 115.040 0.993 3.096 0.452 11.057 5.903

7.2 K-Medians (PAM)

set.seed(42)
pam_result <- pam(df_scaled, k = K_OPTIMAL, metric = "euclidean")

cat("Cluster sizes    :", pam_result$clusinfo[, "size"], "\n")
## Cluster sizes    : 1702 1548 1648
sil_pam <- silhouette(pam_result$clustering, dist(df_scaled))
cat("Silhouette Score :", round(mean(sil_pam[, 3]), 4), "\n")
## Silhouette Score : 0.125
fviz_silhouette(sil_pam) +
  labs(title = "Gambar 10. Silhouette Plot — K-Medians (PAM)") +
  theme_minimal()
##   cluster size ave.sil.width
## 1       1 1702          0.15
## 2       2 1548          0.06
## 3       3 1648          0.15

fviz_cluster(pam_result, data = df_scaled,
             geom = "point", ellipse.type = "convex",
             palette = "jco", alpha = 0.4, pointsize = 0.8) +
  labs(title = paste("Gambar 11. Visualisasi Cluster K-Medians/PAM (K =", K_OPTIMAL, ")")) +
  theme_minimal()

7.3 DBSCAN

minPts_val <- 5
knn_dist   <- dbscan::kNNdist(df_scaled, k = minPts_val)

if (is.matrix(knn_dist)) {
  knn_sorted <- sort(knn_dist[, ncol(knn_dist)])
} else {
  knn_sorted <- sort(knn_dist)
}

par(mar = c(4, 4, 2, 1))
plot(knn_sorted, type = "l",
     main = "Gambar 12. kNN Distance Plot (Tuning eps DBSCAN)",
     xlab = "Titik (diurutkan)",
     ylab = paste0(minPts_val, "-NN Distance"),
     col  = "steelblue", lwd = 1.5)
abline(h = 1.5, col = "red", lty = 2)
legend("topleft", legend = "eps = 1.5", col = "red", lty = 2, bty = "n")

eps_val   <- 1.5
db_result <- dbscan::dbscan(df_scaled, eps = eps_val, minPts = minPts_val)

cat("Distribusi cluster DBSCAN:\n")
## Distribusi cluster DBSCAN:
print(table(db_result$cluster))
## 
##    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14 
##  957 3854    5    5    6   10    5    7    6    7    7   12    7    5    5
cat("Noise points :", sum(db_result$cluster == 0), "\n")
## Noise points : 957
cat("Jumlah cluster:", max(db_result$cluster), "\n")
## Jumlah cluster: 14
non_noise <- db_result$cluster != 0
if (sum(non_noise) > 1 && length(unique(db_result$cluster[non_noise])) > 1) {
  sil_db <- silhouette(db_result$cluster[non_noise],
                       dist(df_scaled[non_noise, ]))
  cat("Silhouette Score (non-noise):", round(mean(sil_db[, 3]), 4), "\n")
}
## Silhouette Score (non-noise): -0.1485
fviz_cluster(db_result, data = df_scaled,
             geom = "point", palette = "jco",
             alpha = 0.4, pointsize = 0.8, ellipse = FALSE) +
  labs(title = paste0("Gambar 13. Visualisasi Cluster DBSCAN (eps=",
                      eps_val, ", minPts=", minPts_val, ")")) +
  theme_minimal()

7.4 Mean Shift

set.seed(42)
n_ms      <- 1000
idx_ms    <- sample(nrow(df_scaled), n_ms)
df_ms_sub <- df_scaled[idx_ms, ]

ms_result <- meanShiftR::meanShift(
  df_ms_sub,
  nNeighbors = 100,
  algorithm  = "LINEAR",
  bandwidth  = rep(1.5, ncol(df_ms_sub))
)

ms_labels <- ms_result$assignment
cat("Distribusi cluster Mean Shift (subset n =", n_ms, "):\n")
## Distribusi cluster Mean Shift (subset n = 1000 ):
print(table(ms_labels))
## ms_labels
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
##  32  85  27 163  31 358  14  23  27   7  20  14  11   1  37  12  39  30   1   4 
##  21  22  23  24  25  26 
##  32   5  20   1   5   1
cat("Jumlah cluster:", length(unique(ms_labels)), "\n")
## Jumlah cluster: 26
if (length(unique(ms_labels)) > 1) {
  sil_ms <- silhouette(ms_labels, dist(df_ms_sub))
  cat("Silhouette Score:", round(mean(sil_ms[, 3]), 4), "\n")
}
## Silhouette Score: -0.0187
pca_ms <- prcomp(df_ms_sub, center = FALSE, scale. = FALSE)

data.frame(PC1     = pca_ms$x[, 1],
           PC2     = pca_ms$x[, 2],
           cluster = factor(ms_labels)) %>%
  ggplot(aes(x = PC1, y = PC2, color = cluster)) +
  geom_point(alpha = 0.5, size = 0.9) +
  stat_ellipse(aes(group = cluster), type = "norm", level = 0.95) +
  scale_color_brewer(palette = "Set1") +
  labs(title  = paste0("Gambar 15. Visualisasi Cluster Mean Shift (subset n=", n_ms, ")"),
       color  = "Cluster") +
  theme_minimal()

7.5 Fuzzy C-Means

set.seed(42)
fcm_result <- e1071::cmeans(df_scaled,
                             centers  = K_OPTIMAL,
                             iter.max = 100,
                             m        = 2,
                             method   = "cmeans")

cat("Cluster sizes (hard assignment):\n")
## Cluster sizes (hard assignment):
print(table(fcm_result$cluster))
## 
##    1    2    3 
##  949 2027 1922
cat("Nilai objektif:", round(fcm_result$withinerror, 4), "\n")
## Nilai objektif: 3.6409
PC <- sum(fcm_result$membership^2) / nrow(df_scaled)
PE <- -sum(fcm_result$membership * log(fcm_result$membership + 1e-10)) / nrow(df_scaled)
cat("\nPartition Coefficient (PC):", round(PC, 4), "  [1=hard, 1/K=fully fuzzy]\n")
## 
## Partition Coefficient (PC): 0.365   [1=hard, 1/K=fully fuzzy]
cat("Partition Entropy     (PE):", round(PE, 4), "  [0=hard, log(K)=fully fuzzy]\n")
## Partition Entropy     (PE): 1.05   [0=hard, log(K)=fully fuzzy]
sil_fcm <- silhouette(fcm_result$cluster, dist(df_scaled))
cat("Silhouette Score          :", round(mean(sil_fcm[, 3]), 4), "\n")
## Silhouette Score          : 0.0986
round(head(fcm_result$membership, 5), 4) %>%
  as.data.frame() %>%
  kable(caption = "Tabel 5. Contoh Membership Matrix FCM (5 Baris Pertama)",
        align = "r") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE) %>%
  row_spec(0, bold = TRUE, background = "#1F4E79", color = "white")
Tabel 5. Contoh Membership Matrix FCM (5 Baris Pertama)
1 2 3
0.2310 0.2302 0.5388
0.4008 0.4004 0.1988
0.3599 0.3587 0.2815
0.1843 0.1831 0.6327
0.1843 0.1831 0.6327
fviz_silhouette(sil_fcm) +
  labs(title = "Gambar 16. Silhouette Plot — Fuzzy C-Means") +
  theme_minimal()
##   cluster size ave.sil.width
## 1       1  949          0.07
## 2       2 2027          0.11
## 3       3 1922          0.10

pca_all <- prcomp(df_scaled, center = FALSE, scale. = FALSE)

data.frame(PC1     = pca_all$x[, 1],
           PC2     = pca_all$x[, 2],
           cluster = factor(fcm_result$cluster)) %>%
  ggplot(aes(x = PC1, y = PC2, color = cluster)) +
  geom_point(alpha = 0.35, size = 0.7) +
  stat_ellipse(aes(group = cluster), type = "norm", level = 0.95) +
  scale_color_brewer(palette = "Set1") +
  labs(title = paste("Gambar 17. Visualisasi Cluster Fuzzy C-Means (K =", K_OPTIMAL, ")"),
       color = "Cluster") +
  theme_minimal()


8 Perbandingan Metode

sil_scores <- c(
  "K-Means"         = round(mean(sil_km[, 3]),  4),
  "K-Medians (PAM)" = round(mean(sil_pam[, 3]), 4),
  "Fuzzy C-Means"   = round(mean(sil_fcm[, 3]), 4)
)

if (exists("sil_db"))
  sil_scores["DBSCAN"] <- round(mean(sil_db[, 3]), 4)

if (length(unique(ms_labels)) > 1) {
  sil_ms2 <- silhouette(ms_labels, dist(df_ms_sub))
  sil_scores["Mean Shift"] <- round(mean(sil_ms2[, 3]), 4)
}

comparison_df <- data.frame(
  Metode     = names(sil_scores),
  Silhouette = sil_scores,
  row.names  = NULL
)

comparison_df %>%
  arrange(desc(Silhouette)) %>%
  kable(caption = "Tabel 6. Perbandingan Silhouette Score Antar Metode",
        align = c("l", "r")) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE) %>%
  row_spec(0, bold = TRUE, background = "#1F4E79", color = "white") %>%
  row_spec(1, bold = TRUE, background = "#E8F5E9")
Tabel 6. Perbandingan Silhouette Score Antar Metode
Metode Silhouette
K-Means 0.1447
K-Medians (PAM) 0.1250
Fuzzy C-Means 0.0986
Mean Shift -0.0187
DBSCAN -0.1485
comparison_df %>%
  ggplot(aes(x = reorder(Metode, Silhouette), y = Silhouette, fill = Metode)) +
  geom_col(width = 0.6, alpha = 0.85, color = "white") +
  geom_text(aes(label = round(Silhouette, 4)), hjust = -0.2, size = 4) +
  coord_flip() +
  scale_fill_brewer(palette = "Set2") +
  scale_y_continuous(limits = c(0, max(comparison_df$Silhouette) * 1.3)) +
  theme_minimal(base_size = 12) +
  theme(legend.position = "none") +
  labs(title = "Gambar 18. Perbandingan Silhouette Score Antar Metode",
       x = "Metode Clustering", y = "Rata-rata Silhouette Score")


9 Validasi: Cluster vs Quality

cat("Tabel kontingensi K-Means vs Quality:\n")
## Tabel kontingensi K-Means vs Quality:
print(table(Cluster = km_result$cluster, Quality = quality_label))
##        Quality
## Cluster   3   4   5   6   7   8   9
##       1  11  42 785 796 141  27   0
##       2   2  52 290 730 459  92   3
##       3   7  69 382 672 280  56   2
df_km %>%
  ggplot(aes(x = factor(cluster), y = quality, fill = factor(cluster))) +
  geom_boxplot(alpha = 0.75, outlier.size = 0.5) +
  scale_fill_brewer(palette = "Set1") +
  theme_minimal() +
  theme(legend.position = "none") +
  labs(title = "Gambar 19. Distribusi Quality per Cluster (K-Means)",
       x = "Cluster", y = "Quality Score")


10 Simpan Hasil

df_hasil <- df_raw %>%
  mutate(
    cluster_kmeans = km_result$cluster,
    cluster_pam    = pam_result$clustering,
    cluster_dbscan = db_result$cluster,
    cluster_fcm    = fcm_result$cluster
  )

write.csv(df_hasil, "wine_clustering_results.csv", row.names = FALSE)
cat("Hasil clustering disimpan ke: wine_clustering_results.csv\n")
## Hasil clustering disimpan ke: wine_clustering_results.csv

11 Kesimpulan

  1. Dataset White Wine Quality (4.898 sampel, 11 fitur kimia) tidak memiliki missing value, namun mengandung outlier terutama pada chlorides (4.37%) dan distribusi yang right-skewed pada beberapa fitur.
  2. Jumlah cluster optimal K = 3 dikonfirmasi oleh Elbow Method, Silhouette Method, dan Gap Statistic.
  3. Ketiga cluster merepresentasikan: wine kering beralkohol tinggi (Cluster 1), wine semi-kering (Cluster 2), dan wine manis beralkohol rendah (Cluster 3).
  4. K-Means dan K-Medians (PAM) menghasilkan Silhouette Score tertinggi untuk metode berbasis partisi; DBSCAN efektif mendeteksi noise point; Fuzzy C-Means memberikan representasi keanggotaan parsial yang realistis.
  5. Validasi eksternal menunjukkan korelasi antara cluster kimia dengan skor kualitas — Cluster 1 memiliki rata-rata quality tertinggi.

Kode ini dipublikasikan di RPubs sebagai bagian dari tugas Analisis Multivariat.