1 Pendahuluan

Penelitian ini menerapkan lima metode analisis clustering — K-Means, K-Medians (PAM), DBSCAN, Mean Shift, dan Fuzzy C-Means — pada White Wine Quality Dataset dari UCI Machine Learning Repository. Dataset terdiri dari 4.898 sampel wine putih dengan 11 fitur kimiawi numerik.

Tujuan utama adalah mengelompokkan wine berdasarkan karakteristik kimia tanpa mempertimbangkan label kualitas, kemudian mengevaluasi korelasi antara cluster yang terbentuk dengan skor kualitas sebagai validasi eksternal.

2 Instalasi & Load Package

packages <- c(
  "tidyverse",   # manipulasi data & ggplot2
  "factoextra",  # visualisasi clustering
  "cluster",     # silhouette, pam (k-medians)
  "fpc",         # DBSCAN
  "meanShiftR",  # Mean Shift
  "e1071",       # Fuzzy C-Means & skewness/kurtosis
  "ggcorrplot",  # correlation plot ggplot-based
  "knitr",       # tabel rapi
  "kableExtra",  # styling tabel HTML
  "dbscan"       # DBSCAN (lebih stabil)
)

installed_pkgs <- rownames(installed.packages())
to_install <- packages[!packages %in% installed_pkgs]
if (length(to_install) > 0) {
  install.packages(to_install, repos = "https://cran.rstudio.com/")
}

lapply(packages, library, character.only = TRUE)

## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "factoextra" "lubridate"  "forcats"    "stringr"    "dplyr"     
##  [6] "purrr"      "readr"      "tidyr"      "tibble"     "ggplot2"   
## [11] "tidyverse"  "stats"      "graphics"   "grDevices"  "utils"     
## [16] "datasets"   "methods"    "base"      
## 
## [[3]]
##  [1] "cluster"    "factoextra" "lubridate"  "forcats"    "stringr"   
##  [6] "dplyr"      "purrr"      "readr"      "tidyr"      "tibble"    
## [11] "ggplot2"    "tidyverse"  "stats"      "graphics"   "grDevices" 
## [16] "utils"      "datasets"   "methods"    "base"      
## 
## [[4]]
##  [1] "fpc"        "cluster"    "factoextra" "lubridate"  "forcats"   
##  [6] "stringr"    "dplyr"      "purrr"      "readr"      "tidyr"     
## [11] "tibble"     "ggplot2"    "tidyverse"  "stats"      "graphics"  
## [16] "grDevices"  "utils"      "datasets"   "methods"    "base"      
## 
## [[5]]
##  [1] "meanShiftR" "fpc"        "cluster"    "factoextra" "lubridate" 
##  [6] "forcats"    "stringr"    "dplyr"      "purrr"      "readr"     
## [11] "tidyr"      "tibble"     "ggplot2"    "tidyverse"  "stats"     
## [16] "graphics"   "grDevices"  "utils"      "datasets"   "methods"   
## [21] "base"      
## 
## [[6]]
##  [1] "e1071"      "meanShiftR" "fpc"        "cluster"    "factoextra"
##  [6] "lubridate"  "forcats"    "stringr"    "dplyr"      "purrr"     
## [11] "readr"      "tidyr"      "tibble"     "ggplot2"    "tidyverse" 
## [16] "stats"      "graphics"   "grDevices"  "utils"      "datasets"  
## [21] "methods"    "base"      
## 
## [[7]]
##  [1] "ggcorrplot" "e1071"      "meanShiftR" "fpc"        "cluster"   
##  [6] "factoextra" "lubridate"  "forcats"    "stringr"    "dplyr"     
## [11] "purrr"      "readr"      "tidyr"      "tibble"     "ggplot2"   
## [16] "tidyverse"  "stats"      "graphics"   "grDevices"  "utils"     
## [21] "datasets"   "methods"    "base"      
## 
## [[8]]
##  [1] "knitr"      "ggcorrplot" "e1071"      "meanShiftR" "fpc"       
##  [6] "cluster"    "factoextra" "lubridate"  "forcats"    "stringr"   
## [11] "dplyr"      "purrr"      "readr"      "tidyr"      "tibble"    
## [16] "ggplot2"    "tidyverse"  "stats"      "graphics"   "grDevices" 
## [21] "utils"      "datasets"   "methods"    "base"      
## 
## [[9]]
##  [1] "kableExtra" "knitr"      "ggcorrplot" "e1071"      "meanShiftR"
##  [6] "fpc"        "cluster"    "factoextra" "lubridate"  "forcats"   
## [11] "stringr"    "dplyr"      "purrr"      "readr"      "tidyr"     
## [16] "tibble"     "ggplot2"    "tidyverse"  "stats"      "graphics"  
## [21] "grDevices"  "utils"      "datasets"   "methods"    "base"      
## 
## [[10]]
##  [1] "dbscan"     "kableExtra" "knitr"      "ggcorrplot" "e1071"     
##  [6] "meanShiftR" "fpc"        "cluster"    "factoextra" "lubridate" 
## [11] "forcats"    "stringr"    "dplyr"      "purrr"      "readr"     
## [16] "tidyr"      "tibble"     "ggplot2"    "tidyverse"  "stats"     
## [21] "graphics"   "grDevices"  "utils"      "datasets"   "methods"   
## [26] "base"

3 Load & Persiapan Data

df_raw <- read.csv("winequality-white.csv", sep = ";", header = TRUE)

cat("Dimensi data:", nrow(df_raw), "baris x", ncol(df_raw), "kolom\n")

## Dimensi data: 4898 baris x 12 kolom

cat("Nama kolom :", paste(names(df_raw), collapse = ", "), "\n")

## Nama kolom : fixed.acidity, volatile.acidity, citric.acid, residual.sugar, chlorides, free.sulfur.dioxide, total.sulfur.dioxide, density, pH, sulphates, alcohol, quality

# Simpan quality untuk validasi, keluarkan dari fitur clustering
quality_label <- df_raw$quality
df <- df_raw %>% select(-quality)

cat("\nFitur clustering (", ncol(df), "fitur):\n")

## 
## Fitur clustering ( 11 fitur):

cat(paste(names(df), collapse = ", "), "\n")

## fixed.acidity, volatile.acidity, citric.acid, residual.sugar, chlorides, free.sulfur.dioxide, total.sulfur.dioxide, density, pH, sulphates, alcohol

4 Eksplorasi Data

4.1 Statistik Deskriptif

desc_stats <- df %>%
  summarise(across(everything(), list(
    Mean     = ~round(mean(.), 3),
    Median   = ~round(median(.), 3),
    SD       = ~round(sd(.), 3),
    Min      = ~round(min(.), 3),
    Max      = ~round(max(.), 3),
    Skewness = ~round(e1071::skewness(.), 3),
    Kurtosis = ~round(e1071::kurtosis(.), 3)
  ))) %>%
  pivot_longer(everything(),
               names_to  = c("Variabel", "Statistik"),
               names_sep = "_(?=[^_]+$)") %>%
  pivot_wider(names_from = Statistik, values_from = value)

desc_stats %>%
  kable(caption = "Tabel 1. Statistik Deskriptif Fitur White Wine Quality Dataset",
        align = c("l", rep("r", 7))) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = TRUE) %>%
  row_spec(0, bold = TRUE, background = "#1F4E79", color = "white")

Tabel 1. Statistik Deskriptif Fitur White Wine Quality Dataset
Variabel	Mean	Median	SD	Min	Max	Skewness	Kurtosis
fixed.acidity	6.855	6.800	0.844	3.800	14.200	0.647	2.167
volatile.acidity	0.278	0.260	0.101	0.080	1.100	1.576	5.082
citric.acid	0.334	0.320	0.121	0.000	1.660	1.281	6.164
residual.sugar	6.391	5.200	5.072	0.600	65.800	1.076	3.462
chlorides	0.046	0.043	0.022	0.009	0.346	5.020	37.508
free.sulfur.dioxide	35.308	34.000	17.007	2.000	289.000	1.406	11.448
total.sulfur.dioxide	138.361	134.000	42.498	9.000	440.000	0.390	0.569
density	0.994	0.994	0.003	0.987	1.039	0.977	9.777
pH	3.188	3.180	0.151	2.720	3.820	0.458	0.528
sulphates	0.490	0.470	0.114	0.220	1.080	0.977	1.586
alcohol	10.514	10.400	1.231	8.000	14.200	0.487	-0.700

4.2 Missing Values & Duplikat

mv <- colSums(is.na(df))
cat("Total missing values:", sum(mv), "\n")

## Total missing values: 0

cat("Baris duplikat     :", sum(duplicated(df_raw)), "\n")

## Baris duplikat     : 937

4.3 Distribusi Fitur (Histogram)

df_long <- df %>%
  pivot_longer(everything(), names_to = "variable", values_to = "value")

ggplot(df_long, aes(x = value, fill = variable)) +
  geom_histogram(bins = 40, color = "white", alpha = 0.85) +
  facet_wrap(~variable, scales = "free", ncol = 3) +
  theme_minimal(base_size = 11) +
  theme(legend.position = "none",
        strip.text = element_text(face = "bold")) +
  labs(title = "Gambar 1. Distribusi Setiap Fitur",
       x = NULL, y = "Frekuensi")

4.4 Boxplot & Deteksi Outlier

ggplot(df_long, aes(x = variable, y = value, fill = variable)) +
  geom_boxplot(outlier.size = 0.5, outlier.alpha = 0.4, alpha = 0.8) +
  facet_wrap(~variable, scales = "free", ncol = 3) +
  theme_minimal(base_size = 11) +
  theme(legend.position = "none",
        axis.text.x = element_blank(),
        strip.text = element_text(face = "bold")) +
  labs(title = "Gambar 2. Boxplot Setiap Fitur (Deteksi Outlier)",
       x = NULL, y = NULL)

outlier_df <- df %>%
  summarise(across(everything(), ~{
    Q1 <- quantile(., 0.25); Q3 <- quantile(., 0.75)
    sum(. < (Q1 - 1.5*(Q3-Q1)) | . > (Q3 + 1.5*(Q3-Q1)))
  })) %>%
  pivot_longer(everything(),
               names_to  = "Variabel",
               values_to = "Jumlah Outlier") %>%
  mutate(`Persentase (%)` = round(`Jumlah Outlier` / nrow(df) * 100, 2))

outlier_df %>%
  kable(caption = "Tabel 2. Jumlah Outlier Per Fitur (Metode IQR)",
        align = c("l", "r", "r")) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE) %>%
  row_spec(0, bold = TRUE, background = "#1F4E79", color = "white")

Tabel 2. Jumlah Outlier Per Fitur (Metode IQR)
Variabel	Jumlah Outlier	Persentase (%)
fixed.acidity	119	2.43
volatile.acidity	186	3.80
citric.acid	270	5.51
residual.sugar	7	0.14
chlorides	208	4.25
free.sulfur.dioxide	50	1.02
total.sulfur.dioxide	19	0.39
density	5	0.10
pH	75	1.53
sulphates	124	2.53
alcohol	0	0.00

4.5 Matriks Korelasi

cor_matrix <- cor(df, method = "pearson")

ggcorrplot(cor_matrix,
           method   = "square",
           type     = "lower",
           lab      = TRUE,
           lab_size = 2.8,
           colors   = c("#6D9EC1", "white", "#E46726"),
           title    = "Gambar 3. Matriks Korelasi Antar Fitur",
           ggtheme  = theme_minimal())

4.6 Distribusi Skor Quality

data.frame(quality = quality_label) %>%
  ggplot(aes(x = factor(quality), fill = factor(quality))) +
  geom_bar(color = "white", alpha = 0.85) +
  geom_text(stat = "count", aes(label = after_stat(count)),
            vjust = -0.5, size = 3.5) +
  scale_fill_brewer(palette = "RdYlGn") +
  theme_minimal(base_size = 12) +
  theme(legend.position = "none") +
  labs(title = "Gambar 4. Distribusi Skor Quality Wine",
       x = "Quality Score", y = "Jumlah Sampel")

5 Pra-Pemrosesan: Standardisasi Z-Score

df_scaled <- scale(df)

data.frame(
  Variabel = colnames(df_scaled),
  Mean_Setelah = round(colMeans(df_scaled), 5),
  SD_Setelah   = round(apply(df_scaled, 2, sd), 5)
) %>%
  kable(caption = "Tabel 3. Verifikasi Standardisasi Z-Score",
        align = c("l", "r", "r")) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE) %>%
  row_spec(0, bold = TRUE, background = "#1F4E79", color = "white")

Tabel 3. Verifikasi Standardisasi Z-Score
	Variabel	SD_Setelah
fixed.acidity	fixed.acidity	1
volatile.acidity	volatile.acidity	1
citric.acid	citric.acid	1
residual.sugar	residual.sugar	1
chlorides	chlorides	1
free.sulfur.dioxide	free.sulfur.dioxide	1
total.sulfur.dioxide	total.sulfur.dioxide	1
density	density	1
pH	pH	1
sulphates	sulphates	1
alcohol	alcohol	1

6 Penentuan Jumlah Cluster Optimal

6.1 Elbow Method

set.seed(42)
fviz_nbclust(df_scaled, kmeans, method = "wss", k.max = 10) +
  labs(title = "Gambar 5. Elbow Method — Penentuan K Optimal") +
  theme_minimal()

6.2 Silhouette Method

set.seed(42)
fviz_nbclust(df_scaled, kmeans, method = "silhouette", k.max = 10) +
  labs(title = "Gambar 6. Silhouette Method — Penentuan K Optimal") +
  theme_minimal()

6.3 Gap Statistic

set.seed(42)
gap_stat <- clusGap(df_scaled, FUN = kmeans, K.max = 8, B = 50, nstart = 25)

fviz_gap_stat(gap_stat) +
  labs(title = "Gambar 7. Gap Statistic — Penentuan K Optimal") +
  theme_minimal()

print(gap_stat, method = "firstmax")

## Clustering Gap statistic ["clusGap"] from call:
## clusGap(x = df_scaled, FUNcluster = kmeans, K.max = 8, B = 50, nstart = 25)
## B=50 simulated reference sets, k = 1..8; spaceH0="scaledPCA"
##  --> Number of clusters (method 'firstmax'): 2
##          logW   E.logW      gap      SE.sim
## [1,] 8.598970 9.887272 1.288302 0.002749074
## [2,] 8.475417 9.803830 1.328413 0.002513543
## [3,] 8.429626 9.754108 1.324482 0.002489185
## [4,] 8.399495 9.717759 1.318264 0.002615777
## [5,] 8.364682 9.696385 1.331703 0.002506576
## [6,] 8.338565 9.676544 1.337979 0.002439820
## [7,] 8.314844 9.659474 1.344630 0.002313483
## [8,] 8.294947 9.643047 1.348099 0.002291627

K_OPTIMAL <- 3
cat("K optimal yang dipilih:", K_OPTIMAL, "\n")

## K optimal yang dipilih: 3

7 Analisis Clustering

7.1 K-Means

set.seed(42)
km_result <- kmeans(df_scaled, centers = K_OPTIMAL, nstart = 25, iter.max = 100)

cat("Cluster sizes         :", km_result$size, "\n")

## Cluster sizes         : 1802 1628 1468

cat("Total WSS             :", round(km_result$tot.withinss, 2), "\n")

## Total WSS             : 39055.05

cat("Between SS / Total SS :", round(km_result$betweenss / km_result$totss * 100, 2), "%\n")

## Between SS / Total SS : 27.5 %

sil_km <- silhouette(km_result$cluster, dist(df_scaled))
cat("Silhouette Score      :", round(mean(sil_km[, 3]), 4), "\n")

## Silhouette Score      : 0.1447

fviz_silhouette(sil_km) +
  labs(title = "Gambar 8. Silhouette Plot — K-Means") +
  theme_minimal()

##   cluster size ave.sil.width
## 1       1 1802          0.17
## 2       2 1628          0.12
## 3       3 1468          0.13

fviz_cluster(km_result, data = df_scaled,
             geom = "point", ellipse.type = "convex",
             palette = "jco", alpha = 0.4, pointsize = 0.8) +
  labs(title = paste("Gambar 9. Visualisasi Cluster K-Means (K =", K_OPTIMAL, ")")) +
  theme_minimal()

df_km <- df %>% mutate(cluster = km_result$cluster, quality = quality_label)

df_km %>%
  group_by(cluster) %>%
  summarise(across(everything(), ~round(mean(.), 3)), .groups = "drop") %>%
  kable(caption = "Tabel 4. Profil Rata-rata Per Cluster (K-Means)",
        align = c("r", rep("r", ncol(df)+1))) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                full_width = TRUE) %>%
  row_spec(0, bold = TRUE, background = "#1F4E79", color = "white")

Tabel 4. Profil Rata-rata Per Cluster (K-Means)
cluster	fixed.acidity	volatile.acidity	citric.acid	residual.sugar	chlorides	free.sulfur.dioxide	total.sulfur.dioxide	density	pH	sulphates	alcohol	quality
1	6.960	0.282	0.363	11.124	0.055	46.246	172.124	0.997	3.155	0.497	9.488	5.608
2	6.221	0.277	0.288	3.374	0.040	31.543	122.018	0.992	3.307	0.516	11.161	6.155
3	7.429	0.274	0.350	3.928	0.041	26.057	115.040	0.993	3.096	0.452	11.057	5.903

7.2 K-Medians (PAM)

set.seed(42)
pam_result <- pam(df_scaled, k = K_OPTIMAL, metric = "euclidean")

cat("Cluster sizes    :", pam_result$clusinfo[, "size"], "\n")

## Cluster sizes    : 1702 1548 1648

sil_pam <- silhouette(pam_result$clustering, dist(df_scaled))
cat("Silhouette Score :", round(mean(sil_pam[, 3]), 4), "\n")

## Silhouette Score : 0.125

fviz_silhouette(sil_pam) +
  labs(title = "Gambar 10. Silhouette Plot — K-Medians (PAM)") +
  theme_minimal()

##   cluster size ave.sil.width
## 1       1 1702          0.15
## 2       2 1548          0.06
## 3       3 1648          0.15

fviz_cluster(pam_result, data = df_scaled,
             geom = "point", ellipse.type = "convex",
             palette = "jco", alpha = 0.4, pointsize = 0.8) +
  labs(title = paste("Gambar 11. Visualisasi Cluster K-Medians/PAM (K =", K_OPTIMAL, ")")) +
  theme_minimal()

7.3 DBSCAN

minPts_val <- 5
knn_dist   <- dbscan::kNNdist(df_scaled, k = minPts_val)

if (is.matrix(knn_dist)) {
  knn_sorted <- sort(knn_dist[, ncol(knn_dist)])
} else {
  knn_sorted <- sort(knn_dist)
}

par(mar = c(4, 4, 2, 1))
plot(knn_sorted, type = "l",
     main = "Gambar 12. kNN Distance Plot (Tuning eps DBSCAN)",
     xlab = "Titik (diurutkan)",
     ylab = paste0(minPts_val, "-NN Distance"),
     col  = "steelblue", lwd = 1.5)
abline(h = 1.5, col = "red", lty = 2)
legend("topleft", legend = "eps = 1.5", col = "red", lty = 2, bty = "n")

eps_val   <- 1.5
db_result <- dbscan::dbscan(df_scaled, eps = eps_val, minPts = minPts_val)

cat("Distribusi cluster DBSCAN:\n")

## Distribusi cluster DBSCAN:

print(table(db_result$cluster))

## 
##    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14 
##  957 3854    5    5    6   10    5    7    6    7    7   12    7    5    5

cat("Noise points :", sum(db_result$cluster == 0), "\n")

## Noise points : 957

cat("Jumlah cluster:", max(db_result$cluster), "\n")

## Jumlah cluster: 14

non_noise <- db_result$cluster != 0
if (sum(non_noise) > 1 && length(unique(db_result$cluster[non_noise])) > 1) {
  sil_db <- silhouette(db_result$cluster[non_noise],
                       dist(df_scaled[non_noise, ]))
  cat("Silhouette Score (non-noise):", round(mean(sil_db[, 3]), 4), "\n")
}

## Silhouette Score (non-noise): -0.1485

fviz_cluster(db_result, data = df_scaled,
             geom = "point", palette = "jco",
             alpha = 0.4, pointsize = 0.8, ellipse = FALSE) +
  labs(title = paste0("Gambar 13. Visualisasi Cluster DBSCAN (eps=",
                      eps_val, ", minPts=", minPts_val, ")")) +
  theme_minimal()

7.4 Mean Shift

set.seed(42)
n_ms      <- 1000
idx_ms    <- sample(nrow(df_scaled), n_ms)
df_ms_sub <- df_scaled[idx_ms, ]

ms_result <- meanShiftR::meanShift(
  df_ms_sub,
  nNeighbors = 100,
  algorithm  = "LINEAR",
  bandwidth  = rep(1.5, ncol(df_ms_sub))
)

ms_labels <- ms_result$assignment
cat("Distribusi cluster Mean Shift (subset n =", n_ms, "):\n")

## Distribusi cluster Mean Shift (subset n = 1000 ):

print(table(ms_labels))

## ms_labels
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
##  32  85  27 163  31 358  14  23  27   7  20  14  11   1  37  12  39  30   1   4 
##  21  22  23  24  25  26 
##  32   5  20   1   5   1

cat("Jumlah cluster:", length(unique(ms_labels)), "\n")

## Jumlah cluster: 26

if (length(unique(ms_labels)) > 1) {
  sil_ms <- silhouette(ms_labels, dist(df_ms_sub))
  cat("Silhouette Score:", round(mean(sil_ms[, 3]), 4), "\n")
}

## Silhouette Score: -0.0187

pca_ms <- prcomp(df_ms_sub, center = FALSE, scale. = FALSE)

data.frame(PC1     = pca_ms$x[, 1],
           PC2     = pca_ms$x[, 2],
           cluster = factor(ms_labels)) %>%
  ggplot(aes(x = PC1, y = PC2, color = cluster)) +
  geom_point(alpha = 0.5, size = 0.9) +
  stat_ellipse(aes(group = cluster), type = "norm", level = 0.95) +
  scale_color_brewer(palette = "Set1") +
  labs(title  = paste0("Gambar 15. Visualisasi Cluster Mean Shift (subset n=", n_ms, ")"),
       color  = "Cluster") +
  theme_minimal()

7.5 Fuzzy C-Means

set.seed(42)
fcm_result <- e1071::cmeans(df_scaled,
                             centers  = K_OPTIMAL,
                             iter.max = 100,
                             m        = 2,
                             method   = "cmeans")

cat("Cluster sizes (hard assignment):\n")

## Cluster sizes (hard assignment):

print(table(fcm_result$cluster))

## 
##    1    2    3 
##  949 2027 1922

cat("Nilai objektif:", round(fcm_result$withinerror, 4), "\n")

## Nilai objektif: 3.6409

PC <- sum(fcm_result$membership^2) / nrow(df_scaled)
PE <- -sum(fcm_result$membership * log(fcm_result$membership + 1e-10)) / nrow(df_scaled)
cat("\nPartition Coefficient (PC):", round(PC, 4), "  [1=hard, 1/K=fully fuzzy]\n")

## 
## Partition Coefficient (PC): 0.365   [1=hard, 1/K=fully fuzzy]

cat("Partition Entropy     (PE):", round(PE, 4), "  [0=hard, log(K)=fully fuzzy]\n")

## Partition Entropy     (PE): 1.05   [0=hard, log(K)=fully fuzzy]

sil_fcm <- silhouette(fcm_result$cluster, dist(df_scaled))
cat("Silhouette Score          :", round(mean(sil_fcm[, 3]), 4), "\n")

## Silhouette Score          : 0.0986

round(head(fcm_result$membership, 5), 4) %>%
  as.data.frame() %>%
  kable(caption = "Tabel 5. Contoh Membership Matrix FCM (5 Baris Pertama)",
        align = "r") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE) %>%
  row_spec(0, bold = TRUE, background = "#1F4E79", color = "white")

Tabel 5. Contoh Membership Matrix FCM (5 Baris Pertama)
1	2	3
0.2310	0.2302	0.5388
0.4008	0.4004	0.1988
0.3599	0.3587	0.2815
0.1843	0.1831	0.6327
0.1843	0.1831	0.6327

fviz_silhouette(sil_fcm) +
  labs(title = "Gambar 16. Silhouette Plot — Fuzzy C-Means") +
  theme_minimal()

##   cluster size ave.sil.width
## 1       1  949          0.07
## 2       2 2027          0.11
## 3       3 1922          0.10

pca_all <- prcomp(df_scaled, center = FALSE, scale. = FALSE)

data.frame(PC1     = pca_all$x[, 1],
           PC2     = pca_all$x[, 2],
           cluster = factor(fcm_result$cluster)) %>%
  ggplot(aes(x = PC1, y = PC2, color = cluster)) +
  geom_point(alpha = 0.35, size = 0.7) +
  stat_ellipse(aes(group = cluster), type = "norm", level = 0.95) +
  scale_color_brewer(palette = "Set1") +
  labs(title = paste("Gambar 17. Visualisasi Cluster Fuzzy C-Means (K =", K_OPTIMAL, ")"),
       color = "Cluster") +
  theme_minimal()

8 Perbandingan Metode

sil_scores <- c(
  "K-Means"         = round(mean(sil_km[, 3]),  4),
  "K-Medians (PAM)" = round(mean(sil_pam[, 3]), 4),
  "Fuzzy C-Means"   = round(mean(sil_fcm[, 3]), 4)
)

if (exists("sil_db"))
  sil_scores["DBSCAN"] <- round(mean(sil_db[, 3]), 4)

if (length(unique(ms_labels)) > 1) {
  sil_ms2 <- silhouette(ms_labels, dist(df_ms_sub))
  sil_scores["Mean Shift"] <- round(mean(sil_ms2[, 3]), 4)
}

comparison_df <- data.frame(
  Metode     = names(sil_scores),
  Silhouette = sil_scores,
  row.names  = NULL
)

comparison_df %>%
  arrange(desc(Silhouette)) %>%
  kable(caption = "Tabel 6. Perbandingan Silhouette Score Antar Metode",
        align = c("l", "r")) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE) %>%
  row_spec(0, bold = TRUE, background = "#1F4E79", color = "white") %>%
  row_spec(1, bold = TRUE, background = "#E8F5E9")

Tabel 6. Perbandingan Silhouette Score Antar Metode
Metode	Silhouette
K-Means	0.1447
K-Medians (PAM)	0.1250
Fuzzy C-Means	0.0986
Mean Shift	-0.0187
DBSCAN	-0.1485

comparison_df %>%
  ggplot(aes(x = reorder(Metode, Silhouette), y = Silhouette, fill = Metode)) +
  geom_col(width = 0.6, alpha = 0.85, color = "white") +
  geom_text(aes(label = round(Silhouette, 4)), hjust = -0.2, size = 4) +
  coord_flip() +
  scale_fill_brewer(palette = "Set2") +
  scale_y_continuous(limits = c(0, max(comparison_df$Silhouette) * 1.3)) +
  theme_minimal(base_size = 12) +
  theme(legend.position = "none") +
  labs(title = "Gambar 18. Perbandingan Silhouette Score Antar Metode",
       x = "Metode Clustering", y = "Rata-rata Silhouette Score")

9 Validasi: Cluster vs Quality

cat("Tabel kontingensi K-Means vs Quality:\n")

## Tabel kontingensi K-Means vs Quality:

print(table(Cluster = km_result$cluster, Quality = quality_label))

##        Quality
## Cluster   3   4   5   6   7   8   9
##       1  11  42 785 796 141  27   0
##       2   2  52 290 730 459  92   3
##       3   7  69 382 672 280  56   2

df_km %>%
  ggplot(aes(x = factor(cluster), y = quality, fill = factor(cluster))) +
  geom_boxplot(alpha = 0.75, outlier.size = 0.5) +
  scale_fill_brewer(palette = "Set1") +
  theme_minimal() +
  theme(legend.position = "none") +
  labs(title = "Gambar 19. Distribusi Quality per Cluster (K-Means)",
       x = "Cluster", y = "Quality Score")

10 Simpan Hasil

df_hasil <- df_raw %>%
  mutate(
    cluster_kmeans = km_result$cluster,
    cluster_pam    = pam_result$clustering,
    cluster_dbscan = db_result$cluster,
    cluster_fcm    = fcm_result$cluster
  )

write.csv(df_hasil, "wine_clustering_results.csv", row.names = FALSE)
cat("Hasil clustering disimpan ke: wine_clustering_results.csv\n")

## Hasil clustering disimpan ke: wine_clustering_results.csv

11 Kesimpulan

Dataset White Wine Quality (4.898 sampel, 11 fitur kimia) tidak memiliki missing value, namun mengandung outlier terutama pada chlorides (4.37%) dan distribusi yang right-skewed pada beberapa fitur.
Jumlah cluster optimal K = 3 dikonfirmasi oleh Elbow Method, Silhouette Method, dan Gap Statistic.
Ketiga cluster merepresentasikan: wine kering beralkohol tinggi (Cluster 1), wine semi-kering (Cluster 2), dan wine manis beralkohol rendah (Cluster 3).
K-Means dan K-Medians (PAM) menghasilkan Silhouette Score tertinggi untuk metode berbasis partisi; DBSCAN efektif mendeteksi noise point; Fuzzy C-Means memberikan representasi keanggotaan parsial yang realistis.
Validasi eksternal menunjukkan korelasi antara cluster kimia dengan skor kualitas — Cluster 1 memiliki rata-rata quality tertinggi.

Kode ini dipublikasikan di RPubs sebagai bagian dari tugas Analisis Multivariat.

Implementasi Analisis Clustering pada White Wine Quality Dataset

Yazid Husain Abdurrahman (24031554076) - Kelas 2024D

29 June 2026