Import Data

df <- read.csv("wine-clustering.csv", stringsAsFactors = FALSE)

# cek struktur data
str(df)
## 'data.frame':    178 obs. of  13 variables:
##  $ Alcohol             : num  14.2 13.2 13.2 14.4 13.2 ...
##  $ Malic_Acid          : num  1.71 1.78 2.36 1.95 2.59 1.76 1.87 2.15 1.64 1.35 ...
##  $ Ash                 : num  2.43 2.14 2.67 2.5 2.87 2.45 2.45 2.61 2.17 2.27 ...
##  $ Ash_Alcanity        : num  15.6 11.2 18.6 16.8 21 15.2 14.6 17.6 14 16 ...
##  $ Magnesium           : int  127 100 101 113 118 112 96 121 97 98 ...
##  $ Total_Phenols       : num  2.8 2.65 2.8 3.85 2.8 3.27 2.5 2.6 2.8 2.98 ...
##  $ Flavanoids          : num  3.06 2.76 3.24 3.49 2.69 3.39 2.52 2.51 2.98 3.15 ...
##  $ Nonflavanoid_Phenols: num  0.28 0.26 0.3 0.24 0.39 0.34 0.3 0.31 0.29 0.22 ...
##  $ Proanthocyanins     : num  2.29 1.28 2.81 2.18 1.82 1.97 1.98 1.25 1.98 1.85 ...
##  $ Color_Intensity     : num  5.64 4.38 5.68 7.8 4.32 6.75 5.25 5.05 5.2 7.22 ...
##  $ Hue                 : num  1.04 1.05 1.03 0.86 1.04 1.05 1.02 1.06 1.08 1.01 ...
##  $ OD280               : num  3.92 3.4 3.17 3.45 2.93 2.85 3.58 3.58 2.85 3.55 ...
##  $ Proline             : int  1065 1050 1185 1480 735 1450 1290 1295 1045 1045 ...
head(df)
##   Alcohol Malic_Acid  Ash Ash_Alcanity Magnesium Total_Phenols Flavanoids
## 1   14.23       1.71 2.43         15.6       127          2.80       3.06
## 2   13.20       1.78 2.14         11.2       100          2.65       2.76
## 3   13.16       2.36 2.67         18.6       101          2.80       3.24
## 4   14.37       1.95 2.50         16.8       113          3.85       3.49
## 5   13.24       2.59 2.87         21.0       118          2.80       2.69
## 6   14.20       1.76 2.45         15.2       112          3.27       3.39
##   Nonflavanoid_Phenols Proanthocyanins Color_Intensity  Hue OD280 Proline
## 1                 0.28            2.29            5.64 1.04  3.92    1065
## 2                 0.26            1.28            4.38 1.05  3.40    1050
## 3                 0.30            2.81            5.68 1.03  3.17    1185
## 4                 0.24            2.18            7.80 0.86  3.45    1480
## 5                 0.39            1.82            4.32 1.04  2.93     735
## 6                 0.34            1.97            6.75 1.05  2.85    1450

EDA

# cek missing value
colSums(is.na(df))
##              Alcohol           Malic_Acid                  Ash 
##                    0                    0                    0 
##         Ash_Alcanity            Magnesium        Total_Phenols 
##                    0                    0                    0 
##           Flavanoids Nonflavanoid_Phenols      Proanthocyanins 
##                    0                    0                    0 
##      Color_Intensity                  Hue                OD280 
##                    0                    0                    0 
##              Proline 
##                    0
# statistik deskriptif
data_deskriptif <- psych::describe(df)
print(data_deskriptif)
##                      vars   n   mean     sd median trimmed    mad    min
## Alcohol                 1 178  13.00   0.81  13.05   13.01   1.01  11.03
## Malic_Acid              2 178   2.34   1.12   1.87    2.21   0.77   0.74
## Ash                     3 178   2.37   0.27   2.36    2.37   0.24   1.36
## Ash_Alcanity            4 178  19.49   3.34  19.50   19.42   3.04  10.60
## Magnesium               5 178  99.74  14.28  98.00   98.44  14.83  70.00
## Total_Phenols           6 178   2.30   0.63   2.36    2.29   0.75   0.98
## Flavanoids              7 178   2.03   1.00   2.13    2.02   1.24   0.34
## Nonflavanoid_Phenols    8 178   0.36   0.12   0.34    0.36   0.13   0.13
## Proanthocyanins         9 178   1.59   0.57   1.56    1.56   0.56   0.41
## Color_Intensity        10 178   5.06   2.32   4.69    4.83   2.24   1.28
## Hue                    11 178   0.96   0.23   0.96    0.96   0.24   0.48
## OD280                  12 178   2.61   0.71   2.78    2.63   0.77   1.27
## Proline                13 178 746.89 314.91 673.50  719.30 300.23 278.00
##                          max   range  skew kurtosis    se
## Alcohol                14.83    3.80 -0.05    -0.89  0.06
## Malic_Acid              5.80    5.06  1.02     0.22  0.08
## Ash                     3.23    1.87 -0.17     1.03  0.02
## Ash_Alcanity           30.00   19.40  0.21     0.40  0.25
## Magnesium             162.00   92.00  1.08     1.96  1.07
## Total_Phenols           3.88    2.90  0.09    -0.87  0.05
## Flavanoids              5.08    4.74  0.02    -0.91  0.07
## Nonflavanoid_Phenols    0.66    0.53  0.44    -0.68  0.01
## Proanthocyanins         3.58    3.17  0.51     0.47  0.04
## Color_Intensity        13.00   11.72  0.85     0.30  0.17
## Hue                     1.71    1.23  0.02    -0.40  0.02
## OD280                   4.00    2.73 -0.30    -1.11  0.05
## Proline              1680.00 1402.00  0.75    -0.31 23.60
# histogram
wine_long <- reshape2::melt(df)
## No id variables; using all as measure variables
ggplot(wine_long, aes(x = value)) +
  geom_histogram(bins = 20, fill = "skyblue", color = "black") +
  facet_wrap(~ variable, scales = "free") +
  theme_minimal()

# heatmap korelasi
cor_mat <- cor(df, use = "complete.obs")
round(cor_mat, 2)
##                      Alcohol Malic_Acid   Ash Ash_Alcanity Magnesium
## Alcohol                 1.00       0.09  0.21        -0.31      0.27
## Malic_Acid              0.09       1.00  0.16         0.29     -0.05
## Ash                     0.21       0.16  1.00         0.44      0.29
## Ash_Alcanity           -0.31       0.29  0.44         1.00     -0.08
## Magnesium               0.27      -0.05  0.29        -0.08      1.00
## Total_Phenols           0.29      -0.34  0.13        -0.32      0.21
## Flavanoids              0.24      -0.41  0.12        -0.35      0.20
## Nonflavanoid_Phenols   -0.16       0.29  0.19         0.36     -0.26
## Proanthocyanins         0.14      -0.22  0.01        -0.20      0.24
## Color_Intensity         0.55       0.25  0.26         0.02      0.20
## Hue                    -0.07      -0.56 -0.07        -0.27      0.06
## OD280                   0.07      -0.37  0.00        -0.28      0.07
## Proline                 0.64      -0.19  0.22        -0.44      0.39
##                      Total_Phenols Flavanoids Nonflavanoid_Phenols
## Alcohol                       0.29       0.24                -0.16
## Malic_Acid                   -0.34      -0.41                 0.29
## Ash                           0.13       0.12                 0.19
## Ash_Alcanity                 -0.32      -0.35                 0.36
## Magnesium                     0.21       0.20                -0.26
## Total_Phenols                 1.00       0.86                -0.45
## Flavanoids                    0.86       1.00                -0.54
## Nonflavanoid_Phenols         -0.45      -0.54                 1.00
## Proanthocyanins               0.61       0.65                -0.37
## Color_Intensity              -0.06      -0.17                 0.14
## Hue                           0.43       0.54                -0.26
## OD280                         0.70       0.79                -0.50
## Proline                       0.50       0.49                -0.31
##                      Proanthocyanins Color_Intensity   Hue OD280 Proline
## Alcohol                         0.14            0.55 -0.07  0.07    0.64
## Malic_Acid                     -0.22            0.25 -0.56 -0.37   -0.19
## Ash                             0.01            0.26 -0.07  0.00    0.22
## Ash_Alcanity                   -0.20            0.02 -0.27 -0.28   -0.44
## Magnesium                       0.24            0.20  0.06  0.07    0.39
## Total_Phenols                   0.61           -0.06  0.43  0.70    0.50
## Flavanoids                      0.65           -0.17  0.54  0.79    0.49
## Nonflavanoid_Phenols           -0.37            0.14 -0.26 -0.50   -0.31
## Proanthocyanins                 1.00           -0.03  0.30  0.52    0.33
## Color_Intensity                -0.03            1.00 -0.52 -0.43    0.32
## Hue                             0.30           -0.52  1.00  0.57    0.24
## OD280                           0.52           -0.43  0.57  1.00    0.31
## Proline                         0.33            0.32  0.24  0.31    1.00
ggcorrplot(
  cor_mat,
  hc.order = TRUE,
  type = "lower",
  lab = TRUE,
  lab_size = 2
) +
  theme(
    axis.text.x = element_text(size = 8),
    axis.text.y = element_text(size = 8)
  )
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the ggcorrplot package.
##   Please report the issue at <https://github.com/kassambara/ggcorrplot/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Preprocessing

# standardisasi
wine_scaled <- scale(df)
wine_scaled <- as.data.frame(wine_scaled)
wine_mat <- as.matrix(wine_scaled)

summary(wine_scaled)
##     Alcohol           Malic_Acid           Ash            Ash_Alcanity      
##  Min.   :-2.42739   Min.   :-1.4290   Min.   :-3.66881   Min.   :-2.663505  
##  1st Qu.:-0.78603   1st Qu.:-0.6569   1st Qu.:-0.57051   1st Qu.:-0.687199  
##  Median : 0.06083   Median :-0.4219   Median :-0.02375   Median : 0.001514  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.000000  
##  3rd Qu.: 0.83378   3rd Qu.: 0.6679   3rd Qu.: 0.69614   3rd Qu.: 0.600395  
##  Max.   : 2.25341   Max.   : 3.1004   Max.   : 3.14745   Max.   : 3.145637  
##    Magnesium       Total_Phenols        Flavanoids      Nonflavanoid_Phenols
##  Min.   :-2.0824   Min.   :-2.10132   Min.   :-1.6912   Min.   :-1.8630     
##  1st Qu.:-0.8221   1st Qu.:-0.88298   1st Qu.:-0.8252   1st Qu.:-0.7381     
##  Median :-0.1219   Median : 0.09569   Median : 0.1059   Median :-0.1756     
##  Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000     
##  3rd Qu.: 0.5082   3rd Qu.: 0.80672   3rd Qu.: 0.8467   3rd Qu.: 0.6078     
##  Max.   : 4.3591   Max.   : 2.53237   Max.   : 3.0542   Max.   : 2.3956     
##  Proanthocyanins    Color_Intensity        Hue               OD280        
##  Min.   :-2.06321   Min.   :-1.6297   Min.   :-2.08884   Min.   :-1.8897  
##  1st Qu.:-0.59560   1st Qu.:-0.7929   1st Qu.:-0.76540   1st Qu.:-0.9496  
##  Median :-0.06272   Median :-0.1588   Median : 0.03303   Median : 0.2371  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.0000  
##  3rd Qu.: 0.62741   3rd Qu.: 0.4926   3rd Qu.: 0.71116   3rd Qu.: 0.7864  
##  Max.   : 3.47527   Max.   : 3.4258   Max.   : 3.29241   Max.   : 1.9554  
##     Proline       
##  Min.   :-1.4890  
##  1st Qu.:-0.7824  
##  Median :-0.2331  
##  Mean   : 0.0000  
##  3rd Qu.: 0.7561  
##  Max.   : 2.9631
# Visualisasi semua metode
pca_res <- prcomp(wine_scaled, center = TRUE, scale. = FALSE)
pca_df <- data.frame(PC1 = pca_res$x[, 1], PC2 = pca_res$x[, 2])

# Menentukan jumlah cluster awal
set.seed(123)

fviz_nbclust(wine_scaled, kmeans, method = "wss")

K-Means

set.seed(123)
km_res <- kmeans(wine_scaled, centers = 3, nstart = 25)

table(km_res$cluster)
## 
##  1  2  3 
## 51 62 65
km_res$centers
##      Alcohol Malic_Acid        Ash Ash_Alcanity   Magnesium Total_Phenols
## 1  0.1644436  0.8690954  0.1863726    0.5228924 -0.07526047   -0.97657548
## 2  0.8328826 -0.3029551  0.3636801   -0.6084749  0.57596208    0.88274724
## 3 -0.9234669 -0.3929331 -0.4931257    0.1701220 -0.49032869   -0.07576891
##    Flavanoids Nonflavanoid_Phenols Proanthocyanins Color_Intensity        Hue
## 1 -1.21182921           0.72402116     -0.77751312       0.9388902 -1.1615122
## 2  0.97506900          -0.56050853      0.57865427       0.1705823  0.4726504
## 3  0.02075402          -0.03343924      0.05810161      -0.8993770  0.4605046
##        OD280    Proline
## 1 -1.2887761 -0.4059428
## 2  0.7770551  1.1220202
## 3  0.2700025 -0.7517257
# visualisasi 
pca_km <- pca_df %>%
  mutate(cluster = factor(km_res$cluster))

ggplot(pca_km, aes(x = PC1, y = PC2, color = cluster)) +
  geom_point(size = 2) +
  labs(title = "Visualisasi K-Means") +
  theme_minimal()

# evaluasi silhouette
sil_km <- silhouette(km_res$cluster, dist(wine_scaled))
mean_km <- mean(sil_km[, 3])
mean_km
## [1] 0.2848589
plot(sil_km)

# K-Median

set.seed(123)
kmed_res <- kcca(wine_mat, k = 3, family = kccaFamily("kmedians"))
kmed_cluster <- clusters(kmed_res)

table(kmed_cluster)
## kmed_cluster
##  1  2  3 
## 65 63 50
# visualisasi PCA
pca_kmed <- pca_df %>%
  mutate(cluster = factor(kmed_cluster))

ggplot(pca_kmed, aes(x = PC1, y = PC2, color = cluster)) +
  geom_point(size = 2) +
  labs(title = "Visualisasi K-Median") +
  theme_minimal()

# evaluasi silhouette
sil_kmed <- silhouette(kmed_cluster, dist(wine_scaled))
mean_kmed <- mean(sil_kmed[, 3])
mean_kmed
## [1] 0.281791
plot(sil_kmed)

# DBSCAN

# coba beberapa eps
for (e in c(2.6, 2.8, 3.0, 3.2)) {
  cat("\nEPS =", e, "\n")
  res <- dbscan::dbscan(wine_mat, eps = e, minPts = 5)
  print(table(res$cluster))
}
## 
## EPS = 2.6 
## 
##   0   1 
##  19 159 
## 
## EPS = 2.8 
## 
##   0   1 
##  15 163 
## 
## EPS = 3 
## 
##   0   1 
##  11 167 
## 
## EPS = 3.2 
## 
##   0   1 
##   8 170
kNNdistplot(wine_mat, k = 4)
abline(h = 2.8, lty = 2, col = "red")

db_res <- dbscan::dbscan(wine_mat, eps = 2.8, minPts = 5)
table(db_res$cluster)
## 
##   0   1 
##  15 163
# visualisasi PCA
pca_db <- pca_df %>%
  mutate(cluster = factor(db_res$cluster))

ggplot(pca_db, aes(x = PC1, y = PC2, color = cluster)) +
  geom_point(size = 2) +
  labs(title = "Visualisasi DBSCAN") +
  theme_minimal()

# evaluasi silhouette hanya untuk non-noise
non_noise <- db_res$cluster != 0
mean_db <- NA

if (sum(non_noise) > 1 && length(unique(db_res$cluster[non_noise])) > 1) {
  sil_db <- silhouette(
    db_res$cluster[non_noise],
    dist(wine_scaled[non_noise, , drop = FALSE])
  )
  mean_db <- mean(sil_db[, 3])
  mean_db
  plot(sil_db)
} else {
  cat("Silhouette DBSCAN tidak dapat dihitung karena cluster non-noise kurang dari 2.\n")
}
## Silhouette DBSCAN tidak dapat dihitung karena cluster non-noise kurang dari 2.

Mean Shift

for (b in c(0.2, 0.5, 0.8, 1.0, 1.2, 1.5)) {
  cat("\nBandwidth =", b, "\n")
  res <- meanShift(
    wine_mat,
    bandwidth = rep(b, ncol(wine_mat)),
    algorithm = "KDTREE"
  )
  print(length(unique(as.vector(res$assignment))))
}
## 
## Bandwidth = 0.2 
## [1] 178
## 
## Bandwidth = 0.5 
## [1] 178
## 
## Bandwidth = 0.8 
## [1] 178
## 
## Bandwidth = 1 
## [1] 178
## 
## Bandwidth = 1.2 
## [1] 178
## 
## Bandwidth = 1.5 
## [1] 178
bw <- rep(1.2, ncol(wine_mat))

ms_res <- meanShift(
  wine_mat,
  bandwidth = bw,
  algorithm = "KDTREE"
)

ms_cluster <- as.vector(ms_res$assignment)
table(ms_cluster)
## ms_cluster
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
# visualisasi PCA
pca_ms <- pca_df %>%
  mutate(cluster = factor(ms_cluster))

ggplot(pca_ms, aes(x = PC1, y = PC2, color = cluster)) +
  geom_point(size = 2) +
  labs(title = "Visualisasi Mean Shift berdasarkan PCA") +
  theme_minimal() +
  theme(
    legend.position = "none"
  )

# evaluasi silhouette
mean_ms <- NA

if (length(unique(ms_cluster)) > 1) {
  sil_ms <- tryCatch(
    cluster::silhouette(as.integer(ms_cluster), dist(wine_scaled)),
    error = function(e) NULL
  )

  if (!is.null(sil_ms)) {
    sil_ms_num <- unclass(sil_ms)

    if (is.matrix(sil_ms_num) && ncol(sil_ms_num) >= 3) {
      mean_ms <- mean(sil_ms_num[, 3], na.rm = TRUE)
      print(mean_ms)
      plot(sil_ms)
    } else {
      cat("Silhouette Mean Shift tidak memiliki format yang sesuai.\n")
    }
  } else {
    cat("Silhouette Mean Shift tidak berhasil dihitung.\n")
  }
} else {
  cat("Silhouette Mean Shift tidak dapat dihitung karena hanya terbentuk 1 cluster.\n")
}
## Silhouette Mean Shift tidak memiliki format yang sesuai.

Fuzzy C-Means

set.seed(123)

fcm_res <- ppclust::fcm(
  x = wine_mat,
  centers = 3,
  m = 2
)

# ubah membership jadi hard cluster
fcm_cluster <- max.col(fcm_res$u, ties.method = "first")

table(fcm_cluster)
## fcm_cluster
##  1  2  3 
## 62 51 65
# visualisasi PCA
pca_fcm <- pca_df %>%
  mutate(cluster = factor(fcm_cluster))

ggplot(pca_fcm, aes(x = PC1, y = PC2, color = cluster)) +
  geom_point(size = 2) +
  labs(title = "Visualisasi Fuzzy C-Means") +
  theme_minimal()

# evaluasi silhouette
sil_fcm <- silhouette(fcm_cluster, dist(wine_scaled))
mean_fcm <- mean(sil_fcm[, 3])
mean_fcm
## [1] 0.2848589
plot(sil_fcm)

# Perbandingan Metode

hasil_eval <- data.frame(
  Metode = c("K-Means", "K-Median", "DBSCAN", "Mean Shift", "Fuzzy C-Means"),
  Silhouette = c(mean_km, mean_kmed, mean_db, mean_ms, mean_fcm)
)

hasil_eval %>%
  arrange(desc(Silhouette))
##          Metode Silhouette
## 1       K-Means  0.2848589
## 2 Fuzzy C-Means  0.2848589
## 3      K-Median  0.2817910
## 4        DBSCAN         NA
## 5    Mean Shift         NA