deskripsi tugas

Tugas ini bertujuan untuk menerapkan metode clustering pada dataset Wine dengan 13 fitur numerik. Proses dimulai dari preprocessing data, normalisasi, dan penentuan jumlah cluster optimal menggunakan metode Elbow dan Silhouette. Selanjutnya dilakukan clustering menggunakan K-Means, K-Medians, DBSCAN, Mean Shift, dan Fuzzy C-Means. Hasil clustering divisualisasikan dan dievaluasi menggunakan Silhouette Score, Dunn Index, dan Adjusted Rand Index (ARI) untuk menilai kualitas hasil clustering.

data <- read.csv("wine_dataset.csv", header = TRUE)


data_numeric <- data[, sapply(data, is.numeric)]

str(data_numeric)
## 'data.frame':    178 obs. of  14 variables:
##  $ Alcohol             : num  14.2 13.2 13.2 14.4 13.2 ...
##  $ Malic_acid          : num  1.71 1.78 2.36 1.95 2.59 1.76 1.87 2.15 1.64 1.35 ...
##  $ Ash                 : num  2.43 2.14 2.67 2.5 2.87 2.45 2.45 2.61 2.17 2.27 ...
##  $ Alcalinity          : num  15.6 11.2 18.6 16.8 21 15.2 14.6 17.6 14 16 ...
##  $ Magnesium           : int  127 100 101 113 118 112 96 121 97 98 ...
##  $ Total_phenols       : num  2.8 2.65 2.8 3.85 2.8 3.27 2.5 2.6 2.8 2.98 ...
##  $ Flavanoids          : num  3.06 2.76 3.24 3.49 2.69 3.39 2.52 2.51 2.98 3.15 ...
##  $ Nonflavanoid_phenols: num  0.28 0.26 0.3 0.24 0.39 0.34 0.3 0.31 0.29 0.22 ...
##  $ Proanthocyanins     : num  2.29 1.28 2.81 2.18 1.82 1.97 1.98 1.25 1.98 1.85 ...
##  $ Color_intensity     : num  5.64 4.38 5.68 7.8 4.32 6.75 5.25 5.05 5.2 7.22 ...
##  $ Hue                 : num  1.04 1.05 1.03 0.86 1.04 1.05 1.02 1.06 1.08 1.01 ...
##  $ OD280_OD315         : num  3.92 3.4 3.17 3.45 2.93 2.85 3.58 3.58 2.85 3.55 ...
##  $ Proline             : int  1065 1050 1185 1480 735 1450 1290 1295 1045 1045 ...
##  $ Class               : int  1 1 1 1 1 1 1 1 1 1 ...
data_numeric <- data[, sapply(data, is.numeric)]
data_numeric <- na.omit(data_numeric)
label <- data_numeric$Class  
data_clustering <- data_numeric[, colnames(data_numeric) != "Class"]
df <- scale(data_clustering)
describe(data_numeric)
##                      vars   n   mean     sd median trimmed    mad    min
## Alcohol                 1 178  13.00   0.81  13.05   13.01   1.01  11.03
## Malic_acid              2 178   2.34   1.12   1.87    2.21   0.77   0.74
## Ash                     3 178   2.37   0.27   2.36    2.37   0.24   1.36
## Alcalinity              4 178  19.49   3.34  19.50   19.42   3.04  10.60
## Magnesium               5 178  99.74  14.28  98.00   98.44  14.83  70.00
## Total_phenols           6 178   2.30   0.63   2.36    2.29   0.75   0.98
## Flavanoids              7 178   2.03   1.00   2.13    2.02   1.24   0.34
## Nonflavanoid_phenols    8 178   0.36   0.12   0.34    0.36   0.13   0.13
## Proanthocyanins         9 178   1.59   0.57   1.56    1.56   0.56   0.41
## Color_intensity        10 178   5.06   2.32   4.69    4.83   2.24   1.28
## Hue                    11 178   0.96   0.23   0.96    0.96   0.24   0.48
## OD280_OD315            12 178   2.61   0.71   2.78    2.63   0.77   1.27
## Proline                13 178 746.89 314.91 673.50  719.30 300.23 278.00
## Class                  14 178   1.94   0.78   2.00    1.92   1.48   1.00
##                          max   range  skew kurtosis    se
## Alcohol                14.83    3.80 -0.05    -0.89  0.06
## Malic_acid              5.80    5.06  1.02     0.22  0.08
## Ash                     3.23    1.87 -0.17     1.03  0.02
## Alcalinity             30.00   19.40  0.21     0.40  0.25
## Magnesium             162.00   92.00  1.08     1.96  1.07
## Total_phenols           3.88    2.90  0.09    -0.87  0.05
## Flavanoids              5.08    4.74  0.02    -0.91  0.07
## Nonflavanoid_phenols    0.66    0.53  0.44    -0.68  0.01
## Proanthocyanins         3.58    3.17  0.51     0.47  0.04
## Color_intensity        13.00   11.72  0.85     0.30  0.17
## Hue                     1.71    1.23  0.02    -0.40  0.02
## OD280_OD315             4.00    2.73 -0.30    -1.11  0.05
## Proline              1680.00 1402.00  0.75    -0.31 23.60
## Class                   3.00    2.00  0.11    -1.34  0.06
df <- scale(data_clustering)
set.seed(123)
wss <- sapply(1:10, function(k){
  kmeans(df, centers = k, nstart = 20)$tot.withinss
})
par(mfrow = c(1, 1))
plot(1:10, wss, type = "b", pch = 19, frame = FALSE, 
     xlab = "Number of clusters K",
     ylab = "Total within-clusters sum of squares",
     main = "Elbow Method")

avg_sil <- function(k) {
  km_res <- kmeans(df, centers = k, nstart = 25)
  ss <- silhouette(km_res$cluster, dist(df))
  mean(ss[, 3])
}
k_values <- 2:10
avg_sil_values <- sapply(k_values, avg_sil)
par(mfrow = c(1, 1))
plot(k_values, avg_sil_values, type = "b", pch = 19, frame = FALSE, 
     xlab = "Number of clusters K",
     ylab = "Average Silhouette Width",
     main = "Silhouette Analysis")

km_res <- kmeans(df, centers = 3)
kmed_res <- kcca(df, k = 3, family = kccaFamily("kmedians"))
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
db_res <- dbscan(df, eps = 0.7, MinPts = 5)
ms_res <- meanShift(df)
fcm_res <- cmeans(df, centers = 3, m = 2) # m = 2 is the fuzziness parameter
plot(df, col = km_res$cluster, main = "K-means")

plot(df, col = clusters(kmed_res), main = "K-medians")

plot(df, col = db_res$cluster + 1L, main = "DBSCAN (0 = Noise)")

plot(df, col = ms_res$assignment, main = "Mean Shift")

plot(df, col = fcm_res$cluster, main = "Fuzzy C-means")

plot(df, col = as.factor(label), main = "Class")

mean(silhouette(km_res$cluster, dist(df))[,3])
## [1] 0.2848589
stats <- cluster.stats(dist(df), km_res$cluster)
paste("Dunn Index:", stats$dunn)
## [1] "Dunn Index: 0.232256713904192"
paste("Within-cluster SS:", stats$within.cluster.ss)
## [1] "Within-cluster SS: 1270.74911531181"
ari_score <- adjustedRandIndex(km_res$cluster, label)
print(paste("Adjusted Rand Index:", ari_score))
## [1] "Adjusted Rand Index: 0.897494981509321"
aggregate(data_clustering, by = list(cluster = km_res$cluster), mean)
##   cluster  Alcohol Malic_acid      Ash Alcalinity Magnesium Total_phenols
## 1       1 13.67677   1.997903 2.466290   17.46290 107.96774      2.847581
## 2       2 12.25092   1.897385 2.231231   20.06308  92.73846      2.247692
## 3       3 13.13412   3.307255 2.417647   21.24118  98.66667      1.683922
##   Flavanoids Nonflavanoid_phenols Proanthocyanins Color_intensity       Hue
## 1  3.0032258            0.2920968        1.922097        5.453548 1.0654839
## 2  2.0500000            0.3576923        1.624154        2.973077 1.0627077
## 3  0.8188235            0.4519608        1.145882        7.234706 0.6919608
##   OD280_OD315   Proline
## 1    3.163387 1100.2258
## 2    2.803385  510.1692
## 3    1.696667  619.0588