Tugas ini bertujuan untuk menerapkan metode clustering pada dataset Wine dengan 13 fitur numerik. Proses dimulai dari preprocessing data, normalisasi, dan penentuan jumlah cluster optimal menggunakan metode Elbow dan Silhouette. Selanjutnya dilakukan clustering menggunakan K-Means, K-Medians, DBSCAN, Mean Shift, dan Fuzzy C-Means. Hasil clustering divisualisasikan dan dievaluasi menggunakan Silhouette Score, Dunn Index, dan Adjusted Rand Index (ARI) untuk menilai kualitas hasil clustering.
data <- read.csv("wine_dataset.csv", header = TRUE)
data_numeric <- data[, sapply(data, is.numeric)]
str(data_numeric)
## 'data.frame': 178 obs. of 14 variables:
## $ Alcohol : num 14.2 13.2 13.2 14.4 13.2 ...
## $ Malic_acid : num 1.71 1.78 2.36 1.95 2.59 1.76 1.87 2.15 1.64 1.35 ...
## $ Ash : num 2.43 2.14 2.67 2.5 2.87 2.45 2.45 2.61 2.17 2.27 ...
## $ Alcalinity : num 15.6 11.2 18.6 16.8 21 15.2 14.6 17.6 14 16 ...
## $ Magnesium : int 127 100 101 113 118 112 96 121 97 98 ...
## $ Total_phenols : num 2.8 2.65 2.8 3.85 2.8 3.27 2.5 2.6 2.8 2.98 ...
## $ Flavanoids : num 3.06 2.76 3.24 3.49 2.69 3.39 2.52 2.51 2.98 3.15 ...
## $ Nonflavanoid_phenols: num 0.28 0.26 0.3 0.24 0.39 0.34 0.3 0.31 0.29 0.22 ...
## $ Proanthocyanins : num 2.29 1.28 2.81 2.18 1.82 1.97 1.98 1.25 1.98 1.85 ...
## $ Color_intensity : num 5.64 4.38 5.68 7.8 4.32 6.75 5.25 5.05 5.2 7.22 ...
## $ Hue : num 1.04 1.05 1.03 0.86 1.04 1.05 1.02 1.06 1.08 1.01 ...
## $ OD280_OD315 : num 3.92 3.4 3.17 3.45 2.93 2.85 3.58 3.58 2.85 3.55 ...
## $ Proline : int 1065 1050 1185 1480 735 1450 1290 1295 1045 1045 ...
## $ Class : int 1 1 1 1 1 1 1 1 1 1 ...
data_numeric <- data[, sapply(data, is.numeric)]
data_numeric <- na.omit(data_numeric)
label <- data_numeric$Class
data_clustering <- data_numeric[, colnames(data_numeric) != "Class"]
df <- scale(data_clustering)
describe(data_numeric)
## vars n mean sd median trimmed mad min
## Alcohol 1 178 13.00 0.81 13.05 13.01 1.01 11.03
## Malic_acid 2 178 2.34 1.12 1.87 2.21 0.77 0.74
## Ash 3 178 2.37 0.27 2.36 2.37 0.24 1.36
## Alcalinity 4 178 19.49 3.34 19.50 19.42 3.04 10.60
## Magnesium 5 178 99.74 14.28 98.00 98.44 14.83 70.00
## Total_phenols 6 178 2.30 0.63 2.36 2.29 0.75 0.98
## Flavanoids 7 178 2.03 1.00 2.13 2.02 1.24 0.34
## Nonflavanoid_phenols 8 178 0.36 0.12 0.34 0.36 0.13 0.13
## Proanthocyanins 9 178 1.59 0.57 1.56 1.56 0.56 0.41
## Color_intensity 10 178 5.06 2.32 4.69 4.83 2.24 1.28
## Hue 11 178 0.96 0.23 0.96 0.96 0.24 0.48
## OD280_OD315 12 178 2.61 0.71 2.78 2.63 0.77 1.27
## Proline 13 178 746.89 314.91 673.50 719.30 300.23 278.00
## Class 14 178 1.94 0.78 2.00 1.92 1.48 1.00
## max range skew kurtosis se
## Alcohol 14.83 3.80 -0.05 -0.89 0.06
## Malic_acid 5.80 5.06 1.02 0.22 0.08
## Ash 3.23 1.87 -0.17 1.03 0.02
## Alcalinity 30.00 19.40 0.21 0.40 0.25
## Magnesium 162.00 92.00 1.08 1.96 1.07
## Total_phenols 3.88 2.90 0.09 -0.87 0.05
## Flavanoids 5.08 4.74 0.02 -0.91 0.07
## Nonflavanoid_phenols 0.66 0.53 0.44 -0.68 0.01
## Proanthocyanins 3.58 3.17 0.51 0.47 0.04
## Color_intensity 13.00 11.72 0.85 0.30 0.17
## Hue 1.71 1.23 0.02 -0.40 0.02
## OD280_OD315 4.00 2.73 -0.30 -1.11 0.05
## Proline 1680.00 1402.00 0.75 -0.31 23.60
## Class 3.00 2.00 0.11 -1.34 0.06
df <- scale(data_clustering)
set.seed(123)
wss <- sapply(1:10, function(k){
kmeans(df, centers = k, nstart = 20)$tot.withinss
})
par(mfrow = c(1, 1))
plot(1:10, wss, type = "b", pch = 19, frame = FALSE,
xlab = "Number of clusters K",
ylab = "Total within-clusters sum of squares",
main = "Elbow Method")
avg_sil <- function(k) {
km_res <- kmeans(df, centers = k, nstart = 25)
ss <- silhouette(km_res$cluster, dist(df))
mean(ss[, 3])
}
k_values <- 2:10
avg_sil_values <- sapply(k_values, avg_sil)
par(mfrow = c(1, 1))
plot(k_values, avg_sil_values, type = "b", pch = 19, frame = FALSE,
xlab = "Number of clusters K",
ylab = "Average Silhouette Width",
main = "Silhouette Analysis")
km_res <- kmeans(df, centers = 3)
kmed_res <- kcca(df, k = 3, family = kccaFamily("kmedians"))
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
db_res <- dbscan(df, eps = 0.7, MinPts = 5)
ms_res <- meanShift(df)
fcm_res <- cmeans(df, centers = 3, m = 2) # m = 2 is the fuzziness parameter
plot(df, col = km_res$cluster, main = "K-means")
plot(df, col = clusters(kmed_res), main = "K-medians")
plot(df, col = db_res$cluster + 1L, main = "DBSCAN (0 = Noise)")
plot(df, col = ms_res$assignment, main = "Mean Shift")
plot(df, col = fcm_res$cluster, main = "Fuzzy C-means")
plot(df, col = as.factor(label), main = "Class")
mean(silhouette(km_res$cluster, dist(df))[,3])
## [1] 0.2848589
stats <- cluster.stats(dist(df), km_res$cluster)
paste("Dunn Index:", stats$dunn)
## [1] "Dunn Index: 0.232256713904192"
paste("Within-cluster SS:", stats$within.cluster.ss)
## [1] "Within-cluster SS: 1270.74911531181"
ari_score <- adjustedRandIndex(km_res$cluster, label)
print(paste("Adjusted Rand Index:", ari_score))
## [1] "Adjusted Rand Index: 0.897494981509321"
aggregate(data_clustering, by = list(cluster = km_res$cluster), mean)
## cluster Alcohol Malic_acid Ash Alcalinity Magnesium Total_phenols
## 1 1 13.67677 1.997903 2.466290 17.46290 107.96774 2.847581
## 2 2 12.25092 1.897385 2.231231 20.06308 92.73846 2.247692
## 3 3 13.13412 3.307255 2.417647 21.24118 98.66667 1.683922
## Flavanoids Nonflavanoid_phenols Proanthocyanins Color_intensity Hue
## 1 3.0032258 0.2920968 1.922097 5.453548 1.0654839
## 2 2.0500000 0.3576923 1.624154 2.973077 1.0627077
## 3 0.8188235 0.4519608 1.145882 7.234706 0.6919608
## OD280_OD315 Proline
## 1 3.163387 1100.2258
## 2 2.803385 510.1692
## 3 1.696667 619.0588