#LOAD LIBRARY
library(tidyverse)
library(flexclust)
library(dbscan)
library(meanShiftR)
library(e1071)
library(cluster)
library(fpc)
library(mclust)
#PREPARE DATA
# Load dataset
wine <- read.csv("Wine dataset.csv")
label <- wine$class
# Drop class
wine_clean <- wine %>% select(-class)
df <- scale(wine_clean)
df <- as.data.frame(df)
set.seed(123)
#STATISTIKA DESKRIPTIF
summary(wine_clean)
## Alcohol Malic.acid Ash Alcalinity.of.ash
## Min. :11.03 Min. :0.740 Min. :1.360 Min. :10.60
## 1st Qu.:12.36 1st Qu.:1.603 1st Qu.:2.210 1st Qu.:17.20
## Median :13.05 Median :1.865 Median :2.360 Median :19.50
## Mean :13.00 Mean :2.336 Mean :2.367 Mean :19.49
## 3rd Qu.:13.68 3rd Qu.:3.083 3rd Qu.:2.558 3rd Qu.:21.50
## Max. :14.83 Max. :5.800 Max. :3.230 Max. :30.00
## Magnesium Total.phenols Flavanoids Nonflavanoid.phenols
## Min. : 70.00 Min. :0.980 Min. :0.340 Min. :0.1300
## 1st Qu.: 88.00 1st Qu.:1.742 1st Qu.:1.205 1st Qu.:0.2700
## Median : 98.00 Median :2.355 Median :2.135 Median :0.3400
## Mean : 99.74 Mean :2.295 Mean :2.029 Mean :0.3619
## 3rd Qu.:107.00 3rd Qu.:2.800 3rd Qu.:2.875 3rd Qu.:0.4375
## Max. :162.00 Max. :3.880 Max. :5.080 Max. :0.6600
## Proanthocyanins Color.intensity Hue OD280.OD315.of.diluted.wines
## Min. :0.410 Min. : 1.280 Min. :0.4800 Min. :1.270
## 1st Qu.:1.250 1st Qu.: 3.220 1st Qu.:0.7825 1st Qu.:1.938
## Median :1.555 Median : 4.690 Median :0.9650 Median :2.780
## Mean :1.591 Mean : 5.058 Mean :0.9574 Mean :2.612
## 3rd Qu.:1.950 3rd Qu.: 6.200 3rd Qu.:1.1200 3rd Qu.:3.170
## Max. :3.580 Max. :13.000 Max. :1.7100 Max. :4.000
## Proline
## Min. : 278.0
## 1st Qu.: 500.5
## Median : 673.5
## Mean : 746.9
## 3rd Qu.: 985.0
## Max. :1680.0
#ELBOW METHOD
wss <- sapply(1:10, function(k){
kmeans(df, centers = k, nstart = 20)$tot.withinss
})
plot(1:10, wss, type = "b", pch = 19,
xlab = "Number of clusters K",
ylab = "WSS",
main = "Elbow Method")
# SILHOUETTE
avg_sil <- function(k){
km <- kmeans(df, centers = k, nstart = 25)
ss <- silhouette(km$cluster, dist(df))
mean(ss[,3])
}
k_values <- 2:10
avg_sil_values <- sapply(k_values, avg_sil)
plot(k_values, avg_sil_values, type = "b", pch = 19,
xlab = "Number of clusters K",
ylab = "Silhouette Score",
main = "Silhouette Analysis")
# CLUSTERING
# 1. K-means
km_res <- kmeans(df, centers = 3)
# 2. K-median
kmed_res <- kcca(df, k = 3, family = kccaFamily("kmedians"))
# 3. DBSCAN
# Gunakan eps hasil kNN plot (misal 4)
db_res <- dbscan::dbscan(df, eps = 4, minPts = ncol(df)+1)
# 4. Mean Shift
ms_res <- meanShift(as.matrix(df))
# 5. Fuzzy C-means
fcm_res <- cmeans(df, centers = 3, m = 2)
par(mfrow = c(2,3))
plot(df[,1:2], col = km_res$cluster, main = "K-means")
plot(df[,1:2], col = clusters(kmed_res), main = "K-median")
plot(df[,1:2], col = db_res$cluster + 1, main = "DBSCAN")
plot(df[,1:2], col = ms_res$assignment, main = "Mean Shift")
plot(df[,1:2], col = fcm_res$cluster, main = "Fuzzy C-means")
plot(df[,1:2], col = label, main = "True Class")
# Silhouette
sil_km <- mean(silhouette(km_res$cluster, dist(df))[,3])
# Dunn Index
stats <- cluster.stats(dist(df), km_res$cluster)
dunn_km <- stats$dunn
# ARI (bandingkan dengan class asli)
ari_km <- adjustedRandIndex(km_res$cluster, label)
sil_km
## [1] 0.2848589
dunn_km
## [1] 0.2322567
ari_km
## [1] 0.897495
#Silhouette 5 Metode
library(cluster)
# Distance matrix
d <- dist(df)
# --- 1. K-MEANS ---
sil_km <- mean(silhouette(km_res$cluster, d)[,3])
# --- 2. K-MEDIAN ---
sil_kmed <- mean(silhouette(clusters(kmed_res), d)[,3])
# --- 3. DBSCAN ---
db_cluster <- db_res$cluster
# Hapus noise (cluster = 0)
cluster_db <- db_cluster[db_cluster != 0]
df_db <- df[db_cluster != 0, ]
# Cek apakah cukup cluster
if(length(unique(cluster_db)) < 2){
sil_db <- NA
} else if(length(unique(cluster_db)) >= 2){
sil_db <- mean(silhouette(cluster_db, dist(df_db))[,3])
}
# --- 4. MEAN SHIFT ---
sil_ms <- mean(silhouette(ms_res$assignment, d)[,3])
# --- 5. FUZZY C-MEANS ---
sil_fcm <- mean(silhouette(fcm_res$cluster, d)[,3])
# HASIL
sil_results <- data.frame(
Method = c("K-means", "K-median", "DBSCAN", "Mean Shift", "Fuzzy C-means"),
Silhouette = c(sil_km, sil_kmed, sil_db, sil_ms, sil_fcm)
)
sil_results
## Method Silhouette
## 1 K-means 0.28485892
## 2 K-median 0.28179105
## 3 DBSCAN NA
## 4 Mean Shift -0.01536785
## 5 Fuzzy C-means 0.28485892