df <- read.csv("wine-clustering.csv", stringsAsFactors = FALSE)
# cek struktur data
str(df)
## 'data.frame': 178 obs. of 13 variables:
## $ Alcohol : num 14.2 13.2 13.2 14.4 13.2 ...
## $ Malic_Acid : num 1.71 1.78 2.36 1.95 2.59 1.76 1.87 2.15 1.64 1.35 ...
## $ Ash : num 2.43 2.14 2.67 2.5 2.87 2.45 2.45 2.61 2.17 2.27 ...
## $ Ash_Alcanity : num 15.6 11.2 18.6 16.8 21 15.2 14.6 17.6 14 16 ...
## $ Magnesium : int 127 100 101 113 118 112 96 121 97 98 ...
## $ Total_Phenols : num 2.8 2.65 2.8 3.85 2.8 3.27 2.5 2.6 2.8 2.98 ...
## $ Flavanoids : num 3.06 2.76 3.24 3.49 2.69 3.39 2.52 2.51 2.98 3.15 ...
## $ Nonflavanoid_Phenols: num 0.28 0.26 0.3 0.24 0.39 0.34 0.3 0.31 0.29 0.22 ...
## $ Proanthocyanins : num 2.29 1.28 2.81 2.18 1.82 1.97 1.98 1.25 1.98 1.85 ...
## $ Color_Intensity : num 5.64 4.38 5.68 7.8 4.32 6.75 5.25 5.05 5.2 7.22 ...
## $ Hue : num 1.04 1.05 1.03 0.86 1.04 1.05 1.02 1.06 1.08 1.01 ...
## $ OD280 : num 3.92 3.4 3.17 3.45 2.93 2.85 3.58 3.58 2.85 3.55 ...
## $ Proline : int 1065 1050 1185 1480 735 1450 1290 1295 1045 1045 ...
head(df)
## Alcohol Malic_Acid Ash Ash_Alcanity Magnesium Total_Phenols Flavanoids
## 1 14.23 1.71 2.43 15.6 127 2.80 3.06
## 2 13.20 1.78 2.14 11.2 100 2.65 2.76
## 3 13.16 2.36 2.67 18.6 101 2.80 3.24
## 4 14.37 1.95 2.50 16.8 113 3.85 3.49
## 5 13.24 2.59 2.87 21.0 118 2.80 2.69
## 6 14.20 1.76 2.45 15.2 112 3.27 3.39
## Nonflavanoid_Phenols Proanthocyanins Color_Intensity Hue OD280 Proline
## 1 0.28 2.29 5.64 1.04 3.92 1065
## 2 0.26 1.28 4.38 1.05 3.40 1050
## 3 0.30 2.81 5.68 1.03 3.17 1185
## 4 0.24 2.18 7.80 0.86 3.45 1480
## 5 0.39 1.82 4.32 1.04 2.93 735
## 6 0.34 1.97 6.75 1.05 2.85 1450
# cek missing value
colSums(is.na(df))
## Alcohol Malic_Acid Ash
## 0 0 0
## Ash_Alcanity Magnesium Total_Phenols
## 0 0 0
## Flavanoids Nonflavanoid_Phenols Proanthocyanins
## 0 0 0
## Color_Intensity Hue OD280
## 0 0 0
## Proline
## 0
# statistik deskriptif
data_deskriptif <- psych::describe(df)
print(data_deskriptif)
## vars n mean sd median trimmed mad min
## Alcohol 1 178 13.00 0.81 13.05 13.01 1.01 11.03
## Malic_Acid 2 178 2.34 1.12 1.87 2.21 0.77 0.74
## Ash 3 178 2.37 0.27 2.36 2.37 0.24 1.36
## Ash_Alcanity 4 178 19.49 3.34 19.50 19.42 3.04 10.60
## Magnesium 5 178 99.74 14.28 98.00 98.44 14.83 70.00
## Total_Phenols 6 178 2.30 0.63 2.36 2.29 0.75 0.98
## Flavanoids 7 178 2.03 1.00 2.13 2.02 1.24 0.34
## Nonflavanoid_Phenols 8 178 0.36 0.12 0.34 0.36 0.13 0.13
## Proanthocyanins 9 178 1.59 0.57 1.56 1.56 0.56 0.41
## Color_Intensity 10 178 5.06 2.32 4.69 4.83 2.24 1.28
## Hue 11 178 0.96 0.23 0.96 0.96 0.24 0.48
## OD280 12 178 2.61 0.71 2.78 2.63 0.77 1.27
## Proline 13 178 746.89 314.91 673.50 719.30 300.23 278.00
## max range skew kurtosis se
## Alcohol 14.83 3.80 -0.05 -0.89 0.06
## Malic_Acid 5.80 5.06 1.02 0.22 0.08
## Ash 3.23 1.87 -0.17 1.03 0.02
## Ash_Alcanity 30.00 19.40 0.21 0.40 0.25
## Magnesium 162.00 92.00 1.08 1.96 1.07
## Total_Phenols 3.88 2.90 0.09 -0.87 0.05
## Flavanoids 5.08 4.74 0.02 -0.91 0.07
## Nonflavanoid_Phenols 0.66 0.53 0.44 -0.68 0.01
## Proanthocyanins 3.58 3.17 0.51 0.47 0.04
## Color_Intensity 13.00 11.72 0.85 0.30 0.17
## Hue 1.71 1.23 0.02 -0.40 0.02
## OD280 4.00 2.73 -0.30 -1.11 0.05
## Proline 1680.00 1402.00 0.75 -0.31 23.60
# histogram
wine_long <- reshape2::melt(df)
## No id variables; using all as measure variables
ggplot(wine_long, aes(x = value)) +
geom_histogram(bins = 20, fill = "skyblue", color = "black") +
facet_wrap(~ variable, scales = "free") +
theme_minimal()
# heatmap korelasi
cor_mat <- cor(df, use = "complete.obs")
round(cor_mat, 2)
## Alcohol Malic_Acid Ash Ash_Alcanity Magnesium
## Alcohol 1.00 0.09 0.21 -0.31 0.27
## Malic_Acid 0.09 1.00 0.16 0.29 -0.05
## Ash 0.21 0.16 1.00 0.44 0.29
## Ash_Alcanity -0.31 0.29 0.44 1.00 -0.08
## Magnesium 0.27 -0.05 0.29 -0.08 1.00
## Total_Phenols 0.29 -0.34 0.13 -0.32 0.21
## Flavanoids 0.24 -0.41 0.12 -0.35 0.20
## Nonflavanoid_Phenols -0.16 0.29 0.19 0.36 -0.26
## Proanthocyanins 0.14 -0.22 0.01 -0.20 0.24
## Color_Intensity 0.55 0.25 0.26 0.02 0.20
## Hue -0.07 -0.56 -0.07 -0.27 0.06
## OD280 0.07 -0.37 0.00 -0.28 0.07
## Proline 0.64 -0.19 0.22 -0.44 0.39
## Total_Phenols Flavanoids Nonflavanoid_Phenols
## Alcohol 0.29 0.24 -0.16
## Malic_Acid -0.34 -0.41 0.29
## Ash 0.13 0.12 0.19
## Ash_Alcanity -0.32 -0.35 0.36
## Magnesium 0.21 0.20 -0.26
## Total_Phenols 1.00 0.86 -0.45
## Flavanoids 0.86 1.00 -0.54
## Nonflavanoid_Phenols -0.45 -0.54 1.00
## Proanthocyanins 0.61 0.65 -0.37
## Color_Intensity -0.06 -0.17 0.14
## Hue 0.43 0.54 -0.26
## OD280 0.70 0.79 -0.50
## Proline 0.50 0.49 -0.31
## Proanthocyanins Color_Intensity Hue OD280 Proline
## Alcohol 0.14 0.55 -0.07 0.07 0.64
## Malic_Acid -0.22 0.25 -0.56 -0.37 -0.19
## Ash 0.01 0.26 -0.07 0.00 0.22
## Ash_Alcanity -0.20 0.02 -0.27 -0.28 -0.44
## Magnesium 0.24 0.20 0.06 0.07 0.39
## Total_Phenols 0.61 -0.06 0.43 0.70 0.50
## Flavanoids 0.65 -0.17 0.54 0.79 0.49
## Nonflavanoid_Phenols -0.37 0.14 -0.26 -0.50 -0.31
## Proanthocyanins 1.00 -0.03 0.30 0.52 0.33
## Color_Intensity -0.03 1.00 -0.52 -0.43 0.32
## Hue 0.30 -0.52 1.00 0.57 0.24
## OD280 0.52 -0.43 0.57 1.00 0.31
## Proline 0.33 0.32 0.24 0.31 1.00
ggcorrplot(
cor_mat,
hc.order = TRUE,
type = "lower",
lab = TRUE,
lab_size = 2
) +
theme(
axis.text.x = element_text(size = 8),
axis.text.y = element_text(size = 8)
)
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the ggcorrplot package.
## Please report the issue at <https://github.com/kassambara/ggcorrplot/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Preprocessing
# standardisasi
wine_scaled <- scale(df)
wine_scaled <- as.data.frame(wine_scaled)
wine_mat <- as.matrix(wine_scaled)
summary(wine_scaled)
## Alcohol Malic_Acid Ash Ash_Alcanity
## Min. :-2.42739 Min. :-1.4290 Min. :-3.66881 Min. :-2.663505
## 1st Qu.:-0.78603 1st Qu.:-0.6569 1st Qu.:-0.57051 1st Qu.:-0.687199
## Median : 0.06083 Median :-0.4219 Median :-0.02375 Median : 0.001514
## Mean : 0.00000 Mean : 0.0000 Mean : 0.00000 Mean : 0.000000
## 3rd Qu.: 0.83378 3rd Qu.: 0.6679 3rd Qu.: 0.69614 3rd Qu.: 0.600395
## Max. : 2.25341 Max. : 3.1004 Max. : 3.14745 Max. : 3.145637
## Magnesium Total_Phenols Flavanoids Nonflavanoid_Phenols
## Min. :-2.0824 Min. :-2.10132 Min. :-1.6912 Min. :-1.8630
## 1st Qu.:-0.8221 1st Qu.:-0.88298 1st Qu.:-0.8252 1st Qu.:-0.7381
## Median :-0.1219 Median : 0.09569 Median : 0.1059 Median :-0.1756
## Mean : 0.0000 Mean : 0.00000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.5082 3rd Qu.: 0.80672 3rd Qu.: 0.8467 3rd Qu.: 0.6078
## Max. : 4.3591 Max. : 2.53237 Max. : 3.0542 Max. : 2.3956
## Proanthocyanins Color_Intensity Hue OD280
## Min. :-2.06321 Min. :-1.6297 Min. :-2.08884 Min. :-1.8897
## 1st Qu.:-0.59560 1st Qu.:-0.7929 1st Qu.:-0.76540 1st Qu.:-0.9496
## Median :-0.06272 Median :-0.1588 Median : 0.03303 Median : 0.2371
## Mean : 0.00000 Mean : 0.0000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.62741 3rd Qu.: 0.4926 3rd Qu.: 0.71116 3rd Qu.: 0.7864
## Max. : 3.47527 Max. : 3.4258 Max. : 3.29241 Max. : 1.9554
## Proline
## Min. :-1.4890
## 1st Qu.:-0.7824
## Median :-0.2331
## Mean : 0.0000
## 3rd Qu.: 0.7561
## Max. : 2.9631
# Visualisasi semua metode
pca_res <- prcomp(wine_scaled, center = TRUE, scale. = FALSE)
pca_df <- data.frame(PC1 = pca_res$x[, 1], PC2 = pca_res$x[, 2])
# Menentukan jumlah cluster awal
set.seed(123)
fviz_nbclust(wine_scaled, kmeans, method = "wss")
set.seed(123)
km_res <- kmeans(wine_scaled, centers = 3, nstart = 25)
table(km_res$cluster)
##
## 1 2 3
## 51 62 65
km_res$centers
## Alcohol Malic_Acid Ash Ash_Alcanity Magnesium Total_Phenols
## 1 0.1644436 0.8690954 0.1863726 0.5228924 -0.07526047 -0.97657548
## 2 0.8328826 -0.3029551 0.3636801 -0.6084749 0.57596208 0.88274724
## 3 -0.9234669 -0.3929331 -0.4931257 0.1701220 -0.49032869 -0.07576891
## Flavanoids Nonflavanoid_Phenols Proanthocyanins Color_Intensity Hue
## 1 -1.21182921 0.72402116 -0.77751312 0.9388902 -1.1615122
## 2 0.97506900 -0.56050853 0.57865427 0.1705823 0.4726504
## 3 0.02075402 -0.03343924 0.05810161 -0.8993770 0.4605046
## OD280 Proline
## 1 -1.2887761 -0.4059428
## 2 0.7770551 1.1220202
## 3 0.2700025 -0.7517257
# visualisasi
pca_km <- pca_df %>%
mutate(cluster = factor(km_res$cluster))
ggplot(pca_km, aes(x = PC1, y = PC2, color = cluster)) +
geom_point(size = 2) +
labs(title = "Visualisasi K-Means") +
theme_minimal()
# evaluasi silhouette
sil_km <- silhouette(km_res$cluster, dist(wine_scaled))
mean_km <- mean(sil_km[, 3])
mean_km
## [1] 0.2848589
plot(sil_km)
# K-Median
set.seed(123)
kmed_res <- kcca(wine_mat, k = 3, family = kccaFamily("kmedians"))
kmed_cluster <- clusters(kmed_res)
table(kmed_cluster)
## kmed_cluster
## 1 2 3
## 65 63 50
# visualisasi PCA
pca_kmed <- pca_df %>%
mutate(cluster = factor(kmed_cluster))
ggplot(pca_kmed, aes(x = PC1, y = PC2, color = cluster)) +
geom_point(size = 2) +
labs(title = "Visualisasi K-Median") +
theme_minimal()
# evaluasi silhouette
sil_kmed <- silhouette(kmed_cluster, dist(wine_scaled))
mean_kmed <- mean(sil_kmed[, 3])
mean_kmed
## [1] 0.281791
plot(sil_kmed)
# DBSCAN
# coba beberapa eps
for (e in c(2.6, 2.8, 3.0, 3.2)) {
cat("\nEPS =", e, "\n")
res <- dbscan::dbscan(wine_mat, eps = e, minPts = 5)
print(table(res$cluster))
}
##
## EPS = 2.6
##
## 0 1
## 19 159
##
## EPS = 2.8
##
## 0 1
## 15 163
##
## EPS = 3
##
## 0 1
## 11 167
##
## EPS = 3.2
##
## 0 1
## 8 170
kNNdistplot(wine_mat, k = 4)
abline(h = 2.8, lty = 2, col = "red")
db_res <- dbscan::dbscan(wine_mat, eps = 2.8, minPts = 5)
table(db_res$cluster)
##
## 0 1
## 15 163
# visualisasi PCA
pca_db <- pca_df %>%
mutate(cluster = factor(db_res$cluster))
ggplot(pca_db, aes(x = PC1, y = PC2, color = cluster)) +
geom_point(size = 2) +
labs(title = "Visualisasi DBSCAN") +
theme_minimal()
# evaluasi silhouette hanya untuk non-noise
non_noise <- db_res$cluster != 0
mean_db <- NA
if (sum(non_noise) > 1 && length(unique(db_res$cluster[non_noise])) > 1) {
sil_db <- silhouette(
db_res$cluster[non_noise],
dist(wine_scaled[non_noise, , drop = FALSE])
)
mean_db <- mean(sil_db[, 3])
mean_db
plot(sil_db)
} else {
cat("Silhouette DBSCAN tidak dapat dihitung karena cluster non-noise kurang dari 2.\n")
}
## Silhouette DBSCAN tidak dapat dihitung karena cluster non-noise kurang dari 2.
for (b in c(0.2, 0.5, 0.8, 1.0, 1.2, 1.5)) {
cat("\nBandwidth =", b, "\n")
res <- meanShift(
wine_mat,
bandwidth = rep(b, ncol(wine_mat)),
algorithm = "KDTREE"
)
print(length(unique(as.vector(res$assignment))))
}
##
## Bandwidth = 0.2
## [1] 178
##
## Bandwidth = 0.5
## [1] 178
##
## Bandwidth = 0.8
## [1] 178
##
## Bandwidth = 1
## [1] 178
##
## Bandwidth = 1.2
## [1] 178
##
## Bandwidth = 1.5
## [1] 178
bw <- rep(1.2, ncol(wine_mat))
ms_res <- meanShift(
wine_mat,
bandwidth = bw,
algorithm = "KDTREE"
)
ms_cluster <- as.vector(ms_res$assignment)
table(ms_cluster)
## ms_cluster
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
# visualisasi PCA
pca_ms <- pca_df %>%
mutate(cluster = factor(ms_cluster))
ggplot(pca_ms, aes(x = PC1, y = PC2, color = cluster)) +
geom_point(size = 2) +
labs(title = "Visualisasi Mean Shift berdasarkan PCA") +
theme_minimal() +
theme(
legend.position = "none"
)
# evaluasi silhouette
mean_ms <- NA
if (length(unique(ms_cluster)) > 1) {
sil_ms <- tryCatch(
cluster::silhouette(as.integer(ms_cluster), dist(wine_scaled)),
error = function(e) NULL
)
if (!is.null(sil_ms)) {
sil_ms_num <- unclass(sil_ms)
if (is.matrix(sil_ms_num) && ncol(sil_ms_num) >= 3) {
mean_ms <- mean(sil_ms_num[, 3], na.rm = TRUE)
print(mean_ms)
plot(sil_ms)
} else {
cat("Silhouette Mean Shift tidak memiliki format yang sesuai.\n")
}
} else {
cat("Silhouette Mean Shift tidak berhasil dihitung.\n")
}
} else {
cat("Silhouette Mean Shift tidak dapat dihitung karena hanya terbentuk 1 cluster.\n")
}
## Silhouette Mean Shift tidak memiliki format yang sesuai.
set.seed(123)
fcm_res <- ppclust::fcm(
x = wine_mat,
centers = 3,
m = 2
)
# ubah membership jadi hard cluster
fcm_cluster <- max.col(fcm_res$u, ties.method = "first")
table(fcm_cluster)
## fcm_cluster
## 1 2 3
## 62 51 65
# visualisasi PCA
pca_fcm <- pca_df %>%
mutate(cluster = factor(fcm_cluster))
ggplot(pca_fcm, aes(x = PC1, y = PC2, color = cluster)) +
geom_point(size = 2) +
labs(title = "Visualisasi Fuzzy C-Means") +
theme_minimal()
# evaluasi silhouette
sil_fcm <- silhouette(fcm_cluster, dist(wine_scaled))
mean_fcm <- mean(sil_fcm[, 3])
mean_fcm
## [1] 0.2848589
plot(sil_fcm)
# Perbandingan Metode
hasil_eval <- data.frame(
Metode = c("K-Means", "K-Median", "DBSCAN", "Mean Shift", "Fuzzy C-Means"),
Silhouette = c(mean_km, mean_kmed, mean_db, mean_ms, mean_fcm)
)
hasil_eval %>%
arrange(desc(Silhouette))
## Metode Silhouette
## 1 K-Means 0.2848589
## 2 Fuzzy C-Means 0.2848589
## 3 K-Median 0.2817910
## 4 DBSCAN NA
## 5 Mean Shift NA