Analisis ini membandingkan lima metode clustering yaitu K-Means, K-Median, DBSCAN, Mean Shift, dan Fuzzy C-Means. Evaluasi dilakukan menggunakan satu metrik internal, yaitu silhouette score. Metode terbaik dipilih dari hasil evaluasi seluruh metode, lalu digunakan untuk analisis karakteristik cluster secara lebih rinci.
Dataset yang digunakan adalah Wholesale Customers.
Fokus clustering diarahkan pada perilaku pembelian pelanggan berdasarkan
variabel numerik transaksi. Variabel Channel dan
Region tidak dipakai sebagai fitur clustering, tetapi tetap
disimpan untuk membantu interpretasi hasil pada tahap EDA.
if (!require("tidyverse")) install.packages("tidyverse")
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.0 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyverse)
if (!require("cluster")) install.packages("cluster")
## Loading required package: cluster
## Warning: package 'cluster' was built under R version 4.5.3
library(cluster)
if (!require("factoextra")) install.packages("factoextra")
## Loading required package: factoextra
## Warning: package 'factoextra' was built under R version 4.5.3
## Welcome to factoextra!
## Want to learn more? See two factoextra-related books at https://www.datanovia.com/en/product/practical-guide-to-principal-component-methods-in-r/
library(factoextra)
if (!require("dbscan")) install.packages("dbscan")
## Loading required package: dbscan
## Warning: package 'dbscan' was built under R version 4.5.3
##
## Attaching package: 'dbscan'
## The following object is masked from 'package:stats':
##
## as.dendrogram
library(dbscan)
if (!require("flexclust")) install.packages("flexclust")
## Loading required package: flexclust
## Warning: package 'flexclust' was built under R version 4.5.3
library(flexclust)
if (!require("e1071")) install.packages("e1071")
## Loading required package: e1071
## Warning: package 'e1071' was built under R version 4.5.3
##
## Attaching package: 'e1071'
## The following object is masked from 'package:flexclust':
##
## bclust
## The following object is masked from 'package:ggplot2':
##
## element
library(e1071)
if (!require("meanShiftR")) install.packages("meanShiftR")
## Loading required package: meanShiftR
library(meanShiftR)
if (!require("fpc")) install.packages("fpc")
## Loading required package: fpc
## Warning: package 'fpc' was built under R version 4.5.3
##
## Attaching package: 'fpc'
## The following object is masked from 'package:dbscan':
##
## dbscan
library(fpc)
if (!require("corrplot")) install.packages("corrplot")
## Loading required package: corrplot
## corrplot 0.95 loaded
library(corrplot)
if (!require("gridExtra")) install.packages("gridExtra")
## Loading required package: gridExtra
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(gridExtra)
data <- read.csv("Wholesale customers data.csv")
head(data)
## Channel Region Fresh Milk Grocery Frozen Detergents_Paper Delicassen
## 1 2 3 12669 9656 7561 214 2674 1338
## 2 2 3 7057 9810 9568 1762 3293 1776
## 3 2 3 6353 8808 7684 2405 3516 7844
## 4 1 3 13265 1196 4221 6404 507 1788
## 5 2 3 22615 5410 7198 3915 1777 5185
## 6 2 3 9413 8259 5126 666 1795 1451
str(data)
## 'data.frame': 440 obs. of 8 variables:
## $ Channel : int 2 2 2 1 2 2 2 2 1 2 ...
## $ Region : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Fresh : int 12669 7057 6353 13265 22615 9413 12126 7579 5963 6006 ...
## $ Milk : int 9656 9810 8808 1196 5410 8259 3199 4956 3648 11093 ...
## $ Grocery : int 7561 9568 7684 4221 7198 5126 6975 9426 6192 18881 ...
## $ Frozen : int 214 1762 2405 6404 3915 666 480 1669 425 1159 ...
## $ Detergents_Paper: int 2674 3293 3516 507 1777 1795 3140 3321 1716 7425 ...
## $ Delicassen : int 1338 1776 7844 1788 5185 1451 545 2566 750 2098 ...
summary(data)
## Channel Region Fresh Milk
## Min. :1.000 Min. :1.000 Min. : 3 Min. : 55
## 1st Qu.:1.000 1st Qu.:2.000 1st Qu.: 3128 1st Qu.: 1533
## Median :1.000 Median :3.000 Median : 8504 Median : 3627
## Mean :1.323 Mean :2.543 Mean : 12000 Mean : 5796
## 3rd Qu.:2.000 3rd Qu.:3.000 3rd Qu.: 16934 3rd Qu.: 7190
## Max. :2.000 Max. :3.000 Max. :112151 Max. :73498
## Grocery Frozen Detergents_Paper Delicassen
## Min. : 3 Min. : 25.0 Min. : 3.0 Min. : 3.0
## 1st Qu.: 2153 1st Qu.: 742.2 1st Qu.: 256.8 1st Qu.: 408.2
## Median : 4756 Median : 1526.0 Median : 816.5 Median : 965.5
## Mean : 7951 Mean : 3071.9 Mean : 2881.5 Mean : 1524.9
## 3rd Qu.:10656 3rd Qu.: 3554.2 3rd Qu.: 3922.0 3rd Qu.: 1820.2
## Max. :92780 Max. :60869.0 Max. :40827.0 Max. :47943.0
colSums(is.na(data))
## Channel Region Fresh Milk
## 0 0 0 0
## Grocery Frozen Detergents_Paper Delicassen
## 0 0 0 0
sum(duplicated(data))
## [1] 0
data %>%
summarise(across(where(is.numeric),
list(mean = mean, median = median, sd = sd),
na.rm = TRUE))
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `across(...)`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
##
## # Previously
## across(a:b, mean, na.rm = TRUE)
##
## # Now
## across(a:b, \(x) mean(x, na.rm = TRUE))
## Channel_mean Channel_median Channel_sd Region_mean Region_median Region_sd
## 1 1.322727 1 0.4680516 2.543182 3 0.7742724
## Fresh_mean Fresh_median Fresh_sd Milk_mean Milk_median Milk_sd Grocery_mean
## 1 12000.3 8504 12647.33 5796.266 3627 7380.377 7951.277
## Grocery_median Grocery_sd Frozen_mean Frozen_median Frozen_sd
## 1 4755.5 9503.163 3071.932 1526 4854.673
## Detergents_Paper_mean Detergents_Paper_median Detergents_Paper_sd
## 1 2881.493 816.5 4767.854
## Delicassen_mean Delicassen_median Delicassen_sd
## 1 1524.87 965.5 2820.106
data_long <- data %>%
pivot_longer(cols = c(Fresh, Milk, Grocery, Frozen, Detergents_Paper, Delicassen),
names_to = "Variabel",
values_to = "Nilai")
ggplot(data_long, aes(x = Nilai)) +
geom_histogram(bins = 30, color = "black", fill = "steelblue") +
facet_wrap(~Variabel, scales = "free") +
theme_minimal() +
labs(title = "Distribusi Awal Variabel Numerik",
x = "Nilai",
y = "Frekuensi")
boxplot(data[, c("Fresh", "Milk", "Grocery", "Frozen", "Detergents_Paper", "Delicassen")],
las = 2,
col = "lightgoldenrod",
main = "Boxplot Variabel Numerik Awal",
ylab = "Nilai")
corr_data <- cor(data[, c("Fresh", "Milk", "Grocery", "Frozen", "Detergents_Paper", "Delicassen")])
corrplot(corr_data, method = "color", type = "lower", addCoef.col = "black", tl.col = "black")
Tahap preprocessing dilakukan agar data siap diproses oleh algoritma
clustering yang berbasis jarak. Langkah yang dipakai adalah memilih
fitur numerik utama, melakukan transformasi log1p() untuk
menstabilkan sebaran data, lalu melakukan standardisasi dengan
scale().
data_model <- data %>%
select(Fresh, Milk, Grocery, Frozen, Detergents_Paper, Delicassen)
data_log <- log1p(data_model)
data_scaled <- scale(data_log)
data_scaled <- as.data.frame(data_scaled)
summary(data_scaled)
## Fresh Milk Grocery Frozen
## Min. :-4.9955 Min. :-3.79061 Min. :-6.34796 Min. :-3.15553
## 1st Qu.:-0.4654 1st Qu.:-0.72733 1st Qu.:-0.69016 1st Qu.:-0.53991
## Median : 0.2146 Median : 0.06924 Median : 0.02255 Median : 0.02178
## Mean : 0.0000 Mean : 0.00000 Mean : 0.00000 Mean : 0.00000
## 3rd Qu.: 0.6829 3rd Qu.: 0.70237 3rd Qu.: 0.74829 3rd Qu.: 0.68107
## Max. : 1.9684 Max. : 2.85333 Max. : 2.69521 Max. : 2.89680
## Detergents_Paper Delicassen
## Min. :-3.16199 Min. :-4.0842
## 1st Qu.:-0.72523 1st Qu.:-0.5076
## Median :-0.05004 Median : 0.1566
## Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.86739 3rd Qu.: 0.6462
## Max. : 2.23767 Max. : 3.1737
data_scaled_long <- data_scaled %>%
pivot_longer(cols = everything(),
names_to = "Variabel",
values_to = "Nilai")
ggplot(data_scaled_long, aes(x = Nilai)) +
geom_histogram(bins = 30, color = "black", fill = "darkseagreen3") +
facet_wrap(~Variabel, scales = "free") +
theme_minimal() +
labs(title = "Distribusi Setelah Log Transformasi dan Scaling",
x = "Nilai Terstandarisasi",
y = "Frekuensi")
boxplot(data_scaled,
las = 2,
col = "lightblue",
main = "Boxplot Setelah Preprocessing")
set.seed(123)
hopkins_stat <- get_clust_tendency(data_scaled, n = nrow(data_scaled) - 1, graph = FALSE)
## Warning: Hopkins statistic uses the corrected formula (Wright 2022); results
## differ from legacy factoextra. Set options(factoextra.warn_hopkins = FALSE) to
## silence this warning.
hopkins_stat$hopkins_stat
## [1] 0.9972662
Nilai Hopkins yang mendekati 1 menunjukkan data cenderung memiliki struktur cluster.
Untuk metode yang membutuhkan jumlah cluster, nilai k
dicari terlebih dahulu menggunakan Elbow dan Silhouette. Nilai terbaik
dari bagian ini dipakai sebagai referensi jumlah cluster untuk K-Means,
K-Median, dan Fuzzy C-Means.
set.seed(123)
elbow_plot <- fviz_nbclust(data_scaled, kmeans, method = "wss") +
labs(title = "Elbow Method")
sil_plot <- fviz_nbclust(data_scaled, kmeans, method = "silhouette") +
labs(title = "Silhouette Method")
grid.arrange(elbow_plot, sil_plot, ncol = 2)
k_candidates <- 2:6
silhouette_kmeans <- sapply(k_candidates, function(k) {
km <- kmeans(data_scaled, centers = k, nstart = 25)
ss <- silhouette(km$cluster, dist(data_scaled))
mean(ss[, 3])
})
k_table <- data.frame(
k = k_candidates,
silhouette = silhouette_kmeans
)
k_table
## k silhouette
## 1 2 0.2903285
## 2 3 0.2591994
## 3 4 0.1885249
## 4 5 0.1921550
## 5 6 0.1937067
k_opt <- k_table$k[which.max(k_table$silhouette)]
k_opt
## [1] 2
set.seed(123)
kmeans_model <- kmeans(data_scaled, centers = k_opt, nstart = 25)
data$cluster_kmeans <- as.factor(kmeans_model$cluster)
fviz_cluster(kmeans_model,
data = data_scaled,
geom = "point",
ellipse.type = "convex",
main = "K-Means Clustering")
kmedian_model <- kcca(data_scaled,
k = k_opt,
family = kccaFamily("kmedians"))
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
kmedian_cluster <- predict(kmedian_model)
data$cluster_kmedians <- as.factor(kmedian_cluster)
fviz_cluster(list(data = data_scaled, cluster = kmedian_cluster),
geom = "point",
ellipse.type = "convex",
palette = "jco",
main = "K-Medians Clustering",
ggtheme = theme_minimal())
avg_silhouette <- function(x, cluster_labels) {
cluster_labels <- as.integer(as.factor(cluster_labels))
tab <- table(cluster_labels)
if (length(unique(cluster_labels)) < 2) return(NA_real_)
if (any(tab < 2)) return(NA_real_)
ss <- silhouette(cluster_labels, dist(x))
mean(ss[, 3], na.rm = TRUE)
}
minPts <- ncol(data_scaled) + 1
kdist <- sort(dbscan::kNNdist(data_scaled, k = minPts))
plot(kdist,
type = "l",
main = "k-Distance Plot untuk DBSCAN",
xlab = "Urutan Titik",
ylab = paste0("Jarak ke-", minPts, " Tetangga"))
eps_grid <- seq(
as.numeric(quantile(kdist, 0.50)),
as.numeric(quantile(kdist, 0.99)),
length.out = 20
)
eval_dbscan <- purrr::map_dfr(eps_grid, function(eps) {
model <- dbscan::dbscan(data_scaled, eps = eps, minPts = minPts)
cl <- model$cluster
tibble(
eps = eps,
n_cluster = length(setdiff(unique(cl), 0)),
noise = sum(cl == 0),
sil = avg_silhouette(data_scaled[cl != 0, , drop = FALSE], cl[cl != 0])
)
})
eval_dbscan_clean <- eval_dbscan %>%
filter(!is.na(sil), n_cluster >= 2)
if (nrow(eval_dbscan_clean) == 0) {
best_eps <- NA_real_
dbscan_model <- NULL
cluster_dbscan <- rep(0, nrow(data_scaled))
sil_dbscan <- NA_real_
message("DBSCAN tidak menemukan cluster valid. Coba ubah range eps.")
} else {
best_row <- eval_dbscan_clean[which.max(eval_dbscan_clean$sil), ]
best_eps <- best_row$eps
dbscan_model <- dbscan::dbscan(data_scaled, eps = best_eps, minPts = minPts)
cluster_dbscan <- dbscan_model$cluster
sil_dbscan <- best_row$sil
}
print(eval_dbscan)
## # A tibble: 20 × 4
## eps n_cluster noise sil
## <dbl> <int> <int> <dbl>
## 1 1.05 2 119 0.228
## 2 1.17 1 85 NA
## 3 1.29 1 69 NA
## 4 1.41 1 56 NA
## 5 1.53 1 42 NA
## 6 1.65 1 33 NA
## 7 1.76 1 24 NA
## 8 1.88 1 18 NA
## 9 2.00 1 16 NA
## 10 2.12 1 12 NA
## 11 2.24 1 7 NA
## 12 2.36 1 7 NA
## 13 2.48 1 4 NA
## 14 2.60 1 4 NA
## 15 2.72 1 4 NA
## 16 2.84 1 4 NA
## 17 2.96 1 3 NA
## 18 3.08 1 3 NA
## 19 3.20 1 3 NA
## 20 3.31 1 2 NA
print(eval_dbscan_clean)
## # A tibble: 1 × 4
## eps n_cluster noise sil
## <dbl> <int> <int> <dbl>
## 1 1.05 2 119 0.228
best_eps
## [1] 1.048916
sil_dbscan
## [1] 0.2279869
if (!is.null(dbscan_model)) {
fviz_cluster(dbscan_model,
data = data_scaled,
geom = "point",
ellipse.type = "convex",
main = "DBSCAN Clustering",
ggtheme = theme_minimal())
}
meanshift_model <- meanShift(as.matrix(data_scaled),
bandwidth = rep(1.2, ncol(data_scaled)))
data$cluster_meanshift <- as.factor(meanshift_model$assignment)
fviz_cluster(list(data = data_scaled,
cluster = as.factor(meanshift_model$assignment)),
geom = "point",
ellipse.type = "convex",
palette = "Set2",
main = "Mean Shift Clustering",
ggtheme = theme_minimal())
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning: Removed 327 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 43 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning in grid.Call.graphics(C_points, x$x, x$y, x$pch, x$size): unimplemented
## pch value '26'
## Warning in grid.Call.graphics(C_points, x$x, x$y, x$pch, x$size): unimplemented
## pch value '26'
## Warning in grid.Call.graphics(C_points, x$x, x$y, x$pch, x$size): unimplemented
## pch value '27'
## Warning in grid.Call.graphics(C_points, x$x, x$y, x$pch, x$size): unimplemented
## pch value '27'
## Warning in grid.Call.graphics(C_points, x$x, x$y, x$pch, x$size): unimplemented
## pch value '28'
## Warning in grid.Call.graphics(C_points, x$x, x$y, x$pch, x$size): unimplemented
## pch value '28'
## Warning in grid.Call.graphics(C_points, x$x, x$y, x$pch, x$size): unimplemented
## pch value '29'
## Warning in grid.Call.graphics(C_points, x$x, x$y, x$pch, x$size): unimplemented
## pch value '29'
## Warning in grid.Call.graphics(C_points, x$x, x$y, x$pch, x$size): unimplemented
## pch value '30'
## Warning in grid.Call.graphics(C_points, x$x, x$y, x$pch, x$size): unimplemented
## pch value '30'
## Warning in grid.Call.graphics(C_points, x$x, x$y, x$pch, x$size): unimplemented
## pch value '31'
## Warning in grid.Call.graphics(C_points, x$x, x$y, x$pch, x$size): unimplemented
## pch value '31'
matrix_data <- as.matrix(data_scaled)
fcm_model <- cmeans(matrix_data, centers = k_opt, m = 2)
data$cluster_fcm <- as.factor(fcm_model$cluster)
fviz_cluster(list(data = matrix_data, cluster = fcm_model$cluster),
geom = "point",
ellipse.type = "norm",
palette = "jco",
main = "Fuzzy C-Means Clustering",
ggtheme = theme_minimal())
Evaluasi dilakukan dengan silhouette score. Metode terbaik adalah metode dengan nilai silhouette tertinggi setelah mempertimbangkan hasil yang valid.
sil_kmeans <- avg_silhouette(data_scaled, kmeans_model$cluster)
sil_kmedians <- avg_silhouette(data_scaled, kmedian_cluster)
sil_meanshift <- avg_silhouette(data_scaled, meanshift_model$assignment)
sil_fcm <- avg_silhouette(data_scaled, fcm_model$cluster)
hasil_evaluasi <- data.frame(
Method = c("KMeans", "KMedians", "DBSCAN", "MeanShift", "FuzzyCMeans"),
Silhouette = c(sil_kmeans, sil_kmedians, sil_dbscan, sil_meanshift, sil_fcm)
)
hasil_evaluasi
## Method Silhouette
## 1 KMeans 0.2903285
## 2 KMedians 0.2835994
## 3 DBSCAN 0.2279869
## 4 MeanShift NA
## 5 FuzzyCMeans 0.2848371
best_method <- hasil_evaluasi %>%
mutate(Silhouette2 = ifelse(is.na(Silhouette), -Inf, Silhouette)) %>%
slice_max(order_by = Silhouette2, n = 1, with_ties = FALSE) %>%
pull(Method)
best_method
## [1] "KMeans"
ggplot(hasil_evaluasi, aes(x = reorder(Method, Silhouette), y = Silhouette)) +
geom_col(fill = "steelblue") +
coord_flip() +
theme_minimal() +
labs(title = "Perbandingan Silhouette Score",
x = "Metode",
y = "Silhouette Score")
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_col()`).
Metode terbaik dipilih otomatis dari tabel evaluasi. Setelah itu, analisis eksploratif difokuskan hanya pada cluster yang dihasilkan oleh metode tersebut.
if (best_method == "KMeans") {
data$cluster_best <- data$cluster_kmeans
} else if (best_method == "KMedians") {
data$cluster_best <- data$cluster_kmedians
} else if (best_method == "DBSCAN") {
data$cluster_best <- as.factor(cluster_dbscan)
} else if (best_method == "MeanShift") {
data$cluster_best <- data$cluster_meanshift
} else if (best_method == "FuzzyCMeans") {
data$cluster_best <- data$cluster_fcm
}
data_eda <- data %>%
mutate(cluster_best = as.factor(cluster_best))
if (best_method == "DBSCAN") {
data_eda <- data_eda %>% filter(as.character(cluster_best) != "0")
cat("Jumlah data noise:", sum(as.character(data$cluster_best) == "0"), "\n")
}
table(data_eda$cluster_best)
##
## 1 2
## 252 188
profil_cluster <- data_eda %>%
group_by(cluster_best) %>%
summarise(
n = n(),
Fresh = mean(Fresh),
Milk = mean(Milk),
Grocery = mean(Grocery),
Frozen = mean(Frozen),
Detergents_Paper = mean(Detergents_Paper),
Delicassen = mean(Delicassen),
.groups = "drop"
)
profil_cluster
## # A tibble: 2 × 8
## cluster_best n Fresh Milk Grocery Frozen Detergents_Paper Delicassen
## <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 252 13973. 2402. 2919. 3706. 492. 1038.
## 2 2 188 9356. 10346. 14697. 2222. 6085. 2177.
profil_long <- data_eda %>%
group_by(cluster_best) %>%
summarise(across(c(Fresh, Milk, Grocery, Frozen, Detergents_Paper, Delicassen),
mean),
.groups = "drop") %>%
pivot_longer(cols = -cluster_best,
names_to = "Variabel",
values_to = "RataRata")
ggplot(profil_long, aes(x = Variabel, y = RataRata, fill = cluster_best)) +
geom_col(position = "dodge") +
theme_minimal() +
labs(title = "Perbandingan Rata-rata Fitur per Cluster",
x = "Variabel",
y = "Rata-rata") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(data_eda, aes(x = cluster_best, y = Grocery, fill = cluster_best)) +
geom_boxplot() +
theme_minimal() +
labs(title = "Sebaran Grocery per Cluster",
x = "Cluster",
y = "Grocery")
channel_cluster <- data_eda %>%
group_by(cluster_best, Channel) %>%
summarise(n = n(), .groups = "drop")
region_cluster <- data_eda %>%
group_by(cluster_best, Region) %>%
summarise(n = n(), .groups = "drop")
channel_cluster
## # A tibble: 4 × 3
## cluster_best Channel n
## <fct> <int> <int>
## 1 1 1 244
## 2 1 2 8
## 3 2 1 54
## 4 2 2 134
region_cluster
## # A tibble: 6 × 3
## cluster_best Region n
## <fct> <int> <int>
## 1 1 1 48
## 2 1 2 28
## 3 1 3 176
## 4 2 1 29
## 5 2 2 19
## 6 2 3 140
ggplot(channel_cluster, aes(x = cluster_best, y = n, fill = factor(Channel))) +
geom_col(position = "dodge") +
theme_minimal() +
labs(title = "Distribusi Channel per Cluster",
x = "Cluster",
y = "Jumlah Data",
fill = "Channel")
ggplot(region_cluster, aes(x = cluster_best, y = n, fill = factor(Region))) +
geom_col(position = "dodge") +
theme_minimal() +
labs(title = "Distribusi Region per Cluster",
x = "Cluster",
y = "Jumlah Data",
fill = "Region")
if (best_method == "KMeans") {
fviz_cluster(kmeans_model,
data = data_scaled,
geom = "point",
ellipse.type = "convex",
main = "Visualisasi PCA - Cluster Terbaik (K-Means)")
} else if (best_method == "KMedians") {
fviz_cluster(list(data = data_scaled, cluster = kmedian_cluster),
geom = "point",
ellipse.type = "convex",
main = "Visualisasi PCA - Cluster Terbaik (K-Medians)")
} else if (best_method == "DBSCAN") {
fviz_cluster(dbscan_model,
data = data_scaled,
geom = "point",
ellipse.type = "convex",
main = "Visualisasi PCA - Cluster Terbaik (DBSCAN)")
} else if (best_method == "MeanShift") {
fviz_cluster(list(data = data_scaled, cluster = meanshift_model$assignment),
geom = "point",
ellipse.type = "convex",
main = "Visualisasi PCA - Cluster Terbaik (Mean Shift)")
} else if (best_method == "FuzzyCMeans") {
fviz_cluster(list(data = matrix_data, cluster = fcm_model$cluster),
geom = "point",
ellipse.type = "convex",
main = "Visualisasi PCA - Cluster Terbaik (Fuzzy C-Means)")
}