Pendahuluan

Analisis ini membandingkan lima metode clustering yaitu K-Means, K-Median, DBSCAN, Mean Shift, dan Fuzzy C-Means. Evaluasi dilakukan menggunakan satu metrik internal, yaitu silhouette score. Metode terbaik dipilih dari hasil evaluasi seluruh metode, lalu digunakan untuk analisis karakteristik cluster secara lebih rinci.

Dataset yang digunakan adalah Wholesale Customers. Fokus clustering diarahkan pada perilaku pembelian pelanggan berdasarkan variabel numerik transaksi. Variabel Channel dan Region tidak dipakai sebagai fitur clustering, tetapi tetap disimpan untuk membantu interpretasi hasil pada tahap EDA.

1. Load Library

if (!require("tidyverse")) install.packages("tidyverse")

## Loading required package: tidyverse

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(tidyverse)

if (!require("cluster")) install.packages("cluster")

## Loading required package: cluster

## Warning: package 'cluster' was built under R version 4.5.3

library(cluster)

if (!require("factoextra")) install.packages("factoextra")

## Loading required package: factoextra

## Warning: package 'factoextra' was built under R version 4.5.3

## Welcome to factoextra!

## Want to learn more? See two factoextra-related books at https://www.datanovia.com/en/product/practical-guide-to-principal-component-methods-in-r/

library(factoextra)

if (!require("dbscan")) install.packages("dbscan")

## Loading required package: dbscan

## Warning: package 'dbscan' was built under R version 4.5.3

## 
## Attaching package: 'dbscan'

## The following object is masked from 'package:stats':
## 
##     as.dendrogram

library(dbscan)

if (!require("flexclust")) install.packages("flexclust")

## Loading required package: flexclust

## Warning: package 'flexclust' was built under R version 4.5.3

library(flexclust)

if (!require("e1071")) install.packages("e1071")

## Loading required package: e1071

## Warning: package 'e1071' was built under R version 4.5.3

## 
## Attaching package: 'e1071'

## The following object is masked from 'package:flexclust':
## 
##     bclust

## The following object is masked from 'package:ggplot2':
## 
##     element

library(e1071)

if (!require("meanShiftR")) install.packages("meanShiftR")

## Loading required package: meanShiftR

library(meanShiftR)

if (!require("fpc")) install.packages("fpc")

## Loading required package: fpc

## Warning: package 'fpc' was built under R version 4.5.3

## 
## Attaching package: 'fpc'

## The following object is masked from 'package:dbscan':
## 
##     dbscan

library(fpc)

if (!require("corrplot")) install.packages("corrplot")

## Loading required package: corrplot

## corrplot 0.95 loaded

library(corrplot)

if (!require("gridExtra")) install.packages("gridExtra")

## Loading required package: gridExtra

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(gridExtra)

2. Import Data

data <- read.csv("Wholesale customers data.csv")
head(data)

##   Channel Region Fresh Milk Grocery Frozen Detergents_Paper Delicassen
## 1       2      3 12669 9656    7561    214             2674       1338
## 2       2      3  7057 9810    9568   1762             3293       1776
## 3       2      3  6353 8808    7684   2405             3516       7844
## 4       1      3 13265 1196    4221   6404              507       1788
## 5       2      3 22615 5410    7198   3915             1777       5185
## 6       2      3  9413 8259    5126    666             1795       1451

str(data)

## 'data.frame':    440 obs. of  8 variables:
##  $ Channel         : int  2 2 2 1 2 2 2 2 1 2 ...
##  $ Region          : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ Fresh           : int  12669 7057 6353 13265 22615 9413 12126 7579 5963 6006 ...
##  $ Milk            : int  9656 9810 8808 1196 5410 8259 3199 4956 3648 11093 ...
##  $ Grocery         : int  7561 9568 7684 4221 7198 5126 6975 9426 6192 18881 ...
##  $ Frozen          : int  214 1762 2405 6404 3915 666 480 1669 425 1159 ...
##  $ Detergents_Paper: int  2674 3293 3516 507 1777 1795 3140 3321 1716 7425 ...
##  $ Delicassen      : int  1338 1776 7844 1788 5185 1451 545 2566 750 2098 ...

summary(data)

##     Channel          Region          Fresh             Milk      
##  Min.   :1.000   Min.   :1.000   Min.   :     3   Min.   :   55  
##  1st Qu.:1.000   1st Qu.:2.000   1st Qu.:  3128   1st Qu.: 1533  
##  Median :1.000   Median :3.000   Median :  8504   Median : 3627  
##  Mean   :1.323   Mean   :2.543   Mean   : 12000   Mean   : 5796  
##  3rd Qu.:2.000   3rd Qu.:3.000   3rd Qu.: 16934   3rd Qu.: 7190  
##  Max.   :2.000   Max.   :3.000   Max.   :112151   Max.   :73498  
##     Grocery          Frozen        Detergents_Paper    Delicassen     
##  Min.   :    3   Min.   :   25.0   Min.   :    3.0   Min.   :    3.0  
##  1st Qu.: 2153   1st Qu.:  742.2   1st Qu.:  256.8   1st Qu.:  408.2  
##  Median : 4756   Median : 1526.0   Median :  816.5   Median :  965.5  
##  Mean   : 7951   Mean   : 3071.9   Mean   : 2881.5   Mean   : 1524.9  
##  3rd Qu.:10656   3rd Qu.: 3554.2   3rd Qu.: 3922.0   3rd Qu.: 1820.2  
##  Max.   :92780   Max.   :60869.0   Max.   :40827.0   Max.   :47943.0

3. EDA Awal

3.1 Cek Missing Value

colSums(is.na(data))

##          Channel           Region            Fresh             Milk 
##                0                0                0                0 
##          Grocery           Frozen Detergents_Paper       Delicassen 
##                0                0                0                0

3.2 Cek Duplikasi

sum(duplicated(data))

## [1] 0

3.3 Statistik Deskriptif Sederhana

data %>%
  summarise(across(where(is.numeric),
                   list(mean = mean, median = median, sd = sd),
                   na.rm = TRUE))

## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `across(...)`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
## 
##   # Previously
##   across(a:b, mean, na.rm = TRUE)
## 
##   # Now
##   across(a:b, \(x) mean(x, na.rm = TRUE))

##   Channel_mean Channel_median Channel_sd Region_mean Region_median Region_sd
## 1     1.322727              1  0.4680516    2.543182             3 0.7742724
##   Fresh_mean Fresh_median Fresh_sd Milk_mean Milk_median  Milk_sd Grocery_mean
## 1    12000.3         8504 12647.33  5796.266        3627 7380.377     7951.277
##   Grocery_median Grocery_sd Frozen_mean Frozen_median Frozen_sd
## 1         4755.5   9503.163    3071.932          1526  4854.673
##   Detergents_Paper_mean Detergents_Paper_median Detergents_Paper_sd
## 1              2881.493                   816.5            4767.854
##   Delicassen_mean Delicassen_median Delicassen_sd
## 1         1524.87             965.5      2820.106

3.4 Visualisasi Distribusi Awal

data_long <- data %>%
  pivot_longer(cols = c(Fresh, Milk, Grocery, Frozen, Detergents_Paper, Delicassen),
               names_to = "Variabel",
               values_to = "Nilai")

ggplot(data_long, aes(x = Nilai)) +
  geom_histogram(bins = 30, color = "black", fill = "steelblue") +
  facet_wrap(~Variabel, scales = "free") +
  theme_minimal() +
  labs(title = "Distribusi Awal Variabel Numerik",
       x = "Nilai",
       y = "Frekuensi")

boxplot(data[, c("Fresh", "Milk", "Grocery", "Frozen", "Detergents_Paper", "Delicassen")],
        las = 2,
        col = "lightgoldenrod",
        main = "Boxplot Variabel Numerik Awal",
        ylab = "Nilai")

3.5 Korelasi Antar Variabel

corr_data <- cor(data[, c("Fresh", "Milk", "Grocery", "Frozen", "Detergents_Paper", "Delicassen")])
corrplot(corr_data, method = "color", type = "lower", addCoef.col = "black", tl.col = "black")

4. Preprocessing

Tahap preprocessing dilakukan agar data siap diproses oleh algoritma clustering yang berbasis jarak. Langkah yang dipakai adalah memilih fitur numerik utama, melakukan transformasi log1p() untuk menstabilkan sebaran data, lalu melakukan standardisasi dengan scale().

data_model <- data %>%
  select(Fresh, Milk, Grocery, Frozen, Detergents_Paper, Delicassen)

data_log <- log1p(data_model)

data_scaled <- scale(data_log)
data_scaled <- as.data.frame(data_scaled)

summary(data_scaled)

##      Fresh              Milk             Grocery             Frozen        
##  Min.   :-4.9955   Min.   :-3.79061   Min.   :-6.34796   Min.   :-3.15553  
##  1st Qu.:-0.4654   1st Qu.:-0.72733   1st Qu.:-0.69016   1st Qu.:-0.53991  
##  Median : 0.2146   Median : 0.06924   Median : 0.02255   Median : 0.02178  
##  Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.00000  
##  3rd Qu.: 0.6829   3rd Qu.: 0.70237   3rd Qu.: 0.74829   3rd Qu.: 0.68107  
##  Max.   : 1.9684   Max.   : 2.85333   Max.   : 2.69521   Max.   : 2.89680  
##  Detergents_Paper     Delicassen     
##  Min.   :-3.16199   Min.   :-4.0842  
##  1st Qu.:-0.72523   1st Qu.:-0.5076  
##  Median :-0.05004   Median : 0.1566  
##  Mean   : 0.00000   Mean   : 0.0000  
##  3rd Qu.: 0.86739   3rd Qu.: 0.6462  
##  Max.   : 2.23767   Max.   : 3.1737

data_scaled_long <- data_scaled %>%
  pivot_longer(cols = everything(),
               names_to = "Variabel",
               values_to = "Nilai")

ggplot(data_scaled_long, aes(x = Nilai)) +
  geom_histogram(bins = 30, color = "black", fill = "darkseagreen3") +
  facet_wrap(~Variabel, scales = "free") +
  theme_minimal() +
  labs(title = "Distribusi Setelah Log Transformasi dan Scaling",
       x = "Nilai Terstandarisasi",
       y = "Frekuensi")

boxplot(data_scaled,
        las = 2,
        col = "lightblue",
        main = "Boxplot Setelah Preprocessing")

5. Uji Kelayakan Clustering

set.seed(123)
hopkins_stat <- get_clust_tendency(data_scaled, n = nrow(data_scaled) - 1, graph = FALSE)

## Warning: Hopkins statistic uses the corrected formula (Wright 2022); results
## differ from legacy factoextra. Set options(factoextra.warn_hopkins = FALSE) to
## silence this warning.

hopkins_stat$hopkins_stat

## [1] 0.9972662

Nilai Hopkins yang mendekati 1 menunjukkan data cenderung memiliki struktur cluster.

6. Menentukan Jumlah Cluster Awal untuk Metode yang Membutuhkan K

Untuk metode yang membutuhkan jumlah cluster, nilai k dicari terlebih dahulu menggunakan Elbow dan Silhouette. Nilai terbaik dari bagian ini dipakai sebagai referensi jumlah cluster untuk K-Means, K-Median, dan Fuzzy C-Means.

set.seed(123)
elbow_plot <- fviz_nbclust(data_scaled, kmeans, method = "wss") +
  labs(title = "Elbow Method")

sil_plot <- fviz_nbclust(data_scaled, kmeans, method = "silhouette") +
  labs(title = "Silhouette Method")

grid.arrange(elbow_plot, sil_plot, ncol = 2)

k_candidates <- 2:6

silhouette_kmeans <- sapply(k_candidates, function(k) {
  km <- kmeans(data_scaled, centers = k, nstart = 25)
  ss <- silhouette(km$cluster, dist(data_scaled))
  mean(ss[, 3])
})

k_table <- data.frame(
  k = k_candidates,
  silhouette = silhouette_kmeans
)

k_table

##   k silhouette
## 1 2  0.2903285
## 2 3  0.2591994
## 3 4  0.1885249
## 4 5  0.1921550
## 5 6  0.1937067

k_opt <- k_table$k[which.max(k_table$silhouette)]
k_opt

## [1] 2

7. Clustering dengan 5 Metode

7.1 K-Means

set.seed(123)
kmeans_model <- kmeans(data_scaled, centers = k_opt, nstart = 25)
data$cluster_kmeans <- as.factor(kmeans_model$cluster)

fviz_cluster(kmeans_model,
             data = data_scaled,
             geom = "point",
             ellipse.type = "convex",
             main = "K-Means Clustering")

7.2 K-Median

kmedian_model <- kcca(data_scaled,
                      k = k_opt,
                      family = kccaFamily("kmedians"))

## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'

## Also defined by 'kernlab'

## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'

## Also defined by 'kernlab'

kmedian_cluster <- predict(kmedian_model)
data$cluster_kmedians <- as.factor(kmedian_cluster)

fviz_cluster(list(data = data_scaled, cluster = kmedian_cluster),
             geom = "point",
             ellipse.type = "convex",
             palette = "jco",
             main = "K-Medians Clustering",
             ggtheme = theme_minimal())

7.3 DBSCAN

avg_silhouette <- function(x, cluster_labels) {
  cluster_labels <- as.integer(as.factor(cluster_labels))
  tab <- table(cluster_labels)

  if (length(unique(cluster_labels)) < 2) return(NA_real_)
  if (any(tab < 2)) return(NA_real_)

  ss <- silhouette(cluster_labels, dist(x))
  mean(ss[, 3], na.rm = TRUE)
}

minPts <- ncol(data_scaled) + 1

kdist <- sort(dbscan::kNNdist(data_scaled, k = minPts))

plot(kdist,
     type = "l",
     main = "k-Distance Plot untuk DBSCAN",
     xlab = "Urutan Titik",
     ylab = paste0("Jarak ke-", minPts, " Tetangga"))

eps_grid <- seq(
  as.numeric(quantile(kdist, 0.50)),
  as.numeric(quantile(kdist, 0.99)),
  length.out = 20
)

eval_dbscan <- purrr::map_dfr(eps_grid, function(eps) {
  model <- dbscan::dbscan(data_scaled, eps = eps, minPts = minPts)
  cl <- model$cluster

  tibble(
    eps = eps,
    n_cluster = length(setdiff(unique(cl), 0)),
    noise = sum(cl == 0),
    sil = avg_silhouette(data_scaled[cl != 0, , drop = FALSE], cl[cl != 0])
  )
})

eval_dbscan_clean <- eval_dbscan %>%
  filter(!is.na(sil), n_cluster >= 2)

if (nrow(eval_dbscan_clean) == 0) {
  best_eps <- NA_real_
  dbscan_model <- NULL
  cluster_dbscan <- rep(0, nrow(data_scaled))
  sil_dbscan <- NA_real_
  message("DBSCAN tidak menemukan cluster valid. Coba ubah range eps.")
} else {
  best_row <- eval_dbscan_clean[which.max(eval_dbscan_clean$sil), ]
  best_eps <- best_row$eps

  dbscan_model <- dbscan::dbscan(data_scaled, eps = best_eps, minPts = minPts)
  cluster_dbscan <- dbscan_model$cluster
  sil_dbscan <- best_row$sil
}

print(eval_dbscan)

## # A tibble: 20 × 4
##      eps n_cluster noise    sil
##    <dbl>     <int> <int>  <dbl>
##  1  1.05         2   119  0.228
##  2  1.17         1    85 NA    
##  3  1.29         1    69 NA    
##  4  1.41         1    56 NA    
##  5  1.53         1    42 NA    
##  6  1.65         1    33 NA    
##  7  1.76         1    24 NA    
##  8  1.88         1    18 NA    
##  9  2.00         1    16 NA    
## 10  2.12         1    12 NA    
## 11  2.24         1     7 NA    
## 12  2.36         1     7 NA    
## 13  2.48         1     4 NA    
## 14  2.60         1     4 NA    
## 15  2.72         1     4 NA    
## 16  2.84         1     4 NA    
## 17  2.96         1     3 NA    
## 18  3.08         1     3 NA    
## 19  3.20         1     3 NA    
## 20  3.31         1     2 NA

print(eval_dbscan_clean)

## # A tibble: 1 × 4
##     eps n_cluster noise   sil
##   <dbl>     <int> <int> <dbl>
## 1  1.05         2   119 0.228

best_eps

## [1] 1.048916

sil_dbscan

## [1] 0.2279869

if (!is.null(dbscan_model)) {
  fviz_cluster(dbscan_model,
               data = data_scaled,
               geom = "point",
               ellipse.type = "convex",
               main = "DBSCAN Clustering",
               ggtheme = theme_minimal())
}

7.4 Mean Shift

meanshift_model <- meanShift(as.matrix(data_scaled),
                             bandwidth = rep(1.2, ncol(data_scaled)))

data$cluster_meanshift <- as.factor(meanshift_model$assignment)

fviz_cluster(list(data = data_scaled,
                  cluster = as.factor(meanshift_model$assignment)),
             geom = "point",
             ellipse.type = "convex",
             palette = "Set2",
             main = "Mean Shift Clustering",
             ggtheme = theme_minimal())

## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

## Warning: Removed 327 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 43 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning in grid.Call.graphics(C_points, x$x, x$y, x$pch, x$size): unimplemented
## pch value '26'
## Warning in grid.Call.graphics(C_points, x$x, x$y, x$pch, x$size): unimplemented
## pch value '26'

## Warning in grid.Call.graphics(C_points, x$x, x$y, x$pch, x$size): unimplemented
## pch value '27'
## Warning in grid.Call.graphics(C_points, x$x, x$y, x$pch, x$size): unimplemented
## pch value '27'

## Warning in grid.Call.graphics(C_points, x$x, x$y, x$pch, x$size): unimplemented
## pch value '28'
## Warning in grid.Call.graphics(C_points, x$x, x$y, x$pch, x$size): unimplemented
## pch value '28'

## Warning in grid.Call.graphics(C_points, x$x, x$y, x$pch, x$size): unimplemented
## pch value '29'
## Warning in grid.Call.graphics(C_points, x$x, x$y, x$pch, x$size): unimplemented
## pch value '29'

## Warning in grid.Call.graphics(C_points, x$x, x$y, x$pch, x$size): unimplemented
## pch value '30'
## Warning in grid.Call.graphics(C_points, x$x, x$y, x$pch, x$size): unimplemented
## pch value '30'

## Warning in grid.Call.graphics(C_points, x$x, x$y, x$pch, x$size): unimplemented
## pch value '31'
## Warning in grid.Call.graphics(C_points, x$x, x$y, x$pch, x$size): unimplemented
## pch value '31'

7.5 Fuzzy C-Means

matrix_data <- as.matrix(data_scaled)

fcm_model <- cmeans(matrix_data, centers = k_opt, m = 2)
data$cluster_fcm <- as.factor(fcm_model$cluster)

fviz_cluster(list(data = matrix_data, cluster = fcm_model$cluster),
             geom = "point",
             ellipse.type = "norm",
             palette = "jco",
             main = "Fuzzy C-Means Clustering",
             ggtheme = theme_minimal())

8. Evaluasi Seluruh Metode

Evaluasi dilakukan dengan silhouette score. Metode terbaik adalah metode dengan nilai silhouette tertinggi setelah mempertimbangkan hasil yang valid.

sil_kmeans <- avg_silhouette(data_scaled, kmeans_model$cluster)
sil_kmedians <- avg_silhouette(data_scaled, kmedian_cluster)
sil_meanshift <- avg_silhouette(data_scaled, meanshift_model$assignment)
sil_fcm <- avg_silhouette(data_scaled, fcm_model$cluster)

hasil_evaluasi <- data.frame(
  Method = c("KMeans", "KMedians", "DBSCAN", "MeanShift", "FuzzyCMeans"),
  Silhouette = c(sil_kmeans, sil_kmedians, sil_dbscan, sil_meanshift, sil_fcm)
)

hasil_evaluasi

##        Method Silhouette
## 1      KMeans  0.2903285
## 2    KMedians  0.2835994
## 3      DBSCAN  0.2279869
## 4   MeanShift         NA
## 5 FuzzyCMeans  0.2848371

best_method <- hasil_evaluasi %>%
  mutate(Silhouette2 = ifelse(is.na(Silhouette), -Inf, Silhouette)) %>%
  slice_max(order_by = Silhouette2, n = 1, with_ties = FALSE) %>%
  pull(Method)

best_method

## [1] "KMeans"

ggplot(hasil_evaluasi, aes(x = reorder(Method, Silhouette), y = Silhouette)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  theme_minimal() +
  labs(title = "Perbandingan Silhouette Score",
       x = "Metode",
       y = "Silhouette Score")

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_col()`).

9. EDA Berdasarkan Metode Terbaik

Metode terbaik dipilih otomatis dari tabel evaluasi. Setelah itu, analisis eksploratif difokuskan hanya pada cluster yang dihasilkan oleh metode tersebut.

if (best_method == "KMeans") {
  data$cluster_best <- data$cluster_kmeans
} else if (best_method == "KMedians") {
  data$cluster_best <- data$cluster_kmedians
} else if (best_method == "DBSCAN") {
  data$cluster_best <- as.factor(cluster_dbscan)
} else if (best_method == "MeanShift") {
  data$cluster_best <- data$cluster_meanshift
} else if (best_method == "FuzzyCMeans") {
  data$cluster_best <- data$cluster_fcm
}

data_eda <- data %>%
  mutate(cluster_best = as.factor(cluster_best))

if (best_method == "DBSCAN") {
  data_eda <- data_eda %>% filter(as.character(cluster_best) != "0")
  cat("Jumlah data noise:", sum(as.character(data$cluster_best) == "0"), "\n")
}

table(data_eda$cluster_best)

## 
##   1   2 
## 252 188

9.1 Profil Rata-rata Tiap Cluster

profil_cluster <- data_eda %>%
  group_by(cluster_best) %>%
  summarise(
    n = n(),
    Fresh = mean(Fresh),
    Milk = mean(Milk),
    Grocery = mean(Grocery),
    Frozen = mean(Frozen),
    Detergents_Paper = mean(Detergents_Paper),
    Delicassen = mean(Delicassen),
    .groups = "drop"
  )

profil_cluster

## # A tibble: 2 × 8
##   cluster_best     n  Fresh   Milk Grocery Frozen Detergents_Paper Delicassen
##   <fct>        <int>  <dbl>  <dbl>   <dbl>  <dbl>            <dbl>      <dbl>
## 1 1              252 13973.  2402.   2919.  3706.             492.      1038.
## 2 2              188  9356. 10346.  14697.  2222.            6085.      2177.

9.2 Visualisasi Karakteristik Cluster

profil_long <- data_eda %>%
  group_by(cluster_best) %>%
  summarise(across(c(Fresh, Milk, Grocery, Frozen, Detergents_Paper, Delicassen),
                   mean),
            .groups = "drop") %>%
  pivot_longer(cols = -cluster_best,
               names_to = "Variabel",
               values_to = "RataRata")

ggplot(profil_long, aes(x = Variabel, y = RataRata, fill = cluster_best)) +
  geom_col(position = "dodge") +
  theme_minimal() +
  labs(title = "Perbandingan Rata-rata Fitur per Cluster",
       x = "Variabel",
       y = "Rata-rata") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot(data_eda, aes(x = cluster_best, y = Grocery, fill = cluster_best)) +
  geom_boxplot() +
  theme_minimal() +
  labs(title = "Sebaran Grocery per Cluster",
       x = "Cluster",
       y = "Grocery")

9.3 Interpretasi Channel dan Region per Cluster

channel_cluster <- data_eda %>%
  group_by(cluster_best, Channel) %>%
  summarise(n = n(), .groups = "drop")

region_cluster <- data_eda %>%
  group_by(cluster_best, Region) %>%
  summarise(n = n(), .groups = "drop")

channel_cluster

## # A tibble: 4 × 3
##   cluster_best Channel     n
##   <fct>          <int> <int>
## 1 1                  1   244
## 2 1                  2     8
## 3 2                  1    54
## 4 2                  2   134

region_cluster

## # A tibble: 6 × 3
##   cluster_best Region     n
##   <fct>         <int> <int>
## 1 1                 1    48
## 2 1                 2    28
## 3 1                 3   176
## 4 2                 1    29
## 5 2                 2    19
## 6 2                 3   140

ggplot(channel_cluster, aes(x = cluster_best, y = n, fill = factor(Channel))) +
  geom_col(position = "dodge") +
  theme_minimal() +
  labs(title = "Distribusi Channel per Cluster",
       x = "Cluster",
       y = "Jumlah Data",
       fill = "Channel")

ggplot(region_cluster, aes(x = cluster_best, y = n, fill = factor(Region))) +
  geom_col(position = "dodge") +
  theme_minimal() +
  labs(title = "Distribusi Region per Cluster",
       x = "Cluster",
       y = "Jumlah Data",
       fill = "Region")

9.4 Visualisasi PCA dari Metode Terbaik

if (best_method == "KMeans") {
  fviz_cluster(kmeans_model,
               data = data_scaled,
               geom = "point",
               ellipse.type = "convex",
               main = "Visualisasi PCA - Cluster Terbaik (K-Means)")
} else if (best_method == "KMedians") {
  fviz_cluster(list(data = data_scaled, cluster = kmedian_cluster),
               geom = "point",
               ellipse.type = "convex",
               main = "Visualisasi PCA - Cluster Terbaik (K-Medians)")
} else if (best_method == "DBSCAN") {
  fviz_cluster(dbscan_model,
               data = data_scaled,
               geom = "point",
               ellipse.type = "convex",
               main = "Visualisasi PCA - Cluster Terbaik (DBSCAN)")
} else if (best_method == "MeanShift") {
  fviz_cluster(list(data = data_scaled, cluster = meanshift_model$assignment),
               geom = "point",
               ellipse.type = "convex",
               main = "Visualisasi PCA - Cluster Terbaik (Mean Shift)")
} else if (best_method == "FuzzyCMeans") {
  fviz_cluster(list(data = matrix_data, cluster = fcm_model$cluster),
               geom = "point",
               ellipse.type = "convex",
               main = "Visualisasi PCA - Cluster Terbaik (Fuzzy C-Means)")
}

Implementasi dan Perbandingan Clustering Metode K-Means, K-Medians, DBSCAN, Mean Shift, dan Fuzzy C-Means pada Data Wholesale Customers untuk Segmentasi Pelanggan

Siti Noerhalizah (24031554073), Gita Nurani (24031554122)

2026-04-14