Analisis pada modul ini bertujuan untuk mengeksplorasi data serta menerapkan beberapa metode clustering untuk mengidentifikasi pola pengelompokan pada data. Metode yang digunakan meliputi K-Modes, Fuzzy C-Means, Mean Shift, dan DBSCAN. Keempat metode tersebut digunakan untuk membandingkan hasil clustering berdasarkan karakteristik masing-masing metode Dataset yang digunakan adalah Wholesale Customers Dataset yang memuat informasi terkait pengeluaran tahunan pelanggan pada berbagai kategori produk seperti Fresh, Milk, Grocery, Frozen, Detergents_Paper, dan Delicassen.

Link Dataset: https://www.kaggle.com/datasets/binovi/wholesale-customers-data-set

1. Load Library

library(klaR)
library(e1071)
library(meanShiftR)
library(dbscan)
library(cluster)
library(tidyverse)
library(ggplot2)
library(knitr)
library(dplyr)

2. Load Dataset

data <- read.csv("Wholesale customers data.csv")
head(data)
##   Channel Region Fresh Milk Grocery Frozen Detergents_Paper Delicassen
## 1       2      3 12669 9656    7561    214             2674       1338
## 2       2      3  7057 9810    9568   1762             3293       1776
## 3       2      3  6353 8808    7684   2405             3516       7844
## 4       1      3 13265 1196    4221   6404              507       1788
## 5       2      3 22615 5410    7198   3915             1777       5185
## 6       2      3  9413 8259    5126    666             1795       1451

2.1 Statistika Deskriptif

data_numeric <- data[sapply(data, is.numeric)]
summary_stats <- summary(data_numeric)
print(summary_stats)
##     Channel          Region          Fresh             Milk      
##  Min.   :1.000   Min.   :1.000   Min.   :     3   Min.   :   55  
##  1st Qu.:1.000   1st Qu.:2.000   1st Qu.:  3128   1st Qu.: 1533  
##  Median :1.000   Median :3.000   Median :  8504   Median : 3627  
##  Mean   :1.323   Mean   :2.543   Mean   : 12000   Mean   : 5796  
##  3rd Qu.:2.000   3rd Qu.:3.000   3rd Qu.: 16934   3rd Qu.: 7190  
##  Max.   :2.000   Max.   :3.000   Max.   :112151   Max.   :73498  
##     Grocery          Frozen        Detergents_Paper    Delicassen     
##  Min.   :    3   Min.   :   25.0   Min.   :    3.0   Min.   :    3.0  
##  1st Qu.: 2153   1st Qu.:  742.2   1st Qu.:  256.8   1st Qu.:  408.2  
##  Median : 4756   Median : 1526.0   Median :  816.5   Median :  965.5  
##  Mean   : 7951   Mean   : 3071.9   Mean   : 2881.5   Mean   : 1524.9  
##  3rd Qu.:10656   3rd Qu.: 3554.2   3rd Qu.: 3922.0   3rd Qu.: 1820.2  
##  Max.   :92780   Max.   :60869.0   Max.   :40827.0   Max.   :47943.0

3. Visualisasi Data Awal

boxplot(data, main="Boxplot Data Awal", col="lightblue")

4. Preprocessing

colSums(is.na(data))
##          Channel           Region            Fresh             Milk 
##                0                0                0                0 
##          Grocery           Frozen Detergents_Paper       Delicassen 
##                0                0                0                0
data_scaled <- scale(data)

5. Clustering

5.1 K-Modes

data_kmodes <- data

data_kmodes$Fresh <- cut(data$Fresh, 3, labels = c("Low","Medium","High"))
data_kmodes$Milk <- cut(data$Milk, 3, labels = c("Low","Medium","High"))
data_kmodes$Grocery <- cut(data$Grocery, 3, labels = c("Low","Medium","High"))
data_kmodes$Frozen <- cut(data$Frozen, 3, labels = c("Low","Medium","High"))
data_kmodes$Detergents_Paper <- cut(data$Detergents_Paper, 3, labels = c("Low","Medium","High"))
data_kmodes$Delicassen <- cut(data$Delicassen, 3, labels = c("Low","Medium","High"))

set.seed(123)

kmodes_model <- kmodes(data_kmodes, modes = 3, iter.max = 10)
kmodes_cluster <- kmodes_model$cluster

table(kmodes_cluster)
## kmodes_cluster
##   1   2   3 
##  36 118 286

5.2 Fuzzy C-Means

set.seed(123)

fcm_model <- cmeans(data_scaled, centers = 3, m = 2)
fcm_cluster <- fcm_model$cluster

table(fcm_cluster)
## fcm_cluster
##   1   2   3 
##  96 208 136

5.3 Mean Shift

ms_model <- meanShift(data_scaled)
ms_cluster <- ms_model$assignment

table(ms_cluster)
## ms_cluster
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
## 352   1   9   1   2   4   1   5   1   1   1   1   1   1   1   2   1   1   1   2 
##  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
##   1   1   1   1   1   1   1   1   1   1   2   1   2   2   2   4   1   1   1   1 
##  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
##   1   1   1   1   1   3   1   2   1   1   1   1   1   1   1   1   1   1   1   1 
##  61 
##   1

5.4 DBSCAN

db_model <- dbscan(data_scaled, eps = 0.7, minPts = 5)
db_cluster <- db_model$cluster

table(db_cluster)
## db_cluster
##   0   1   2   3   4   5 
## 143  66 162   6  43  20

6. Evaluasi (Silhouette Score)

sil_kmodes <- silhouette(kmodes_cluster, dist(data_scaled))
mean(sil_kmodes[,3])
## [1] 0.2226291
sil_fcm <- silhouette(fcm_cluster, dist(data_scaled))
mean(sil_fcm[,3])
## [1] 0.3116016
sil_db <- silhouette(db_cluster, dist(data_scaled))
mean(sil_db[,3])
## [1] 0.1231232

7. Visualisasi

pca <- prcomp(data_scaled)
pca_data <- data.frame(pca$x[,1:2])

pca_data$KModes <- as.factor(kmodes_cluster)
pca_data$FCM <- as.factor(fcm_cluster)
pca_data$MeanShift <- as.factor(ms_cluster)
pca_data$DBSCAN <- as.factor(db_cluster)

7.1 Visualisasi K-Modes

ggplot(pca_data, aes(PC1, PC2, color = KModes)) +
  geom_point(size = 2) +
  ggtitle("K-Modes (PCA)") +
  theme_minimal()

7.1 Visualisasi Fuzzy C-Means

ggplot(pca_data, aes(PC1, PC2, color = FCM)) +
  geom_point(size = 2) +
  ggtitle("Fuzzy C-Means (PCA)") +
  theme_minimal()

7.1 Visualisasi Mean Shift

ggplot(pca_data, aes(PC1, PC2, color = MeanShift)) +
  geom_point(size = 2) +
  ggtitle("Mean Shift (PCA)") +
  theme_minimal()

7.1 Visualisasi DBSCAN

ggplot(pca_data, aes(PC1, PC2, color = DBSCAN)) +
  geom_point(size = 2) +
  ggtitle("DBSCAN (PCA)") +
  theme_minimal()

8. Model Terbaik

sil_kmodes_val <- mean(silhouette(kmodes_cluster, dist(data_scaled))[,3])
sil_fcm_val <- mean(silhouette(fcm_cluster, dist(data_scaled))[,3])
sil_db_val <- mean(silhouette(db_cluster, dist(data_scaled))[,3])

scores <- c(KModes = sil_kmodes_val,
            FCM = sil_fcm_val,
            DBSCAN = sil_db_val)

best_model <- names(which.max(scores))
best_score <- max(scores)
cat("Model terbaik adalah:", best_model, "\n")
## Model terbaik adalah: FCM
cat("Dengan nilai silhouette score:", best_score)
## Dengan nilai silhouette score: 0.3116016