Analisis pada modul ini bertujuan untuk mengeksplorasi data serta menerapkan beberapa metode clustering untuk mengidentifikasi pola pengelompokan pada data. Metode yang digunakan meliputi K-Modes, Fuzzy C-Means, Mean Shift, dan DBSCAN. Keempat metode tersebut digunakan untuk membandingkan hasil clustering berdasarkan karakteristik masing-masing metode Dataset yang digunakan adalah Wholesale Customers Dataset yang memuat informasi terkait pengeluaran tahunan pelanggan pada berbagai kategori produk seperti Fresh, Milk, Grocery, Frozen, Detergents_Paper, dan Delicassen.
Link Dataset: https://www.kaggle.com/datasets/binovi/wholesale-customers-data-set
library(klaR)
library(e1071)
library(meanShiftR)
library(dbscan)
library(cluster)
library(tidyverse)
library(ggplot2)
library(knitr)
library(dplyr)
data <- read.csv("Wholesale customers data.csv")
head(data)
## Channel Region Fresh Milk Grocery Frozen Detergents_Paper Delicassen
## 1 2 3 12669 9656 7561 214 2674 1338
## 2 2 3 7057 9810 9568 1762 3293 1776
## 3 2 3 6353 8808 7684 2405 3516 7844
## 4 1 3 13265 1196 4221 6404 507 1788
## 5 2 3 22615 5410 7198 3915 1777 5185
## 6 2 3 9413 8259 5126 666 1795 1451
data_numeric <- data[sapply(data, is.numeric)]
summary_stats <- summary(data_numeric)
print(summary_stats)
## Channel Region Fresh Milk
## Min. :1.000 Min. :1.000 Min. : 3 Min. : 55
## 1st Qu.:1.000 1st Qu.:2.000 1st Qu.: 3128 1st Qu.: 1533
## Median :1.000 Median :3.000 Median : 8504 Median : 3627
## Mean :1.323 Mean :2.543 Mean : 12000 Mean : 5796
## 3rd Qu.:2.000 3rd Qu.:3.000 3rd Qu.: 16934 3rd Qu.: 7190
## Max. :2.000 Max. :3.000 Max. :112151 Max. :73498
## Grocery Frozen Detergents_Paper Delicassen
## Min. : 3 Min. : 25.0 Min. : 3.0 Min. : 3.0
## 1st Qu.: 2153 1st Qu.: 742.2 1st Qu.: 256.8 1st Qu.: 408.2
## Median : 4756 Median : 1526.0 Median : 816.5 Median : 965.5
## Mean : 7951 Mean : 3071.9 Mean : 2881.5 Mean : 1524.9
## 3rd Qu.:10656 3rd Qu.: 3554.2 3rd Qu.: 3922.0 3rd Qu.: 1820.2
## Max. :92780 Max. :60869.0 Max. :40827.0 Max. :47943.0
boxplot(data, main="Boxplot Data Awal", col="lightblue")
colSums(is.na(data))
## Channel Region Fresh Milk
## 0 0 0 0
## Grocery Frozen Detergents_Paper Delicassen
## 0 0 0 0
data_scaled <- scale(data)
data_kmodes <- data
data_kmodes$Fresh <- cut(data$Fresh, 3, labels = c("Low","Medium","High"))
data_kmodes$Milk <- cut(data$Milk, 3, labels = c("Low","Medium","High"))
data_kmodes$Grocery <- cut(data$Grocery, 3, labels = c("Low","Medium","High"))
data_kmodes$Frozen <- cut(data$Frozen, 3, labels = c("Low","Medium","High"))
data_kmodes$Detergents_Paper <- cut(data$Detergents_Paper, 3, labels = c("Low","Medium","High"))
data_kmodes$Delicassen <- cut(data$Delicassen, 3, labels = c("Low","Medium","High"))
set.seed(123)
kmodes_model <- kmodes(data_kmodes, modes = 3, iter.max = 10)
kmodes_cluster <- kmodes_model$cluster
table(kmodes_cluster)
## kmodes_cluster
## 1 2 3
## 36 118 286
set.seed(123)
fcm_model <- cmeans(data_scaled, centers = 3, m = 2)
fcm_cluster <- fcm_model$cluster
table(fcm_cluster)
## fcm_cluster
## 1 2 3
## 96 208 136
ms_model <- meanShift(data_scaled)
ms_cluster <- ms_model$assignment
table(ms_cluster)
## ms_cluster
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 352 1 9 1 2 4 1 5 1 1 1 1 1 1 1 2 1 1 1 2
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
## 1 1 1 1 1 1 1 1 1 1 2 1 2 2 2 4 1 1 1 1
## 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## 1 1 1 1 1 3 1 2 1 1 1 1 1 1 1 1 1 1 1 1
## 61
## 1
db_model <- dbscan(data_scaled, eps = 0.7, minPts = 5)
db_cluster <- db_model$cluster
table(db_cluster)
## db_cluster
## 0 1 2 3 4 5
## 143 66 162 6 43 20
sil_kmodes <- silhouette(kmodes_cluster, dist(data_scaled))
mean(sil_kmodes[,3])
## [1] 0.2226291
sil_fcm <- silhouette(fcm_cluster, dist(data_scaled))
mean(sil_fcm[,3])
## [1] 0.3116016
sil_db <- silhouette(db_cluster, dist(data_scaled))
mean(sil_db[,3])
## [1] 0.1231232
pca <- prcomp(data_scaled)
pca_data <- data.frame(pca$x[,1:2])
pca_data$KModes <- as.factor(kmodes_cluster)
pca_data$FCM <- as.factor(fcm_cluster)
pca_data$MeanShift <- as.factor(ms_cluster)
pca_data$DBSCAN <- as.factor(db_cluster)
ggplot(pca_data, aes(PC1, PC2, color = KModes)) +
geom_point(size = 2) +
ggtitle("K-Modes (PCA)") +
theme_minimal()
ggplot(pca_data, aes(PC1, PC2, color = FCM)) +
geom_point(size = 2) +
ggtitle("Fuzzy C-Means (PCA)") +
theme_minimal()
ggplot(pca_data, aes(PC1, PC2, color = MeanShift)) +
geom_point(size = 2) +
ggtitle("Mean Shift (PCA)") +
theme_minimal()
ggplot(pca_data, aes(PC1, PC2, color = DBSCAN)) +
geom_point(size = 2) +
ggtitle("DBSCAN (PCA)") +
theme_minimal()
sil_kmodes_val <- mean(silhouette(kmodes_cluster, dist(data_scaled))[,3])
sil_fcm_val <- mean(silhouette(fcm_cluster, dist(data_scaled))[,3])
sil_db_val <- mean(silhouette(db_cluster, dist(data_scaled))[,3])
scores <- c(KModes = sil_kmodes_val,
FCM = sil_fcm_val,
DBSCAN = sil_db_val)
best_model <- names(which.max(scores))
best_score <- max(scores)
cat("Model terbaik adalah:", best_model, "\n")
## Model terbaik adalah: FCM
cat("Dengan nilai silhouette score:", best_score)
## Dengan nilai silhouette score: 0.3116016