1. Load Library

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(flexclust)
## Warning: package 'flexclust' was built under R version 4.5.3
library(dbscan)
## Warning: package 'dbscan' was built under R version 4.5.3
## 
## Attaching package: 'dbscan'
## 
## The following object is masked from 'package:stats':
## 
##     as.dendrogram
library(meanShiftR)
library(e1071)
## Warning: package 'e1071' was built under R version 4.5.3
## 
## Attaching package: 'e1071'
## 
## The following object is masked from 'package:flexclust':
## 
##     bclust
## 
## The following object is masked from 'package:ggplot2':
## 
##     element
library(cluster)
## Warning: package 'cluster' was built under R version 4.5.3
library(fpc)
## Warning: package 'fpc' was built under R version 4.5.3
## 
## Attaching package: 'fpc'
## 
## The following object is masked from 'package:dbscan':
## 
##     dbscan
library(psych)
## 
## Attaching package: 'psych'
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

2. Import Dataset

df <- read.csv("Customer_Transactions.csv")
head(df)
##   customer_id age gender   country annual_income spending_score num_purchases
## 1           1  37   Male   Germany         85886             14            18
## 2           2  40   Male     India         41041              4            10
## 3           3  69 Female Australia        143869             59            39
## 4           4  30   Male        UK         87261             45            34
## 5           5  69 Female        UK        110678             40            38
## 6           6  28   Male Australia         90330              2             7
##   avg_purchase_value membership_years website_visits_per_month
## 1              41.20                6                       20
## 2              31.73                4                       29
## 3              65.96               12                       26
## 4              51.87               12                        7
## 5              59.64               13                       16
## 6              35.63                1                       19
##   cart_abandon_rate churned                              feedback_text
## 1              0.95       0           Very satisfied with my purchase.
## 2              0.21       0          Good quality and value for money.
## 3              0.08       0                Excellent customer service.
## 4              0.61       0          Good quality and value for money.
## 5              0.49       0                Excellent customer service.
## 6              0.81       1 The product did not match the description.
##   last_purchase_date
## 1         2025-06-22
## 2         2025-10-17
## 3         2025-07-01
## 4         2025-08-17
## 5         2025-06-21
## 6         2024-10-25

3. Deskripsi Data

describe(df)
##                          vars     n     mean       sd   median  trimmed
## customer_id                 1 10000  5000.50  2886.90  5000.50  5000.50
## age                         2 10000    44.05    15.40    44.00    44.05
## gender*                     3 10000     1.51     0.50     2.00     1.51
## country*                    4 10000     6.16     3.06     6.00     6.28
## annual_income               5 10000 86067.68 38986.79 78339.50 83870.92
## spending_score              6 10000    50.92    28.75    51.00    51.04
## num_purchases               7 10000    22.58    10.16    22.00    22.27
## avg_purchase_value          8 10000    47.45    11.21    46.99    47.25
## membership_years            9 10000     6.34     4.68     6.00     6.13
## website_visits_per_month   10 10000    15.58     8.66    16.00    15.59
## cart_abandon_rate          11 10000     0.50     0.29     0.51     0.50
## churned                    12 10000     0.11     0.31     0.00     0.01
## feedback_text*             13 10000     6.90     3.42     7.00     6.82
## last_purchase_date*        14 10000   529.12   122.20   554.00   553.83
##                               mad      min       max     range  skew kurtosis
## customer_id               3706.50     1.00  10000.00   9999.00  0.00    -1.20
## age                         19.27    18.00     70.00     52.00  0.00    -1.21
## gender*                      0.00     1.00      2.00      1.00 -0.03    -2.00
## country*                     4.45     1.00     10.00      9.00 -0.14    -1.31
## annual_income            40833.03 20028.00 179960.00 159932.00  0.47    -0.70
## spending_score              35.58     1.00    100.00     99.00 -0.03    -1.17
## num_purchases               11.86     1.00     49.00     48.00  0.24    -0.90
## avg_purchase_value          11.55    16.75     83.27     66.52  0.16    -0.32
## membership_years             5.93     0.00     15.00     15.00  0.29    -1.16
## website_visits_per_month    11.86     1.00     30.00     29.00 -0.01    -1.21
## cart_abandon_rate            0.37     0.00      1.00      1.00 -0.01    -1.19
## churned                      0.00     0.00      1.00      1.00  2.51     4.30
## feedback_text*               4.45     1.00     12.00     11.00  0.29    -1.37
## last_purchase_date*         74.13     1.00    654.00    653.00 -2.32     5.76
##                              se
## customer_id               28.87
## age                        0.15
## gender*                    0.00
## country*                   0.03
## annual_income            389.87
## spending_score             0.29
## num_purchases              0.10
## avg_purchase_value         0.11
## membership_years           0.05
## website_visits_per_month   0.09
## cart_abandon_rate          0.00
## churned                    0.00
## feedback_text*             0.03
## last_purchase_date*        1.22

4. Seleksi Variabel

df <- df %>%
select(
age,
annual_income,
spending_score,
num_purchases,
avg_purchase_value,
membership_years,
website_visits_per_month,
cart_abandon_rate,
churned
)

5. Cek Missing Value

colSums(is.na(df))
##                      age            annual_income           spending_score 
##                        0                        0                        0 
##            num_purchases       avg_purchase_value         membership_years 
##                        0                        0                        0 
## website_visits_per_month        cart_abandon_rate                  churned 
##                        0                        0                        0

6. Tabel Missing Value

library(tidyverse)

missing_table <- df %>%
  summarise(across(everything(), ~sum(is.na(.)))) %>%
  pivot_longer(cols = everything(),
               names_to = "variable",
               values_to = "missing_values")

missing_table
## # A tibble: 9 × 2
##   variable                 missing_values
##   <chr>                             <int>
## 1 age                                   0
## 2 annual_income                         0
## 3 spending_score                        0
## 4 num_purchases                         0
## 5 avg_purchase_value                    0
## 6 membership_years                      0
## 7 website_visits_per_month              0
## 8 cart_abandon_rate                     0
## 9 churned                               0

7. Handling Missing Value (Median Imputation)

df <- df %>%
mutate(across(everything(),
~ifelse(is.na(.), median(., na.rm = TRUE), .)))

8. Normalisasi Data

df_scaled <- scale(df)

# Ringkasan hasil scaling
summary(df_scaled)
##       age            annual_income     spending_score      num_purchases     
##  Min.   :-1.690773   Min.   :-1.6939   Min.   :-1.736289   Min.   :-2.12286  
##  1st Qu.:-0.846873   1st Qu.:-0.7880   1st Qu.:-0.866826   1st Qu.:-0.84379  
##  Median :-0.002973   Median :-0.1982   Median : 0.002636   Median :-0.05667  
##  Mean   : 0.000000   Mean   : 0.0000   Mean   : 0.000000   Mean   : 0.00000  
##  3rd Qu.: 0.840927   3rd Qu.: 0.7567   3rd Qu.: 0.837320   3rd Qu.: 0.82884  
##  Max.   : 1.684827   Max.   : 2.4083   Max.   : 1.706783   Max.   : 2.59986  
##  avg_purchase_value membership_years   website_visits_per_month
##  Min.   :-2.73940   Min.   :-1.35462   Min.   :-1.68429        
##  1st Qu.:-0.70922   1st Qu.:-0.92733   1st Qu.:-0.87554        
##  Median :-0.04082   Median :-0.07275   Median : 0.04874        
##  Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.00000        
##  3rd Qu.: 0.68112   3rd Qu.: 0.78183   3rd Qu.: 0.85750        
##  Max.   : 3.19675   Max.   : 1.85006   Max.   : 1.66625        
##  cart_abandon_rate     churned       
##  Min.   :-1.74740   Min.   :-0.3497  
##  1st Qu.:-0.87582   1st Qu.:-0.3497  
##  Median : 0.03062   Median :-0.3497  
##  Mean   : 0.00000   Mean   : 0.0000  
##  3rd Qu.: 0.86734   3rd Qu.:-0.3497  
##  Max.   : 1.73892   Max.   : 2.8589

9. Sampling Data

set.seed(123)

df_sample <- df_scaled[sample(1:nrow(df_scaled), 500), ]
cat("Jumlah data:", nrow(df_sample), "\n")
## Jumlah data: 500
cat("Jumlah variabel:", ncol(df_sample))
## Jumlah variabel: 9

10. Penentuan Jumlah Cluster (Elbow Method)

wss <- sapply(1:10, function(k){
kmeans(df_sample, centers = k, nstart = 20)$tot.withinss
})

plot(1:10, wss, type="b", pch=19,
xlab="Jumlah Cluster",
ylab="WSS",
main="Elbow Method")

11. Penentuan Jumlah Cluster (Silhouette Method)

avg_sil <- function(k){

km <- kmeans(df_sample, centers = k, nstart = 25)

ss <- silhouette(km$cluster, dist(df_sample))

mean(ss[,3])

}

k_values <- 2:10

sil_values <- sapply(k_values, avg_sil)

plot(k_values, sil_values, type="b", pch=19,
xlab="Jumlah Cluster",
ylab="Silhouette",
main="Silhouette Method")

12. Menentukan Nilai K Optimal

k <- k_values[which.max(sil_values)]

k
## [1] 3

13. Clustering K-Means

km_res <- kmeans(df_sample, centers = k, nstart = 25)

14. Clustering K-Median

kmed_res <- kcca(df_sample, k = k, family = kccaFamily("kmedians"))
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'

15. Penentuan Parameter DBSCAN (kNN Distance Plot)

# Grafik kNN distance untuk menentukan eps DBSCAN
kNNdistplot(df_sample, k = 5)
abline(h = 1.5, col = "red", lwd = 2)
title("kNN Distance Plot untuk Penentuan eps DBSCAN")

16. Clustering DBSCAN

db_res <- dbscan(df_sample, eps = 1.5, MinPts = 5)
table(db_res$cluster)
## 
##   0   1   2   3   4   5   6   7   8 
## 197 254   4   4  22   5   5   4   5

17. PCA & Clustering untuk Mean Shift

df_pca <- prcomp(df_sample)$x[,1:2]

ms_res <- meanShiftR::meanShift(
  as.matrix(df_pca),
  bandwidth = c(1.8, 1.8)
)

ms_cluster <- ms_res$assignment
table(ms_cluster)
## ms_cluster
##   1   2   3   4   5   6   7   8 
## 114 120  57  56  58  82   8   5

18. Clustering Fuzzy C-Means

fcm_res <- cmeans(df_sample, centers = k, m = 2)

19. Perhitungan Distance

dist_sample <- dist(df_sample)

dist_pca <- dist(df_pca)

20. Evaluasi Silhouette Score

sil_km <- mean(silhouette(km_res$cluster, dist_sample)[,3])

sil_kmed <- mean(silhouette(clusters(kmed_res), dist_sample)[,3])

sil_fcm <- mean(silhouette(fcm_res$cluster, dist_sample)[,3])

21. Evaluasi Silhouette DBSCAN

db_cluster <- db_res$cluster
valid <- db_cluster != 0

if(length(unique(db_cluster[valid])) > 1){

d <- as.matrix(dist(df_sample))

sil_db <- mean(
silhouette(db_cluster[valid], d[valid, valid])[,3]
)

} else {

sil_db <- NA

}

22. Evaluasi Silhouette Mean Shift

if(length(unique(ms_cluster)) > 1){

sil_ms <- mean(silhouette(ms_cluster, dist_pca)[,3])

} else {

sil_ms <- NA

}

23.Evaluasi Dunn Index

dunn_km <- cluster.stats(dist_sample, km_res$cluster)$dunn

dunn_kmed <- cluster.stats(dist_sample, clusters(kmed_res))$dunn

dunn_fcm <- cluster.stats(dist_sample, fcm_res$cluster)$dunn

24. Evaluasi Dunn DBSCAN

if(length(unique(db_cluster[valid])) > 1){

dunn_db <- cluster.stats(dist(df_sample[valid,]),
db_cluster[valid])$dunn

} else {

dunn_db <- NA

}

25. Evaluasi Dunn Mean Shift

if(length(unique(ms_cluster)) > 1){

dunn_ms <- cluster.stats(dist_pca, ms_cluster)$dunn

} else {

dunn_ms <- NA

}

26.Tabel Perbandingan Evaluasi

evaluation_table <- data.frame(
  Method = c("K-Means", "K-Median", "DBSCAN", "Mean Shift", "Fuzzy C-Means"),
  Silhouette = c(sil_km, sil_kmed, sil_db, sil_ms, sil_fcm),
  Dunn_Index = c(dunn_km, dunn_kmed, dunn_db, dunn_ms, dunn_fcm)
)

evaluation_table
##          Method  Silhouette Dunn_Index
## 1       K-Means  0.19969745 0.12514841
## 2      K-Median  0.14808636 0.11195335
## 3        DBSCAN -0.06687743 0.10178209
## 4    Mean Shift  0.12205498 0.01079951
## 5 Fuzzy C-Means  0.09009458 0.11405901

27. Output Evaluasi

cat(" SILHOUETTE SCORE \n")
##  SILHOUETTE SCORE
cat("\n","K-Means :", sil_km, "\n")
## 
##  K-Means : 0.1996974
cat(" K-Median :", sil_kmed, "\n")
##  K-Median : 0.1480864
cat(" DBSCAN :", sil_db, "\n")
##  DBSCAN : -0.06687743
cat(" Mean Shift :", sil_ms, "\n")
##  Mean Shift : 0.122055
cat(" Fuzzy C-Means :", sil_fcm, "\n")
##  Fuzzy C-Means : 0.09009458
cat("\n DUNN INDEX \n")
## 
##  DUNN INDEX
cat("\n","K-Means :", dunn_km, "\n")
## 
##  K-Means : 0.1251484
cat(" K-Median :", dunn_kmed, "\n")
##  K-Median : 0.1119533
cat(" DBSCAN :", dunn_db, "\n")
##  DBSCAN : 0.1017821
cat(" Mean Shift :", dunn_ms, "\n")
##  Mean Shift : 0.01079951
cat(" Fuzzy C-Means :", dunn_fcm, "\n")
##  Fuzzy C-Means : 0.114059

28. Interpretasi Hasil K-Means

df_result <- as.data.frame(df_sample)

df_result$Cluster_KMeans <- km_res$cluster

aggregate(df_result,
by=list(df_result$Cluster_KMeans),
mean)
##   Group.1         age annual_income spending_score num_purchases
## 1       1 -0.41148714    -0.3353013     -0.1843589    -0.7893346
## 2       2  0.05495137     0.1761706     -0.8293726    -0.6182522
## 3       3  0.43874655     0.3458068      0.3121649     0.8820570
##   avg_purchase_value membership_years website_visits_per_month
## 1         -0.3581701       -0.7206377               0.04874458
## 2         -0.3146092       -0.4474470              -0.24987118
## 3          0.4709752        0.7896036              -0.03633183
##   cart_abandon_rate    churned Cluster_KMeans
## 1       -0.06537142 -0.3497459              1
## 2        0.59004267  2.8589324              2
## 3        0.12776508 -0.3497459              3