1. LOAD LIBRARY

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.3
## Warning: package 'readr' was built under R version 4.5.3
## Warning: package 'forcats' was built under R version 4.5.3
## Warning: package 'lubridate' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(flexclust)
## Warning: package 'flexclust' was built under R version 4.5.3
library(dbscan)
## Warning: package 'dbscan' was built under R version 4.5.3
## 
## Attaching package: 'dbscan'
## 
## The following object is masked from 'package:stats':
## 
##     as.dendrogram
library(meanShiftR)
library(e1071)
## Warning: package 'e1071' was built under R version 4.5.3
## 
## Attaching package: 'e1071'
## 
## The following object is masked from 'package:flexclust':
## 
##     bclust
## 
## The following object is masked from 'package:ggplot2':
## 
##     element
library(cluster)
## Warning: package 'cluster' was built under R version 4.5.3
library(fpc)
## Warning: package 'fpc' was built under R version 4.5.3
## 
## Attaching package: 'fpc'
## 
## The following object is masked from 'package:dbscan':
## 
##     dbscan
library(psych)
## Warning: package 'psych' was built under R version 4.5.3
## 
## Attaching package: 'psych'
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

2. LOAD DATA

df <- read.csv("Customer_Transactions.csv")
head(df)
##   customer_id age gender   country annual_income spending_score num_purchases
## 1           1  37   Male   Germany         85886             14            18
## 2           2  40   Male     India         41041              4            10
## 3           3  69 Female Australia        143869             59            39
## 4           4  30   Male        UK         87261             45            34
## 5           5  69 Female        UK        110678             40            38
## 6           6  28   Male Australia         90330              2             7
##   avg_purchase_value membership_years website_visits_per_month
## 1              41.20                6                       20
## 2              31.73                4                       29
## 3              65.96               12                       26
## 4              51.87               12                        7
## 5              59.64               13                       16
## 6              35.63                1                       19
##   cart_abandon_rate churned                              feedback_text
## 1              0.95       0           Very satisfied with my purchase.
## 2              0.21       0          Good quality and value for money.
## 3              0.08       0                Excellent customer service.
## 4              0.61       0          Good quality and value for money.
## 5              0.49       0                Excellent customer service.
## 6              0.81       1 The product did not match the description.
##   last_purchase_date
## 1         2025-06-22
## 2         2025-10-17
## 3         2025-07-01
## 4         2025-08-17
## 5         2025-06-21
## 6         2024-10-25
describe(df)
##                          vars     n     mean       sd   median  trimmed
## customer_id                 1 10000  5000.50  2886.90  5000.50  5000.50
## age                         2 10000    44.05    15.40    44.00    44.05
## gender*                     3 10000     1.51     0.50     2.00     1.51
## country*                    4 10000     6.16     3.06     6.00     6.28
## annual_income               5 10000 86067.68 38986.79 78339.50 83870.92
## spending_score              6 10000    50.92    28.75    51.00    51.04
## num_purchases               7 10000    22.58    10.16    22.00    22.27
## avg_purchase_value          8 10000    47.45    11.21    46.99    47.25
## membership_years            9 10000     6.34     4.68     6.00     6.13
## website_visits_per_month   10 10000    15.58     8.66    16.00    15.59
## cart_abandon_rate          11 10000     0.50     0.29     0.51     0.50
## churned                    12 10000     0.11     0.31     0.00     0.01
## feedback_text*             13 10000     6.90     3.42     7.00     6.82
## last_purchase_date*        14 10000   529.12   122.20   554.00   553.83
##                               mad      min       max     range  skew kurtosis
## customer_id               3706.50     1.00  10000.00   9999.00  0.00    -1.20
## age                         19.27    18.00     70.00     52.00  0.00    -1.21
## gender*                      0.00     1.00      2.00      1.00 -0.03    -2.00
## country*                     4.45     1.00     10.00      9.00 -0.14    -1.31
## annual_income            40833.03 20028.00 179960.00 159932.00  0.47    -0.70
## spending_score              35.58     1.00    100.00     99.00 -0.03    -1.17
## num_purchases               11.86     1.00     49.00     48.00  0.24    -0.90
## avg_purchase_value          11.55    16.75     83.27     66.52  0.16    -0.32
## membership_years             5.93     0.00     15.00     15.00  0.29    -1.16
## website_visits_per_month    11.86     1.00     30.00     29.00 -0.01    -1.21
## cart_abandon_rate            0.37     0.00      1.00      1.00 -0.01    -1.19
## churned                      0.00     0.00      1.00      1.00  2.51     4.30
## feedback_text*               4.45     1.00     12.00     11.00  0.29    -1.37
## last_purchase_date*         74.13     1.00    654.00    653.00 -2.32     5.76
##                              se
## customer_id               28.87
## age                        0.15
## gender*                    0.00
## country*                   0.03
## annual_income            389.87
## spending_score             0.29
## num_purchases              0.10
## avg_purchase_value         0.11
## membership_years           0.05
## website_visits_per_month   0.09
## cart_abandon_rate          0.00
## churned                    0.00
## feedback_text*             0.03
## last_purchase_date*        1.22

3. PREPROCESSING

df <- df %>%
select(
age,
annual_income,
spending_score,
num_purchases,
avg_purchase_value,
membership_years,
website_visits_per_month,
cart_abandon_rate,
churned
)

Cek missing value

colSums(is.na(df))
##                      age            annual_income           spending_score 
##                        0                        0                        0 
##            num_purchases       avg_purchase_value         membership_years 
##                        0                        0                        0 
## website_visits_per_month        cart_abandon_rate                  churned 
##                        0                        0                        0

Isi missing value dengan median

df <- df %>%
mutate(across(everything(),
~ifelse(is.na(.), median(., na.rm = TRUE), .)))

Scaling (WAJIB)

df_scaled <- scale(df)
head(df_scaled)
##             age annual_income spending_score num_purchases avg_purchase_value
## [1,] -0.4573808   -0.00465994     -1.2841684    -0.4502324         -0.5575169
## [2,] -0.2626347   -1.15492141     -1.6319534    -1.2373521         -1.4026073
## [3,]  1.6199115    1.48258748      0.2808642     1.6159567          1.6520330
## [4,] -0.9117885    0.03060842     -0.2060348     1.1240069          0.3946599
## [5,]  1.6199115    0.63124779     -0.3799273     1.5175667          1.0880445
## [6,] -1.0416193    0.10932739     -1.7015104    -1.5325220         -1.0545764
##      membership_years website_visits_per_month cart_abandon_rate    churned
## [1,]       -0.0727462               0.51088801        1.56460082 -0.3497459
## [2,]       -0.5000366               1.55071072       -1.01526969 -0.3497459
## [3,]        1.2091251               1.20410315       -1.46849019 -0.3497459
## [4,]        1.2091251              -0.99107814        0.37925491 -0.3497459
## [5,]        1.4227703               0.04874458       -0.03910247 -0.3497459
## [6,]       -1.1409723               0.39535215        1.07651721  2.8589324
set.seed(123)
df_sample <- df_scaled[sample(1:nrow(df_scaled), 500), ]

4. MENENTUKAN JUMLAH CLUSTER (K)

Elbow Method

wss <- sapply(1:10, function(k){
kmeans(df_sample, centers = k, nstart = 20)$tot.withinss
})

plot(1:10, wss, type="b", pch=19,
xlab="Jumlah Cluster",
ylab="WSS",
main="Elbow Method")

Silhouette Method

avg_sil <- function(k){

km <- kmeans(df_sample, centers = k, nstart = 25)

ss <- silhouette(km$cluster, dist(df_sample))

mean(ss[,3])

}

k_values <- 2:10

sil_values <- sapply(k_values, avg_sil)

plot(k_values, sil_values, type="b", pch=19,
xlab="Jumlah Cluster",
ylab="Silhouette",
main="Silhouette Method")

Pilih jumlah cluster

k <- k_values[which.max(sil_values)]
k
## [1] 3

— 1. K-MEANS —

km_res <- kmeans(df_sample, centers = k, nstart = 25)
table(km_res$cluster)
## 
##   1   2   3 
## 215  65 220

2. K-Median

kmed_res <- kcca(df_sample, k = k, family = kccaFamily("kmedians"))
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
table(clusters(kmed_res))
## 
##   1   2   3 
## 164 166 170

3. DBSCAN

kNNdistplot(df_sample, k = 5)
abline(h = 1.5, col = "red", lwd = 2)
title("kNN Distance Plot untuk Penentuan eps DBSCAN")

db_res <- dbscan(df_sample, eps = 1.5, MinPts = 5)
table(db_res$cluster)
## 
##   0   1   2   3   4   5   6   7   8 
## 197 254   4   4  22   5   5   4   5

4. Mean Shift

df_sample <- scale(df_sample)

df_pca <- prcomp(df_sample)$x[,1:2]

ms_res <- meanShiftR::meanShift(
  as.matrix(df_pca),
  bandwidth = c(1.8, 1.8)
)

ms_cluster <- ms_res$assignment
table(ms_cluster)
## ms_cluster
##   1   2   3   4   5   6 
## 139 124  72  62  98   5

5. Fuzzy C-Means

fcm_res <- cmeans(df_sample, centers = k, m = 2)
table(fcm_res$cluster)
## 
##   1   2   3 
## 224 234  42

6. VISUALISASI

Silhouette

dist_sample <- dist(df_sample)
dist_pca <- dist(df_pca)
sil_km <- mean(silhouette(km_res$cluster, dist_sample)[,3])

sil_kmed <- mean(silhouette(clusters(kmed_res), dist_sample)[,3])

sil_fcm <- mean(silhouette(fcm_res$cluster, dist_sample)[,3])
db_cluster <- db_res$cluster
valid <- db_cluster != 0
if(length(unique(db_cluster[valid])) > 1){
d <- as.matrix(dist(df_sample))
sil_db <- mean(
silhouette(db_cluster[valid], d[valid, valid])[,3]
)

} else {

sil_db <- NA

}

Mean Shift

if(length(unique(ms_cluster)) > 1){
  sil_ms <- mean(silhouette(ms_cluster, dist(df_pca))[,3])
} else {
  sil_ms <- NA
}

Dunn Index

dunn_km <- cluster.stats(dist_sample, km_res$cluster)$dunn

dunn_kmed <- cluster.stats(dist_sample, clusters(kmed_res))$dunn

dunn_fcm <- cluster.stats(dist_sample, fcm_res$cluster)$dunn

DBSCAN Dunn

if(length(unique(db_cluster[valid])) > 1){

dunn_db <- cluster.stats(dist(df_sample[valid,]),
db_cluster[valid])$dunn

} else {

dunn_db <- NA

}

Mean Shift Dunn

if(length(unique(ms_cluster)) > 1){

dunn_ms <- cluster.stats(dist_pca, ms_cluster)$dunn

} else {

dunn_ms <- NA

}

8. HASIL AKHIR

cat("=== SILHOUETTE SCORE ===\n")
## === SILHOUETTE SCORE ===
cat("K-Means :", sil_km, "\n")
## K-Means : 0.1928314
cat("K-Median :", sil_kmed, "\n")
## K-Median : 0.1469764
cat("DBSCAN :", sil_db, "\n")
## DBSCAN : -0.06534474
cat("Mean Shift :", sil_ms, "\n")
## Mean Shift : 0.1602052
cat("Fuzzy C-Means :", sil_fcm, "\n")
## Fuzzy C-Means : 0.08175724
cat("\n=== DUNN INDEX ===\n")
## 
## === DUNN INDEX ===
cat("K-Means :", dunn_km, "\n")
## K-Means : 0.1258425
cat("K-Median :", dunn_kmed, "\n")
## K-Median : 0.1132191
cat("DBSCAN :", dunn_db, "\n")
## DBSCAN : 0.103836
cat("Mean Shift :", dunn_ms, "\n")
## Mean Shift : 0.006113718
cat("Fuzzy C-Means :", dunn_fcm, "\n")
## Fuzzy C-Means : 0.1025311

9. TAMBAHAN (INSIGHT)

df_result <- as.data.frame(df_sample)
df_result$Cluster_KMeans <- km_res$cluster
aggregate(df_result,
by=list(df_result$Cluster_KMeans),
mean)
##   Group.1         age annual_income spending_score num_purchases
## 1       1 -0.42437007    -0.3618976     -0.1312588    -0.7238402
## 2       2  0.03094258     0.1435942     -0.7601738    -0.5603930
## 3       3  0.40558317     0.3112471      0.3528725     0.8729600
##   avg_purchase_value membership_years website_visits_per_month
## 1         -0.3617111       -0.6819857               0.07703681
## 2         -0.3191820       -0.4158334              -0.22464580
## 3          0.4477942        0.7893459              -0.00891335
##   cart_abandon_rate    churned Cluster_KMeans
## 1        -0.1677274 -0.3861689              1
## 2         0.4782262  2.5843613              2
## 3         0.0226213 -0.3861689              3
sil_km
## [1] 0.1928314
sil_kmed
## [1] 0.1469764
sil_db
## [1] -0.06534474
sil_ms
## [1] 0.1602052
sil_fcm
## [1] 0.08175724