1. LOAD LIBRARY
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.3
## Warning: package 'readr' was built under R version 4.5.3
## Warning: package 'forcats' was built under R version 4.5.3
## Warning: package 'lubridate' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.0 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(flexclust)
## Warning: package 'flexclust' was built under R version 4.5.3
library(dbscan)
## Warning: package 'dbscan' was built under R version 4.5.3
##
## Attaching package: 'dbscan'
##
## The following object is masked from 'package:stats':
##
## as.dendrogram
library(meanShiftR)
library(e1071)
## Warning: package 'e1071' was built under R version 4.5.3
##
## Attaching package: 'e1071'
##
## The following object is masked from 'package:flexclust':
##
## bclust
##
## The following object is masked from 'package:ggplot2':
##
## element
library(cluster)
## Warning: package 'cluster' was built under R version 4.5.3
library(fpc)
## Warning: package 'fpc' was built under R version 4.5.3
##
## Attaching package: 'fpc'
##
## The following object is masked from 'package:dbscan':
##
## dbscan
library(psych)
## Warning: package 'psych' was built under R version 4.5.3
##
## Attaching package: 'psych'
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
2. LOAD DATA
df <- read.csv("Customer_Transactions.csv")
head(df)
## customer_id age gender country annual_income spending_score num_purchases
## 1 1 37 Male Germany 85886 14 18
## 2 2 40 Male India 41041 4 10
## 3 3 69 Female Australia 143869 59 39
## 4 4 30 Male UK 87261 45 34
## 5 5 69 Female UK 110678 40 38
## 6 6 28 Male Australia 90330 2 7
## avg_purchase_value membership_years website_visits_per_month
## 1 41.20 6 20
## 2 31.73 4 29
## 3 65.96 12 26
## 4 51.87 12 7
## 5 59.64 13 16
## 6 35.63 1 19
## cart_abandon_rate churned feedback_text
## 1 0.95 0 Very satisfied with my purchase.
## 2 0.21 0 Good quality and value for money.
## 3 0.08 0 Excellent customer service.
## 4 0.61 0 Good quality and value for money.
## 5 0.49 0 Excellent customer service.
## 6 0.81 1 The product did not match the description.
## last_purchase_date
## 1 2025-06-22
## 2 2025-10-17
## 3 2025-07-01
## 4 2025-08-17
## 5 2025-06-21
## 6 2024-10-25
describe(df)
## vars n mean sd median trimmed
## customer_id 1 10000 5000.50 2886.90 5000.50 5000.50
## age 2 10000 44.05 15.40 44.00 44.05
## gender* 3 10000 1.51 0.50 2.00 1.51
## country* 4 10000 6.16 3.06 6.00 6.28
## annual_income 5 10000 86067.68 38986.79 78339.50 83870.92
## spending_score 6 10000 50.92 28.75 51.00 51.04
## num_purchases 7 10000 22.58 10.16 22.00 22.27
## avg_purchase_value 8 10000 47.45 11.21 46.99 47.25
## membership_years 9 10000 6.34 4.68 6.00 6.13
## website_visits_per_month 10 10000 15.58 8.66 16.00 15.59
## cart_abandon_rate 11 10000 0.50 0.29 0.51 0.50
## churned 12 10000 0.11 0.31 0.00 0.01
## feedback_text* 13 10000 6.90 3.42 7.00 6.82
## last_purchase_date* 14 10000 529.12 122.20 554.00 553.83
## mad min max range skew kurtosis
## customer_id 3706.50 1.00 10000.00 9999.00 0.00 -1.20
## age 19.27 18.00 70.00 52.00 0.00 -1.21
## gender* 0.00 1.00 2.00 1.00 -0.03 -2.00
## country* 4.45 1.00 10.00 9.00 -0.14 -1.31
## annual_income 40833.03 20028.00 179960.00 159932.00 0.47 -0.70
## spending_score 35.58 1.00 100.00 99.00 -0.03 -1.17
## num_purchases 11.86 1.00 49.00 48.00 0.24 -0.90
## avg_purchase_value 11.55 16.75 83.27 66.52 0.16 -0.32
## membership_years 5.93 0.00 15.00 15.00 0.29 -1.16
## website_visits_per_month 11.86 1.00 30.00 29.00 -0.01 -1.21
## cart_abandon_rate 0.37 0.00 1.00 1.00 -0.01 -1.19
## churned 0.00 0.00 1.00 1.00 2.51 4.30
## feedback_text* 4.45 1.00 12.00 11.00 0.29 -1.37
## last_purchase_date* 74.13 1.00 654.00 653.00 -2.32 5.76
## se
## customer_id 28.87
## age 0.15
## gender* 0.00
## country* 0.03
## annual_income 389.87
## spending_score 0.29
## num_purchases 0.10
## avg_purchase_value 0.11
## membership_years 0.05
## website_visits_per_month 0.09
## cart_abandon_rate 0.00
## churned 0.00
## feedback_text* 0.03
## last_purchase_date* 1.22
3. PREPROCESSING
df <- df %>%
select(
age,
annual_income,
spending_score,
num_purchases,
avg_purchase_value,
membership_years,
website_visits_per_month,
cart_abandon_rate,
churned
)
Cek missing value
colSums(is.na(df))
## age annual_income spending_score
## 0 0 0
## num_purchases avg_purchase_value membership_years
## 0 0 0
## website_visits_per_month cart_abandon_rate churned
## 0 0 0
Scaling (WAJIB)
df_scaled <- scale(df)
head(df_scaled)
## age annual_income spending_score num_purchases avg_purchase_value
## [1,] -0.4573808 -0.00465994 -1.2841684 -0.4502324 -0.5575169
## [2,] -0.2626347 -1.15492141 -1.6319534 -1.2373521 -1.4026073
## [3,] 1.6199115 1.48258748 0.2808642 1.6159567 1.6520330
## [4,] -0.9117885 0.03060842 -0.2060348 1.1240069 0.3946599
## [5,] 1.6199115 0.63124779 -0.3799273 1.5175667 1.0880445
## [6,] -1.0416193 0.10932739 -1.7015104 -1.5325220 -1.0545764
## membership_years website_visits_per_month cart_abandon_rate churned
## [1,] -0.0727462 0.51088801 1.56460082 -0.3497459
## [2,] -0.5000366 1.55071072 -1.01526969 -0.3497459
## [3,] 1.2091251 1.20410315 -1.46849019 -0.3497459
## [4,] 1.2091251 -0.99107814 0.37925491 -0.3497459
## [5,] 1.4227703 0.04874458 -0.03910247 -0.3497459
## [6,] -1.1409723 0.39535215 1.07651721 2.8589324
set.seed(123)
df_sample <- df_scaled[sample(1:nrow(df_scaled), 500), ]
4. MENENTUKAN JUMLAH CLUSTER (K)
Elbow Method
wss <- sapply(1:10, function(k){
kmeans(df_sample, centers = k, nstart = 20)$tot.withinss
})
plot(1:10, wss, type="b", pch=19,
xlab="Jumlah Cluster",
ylab="WSS",
main="Elbow Method")

Silhouette Method
avg_sil <- function(k){
km <- kmeans(df_sample, centers = k, nstart = 25)
ss <- silhouette(km$cluster, dist(df_sample))
mean(ss[,3])
}
k_values <- 2:10
sil_values <- sapply(k_values, avg_sil)
plot(k_values, sil_values, type="b", pch=19,
xlab="Jumlah Cluster",
ylab="Silhouette",
main="Silhouette Method")

Pilih jumlah cluster
k <- k_values[which.max(sil_values)]
k
## [1] 3
— 1. K-MEANS —
km_res <- kmeans(df_sample, centers = k, nstart = 25)
table(km_res$cluster)
##
## 1 2 3
## 215 65 220
3. DBSCAN
kNNdistplot(df_sample, k = 5)
abline(h = 1.5, col = "red", lwd = 2)
title("kNN Distance Plot untuk Penentuan eps DBSCAN")

db_res <- dbscan(df_sample, eps = 1.5, MinPts = 5)
table(db_res$cluster)
##
## 0 1 2 3 4 5 6 7 8
## 197 254 4 4 22 5 5 4 5
4. Mean Shift
df_sample <- scale(df_sample)
df_pca <- prcomp(df_sample)$x[,1:2]
ms_res <- meanShiftR::meanShift(
as.matrix(df_pca),
bandwidth = c(1.8, 1.8)
)
ms_cluster <- ms_res$assignment
table(ms_cluster)
## ms_cluster
## 1 2 3 4 5 6
## 139 124 72 62 98 5
5. Fuzzy C-Means
fcm_res <- cmeans(df_sample, centers = k, m = 2)
table(fcm_res$cluster)
##
## 1 2 3
## 224 234 42
6. VISUALISASI
Silhouette
dist_sample <- dist(df_sample)
dist_pca <- dist(df_pca)
sil_km <- mean(silhouette(km_res$cluster, dist_sample)[,3])
sil_kmed <- mean(silhouette(clusters(kmed_res), dist_sample)[,3])
sil_fcm <- mean(silhouette(fcm_res$cluster, dist_sample)[,3])
db_cluster <- db_res$cluster
valid <- db_cluster != 0
if(length(unique(db_cluster[valid])) > 1){
d <- as.matrix(dist(df_sample))
sil_db <- mean(
silhouette(db_cluster[valid], d[valid, valid])[,3]
)
} else {
sil_db <- NA
}
Mean Shift
if(length(unique(ms_cluster)) > 1){
sil_ms <- mean(silhouette(ms_cluster, dist(df_pca))[,3])
} else {
sil_ms <- NA
}
Dunn Index
dunn_km <- cluster.stats(dist_sample, km_res$cluster)$dunn
dunn_kmed <- cluster.stats(dist_sample, clusters(kmed_res))$dunn
dunn_fcm <- cluster.stats(dist_sample, fcm_res$cluster)$dunn
DBSCAN Dunn
if(length(unique(db_cluster[valid])) > 1){
dunn_db <- cluster.stats(dist(df_sample[valid,]),
db_cluster[valid])$dunn
} else {
dunn_db <- NA
}
Mean Shift Dunn
if(length(unique(ms_cluster)) > 1){
dunn_ms <- cluster.stats(dist_pca, ms_cluster)$dunn
} else {
dunn_ms <- NA
}
8. HASIL AKHIR
cat("=== SILHOUETTE SCORE ===\n")
## === SILHOUETTE SCORE ===
cat("K-Means :", sil_km, "\n")
## K-Means : 0.1928314
cat("K-Median :", sil_kmed, "\n")
## K-Median : 0.1469764
cat("DBSCAN :", sil_db, "\n")
## DBSCAN : -0.06534474
cat("Mean Shift :", sil_ms, "\n")
## Mean Shift : 0.1602052
cat("Fuzzy C-Means :", sil_fcm, "\n")
## Fuzzy C-Means : 0.08175724
cat("\n=== DUNN INDEX ===\n")
##
## === DUNN INDEX ===
cat("K-Means :", dunn_km, "\n")
## K-Means : 0.1258425
cat("K-Median :", dunn_kmed, "\n")
## K-Median : 0.1132191
cat("DBSCAN :", dunn_db, "\n")
## DBSCAN : 0.103836
cat("Mean Shift :", dunn_ms, "\n")
## Mean Shift : 0.006113718
cat("Fuzzy C-Means :", dunn_fcm, "\n")
## Fuzzy C-Means : 0.1025311
9. TAMBAHAN (INSIGHT)
df_result <- as.data.frame(df_sample)
df_result$Cluster_KMeans <- km_res$cluster
aggregate(df_result,
by=list(df_result$Cluster_KMeans),
mean)
## Group.1 age annual_income spending_score num_purchases
## 1 1 -0.42437007 -0.3618976 -0.1312588 -0.7238402
## 2 2 0.03094258 0.1435942 -0.7601738 -0.5603930
## 3 3 0.40558317 0.3112471 0.3528725 0.8729600
## avg_purchase_value membership_years website_visits_per_month
## 1 -0.3617111 -0.6819857 0.07703681
## 2 -0.3191820 -0.4158334 -0.22464580
## 3 0.4477942 0.7893459 -0.00891335
## cart_abandon_rate churned Cluster_KMeans
## 1 -0.1677274 -0.3861689 1
## 2 0.4782262 2.5843613 2
## 3 0.0226213 -0.3861689 3
sil_km
## [1] 0.1928314
sil_kmed
## [1] 0.1469764
sil_db
## [1] -0.06534474
sil_ms
## [1] 0.1602052
sil_fcm
## [1] 0.08175724