1. Load Library
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.0 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(flexclust)
## Warning: package 'flexclust' was built under R version 4.5.3
library(dbscan)
## Warning: package 'dbscan' was built under R version 4.5.3
##
## Attaching package: 'dbscan'
##
## The following object is masked from 'package:stats':
##
## as.dendrogram
library(meanShiftR)
library(e1071)
## Warning: package 'e1071' was built under R version 4.5.3
##
## Attaching package: 'e1071'
##
## The following object is masked from 'package:flexclust':
##
## bclust
##
## The following object is masked from 'package:ggplot2':
##
## element
library(cluster)
## Warning: package 'cluster' was built under R version 4.5.3
library(fpc)
## Warning: package 'fpc' was built under R version 4.5.3
##
## Attaching package: 'fpc'
##
## The following object is masked from 'package:dbscan':
##
## dbscan
library(psych)
##
## Attaching package: 'psych'
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
2. Import Dataset
df <- read.csv("Customer_Transactions.csv")
head(df)
## customer_id age gender country annual_income spending_score num_purchases
## 1 1 37 Male Germany 85886 14 18
## 2 2 40 Male India 41041 4 10
## 3 3 69 Female Australia 143869 59 39
## 4 4 30 Male UK 87261 45 34
## 5 5 69 Female UK 110678 40 38
## 6 6 28 Male Australia 90330 2 7
## avg_purchase_value membership_years website_visits_per_month
## 1 41.20 6 20
## 2 31.73 4 29
## 3 65.96 12 26
## 4 51.87 12 7
## 5 59.64 13 16
## 6 35.63 1 19
## cart_abandon_rate churned feedback_text
## 1 0.95 0 Very satisfied with my purchase.
## 2 0.21 0 Good quality and value for money.
## 3 0.08 0 Excellent customer service.
## 4 0.61 0 Good quality and value for money.
## 5 0.49 0 Excellent customer service.
## 6 0.81 1 The product did not match the description.
## last_purchase_date
## 1 2025-06-22
## 2 2025-10-17
## 3 2025-07-01
## 4 2025-08-17
## 5 2025-06-21
## 6 2024-10-25
3. Deskripsi Data
describe(df)
## vars n mean sd median trimmed
## customer_id 1 10000 5000.50 2886.90 5000.50 5000.50
## age 2 10000 44.05 15.40 44.00 44.05
## gender* 3 10000 1.51 0.50 2.00 1.51
## country* 4 10000 6.16 3.06 6.00 6.28
## annual_income 5 10000 86067.68 38986.79 78339.50 83870.92
## spending_score 6 10000 50.92 28.75 51.00 51.04
## num_purchases 7 10000 22.58 10.16 22.00 22.27
## avg_purchase_value 8 10000 47.45 11.21 46.99 47.25
## membership_years 9 10000 6.34 4.68 6.00 6.13
## website_visits_per_month 10 10000 15.58 8.66 16.00 15.59
## cart_abandon_rate 11 10000 0.50 0.29 0.51 0.50
## churned 12 10000 0.11 0.31 0.00 0.01
## feedback_text* 13 10000 6.90 3.42 7.00 6.82
## last_purchase_date* 14 10000 529.12 122.20 554.00 553.83
## mad min max range skew kurtosis
## customer_id 3706.50 1.00 10000.00 9999.00 0.00 -1.20
## age 19.27 18.00 70.00 52.00 0.00 -1.21
## gender* 0.00 1.00 2.00 1.00 -0.03 -2.00
## country* 4.45 1.00 10.00 9.00 -0.14 -1.31
## annual_income 40833.03 20028.00 179960.00 159932.00 0.47 -0.70
## spending_score 35.58 1.00 100.00 99.00 -0.03 -1.17
## num_purchases 11.86 1.00 49.00 48.00 0.24 -0.90
## avg_purchase_value 11.55 16.75 83.27 66.52 0.16 -0.32
## membership_years 5.93 0.00 15.00 15.00 0.29 -1.16
## website_visits_per_month 11.86 1.00 30.00 29.00 -0.01 -1.21
## cart_abandon_rate 0.37 0.00 1.00 1.00 -0.01 -1.19
## churned 0.00 0.00 1.00 1.00 2.51 4.30
## feedback_text* 4.45 1.00 12.00 11.00 0.29 -1.37
## last_purchase_date* 74.13 1.00 654.00 653.00 -2.32 5.76
## se
## customer_id 28.87
## age 0.15
## gender* 0.00
## country* 0.03
## annual_income 389.87
## spending_score 0.29
## num_purchases 0.10
## avg_purchase_value 0.11
## membership_years 0.05
## website_visits_per_month 0.09
## cart_abandon_rate 0.00
## churned 0.00
## feedback_text* 0.03
## last_purchase_date* 1.22
4. Seleksi Variabel
df <- df %>%
select(
age,
annual_income,
spending_score,
num_purchases,
avg_purchase_value,
membership_years,
website_visits_per_month,
cart_abandon_rate,
churned
)
5. Cek Missing Value
colSums(is.na(df))
## age annual_income spending_score
## 0 0 0
## num_purchases avg_purchase_value membership_years
## 0 0 0
## website_visits_per_month cart_abandon_rate churned
## 0 0 0
6. Tabel Missing Value
library(tidyverse)
missing_table <- df %>%
summarise(across(everything(), ~sum(is.na(.)))) %>%
pivot_longer(cols = everything(),
names_to = "variable",
values_to = "missing_values")
missing_table
## # A tibble: 9 × 2
## variable missing_values
## <chr> <int>
## 1 age 0
## 2 annual_income 0
## 3 spending_score 0
## 4 num_purchases 0
## 5 avg_purchase_value 0
## 6 membership_years 0
## 7 website_visits_per_month 0
## 8 cart_abandon_rate 0
## 9 churned 0
8. Normalisasi Data
df_scaled <- scale(df)
# Ringkasan hasil scaling
summary(df_scaled)
## age annual_income spending_score num_purchases
## Min. :-1.690773 Min. :-1.6939 Min. :-1.736289 Min. :-2.12286
## 1st Qu.:-0.846873 1st Qu.:-0.7880 1st Qu.:-0.866826 1st Qu.:-0.84379
## Median :-0.002973 Median :-0.1982 Median : 0.002636 Median :-0.05667
## Mean : 0.000000 Mean : 0.0000 Mean : 0.000000 Mean : 0.00000
## 3rd Qu.: 0.840927 3rd Qu.: 0.7567 3rd Qu.: 0.837320 3rd Qu.: 0.82884
## Max. : 1.684827 Max. : 2.4083 Max. : 1.706783 Max. : 2.59986
## avg_purchase_value membership_years website_visits_per_month
## Min. :-2.73940 Min. :-1.35462 Min. :-1.68429
## 1st Qu.:-0.70922 1st Qu.:-0.92733 1st Qu.:-0.87554
## Median :-0.04082 Median :-0.07275 Median : 0.04874
## Mean : 0.00000 Mean : 0.00000 Mean : 0.00000
## 3rd Qu.: 0.68112 3rd Qu.: 0.78183 3rd Qu.: 0.85750
## Max. : 3.19675 Max. : 1.85006 Max. : 1.66625
## cart_abandon_rate churned
## Min. :-1.74740 Min. :-0.3497
## 1st Qu.:-0.87582 1st Qu.:-0.3497
## Median : 0.03062 Median :-0.3497
## Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.86734 3rd Qu.:-0.3497
## Max. : 1.73892 Max. : 2.8589
9. Sampling Data
set.seed(123)
df_sample <- df_scaled[sample(1:nrow(df_scaled), 500), ]
cat("Jumlah data:", nrow(df_sample), "\n")
## Jumlah data: 500
cat("Jumlah variabel:", ncol(df_sample))
## Jumlah variabel: 9
10. Penentuan Jumlah Cluster (Elbow Method)
wss <- sapply(1:10, function(k){
kmeans(df_sample, centers = k, nstart = 20)$tot.withinss
})
plot(1:10, wss, type="b", pch=19,
xlab="Jumlah Cluster",
ylab="WSS",
main="Elbow Method")

11. Penentuan Jumlah Cluster (Silhouette Method)
avg_sil <- function(k){
km <- kmeans(df_sample, centers = k, nstart = 25)
ss <- silhouette(km$cluster, dist(df_sample))
mean(ss[,3])
}
k_values <- 2:10
sil_values <- sapply(k_values, avg_sil)
plot(k_values, sil_values, type="b", pch=19,
xlab="Jumlah Cluster",
ylab="Silhouette",
main="Silhouette Method")

12. Menentukan Nilai K Optimal
k <- k_values[which.max(sil_values)]
k
## [1] 3
13. Clustering K-Means
km_res <- kmeans(df_sample, centers = k, nstart = 25)
15. Penentuan Parameter DBSCAN (kNN Distance Plot)
# Grafik kNN distance untuk menentukan eps DBSCAN
kNNdistplot(df_sample, k = 5)
abline(h = 1.5, col = "red", lwd = 2)
title("kNN Distance Plot untuk Penentuan eps DBSCAN")

16. Clustering DBSCAN
db_res <- dbscan(df_sample, eps = 1.5, MinPts = 5)
table(db_res$cluster)
##
## 0 1 2 3 4 5 6 7 8
## 197 254 4 4 22 5 5 4 5
17. PCA & Clustering untuk Mean Shift
df_pca <- prcomp(df_sample)$x[,1:2]
ms_res <- meanShiftR::meanShift(
as.matrix(df_pca),
bandwidth = c(1.8, 1.8)
)
ms_cluster <- ms_res$assignment
table(ms_cluster)
## ms_cluster
## 1 2 3 4 5 6 7 8
## 114 120 57 56 58 82 8 5
18. Clustering Fuzzy C-Means
fcm_res <- cmeans(df_sample, centers = k, m = 2)
19. Perhitungan Distance
dist_sample <- dist(df_sample)
dist_pca <- dist(df_pca)
20. Evaluasi Silhouette Score
sil_km <- mean(silhouette(km_res$cluster, dist_sample)[,3])
sil_kmed <- mean(silhouette(clusters(kmed_res), dist_sample)[,3])
sil_fcm <- mean(silhouette(fcm_res$cluster, dist_sample)[,3])
21. Evaluasi Silhouette DBSCAN
db_cluster <- db_res$cluster
valid <- db_cluster != 0
if(length(unique(db_cluster[valid])) > 1){
d <- as.matrix(dist(df_sample))
sil_db <- mean(
silhouette(db_cluster[valid], d[valid, valid])[,3]
)
} else {
sil_db <- NA
}
22. Evaluasi Silhouette Mean Shift
if(length(unique(ms_cluster)) > 1){
sil_ms <- mean(silhouette(ms_cluster, dist_pca)[,3])
} else {
sil_ms <- NA
}
23.Evaluasi Dunn Index
dunn_km <- cluster.stats(dist_sample, km_res$cluster)$dunn
dunn_kmed <- cluster.stats(dist_sample, clusters(kmed_res))$dunn
dunn_fcm <- cluster.stats(dist_sample, fcm_res$cluster)$dunn
24. Evaluasi Dunn DBSCAN
if(length(unique(db_cluster[valid])) > 1){
dunn_db <- cluster.stats(dist(df_sample[valid,]),
db_cluster[valid])$dunn
} else {
dunn_db <- NA
}
25. Evaluasi Dunn Mean Shift
if(length(unique(ms_cluster)) > 1){
dunn_ms <- cluster.stats(dist_pca, ms_cluster)$dunn
} else {
dunn_ms <- NA
}
26.Tabel Perbandingan Evaluasi
evaluation_table <- data.frame(
Method = c("K-Means", "K-Median", "DBSCAN", "Mean Shift", "Fuzzy C-Means"),
Silhouette = c(sil_km, sil_kmed, sil_db, sil_ms, sil_fcm),
Dunn_Index = c(dunn_km, dunn_kmed, dunn_db, dunn_ms, dunn_fcm)
)
evaluation_table
## Method Silhouette Dunn_Index
## 1 K-Means 0.19969745 0.12514841
## 2 K-Median 0.14808636 0.11195335
## 3 DBSCAN -0.06687743 0.10178209
## 4 Mean Shift 0.12205498 0.01079951
## 5 Fuzzy C-Means 0.09009458 0.11405901
27. Output Evaluasi
cat(" SILHOUETTE SCORE \n")
## SILHOUETTE SCORE
cat("\n","K-Means :", sil_km, "\n")
##
## K-Means : 0.1996974
cat(" K-Median :", sil_kmed, "\n")
## K-Median : 0.1480864
cat(" DBSCAN :", sil_db, "\n")
## DBSCAN : -0.06687743
cat(" Mean Shift :", sil_ms, "\n")
## Mean Shift : 0.122055
cat(" Fuzzy C-Means :", sil_fcm, "\n")
## Fuzzy C-Means : 0.09009458
cat("\n DUNN INDEX \n")
##
## DUNN INDEX
cat("\n","K-Means :", dunn_km, "\n")
##
## K-Means : 0.1251484
cat(" K-Median :", dunn_kmed, "\n")
## K-Median : 0.1119533
cat(" DBSCAN :", dunn_db, "\n")
## DBSCAN : 0.1017821
cat(" Mean Shift :", dunn_ms, "\n")
## Mean Shift : 0.01079951
cat(" Fuzzy C-Means :", dunn_fcm, "\n")
## Fuzzy C-Means : 0.114059
28. Interpretasi Hasil K-Means
df_result <- as.data.frame(df_sample)
df_result$Cluster_KMeans <- km_res$cluster
aggregate(df_result,
by=list(df_result$Cluster_KMeans),
mean)
## Group.1 age annual_income spending_score num_purchases
## 1 1 -0.41148714 -0.3353013 -0.1843589 -0.7893346
## 2 2 0.05495137 0.1761706 -0.8293726 -0.6182522
## 3 3 0.43874655 0.3458068 0.3121649 0.8820570
## avg_purchase_value membership_years website_visits_per_month
## 1 -0.3581701 -0.7206377 0.04874458
## 2 -0.3146092 -0.4474470 -0.24987118
## 3 0.4709752 0.7896036 -0.03633183
## cart_abandon_rate churned Cluster_KMeans
## 1 -0.06537142 -0.3497459 1
## 2 0.59004267 2.8589324 2
## 3 0.12776508 -0.3497459 3