library(cluster)
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.4.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.4.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(clusterCrit)
## Warning: package 'clusterCrit' was built under R version 4.4.3
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)
library(scales)
## Warning: package 'scales' was built under R version 4.4.3
library(RColorBrewer)

0.1 Input Data

mall = read.csv("C:/Users/Imam/Downloads/data kaggle tpg/Mall_Customers.csv")
mall
mall_data <- mall[, c("Age", "Annual.Income..k..", "Spending.Score..1.100.")]
summary(mall_data)
##       Age        Annual.Income..k.. Spending.Score..1.100.
##  Min.   :18.00   Min.   : 15.00     Min.   : 1.00         
##  1st Qu.:28.75   1st Qu.: 41.50     1st Qu.:34.75         
##  Median :36.00   Median : 61.50     Median :50.00         
##  Mean   :38.85   Mean   : 60.56     Mean   :50.20         
##  3rd Qu.:49.00   3rd Qu.: 78.00     3rd Qu.:73.00         
##  Max.   :70.00   Max.   :137.00     Max.   :99.00

1 Hierarchical Clustering

1.1 Standarisasi & Matriks Jarak

mall_scaled <- scale(mall_data)
dist_mall <- dist(mall_scaled, method = "euclidean")
head(dist_mall,10)
##  [1] 1.6327138 1.2804744 1.4996118 0.8632819 1.4508077 1.7198871 2.1520375
##  [8] 3.5134279 1.5087491 3.5731991
hc_ward <- hclust(dist_mall, method = "ward.D2")
hc_ward
## 
## Call:
## hclust(d = dist_mall, method = "ward.D2")
## 
## Cluster method   : ward.D2 
## Distance         : euclidean 
## Number of objects: 200

1.2 Dendogram

fviz_dend(hc_ward, k = 4, rect = TRUE, 
          main = "Dendrogram - Hierarchical Clustering (Ward.D2)")
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

1.3 Hasil Clustering

clusters_hc <- cutree(hc_ward, k = 4)
table(clusters_hc)
## clusters_hc
##  1  2  3  4 
## 67 66 39 28
head(data.frame(Observasi = 1:length(clusters_hc),
                Cluster = clusters_hc), 10)
eval_cluster <- function(data_scaled, cluster, dist_matrix) {
  sil <- mean(silhouette(cluster, dist_matrix)[, 3])
  ch <- intCriteria(traj = as.matrix(data_scaled),
                    part = cluster, crit = "Calinski_Harabasz")$calinski_harabasz
  dunn <- intCriteria(traj = as.matrix(data_scaled),
                      part = cluster, crit = "Dunn")$dunn
  return(data.frame(Silhouette = sil,
                    Calinski_Harabasz = ch,
                    Dunn = dunn))
}
eval_cluster(mall_scaled, clusters_hc, dist_mall)

1.4 Karakteristik Cluster

# Gabungkan data hasil clustering dengan data asli (yang sudah distandardisasi)
data_clustered <- data.frame(mall_scaled, cluster = factor(clusters_hc))

# Hitung rata-rata setiap variabel untuk masing-masing cluster
cluster_summary <- data_clustered %>%
  group_by(cluster) %>%
  summarise(across(where(is.numeric), mean, .names = "{.col}"))

cluster_summary
  • Cluster 1: Usia lebih tua dari rata-rata, pendapatan rendah, dan jarang berbelanja.

  • Cluster 2: Usia lebih muda, pendapatan rendah, dan cukup aktif berbelanja.

  • Cluster 3: Usia muda, pendapatan tinggi, dan sangat aktif berbelanja.

  • Cluster 4: Usia lebih tua, pendapatan tinggi, dan jarang berbelanja.

1.5 Visualisasi Clustering

# Ubah ke format long biar bisa dibuat grafik
cluster_summary_long <- cluster_summary %>%
  pivot_longer(-cluster, names_to = "Variabel", values_to = "Rata_rata")
cluster_summary_long

1.5.1 Heatmap

ggplot(cluster_summary_long, aes(x = Variabel, y = cluster, fill = Rata_rata)) +
  geom_tile(color = "white") +
  scale_fill_gradientn(colors = c("#56B1F7", "white", "#FF6B6B")) +
  labs(title = "Heatmap Karakteristik Cluster Mall Customers",
       x = "Variabel", y = "Cluster") +
  theme_minimal(base_size = 12) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

1.5.2 Barplot

ggplot(cluster_summary_long, aes(x = Variabel, y = Rata_rata, fill = cluster)) +
  geom_bar(stat = "identity", position = "dodge") +
  scale_fill_brewer(palette = "Set2") +
  labs(title = "Bar Plot Karakteristik Cluster Mall Customers",
       x = "Variabel", y = "Rata-rata Nilai") +
  theme_minimal(base_size = 12) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

2 Non-Hierarchical Clustering (K-Means)

2.1 Menentukan Jumlah Cluster Optimal

fviz_nbclust(mall_scaled, kmeans, method = "wss") +
  labs(title = "Elbow Method - Mall Customers")

Berdasarkan metode Elbow, penurunan mulai melambat di k = 4. Jumlah cluster optimal menurut Elbow Method adalah k = 4. Artinya, memilih 4 cluster sudah cukup untuk menjelaskan variasi data tanpa menambah cluster berlebihan.

fviz_nbclust(mall_scaled, kmeans, method = "silhouette") +
  labs(title = "Silhouette Method - Mall Customers")

Berdasarkan metode silhouette, jumlah cluster optimal di k=8.

2.2 Penerapan Algoritma K-Means

set.seed(123)
kmeans_result <- kmeans(mall_scaled, centers = 4, nstart = 25)
kmeans_result
## K-means clustering with 4 clusters of sizes 38, 65, 57, 40
## 
## Cluster means:
##           Age Annual.Income..k.. Spending.Score..1.100.
## 1  0.03711223          0.9876366             -1.1857814
## 2  1.08344244         -0.4893373             -0.3961802
## 3 -0.96008279         -0.7827991              0.3910484
## 4 -0.42773261          0.9724070              1.2130414
## 
## Clustering vector:
##   [1] 3 3 3 3 3 3 2 3 2 3 2 3 2 3 2 3 3 3 2 3 3 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2
##  [38] 3 2 3 2 3 2 3 2 3 2 3 3 3 2 3 3 2 2 2 2 2 3 2 2 3 2 2 2 3 2 2 3 3 2 2 2 2
##  [75] 2 3 2 2 3 2 2 3 2 2 3 2 2 3 3 2 2 3 2 2 3 3 2 3 2 3 3 2 2 3 2 3 2 2 2 2 2
## [112] 3 1 3 3 3 2 2 2 2 3 1 4 4 1 4 1 4 2 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4
## [149] 1 4 1 4 1 4 1 4 1 4 1 4 2 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1
## [186] 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4
## 
## Within cluster sum of squares by cluster:
## [1] 44.01863 74.83280 61.43215 23.91544
##  (between_SS / total_SS =  65.8 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

2.3 Hasil Clustering

table(kmeans_result$cluster)
## 
##  1  2  3  4 
## 38 65 57 40
head(data.frame(Observasi = 1:length(kmeans_result$cluster),
           Cluster = kmeans_result$cluster),10)

2.4 Evaluasi Clustering

cluster_km <- kmeans_result$cluster
dist_matrix <- dist(mall_scaled, method = "euclidean")

sil <- mean(silhouette(cluster_km, dist_matrix)[, 3])
ch <- intCriteria(traj = as.matrix(mall_scaled),
                  part = cluster_km,
                  crit = "Calinski_Harabasz")$calinski_harabasz
dunn <- intCriteria(traj = as.matrix(mall_scaled),
                    part = cluster_km,
                    crit = "Dunn")$dunn

eval_cluster_kmeans <- data.frame(Silhouette = sil,
                                  Calinski_Harabasz = ch,
                                  Dunn = dunn)
eval_cluster_kmeans

2.5 Karakteristik Cluster

data_kmeans <- data.frame(mall_data, cluster = factor(cluster_km))

kmeans_summary <- data_kmeans %>%
  group_by(cluster) %>%
  summarise(across(where(is.numeric), mean, .names = "{.col}"))

kmeans_summary

2.6 Visualisasi Clustering

fviz_cluster(kmeans_result, data = mall_scaled,
palette = "jco", geom = "point",
main = "Visualisasi Hasil K-Means Clustering (4 Cluster)")

heat_data <- kmeans_summary %>%
  pivot_longer(-cluster, names_to = "Variabel", values_to = "Rata_Rata")
heat_data

2.6.1 Heatmap

ggplot(heat_data, aes(x = Variabel, y = cluster, fill = Rata_Rata)) +
  geom_tile(color = "white") +
  scale_fill_gradientn(colors = c("#56B1F7", "white", "#FF6B6B")) +
  labs(title = "Heatmap Karakteristik Cluster",
       x = "Variabel", y = "Cluster") +
  theme_minimal(base_size = 12) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

2.6.2 Barplot

ggplot(heat_data, aes(x = Variabel, y = Rata_Rata, fill = cluster)) +
geom_bar(stat = "identity", position = "dodge") +
scale_fill_brewer(palette = "Set2") +
labs(title = "Barplot Karakteristik Cluster",
x = "Variabel", y = "Rata-rata Nilai") +
theme_minimal(base_size = 12) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

  • Cluster 1 → Age sedang-tua, Income paling tinggi, Spending paling rendah.

  • Cluster 2 → Age paling tua, Income sedang, Spending rendah-sedang

  • Cluster 3 → Age paling muda, Income paling rendah, Spending sedang-tinggi.

  • Cluster 4 → Age muda-sedang, Income tinggi, Spending paling tinggi

3 C. PERBANDINGAN HASIL CLUSTERING

comparison <- data.frame(
  Observasi = 1:nrow(mall_data),   
  Hierarchical = clusters_hc,
  KMeans = kmeans_result$cluster
)

head(comparison, 10)