## Warning: package 'factoextra' was built under R version 4.4.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.4.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
## Warning: package 'clusterCrit' was built under R version 4.4.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Warning: package 'scales' was built under R version 4.4.3
## Age Annual.Income..k.. Spending.Score..1.100.
## Min. :18.00 Min. : 15.00 Min. : 1.00
## 1st Qu.:28.75 1st Qu.: 41.50 1st Qu.:34.75
## Median :36.00 Median : 61.50 Median :50.00
## Mean :38.85 Mean : 60.56 Mean :50.20
## 3rd Qu.:49.00 3rd Qu.: 78.00 3rd Qu.:73.00
## Max. :70.00 Max. :137.00 Max. :99.00
## [1] 1.6327138 1.2804744 1.4996118 0.8632819 1.4508077 1.7198871 2.1520375
## [8] 3.5134279 1.5087491 3.5731991
##
## Call:
## hclust(d = dist_mall, method = "ward.D2")
##
## Cluster method : ward.D2
## Distance : euclidean
## Number of objects: 200
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## clusters_hc
## 1 2 3 4
## 67 66 39 28
eval_cluster <- function(data_scaled, cluster, dist_matrix) {
sil <- mean(silhouette(cluster, dist_matrix)[, 3])
ch <- intCriteria(traj = as.matrix(data_scaled),
part = cluster, crit = "Calinski_Harabasz")$calinski_harabasz
dunn <- intCriteria(traj = as.matrix(data_scaled),
part = cluster, crit = "Dunn")$dunn
return(data.frame(Silhouette = sil,
Calinski_Harabasz = ch,
Dunn = dunn))
}
eval_cluster(mall_scaled, clusters_hc, dist_mall)# Gabungkan data hasil clustering dengan data asli (yang sudah distandardisasi)
data_clustered <- data.frame(mall_scaled, cluster = factor(clusters_hc))
# Hitung rata-rata setiap variabel untuk masing-masing cluster
cluster_summary <- data_clustered %>%
group_by(cluster) %>%
summarise(across(where(is.numeric), mean, .names = "{.col}"))
cluster_summaryCluster 1: Usia lebih tua dari rata-rata, pendapatan rendah, dan jarang berbelanja.
Cluster 2: Usia lebih muda, pendapatan rendah, dan cukup aktif berbelanja.
Cluster 3: Usia muda, pendapatan tinggi, dan sangat aktif berbelanja.
Cluster 4: Usia lebih tua, pendapatan tinggi, dan jarang berbelanja.
# Ubah ke format long biar bisa dibuat grafik
cluster_summary_long <- cluster_summary %>%
pivot_longer(-cluster, names_to = "Variabel", values_to = "Rata_rata")
cluster_summary_longggplot(cluster_summary_long, aes(x = Variabel, y = cluster, fill = Rata_rata)) +
geom_tile(color = "white") +
scale_fill_gradientn(colors = c("#56B1F7", "white", "#FF6B6B")) +
labs(title = "Heatmap Karakteristik Cluster Mall Customers",
x = "Variabel", y = "Cluster") +
theme_minimal(base_size = 12) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))ggplot(cluster_summary_long, aes(x = Variabel, y = Rata_rata, fill = cluster)) +
geom_bar(stat = "identity", position = "dodge") +
scale_fill_brewer(palette = "Set2") +
labs(title = "Bar Plot Karakteristik Cluster Mall Customers",
x = "Variabel", y = "Rata-rata Nilai") +
theme_minimal(base_size = 12) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Berdasarkan metode Elbow, penurunan mulai melambat di k = 4.
Jumlah cluster optimal menurut Elbow Method adalah k = 4. Artinya,
memilih 4 cluster sudah cukup untuk menjelaskan variasi data tanpa
menambah cluster berlebihan.
fviz_nbclust(mall_scaled, kmeans, method = "silhouette") +
labs(title = "Silhouette Method - Mall Customers")
Berdasarkan metode silhouette, jumlah cluster optimal di
k=8.
## K-means clustering with 4 clusters of sizes 38, 65, 57, 40
##
## Cluster means:
## Age Annual.Income..k.. Spending.Score..1.100.
## 1 0.03711223 0.9876366 -1.1857814
## 2 1.08344244 -0.4893373 -0.3961802
## 3 -0.96008279 -0.7827991 0.3910484
## 4 -0.42773261 0.9724070 1.2130414
##
## Clustering vector:
## [1] 3 3 3 3 3 3 2 3 2 3 2 3 2 3 2 3 3 3 2 3 3 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2
## [38] 3 2 3 2 3 2 3 2 3 2 3 3 3 2 3 3 2 2 2 2 2 3 2 2 3 2 2 2 3 2 2 3 3 2 2 2 2
## [75] 2 3 2 2 3 2 2 3 2 2 3 2 2 3 3 2 2 3 2 2 3 3 2 3 2 3 3 2 2 3 2 3 2 2 2 2 2
## [112] 3 1 3 3 3 2 2 2 2 3 1 4 4 1 4 1 4 2 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4
## [149] 1 4 1 4 1 4 1 4 1 4 1 4 2 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4 1
## [186] 4 1 4 1 4 1 4 1 4 1 4 1 4 1 4
##
## Within cluster sum of squares by cluster:
## [1] 44.01863 74.83280 61.43215 23.91544
## (between_SS / total_SS = 65.8 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
##
## 1 2 3 4
## 38 65 57 40
cluster_km <- kmeans_result$cluster
dist_matrix <- dist(mall_scaled, method = "euclidean")
sil <- mean(silhouette(cluster_km, dist_matrix)[, 3])
ch <- intCriteria(traj = as.matrix(mall_scaled),
part = cluster_km,
crit = "Calinski_Harabasz")$calinski_harabasz
dunn <- intCriteria(traj = as.matrix(mall_scaled),
part = cluster_km,
crit = "Dunn")$dunn
eval_cluster_kmeans <- data.frame(Silhouette = sil,
Calinski_Harabasz = ch,
Dunn = dunn)
eval_cluster_kmeansdata_kmeans <- data.frame(mall_data, cluster = factor(cluster_km))
kmeans_summary <- data_kmeans %>%
group_by(cluster) %>%
summarise(across(where(is.numeric), mean, .names = "{.col}"))
kmeans_summaryfviz_cluster(kmeans_result, data = mall_scaled,
palette = "jco", geom = "point",
main = "Visualisasi Hasil K-Means Clustering (4 Cluster)")heat_data <- kmeans_summary %>%
pivot_longer(-cluster, names_to = "Variabel", values_to = "Rata_Rata")
heat_dataggplot(heat_data, aes(x = Variabel, y = cluster, fill = Rata_Rata)) +
geom_tile(color = "white") +
scale_fill_gradientn(colors = c("#56B1F7", "white", "#FF6B6B")) +
labs(title = "Heatmap Karakteristik Cluster",
x = "Variabel", y = "Cluster") +
theme_minimal(base_size = 12) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))ggplot(heat_data, aes(x = Variabel, y = Rata_Rata, fill = cluster)) +
geom_bar(stat = "identity", position = "dodge") +
scale_fill_brewer(palette = "Set2") +
labs(title = "Barplot Karakteristik Cluster",
x = "Variabel", y = "Rata-rata Nilai") +
theme_minimal(base_size = 12) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))Cluster 1 → Age sedang-tua, Income paling tinggi, Spending paling rendah.
Cluster 2 → Age paling tua, Income sedang, Spending rendah-sedang
Cluster 3 → Age paling muda, Income paling rendah, Spending sedang-tinggi.
Cluster 4 → Age muda-sedang, Income tinggi, Spending paling tinggi