library(cluster)
## Warning: package 'cluster' was built under R version 4.5.3
library(mclust)
## Warning: package 'mclust' was built under R version 4.5.3
## Package 'mclust' version 6.1.2
## Type 'citation("mclust")' for citing this R package in publications.
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.1
# Input data
(data <- data.frame(
X1 = c(11,6,9,3,4,15,9,13,4,13),
X2 = c(5,3,7,7,12,8,12,9,11,7),
X3 = c(7,11,11,7,7,4,12,11,14,3),
X4 = c(6,5,8,12,12,8,6,14,6,12)
))
## X1 X2 X3 X4
## 1 11 5 7 6
## 2 6 3 11 5
## 3 9 7 11 8
## 4 3 7 7 12
## 5 4 12 7 12
## 6 15 8 4 8
## 7 9 12 12 6
## 8 13 9 11 14
## 9 4 11 14 6
## 10 13 7 3 12
# Data Scaling
(data_sc <- as.data.frame(scale(data)))
## X1 X2 X3 X4
## 1 0.53538263 -1.04699461 -0.4692942 -0.8838069
## 2 -0.62849265 -1.72247501 0.6349275 -1.1885678
## 3 0.06983252 -0.37151422 0.6349275 -0.2742849
## 4 -1.32681782 -0.37151422 -0.4692942 0.9447590
## 5 -1.09404276 1.31718677 -0.4692942 0.9447590
## 6 1.46648285 -0.03377402 -1.2974606 -0.2742849
## 7 0.06983252 1.31718677 0.9109829 -0.8838069
## 8 1.00093274 0.30396618 0.6349275 1.5542810
## 9 -1.09404276 0.97944657 1.4630938 -0.8838069
## 10 1.00093274 -0.37151422 -1.5735160 0.9447590
# DBSCAN
# Non-Scaling
library(dbscan)
## Warning: package 'dbscan' was built under R version 4.5.3
##
## Attaching package: 'dbscan'
## The following object is masked from 'package:stats':
##
## as.dendrogram
kNNdistplot(data, k = 3)
(dbclust <- dbscan(data, eps = 5, minPts = 2))
## DBSCAN clustering for 10 objects.
## Parameters: eps = 5, minPts = 2
## Using euclidean distances and borderpoints = TRUE
## The clustering contains 1 cluster(s) and 8 noise points.
##
## 0 1
## 8 2
##
## Available fields: cluster, eps, minPts, metric, borderPoints
dbclust$cluster
## [1] 0 0 0 0 0 1 0 0 0 1
result_db <- data.frame(data)
result_db$cluster <- dbclust$cluster
result_db$cluster <- as.factor(result_db$cluster)
result_db
## X1 X2 X3 X4 cluster
## 1 11 5 7 6 0
## 2 6 3 11 5 0
## 3 9 7 11 8 0
## 4 3 7 7 12 0
## 5 4 12 7 12 0
## 6 15 8 4 8 1
## 7 9 12 12 6 0
## 8 13 9 11 14 0
## 9 4 11 14 6 0
## 10 13 7 3 12 1
# visualisasi
ggplot(result_db,
aes(x = X1,
y = X2,
color = cluster)) +
geom_point(size = 4) +
ggtitle("DBSCAN Clustering") +
theme_minimal()
terbentuk 1 cluster
terdapat 8 data noise/outlier (0)
Data belum di-scaling sehingga jarak antar titik masih dipengaruhi skala asli tiap variabel. Variabel dengan nilai besar (seperti X1) lebih dominan dalam perhitungan jarak, banyak data yang dianggap terlalu jauh untuk dikelompokkan.
Hanya observasi 6 dan 10 yang cukup dekat untuk membentuk cluster, 8 observasi lainnya jadi noise.
# Scaling
kNNdistplot(data_sc, k = 3)
(db_clustsc <- dbscan(data_sc, eps = 1.5, minPts = 2))
## DBSCAN clustering for 10 objects.
## Parameters: eps = 1.5, minPts = 2
## Using euclidean distances and borderpoints = TRUE
## The clustering contains 2 cluster(s) and 6 noise points.
##
## 0 1 2
## 6 2 2
##
## Available fields: cluster, eps, minPts, metric, borderPoints
db_clustsc$cluster
## [1] 0 0 0 0 0 1 2 0 2 1
result_dbsc <- data.frame(data_sc)
result_dbsc$cluster <- db_clustsc$cluster
result_dbsc$cluster <- as.factor(result_dbsc$cluster)
result_dbsc
## X1 X2 X3 X4 cluster
## 1 0.53538263 -1.04699461 -0.4692942 -0.8838069 0
## 2 -0.62849265 -1.72247501 0.6349275 -1.1885678 0
## 3 0.06983252 -0.37151422 0.6349275 -0.2742849 0
## 4 -1.32681782 -0.37151422 -0.4692942 0.9447590 0
## 5 -1.09404276 1.31718677 -0.4692942 0.9447590 0
## 6 1.46648285 -0.03377402 -1.2974606 -0.2742849 1
## 7 0.06983252 1.31718677 0.9109829 -0.8838069 2
## 8 1.00093274 0.30396618 0.6349275 1.5542810 0
## 9 -1.09404276 0.97944657 1.4630938 -0.8838069 2
## 10 1.00093274 -0.37151422 -1.5735160 0.9447590 1
# visualisasi
ggplot(result_dbsc,
aes(x = X1,
y = X2,
color = cluster)) +
geom_point(size = 4) +
ggtitle("DBSCAN Clustering (Scaling)") +
theme_minimal()
terbentuk 2 cluster (1&2)
terdapat 6 noise (0)
Setelah di-scaling, semua variabel punya skala yang sama jadi jarak antar titik lebih seimbang. Sehingga DBSCAN bisa menemukan cluster baru yang sebelumnya tidak terdeteksi.
cluster pertama: observasi 6 dan 10
cluster kedua: observasi 7 dan 9
6 observasi lainnya masih noise karena belum memenuhi syarat kepadatan minimum.
# Non-Scaling
(gmm_model <- Mclust(data, G = 3))
## 'Mclust' model object: (VEI,3)
##
## Available components:
## [1] "call" "data" "modelName" "n"
## [5] "d" "G" "BIC" "loglik"
## [9] "df" "bic" "icl" "hypvol"
## [13] "parameters" "z" "classification" "uncertainty"
summary(gmm_model)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VEI (diagonal, equal shape) model with 3 components:
##
## log-likelihood n df BIC ICL
## -92.53632 10 20 -231.1243 -231.3989
##
## Clustering table:
## 1 2 3
## 3 5 2
gmm_model$classification
## [1] 1 1 1 2 2 2 3 2 3 2
# visualisasi
gmm_viz <- predict(gmm_model)$classification
ggplot(data = data, aes(x = X1, y = X2)) +
geom_point(aes(col = as.factor(gmm_viz))) +
theme_bw() +
labs(col = "cluster")
Model terbaik yang dipilih Mclust tanpa scaling adalah VEI, dengan covariance diagonal dan shape antar cluster sama.
Data terbagi jadi 3 kelompok berdasarkan kemiripan antar observasi. Karena data belum di-scaling, variabel yang skalanya lebih besar lebih dominan dalam pembentukan cluster.
# Scaling
(gmm_modelsc <- Mclust(data_sc, G = 3))
## 'Mclust' model object: (VII,3)
##
## Available components:
## [1] "call" "data" "modelName" "n"
## [5] "d" "G" "BIC" "loglik"
## [9] "df" "bic" "icl" "hypvol"
## [13] "parameters" "z" "classification" "uncertainty"
summary(gmm_modelsc)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VII (spherical, varying volume) model with 3 components:
##
## log-likelihood n df BIC ICL
## -45.5799 10 17 -130.3037 -130.8176
##
## Clustering table:
## 1 2 3
## 3 5 2
gmm_modelsc$classification
## [1] 1 1 1 2 2 2 3 2 3 2
# Visualisasi
gmm_vizsc <- predict(gmm_modelsc)$classification
ggplot(data = data_sc, aes(x = X1, y = X2)) +
geom_point(aes(col = as.factor(gmm_vizsc))) +
theme_bw() +
labs(col = "cluster")
Model terbaik berubah jadi VII.
Nilai BIC scaling lebih baik dibanding non-scaling.
Hasil non-scaling dan scaling menunjukkan komposisi anggota cluster yang sama, berarti scaling tidak mengubah struktur pengelompokan data pada metode ini.
Metode spectral clustering dilakukan melalui tahapan:
# Spectral
set.seed(123)
# Non-Scaling
# Similarity Matrix
similarity_matrix <- exp(-dist(data)^2)
# Spectral Decomposition
eigen_result <- eigen(similarity_matrix)
k <- 3
k_eigenvectors <- eigen_result$vectors[,1:k]
(km_keigenvec <- kmeans(k_eigenvectors, centers = k))
## K-means clustering with 3 clusters of sizes 2, 2, 6
##
## Cluster means:
## [,1] [,2] [,3]
## 1 -3.988606e-08 -2.520552e-21 -7.037361e-01
## 2 -7.071068e-01 1.355843e-16 3.951340e-08
## 3 -1.118308e-14 -2.357023e-01 -1.853613e-02
##
## Clustering vector:
## [1] 1 3 1 3 3 2 3 3 3 2
##
## Within cluster sum of squares by cluster:
## [1] 2.115304e-05 3.091525e-15 6.740949e-01
## (between_SS / total_SS = 71.5 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
# Visualisasi
ggplot(data = data, aes(x = X1, y = X2)) +
geom_point(aes(col = as.factor(km_keigenvec$cluster))) +
theme_bw() +
labs(col = "Cluster")
Nilai 71,5% menunjukkan pemisahan cluster sudah cukup baik.
Pada data non-scaling, terbentuk satu cluster besar dan dua cluster kecil, berarti sebagian besar observasi punya kemiripan yang tinggi sehingga tergabung di satu kelompok. Data belum di-scaling jadi similarity matrix masih dipengaruhi perbedaan skala antar variabel.
# Scaling
# Similarity Matrix
similarity_matrixsc <- exp(-dist(data_sc)^2)
# Spectral Decomposition
eigen_resultsc <- eigen(similarity_matrixsc)
k <- 3
k_eigenvectorsc <- eigen_resultsc$vectors[,1:k]
(km_keigenvecsc <- kmeans(k_eigenvectorsc, centers = k))
## K-means clustering with 3 clusters of sizes 5, 2, 3
##
## Cluster means:
## [,1] [,2] [,3]
## 1 -0.2239956 0.1606721 0.06362491
## 2 -0.2805540 -0.5363057 0.35444753
## 3 -0.2958270 -0.1408755 -0.46054236
##
## Clustering vector:
## [1] 3 3 3 1 1 2 1 1 1 2
##
## Within cluster sum of squares by cluster:
## [1] 0.596682971 0.006082274 0.054735006
## (between_SS / total_SS = 71.2 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
# Visualisasi
ggplot(data = data_sc, aes(x = X1, y = X2)) +
geom_point(aes(col = as.factor(km_keigenvecsc$cluster))) +
theme_bw() +
labs(col = "Cluster")
Nilai 71,2% hampir sama dengan non-scaling, jadi kualitas pemisahannya relatif setara.
Hasil scaling menghasilkan distribusi cluster yang lebih seimbang dibanding non-scaling, karena semua variabel jadi mempunyai kontribusi yang sama dalam perhitungan jarak dan similarity matrix.
Metode DBSCAN menghasilkan banyak noise sehingga kurang optimal untuk dataset ini. Setelah di-scaling, DBSCAN bisa membentuk dua cluster kecil, tapi sebagian besar data masih noise.
Metode Model-Based Clustering menghasilkan tiga cluster yang stabil baik di data non-scaling maupun scaling, berarti metode ini cukup tahan terhadap perubahan skala data.
Spectral Clustering hasilnya lebih baik setelah scaling, dan strukturnya jadi mirip dengan hasil Model-Based Clustering. Jadi Model-Based Clustering dan Spectral Clustering lebih cocok dipakai untuk dataset ini dibanding dengan DBSCAN.
X1 dan X2 dipakai untuk visualisasi karena scatterplot cuma bisa menampilkan dua variabel, tapi proses clustering tetap pakai semua variabel.