library(cluster)
library(kmed)
# Input data
data <- data.frame(
X1 = c(11,6,9,3,4,15,9,13,4,13),
X2 = c(5,3,7,7,12,8,12,9,11,7),
X3 = c(7,11,11,7,7,4,12,11,14,3),
X4 = c(6,5,8,12,12,8,6,14,6,12)
)
# TANPA SCALING
# Hitung jarak Manhattan
(dist_matrix <- dist(data, method = "manhattan"))
## 1 2 3 4 5 6 7 8 9
## 2 12
## 3 10 10
## 4 16 18 14
## 5 20 22 18 6
## 6 12 24 14 20 22
## 7 14 14 8 22 16 20
## 8 18 22 12 18 18 16 16
## 9 20 14 14 18 14 26 8 22
## 10 14 26 16 14 18 8 24 12 30
# Hierarchical clustering (complete linkage)
(hc_complete <- hclust(dist_matrix, method = "complete"))
##
## Call:
## hclust(d = dist_matrix, method = "complete")
##
## Cluster method : complete
## Distance : manhattan
## Number of objects: 10
# Plot dendrogram
plot(hc_complete,
main = "Dendrogram (Complete Linkage - Manhattan Distance)",
xlab = "Observasi",
ylab = "Jarak")
rect.hclust(hc_complete, k = 3, border = "red")
(cluster_hc <- cutree(hc_complete, k = 3))
## [1] 1 1 1 2 2 3 1 3 2 3
# SCALING
datasc <- scale(data, center = TRUE, scale = TRUE)
# Hitung jarak Manhattan
(dist_matrixsc <- dist(datasc, method = "manhattan"))
## 1 2 3 4 5 6 7 8
## 2 3.248338
## 3 2.854774 2.963569
## 4 4.366247 5.286835 3.719916
## 5 5.822173 6.742761 5.175842 1.921476
## 6 3.382009 6.630347 3.666779 5.178251 5.958697
## 7 4.210009 4.318803 2.574278 6.294194 4.372718 5.565577
## 8 5.358821 6.398715 3.435147 4.716975 4.821940 4.564244 4.658464
## 9 5.588255 4.300399 3.952524 5.344690 4.098694 6.943823 2.053726 6.036710
## 10 4.073818 7.322157 4.358588 3.431972 4.887898 2.298390 6.932866 3.493446
## 9
## 2
## 3
## 4
## 5
## 6
## 7
## 8
## 9
## 10 8.311112
# Hierarchical clustering (complete linkage)
(hc_completesc <- hclust(dist_matrixsc, method = "complete"))
##
## Call:
## hclust(d = dist_matrixsc, method = "complete")
##
## Cluster method : complete
## Distance : manhattan
## Number of objects: 10
# Plot dendrogram
plot(hc_completesc,
main = "Dendrogram (Complete Linkage - Manhattan Distance)",
sub = "Scaling",
xlab = "Observasi",
ylab = "Jarak")
rect.hclust(hc_completesc, k = 3, border = "blue")
(cluster_hcsc <- cutree(hc_completesc, k = 3))
## [1] 1 1 1 2 2 3 1 3 1 3
table(cluster_hc, cluster_hcsc)
## cluster_hcsc
## cluster_hc 1 2 3
## 1 4 0 0
## 2 1 2 0
## 3 0 0 3
Hierarchical clustering berhasil membentuk tiga cluster baik pada data non-scaling maupun scaling. Sebagian besar anggota cluster tetap konsisten, namun scaling menyebabkan perubahan pada beberapa observasi karena perhitungan jarak menjadi lebih seimbang.
Penggunaan scaling membantu menghasilkan cluster yang lebih representatif karena mengurangi dominasi variabel tertentu dalam proses pengelompokan data.
# k = 3
set.seed(123)
# non-scaling
(km <- kmeans(data, centers = 3, nstart = 25))
## K-means clustering with 3 clusters of sizes 2, 4, 4
##
## Cluster means:
## X1 X2 X3 X4
## 1 3.5 9.50 7.00 12.00
## 2 13.0 7.25 6.25 10.00
## 3 7.0 8.25 12.00 6.25
##
## Clustering vector:
## [1] 2 3 3 1 1 2 3 2 3 2
##
## Within cluster sum of squares by cluster:
## [1] 13.0 95.5 79.5
## (between_SS / total_SS = 59.1 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
kmclust <- km$cluster
# with scaling
km.sc <- kmeans(datasc, centers = 3, nstart = 25)
(kmclust.sc <- km.sc$cluster)
## [1] 3 1 1 2 2 3 1 2 1 3
table(kmclust, kmclust.sc)
## kmclust.sc
## kmclust 1 2 3
## 1 0 2 0
## 2 0 1 3
## 3 4 0 0
Perubahan cluster terjadi karena scaling mengubah perhitungan jarak antar observasi.
Pada data non-scaling, variabel dengan rentang lebih besar memiliki pengaruh lebih dominan terhadap pembentukan cluster.
Setelah scaling:
seluruh variabel memiliki kontribusi yang setara, sehingga kedekatan antar observasi berubah, dan beberapa observasi berpindah cluster.
Contohnya:
observasi 8 sebelumnya berada di cluster 2. Setelah scaling berpindah ke cluster 2 versi scaling yang berisi karakteristik berbeda.
K-Means berhasil membentuk tiga cluster baik pada data non-scaling maupun scaling. Namun, scaling mempengaruhi hasil pengelompokan karena metode K-Means sangat sensitif terhadap jarak.
Dengan scaling, proses clustering menjadi lebih seimbang karena seluruh variabel memiliki kontribusi yang sama dalam pembentukan centroid dan perhitungan jarak antar data.
set.seed(123)
# non-scaling
(kmed <- pam(data, k = 3))
## Medoids:
## ID X1 X2 X3 X4
## [1,] 3 9 7 11 8
## [2,] 5 4 12 7 12
## [3,] 10 13 7 3 12
## Clustering vector:
## [1] 1 1 1 2 2 3 1 1 1 3
## Objective function:
## build swap
## 4.12209 4.12209
##
## Available components:
## [1] "medoids" "id.med" "clustering" "objective" "isolation"
## [6] "clusinfo" "silinfo" "diss" "call" "data"
(kmed_clust <- kmed$clustering)
## [1] 1 1 1 2 2 3 1 1 1 3
kmed$medoids
## X1 X2 X3 X4
## [1,] 9 7 11 8
## [2,] 4 12 7 12
## [3,] 13 7 3 12
# scaling
(kmed_sc <- pam(datasc, k = 3))
## Medoids:
## ID X1 X2 X3 X4
## [1,] 3 0.06983252 -0.3715142 0.6349275 -0.2742849
## [2,] 5 -1.09404276 1.3171868 -0.4692942 0.9447590
## [3,] 10 1.00093274 -0.3715142 -1.5735160 0.9447590
## Clustering vector:
## [1] 1 1 1 2 2 3 1 1 1 3
## Objective function:
## build swap
## 1.239476 1.239476
##
## Available components:
## [1] "medoids" "id.med" "clustering" "objective" "isolation"
## [6] "clusinfo" "silinfo" "diss" "call" "data"
(kmed_clust_sc <- kmed_sc$clustering)
## [1] 1 1 1 2 2 3 1 1 1 3
kmed_sc$medoids
## X1 X2 X3 X4
## [1,] 0.06983252 -0.3715142 0.6349275 -0.2742849
## [2,] -1.09404276 1.3171868 -0.4692942 0.9447590
## [3,] 1.00093274 -0.3715142 -1.5735160 0.9447590
table(kmed_clust, kmed_clust_sc)
## kmed_clust_sc
## kmed_clust 1 2 3
## 1 6 0 0
## 2 0 2 0
## 3 0 0 2
Metode K-Medoids berhasil membentuk tiga cluster yang stabil baik pada data non-scaling maupun scaling. Seluruh anggota cluster tetap sama setelah scaling, yang menunjukkan bahwa struktur data cukup konsisten.
Selain itu, scaling menghasilkan nilai objective function yang lebih kecil sehingga cluster menjadi lebih kompak dan representatif.
# Non-Scaling
pam_non <- pam(data, k = 3)
pam_non$clustering
## [1] 1 1 1 2 2 3 1 1 1 3
pam_non$medoids # Medoid
## X1 X2 X3 X4
## [1,] 9 7 11 8
## [2,] 4 12 7 12
## [3,] 13 7 3 12
result_pam_non <- data
result_pam_non$cluster <- as.factor(pam_non$clustering)
result_pam_non
## X1 X2 X3 X4 cluster
## 1 11 5 7 6 1
## 2 6 3 11 5 1
## 3 9 7 11 8 1
## 4 3 7 7 12 2
## 5 4 12 7 12 2
## 6 15 8 4 8 3
## 7 9 12 12 6 1
## 8 13 9 11 14 1
## 9 4 11 14 6 1
## 10 13 7 3 12 3
# Scaling
pam_sc <- pam(datasc, k = 3)
pam_sc$clustering
## [1] 1 1 1 2 2 3 1 1 1 3
pam_sc$medoids # Medoid
## X1 X2 X3 X4
## [1,] 0.06983252 -0.3715142 0.6349275 -0.2742849
## [2,] -1.09404276 1.3171868 -0.4692942 0.9447590
## [3,] 1.00093274 -0.3715142 -1.5735160 0.9447590
result_pam_sc<- datasc
result_pam_sc$cluster <- as.factor(pam_sc$clustering)
## Warning in result_pam_sc$cluster <- as.factor(pam_sc$clustering): Coercing LHS
## to a list
result_pam_sc
## [[1]]
## [1] 0.5353826
##
## [[2]]
## [1] -0.6284926
##
## [[3]]
## [1] 0.06983252
##
## [[4]]
## [1] -1.326818
##
## [[5]]
## [1] -1.094043
##
## [[6]]
## [1] 1.466483
##
## [[7]]
## [1] 0.06983252
##
## [[8]]
## [1] 1.000933
##
## [[9]]
## [1] -1.094043
##
## [[10]]
## [1] 1.000933
##
## [[11]]
## [1] -1.046995
##
## [[12]]
## [1] -1.722475
##
## [[13]]
## [1] -0.3715142
##
## [[14]]
## [1] -0.3715142
##
## [[15]]
## [1] 1.317187
##
## [[16]]
## [1] -0.03377402
##
## [[17]]
## [1] 1.317187
##
## [[18]]
## [1] 0.3039662
##
## [[19]]
## [1] 0.9794466
##
## [[20]]
## [1] -0.3715142
##
## [[21]]
## [1] -0.4692942
##
## [[22]]
## [1] 0.6349275
##
## [[23]]
## [1] 0.6349275
##
## [[24]]
## [1] -0.4692942
##
## [[25]]
## [1] -0.4692942
##
## [[26]]
## [1] -1.297461
##
## [[27]]
## [1] 0.9109829
##
## [[28]]
## [1] 0.6349275
##
## [[29]]
## [1] 1.463094
##
## [[30]]
## [1] -1.573516
##
## [[31]]
## [1] -0.8838069
##
## [[32]]
## [1] -1.188568
##
## [[33]]
## [1] -0.2742849
##
## [[34]]
## [1] 0.944759
##
## [[35]]
## [1] 0.944759
##
## [[36]]
## [1] -0.2742849
##
## [[37]]
## [1] -0.8838069
##
## [[38]]
## [1] 1.554281
##
## [[39]]
## [1] -0.8838069
##
## [[40]]
## [1] 0.944759
##
## $cluster
## [1] 1 1 1 2 2 3 1 1 1 3
## Levels: 1 2 3
Metode PAM berhasil membentuk tiga cluster dengan hasil yang stabil pada data non-scaling maupun scaling. Seluruh anggota cluster tetap sama setelah scaling, sehingga menunjukkan bahwa struktur data cukup konsisten dan tidak sensitif terhadap perubahan skala variabel.