Load Library

library(cluster)
library(kmed)

Input Data

# Input data
data <- data.frame(
  X1 = c(11,6,9,3,4,15,9,13,4,13),
  X2 = c(5,3,7,7,12,8,12,9,11,7),
  X3 = c(7,11,11,7,7,4,12,11,14,3),
  X4 = c(6,5,8,12,12,8,6,14,6,12)
)

1. HCLUST

Non-Scaling

# TANPA SCALING
# Hitung jarak Manhattan
(dist_matrix <- dist(data, method = "manhattan"))
##     1  2  3  4  5  6  7  8  9
## 2  12                        
## 3  10 10                     
## 4  16 18 14                  
## 5  20 22 18  6               
## 6  12 24 14 20 22            
## 7  14 14  8 22 16 20         
## 8  18 22 12 18 18 16 16      
## 9  20 14 14 18 14 26  8 22   
## 10 14 26 16 14 18  8 24 12 30
# Hierarchical clustering (complete linkage)
(hc_complete <- hclust(dist_matrix, method = "complete"))
## 
## Call:
## hclust(d = dist_matrix, method = "complete")
## 
## Cluster method   : complete 
## Distance         : manhattan 
## Number of objects: 10
# Plot dendrogram
plot(hc_complete, 
     main = "Dendrogram (Complete Linkage - Manhattan Distance)",
     xlab = "Observasi",
     ylab = "Jarak")
rect.hclust(hc_complete, k = 3, border = "red")

(cluster_hc <- cutree(hc_complete, k = 3))
##  [1] 1 1 1 2 2 3 1 3 2 3

Scaling

# SCALING 
datasc <- scale(data, center = TRUE, scale = TRUE)

# Hitung jarak Manhattan
(dist_matrixsc <- dist(datasc, method = "manhattan"))
##           1        2        3        4        5        6        7        8
## 2  3.248338                                                               
## 3  2.854774 2.963569                                                      
## 4  4.366247 5.286835 3.719916                                             
## 5  5.822173 6.742761 5.175842 1.921476                                    
## 6  3.382009 6.630347 3.666779 5.178251 5.958697                           
## 7  4.210009 4.318803 2.574278 6.294194 4.372718 5.565577                  
## 8  5.358821 6.398715 3.435147 4.716975 4.821940 4.564244 4.658464         
## 9  5.588255 4.300399 3.952524 5.344690 4.098694 6.943823 2.053726 6.036710
## 10 4.073818 7.322157 4.358588 3.431972 4.887898 2.298390 6.932866 3.493446
##           9
## 2          
## 3          
## 4          
## 5          
## 6          
## 7          
## 8          
## 9          
## 10 8.311112
# Hierarchical clustering (complete linkage)
(hc_completesc <- hclust(dist_matrixsc, method = "complete"))
## 
## Call:
## hclust(d = dist_matrixsc, method = "complete")
## 
## Cluster method   : complete 
## Distance         : manhattan 
## Number of objects: 10
# Plot dendrogram
plot(hc_completesc, 
     main = "Dendrogram (Complete Linkage - Manhattan Distance)",
     sub = "Scaling",
     xlab = "Observasi",
     ylab = "Jarak")
rect.hclust(hc_completesc, k = 3, border = "blue")

(cluster_hcsc <- cutree(hc_completesc, k = 3))
##  [1] 1 1 1 2 2 3 1 3 1 3

Perbandingan

table(cluster_hc, cluster_hcsc)
##           cluster_hcsc
## cluster_hc 1 2 3
##          1 4 0 0
##          2 1 2 0
##          3 0 0 3

Hierarchical clustering berhasil membentuk tiga cluster baik pada data non-scaling maupun scaling. Sebagian besar anggota cluster tetap konsisten, namun scaling menyebabkan perubahan pada beberapa observasi karena perhitungan jarak menjadi lebih seimbang.

Penggunaan scaling membantu menghasilkan cluster yang lebih representatif karena mengurangi dominasi variabel tertentu dalam proses pengelompokan data.

2. K-MEANS

Non-Scaling

# k = 3
set.seed(123)
# non-scaling
(km <- kmeans(data, centers = 3, nstart = 25))
## K-means clustering with 3 clusters of sizes 2, 4, 4
## 
## Cluster means:
##     X1   X2    X3    X4
## 1  3.5 9.50  7.00 12.00
## 2 13.0 7.25  6.25 10.00
## 3  7.0 8.25 12.00  6.25
## 
## Clustering vector:
##  [1] 2 3 3 1 1 2 3 2 3 2
## 
## Within cluster sum of squares by cluster:
## [1] 13.0 95.5 79.5
##  (between_SS / total_SS =  59.1 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
kmclust <- km$cluster

Scaling

# with scaling
km.sc <- kmeans(datasc, centers = 3, nstart = 25)
(kmclust.sc <- km.sc$cluster)
##  [1] 3 1 1 2 2 3 1 2 1 3

Perbandingan

table(kmclust, kmclust.sc)
##        kmclust.sc
## kmclust 1 2 3
##       1 0 2 0
##       2 0 1 3
##       3 4 0 0

Perubahan cluster terjadi karena scaling mengubah perhitungan jarak antar observasi.

Pada data non-scaling, variabel dengan rentang lebih besar memiliki pengaruh lebih dominan terhadap pembentukan cluster.

Setelah scaling:

seluruh variabel memiliki kontribusi yang setara, sehingga kedekatan antar observasi berubah, dan beberapa observasi berpindah cluster.

Contohnya:

observasi 8 sebelumnya berada di cluster 2. Setelah scaling berpindah ke cluster 2 versi scaling yang berisi karakteristik berbeda.

K-Means berhasil membentuk tiga cluster baik pada data non-scaling maupun scaling. Namun, scaling mempengaruhi hasil pengelompokan karena metode K-Means sangat sensitif terhadap jarak.

Dengan scaling, proses clustering menjadi lebih seimbang karena seluruh variabel memiliki kontribusi yang sama dalam pembentukan centroid dan perhitungan jarak antar data.

3. K-Medoid

Non-Scaling

set.seed(123)

# non-scaling
(kmed <- pam(data, k = 3))
## Medoids:
##      ID X1 X2 X3 X4
## [1,]  3  9  7 11  8
## [2,]  5  4 12  7 12
## [3,] 10 13  7  3 12
## Clustering vector:
##  [1] 1 1 1 2 2 3 1 1 1 3
## Objective function:
##   build    swap 
## 4.12209 4.12209 
## 
## Available components:
##  [1] "medoids"    "id.med"     "clustering" "objective"  "isolation" 
##  [6] "clusinfo"   "silinfo"    "diss"       "call"       "data"
(kmed_clust <- kmed$clustering)
##  [1] 1 1 1 2 2 3 1 1 1 3
kmed$medoids
##      X1 X2 X3 X4
## [1,]  9  7 11  8
## [2,]  4 12  7 12
## [3,] 13  7  3 12

Scaling

# scaling
(kmed_sc <- pam(datasc, k = 3))
## Medoids:
##      ID          X1         X2         X3         X4
## [1,]  3  0.06983252 -0.3715142  0.6349275 -0.2742849
## [2,]  5 -1.09404276  1.3171868 -0.4692942  0.9447590
## [3,] 10  1.00093274 -0.3715142 -1.5735160  0.9447590
## Clustering vector:
##  [1] 1 1 1 2 2 3 1 1 1 3
## Objective function:
##    build     swap 
## 1.239476 1.239476 
## 
## Available components:
##  [1] "medoids"    "id.med"     "clustering" "objective"  "isolation" 
##  [6] "clusinfo"   "silinfo"    "diss"       "call"       "data"
(kmed_clust_sc <- kmed_sc$clustering)
##  [1] 1 1 1 2 2 3 1 1 1 3
kmed_sc$medoids
##               X1         X2         X3         X4
## [1,]  0.06983252 -0.3715142  0.6349275 -0.2742849
## [2,] -1.09404276  1.3171868 -0.4692942  0.9447590
## [3,]  1.00093274 -0.3715142 -1.5735160  0.9447590

Perbandingan

table(kmed_clust, kmed_clust_sc)
##           kmed_clust_sc
## kmed_clust 1 2 3
##          1 6 0 0
##          2 0 2 0
##          3 0 0 2

Metode K-Medoids berhasil membentuk tiga cluster yang stabil baik pada data non-scaling maupun scaling. Seluruh anggota cluster tetap sama setelah scaling, yang menunjukkan bahwa struktur data cukup konsisten.

Selain itu, scaling menghasilkan nilai objective function yang lebih kecil sehingga cluster menjadi lebih kompak dan representatif.

4. PAM

Non-Scaling

# Non-Scaling
pam_non <- pam(data, k = 3)
pam_non$clustering 
##  [1] 1 1 1 2 2 3 1 1 1 3
pam_non$medoids # Medoid
##      X1 X2 X3 X4
## [1,]  9  7 11  8
## [2,]  4 12  7 12
## [3,] 13  7  3 12
result_pam_non <- data 
result_pam_non$cluster <- as.factor(pam_non$clustering)
result_pam_non
##    X1 X2 X3 X4 cluster
## 1  11  5  7  6       1
## 2   6  3 11  5       1
## 3   9  7 11  8       1
## 4   3  7  7 12       2
## 5   4 12  7 12       2
## 6  15  8  4  8       3
## 7   9 12 12  6       1
## 8  13  9 11 14       1
## 9   4 11 14  6       1
## 10 13  7  3 12       3

Scaling

# Scaling
pam_sc <- pam(datasc, k = 3)
pam_sc$clustering 
##  [1] 1 1 1 2 2 3 1 1 1 3
pam_sc$medoids # Medoid
##               X1         X2         X3         X4
## [1,]  0.06983252 -0.3715142  0.6349275 -0.2742849
## [2,] -1.09404276  1.3171868 -0.4692942  0.9447590
## [3,]  1.00093274 -0.3715142 -1.5735160  0.9447590
result_pam_sc<- datasc 
result_pam_sc$cluster <- as.factor(pam_sc$clustering)
## Warning in result_pam_sc$cluster <- as.factor(pam_sc$clustering): Coercing LHS
## to a list
result_pam_sc
## [[1]]
## [1] 0.5353826
## 
## [[2]]
## [1] -0.6284926
## 
## [[3]]
## [1] 0.06983252
## 
## [[4]]
## [1] -1.326818
## 
## [[5]]
## [1] -1.094043
## 
## [[6]]
## [1] 1.466483
## 
## [[7]]
## [1] 0.06983252
## 
## [[8]]
## [1] 1.000933
## 
## [[9]]
## [1] -1.094043
## 
## [[10]]
## [1] 1.000933
## 
## [[11]]
## [1] -1.046995
## 
## [[12]]
## [1] -1.722475
## 
## [[13]]
## [1] -0.3715142
## 
## [[14]]
## [1] -0.3715142
## 
## [[15]]
## [1] 1.317187
## 
## [[16]]
## [1] -0.03377402
## 
## [[17]]
## [1] 1.317187
## 
## [[18]]
## [1] 0.3039662
## 
## [[19]]
## [1] 0.9794466
## 
## [[20]]
## [1] -0.3715142
## 
## [[21]]
## [1] -0.4692942
## 
## [[22]]
## [1] 0.6349275
## 
## [[23]]
## [1] 0.6349275
## 
## [[24]]
## [1] -0.4692942
## 
## [[25]]
## [1] -0.4692942
## 
## [[26]]
## [1] -1.297461
## 
## [[27]]
## [1] 0.9109829
## 
## [[28]]
## [1] 0.6349275
## 
## [[29]]
## [1] 1.463094
## 
## [[30]]
## [1] -1.573516
## 
## [[31]]
## [1] -0.8838069
## 
## [[32]]
## [1] -1.188568
## 
## [[33]]
## [1] -0.2742849
## 
## [[34]]
## [1] 0.944759
## 
## [[35]]
## [1] 0.944759
## 
## [[36]]
## [1] -0.2742849
## 
## [[37]]
## [1] -0.8838069
## 
## [[38]]
## [1] 1.554281
## 
## [[39]]
## [1] -0.8838069
## 
## [[40]]
## [1] 0.944759
## 
## $cluster
##  [1] 1 1 1 2 2 3 1 1 1 3
## Levels: 1 2 3

Metode PAM berhasil membentuk tiga cluster dengan hasil yang stabil pada data non-scaling maupun scaling. Seluruh anggota cluster tetap sama setelah scaling, sehingga menunjukkan bahwa struktur data cukup konsisten dan tidak sensitif terhadap perubahan skala variabel.