Clustering

Dataset

Dataset: https://www.kaggle.com/datasets/valakhorasani/gym-members-exercise-dataset

Notebook ini membandingkan 5 algoritma clustering pada dataset gym members: K-means, K-medians, DBSCAN, Mean Shift, dan Fuzzy C-means. Data diproses menggunakan PCA sebelum clustering untuk mereduksi dimensi dan menghilangkan multikolinearitas antar fitur.

library(psych)
library(tidyverse)
library(flexclust)
library(dbscan)
library(meanShiftR)
library(e1071)
library(cluster)
library(fpc)
library(mclust)
library(factoextra)
library(gridExtra)

Load dan Eksplorasi Data

# Import Data
data <- read.csv("gym_members_exercise_tracking.csv")
knitr::kable(head(data))

Age	Gender	Weight..kg.	Height..m.	Max_BPM	Avg_BPM	Resting_BPM	Session_Duration..hours.	Calories_Burned	Workout_Type	Fat_Percentage	Water_Intake..liters.	Workout_Frequency..days.week.	Experience_Level	BMI
56	Male	88.3	1.71	180	157	60	1.69	1313	Yoga	12.6	3.5	4	3	30.20
46	Female	74.9	1.53	179	151	66	1.30	883	HIIT	33.9	2.1	4	2	32.00
32	Female	68.1	1.66	167	122	54	1.11	677	Cardio	33.4	2.3	4	2	24.71
25	Male	53.2	1.70	190	164	56	0.59	532	Strength	28.8	2.1	3	1	18.41
38	Male	46.1	1.79	188	158	68	0.64	556	Strength	29.2	2.8	3	1	14.39
56	Female	58.0	1.68	168	156	74	1.59	1116	HIIT	15.5	2.7	5	3	20.55

# Dimensi Data
cat("Dimensi:", nrow(data), "x", ncol(data), "\n")

## Dimensi: 973 x 15

# Struktur dan Tipe Data
str(data)

## 'data.frame':    973 obs. of  15 variables:
##  $ Age                          : int  56 46 32 25 38 56 36 40 28 28 ...
##  $ Gender                       : chr  "Male" "Female" "Female" "Male" ...
##  $ Weight..kg.                  : num  88.3 74.9 68.1 53.2 46.1 ...
##  $ Height..m.                   : num  1.71 1.53 1.66 1.7 1.79 1.68 1.72 1.51 1.94 1.84 ...
##  $ Max_BPM                      : int  180 179 167 190 188 168 174 189 185 169 ...
##  $ Avg_BPM                      : int  157 151 122 164 158 156 169 141 127 136 ...
##  $ Resting_BPM                  : int  60 66 54 56 68 74 73 64 52 64 ...
##  $ Session_Duration..hours.     : num  1.69 1.3 1.11 0.59 0.64 1.59 1.49 1.27 1.03 1.08 ...
##  $ Calories_Burned              : num  1313 883 677 532 556 ...
##  $ Workout_Type                 : chr  "Yoga" "HIIT" "Cardio" "Strength" ...
##  $ Fat_Percentage               : num  12.6 33.9 33.4 28.8 29.2 15.5 21.3 30.6 28.9 29.7 ...
##  $ Water_Intake..liters.        : num  3.5 2.1 2.3 2.1 2.8 2.7 2.3 1.9 2.6 2.7 ...
##  $ Workout_Frequency..days.week.: int  4 4 4 3 3 5 3 3 4 3 ...
##  $ Experience_Level             : int  3 2 2 1 1 3 2 2 2 1 ...
##  $ BMI                          : num  30.2 32 24.7 18.4 14.4 ...

summary(data)

##       Age           Gender           Weight..kg.       Height..m.   
##  Min.   :18.00   Length:973         Min.   : 40.00   Min.   :1.500  
##  1st Qu.:28.00   Class :character   1st Qu.: 58.10   1st Qu.:1.620  
##  Median :40.00   Mode  :character   Median : 70.00   Median :1.710  
##  Mean   :38.68                      Mean   : 73.85   Mean   :1.723  
##  3rd Qu.:49.00                      3rd Qu.: 86.00   3rd Qu.:1.800  
##  Max.   :59.00                      Max.   :129.90   Max.   :2.000  
##     Max_BPM         Avg_BPM       Resting_BPM    Session_Duration..hours.
##  Min.   :160.0   Min.   :120.0   Min.   :50.00   Min.   :0.500           
##  1st Qu.:170.0   1st Qu.:131.0   1st Qu.:56.00   1st Qu.:1.040           
##  Median :180.0   Median :143.0   Median :62.00   Median :1.260           
##  Mean   :179.9   Mean   :143.8   Mean   :62.22   Mean   :1.256           
##  3rd Qu.:190.0   3rd Qu.:156.0   3rd Qu.:68.00   3rd Qu.:1.460           
##  Max.   :199.0   Max.   :169.0   Max.   :74.00   Max.   :2.000           
##  Calories_Burned  Workout_Type       Fat_Percentage  Water_Intake..liters.
##  Min.   : 303.0   Length:973         Min.   :10.00   Min.   :1.500        
##  1st Qu.: 720.0   Class :character   1st Qu.:21.30   1st Qu.:2.200        
##  Median : 893.0   Mode  :character   Median :26.20   Median :2.600        
##  Mean   : 905.4                      Mean   :24.98   Mean   :2.627        
##  3rd Qu.:1076.0                      3rd Qu.:29.30   3rd Qu.:3.100        
##  Max.   :1783.0                      Max.   :35.00   Max.   :3.700        
##  Workout_Frequency..days.week. Experience_Level      BMI       
##  Min.   :2.000                 Min.   :1.00     Min.   :12.32  
##  1st Qu.:3.000                 1st Qu.:1.00     1st Qu.:20.11  
##  Median :3.000                 Median :2.00     Median :24.16  
##  Mean   :3.322                 Mean   :1.81     Mean   :24.91  
##  3rd Qu.:4.000                 3rd Qu.:2.00     3rd Qu.:28.56  
##  Max.   :5.000                 Max.   :3.00     Max.   :49.84

colSums(is.na(data))

##                           Age                        Gender 
##                             0                             0 
##                   Weight..kg.                    Height..m. 
##                             0                             0 
##                       Max_BPM                       Avg_BPM 
##                             0                             0 
##                   Resting_BPM      Session_Duration..hours. 
##                             0                             0 
##               Calories_Burned                  Workout_Type 
##                             0                             0 
##                Fat_Percentage         Water_Intake..liters. 
##                             0                             0 
## Workout_Frequency..days.week.              Experience_Level 
##                             0                             0 
##                           BMI 
##                             0

Preprocessing

1. Hitung Missing Value

colSums(is.na(data))

##                           Age                        Gender 
##                             0                             0 
##                   Weight..kg.                    Height..m. 
##                             0                             0 
##                       Max_BPM                       Avg_BPM 
##                             0                             0 
##                   Resting_BPM      Session_Duration..hours. 
##                             0                             0 
##               Calories_Burned                  Workout_Type 
##                             0                             0 
##                Fat_Percentage         Water_Intake..liters. 
##                             0                             0 
## Workout_Frequency..days.week.              Experience_Level 
##                             0                             0 
##                           BMI 
##                             0

2. Hanya Menggunakan Fitur Numerik

data_num   <- data[, sapply(data, is.numeric)]
cat("Fitur numerik yang digunakan:", ncol(data_num), "\n")

## Fitur numerik yang digunakan: 13

3. Mengganti Nama Fitur

colnames(data_num) <- paste0("X", 1:ncol(data_num))
knitr::kable(head(data_num))

X1	X2	X3	X4	X5	X6	X7	X8	X9	X10	X11	X12	X13
56	88.3	1.71	180	157	60	1.69	1313	12.6	3.5	4	3	30.20
46	74.9	1.53	179	151	66	1.30	883	33.9	2.1	4	2	32.00
32	68.1	1.66	167	122	54	1.11	677	33.4	2.3	4	2	24.71
25	53.2	1.70	190	164	56	0.59	532	28.8	2.1	3	1	18.41
38	46.1	1.79	188	158	68	0.64	556	29.2	2.8	3	1	14.39
56	58.0	1.68	168	156	74	1.59	1116	15.5	2.7	5	3	20.55

Assumptions

1. Corrrelation

Masing-masing variabel memiliki korelasi dan tidak bernilai 0

cor(data_num)

##               X1           X2           X3            X4            X5
## X1   1.000000000 -0.036339635 -0.027837495 -0.0170725970  0.0359691433
## X2  -0.036339635  1.000000000  0.365321203  0.0570611305  0.0097174780
## X3  -0.027837495  0.365321203  1.000000000 -0.0176598843 -0.0147762881
## X4  -0.017072597  0.057061130 -0.017659884  1.0000000000 -0.0397514432
## X5   0.035969143  0.009717478 -0.014776288 -0.0397514432  1.0000000000
## X6   0.004353714 -0.032138091 -0.005089864  0.0366474807  0.0596355022
## X7  -0.019911904 -0.013665561 -0.010205897  0.0100509814  0.0160144382
## X8  -0.154678760  0.095443473  0.086348051  0.0020900159  0.3396586672
## X9   0.002370051 -0.225511640 -0.235520936 -0.0090557315 -0.0073016551
## X10  0.041528359  0.394275710  0.393532902  0.0316206428 -0.0029106374
## X11  0.008055163 -0.011769328 -0.011269883 -0.0290990657 -0.0106807977
## X12 -0.018675927  0.003378528 -0.010266611  0.0005448337 -0.0008881572
## X13 -0.013691370  0.853157690 -0.159468750  0.0671052310  0.0216054995
##               X6           X7           X8           X9          X10
## X1   0.004353714 -0.019911904 -0.154678760  0.002370051  0.041528359
## X2  -0.032138091 -0.013665561  0.095443473 -0.225511640  0.394275710
## X3  -0.005089864 -0.010205897  0.086348051 -0.235520936  0.393532902
## X4   0.036647481  0.010050981  0.002090016 -0.009055731  0.031620643
## X5   0.059635502  0.016014438  0.339658667 -0.007301655 -0.002910637
## X6   1.000000000 -0.016648808  0.016517951 -0.016834389  0.007725998
## X7  -0.016648808  1.000000000  0.908140376 -0.581519771  0.283410977
## X8   0.016517951  0.908140376  1.000000000 -0.597615248  0.356930683
## X9  -0.016834389 -0.581519771 -0.597615248  1.000000000 -0.588682834
## X10  0.007725998  0.283410977  0.356930683 -0.588682834  1.000000000
## X11 -0.007966891  0.644140366  0.576150125 -0.537059548  0.238562571
## X12  0.001757585  0.764768119  0.694129448 -0.654362613  0.304103549
## X13 -0.032542632 -0.006492647  0.059760826 -0.119257760  0.213696572
##              X11           X12          X13
## X1   0.008055163 -0.0186759269 -0.013691370
## X2  -0.011769328  0.0033785279  0.853157690
## X3  -0.011269883 -0.0102666112 -0.159468750
## X4  -0.029099066  0.0005448337  0.067105231
## X5  -0.010680798 -0.0008881572  0.021605500
## X6  -0.007966891  0.0017575852 -0.032542632
## X7   0.644140366  0.7647681189 -0.006492647
## X8   0.576150125  0.6941294479  0.059760826
## X9  -0.537059548 -0.6543626129 -0.119257760
## X10  0.238562571  0.3041035494  0.213696572
## X11  1.000000000  0.8370787094  0.001644974
## X12  0.837078709  1.0000000000  0.016031073
## X13  0.001644974  0.0160310726  1.000000000

corrplot::corrplot(cor(data_num), tl.col = "black", tl.srt = 45, tl.cex = 0.5)

2. MSA

KMO/MSA memilki nilai > 0.5

# Check MSA
r <- cor(data_num)
KMO(r)

## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = r)
## Overall MSA =  0.47
## MSA for each item = 
##   X1   X2   X3   X4   X5   X6   X7   X8   X9  X10  X11  X12  X13 
## 0.02 0.35 0.16 0.63 0.06 0.25 0.50 0.49 0.86 0.75 0.80 0.79 0.29

# Delete X1
data <- data_num[-1]
r <- cor(data)
KMO(r)

## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = r)
## Overall MSA =  0.54
## MSA for each item = 
##   X2   X3   X4   X5   X6   X7   X8   X9  X10  X11  X12  X13 
## 0.36 0.16 0.60 0.09 0.40 0.59 0.58 0.87 0.82 0.80 0.80 0.30

# Delete X5
data <- data[-4]
r <- cor(data)
KMO(r)

## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = r)
## Overall MSA =  0.61
## MSA for each item = 
##   X2   X3   X4   X6   X7   X8   X9  X10  X11  X12  X13 
## 0.36 0.16 0.54 0.19 0.74 0.75 0.87 0.83 0.80 0.79 0.30

# Delete X3
data <- data[-2]
r <- cor(data)
KMO(r)

## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = r)
## Overall MSA =  0.72
## MSA for each item = 
##   X2   X4   X6   X7   X8   X9  X10  X11  X12  X13 
## 0.51 0.53 0.19 0.74 0.75 0.85 0.73 0.80 0.78 0.50

# Delete X6
data <- data[-3]
r <- cor(data)
KMO(r)

## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = r)
## Overall MSA =  0.72
## MSA for each item = 
##   X2   X4   X7   X8   X9  X10  X11  X12  X13 
## 0.51 0.56 0.74 0.75 0.85 0.73 0.80 0.79 0.50

3. Bartlett Test

Dengan syarat hasil p-value < 0.05

bartlett.test(data)

## 
##  Bartlett test of homogeneity of variances
## 
## data:  data
## Bartlett's K-squared = 53948, df = 8, p-value < 2.2e-16

Principal Component Analysis

1. Standarisasi

data_scale <- scale(data)
r = cov(data_scale)

2. Eigen Value dan Vector

pc <- eigen(r)
eigen_value <- pc$values
eigen_vector <- pc$vectors
print(eigen_value)

## [1] 3.97241779 2.01436367 0.99301511 0.82277961 0.55528999 0.30782414 0.14308312
## [8] 0.11605509 0.07517148

print(eigen_vector)

##               [,1]        [,2]        [,3]        [,4]         [,5]
##  [1,] -0.099617240 -0.65940372 -0.06538785 -0.14384144 -0.008399245
##  [2,] -0.005696882 -0.09097688  0.99424603  0.02099783  0.044865893
##  [3,] -0.440116953  0.15513574  0.04407065 -0.18577363 -0.440966682
##  [4,] -0.435887810  0.07399296  0.02110284 -0.12288184 -0.571382876
##  [5,]  0.408796215  0.08320719  0.02566220 -0.36646008 -0.141992160
##  [6,] -0.271758072 -0.29072163 -0.04086506  0.73988720  0.001629442
##  [7,] -0.400789286  0.15495238 -0.02254232 -0.23976481  0.589730005
##  [8,] -0.446212227  0.14249754  0.01358778 -0.16818811  0.329125078
##  [9,] -0.075090869 -0.62578718 -0.04231333 -0.40218961  0.026782003
##               [,6]        [,7]          [,8]         [,9]
##  [1,] -0.002632990  0.35104856  0.6091448359  0.189544735
##  [2,] -0.001620776  0.02530538  0.0008635083 -0.008245426
##  [3,] -0.067321913 -0.10161042 -0.1160223900  0.723191661
##  [4,] -0.073596872  0.24264945  0.0301603762 -0.630316021
##  [5,] -0.796334024 -0.12359334  0.1450612238 -0.019569357
##  [6,] -0.509513725 -0.14481995 -0.1091713262  0.005232390
##  [7,] -0.308731214  0.50252965 -0.2472519228  0.024420993
##  [8,] -0.008113606 -0.63480224  0.4634801799 -0.161059984
##  [9,]  0.030146211 -0.33935681 -0.5528674818 -0.129434537

3. Varians dan Kumulatif

sumvar <- sum(eigen_value)

propvar <- eigen_value / sumvar
cumvar  <- cumsum(propvar)

# Tampilkan tabel
pca_table <- data.frame(
  PC = paste0("PC", 1:length(eigen_value)),
  eigen_value = eigen_value,
  propvar = propvar * 100,
  cumulative = cumvar * 100
)

print(pca_table)

##    PC eigen_value    propvar cumulative
## 1 PC1  3.97241779 44.1379754   44.13798
## 2 PC2  2.01436367 22.3818185   66.51979
## 3 PC3  0.99301511 11.0335012   77.55330
## 4 PC4  0.82277961  9.1419956   86.69529
## 5 PC5  0.55528999  6.1698887   92.86518
## 6 PC6  0.30782414  3.4202683   96.28545
## 7 PC7  0.14308312  1.5898125   97.87526
## 8 PC8  0.11605509  1.2895010   99.16476
## 9 PC9  0.07517148  0.8352387  100.00000

4. Mengambil Total PCA >= 80%

n_pc <- which(cumvar >= 0.80)[1]

cat("Jumlah PC:", n_pc, "\n")

## Jumlah PC: 4

cat("Total varians:", round(cumvar[n_pc]*100, 2), "%\n")

## Total varians: 86.7 %

# Visualisasi
par(mfrow = c(1, 2), mar = c(5, 4, 4, 2))

# Scree Plot 
plot(propvar * 100, type = "b", pch = 19, col = "steelblue", lwd = 2,
     xlab = "Principal Component", ylab = "Varians (%)",
     main = "Scree Plot", frame = FALSE)

abline(v = n_pc, col = "red", lty = 2)
legend("topright", legend = paste("PC ke-", n_pc),
       col = "red", lty = 2, bty = "n")

# Kumulatif Varians 
plot(cumvar * 100, type = "b", pch = 19, col = "darkorange", lwd = 2,
     xlab = "Jumlah PC", ylab = "Kumulatif Varians (%)",
     main = "Kumulatif Varians", frame = FALSE, ylim = c(0, 100))

abline(h = 80, col = "red", lty = 2)
legend("bottomright", legend = "80% threshold",
       col = "red", lty = 2, bty = "n")

5. Hasil PCA

scores <- data_scale %*% eigen_vector

# Ambil sesuai n_pc
scores_PC <- scores

head(scores_PC)

##            [,1]       [,2]        [,3]         [,4]       [,5]       [,6]
## [1,] -3.5548957 -0.8833618 -0.08595574  0.517010543 -0.1447401  0.4177351
## [2,]  0.3043330 -0.1523520 -0.06144534 -1.843146738  0.3344867 -0.8900295
## [3,]  0.8742913  0.5933339 -1.08535378 -0.907137050  0.9490050 -0.9337736
## [4,]  2.7352589  0.8658108  0.90771702  0.475913884  1.0044088  0.2816337
## [5,]  2.4215596  1.1754000  0.74487711  1.564798773  0.8615264 -0.3968399
## [6,] -2.7419137  1.5571772 -0.95281782  0.007667743  0.9010904  0.4243947
##             [,7]         [,8]        [,9]
## [1,] -0.41012131 -0.009206248 -0.19661338
## [2,] -0.21756741 -0.337534476 -0.03979506
## [3,] -0.15051135  0.064842298  0.12866605
## [4,]  0.44619488 -0.104668701 -0.45596013
## [5,]  0.35883015 -0.107400094 -0.38509153
## [6,]  0.09407731 -0.125485492 -0.01599479

6. Visualisasi

pca_res <- prcomp(data_scale, center = FALSE, scale. = FALSE)

fviz_pca_var(pca_res,
             axes = c(1, 2),
             col.var = "contrib",
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE) +
  labs(title = "PCA — Kontribusi Variabel ke PC1 & PC2") +
  theme_minimal(base_size = 11)

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the ggpubr package.
##   Please report the issue at <https://github.com/kassambara/ggpubr/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Penentuan Jumlah Cluster Optimal

# Elbow Method (WSS)
wss <- sapply(1:10, function(k) {
  kmeans(scores_PC, centers = k, nstart = 20, iter.max = 100)$tot.withinss
})

# Silhouette Method 
k_values <- 2:10
avg_sil_values <- sapply(k_values, function(k) {
  km <- kmeans(scores_PC, centers = k, nstart = 25, iter.max = 100)
  mean(silhouette(km$cluster, dist(scores_PC))[, 3])
})

# Best K 
best_k <- k_values[which.max(avg_sil_values)]
cat("K optimal (Silhouette):", best_k, "\n")

## K optimal (Silhouette): 2

par(mfrow = c(1, 2), mar = c(5, 4, 4, 2))

plot(1:10, wss, type = "b", pch = 19, col = "steelblue", lwd = 2,
     xlab = "K", ylab = "Within-Cluster SS", main = "Elbow Method", frame = FALSE)
abline(v = 3, col = "red", lty = 2)
legend("topright", legend = "K = 3", col = "red", lty = 2, bty = "n")

plot(k_values, avg_sil_values, type = "b", pch = 19, col = "darkorange", lwd = 2,
     xlab = "K", ylab = "Avg Silhouette Width", main = "Silhouette Analysis", frame = FALSE)
abline(v = best_k, col = "red", lty = 2)
legend("topright", legend = paste("K =", best_k), col = "red", lty = 2, bty = "n")

Clustering (K = 3)

6.1 K-Means

# 1. K-means
km_res <- kmeans(scores_PC, centers = 3)
km_res

## K-means clustering with 3 clusters of sizes 220, 194, 559
## 
## Cluster means:
##         [,1]       [,2]        [,3]        [,4]        [,5]        [,6]
## 1  0.2062301 -2.0987082 -0.05572376 -0.04717566 -0.05202715 -0.11970994
## 2 -3.3264314  0.4091485  0.04811984  0.20967794  0.14273609  0.24260474
## 3  1.0732685  0.6839732  0.00523073 -0.05420192 -0.02906052 -0.03708253
##          [,7]        [,8]        [,9]
## 1  0.04820349  0.07247970  0.02118630
## 2 -0.09185405 -0.03885619  0.03256173
## 3  0.01290683 -0.01504013 -0.01963857
## 
## Clustering vector:
##   [1] 2 3 3 3 3 2 3 3 1 1 1 3 1 1 1 3 3 3 3 3 1 3 3 1 3 3 3 3 2 3 1 3 3 3 2 1 3
##  [38] 1 3 3 3 3 1 3 1 3 3 1 3 1 3 2 1 1 3 1 3 3 2 3 3 3 2 3 2 3 2 3 2 1 2 3 3 3
##  [75] 1 3 3 3 1 2 3 2 3 3 3 3 1 3 1 3 2 2 1 3 1 3 1 1 3 2 1 3 3 1 3 2 3 2 3 3 3
## [112] 3 2 2 3 3 3 2 1 3 3 1 1 1 2 1 3 2 1 2 3 3 2 1 3 1 3 3 3 2 1 1 1 3 3 2 3 3
## [149] 1 3 3 1 2 3 1 3 3 3 3 3 1 1 1 2 2 3 2 3 3 3 1 3 2 3 1 3 2 3 3 3 1 2 2 1 2
## [186] 3 2 1 3 3 3 3 3 1 3 3 3 2 3 3 3 2 3 3 2 2 1 3 3 2 3 3 2 3 3 3 3 3 3 1 3 2
## [223] 3 2 1 2 1 3 3 2 3 3 3 2 1 3 1 2 3 3 1 1 3 3 1 3 3 3 3 3 1 1 3 3 3 3 2 1 3
## [260] 1 3 1 3 3 1 3 3 2 3 2 3 1 3 3 2 1 2 3 2 1 3 3 3 1 3 2 1 1 1 1 3 1 1 3 3 3
## [297] 3 3 1 1 1 2 1 2 3 3 3 2 3 2 2 1 1 3 3 2 2 3 3 3 3 2 3 3 3 3 3 3 3 3 2 3 3
## [334] 3 3 1 3 3 1 3 1 2 3 3 1 1 3 3 2 3 3 3 3 3 3 1 2 1 3 1 3 1 3 3 2 3 3 2 3 1
## [371] 1 3 3 2 3 1 1 3 3 1 2 1 1 3 1 3 3 3 1 3 3 2 3 3 3 2 2 3 2 3 3 3 3 3 2 1 1
## [408] 3 1 3 3 3 2 3 1 3 3 3 3 2 1 2 2 1 2 3 2 1 2 2 2 3 3 3 3 3 1 3 2 3 2 3 2 2
## [445] 3 1 1 3 1 3 3 3 3 3 3 2 1 2 1 3 3 3 3 1 3 1 3 1 3 3 1 3 2 3 3 2 1 3 3 3 1
## [482] 1 3 3 2 2 1 3 3 3 3 3 2 3 1 1 1 2 3 3 3 3 3 3 3 3 3 1 3 3 3 2 2 2 3 3 3 1
## [519] 2 3 3 3 3 3 1 3 1 3 1 2 2 3 3 3 1 3 2 3 2 1 3 3 2 3 3 3 3 1 3 3 3 1 3 3 1
## [556] 3 3 2 3 3 3 1 1 3 1 3 3 3 3 3 3 1 2 3 1 3 3 3 3 3 3 3 3 3 2 3 3 2 1 1 3 2
## [593] 3 1 1 2 3 3 3 3 3 2 3 1 3 2 3 3 2 2 2 2 3 3 1 2 3 3 1 3 3 2 2 3 3 3 3 2 3
## [630] 3 2 1 3 3 1 3 3 3 2 1 1 3 3 1 1 1 2 1 1 2 3 3 3 3 3 3 3 3 3 3 3 1 3 2 3 3
## [667] 3 3 2 2 3 3 2 2 3 3 1 3 1 3 3 1 3 3 3 3 3 1 3 3 3 3 2 3 2 3 1 1 3 3 3 3 3
## [704] 3 3 3 1 3 3 3 1 2 2 3 1 1 3 3 1 2 3 1 3 3 1 3 3 3 2 1 3 3 2 1 1 2 2 1 2 1
## [741] 3 3 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 2 3 3 1 3 3 1 3 3 3 2 3 3 3 1 2 1 3
## [778] 2 3 2 3 3 3 3 2 1 3 3 1 3 2 3 3 3 3 3 3 2 3 3 3 2 3 3 2 3 1 3 3 3 2 2 1 3
## [815] 3 1 3 3 3 2 3 3 3 3 3 3 1 3 3 3 2 2 2 1 3 2 3 3 1 1 2 3 3 3 1 2 1 3 2 1 3
## [852] 3 1 3 1 1 3 2 3 3 3 3 3 2 2 1 2 3 1 3 2 3 3 3 3 1 2 2 3 2 3 2 3 3 1 2 3 3
## [889] 3 1 3 1 2 3 3 3 2 2 1 3 1 3 2 3 3 1 2 1 3 3 2 1 1 1 1 3 1 3 3 3 2 3 2 3 1
## [926] 3 1 2 3 3 2 3 2 1 3 3 1 1 3 3 3 3 2 2 3 1 3 3 2 1 3 3 3 3 2 1 3 2 3 1 3 3
## [963] 2 3 2 3 2 3 2 3 2 1 1
## 
## Within cluster sum of squares by cluster:
## [1] 1200.8401  646.8118 2800.9395
##  (between_SS / total_SS =  46.9 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

# Tambahin cluster ke data asli
data_clustered <- as.data.frame(scores_PC) # Convert matrix to data frame
data_clustered$cluster <- km_res$cluster

# Distribusi cluster
cat("Distribusi cluster:\n")

## Distribusi cluster:

print(table(data_clustered$cluster))

## 
##   1   2   3 
## 220 194 559

cluster1 <- subset(data_clustered, cluster == 1)
cluster2 <- subset(data_clustered, cluster == 2)
cluster3 <- subset(data_clustered, cluster == 3)

View(head(cluster1, 10)) # Display first 10 rows for consistency
View(head(cluster2, 10))
View(head(cluster3, 10))

6.2 K-Medians

# 2. K-Medians
kmed_res <- kcca(scores_PC, k = 3, family = kccaFamily("kmedians"))

## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'

## Also defined by 'kernlab'

## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'

## Also defined by 'kernlab'

# Distribusi cluster
cat("Distribusi cluster:\n")

## Distribusi cluster:

print(table(clusters(kmed_res)))

## 
##   1   2   3 
## 465 195 313

# Tambahin ke data asli
data_clustered <- as.data.frame(scores_PC) # Convert matrix to data frame
data_clustered$cluster <- clusters(kmed_res)

# Pisahin per cluster
cluster1 <- subset(data_clustered, cluster == 1)
cluster2 <- subset(data_clustered, cluster == 2)
cluster3 <- subset(data_clustered, cluster == 3)

cat("\nCluster 1:\n")

## 
## Cluster 1:

print(head(cluster1, 10))

##             V1          V2          V3         V4         V5         V6
## 2   0.30433297 -0.15235202 -0.06144534 -1.8431467  0.3344867 -0.8900295
## 3   0.87429126  0.59333389 -1.08535378 -0.9071370  0.9490050 -0.9337736
## 7  -1.09953969  0.59237179 -0.40355561 -0.4058413 -1.3721097  0.6724907
## 8   0.67324078 -0.06226236  0.84815889 -1.4832362 -0.1884369  0.0331401
## 9   0.13336756 -2.16208453  0.20775515 -1.0409090  1.1462374 -0.5862504
## 10  1.10306715 -1.55716701 -1.08058122 -0.2987946 -0.2768027 -0.4632111
## 12 -0.07206057  0.86870092 -0.37886144  1.8397674  0.0934775 -0.4561576
## 13 -0.74293531 -3.12782710  1.06823435 -1.0903531 -0.1441856  0.4613117
## 14  0.50925990 -1.27403500 -0.21939956  1.4662842 -0.3097752 -1.0905298
## 15 -1.10660014 -2.72993738  1.19091307 -0.3264263 -0.7911292 -0.7004210
##            V7           V8          V9 cluster
## 2  -0.2175674 -0.337534476 -0.03979506       1
## 3  -0.1505114  0.064842298  0.12866605       1
## 7   0.1557596  0.147587929 -0.66282561       1
## 8  -0.6262349 -0.125276356 -0.17479947       1
## 9   0.4650611  0.845288799  0.19750622       1
## 10  0.5484607  0.098413666  0.16422097       1
## 12 -0.5479242 -0.006869135 -0.22705949       1
## 13 -0.4695494 -0.279702954 -0.34837504       1
## 14  0.5362785  0.158909882  0.47563148       1
## 15 -0.1009853  0.463198794 -0.35452841       1

cat("\nCluster 2:\n")

## 
## Cluster 2:

print(head(cluster2, 10))

##           V1         V2          V3           V4          V5          V6
## 1  -3.554896 -0.8833618 -0.08595574  0.517010543 -0.14474009  0.41773511
## 6  -2.741914  1.5571772 -0.95281782  0.007667743  0.90109035  0.42439468
## 29 -3.414256 -0.1816066  0.43733348  0.657762898  0.96908399 -0.08376031
## 35 -3.755421 -0.4620553  0.49190353  0.723291560  0.95389667  0.42800434
## 52 -3.606637 -0.2392797 -1.38845327  0.517492975  0.83064715  0.09562245
## 56 -2.011343 -2.8808307 -0.04260298 -1.717790838 -0.46980599  0.05465696
## 59 -2.333756  1.2384319  0.20441681  0.061580913  0.07959266  0.44424174
## 63 -4.057361 -0.9416178  0.14112459  0.056716432  0.46952921  0.04547263
## 65 -3.640339  1.5365387 -1.59407375 -0.602063163 -0.28785926  0.04745706
## 67 -4.717246 -0.6868947 -0.40204045 -0.336071735 -0.50297643 -0.27893693
##             V7           V8          V9 cluster
## 1  -0.41012131 -0.009206248 -0.19661338       2
## 6   0.09407731 -0.125485492 -0.01599479       2
## 29  0.18273295  0.123449267  0.27820907       2
## 35  0.19176065 -0.288313475 -0.36587600       2
## 52  0.03510367 -0.156739461  0.23625443       2
## 56  0.22822079 -0.658257308 -0.53873671       2
## 59 -0.43275356  0.236731147  0.05161049       2
## 63 -0.04163305 -0.544244323 -0.07429049       2
## 65  0.21498934 -0.139253393  0.11062017       2
## 67  0.03856095 -0.654421805 -0.26048216       2

cat("\nCluster 3:\n")

## 
## Cluster 3:

print(head(cluster3, 10))

##           V1         V2        V3          V4         V5          V6
## 4  2.7352589  0.8658108 0.9077170  0.47591388  1.0044088  0.28163368
## 5  2.4215596  1.1754000 0.7448771  1.56479877  0.8615264 -0.39683991
## 11 1.2347599 -4.1550594 0.3322366  0.21627066  0.1912985  0.95463267
## 16 2.0848219  1.7915295 0.2926179  0.83424236 -0.7615915  0.77050990
## 18 1.9295951  2.2877142 0.4258162 -0.48889019 -0.6214933  0.08119026
## 20 0.8981528 -0.3838146 0.7549208 -0.24343218 -1.5880251  0.74160733
## 22 1.1481069 -0.5599490 1.4635949  1.32296430 -0.9304952 -0.01415540
## 23 1.8663454 -0.2563679 0.6652213 -0.01041138 -1.1373642 -0.43723259
## 24 1.4498074 -3.8214515 1.1497064  0.55509396 -0.1825644 -0.40311925
## 27 2.5897498 -0.1101889 1.0253227  1.32856447  0.2892521  0.30054425
##              V7           V8          V9 cluster
## 4   0.446194877 -0.104668701 -0.45596013       3
## 5   0.358830154 -0.107400094 -0.38509153       3
## 11 -0.325319019 -0.386198682  0.01671153       3
## 16  0.006225267  0.014232110  0.18182402       3
## 18  0.693720253 -0.008768853 -0.01106649       3
## 20  0.079105115 -0.222125215  0.05443074       3
## 22 -0.042395950 -0.068111396 -0.06601022       3
## 23 -0.232020122 -0.017410138  0.02962104       3
## 24 -0.162337303  0.175866809 -0.28928185       3
## 27 -0.169641308  0.172133142 -0.03537376       3

6.3 DBSCAN

# eps=2.0 dipilih agar DBSCAN menghasilkan cluster yang bermakna
db_res <- dbscan(scores_PC, eps = 2.0, MinPts = 5)
cat("Distribusi cluster (−1 = noise):\n")

## Distribusi cluster (−1 = noise):

print(table(db_res$cluster))

## 
##   0   1 
##   1 972

cat("\nJumlah noise:", sum(db_res$cluster == 0), "\n")

## 
## Jumlah noise: 1

6.4 Mean Shift

ms_res <- meanShift(scores_PC)
cat("Jumlah cluster yang terbentuk:", length(unique(ms_res$assignment)), "\n")

## Jumlah cluster yang terbentuk: 213

cat("Distribusi cluster:\n")

## Distribusi cluster:

print(table(ms_res$assignment))

## 
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
## 101  50  68   8   4  14  66   2   1   1   1   1   4  11  31   9   8   2  11   1 
##  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
##   4  19   1   9  10   1   3   3   1   3  18  32   1   1  31   3  17   3   5   1 
##  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
##   5   6   2   1   8   4   3   6   7   6   5   1   1   1  15   3   3   1   2   1 
##  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
##   1   2   1   2   4   1  19   1   9   1   6   6   9   1   1   2   1  11   1   1 
##  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
##   2   8   6   4   2   1   1   1   5   1   1   1   5   1   5   1   1   6   1   4 
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
##   1   1   1   1  10   5   6   9   6   1   2   2   1   2  15   1   1   1   1   2 
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
##   3   1   8   1   2   1   4   1   1   1   1   1   1   2   1   1   1   1   1   1 
## 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 
##   3   1   1   1   1   2   2   1   1   2   1   1   1   1   1   1   1   1   1   1 
## 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 
##   1   1   1   2   2   2   2   1   1   3   1   1   4   1   2   2   2   1   2   1 
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 
##   1   1   2   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 201 202 203 204 205 206 207 208 209 210 211 212 213 
##   1   1   1   1   1   1   1   1   1   1   1   1   1

6.5 Fuzzy C-means

fcm_res <- cmeans(scores_PC, centers = 3, m = 2, iter.max = 100)
cat("Distribusi cluster (hard assignment):\n")

## Distribusi cluster (hard assignment):

print(table(fcm_res$cluster))

## 
##   1   2   3 
## 213 323 437

cat("\nContoh membership degree (5 baris pertama):\n")

## 
## Contoh membership degree (5 baris pertama):

print(round(head(fcm_res$membership, 5), 3))

##          1     2     3
## [1,] 0.837 0.084 0.079
## [2,] 0.133 0.442 0.425
## [3,] 0.101 0.433 0.466
## [4,] 0.093 0.428 0.479
## [5,] 0.112 0.422 0.467

Visualisasi Hasil Clustering

# --- Visualization Comparison ---
par(mfrow = c(2, 3), mar = c(4, 4, 2, 1))

# K-Means
plot(scores_PC,
     col = km_res$cluster,
     pch = 16,
     main = "K-Means")

# K-Medians
plot(scores_PC,
     col = clusters(kmed_res),
     pch = 16,
     main = "K-Medians")

# DBSCAN
plot(scores_PC,
     col = db_res$cluster + 1L,
     pch = 16,
     main = "DBSCAN (0 = Noise)")

# Mean Shift
plot(scores_PC,
     col = ms_res$assignment,
     pch = 16,
     main = "Mean Shift")

# Fuzzy C-Means
plot(scores_PC,
     col = fcm_res$cluster,
     pch = 16,
     main = "Fuzzy C-Means")

Evaluasi Metrik

dist_matrix <- dist(scores_PC)

eval_clustering <- function(labels, data_dist, name) {
  valid <- labels > 0
  n_cl  <- length(unique(labels[valid]))
  if (n_cl < 2) return(data.frame(Method=name, K=n_cl, Silhouette=NA, Dunn=NA))

  sil   <- mean(silhouette(labels[valid], as.dist(as.matrix(data_dist)[valid, valid]))[, 3])
  stats <- cluster.stats(data_dist, labels)
  data.frame(Method=name, K=n_cl, Silhouette=round(sil,4), Dunn=round(stats$dunn,4))
}

eval_results <- rbind(
  eval_clustering(km_res$cluster,     dist_matrix, "K-means"),
  eval_clustering(clusters(kmed_res), dist_matrix, "K-medians"),
  eval_clustering(db_res$cluster,     dist_matrix, "DBSCAN"),
  eval_clustering(ms_res$assignment,  dist_matrix, "Mean Shift"),
  eval_clustering(fcm_res$cluster,    dist_matrix, "Fuzzy C-means")
)

print(eval_results, row.names = FALSE)

##         Method   K Silhouette   Dunn
##        K-means   3     0.2961 0.1152
##      K-medians   3     0.1952 0.0692
##         DBSCAN   1         NA     NA
##     Mean Shift 213    -0.0856 0.0738
##  Fuzzy C-means   3     0.2425 0.0688

cat("\nMetode terbaik:", eval_results$Method[which.max(eval_results$Silhouette)], "\n")

## 
## Metode terbaik: K-means

Silhouette Plot (K-Means)

fviz_silhouette(silhouette(km_res$cluster, dist_matrix)) +
  labs(title = "Silhouette Plot - K-means") +
  theme_minimal(base_size = 12) +
  theme(plot.title = element_text(face = "bold", hjust = 0.5))

##   cluster size ave.sil.width
## 1       1  220          0.22
## 2       2  194          0.49
## 3       3  559          0.26