Dataset: https://www.kaggle.com/datasets/valakhorasani/gym-members-exercise-dataset
Notebook ini membandingkan 5 algoritma clustering pada dataset gym members: K-means, K-medians, DBSCAN, Mean Shift, dan Fuzzy C-means. Data diproses menggunakan PCA sebelum clustering untuk mereduksi dimensi dan menghilangkan multikolinearitas antar fitur.
library(psych)
library(tidyverse)
library(flexclust)
library(dbscan)
library(meanShiftR)
library(e1071)
library(cluster)
library(fpc)
library(mclust)
library(factoextra)
library(gridExtra)
# Import Data
data <- read.csv("gym_members_exercise_tracking.csv")
knitr::kable(head(data))
| Age | Gender | Weight..kg. | Height..m. | Max_BPM | Avg_BPM | Resting_BPM | Session_Duration..hours. | Calories_Burned | Workout_Type | Fat_Percentage | Water_Intake..liters. | Workout_Frequency..days.week. | Experience_Level | BMI |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 56 | Male | 88.3 | 1.71 | 180 | 157 | 60 | 1.69 | 1313 | Yoga | 12.6 | 3.5 | 4 | 3 | 30.20 |
| 46 | Female | 74.9 | 1.53 | 179 | 151 | 66 | 1.30 | 883 | HIIT | 33.9 | 2.1 | 4 | 2 | 32.00 |
| 32 | Female | 68.1 | 1.66 | 167 | 122 | 54 | 1.11 | 677 | Cardio | 33.4 | 2.3 | 4 | 2 | 24.71 |
| 25 | Male | 53.2 | 1.70 | 190 | 164 | 56 | 0.59 | 532 | Strength | 28.8 | 2.1 | 3 | 1 | 18.41 |
| 38 | Male | 46.1 | 1.79 | 188 | 158 | 68 | 0.64 | 556 | Strength | 29.2 | 2.8 | 3 | 1 | 14.39 |
| 56 | Female | 58.0 | 1.68 | 168 | 156 | 74 | 1.59 | 1116 | HIIT | 15.5 | 2.7 | 5 | 3 | 20.55 |
# Dimensi Data
cat("Dimensi:", nrow(data), "x", ncol(data), "\n")
## Dimensi: 973 x 15
# Struktur dan Tipe Data
str(data)
## 'data.frame': 973 obs. of 15 variables:
## $ Age : int 56 46 32 25 38 56 36 40 28 28 ...
## $ Gender : chr "Male" "Female" "Female" "Male" ...
## $ Weight..kg. : num 88.3 74.9 68.1 53.2 46.1 ...
## $ Height..m. : num 1.71 1.53 1.66 1.7 1.79 1.68 1.72 1.51 1.94 1.84 ...
## $ Max_BPM : int 180 179 167 190 188 168 174 189 185 169 ...
## $ Avg_BPM : int 157 151 122 164 158 156 169 141 127 136 ...
## $ Resting_BPM : int 60 66 54 56 68 74 73 64 52 64 ...
## $ Session_Duration..hours. : num 1.69 1.3 1.11 0.59 0.64 1.59 1.49 1.27 1.03 1.08 ...
## $ Calories_Burned : num 1313 883 677 532 556 ...
## $ Workout_Type : chr "Yoga" "HIIT" "Cardio" "Strength" ...
## $ Fat_Percentage : num 12.6 33.9 33.4 28.8 29.2 15.5 21.3 30.6 28.9 29.7 ...
## $ Water_Intake..liters. : num 3.5 2.1 2.3 2.1 2.8 2.7 2.3 1.9 2.6 2.7 ...
## $ Workout_Frequency..days.week.: int 4 4 4 3 3 5 3 3 4 3 ...
## $ Experience_Level : int 3 2 2 1 1 3 2 2 2 1 ...
## $ BMI : num 30.2 32 24.7 18.4 14.4 ...
summary(data)
## Age Gender Weight..kg. Height..m.
## Min. :18.00 Length:973 Min. : 40.00 Min. :1.500
## 1st Qu.:28.00 Class :character 1st Qu.: 58.10 1st Qu.:1.620
## Median :40.00 Mode :character Median : 70.00 Median :1.710
## Mean :38.68 Mean : 73.85 Mean :1.723
## 3rd Qu.:49.00 3rd Qu.: 86.00 3rd Qu.:1.800
## Max. :59.00 Max. :129.90 Max. :2.000
## Max_BPM Avg_BPM Resting_BPM Session_Duration..hours.
## Min. :160.0 Min. :120.0 Min. :50.00 Min. :0.500
## 1st Qu.:170.0 1st Qu.:131.0 1st Qu.:56.00 1st Qu.:1.040
## Median :180.0 Median :143.0 Median :62.00 Median :1.260
## Mean :179.9 Mean :143.8 Mean :62.22 Mean :1.256
## 3rd Qu.:190.0 3rd Qu.:156.0 3rd Qu.:68.00 3rd Qu.:1.460
## Max. :199.0 Max. :169.0 Max. :74.00 Max. :2.000
## Calories_Burned Workout_Type Fat_Percentage Water_Intake..liters.
## Min. : 303.0 Length:973 Min. :10.00 Min. :1.500
## 1st Qu.: 720.0 Class :character 1st Qu.:21.30 1st Qu.:2.200
## Median : 893.0 Mode :character Median :26.20 Median :2.600
## Mean : 905.4 Mean :24.98 Mean :2.627
## 3rd Qu.:1076.0 3rd Qu.:29.30 3rd Qu.:3.100
## Max. :1783.0 Max. :35.00 Max. :3.700
## Workout_Frequency..days.week. Experience_Level BMI
## Min. :2.000 Min. :1.00 Min. :12.32
## 1st Qu.:3.000 1st Qu.:1.00 1st Qu.:20.11
## Median :3.000 Median :2.00 Median :24.16
## Mean :3.322 Mean :1.81 Mean :24.91
## 3rd Qu.:4.000 3rd Qu.:2.00 3rd Qu.:28.56
## Max. :5.000 Max. :3.00 Max. :49.84
colSums(is.na(data))
## Age Gender
## 0 0
## Weight..kg. Height..m.
## 0 0
## Max_BPM Avg_BPM
## 0 0
## Resting_BPM Session_Duration..hours.
## 0 0
## Calories_Burned Workout_Type
## 0 0
## Fat_Percentage Water_Intake..liters.
## 0 0
## Workout_Frequency..days.week. Experience_Level
## 0 0
## BMI
## 0
colSums(is.na(data))
## Age Gender
## 0 0
## Weight..kg. Height..m.
## 0 0
## Max_BPM Avg_BPM
## 0 0
## Resting_BPM Session_Duration..hours.
## 0 0
## Calories_Burned Workout_Type
## 0 0
## Fat_Percentage Water_Intake..liters.
## 0 0
## Workout_Frequency..days.week. Experience_Level
## 0 0
## BMI
## 0
data_num <- data[, sapply(data, is.numeric)]
cat("Fitur numerik yang digunakan:", ncol(data_num), "\n")
## Fitur numerik yang digunakan: 13
colnames(data_num) <- paste0("X", 1:ncol(data_num))
knitr::kable(head(data_num))
| X1 | X2 | X3 | X4 | X5 | X6 | X7 | X8 | X9 | X10 | X11 | X12 | X13 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 56 | 88.3 | 1.71 | 180 | 157 | 60 | 1.69 | 1313 | 12.6 | 3.5 | 4 | 3 | 30.20 |
| 46 | 74.9 | 1.53 | 179 | 151 | 66 | 1.30 | 883 | 33.9 | 2.1 | 4 | 2 | 32.00 |
| 32 | 68.1 | 1.66 | 167 | 122 | 54 | 1.11 | 677 | 33.4 | 2.3 | 4 | 2 | 24.71 |
| 25 | 53.2 | 1.70 | 190 | 164 | 56 | 0.59 | 532 | 28.8 | 2.1 | 3 | 1 | 18.41 |
| 38 | 46.1 | 1.79 | 188 | 158 | 68 | 0.64 | 556 | 29.2 | 2.8 | 3 | 1 | 14.39 |
| 56 | 58.0 | 1.68 | 168 | 156 | 74 | 1.59 | 1116 | 15.5 | 2.7 | 5 | 3 | 20.55 |
Masing-masing variabel memiliki korelasi dan tidak bernilai 0
cor(data_num)
## X1 X2 X3 X4 X5
## X1 1.000000000 -0.036339635 -0.027837495 -0.0170725970 0.0359691433
## X2 -0.036339635 1.000000000 0.365321203 0.0570611305 0.0097174780
## X3 -0.027837495 0.365321203 1.000000000 -0.0176598843 -0.0147762881
## X4 -0.017072597 0.057061130 -0.017659884 1.0000000000 -0.0397514432
## X5 0.035969143 0.009717478 -0.014776288 -0.0397514432 1.0000000000
## X6 0.004353714 -0.032138091 -0.005089864 0.0366474807 0.0596355022
## X7 -0.019911904 -0.013665561 -0.010205897 0.0100509814 0.0160144382
## X8 -0.154678760 0.095443473 0.086348051 0.0020900159 0.3396586672
## X9 0.002370051 -0.225511640 -0.235520936 -0.0090557315 -0.0073016551
## X10 0.041528359 0.394275710 0.393532902 0.0316206428 -0.0029106374
## X11 0.008055163 -0.011769328 -0.011269883 -0.0290990657 -0.0106807977
## X12 -0.018675927 0.003378528 -0.010266611 0.0005448337 -0.0008881572
## X13 -0.013691370 0.853157690 -0.159468750 0.0671052310 0.0216054995
## X6 X7 X8 X9 X10
## X1 0.004353714 -0.019911904 -0.154678760 0.002370051 0.041528359
## X2 -0.032138091 -0.013665561 0.095443473 -0.225511640 0.394275710
## X3 -0.005089864 -0.010205897 0.086348051 -0.235520936 0.393532902
## X4 0.036647481 0.010050981 0.002090016 -0.009055731 0.031620643
## X5 0.059635502 0.016014438 0.339658667 -0.007301655 -0.002910637
## X6 1.000000000 -0.016648808 0.016517951 -0.016834389 0.007725998
## X7 -0.016648808 1.000000000 0.908140376 -0.581519771 0.283410977
## X8 0.016517951 0.908140376 1.000000000 -0.597615248 0.356930683
## X9 -0.016834389 -0.581519771 -0.597615248 1.000000000 -0.588682834
## X10 0.007725998 0.283410977 0.356930683 -0.588682834 1.000000000
## X11 -0.007966891 0.644140366 0.576150125 -0.537059548 0.238562571
## X12 0.001757585 0.764768119 0.694129448 -0.654362613 0.304103549
## X13 -0.032542632 -0.006492647 0.059760826 -0.119257760 0.213696572
## X11 X12 X13
## X1 0.008055163 -0.0186759269 -0.013691370
## X2 -0.011769328 0.0033785279 0.853157690
## X3 -0.011269883 -0.0102666112 -0.159468750
## X4 -0.029099066 0.0005448337 0.067105231
## X5 -0.010680798 -0.0008881572 0.021605500
## X6 -0.007966891 0.0017575852 -0.032542632
## X7 0.644140366 0.7647681189 -0.006492647
## X8 0.576150125 0.6941294479 0.059760826
## X9 -0.537059548 -0.6543626129 -0.119257760
## X10 0.238562571 0.3041035494 0.213696572
## X11 1.000000000 0.8370787094 0.001644974
## X12 0.837078709 1.0000000000 0.016031073
## X13 0.001644974 0.0160310726 1.000000000
corrplot::corrplot(cor(data_num), tl.col = "black", tl.srt = 45, tl.cex = 0.5)
KMO/MSA memilki nilai > 0.5
# Check MSA
r <- cor(data_num)
KMO(r)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = r)
## Overall MSA = 0.47
## MSA for each item =
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13
## 0.02 0.35 0.16 0.63 0.06 0.25 0.50 0.49 0.86 0.75 0.80 0.79 0.29
# Delete X1
data <- data_num[-1]
r <- cor(data)
KMO(r)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = r)
## Overall MSA = 0.54
## MSA for each item =
## X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13
## 0.36 0.16 0.60 0.09 0.40 0.59 0.58 0.87 0.82 0.80 0.80 0.30
# Delete X5
data <- data[-4]
r <- cor(data)
KMO(r)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = r)
## Overall MSA = 0.61
## MSA for each item =
## X2 X3 X4 X6 X7 X8 X9 X10 X11 X12 X13
## 0.36 0.16 0.54 0.19 0.74 0.75 0.87 0.83 0.80 0.79 0.30
# Delete X3
data <- data[-2]
r <- cor(data)
KMO(r)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = r)
## Overall MSA = 0.72
## MSA for each item =
## X2 X4 X6 X7 X8 X9 X10 X11 X12 X13
## 0.51 0.53 0.19 0.74 0.75 0.85 0.73 0.80 0.78 0.50
# Delete X6
data <- data[-3]
r <- cor(data)
KMO(r)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = r)
## Overall MSA = 0.72
## MSA for each item =
## X2 X4 X7 X8 X9 X10 X11 X12 X13
## 0.51 0.56 0.74 0.75 0.85 0.73 0.80 0.79 0.50
Dengan syarat hasil p-value < 0.05
bartlett.test(data)
##
## Bartlett test of homogeneity of variances
##
## data: data
## Bartlett's K-squared = 53948, df = 8, p-value < 2.2e-16
data_scale <- scale(data)
r = cov(data_scale)
pc <- eigen(r)
eigen_value <- pc$values
eigen_vector <- pc$vectors
print(eigen_value)
## [1] 3.97241779 2.01436367 0.99301511 0.82277961 0.55528999 0.30782414 0.14308312
## [8] 0.11605509 0.07517148
print(eigen_vector)
## [,1] [,2] [,3] [,4] [,5]
## [1,] -0.099617240 -0.65940372 -0.06538785 -0.14384144 -0.008399245
## [2,] -0.005696882 -0.09097688 0.99424603 0.02099783 0.044865893
## [3,] -0.440116953 0.15513574 0.04407065 -0.18577363 -0.440966682
## [4,] -0.435887810 0.07399296 0.02110284 -0.12288184 -0.571382876
## [5,] 0.408796215 0.08320719 0.02566220 -0.36646008 -0.141992160
## [6,] -0.271758072 -0.29072163 -0.04086506 0.73988720 0.001629442
## [7,] -0.400789286 0.15495238 -0.02254232 -0.23976481 0.589730005
## [8,] -0.446212227 0.14249754 0.01358778 -0.16818811 0.329125078
## [9,] -0.075090869 -0.62578718 -0.04231333 -0.40218961 0.026782003
## [,6] [,7] [,8] [,9]
## [1,] -0.002632990 0.35104856 0.6091448359 0.189544735
## [2,] -0.001620776 0.02530538 0.0008635083 -0.008245426
## [3,] -0.067321913 -0.10161042 -0.1160223900 0.723191661
## [4,] -0.073596872 0.24264945 0.0301603762 -0.630316021
## [5,] -0.796334024 -0.12359334 0.1450612238 -0.019569357
## [6,] -0.509513725 -0.14481995 -0.1091713262 0.005232390
## [7,] -0.308731214 0.50252965 -0.2472519228 0.024420993
## [8,] -0.008113606 -0.63480224 0.4634801799 -0.161059984
## [9,] 0.030146211 -0.33935681 -0.5528674818 -0.129434537
sumvar <- sum(eigen_value)
propvar <- eigen_value / sumvar
cumvar <- cumsum(propvar)
# Tampilkan tabel
pca_table <- data.frame(
PC = paste0("PC", 1:length(eigen_value)),
eigen_value = eigen_value,
propvar = propvar * 100,
cumulative = cumvar * 100
)
print(pca_table)
## PC eigen_value propvar cumulative
## 1 PC1 3.97241779 44.1379754 44.13798
## 2 PC2 2.01436367 22.3818185 66.51979
## 3 PC3 0.99301511 11.0335012 77.55330
## 4 PC4 0.82277961 9.1419956 86.69529
## 5 PC5 0.55528999 6.1698887 92.86518
## 6 PC6 0.30782414 3.4202683 96.28545
## 7 PC7 0.14308312 1.5898125 97.87526
## 8 PC8 0.11605509 1.2895010 99.16476
## 9 PC9 0.07517148 0.8352387 100.00000
n_pc <- which(cumvar >= 0.80)[1]
cat("Jumlah PC:", n_pc, "\n")
## Jumlah PC: 4
cat("Total varians:", round(cumvar[n_pc]*100, 2), "%\n")
## Total varians: 86.7 %
# Visualisasi
par(mfrow = c(1, 2), mar = c(5, 4, 4, 2))
# Scree Plot
plot(propvar * 100, type = "b", pch = 19, col = "steelblue", lwd = 2,
xlab = "Principal Component", ylab = "Varians (%)",
main = "Scree Plot", frame = FALSE)
abline(v = n_pc, col = "red", lty = 2)
legend("topright", legend = paste("PC ke-", n_pc),
col = "red", lty = 2, bty = "n")
# Kumulatif Varians
plot(cumvar * 100, type = "b", pch = 19, col = "darkorange", lwd = 2,
xlab = "Jumlah PC", ylab = "Kumulatif Varians (%)",
main = "Kumulatif Varians", frame = FALSE, ylim = c(0, 100))
abline(h = 80, col = "red", lty = 2)
legend("bottomright", legend = "80% threshold",
col = "red", lty = 2, bty = "n")
scores <- data_scale %*% eigen_vector
# Ambil sesuai n_pc
scores_PC <- scores
head(scores_PC)
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] -3.5548957 -0.8833618 -0.08595574 0.517010543 -0.1447401 0.4177351
## [2,] 0.3043330 -0.1523520 -0.06144534 -1.843146738 0.3344867 -0.8900295
## [3,] 0.8742913 0.5933339 -1.08535378 -0.907137050 0.9490050 -0.9337736
## [4,] 2.7352589 0.8658108 0.90771702 0.475913884 1.0044088 0.2816337
## [5,] 2.4215596 1.1754000 0.74487711 1.564798773 0.8615264 -0.3968399
## [6,] -2.7419137 1.5571772 -0.95281782 0.007667743 0.9010904 0.4243947
## [,7] [,8] [,9]
## [1,] -0.41012131 -0.009206248 -0.19661338
## [2,] -0.21756741 -0.337534476 -0.03979506
## [3,] -0.15051135 0.064842298 0.12866605
## [4,] 0.44619488 -0.104668701 -0.45596013
## [5,] 0.35883015 -0.107400094 -0.38509153
## [6,] 0.09407731 -0.125485492 -0.01599479
pca_res <- prcomp(data_scale, center = FALSE, scale. = FALSE)
fviz_pca_var(pca_res,
axes = c(1, 2),
col.var = "contrib",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE) +
labs(title = "PCA — Kontribusi Variabel ke PC1 & PC2") +
theme_minimal(base_size = 11)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the ggpubr package.
## Please report the issue at <https://github.com/kassambara/ggpubr/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Elbow Method (WSS)
wss <- sapply(1:10, function(k) {
kmeans(scores_PC, centers = k, nstart = 20, iter.max = 100)$tot.withinss
})
# Silhouette Method
k_values <- 2:10
avg_sil_values <- sapply(k_values, function(k) {
km <- kmeans(scores_PC, centers = k, nstart = 25, iter.max = 100)
mean(silhouette(km$cluster, dist(scores_PC))[, 3])
})
# Best K
best_k <- k_values[which.max(avg_sil_values)]
cat("K optimal (Silhouette):", best_k, "\n")
## K optimal (Silhouette): 2
par(mfrow = c(1, 2), mar = c(5, 4, 4, 2))
plot(1:10, wss, type = "b", pch = 19, col = "steelblue", lwd = 2,
xlab = "K", ylab = "Within-Cluster SS", main = "Elbow Method", frame = FALSE)
abline(v = 3, col = "red", lty = 2)
legend("topright", legend = "K = 3", col = "red", lty = 2, bty = "n")
plot(k_values, avg_sil_values, type = "b", pch = 19, col = "darkorange", lwd = 2,
xlab = "K", ylab = "Avg Silhouette Width", main = "Silhouette Analysis", frame = FALSE)
abline(v = best_k, col = "red", lty = 2)
legend("topright", legend = paste("K =", best_k), col = "red", lty = 2, bty = "n")
# 1. K-means
km_res <- kmeans(scores_PC, centers = 3)
km_res
## K-means clustering with 3 clusters of sizes 220, 194, 559
##
## Cluster means:
## [,1] [,2] [,3] [,4] [,5] [,6]
## 1 0.2062301 -2.0987082 -0.05572376 -0.04717566 -0.05202715 -0.11970994
## 2 -3.3264314 0.4091485 0.04811984 0.20967794 0.14273609 0.24260474
## 3 1.0732685 0.6839732 0.00523073 -0.05420192 -0.02906052 -0.03708253
## [,7] [,8] [,9]
## 1 0.04820349 0.07247970 0.02118630
## 2 -0.09185405 -0.03885619 0.03256173
## 3 0.01290683 -0.01504013 -0.01963857
##
## Clustering vector:
## [1] 2 3 3 3 3 2 3 3 1 1 1 3 1 1 1 3 3 3 3 3 1 3 3 1 3 3 3 3 2 3 1 3 3 3 2 1 3
## [38] 1 3 3 3 3 1 3 1 3 3 1 3 1 3 2 1 1 3 1 3 3 2 3 3 3 2 3 2 3 2 3 2 1 2 3 3 3
## [75] 1 3 3 3 1 2 3 2 3 3 3 3 1 3 1 3 2 2 1 3 1 3 1 1 3 2 1 3 3 1 3 2 3 2 3 3 3
## [112] 3 2 2 3 3 3 2 1 3 3 1 1 1 2 1 3 2 1 2 3 3 2 1 3 1 3 3 3 2 1 1 1 3 3 2 3 3
## [149] 1 3 3 1 2 3 1 3 3 3 3 3 1 1 1 2 2 3 2 3 3 3 1 3 2 3 1 3 2 3 3 3 1 2 2 1 2
## [186] 3 2 1 3 3 3 3 3 1 3 3 3 2 3 3 3 2 3 3 2 2 1 3 3 2 3 3 2 3 3 3 3 3 3 1 3 2
## [223] 3 2 1 2 1 3 3 2 3 3 3 2 1 3 1 2 3 3 1 1 3 3 1 3 3 3 3 3 1 1 3 3 3 3 2 1 3
## [260] 1 3 1 3 3 1 3 3 2 3 2 3 1 3 3 2 1 2 3 2 1 3 3 3 1 3 2 1 1 1 1 3 1 1 3 3 3
## [297] 3 3 1 1 1 2 1 2 3 3 3 2 3 2 2 1 1 3 3 2 2 3 3 3 3 2 3 3 3 3 3 3 3 3 2 3 3
## [334] 3 3 1 3 3 1 3 1 2 3 3 1 1 3 3 2 3 3 3 3 3 3 1 2 1 3 1 3 1 3 3 2 3 3 2 3 1
## [371] 1 3 3 2 3 1 1 3 3 1 2 1 1 3 1 3 3 3 1 3 3 2 3 3 3 2 2 3 2 3 3 3 3 3 2 1 1
## [408] 3 1 3 3 3 2 3 1 3 3 3 3 2 1 2 2 1 2 3 2 1 2 2 2 3 3 3 3 3 1 3 2 3 2 3 2 2
## [445] 3 1 1 3 1 3 3 3 3 3 3 2 1 2 1 3 3 3 3 1 3 1 3 1 3 3 1 3 2 3 3 2 1 3 3 3 1
## [482] 1 3 3 2 2 1 3 3 3 3 3 2 3 1 1 1 2 3 3 3 3 3 3 3 3 3 1 3 3 3 2 2 2 3 3 3 1
## [519] 2 3 3 3 3 3 1 3 1 3 1 2 2 3 3 3 1 3 2 3 2 1 3 3 2 3 3 3 3 1 3 3 3 1 3 3 1
## [556] 3 3 2 3 3 3 1 1 3 1 3 3 3 3 3 3 1 2 3 1 3 3 3 3 3 3 3 3 3 2 3 3 2 1 1 3 2
## [593] 3 1 1 2 3 3 3 3 3 2 3 1 3 2 3 3 2 2 2 2 3 3 1 2 3 3 1 3 3 2 2 3 3 3 3 2 3
## [630] 3 2 1 3 3 1 3 3 3 2 1 1 3 3 1 1 1 2 1 1 2 3 3 3 3 3 3 3 3 3 3 3 1 3 2 3 3
## [667] 3 3 2 2 3 3 2 2 3 3 1 3 1 3 3 1 3 3 3 3 3 1 3 3 3 3 2 3 2 3 1 1 3 3 3 3 3
## [704] 3 3 3 1 3 3 3 1 2 2 3 1 1 3 3 1 2 3 1 3 3 1 3 3 3 2 1 3 3 2 1 1 2 2 1 2 1
## [741] 3 3 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 2 3 3 1 3 3 1 3 3 3 2 3 3 3 1 2 1 3
## [778] 2 3 2 3 3 3 3 2 1 3 3 1 3 2 3 3 3 3 3 3 2 3 3 3 2 3 3 2 3 1 3 3 3 2 2 1 3
## [815] 3 1 3 3 3 2 3 3 3 3 3 3 1 3 3 3 2 2 2 1 3 2 3 3 1 1 2 3 3 3 1 2 1 3 2 1 3
## [852] 3 1 3 1 1 3 2 3 3 3 3 3 2 2 1 2 3 1 3 2 3 3 3 3 1 2 2 3 2 3 2 3 3 1 2 3 3
## [889] 3 1 3 1 2 3 3 3 2 2 1 3 1 3 2 3 3 1 2 1 3 3 2 1 1 1 1 3 1 3 3 3 2 3 2 3 1
## [926] 3 1 2 3 3 2 3 2 1 3 3 1 1 3 3 3 3 2 2 3 1 3 3 2 1 3 3 3 3 2 1 3 2 3 1 3 3
## [963] 2 3 2 3 2 3 2 3 2 1 1
##
## Within cluster sum of squares by cluster:
## [1] 1200.8401 646.8118 2800.9395
## (between_SS / total_SS = 46.9 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
# Tambahin cluster ke data asli
data_clustered <- as.data.frame(scores_PC) # Convert matrix to data frame
data_clustered$cluster <- km_res$cluster
# Distribusi cluster
cat("Distribusi cluster:\n")
## Distribusi cluster:
print(table(data_clustered$cluster))
##
## 1 2 3
## 220 194 559
cluster1 <- subset(data_clustered, cluster == 1)
cluster2 <- subset(data_clustered, cluster == 2)
cluster3 <- subset(data_clustered, cluster == 3)
View(head(cluster1, 10)) # Display first 10 rows for consistency
View(head(cluster2, 10))
View(head(cluster3, 10))
# 2. K-Medians
kmed_res <- kcca(scores_PC, k = 3, family = kccaFamily("kmedians"))
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
# Distribusi cluster
cat("Distribusi cluster:\n")
## Distribusi cluster:
print(table(clusters(kmed_res)))
##
## 1 2 3
## 465 195 313
# Tambahin ke data asli
data_clustered <- as.data.frame(scores_PC) # Convert matrix to data frame
data_clustered$cluster <- clusters(kmed_res)
# Pisahin per cluster
cluster1 <- subset(data_clustered, cluster == 1)
cluster2 <- subset(data_clustered, cluster == 2)
cluster3 <- subset(data_clustered, cluster == 3)
cat("\nCluster 1:\n")
##
## Cluster 1:
print(head(cluster1, 10))
## V1 V2 V3 V4 V5 V6
## 2 0.30433297 -0.15235202 -0.06144534 -1.8431467 0.3344867 -0.8900295
## 3 0.87429126 0.59333389 -1.08535378 -0.9071370 0.9490050 -0.9337736
## 7 -1.09953969 0.59237179 -0.40355561 -0.4058413 -1.3721097 0.6724907
## 8 0.67324078 -0.06226236 0.84815889 -1.4832362 -0.1884369 0.0331401
## 9 0.13336756 -2.16208453 0.20775515 -1.0409090 1.1462374 -0.5862504
## 10 1.10306715 -1.55716701 -1.08058122 -0.2987946 -0.2768027 -0.4632111
## 12 -0.07206057 0.86870092 -0.37886144 1.8397674 0.0934775 -0.4561576
## 13 -0.74293531 -3.12782710 1.06823435 -1.0903531 -0.1441856 0.4613117
## 14 0.50925990 -1.27403500 -0.21939956 1.4662842 -0.3097752 -1.0905298
## 15 -1.10660014 -2.72993738 1.19091307 -0.3264263 -0.7911292 -0.7004210
## V7 V8 V9 cluster
## 2 -0.2175674 -0.337534476 -0.03979506 1
## 3 -0.1505114 0.064842298 0.12866605 1
## 7 0.1557596 0.147587929 -0.66282561 1
## 8 -0.6262349 -0.125276356 -0.17479947 1
## 9 0.4650611 0.845288799 0.19750622 1
## 10 0.5484607 0.098413666 0.16422097 1
## 12 -0.5479242 -0.006869135 -0.22705949 1
## 13 -0.4695494 -0.279702954 -0.34837504 1
## 14 0.5362785 0.158909882 0.47563148 1
## 15 -0.1009853 0.463198794 -0.35452841 1
cat("\nCluster 2:\n")
##
## Cluster 2:
print(head(cluster2, 10))
## V1 V2 V3 V4 V5 V6
## 1 -3.554896 -0.8833618 -0.08595574 0.517010543 -0.14474009 0.41773511
## 6 -2.741914 1.5571772 -0.95281782 0.007667743 0.90109035 0.42439468
## 29 -3.414256 -0.1816066 0.43733348 0.657762898 0.96908399 -0.08376031
## 35 -3.755421 -0.4620553 0.49190353 0.723291560 0.95389667 0.42800434
## 52 -3.606637 -0.2392797 -1.38845327 0.517492975 0.83064715 0.09562245
## 56 -2.011343 -2.8808307 -0.04260298 -1.717790838 -0.46980599 0.05465696
## 59 -2.333756 1.2384319 0.20441681 0.061580913 0.07959266 0.44424174
## 63 -4.057361 -0.9416178 0.14112459 0.056716432 0.46952921 0.04547263
## 65 -3.640339 1.5365387 -1.59407375 -0.602063163 -0.28785926 0.04745706
## 67 -4.717246 -0.6868947 -0.40204045 -0.336071735 -0.50297643 -0.27893693
## V7 V8 V9 cluster
## 1 -0.41012131 -0.009206248 -0.19661338 2
## 6 0.09407731 -0.125485492 -0.01599479 2
## 29 0.18273295 0.123449267 0.27820907 2
## 35 0.19176065 -0.288313475 -0.36587600 2
## 52 0.03510367 -0.156739461 0.23625443 2
## 56 0.22822079 -0.658257308 -0.53873671 2
## 59 -0.43275356 0.236731147 0.05161049 2
## 63 -0.04163305 -0.544244323 -0.07429049 2
## 65 0.21498934 -0.139253393 0.11062017 2
## 67 0.03856095 -0.654421805 -0.26048216 2
cat("\nCluster 3:\n")
##
## Cluster 3:
print(head(cluster3, 10))
## V1 V2 V3 V4 V5 V6
## 4 2.7352589 0.8658108 0.9077170 0.47591388 1.0044088 0.28163368
## 5 2.4215596 1.1754000 0.7448771 1.56479877 0.8615264 -0.39683991
## 11 1.2347599 -4.1550594 0.3322366 0.21627066 0.1912985 0.95463267
## 16 2.0848219 1.7915295 0.2926179 0.83424236 -0.7615915 0.77050990
## 18 1.9295951 2.2877142 0.4258162 -0.48889019 -0.6214933 0.08119026
## 20 0.8981528 -0.3838146 0.7549208 -0.24343218 -1.5880251 0.74160733
## 22 1.1481069 -0.5599490 1.4635949 1.32296430 -0.9304952 -0.01415540
## 23 1.8663454 -0.2563679 0.6652213 -0.01041138 -1.1373642 -0.43723259
## 24 1.4498074 -3.8214515 1.1497064 0.55509396 -0.1825644 -0.40311925
## 27 2.5897498 -0.1101889 1.0253227 1.32856447 0.2892521 0.30054425
## V7 V8 V9 cluster
## 4 0.446194877 -0.104668701 -0.45596013 3
## 5 0.358830154 -0.107400094 -0.38509153 3
## 11 -0.325319019 -0.386198682 0.01671153 3
## 16 0.006225267 0.014232110 0.18182402 3
## 18 0.693720253 -0.008768853 -0.01106649 3
## 20 0.079105115 -0.222125215 0.05443074 3
## 22 -0.042395950 -0.068111396 -0.06601022 3
## 23 -0.232020122 -0.017410138 0.02962104 3
## 24 -0.162337303 0.175866809 -0.28928185 3
## 27 -0.169641308 0.172133142 -0.03537376 3
# eps=2.0 dipilih agar DBSCAN menghasilkan cluster yang bermakna
db_res <- dbscan(scores_PC, eps = 2.0, MinPts = 5)
cat("Distribusi cluster (−1 = noise):\n")
## Distribusi cluster (−1 = noise):
print(table(db_res$cluster))
##
## 0 1
## 1 972
cat("\nJumlah noise:", sum(db_res$cluster == 0), "\n")
##
## Jumlah noise: 1
ms_res <- meanShift(scores_PC)
cat("Jumlah cluster yang terbentuk:", length(unique(ms_res$assignment)), "\n")
## Jumlah cluster yang terbentuk: 213
cat("Distribusi cluster:\n")
## Distribusi cluster:
print(table(ms_res$assignment))
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 101 50 68 8 4 14 66 2 1 1 1 1 4 11 31 9 8 2 11 1
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
## 4 19 1 9 10 1 3 3 1 3 18 32 1 1 31 3 17 3 5 1
## 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## 5 6 2 1 8 4 3 6 7 6 5 1 1 1 15 3 3 1 2 1
## 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
## 1 2 1 2 4 1 19 1 9 1 6 6 9 1 1 2 1 11 1 1
## 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
## 2 8 6 4 2 1 1 1 5 1 1 1 5 1 5 1 1 6 1 4
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
## 1 1 1 1 10 5 6 9 6 1 2 2 1 2 15 1 1 1 1 2
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
## 3 1 8 1 2 1 4 1 1 1 1 1 1 2 1 1 1 1 1 1
## 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
## 3 1 1 1 1 2 2 1 1 2 1 1 1 1 1 1 1 1 1 1
## 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
## 1 1 1 2 2 2 2 1 1 3 1 1 4 1 2 2 2 1 2 1
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
## 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 201 202 203 204 205 206 207 208 209 210 211 212 213
## 1 1 1 1 1 1 1 1 1 1 1 1 1
fcm_res <- cmeans(scores_PC, centers = 3, m = 2, iter.max = 100)
cat("Distribusi cluster (hard assignment):\n")
## Distribusi cluster (hard assignment):
print(table(fcm_res$cluster))
##
## 1 2 3
## 213 323 437
cat("\nContoh membership degree (5 baris pertama):\n")
##
## Contoh membership degree (5 baris pertama):
print(round(head(fcm_res$membership, 5), 3))
## 1 2 3
## [1,] 0.837 0.084 0.079
## [2,] 0.133 0.442 0.425
## [3,] 0.101 0.433 0.466
## [4,] 0.093 0.428 0.479
## [5,] 0.112 0.422 0.467
# --- Visualization Comparison ---
par(mfrow = c(2, 3), mar = c(4, 4, 2, 1))
# K-Means
plot(scores_PC,
col = km_res$cluster,
pch = 16,
main = "K-Means")
# K-Medians
plot(scores_PC,
col = clusters(kmed_res),
pch = 16,
main = "K-Medians")
# DBSCAN
plot(scores_PC,
col = db_res$cluster + 1L,
pch = 16,
main = "DBSCAN (0 = Noise)")
# Mean Shift
plot(scores_PC,
col = ms_res$assignment,
pch = 16,
main = "Mean Shift")
# Fuzzy C-Means
plot(scores_PC,
col = fcm_res$cluster,
pch = 16,
main = "Fuzzy C-Means")
dist_matrix <- dist(scores_PC)
eval_clustering <- function(labels, data_dist, name) {
valid <- labels > 0
n_cl <- length(unique(labels[valid]))
if (n_cl < 2) return(data.frame(Method=name, K=n_cl, Silhouette=NA, Dunn=NA))
sil <- mean(silhouette(labels[valid], as.dist(as.matrix(data_dist)[valid, valid]))[, 3])
stats <- cluster.stats(data_dist, labels)
data.frame(Method=name, K=n_cl, Silhouette=round(sil,4), Dunn=round(stats$dunn,4))
}
eval_results <- rbind(
eval_clustering(km_res$cluster, dist_matrix, "K-means"),
eval_clustering(clusters(kmed_res), dist_matrix, "K-medians"),
eval_clustering(db_res$cluster, dist_matrix, "DBSCAN"),
eval_clustering(ms_res$assignment, dist_matrix, "Mean Shift"),
eval_clustering(fcm_res$cluster, dist_matrix, "Fuzzy C-means")
)
print(eval_results, row.names = FALSE)
## Method K Silhouette Dunn
## K-means 3 0.2961 0.1152
## K-medians 3 0.1952 0.0692
## DBSCAN 1 NA NA
## Mean Shift 213 -0.0856 0.0738
## Fuzzy C-means 3 0.2425 0.0688
cat("\nMetode terbaik:", eval_results$Method[which.max(eval_results$Silhouette)], "\n")
##
## Metode terbaik: K-means
fviz_silhouette(silhouette(km_res$cluster, dist_matrix)) +
labs(title = "Silhouette Plot - K-means") +
theme_minimal(base_size = 12) +
theme(plot.title = element_text(face = "bold", hjust = 0.5))
## cluster size ave.sil.width
## 1 1 220 0.22
## 2 2 194 0.49
## 3 3 559 0.26