Pendahuluan
Dataset ObesityDataSet berisi data kebiasaan makan,
aktivitas fisik, dan kondisi fisik 2.111 individu
dengan 17 variabel (16 fitur + 1 label target). Tujuan
analisis:
- Scaling (standardisasi) variabel numerik
- Clustering menggunakan K-Means dan Hierarchical
Clustering
- Validasi hasil clustering (Silhouette, Dunn,
Connectivity)
- Membandingkan metode terbaik
Kolom NObeyesdad dihapus sebelum clustering karena
analisis bersifat unsupervised.
Load Library dan Data
library(tidyverse)
library(cluster)
library(factoextra)
library(NbClust)
library(clValid)
library(ggdendro)
library(gridExtra)
library(knitr)
library(scales)
library(ggpubr)
df_raw <- read.csv("ObesityDataSet_raw_and_data_sinthetic.csv", stringsAsFactors = TRUE)
cat("Dimensi:", nrow(df_raw), "x", ncol(df_raw), "\n")
## Dimensi: 2111 x 17
head(df_raw)
## Gender Age Height Weight family_history_with_overweight FAVC FCVC NCP
## 1 Female 21 1.62 64.0 yes no 2 3
## 2 Female 21 1.52 56.0 yes no 3 3
## 3 Male 23 1.80 77.0 yes no 2 3
## 4 Male 27 1.80 87.0 no no 3 3
## 5 Male 22 1.78 89.8 no no 2 1
## 6 Male 29 1.62 53.0 no yes 2 3
## CAEC SMOKE CH2O SCC FAF TUE CALC MTRANS
## 1 Sometimes no 2 no 0 1 no Public_Transportation
## 2 Sometimes yes 3 yes 3 0 Sometimes Public_Transportation
## 3 Sometimes no 2 no 2 1 Frequently Public_Transportation
## 4 Sometimes no 2 no 2 0 Frequently Walking
## 5 Sometimes no 2 no 0 0 Sometimes Public_Transportation
## 6 Sometimes no 2 no 0 0 Sometimes Automobile
## NObeyesdad
## 1 Normal_Weight
## 2 Normal_Weight
## 3 Normal_Weight
## 4 Overweight_Level_I
## 5 Overweight_Level_II
## 6 Normal_Weight
str(df_raw)
## 'data.frame': 2111 obs. of 17 variables:
## $ Gender : Factor w/ 2 levels "Female","Male": 1 1 2 2 2 2 1 2 2 2 ...
## $ Age : num 21 21 23 27 22 29 23 22 24 22 ...
## $ Height : num 1.62 1.52 1.8 1.8 1.78 1.62 1.5 1.64 1.78 1.72 ...
## $ Weight : num 64 56 77 87 89.8 53 55 53 64 68 ...
## $ family_history_with_overweight: Factor w/ 2 levels "no","yes": 2 2 2 1 1 1 2 1 2 2 ...
## $ FAVC : Factor w/ 2 levels "no","yes": 1 1 1 1 1 2 2 1 2 2 ...
## $ FCVC : num 2 3 2 3 2 2 3 2 3 2 ...
## $ NCP : num 3 3 3 3 1 3 3 3 3 3 ...
## $ CAEC : Factor w/ 4 levels "Always","Frequently",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ SMOKE : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 1 1 1 1 ...
## $ CH2O : num 2 3 2 2 2 2 2 2 2 2 ...
## $ SCC : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 1 1 1 1 ...
## $ FAF : num 0 3 2 2 0 0 1 3 1 1 ...
## $ TUE : num 1 0 1 0 0 0 0 0 1 1 ...
## $ CALC : Factor w/ 4 levels "Always","Frequently",..: 3 4 2 2 4 4 4 4 2 3 ...
## $ MTRANS : Factor w/ 5 levels "Automobile","Bike",..: 4 4 4 5 4 1 3 4 4 4 ...
## $ NObeyesdad : Factor w/ 7 levels "Insufficient_Weight",..: 2 2 2 6 7 2 2 2 2 2 ...
na_count <- colSums(is.na(df_raw))
cat("Missing value per kolom:\n"); print(na_count)
## Missing value per kolom:
## Gender Age
## 0 0
## Height Weight
## 0 0
## family_history_with_overweight FAVC
## 0 0
## FCVC NCP
## 0 0
## CAEC SMOKE
## 0 0
## CH2O SCC
## 0 0
## FAF TUE
## 0 0
## CALC MTRANS
## 0 0
## NObeyesdad
## 0
cat("Total:", sum(na_count), "\n")
## Total: 0
Pra-Pemrosesan Data
Encoding Variabel Kategorikal
df_encoded <- df_raw %>%
mutate(
Gender = ifelse(Gender == "Male", 1, 0),
family_history_with_overweight = ifelse(family_history_with_overweight == "yes", 1, 0),
FAVC = ifelse(FAVC == "yes", 1, 0),
SMOKE = ifelse(SMOKE == "yes", 1, 0),
SCC = ifelse(SCC == "yes", 1, 0),
CAEC = case_when(CAEC == "no" ~ 0, CAEC == "Sometimes" ~ 1,
CAEC == "Frequently" ~ 2, CAEC == "Always" ~ 3),
CALC = case_when(CALC == "no" ~ 0, CALC == "Sometimes" ~ 1,
CALC == "Frequently" ~ 2, CALC == "Always" ~ 3),
MTRANS = case_when(
MTRANS == "Walking" ~ 1, MTRANS == "Bike" ~ 2,
MTRANS == "Motorbike" ~ 3, MTRANS == "Public_Transportation" ~ 4,
MTRANS == "Automobile" ~ 5)
) %>%
select(-NObeyesdad)
cat("Dimensi setelah encoding:", nrow(df_encoded), "x", ncol(df_encoded), "\n")
## Dimensi setelah encoding: 2111 x 16
head(df_encoded)
## Gender Age Height Weight family_history_with_overweight FAVC FCVC NCP CAEC
## 1 0 21 1.62 64.0 1 0 2 3 1
## 2 0 21 1.52 56.0 1 0 3 3 1
## 3 1 23 1.80 77.0 1 0 2 3 1
## 4 1 27 1.80 87.0 0 0 3 3 1
## 5 1 22 1.78 89.8 0 0 2 1 1
## 6 1 29 1.62 53.0 0 1 2 3 1
## SMOKE CH2O SCC FAF TUE CALC MTRANS
## 1 0 2 0 0 1 0 4
## 2 1 3 1 3 0 1 4
## 3 0 2 0 2 1 2 4
## 4 0 2 0 2 0 2 1
## 5 0 2 0 0 0 1 4
## 6 0 2 0 0 0 1 5
Scaling (Z-Score)
df_scaled <- as.data.frame(scale(df_encoded))
cat("Mean tiap kolom (harus ~0):\n"); round(colMeans(df_scaled), 4)
## Mean tiap kolom (harus ~0):
## Gender Age
## 0 0
## Height Weight
## 0 0
## family_history_with_overweight FAVC
## 0 0
## FCVC NCP
## 0 0
## CAEC SMOKE
## 0 0
## CH2O SCC
## 0 0
## FAF TUE
## 0 0
## CALC MTRANS
## 0 0
cat("SD tiap kolom (harus ~1):\n"); round(apply(df_scaled, 2, sd), 4)
## SD tiap kolom (harus ~1):
## Gender Age
## 1 1
## Height Weight
## 1 1
## family_history_with_overweight FAVC
## 1 1
## FCVC NCP
## 1 1
## CAEC SMOKE
## 1 1
## CH2O SCC
## 1 1
## FAF TUE
## 1 1
## CALC MTRANS
## 1 1
vars_show <- c("Age", "Height", "Weight", "FCVC")
df_viz <- bind_rows(
df_encoded[, vars_show] %>% pivot_longer(everything(), names_to = "Variabel", values_to = "Nilai") %>% mutate(Status = "Sebelum Scaling"),
df_scaled[, vars_show] %>% pivot_longer(everything(), names_to = "Variabel", values_to = "Nilai") %>% mutate(Status = "Setelah Scaling")
)
ggplot(df_viz, aes(x = Nilai, fill = Status)) +
geom_histogram(bins = 30, alpha = 0.7, color = "white") +
facet_grid(Status ~ Variabel, scales = "free") +
scale_fill_manual(values = c("Sebelum Scaling" = "#E74C3C", "Setelah Scaling" = "#2ECC71")) +
labs(title = "Distribusi Variabel: Sebelum vs Sesudah Scaling",
x = "Nilai", y = "Frekuensi") +
theme_minimal(base_size = 11) +
theme(legend.position = "none", strip.text = element_text(face = "bold"))

Menentukan Jumlah Cluster Optimal
Metode Elbow
set.seed(123)
fviz_nbclust(df_scaled, kmeans, method = "wss", k.max = 10) +
geom_vline(xintercept = 3, linetype = "dashed", color = "red", linewidth = 0.8) +
labs(title = "Metode Elbow", x = "Jumlah Cluster (k)", y = "Total Within Sum of Squares") +
theme_minimal()

Metode Silhouette
set.seed(123)
fviz_nbclust(df_scaled, kmeans, method = "silhouette", k.max = 10) +
labs(title = "Metode Silhouette", x = "Jumlah Cluster (k)", y = "Silhouette Score Rata-rata") +
theme_minimal()
K-Means Clustering
k_optimal <- 3
set.seed(123)
kmeans_result <- kmeans(df_scaled, centers = k_optimal, nstart = 25, iter.max = 100)
cat("Ukuran tiap cluster:\n"); print(kmeans_result$size)
## Ukuran tiap cluster:
## [1] 517 932 662
cat("Between SS / Total SS:", round(kmeans_result$betweenss / kmeans_result$totss * 100, 2), "%\n")
## Between SS / Total SS: 18.15 %
fviz_cluster(
kmeans_result,
data = df_scaled,
geom = "point",
ellipse.type = "convex",
ggtheme = theme_minimal()
)

df_encoded$Cluster_KMeans <- as.factor(kmeans_result$cluster)
profil_kmeans <- df_encoded %>%
group_by(Cluster_KMeans) %>%
summarise(n = n(),
Usia_rata2 = round(mean(Age), 2), TB_rata2 = round(mean(Height), 3),
BB_rata2 = round(mean(Weight), 2), FCVC_rata2 = round(mean(FCVC), 2),
FAF_rata2 = round(mean(FAF), 2), TUE_rata2 = round(mean(TUE), 2),
CH2O_rata2 = round(mean(CH2O), 2))
kable(profil_kmeans,
caption = "Profil Rata-rata per Cluster K-Means",
col.names = c("Cluster","n","Usia","Tinggi","Berat","Sayur","Aktivitas","Teknologi","Air"))
Profil Rata-rata per Cluster K-Means
| 1 |
517 |
20.81 |
1.638 |
57.61 |
2.41 |
1.14 |
0.78 |
1.78 |
| 2 |
932 |
25.12 |
1.767 |
93.90 |
2.25 |
1.17 |
0.65 |
2.11 |
| 3 |
662 |
25.90 |
1.660 |
98.93 |
2.67 |
0.68 |
0.58 |
2.03 |
df_scaled_cluster <- df_scaled
df_scaled_cluster$Cluster <- as.factor(kmeans_result$cluster)
profil_z <- df_scaled_cluster %>%
group_by(Cluster) %>%
summarise(across(where(is.numeric), mean)) %>%
pivot_longer(-Cluster, names_to = "Variabel", values_to = "Z_Score")
ggplot(profil_z, aes(x = Variabel, y = Cluster, fill = Z_Score)) +
geom_tile(color = "white", linewidth = 0.5) +
geom_text(aes(label = round(Z_Score, 2)), size = 3) +
scale_fill_gradient2(low = "#2980B9", mid = "white", high = "#E74C3C", midpoint = 0) +
labs(title = "Heatmap Profil Cluster K-Means (Z-Score)",
subtitle = "Merah = di atas rata-rata | Biru = di bawah rata-rata") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 9))

Hierarchical Clustering
set.seed(123)
n_sample <- min(500, nrow(df_scaled))
idx_sample <- sample(1:nrow(df_scaled), n_sample, replace = FALSE)
df_hclust <- df_scaled[idx_sample, ]
dist_matrix <- dist(df_hclust, method = "euclidean")
hclust_ward <- hclust(dist_matrix, method = "ward.D2")
cat("Jumlah observasi:", n_sample, "| Linkage: ward.D2 | Jarak: Euclidean\n")
## Jumlah observasi: 500 | Linkage: ward.D2 | Jarak: Euclidean
plot(hclust_ward,
main = paste("Dendrogram Hierarchical Clustering (Ward's, n =", n_sample, ")"),
xlab = "Observasi", ylab = "Height", labels = FALSE, hang = -1)
abline(h = 30, col = "red", lty = 2, lwd = 2)
legend("topright", legend = "Garis potong (k=3)", col = "red", lty = 2, lwd = 2, bty = "n")

hclust_cluster <- cutree(hclust_ward, k = k_optimal)
cat("Distribusi cluster HC:\n"); print(table(hclust_cluster))
## Distribusi cluster HC:
## hclust_cluster
## 1 2 3
## 161 85 254
fviz_cluster(
list(data = df_hclust, cluster = hclust_cluster),
geom = "point",
ellipse.type = "convex"
) +
theme_minimal() +
labs(
title = paste("Hierarchical Clustering - Ward's (k =", k_optimal, ")"),
subtitle = "Reduksi dimensi dengan PCA"
)

df_hclust_profil <- df_encoded[idx_sample, ]
df_hclust_profil$Cluster_HC <- as.factor(hclust_cluster)
profil_hc <- df_hclust_profil %>%
group_by(Cluster_HC) %>%
summarise(n = n(),
Usia_rata2 = round(mean(Age), 2), TB_rata2 = round(mean(Height), 3),
BB_rata2 = round(mean(Weight), 2), FCVC_rata2 = round(mean(FCVC), 2),
FAF_rata2 = round(mean(FAF), 2), TUE_rata2 = round(mean(TUE), 2),
CH2O_rata2 = round(mean(CH2O), 2))
kable(profil_hc,
caption = "Profil Rata-rata per Cluster Hierarchical",
col.names = c("Cluster","n","Usia","Tinggi","Berat","Sayur","Aktivitas","Teknologi","Air"))
Profil Rata-rata per Cluster Hierarchical
| 1 |
161 |
21.20 |
1.675 |
63.46 |
2.35 |
1.24 |
0.71 |
1.90 |
| 2 |
85 |
23.15 |
1.699 |
123.15 |
3.00 |
0.78 |
0.62 |
2.23 |
| 3 |
254 |
26.64 |
1.719 |
89.96 |
2.28 |
0.94 |
0.64 |
2.02 |
Validasi Cluster
Silhouette Score
sil_kmeans <- silhouette(kmeans_result$cluster, dist(df_scaled))
avg_sil_km <- mean(sil_kmeans[, 3])
sil_hc <- silhouette(hclust_cluster, dist_matrix)
avg_sil_hc <- mean(sil_hc[, 3])
cat("Silhouette K-Means :", round(avg_sil_km, 4), "\n")
## Silhouette K-Means : 0.1002
cat("Silhouette Hierarchical :", round(avg_sil_hc, 4), "\n")
## Silhouette Hierarchical : 0.0605
cat("Pemenang:", ifelse(avg_sil_km > avg_sil_hc, "K-Means", "Hierarchical"), "\n")
## Pemenang: K-Means
p1 <- fviz_silhouette(sil_kmeans, print.summary = FALSE) +
labs(title = paste("Silhouette K-Means =", round(avg_sil_km, 4))) +
theme_minimal() + theme(legend.position = "bottom")
p2 <- fviz_silhouette(sil_hc, print.summary = FALSE) +
labs(title = paste("Silhouette Hierarchical =", round(avg_sil_hc, 4))) +
theme_minimal() + theme(legend.position = "bottom")
grid.arrange(p1, p2, ncol = 2)

Dunn Index & Connectivity
val_km <- clValid(as.matrix(df_scaled[idx_sample, ]), k_optimal,
clMethods = "kmeans", validation = "internal")
val_hc <- clValid(as.matrix(df_hclust), k_optimal,
clMethods = "hierarchical", validation = "internal")
cat("--- K-Means ---\n"); summary(val_km)
## --- K-Means ---
##
## Clustering Methods:
## kmeans
##
## Cluster sizes:
## 3
##
## Validation Measures:
## 3
##
## kmeans Connectivity 87.7278
## Dunn 0.1744
## Silhouette 0.2102
##
## Optimal Scores:
##
## Score Method Clusters
## Connectivity 87.7278 kmeans 3
## Dunn 0.1744 kmeans 3
## Silhouette 0.2102 kmeans 3
cat("--- Hierarchical ---\n"); summary(val_hc)
## --- Hierarchical ---
##
## Clustering Methods:
## hierarchical
##
## Cluster sizes:
## 3
##
## Validation Measures:
## 3
##
## hierarchical Connectivity 7.7849
## Dunn 0.5450
## Silhouette 0.4401
##
## Optimal Scores:
##
## Score Method Clusters
## Connectivity 7.7849 hierarchical 3
## Dunn 0.5450 hierarchical 3
## Silhouette 0.4401 hierarchical 3
Ringkasan Validasi
tabel_validasi <- data.frame(
Metrik = c("Silhouette ↑", "Dunn Index ↑", "Connectivity ↓"),
Keterangan = c("Kohesivitas & separasi (lebih tinggi lebih baik)",
"Jarak antar-cluster vs ukuran (lebih tinggi lebih baik)",
"Ketetanggaan antar-cluster (lebih rendah lebih baik)"),
KMeans = c(round(avg_sil_km, 4), round(measures(val_km)[2], 4), round(measures(val_km)[1], 4)),
Hierarchical = c(round(avg_sil_hc, 4), round(measures(val_hc)[2], 4), round(measures(val_hc)[1], 4))
)
kable(tabel_validasi,
caption = "Perbandingan Indeks Validasi: K-Means vs Hierarchical",
col.names = c("Metrik", "Keterangan", "K-Means", "Hierarchical"))
Perbandingan Indeks Validasi: K-Means vs Hierarchical
| Silhouette ↑ |
Kohesivitas & separasi (lebih tinggi lebih
baik) |
0.1002 |
0.0605 |
| Dunn Index ↑ |
Jarak antar-cluster vs ukuran (lebih tinggi lebih
baik) |
0.1744 |
0.5450 |
| Connectivity ↓ |
Ketetanggaan antar-cluster (lebih rendah lebih
baik) |
87.7278 |
7.7849 |
df_compare <- data.frame(
Metode = c("K-Means", "Hierarchical"),
Silhouette = c(avg_sil_km, avg_sil_hc)
)
ggplot(df_compare, aes(x = Metode, y = Silhouette, fill = Metode)) +
geom_col(width = 0.5, show.legend = FALSE) +
geom_text(aes(label = round(Silhouette, 4)), vjust = -0.5, size = 5, fontface = "bold") +
scale_fill_manual(values = c("K-Means" = "#3498DB", "Hierarchical" = "#E67E22")) +
scale_y_continuous(limits = c(0, max(df_compare$Silhouette) * 1.2)) +
labs(title = "Perbandingan Silhouette Score",
subtitle = "Nilai lebih tinggi = cluster lebih baik",
x = "Metode", y = "Silhouette Score") +
theme_minimal(base_size = 13)

Kesimpulan
metode_terbaik <- ifelse(avg_sil_km >= avg_sil_hc, "K-Means", "Hierarchical Clustering")
cat("==============================================\n")
## ==============================================
cat(" KESIMPULAN ANALISIS CLUSTERING\n")
## KESIMPULAN ANALISIS CLUSTERING
cat("==============================================\n")
## ==============================================
cat("Dataset :", nrow(df_raw), "obs x", ncol(df_raw), "variabel\n")
## Dataset : 2111 obs x 17 variabel
cat("k optimal:", k_optimal, "\n\n")
## k optimal: 3
cat("Silhouette K-Means :", round(avg_sil_km, 4), "\n")
## Silhouette K-Means : 0.1002
cat("Silhouette Hierarchical :", round(avg_sil_hc, 4), "\n")
## Silhouette Hierarchical : 0.0605
cat("Metode terbaik :", metode_terbaik, "\n\n")
## Metode terbaik : K-Means
cat("Profil Cluster (K-Means):\n")
## Profil Cluster (K-Means):
for (i in 1:k_optimal) {
b <- profil_kmeans[profil_kmeans$Cluster_KMeans == i, ]
cat(sprintf(" Cluster %d (n=%d): BB=%.1f kg, Aktivitas=%.2f, Sayur=%.2f\n",
i, b$n, b$BB_rata2, b$FAF_rata2, b$FCVC_rata2))
}
## Cluster 1 (n=517): BB=57.6 kg, Aktivitas=1.14, Sayur=2.41
## Cluster 2 (n=932): BB=93.9 kg, Aktivitas=1.17, Sayur=2.25
## Cluster 3 (n=662): BB=98.9 kg, Aktivitas=0.68, Sayur=2.67