Pendahuluan

Dataset ObesityDataSet berisi data kebiasaan makan, aktivitas fisik, dan kondisi fisik 2.111 individu dengan 17 variabel (16 fitur + 1 label target). Tujuan analisis:

  1. Scaling (standardisasi) variabel numerik
  2. Clustering menggunakan K-Means dan Hierarchical Clustering
  3. Validasi hasil clustering (Silhouette, Dunn, Connectivity)
  4. Membandingkan metode terbaik

Kolom NObeyesdad dihapus sebelum clustering karena analisis bersifat unsupervised.


Load Library dan Data

library(tidyverse)
library(cluster)
library(factoextra)
library(NbClust)
library(clValid)
library(ggdendro)
library(gridExtra)
library(knitr)
library(scales)
library(ggpubr)
df_raw <- read.csv("ObesityDataSet_raw_and_data_sinthetic.csv", stringsAsFactors = TRUE)
cat("Dimensi:", nrow(df_raw), "x", ncol(df_raw), "\n")
## Dimensi: 2111 x 17
head(df_raw)
##   Gender Age Height Weight family_history_with_overweight FAVC FCVC NCP
## 1 Female  21   1.62   64.0                            yes   no    2   3
## 2 Female  21   1.52   56.0                            yes   no    3   3
## 3   Male  23   1.80   77.0                            yes   no    2   3
## 4   Male  27   1.80   87.0                             no   no    3   3
## 5   Male  22   1.78   89.8                             no   no    2   1
## 6   Male  29   1.62   53.0                             no  yes    2   3
##        CAEC SMOKE CH2O SCC FAF TUE       CALC                MTRANS
## 1 Sometimes    no    2  no   0   1         no Public_Transportation
## 2 Sometimes   yes    3 yes   3   0  Sometimes Public_Transportation
## 3 Sometimes    no    2  no   2   1 Frequently Public_Transportation
## 4 Sometimes    no    2  no   2   0 Frequently               Walking
## 5 Sometimes    no    2  no   0   0  Sometimes Public_Transportation
## 6 Sometimes    no    2  no   0   0  Sometimes            Automobile
##            NObeyesdad
## 1       Normal_Weight
## 2       Normal_Weight
## 3       Normal_Weight
## 4  Overweight_Level_I
## 5 Overweight_Level_II
## 6       Normal_Weight
str(df_raw)
## 'data.frame':    2111 obs. of  17 variables:
##  $ Gender                        : Factor w/ 2 levels "Female","Male": 1 1 2 2 2 2 1 2 2 2 ...
##  $ Age                           : num  21 21 23 27 22 29 23 22 24 22 ...
##  $ Height                        : num  1.62 1.52 1.8 1.8 1.78 1.62 1.5 1.64 1.78 1.72 ...
##  $ Weight                        : num  64 56 77 87 89.8 53 55 53 64 68 ...
##  $ family_history_with_overweight: Factor w/ 2 levels "no","yes": 2 2 2 1 1 1 2 1 2 2 ...
##  $ FAVC                          : Factor w/ 2 levels "no","yes": 1 1 1 1 1 2 2 1 2 2 ...
##  $ FCVC                          : num  2 3 2 3 2 2 3 2 3 2 ...
##  $ NCP                           : num  3 3 3 3 1 3 3 3 3 3 ...
##  $ CAEC                          : Factor w/ 4 levels "Always","Frequently",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ SMOKE                         : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 1 1 1 1 ...
##  $ CH2O                          : num  2 3 2 2 2 2 2 2 2 2 ...
##  $ SCC                           : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 1 1 1 1 ...
##  $ FAF                           : num  0 3 2 2 0 0 1 3 1 1 ...
##  $ TUE                           : num  1 0 1 0 0 0 0 0 1 1 ...
##  $ CALC                          : Factor w/ 4 levels "Always","Frequently",..: 3 4 2 2 4 4 4 4 2 3 ...
##  $ MTRANS                        : Factor w/ 5 levels "Automobile","Bike",..: 4 4 4 5 4 1 3 4 4 4 ...
##  $ NObeyesdad                    : Factor w/ 7 levels "Insufficient_Weight",..: 2 2 2 6 7 2 2 2 2 2 ...
na_count <- colSums(is.na(df_raw))
cat("Missing value per kolom:\n"); print(na_count)
## Missing value per kolom:
##                         Gender                            Age 
##                              0                              0 
##                         Height                         Weight 
##                              0                              0 
## family_history_with_overweight                           FAVC 
##                              0                              0 
##                           FCVC                            NCP 
##                              0                              0 
##                           CAEC                          SMOKE 
##                              0                              0 
##                           CH2O                            SCC 
##                              0                              0 
##                            FAF                            TUE 
##                              0                              0 
##                           CALC                         MTRANS 
##                              0                              0 
##                     NObeyesdad 
##                              0
cat("Total:", sum(na_count), "\n")
## Total: 0

Pra-Pemrosesan Data

Encoding Variabel Kategorikal

df_encoded <- df_raw %>%
  mutate(
    Gender = ifelse(Gender == "Male", 1, 0),
    family_history_with_overweight = ifelse(family_history_with_overweight == "yes", 1, 0),
    FAVC  = ifelse(FAVC  == "yes", 1, 0),
    SMOKE = ifelse(SMOKE == "yes", 1, 0),
    SCC   = ifelse(SCC   == "yes", 1, 0),
    CAEC = case_when(CAEC == "no" ~ 0, CAEC == "Sometimes" ~ 1,
                     CAEC == "Frequently" ~ 2, CAEC == "Always" ~ 3),
    CALC = case_when(CALC == "no" ~ 0, CALC == "Sometimes" ~ 1,
                     CALC == "Frequently" ~ 2, CALC == "Always" ~ 3),
    MTRANS = case_when(
      MTRANS == "Walking" ~ 1, MTRANS == "Bike" ~ 2,
      MTRANS == "Motorbike" ~ 3, MTRANS == "Public_Transportation" ~ 4,
      MTRANS == "Automobile" ~ 5)
  ) %>%
  select(-NObeyesdad)

cat("Dimensi setelah encoding:", nrow(df_encoded), "x", ncol(df_encoded), "\n")
## Dimensi setelah encoding: 2111 x 16
head(df_encoded)
##   Gender Age Height Weight family_history_with_overweight FAVC FCVC NCP CAEC
## 1      0  21   1.62   64.0                              1    0    2   3    1
## 2      0  21   1.52   56.0                              1    0    3   3    1
## 3      1  23   1.80   77.0                              1    0    2   3    1
## 4      1  27   1.80   87.0                              0    0    3   3    1
## 5      1  22   1.78   89.8                              0    0    2   1    1
## 6      1  29   1.62   53.0                              0    1    2   3    1
##   SMOKE CH2O SCC FAF TUE CALC MTRANS
## 1     0    2   0   0   1    0      4
## 2     1    3   1   3   0    1      4
## 3     0    2   0   2   1    2      4
## 4     0    2   0   2   0    2      1
## 5     0    2   0   0   0    1      4
## 6     0    2   0   0   0    1      5

Scaling (Z-Score)

df_scaled <- as.data.frame(scale(df_encoded))
cat("Mean tiap kolom (harus ~0):\n"); round(colMeans(df_scaled), 4)
## Mean tiap kolom (harus ~0):
##                         Gender                            Age 
##                              0                              0 
##                         Height                         Weight 
##                              0                              0 
## family_history_with_overweight                           FAVC 
##                              0                              0 
##                           FCVC                            NCP 
##                              0                              0 
##                           CAEC                          SMOKE 
##                              0                              0 
##                           CH2O                            SCC 
##                              0                              0 
##                            FAF                            TUE 
##                              0                              0 
##                           CALC                         MTRANS 
##                              0                              0
cat("SD tiap kolom (harus ~1):\n"); round(apply(df_scaled, 2, sd), 4)
## SD tiap kolom (harus ~1):
##                         Gender                            Age 
##                              1                              1 
##                         Height                         Weight 
##                              1                              1 
## family_history_with_overweight                           FAVC 
##                              1                              1 
##                           FCVC                            NCP 
##                              1                              1 
##                           CAEC                          SMOKE 
##                              1                              1 
##                           CH2O                            SCC 
##                              1                              1 
##                            FAF                            TUE 
##                              1                              1 
##                           CALC                         MTRANS 
##                              1                              1
vars_show <- c("Age", "Height", "Weight", "FCVC")

df_viz <- bind_rows(
  df_encoded[, vars_show] %>% pivot_longer(everything(), names_to = "Variabel", values_to = "Nilai") %>% mutate(Status = "Sebelum Scaling"),
  df_scaled[, vars_show]  %>% pivot_longer(everything(), names_to = "Variabel", values_to = "Nilai") %>% mutate(Status = "Setelah Scaling")
)

ggplot(df_viz, aes(x = Nilai, fill = Status)) +
  geom_histogram(bins = 30, alpha = 0.7, color = "white") +
  facet_grid(Status ~ Variabel, scales = "free") +
  scale_fill_manual(values = c("Sebelum Scaling" = "#E74C3C", "Setelah Scaling" = "#2ECC71")) +
  labs(title = "Distribusi Variabel: Sebelum vs Sesudah Scaling",
       x = "Nilai", y = "Frekuensi") +
  theme_minimal(base_size = 11) +
  theme(legend.position = "none", strip.text = element_text(face = "bold"))


Menentukan Jumlah Cluster Optimal

Metode Elbow

set.seed(123)
fviz_nbclust(df_scaled, kmeans, method = "wss", k.max = 10) +
  geom_vline(xintercept = 3, linetype = "dashed", color = "red", linewidth = 0.8) +
  labs(title = "Metode Elbow", x = "Jumlah Cluster (k)", y = "Total Within Sum of Squares") +
  theme_minimal()

Metode Silhouette

set.seed(123)
fviz_nbclust(df_scaled, kmeans, method = "silhouette", k.max = 10) +
  labs(title = "Metode Silhouette", x = "Jumlah Cluster (k)", y = "Silhouette Score Rata-rata") +
  theme_minimal()

K-Means Clustering

k_optimal <- 3
set.seed(123)
kmeans_result <- kmeans(df_scaled, centers = k_optimal, nstart = 25, iter.max = 100)

cat("Ukuran tiap cluster:\n"); print(kmeans_result$size)
## Ukuran tiap cluster:
## [1] 517 932 662
cat("Between SS / Total SS:", round(kmeans_result$betweenss / kmeans_result$totss * 100, 2), "%\n")
## Between SS / Total SS: 18.15 %
fviz_cluster(
  kmeans_result,
  data = df_scaled,
  geom = "point",
  ellipse.type = "convex",
  ggtheme = theme_minimal()
)

df_encoded$Cluster_KMeans <- as.factor(kmeans_result$cluster)

profil_kmeans <- df_encoded %>%
  group_by(Cluster_KMeans) %>%
  summarise(n = n(),
            Usia_rata2 = round(mean(Age), 2),    TB_rata2   = round(mean(Height), 3),
            BB_rata2   = round(mean(Weight), 2),  FCVC_rata2 = round(mean(FCVC), 2),
            FAF_rata2  = round(mean(FAF), 2),     TUE_rata2  = round(mean(TUE), 2),
            CH2O_rata2 = round(mean(CH2O), 2))

kable(profil_kmeans,
      caption = "Profil Rata-rata per Cluster K-Means",
      col.names = c("Cluster","n","Usia","Tinggi","Berat","Sayur","Aktivitas","Teknologi","Air"))
Profil Rata-rata per Cluster K-Means
Cluster n Usia Tinggi Berat Sayur Aktivitas Teknologi Air
1 517 20.81 1.638 57.61 2.41 1.14 0.78 1.78
2 932 25.12 1.767 93.90 2.25 1.17 0.65 2.11
3 662 25.90 1.660 98.93 2.67 0.68 0.58 2.03
df_scaled_cluster        <- df_scaled
df_scaled_cluster$Cluster <- as.factor(kmeans_result$cluster)

profil_z <- df_scaled_cluster %>%
  group_by(Cluster) %>%
  summarise(across(where(is.numeric), mean)) %>%
  pivot_longer(-Cluster, names_to = "Variabel", values_to = "Z_Score")

ggplot(profil_z, aes(x = Variabel, y = Cluster, fill = Z_Score)) +
  geom_tile(color = "white", linewidth = 0.5) +
  geom_text(aes(label = round(Z_Score, 2)), size = 3) +
  scale_fill_gradient2(low = "#2980B9", mid = "white", high = "#E74C3C", midpoint = 0) +
  labs(title = "Heatmap Profil Cluster K-Means (Z-Score)",
       subtitle = "Merah = di atas rata-rata | Biru = di bawah rata-rata") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 9))


Hierarchical Clustering

set.seed(123)
n_sample   <- min(500, nrow(df_scaled))
idx_sample <- sample(1:nrow(df_scaled), n_sample, replace = FALSE)
df_hclust  <- df_scaled[idx_sample, ]

dist_matrix <- dist(df_hclust, method = "euclidean")
hclust_ward <- hclust(dist_matrix, method = "ward.D2")

cat("Jumlah observasi:", n_sample, "| Linkage: ward.D2 | Jarak: Euclidean\n")
## Jumlah observasi: 500 | Linkage: ward.D2 | Jarak: Euclidean
plot(hclust_ward,
     main = paste("Dendrogram Hierarchical Clustering (Ward's, n =", n_sample, ")"),
     xlab = "Observasi", ylab = "Height", labels = FALSE, hang = -1)
abline(h = 30, col = "red", lty = 2, lwd = 2)
legend("topright", legend = "Garis potong (k=3)", col = "red", lty = 2, lwd = 2, bty = "n")

hclust_cluster <- cutree(hclust_ward, k = k_optimal)
cat("Distribusi cluster HC:\n"); print(table(hclust_cluster))
## Distribusi cluster HC:
## hclust_cluster
##   1   2   3 
## 161  85 254
fviz_cluster(
  list(data = df_hclust, cluster = hclust_cluster),
  geom = "point",
  ellipse.type = "convex"
) +
  theme_minimal() +
  labs(
    title = paste("Hierarchical Clustering - Ward's (k =", k_optimal, ")"),
    subtitle = "Reduksi dimensi dengan PCA"
  )

df_hclust_profil            <- df_encoded[idx_sample, ]
df_hclust_profil$Cluster_HC <- as.factor(hclust_cluster)

profil_hc <- df_hclust_profil %>%
  group_by(Cluster_HC) %>%
  summarise(n = n(),
            Usia_rata2 = round(mean(Age), 2),    TB_rata2   = round(mean(Height), 3),
            BB_rata2   = round(mean(Weight), 2),  FCVC_rata2 = round(mean(FCVC), 2),
            FAF_rata2  = round(mean(FAF), 2),     TUE_rata2  = round(mean(TUE), 2),
            CH2O_rata2 = round(mean(CH2O), 2))

kable(profil_hc,
      caption = "Profil Rata-rata per Cluster Hierarchical",
      col.names = c("Cluster","n","Usia","Tinggi","Berat","Sayur","Aktivitas","Teknologi","Air"))
Profil Rata-rata per Cluster Hierarchical
Cluster n Usia Tinggi Berat Sayur Aktivitas Teknologi Air
1 161 21.20 1.675 63.46 2.35 1.24 0.71 1.90
2 85 23.15 1.699 123.15 3.00 0.78 0.62 2.23
3 254 26.64 1.719 89.96 2.28 0.94 0.64 2.02

Validasi Cluster

Silhouette Score

sil_kmeans <- silhouette(kmeans_result$cluster, dist(df_scaled))
avg_sil_km <- mean(sil_kmeans[, 3])

sil_hc     <- silhouette(hclust_cluster, dist_matrix)
avg_sil_hc <- mean(sil_hc[, 3])

cat("Silhouette K-Means      :", round(avg_sil_km, 4), "\n")
## Silhouette K-Means      : 0.1002
cat("Silhouette Hierarchical :", round(avg_sil_hc, 4), "\n")
## Silhouette Hierarchical : 0.0605
cat("Pemenang:", ifelse(avg_sil_km > avg_sil_hc, "K-Means", "Hierarchical"), "\n")
## Pemenang: K-Means
p1 <- fviz_silhouette(sil_kmeans, print.summary = FALSE) +
  labs(title = paste("Silhouette K-Means =", round(avg_sil_km, 4))) +
  theme_minimal() + theme(legend.position = "bottom")

p2 <- fviz_silhouette(sil_hc, print.summary = FALSE) +
  labs(title = paste("Silhouette Hierarchical =", round(avg_sil_hc, 4))) +
  theme_minimal() + theme(legend.position = "bottom")

grid.arrange(p1, p2, ncol = 2)

Dunn Index & Connectivity

val_km <- clValid(as.matrix(df_scaled[idx_sample, ]), k_optimal,
                  clMethods = "kmeans",       validation = "internal")
val_hc <- clValid(as.matrix(df_hclust),       k_optimal,
                  clMethods = "hierarchical", validation = "internal")

cat("--- K-Means ---\n");      summary(val_km)
## --- K-Means ---
## 
## Clustering Methods:
##  kmeans 
## 
## Cluster sizes:
##  3 
## 
## Validation Measures:
##                            3
##                             
## kmeans Connectivity  87.7278
##        Dunn           0.1744
##        Silhouette     0.2102
## 
## Optimal Scores:
## 
##              Score   Method Clusters
## Connectivity 87.7278 kmeans 3       
## Dunn          0.1744 kmeans 3       
## Silhouette    0.2102 kmeans 3
cat("--- Hierarchical ---\n"); summary(val_hc)
## --- Hierarchical ---
## 
## Clustering Methods:
##  hierarchical 
## 
## Cluster sizes:
##  3 
## 
## Validation Measures:
##                                 3
##                                  
## hierarchical Connectivity  7.7849
##              Dunn          0.5450
##              Silhouette    0.4401
## 
## Optimal Scores:
## 
##              Score  Method       Clusters
## Connectivity 7.7849 hierarchical 3       
## Dunn         0.5450 hierarchical 3       
## Silhouette   0.4401 hierarchical 3

Ringkasan Validasi

tabel_validasi <- data.frame(
  Metrik       = c("Silhouette ↑", "Dunn Index ↑", "Connectivity ↓"),
  Keterangan   = c("Kohesivitas & separasi (lebih tinggi lebih baik)",
                   "Jarak antar-cluster vs ukuran (lebih tinggi lebih baik)",
                   "Ketetanggaan antar-cluster (lebih rendah lebih baik)"),
  KMeans       = c(round(avg_sil_km, 4), round(measures(val_km)[2], 4), round(measures(val_km)[1], 4)),
  Hierarchical = c(round(avg_sil_hc, 4), round(measures(val_hc)[2], 4), round(measures(val_hc)[1], 4))
)

kable(tabel_validasi,
      caption = "Perbandingan Indeks Validasi: K-Means vs Hierarchical",
      col.names = c("Metrik", "Keterangan", "K-Means", "Hierarchical"))
Perbandingan Indeks Validasi: K-Means vs Hierarchical
Metrik Keterangan K-Means Hierarchical
Silhouette ↑ Kohesivitas & separasi (lebih tinggi lebih baik) 0.1002 0.0605
Dunn Index ↑ Jarak antar-cluster vs ukuran (lebih tinggi lebih baik) 0.1744 0.5450
Connectivity ↓ Ketetanggaan antar-cluster (lebih rendah lebih baik) 87.7278 7.7849
df_compare <- data.frame(
  Metode     = c("K-Means", "Hierarchical"),
  Silhouette = c(avg_sil_km, avg_sil_hc)
)

ggplot(df_compare, aes(x = Metode, y = Silhouette, fill = Metode)) +
  geom_col(width = 0.5, show.legend = FALSE) +
  geom_text(aes(label = round(Silhouette, 4)), vjust = -0.5, size = 5, fontface = "bold") +
  scale_fill_manual(values = c("K-Means" = "#3498DB", "Hierarchical" = "#E67E22")) +
  scale_y_continuous(limits = c(0, max(df_compare$Silhouette) * 1.2)) +
  labs(title = "Perbandingan Silhouette Score",
       subtitle = "Nilai lebih tinggi = cluster lebih baik",
       x = "Metode", y = "Silhouette Score") +
  theme_minimal(base_size = 13)


Kesimpulan

metode_terbaik <- ifelse(avg_sil_km >= avg_sil_hc, "K-Means", "Hierarchical Clustering")

cat("==============================================\n")
## ==============================================
cat("  KESIMPULAN ANALISIS CLUSTERING\n")
##   KESIMPULAN ANALISIS CLUSTERING
cat("==============================================\n")
## ==============================================
cat("Dataset :", nrow(df_raw), "obs x", ncol(df_raw), "variabel\n")
## Dataset : 2111 obs x 17 variabel
cat("k optimal:", k_optimal, "\n\n")
## k optimal: 3
cat("Silhouette K-Means      :", round(avg_sil_km, 4), "\n")
## Silhouette K-Means      : 0.1002
cat("Silhouette Hierarchical :", round(avg_sil_hc, 4), "\n")
## Silhouette Hierarchical : 0.0605
cat("Metode terbaik          :", metode_terbaik, "\n\n")
## Metode terbaik          : K-Means
cat("Profil Cluster (K-Means):\n")
## Profil Cluster (K-Means):
for (i in 1:k_optimal) {
  b <- profil_kmeans[profil_kmeans$Cluster_KMeans == i, ]
  cat(sprintf("  Cluster %d (n=%d): BB=%.1f kg, Aktivitas=%.2f, Sayur=%.2f\n",
              i, b$n, b$BB_rata2, b$FAF_rata2, b$FCVC_rata2))
}
##   Cluster 1 (n=517): BB=57.6 kg, Aktivitas=1.14, Sayur=2.41
##   Cluster 2 (n=932): BB=93.9 kg, Aktivitas=1.17, Sayur=2.25
##   Cluster 3 (n=662): BB=98.9 kg, Aktivitas=0.68, Sayur=2.67