Pendahuluan

Dataset ObesityDataSet berisi data kebiasaan makan, aktivitas fisik, dan kondisi fisik 2.111 individu dengan 17 variabel (16 fitur + 1 label target). Tujuan analisis:

Scaling (standardisasi) variabel numerik
Clustering menggunakan K-Means dan Hierarchical Clustering
Validasi hasil clustering (Silhouette, Dunn, Connectivity)
Membandingkan metode terbaik

Kolom NObeyesdad dihapus sebelum clustering karena analisis bersifat unsupervised.

Load Library dan Data

library(tidyverse)
library(cluster)
library(factoextra)
library(NbClust)
library(clValid)
library(ggdendro)
library(gridExtra)
library(knitr)
library(scales)
library(ggpubr)

df_raw <- read.csv("ObesityDataSet_raw_and_data_sinthetic.csv", stringsAsFactors = TRUE)
cat("Dimensi:", nrow(df_raw), "x", ncol(df_raw), "\n")

## Dimensi: 2111 x 17

head(df_raw)

##   Gender Age Height Weight family_history_with_overweight FAVC FCVC NCP
## 1 Female  21   1.62   64.0                            yes   no    2   3
## 2 Female  21   1.52   56.0                            yes   no    3   3
## 3   Male  23   1.80   77.0                            yes   no    2   3
## 4   Male  27   1.80   87.0                             no   no    3   3
## 5   Male  22   1.78   89.8                             no   no    2   1
## 6   Male  29   1.62   53.0                             no  yes    2   3
##        CAEC SMOKE CH2O SCC FAF TUE       CALC                MTRANS
## 1 Sometimes    no    2  no   0   1         no Public_Transportation
## 2 Sometimes   yes    3 yes   3   0  Sometimes Public_Transportation
## 3 Sometimes    no    2  no   2   1 Frequently Public_Transportation
## 4 Sometimes    no    2  no   2   0 Frequently               Walking
## 5 Sometimes    no    2  no   0   0  Sometimes Public_Transportation
## 6 Sometimes    no    2  no   0   0  Sometimes            Automobile
##            NObeyesdad
## 1       Normal_Weight
## 2       Normal_Weight
## 3       Normal_Weight
## 4  Overweight_Level_I
## 5 Overweight_Level_II
## 6       Normal_Weight

str(df_raw)

## 'data.frame':    2111 obs. of  17 variables:
##  $ Gender                        : Factor w/ 2 levels "Female","Male": 1 1 2 2 2 2 1 2 2 2 ...
##  $ Age                           : num  21 21 23 27 22 29 23 22 24 22 ...
##  $ Height                        : num  1.62 1.52 1.8 1.8 1.78 1.62 1.5 1.64 1.78 1.72 ...
##  $ Weight                        : num  64 56 77 87 89.8 53 55 53 64 68 ...
##  $ family_history_with_overweight: Factor w/ 2 levels "no","yes": 2 2 2 1 1 1 2 1 2 2 ...
##  $ FAVC                          : Factor w/ 2 levels "no","yes": 1 1 1 1 1 2 2 1 2 2 ...
##  $ FCVC                          : num  2 3 2 3 2 2 3 2 3 2 ...
##  $ NCP                           : num  3 3 3 3 1 3 3 3 3 3 ...
##  $ CAEC                          : Factor w/ 4 levels "Always","Frequently",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ SMOKE                         : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 1 1 1 1 ...
##  $ CH2O                          : num  2 3 2 2 2 2 2 2 2 2 ...
##  $ SCC                           : Factor w/ 2 levels "no","yes": 1 2 1 1 1 1 1 1 1 1 ...
##  $ FAF                           : num  0 3 2 2 0 0 1 3 1 1 ...
##  $ TUE                           : num  1 0 1 0 0 0 0 0 1 1 ...
##  $ CALC                          : Factor w/ 4 levels "Always","Frequently",..: 3 4 2 2 4 4 4 4 2 3 ...
##  $ MTRANS                        : Factor w/ 5 levels "Automobile","Bike",..: 4 4 4 5 4 1 3 4 4 4 ...
##  $ NObeyesdad                    : Factor w/ 7 levels "Insufficient_Weight",..: 2 2 2 6 7 2 2 2 2 2 ...

na_count <- colSums(is.na(df_raw))
cat("Missing value per kolom:\n"); print(na_count)

## Missing value per kolom:

##                         Gender                            Age 
##                              0                              0 
##                         Height                         Weight 
##                              0                              0 
## family_history_with_overweight                           FAVC 
##                              0                              0 
##                           FCVC                            NCP 
##                              0                              0 
##                           CAEC                          SMOKE 
##                              0                              0 
##                           CH2O                            SCC 
##                              0                              0 
##                            FAF                            TUE 
##                              0                              0 
##                           CALC                         MTRANS 
##                              0                              0 
##                     NObeyesdad 
##                              0

cat("Total:", sum(na_count), "\n")

## Total: 0

Pra-Pemrosesan Data

Encoding Variabel Kategorikal

df_encoded <- df_raw %>%
  mutate(
    Gender = ifelse(Gender == "Male", 1, 0),
    family_history_with_overweight = ifelse(family_history_with_overweight == "yes", 1, 0),
    FAVC  = ifelse(FAVC  == "yes", 1, 0),
    SMOKE = ifelse(SMOKE == "yes", 1, 0),
    SCC   = ifelse(SCC   == "yes", 1, 0),
    CAEC = case_when(CAEC == "no" ~ 0, CAEC == "Sometimes" ~ 1,
                     CAEC == "Frequently" ~ 2, CAEC == "Always" ~ 3),
    CALC = case_when(CALC == "no" ~ 0, CALC == "Sometimes" ~ 1,
                     CALC == "Frequently" ~ 2, CALC == "Always" ~ 3),
    MTRANS = case_when(
      MTRANS == "Walking" ~ 1, MTRANS == "Bike" ~ 2,
      MTRANS == "Motorbike" ~ 3, MTRANS == "Public_Transportation" ~ 4,
      MTRANS == "Automobile" ~ 5)
  ) %>%
  select(-NObeyesdad)

cat("Dimensi setelah encoding:", nrow(df_encoded), "x", ncol(df_encoded), "\n")

## Dimensi setelah encoding: 2111 x 16

head(df_encoded)

##   Gender Age Height Weight family_history_with_overweight FAVC FCVC NCP CAEC
## 1      0  21   1.62   64.0                              1    0    2   3    1
## 2      0  21   1.52   56.0                              1    0    3   3    1
## 3      1  23   1.80   77.0                              1    0    2   3    1
## 4      1  27   1.80   87.0                              0    0    3   3    1
## 5      1  22   1.78   89.8                              0    0    2   1    1
## 6      1  29   1.62   53.0                              0    1    2   3    1
##   SMOKE CH2O SCC FAF TUE CALC MTRANS
## 1     0    2   0   0   1    0      4
## 2     1    3   1   3   0    1      4
## 3     0    2   0   2   1    2      4
## 4     0    2   0   2   0    2      1
## 5     0    2   0   0   0    1      4
## 6     0    2   0   0   0    1      5

Scaling (Z-Score)

df_scaled <- as.data.frame(scale(df_encoded))
cat("Mean tiap kolom (harus ~0):\n"); round(colMeans(df_scaled), 4)

## Mean tiap kolom (harus ~0):

##                         Gender                            Age 
##                              0                              0 
##                         Height                         Weight 
##                              0                              0 
## family_history_with_overweight                           FAVC 
##                              0                              0 
##                           FCVC                            NCP 
##                              0                              0 
##                           CAEC                          SMOKE 
##                              0                              0 
##                           CH2O                            SCC 
##                              0                              0 
##                            FAF                            TUE 
##                              0                              0 
##                           CALC                         MTRANS 
##                              0                              0

cat("SD tiap kolom (harus ~1):\n"); round(apply(df_scaled, 2, sd), 4)

## SD tiap kolom (harus ~1):

##                         Gender                            Age 
##                              1                              1 
##                         Height                         Weight 
##                              1                              1 
## family_history_with_overweight                           FAVC 
##                              1                              1 
##                           FCVC                            NCP 
##                              1                              1 
##                           CAEC                          SMOKE 
##                              1                              1 
##                           CH2O                            SCC 
##                              1                              1 
##                            FAF                            TUE 
##                              1                              1 
##                           CALC                         MTRANS 
##                              1                              1

vars_show <- c("Age", "Height", "Weight", "FCVC")

df_viz <- bind_rows(
  df_encoded[, vars_show] %>% pivot_longer(everything(), names_to = "Variabel", values_to = "Nilai") %>% mutate(Status = "Sebelum Scaling"),
  df_scaled[, vars_show]  %>% pivot_longer(everything(), names_to = "Variabel", values_to = "Nilai") %>% mutate(Status = "Setelah Scaling")
)

ggplot(df_viz, aes(x = Nilai, fill = Status)) +
  geom_histogram(bins = 30, alpha = 0.7, color = "white") +
  facet_grid(Status ~ Variabel, scales = "free") +
  scale_fill_manual(values = c("Sebelum Scaling" = "#E74C3C", "Setelah Scaling" = "#2ECC71")) +
  labs(title = "Distribusi Variabel: Sebelum vs Sesudah Scaling",
       x = "Nilai", y = "Frekuensi") +
  theme_minimal(base_size = 11) +
  theme(legend.position = "none", strip.text = element_text(face = "bold"))

Menentukan Jumlah Cluster Optimal

Metode Elbow

set.seed(123)
fviz_nbclust(df_scaled, kmeans, method = "wss", k.max = 10) +
  geom_vline(xintercept = 3, linetype = "dashed", color = "red", linewidth = 0.8) +
  labs(title = "Metode Elbow", x = "Jumlah Cluster (k)", y = "Total Within Sum of Squares") +
  theme_minimal()

Metode Silhouette

set.seed(123)
fviz_nbclust(df_scaled, kmeans, method = "silhouette", k.max = 10) +
  labs(title = "Metode Silhouette", x = "Jumlah Cluster (k)", y = "Silhouette Score Rata-rata") +
  theme_minimal()

K-Means Clustering

k_optimal <- 3
set.seed(123)
kmeans_result <- kmeans(df_scaled, centers = k_optimal, nstart = 25, iter.max = 100)

cat("Ukuran tiap cluster:\n"); print(kmeans_result$size)

## Ukuran tiap cluster:

## [1] 517 932 662

cat("Between SS / Total SS:", round(kmeans_result$betweenss / kmeans_result$totss * 100, 2), "%\n")

## Between SS / Total SS: 18.15 %

fviz_cluster(
  kmeans_result,
  data = df_scaled,
  geom = "point",
  ellipse.type = "convex",
  ggtheme = theme_minimal()
)

df_encoded$Cluster_KMeans <- as.factor(kmeans_result$cluster)

profil_kmeans <- df_encoded %>%
  group_by(Cluster_KMeans) %>%
  summarise(n = n(),
            Usia_rata2 = round(mean(Age), 2),    TB_rata2   = round(mean(Height), 3),
            BB_rata2   = round(mean(Weight), 2),  FCVC_rata2 = round(mean(FCVC), 2),
            FAF_rata2  = round(mean(FAF), 2),     TUE_rata2  = round(mean(TUE), 2),
            CH2O_rata2 = round(mean(CH2O), 2))

kable(profil_kmeans,
      caption = "Profil Rata-rata per Cluster K-Means",
      col.names = c("Cluster","n","Usia","Tinggi","Berat","Sayur","Aktivitas","Teknologi","Air"))

Profil Rata-rata per Cluster K-Means
Cluster	n	Usia	Tinggi	Berat	Sayur	Aktivitas	Teknologi	Air
1	517	20.81	1.638	57.61	2.41	1.14	0.78	1.78
2	932	25.12	1.767	93.90	2.25	1.17	0.65	2.11
3	662	25.90	1.660	98.93	2.67	0.68	0.58	2.03

df_scaled_cluster        <- df_scaled
df_scaled_cluster$Cluster <- as.factor(kmeans_result$cluster)

profil_z <- df_scaled_cluster %>%
  group_by(Cluster) %>%
  summarise(across(where(is.numeric), mean)) %>%
  pivot_longer(-Cluster, names_to = "Variabel", values_to = "Z_Score")

ggplot(profil_z, aes(x = Variabel, y = Cluster, fill = Z_Score)) +
  geom_tile(color = "white", linewidth = 0.5) +
  geom_text(aes(label = round(Z_Score, 2)), size = 3) +
  scale_fill_gradient2(low = "#2980B9", mid = "white", high = "#E74C3C", midpoint = 0) +
  labs(title = "Heatmap Profil Cluster K-Means (Z-Score)",
       subtitle = "Merah = di atas rata-rata | Biru = di bawah rata-rata") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 9))

Hierarchical Clustering

set.seed(123)
n_sample   <- min(500, nrow(df_scaled))
idx_sample <- sample(1:nrow(df_scaled), n_sample, replace = FALSE)
df_hclust  <- df_scaled[idx_sample, ]

dist_matrix <- dist(df_hclust, method = "euclidean")
hclust_ward <- hclust(dist_matrix, method = "ward.D2")

cat("Jumlah observasi:", n_sample, "| Linkage: ward.D2 | Jarak: Euclidean\n")

## Jumlah observasi: 500 | Linkage: ward.D2 | Jarak: Euclidean

plot(hclust_ward,
     main = paste("Dendrogram Hierarchical Clustering (Ward's, n =", n_sample, ")"),
     xlab = "Observasi", ylab = "Height", labels = FALSE, hang = -1)
abline(h = 30, col = "red", lty = 2, lwd = 2)
legend("topright", legend = "Garis potong (k=3)", col = "red", lty = 2, lwd = 2, bty = "n")

hclust_cluster <- cutree(hclust_ward, k = k_optimal)
cat("Distribusi cluster HC:\n"); print(table(hclust_cluster))

## Distribusi cluster HC:

## hclust_cluster
##   1   2   3 
## 161  85 254

fviz_cluster(
  list(data = df_hclust, cluster = hclust_cluster),
  geom = "point",
  ellipse.type = "convex"
) +
  theme_minimal() +
  labs(
    title = paste("Hierarchical Clustering - Ward's (k =", k_optimal, ")"),
    subtitle = "Reduksi dimensi dengan PCA"
  )

df_hclust_profil            <- df_encoded[idx_sample, ]
df_hclust_profil$Cluster_HC <- as.factor(hclust_cluster)

profil_hc <- df_hclust_profil %>%
  group_by(Cluster_HC) %>%
  summarise(n = n(),
            Usia_rata2 = round(mean(Age), 2),    TB_rata2   = round(mean(Height), 3),
            BB_rata2   = round(mean(Weight), 2),  FCVC_rata2 = round(mean(FCVC), 2),
            FAF_rata2  = round(mean(FAF), 2),     TUE_rata2  = round(mean(TUE), 2),
            CH2O_rata2 = round(mean(CH2O), 2))

kable(profil_hc,
      caption = "Profil Rata-rata per Cluster Hierarchical",
      col.names = c("Cluster","n","Usia","Tinggi","Berat","Sayur","Aktivitas","Teknologi","Air"))

Profil Rata-rata per Cluster Hierarchical
Cluster	n	Usia	Tinggi	Berat	Sayur	Aktivitas	Teknologi	Air
1	161	21.20	1.675	63.46	2.35	1.24	0.71	1.90
2	85	23.15	1.699	123.15	3.00	0.78	0.62	2.23
3	254	26.64	1.719	89.96	2.28	0.94	0.64	2.02

Validasi Cluster

Silhouette Score

sil_kmeans <- silhouette(kmeans_result$cluster, dist(df_scaled))
avg_sil_km <- mean(sil_kmeans[, 3])

sil_hc     <- silhouette(hclust_cluster, dist_matrix)
avg_sil_hc <- mean(sil_hc[, 3])

cat("Silhouette K-Means      :", round(avg_sil_km, 4), "\n")

## Silhouette K-Means      : 0.1002

cat("Silhouette Hierarchical :", round(avg_sil_hc, 4), "\n")

## Silhouette Hierarchical : 0.0605

cat("Pemenang:", ifelse(avg_sil_km > avg_sil_hc, "K-Means", "Hierarchical"), "\n")

## Pemenang: K-Means

p1 <- fviz_silhouette(sil_kmeans, print.summary = FALSE) +
  labs(title = paste("Silhouette K-Means =", round(avg_sil_km, 4))) +
  theme_minimal() + theme(legend.position = "bottom")

p2 <- fviz_silhouette(sil_hc, print.summary = FALSE) +
  labs(title = paste("Silhouette Hierarchical =", round(avg_sil_hc, 4))) +
  theme_minimal() + theme(legend.position = "bottom")

grid.arrange(p1, p2, ncol = 2)

Dunn Index & Connectivity

val_km <- clValid(as.matrix(df_scaled[idx_sample, ]), k_optimal,
                  clMethods = "kmeans",       validation = "internal")
val_hc <- clValid(as.matrix(df_hclust),       k_optimal,
                  clMethods = "hierarchical", validation = "internal")

cat("--- K-Means ---\n");      summary(val_km)

## --- K-Means ---

## 
## Clustering Methods:
##  kmeans 
## 
## Cluster sizes:
##  3 
## 
## Validation Measures:
##                            3
##                             
## kmeans Connectivity  87.7278
##        Dunn           0.1744
##        Silhouette     0.2102
## 
## Optimal Scores:
## 
##              Score   Method Clusters
## Connectivity 87.7278 kmeans 3       
## Dunn          0.1744 kmeans 3       
## Silhouette    0.2102 kmeans 3

cat("--- Hierarchical ---\n"); summary(val_hc)

## --- Hierarchical ---

## 
## Clustering Methods:
##  hierarchical 
## 
## Cluster sizes:
##  3 
## 
## Validation Measures:
##                                 3
##                                  
## hierarchical Connectivity  7.7849
##              Dunn          0.5450
##              Silhouette    0.4401
## 
## Optimal Scores:
## 
##              Score  Method       Clusters
## Connectivity 7.7849 hierarchical 3       
## Dunn         0.5450 hierarchical 3       
## Silhouette   0.4401 hierarchical 3

Ringkasan Validasi

tabel_validasi <- data.frame(
  Metrik       = c("Silhouette ↑", "Dunn Index ↑", "Connectivity ↓"),
  Keterangan   = c("Kohesivitas & separasi (lebih tinggi lebih baik)",
                   "Jarak antar-cluster vs ukuran (lebih tinggi lebih baik)",
                   "Ketetanggaan antar-cluster (lebih rendah lebih baik)"),
  KMeans       = c(round(avg_sil_km, 4), round(measures(val_km)[2], 4), round(measures(val_km)[1], 4)),
  Hierarchical = c(round(avg_sil_hc, 4), round(measures(val_hc)[2], 4), round(measures(val_hc)[1], 4))
)

kable(tabel_validasi,
      caption = "Perbandingan Indeks Validasi: K-Means vs Hierarchical",
      col.names = c("Metrik", "Keterangan", "K-Means", "Hierarchical"))

Perbandingan Indeks Validasi: K-Means vs Hierarchical
Metrik	Keterangan	K-Means	Hierarchical
Silhouette ↑	Kohesivitas & separasi (lebih tinggi lebih baik)	0.1002	0.0605
Dunn Index ↑	Jarak antar-cluster vs ukuran (lebih tinggi lebih baik)	0.1744	0.5450
Connectivity ↓	Ketetanggaan antar-cluster (lebih rendah lebih baik)	87.7278	7.7849

df_compare <- data.frame(
  Metode     = c("K-Means", "Hierarchical"),
  Silhouette = c(avg_sil_km, avg_sil_hc)
)

ggplot(df_compare, aes(x = Metode, y = Silhouette, fill = Metode)) +
  geom_col(width = 0.5, show.legend = FALSE) +
  geom_text(aes(label = round(Silhouette, 4)), vjust = -0.5, size = 5, fontface = "bold") +
  scale_fill_manual(values = c("K-Means" = "#3498DB", "Hierarchical" = "#E67E22")) +
  scale_y_continuous(limits = c(0, max(df_compare$Silhouette) * 1.2)) +
  labs(title = "Perbandingan Silhouette Score",
       subtitle = "Nilai lebih tinggi = cluster lebih baik",
       x = "Metode", y = "Silhouette Score") +
  theme_minimal(base_size = 13)

Kesimpulan

metode_terbaik <- ifelse(avg_sil_km >= avg_sil_hc, "K-Means", "Hierarchical Clustering")

cat("==============================================\n")

## ==============================================

cat("  KESIMPULAN ANALISIS CLUSTERING\n")

##   KESIMPULAN ANALISIS CLUSTERING

cat("==============================================\n")

## ==============================================

cat("Dataset :", nrow(df_raw), "obs x", ncol(df_raw), "variabel\n")

## Dataset : 2111 obs x 17 variabel

cat("k optimal:", k_optimal, "\n\n")

## k optimal: 3

cat("Silhouette K-Means      :", round(avg_sil_km, 4), "\n")

## Silhouette K-Means      : 0.1002

cat("Silhouette Hierarchical :", round(avg_sil_hc, 4), "\n")

## Silhouette Hierarchical : 0.0605

cat("Metode terbaik          :", metode_terbaik, "\n\n")

## Metode terbaik          : K-Means

cat("Profil Cluster (K-Means):\n")

## Profil Cluster (K-Means):

for (i in 1:k_optimal) {
  b <- profil_kmeans[profil_kmeans$Cluster_KMeans == i, ]
  cat(sprintf("  Cluster %d (n=%d): BB=%.1f kg, Aktivitas=%.2f, Sayur=%.2f\n",
              i, b$n, b$BB_rata2, b$FAF_rata2, b$FCVC_rata2))
}

##   Cluster 1 (n=517): BB=57.6 kg, Aktivitas=1.14, Sayur=2.41
##   Cluster 2 (n=932): BB=93.9 kg, Aktivitas=1.17, Sayur=2.25
##   Cluster 3 (n=662): BB=98.9 kg, Aktivitas=0.68, Sayur=2.67

Estimation of Obesity Levels Based On Eating Habits and Physical Condition

Muhamad Hambali

2026-06-05

Pendahuluan

Load Library dan Data

Pra-Pemrosesan Data

Encoding Variabel Kategorikal

Scaling (Z-Score)

Menentukan Jumlah Cluster Optimal

Metode Elbow

Metode Silhouette

K-Means Clustering

Hierarchical Clustering

Validasi Cluster

Silhouette Score

Dunn Index & Connectivity

Ringkasan Validasi

Kesimpulan