Modul 3 Clustering

Persiapan dan Library

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.5.3

## Warning: package 'ggplot2' was built under R version 4.5.3

## Warning: package 'readr' was built under R version 4.5.3

## Warning: package 'dplyr' was built under R version 4.5.3

## Warning: package 'lubridate' was built under R version 4.5.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.1     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(cluster)

## Warning: package 'cluster' was built under R version 4.5.3

library(factoextra)

## Warning: package 'factoextra' was built under R version 4.5.3

## Welcome to factoextra!
## Want to learn more? See two factoextra-related books at https://www.datanovia.com/en/product/practical-guide-to-principal-component-methods-in-r/

library(dbscan)

## Warning: package 'dbscan' was built under R version 4.5.3

## 
## Attaching package: 'dbscan'
## 
## The following object is masked from 'package:stats':
## 
##     as.dendrogram

library(fpc)

## Warning: package 'fpc' was built under R version 4.5.3

## 
## Attaching package: 'fpc'
## 
## The following object is masked from 'package:dbscan':
## 
##     dbscan

library(ppclust)

## Warning: package 'ppclust' was built under R version 4.5.3

## 
## Attaching package: 'ppclust'
## 
## The following object is masked from 'package:fpc':
## 
##     plotcluster

library(meanShiftR)
library(gridExtra)

## Warning: package 'gridExtra' was built under R version 4.5.3

## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine

Load Dataset

hawks_data <- read.csv(file.choose())

hawks_clean <- hawks_data %>%
  select(Wing, Weight, Tail, Hallux) %>%
  drop_na()

hawks_scaled <- scale(hawks_clean)

Implementasi 5 Metode

k_range <- 2:7
wss     <- sapply(k_range, function(k) kmeans(hawks_scaled, k, nstart=25)$tot.withinss)
sil_avg <- sapply(k_range, function(k) mean(silhouette(kmeans(hawks_scaled,k,nstart=25)$cluster, dist(hawks_scaled))[,3]))
k_val       <- k_range[which.max(sil_avg)]
cat("K optimal:", k_val, "\n")

## K optimal: 3

p1 <- ggplot(data.frame(k=k_range, WSS=wss), aes(k,WSS)) +
  geom_line(color="#2196F3",size=1.1) + geom_point(color="#F44336",size=3) +
  geom_vline(xintercept=k_val, linetype="dashed") +
  labs(title="Elbow Method", x="k", y="WSS") + theme_minimal()

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

p2 <- ggplot(data.frame(k=k_range, Sil=sil_avg), aes(k,Sil)) +
  geom_line(color="#4CAF50",size=1.1) + geom_point(color="#FF9800",size=3) +
  geom_vline(xintercept=k_val, linetype="dashed") +
  labs(title="Silhouette Method", x="k", y="Avg Silhouette") + theme_minimal()

grid.arrange(p1, p2, ncol=2, top="Visualisasi Elbow Method dan Silhouette Method")

# A. K-Means
set.seed(123)
km_res <- kmeans(hawks_scaled, centers = k_val, nstart = 25)

# B. K-Medians
pam_res <- pam(hawks_scaled, k = k_val)

# C. DBSCAN
db_res <- dbscan::dbscan(hawks_scaled, eps = 0.5, minPts = 5)

# D. MeanShift
ms_res <- meanShift(hawks_scaled)

# E. Fuzzy C-Means
fcm_res <- fcm(hawks_scaled, centers = k_val)

Evaluasi Menggunakan Silhoutte Score

calc_sil <- function(data, clusters) {
  if(length(unique(clusters)) > 1) {
    ss <- silhouette(clusters, dist(data))
    return(mean(ss[, 3]))
  } else return(0)
}

results <- data.frame(
  Metode = c("K-Means", "K-Medians (PAM)", "DBSCAN", "Fuzzy C-Means", "Meanshift"),
  Silhouette = c(
    calc_sil(hawks_scaled, km_res$cluster),
    calc_sil(hawks_scaled, pam_res$clustering),
    calc_sil(hawks_scaled, db_res$cluster),
    calc_sil(hawks_scaled, fcm_res$cluster),
    calc_sil(hawks_scaled, ms_res$cluster)
  )
)

print("--- Hasil Evaluasi Silhouette Score ---")

## [1] "--- Hasil Evaluasi Silhouette Score ---"

print(results)

##            Metode Silhouette
## 1         K-Means  0.7565214
## 2 K-Medians (PAM)  0.4625866
## 3          DBSCAN  0.6546952
## 4   Fuzzy C-Means  0.6703255
## 5       Meanshift  0.0000000

Pilih Metode Terbaik

best_cluster <- km_res$cluster
hawks_final <- hawks_clean %>% mutate(Cluster = as.factor(best_cluster))

summary_stats <- hawks_final %>%
  group_by(Cluster) %>%
  summarise(across(everything(), list(mean = mean, sd = sd)))
print(summary_stats)

## # A tibble: 3 × 9
##   Cluster Wing_mean Wing_sd Weight_mean Weight_sd Tail_mean Tail_sd Hallux_mean
##   <fct>       <dbl>   <dbl>       <dbl>     <dbl>     <dbl>   <dbl>       <dbl>
## 1 1            192.    27.1        176.      94.7      153.    21.5        14.5
## 2 2            382.    33.9       1086.     204.       223.    14.7        30.8
## 3 3            253.   129.         533.     755.       172.    55.0       105. 
## # ℹ 1 more variable: Hallux_sd <dbl>

fviz_cluster(km_res, data = hawks_scaled,
             palette = "jco", 
             geom = "point",
             ellipse.type = "convex", 
             ggtheme = theme_minimal(),
             main = "Visualisasi Cluster Terbaik (K-Means)")

hawks_final %>%
  pivot_longer(cols = -Cluster, names_to = "Feature", values_to = "Value") %>%
  ggplot(aes(x = Cluster, y = Value, fill = Cluster)) +
  geom_boxplot() +
  facet_wrap(~Feature, scales = "free") +
  theme_minimal() +
  labs(title = "Distribusi Fitur Fisik per Cluster")

Kesimpulan

Berdasarkan hasil perbandingan lima metode clustering, algoritma K-Means memberikan performa optimal dengan nilai Silhouette Score tertinggi, yang menunjukkan pemisahan kelompok yang solid berdasarkan fitur morfologi burung. Proses ini berhasil mengidentifikasi tiga kelompok utama tanpa bantuan label biner: cluster dengan ukuran fisik terbesar (representasi Red-tailed), cluster ukuran terkecil (Sharp-shinned), dan kelompok menengah (Cooper’s). Standarisasi data terbukti krusial dalam menyeimbangkan skala variabel seperti Weight dan Hallux, sehingga algoritma mampu mengelompokkan data secara akurat. Secara keseluruhan, analisis ini membuktikan bahwa dimensi fisik seperti panjang sayap, berat, dan ekor merupakan prediktor taksonomi yang sangat kuat dalam membedakan spesies Hawk secara otomatis