# ============================================================
# CLUSTERING DATASET SEISMIC BUMPS
# 5 Metode: K-Means, K-Medians, DBSCAN, Mean Shift, Fuzzy C-Means
# Dataset: seismic-bumps.arff
# ============================================================
# ── 1. INSTALL & LOAD LIBRARY ──────────────────────────────
packages <- c("foreign", "cluster", "dbscan", "ks", "e1071",
"factoextra", "ggplot2", "dplyr", "gridExtra",
"clusterCrit", "scales")
install_if_missing <- function(pkg) {
if (!requireNamespace(pkg, quietly = TRUE)) {
install.packages(pkg, repos = "https://cloud.r-project.org")
}
}
invisible(lapply(packages, install_if_missing))
invisible(lapply(packages, library, character.only = TRUE))
## Warning: package 'cluster' was built under R version 4.5.3
## Warning: package 'dbscan' was built under R version 4.5.3
##
## Attaching package: 'dbscan'
## The following object is masked from 'package:stats':
##
## as.dendrogram
## Warning: package 'ks' was built under R version 4.5.3
## Warning: package 'e1071' was built under R version 4.5.3
## Warning: package 'factoextra' was built under R version 4.5.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.5.3
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:e1071':
##
## element
## Welcome to factoextra!
## Want to learn more? See two factoextra-related books at https://www.datanovia.com/en/product/practical-guide-to-principal-component-methods-in-r/
## Warning: package 'dplyr' was built under R version 4.5.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Warning: package 'gridExtra' was built under R version 4.5.3
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
## Warning: package 'clusterCrit' was built under R version 4.5.3
## Warning: package 'scales' was built under R version 4.5.3
cat("✔ Semua library berhasil dimuat.\n\n")
## ✔ Semua library berhasil dimuat.
# ── 2. LOAD DATA ────────────────────────────────────────────
# Ganti path sesuai lokasi file Anda
data_raw <- read.arff("seismic-bumps.arff")
cat("=== INFO DATASET ===\n")
## === INFO DATASET ===
cat("Jumlah baris :", nrow(data_raw), "\n")
## Jumlah baris : 2584
cat("Jumlah kolom :", ncol(data_raw), "\n")
## Jumlah kolom : 19
cat("Nama kolom :", paste(names(data_raw), collapse = ", "), "\n\n")
## Nama kolom : seismic, seismoacoustic, shift, genergy, gpuls, gdenergy, gdpuls, ghazard, nbumps, nbumps2, nbumps3, nbumps4, nbumps5, nbumps6, nbumps7, nbumps89, energy, maxenergy, class
cat("Distribusi kelas:\n")
## Distribusi kelas:
print(table(data_raw$class))
##
## 0 1
## 2414 170
# ── 3. PRA-PEMROSESAN ───────────────────────────────────────
# Pilih hanya fitur numerik (buang variabel kategorik & class)
cat_cols <- c("seismic", "seismoacoustic", "shift", "ghazard", "class")
num_cols <- setdiff(names(data_raw), cat_cols)
data_num <- data_raw[, num_cols]
# Hapus kolom dengan varians nol
zero_var <- sapply(data_num, function(x) var(x, na.rm = TRUE) == 0)
data_num <- data_num[, !zero_var]
cat("\nKolom numerik yang digunakan:\n")
##
## Kolom numerik yang digunakan:
cat(paste(names(data_num), collapse = ", "), "\n\n")
## genergy, gpuls, gdenergy, gdpuls, nbumps, nbumps2, nbumps3, nbumps4, nbumps5, energy, maxenergy
# Standardisasi (Z-score)
data_scaled <- scale(data_num)
# Reduksi ke 2 dimensi (PCA) untuk visualisasi
pca_result <- prcomp(data_scaled, center = FALSE, scale. = FALSE)
pca_df <- as.data.frame(pca_result$x[, 1:2])
colnames(pca_df) <- c("PC1", "PC2")
var_exp <- round(summary(pca_result)$importance[2, 1:2] * 100, 2)
cat(sprintf("Variansi dijelaskan PC1=%.2f%%, PC2=%.2f%%\n\n",
var_exp[1], var_exp[2]))
## Variansi dijelaskan PC1=32.52%, PC2=19.85%
# Label kelas asli untuk referensi
true_labels <- as.integer(as.character(data_raw$class))
# ── 4. FUNGSI BANTU ─────────────────────────────────────────
eval_cluster <- function(data_mat, clusters, method_name) {
# Buang noise (label -1 / 0 pada DBSCAN)
valid <- clusters > 0
if (sum(valid) < 10) {
cat(method_name, ": terlalu sedikit titik valid untuk evaluasi\n")
return(invisible(NULL))
}
k <- length(unique(clusters[valid]))
if (k < 2) {
cat(method_name, ": hanya 1 cluster, silhouette tidak dihitung\n")
return(invisible(NULL))
}
sil <- silhouette(clusters[valid], dist(data_mat[valid, ]))
cat(sprintf("%-20s | k=%d | Avg Silhouette = %.4f\n",
method_name, k, mean(sil[, 3])))
}
plot_cluster <- function(df2d, clusters, title, true_lab = NULL) {
df_plot <- df2d
df_plot$Cluster <- factor(clusters)
p <- ggplot(df_plot, aes(PC1, PC2, color = Cluster)) +
geom_point(alpha = 0.5, size = 1) +
labs(title = title,
subtitle = sprintf("k=%d cluster", length(unique(clusters[clusters > 0]))),
x = sprintf("PC1 (%.1f%%)", var_exp[1]),
y = sprintf("PC2 (%.1f%%)", var_exp[2])) +
theme_minimal(base_size = 10) +
theme(legend.position = "bottom",
plot.title = element_text(face = "bold"))
return(p)
}
# ============================================================
# METODE 1 : K-MEANS
# ============================================================
cat("\n===== METODE 1: K-MEANS =====\n")
##
## ===== METODE 1: K-MEANS =====
set.seed(42)
k_opt <- 3 # Anda bisa mengubah nilai ini
km_result <- kmeans(data_scaled, centers = k_opt,
nstart = 25, iter.max = 100)
cat("Cluster sizes :", km_result$size, "\n")
## Cluster sizes : 428 2144 12
cat("Within-SS total:", round(km_result$tot.withinss, 2), "\n")
## Within-SS total: 16932.79
eval_cluster(data_scaled, km_result$cluster, "K-Means")
## K-Means | k=3 | Avg Silhouette = 0.5156
pca_df$kmeans <- km_result$cluster
p1 <- plot_cluster(pca_df, km_result$cluster, "K-Means Clustering")
# ============================================================
# METODE 2 : K-MEDIANS (via pam dari package cluster)
# ============================================================
cat("\n===== METODE 2: K-MEDIANS (PAM) =====\n")
##
## ===== METODE 2: K-MEDIANS (PAM) =====
# K-Medians = PAM (Partitioning Around Medoids) + manhattan distance
# Menggunakan sampel karena PAM O(n^2)
set.seed(42)
n_sample <- min(nrow(data_scaled), 800)
idx_sample <- sample(nrow(data_scaled), n_sample)
pam_result <- pam(data_scaled[idx_sample, ],
k = k_opt,
metric = "manhattan") # inilah yang membuat ini K-Medians
# Assign semua baris ke medoid terdekat (manhattan)
assign_to_medoid <- function(data, medoids) {
apply(data, 1, function(row) {
dists <- apply(medoids, 1, function(m) sum(abs(row - m)))
which.min(dists)
})
}
medoids_coord <- data_scaled[idx_sample, ][pam_result$medoids, ]
kmed_clusters <- assign_to_medoid(data_scaled, medoids_coord)
cat("Cluster sizes :", table(kmed_clusters), "\n")
## Cluster sizes : 2584
eval_cluster(data_scaled, kmed_clusters, "K-Medians (PAM)")
## K-Medians (PAM) : hanya 1 cluster, silhouette tidak dihitung
pca_df$kmedians <- kmed_clusters
p2 <- plot_cluster(pca_df, kmed_clusters, "K-Medians Clustering")
# ============================================================
# METODE 3 : DBSCAN
# ============================================================
cat("\n===== METODE 3: DBSCAN =====\n")
##
## ===== METODE 3: DBSCAN =====
# Pilih eps menggunakan k-distance plot (k=5)
kNNdist_val <- sort(kNNdist(data_scaled, k = 5))
# Estimasi eps otomatis: siku kurva k-distance
# Ambil titik di mana kenaikan tiba-tiba (persentil ke-95)
eps_est <- quantile(kNNdist_val, 0.95)
cat(sprintf("Estimasi eps (otomatis, p95) : %.4f\n", eps_est))
## Estimasi eps (otomatis, p95) : 2.1824
db_result <- dbscan(data_scaled,
eps = eps_est,
minPts = 10)
n_cluster_db <- length(unique(db_result$cluster[db_result$cluster > 0]))
n_noise <- sum(db_result$cluster == 0)
cat(sprintf("Jumlah cluster : %d\n", n_cluster_db))
## Jumlah cluster : 2
cat(sprintf("Titik noise : %d (%.1f%%)\n",
n_noise, n_noise / nrow(data_scaled) * 100))
## Titik noise : 109 (4.2%)
eval_cluster(data_scaled, db_result$cluster, "DBSCAN")
## DBSCAN | k=2 | Avg Silhouette = 0.4943
pca_df$dbscan <- db_result$cluster
p3 <- plot_cluster(pca_df, db_result$cluster, "DBSCAN Clustering")
# ============================================================
# METODE 4 : MEAN SHIFT
# ============================================================
cat("\n===== METODE 4: MEAN SHIFT =====\n")
##
## ===== METODE 4: MEAN SHIFT =====
# Mean Shift memerlukan estimasi bandwidth
# Gunakan PCA 2D agar lebih cepat & visualisasi lebih jelas
set.seed(42)
n_ms <- min(nrow(pca_df), 1000) # subsample untuk kecepatan
idx_ms <- sample(nrow(pca_df), n_ms)
pca_sub <- as.matrix(pca_df[idx_ms, c("PC1","PC2")])
# Estimasi bandwidth (Silverman's rule via ks::hpi)
bw_ms <- hpi(pca_sub)
cat(sprintf("Bandwidth Mean Shift (hpi): %.4f\n", bw_ms))
## Bandwidth Mean Shift (hpi): 0.0553
# Jalankan Mean Shift menggunakan kernel density gradient
ms_kde <- kde(pca_sub, H = diag(c(bw_ms^2, bw_ms^2)))
# Assign setiap titik ke mode density tertinggi (simplified: pakai k-means pada output mode)
# Untuk implementasi penuh Mean Shift di R, kita gunakan ms() dari package meanShiftR bila ada
# Jika tidak ada, kita simulasi dengan iterasi manual sederhana
mean_shift_iter <- function(X, bandwidth, max_iter = 50, tol = 1e-4) {
n <- nrow(X)
X_shifted <- X
for (iter in 1:max_iter) {
X_new <- X_shifted
for (i in 1:n) {
diffs <- sweep(X_shifted, 2, X_shifted[i, ])
w <- exp(-rowSums(diffs^2) / (2 * bandwidth^2))
X_new[i, ] <- colSums(X_shifted * w) / sum(w)
}
shift <- max(abs(X_new - X_shifted))
X_shifted <- X_new
if (shift < tol) break
}
X_shifted
}
cat("Menjalankan Mean Shift (mungkin memerlukan beberapa menit)...\n")
## Menjalankan Mean Shift (mungkin memerlukan beberapa menit)...
ms_shifted <- mean_shift_iter(pca_sub, bandwidth = bw_ms * 2)
# Cluster berdasarkan kedekatan titik akhir (hierarchical cut)
dist_shifted <- dist(ms_shifted)
hc_ms <- hclust(dist_shifted, method = "complete")
ms_cut_height <- quantile(hc_ms$height, 0.98)
ms_labels <- cutree(hc_ms, h = ms_cut_height)
# Extend ke seluruh dataset
full_ms_labels <- integer(nrow(pca_df))
pca_full_mat <- as.matrix(pca_df[, c("PC1","PC2")])
for (i in 1:nrow(pca_full_mat)) {
dists <- apply(pca_sub, 1, function(p)
sum((pca_full_mat[i, ] - p)^2))
full_ms_labels[i] <- ms_labels[which.min(dists)]
}
n_ms_clusters <- length(unique(full_ms_labels))
cat(sprintf("Jumlah cluster Mean Shift : %d\n", n_ms_clusters))
## Jumlah cluster Mean Shift : 21
eval_cluster(as.matrix(pca_df[, c("PC1","PC2")]), full_ms_labels, "Mean Shift")
## Mean Shift | k=21 | Avg Silhouette = 0.4860
pca_df$meanshift <- full_ms_labels
p4 <- plot_cluster(pca_df, full_ms_labels, "Mean Shift Clustering")
# ============================================================
# METODE 5 : FUZZY C-MEANS
# ============================================================
cat("\n===== METODE 5: FUZZY C-MEANS =====\n")
##
## ===== METODE 5: FUZZY C-MEANS =====
set.seed(42)
fcm_result <- cmeans(data_scaled,
centers = k_opt,
iter.max = 100,
m = 2, # fuzzifier (nilai 2 adalah standar)
method = "cmeans",
dist = "euclidean")
# Hard assignment: ambil cluster dengan keanggotaan tertinggi
fcm_clusters <- apply(fcm_result$membership, 1, which.max)
cat("Cluster sizes :", table(fcm_clusters), "\n")
## Cluster sizes : 449 1449 686
cat("Objective (within):", round(fcm_result$withinerror, 4), "\n")
## Objective (within): 3.4929
eval_cluster(data_scaled, fcm_clusters, "Fuzzy C-Means")
## Fuzzy C-Means | k=3 | Avg Silhouette = 0.2857
# Tampilkan derajat keanggotaan rata-rata per cluster
cat("\nRata-rata membership per cluster:\n")
##
## Rata-rata membership per cluster:
for (c in 1:k_opt) {
avg_mem <- round(mean(fcm_result$membership[, c]), 4)
cat(sprintf(" Cluster %d: %.4f\n", c, avg_mem))
}
## Cluster 1: 0.1650
## Cluster 2: 0.5381
## Cluster 3: 0.2969
pca_df$fcm <- fcm_clusters
p5 <- plot_cluster(pca_df, fcm_clusters, "Fuzzy C-Means Clustering")
# ── 5. GABUNGKAN PLOT ───────────────────────────────────────
cat("\n===== MENYIMPAN VISUALISASI =====\n")
##
## ===== MENYIMPAN VISUALISASI =====
png("clustering_results.png", width = 1600, height = 1100, res = 120)
grid.arrange(p1, p2, p3, p4, p5,
ncol = 3,
top = "Perbandingan 5 Metode Clustering — Seismic Bumps Dataset")
dev.off()
## png
## 2
cat("✔ Plot disimpan ke: clustering_results.png\n")
## ✔ Plot disimpan ke: clustering_results.png
# ── 6. RINGKASAN EVALUASI ───────────────────────────────────
cat("\n")
cat("============================================================\n")
## ============================================================
cat("RINGKASAN EVALUASI SEMUA METODE\n")
## RINGKASAN EVALUASI SEMUA METODE
cat("============================================================\n")
## ============================================================
cat(sprintf("%-20s | %-8s | %s\n", "Metode", "k-cluster", "Avg Silhouette"))
## Metode | k-cluster | Avg Silhouette
cat(rep("-", 50), "\n", sep = "")
## --------------------------------------------------
eval_summary <- function(data_mat, clusters, name) {
valid <- clusters > 0
k <- length(unique(clusters[valid]))
if (sum(valid) < 10 || k < 2) {
cat(sprintf("%-20s | %-8s | %s\n", name, k, "N/A"))
return(invisible(NULL))
}
sil_avg <- mean(silhouette(clusters[valid], dist(data_mat[valid, ]))[, 3])
cat(sprintf("%-20s | %-8d | %.4f\n", name, k, sil_avg))
}
# Gunakan subset PCA untuk kecepatan silhouette
set.seed(1)
eval_idx <- sample(nrow(data_scaled), min(500, nrow(data_scaled)))
eval_data <- as.matrix(pca_df[eval_idx, c("PC1","PC2")])
eval_summary(eval_data, km_result$cluster[eval_idx], "K-Means")
## K-Means | 3 | 0.6939
eval_summary(eval_data, kmed_clusters[eval_idx], "K-Medians")
## K-Medians | 1 | N/A
eval_summary(eval_data, db_result$cluster[eval_idx], "DBSCAN")
## DBSCAN | 2 | 0.5666
eval_summary(eval_data, full_ms_labels[eval_idx], "Mean Shift")
## Mean Shift | 19 | 0.5010
eval_summary(eval_data, fcm_clusters[eval_idx], "Fuzzy C-Means")
## Fuzzy C-Means | 3 | 0.2868
cat("============================================================\n")
## ============================================================
cat("\n✔ Selesai! Semua metode clustering berhasil dijalankan.\n")
##
## ✔ Selesai! Semua metode clustering berhasil dijalankan.
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this: