# ============================================================
# CLUSTERING DATASET SEISMIC BUMPS
# 5 Metode: K-Means, K-Medians, DBSCAN, Mean Shift, Fuzzy C-Means
# Dataset: seismic-bumps.arff
# ============================================================

# ── 1. INSTALL & LOAD LIBRARY ──────────────────────────────
packages <- c("foreign", "cluster", "dbscan", "ks", "e1071",
              "factoextra", "ggplot2", "dplyr", "gridExtra",
              "clusterCrit", "scales")

install_if_missing <- function(pkg) {
  if (!requireNamespace(pkg, quietly = TRUE)) {
    install.packages(pkg, repos = "https://cloud.r-project.org")
  }
}

invisible(lapply(packages, install_if_missing))
invisible(lapply(packages, library, character.only = TRUE))
## Warning: package 'cluster' was built under R version 4.5.3
## Warning: package 'dbscan' was built under R version 4.5.3
## 
## Attaching package: 'dbscan'
## The following object is masked from 'package:stats':
## 
##     as.dendrogram
## Warning: package 'ks' was built under R version 4.5.3
## Warning: package 'e1071' was built under R version 4.5.3
## Warning: package 'factoextra' was built under R version 4.5.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.5.3
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:e1071':
## 
##     element
## Welcome to factoextra!
## Want to learn more? See two factoextra-related books at https://www.datanovia.com/en/product/practical-guide-to-principal-component-methods-in-r/
## Warning: package 'dplyr' was built under R version 4.5.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## Warning: package 'gridExtra' was built under R version 4.5.3
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
## Warning: package 'clusterCrit' was built under R version 4.5.3
## Warning: package 'scales' was built under R version 4.5.3
cat("✔ Semua library berhasil dimuat.\n\n")
## ✔ Semua library berhasil dimuat.
# ── 2. LOAD DATA ────────────────────────────────────────────
# Ganti path sesuai lokasi file Anda
data_raw <- read.arff("seismic-bumps.arff")

cat("=== INFO DATASET ===\n")
## === INFO DATASET ===
cat("Jumlah baris  :", nrow(data_raw), "\n")
## Jumlah baris  : 2584
cat("Jumlah kolom  :", ncol(data_raw), "\n")
## Jumlah kolom  : 19
cat("Nama kolom    :", paste(names(data_raw), collapse = ", "), "\n\n")
## Nama kolom    : seismic, seismoacoustic, shift, genergy, gpuls, gdenergy, gdpuls, ghazard, nbumps, nbumps2, nbumps3, nbumps4, nbumps5, nbumps6, nbumps7, nbumps89, energy, maxenergy, class
cat("Distribusi kelas:\n")
## Distribusi kelas:
print(table(data_raw$class))
## 
##    0    1 
## 2414  170
# ── 3. PRA-PEMROSESAN ───────────────────────────────────────
# Pilih hanya fitur numerik (buang variabel kategorik & class)
cat_cols  <- c("seismic", "seismoacoustic", "shift", "ghazard", "class")
num_cols  <- setdiff(names(data_raw), cat_cols)

data_num  <- data_raw[, num_cols]

# Hapus kolom dengan varians nol
zero_var  <- sapply(data_num, function(x) var(x, na.rm = TRUE) == 0)
data_num  <- data_num[, !zero_var]
cat("\nKolom numerik yang digunakan:\n")
## 
## Kolom numerik yang digunakan:
cat(paste(names(data_num), collapse = ", "), "\n\n")
## genergy, gpuls, gdenergy, gdpuls, nbumps, nbumps2, nbumps3, nbumps4, nbumps5, energy, maxenergy
# Standardisasi (Z-score)
data_scaled <- scale(data_num)

# Reduksi ke 2 dimensi (PCA) untuk visualisasi
pca_result  <- prcomp(data_scaled, center = FALSE, scale. = FALSE)
pca_df      <- as.data.frame(pca_result$x[, 1:2])
colnames(pca_df) <- c("PC1", "PC2")

var_exp <- round(summary(pca_result)$importance[2, 1:2] * 100, 2)
cat(sprintf("Variansi dijelaskan PC1=%.2f%%, PC2=%.2f%%\n\n",
            var_exp[1], var_exp[2]))
## Variansi dijelaskan PC1=32.52%, PC2=19.85%
# Label kelas asli untuk referensi
true_labels <- as.integer(as.character(data_raw$class))


# ── 4. FUNGSI BANTU ─────────────────────────────────────────
eval_cluster <- function(data_mat, clusters, method_name) {
  # Buang noise (label -1 / 0 pada DBSCAN)
  valid <- clusters > 0
  if (sum(valid) < 10) {
    cat(method_name, ": terlalu sedikit titik valid untuk evaluasi\n")
    return(invisible(NULL))
  }
  k <- length(unique(clusters[valid]))
  if (k < 2) {
    cat(method_name, ": hanya 1 cluster, silhouette tidak dihitung\n")
    return(invisible(NULL))
  }
  sil  <- silhouette(clusters[valid], dist(data_mat[valid, ]))
  cat(sprintf("%-20s | k=%d | Avg Silhouette = %.4f\n",
              method_name, k, mean(sil[, 3])))
}

plot_cluster <- function(df2d, clusters, title, true_lab = NULL) {
  df_plot <- df2d
  df_plot$Cluster <- factor(clusters)
  p <- ggplot(df_plot, aes(PC1, PC2, color = Cluster)) +
    geom_point(alpha = 0.5, size = 1) +
    labs(title = title,
         subtitle = sprintf("k=%d cluster", length(unique(clusters[clusters > 0]))),
         x = sprintf("PC1 (%.1f%%)", var_exp[1]),
         y = sprintf("PC2 (%.1f%%)", var_exp[2])) +
    theme_minimal(base_size = 10) +
    theme(legend.position = "bottom",
          plot.title = element_text(face = "bold"))
  return(p)
}


# ============================================================
# METODE 1 : K-MEANS
# ============================================================
cat("\n===== METODE 1: K-MEANS =====\n")
## 
## ===== METODE 1: K-MEANS =====
set.seed(42)
k_opt <- 3   # Anda bisa mengubah nilai ini

km_result <- kmeans(data_scaled, centers = k_opt,
                    nstart = 25, iter.max = 100)

cat("Cluster sizes  :", km_result$size, "\n")
## Cluster sizes  : 428 2144 12
cat("Within-SS total:", round(km_result$tot.withinss, 2), "\n")
## Within-SS total: 16932.79
eval_cluster(data_scaled, km_result$cluster, "K-Means")
## K-Means              | k=3 | Avg Silhouette = 0.5156
pca_df$kmeans <- km_result$cluster
p1 <- plot_cluster(pca_df, km_result$cluster, "K-Means Clustering")


# ============================================================
# METODE 2 : K-MEDIANS  (via pam dari package cluster)
# ============================================================
cat("\n===== METODE 2: K-MEDIANS (PAM) =====\n")
## 
## ===== METODE 2: K-MEDIANS (PAM) =====
# K-Medians = PAM (Partitioning Around Medoids) + manhattan distance
# Menggunakan sampel karena PAM O(n^2)
set.seed(42)
n_sample <- min(nrow(data_scaled), 800)
idx_sample <- sample(nrow(data_scaled), n_sample)

pam_result <- pam(data_scaled[idx_sample, ],
                  k = k_opt,
                  metric = "manhattan")   # inilah yang membuat ini K-Medians

# Assign semua baris ke medoid terdekat (manhattan)
assign_to_medoid <- function(data, medoids) {
  apply(data, 1, function(row) {
    dists <- apply(medoids, 1, function(m) sum(abs(row - m)))
    which.min(dists)
  })
}
medoids_coord <- data_scaled[idx_sample, ][pam_result$medoids, ]
kmed_clusters <- assign_to_medoid(data_scaled, medoids_coord)

cat("Cluster sizes  :", table(kmed_clusters), "\n")
## Cluster sizes  : 2584
eval_cluster(data_scaled, kmed_clusters, "K-Medians (PAM)")
## K-Medians (PAM) : hanya 1 cluster, silhouette tidak dihitung
pca_df$kmedians <- kmed_clusters
p2 <- plot_cluster(pca_df, kmed_clusters, "K-Medians Clustering")


# ============================================================
# METODE 3 : DBSCAN
# ============================================================
cat("\n===== METODE 3: DBSCAN =====\n")
## 
## ===== METODE 3: DBSCAN =====
# Pilih eps menggunakan k-distance plot (k=5)
kNNdist_val <- sort(kNNdist(data_scaled, k = 5))

# Estimasi eps otomatis: siku kurva k-distance
# Ambil titik di mana kenaikan tiba-tiba (persentil ke-95)
eps_est <- quantile(kNNdist_val, 0.95)
cat(sprintf("Estimasi eps (otomatis, p95) : %.4f\n", eps_est))
## Estimasi eps (otomatis, p95) : 2.1824
db_result <- dbscan(data_scaled,
                    eps    = eps_est,
                    minPts = 10)

n_cluster_db <- length(unique(db_result$cluster[db_result$cluster > 0]))
n_noise      <- sum(db_result$cluster == 0)
cat(sprintf("Jumlah cluster : %d\n", n_cluster_db))
## Jumlah cluster : 2
cat(sprintf("Titik noise    : %d (%.1f%%)\n",
            n_noise, n_noise / nrow(data_scaled) * 100))
## Titik noise    : 109 (4.2%)
eval_cluster(data_scaled, db_result$cluster, "DBSCAN")
## DBSCAN               | k=2 | Avg Silhouette = 0.4943
pca_df$dbscan <- db_result$cluster
p3 <- plot_cluster(pca_df, db_result$cluster, "DBSCAN Clustering")


# ============================================================
# METODE 4 : MEAN SHIFT
# ============================================================
cat("\n===== METODE 4: MEAN SHIFT =====\n")
## 
## ===== METODE 4: MEAN SHIFT =====
# Mean Shift memerlukan estimasi bandwidth
# Gunakan PCA 2D agar lebih cepat & visualisasi lebih jelas
set.seed(42)
n_ms <- min(nrow(pca_df), 1000)   # subsample untuk kecepatan
idx_ms <- sample(nrow(pca_df), n_ms)
pca_sub <- as.matrix(pca_df[idx_ms, c("PC1","PC2")])

# Estimasi bandwidth (Silverman's rule via ks::hpi)
bw_ms <- hpi(pca_sub)
cat(sprintf("Bandwidth Mean Shift (hpi): %.4f\n", bw_ms))
## Bandwidth Mean Shift (hpi): 0.0553
# Jalankan Mean Shift menggunakan kernel density gradient
ms_kde <- kde(pca_sub, H = diag(c(bw_ms^2, bw_ms^2)))

# Assign setiap titik ke mode density tertinggi (simplified: pakai k-means pada output mode)
# Untuk implementasi penuh Mean Shift di R, kita gunakan ms() dari package meanShiftR bila ada
# Jika tidak ada, kita simulasi dengan iterasi manual sederhana

mean_shift_iter <- function(X, bandwidth, max_iter = 50, tol = 1e-4) {
  n <- nrow(X)
  X_shifted <- X
  for (iter in 1:max_iter) {
    X_new <- X_shifted
    for (i in 1:n) {
      diffs  <- sweep(X_shifted, 2, X_shifted[i, ])
      w      <- exp(-rowSums(diffs^2) / (2 * bandwidth^2))
      X_new[i, ] <- colSums(X_shifted * w) / sum(w)
    }
    shift <- max(abs(X_new - X_shifted))
    X_shifted <- X_new
    if (shift < tol) break
  }
  X_shifted
}

cat("Menjalankan Mean Shift (mungkin memerlukan beberapa menit)...\n")
## Menjalankan Mean Shift (mungkin memerlukan beberapa menit)...
ms_shifted <- mean_shift_iter(pca_sub, bandwidth = bw_ms * 2)

# Cluster berdasarkan kedekatan titik akhir (hierarchical cut)
dist_shifted <- dist(ms_shifted)
hc_ms        <- hclust(dist_shifted, method = "complete")
ms_cut_height <- quantile(hc_ms$height, 0.98)
ms_labels    <- cutree(hc_ms, h = ms_cut_height)

# Extend ke seluruh dataset
full_ms_labels <- integer(nrow(pca_df))
pca_full_mat   <- as.matrix(pca_df[, c("PC1","PC2")])
for (i in 1:nrow(pca_full_mat)) {
  dists <- apply(pca_sub, 1, function(p)
    sum((pca_full_mat[i, ] - p)^2))
  full_ms_labels[i] <- ms_labels[which.min(dists)]
}

n_ms_clusters <- length(unique(full_ms_labels))
cat(sprintf("Jumlah cluster Mean Shift : %d\n", n_ms_clusters))
## Jumlah cluster Mean Shift : 21
eval_cluster(as.matrix(pca_df[, c("PC1","PC2")]), full_ms_labels, "Mean Shift")
## Mean Shift           | k=21 | Avg Silhouette = 0.4860
pca_df$meanshift <- full_ms_labels
p4 <- plot_cluster(pca_df, full_ms_labels, "Mean Shift Clustering")


# ============================================================
# METODE 5 : FUZZY C-MEANS
# ============================================================
cat("\n===== METODE 5: FUZZY C-MEANS =====\n")
## 
## ===== METODE 5: FUZZY C-MEANS =====
set.seed(42)
fcm_result <- cmeans(data_scaled,
                     centers = k_opt,
                     iter.max = 100,
                     m = 2,          # fuzzifier (nilai 2 adalah standar)
                     method = "cmeans",
                     dist = "euclidean")

# Hard assignment: ambil cluster dengan keanggotaan tertinggi
fcm_clusters <- apply(fcm_result$membership, 1, which.max)

cat("Cluster sizes  :", table(fcm_clusters), "\n")
## Cluster sizes  : 449 1449 686
cat("Objective (within):", round(fcm_result$withinerror, 4), "\n")
## Objective (within): 3.4929
eval_cluster(data_scaled, fcm_clusters, "Fuzzy C-Means")
## Fuzzy C-Means        | k=3 | Avg Silhouette = 0.2857
# Tampilkan derajat keanggotaan rata-rata per cluster
cat("\nRata-rata membership per cluster:\n")
## 
## Rata-rata membership per cluster:
for (c in 1:k_opt) {
  avg_mem <- round(mean(fcm_result$membership[, c]), 4)
  cat(sprintf("  Cluster %d: %.4f\n", c, avg_mem))
}
##   Cluster 1: 0.1650
##   Cluster 2: 0.5381
##   Cluster 3: 0.2969
pca_df$fcm <- fcm_clusters
p5 <- plot_cluster(pca_df, fcm_clusters, "Fuzzy C-Means Clustering")


# ── 5. GABUNGKAN PLOT ───────────────────────────────────────
cat("\n===== MENYIMPAN VISUALISASI =====\n")
## 
## ===== MENYIMPAN VISUALISASI =====
png("clustering_results.png", width = 1600, height = 1100, res = 120)
grid.arrange(p1, p2, p3, p4, p5,
             ncol = 3,
             top = "Perbandingan 5 Metode Clustering — Seismic Bumps Dataset")
dev.off()
## png 
##   2
cat("✔ Plot disimpan ke: clustering_results.png\n")
## ✔ Plot disimpan ke: clustering_results.png
# ── 6. RINGKASAN EVALUASI ───────────────────────────────────
cat("\n")
cat("============================================================\n")
## ============================================================
cat("RINGKASAN EVALUASI SEMUA METODE\n")
## RINGKASAN EVALUASI SEMUA METODE
cat("============================================================\n")
## ============================================================
cat(sprintf("%-20s | %-8s | %s\n", "Metode", "k-cluster", "Avg Silhouette"))
## Metode               | k-cluster | Avg Silhouette
cat(rep("-", 50), "\n", sep = "")
## --------------------------------------------------
eval_summary <- function(data_mat, clusters, name) {
  valid <- clusters > 0
  k     <- length(unique(clusters[valid]))
  if (sum(valid) < 10 || k < 2) {
    cat(sprintf("%-20s | %-8s | %s\n", name, k, "N/A"))
    return(invisible(NULL))
  }
  sil_avg <- mean(silhouette(clusters[valid], dist(data_mat[valid, ]))[, 3])
  cat(sprintf("%-20s | %-8d | %.4f\n", name, k, sil_avg))
}

# Gunakan subset PCA untuk kecepatan silhouette
set.seed(1)
eval_idx <- sample(nrow(data_scaled), min(500, nrow(data_scaled)))
eval_data <- as.matrix(pca_df[eval_idx, c("PC1","PC2")])

eval_summary(eval_data, km_result$cluster[eval_idx],   "K-Means")
## K-Means              | 3        | 0.6939
eval_summary(eval_data, kmed_clusters[eval_idx],       "K-Medians")
## K-Medians            | 1        | N/A
eval_summary(eval_data, db_result$cluster[eval_idx],   "DBSCAN")
## DBSCAN               | 2        | 0.5666
eval_summary(eval_data, full_ms_labels[eval_idx],      "Mean Shift")
## Mean Shift           | 19       | 0.5010
eval_summary(eval_data, fcm_clusters[eval_idx],        "Fuzzy C-Means")
## Fuzzy C-Means        | 3        | 0.2868
cat("============================================================\n")
## ============================================================
cat("\n✔ Selesai! Semua metode clustering berhasil dijalankan.\n")
## 
## ✔ Selesai! Semua metode clustering berhasil dijalankan.

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this: