library(readxl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(skimr)
## Warning: package 'skimr' was built under R version 4.5.3
library(janitor)
## Warning: package 'janitor' was built under R version 4.5.3
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
library(knitr)
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.5.3
## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 4.5.3
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.5.3
## Welcome to factoextra!
## Want to learn more? See two factoextra-related books at https://www.datanovia.com/en/product/practical-guide-to-principal-component-methods-in-r/
library(cluster)
## Warning: package 'cluster' was built under R version 4.5.3
library(dbscan)
## Warning: package 'dbscan' was built under R version 4.5.3
## 
## Attaching package: 'dbscan'
## 
## The following object is masked from 'package:stats':
## 
##     as.dendrogram
library(fpc)
## Warning: package 'fpc' was built under R version 4.5.3
## 
## Attaching package: 'fpc'
## 
## The following object is masked from 'package:dbscan':
## 
##     dbscan
library(LPCM)
## Warning: package 'LPCM' was built under R version 4.5.3
## 
## Attaching package: 'LPCM'
## 
## The following object is masked from 'package:lubridate':
## 
##     ms
library(dplyr)
library(e1071)
## Warning: package 'e1071' was built under R version 4.5.3
## 
## Attaching package: 'e1071'
## 
## The following object is masked from 'package:ggplot2':
## 
##     element

Tahap 1: Load Data & Inspeksi Awal

1.1 Load Dataset

df <- read_excel("Dry_Bean_Dataset.xlsx")

cat("Dataset berhasil dimuat\n")
## Dataset berhasil dimuat
cat("Jumlah baris  :", nrow(df), "\n")
## Jumlah baris  : 13611
cat("Jumlah kolom  :", ncol(df), "\n")
## Jumlah kolom  : 17
cat("Total sel     :", nrow(df) * ncol(df), "\n")
## Total sel     : 231387

1.2 Preview Data

df %>%
  head(10) %>%
  kable(caption = "10 Baris Pertama Dataset") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE,
                font_size = 12)
10 Baris Pertama Dataset
Area Perimeter MajorAxisLength MinorAxisLength AspectRation Eccentricity ConvexArea EquivDiameter Extent Solidity roundness Compactness ShapeFactor1 ShapeFactor2 ShapeFactor3 ShapeFactor4 Class
28395 610.291 208.1781 173.8887 1.197191 0.5498122 28715 190.1411 0.7639225 0.9888560 0.9580271 0.9133578 0.0073315 0.0031473 0.8342224 0.9987239 SEKER
28734 638.018 200.5248 182.7344 1.097357 0.4117853 29172 191.2728 0.7839681 0.9849856 0.8870336 0.9538608 0.0069787 0.0035636 0.9098505 0.9984303 SEKER
29380 624.110 212.8261 175.9311 1.209713 0.5627273 29690 193.4109 0.7781132 0.9895588 0.9478495 0.9087742 0.0072439 0.0030477 0.8258706 0.9990661 SEKER
30008 645.884 210.5580 182.5165 1.153638 0.4986160 30724 195.4671 0.7826813 0.9766957 0.9039364 0.9283288 0.0070167 0.0032146 0.8617944 0.9941988 SEKER
30140 620.134 201.8479 190.2793 1.060798 0.3336797 30417 195.8965 0.7730980 0.9908933 0.9848771 0.9705155 0.0066970 0.0036650 0.9419004 0.9991661 SEKER
30279 634.927 212.5606 181.5102 1.171067 0.5204007 30600 196.3477 0.7756885 0.9895098 0.9438518 0.9237260 0.0070201 0.0031528 0.8532696 0.9992358 SEKER
30477 670.033 211.0502 184.0391 1.146768 0.4894779 30970 196.9886 0.7624015 0.9840814 0.8530799 0.9333736 0.0069249 0.0032420 0.8711862 0.9990487 SEKER
30519 629.727 212.9968 182.7372 1.165590 0.5137596 30847 197.1243 0.7706818 0.9893669 0.9671092 0.9254804 0.0069792 0.0031583 0.8565140 0.9983446 SEKER
30685 635.681 213.5341 183.1571 1.165852 0.5140809 31044 197.6597 0.7715615 0.9884358 0.9542398 0.9256585 0.0069589 0.0031515 0.8568437 0.9989530 SEKER
30834 631.934 217.2278 180.8975 1.200834 0.5536422 31120 198.1390 0.7836828 0.9908098 0.9702782 0.9121254 0.0070451 0.0030080 0.8319728 0.9990611 SEKER

1.3 Tipe Data Tiap Kolom

tipe_data <- data.frame(
  Kolom    = names(df),
  Tipe     = sapply(df, class),
  row.names = NULL
)

tipe_data %>%
  kable(caption = "Tipe Data Tiap Kolom") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE)
Tipe Data Tiap Kolom
Kolom Tipe
Area numeric
Perimeter numeric
MajorAxisLength numeric
MinorAxisLength numeric
AspectRation numeric
Eccentricity numeric
ConvexArea numeric
EquivDiameter numeric
Extent numeric
Solidity numeric
roundness numeric
Compactness numeric
ShapeFactor1 numeric
ShapeFactor2 numeric
ShapeFactor3 numeric
ShapeFactor4 numeric
Class character

1.4 Statistik Deskriptif

df %>%
  select(-Class) %>%
  summary() %>%
  kable(caption = "Statistik Deskriptif Fitur Numerik") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                font_size = 11)
Statistik Deskriptif Fitur Numerik
Area Perimeter MajorAxisLength MinorAxisLength AspectRation Eccentricity ConvexArea EquivDiameter Extent Solidity roundness Compactness ShapeFactor1 ShapeFactor2 ShapeFactor3 ShapeFactor4
Min. : 20420 Min. : 524.7 Min. :183.6 Min. :122.5 Min. :1.025 Min. :0.2190 Min. : 20684 Min. :161.2 Min. :0.5553 Min. :0.9192 Min. :0.4896 Min. :0.6406 Min. :0.002778 Min. :0.0005642 Min. :0.4103 Min. :0.9477
1st Qu.: 36328 1st Qu.: 703.5 1st Qu.:253.3 1st Qu.:175.8 1st Qu.:1.432 1st Qu.:0.7159 1st Qu.: 36715 1st Qu.:215.1 1st Qu.:0.7186 1st Qu.:0.9857 1st Qu.:0.8321 1st Qu.:0.7625 1st Qu.:0.005900 1st Qu.:0.0011535 1st Qu.:0.5814 1st Qu.:0.9937
Median : 44652 Median : 794.9 Median :296.9 Median :192.4 Median :1.551 Median :0.7644 Median : 45178 Median :238.4 Median :0.7599 Median :0.9883 Median :0.8832 Median :0.8013 Median :0.006645 Median :0.0016935 Median :0.6420 Median :0.9964
Mean : 53048 Mean : 855.3 Mean :320.1 Mean :202.3 Mean :1.583 Mean :0.7509 Mean : 53768 Mean :253.1 Mean :0.7497 Mean :0.9871 Mean :0.8733 Mean :0.7999 Mean :0.006564 Mean :0.0017159 Mean :0.6436 Mean :0.9951
3rd Qu.: 61332 3rd Qu.: 977.2 3rd Qu.:376.5 3rd Qu.:217.0 3rd Qu.:1.707 3rd Qu.:0.8105 3rd Qu.: 62294 3rd Qu.:279.4 3rd Qu.:0.7869 3rd Qu.:0.9900 3rd Qu.:0.9169 3rd Qu.:0.8343 3rd Qu.:0.007271 3rd Qu.:0.0021703 3rd Qu.:0.6960 3rd Qu.:0.9979
Max. :254616 Max. :1985.4 Max. :738.9 Max. :460.2 Max. :2.430 Max. :0.9114 Max. :263261 Max. :569.4 Max. :0.8662 Max. :0.9947 Max. :0.9907 Max. :0.9873 Max. :0.010451 Max. :0.0036650 Max. :0.9748 Max. :0.9997
skim(df)
Data summary
Name df
Number of rows 13611
Number of columns 17
_______________________
Column type frequency:
character 1
numeric 16
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Class 0 1 4 8 0 7 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Area 0 1 53048.28 29324.10 20420.00 36328.00 44652.00 61332.00 254616.00 ▇▂▁▁▁
Perimeter 0 1 855.28 214.29 524.74 703.52 794.94 977.21 1985.37 ▇▆▁▁▁
MajorAxisLength 0 1 320.14 85.69 183.60 253.30 296.88 376.50 738.86 ▇▆▂▁▁
MinorAxisLength 0 1 202.27 44.97 122.51 175.85 192.43 217.03 460.20 ▇▇▁▁▁
AspectRation 0 1 1.58 0.25 1.02 1.43 1.55 1.71 2.43 ▂▇▅▂▁
Eccentricity 0 1 0.75 0.09 0.22 0.72 0.76 0.81 0.91 ▁▁▂▇▇
ConvexArea 0 1 53768.20 29774.92 20684.00 36714.50 45178.00 62294.00 263261.00 ▇▂▁▁▁
EquivDiameter 0 1 253.06 59.18 161.24 215.07 238.44 279.45 569.37 ▇▆▁▁▁
Extent 0 1 0.75 0.05 0.56 0.72 0.76 0.79 0.87 ▁▁▅▇▂
Solidity 0 1 0.99 0.00 0.92 0.99 0.99 0.99 0.99 ▁▁▁▁▇
roundness 0 1 0.87 0.06 0.49 0.83 0.88 0.92 0.99 ▁▁▂▇▇
Compactness 0 1 0.80 0.06 0.64 0.76 0.80 0.83 0.99 ▂▅▇▂▁
ShapeFactor1 0 1 0.01 0.00 0.00 0.01 0.01 0.01 0.01 ▁▃▇▃▁
ShapeFactor2 0 1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ▇▇▇▃▁
ShapeFactor3 0 1 0.64 0.10 0.41 0.58 0.64 0.70 0.97 ▂▇▇▃▁
ShapeFactor4 0 1 1.00 0.00 0.95 0.99 1.00 1.00 1.00 ▁▁▁▁▇

1.5 Jumlah Nilai Unik Tiap Kolom

nilai_unik <- data.frame(
  Kolom       = names(df),
  Nilai_Unik  = sapply(df, n_distinct),
  row.names   = NULL
)

nilai_unik %>%
  kable(caption = "Jumlah Nilai Unik per Kolom") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE)
Jumlah Nilai Unik per Kolom
Kolom Nilai_Unik
Area 12011
Perimeter 13416
MajorAxisLength 13543
MinorAxisLength 13543
AspectRation 13543
Eccentricity 13543
ConvexArea 12066
EquivDiameter 12011
Extent 13535
Solidity 13526
roundness 13543
Compactness 13543
ShapeFactor1 13543
ShapeFactor2 13543
ShapeFactor3 13543
ShapeFactor4 13543
Class 7

1.6 Distribusi Kelas

dist_kelas <- df %>%
  count(Class, name = "Jumlah") %>%
  arrange(desc(Jumlah)) %>%
  mutate(Persentase = percent(Jumlah / sum(Jumlah), accuracy = 0.1))

dist_kelas %>%
  kable(caption = "Distribusi Data per Kelas") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white")
Distribusi Data per Kelas
Class Jumlah Persentase
DERMASON 3546 26.1%
SIRA 2636 19.4%
SEKER 2027 14.9%
HOROZ 1928 14.2%
CALI 1630 12.0%
BARBUNYA 1322 9.7%
BOMBAY 522 3.8%
warna <- c("#2ecc71","#3498db","#e74c3c","#f39c12","#9b59b6","#1abc9c","#e67e22")

# Bar chart
p1 <- ggplot(dist_kelas, aes(x = reorder(Class, -Jumlah),
                             y = Jumlah, fill = Class)) +
  geom_col(width = 0.7, show.legend = FALSE) +
  geom_text(aes(label = paste0(Jumlah, "\n", Persentase)),
            vjust = -0.3, size = 3.5, fontface = "bold") +
  scale_fill_manual(values = warna) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.15))) +
  labs(title = "Distribusi Jumlah Data per Kelas",
       x = "Kelas", y = "Jumlah") +
  theme_minimal(base_size = 13) +
  theme(plot.title   = element_text(face = "bold", hjust = 0.5),
        axis.text.x  = element_text(face = "bold"))

# Pie chart
p2 <- ggplot(dist_kelas, aes(x = "", y = Jumlah, fill = Class)) +
  geom_col(width = 1, color = "white", linewidth = 0.5) +
  coord_polar(theta = "y") +
  geom_text(aes(label = paste0(Class, "\n", Persentase)),
            position = position_stack(vjust = 0.5),
            size = 3.5, fontface = "bold", color = "white") +
  scale_fill_manual(values = warna) +
  labs(title = "Proporsi Kelas") +
  theme_void(base_size = 13) +
  theme(plot.title  = element_text(face = "bold", hjust = 0.5),
        legend.position = "none")

gridExtra::grid.arrange(p1, p2, ncol = 2)

Tahap 2: Cek & Tangani Missing Values

2.1 Cek Missing Values

missing_df <- data.frame(
  Kolom          = names(df),
  Missing_Count  = sapply(df, function(x) sum(is.na(x))),
  Missing_Persen = sapply(df, function(x) 
    percent(mean(is.na(x)), accuracy = 0.01)),
  row.names = NULL
)

missing_df %>%
  kable(caption = "Ringkasan Missing Values per Kolom") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  row_spec(which(missing_df$Missing_Count > 0),
           background = "#fde8e8", color = "#c0392b")
Ringkasan Missing Values per Kolom
Kolom Missing_Count Missing_Persen
Area 0 0.00%
Perimeter 0 0.00%
MajorAxisLength 0 0.00%
MinorAxisLength 0 0.00%
AspectRation 0 0.00%
Eccentricity 0 0.00%
ConvexArea 0 0.00%
EquivDiameter 0 0.00%
Extent 0 0.00%
Solidity 0 0.00%
roundness 0 0.00%
Compactness 0 0.00%
ShapeFactor1 0 0.00%
ShapeFactor2 0 0.00%
ShapeFactor3 0 0.00%
ShapeFactor4 0 0.00%
Class 0 0.00%
cat("\nTotal missing values:", sum(is.na(df)), "\n")
## 
## Total missing values: 0

2.2 Cek Data Duplikat

n_duplikat <- sum(duplicated(df))
cat("Jumlah baris duplikat:", n_duplikat, "\n")
## Jumlah baris duplikat: 68
if (n_duplikat > 0) {
  df <- df %>% distinct()
  cat("Duplikat dihapus. Shape baru:", nrow(df), "x", ncol(df), "\n")
} else {
  cat("Tidak ada duplikat\n")
}
## Duplikat dihapus. Shape baru: 13543 x 17

2.3 Kesimpulan Kualitas Data

kesimpulan <- data.frame(
  Aspek      = c("Shape awal", "Total missing values",
                 "Data duplikat", "Jumlah kelas", "Fitur numerik"),
  Hasil      = c(
    paste0(nrow(df), " baris × ", ncol(df), " kolom"),
    paste0(sum(is.na(df)), " (tidak ada)"),
    paste0(n_duplikat,     " (tidak ada)"),
    paste0(n_distinct(df$Class), " kelas"),
    paste0(ncol(select(df, -Class)), " kolom")
  ),
  Status     = c("oke", "oke", "oke", "oke", "oke")
)

kesimpulan %>%
  kable(caption = "Kesimpulan Kualitas Data") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
  column_spec(3, color = "green", bold = TRUE)
Kesimpulan Kualitas Data
Aspek Hasil Status
Shape awal 13543 baris × 17 kolom oke
Total missing values 0 (tidak ada) oke
Data duplikat 68 (tidak ada) oke
Jumlah kelas 7 kelas oke
Fitur numerik 16 kolom oke

#3. Deteksi & Tangani Outlier ##3.1 Setup: Pisahkan Fitur Numerik

df_numerik <- df %>% select(-Class)
fitur_numerik <- names(df_numerik)

cat("Jumlah fitur numerik:", length(fitur_numerik), "\n")
## Jumlah fitur numerik: 16
cat("Fitur:", paste(fitur_numerik, collapse = ", "), "\n")
## Fitur: Area, Perimeter, MajorAxisLength, MinorAxisLength, AspectRation, Eccentricity, ConvexArea, EquivDiameter, Extent, Solidity, roundness, Compactness, ShapeFactor1, ShapeFactor2, ShapeFactor3, ShapeFactor4

3.2 Deteksi Outlier: Metode IQR (Boxplot Rule)

iqr_summary <- df_numerik %>%
  pivot_longer(everything(), names_to = "Fitur", values_to = "Nilai") %>%
  group_by(Fitur) %>%
  summarise(
    Q1          = quantile(Nilai, 0.25),
    Q3          = quantile(Nilai, 0.75),
    IQR         = Q3 - Q1,
    Lower_Bound = Q1 - 1.5 * IQR,
    Upper_Bound = Q3 + 1.5 * IQR,
    N_Outlier   = sum(Nilai < (Q1 - 1.5 * IQR) | Nilai > (Q3 + 1.5 * IQR)),
    Pct_Outlier = round(N_Outlier / n() * 100, 2),
    .groups = "drop"
  ) %>%
  arrange(desc(N_Outlier))

iqr_summary %>%
  kable(
    caption = "Deteksi Outlier per Fitur (IQR)",
    digits  = 4,
    col.names = c("Fitur", "Q1", "Q3", "IQR",
                  "Batas Bawah", "Batas Atas",
                  "N Outlier", "% Outlier")
  ) %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed"),
    full_width = FALSE,
    font_size  = 12
  ) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
  row_spec(
    which(iqr_summary$Pct_Outlier >= 5),
    background = "#fde8e8", color = "#c0392b"
  )
Deteksi Outlier per Fitur (IQR)
Fitur Q1 Q3 IQR Batas Bawah Batas Atas N Outlier % Outlier
Eccentricity 0.7151 0.8097 0.0945 0.5734 0.9515 833 6.15
Solidity 0.9857 0.9900 0.0043 0.9792 0.9965 774 5.72
ShapeFactor4 0.9937 0.9979 0.0042 0.9875 1.0041 760 5.61
MinorAxisLength 175.8864 217.2454 41.3590 113.8478 279.2840 567 4.19
Area 36282.5000 61382.0000 25099.5000 -1366.7500 99031.2500 551 4.07
ConvexArea 36673.0000 62360.0000 25687.0000 -1857.5000 100890.5000 549 4.05
ShapeFactor1 0.0059 0.0073 0.0014 0.0038 0.0093 533 3.94
EquivDiameter 214.9333 279.5604 64.6271 117.9927 376.5010 526 3.88
Perimeter 703.2300 977.1465 273.9165 292.3553 1388.0212 500 3.69
AspectRation 1.4307 1.7039 0.2733 1.0208 2.1138 485 3.58
MajorAxisLength 253.0868 376.3125 123.2257 68.2483 561.1510 379 2.80
Extent 0.7187 0.7868 0.0681 0.6166 0.8890 271 2.00
ShapeFactor3 0.5825 0.6963 0.1138 0.4118 0.8671 202 1.49
Compactness 0.7632 0.8345 0.0712 0.6564 0.9413 124 0.92
roundness 0.8334 0.9170 0.0836 0.7080 1.0425 98 0.72
ShapeFactor2 0.0012 0.0022 0.0010 -0.0004 0.0037 0 0.00

3.3 Deteksi Outlier: Metode Z-Score

zscore_summary <- df_numerik %>%
  pivot_longer(everything(), names_to = "Fitur", values_to = "Nilai") %>%
  group_by(Fitur) %>%
  summarise(
    N_Outlier_Z   = sum(abs(scale(Nilai)) > 3),
    Pct_Outlier_Z = round(N_Outlier_Z / n() * 100, 2),
    .groups = "drop"
  ) %>%
  arrange(desc(N_Outlier_Z))

zscore_summary %>%
  kable(
    caption   = "Deteksi Outlier per Fitur (Z-score)",
    col.names = c("Fitur", "N Outlier", "% Outlier")
  ) %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed"),
    full_width = FALSE,
    font_size  = 12
  ) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white")
Deteksi Outlier per Fitur (Z-score)
Fitur N Outlier % Outlier
MinorAxisLength 506 3.74
Area 483 3.57
ConvexArea 482 3.56
EquivDiameter 465 3.43
Perimeter 404 2.98
MajorAxisLength 316 2.33
ShapeFactor4 239 1.76
Solidity 237 1.75
Extent 139 1.03
Eccentricity 125 0.92
roundness 75 0.55
ShapeFactor1 53 0.39
AspectRation 21 0.16
ShapeFactor3 8 0.06
ShapeFactor2 5 0.04
Compactness 1 0.01

3.4 Ringkasan Level Baris

outlier_mask <- df_numerik

for (col in fitur_numerik) {
  q1  <- quantile(df_numerik[[col]], 0.25)
  q3  <- quantile(df_numerik[[col]], 0.75)
  iqr <- q3 - q1
  outlier_mask[[col]] <- df_numerik[[col]] < (q1 - 1.5 * iqr) |
    df_numerik[[col]] > (q3 + 1.5 * iqr)
}

baris_outlier <- rowSums(outlier_mask) > 0
n_baris_outlier <- sum(baris_outlier)
n_baris_bersih  <- nrow(df) - n_baris_outlier

ringkasan_baris <- data.frame(
  Kondisi    = c("Baris dengan outlier (≥1 fitur)", "Baris bersih", "Total"),
  Jumlah     = c(n_baris_outlier, n_baris_bersih, nrow(df)),
  Persentase = percent(
    c(n_baris_outlier, n_baris_bersih, nrow(df)) / nrow(df),
    accuracy = 0.01
  )
)

ringkasan_baris %>%
  kable(caption = "Ringkasan Outlier Level Baris (IQR)") %>%
  kable_styling(
    bootstrap_options = c("striped", "hover"),
    full_width = FALSE
  ) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
  row_spec(1, background = "#fde8e8", color = "#c0392b") %>%
  row_spec(2, background = "#eafaf1", color = "#1e8449")
Ringkasan Outlier Level Baris (IQR)
Kondisi Jumlah Persentase
Baris dengan outlier (≥1 fitur) 3004 22.18%
Baris bersih 10539 77.82%
Total 13543 100.00%

3.5 Visualisasi Boxplot per Fitur

warna_box <- c("#2ecc71","#3498db","#e74c3c","#f39c12","#9b59b6",
               "#1abc9c","#e67e22","#34495e","#16a085","#8e44ad",
               "#2980b9","#c0392b","#27ae60","#d35400","#7f8c8d","#2c3e50")

df_long <- df_numerik %>%
  pivot_longer(everything(), names_to = "Fitur", values_to = "Nilai") %>%
  mutate(Fitur = factor(Fitur, levels = fitur_numerik))

ggplot(df_long, aes(x = Fitur, y = Nilai, fill = Fitur)) +
  geom_boxplot(
    outlier.shape  = 16,
    outlier.size   = 0.8,
    outlier.alpha  = 0.4,
    outlier.colour = "#e74c3c",
    show.legend    = FALSE
  ) +
  scale_fill_manual(values = warna_box) +
  facet_wrap(~Fitur, scales = "free", ncol = 4) +
  labs(
    title    = "Boxplot Tiap Fitur",
    subtitle = "Skala bebas per fitur (free scale)",
    x = NULL, y = "Nilai"
  ) +
  theme_minimal(base_size = 11) +
  theme(
    plot.title    = element_text(face = "bold", hjust = 0.5, size = 14),
    plot.subtitle = element_text(hjust = 0.5, color = "gray50"),
    axis.text.x   = element_blank(),
    axis.ticks.x  = element_blank(),
    strip.text    = element_text(face = "bold", size = 10)
  )

3.6 Visualisasi: Bar Chart % Outlier per Fitur

iqr_plot <- iqr_summary %>%
  mutate(
    Fitur = reorder(Fitur, Pct_Outlier),
    Warna = ifelse(Pct_Outlier >= 5, "#e74c3c", "#3498db")
  )

ggplot(iqr_plot, aes(x = Fitur, y = Pct_Outlier, fill = Warna)) +
  geom_col(width = 0.7, show.legend = FALSE) +
  geom_hline(yintercept = 5, linetype = "dashed",
             color = "#f39c12", linewidth = 1) +
  geom_text(aes(label = paste0(Pct_Outlier, "%")),
            hjust = -0.15, size = 3.5, fontface = "bold") +
  scale_fill_identity() +
  scale_y_continuous(
    expand = expansion(mult = c(0, 0.2)),
    labels = function(x) paste0(x, "%")
  ) +
  coord_flip() +
  annotate("text", x = 1, y = 5.3,
           label = "Batas 5%", color = "#f39c12",
           size = 3.5, fontface = "italic") +
  labs(
    title    = "Persentase Outlier per Fitur (Metode IQR)",
    subtitle = "Merah = outlier ≥ 5% | Biru = outlier < 5%",
    x = "Fitur", y = "% Outlier"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title    = element_text(face = "bold", hjust = 0.5),
    plot.subtitle = element_text(hjust = 0.5, color = "gray50"),
    panel.grid.major.y = element_blank()
  )

3.7 Penanganan Outlier: Winsorizing (Capping IQR)

df_clean <- df  

for (col in fitur_numerik) {
  q1  <- quantile(df_clean[[col]], 0.25)
  q3  <- quantile(df_clean[[col]], 0.75)
  iqr <- q3 - q1
  lower <- q1 - 1.5 * iqr
  upper <- q3 + 1.5 * iqr
  
  df_clean[[col]] <- pmin(pmax(df_clean[[col]], lower), upper)
}

cat("Winsorizing selesai!\n")
## Winsorizing selesai!
cat("Shape df_clean:", nrow(df_clean), "x", ncol(df_clean), "\n")
## Shape df_clean: 13543 x 17
cat("Kolom Class tetap ada:", "Class" %in% names(df_clean), "\n")
## Kolom Class tetap ada: TRUE
outlier_setelah <- df_clean %>%
  select(-Class) %>%
  pivot_longer(everything(), names_to = "Fitur", values_to = "Nilai") %>%
  group_by(Fitur) %>%
  summarise(
    N_Outlier   = sum(Nilai < (quantile(Nilai, 0.25) - 1.5 * IQR(Nilai)) |
                        Nilai > (quantile(Nilai, 0.75) + 1.5 * IQR(Nilai))),
    Pct_Outlier = round(N_Outlier / n() * 100, 2),
    .groups = "drop"
  )

outlier_setelah %>%
  kable(
    caption   = "Verifikasi Outlier Setelah Winsorizing",
    col.names = c("Fitur", "N Outlier", "% Outlier")
  ) %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed"),
    full_width = FALSE,
    font_size  = 12
  ) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
  row_spec(
    which(outlier_setelah$N_Outlier == 0),
    background = "#eafaf1", color = "#1e8449"
  )
Verifikasi Outlier Setelah Winsorizing
Fitur N Outlier % Outlier
Area 0 0
AspectRation 0 0
Compactness 0 0
ConvexArea 0 0
Eccentricity 0 0
EquivDiameter 0 0
Extent 0 0
MajorAxisLength 0 0
MinorAxisLength 0 0
Perimeter 0 0
ShapeFactor1 0 0
ShapeFactor2 0 0
ShapeFactor3 0 0
ShapeFactor4 0 0
Solidity 0 0
roundness 0 0

3.8 Kesimpulan Tahap 3

fitur_tinggi <- iqr_summary %>%
  filter(Pct_Outlier >= 5) %>%
  pull(Fitur)

kesimpulan_3 <- data.frame(
  Aspek  = c(
    "Metode deteksi",
    "Fitur dengan outlier tertinggi",
    "Baris terdeteksi outlier (IQR)",
    "Metode penanganan",
    "Shape data setelah penanganan",
    "Status"
  ),
  Hasil  = c(
    "IQR (1.5×IQR) & Z-Score (|z| > 3)",
    paste(fitur_tinggi, collapse = ", "),
    paste0(n_baris_outlier, " baris (", round(n_baris_outlier/nrow(df)*100,2), "%)"),
    "Winsorizing / Capping ke batas IQR",
    paste0(nrow(df_clean), " baris × ", ncol(df_clean), " kolom"),
    "Selesai — lanjut ke Tahap 4 EDA"
  ),
  Status = c("info", "perlu cek", "perlu cek", "wajib", "oke", "oke")
)

kesimpulan_3 %>%
  kable(caption = "Kesimpulan Tahap 3: Deteksi & Penanganan Outlier") %>%
  kable_styling(
    bootstrap_options = c("striped", "hover"),
    full_width = FALSE
  ) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
  row_spec(which(kesimpulan_3$Status == "oke"),
           background = "#eafaf1", color = "#1e8449") %>%
  row_spec(which(kesimpulan_3$Status == "perlu cek"),
           background = "#fef9e7", color = "#b7770d") %>%
  column_spec(3, bold = TRUE)
Kesimpulan Tahap 3: Deteksi & Penanganan Outlier
Aspek Hasil Status
Metode deteksi IQR (1.5×IQR) & Z-Score (&#124;z&#124; > 3) info
Fitur dengan outlier tertinggi Eccentricity, Solidity, ShapeFactor4 perlu cek
Baris terdeteksi outlier (IQR) 3004 baris (22.18%) perlu cek
Metode penanganan Winsorizing / Capping ke batas IQR wajib
Shape data setelah penanganan 13543 baris × 17 kolom oke
Status Selesai — lanjut ke Tahap 4 EDA oke

Tahap 4: Exploratory Data Analysis (EDA)

4.1 Setup: Siapkan Data Numerik Bersih

df_numerik_clean <- df_clean %>% select(-Class)
fitur_numerik    <- names(df_numerik_clean)

cat("Jumlah fitur numerik:", length(fitur_numerik), "\n")
## Jumlah fitur numerik: 16
cat("Fitur:", paste(fitur_numerik, collapse = ", "), "\n")
## Fitur: Area, Perimeter, MajorAxisLength, MinorAxisLength, AspectRation, Eccentricity, ConvexArea, EquivDiameter, Extent, Solidity, roundness, Compactness, ShapeFactor1, ShapeFactor2, ShapeFactor3, ShapeFactor4

4.2 Statistik Deskriptif (Setelah Winsorizing)

stat_desc <- df_numerik_clean %>%
  pivot_longer(everything(), names_to = "Fitur", values_to = "Nilai") %>%
  group_by(Fitur) %>%
  summarise(
    Min    = round(min(Nilai), 2),
    Q1     = round(quantile(Nilai, 0.25), 2),
    Median = round(median(Nilai), 2),
    Mean   = round(mean(Nilai), 2),
    Q3     = round(quantile(Nilai, 0.75), 2),
    Max    = round(max(Nilai), 2),
    SD     = round(sd(Nilai), 2),
    .groups = "drop"
  )

stat_desc %>%
  kable(caption = "Statistik Deskriptif Fitur Numerik (Setelah Winsorizing)") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE, font_size = 12) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white")
Statistik Deskriptif Fitur Numerik (Setelah Winsorizing)
Fitur Min Q1 Median Mean Q3 Max SD
Area 20420.00 36282.50 44580.00 50166.95 61382.00 99031.25 18864.23
AspectRation 1.02 1.43 1.55 1.58 1.70 2.11 0.24
Compactness 0.66 0.76 0.80 0.80 0.83 0.94 0.06
ConvexArea 20684.00 36673.00 45122.00 50869.02 62360.00 100890.50 19253.93
Eccentricity 0.57 0.72 0.76 0.75 0.81 0.91 0.08
EquivDiameter 161.24 214.93 238.25 249.47 279.56 376.50 47.51
Extent 0.62 0.72 0.76 0.75 0.79 0.87 0.05
MajorAxisLength 183.60 253.09 296.40 318.31 376.31 561.15 80.41
MinorAxisLength 122.51 175.89 192.49 198.67 217.25 279.28 32.83
Perimeter 524.74 703.23 793.90 847.33 977.15 1388.02 189.36
ShapeFactor1 0.00 0.01 0.01 0.01 0.01 0.01 0.00
ShapeFactor2 0.00 0.00 0.00 0.00 0.00 0.00 0.00
ShapeFactor3 0.41 0.58 0.64 0.64 0.70 0.87 0.10
ShapeFactor4 0.99 0.99 1.00 1.00 1.00 1.00 0.00
Solidity 0.98 0.99 0.99 0.99 0.99 0.99 0.00
roundness 0.71 0.83 0.88 0.87 0.92 0.99 0.06

4.3 Distribusi Tiap Fitur (Histogram)

warna_hist <- c("#2ecc71","#3498db","#e74c3c","#f39c12","#9b59b6",
                "#1abc9c","#e67e22","#34495e","#16a085","#8e44ad",
                "#2980b9","#c0392b","#27ae60","#d35400","#7f8c8d","#2c3e50")

df_long_clean <- df_numerik_clean %>%
  pivot_longer(everything(), names_to = "Fitur", values_to = "Nilai") %>%
  mutate(Fitur = factor(Fitur, levels = fitur_numerik))

ggplot(df_long_clean, aes(x = Nilai, fill = Fitur)) +
  geom_histogram(bins = 40, color = "white", alpha = 0.85, show.legend = FALSE) +
  geom_freqpoly(bins = 40, color = "black", linewidth = 0.5, show.legend = FALSE) +
  scale_fill_manual(values = warna_hist) +
  facet_wrap(~Fitur, scales = "free", ncol = 4) +
  labs(
    title    = "Distribusi Tiap Fitur Numerik (Setelah Winsorizing)",
    subtitle = "Skala bebas per fitur",
    x = "Nilai", y = "Frekuensi"
  ) +
  theme_minimal(base_size = 11) +
  theme(
    plot.title    = element_text(face = "bold", hjust = 0.5, size = 14),
    plot.subtitle = element_text(hjust = 0.5, color = "gray50"),
    strip.text    = element_text(face = "bold", size = 10)
  )

4.4 Distribusi Fitur per Kelas

df_long_kelas <- df_clean %>%
  pivot_longer(-Class, names_to = "Fitur", values_to = "Nilai") %>%
  mutate(Fitur = factor(Fitur, levels = fitur_numerik))

warna_kelas <- c("#2ecc71","#3498db","#e74c3c","#f39c12",
                 "#9b59b6","#1abc9c","#e67e22")

ggplot(df_long_kelas, aes(x = Class, y = Nilai, fill = Class)) +
  geom_violin(trim = TRUE, alpha = 0.7, show.legend = FALSE) +
  geom_boxplot(width = 0.15, fill = "white",
               outlier.shape = NA, show.legend = FALSE) +
  scale_fill_manual(values = warna_kelas) +
  facet_wrap(~Fitur, scales = "free_y", ncol = 4) +
  labs(
    title    = "Distribusi Fitur per Kelas (Violin dan Boxplot)",
    subtitle = "Distribusi nilai fitur untuk tiap kelas kacang",
    x = "Kelas", y = "Nilai"
  ) +
  theme_minimal(base_size = 10) +
  theme(
    plot.title    = element_text(face = "bold", hjust = 0.5, size = 13),
    plot.subtitle = element_text(hjust = 0.5, color = "gray50"),
    strip.text    = element_text(face = "bold", size = 9),
    axis.text.x   = element_text(angle = 30, hjust = 1, size = 8)
  )

4.5 Matriks Korelasi Antar Fitur

cor_matrix <- cor(df_numerik_clean, method = "pearson")

cor_matrix %>%
  round(2) %>%
  kable(caption = "Matriks Korelasi Pearson Antar Fitur") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = TRUE, font_size = 10) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white")
Matriks Korelasi Pearson Antar Fitur
Area Perimeter MajorAxisLength MinorAxisLength AspectRation Eccentricity ConvexArea EquivDiameter Extent Solidity roundness Compactness ShapeFactor1 ShapeFactor2 ShapeFactor3 ShapeFactor4
Area 1.00 0.99 0.95 0.92 0.37 0.39 1.00 1.00 0.00 -0.32 -0.54 -0.39 -0.89 -0.77 -0.39 -0.50
Perimeter 0.99 1.00 0.97 0.88 0.44 0.44 0.99 0.99 -0.04 -0.37 -0.62 -0.45 -0.86 -0.81 -0.45 -0.52
MajorAxisLength 0.95 0.97 1.00 0.77 0.59 0.58 0.95 0.96 -0.09 -0.33 -0.64 -0.60 -0.76 -0.88 -0.60 -0.57
MinorAxisLength 0.92 0.88 0.77 1.00 0.00 0.02 0.92 0.92 0.14 -0.23 -0.28 -0.01 -0.99 -0.49 -0.02 -0.33
AspectRation 0.37 0.44 0.59 0.00 1.00 0.96 0.37 0.38 -0.36 -0.30 -0.78 -0.99 0.01 -0.84 -0.98 -0.52
Eccentricity 0.39 0.44 0.58 0.02 0.96 1.00 0.39 0.39 -0.33 -0.34 -0.75 -0.98 0.01 -0.87 -0.99 -0.54
ConvexArea 1.00 0.99 0.95 0.92 0.37 0.39 1.00 1.00 0.00 -0.33 -0.54 -0.39 -0.89 -0.77 -0.40 -0.51
EquivDiameter 1.00 0.99 0.96 0.92 0.38 0.39 1.00 1.00 0.00 -0.31 -0.54 -0.40 -0.90 -0.78 -0.40 -0.50
Extent 0.00 -0.04 -0.09 0.14 -0.36 -0.33 0.00 0.00 1.00 0.21 0.34 0.35 -0.14 0.23 0.34 0.15
Solidity -0.32 -0.37 -0.33 -0.23 -0.30 -0.34 -0.33 -0.31 0.21 1.00 0.65 0.33 0.18 0.38 0.34 0.60
roundness -0.54 -0.62 -0.64 -0.28 -0.78 -0.75 -0.54 -0.54 0.34 0.65 1.00 0.78 0.25 0.79 0.77 0.53
Compactness -0.39 -0.45 -0.60 -0.01 -0.99 -0.98 -0.39 -0.40 0.35 0.33 0.78 1.00 -0.01 0.87 1.00 0.55
ShapeFactor1 -0.89 -0.86 -0.76 -0.99 0.01 0.01 -0.89 -0.90 -0.14 0.18 0.25 -0.01 1.00 0.47 -0.01 0.30
ShapeFactor2 -0.77 -0.81 -0.88 -0.49 -0.84 -0.87 -0.77 -0.78 0.23 0.38 0.79 0.87 0.47 1.00 0.87 0.61
ShapeFactor3 -0.39 -0.45 -0.60 -0.02 -0.98 -0.99 -0.40 -0.40 0.34 0.34 0.77 1.00 -0.01 0.87 1.00 0.55
ShapeFactor4 -0.50 -0.52 -0.57 -0.33 -0.52 -0.54 -0.51 -0.50 0.15 0.60 0.53 0.55 0.30 0.61 0.55 1.00

4.6 Visualisasi Heatmap Korelasi

ggcorrplot(
  cor_matrix,
  method     = "square",
  type       = "lower",
  lab        = TRUE,
  lab_size   = 3,
  colors     = c("#e74c3c", "white", "#2ecc71"),
  title      = "Heatmap Korelasi Antar Fitur (Pearson)",
  ggtheme    = theme_minimal(base_size = 11),
  tl.cex     = 10,
  outline.col = "white"
) +
  theme(
    plot.title      = element_text(face = "bold", hjust = 0.5, size = 13),
    legend.position = "right"
  )
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the ggcorrplot package.
##   Please report the issue at <https://github.com/kassambara/ggcorrplot/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

4.7 Identifikasi Pasangan Fitur Berkorelasi Tinggi

cor_df <- as.data.frame(as.table(cor_matrix)) %>%
  rename(Fitur_1 = Var1, Fitur_2 = Var2, Korelasi = Freq) %>%
  filter(as.character(Fitur_1) < as.character(Fitur_2)) %>%
  mutate(Korelasi = round(Korelasi, 4),
         Abs_Kor  = abs(Korelasi)) %>%
  arrange(desc(Abs_Kor))

korelasi_tinggi <- cor_df %>% filter(Abs_Kor >= 0.8)

korelasi_tinggi %>%
  select(Fitur_1, Fitur_2, Korelasi) %>%
  kable(caption = "Pasangan Fitur dengan Korelasi Tinggi (|r| ≥ 0.80)") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE, font_size = 12) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
  row_spec(which(abs(korelasi_tinggi$Korelasi) >= 0.95),
           background = "#fde8e8", color = "#c0392b") %>%
  row_spec(which(abs(korelasi_tinggi$Korelasi) >= 0.80 &
                   abs(korelasi_tinggi$Korelasi) < 0.95),
           background = "#fef9e7", color = "#b7770d")
Pasangan Fitur dengan Korelasi Tinggi (|r| ≥ 0.80)
Fitur_1 Fitur_2 Korelasi
Area ConvexArea 0.9999
Compactness ShapeFactor3 0.9989
Area EquivDiameter 0.9972
ConvexArea EquivDiameter 0.9970
AspectRation Compactness -0.9909
Eccentricity ShapeFactor3 -0.9907
EquivDiameter Perimeter 0.9905
MinorAxisLength ShapeFactor1 -0.9877
ConvexArea Perimeter 0.9874
Area Perimeter 0.9866
AspectRation ShapeFactor3 -0.9842
Compactness Eccentricity -0.9840
MajorAxisLength Perimeter 0.9746
EquivDiameter MajorAxisLength 0.9589
AspectRation Eccentricity 0.9560
ConvexArea MajorAxisLength 0.9529
Area MajorAxisLength 0.9526
EquivDiameter MinorAxisLength 0.9175
Area MinorAxisLength 0.9172
ConvexArea MinorAxisLength 0.9168
EquivDiameter ShapeFactor1 -0.9013
Area ShapeFactor1 -0.8913
ConvexArea ShapeFactor1 -0.8904
MajorAxisLength ShapeFactor2 -0.8844
MinorAxisLength Perimeter 0.8784
ShapeFactor2 ShapeFactor3 0.8715
Compactness ShapeFactor2 0.8679
Eccentricity ShapeFactor2 -0.8662
Perimeter ShapeFactor1 -0.8602
AspectRation ShapeFactor2 -0.8444
Perimeter ShapeFactor2 -0.8079
cat("\nTotal pasangan berkorelasi tinggi (|r| ≥ 0.80):", nrow(korelasi_tinggi), "\n")
## 
## Total pasangan berkorelasi tinggi (|r| ≥ 0.80): 31

4.8 Kesimpulan EDA

n_kor_tinggi <- nrow(korelasi_tinggi)
n_kor_sangat_tinggi <- sum(abs(korelasi_tinggi$Korelasi) >= 0.95)

kesimpulan_4 <- data.frame(
  Aspek  = c(
    "Jumlah fitur yang dianalisis",
    "Pasangan korelasi tinggi (|r| ≥ 0.80)",
    "Pasangan korelasi sangat tinggi (|r| ≥ 0.95)",
    "Fitur terkorelasi tertinggi",
    "Rekomendasi"
  ),
  Hasil  = c(
    paste0(length(fitur_numerik), " fitur numerik"),
    paste0(n_kor_tinggi, " pasangan"),
    paste0(n_kor_sangat_tinggi, " pasangan"),
    paste0(cor_df$Fitur_1[1], " — ", cor_df$Fitur_2[1],
           " (r = ", cor_df$Korelasi[1], ")"),
    "Pertimbangkan PCA atau drop fitur redundan"
  ),
  Status = c("oke", "perlu cek", "wajib", "info", "wajib")
)

kesimpulan_4 %>%
  kable(caption = "Kesimpulan Tahap 4: EDA") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
  row_spec(which(kesimpulan_4$Status == "oke"),
           background = "#eafaf1", color = "#1e8449") %>%
  row_spec(which(kesimpulan_4$Status == "perlu cek"),
           background = "#fef9e7", color = "#b7770d") %>%
  row_spec(which(kesimpulan_4$Status == "wajib"),
           background = "#fde8e8", color = "#c0392b") %>%
  column_spec(3, bold = TRUE)
Kesimpulan Tahap 4: EDA
Aspek Hasil Status
Jumlah fitur yang dianalisis 16 fitur numerik oke
Pasangan korelasi tinggi (&#124;r&#124; ≥ 0.80) 31 pasangan perlu cek
Pasangan korelasi sangat tinggi (&#124;r&#124; ≥ 0.95) 17 pasangan wajib
Fitur terkorelasi tertinggi Area — ConvexArea (r = 0.9999) info
Rekomendasi Pertimbangkan PCA atau drop fitur redundan wajib

Tahap 5: Feature Selection / Reduksi Dimensi (Opsional)

5.1 Strategi: Drop Fitur Redundan

fitur_drop <- c()

for (i in seq_len(nrow(korelasi_tinggi))) {
  f1 <- as.character(korelasi_tinggi$Fitur_1[i])
  f2 <- as.character(korelasi_tinggi$Fitur_2[i])
  if (!(f1 %in% fitur_drop) && !(f2 %in% fitur_drop)) {
    fitur_drop <- c(fitur_drop, f2)
  }
}

fitur_terpilih <- setdiff(fitur_numerik, fitur_drop)

cat("Fitur yang redundan:\n")
## Fitur yang redundan:
cat(paste(" -", fitur_drop, collapse = "\n"), "\n\n")
##  - ConvexArea
##  - ShapeFactor3
##  - EquivDiameter
##  - Compactness
##  - ShapeFactor1
##  - Perimeter
##  - Eccentricity
##  - MajorAxisLength
##  - MinorAxisLength
##  - ShapeFactor2
cat("Fitur yang dipertahankan:\n")
## Fitur yang dipertahankan:
cat(paste(" +", fitur_terpilih, collapse = "\n"), "\n\n")
##  + Area
##  + AspectRation
##  + Extent
##  + Solidity
##  + roundness
##  + ShapeFactor4
cat("Jumlah fitur sebelum seleksi:", length(fitur_numerik), "\n")
## Jumlah fitur sebelum seleksi: 16
cat("Jumlah fitur setelah seleksi:", length(fitur_terpilih), "\n")
## Jumlah fitur setelah seleksi: 6
tabel_seleksi <- data.frame(
  Fitur  = fitur_numerik,
  Status = ifelse(fitur_numerik %in% fitur_drop, "Drop (Redundan)", "Dipertahankan")
)

tabel_seleksi %>%
  kable(caption = "Status Fitur Setelah Feature Selection") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE, font_size = 12) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
  row_spec(which(tabel_seleksi$Status == "Drop (Redundan)"),
           background = "#fde8e8", color = "#c0392b") %>%
  row_spec(which(tabel_seleksi$Status == "Dipertahankan"),
           background = "#eafaf1", color = "#1e8449")
Status Fitur Setelah Feature Selection
Fitur Status
Area Dipertahankan
Perimeter Drop (Redundan)
MajorAxisLength Drop (Redundan)
MinorAxisLength Drop (Redundan)
AspectRation Dipertahankan
Eccentricity Drop (Redundan)
ConvexArea Drop (Redundan)
EquivDiameter Drop (Redundan)
Extent Dipertahankan
Solidity Dipertahankan
roundness Dipertahankan
Compactness Drop (Redundan)
ShapeFactor1 Drop (Redundan)
ShapeFactor2 Drop (Redundan)
ShapeFactor3 Drop (Redundan)
ShapeFactor4 Dipertahankan

5.2 PCA: Analisis Komponen Utama

df_seleksi <- df_clean %>% select(all_of(fitur_terpilih))

pca_result <- prcomp(df_seleksi, center = TRUE, scale. = TRUE)

var_explained <- summary(pca_result)$importance
var_df <- data.frame(
  PC              = paste0("PC", 1:ncol(var_explained)),
  SD              = round(var_explained[1, ], 4),
  Proporsi_Var    = round(var_explained[2, ], 4),
  Kumulatif_Var   = round(var_explained[3, ], 4)
)

var_df %>%
  kable(caption = "Variansi yang Dijelaskan oleh Tiap Komponen Utama (PCA)") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE, font_size = 12) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
  row_spec(which(var_df$Kumulatif_Var >= 0.90)[1],
           background = "#eafaf1", color = "#1e8449")
Variansi yang Dijelaskan oleh Tiap Komponen Utama (PCA)
PC SD Proporsi_Var Kumulatif_Var
PC1 PC1 1.7837 0.5302 0.5302
PC2 PC2 1.0328 0.1778 0.7080
PC3 PC3 0.8542 0.1216 0.8296
PC4 PC4 0.7082 0.0836 0.9132
PC5 PC5 0.6681 0.0744 0.9876
PC6 PC6 0.2727 0.0124 1.0000

5.3 Scree Plot PCA

fviz_eig(
  pca_result,
  addlabels = TRUE,
  ylim      = c(0, max(var_explained[2, ] * 100) + 5),
  barfill   = "#3498db",
  barcolor  = "white",
  linecolor = "#e74c3c",
  main      = "Variansi Tiap Komponen Utama (PCA)"
) +
  theme_minimal(base_size = 12) +
  theme(plot.title = element_text(face = "bold", hjust = 0.5))

5.4 Biplot PCA

fviz_pca_biplot(
  pca_result,
  geom.ind   = "point",
  col.ind    = df_clean$Class,
  palette    = c("#2ecc71","#3498db","#e74c3c","#f39c12",
                 "#9b59b6","#1abc9c","#e67e22"),
  addEllipses = TRUE,
  ellipse.level = 0.90,
  col.var    = "black",
  repel      = TRUE,
  legend.title = "Kelas",
  title      = "Fitur & Distribusi Kelas (PCA)"
) +
  theme_minimal(base_size = 11) +
  theme(plot.title = element_text(face = "bold", hjust = 0.5))
## Ignoring unknown labels:
## • linetype : "Kelas"
## Warning: The shape palette can deal with a maximum of 6 discrete values because more
## than 6 becomes difficult to discriminate
## ℹ you have requested 7 values. Consider specifying shapes manually if you need
##   that many of them.
## Warning: Removed 2636 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

5.5 Keputusan: Pilih Pendekatan

n_pc_90 <- min(which(var_df$Kumulatif_Var >= 0.90))

cat("PC yang dibutuhkan untuk menjelaskan >= 90% variansi:", n_pc_90, "\n")
## PC yang dibutuhkan untuk menjelaskan >= 90% variansi: 4
cat("Jumlah fitur setelah seleksi manual:", length(fitur_terpilih), "\n\n")
## Jumlah fitur setelah seleksi manual: 6
if (n_pc_90 < length(fitur_terpilih)) {
  cat("Rekomendasi: Gunakan PCA dengan", n_pc_90, "komponen untuk clustering\n")
} else {
  cat("Rekomendasi: Gunakan seleksi manual (", length(fitur_terpilih), "fitur) — PCA tidak memberikan reduksi yang signifikan\n")
}
## Rekomendasi: Gunakan PCA dengan 4 komponen untuk clustering
pc_scores <- as.data.frame(pca_result$x[, 1:n_pc_90])
pc_scores$Class <- df_clean$Class

cat("Dimensi data PC scores:", nrow(pc_scores), "baris x",
    ncol(pc_scores) - 1, "komponen + kolom Class\n")
## Dimensi data PC scores: 13543 baris x 4 komponen + kolom Class

Tahap 6: Pisahkan Kolom Label (Class)

6.1 Pisahkan Fitur dan Label

X_clustering <- df_clean %>% select(all_of(fitur_terpilih))
y_label      <- df_clean$Class

cat("Data untuk Clustering\n")
## Data untuk Clustering
cat("\nShape fitur (X):", nrow(X_clustering), "x", ncol(X_clustering), "\n")
## 
## Shape fitur (X): 13543 x 6
cat("Panjang label (y):", length(y_label), "\n")
## Panjang label (y): 13543
cat("Label Class tersedia:", n_distinct(y_label), "kelas unik\n")
## Label Class tersedia: 7 kelas unik
cat("Nama kelas:", paste(unique(y_label), collapse = ", "), "\n\n")
## Nama kelas: SEKER, BARBUNYA, BOMBAY, CALI, HOROZ, SIRA, DERMASON
tabel_pisah <- data.frame(
  Objek    = c("X_clustering", "y_label"),
  Isi      = c(
    paste0(ncol(X_clustering), " fitur numerik terpilih"),
    "Kolom Class"
  ),
  Dipakai_Saat = c(
    "Training clustering",
    "Evaluasi hasil clustering"
  ),
  Dimensi  = c(
    paste0(nrow(X_clustering), " x ", ncol(X_clustering)),
    paste0(length(y_label), " baris")
  )
)

tabel_pisah %>%
  kable(caption = "Pemisahan Fitur dan Label untuk Clustering") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE, font_size = 12) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
  row_spec(1, background = "#eafaf1", color = "#1e8449") %>%
  row_spec(2, background = "#fef9e7", color = "#b7770d")
Pemisahan Fitur dan Label untuk Clustering
Objek Isi Dipakai_Saat Dimensi
X_clustering 6 fitur numerik terpilih Training clustering 13543 x 6
y_label Kolom Class Evaluasi hasil clustering 13543 baris

Tahap 7: Normalisasi / Standarisasi Fitur

7.1 Alasan Normalisasi

range_sebelum <- X_clustering %>%
  pivot_longer(everything(), names_to = "Fitur", values_to = "Nilai") %>%
  group_by(Fitur) %>%
  summarise(
    Min    = round(min(Nilai), 2),
    Max    = round(max(Nilai), 2),
    Range  = round(max(Nilai) - min(Nilai), 2),
    Mean   = round(mean(Nilai), 2),
    SD     = round(sd(Nilai), 2),
    .groups = "drop"
  ) %>%
  arrange(desc(Range))

range_sebelum %>%
  kable(caption = "Range Nilai Fitur Sebelum Normalisasi") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE, font_size = 12) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
  row_spec(1:3, background = "#fde8e8", color = "#c0392b")
Range Nilai Fitur Sebelum Normalisasi
Fitur Min Max Range Mean SD
Area 20420.00 99031.25 78611.25 50166.95 18864.23
AspectRation 1.02 2.11 1.09 1.58 0.24
roundness 0.71 0.99 0.28 0.87 0.06
Extent 0.62 0.87 0.25 0.75 0.05
Solidity 0.98 0.99 0.02 0.99 0.00
ShapeFactor4 0.99 1.00 0.01 1.00 0.00
cat("\nFitur dengan range terbesar:",
    range_sebelum$Fitur[1], "(range =", range_sebelum$Range[1], ")\n")
## 
## Fitur dengan range terbesar: Area (range = 78611.25 )
cat("Fitur dengan range terkecil:",
    range_sebelum$Fitur[nrow(range_sebelum)],
    "(range =", range_sebelum$Range[nrow(range_sebelum)], ")\n")
## Fitur dengan range terkecil: ShapeFactor4 (range = 0.01 )
cat("\nPerbedaan skala yang besar akan mendominasi perhitungan jarak pada clustering\n")
## 
## Perbedaan skala yang besar akan mendominasi perhitungan jarak pada clustering

7.2 Standardisasi Z-score (StandardScaler)

X_scaled_zscore <- X_clustering %>%
  mutate(across(everything(), ~ (. - mean(.)) / sd(.)))

zscore_check <- X_scaled_zscore %>%
  pivot_longer(everything(), names_to = "Fitur", values_to = "Nilai") %>%
  group_by(Fitur) %>%
  summarise(
    Mean_Baru = round(mean(Nilai), 6),
    SD_Baru   = round(sd(Nilai), 6),
    Min_Baru  = round(min(Nilai), 4),
    Max_Baru  = round(max(Nilai), 4),
    .groups   = "drop"
  )

zscore_check %>%
  kable(caption = "Verifikasi Hasil Z-score Standardization") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE, font_size = 12) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
  row_spec(which(abs(zscore_check$Mean_Baru) < 1e-4 & abs(zscore_check$SD_Baru - 1) < 1e-4),
           background = "#eafaf1", color = "#1e8449")
Verifikasi Hasil Z-score Standardization
Fitur Mean_Baru SD_Baru Min_Baru Max_Baru
Area 0 1 -1.5769 2.5903
AspectRation 0 1 -2.3166 2.2397
Extent 0 1 -2.7860 2.4199
ShapeFactor4 0 1 -2.4066 1.3271
Solidity 0 1 -2.3219 2.0201
roundness 0 1 -2.8469 2.0014
cat("\nMean semua fitur (harus ~0):",
    round(mean(as.matrix(X_scaled_zscore)), 6), "\n")
## 
## Mean semua fitur (harus ~0): 0
cat("SD semua fitur (harus ~1):",
    round(mean(apply(X_scaled_zscore, 2, sd)), 6), "\n")
## SD semua fitur (harus ~1): 1

7.3 Min-Max Normalization (MinMaxScaler)

X_scaled_minmax <- X_clustering %>%
  mutate(across(everything(), ~ (. - min(.)) / (max(.) - min(.))))

minmax_check <- X_scaled_minmax %>%
  pivot_longer(everything(), names_to = "Fitur", values_to = "Nilai") %>%
  group_by(Fitur) %>%
  summarise(
    Min_Baru = round(min(Nilai), 6),
    Max_Baru = round(max(Nilai), 6),
    Mean_Baru = round(mean(Nilai), 4),
    .groups = "drop"
  )

minmax_check %>%
  kable(caption = "Verifikasi Hasil Min-Max Normalization (Rentang [0,1])") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE, font_size = 12) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
  row_spec(which(minmax_check$Min_Baru == 0 & minmax_check$Max_Baru == 1),
           background = "#eafaf1", color = "#1e8449")
Verifikasi Hasil Min-Max Normalization (Rentang [0,1])
Fitur Min_Baru Max_Baru Mean_Baru
Area 0 1 0.3784
AspectRation 0 1 0.5084
Extent 0 1 0.5352
ShapeFactor4 0 1 0.6446
Solidity 0 1 0.5348
roundness 0 1 0.5872

7.4 Perbandingan Distribusi: Sebelum vs Sesudah Normalisasi

fitur_sampel <- fitur_terpilih[1:min(6, length(fitur_terpilih))]

df_vis <- bind_rows(
  X_clustering %>%
    select(all_of(fitur_sampel)) %>%
    pivot_longer(everything(), names_to = "Fitur", values_to = "Nilai") %>%
    mutate(Kondisi = "Sebelum Normalisasi"),

  X_scaled_zscore %>%
    select(all_of(fitur_sampel)) %>%
    pivot_longer(everything(), names_to = "Fitur", values_to = "Nilai") %>%
    mutate(Kondisi = "Z-score"),

  X_scaled_minmax %>%
    select(all_of(fitur_sampel)) %>%
    pivot_longer(everything(), names_to = "Fitur", values_to = "Nilai") %>%
    mutate(Kondisi = "Min-Max")
) %>%
  mutate(
    Kondisi = factor(Kondisi,
                     levels = c("Sebelum Normalisasi", "Z-score", "Min-Max")),
    Fitur   = factor(Fitur, levels = fitur_sampel)
  )

ggplot(df_vis, aes(x = Nilai, fill = Kondisi)) +
  geom_histogram(bins = 40, color = "white", alpha = 0.8) +
  scale_fill_manual(values = c("#e74c3c", "#3498db", "#2ecc71")) +
  facet_grid(Kondisi ~ Fitur, scales = "free") +
  labs(
    title    = "PeSebelum vs Sesudah Normalisasi",
    subtitle = paste0("Ditampilkan ", length(fitur_sampel),
                      " fitur pertama (skala bebas per panel)"),
    x = "Nilai", y = "Frekuensi",
    fill = "Metode"
  ) +
  theme_minimal(base_size = 10) +
  theme(
    plot.title    = element_text(face = "bold", hjust = 0.5, size = 13),
    plot.subtitle = element_text(hjust = 0.5, color = "gray50"),
    strip.text    = element_text(face = "bold", size = 9),
    legend.position = "bottom"
  )

7.5 Pilih Metode Normalisasi untuk Clustering

X_final <- X_scaled_zscore
cat("Data Final untuk Clustering\n")
## Data Final untuk Clustering
cat("\nMetode normalisasi: Z-score (StandardScaler)\n")
## 
## Metode normalisasi: Z-score (StandardScaler)
cat("Shape X_final:", nrow(X_final), "x", ncol(X_final), "\n")
## Shape X_final: 13543 x 6
cat("Kolom:", paste(names(X_final), collapse = ", "), "\n\n")
## Kolom: Area, AspectRation, Extent, Solidity, roundness, ShapeFactor4
cat("Statistik X_final:\n")
## Statistik X_final:
print(round(sapply(X_final, function(x) c(mean=mean(x), sd=sd(x), min=min(x), max=max(x))), 4))
##         Area AspectRation  Extent Solidity roundness ShapeFactor4
## mean  0.0000       0.0000  0.0000   0.0000    0.0000       0.0000
## sd    1.0000       1.0000  1.0000   1.0000    1.0000       1.0000
## min  -1.5769      -2.3166 -2.7860  -2.3219   -2.8469      -2.4066
## max   2.5903       2.2397  2.4199   2.0201    2.0014       1.3271

8.1 Penentuan Jumlah Kluster Optimal

8.1.1 Metode Elbow (WSS)

set.seed(42)
wss_values <- sapply(1:10, function(k) {
  kmeans(X_final, centers = k, nstart = 10, iter.max = 100)$tot.withinss
})

wss_df <- data.frame(k = 1:10, WSS = wss_values)

delta_wss  <- diff(wss_values)        
pct_drop   <- abs(delta_wss) / wss_values[-length(wss_values)] * 100
elbow_k    <- which(pct_drop < 10)[1] + 1   
elbow_k    <- if (is.na(elbow_k)) 7 else elbow_k 

ggplot(wss_df, aes(x = k, y = WSS)) +
  geom_line(color = "#3498db", linewidth = 1.2) +
  geom_point(color = "#e74c3c", size = 4) +
  geom_vline(xintercept = elbow_k, linetype = "dashed",
             color = "#e74c3c", linewidth = 0.8) +
  annotate("text", x = elbow_k + 0.35, y = max(wss_values) * 0.85,
           label = paste0("k = ", elbow_k, " (siku)"),
           color = "#e74c3c", size = 4, fontface = "italic") +
  scale_x_continuous(breaks = 1:10) +
  labs(
    title    = "Metode Elbow — Penentuan Jumlah Kluster Optimal (K-Means)",
    subtitle = "Cari titik 'siku' di mana WSS mulai melandai",
    x = "Jumlah Kluster (k)", y = "Total Within-Cluster SS (WSS)"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title    = element_text(face = "bold", hjust = 0.5, size = 13),
    plot.subtitle = element_text(hjust = 0.5, color = "gray50")
  )

8.1.2 Metode Silhouette

set.seed(42)
samp_idx_sil <- sample(nrow(X_final), min(2000, nrow(X_final)))
X_sil_sample <- X_final[samp_idx_sil, ]

sil_values_km <- sapply(2:8, function(k) {
  km  <- kmeans(X_final, centers = k, nstart = 10, iter.max = 100)
  sil <- silhouette(km$cluster[samp_idx_sil], dist(X_sil_sample))
  mean(sil[, 3])
})

sil_df_km <- data.frame(k = 2:8, Silhouette = sil_values_km)
best_k_km <- sil_df_km$k[which.max(sil_df_km$Silhouette)]

ggplot(sil_df_km, aes(x = k, y = Silhouette)) +
  geom_line(color = "#2ecc71", linewidth = 1.2) +
  geom_point(aes(color = k == best_k_km), size = 4, show.legend = FALSE) +
  scale_color_manual(values = c("FALSE" = "#2ecc71", "TRUE" = "#e74c3c")) +
  geom_vline(xintercept = best_k_km, linetype = "dashed",
             color = "#e74c3c", linewidth = 0.8) +
  annotate("text", x = best_k_km + 0.3,
           y = max(sil_values_km) * 0.97,
           label = paste0("k = ", best_k_km, " (optimal)"),
           color = "#e74c3c", size = 4, fontface = "italic") +
  scale_x_continuous(breaks = 2:8) +
  labs(
    title    = "Metode Silhouette — Penentuan Jumlah Kluster Optimal (K-Means)",
    subtitle = "Skor lebih tinggi = kluster lebih kohesif & terpisah",
    x = "Jumlah Kluster (k)", y = "Rata-rata Silhouette Score"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title    = element_text(face = "bold", hjust = 0.5, size = 13),
    plot.subtitle = element_text(hjust = 0.5, color = "gray50")
  )

cat("k optimal berdasarkan Silhouette:", best_k_km, "\n")
## k optimal berdasarkan Silhouette: 2

8.1.3 Keputusan k

k_optimal <- 7

cat(" Keputusan: k =", k_optimal, "\n")
##  Keputusan: k = 7
cat(" Alasan   : Sesuai jumlah kelas asli (7 jenis kacang)\n")
##  Alasan   : Sesuai jumlah kelas asli (7 jenis kacang)
cat("            + dikonfirmasi Elbow & Silhouette\n")
##             + dikonfirmasi Elbow & Silhouette

8.2 Training K-Means

set.seed(42)

km_result <- kmeans(
  X_final,
  centers   = k_optimal,
  nstart    = 25,      
  iter.max  = 300,
  algorithm = "Hartigan-Wong"
)

km_labels <- km_result$cluster

cat("K-Means selesai!\n\n")
## K-Means selesai!
cat("Algoritma          : Hartigan-Wong\n")
## Algoritma          : Hartigan-Wong
cat("Metrik jarak       : Euclidean (L2 norm)\n")
## Metrik jarak       : Euclidean (L2 norm)
cat("Pusat kluster      : Mean (rata-rata koordinat)\n")
## Pusat kluster      : Mean (rata-rata koordinat)
cat("Jumlah kluster     :", nrow(km_result$centers), "\n")
## Jumlah kluster     : 7
cat("Iterasi konvergen  :", km_result$iter, "\n")
## Iterasi konvergen  : 6
cat("Total WSS          :", round(km_result$tot.withinss, 2), "\n")
## Total WSS          : 25231.11
cat("Between-SS/Total-SS:",
    round(km_result$betweenss / km_result$totss * 100, 2), "%\n\n")
## Between-SS/Total-SS: 68.95 %
cat("Distribusi anggota per kluster:\n")
## Distribusi anggota per kluster:
print(table(km_labels))
## km_labels
##    1    2    3    4    5    6    7 
## 2788 1408 2722 2484 1091 1179 1871

8.3 Evaluasi Internal K-Means

8.3.1 Silhouette Plot

set.seed(42)
samp_km    <- sample(nrow(X_final), min(2000, nrow(X_final)))
sil_km     <- silhouette(km_labels[samp_km], dist(X_final[samp_km, ]))
avg_sil_km <- mean(sil_km[, 3])

fviz_silhouette(sil_km,
                palette       = "jco",
                ggtheme       = theme_minimal(base_size = 11),
                print.summary = FALSE) +
  geom_hline(yintercept = avg_sil_km, linetype = "dashed",
             color = "#e74c3c", linewidth = 0.8) +
  labs(
    title    = "K-Means Clustering (Silhouette)",
    subtitle = paste0("Rata-rata Silhouette Score = ", round(avg_sil_km, 4),
                      " | Range: -1 (buruk) hingga 1 (sempurna)")
  ) +
  theme(
    plot.title    = element_text(face = "bold", hjust = 0.5, size = 13),
    plot.subtitle = element_text(hjust = 0.5, color = "gray50")
  )

8.3.2 Tabel Metrik Internal

eval_km <- data.frame(
  Metrik     = c("Silhouette Score",
                 "Between-SS / Total-SS (%)",
                 "Total Within-SS",
                 "Iterasi Konvergen"),
  Nilai      = c(round(avg_sil_km, 4),
                 round(km_result$betweenss / km_result$totss * 100, 2),
                 round(km_result$tot.withinss, 2),
                 km_result$iter),
  Keterangan = c("Mendekati 1 = kluster padat & terpisah",
                 "Semakin tinggi = pemisahan kluster lebih baik",
                 "Semakin kecil = kluster lebih kompak",
                 "Cepat konvergen = solusi stabil")
)

eval_km %>%
  kable(caption = "Evaluasi Internal — K-Means Clustering") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE, font_size = 12) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
  row_spec(1, background = "#eafaf1", color = "#1e8449")
Evaluasi Internal — K-Means Clustering
Metrik Nilai Keterangan
Silhouette Score 0.2688 Mendekati 1 = kluster padat & terpisah
Between-SS / Total-SS (%) 68.9500 Semakin tinggi = pemisahan kluster lebih baik
Total Within-SS 25231.1100 Semakin kecil = kluster lebih kompak
Iterasi Konvergen 6.0000 Cepat konvergen = solusi stabil

8.4 Visualisasi Hasil K-Means

8.4.1 Cluster Plot (PCA 2D)

warna_kluster <- c("#2ecc71","#3498db","#e74c3c","#f39c12",
                   "#9b59b6","#1abc9c","#e67e22")

fviz_cluster(
  km_result,
  data         = X_final,
  palette      = warna_kluster,
  geom         = "point",
  ellipse.type = "convex",
  alpha        = 0.4,
  pointsize    = 1,
  ggtheme      = theme_minimal(base_size = 12)
) +
  labs(
    title    = "K-Means Clustering (PCA 2D)",
    subtitle = paste0("k = ", k_optimal, "Proyeksi ke 2 PC pertama"),
    color = "Kluster", fill = "Kluster", shape = "Kluster"
  ) +
  theme(
    plot.title    = element_text(face = "bold", hjust = 0.5, size = 13),
    plot.subtitle = element_text(hjust = 0.5, color = "gray50")
  )

8.4.2 Distribusi Anggota per Kluster

dist_km <- data.frame(
  Kluster = paste0("K", sort(unique(km_labels))),
  Jumlah  = as.integer(table(km_labels))
) %>%
  mutate(
    Persentase = percent(Jumlah / sum(Jumlah), accuracy = 0.1),
    Kluster    = reorder(Kluster, -Jumlah)
  )

ggplot(dist_km, aes(x = Kluster, y = Jumlah, fill = Kluster)) +
  geom_col(width = 0.7, show.legend = FALSE) +
  geom_text(aes(label = paste0(Jumlah, "\n", Persentase)),
            vjust = -0.3, size = 3.8, fontface = "bold") +
  scale_fill_manual(values = warna_kluster) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.18))) +
  labs(title = "Distribusi Anggota per Kluster — K-Means",
       x = "Kluster", y = "Jumlah Anggota") +
  theme_minimal(base_size = 12) +
  theme(plot.title  = element_text(face = "bold", hjust = 0.5),
        axis.text.x = element_text(face = "bold"))

8.4.3 Heatmap Profil Centroid

centroid_km <- as.data.frame(km_result$centers) %>%
  mutate(Kluster = paste0("K", 1:k_optimal)) %>%
  pivot_longer(-Kluster, names_to = "Fitur", values_to = "Nilai")

ggplot(centroid_km, aes(x = Fitur, y = Kluster, fill = Nilai)) +
  geom_tile(color = "white", linewidth = 0.5) +
  geom_text(aes(label = round(Nilai, 2)), size = 3.2, fontface = "bold") +
  scale_fill_gradient2(low = "#e74c3c", mid = "white", high = "#2ecc71",
                       midpoint = 0, name = "Z-score") +
  labs(
    title    = "Profil Centroid K-Means (Z-score)",
    subtitle = "Centroid = rata-rata koordinat semua titik di kluster | Merah = rendah | Hijau = tinggi",
    x = "Fitur", y = "Kluster"
  ) +
  theme_minimal(base_size = 11) +
  theme(
    plot.title    = element_text(face = "bold", hjust = 0.5, size = 13),
    plot.subtitle = element_text(hjust = 0.5, color = "gray50"),
    axis.text.x   = element_text(angle = 35, hjust = 1, face = "bold")
  )

8.5 Evaluasi Eksternal K-Means

purity_score <- function(labels_true, labels_pred) {
  tbl <- table(labels_true, labels_pred)
  sum(apply(tbl, 2, max)) / length(labels_true)
}

purity_km <- purity_score(y_label, km_labels)

table(Kluster = km_labels, Kelas_Asli = y_label) %>%
  as.data.frame.matrix() %>%
  rownames_to_column("Kluster") %>%
  mutate(Kluster = paste0("K", Kluster)) %>%
  kable(caption = "Tabel Kontingensi — K-Means vs Kelas Asli") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE, font_size = 12) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white")
Tabel Kontingensi — K-Means vs Kelas Asli
Kluster BARBUNYA BOMBAY CALI DERMASON HOROZ SEKER SIRA
K1 52 0 16 1407 15 95 1203
K2 76 127 710 40 296 4 155
K3 55 0 21 1456 24 52 1114
K4 3 1 2 583 0 1846 49
K5 943 0 15 52 20 30 31
K6 18 2 65 8 1036 0 50
K7 175 392 801 0 469 0 34
cat("\nPurity Score K-Means:", round(purity_km, 4),
    paste0("(", round(purity_km * 100, 2), "%)"), "\n")
## 
## Purity Score K-Means: 0.6054 (60.54%)

8.6 Kesimpulan K-Means

data.frame(
  Aspek  = c("Algoritma", "Pusat Kluster", "Metrik Jarak",
             "Jumlah Kluster", "Silhouette Score",
             "Between-SS / Total-SS", "Purity Score", "Status"),
  Hasil  = c("K-Means (Hartigan-Wong)",
             "Mean / rata-rata koordinat (bukan titik data aktual)",
             "Euclidean (L2 norm)",
             paste0(k_optimal, " kluster"),
             round(avg_sil_km, 4),
             paste0(round(km_result$betweenss / km_result$totss * 100, 2), "%"),
             round(purity_km, 4),
             "lanjut ke Tahap K-Median")
) %>%
  kable(caption = "Kesimpulan Tahap 8: K-Means Clustering") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
  row_spec(8, background = "#eafaf1", color = "#1e8449")
Kesimpulan Tahap 8: K-Means Clustering
Aspek Hasil
Algoritma K-Means (Hartigan-Wong)
Pusat Kluster Mean / rata-rata koordinat (bukan titik data aktual)
Metrik Jarak Euclidean (L2 norm)
Jumlah Kluster 7 kluster
Silhouette Score 0.2688
Between-SS / Total-SS 68.95%
Purity Score 0.6054
Status lanjut ke Tahap K-Median

Tahap 9: K-Median Clustering

9.1 Penentuan Jumlah Kluster Optimal

9.1.1 Metode Elbow (Total Dissimilarity PAM)

set.seed(42)
dissim_values <- sapply(2:8, function(k) {
  cl <- clara(X_final, k = k, metric = "manhattan", samples = 200, pamLike = TRUE)
  cl$objective
})

dissim_df <- data.frame(k = 2:8, Dissimilarity = dissim_values)

ggplot(dissim_df, aes(x = k, y = Dissimilarity)) +
  geom_line(color = "#9b59b6", linewidth = 1.2) +
  geom_point(color = "#e74c3c", size = 4) +
  geom_vline(xintercept = 7, linetype = "dashed",
             color = "#e74c3c", linewidth = 0.8) +
  annotate("text", x = 7.3,
           y = max(dissim_values) * 0.95,
           label = "k = 7 (elbow)", color = "#e74c3c",
           size = 4, fontface = "italic") +
  scale_x_continuous(breaks = 2:8) +
  labs(
    title    = "Metode Elbow — Penentuan Jumlah Kluster Optimal (K-Median/PAM)",
    subtitle = "Total Dissimilarity Manhattan — titik siku = k optimal",
    x = "Jumlah Kluster (k)", y = "Total Dissimilarity (swap)"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title    = element_text(face = "bold", hjust = 0.5, size = 13),
    plot.subtitle = element_text(hjust = 0.5, color = "gray50")
  )

9.1.2 Metode Silhouette

set.seed(42)

sil_values_pam <- sapply(2:8, function(k) {
  cl <- clara(X_final, k = k, metric = "manhattan", samples = 200, pamLike = TRUE)
  mean(cl$silinfo$avg.width)
})

sil_df_pam <- data.frame(k = 2:8, Silhouette = sil_values_pam)
best_k_pam <- sil_df_pam$k[which.max(sil_df_pam$Silhouette)]

ggplot(sil_df_pam, aes(x = k, y = Silhouette)) +
  geom_line(color = "#9b59b6", linewidth = 1.2) +
  geom_point(aes(color = k == best_k_pam), size = 4, show.legend = FALSE) +
  scale_color_manual(values = c("FALSE" = "#9b59b6", "TRUE" = "#e74c3c")) +
  geom_vline(xintercept = best_k_pam, linetype = "dashed",
             color = "#e74c3c", linewidth = 0.8) +
  annotate("text", x = best_k_pam + 0.3,
           y = max(sil_values_pam) * 0.97,
           label = paste0("k = ", best_k_pam, " (optimal)"),
           color = "#e74c3c", size = 4, fontface = "italic") +
  scale_x_continuous(breaks = 2:8) +
  labs(
    title    = "Metode Silhouette — Penentuan Jumlah Kluster Optimal (K-Median/PAM)",
    subtitle = "Skor lebih tinggi = kluster lebih kohesif & terpisah",
    x = "Jumlah Kluster (k)", y = "Rata-rata Silhouette Score"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title    = element_text(face = "bold", hjust = 0.5, size = 13),
    plot.subtitle = element_text(hjust = 0.5, color = "gray50")
  )

cat("k optimal berdasarkan Silhouette (K-Median/PAM):", best_k_pam, "\n")
## k optimal berdasarkan Silhouette (K-Median/PAM): 2

9.2 Training K-Median (PAM)

set.seed(42)

pam_result <- clara(
  X_final,
  k       = k_optimal,
  metric  = "manhattan",  
  samples = 200,          
  pamLike = TRUE,         
  stand   = FALSE         
)

pam_labels <- pam_result$clustering

cat("K-Median (CLARA) selesai!\n\n")
## K-Median (CLARA) selesai!
cat("Algoritma          : CLARA (Clustering Large Applications)\n")
## Algoritma          : CLARA (Clustering Large Applications)
cat("Setara dengan      : PAM / K-Median untuk dataset besar\n")
## Setara dengan      : PAM / K-Median untuk dataset besar
cat("Metrik jarak       : Manhattan (L1 norm) — sesuai PPT K-Median\n")
## Metrik jarak       : Manhattan (L1 norm) — sesuai PPT K-Median
cat("Pusat kluster      : Medoid (objek data aktual, bukan rata-rata)\n")
## Pusat kluster      : Medoid (objek data aktual, bukan rata-rata)
cat("Jumlah kluster     :", k_optimal, "\n")
## Jumlah kluster     : 7
cat("Objective          :", round(pam_result$objective, 4), "\n\n")
## Objective          : 2.6052
cat("Distribusi anggota per kluster:\n")
## Distribusi anggota per kluster:
print(table(pam_labels))
## pam_labels
##    1    2    3    4    5    6    7 
## 1956 3287 2695 1183  829 2051 1542

9.3 Evaluasi Internal K-Median

9.3.1 Silhouette Plot

set.seed(42)
samp_pam    <- sample(nrow(X_final), min(2000, nrow(X_final)))
sil_pam     <- silhouette(pam_labels[samp_pam], dist(X_final[samp_pam, ], method = "manhattan"))
avg_sil_pam <- mean(sil_pam[, 3])

fviz_silhouette(sil_pam,
                palette       = "jco",
                ggtheme       = theme_minimal(base_size = 11),
                print.summary = FALSE) +
  geom_hline(yintercept = avg_sil_pam, linetype = "dashed",
             color = "#9b59b6", linewidth = 0.8) +
  labs(
    title    = "Silhouette Plot — K-Median (PAM) Clustering",
    subtitle = paste0("Rata-rata Silhouette Score = ", round(avg_sil_pam, 4),
                      " | Manhattan distance")
  ) +
  theme(
    plot.title    = element_text(face = "bold", hjust = 0.5, size = 13),
    plot.subtitle = element_text(hjust = 0.5, color = "gray50")
  )

9.3.2 Tabel Metrik Internal

eval_pam <- data.frame(
  Metrik     = c("Silhouette Score (↑ lebih baik)",
                 "Objective Function (swap) (↓ lebih baik)",
                 "Metrik Jarak",
                 "Representasi Pusat Kluster"),
  Nilai      = c(round(avg_sil_pam, 4),
                 round(pam_result$objective, 4),
                 "Manhattan (L1 norm)",
                 "Medoid (objek data aktual)"),
  Keterangan = c("Mendekati 1 = kluster padat & terpisah",
                 "Semakin kecil = kluster lebih kompak",
                 "Sum of absolute differences — robust terhadap outlier",
                 "Lebih interpretatif dari centroid K-Means")
)

eval_pam %>%
  kable(caption = "Evaluasi Internal — K-Median (PAM) Clustering") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE, font_size = 12) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
  row_spec(1, background = "#f5eef8", color = "#6c3483")
Evaluasi Internal — K-Median (PAM) Clustering
Metrik Nilai Keterangan
Silhouette Score (↑ lebih baik) 0.2415 Mendekati 1 = kluster padat & terpisah
Objective Function (swap) (↓ lebih baik) 2.6052 Semakin kecil = kluster lebih kompak
Metrik Jarak Manhattan (L1 norm) Sum of absolute differences — robust terhadap outlier
Representasi Pusat Kluster Medoid (objek data aktual) Lebih interpretatif dari centroid K-Means

9.4 Visualisasi Hasil K-Median

9.4.1 Cluster Plot (PCA 2D)

fviz_cluster(
  pam_result,
  data         = X_final,
  palette      = warna_kluster,
  geom         = "point",
  ellipse.type = "convex",
  alpha        = 0.4,
  pointsize    = 1,
  ggtheme      = theme_minimal(base_size = 12)
) +
  labs(
    title    = "Visualisasi K-Median (PAM) Clustering (PCA 2D)",
    subtitle = paste0("k = ", k_optimal,
                      " kluster | Manhattan distance | Proyeksi ke 2 PC pertama"),
    color = "Kluster", fill = "Kluster", shape = "Kluster"
  ) +
  theme(
    plot.title    = element_text(face = "bold", hjust = 0.5, size = 13),
    plot.subtitle = element_text(hjust = 0.5, color = "gray50")
  )

9.4.2 Distribusi Anggota per Kluster

dist_pam <- data.frame(
  Kluster = paste0("K", sort(unique(pam_labels))),
  Jumlah  = as.integer(table(pam_labels))
) %>%
  mutate(
    Persentase = percent(Jumlah / sum(Jumlah), accuracy = 0.1),
    Kluster    = reorder(Kluster, -Jumlah)
  )

ggplot(dist_pam, aes(x = Kluster, y = Jumlah, fill = Kluster)) +
  geom_col(width = 0.7, show.legend = FALSE) +
  geom_text(aes(label = paste0(Jumlah, "\n", Persentase)),
            vjust = -0.3, size = 3.8, fontface = "bold") +
  scale_fill_manual(values = warna_kluster) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.18))) +
  labs(title = "Distribusi Anggota per Kluster — K-Median (PAM)",
       x = "Kluster", y = "Jumlah Anggota") +
  theme_minimal(base_size = 12) +
  theme(plot.title  = element_text(face = "bold", hjust = 0.5),
        axis.text.x = element_text(face = "bold"))

9.4.3 Heatmap Profil Medoid

medoid_df <- as.data.frame(pam_result$medoids) %>%
  mutate(Kluster = paste0("K", 1:k_optimal)) %>%
  pivot_longer(-Kluster, names_to = "Fitur", values_to = "Nilai")

ggplot(medoid_df, aes(x = Fitur, y = Kluster, fill = Nilai)) +
  geom_tile(color = "white", linewidth = 0.5) +
  geom_text(aes(label = round(Nilai, 2)), size = 3.2, fontface = "bold") +
  scale_fill_gradient2(low = "#e74c3c", mid = "white", high = "#9b59b6",
                       midpoint = 0, name = "Z-score") +
  labs(
    title    = "Profil Medoid K-Median/PAM (Z-score)",
    subtitle = "Medoid = objek data aktual paling representatif | Merah = rendah | Ungu = tinggi",
    x = "Fitur", y = "Kluster"
  ) +
  theme_minimal(base_size = 11) +
  theme(
    plot.title    = element_text(face = "bold", hjust = 0.5, size = 13),
    plot.subtitle = element_text(hjust = 0.5, color = "gray50"),
    axis.text.x   = element_text(angle = 35, hjust = 1, face = "bold")
  )

9.5 Evaluasi Eksternal K-Median

purity_pam <- purity_score(y_label, pam_labels)

table(Kluster = pam_labels, Kelas_Asli = y_label) %>%
  as.data.frame.matrix() %>%
  rownames_to_column("Kluster") %>%
  mutate(Kluster = paste0("K", Kluster)) %>%
  kable(caption = "Tabel Kontingensi — K-Median (PAM) vs Kelas Asli") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE, font_size = 12) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white")
Tabel Kontingensi — K-Median (PAM) vs Kelas Asli
Kluster BARBUNYA BOMBAY CALI DERMASON HOROZ SEKER SIRA
K1 0 8 1 244 0 1685 18
K2 49 29 13 1961 17 271 947
K3 118 19 24 1242 36 49 1207
K4 74 123 635 38 203 4 106
K5 750 0 2 32 18 16 11
K6 319 340 897 21 179 2 293
K7 12 3 58 8 1407 0 54
cat("\nPurity Score K-Median (PAM):", round(purity_pam, 4),
    paste0("(", round(purity_pam * 100, 2), "%)"), "\n")
## 
## Purity Score K-Median (PAM): 0.6333 (63.33%)

9.6 Kesimpulan K-Median

data.frame(
  Aspek  = c("Algoritma", "Pusat Kluster", "Metrik Jarak",
             "Jumlah Kluster", "Silhouette Score",
             "Objective (swap)", "Purity Score", "Status"),
  Hasil  = c("K-Median via CLARA (Clustering Large Applications)",
             "Medoid (objek data aktual — bukan rata-rata)",
             "Manhattan (L1 norm) — sesuai PPT",
             paste0(k_optimal, " kluster"),
             round(avg_sil_pam, 4),
             round(pam_result$objective, 4),
             round(purity_pam, 4),
             "lanjut ke Tahap Perbandingan")
) %>%
  kable(caption = "Kesimpulan Tahap 9: K-Median (PAM) Clustering") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
  row_spec(8, background = "#f5eef8", color = "#6c3483")
Kesimpulan Tahap 9: K-Median (PAM) Clustering
Aspek Hasil
Algoritma K-Median via CLARA (Clustering Large Applications)
Pusat Kluster Medoid (objek data aktual — bukan rata-rata)
Metrik Jarak Manhattan (L1 norm) — sesuai PPT
Jumlah Kluster 7 kluster
Silhouette Score 0.2415
Objective (swap) 2.6052
Purity Score 0.6333
Status lanjut ke Tahap Perbandingan

Tahap 10: DBSCAN Clustering

10.1 Penentuan Parameter eps (k-distance plot)

set.seed(42)
samp_db <- sample(nrow(X_final), min(3000, nrow(X_final)))
X_db_sample <- X_final[samp_db, ]

minPts <- ncol(X_final) * 2  
kNNdistplot(X_db_sample, k = minPts)
abline(h = 1.5, col = "red", lty = 2, lwd = 2)
title(main = "k-distance Plot — Penentuan Nilai eps DBSCAN",
      sub  = paste0("Garis merah = kandidat eps | minPts = ", minPts))

10.2 Training DBSCAN

set.seed(42)

eps_val    <- 1.5
minPts_val <- ncol(X_final) * 2

db_result <- dbscan::dbscan(X_final, eps = eps_val, minPts = minPts_val)
db_labels <- db_result$cluster  # 0 = noise

cat("DBSCAN selesai!\n\n")
## DBSCAN selesai!
cat("Parameter eps     :", eps_val, "\n")
## Parameter eps     : 1.5
cat("Parameter minPts  :", minPts_val, "\n")
## Parameter minPts  : 12
cat("Jumlah kluster    :", max(db_labels), "\n")
## Jumlah kluster    : 1
cat("Titik noise (0)   :", sum(db_labels == 0), "\n\n")
## Titik noise (0)   : 0
cat("Distribusi label:\n")
## Distribusi label:
print(table(db_labels))
## db_labels
##     1 
## 13543

10.3 Evaluasi Internal DBSCAN

set.seed(42)
non_noise_idx <- which(db_labels != 0)

if (length(unique(db_labels[non_noise_idx])) >= 2) {
  samp_db2   <- sample(non_noise_idx, min(2000, length(non_noise_idx)))
  sil_db     <- silhouette(db_labels[samp_db2], dist(X_final[samp_db2, ]))
  avg_sil_db <- mean(sil_db[, 3])
  
  fviz_silhouette(sil_db,
                  palette       = "jco",
                  ggtheme       = theme_minimal(base_size = 11),
                  print.summary = FALSE) +
    geom_hline(yintercept = avg_sil_db, linetype = "dashed",
               color = "#e74c3c", linewidth = 0.8) +
    labs(
      title    = "Silhouette Plot — DBSCAN Clustering",
      subtitle = paste0("Rata-rata Silhouette Score = ", round(avg_sil_db, 4),
                        " | Noise points dikecualikan")
    ) +
    theme(plot.title    = element_text(face = "bold", hjust = 0.5, size = 13),
          plot.subtitle = element_text(hjust = 0.5, color = "gray50"))
} else {
  avg_sil_db <- NA
  cat("Tidak cukup kluster untuk menghitung Silhouette.\n")
}
## Tidak cukup kluster untuk menghitung Silhouette.

10.4 Visualisasi DBSCAN (PCA 2D)

pca_2d <- as.data.frame(prcomp(X_final, center = TRUE, scale. = TRUE)$x[, 1:2])
pca_2d$Kluster <- factor(ifelse(db_labels == 0, "Noise", paste0("K", db_labels)))

warna_db <- c("Noise" = "gray70",
              setNames(c("#2ecc71","#3498db","#e74c3c","#f39c12",
                         "#9b59b6","#1abc9c","#e67e22","#34495e"),
                       paste0("K", 1:8)))

ggplot(pca_2d, aes(x = PC1, y = PC2, color = Kluster)) +
  geom_point(alpha = 0.4, size = 0.8) +
  scale_color_manual(values = warna_db) +
  labs(
    title    = "Visualisasi DBSCAN Clustering (PCA 2D)",
    subtitle = paste0("eps = ", eps_val, " | minPts = ", minPts_val,
                      " | Kluster = ", max(db_labels),
                      " | Noise = ", sum(db_labels == 0), " titik"),
    color = "Label"
  ) +
  theme_minimal(base_size = 12) +
  theme(plot.title    = element_text(face = "bold", hjust = 0.5, size = 13),
        plot.subtitle = element_text(hjust = 0.5, color = "gray50"))

10.5 Evaluasi Eksternal DBSCAN

non_noise <- db_labels != 0
purity_db <- purity_score(y_label[non_noise], db_labels[non_noise])

cat("Purity Score DBSCAN (non-noise):", round(purity_db, 4),
    paste0("(", round(purity_db * 100, 2), "%)"), "\n")
## Purity Score DBSCAN (non-noise): 0.2618 (26.18%)
cat("Proporsi noise:", round(mean(db_labels == 0) * 100, 2), "%\n")
## Proporsi noise: 0 %

Tahap 11: Mean Shift Clustering

11.1 Training Mean Shift

set.seed(42)

samp_ms <- sample(nrow(X_final), min(800, nrow(X_final)))
X_ms    <- as.matrix(X_final[samp_ms, ])

ms_result <- ms(X_ms, h = 1.5, scaled = FALSE)

ms_labels_samp <- ms_result$cluster.label
n_kluster_ms   <- length(unique(ms_labels_samp))

cat("Mean Shift selesai!\n\n")
## Mean Shift selesai!
cat("Bandwidth (h)  :", 1.5, "\n")
## Bandwidth (h)  : 1.5
cat("Jumlah kluster :", n_kluster_ms, "\n\n")
## Jumlah kluster : 1
cat("Distribusi label (subsample):\n")
## Distribusi label (subsample):
print(table(ms_labels_samp))
## ms_labels_samp
##   1 
## 800

11.2 Assign Label ke Full Data (nearest mode)

mean_shift <- function(X, h, max_iter = 100, tol = 1e-4) {
  n <- nrow(X)
  modes <- X
  
  for (iter in 1:max_iter) {
    modes_new <- modes
    for (i in 1:n) {
      dists   <- rowSums((X - matrix(modes[i, ], n, ncol(X), byrow = TRUE))^2)
      weights <- exp(-dists / (2 * h^2))
      modes_new[i, ] <- colSums(X * weights) / sum(weights)
    }
    shift <- max(sqrt(rowSums((modes_new - modes)^2)))
    modes <- modes_new
    if (shift < tol) break
  }
  return(modes)
}

assign_clusters <- function(modes, tol = 0.5) {
  n      <- nrow(modes)
  labels <- rep(0, n)
  k      <- 0
  
  for (i in 1:n) {
    if (labels[i] == 0) {
      k <- k + 1
      labels[i] <- k
    }
    if (i < n) {
      for (j in (i+1):n) {
        if (labels[j] == 0) {
          dist_ij <- sqrt(sum((modes[i, ] - modes[j, ])^2))
          if (dist_ij < tol) labels[j] <- labels[i]
        }
      }
    }
  }
  return(labels)
}

assign_to_mode <- function(X, centers) {
  apply(X, 1, function(xi) {
    dists <- apply(centers, 1, function(m) sqrt(sum((xi - m)^2)))
    which.min(dists)
  })
}

cat("Fungsi mean_shift, assign_clusters, assign_to_mode berhasil didefinisikan.\n")
## Fungsi mean_shift, assign_clusters, assign_to_mode berhasil didefinisikan.
set.seed(42)
samp_ms <- sample(nrow(X_final), min(500, nrow(X_final)))
X_ms    <- as.matrix(X_final[samp_ms, ])

cat("Menjalankan Mean Shift dengan h = 0.8...\n")
## Menjalankan Mean Shift dengan h = 0.8...
final_modes    <- mean_shift(X_ms, h = 0.8, max_iter = 50, tol = 1e-3)
ms_labels_samp <- assign_clusters(final_modes, tol = 0.3)  

n_kluster_ms <- length(unique(ms_labels_samp))
cat("Jumlah kluster ditemukan:", n_kluster_ms, "\n")
## Jumlah kluster ditemukan: 3
print(table(ms_labels_samp))
## ms_labels_samp
##   1   2   3 
## 435  41  24
if (n_kluster_ms == 1) {
  cat("\nMasih 1 kluster, mencoba h = 0.5...\n")
  final_modes    <- mean_shift(X_ms, h = 0.5, max_iter = 50, tol = 1e-3)
  ms_labels_samp <- assign_clusters(final_modes, tol = 0.2)
  n_kluster_ms   <- length(unique(ms_labels_samp))
  cat("Jumlah kluster ditemukan:", n_kluster_ms, "\n")
  print(table(ms_labels_samp))
}

mode_centers <- t(sapply(unique(ms_labels_samp), function(k) {
  colMeans(X_ms[ms_labels_samp == k, , drop = FALSE])
}))

ms_labels_full <- assign_to_mode(as.matrix(X_final), mode_centers)
cat("\nDistribusi label full data:\n")
## 
## Distribusi label full data:
print(table(ms_labels_full))
## ms_labels_full
##     1     2     3 
## 10220  2102  1221

11.3 Evaluasi Internal Mean Shift

set.seed(42)
samp_ms2 <- sample(nrow(X_final), min(2000, nrow(X_final)))

n_kluster_full <- length(unique(ms_labels_full))
cat("Jumlah kluster unik di full data:", n_kluster_full, "\n")
## Jumlah kluster unik di full data: 3
if (n_kluster_full >= 2) {
  sil_ms     <- silhouette(ms_labels_full[samp_ms2], dist(X_final[samp_ms2, ]))
  avg_sil_ms <- mean(sil_ms[, 3])
  
  fviz_silhouette(sil_ms,
                  palette       = "jco",
                  ggtheme       = theme_minimal(base_size = 11),
                  print.summary = FALSE) +
    geom_hline(yintercept = avg_sil_ms, linetype = "dashed",
               color = "#e74c3c", linewidth = 0.8) +
    labs(
      title    = "Silhouette Plot — Mean Shift Clustering",
      subtitle = paste0("Rata-rata Silhouette Score = ", round(avg_sil_ms, 4))
    ) +
    theme(plot.title    = element_text(face = "bold", hjust = 0.5, size = 13),
          plot.subtitle = element_text(hjust = 0.5, color = "gray50"))
  
} else {
  avg_sil_ms <- NA
  cat("Hanya 1 kluster terbentuk.\n")
}

11.4 Visualisasi Mean Shift (PCA 2D)

pca_2d_ms <- as.data.frame(prcomp(X_final, center = TRUE, scale. = TRUE)$x[, 1:2])
pca_2d_ms$Kluster <- factor(paste0("K", ms_labels_full))

warna_ms <- setNames(
  c("#2ecc71","#3498db","#e74c3c","#f39c12","#9b59b6",
    "#1abc9c","#e67e22","#34495e","#16a085","#8e44ad"),
  paste0("K", 1:10)
)

ggplot(pca_2d_ms, aes(x = PC1, y = PC2, color = Kluster)) +
  geom_point(alpha = 0.4, size = 0.8) +
  scale_color_manual(values = warna_ms) +
  labs(
    title    = "Visualisasi Mean Shift Clustering (PCA 2D)",
    subtitle = paste0("Bandwidth h = 1.5 | ",
                      n_kluster_ms, " kluster ditemukan otomatis"),
    color = "Kluster"
  ) +
  theme_minimal(base_size = 12) +
  theme(plot.title    = element_text(face = "bold", hjust = 0.5, size = 13),
        plot.subtitle = element_text(hjust = 0.5, color = "gray50"))

11.5 Visualisasi (PCA 2D)

pca_2d_ms <- as.data.frame(prcomp(X_final, center = TRUE, scale. = TRUE)$x[, 1:2])
pca_2d_ms$Kluster <- factor(paste0("K", ms_labels_full))

warna_ms <- setNames(
  c("#2ecc71","#3498db","#e74c3c","#f39c12","#9b59b6",
    "#1abc9c","#e67e22","#34495e","#16a085","#8e44ad"),
  paste0("K", 1:10)
)

ggplot(pca_2d_ms, aes(x = PC1, y = PC2, color = Kluster)) +
  geom_point(alpha = 0.4, size = 0.8) +
  scale_color_manual(values = warna_ms) +
  labs(
    title    = "Visualisasi Mean Shift Clustering (PCA 2D)",
    subtitle = paste0("Bandwidth h = 1.5"),
    color = "Kluster"
  ) +
  theme_minimal(base_size = 12) +
  theme(plot.title    = element_text(face = "bold", hjust = 0.5, size = 13),
        plot.subtitle = element_text(hjust = 0.5, color = "gray50"))

11.6 Evaluasi Eksternal

purity_ms <- purity_score(y_label, ms_labels_full)
cat("Purity Score Mean Shift:", round(purity_ms, 4),
    paste0("(", round(purity_ms * 100, 2), "%)"), "\n")
## Purity Score Mean Shift: 0.3939 (39.39%)

Tahap 12: Fuzzy C-Means Clustering

##12.1 Training Fuzzy C-means

## 12.1 Training Fuzzy C-Means
library(e1071)

# Definisikan ulang jika session di-restart
k_optimal <- 7

set.seed(42)

fcm_result <- cmeans(
  as.matrix(X_final),
  centers  = k_optimal,
  iter.max = 1000,
  m        = 2,
  dist     = "euclidean",
  method   = "cmeans"
)

fcm_labels     <- fcm_result$cluster
fcm_membership <- fcm_result$membership

cat("Fuzzy C-Means selesai!\n\n")
## Fuzzy C-Means selesai!
cat("Fuzzifier (m)    : 2\n")
## Fuzzifier (m)    : 2
cat("Metrik jarak     : Euclidean\n")
## Metrik jarak     : Euclidean
cat("Jumlah kluster   :", k_optimal, "\n")
## Jumlah kluster   : 7
cat("Withiness (obj)  :", round(fcm_result$withinerror, 4), "\n\n")
## Withiness (obj)  : 0.7123
cat("Distribusi label hard:\n")
## Distribusi label hard:
print(table(fcm_labels))
## fcm_labels
##    1    2    3    4    5    6    7 
## 2702 2747 1208 1489 1824 2307 1266

12.2 Visualisasi Matriks Keanggotaan Fuzzy (200 sampel)

library(tidyverse)

set.seed(42)
samp_fcm <- sample(nrow(fcm_membership), 200)

membership_long <- as.data.frame(fcm_membership[samp_fcm, ]) %>%
  mutate(Observasi = row_number()) %>%
  pivot_longer(-Observasi,
               names_to  = "Kluster",
               values_to = "Keanggotaan") %>%
  mutate(Kluster = paste0("K", Kluster))

ggplot(membership_long,
       aes(x = Observasi, y = Keanggotaan, fill = Kluster)) +
  geom_area(alpha = 0.8, position = "stack") +
  scale_fill_manual(values = c("#2ecc71","#3498db","#e74c3c",
                               "#f39c12","#9b59b6","#1abc9c","#e67e22")) +
  labs(
    title    = "Matriks Keanggotaan Fuzzy C-Means (200 sampel)",
    subtitle = "Setiap observasi memiliki derajat keanggotaan di semua kluster",
    x = "Observasi (sampel)", y = "Derajat Keanggotaan",
    fill = "Kluster"
  ) +
  theme_minimal(base_size = 12) +
  theme(plot.title    = element_text(face = "bold", hjust = 0.5, size = 13),
        plot.subtitle = element_text(hjust = 0.5, color = "gray50"))

12.3 Evaluasi Internal — Silhouette

set.seed(42)
samp_fcm2   <- sample(nrow(X_final), min(2000, nrow(X_final)))
sil_fcm     <- silhouette(fcm_labels[samp_fcm2], dist(X_final[samp_fcm2, ]))
avg_sil_fcm <- mean(sil_fcm[, 3])

fviz_silhouette(sil_fcm,
                palette       = "jco",
                ggtheme       = theme_minimal(base_size = 11),
                print.summary = FALSE) +
  geom_hline(yintercept = avg_sil_fcm, linetype = "dashed",
             color = "#e74c3c", linewidth = 0.8) +
  labs(
    title    = "Silhouette Plot — Fuzzy C-Means Clustering",
    subtitle = paste0("Rata-rata Silhouette Score = ", round(avg_sil_fcm, 4),
                      " | m = 2 (fuzzifier)")
  ) +
  theme(plot.title    = element_text(face = "bold", hjust = 0.5, size = 13),
        plot.subtitle = element_text(hjust = 0.5, color = "gray50"))

12.4 Visualisasi Cluster Plot (PCA 2D)

pca_2d_fcm <- as.data.frame(
  prcomp(X_final, center = TRUE, scale. = TRUE)$x[, 1:2]
)
pca_2d_fcm$Kluster <- factor(paste0("K", fcm_labels))

ggplot(pca_2d_fcm, aes(x = PC1, y = PC2, color = Kluster)) +
  geom_point(alpha = 0.4, size = 0.8) +
  scale_color_manual(values = c("#2ecc71","#3498db","#e74c3c",
                                "#f39c12","#9b59b6","#1abc9c","#e67e22")) +
  stat_ellipse(type = "convex", alpha = 0.12, geom = "polygon",
               aes(fill = Kluster), show.legend = FALSE) +
  scale_fill_manual(values  = c("#2ecc71","#3498db","#e74c3c",
                                "#f39c12","#9b59b6","#1abc9c","#e67e22")) +
  labs(
    title    = "Visualisasi Fuzzy C-Means Clustering (PCA 2D)",
    subtitle = paste0("k = ", k_optimal, " | m = 2 | Label = hard assignment"),
    color = "Kluster"
  ) +
  theme_minimal(base_size = 12) +
  theme(plot.title    = element_text(face = "bold", hjust = 0.5, size = 13),
        plot.subtitle = element_text(hjust = 0.5, color = "gray50"))
## Unrecognized ellipse type
## Unrecognized ellipse type
## Unrecognized ellipse type
## Unrecognized ellipse type
## Unrecognized ellipse type
## Unrecognized ellipse type
## Unrecognized ellipse type

12.5 Heatmap Centroid Fuzzy C-Means

centroid_fcm <- as.data.frame(fcm_result$centers) %>%
  mutate(Kluster = paste0("K", 1:k_optimal)) %>%
  pivot_longer(-Kluster, names_to = "Fitur", values_to = "Nilai")

ggplot(centroid_fcm, aes(x = Fitur, y = Kluster, fill = Nilai)) +
  geom_tile(color = "white", linewidth = 0.5) +
  geom_text(aes(label = round(Nilai, 2)), size = 3.2, fontface = "bold") +
  scale_fill_gradient2(low = "#e74c3c", mid = "white", high = "#3498db",
                       midpoint = 0, name = "Z-score") +
  labs(
    title    = "Profil Centroid Fuzzy C-Means (Z-score)",
    subtitle = "Centroid fuzzy = rata-rata berbobot keanggotaan | Merah = rendah | Biru = tinggi",
    x = "Fitur", y = "Kluster"
  ) +
  theme_minimal(base_size = 11) +
  theme(
    plot.title    = element_text(face = "bold", hjust = 0.5, size = 13),
    plot.subtitle = element_text(hjust = 0.5, color = "gray50"),
    axis.text.x   = element_text(angle = 35, hjust = 1, face = "bold")
  )

12.6 Evaluasi Eksternal — Purity Score

purity_score <- function(labels_true, labels_pred) {
  tbl <- table(labels_true, labels_pred)
  sum(apply(tbl, 2, max)) / length(labels_true)
}

purity_fcm <- purity_score(y_label, fcm_labels)

table(Kluster = fcm_labels, Kelas_Asli = y_label) %>%
  as.data.frame.matrix() %>%
  rownames_to_column("Kluster") %>%
  mutate(Kluster = paste0("K", Kluster)) %>%
  kable(caption = "Tabel Kontingensi — Fuzzy C-Means vs Kelas Asli") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE, font_size = 12) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white")
Tabel Kontingensi — Fuzzy C-Means vs Kelas Asli
Kluster BARBUNYA BOMBAY CALI DERMASON HOROZ SEKER SIRA
K1 35 0 13 1505 16 77 1056
K2 41 0 19 1482 9 114 1082
K3 798 0 70 46 81 18 195
K4 203 147 701 48 278 5 107
K5 184 369 756 0 434 0 81
K6 3 4 2 451 0 1813 34
K7 58 2 69 14 1042 0 81
cat("\nPurity Score Fuzzy C-Means:", round(purity_fcm, 4),
    paste0("(", round(purity_fcm * 100, 2), "%)"), "\n")
## 
## Purity Score Fuzzy C-Means: 0.5979 (59.79%)

12.7 Kesimpulan Fuzzy C-Means

data.frame(
  Aspek  = c("Algoritma", "Pusat Kluster", "Metrik Jarak",
             "Fuzzifier (m)", "Jumlah Kluster",
             "Silhouette Score", "Purity Score", "Status"),
  Hasil  = c("Fuzzy C-Means (cmeans)",
             "Centroid fuzzy (rata-rata berbobot keanggotaan)",
             "Euclidean",
             "2 (standar)",
             paste0(k_optimal, " kluster"),
             round(avg_sil_fcm, 4),
             round(purity_fcm, 4),
             "Selesai — lanjut ke Tahap 13 Perbandingan")
) %>%
  kable(caption = "Kesimpulan Tahap 12: Fuzzy C-Means Clustering") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  row_spec(0, bold = TRUE, background = "#2c3e50", color = "white") %>%
  row_spec(8, background = "#eafaf1", color = "#1e8449")
Kesimpulan Tahap 12: Fuzzy C-Means Clustering
Aspek Hasil
Algoritma Fuzzy C-Means (cmeans)
Pusat Kluster Centroid fuzzy (rata-rata berbobot keanggotaan)
Metrik Jarak Euclidean
Fuzzifier (m) 2 (standar)
Jumlah Kluster 7 kluster
Silhouette Score 0.253
Purity Score 0.5979
Status Selesai — lanjut ke Tahap 13 Perbandingan